-
-
Notifications
You must be signed in to change notification settings - Fork 7
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Melanoma task #310
base: main
Are you sure you want to change the base?
Melanoma task #310
Changes from all commits
1186639
ae5709c
bb37f7b
b59e3e2
996780b
72c535e
1c0e12e
e950423
d3ceff2
9566b52
47b090b
9794efd
5f71fe9
dd8da0c
9b1c240
9cff991
46707fe
6ec3b21
9e8ebb4
a7c655d
0f9f547
10f5da9
1252718
681779e
a4908c0
3527acd
3ead917
968c035
ab7bc64
394de72
30697db
3ce80de
e6d1c9f
d6bbca2
439762c
0c259ca
cf1148e
bc0f7c8
07d3fea
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -14,4 +14,4 @@ inst/doc | |
/doc/ | ||
/Meta/ | ||
CRAN-SUBMISSION | ||
paper/data | ||
paper/data |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,127 @@ | ||
#' @title Melanoma Image classification | ||
#' @name mlr_tasks_melanoma | ||
#' @description | ||
#' Classification of melanoma tumor images. | ||
#' | ||
#' The data comes from the 2020 SIIM-ISIC challenge. | ||
#' | ||
#' @section Construction: | ||
#' ``` | ||
#' tsk("melanoma") | ||
#' ``` | ||
#' | ||
#' @template task_download | ||
#' | ||
#' @source | ||
#' \url{https://challenge2020.isic-archive.com/} | ||
#' | ||
#' @section Properties: | ||
#' `r rd_info_task_torch("melanoma", missings = FALSE)` | ||
#' | ||
#' @references | ||
#' `r format_bib("melanoma2021")` | ||
#' task = tsk("melanoma") | ||
#' task | ||
NULL | ||
|
||
# @param path (`character(1)`)\cr | ||
# The cache_dir/datasets/melanoma folder | ||
constructor_melanoma = function(path) { | ||
require_namespaces("curl") | ||
|
||
base_url = "https://huggingface.co/datasets/carsonzhang/ISIC_2020_small/resolve/main/" | ||
|
||
compressed_tarball_file_name = "hf_ISIC_2020_small.tar.gz" | ||
compressed_tarball_path = file.path(path, compressed_tarball_file_name) | ||
curl::curl_download(paste0(base_url, compressed_tarball_file_name), compressed_tarball_path) | ||
utils::untar(compressed_tarball_path, exdir = path) | ||
on.exit({file.remove(compressed_tarball_path)}, add = TRUE) | ||
|
||
training_metadata_file_name = "ISIC_2020_Training_GroundTruth_v2.csv" | ||
training_metadata = data.table::fread(file.path(path, training_metadata_file_name)) | ||
|
||
test_metadata_file_name = "ISIC_2020_Test_Metadata.csv" | ||
test_metadata = file.path(path, test_metadata_file_name) | ||
|
||
training_metadata = training_metadata[, split := "train"] | ||
test_metadata = setnames(test_metadata, | ||
old = c("image", "patient", "anatom_site_general"), | ||
new = c("image_name", "patient_id", "anatom_site_general_challenge") | ||
)[, split := "test"] | ||
# response column needs to be filled for the test data | ||
metadata = rbind(training_metadata, test_metadata, fill = TRUE) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. what is being filled here? |
||
metadata[, image_name := NULL] | ||
metadata[, target := NULL] | ||
metadata = setnames(metadata, old = "benign_malignant", new = "outcome") | ||
|
||
melanoma_ds_generator = torch::dataset( | ||
initialize = function() { | ||
self$.metadata = metadata | ||
self$.path = path | ||
}, | ||
.getitem = function(idx) { | ||
force(idx) | ||
|
||
x = torchvision::base_loader(file.path(self$.path, paste0(self$.metadata[idx, ]$file_name))) | ||
x = torchvision::transform_to_tensor(x) | ||
|
||
return(list(x = x)) | ||
}, | ||
.length = function() { | ||
nrow(self$.metadata) | ||
} | ||
) | ||
|
||
melanoma_ds = melanoma_ds_generator() | ||
|
||
dd = as_data_descriptor(melanoma_ds, list(x = c(NA, 3, 128, 128))) | ||
lt = lazy_tensor(dd) | ||
|
||
return(cbind(metadata, data.table(image = lt))) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The return value of this is cached. I don't think we should cache the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. you can check tiny imagenet for how it's done |
||
} | ||
|
||
load_task_melanoma = function(id = "melanoma") { | ||
cached_constructor = function(backend) { | ||
data = cached(constructor_melanoma, "datasets", "melanoma")$data | ||
|
||
data[, outcome := factor(get(outcome), levels = c("benign", "malignant"))] | ||
set(data, j = "outcome", value = factor(get(outcome), levels = c("benign", "malignant"))) | ||
|
||
char_features = c("sex", "anatom_site_general_challenge") | ||
data[, (char_features) := lapply(.SD, factor), .SDcols = char_features] | ||
|
||
dt = cbind( | ||
data, | ||
data.table( | ||
..row_id = seq_len(nrow(data)) | ||
) | ||
) | ||
|
||
DataBackendDataTable$new(data = dt, primary_key = "..row_id") | ||
} | ||
|
||
backend = DataBackendLazy$new( | ||
constructor = cached_constructor, | ||
rownames = seq_len(32701 + 10982), | ||
col_info = load_col_info("melanoma"), | ||
primary_key = "..row_id" | ||
) | ||
|
||
task = TaskClassif$new( | ||
backend = backend, | ||
id = "melanoma", | ||
target = "outcome", | ||
label = "Melanoma Classification" | ||
) | ||
|
||
task$set_col_roles("patient_id", "group") | ||
task$col_roles$feature = c("sex", "anatom_site_general_challenge", "age_approx", "image") | ||
|
||
backend$hash = task$man = "mlr3torch::mlr_tasks_melanoma" | ||
|
||
task$filter(1:32701) | ||
|
||
return(task) | ||
} | ||
|
||
register_task("melanoma", load_task_melanoma) |
This file was deleted.
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,63 @@ | ||
library(torch) | ||
library(torchvision) | ||
library(mlr3torch) | ||
library(here) | ||
|
||
library(data.table) | ||
setDTthreads(threads = 1) | ||
|
||
training_metadata = fread(here::here("cache", "ISIC_2020_Training_GroundTruth.csv")) | ||
|
||
# hard-coded cache directory that I use locally | ||
cache_dir = here("cache") | ||
|
||
ds_base_loader = torch::dataset( | ||
initialize = function(n_images) { | ||
self$.metadata = fread(here(cache_dir, "ISIC_2020_Training_GroundTruth.csv"))[1:n_images, ] | ||
self$.path = file.path(here(cache_dir), "train") | ||
}, | ||
.getitem = function(idx) { | ||
force(idx) | ||
|
||
x = torchvision::base_loader(file.path(self$.path, paste0(self$.metadata[idx, ]$image_name, ".jpg"))) | ||
x = torchvision::transform_to_tensor(x) | ||
|
||
return(list(x = x)) | ||
}, | ||
.length = function() { | ||
nrow(self$.metadata) | ||
} | ||
) | ||
|
||
ds_magick_loader = torch::dataset( | ||
initialize = function(n_images) { | ||
self$.metadata = fread(here(cache_dir, "ISIC_2020_Training_GroundTruth.csv"))[1:n_images, ] | ||
self$.path = file.path(here(cache_dir), "train") | ||
}, | ||
.getitem = function(idx) { | ||
force(idx) | ||
|
||
image_name = self$.metadata[idx, ]$image_name | ||
|
||
x = magick::image_read(file.path(self$.path, paste0(image_name, ".jpg"))) | ||
x = torchvision::transform_to_tensor(x) | ||
|
||
return(list(x = x, image_name = image_name)) | ||
}, | ||
.length = function() { | ||
nrow(self$.metadata) | ||
} | ||
) | ||
|
||
n_images = 10 | ||
|
||
ds_base = ds_base_loader(n_images) | ||
ds_magick = ds_magick_loader(n_images) | ||
|
||
bmr = bench::mark( | ||
for (i in 1:n_images) ds_base$.getitem(i), | ||
for (i in 1:n_images) ds_magick$.getitem(i), | ||
memory = FALSE | ||
) | ||
|
||
print(bmr) |
This file was deleted.
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,80 @@ | ||
devtools::load_all() | ||
|
||
# manually construct the task once | ||
# library(here) | ||
# library(data.table) | ||
library(data.table) | ||
withr::local_options(mlr3torch.cache = TRUE) | ||
|
||
constructor_melanoma = function(path) { | ||
require_namespaces("curl") | ||
|
||
base_url = "https://huggingface.co/datasets/carsonzhang/ISIC_2020_small/resolve/main/" | ||
|
||
compressed_tarball_file_name = "hf_ISIC_2020_small.tar.gz" | ||
compressed_tarball_path = file.path(path, compressed_tarball_file_name) | ||
curl::curl_download(paste0(base_url, compressed_tarball_file_name), compressed_tarball_path) | ||
utils::untar(compressed_tarball_path, exdir = path) | ||
on.exit({file.remove(compressed_tarball_path)}, add = TRUE) | ||
|
||
training_metadata_file_name = "ISIC_2020_Training_GroundTruth_v2.csv" | ||
training_metadata = fread(file.path(path, training_metadata_file_name)) | ||
|
||
test_metadata_file_name = "ISIC_2020_Test_Metadata.csv" | ||
test_metadata = fread(file.path(path, test_metadata_file_name)) | ||
|
||
training_metadata = training_metadata[, split := "train"] | ||
test_metadata = setnames(test_metadata, | ||
old = c("image", "patient", "anatom_site_general"), | ||
new = c("image_name", "patient_id", "anatom_site_general_challenge") | ||
)[, split := "test"] | ||
# response column needs to be filled for the test data | ||
metadata = rbind(training_metadata, test_metadata, fill = TRUE) | ||
metadata[, image_name := NULL] | ||
metadata[, target := NULL] | ||
metadata = setnames(metadata, old = "benign_malignant", new = "outcome") | ||
|
||
melanoma_ds_generator = torch::dataset( | ||
initialize = function() { | ||
self$.metadata = metadata | ||
self$.path = path | ||
}, | ||
.getitem = function(idx) { | ||
force(idx) | ||
|
||
x = torchvision::base_loader(file.path(self$.path, paste0(self$.metadata[idx, ]$file_name))) | ||
x = torchvision::transform_to_tensor(x) | ||
|
||
return(list(x = x)) | ||
}, | ||
.length = function() { | ||
nrow(self$.metadata) | ||
} | ||
) | ||
|
||
melanoma_ds = melanoma_ds_generator() | ||
|
||
dd = as_data_descriptor(melanoma_ds, list(x = c(NA, 3, 128, 128))) | ||
lt = lazy_tensor(dd) | ||
|
||
return(cbind(metadata, data.table(image = lt))) | ||
} | ||
|
||
bench::system_time(melanoma_dt <- constructor_melanoma(file.path(get_cache_dir(), "datasets", "melanoma"))) | ||
# melanoma_dt = constructor_melanoma(file.path(get_cache_dir(), "datasets", "melanoma")) | ||
|
||
# change the encodings of variables: diagnosis, outcome | ||
melanoma_dt[, outcome := factor(get(outcome), levels = c("benign", "malignant"))] | ||
|
||
char_features = c("sex", "anatom_site_general_challenge") | ||
melanoma_dt[, (char_features) := lapply(.SD, factor), .SDcols = char_features] | ||
|
||
tsk_melanoma = as_task_classif(melanoma_dt, target = "outcome", id = "melanoma") | ||
tsk_melanoma$set_col_roles("patient_id", "group") | ||
tsk_melanoma$col_roles$feature = c(char_features, "age_approx", "image") | ||
|
||
tsk_melanoma$label = "Melanoma Classification" | ||
|
||
ci = col_info(tsk_melanoma$backend) | ||
|
||
saveRDS(ci, here::here("inst/col_info/melanoma.rds")) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
because curl is in suggests, we should run
mlr3misc::require_namespaces("curl")
before so users get a good error message when they don't have it installed.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
But we should just write
require_namespaces()
without themlr3misc::
right?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
yes!