From 1186639f24976113c13973607fe47891f395560f Mon Sep 17 00:00:00 2001 From: Carson Zhang Date: Sun, 20 Oct 2024 21:35:15 +0200 Subject: [PATCH 01/46] skeleton based on Sebastian's description and the MNIST task --- R/TaskClassif_melanoma.R | 43 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 43 insertions(+) create mode 100644 R/TaskClassif_melanoma.R diff --git a/R/TaskClassif_melanoma.R b/R/TaskClassif_melanoma.R new file mode 100644 index 00000000..d1d2b5ef --- /dev/null +++ b/R/TaskClassif_melanoma.R @@ -0,0 +1,43 @@ +#' @title Melanoma Image classification +#' @name mlr_tasks_melanoma +#' @description +#' Classification of melanoma tumor images. +#' +#' More descriptive text. +#' +#' @section Construction: +#' ``` +#' tsk("melanoma") +#' ``` +#' +#' @template task_download +#' +#' @source +#' \url{https://www.kaggle.com/c/siim-isic-melanoma-classification/data} +#' +#' @section Properties: +#' `r rd_info_task_torch("mnmelanoma", missings = FALSE)` +#' +#' @references +#' `r format_bib("melanoma")` +#' @examplesIf torch::torch_is_installed() +#' task = tsk("melanoma") +#' task +NULL + +load_task_melanoma = function(id = "melanoma") { + # construct a DataBackendLazy for this large dataset + + # the DataBackendLazy implements the logic for downloading, processing, caching the dataset. + # in this caes, we only need to implement the download and processing becuase the private `cached()` function implements caching + + # the DataBackendLazy also hardcodes some metadata that will be available even before the data is downloaded. + # this metadata will be stored in `.inst/col_info` + # and can be loaded using `load_column_info()` + # the code that generates this hardcoded metadata should be in `./data-raw` + + # create a TaskClassif from this DataBackendLazy + return(task) +} + +register_task("melanoma", load_task_melanoma) \ No newline at end of file From ae5709c895e0d922ee2674c0c8aa9ff73b0f73a1 Mon Sep 17 00:00:00 2001 From: Carson Zhang Date: Sun, 20 Oct 2024 21:44:11 +0200 Subject: [PATCH 02/46] added initial test file, script where I will interactively try out the task --- tests/temp-TaskClassif_melanoma.R | 0 tests/testthat/test_TaskClassif_melanoma.R | 18 ++++++++++++++++++ 2 files changed, 18 insertions(+) create mode 100644 tests/temp-TaskClassif_melanoma.R create mode 100644 tests/testthat/test_TaskClassif_melanoma.R diff --git a/tests/temp-TaskClassif_melanoma.R b/tests/temp-TaskClassif_melanoma.R new file mode 100644 index 00000000..e69de29b diff --git a/tests/testthat/test_TaskClassif_melanoma.R b/tests/testthat/test_TaskClassif_melanoma.R new file mode 100644 index 00000000..013b2189 --- /dev/null +++ b/tests/testthat/test_TaskClassif_melanoma.R @@ -0,0 +1,18 @@ +skip_on_cran() + +test_that("melanoma task works", { + withr::local_options(mlr3torch.cache = TRUE) + task = tsk("melanoma") + # this makes the test faster + # task$row_roles$use = 1:10 + # expect_equal(task$id, "mnist") + # expect_equal(task$label, "MNIST Digit Classification") + # expect_equal(task$feature_names, "image") + # expect_equal(task$target_names, "label") + # expect_equal(task$man, "mlr3torch::mlr_tasks_mnist") + # expect_equal(task$properties, "multiclass") + + # x = materialize(task$data(task$row_ids[1:2], cols = "image")[[1L]], rbind = TRUE) + # expect_equal(x$shape, c(2, 1, 28, 28)) + # expect_equal(x$dtype, torch_float32()) +}) \ No newline at end of file From bb37f7b476ad2b7186bdf7d2be974ce55c2fcef0 Mon Sep 17 00:00:00 2001 From: Carson Zhang Date: Tue, 22 Oct 2024 13:22:01 +0200 Subject: [PATCH 03/46] added more skeleton files --- R/TaskClassif_melanoma.R | 28 +++++++++++++++++++++++++++- data-raw/melanoma.R | 11 +++++++++++ 2 files changed, 38 insertions(+), 1 deletion(-) create mode 100644 data-raw/melanoma.R diff --git a/R/TaskClassif_melanoma.R b/R/TaskClassif_melanoma.R index d1d2b5ef..6d99f611 100644 --- a/R/TaskClassif_melanoma.R +++ b/R/TaskClassif_melanoma.R @@ -25,11 +25,30 @@ #' task NULL +# @param path (`character(1)`)\cr +# The cache_dir/datasets/melanoma folder +constructor_melanoma = function(path) { + # download data + # TODO: look at the similar code from the `torchdatasets` package and decide what you want to include + + data.table( + # image: ltsnr + # metadata cols + ) +} + load_task_melanoma = function(id = "melanoma") { # construct a DataBackendLazy for this large dataset + backend = DataBackendLazy$new( + constructor = cached_constructor, + rownames = seq_len(n_rows), # TODO: compute + col_info = load_col_info("melanoma") + primary_key = "..row_id" # TODO: explain + ) # the DataBackendLazy implements the logic for downloading, processing, caching the dataset. - # in this caes, we only need to implement the download and processing becuase the private `cached()` function implements caching + # in this case, we only need to implement the download and processing because the private `cached()` function implements caching + # TODO: find this private `cached()` function # the DataBackendLazy also hardcodes some metadata that will be available even before the data is downloaded. # this metadata will be stored in `.inst/col_info` @@ -37,6 +56,13 @@ load_task_melanoma = function(id = "melanoma") { # the code that generates this hardcoded metadata should be in `./data-raw` # create a TaskClassif from this DataBackendLazy + task = TaskClassif$new( + backend = backend, + id = "melanoma", + target = "class", + label = "Melanoma classification" + ) + return(task) } diff --git a/data-raw/melanoma.R b/data-raw/melanoma.R new file mode 100644 index 00000000..b9f9c812 --- /dev/null +++ b/data-raw/melanoma.R @@ -0,0 +1,11 @@ +devtools::load_all() + +ci = col_info(get_private(tsk("melanoma")$backend)$.constructor()) + +saveRDS(ci, here::here("inst/col_ino/melanoma.rds")) + +mlr3::DataBackendCbind$new(c) + +# split + +# ci \ No newline at end of file From b59e3e2c4c80b2711798daebb25715790027641b Mon Sep 17 00:00:00 2001 From: Carson Zhang Date: Fri, 25 Oct 2024 08:38:43 +0200 Subject: [PATCH 04/46] there exists code that downloads and unzips --- R/TaskClassif_melanoma.R | 76 ++++++++++++++++++++++++++----- R/download_melanoma.R | 23 ++++++++++ data-raw/melanoma.R | 2 + tests/temp-TaskClassif_melanoma.R | 15 ++++++ 4 files changed, 104 insertions(+), 12 deletions(-) create mode 100644 R/download_melanoma.R diff --git a/R/TaskClassif_melanoma.R b/R/TaskClassif_melanoma.R index 6d99f611..1035bec7 100644 --- a/R/TaskClassif_melanoma.R +++ b/R/TaskClassif_melanoma.R @@ -2,23 +2,23 @@ #' @name mlr_tasks_melanoma #' @description #' Classification of melanoma tumor images. -#' +#' #' More descriptive text. -#' +#' #' @section Construction: #' ``` #' tsk("melanoma") #' ``` -#' +#' #' @template task_download -#' -#' @source -#' \url{https://www.kaggle.com/c/siim-isic-melanoma-classification/data} -#' +#' +#' @source +#' \url{https://challenge2020.isic-archive.com/} +#' #' @section Properties: #' `r rd_info_task_torch("mnmelanoma", missings = FALSE)` -#' -#' @references +#' +#' @references #' `r format_bib("melanoma")` #' @examplesIf torch::torch_is_installed() #' task = tsk("melanoma") @@ -29,7 +29,38 @@ NULL # The cache_dir/datasets/melanoma folder constructor_melanoma = function(path) { # download data - # TODO: look at the similar code from the `torchdatasets` package and decide what you want to include + training_jpeg_images_url = "https://isic-challenge-data.s3.amazonaws.com/2020/ISIC_2020_Training_JPEG.zip" + training_metadata_url = "https://isic-challenge-data.s3.amazonaws.com/2020/ISIC_2020_Training_GroundTruth.csv" + training_metadata_v2_url = "https://isic-challenge-data.s3.amazonaws.com/2020/ISIC_2020_Training_GroundTruth_v2.csv" + training_duplicate_image_list_url = "https://isic-challenge-data.s3.amazonaws.com/2020/ISIC_2020_Training_Duplicates.csv" + + test_jpeg_images_url = "https://isic-challenge-data.s3.amazonaws.com/2020/ISIC_2020_Test_JPEG.zip" + test_metadata_url = "https://isic-challenge-data.s3.amazonaws.com/2020/ISIC_2020_Test_Metadata.csv" + + urls = c( + training_jpeg_images_url, training_metadata_url, training_metadata_v2_url, training_duplicate_image_list_url, + test_jpeg_images_url, test_metadata_url + ) + + download_melanoma_file = function(url) { + # change if necessary + prev_options = options(timeout = 3600) + on.exit(options(prev_options)) + + download.file(url, here(cache_dir, basename(url))) + } + + mlr3misc::walk(urls, download_melanoma_file) + + unzip(here(cache_dir, basename(training_jpeg_images_url)), exdir = here(cache_dir)) + unzip(here(cache_dir, basename(test_jpeg_images_url)), exdir = here(cache_dir)) + + train_metadata = fread(here(path, basename(test_jpeg_images_url))) + # train_images = + + # TODO: decide whether to delete these, since there are no ground truth labels + # test_metadata = fread(here(path, basename(test_metadata_url))) + # test_images = fread(here()) data.table( # image: ltsnr @@ -38,15 +69,30 @@ constructor_melanoma = function(path) { } load_task_melanoma = function(id = "melanoma") { + cached_constructor = function(backend) { + data = cached(constructor_melanoma, "datasets", "melanoma")$data + labels = ... + + ds = dataset( + + )(data$image) + + # some preprocessing + + # TODO: determine the end dimensionality + data_descriptor = DataDescriptor$new(dataset = ds, list(image = c(NA, channel, spatial_dims))) + } + # construct a DataBackendLazy for this large dataset backend = DataBackendLazy$new( constructor = cached_constructor, rownames = seq_len(n_rows), # TODO: compute + # hard-coded info about the task (nrows, ncols) col_info = load_col_info("melanoma") primary_key = "..row_id" # TODO: explain ) - # the DataBackendLazy implements the logic for downloading, processing, caching the dataset. + # the DataBackendLazy implements the logic for downloading, processing, caching the dataset. # in this case, we only need to implement the download and processing because the private `cached()` function implements caching # TODO: find this private `cached()` function @@ -66,4 +112,10 @@ load_task_melanoma = function(id = "melanoma") { return(task) } -register_task("melanoma", load_task_melanoma) \ No newline at end of file +register_task("melanoma", load_task_melanoma) + +download_melanoma = function() { + download.file("https://isic-challenge-data.s3.amazonaws.com/2020/ISIC_2020_Training_GroundTruth_v2.csv", + dest = "~/Downloads/metadata.csv" + ) +} diff --git a/R/download_melanoma.R b/R/download_melanoma.R new file mode 100644 index 00000000..9e27eca8 --- /dev/null +++ b/R/download_melanoma.R @@ -0,0 +1,23 @@ +training_jpeg_images_url = "https://isic-challenge-data.s3.amazonaws.com/2020/ISIC_2020_Training_JPEG.zip" +training_metadata_url = "https://isic-challenge-data.s3.amazonaws.com/2020/ISIC_2020_Training_GroundTruth.csv" +training_metadata_v2_url = "https://isic-challenge-data.s3.amazonaws.com/2020/ISIC_2020_Training_GroundTruth_v2.csv" +training_duplicate_image_list_url = "https://isic-challenge-data.s3.amazonaws.com/2020/ISIC_2020_Training_Duplicates.csv" + +test_jpeg_images_url = "https://isic-challenge-data.s3.amazonaws.com/2020/ISIC_2020_Test_JPEG.zip" +test_metadata_url = "https://isic-challenge-data.s3.amazonaws.com/2020/ISIC_2020_Test_Metadata.csv" + + +urls = c( + training_jpeg_images_url + # training_metadata_url, training_metadata_v2_url, training_duplicate_image_list_url +) + +unzip(here(cache_dir, basename(training_jpeg_images_url))) + +options(timeout = 36000) + +download_melanoma_file = function(url) { + download.file(url, here::here("cache", basename(url))) +} + +mlr3misc::walk(urls, download_melanoma_file) diff --git a/data-raw/melanoma.R b/data-raw/melanoma.R index b9f9c812..8f932124 100644 --- a/data-raw/melanoma.R +++ b/data-raw/melanoma.R @@ -1,5 +1,7 @@ devtools::load_all() +# manually construct the task once + ci = col_info(get_private(tsk("melanoma")$backend)$.constructor()) saveRDS(ci, here::here("inst/col_ino/melanoma.rds")) diff --git a/tests/temp-TaskClassif_melanoma.R b/tests/temp-TaskClassif_melanoma.R index e69de29b..4fd6f019 100644 --- a/tests/temp-TaskClassif_melanoma.R +++ b/tests/temp-TaskClassif_melanoma.R @@ -0,0 +1,15 @@ +library(mlr3torch) +library(here) + +library(tidytable) + +# TODO: figure out whether we want the v2 file +ground_truth = fread(here::here("cache", "ISIC_2020_Training_GroundTruth.csv")) + +ground_truth + +# TODO: figure out how to unzip the actual images + +# Construct lazy tensor for each image (e.g. a data table with a single ltnsr column) + +# Join with the ground truth file From 996780bbc1dd57ca36ade2c5725611c05a8b7f07 Mon Sep 17 00:00:00 2001 From: Carson Zhang Date: Fri, 25 Oct 2024 08:43:19 +0200 Subject: [PATCH 05/46] extra comment --- R/TaskClassif_melanoma.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/TaskClassif_melanoma.R b/R/TaskClassif_melanoma.R index 1035bec7..3daacb5d 100644 --- a/R/TaskClassif_melanoma.R +++ b/R/TaskClassif_melanoma.R @@ -3,7 +3,7 @@ #' @description #' Classification of melanoma tumor images. #' -#' More descriptive text. +#' The data comes from the 2020 ISIC challenge. #' #' @section Construction: #' ``` From 72c535e10ca6f64ae2feb09bfdf2b780126f6c8d Mon Sep 17 00:00:00 2001 From: Carson Zhang Date: Fri, 25 Oct 2024 10:56:25 +0200 Subject: [PATCH 06/46] dataset constructs --- R/TaskClassif_melanoma.R | 6 ----- tests/temp-TaskClassif_melanoma.R | 45 ++++++++++++++++++++++++++++--- 2 files changed, 41 insertions(+), 10 deletions(-) diff --git a/R/TaskClassif_melanoma.R b/R/TaskClassif_melanoma.R index 3daacb5d..b53a3796 100644 --- a/R/TaskClassif_melanoma.R +++ b/R/TaskClassif_melanoma.R @@ -113,9 +113,3 @@ load_task_melanoma = function(id = "melanoma") { } register_task("melanoma", load_task_melanoma) - -download_melanoma = function() { - download.file("https://isic-challenge-data.s3.amazonaws.com/2020/ISIC_2020_Training_GroundTruth_v2.csv", - dest = "~/Downloads/metadata.csv" - ) -} diff --git a/tests/temp-TaskClassif_melanoma.R b/tests/temp-TaskClassif_melanoma.R index 4fd6f019..d577f1f7 100644 --- a/tests/temp-TaskClassif_melanoma.R +++ b/tests/temp-TaskClassif_melanoma.R @@ -1,15 +1,52 @@ +library(torch) +library(torchvision) library(mlr3torch) library(here) +library(data.table) library(tidytable) # TODO: figure out whether we want the v2 file -ground_truth = fread(here::here("cache", "ISIC_2020_Training_GroundTruth.csv")) +# I think no, since I don't really see a "use" for the lesion ID +training_metadata = fread(here::here("cache", "ISIC_2020_Training_GroundTruth.csv")) +# training_metadata_v2 = fread(here::here("cache", "ISIC_2020_Training_GroundTruth_v2.csv")) + +cache_dir = here("cache") +# construct a torch dataset +ds = torch::dataset( + initialize = function() { + self$.metadata = fread(here("cache", "ISIC_2020_Training_GroundTruth.csv")) + self$.path = file.path(here(cache_dir), "train") + }, + .getitem = function(idx) { + force(idx) + + x = torchvision::base_loader(file.path(self$.path, paste0(self$.metadata[idx, ]$image_name, ".jpg"))) + # TODO: decide on these transformations + x = torchvision::transform_to_tensor(x) %>% torchvision::transform_rgb_to_grayscale() + + # TODO: should we only return the images here? + return(list(x = x)) + }, + .length = function() { + nrow(self$.metadata) + } +) + +melanoma_ds = ds() -ground_truth -# TODO: figure out how to unzip the actual images # Construct lazy tensor for each image (e.g. a data table with a single ltnsr column) -# Join with the ground truth file +# Join with the metadata file + +# TODO: figure out the dimensions. The images each have a different dimension. +# dd_dims = c(NA, ) +# dd = as_data_descriptor(ds, list(x = dd_dims)) + +lt = lazy_tensor(dd) + +dt_train = inner_join(training_metadata, data.table(..., x = lt), by = image_name) + +# as_task_regr(dt_train, target = "corr", id = "guess_the_correlation") \ No newline at end of file From 1c0e12e7d100de00e96484a95b02a57e9e6113be Mon Sep 17 00:00:00 2001 From: Carson Zhang Date: Fri, 25 Oct 2024 12:10:27 +0200 Subject: [PATCH 07/46] benchmark code for image loaders --- .../image_loaders/benchmark_image_loaders.R | 62 +++++++++++++++++++ tests/temp-TaskClassif_melanoma.R | 18 +++--- 2 files changed, 69 insertions(+), 11 deletions(-) create mode 100644 benchmarks/image_loaders/benchmark_image_loaders.R diff --git a/benchmarks/image_loaders/benchmark_image_loaders.R b/benchmarks/image_loaders/benchmark_image_loaders.R new file mode 100644 index 00000000..51c4b5e7 --- /dev/null +++ b/benchmarks/image_loaders/benchmark_image_loaders.R @@ -0,0 +1,62 @@ +library(torch) +library(torchvision) +library(mlr3torch) +library(here) + +library(data.table) +library(tidytable) + +training_metadata = fread(here::here("cache", "ISIC_2020_Training_GroundTruth.csv")) + +# hard-coded cache directory that I use locally +cache_dir = here("cache") + +# construct a torch dataset +ds_base_loader = torch::dataset( + initialize = function(n_images) { + self$.metadata = fread(here(cache_dir, "ISIC_2020_Training_GroundTruth.csv"))[1:n_images, ] + self$.path = file.path(here(cache_dir), "train") + }, + .getitem = function(idx) { + force(idx) + + x = torchvision::base_loader(file.path(self$.path, paste0(self$.metadata[idx, ]$image_name, ".jpg"))) + x = torchvision::transform_to_tensor(x) + + return(list(x = x)) + }, + .length = function() { + nrow(self$.metadata) + } +) + +ds_magick_loader = torch::dataset( + initialize = function(n_images) { + self$.metadata = fread(here(cache_dir, "ISIC_2020_Training_GroundTruth.csv"))[1:n_images, ] + self$.path = file.path(here(cache_dir), "train") + }, + .getitem = function(idx) { + force(idx) + + image_name = self$.metadata[idx, ]$image_name + + x = torchvision::magick_loader(file.path(self$.path, paste0(image_name, ".jpg"))) + x = torchvision::transform_to_tensor(x) + + return(list(x = x, image_name = image_name)) + }, + .length = function() { + nrow(self$.metadata) + } +) + +n_images = 10 + +ds_base = ds_base_loader(n_images) +ds_magick = ds_magick_loader(n_images) + +bench::mark( + for (i in 1:n_images) ds_base$.getitem(i), + for (i in 1:n_images) ds_magick$.getitem(i), + memory = FALSE +) diff --git a/tests/temp-TaskClassif_melanoma.R b/tests/temp-TaskClassif_melanoma.R index d577f1f7..539b4bd9 100644 --- a/tests/temp-TaskClassif_melanoma.R +++ b/tests/temp-TaskClassif_melanoma.R @@ -15,7 +15,7 @@ cache_dir = here("cache") # construct a torch dataset ds = torch::dataset( initialize = function() { - self$.metadata = fread(here("cache", "ISIC_2020_Training_GroundTruth.csv")) + self$.metadata = fread(here(cache_dir, "ISIC_2020_Training_GroundTruth.csv")) self$.path = file.path(here(cache_dir), "train") }, .getitem = function(idx) { @@ -23,9 +23,9 @@ ds = torch::dataset( x = torchvision::base_loader(file.path(self$.path, paste0(self$.metadata[idx, ]$image_name, ".jpg"))) # TODO: decide on these transformations - x = torchvision::transform_to_tensor(x) %>% torchvision::transform_rgb_to_grayscale() + x = torchvision::transform_to_tensor(x) - # TODO: should we only return the images here? + # TODO: should we only return the images here? I think yes return(list(x = x)) }, .length = function() { @@ -35,18 +35,14 @@ ds = torch::dataset( melanoma_ds = ds() - - -# Construct lazy tensor for each image (e.g. a data table with a single ltnsr column) - -# Join with the metadata file - -# TODO: figure out the dimensions. The images each have a different dimension. # dd_dims = c(NA, ) -# dd = as_data_descriptor(ds, list(x = dd_dims)) +dd = as_data_descriptor(melanoma_ds, list(x = dd_dims)) +# Construct lazy tensor for each image (e.g. a data table with a single ltnsr column) +# TODO: confirm that this maintains the same ordering lt = lazy_tensor(dd) +# Join with the metadata file dt_train = inner_join(training_metadata, data.table(..., x = lt), by = image_name) # as_task_regr(dt_train, target = "corr", id = "guess_the_correlation") \ No newline at end of file From e9504232c4c99c77c775df14cbbb52d92571b4c1 Mon Sep 17 00:00:00 2001 From: Carson Zhang Date: Fri, 25 Oct 2024 12:12:45 +0200 Subject: [PATCH 08/46] idrk --- benchmarks/image_loaders/benchmark_image_loaders.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmarks/image_loaders/benchmark_image_loaders.R b/benchmarks/image_loaders/benchmark_image_loaders.R index 51c4b5e7..41b5424e 100644 --- a/benchmarks/image_loaders/benchmark_image_loaders.R +++ b/benchmarks/image_loaders/benchmark_image_loaders.R @@ -50,7 +50,7 @@ ds_magick_loader = torch::dataset( } ) -n_images = 10 +n_images = 100 ds_base = ds_base_loader(n_images) ds_magick = ds_magick_loader(n_images) From d3ceff28b99b4ae35154cfc56226c70a83f038df Mon Sep 17 00:00:00 2001 From: Carson Zhang Date: Fri, 25 Oct 2024 12:20:16 +0200 Subject: [PATCH 09/46] added resize script for melanoma dataset --- R/resize_melanoma.R | 8 ++++++++ benchmarks/image_loaders/benchmark_image_loaders.R | 2 -- 2 files changed, 8 insertions(+), 2 deletions(-) create mode 100644 R/resize_melanoma.R diff --git a/R/resize_melanoma.R b/R/resize_melanoma.R new file mode 100644 index 00000000..336b4e14 --- /dev/null +++ b/R/resize_melanoma.R @@ -0,0 +1,8 @@ +library(torch) +library(torchvision) +library(mlr3torch) +library(here) + +library(data.table) +library(tidytable) + diff --git a/benchmarks/image_loaders/benchmark_image_loaders.R b/benchmarks/image_loaders/benchmark_image_loaders.R index 41b5424e..c61ca120 100644 --- a/benchmarks/image_loaders/benchmark_image_loaders.R +++ b/benchmarks/image_loaders/benchmark_image_loaders.R @@ -4,14 +4,12 @@ library(mlr3torch) library(here) library(data.table) -library(tidytable) training_metadata = fread(here::here("cache", "ISIC_2020_Training_GroundTruth.csv")) # hard-coded cache directory that I use locally cache_dir = here("cache") -# construct a torch dataset ds_base_loader = torch::dataset( initialize = function(n_images) { self$.metadata = fread(here(cache_dir, "ISIC_2020_Training_GroundTruth.csv"))[1:n_images, ] From 9566b52954bd98d412f3ea58a0c97a2e4f241599 Mon Sep 17 00:00:00 2001 From: Carson Zhang Date: Fri, 25 Oct 2024 13:13:44 +0200 Subject: [PATCH 10/46] faijweoif --- benchmarks/image_loaders/benchmark_image_loaders.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/benchmarks/image_loaders/benchmark_image_loaders.R b/benchmarks/image_loaders/benchmark_image_loaders.R index c61ca120..44f05e36 100644 --- a/benchmarks/image_loaders/benchmark_image_loaders.R +++ b/benchmarks/image_loaders/benchmark_image_loaders.R @@ -4,6 +4,7 @@ library(mlr3torch) library(here) library(data.table) +setDTthreads(threads = 1) training_metadata = fread(here::here("cache", "ISIC_2020_Training_GroundTruth.csv")) @@ -55,6 +56,5 @@ ds_magick = ds_magick_loader(n_images) bench::mark( for (i in 1:n_images) ds_base$.getitem(i), - for (i in 1:n_images) ds_magick$.getitem(i), - memory = FALSE + for (i in 1:n_images) ds_magick$.getitem(i) ) From 47b090b0cc06d3b657fad534f451c2c1f225ea7b Mon Sep 17 00:00:00 2001 From: Carson Zhang Date: Fri, 25 Oct 2024 16:36:57 +0200 Subject: [PATCH 11/46] magick 2 times as slow --- R/resize_melanoma.R | 26 ++++++++++++++++--- attic/resize_melanoma.py | 25 ++++++++++++++++++ .../image_loaders/benchmark_image_loaders.R | 7 ++--- 3 files changed, 52 insertions(+), 6 deletions(-) create mode 100644 attic/resize_melanoma.py diff --git a/R/resize_melanoma.R b/R/resize_melanoma.R index 336b4e14..516c3486 100644 --- a/R/resize_melanoma.R +++ b/R/resize_melanoma.R @@ -1,8 +1,28 @@ library(torch) library(torchvision) -library(mlr3torch) + +library(purrr) + library(here) -library(data.table) -library(tidytable) +# change to wherever your files live +cache_dir = here("cache") + +path_to_melanoma_train = here(cache_dir, "train") +path_to_melanoma_test = here(cache_dir, "ISIC_2020_Test_Input") +path_to_output_train = here(cache_dir, "train_small") +path_to_output_test = here(cache_dir, "ISIC_2020_Test_Input_small") + +resize_to_dims = c(128, 128) + +resize_and_write = function(image_file_name, path_to_input_train, path_to_output_dir, dims) { + image = base_loader(file.path(path_to_input_train, image_file_name)) + small_image = torchvision::transform_resize(image, dims) + + output_file_name = file.path(path_to_output_dir, basename(image_file_name)) + + torch::torch_save(small_image, path_to_output_dir) +} +walk(.x = list.files(path_to_melanoma_train), .f = resize_and_write(path_to_melanoma_train, path_to_output_train, resize_to_dims), .progress = TRUE) +walk(.x = list.files(path_to_melanoma_test), .f = resize_and_write(path_to_melanoma_test, path_to_output_test), .progress = TRUE) diff --git a/attic/resize_melanoma.py b/attic/resize_melanoma.py new file mode 100644 index 00000000..857d36e2 --- /dev/null +++ b/attic/resize_melanoma.py @@ -0,0 +1,25 @@ +import torch +import os +from tqdm import tqdm +import torchvision + +PATH_TO_MLR3TORCH = "." +cache_dir = "cache" + +path_to_melanoma_train = os.path.join(PATH_TO_MLR3TORCH, cache_dir, "train/") +path_to_melanoma_test = os.path.join(PATH_TO_MLR3TORCH, cache_dir, "ISIC_2020_Test_Input") + +path_to_output_train = os.path.join(PATH_TO_MLR3TORCH, cache_dir, "train_small") +path_to_output_test = os.path.join(PATH_TO_MLR3TORCH, cache_dir, "ISIC_2020_Test_Input_small") + +tx = torchvision.transforms.Resize((128, 128)) + +for f in tqdm(os.listdir(path_to_melanoma_train)): + img = torchvision.io.read_image(path_to_melanoma_train + f) + small_img = tx(img.float()) + torchvision.utils.save_image(small_img, os.path.join(path_to_output_train, f)) + +for f in tqdm(os.listdir(path_to_melanoma_test)): + img = torchvision.io.read_image(path_to_melanoma_train + f) + small_img = tx(img.float()) + torchvision.utils.save_image(small_img, os.path.join(path_to_output_test, f)) diff --git a/benchmarks/image_loaders/benchmark_image_loaders.R b/benchmarks/image_loaders/benchmark_image_loaders.R index 44f05e36..5b46187a 100644 --- a/benchmarks/image_loaders/benchmark_image_loaders.R +++ b/benchmarks/image_loaders/benchmark_image_loaders.R @@ -49,12 +49,13 @@ ds_magick_loader = torch::dataset( } ) -n_images = 100 +n_images = 1000 ds_base = ds_base_loader(n_images) ds_magick = ds_magick_loader(n_images) bench::mark( for (i in 1:n_images) ds_base$.getitem(i), - for (i in 1:n_images) ds_magick$.getitem(i) -) + for (i in 1:n_images) ds_magick$.getitem(i), + memory = FALSE +) \ No newline at end of file From 9794efd79ba4759bc6ea6dcc76ec8d2b9db77118 Mon Sep 17 00:00:00 2001 From: Carson Zhang Date: Sun, 27 Oct 2024 17:59:53 +0100 Subject: [PATCH 12/46] jwaoeifajwoeij --- R/modify_melanoma_metadata.R | 22 ++++++++++++++++++++++ R/resize_melanoma.R | 3 ++- attic/resize_melanoma.py | 10 +++++----- 3 files changed, 29 insertions(+), 6 deletions(-) create mode 100644 R/modify_melanoma_metadata.R diff --git a/R/modify_melanoma_metadata.R b/R/modify_melanoma_metadata.R new file mode 100644 index 00000000..cf1aeafd --- /dev/null +++ b/R/modify_melanoma_metadata.R @@ -0,0 +1,22 @@ +library(data.table) +library(purrr) + +cache_dir = here("cache") + +duplicates = fread(here(cache_dir, "ISIC_2020_Training_Duplicates.csv")) + +metadata_file_paths = c( + here(cache_dir, "ISIC_2020_Training_GroundTruth.csv"), + here(cache_dir, "ISIC_2020_Training_GroundTruth_v2.csv"), + here(cache_dir, "ISIC_2020_Test_Metadata.csv") +) + +metadata_dt_list = map(metadata_file_paths, fread) + +add_hf_file_name_col = function(metadata_dt, image_relative_dir) { + metadata_dt[, (file_name) := file.path(image_relative_dir, metadata_dt$image_name)] +} + +image_relative_paths = c("train", "train", "ISIC_2020_Test_Input") + +walk2(metadata_dt_list, image_relative_paths, add_hf_file_name_col) diff --git a/R/resize_melanoma.R b/R/resize_melanoma.R index 516c3486..3aa7ed7d 100644 --- a/R/resize_melanoma.R +++ b/R/resize_melanoma.R @@ -17,9 +17,10 @@ resize_to_dims = c(128, 128) resize_and_write = function(image_file_name, path_to_input_train, path_to_output_dir, dims) { image = base_loader(file.path(path_to_input_train, image_file_name)) - small_image = torchvision::transform_resize(image, dims) + small_image = torchvision::transform_resize(transform_to_tensor(image), dims) output_file_name = file.path(path_to_output_dir, basename(image_file_name)) + print(output_file_name) torch::torch_save(small_image, path_to_output_dir) } diff --git a/attic/resize_melanoma.py b/attic/resize_melanoma.py index 857d36e2..70c69357 100644 --- a/attic/resize_melanoma.py +++ b/attic/resize_melanoma.py @@ -6,7 +6,7 @@ PATH_TO_MLR3TORCH = "." cache_dir = "cache" -path_to_melanoma_train = os.path.join(PATH_TO_MLR3TORCH, cache_dir, "train/") +path_to_melanoma_train = os.path.join(PATH_TO_MLR3TORCH, cache_dir, "train") path_to_melanoma_test = os.path.join(PATH_TO_MLR3TORCH, cache_dir, "ISIC_2020_Test_Input") path_to_output_train = os.path.join(PATH_TO_MLR3TORCH, cache_dir, "train_small") @@ -14,10 +14,10 @@ tx = torchvision.transforms.Resize((128, 128)) -for f in tqdm(os.listdir(path_to_melanoma_train)): - img = torchvision.io.read_image(path_to_melanoma_train + f) - small_img = tx(img.float()) - torchvision.utils.save_image(small_img, os.path.join(path_to_output_train, f)) +# for f in tqdm(os.listdir(path_to_melanoma_train)): +# img = torchvision.io.read_image(path_to_melanoma_train + f) +# small_img = tx(img.float()) +# torchvision.utils.save_image(small_img, os.path.join(path_to_output_train, f)) for f in tqdm(os.listdir(path_to_melanoma_test)): img = torchvision.io.read_image(path_to_melanoma_train + f) From 5f71fe9f0b24b157bcfa48ce90cdf6999a692667 Mon Sep 17 00:00:00 2001 From: Carson Zhang Date: Sun, 27 Oct 2024 18:22:59 +0100 Subject: [PATCH 13/46] added my local cache dirs to gitignore --- .gitignore | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.gitignore b/.gitignore index b62c0c99..e48dd286 100644 --- a/.gitignore +++ b/.gitignore @@ -15,3 +15,5 @@ inst/doc /Meta/ CRAN-SUBMISSION paper/data +cache/ +benchmarks/data \ No newline at end of file From dd8da0c9a1a93a5364210c02c326540729d09621 Mon Sep 17 00:00:00 2001 From: Carson Zhang Date: Tue, 29 Oct 2024 10:56:40 +0100 Subject: [PATCH 14/46] finished resizing images in hard-coded cache dir --- R/TaskClassif_melanoma.R | 50 ++++++++++++------- attic/resize_melanoma.py | 15 +++--- .../image_loaders/benchmark_image_loaders.R | 10 ++-- man/mlr_learners_torch_image.Rd | 2 +- man/mlr_pipeops_nn_adaptive_avg_pool1d.Rd | 2 +- tests/temp-TaskClassif_melanoma.R | 15 ++---- 6 files changed, 52 insertions(+), 42 deletions(-) diff --git a/R/TaskClassif_melanoma.R b/R/TaskClassif_melanoma.R index b53a3796..860f8a6d 100644 --- a/R/TaskClassif_melanoma.R +++ b/R/TaskClassif_melanoma.R @@ -16,7 +16,7 @@ #' \url{https://challenge2020.isic-archive.com/} #' #' @section Properties: -#' `r rd_info_task_torch("mnmelanoma", missings = FALSE)` +#' `r rd_info_task_torch("melanoma", missings = FALSE)` #' #' @references #' `r format_bib("melanoma")` @@ -43,35 +43,46 @@ constructor_melanoma = function(path) { ) download_melanoma_file = function(url) { - # change if necessary - prev_options = options(timeout = 3600) + prev_options = options(timeout = 36000) on.exit(options(prev_options)) - download.file(url, here(cache_dir, basename(url))) + download.file(url, path) } mlr3misc::walk(urls, download_melanoma_file) - unzip(here(cache_dir, basename(training_jpeg_images_url)), exdir = here(cache_dir)) - unzip(here(cache_dir, basename(test_jpeg_images_url)), exdir = here(cache_dir)) + unzip(here(path, basename(training_jpeg_images_url)), exdir = path) + unzip(here(cache_dir, basename(test_jpeg_images_url)), exdir = path) - train_metadata = fread(here(path, basename(test_jpeg_images_url))) - # train_images = + training_metadata = fread(here(path, basename(training_metadata_url))) + + ds = torch::dataset( + initialize = function() { + self$.metadata = fread(here(path, "ISIC_2020_Training_GroundTruth.csv")) + self$.path = file.path(here(path), "train") + }, + .getitem = function(idx) { + force(idx) + + x = torchvision::base_loader(file.path(self$.path, paste0(self$.metadata[idx, ]$image_name, ".jpg"))) + x = torchvision::transform_to_tensor(x) + + return(list(x = x)) + }, + .length = function() { + nrow(self$.metadata) + } + ) - # TODO: decide whether to delete these, since there are no ground truth labels - # test_metadata = fread(here(path, basename(test_metadata_url))) - # test_images = fread(here()) + dd = as_data_descriptor(melanoma_ds, list(x = NULL)) + lt = lazy_tensor(dd) - data.table( - # image: ltsnr - # metadata cols - ) + return(cbind(training_metadata, data.table(x = lt))) } load_task_melanoma = function(id = "melanoma") { cached_constructor = function(backend) { data = cached(constructor_melanoma, "datasets", "melanoma")$data - labels = ... ds = dataset( @@ -79,8 +90,11 @@ load_task_melanoma = function(id = "melanoma") { # some preprocessing - # TODO: determine the end dimensionality - data_descriptor = DataDescriptor$new(dataset = ds, list(image = c(NA, channel, spatial_dims))) + dd = as_data_descriptor(melanoma_ds, list(x = NULL)) + lt = lazy_tensor(dd) + dt = cbind(training_metadata, data.table(x = lt)) + + DataBackendDataTable$new(data = dt, primary_key = ...) } # construct a DataBackendLazy for this large dataset diff --git a/attic/resize_melanoma.py b/attic/resize_melanoma.py index 70c69357..133243f2 100644 --- a/attic/resize_melanoma.py +++ b/attic/resize_melanoma.py @@ -14,12 +14,13 @@ tx = torchvision.transforms.Resize((128, 128)) -# for f in tqdm(os.listdir(path_to_melanoma_train)): -# img = torchvision.io.read_image(path_to_melanoma_train + f) -# small_img = tx(img.float()) -# torchvision.utils.save_image(small_img, os.path.join(path_to_output_train, f)) +for f in tqdm(os.listdir(path_to_melanoma_train)): + img = torchvision.io.read_image(os.path.join(path_to_melanoma_train, f)) + small_img = tx(img.float() / 255) + torchvision.utils.save_image(small_img, os.path.join(path_to_output_train, f)) for f in tqdm(os.listdir(path_to_melanoma_test)): - img = torchvision.io.read_image(path_to_melanoma_train + f) - small_img = tx(img.float()) - torchvision.utils.save_image(small_img, os.path.join(path_to_output_test, f)) + if f.endswith(".jpg"): + img = torchvision.io.read_image(os.path.join(path_to_melanoma_test, f)) + small_img = tx(img.float() / 255) + torchvision.utils.save_image(small_img, os.path.join(path_to_output_test, f)) diff --git a/benchmarks/image_loaders/benchmark_image_loaders.R b/benchmarks/image_loaders/benchmark_image_loaders.R index 5b46187a..cf56a33a 100644 --- a/benchmarks/image_loaders/benchmark_image_loaders.R +++ b/benchmarks/image_loaders/benchmark_image_loaders.R @@ -39,7 +39,7 @@ ds_magick_loader = torch::dataset( image_name = self$.metadata[idx, ]$image_name - x = torchvision::magick_loader(file.path(self$.path, paste0(image_name, ".jpg"))) + x = magick::image_read(file.path(self$.path, paste0(image_name, ".jpg"))) x = torchvision::transform_to_tensor(x) return(list(x = x, image_name = image_name)) @@ -49,13 +49,15 @@ ds_magick_loader = torch::dataset( } ) -n_images = 1000 +n_images = 10 ds_base = ds_base_loader(n_images) ds_magick = ds_magick_loader(n_images) -bench::mark( +bmr = bench::mark( for (i in 1:n_images) ds_base$.getitem(i), for (i in 1:n_images) ds_magick$.getitem(i), memory = FALSE -) \ No newline at end of file +) + +print(bmr) \ No newline at end of file diff --git a/man/mlr_learners_torch_image.Rd b/man/mlr_learners_torch_image.Rd index af2b854d..fce0bef7 100644 --- a/man/mlr_learners_torch_image.Rd +++ b/man/mlr_learners_torch_image.Rd @@ -64,7 +64,7 @@ Creates a new instance of this \link[R6:R6Class]{R6} class. optimizer = NULL, loss = NULL, callbacks = list(), - packages = c("torchvision", "magick"), + packages = c("torchvision"), man, properties = NULL, predict_types = NULL diff --git a/man/mlr_pipeops_nn_adaptive_avg_pool1d.Rd b/man/mlr_pipeops_nn_adaptive_avg_pool1d.Rd index 64b3fa77..aa36c41d 100644 --- a/man/mlr_pipeops_nn_adaptive_avg_pool1d.Rd +++ b/man/mlr_pipeops_nn_adaptive_avg_pool1d.Rd @@ -27,7 +27,7 @@ Part of this documentation have been copied or adapted from the documentation of \section{Parameters}{ \itemize{ -\item \code{output_size} :: \code{integer()}\cr +\item \code{output_size} :: \code{integer(1)}\cr The target output size. A single number. } } diff --git a/tests/temp-TaskClassif_melanoma.R b/tests/temp-TaskClassif_melanoma.R index 539b4bd9..3f1803eb 100644 --- a/tests/temp-TaskClassif_melanoma.R +++ b/tests/temp-TaskClassif_melanoma.R @@ -22,10 +22,8 @@ ds = torch::dataset( force(idx) x = torchvision::base_loader(file.path(self$.path, paste0(self$.metadata[idx, ]$image_name, ".jpg"))) - # TODO: decide on these transformations x = torchvision::transform_to_tensor(x) - # TODO: should we only return the images here? I think yes return(list(x = x)) }, .length = function() { @@ -35,14 +33,9 @@ ds = torch::dataset( melanoma_ds = ds() -# dd_dims = c(NA, ) -dd = as_data_descriptor(melanoma_ds, list(x = dd_dims)) - -# Construct lazy tensor for each image (e.g. a data table with a single ltnsr column) -# TODO: confirm that this maintains the same ordering +dd = as_data_descriptor(melanoma_ds, list(x = NULL)) lt = lazy_tensor(dd) +dt_train = cbind(training_metadata, data.table(x = lt)) +# as_task_regr(dt_train, target = "corr", id = "guess_the_correlation") -# Join with the metadata file -dt_train = inner_join(training_metadata, data.table(..., x = lt), by = image_name) - -# as_task_regr(dt_train, target = "corr", id = "guess_the_correlation") \ No newline at end of file +training_duplicates = fread(here(cache_dir, "ISIC_2020_Training_Duplicates.csv")) \ No newline at end of file From 9b1c240b91cb40fd0fd02637e20ab92718a4a7f0 Mon Sep 17 00:00:00 2001 From: Carson Zhang Date: Tue, 5 Nov 2024 13:35:38 +0100 Subject: [PATCH 15/46] code to generate hf dataset, still need to check for full reproducibility --- R/TaskClassif_melanoma.R | 68 +++++++++++---- R/modify_melanoma_metadata.R | 22 ----- .../01-download_melanoma.R | 21 +++-- attic/02-resize_melanoma.py | 29 +++++++ attic/03-process_melanoma.R | 84 +++++++++++++++++++ attic/find_extensions.py | 78 +++++++++++++++++ ...ze_melanoma.py => resize_melanoma copy.py} | 0 {R => attic}/resize_melanoma.R | 0 tests/temp-TaskClassif_melanoma.R | 8 +- 9 files changed, 260 insertions(+), 50 deletions(-) delete mode 100644 R/modify_melanoma_metadata.R rename R/download_melanoma.R => attic/01-download_melanoma.R (69%) create mode 100644 attic/02-resize_melanoma.py create mode 100644 attic/03-process_melanoma.R create mode 100644 attic/find_extensions.py rename attic/{resize_melanoma.py => resize_melanoma copy.py} (100%) rename {R => attic}/resize_melanoma.R (100%) diff --git a/R/TaskClassif_melanoma.R b/R/TaskClassif_melanoma.R index 860f8a6d..38c42a09 100644 --- a/R/TaskClassif_melanoma.R +++ b/R/TaskClassif_melanoma.R @@ -3,7 +3,7 @@ #' @description #' Classification of melanoma tumor images. #' -#' The data comes from the 2020 ISIC challenge. +#' The data comes from the 2020 SIIM-ISIC challenge. #' #' @section Construction: #' ``` @@ -29,12 +29,13 @@ NULL # The cache_dir/datasets/melanoma folder constructor_melanoma = function(path) { # download data + # TODO: change to Hugging Face URLs + # TODO: use the code from the attic training_jpeg_images_url = "https://isic-challenge-data.s3.amazonaws.com/2020/ISIC_2020_Training_JPEG.zip" training_metadata_url = "https://isic-challenge-data.s3.amazonaws.com/2020/ISIC_2020_Training_GroundTruth.csv" training_metadata_v2_url = "https://isic-challenge-data.s3.amazonaws.com/2020/ISIC_2020_Training_GroundTruth_v2.csv" training_duplicate_image_list_url = "https://isic-challenge-data.s3.amazonaws.com/2020/ISIC_2020_Training_Duplicates.csv" - test_jpeg_images_url = "https://isic-challenge-data.s3.amazonaws.com/2020/ISIC_2020_Test_JPEG.zip" test_metadata_url = "https://isic-challenge-data.s3.amazonaws.com/2020/ISIC_2020_Test_Metadata.csv" urls = c( @@ -50,15 +51,16 @@ constructor_melanoma = function(path) { } mlr3misc::walk(urls, download_melanoma_file) - + unzip(here(path, basename(training_jpeg_images_url)), exdir = path) unzip(here(cache_dir, basename(test_jpeg_images_url)), exdir = path) - training_metadata = fread(here(path, basename(training_metadata_url))) - - ds = torch::dataset( + training_metadata = fread(here(path, basename(training_metadata_v2_url))) + + # if you want some operation to be cached (e.g. if it's expensive) do it here + ds_train = torch::dataset( initialize = function() { - self$.metadata = fread(here(path, "ISIC_2020_Training_GroundTruth.csv")) + self$.metadata = fread(here(path, "ISIC_2020_Training_GroundTruth_v2.csv")) self$.path = file.path(here(path), "train") }, .getitem = function(idx) { @@ -74,8 +76,26 @@ constructor_melanoma = function(path) { } ) - dd = as_data_descriptor(melanoma_ds, list(x = NULL)) - lt = lazy_tensor(dd) + ds_test = torch::dataset( + initialize = function() { + self$.metadata = fread(here(path, "ISIC_2020_Test_Metadata.csv")) + self$.path = file.path(here(path), "ISIC_2020_Test_Input") + } + .getitem = function(idx) { + force(idx) + + x = torchvision::base_loader(file.path(self$.path, paste0(self$.metadata[idx, ]$image_name, ".jpg"))) + x = torchvision::transform_to_tensor(x) + + return(list(x = x)) + }, + .length = function() { + nrow(self$.metadata) + } + ) + + dd_train = as_data_descriptor(melanoma_ds, list(x = NULL)) + lt_train = lazy_tensor(dd) return(cbind(training_metadata, data.table(x = lt))) } @@ -85,30 +105,45 @@ load_task_melanoma = function(id = "melanoma") { data = cached(constructor_melanoma, "datasets", "melanoma")$data ds = dataset( - + initialize = function() { + self$.metadata = fread(here(cache_dir, "ISIC_2020_Training_GroundTruth.csv")) + self$.path = file.path(here(cache_dir), "train") + }, + .getitem = function(idx) { + force(idx) + + x = torchvision::base_loader(file.path(self$.path, paste0(self$.metadata[idx, ]$image_name, ".jpg"))) + x = torchvision::transform_to_tensor(x) + + return(list(x = x)) + }, + .length = function() { + nrow(self$.metadata) + } )(data$image) - # some preprocessing + # TODO: some preprocessing dd = as_data_descriptor(melanoma_ds, list(x = NULL)) lt = lazy_tensor(dd) dt = cbind(training_metadata, data.table(x = lt)) - DataBackendDataTable$new(data = dt, primary_key = ...) + # set ..row_id = + + DataBackendDataTable$new(data = dt, primary_key = "..row_id") } # construct a DataBackendLazy for this large dataset backend = DataBackendLazy$new( constructor = cached_constructor, - rownames = seq_len(n_rows), # TODO: compute + rownames = seq_len(32701), # TODO: is it weird to have rownames different from primary_key? # hard-coded info about the task (nrows, ncols) - col_info = load_col_info("melanoma") - primary_key = "..row_id" # TODO: explain + col_info = load_col_info("melanoma"), + primary_key = "..row_id" ) # the DataBackendLazy implements the logic for downloading, processing, caching the dataset. # in this case, we only need to implement the download and processing because the private `cached()` function implements caching - # TODO: find this private `cached()` function # the DataBackendLazy also hardcodes some metadata that will be available even before the data is downloaded. # this metadata will be stored in `.inst/col_info` @@ -122,6 +157,7 @@ load_task_melanoma = function(id = "melanoma") { target = "class", label = "Melanoma classification" ) + task$set_col_roles("patient_id", roles = "group") return(task) } diff --git a/R/modify_melanoma_metadata.R b/R/modify_melanoma_metadata.R deleted file mode 100644 index cf1aeafd..00000000 --- a/R/modify_melanoma_metadata.R +++ /dev/null @@ -1,22 +0,0 @@ -library(data.table) -library(purrr) - -cache_dir = here("cache") - -duplicates = fread(here(cache_dir, "ISIC_2020_Training_Duplicates.csv")) - -metadata_file_paths = c( - here(cache_dir, "ISIC_2020_Training_GroundTruth.csv"), - here(cache_dir, "ISIC_2020_Training_GroundTruth_v2.csv"), - here(cache_dir, "ISIC_2020_Test_Metadata.csv") -) - -metadata_dt_list = map(metadata_file_paths, fread) - -add_hf_file_name_col = function(metadata_dt, image_relative_dir) { - metadata_dt[, (file_name) := file.path(image_relative_dir, metadata_dt$image_name)] -} - -image_relative_paths = c("train", "train", "ISIC_2020_Test_Input") - -walk2(metadata_dt_list, image_relative_paths, add_hf_file_name_col) diff --git a/R/download_melanoma.R b/attic/01-download_melanoma.R similarity index 69% rename from R/download_melanoma.R rename to attic/01-download_melanoma.R index 9e27eca8..80d23ea3 100644 --- a/R/download_melanoma.R +++ b/attic/01-download_melanoma.R @@ -1,3 +1,5 @@ +library(here) + training_jpeg_images_url = "https://isic-challenge-data.s3.amazonaws.com/2020/ISIC_2020_Training_JPEG.zip" training_metadata_url = "https://isic-challenge-data.s3.amazonaws.com/2020/ISIC_2020_Training_GroundTruth.csv" training_metadata_v2_url = "https://isic-challenge-data.s3.amazonaws.com/2020/ISIC_2020_Training_GroundTruth_v2.csv" @@ -8,16 +10,21 @@ test_metadata_url = "https://isic-challenge-data.s3.amazonaws.com/2020/ISIC_2020 urls = c( - training_jpeg_images_url - # training_metadata_url, training_metadata_v2_url, training_duplicate_image_list_url + training_jpeg_images_url, + training_metadata_url, training_metadata_v2_url, training_duplicate_image_list_url, + test_jpeg_images_url, + test_metadata_url ) -unzip(here(cache_dir, basename(training_jpeg_images_url))) - -options(timeout = 36000) - +cache_dir = here("cache") download_melanoma_file = function(url) { - download.file(url, here::here("cache", basename(url))) + op = options(timeout = 36000) + on.exit(options(op)) + + download.file(url, here(cache_dir, basename(url))) } mlr3misc::walk(urls, download_melanoma_file) + +unzip(here(cache_dir, basename(training_jpeg_images_url))) +unzip(here(cache_dir, basename(test_jpeg_images_url))) diff --git a/attic/02-resize_melanoma.py b/attic/02-resize_melanoma.py new file mode 100644 index 00000000..5c54c548 --- /dev/null +++ b/attic/02-resize_melanoma.py @@ -0,0 +1,29 @@ +import torch +import os +from tqdm import tqdm +import torchvision + +PATH_TO_MLR3TORCH = "." +cache_dir = "cache" + +path_to_melanoma_train = os.path.join(PATH_TO_MLR3TORCH, cache_dir, "train") +path_to_melanoma_test = os.path.join(PATH_TO_MLR3TORCH, cache_dir, "ISIC_2020_Test_Input") + +path_to_output_train = os.path.join(PATH_TO_MLR3TORCH, cache_dir, "hf_dataset", "train") +path_to_output_test = os.path.join(PATH_TO_MLR3TORCH, cache_dir, "hf_dataset", "ISIC_2020_Test_Input") + +os.makedirs(path_to_output_train) +os.makedirs(path_to_output_test) + +tx = torchvision.transforms.Resize((128, 128)) + +for f in tqdm(os.listdir(path_to_melanoma_train)): + img = torchvision.io.read_image(os.path.join(path_to_melanoma_train, f)) + small_img = tx(img.float() / 255) + torchvision.utils.save_image(small_img, os.path.join(path_to_output_train, f)) + +for f in tqdm(os.listdir(path_to_melanoma_test)): + if f.endswith(".jpg"): + img = torchvision.io.read_image(os.path.join(path_to_melanoma_test, f)) + small_img = tx(img.float() / 255) + torchvision.utils.save_image(small_img, os.path.join(path_to_output_test, f)) diff --git a/attic/03-process_melanoma.R b/attic/03-process_melanoma.R new file mode 100644 index 00000000..a4327d22 --- /dev/null +++ b/attic/03-process_melanoma.R @@ -0,0 +1,84 @@ +library(data.table) +library(tidytable) +library(purrr) + +library(here) + +library(fs) + +# this script changes the data into the format expected by Hugging Face +# It expects that you have downloaded and extracted the original data by running the download_melanoma.R script +# and that you have already resized it with PyTorch + +cache_dir = here("cache") + +duplicates = fread(here(cache_dir, "ISIC_2020_Training_Duplicates.csv")) + +metadata_file_paths = c( + here(cache_dir, "ISIC_2020_Training_GroundTruth.csv"), + here(cache_dir, "ISIC_2020_Training_GroundTruth_v2.csv"), + here(cache_dir, "ISIC_2020_Test_Metadata.csv") +) +metadata_dt_list = map(metadata_file_paths, fread) +metadata_dt_list[[3]] = rename(metadata_dt_list[[3]], image_name = image) + +# deduplicate the metadata +dedup = function(metadata_dt, duplicate_file_names) { + metadata_dt[!(image_name %in% duplicate_file_names), ] +} + +training_metadata = dedup(metadata_dt_list[[1]], duplicates$image_name_2) +training_metadata_v2 = dedup(metadata_dt_list[[2]], duplicates$image_name_2) +test_metadata = metadata_dt_list[[3]] + +hf_dataset_dir = here(cache_dir, "hf_dataset") +hf_train_dir = here(hf_dataset_dir, "train") +hf_test_dir = here(hf_dataset_dir, "ISIC_2020_Test_Input") + +train_dirnames_for_each_img = paste0("train", (training_metadata_v2[, .I] %% 4) + 1) +test_dirnames_for_each_img = paste0("ISIC_2020_Test_Input", (test_metadata[, .I] %% 2) + 1) + +# add a column that Hugging Face wants +add_hf_file_name_col = function(metadata_dt, image_relative_dirnames) { + metadata_dt[, file_name := paste0(file.path(image_relative_dirnames, metadata_dt$image_name), ".jpg")] +} + +# image_relative_paths = c("train", "train", "ISIC_2020_Test_Input") + +add_hf_file_name_col(training_metadata, train_dirnames_for_each_img) +add_hf_file_name_col(training_metadata_v2, train_dirnames_for_each_img) +add_hf_file_name_col(metadata_dt_list[[3]], test_dirnames_for_each_img) + +# delete the duplicated images +list.files(hf_train_dir) |> length() +file.remove(here(hf_train_dir, paste0(duplicates$image_name_2, ".jpg"))) +list.files(hf_train_dir) |> length() + +old_names = function(metadata_dt, dir) { + paste0(file.path(dir, metadata_dt$image_name), ".jpg") +} + +create_if_necessary = function(dirname) { + if (!dir.exists(dirname)) { + dir.create(dirname) + } +} + +walk(here(hf_dataset_dir, unique(train_dirnames_for_each_img)), create_if_necessary) +walk(here(hf_dataset_dir, unique(test_dirnames_for_each_img)), create_if_necessary) + +# file_move(old_names(training_metadata), here(hf_dataset_dir, train_dirnames_for_each_img, paste0(training_metadata$image_name, ".jpg"))) +file_move(old_names(training_metadata_v2, hf_train_dir), here(hf_dataset_dir, train_dirnames_for_each_img, paste0(training_metadata_v2$image_name, ".jpg"))) +file_move(old_names(test_metadata, hf_test_dir), here(hf_dataset_dir, test_dirnames_for_each_img, paste0(test_metadata$image_name, ".jpg"))) + +test_metadata = rename(test_metadata, image = image_name) + +fwrite(training_metadata, here(hf_dataset_dir, "ISIC_2020_Training_GroundTruth.csv")) +fwrite(training_metadata_v2, here(hf_dataset_dir, "ISIC_2020_Training_GroundTruth_v2.csv")) +fwrite(test_metadata, here(hf_dataset_dir, "ISIC_2020_Test_Metadata.csv")) + +# test1 = list.files(here(hf_dataset_dir, "ISIC_2020_Test_Input1")) +# test2 = list.files(here(hf_dataset_dir, "ISIC_2020_Test_Input2")) +# setdiff(test1, test2) + +# test_metadata |> filter(image_name == "ISIC_9999302") |> pull(file_name) diff --git a/attic/find_extensions.py b/attic/find_extensions.py new file mode 100644 index 00000000..76e75d05 --- /dev/null +++ b/attic/find_extensions.py @@ -0,0 +1,78 @@ +#!/usr/bin/env python3 +import os +from collections import defaultdict +from pathlib import Path +import argparse + + +def main(): + parser = argparse.ArgumentParser(description='Find all file extensions in a directory') + parser.add_argument('path', nargs='?', default='.', + help='Directory path to scan (default: current directory)') + parser.add_argument('-a', '--all', action='store_true', + help='Include hidden files and directories') + parser.add_argument('-c', '--count', action='store_true', + help='Show count of files per extension') + parser.add_argument('-s', '--sort-count', action='store_true', + help='Sort by count instead of alphabetically') + + args = parser.parse_args() + + # Validate directory + if not os.path.isdir(args.path): + print(f"Error: '{args.path}' is not a valid directory") + return 1 + + # Initialize counter + extensions = defaultdict(int) + + # Walk through directory + for root, dirs, files in os.walk(args.path): + # Skip hidden directories unless -a flag is used + if not args.all: + dirs[:] = [d for d in dirs if not d.startswith('.')] + files = [f for f in files if not f.startswith('.')] + + for file in files: + ext = Path(file).suffix.lower() + if ext: + extensions[ext[1:]] += 1 # Remove the leading dot + else: + extensions['(no extension)'] += 1 + + # No files found + if not extensions: + print("No files found in the specified directory.") + return 0 + + # Prepare for display + if args.sort_count: + # Sort by count (descending) and then by extension name + items = sorted(extensions.items(), key=lambda x: (-x[1], x[0])) + else: + # Sort alphabetically by extension + items = sorted(extensions.items()) + + # Display results + print(f"\nExtensions found in: {os.path.abspath(args.path)}") + print("-" * 40) + + if args.count: + # Show with counts + max_ext_len = max(len(ext) for ext in extensions.keys()) + for ext, count in items: + print(f"{ext:<{max_ext_len}} : {count:>5} files") + else: + # Show just extensions + for ext, _ in items: + print(ext) + + # Print summary + total_files = sum(extensions.values()) + total_extensions = len(extensions) + print("-" * 40) + print(f"Total: {total_files} files, {total_extensions} unique extensions") + + +if __name__ == "__main__": + exit(main()) diff --git a/attic/resize_melanoma.py b/attic/resize_melanoma copy.py similarity index 100% rename from attic/resize_melanoma.py rename to attic/resize_melanoma copy.py diff --git a/R/resize_melanoma.R b/attic/resize_melanoma.R similarity index 100% rename from R/resize_melanoma.R rename to attic/resize_melanoma.R diff --git a/tests/temp-TaskClassif_melanoma.R b/tests/temp-TaskClassif_melanoma.R index 3f1803eb..2794dffd 100644 --- a/tests/temp-TaskClassif_melanoma.R +++ b/tests/temp-TaskClassif_melanoma.R @@ -7,9 +7,7 @@ library(data.table) library(tidytable) # TODO: figure out whether we want the v2 file -# I think no, since I don't really see a "use" for the lesion ID -training_metadata = fread(here::here("cache", "ISIC_2020_Training_GroundTruth.csv")) -# training_metadata_v2 = fread(here::here("cache", "ISIC_2020_Training_GroundTruth_v2.csv")) +training_metadata_v2 = fread(here::here("cache", "ISIC_2020_Training_GroundTruth_v2.csv")) cache_dir = here("cache") # construct a torch dataset @@ -35,7 +33,7 @@ melanoma_ds = ds() dd = as_data_descriptor(melanoma_ds, list(x = NULL)) lt = lazy_tensor(dd) -dt_train = cbind(training_metadata, data.table(x = lt)) +dt_train = cbind(training_metadata_v2, data.table(x = lt)) # as_task_regr(dt_train, target = "corr", id = "guess_the_correlation") -training_duplicates = fread(here(cache_dir, "ISIC_2020_Training_Duplicates.csv")) \ No newline at end of file +training_duplicates = fread(here(cache_dir, "ISIC_2020_Training_Duplicates.csv")) From 9cff991b8811ca7d6079df7a58d1f00ade8676cb Mon Sep 17 00:00:00 2001 From: Carson Zhang Date: Fri, 8 Nov 2024 12:20:20 +0100 Subject: [PATCH 16/46] looks ok with hard-coded cache, still need to test properly --- R/TaskClassif_melanoma.R | 124 +++++++-------------- attic/hfhub_test.R | 69 ++++++++++++ data-raw/melanoma.R | 70 +++++++++++- data-raw/tiny_imagenet.R | 10 +- inst/col_info/melanoma.rds | Bin 0 -> 412 bytes tests/testthat/test_TaskClassif_melanoma.R | 20 ++-- 6 files changed, 185 insertions(+), 108 deletions(-) create mode 100644 attic/hfhub_test.R create mode 100644 inst/col_info/melanoma.rds diff --git a/R/TaskClassif_melanoma.R b/R/TaskClassif_melanoma.R index 38c42a09..8903a6be 100644 --- a/R/TaskClassif_melanoma.R +++ b/R/TaskClassif_melanoma.R @@ -28,45 +28,33 @@ NULL # @param path (`character(1)`)\cr # The cache_dir/datasets/melanoma folder constructor_melanoma = function(path) { - # download data - # TODO: change to Hugging Face URLs - # TODO: use the code from the attic - training_jpeg_images_url = "https://isic-challenge-data.s3.amazonaws.com/2020/ISIC_2020_Training_JPEG.zip" - training_metadata_url = "https://isic-challenge-data.s3.amazonaws.com/2020/ISIC_2020_Training_GroundTruth.csv" - training_metadata_v2_url = "https://isic-challenge-data.s3.amazonaws.com/2020/ISIC_2020_Training_GroundTruth_v2.csv" - training_duplicate_image_list_url = "https://isic-challenge-data.s3.amazonaws.com/2020/ISIC_2020_Training_Duplicates.csv" - - test_metadata_url = "https://isic-challenge-data.s3.amazonaws.com/2020/ISIC_2020_Test_Metadata.csv" - - urls = c( - training_jpeg_images_url, training_metadata_url, training_metadata_v2_url, training_duplicate_image_list_url, - test_jpeg_images_url, test_metadata_url + file_names = c( + "ISIC_2020_Training_GroundTruth_v2.csv", "train1", "train2", "train3", "train4", + "ISIC_2020_Test_Metadata.csv", "ISIC_2020_Test_Input1", "ISIC_2020_Test_Input2" ) - download_melanoma_file = function(url) { - prev_options = options(timeout = 36000) - on.exit(options(prev_options)) + withr::with_envvar(c(HUGGINGFACE_HUB_CACHE = path), { + hfhub::hub_snapshot("carsonzhang/ISIC_2020_small", repo_type = "dataset") + }) - download.file(url, path) - } - - mlr3misc::walk(urls, download_melanoma_file) - - unzip(here(path, basename(training_jpeg_images_url)), exdir = path) - unzip(here(cache_dir, basename(test_jpeg_images_url)), exdir = path) + hf_dataset_path = here(path, "datasets--carsonzhang--ISIC_2020_small", "snapshots", "2737ff07cc2ef8bd44d692d3323472fce272fca3") - training_metadata = fread(here(path, basename(training_metadata_v2_url))) + training_metadata = fread(here(hf_dataset_path, "ISIC_2020_Training_GroundTruth_v2.csv"))[, split := "train"] + test_metadata = setnames(fread(here(hf_dataset_path, "ISIC_2020_Test_Metadata.csv")), + old = c("image", "patient", "anatom_site_general"), + new = c("image_name", "patient_id", "anatom_site_general_challenge") + )[, split := "test"] + metadata = rbind(training_metadata, test_metadata, fill = TRUE) - # if you want some operation to be cached (e.g. if it's expensive) do it here - ds_train = torch::dataset( + melanoma_ds_generator = torch::dataset( initialize = function() { - self$.metadata = fread(here(path, "ISIC_2020_Training_GroundTruth_v2.csv")) - self$.path = file.path(here(path), "train") + self$.metadata = metadata + self$.path = hf_dataset_path }, .getitem = function(idx) { force(idx) - x = torchvision::base_loader(file.path(self$.path, paste0(self$.metadata[idx, ]$image_name, ".jpg"))) + x = torchvision::base_loader(file.path(self$.path, paste0(self$.metadata[idx, ]$file_name))) x = torchvision::transform_to_tensor(x) return(list(x = x)) @@ -76,89 +64,57 @@ constructor_melanoma = function(path) { } ) - ds_test = torch::dataset( - initialize = function() { - self$.metadata = fread(here(path, "ISIC_2020_Test_Metadata.csv")) - self$.path = file.path(here(path), "ISIC_2020_Test_Input") - } - .getitem = function(idx) { - force(idx) - - x = torchvision::base_loader(file.path(self$.path, paste0(self$.metadata[idx, ]$image_name, ".jpg"))) - x = torchvision::transform_to_tensor(x) - - return(list(x = x)) - }, - .length = function() { - nrow(self$.metadata) - } - ) + melanoma_ds = melanoma_ds_generator() - dd_train = as_data_descriptor(melanoma_ds, list(x = NULL)) - lt_train = lazy_tensor(dd) + dd = as_data_descriptor(melanoma_ds, list(x = c(NA, 3, 128, 128))) + lt = lazy_tensor(dd) - return(cbind(training_metadata, data.table(x = lt))) + return(cbind(metadata, data.table(image = lt))) } load_task_melanoma = function(id = "melanoma") { cached_constructor = function(backend) { data = cached(constructor_melanoma, "datasets", "melanoma")$data - ds = dataset( - initialize = function() { - self$.metadata = fread(here(cache_dir, "ISIC_2020_Training_GroundTruth.csv")) - self$.path = file.path(here(cache_dir), "train") - }, - .getitem = function(idx) { - force(idx) - - x = torchvision::base_loader(file.path(self$.path, paste0(self$.metadata[idx, ]$image_name, ".jpg"))) - x = torchvision::transform_to_tensor(x) + # remove irrelevant cols: image_name, target + data[, image_name := NULL] + data[, target := NULL] - return(list(x = x)) - }, - .length = function() { - nrow(self$.metadata) - } - )(data$image) + # change the encodings of variables: diagnosis, benign_malignant + data[, benign_malignant := factor(benign_malignant, levels = c("benign", "malignant"))] - # TODO: some preprocessing + char_features = c("sex", "anatom_site_general_challenge", "diagnosis") + data[, lapply(.SD, factor), .SDcols = char_features] - dd = as_data_descriptor(melanoma_ds, list(x = NULL)) - lt = lazy_tensor(dd) - dt = cbind(training_metadata, data.table(x = lt)) - - # set ..row_id = + dt = cbind(data, + data.table( + ..row_id = seq_along(data$benign_malignant) + ) + ) DataBackendDataTable$new(data = dt, primary_key = "..row_id") } - # construct a DataBackendLazy for this large dataset backend = DataBackendLazy$new( constructor = cached_constructor, - rownames = seq_len(32701), # TODO: is it weird to have rownames different from primary_key? - # hard-coded info about the task (nrows, ncols) + rownames = seq_len(32701 + 10982), col_info = load_col_info("melanoma"), primary_key = "..row_id" ) - # the DataBackendLazy implements the logic for downloading, processing, caching the dataset. - # in this case, we only need to implement the download and processing because the private `cached()` function implements caching - - # the DataBackendLazy also hardcodes some metadata that will be available even before the data is downloaded. - # this metadata will be stored in `.inst/col_info` - # and can be loaded using `load_column_info()` - # the code that generates this hardcoded metadata should be in `./data-raw` - - # create a TaskClassif from this DataBackendLazy task = TaskClassif$new( backend = backend, id = "melanoma", - target = "class", + target = "target", label = "Melanoma classification" ) + + backend$hash = task$man = "mlr3torch::mlr_tasks_melanoma" + task$set_col_roles("patient_id", roles = "group") + task$filter(1:32701) + return(task) } diff --git a/attic/hfhub_test.R b/attic/hfhub_test.R new file mode 100644 index 00000000..8c63b9d1 --- /dev/null +++ b/attic/hfhub_test.R @@ -0,0 +1,69 @@ +library(here) +library(data.table) + +devtools::load_all() + +file_names = c( + "ISIC_2020_Training_GroundTruth_v2.csv", "train1", "train2", "train3", "train4", + "ISIC_2020_Test_Metadata.csv", "ISIC_2020_Test_Input1", "ISIC_2020_Test_Input2" +) + +hf_cache_dir = here::here("cache", "hf_downloaded") + +# withr::with_envvar(c(HUGGINGFACE_HUB_CACHE = hf_cache_dir), { +# path <- hfhub::hub_snapshot("carsonzhang/ISIC_2020_small", repo_type = "dataset") +# }) + +# print(paths) + + +hf_dataset_path = here(hf_cache_dir, "datasets--carsonzhang--ISIC_2020_small", "snapshots", "2737ff07cc2ef8bd44d692d3323472fce272fca3") + +constructor_melanoma = function(path) { + file_names = c( + "ISIC_2020_Training_GroundTruth_v2.csv", "train1", "train2", "train3", "train4", + "ISIC_2020_Test_Metadata.csv", "ISIC_2020_Test_Input1", "ISIC_2020_Test_Input2" + ) + + # withr::with_envvar(c(HUGGINGFACE_HUB_CACHE = path), { + # hfhub::hub_snapshot("carsonzhang/ISIC_2020_small", repo_type = "dataset") + # }) + + hf_dataset_path = here(path, "datasets--carsonzhang--ISIC_2020_small", "snapshots", "2737ff07cc2ef8bd44d692d3323472fce272fca3") + + training_metadata = fread(here(hf_dataset_path, "ISIC_2020_Training_GroundTruth_v2.csv"))[, split := "train"] + test_metadata = setnames(fread(here(hf_dataset_path, "ISIC_2020_Test_Metadata.csv")), + old = c("image", "patient", "anatom_site_general"), + new = c("image_name", "patient_id", "anatom_site_general_challenge") + )[, split := "test"] + metadata = rbind(training_metadata, test_metadata, fill = TRUE) + + # write to disk? + + melanoma_ds_generator = torch::dataset( + initialize = function() { + self$.metadata = metadata + self$.path = hf_dataset_path + }, + .getitem = function(idx) { + force(idx) + + x = torchvision::base_loader(file.path(self$.path, paste0(self$.metadata[idx, ]$file_name))) + x = torchvision::transform_to_tensor(x) + + return(list(x = x)) + }, + .length = function() { + nrow(self$.metadata) + } + ) + + melanoma_ds = melanoma_ds_generator() + + dd = as_data_descriptor(melanoma_ds, list(x = c(NA, 3, 128, 128))) + lt = lazy_tensor(dd) + + return(cbind(metadata, data.table(image = lt))) +} + +melanoma_ds = constructor_melanoma(hf_cache_dir) diff --git a/data-raw/melanoma.R b/data-raw/melanoma.R index 8f932124..a7186a32 100644 --- a/data-raw/melanoma.R +++ b/data-raw/melanoma.R @@ -1,13 +1,73 @@ devtools::load_all() # manually construct the task once +library(here) +library(data.table) -ci = col_info(get_private(tsk("melanoma")$backend)$.constructor()) +hf_cache_dir = here::here("cache", "hf_downloaded") -saveRDS(ci, here::here("inst/col_ino/melanoma.rds")) +hf_dataset_path = here::here(hf_cache_dir, "datasets--carsonzhang--ISIC_2020_small", "snapshots", "2737ff07cc2ef8bd44d692d3323472fce272fca3") -mlr3::DataBackendCbind$new(c) +constructor_melanoma = function(path) { + file_names = c( + "ISIC_2020_Training_GroundTruth_v2.csv", "train1", "train2", "train3", "train4", + "ISIC_2020_Test_Metadata.csv", "ISIC_2020_Test_Input1", "ISIC_2020_Test_Input2" + ) -# split + # withr::with_envvar(c(HUGGINGFACE_HUB_CACHE = path), { + # hfhub::hub_snapshot("carsonzhang/ISIC_2020_small", repo_type = "dataset") + # }) -# ci \ No newline at end of file + hf_dataset_path = here(path, "datasets--carsonzhang--ISIC_2020_small", "snapshots", "2737ff07cc2ef8bd44d692d3323472fce272fca3") + + training_metadata = fread(here(hf_dataset_path, "ISIC_2020_Training_GroundTruth_v2.csv"))[, split := "train"] + test_metadata = setnames(fread(here(hf_dataset_path, "ISIC_2020_Test_Metadata.csv")), + old = c("image", "patient", "anatom_site_general"), + new = c("image_name", "patient_id", "anatom_site_general_challenge") + )[, split := "test"] + metadata = rbind(training_metadata, test_metadata, fill = TRUE) + + melanoma_ds_generator = torch::dataset( + initialize = function() { + self$.metadata = metadata + self$.path = hf_dataset_path + }, + .getitem = function(idx) { + force(idx) + + x = torchvision::base_loader(file.path(self$.path, paste0(self$.metadata[idx, ]$file_name))) + x = torchvision::transform_to_tensor(x) + + return(list(x = x)) + }, + .length = function() { + nrow(self$.metadata) + } + ) + + melanoma_ds = melanoma_ds_generator() + + dd = as_data_descriptor(melanoma_ds, list(x = c(NA, 3, 128, 128))) + lt = lazy_tensor(dd) + + return(cbind(metadata, data.table(image = lt))) +} + +melanoma_dt = constructor_melanoma(hf_cache_dir) + +melanoma_dt[, image_name := NULL] +melanoma_dt[, target := NULL] + +# change the encodings of variables: diagnosis, benign_malignant +melanoma_dt[, benign_malignant := factor(benign_malignant, levels = c("benign", "malignant"))] + +char_features = c("sex", "anatom_site_general_challenge") +melanoma_dt[, (char_features) := lapply(.SD, factor), .SDcols = char_features] + +tsk_melanoma = as_task_classif(melanoma_dt, target = "benign_malignant", id = "melanoma") +tsk_melanoma$set_col_roles("patient_id", "group") +tsk_melanoma$col_roles$feature = c(char_features, "age_approx", "image") + +ci = col_info(tsk_melanoma$backend) + +saveRDS(ci, here::here("inst/col_info/melanoma.rds")) \ No newline at end of file diff --git a/data-raw/tiny_imagenet.R b/data-raw/tiny_imagenet.R index 99ad06da..a1ddce13 100644 --- a/data-raw/tiny_imagenet.R +++ b/data-raw/tiny_imagenet.R @@ -2,12 +2,4 @@ devtools::load_all() ci = col_info(get_private(tsk("tiny_imagenet")$backend)$.constructor()) -saveRDS(ci, here::here("inst/col_info/tiny_imagenet.rds")) - -mlr3:::DataBackendCbind$new(c) - - -split = factor(rep(c("train", "valid", "test"), times = c(100000, 10000, 10000))) - -ci = rbind(ci, data.table(id = "split", type = "factor", levels = levels(split))) -setkeyv(ci) +saveRDS(ci, here::here("inst/col_info/tiny_imagenet.rds")) \ No newline at end of file diff --git a/inst/col_info/melanoma.rds b/inst/col_info/melanoma.rds new file mode 100644 index 0000000000000000000000000000000000000000..52d1d4b0fde938d7cb28a81d7cabf5203fd329bb GIT binary patch literal 412 zcmV;N0b~9jiwFP!000001C^3ZPQx$|h217;NeiN=5<4U=K)U4woB$+t(KMN)7LFa+ zhPJG@G~1R*YWF89i!L17FP`UbJo7$cj0G$Vdn|}($)dN{m#1gartJ^LCN#z@WHHTG zNut#<<+2G zN@ck5Lcs;55-J}V@e1}fu|UI>Ji@8La+LO<1O`1>WNN`}C!ryi7E9FUw}S#QtIqFY zN=lP}{|aLP-|N((G#3Q@dkJIHmYcDS8%-WVcaZsj1wuZPn0+|mR;VTFM_gHr6}M|A zUn{E1jOxQJ2+v!CsLaeLdRG5tgkqGp`mE~{s~tk$lkbY^K@PP&5N|s4L5F@&Acgtz zhV8mN9rkJNd8nozOvj?XFL`L!wYL}u{6b+m1~K(;;!SRv;2-*Ix7*+DO_>1WZ}NGL za Date: Fri, 8 Nov 2024 15:08:20 +0100 Subject: [PATCH 17/46] caching does not seem to work --- R/TaskClassif_melanoma.R | 20 +++++++++++--------- tests/testthat/test_TaskClassif_melanoma.R | 4 ++-- 2 files changed, 13 insertions(+), 11 deletions(-) diff --git a/R/TaskClassif_melanoma.R b/R/TaskClassif_melanoma.R index 8903a6be..0f57630a 100644 --- a/R/TaskClassif_melanoma.R +++ b/R/TaskClassif_melanoma.R @@ -77,16 +77,17 @@ load_task_melanoma = function(id = "melanoma") { data = cached(constructor_melanoma, "datasets", "melanoma")$data # remove irrelevant cols: image_name, target - data[, image_name := NULL] - data[, target := NULL] + data[, (image_name) := NULL] + data[, (target) := NULL] # change the encodings of variables: diagnosis, benign_malignant - data[, benign_malignant := factor(benign_malignant, levels = c("benign", "malignant"))] + data[, (benign_malignant) := factor(benign_malignant, levels = c("benign", "malignant"))] - char_features = c("sex", "anatom_site_general_challenge", "diagnosis") - data[, lapply(.SD, factor), .SDcols = char_features] + char_features = c("sex", "anatom_site_general_challenge") + data[, (char_features) := lapply(.SD, factor), .SDcols = char_features] - dt = cbind(data, + dt = cbind( + data, data.table( ..row_id = seq_along(data$benign_malignant) ) @@ -105,13 +106,14 @@ load_task_melanoma = function(id = "melanoma") { task = TaskClassif$new( backend = backend, id = "melanoma", - target = "target", + target = "benign_malignant", label = "Melanoma classification" ) + task$set_col_roles("patient_id", "group") + task$col_roles$feature = c("sex", "anatom_site_general_challenge", "age_approx", "image") + backend$hash = task$man = "mlr3torch::mlr_tasks_melanoma" - - task$set_col_roles("patient_id", roles = "group") task$filter(1:32701) diff --git a/tests/testthat/test_TaskClassif_melanoma.R b/tests/testthat/test_TaskClassif_melanoma.R index c986f6fd..0761c6b4 100644 --- a/tests/testthat/test_TaskClassif_melanoma.R +++ b/tests/testthat/test_TaskClassif_melanoma.R @@ -7,8 +7,8 @@ test_that("melanoma task works", { task$row_roles$use = 1:10 expect_equal(task$id, "melanoma") expect_equal(task$label, "Melanoma classification") - expect_equal(task$feature_names, "image") - expect_equal(task$target_names, "target") + expect_equal(task$feature_names, c("sex", "anatom_site_general_challenge", "age_approx", "image")) + expect_equal(task$target_names, "benign_malignant") expect_equal(task$man, "mlr3torch::mlr_tasks_melanoma") expect_equal(task$properties, "twoclass") From 6ec3b21a2fcf876ab18b3a9cb705ba983adaaaf2 Mon Sep 17 00:00:00 2001 From: Carson Zhang Date: Fri, 8 Nov 2024 16:40:31 +0100 Subject: [PATCH 18/46] caching does not work --- data-raw/melanoma.R | 2 ++ tests/testthat/test_TaskClassif_melanoma.R | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/data-raw/melanoma.R b/data-raw/melanoma.R index a7186a32..411e476d 100644 --- a/data-raw/melanoma.R +++ b/data-raw/melanoma.R @@ -68,6 +68,8 @@ tsk_melanoma = as_task_classif(melanoma_dt, target = "benign_malignant", id = "m tsk_melanoma$set_col_roles("patient_id", "group") tsk_melanoma$col_roles$feature = c(char_features, "age_approx", "image") +tsk_melanoma$label = "Melanoma classification" + ci = col_info(tsk_melanoma$backend) saveRDS(ci, here::here("inst/col_info/melanoma.rds")) \ No newline at end of file diff --git a/tests/testthat/test_TaskClassif_melanoma.R b/tests/testthat/test_TaskClassif_melanoma.R index 0761c6b4..ffd26bc0 100644 --- a/tests/testthat/test_TaskClassif_melanoma.R +++ b/tests/testthat/test_TaskClassif_melanoma.R @@ -10,7 +10,7 @@ test_that("melanoma task works", { expect_equal(task$feature_names, c("sex", "anatom_site_general_challenge", "age_approx", "image")) expect_equal(task$target_names, "benign_malignant") expect_equal(task$man, "mlr3torch::mlr_tasks_melanoma") - expect_equal(task$properties, "twoclass") + expect_equal(task$properties, c("twoclass", "groups")) x = materialize(task$data(task$row_ids[1:2], cols = "image")[[1L]], rbind = TRUE) expect_equal(x$shape, c(2, 3, 128, 128)) From 9e8ebb4360d718d4d45dc466a8929439db43d3c0 Mon Sep 17 00:00:00 2001 From: Carson Zhang Date: Sun, 10 Nov 2024 20:44:42 +0100 Subject: [PATCH 19/46] manually set a different cache dir --- DESCRIPTION | 1 + R/TaskClassif_melanoma.R | 2 +- attic/requirements-resize_melanoma.txt | 4 +++ tests/temp-TaskClassif_melanoma.R | 39 ---------------------- tests/testthat/test_TaskClassif_melanoma.R | 2 +- 5 files changed, 7 insertions(+), 41 deletions(-) create mode 100644 attic/requirements-resize_melanoma.txt delete mode 100644 tests/temp-TaskClassif_melanoma.R diff --git a/DESCRIPTION b/DESCRIPTION index 8d87a397..ea244e31 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -58,6 +58,7 @@ Suggests: callr, future, ggplot2, + hfhub, igraph, jsonlite, knitr, diff --git a/R/TaskClassif_melanoma.R b/R/TaskClassif_melanoma.R index 0f57630a..33975721 100644 --- a/R/TaskClassif_melanoma.R +++ b/R/TaskClassif_melanoma.R @@ -89,7 +89,7 @@ load_task_melanoma = function(id = "melanoma") { dt = cbind( data, data.table( - ..row_id = seq_along(data$benign_malignant) + ..row_id = seq_along(data$lesion_id) ) ) diff --git a/attic/requirements-resize_melanoma.txt b/attic/requirements-resize_melanoma.txt new file mode 100644 index 00000000..41f33a3f --- /dev/null +++ b/attic/requirements-resize_melanoma.txt @@ -0,0 +1,4 @@ +altgraph @ file:///AppleInternal/Library/BuildRoots/226e9c8c-edb1-11ee-8f17-a65dcee5a99e/Library/Caches/com.apple.xbs/Sources/python3/altgraph-0.17.2-py2.py3-none-any.whl +torch==2.5.0 +torchvision==0.20.0 +tqdm==4.66.1 \ No newline at end of file diff --git a/tests/temp-TaskClassif_melanoma.R b/tests/temp-TaskClassif_melanoma.R deleted file mode 100644 index 2794dffd..00000000 --- a/tests/temp-TaskClassif_melanoma.R +++ /dev/null @@ -1,39 +0,0 @@ -library(torch) -library(torchvision) -library(mlr3torch) -library(here) - -library(data.table) -library(tidytable) - -# TODO: figure out whether we want the v2 file -training_metadata_v2 = fread(here::here("cache", "ISIC_2020_Training_GroundTruth_v2.csv")) - -cache_dir = here("cache") -# construct a torch dataset -ds = torch::dataset( - initialize = function() { - self$.metadata = fread(here(cache_dir, "ISIC_2020_Training_GroundTruth.csv")) - self$.path = file.path(here(cache_dir), "train") - }, - .getitem = function(idx) { - force(idx) - - x = torchvision::base_loader(file.path(self$.path, paste0(self$.metadata[idx, ]$image_name, ".jpg"))) - x = torchvision::transform_to_tensor(x) - - return(list(x = x)) - }, - .length = function() { - nrow(self$.metadata) - } -) - -melanoma_ds = ds() - -dd = as_data_descriptor(melanoma_ds, list(x = NULL)) -lt = lazy_tensor(dd) -dt_train = cbind(training_metadata_v2, data.table(x = lt)) -# as_task_regr(dt_train, target = "corr", id = "guess_the_correlation") - -training_duplicates = fread(here(cache_dir, "ISIC_2020_Training_Duplicates.csv")) diff --git a/tests/testthat/test_TaskClassif_melanoma.R b/tests/testthat/test_TaskClassif_melanoma.R index ffd26bc0..7994c5f0 100644 --- a/tests/testthat/test_TaskClassif_melanoma.R +++ b/tests/testthat/test_TaskClassif_melanoma.R @@ -1,7 +1,7 @@ skip_on_cran() test_that("melanoma task works", { - withr::local_options(mlr3torch.cache = TRUE) + withr::local_options(mlr3torch.cache = here::here("cache2")) task = tsk("melanoma") # this makes the test faster task$row_roles$use = 1:10 From a7c655d1d1e343acec5dabc6f2cbc3db4bcaece4 Mon Sep 17 00:00:00 2001 From: Carson Zhang Date: Tue, 12 Nov 2024 12:09:04 +0100 Subject: [PATCH 20/46] using extrasmall version for testing --- R/TaskClassif_melanoma.R | 19 +++++++------ attic/temp-test_melanoma.R | 31 ++++++++++++++++++++++ data-raw/melanoma.R | 27 +++++++++++-------- tests/testthat/test_TaskClassif_melanoma.R | 13 +++++---- 4 files changed, 66 insertions(+), 24 deletions(-) create mode 100644 attic/temp-test_melanoma.R diff --git a/R/TaskClassif_melanoma.R b/R/TaskClassif_melanoma.R index 33975721..f4e5eabb 100644 --- a/R/TaskClassif_melanoma.R +++ b/R/TaskClassif_melanoma.R @@ -34,14 +34,16 @@ constructor_melanoma = function(path) { ) withr::with_envvar(c(HUGGINGFACE_HUB_CACHE = path), { - hfhub::hub_snapshot("carsonzhang/ISIC_2020_small", repo_type = "dataset") + hfhub::hub_snapshot("carsonzhang/ISIC_2020_extrasmall", repo_type = "dataset") }) - hf_dataset_path = here(path, "datasets--carsonzhang--ISIC_2020_small", "snapshots", "2737ff07cc2ef8bd44d692d3323472fce272fca3") + hf_dataset_parent_path = here::here(path, "raw", "datasets--carsonzhang--ISIC_2020_extrasmall", "snapshots") + # there should only be a single directory whose name is a hash value, this avoids hard-coding it + hf_dataset_path = here::here(hf_dataset_parent_path, list.files(hf_dataset_parent_path)) - training_metadata = fread(here(hf_dataset_path, "ISIC_2020_Training_GroundTruth_v2.csv"))[, split := "train"] - test_metadata = setnames(fread(here(hf_dataset_path, "ISIC_2020_Test_Metadata.csv")), - old = c("image", "patient", "anatom_site_general"), + training_metadata = fread(here::here(hf_dataset_path, "ISIC_2020_Training_GroundTruth_v2.csv"))[, split := "train"] + test_metadata = setnames(fread(here::here(hf_dataset_path, "ISIC_2020_Test_Metadata.csv")), + old = c("image", "patient", "anatom_site_general"), new = c("image_name", "patient_id", "anatom_site_general_challenge") )[, split := "test"] metadata = rbind(training_metadata, test_metadata, fill = TRUE) @@ -89,7 +91,7 @@ load_task_melanoma = function(id = "melanoma") { dt = cbind( data, data.table( - ..row_id = seq_along(data$lesion_id) + ..row_id = seq_len(nrow(data)) ) ) @@ -98,7 +100,8 @@ load_task_melanoma = function(id = "melanoma") { backend = DataBackendLazy$new( constructor = cached_constructor, - rownames = seq_len(32701 + 10982), + # rownames = seq_len(32701 + 10982), + rownames = seq_len(18), col_info = load_col_info("melanoma"), primary_key = "..row_id" ) @@ -115,7 +118,7 @@ load_task_melanoma = function(id = "melanoma") { backend$hash = task$man = "mlr3torch::mlr_tasks_melanoma" - task$filter(1:32701) + # task$filter(1:32701) return(task) } diff --git a/attic/temp-test_melanoma.R b/attic/temp-test_melanoma.R new file mode 100644 index 00000000..8c22d084 --- /dev/null +++ b/attic/temp-test_melanoma.R @@ -0,0 +1,31 @@ +library(data.table) +library(here) +library(tidytable) + +withr::local_options(mlr3torch.cache = TRUE) + +load_col_info("melanoma") + +task = tsk("melanoma") +# this makes the test faster +task$row_roles$use = 1:10 +expect_equal(task$id, "melanoma") +expect_equal(task$label, "Melanoma classification") +expect_equal(task$feature_names, c("sex", "anatom_site_general_challenge", "age_approx", "image")) +expect_equal(task$target_names, "benign_malignant") +expect_equal(task$man, "mlr3torch::mlr_tasks_melanoma") +expect_equal(task$properties, c("twoclass", "groups")) + +x = materialize(task$data(task$row_ids[1:2], cols = "image")[[1L]], rbind = TRUE) +expect_equal(x$shape, c(2, 3, 128, 128)) +expect_equal(x$dtype, torch_float32()) + +training_metadata = fread(here::here("cache", "hf_dataset", "train", "ISIC_2020_Training_GroundTruth_v2.csv")) +training_metadata_extrasmall = training_metadata |> + filter(file_name %in% list.files(here("cache", "hf_dataset", "train"), pattern = ".jpg$", recursive = TRUE)) +fwrite(training_metadata_extrasmall, here("cache", "hf_dataset", "train", "ISIC_2020_Training_GroundTruth_v2.csv")) + +test_metadata = fread(here::here("cache", "hf_dataset", "ISIC_2020_Test_Input", "ISIC_2020_Test_Metadata.csv")) +test_metadata_extrasmall = test_metadata |> + filter(file_name %in% list.files(here("cache", "hf_dataset", "ISIC_2020_Test_Input"), pattern = ".jpg$", recursive = TRUE)) +fwrite(test_metadata_extrasmall, here("cache", "hf_dataset", "ISIC_2020_Test_Input", "ISIC_2020_Test_Metadata.csv")) diff --git a/data-raw/melanoma.R b/data-raw/melanoma.R index 411e476d..0b07ce07 100644 --- a/data-raw/melanoma.R +++ b/data-raw/melanoma.R @@ -4,9 +4,13 @@ devtools::load_all() library(here) library(data.table) -hf_cache_dir = here::here("cache", "hf_downloaded") +withr::local_options(mlr3torch.cache = TRUE) -hf_dataset_path = here::here(hf_cache_dir, "datasets--carsonzhang--ISIC_2020_small", "snapshots", "2737ff07cc2ef8bd44d692d3323472fce272fca3") +# hf_cache_dir = here::here("cache2") + +# hf_dataset_parent_path = here::here(hf_cache_dir, "raw", "datasets--carsonzhang--ISIC_2020_extrasmall", "snapshots") +# there should only be a single directory whose name is a hash value, this avoids hard-coding it +# hf_dataset_path = here::here(hf_dataset_parent_path, list.files(hf_dataset_parent_path)) constructor_melanoma = function(path) { file_names = c( @@ -14,15 +18,16 @@ constructor_melanoma = function(path) { "ISIC_2020_Test_Metadata.csv", "ISIC_2020_Test_Input1", "ISIC_2020_Test_Input2" ) - # withr::with_envvar(c(HUGGINGFACE_HUB_CACHE = path), { - # hfhub::hub_snapshot("carsonzhang/ISIC_2020_small", repo_type = "dataset") - # }) - - hf_dataset_path = here(path, "datasets--carsonzhang--ISIC_2020_small", "snapshots", "2737ff07cc2ef8bd44d692d3323472fce272fca3") + withr::with_envvar(c(HUGGINGFACE_HUB_CACHE = path), { + hfhub::hub_snapshot("carsonzhang/ISIC_2020_extrasmall", repo_type = "dataset") + }) + hf_dataset_parent_path = here::here(path, "raw", "datasets--carsonzhang--ISIC_2020_extrasmall", "snapshots") + # there should only be a single directory whose name is a hash value, this avoids hard-coding it + hf_dataset_path = here::here(hf_dataset_parent_path, list.files(hf_dataset_parent_path)) training_metadata = fread(here(hf_dataset_path, "ISIC_2020_Training_GroundTruth_v2.csv"))[, split := "train"] - test_metadata = setnames(fread(here(hf_dataset_path, "ISIC_2020_Test_Metadata.csv")), - old = c("image", "patient", "anatom_site_general"), + test_metadata = setnames(fread(here(hf_dataset_path, "ISIC_2020_Test_Metadata.csv")), + old = c("image", "patient", "anatom_site_general"), new = c("image_name", "patient_id", "anatom_site_general_challenge") )[, split := "test"] metadata = rbind(training_metadata, test_metadata, fill = TRUE) @@ -53,7 +58,7 @@ constructor_melanoma = function(path) { return(cbind(metadata, data.table(image = lt))) } -melanoma_dt = constructor_melanoma(hf_cache_dir) +melanoma_dt = constructor_melanoma(get_cache_dir()) melanoma_dt[, image_name := NULL] melanoma_dt[, target := NULL] @@ -72,4 +77,4 @@ tsk_melanoma$label = "Melanoma classification" ci = col_info(tsk_melanoma$backend) -saveRDS(ci, here::here("inst/col_info/melanoma.rds")) \ No newline at end of file +saveRDS(ci, here::here("inst/col_info/melanoma.rds")) diff --git a/tests/testthat/test_TaskClassif_melanoma.R b/tests/testthat/test_TaskClassif_melanoma.R index 7994c5f0..e0bdee1f 100644 --- a/tests/testthat/test_TaskClassif_melanoma.R +++ b/tests/testthat/test_TaskClassif_melanoma.R @@ -1,10 +1,10 @@ skip_on_cran() test_that("melanoma task works", { - withr::local_options(mlr3torch.cache = here::here("cache2")) + withr::local_options(mlr3torch.cache = TRUE) task = tsk("melanoma") # this makes the test faster - task$row_roles$use = 1:10 + task$filter(1:10) expect_equal(task$id, "melanoma") expect_equal(task$label, "Melanoma classification") expect_equal(task$feature_names, c("sex", "anatom_site_general_challenge", "age_approx", "image")) @@ -12,7 +12,10 @@ test_that("melanoma task works", { expect_equal(task$man, "mlr3torch::mlr_tasks_melanoma") expect_equal(task$properties, c("twoclass", "groups")) - x = materialize(task$data(task$row_ids[1:2], cols = "image")[[1L]], rbind = TRUE) - expect_equal(x$shape, c(2, 3, 128, 128)) - expect_equal(x$dtype, torch_float32()) + task$data() + + expect_true("datasets--carsonzhang--ISIC_2020_small" %in% list.files(file.path(get_cache_dir(), "datasets", "melanoma", "raw"))) + expect_true("data.rds" %in% list.files(file.path(get_cache_dir(), "datasets", "melanoma"))) + expect_equal(task$backend$nrow, 32701 + 10982) + expect_equal(task$backend$ncol, 7) }) \ No newline at end of file From 0f9f54775a35f47dac414555c726e8d451da7cde Mon Sep 17 00:00:00 2001 From: Carson Zhang Date: Tue, 12 Nov 2024 14:18:30 +0100 Subject: [PATCH 21/46] looks like caching works --- R/TaskClassif_melanoma.R | 3 +-- data-raw/melanoma.R | 12 ++++++------ tests/testthat/test_TaskClassif_melanoma.R | 4 ++-- 3 files changed, 9 insertions(+), 10 deletions(-) diff --git a/R/TaskClassif_melanoma.R b/R/TaskClassif_melanoma.R index f4e5eabb..edd7722b 100644 --- a/R/TaskClassif_melanoma.R +++ b/R/TaskClassif_melanoma.R @@ -36,8 +36,7 @@ constructor_melanoma = function(path) { withr::with_envvar(c(HUGGINGFACE_HUB_CACHE = path), { hfhub::hub_snapshot("carsonzhang/ISIC_2020_extrasmall", repo_type = "dataset") }) - - hf_dataset_parent_path = here::here(path, "raw", "datasets--carsonzhang--ISIC_2020_extrasmall", "snapshots") + hf_dataset_parent_path = here::here(path, "datasets--carsonzhang--ISIC_2020_extrasmall", "snapshots") # there should only be a single directory whose name is a hash value, this avoids hard-coding it hf_dataset_path = here::here(hf_dataset_parent_path, list.files(hf_dataset_parent_path)) diff --git a/data-raw/melanoma.R b/data-raw/melanoma.R index 0b07ce07..d2fe3134 100644 --- a/data-raw/melanoma.R +++ b/data-raw/melanoma.R @@ -1,8 +1,8 @@ devtools::load_all() # manually construct the task once -library(here) -library(data.table) +# library(here) +# library(data.table) withr::local_options(mlr3torch.cache = TRUE) @@ -21,12 +21,12 @@ constructor_melanoma = function(path) { withr::with_envvar(c(HUGGINGFACE_HUB_CACHE = path), { hfhub::hub_snapshot("carsonzhang/ISIC_2020_extrasmall", repo_type = "dataset") }) - hf_dataset_parent_path = here::here(path, "raw", "datasets--carsonzhang--ISIC_2020_extrasmall", "snapshots") + hf_dataset_parent_path = here::here(path, "datasets--carsonzhang--ISIC_2020_extrasmall", "snapshots") # there should only be a single directory whose name is a hash value, this avoids hard-coding it hf_dataset_path = here::here(hf_dataset_parent_path, list.files(hf_dataset_parent_path)) - training_metadata = fread(here(hf_dataset_path, "ISIC_2020_Training_GroundTruth_v2.csv"))[, split := "train"] - test_metadata = setnames(fread(here(hf_dataset_path, "ISIC_2020_Test_Metadata.csv")), + training_metadata = fread(here::here(hf_dataset_path, "ISIC_2020_Training_GroundTruth_v2.csv"))[, split := "train"] + test_metadata = setnames(fread(here::here(hf_dataset_path, "ISIC_2020_Test_Metadata.csv")), old = c("image", "patient", "anatom_site_general"), new = c("image_name", "patient_id", "anatom_site_general_challenge") )[, split := "test"] @@ -58,7 +58,7 @@ constructor_melanoma = function(path) { return(cbind(metadata, data.table(image = lt))) } -melanoma_dt = constructor_melanoma(get_cache_dir()) +melanoma_dt = constructor_melanoma(file.path(get_cache_dir(), "datasets", "melanoma")) melanoma_dt[, image_name := NULL] melanoma_dt[, target := NULL] diff --git a/tests/testthat/test_TaskClassif_melanoma.R b/tests/testthat/test_TaskClassif_melanoma.R index e0bdee1f..08bf75cc 100644 --- a/tests/testthat/test_TaskClassif_melanoma.R +++ b/tests/testthat/test_TaskClassif_melanoma.R @@ -4,7 +4,7 @@ test_that("melanoma task works", { withr::local_options(mlr3torch.cache = TRUE) task = tsk("melanoma") # this makes the test faster - task$filter(1:10) + # task$filter(1:10) expect_equal(task$id, "melanoma") expect_equal(task$label, "Melanoma classification") expect_equal(task$feature_names, c("sex", "anatom_site_general_challenge", "age_approx", "image")) @@ -17,5 +17,5 @@ test_that("melanoma task works", { expect_true("datasets--carsonzhang--ISIC_2020_small" %in% list.files(file.path(get_cache_dir(), "datasets", "melanoma", "raw"))) expect_true("data.rds" %in% list.files(file.path(get_cache_dir(), "datasets", "melanoma"))) expect_equal(task$backend$nrow, 32701 + 10982) - expect_equal(task$backend$ncol, 7) + expect_equal(task$backend$ncol, 5) }) \ No newline at end of file From 10f5da9631552dc82dbe4cc19ecb6209b7c5e3c7 Mon Sep 17 00:00:00 2001 From: Carson Zhang Date: Tue, 26 Nov 2024 11:47:26 +0100 Subject: [PATCH 22/46] looks ok but download is slooow --- DESCRIPTION | 1 + R/TaskClassif_melanoma.R | 61 +++++++++++++++------- R/bibentries.R | 9 ++++ data-raw/melanoma.R | 47 +++++++++++------ man/mlr_learners_torch_image.Rd | 2 +- tests/testthat/test_TaskClassif_melanoma.R | 9 ++-- 6 files changed, 90 insertions(+), 39 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index ea244e31..dca5d370 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -127,6 +127,7 @@ Collate: 'PipeOpTorchReshape.R' 'PipeOpTorchSoftmax.R' 'TaskClassif_lazy_iris.R' + 'TaskClassif_melanoma.R' 'TaskClassif_mnist.R' 'TaskClassif_tiny_imagenet.R' 'TorchDescriptor.R' diff --git a/R/TaskClassif_melanoma.R b/R/TaskClassif_melanoma.R index edd7722b..e01b20b7 100644 --- a/R/TaskClassif_melanoma.R +++ b/R/TaskClassif_melanoma.R @@ -19,7 +19,7 @@ #' `r rd_info_task_torch("melanoma", missings = FALSE)` #' #' @references -#' `r format_bib("melanoma")` +#' `r format_bib("melanoma2021")` #' @examplesIf torch::torch_is_installed() #' task = tsk("melanoma") #' task @@ -28,20 +28,44 @@ NULL # @param path (`character(1)`)\cr # The cache_dir/datasets/melanoma folder constructor_melanoma = function(path) { - file_names = c( - "ISIC_2020_Training_GroundTruth_v2.csv", "train1", "train2", "train3", "train4", - "ISIC_2020_Test_Metadata.csv", "ISIC_2020_Test_Input1", "ISIC_2020_Test_Input2" + # file_names = c( + # "ISIC_2020_Training_GroundTruth_v2.csv", "train1", "train2", "train3", "train4", + # "ISIC_2020_Test_Metadata.csv", "ISIC_2020_Test_Input1", "ISIC_2020_Test_Input2" + # ) + withr::local_options(mlr3torch.cache = TRUE) + path = file.path(get_cache_dir(), "datasets", "melanoma") + base_url = "https://huggingface.co/datasets/carsonzhang/ISIC_2020_small/resolve/main/" + + training_metadata_file_name = "ISIC_2020_Training_GroundTruth_v2.csv" + curl::curl_download(paste0(base_url, training_metadata_file_name), file.path(path, training_metadata_file_name)) + training_metadata = fread(here::here(path, training_metadata_file_name)) + + train_dir_names = c("train1", "train2", "train3", "train4") + for (dir in train_dir_names) { + if (!dir.exists(file.path(path, dir))) dir.create(file.path(path, dir)) + } + + pmap( + list(paste(base_url, training_metadata$file_name, sep = ""), paste(path, "/", training_metadata$file_name, sep = "")), + curl::curl_download ) - withr::with_envvar(c(HUGGINGFACE_HUB_CACHE = path), { - hfhub::hub_snapshot("carsonzhang/ISIC_2020_extrasmall", repo_type = "dataset") - }) - hf_dataset_parent_path = here::here(path, "datasets--carsonzhang--ISIC_2020_extrasmall", "snapshots") - # there should only be a single directory whose name is a hash value, this avoids hard-coding it - hf_dataset_path = here::here(hf_dataset_parent_path, list.files(hf_dataset_parent_path)) + test_metadata_file_name = "ISIC_2020_Test_Metadata.csv" + curl::curl_download(paste0(base_url, test_metadata_file_name), file.path(path, test_metadata_file_name)) + test_metadata = fread(here::here(path, test_metadata_file_name)) + + test_dir_names = c("ISIC_2020_Test_Input1", "ISIC_2020_Test_Input2") + for (dir in train_dir_names) { + if (!dir.exists(file.path(path, dir))) dir.create(file.path(path, dir)) + } + + pmap( + list(paste(base_url, test_metadata$file_name, sep = ""), paste(path, "/", test_metadata$file_name, sep = "")), + curl_download + ) - training_metadata = fread(here::here(hf_dataset_path, "ISIC_2020_Training_GroundTruth_v2.csv"))[, split := "train"] - test_metadata = setnames(fread(here::here(hf_dataset_path, "ISIC_2020_Test_Metadata.csv")), + training_metadata = training_metadata[, split := "train"] + test_metadata = setnames(test_metadata, old = c("image", "patient", "anatom_site_general"), new = c("image_name", "patient_id", "anatom_site_general_challenge") )[, split := "test"] @@ -78,11 +102,13 @@ load_task_melanoma = function(id = "melanoma") { data = cached(constructor_melanoma, "datasets", "melanoma")$data # remove irrelevant cols: image_name, target - data[, (image_name) := NULL] - data[, (target) := NULL] + print(names(data)) + # if ("image_name" %in% names(data)) data[, image_name := NULL] + data[, image_name := NULL] + data[, target := NULL] # change the encodings of variables: diagnosis, benign_malignant - data[, (benign_malignant) := factor(benign_malignant, levels = c("benign", "malignant"))] + data[, benign_malignant := factor(benign_malignant, levels = c("benign", "malignant"))] char_features = c("sex", "anatom_site_general_challenge") data[, (char_features) := lapply(.SD, factor), .SDcols = char_features] @@ -99,8 +125,7 @@ load_task_melanoma = function(id = "melanoma") { backend = DataBackendLazy$new( constructor = cached_constructor, - # rownames = seq_len(32701 + 10982), - rownames = seq_len(18), + rownames = seq_len(32701 + 10982), col_info = load_col_info("melanoma"), primary_key = "..row_id" ) @@ -117,7 +142,7 @@ load_task_melanoma = function(id = "melanoma") { backend$hash = task$man = "mlr3torch::mlr_tasks_melanoma" - # task$filter(1:32701) + task$filter(1:32701) return(task) } diff --git a/R/bibentries.R b/R/bibentries.R index 68d7d4fa..f9d7fa67 100644 --- a/R/bibentries.R +++ b/R/bibentries.R @@ -112,6 +112,15 @@ bibentries = c(# nolint start booktitle = "Proceedings of the IEEE conference on computer vision and pattern recognition ", pages = "2818--2826 ", year = "2016 " + ), + melanoma2021 = bibentry("article", + title = "A patient-centric dataset of images and metadata for identifying melanomas using clinical context", + author = "Rotemberg, V. and Kurtansky, N. and Betz-Stablein, B. and Caffery, L. and Chousakos, E. and Codella, N. and Combalia, M. and Dusza, S. and Guitera, P. and Gutman, D. and Halpern, A. and Helba, B. and Kittler, H. and Kose, K. and Langer, S. and Lioprys, K. and Malvehy, J. and Musthaq, S. and Nanda, J. and Reiter, O. and Shih, G. and Stratigos, A. and Tschandl, P. and Weber, J. and Soyer, P.", + journal = "Scientific Data", + volume = "8", + pages = "34", + year = "2021", + doi = "10.1038/s41597-021-00815-z" ) ) # nolint end diff --git a/data-raw/melanoma.R b/data-raw/melanoma.R index d2fe3134..7de49ada 100644 --- a/data-raw/melanoma.R +++ b/data-raw/melanoma.R @@ -6,27 +6,40 @@ devtools::load_all() withr::local_options(mlr3torch.cache = TRUE) -# hf_cache_dir = here::here("cache2") +constructor_melanoma = function(path) { + # path = file.path(get_cache_dir(), "datasets", "melanoma") + base_url = "https://huggingface.co/datasets/carsonzhang/ISIC_2020_small/resolve/main/" -# hf_dataset_parent_path = here::here(hf_cache_dir, "raw", "datasets--carsonzhang--ISIC_2020_extrasmall", "snapshots") -# there should only be a single directory whose name is a hash value, this avoids hard-coding it -# hf_dataset_path = here::here(hf_dataset_parent_path, list.files(hf_dataset_parent_path)) + training_metadata_file_name = "ISIC_2020_Training_GroundTruth_v2.csv" + curl::curl_download(paste0(base_url, training_metadata_file_name), file.path(path, training_metadata_file_name)) + training_metadata = fread(here::here(path, training_metadata_file_name)) -constructor_melanoma = function(path) { - file_names = c( - "ISIC_2020_Training_GroundTruth_v2.csv", "train1", "train2", "train3", "train4", - "ISIC_2020_Test_Metadata.csv", "ISIC_2020_Test_Input1", "ISIC_2020_Test_Input2" + train_dir_names = c("train1", "train2", "train3", "train4") + for (dir in train_dir_names) { + if (!dir.exists(file.path(path, dir))) dir.create(file.path(path, dir)) + } + + pmap( + list(paste(base_url, training_metadata$file_name, sep = ""), paste(path, "/", training_metadata$file_name, sep = "")), + curl::curl_download ) - withr::with_envvar(c(HUGGINGFACE_HUB_CACHE = path), { - hfhub::hub_snapshot("carsonzhang/ISIC_2020_extrasmall", repo_type = "dataset") - }) - hf_dataset_parent_path = here::here(path, "datasets--carsonzhang--ISIC_2020_extrasmall", "snapshots") - # there should only be a single directory whose name is a hash value, this avoids hard-coding it - hf_dataset_path = here::here(hf_dataset_parent_path, list.files(hf_dataset_parent_path)) + test_metadata_file_name = "ISIC_2020_Test_Metadata.csv" + curl::curl_download(paste0(base_url, test_metadata_file_name), file.path(path, test_metadata_file_name)) + test_metadata = fread(here::here(path, test_metadata_file_name)) + + test_dir_names = c("ISIC_2020_Test_Input1", "ISIC_2020_Test_Input2") + for (dir in train_dir_names) { + if (!dir.exists(file.path(path, dir))) dir.create(file.path(path, dir)) + } + + pmap( + list(paste(base_url, test_metadata$file_name, sep = ""), paste(path, "/", test_metadata$file_name, sep = "")), + curl::curl_download + ) - training_metadata = fread(here::here(hf_dataset_path, "ISIC_2020_Training_GroundTruth_v2.csv"))[, split := "train"] - test_metadata = setnames(fread(here::here(hf_dataset_path, "ISIC_2020_Test_Metadata.csv")), + training_metadata = training_metadata[, split := "train"] + test_metadata = setnames(test_metadata, old = c("image", "patient", "anatom_site_general"), new = c("image_name", "patient_id", "anatom_site_general_challenge") )[, split := "test"] @@ -58,6 +71,7 @@ constructor_melanoma = function(path) { return(cbind(metadata, data.table(image = lt))) } +bench::system_time(melanoma_dt <- constructor_melanoma(file.path(get_cache_dir(), "datasets", "melanoma"))) melanoma_dt = constructor_melanoma(file.path(get_cache_dir(), "datasets", "melanoma")) melanoma_dt[, image_name := NULL] @@ -78,3 +92,4 @@ tsk_melanoma$label = "Melanoma classification" ci = col_info(tsk_melanoma$backend) saveRDS(ci, here::here("inst/col_info/melanoma.rds")) +saveRDS(melanoma_) diff --git a/man/mlr_learners_torch_image.Rd b/man/mlr_learners_torch_image.Rd index fce0bef7..af2b854d 100644 --- a/man/mlr_learners_torch_image.Rd +++ b/man/mlr_learners_torch_image.Rd @@ -64,7 +64,7 @@ Creates a new instance of this \link[R6:R6Class]{R6} class. optimizer = NULL, loss = NULL, callbacks = list(), - packages = c("torchvision"), + packages = c("torchvision", "magick"), man, properties = NULL, predict_types = NULL diff --git a/tests/testthat/test_TaskClassif_melanoma.R b/tests/testthat/test_TaskClassif_melanoma.R index 08bf75cc..3b89ab51 100644 --- a/tests/testthat/test_TaskClassif_melanoma.R +++ b/tests/testthat/test_TaskClassif_melanoma.R @@ -14,8 +14,9 @@ test_that("melanoma task works", { task$data() - expect_true("datasets--carsonzhang--ISIC_2020_small" %in% list.files(file.path(get_cache_dir(), "datasets", "melanoma", "raw"))) - expect_true("data.rds" %in% list.files(file.path(get_cache_dir(), "datasets", "melanoma"))) + # expect_true("datasets--carsonzhang--ISIC_2020_small" %in% list.files(file.path(get_cache_dir(), "datasets", "melanoma", "raw"))) + # expect_true("data.rds" %in% list.files(file.path(get_cache_dir(), "datasets", "melanoma"))) expect_equal(task$backend$nrow, 32701 + 10982) - expect_equal(task$backend$ncol, 5) -}) \ No newline at end of file + expect_equal(task$backend$ncol, 11) + expect_equal(task$ncol, 5) +}) From 1252718e029a3e606b2133f258bc926fd06937f7 Mon Sep 17 00:00:00 2001 From: Carson Zhang Date: Thu, 28 Nov 2024 22:54:13 +0100 Subject: [PATCH 23/46] looks ok --- R/TaskClassif_melanoma.R | 41 +++++------------ data-raw/melanoma.R | 52 ++++++++++------------ man/mlr_tasks_melanoma.Rd | 52 ++++++++++++++++++++++ tests/testthat/test_TaskClassif_melanoma.R | 6 +-- 4 files changed, 89 insertions(+), 62 deletions(-) create mode 100644 man/mlr_tasks_melanoma.Rd diff --git a/R/TaskClassif_melanoma.R b/R/TaskClassif_melanoma.R index e01b20b7..5bc0391c 100644 --- a/R/TaskClassif_melanoma.R +++ b/R/TaskClassif_melanoma.R @@ -28,41 +28,22 @@ NULL # @param path (`character(1)`)\cr # The cache_dir/datasets/melanoma folder constructor_melanoma = function(path) { - # file_names = c( - # "ISIC_2020_Training_GroundTruth_v2.csv", "train1", "train2", "train3", "train4", - # "ISIC_2020_Test_Metadata.csv", "ISIC_2020_Test_Input1", "ISIC_2020_Test_Input2" - # ) - withr::local_options(mlr3torch.cache = TRUE) - path = file.path(get_cache_dir(), "datasets", "melanoma") + # should happen automatically, but this is needed for curl to work + # fs::dir_create(path, recurse = TRUE) + base_url = "https://huggingface.co/datasets/carsonzhang/ISIC_2020_small/resolve/main/" - training_metadata_file_name = "ISIC_2020_Training_GroundTruth_v2.csv" - curl::curl_download(paste0(base_url, training_metadata_file_name), file.path(path, training_metadata_file_name)) - training_metadata = fread(here::here(path, training_metadata_file_name)) + compressed_tarball_file_name = "hf_ISIC_2020_small.tar.gz" - train_dir_names = c("train1", "train2", "train3", "train4") - for (dir in train_dir_names) { - if (!dir.exists(file.path(path, dir))) dir.create(file.path(path, dir)) - } - - pmap( - list(paste(base_url, training_metadata$file_name, sep = ""), paste(path, "/", training_metadata$file_name, sep = "")), - curl::curl_download - ) + curl::curl_download(paste0(base_url, compressed_tarball_file_name), file.path(path, compressed_tarball_file_name)) - test_metadata_file_name = "ISIC_2020_Test_Metadata.csv" - curl::curl_download(paste0(base_url, test_metadata_file_name), file.path(path, test_metadata_file_name)) - test_metadata = fread(here::here(path, test_metadata_file_name)) + utils::untar(file.path(path, compressed_tarball_file_name), exdir = path) - test_dir_names = c("ISIC_2020_Test_Input1", "ISIC_2020_Test_Input2") - for (dir in train_dir_names) { - if (!dir.exists(file.path(path, dir))) dir.create(file.path(path, dir)) - } + training_metadata_file_name = "ISIC_2020_Training_GroundTruth_v2.csv" + training_metadata = data.table::fread(here::here(path, training_metadata_file_name)) - pmap( - list(paste(base_url, test_metadata$file_name, sep = ""), paste(path, "/", test_metadata$file_name, sep = "")), - curl_download - ) + test_metadata_file_name = "ISIC_2020_Test_Metadata.csv" + test_metadata = data.table::fread(here::here(path, test_metadata_file_name)) training_metadata = training_metadata[, split := "train"] test_metadata = setnames(test_metadata, @@ -74,7 +55,7 @@ constructor_melanoma = function(path) { melanoma_ds_generator = torch::dataset( initialize = function() { self$.metadata = metadata - self$.path = hf_dataset_path + self$.path = path }, .getitem = function(idx) { force(idx) diff --git a/data-raw/melanoma.R b/data-raw/melanoma.R index 7de49ada..4aaf0e12 100644 --- a/data-raw/melanoma.R +++ b/data-raw/melanoma.R @@ -3,40 +3,34 @@ devtools::load_all() # manually construct the task once # library(here) # library(data.table) - +library(data.table) withr::local_options(mlr3torch.cache = TRUE) +unzip2 <- function(path, exdir) { + if (grepl("linux", R.version$os)) { + utils::unzip(path, exdir = exdir) + } else { + zip::unzip(path, exdir = exdir) + } +} + constructor_melanoma = function(path) { - # path = file.path(get_cache_dir(), "datasets", "melanoma") + # should happen automatically, but this is needed for curl to work + fs::dir_create(path, recurse = TRUE) + base_url = "https://huggingface.co/datasets/carsonzhang/ISIC_2020_small/resolve/main/" - training_metadata_file_name = "ISIC_2020_Training_GroundTruth_v2.csv" - curl::curl_download(paste0(base_url, training_metadata_file_name), file.path(path, training_metadata_file_name)) - training_metadata = fread(here::here(path, training_metadata_file_name)) + compressed_tarball_file_name = "hf_ISIC_2020_small.tar.gz" - train_dir_names = c("train1", "train2", "train3", "train4") - for (dir in train_dir_names) { - if (!dir.exists(file.path(path, dir))) dir.create(file.path(path, dir)) - } + curl::curl_download(paste0(base_url, compressed_tarball_file_name), file.path(path, compressed_tarball_file_name)) - pmap( - list(paste(base_url, training_metadata$file_name, sep = ""), paste(path, "/", training_metadata$file_name, sep = "")), - curl::curl_download - ) + untar(file.path(path, compressed_tarball_file_name), exdir = path) - test_metadata_file_name = "ISIC_2020_Test_Metadata.csv" - curl::curl_download(paste0(base_url, test_metadata_file_name), file.path(path, test_metadata_file_name)) - test_metadata = fread(here::here(path, test_metadata_file_name)) - - test_dir_names = c("ISIC_2020_Test_Input1", "ISIC_2020_Test_Input2") - for (dir in train_dir_names) { - if (!dir.exists(file.path(path, dir))) dir.create(file.path(path, dir)) - } + training_metadata_file_name = "ISIC_2020_Training_GroundTruth_v2.csv" + training_metadata = data.table::fread(here::here(path, training_metadata_file_name)) - pmap( - list(paste(base_url, test_metadata$file_name, sep = ""), paste(path, "/", test_metadata$file_name, sep = "")), - curl::curl_download - ) + test_metadata_file_name = "ISIC_2020_Test_Metadata.csv" + test_metadata = data.table::fread(here::here(path, test_metadata_file_name)) training_metadata = training_metadata[, split := "train"] test_metadata = setnames(test_metadata, @@ -48,7 +42,7 @@ constructor_melanoma = function(path) { melanoma_ds_generator = torch::dataset( initialize = function() { self$.metadata = metadata - self$.path = hf_dataset_path + self$.path = path }, .getitem = function(idx) { force(idx) @@ -71,8 +65,11 @@ constructor_melanoma = function(path) { return(cbind(metadata, data.table(image = lt))) } +# path = file.path(here::here("cache"), "datasets", "melanoma") +# fs::dir_create(path, recurse = TRUE) + bench::system_time(melanoma_dt <- constructor_melanoma(file.path(get_cache_dir(), "datasets", "melanoma"))) -melanoma_dt = constructor_melanoma(file.path(get_cache_dir(), "datasets", "melanoma")) +# melanoma_dt = constructor_melanoma(file.path(get_cache_dir(), "datasets", "melanoma")) melanoma_dt[, image_name := NULL] melanoma_dt[, target := NULL] @@ -92,4 +89,3 @@ tsk_melanoma$label = "Melanoma classification" ci = col_info(tsk_melanoma$backend) saveRDS(ci, here::here("inst/col_info/melanoma.rds")) -saveRDS(melanoma_) diff --git a/man/mlr_tasks_melanoma.Rd b/man/mlr_tasks_melanoma.Rd new file mode 100644 index 00000000..55839d36 --- /dev/null +++ b/man/mlr_tasks_melanoma.Rd @@ -0,0 +1,52 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/TaskClassif_melanoma.R +\name{mlr_tasks_melanoma} +\alias{mlr_tasks_melanoma} +\title{Melanoma Image classification} +\source{ +\url{https://challenge2020.isic-archive.com/} +} +\description{ +Classification of melanoma tumor images. + +The data comes from the 2020 SIIM-ISIC challenge. +} +\section{Construction}{ + + +\if{html}{\out{
}}\preformatted{tsk("melanoma") +}\if{html}{\out{
}} +} + +\section{Download}{ + +The \link[mlr3:Task]{task}'s backend is a \code{\link{DataBackendLazy}} which will download the data once it is requested. +Other meta-data is already available before that. +You can cache these datasets by setting the \code{mlr3torch.cache} option to \code{TRUE} or to a specific path to be used +as the cache directory. +} + +\section{Properties}{ + +\itemize{ +\item Task type: \dQuote{classif} +\item Properties: \dQuote{twoclass}, \dQuote{groups} +\item Has Missings: no +\item Target: \dQuote{benign_malignant} +\item Features: \dQuote{sex}, \dQuote{anatom_site_general_challenge}, \dQuote{age_approx}, \dQuote{image} +\item Data Dimension: 43683x11 +} +} + +\examples{ +\dontshow{if (torch::torch_is_installed()) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf} +task = tsk("melanoma") +task +\dontshow{\}) # examplesIf} +} +\references{ +Rotemberg, V., Kurtansky, N., Betz-Stablein, B., Caffery, L., Chousakos, E., Codella, N., Combalia, M., Dusza, S., Guitera, P., Gutman, D., Halpern, A., Helba, B., Kittler, H., Kose, K., Langer, S., Lioprys, K., Malvehy, J., Musthaq, S., Nanda, J., Reiter, O., Shih, G., Stratigos, A., Tschandl, P., Weber, J., Soyer, P. (2021). +\dQuote{A patient-centric dataset of images and metadata for identifying melanomas using clinical context.} +\emph{Scientific Data}, \bold{8}, 34. +\doi{10.1038/s41597-021-00815-z}. +} diff --git a/tests/testthat/test_TaskClassif_melanoma.R b/tests/testthat/test_TaskClassif_melanoma.R index 3b89ab51..380612f1 100644 --- a/tests/testthat/test_TaskClassif_melanoma.R +++ b/tests/testthat/test_TaskClassif_melanoma.R @@ -3,8 +3,7 @@ skip_on_cran() test_that("melanoma task works", { withr::local_options(mlr3torch.cache = TRUE) task = tsk("melanoma") - # this makes the test faster - # task$filter(1:10) + expect_equal(task$id, "melanoma") expect_equal(task$label, "Melanoma classification") expect_equal(task$feature_names, c("sex", "anatom_site_general_challenge", "age_approx", "image")) @@ -14,8 +13,7 @@ test_that("melanoma task works", { task$data() - # expect_true("datasets--carsonzhang--ISIC_2020_small" %in% list.files(file.path(get_cache_dir(), "datasets", "melanoma", "raw"))) - # expect_true("data.rds" %in% list.files(file.path(get_cache_dir(), "datasets", "melanoma"))) + expect_true("data.rds" %in% list.files(file.path(get_cache_dir(), "datasets", "melanoma"))) expect_equal(task$backend$nrow, 32701 + 10982) expect_equal(task$backend$ncol, 11) expect_equal(task$ncol, 5) From 681779eab055bd6f9b6b3c984b66f35781eebcfa Mon Sep 17 00:00:00 2001 From: Carson Zhang Date: Thu, 28 Nov 2024 23:03:41 +0100 Subject: [PATCH 24/46] removed manual cache dirs, references to irrelevant files in gitignore. also delete the tarball after done using --- .gitignore | 4 +- R/TaskClassif_melanoma.R | 11 ++-- benchmarks/dataset.R | 11 ---- .../image_loaders/benchmark_image_loaders.R | 63 ------------------- benchmarks/merge.R | 30 --------- 5 files changed, 5 insertions(+), 114 deletions(-) delete mode 100644 benchmarks/dataset.R delete mode 100644 benchmarks/image_loaders/benchmark_image_loaders.R delete mode 100644 benchmarks/merge.R diff --git a/.gitignore b/.gitignore index e48dd286..abe618df 100644 --- a/.gitignore +++ b/.gitignore @@ -14,6 +14,4 @@ inst/doc /doc/ /Meta/ CRAN-SUBMISSION -paper/data -cache/ -benchmarks/data \ No newline at end of file +paper/data \ No newline at end of file diff --git a/R/TaskClassif_melanoma.R b/R/TaskClassif_melanoma.R index 5bc0391c..17ad9dec 100644 --- a/R/TaskClassif_melanoma.R +++ b/R/TaskClassif_melanoma.R @@ -28,16 +28,13 @@ NULL # @param path (`character(1)`)\cr # The cache_dir/datasets/melanoma folder constructor_melanoma = function(path) { - # should happen automatically, but this is needed for curl to work - # fs::dir_create(path, recurse = TRUE) - base_url = "https://huggingface.co/datasets/carsonzhang/ISIC_2020_small/resolve/main/" compressed_tarball_file_name = "hf_ISIC_2020_small.tar.gz" - - curl::curl_download(paste0(base_url, compressed_tarball_file_name), file.path(path, compressed_tarball_file_name)) - - utils::untar(file.path(path, compressed_tarball_file_name), exdir = path) + compressed_tarball_path = file.path(path, compressed_tarball_file_name) + curl::curl_download(paste0(base_url, compressed_tarball_file_name), compressed_tarball_path) + utils::untar(compressed_tarball_path, exdir = path) + file.remove(compressed_tarball_path) training_metadata_file_name = "ISIC_2020_Training_GroundTruth_v2.csv" training_metadata = data.table::fread(here::here(path, training_metadata_file_name)) diff --git a/benchmarks/dataset.R b/benchmarks/dataset.R deleted file mode 100644 index bc227ea4..00000000 --- a/benchmarks/dataset.R +++ /dev/null @@ -1,11 +0,0 @@ -devtools::load_all("~/mlr/mlr3") -devtools::load_all("~/mlr/mlr3torch") - -lazy_iris = tsk("lazy_iris") -dt = lazy_iris$data(cols = "x")$x -dataset = dt[[1L]][[2L]]$dataset - -dt = do.call(c, args = lapply(1:1000, function(i) dt)) - - -profvis::profvis({materialize_internal(dt, rbind = TRUE)}) diff --git a/benchmarks/image_loaders/benchmark_image_loaders.R b/benchmarks/image_loaders/benchmark_image_loaders.R deleted file mode 100644 index cf56a33a..00000000 --- a/benchmarks/image_loaders/benchmark_image_loaders.R +++ /dev/null @@ -1,63 +0,0 @@ -library(torch) -library(torchvision) -library(mlr3torch) -library(here) - -library(data.table) -setDTthreads(threads = 1) - -training_metadata = fread(here::here("cache", "ISIC_2020_Training_GroundTruth.csv")) - -# hard-coded cache directory that I use locally -cache_dir = here("cache") - -ds_base_loader = torch::dataset( - initialize = function(n_images) { - self$.metadata = fread(here(cache_dir, "ISIC_2020_Training_GroundTruth.csv"))[1:n_images, ] - self$.path = file.path(here(cache_dir), "train") - }, - .getitem = function(idx) { - force(idx) - - x = torchvision::base_loader(file.path(self$.path, paste0(self$.metadata[idx, ]$image_name, ".jpg"))) - x = torchvision::transform_to_tensor(x) - - return(list(x = x)) - }, - .length = function() { - nrow(self$.metadata) - } -) - -ds_magick_loader = torch::dataset( - initialize = function(n_images) { - self$.metadata = fread(here(cache_dir, "ISIC_2020_Training_GroundTruth.csv"))[1:n_images, ] - self$.path = file.path(here(cache_dir), "train") - }, - .getitem = function(idx) { - force(idx) - - image_name = self$.metadata[idx, ]$image_name - - x = magick::image_read(file.path(self$.path, paste0(image_name, ".jpg"))) - x = torchvision::transform_to_tensor(x) - - return(list(x = x, image_name = image_name)) - }, - .length = function() { - nrow(self$.metadata) - } -) - -n_images = 10 - -ds_base = ds_base_loader(n_images) -ds_magick = ds_magick_loader(n_images) - -bmr = bench::mark( - for (i in 1:n_images) ds_base$.getitem(i), - for (i in 1:n_images) ds_magick$.getitem(i), - memory = FALSE -) - -print(bmr) \ No newline at end of file diff --git a/benchmarks/merge.R b/benchmarks/merge.R deleted file mode 100644 index d03c1804..00000000 --- a/benchmarks/merge.R +++ /dev/null @@ -1,30 +0,0 @@ -# Just a simple benchmark that compares different approaches to merging tensors. - -library(torch) -n1 = 1000 -n2 = 1 -n3 = 3000 - -compare = function(..., n) { - - # generate the input tensors - tensors = lapply(seq(n), function(i) torch_randn(..., device = "cpu")) - - merge_stack = function(...) { - torch_sum(torch_stack(torch_broadcast_tensors(list(...)), dim = 1L), dim = 1L) - } - - merge_reduce = function(...) { - Reduce(torch_add, list(...)) - } - - bench::mark( - stack = do.call(merge_stack, args = tensors), - reduce = do.call(merge_reduce, args = tensors) - ) -} - -compare(100, 1, 1, 300, n = 5) -compare(30, 50, 30, n = 5) - -# the reduce approach is faster From a4908c0b2baecad5717c90edcb0bb11192eb10ff Mon Sep 17 00:00:00 2001 From: Carson Zhang Date: Thu, 28 Nov 2024 23:05:25 +0100 Subject: [PATCH 25/46] updated description. using curl, not hfhub --- DESCRIPTION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/DESCRIPTION b/DESCRIPTION index dca5d370..6d6754c0 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -56,9 +56,9 @@ Imports: withr Suggests: callr, + curl, future, ggplot2, - hfhub, igraph, jsonlite, knitr, From 3527acd5f038816fa6c59a92598c68e424f9898b Mon Sep 17 00:00:00 2001 From: Sebastian Fischer Date: Tue, 29 Oct 2024 12:10:02 +0100 Subject: [PATCH 26/46] enable byte compilation --- DESCRIPTION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/DESCRIPTION b/DESCRIPTION index 6d6754c0..e88716cc 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -75,7 +75,7 @@ Suggests: waldo Config/testthat/edition: 3 NeedsCompilation: no -ByteCompile: no +ByteCompile: yes Encoding: UTF-8 Roxygen: list(markdown = TRUE, r6 = TRUE) RoxygenNote: 7.3.2 From 3ead917b52bfb9f37a45f44cd05fa8fd45ab91da Mon Sep 17 00:00:00 2001 From: cxzhang4 Date: Fri, 8 Nov 2024 10:41:11 +0100 Subject: [PATCH 27/46] Magick to base loader (#299) --- DESCRIPTION | 1 - R/LearnerTorchImage.R | 2 +- R/utils.R | 2 +- man/mlr_learners.torchvision.Rd | 6 +++--- man/mlr_learners_torch_image.Rd | 2 +- tests/testthat/test_LearnerTorchVision.R | 1 - 6 files changed, 6 insertions(+), 8 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index e88716cc..0f553b69 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -62,7 +62,6 @@ Suggests: igraph, jsonlite, knitr, - magick, mlr3tuning (>= 1.0.0), progress, rmarkdown, diff --git a/R/LearnerTorchImage.R b/R/LearnerTorchImage.R index 1601c03e..56f3ed96 100644 --- a/R/LearnerTorchImage.R +++ b/R/LearnerTorchImage.R @@ -31,7 +31,7 @@ LearnerTorchImage = R6Class("LearnerTorchImage", #' @description #' Creates a new instance of this [R6][R6::R6Class] class. initialize = function(id, task_type, param_set = ps(), label, optimizer = NULL, loss = NULL, - callbacks = list(), packages = c("torchvision", "magick"), man, properties = NULL, + callbacks = list(), packages = "torchvision", man, properties = NULL, predict_types = NULL) { properties = properties %??% switch(task_type, regr = c(), diff --git a/R/utils.R b/R/utils.R index 2762d5de..2067c651 100644 --- a/R/utils.R +++ b/R/utils.R @@ -178,7 +178,7 @@ dataset_image = dataset("image_dataset", self$uris = uris }, .getitem = function(x) { - list(x = torchvision::transform_to_tensor(magick::image_read(self$uris[x]))) + list(x = torchvision::transform_to_tensor(torchvision::base_loader(self$uris[x]))) }, .length = function() { length(self$uris) diff --git a/man/mlr_learners.torchvision.Rd b/man/mlr_learners.torchvision.Rd index 87883dd9..29acfb0e 100644 --- a/man/mlr_learners.torchvision.Rd +++ b/man/mlr_learners.torchvision.Rd @@ -90,9 +90,9 @@ Krizhevsky, Alex, Sutskever, Ilya, Hinton, E. G (2017). Sandler, Mark, Howard, Andrew, Zhu, Menglong, Zhmoginov, Andrey, Chen, Liang-Chieh (2018). \dQuote{Mobilenetv2: Inverted residuals and linear bottlenecks.} In \emph{Proceedings of the IEEE conference on computer vision and pattern recognition}, 4510--4520. -He, Kaiming, Zhang, Xiangyu, Ren, Shaoqing, Sun, Jian (2016). -\dQuote{Deep residual learning for image recognition.} -In \emph{Proceedings of the IEEE conference on computer vision and pattern recognition}, 770--778. +He, Kaiming, Zhang, Xiangyu, Ren, Shaoqing, Sun, Jian (2016 ). +\dQuote{Deep residual learning for image recognition .} +In \emph{Proceedings of the IEEE conference on computer vision and pattern recognition }, 770--778 . Simonyan, Karen, Zisserman, Andrew (2014). \dQuote{Very deep convolutional networks for large-scale image recognition.} \emph{arXiv preprint arXiv:1409.1556}.} diff --git a/man/mlr_learners_torch_image.Rd b/man/mlr_learners_torch_image.Rd index af2b854d..fce0bef7 100644 --- a/man/mlr_learners_torch_image.Rd +++ b/man/mlr_learners_torch_image.Rd @@ -64,7 +64,7 @@ Creates a new instance of this \link[R6:R6Class]{R6} class. optimizer = NULL, loss = NULL, callbacks = list(), - packages = c("torchvision", "magick"), + packages = c("torchvision"), man, properties = NULL, predict_types = NULL diff --git a/tests/testthat/test_LearnerTorchVision.R b/tests/testthat/test_LearnerTorchVision.R index a757bfd7..546c9891 100644 --- a/tests/testthat/test_LearnerTorchVision.R +++ b/tests/testthat/test_LearnerTorchVision.R @@ -17,7 +17,6 @@ test_that("LearnerTorchVision basic checks", { vgg13$id = "a" expect_false(alexnet$phash == vgg13$phash) expect_true("torchvision" %in% alexnet$packages) - expect_true("magick" %in% alexnet$packages) alexnet = lrn("classif.alexnet", optimizer = "sgd", loss = "cross_entropy", callbacks = t_clbk("checkpoint"), epochs = 0, batch_size = 1 From 968c03583bcfee64d6691456eee55f09f9927f9d Mon Sep 17 00:00:00 2001 From: Sebastian Fischer Date: Fri, 8 Nov 2024 10:41:48 +0100 Subject: [PATCH 28/46] update news --- NEWS.md | 1 + 1 file changed, 1 insertion(+) diff --git a/NEWS.md b/NEWS.md index 5aa32a16..ab97ff93 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,5 +1,6 @@ # mlr3torch dev +* perf: Use a faster image loader * feat: Add parameter `num_interop_threads` to `LearnerTorch` * feat: Add adaptive average pooling From ab7bc64076a259b2daa1a52d17b8ba7286bc4f67 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Thu, 21 Nov 2024 08:55:43 +0100 Subject: [PATCH 29/46] Bump JamesIves/github-pages-deploy-action from 4.6.8 to 4.6.9 (#302) Bumps [JamesIves/github-pages-deploy-action](https://github.com/jamesives/github-pages-deploy-action) from 4.6.8 to 4.6.9. - [Release notes](https://github.com/jamesives/github-pages-deploy-action/releases) - [Commits](https://github.com/jamesives/github-pages-deploy-action/compare/v4.6.8...v4.6.9) --- updated-dependencies: - dependency-name: JamesIves/github-pages-deploy-action dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- .github/workflows/pkgdown.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/pkgdown.yml b/.github/workflows/pkgdown.yml index fbe74a40..fa1a3a07 100644 --- a/.github/workflows/pkgdown.yml +++ b/.github/workflows/pkgdown.yml @@ -45,7 +45,7 @@ jobs: - name: Deploy if: github.event_name != 'pull_request' - uses: JamesIves/github-pages-deploy-action@v4.6.8 + uses: JamesIves/github-pages-deploy-action@v4.6.9 with: clean: false branch: gh-pages From 394de7269c04da32cd5bb4cdccce39d5d1c163f6 Mon Sep 17 00:00:00 2001 From: Sebastian Fischer Date: Thu, 21 Nov 2024 12:03:16 +0100 Subject: [PATCH 30/46] improve docs --- R/LearnerTorch.R | 2 +- man/mlr_learners.torchvision.Rd | 6 +++--- man/mlr_learners_torch.Rd | 2 +- man/mlr_learners_torch_image.Rd | 2 +- 4 files changed, 6 insertions(+), 6 deletions(-) diff --git a/R/LearnerTorch.R b/R/LearnerTorch.R index ce2248ab..7536fd03 100644 --- a/R/LearnerTorch.R +++ b/R/LearnerTorch.R @@ -21,7 +21,7 @@ #' After loading a marshaled `LearnerTorch` into R again, you then need to call `$unmarshal()` to transform it #' into a useable state. #' -#' @section Early Stopping and Tuning: +#' @section Early Stopping and Internal Tuning: #' In order to prevent overfitting, the `LearnerTorch` class allows to use early stopping via the `patience` #' and `min_delta` parameters, see the `Learner`'s parameters. #' When tuning a `LearnerTorch` it is also possible to combine the explicit tuning via `mlr3tuning` diff --git a/man/mlr_learners.torchvision.Rd b/man/mlr_learners.torchvision.Rd index 29acfb0e..87883dd9 100644 --- a/man/mlr_learners.torchvision.Rd +++ b/man/mlr_learners.torchvision.Rd @@ -90,9 +90,9 @@ Krizhevsky, Alex, Sutskever, Ilya, Hinton, E. G (2017). Sandler, Mark, Howard, Andrew, Zhu, Menglong, Zhmoginov, Andrey, Chen, Liang-Chieh (2018). \dQuote{Mobilenetv2: Inverted residuals and linear bottlenecks.} In \emph{Proceedings of the IEEE conference on computer vision and pattern recognition}, 4510--4520. -He, Kaiming, Zhang, Xiangyu, Ren, Shaoqing, Sun, Jian (2016 ). -\dQuote{Deep residual learning for image recognition .} -In \emph{Proceedings of the IEEE conference on computer vision and pattern recognition }, 770--778 . +He, Kaiming, Zhang, Xiangyu, Ren, Shaoqing, Sun, Jian (2016). +\dQuote{Deep residual learning for image recognition.} +In \emph{Proceedings of the IEEE conference on computer vision and pattern recognition}, 770--778. Simonyan, Karen, Zisserman, Andrew (2014). \dQuote{Very deep convolutional networks for large-scale image recognition.} \emph{arXiv preprint arXiv:1409.1556}.} diff --git a/man/mlr_learners_torch.Rd b/man/mlr_learners_torch.Rd index ebdfe53d..3c5213a9 100644 --- a/man/mlr_learners_torch.Rd +++ b/man/mlr_learners_torch.Rd @@ -29,7 +29,7 @@ After loading a marshaled \code{LearnerTorch} into R again, you then need to cal into a useable state. } -\section{Early Stopping and Tuning}{ +\section{Early Stopping and Internal Tuning}{ In order to prevent overfitting, the \code{LearnerTorch} class allows to use early stopping via the \code{patience} and \code{min_delta} parameters, see the \code{Learner}'s parameters. diff --git a/man/mlr_learners_torch_image.Rd b/man/mlr_learners_torch_image.Rd index fce0bef7..0797ba25 100644 --- a/man/mlr_learners_torch_image.Rd +++ b/man/mlr_learners_torch_image.Rd @@ -64,7 +64,7 @@ Creates a new instance of this \link[R6:R6Class]{R6} class. optimizer = NULL, loss = NULL, callbacks = list(), - packages = c("torchvision"), + packages = "torchvision", man, properties = NULL, predict_types = NULL From 30697dbc883e1425f10a12d4b1bf19b0cdf81d25 Mon Sep 17 00:00:00 2001 From: Sebastian Fischer Date: Mon, 25 Nov 2024 16:00:55 +0100 Subject: [PATCH 31/46] feat(tab resnet): allow numeric values for multiplier param --- R/LearnerTorchTabResNet.R | 8 ++++---- man/mlr_learners.tab_resnet.Rd | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/R/LearnerTorchTabResNet.R b/R/LearnerTorchTabResNet.R index 4896944f..232ace4a 100644 --- a/R/LearnerTorchTabResNet.R +++ b/R/LearnerTorchTabResNet.R @@ -18,7 +18,7 @@ #' The input and output dimension of a block. #' * `d_hidden` :: `integer(1)`\cr #' The latent dimension of a block. -#' * `d_hidden_multiplier` :: `integer(1)`\cr +#' * `d_hidden_multiplier` :: `numeric(1)`\cr #' Alternative way to specify the latent dimension as `d_block * d_hidden_multiplier`. #' * `dropout1` :: `numeric(1)`\cr #' First dropout ratio. @@ -85,7 +85,7 @@ PipeOpTorchTabResNetBlock = R6Class("PipeOpTorchTabResNetBlock", initialize = function(id = "nn_tab_resnet", param_vals = list()) { param_set = ps( d_hidden = p_int(1, default = NULL, tags = "train", special_vals = list(NULL)), - d_hidden_multiplier = p_int(1, default = NULL, tags = "train", special_vals = list(NULL)), + d_hidden_multiplier = p_dbl(0, default = NULL, tags = "train", special_vals = list(NULL)), dropout1 = p_dbl(0, 1, tags = c("train", "required")), dropout2 = p_dbl(0, 1, tags = c("train", "required")) ) @@ -120,8 +120,8 @@ nn_tab_resnet_block = nn_module("nn_tab_resnet_block", ) { assert_int(d_block, lower = 1L) if (is.null(d_hidden)) { - assert_int(d_hidden_multiplier, lower = 1L) - d_hidden = d_block * d_hidden_multiplier + assert_numeric(d_hidden_multiplier, lower = 0) + d_hidden = as.integer(d_block * d_hidden_multiplier) } else { assert_int(d_hidden, lower = 1L) assert_true(is.null(d_hidden_multiplier)) diff --git a/man/mlr_learners.tab_resnet.Rd b/man/mlr_learners.tab_resnet.Rd index 1d8b9e8d..37886d77 100644 --- a/man/mlr_learners.tab_resnet.Rd +++ b/man/mlr_learners.tab_resnet.Rd @@ -40,7 +40,7 @@ The number of blocks. The input and output dimension of a block. \item \code{d_hidden} :: \code{integer(1)}\cr The latent dimension of a block. -\item \code{d_hidden_multiplier} :: \code{integer(1)}\cr +\item \code{d_hidden_multiplier} :: \code{numeric(1)}\cr Alternative way to specify the latent dimension as \code{d_block * d_hidden_multiplier}. \item \code{dropout1} :: \code{numeric(1)}\cr First dropout ratio. From 3ce80def9256955cf92c0eedde5f4023f04dde2b Mon Sep 17 00:00:00 2001 From: Sebastian Fischer Date: Tue, 26 Nov 2024 11:21:24 +0100 Subject: [PATCH 32/46] feat(mlp): add n_layers parameter (#307) --- NEWS.md | 1 + R/LearnerTorchMLP.R | 13 ++++++++++++- man/mlr_learners.mlp.Rd | 2 ++ tests/testthat/test_LearnerTorchMLP.R | 20 +++++++++++++++++++- 4 files changed, 34 insertions(+), 2 deletions(-) diff --git a/NEWS.md b/NEWS.md index ab97ff93..f45b16c0 100644 --- a/NEWS.md +++ b/NEWS.md @@ -3,6 +3,7 @@ * perf: Use a faster image loader * feat: Add parameter `num_interop_threads` to `LearnerTorch` * feat: Add adaptive average pooling +* feat: Added `n_layers` parameter to MLP # mlr3torch 0.1.2 diff --git a/R/LearnerTorchMLP.R b/R/LearnerTorchMLP.R index e1079293..4b2a955f 100644 --- a/R/LearnerTorchMLP.R +++ b/R/LearnerTorchMLP.R @@ -22,6 +22,8 @@ #' * `neurons` :: `integer()`\cr #' The number of neurons per hidden layer. By default there is no hidden layer. #' Setting this to `c(10, 20)` would have a the first hidden layer with 10 neurons and the second with 20. +#' * `n_layers` :: `integer()`\cr +#' The number of layers. This parameter must only be set when `neurons` has length 1. #' * `p` :: `numeric(1)`\cr #' The dropout probability. Is initialized to `0.5`. #' * `shape` :: `integer()` or `NULL`\cr @@ -48,6 +50,7 @@ LearnerTorchMLP = R6Class("LearnerTorchMLP", param_set = ps( neurons = p_uty(tags = c("train", "predict"), custom_check = check_neurons), p = p_dbl(lower = 0, upper = 1, tags = "train"), + n_layers = p_int(lower = 1L, tags = "train"), activation = p_uty(tags = c("required", "train"), custom_check = check_nn_module), activation_args = p_uty(tags = c("required", "train"), custom_check = check_activation_args), shape = p_uty(tags = "train", custom_check = check_shape) @@ -127,8 +130,16 @@ single_lazy_tensor = function(task) { } # shape is (NA, x) if preesnt -make_mlp = function(task, d_in, d_out, activation, neurons = integer(0), p, activation_args, ...) { +make_mlp = function(task, d_in, d_out, activation, neurons = integer(0), p, activation_args, n_layers = NULL, ...) { # This way, dropout_args will have length 0 if p is `NULL` + + if (!is.null(n_layers)) { + if (length(neurons) != 1L) { + stopf("Can only supply `n_layers` when neurons has length 1.") + } + neurons = rep(neurons, n_layers) + } + dropout_args = list() dropout_args$p = p prev_dim = d_in diff --git a/man/mlr_learners.mlp.Rd b/man/mlr_learners.mlp.Rd index 6eb586aa..510d0e97 100644 --- a/man/mlr_learners.mlp.Rd +++ b/man/mlr_learners.mlp.Rd @@ -43,6 +43,8 @@ This is intialized to an empty list. \item \code{neurons} :: \code{integer()}\cr The number of neurons per hidden layer. By default there is no hidden layer. Setting this to \code{c(10, 20)} would have a the first hidden layer with 10 neurons and the second with 20. +\item \code{n_layers} :: \code{integer()}\cr +The number of layers. This parameter must only be set when \code{neurons} has length 1. \item \code{p} :: \code{numeric(1)}\cr The dropout probability. Is initialized to \code{0.5}. \item \code{shape} :: \code{integer()} or \code{NULL}\cr diff --git a/tests/testthat/test_LearnerTorchMLP.R b/tests/testthat/test_LearnerTorchMLP.R index 1869e554..85ed79ce 100644 --- a/tests/testthat/test_LearnerTorchMLP.R +++ b/tests/testthat/test_LearnerTorchMLP.R @@ -52,4 +52,22 @@ test_that("works for lazy tensor", { expect_class(pred, "Prediction") }) -# TODO: More tests +test_that("neurons and n_layers", { + l1 = lrn("classif.mlp", batch_size = 32, epochs = 0L) + l2 = l1$clone(deep = TRUE) + task = tsk("iris") + l1$param_set$set_values(neurons = c(10, 10)) + l2$param_set$set_values(neurons = 10, n_layers = 2) + l1$train(task) + l2$train(task) + expect_equal(l1$network$parameters[[1]]$shape, l2$network$parameters[[1]]$shape) + expect_equal(l1$network$parameters[[3]]$shape, l2$network$parameters[[3]]$shape) + expect_equal(l1$network$parameters[[5]]$shape, l2$network$parameters[[5]]$shape) + expect_equal(l1$network$parameters[[1]]$shape, c(10, 4)) + expect_equal(l1$network$parameters[[3]]$shape, c(10, 10)) + expect_equal(l1$network$parameters[[5]]$shape, c(3, 10)) + + l1$param_set$set_values(n_layers = 2) + expect_error(l1$train(task), "Can only supply") +}) + From e6d1c9ffe6bf941624e3410fb7f7874254965363 Mon Sep 17 00:00:00 2001 From: Sebastian Fischer Date: Tue, 26 Nov 2024 11:21:33 +0100 Subject: [PATCH 33/46] fix leanification (#306) --- DESCRIPTION | 3 ++- R/LearnerTorchTabResNet.R | 2 +- R/LearnerTorchVision.R | 2 +- R/PipeOpModule.R | 2 +- R/PipeOpTorchAdaptiveAvgPool.R | 4 +-- R/PipeOpTorchAvgPool.R | 2 +- R/PipeOpTorchBatchNorm.R | 2 +- R/PipeOpTorchBlock.R | 2 +- R/PipeOpTorchCallbacks.R | 2 +- R/PipeOpTorchConv.R | 2 +- R/PipeOpTorchConvTranspose.R | 2 +- R/PipeOpTorchDropout.R | 2 +- R/PipeOpTorchHead.R | 2 +- R/PipeOpTorchIngress.R | 2 +- R/PipeOpTorchLayerNorm.R | 2 +- R/PipeOpTorchLinear.R | 2 +- R/PipeOpTorchLoss.R | 2 +- R/PipeOpTorchMaxPool.R | 2 +- R/PipeOpTorchMerge.R | 2 +- R/PipeOpTorchModel.R | 2 +- R/PipeOpTorchOptimizer.R | 2 +- R/PipeOpTorchReshape.R | 2 +- R/PipeOpTorchSoftmax.R | 2 +- R/TaskClassif_lazy_iris.R | 4 +-- R/TorchCallback.R | 2 +- R/aaa.R | 36 ++++++++++++++++++++++++++ R/zzz.R | 46 ---------------------------------- 27 files changed, 64 insertions(+), 73 deletions(-) create mode 100644 R/aaa.R diff --git a/DESCRIPTION b/DESCRIPTION index 0f553b69..7e892ee3 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -80,7 +80,7 @@ Roxygen: list(markdown = TRUE, r6 = TRUE) RoxygenNote: 7.3.2 Collate: 'CallbackSet.R' - 'zzz.R' + 'aaa.R' 'TorchCallback.R' 'CallbackSetCheckpoint.R' 'CallbackSetEarlyStopping.R' @@ -143,3 +143,4 @@ Collate: 'preprocess.R' 'rd_info.R' 'with_torch_settings.R' + 'zzz.R' diff --git a/R/LearnerTorchTabResNet.R b/R/LearnerTorchTabResNet.R index 232ace4a..d6c56073 100644 --- a/R/LearnerTorchTabResNet.R +++ b/R/LearnerTorchTabResNet.R @@ -120,7 +120,7 @@ nn_tab_resnet_block = nn_module("nn_tab_resnet_block", ) { assert_int(d_block, lower = 1L) if (is.null(d_hidden)) { - assert_numeric(d_hidden_multiplier, lower = 0) + assert_numeric(d_hidden_multiplier, lower = 0, null.ok = TRUE) d_hidden = as.integer(d_block * d_hidden_multiplier) } else { assert_int(d_hidden, lower = 1L) diff --git a/R/LearnerTorchVision.R b/R/LearnerTorchVision.R index 382395c4..42f8d607 100644 --- a/R/LearnerTorchVision.R +++ b/R/LearnerTorchVision.R @@ -106,7 +106,7 @@ replace_head.VGG = function(network, d_out) { network } -#' @include zzz.R +#' @include aaa.R register_learner("classif.alexnet", function(loss = NULL, optimizer = NULL, callbacks = list()) { LearnerTorchVision$new("alexnet", torchvision::model_alexnet, "AlexNet", diff --git a/R/PipeOpModule.R b/R/PipeOpModule.R index f974dac5..790e217e 100644 --- a/R/PipeOpModule.R +++ b/R/PipeOpModule.R @@ -153,5 +153,5 @@ PipeOpModule = R6Class("PipeOpModule", ) ) -#' @include zzz.R +#' @include aaa.R register_po("module", PipeOpModule) diff --git a/R/PipeOpTorchAdaptiveAvgPool.R b/R/PipeOpTorchAdaptiveAvgPool.R index cb7c7652..f3bd5c73 100644 --- a/R/PipeOpTorchAdaptiveAvgPool.R +++ b/R/PipeOpTorchAdaptiveAvgPool.R @@ -120,7 +120,7 @@ PipeOpTorchAdaptiveAvgPool3D = R6Class("PipeOpTorchAdaptiveAvgPool3D", inherit = ) ) -#' @include zzz.R +#' @include aaa.R register_po("nn_adaptive_avg_pool1d", PipeOpTorchAdaptiveAvgPool1D) register_po("nn_adaptive_avg_pool2d", PipeOpTorchAdaptiveAvgPool2D) -register_po("nn_adaptive_avg_pool3d", PipeOpTorchAdaptiveAvgPool3D) \ No newline at end of file +register_po("nn_adaptive_avg_pool3d", PipeOpTorchAdaptiveAvgPool3D) diff --git a/R/PipeOpTorchAvgPool.R b/R/PipeOpTorchAvgPool.R index 235206a9..04d9ddb9 100644 --- a/R/PipeOpTorchAvgPool.R +++ b/R/PipeOpTorchAvgPool.R @@ -145,7 +145,7 @@ PipeOpTorchAvgPool3D = R6Class("PipeOpTorchAvgPool3D", inherit = PipeOpTorchAvgP ) ) -#' @include zzz.R +#' @include aaa.R register_po("nn_avg_pool1d", PipeOpTorchAvgPool1D) register_po("nn_avg_pool2d", PipeOpTorchAvgPool2D) register_po("nn_avg_pool3d", PipeOpTorchAvgPool3D) diff --git a/R/PipeOpTorchBatchNorm.R b/R/PipeOpTorchBatchNorm.R index ac1e715d..c2c24044 100644 --- a/R/PipeOpTorchBatchNorm.R +++ b/R/PipeOpTorchBatchNorm.R @@ -129,7 +129,7 @@ PipeOpTorchBatchNorm3D = R6Class("PipeOpTorchBatchNorm3D", inherit = PipeOpTorch ) ) -#' @include zzz.R +#' @include aaa.R register_po("nn_batch_norm1d", PipeOpTorchBatchNorm1D) register_po("nn_batch_norm2d", PipeOpTorchBatchNorm2D) register_po("nn_batch_norm3d", PipeOpTorchBatchNorm3D) diff --git a/R/PipeOpTorchBlock.R b/R/PipeOpTorchBlock.R index f1c2782a..6ca7a48f 100644 --- a/R/PipeOpTorchBlock.R +++ b/R/PipeOpTorchBlock.R @@ -116,5 +116,5 @@ PipeOpTorchBlock = R6Class("PipeOpTorchBlock", ) -#' @include zzz.R +#' @include aaa.R register_po("nn_block", PipeOpTorchBlock, metainf = list(block = as_graph(po("nop")))) diff --git a/R/PipeOpTorchCallbacks.R b/R/PipeOpTorchCallbacks.R index a1a2e2da..ca862e96 100644 --- a/R/PipeOpTorchCallbacks.R +++ b/R/PipeOpTorchCallbacks.R @@ -90,5 +90,5 @@ PipeOpTorchCallbacks = R6Class("PipeOpTorchCallbacks", ) ) -#' @include zzz.R +#' @include aaa.R register_po("torch_callbacks", PipeOpTorchCallbacks) diff --git a/R/PipeOpTorchConv.R b/R/PipeOpTorchConv.R index 5e5bee69..37007b53 100644 --- a/R/PipeOpTorchConv.R +++ b/R/PipeOpTorchConv.R @@ -146,7 +146,7 @@ PipeOpTorchConv3D = R6Class("PipeOpTorchConv3D", inherit = PipeOpTorchConv, ) ) -#' @include zzz.R +#' @include aaa.R register_po("nn_conv1d", PipeOpTorchConv1D) register_po("nn_conv2d", PipeOpTorchConv2D) register_po("nn_conv3d", PipeOpTorchConv3D) diff --git a/R/PipeOpTorchConvTranspose.R b/R/PipeOpTorchConvTranspose.R index fff15b79..6550f79a 100644 --- a/R/PipeOpTorchConvTranspose.R +++ b/R/PipeOpTorchConvTranspose.R @@ -153,7 +153,7 @@ PipeOpTorchConvTranspose3D = R6Class("PipeOpTorchConvTranspose3D", inherit = Pip ) ) -#' @include zzz.R +#' @include aaa.R register_po("nn_conv_transpose1d", PipeOpTorchConvTranspose1D) register_po("nn_conv_transpose2d", PipeOpTorchConvTranspose2D) register_po("nn_conv_transpose3d", PipeOpTorchConvTranspose3D) diff --git a/R/PipeOpTorchDropout.R b/R/PipeOpTorchDropout.R index be0c7389..77354b62 100644 --- a/R/PipeOpTorchDropout.R +++ b/R/PipeOpTorchDropout.R @@ -37,5 +37,5 @@ PipeOpTorchDropout = R6Class("PipeOpTorchDropout", ) ) -#' @include zzz.R +#' @include aaa.R register_po("nn_dropout", PipeOpTorchDropout) diff --git a/R/PipeOpTorchHead.R b/R/PipeOpTorchHead.R index 939ebf9a..c6af8ad8 100644 --- a/R/PipeOpTorchHead.R +++ b/R/PipeOpTorchHead.R @@ -51,5 +51,5 @@ PipeOpTorchHead = R6Class("PipeOpTorchHead", ) ) -#' @include zzz.R +#' @include aaa.R register_po("nn_head", PipeOpTorchHead) diff --git a/R/PipeOpTorchIngress.R b/R/PipeOpTorchIngress.R index 26ae455c..10ba88d1 100644 --- a/R/PipeOpTorchIngress.R +++ b/R/PipeOpTorchIngress.R @@ -213,7 +213,7 @@ PipeOpTorchIngressNumeric = R6Class("PipeOpTorchIngressNumeric", ) ) -#' @include zzz.R +#' @include aaa.R register_po("torch_ingress_num", PipeOpTorchIngressNumeric) #' @title Torch Entry Point for Categorical Features diff --git a/R/PipeOpTorchLayerNorm.R b/R/PipeOpTorchLayerNorm.R index 0128861e..78a74808 100644 --- a/R/PipeOpTorchLayerNorm.R +++ b/R/PipeOpTorchLayerNorm.R @@ -43,5 +43,5 @@ PipeOpTorchLayerNorm = R6Class("PipeOpTorchLayerNorm", ) ) -#' @include zzz.R +#' @include aaa.R register_po("nn_layer_norm", PipeOpTorchLayerNorm) diff --git a/R/PipeOpTorchLinear.R b/R/PipeOpTorchLinear.R index 901bbc76..1fed04d0 100644 --- a/R/PipeOpTorchLinear.R +++ b/R/PipeOpTorchLinear.R @@ -45,5 +45,5 @@ PipeOpTorchLinear = R6Class("PipeOpTorchLinear", ) ) -#' @include zzz.R +#' @include aaa.R register_po("nn_linear", PipeOpTorchLinear) diff --git a/R/PipeOpTorchLoss.R b/R/PipeOpTorchLoss.R index 603701cb..8639e308 100644 --- a/R/PipeOpTorchLoss.R +++ b/R/PipeOpTorchLoss.R @@ -66,5 +66,5 @@ PipeOpTorchLoss = R6Class("PipeOpTorchLoss", # We set an arbitrary loss, so Dict -> DT conversion works -#' @include zzz.R TorchLoss.R +#' @include aaa.R TorchLoss.R register_po("torch_loss", PipeOpTorchLoss, metainf = list(loss = t_loss("mse"))) diff --git a/R/PipeOpTorchMaxPool.R b/R/PipeOpTorchMaxPool.R index 10552b7e..6545ee26 100644 --- a/R/PipeOpTorchMaxPool.R +++ b/R/PipeOpTorchMaxPool.R @@ -148,7 +148,7 @@ PipeOpTorchMaxPool3D = R6Class("PipeOpTorchMaxPool3D", inherit = PipeOpTorchMaxP ) ) -#' @include zzz.R +#' @include aaa.R register_po("nn_max_pool1d", PipeOpTorchMaxPool1D) register_po("nn_max_pool2d", PipeOpTorchMaxPool2D) register_po("nn_max_pool3d", PipeOpTorchMaxPool3D) diff --git a/R/PipeOpTorchMerge.R b/R/PipeOpTorchMerge.R index b97d9e7b..29c4ce0b 100644 --- a/R/PipeOpTorchMerge.R +++ b/R/PipeOpTorchMerge.R @@ -255,7 +255,7 @@ nn_merge_cat = nn_module( -#' @include zzz.R +#' @include aaa.R register_po("nn_merge_sum", PipeOpTorchMergeSum) register_po("nn_merge_prod", PipeOpTorchMergeProd) register_po("nn_merge_cat", PipeOpTorchMergeCat) diff --git a/R/PipeOpTorchModel.R b/R/PipeOpTorchModel.R index 80672f4c..8b6d558c 100644 --- a/R/PipeOpTorchModel.R +++ b/R/PipeOpTorchModel.R @@ -187,6 +187,6 @@ PipeOpTorchModelRegr = R6Class("PipeOpTorchModelRegr", ) ) -#' @include zzz.R +#' @include aaa.R register_po("torch_model_regr", PipeOpTorchModelRegr) register_po("torch_model_classif", PipeOpTorchModelClassif) diff --git a/R/PipeOpTorchOptimizer.R b/R/PipeOpTorchOptimizer.R index 1010f778..f65c70da 100644 --- a/R/PipeOpTorchOptimizer.R +++ b/R/PipeOpTorchOptimizer.R @@ -64,5 +64,5 @@ PipeOpTorchOptimizer = R6Class("PipeOpTorchOptimizer", ) ) -#' @include zzz.R +#' @include aaa.R register_po("torch_optimizer", PipeOpTorchOptimizer) diff --git a/R/PipeOpTorchReshape.R b/R/PipeOpTorchReshape.R index b914731d..a0cbd7b8 100644 --- a/R/PipeOpTorchReshape.R +++ b/R/PipeOpTorchReshape.R @@ -252,7 +252,7 @@ nn_unsqueeze = nn_module( } ) -#' @include zzz.R +#' @include aaa.R register_po("nn_reshape", PipeOpTorchReshape) register_po("nn_unsqueeze", PipeOpTorchUnsqueeze) register_po("nn_squeeze", PipeOpTorchSqueeze) diff --git a/R/PipeOpTorchSoftmax.R b/R/PipeOpTorchSoftmax.R index bba69ec0..e5eb19b3 100644 --- a/R/PipeOpTorchSoftmax.R +++ b/R/PipeOpTorchSoftmax.R @@ -34,5 +34,5 @@ PipeOpTorchSoftmax = R6::R6Class("PipeOpTorchSoftmax", ) ) -#' @include zzz.R +#' @include aaa.R register_po("nn_softmax", PipeOpTorchSoftmax) diff --git a/R/TaskClassif_lazy_iris.R b/R/TaskClassif_lazy_iris.R index 0c3f3fac..5e14176a 100644 --- a/R/TaskClassif_lazy_iris.R +++ b/R/TaskClassif_lazy_iris.R @@ -3,7 +3,7 @@ #' @name mlr_tasks_lazy_iris #' #' @format [R6::R6Class] inheriting from [mlr3::TaskClassif]. -#' @include zzz.R +#' @include aaa.R #' #' @description #' A classification task for the popular [datasets::iris] data set. @@ -44,5 +44,5 @@ load_task_lazy_iris = function(id = "lazy_iris") { task } -#' @include zzz.R +#' @include aaa.R mlr3torch_tasks[["lazy_iris"]] = load_task_lazy_iris diff --git a/R/TorchCallback.R b/R/TorchCallback.R index a6780b64..ce1cf1b8 100644 --- a/R/TorchCallback.R +++ b/R/TorchCallback.R @@ -245,7 +245,7 @@ TorchCallback = R6Class("TorchCallback", #' #' @export #' @return [`TorchCallback`] -#' @include zzz.R CallbackSet.R +#' @include aaa.R CallbackSet.R #' @family Callback #' @examplesIf torch::torch_is_installed() #' custom_tcb = torch_callback("custom", diff --git a/R/aaa.R b/R/aaa.R new file mode 100644 index 00000000..99a47777 --- /dev/null +++ b/R/aaa.R @@ -0,0 +1,36 @@ +register_po = function(name, constructor, metainf = NULL) { + if (name %in% names(mlr3torch_pipeops)) stopf("pipeop %s registered twice", name) + mlr3torch_pipeops[[name]] = list(constructor = constructor, metainf = substitute(metainf)) +} + +register_learner = function(.name, .constructor, ...) { + assert_multi_class(.constructor, c("function", "R6ClassGenerator")) + if (is.function(.constructor)) { + mlr3torch_learners[[.name]] = list(fn = .constructor, prototype_args = list(...)) + return(NULL) + } + task_type = if (startsWith(.name, "classif")) "classif" else "regr" + # What I am doing here: + # The problem is that we wan't to set the task_type when creating the learner from the dictionary + # The initial idea was to add functions function(...) LearnerClass$new(..., task_type = "") + # This did not work because mlr3misc does not work with ... arguments (... arguments are not + # passed further to the initialize method) + # For this reason, we need this hacky solution here, might change in the future in mlr3misc + fn = crate(function() { + invoke(.constructor$new, task_type = task_type, .args = as.list(match.call()[-1])) + }, .constructor, task_type, .parent = topenv()) + fmls = formals(.constructor$public_methods$initialize) + fmls$task_type = NULL + formals(fn) = fmls + if (.name %in% names(mlr3torch_learners)) stopf("learner %s registered twice", .name) + mlr3torch_learners[[.name]] = list(fn = fn, prototype_args = list(...)) +} + +register_task = function(name, constructor) { + if (name %in% names(mlr3torch_tasks)) stopf("task %s registered twice", name) + mlr3torch_tasks[[name]] = constructor +} + +mlr3torch_pipeops = new.env() +mlr3torch_learners = new.env() +mlr3torch_tasks = new.env() diff --git a/R/zzz.R b/R/zzz.R index dfa8ad6b..5a89a43f 100644 --- a/R/zzz.R +++ b/R/zzz.R @@ -23,10 +23,6 @@ utils::globalVariables(c("self", "private", "super", "..")) if (FALSE) knitr::knit2pandoc if (FALSE) withr::with_seed -mlr3torch_pipeops = new.env() -mlr3torch_learners = new.env() -mlr3torch_tasks = new.env() -mlr3torch_resamplings = new.env() mlr3torch_task_generators = new.env() mlr3torch_pipeop_tags = c("torch", "activation") mlr3torch_feature_types = c(lt = "lazy_tensor") @@ -34,44 +30,6 @@ mlr3torch_feature_types = c(lt = "lazy_tensor") # silence static checker withr::with_seed -register_po = function(name, constructor, metainf = NULL) { - if (name %in% names(mlr3torch_pipeops)) stopf("pipeop %s registered twice", name) - mlr3torch_pipeops[[name]] = list(constructor = constructor, metainf = substitute(metainf)) -} - -register_resampling = function(name, constructor) { - if (name %in% names(mlr3torch_resamplings)) stopf("resampling %s registered twice", name) - mlr3torch_resamplings[[name]] = constructor -} - -register_learner = function(.name, .constructor, ...) { - assert_multi_class(.constructor, c("function", "R6ClassGenerator")) - if (is.function(.constructor)) { - mlr3torch_learners[[.name]] = list(fn = .constructor, prototype_args = list(...)) - return(NULL) - } - task_type = if (startsWith(.name, "classif")) "classif" else "regr" - # What I am doing here: - # The problem is that we wan't to set the task_type when creating the learner from the dictionary - # The initial idea was to add functions function(...) LearnerClass$new(..., task_type = "") - # This did not work because mlr3misc does not work with ... arguments (... arguments are not - # passed further to the initialize method) - # For this reason, we need this hacky solution here, might change in the future in mlr3misc - fn = crate(function() { - invoke(.constructor$new, task_type = task_type, .args = as.list(match.call()[-1])) - }, .constructor, task_type, .parent = topenv()) - fmls = formals(.constructor$public_methods$initialize) - fmls$task_type = NULL - formals(fn) = fmls - if (.name %in% names(mlr3torch_learners)) stopf("learner %s registered twice", .name) - mlr3torch_learners[[.name]] = list(fn = fn, prototype_args = list(...)) -} - -register_task = function(name, constructor) { - if (name %in% names(mlr3torch_tasks)) stopf("task %s registered twice", name) - mlr3torch_tasks[[name]] = constructor -} - register_mlr3 = function() { mlr_learners = utils::getFromNamespace("mlr_learners", ns = "mlr3") iwalk(as.list(mlr3torch_learners), function(x, nm) mlr_learners$add(nm, x$fn, .prototype_args = x$prototype_args)) @@ -79,9 +37,6 @@ register_mlr3 = function() { mlr_tasks = mlr3::mlr_tasks iwalk(as.list(mlr3torch_tasks), function(task, nm) mlr_tasks$add(nm, task)) # nolint - mlr_resamplings = mlr3::mlr_resamplings - iwalk(as.list(mlr3torch_resamplings), function(resampling, nm) mlr_resamplings$add(nm, resampling)) - mlr_reflections = utils::getFromNamespace("mlr_reflections", ns = "mlr3") # nolint iwalk(as.list(mlr3torch_feature_types), function(ft, nm) mlr_reflections$task_feature_types[[nm]] = ft) # nolint @@ -135,7 +90,6 @@ register_mlr3pipelines = function() { .onUnload = function(libPaths) { # nolint walk(names(mlr3torch_learners), function(nm) mlr_learners$remove(nm)) - walk(names(mlr3torch_resamplings), function(nm) mlr_resamplings$remove(nm)) walk(names(mlr3torch_tasks), function(nm) mlr_tasks$remove(nm)) walk(names(mlr3torch_pipeops), function(nm) mlr_pipeops$remove(nm)) mlr_reflections$pipeops$valid_tags = setdiff(mlr_reflections$pipeops$valid_tags, mlr3torch_pipeop_tags) From d6bbca25adefa7cabf1dd19668466e6e01612dbe Mon Sep 17 00:00:00 2001 From: Carson Zhang Date: Fri, 29 Nov 2024 00:29:32 +0100 Subject: [PATCH 34/46] resolved merge conflict' --- .../image_loaders/benchmark_image_loaders.R | 63 +++++++++++++++++++ man/mlr_learners_torch_image.Rd | 2 +- tests/temp-TaskClassif_melanoma.R | 41 ++++++++++++ 3 files changed, 105 insertions(+), 1 deletion(-) create mode 100644 benchmarks/image_loaders/benchmark_image_loaders.R create mode 100644 tests/temp-TaskClassif_melanoma.R diff --git a/benchmarks/image_loaders/benchmark_image_loaders.R b/benchmarks/image_loaders/benchmark_image_loaders.R new file mode 100644 index 00000000..cf56a33a --- /dev/null +++ b/benchmarks/image_loaders/benchmark_image_loaders.R @@ -0,0 +1,63 @@ +library(torch) +library(torchvision) +library(mlr3torch) +library(here) + +library(data.table) +setDTthreads(threads = 1) + +training_metadata = fread(here::here("cache", "ISIC_2020_Training_GroundTruth.csv")) + +# hard-coded cache directory that I use locally +cache_dir = here("cache") + +ds_base_loader = torch::dataset( + initialize = function(n_images) { + self$.metadata = fread(here(cache_dir, "ISIC_2020_Training_GroundTruth.csv"))[1:n_images, ] + self$.path = file.path(here(cache_dir), "train") + }, + .getitem = function(idx) { + force(idx) + + x = torchvision::base_loader(file.path(self$.path, paste0(self$.metadata[idx, ]$image_name, ".jpg"))) + x = torchvision::transform_to_tensor(x) + + return(list(x = x)) + }, + .length = function() { + nrow(self$.metadata) + } +) + +ds_magick_loader = torch::dataset( + initialize = function(n_images) { + self$.metadata = fread(here(cache_dir, "ISIC_2020_Training_GroundTruth.csv"))[1:n_images, ] + self$.path = file.path(here(cache_dir), "train") + }, + .getitem = function(idx) { + force(idx) + + image_name = self$.metadata[idx, ]$image_name + + x = magick::image_read(file.path(self$.path, paste0(image_name, ".jpg"))) + x = torchvision::transform_to_tensor(x) + + return(list(x = x, image_name = image_name)) + }, + .length = function() { + nrow(self$.metadata) + } +) + +n_images = 10 + +ds_base = ds_base_loader(n_images) +ds_magick = ds_magick_loader(n_images) + +bmr = bench::mark( + for (i in 1:n_images) ds_base$.getitem(i), + for (i in 1:n_images) ds_magick$.getitem(i), + memory = FALSE +) + +print(bmr) \ No newline at end of file diff --git a/man/mlr_learners_torch_image.Rd b/man/mlr_learners_torch_image.Rd index 0797ba25..fce0bef7 100644 --- a/man/mlr_learners_torch_image.Rd +++ b/man/mlr_learners_torch_image.Rd @@ -64,7 +64,7 @@ Creates a new instance of this \link[R6:R6Class]{R6} class. optimizer = NULL, loss = NULL, callbacks = list(), - packages = "torchvision", + packages = c("torchvision"), man, properties = NULL, predict_types = NULL diff --git a/tests/temp-TaskClassif_melanoma.R b/tests/temp-TaskClassif_melanoma.R new file mode 100644 index 00000000..3f1803eb --- /dev/null +++ b/tests/temp-TaskClassif_melanoma.R @@ -0,0 +1,41 @@ +library(torch) +library(torchvision) +library(mlr3torch) +library(here) + +library(data.table) +library(tidytable) + +# TODO: figure out whether we want the v2 file +# I think no, since I don't really see a "use" for the lesion ID +training_metadata = fread(here::here("cache", "ISIC_2020_Training_GroundTruth.csv")) +# training_metadata_v2 = fread(here::here("cache", "ISIC_2020_Training_GroundTruth_v2.csv")) + +cache_dir = here("cache") +# construct a torch dataset +ds = torch::dataset( + initialize = function() { + self$.metadata = fread(here(cache_dir, "ISIC_2020_Training_GroundTruth.csv")) + self$.path = file.path(here(cache_dir), "train") + }, + .getitem = function(idx) { + force(idx) + + x = torchvision::base_loader(file.path(self$.path, paste0(self$.metadata[idx, ]$image_name, ".jpg"))) + x = torchvision::transform_to_tensor(x) + + return(list(x = x)) + }, + .length = function() { + nrow(self$.metadata) + } +) + +melanoma_ds = ds() + +dd = as_data_descriptor(melanoma_ds, list(x = NULL)) +lt = lazy_tensor(dd) +dt_train = cbind(training_metadata, data.table(x = lt)) +# as_task_regr(dt_train, target = "corr", id = "guess_the_correlation") + +training_duplicates = fread(here(cache_dir, "ISIC_2020_Training_Duplicates.csv")) \ No newline at end of file From 0c259caf7542f0e57f230fcd20e0ad462f93a7f7 Mon Sep 17 00:00:00 2001 From: Carson Zhang Date: Fri, 29 Nov 2024 11:45:51 +0100 Subject: [PATCH 35/46] cleanup --- R/TaskClassif_melanoma.R | 23 +++--- attic/01-download_melanoma.R | 30 -------- attic/02-resize_melanoma.py | 29 -------- attic/03-process_melanoma.R | 84 ---------------------- attic/find_extensions.py | 78 -------------------- attic/requirements-resize_melanoma.txt | 4 -- attic/resize_melanoma copy.py | 26 ------- attic/resize_melanoma.R | 29 -------- attic/temp-test_melanoma.R | 31 -------- data-raw/melanoma.R | 28 ++++---- tests/temp-TaskClassif_melanoma.R | 41 ----------- tests/testthat/test_TaskClassif_melanoma.R | 1 + 12 files changed, 23 insertions(+), 381 deletions(-) delete mode 100644 attic/01-download_melanoma.R delete mode 100644 attic/02-resize_melanoma.py delete mode 100644 attic/03-process_melanoma.R delete mode 100644 attic/find_extensions.py delete mode 100644 attic/requirements-resize_melanoma.txt delete mode 100644 attic/resize_melanoma copy.py delete mode 100644 attic/resize_melanoma.R delete mode 100644 attic/temp-test_melanoma.R delete mode 100644 tests/temp-TaskClassif_melanoma.R diff --git a/R/TaskClassif_melanoma.R b/R/TaskClassif_melanoma.R index 17ad9dec..9b4700f9 100644 --- a/R/TaskClassif_melanoma.R +++ b/R/TaskClassif_melanoma.R @@ -20,7 +20,6 @@ #' #' @references #' `r format_bib("melanoma2021")` -#' @examplesIf torch::torch_is_installed() #' task = tsk("melanoma") #' task NULL @@ -28,26 +27,31 @@ NULL # @param path (`character(1)`)\cr # The cache_dir/datasets/melanoma folder constructor_melanoma = function(path) { + require_namespaces("curl") + base_url = "https://huggingface.co/datasets/carsonzhang/ISIC_2020_small/resolve/main/" compressed_tarball_file_name = "hf_ISIC_2020_small.tar.gz" compressed_tarball_path = file.path(path, compressed_tarball_file_name) curl::curl_download(paste0(base_url, compressed_tarball_file_name), compressed_tarball_path) utils::untar(compressed_tarball_path, exdir = path) - file.remove(compressed_tarball_path) + on.exit({file.remove(compressed_tarball_path)}, add = TRUE) training_metadata_file_name = "ISIC_2020_Training_GroundTruth_v2.csv" - training_metadata = data.table::fread(here::here(path, training_metadata_file_name)) + training_metadata = data.table::fread(file.path(path, training_metadata_file_name)) test_metadata_file_name = "ISIC_2020_Test_Metadata.csv" - test_metadata = data.table::fread(here::here(path, test_metadata_file_name)) + test_metadata = file.path(path, test_metadata_file_name) training_metadata = training_metadata[, split := "train"] test_metadata = setnames(test_metadata, old = c("image", "patient", "anatom_site_general"), new = c("image_name", "patient_id", "anatom_site_general_challenge") )[, split := "test"] - metadata = rbind(training_metadata, test_metadata, fill = TRUE) + metadata = rbind(training_metadata, test_metadata) + metadata[, image_name := NULL] + metadata[, target := NULL] + metadata = setnames(metadata, old = "benign_malignant", new = "outcome") melanoma_ds_generator = torch::dataset( initialize = function() { @@ -79,13 +83,6 @@ load_task_melanoma = function(id = "melanoma") { cached_constructor = function(backend) { data = cached(constructor_melanoma, "datasets", "melanoma")$data - # remove irrelevant cols: image_name, target - print(names(data)) - # if ("image_name" %in% names(data)) data[, image_name := NULL] - data[, image_name := NULL] - data[, target := NULL] - - # change the encodings of variables: diagnosis, benign_malignant data[, benign_malignant := factor(benign_malignant, levels = c("benign", "malignant"))] char_features = c("sex", "anatom_site_general_challenge") @@ -112,7 +109,7 @@ load_task_melanoma = function(id = "melanoma") { backend = backend, id = "melanoma", target = "benign_malignant", - label = "Melanoma classification" + label = "Melanoma Classification" ) task$set_col_roles("patient_id", "group") diff --git a/attic/01-download_melanoma.R b/attic/01-download_melanoma.R deleted file mode 100644 index 80d23ea3..00000000 --- a/attic/01-download_melanoma.R +++ /dev/null @@ -1,30 +0,0 @@ -library(here) - -training_jpeg_images_url = "https://isic-challenge-data.s3.amazonaws.com/2020/ISIC_2020_Training_JPEG.zip" -training_metadata_url = "https://isic-challenge-data.s3.amazonaws.com/2020/ISIC_2020_Training_GroundTruth.csv" -training_metadata_v2_url = "https://isic-challenge-data.s3.amazonaws.com/2020/ISIC_2020_Training_GroundTruth_v2.csv" -training_duplicate_image_list_url = "https://isic-challenge-data.s3.amazonaws.com/2020/ISIC_2020_Training_Duplicates.csv" - -test_jpeg_images_url = "https://isic-challenge-data.s3.amazonaws.com/2020/ISIC_2020_Test_JPEG.zip" -test_metadata_url = "https://isic-challenge-data.s3.amazonaws.com/2020/ISIC_2020_Test_Metadata.csv" - - -urls = c( - training_jpeg_images_url, - training_metadata_url, training_metadata_v2_url, training_duplicate_image_list_url, - test_jpeg_images_url, - test_metadata_url -) - -cache_dir = here("cache") -download_melanoma_file = function(url) { - op = options(timeout = 36000) - on.exit(options(op)) - - download.file(url, here(cache_dir, basename(url))) -} - -mlr3misc::walk(urls, download_melanoma_file) - -unzip(here(cache_dir, basename(training_jpeg_images_url))) -unzip(here(cache_dir, basename(test_jpeg_images_url))) diff --git a/attic/02-resize_melanoma.py b/attic/02-resize_melanoma.py deleted file mode 100644 index 5c54c548..00000000 --- a/attic/02-resize_melanoma.py +++ /dev/null @@ -1,29 +0,0 @@ -import torch -import os -from tqdm import tqdm -import torchvision - -PATH_TO_MLR3TORCH = "." -cache_dir = "cache" - -path_to_melanoma_train = os.path.join(PATH_TO_MLR3TORCH, cache_dir, "train") -path_to_melanoma_test = os.path.join(PATH_TO_MLR3TORCH, cache_dir, "ISIC_2020_Test_Input") - -path_to_output_train = os.path.join(PATH_TO_MLR3TORCH, cache_dir, "hf_dataset", "train") -path_to_output_test = os.path.join(PATH_TO_MLR3TORCH, cache_dir, "hf_dataset", "ISIC_2020_Test_Input") - -os.makedirs(path_to_output_train) -os.makedirs(path_to_output_test) - -tx = torchvision.transforms.Resize((128, 128)) - -for f in tqdm(os.listdir(path_to_melanoma_train)): - img = torchvision.io.read_image(os.path.join(path_to_melanoma_train, f)) - small_img = tx(img.float() / 255) - torchvision.utils.save_image(small_img, os.path.join(path_to_output_train, f)) - -for f in tqdm(os.listdir(path_to_melanoma_test)): - if f.endswith(".jpg"): - img = torchvision.io.read_image(os.path.join(path_to_melanoma_test, f)) - small_img = tx(img.float() / 255) - torchvision.utils.save_image(small_img, os.path.join(path_to_output_test, f)) diff --git a/attic/03-process_melanoma.R b/attic/03-process_melanoma.R deleted file mode 100644 index a4327d22..00000000 --- a/attic/03-process_melanoma.R +++ /dev/null @@ -1,84 +0,0 @@ -library(data.table) -library(tidytable) -library(purrr) - -library(here) - -library(fs) - -# this script changes the data into the format expected by Hugging Face -# It expects that you have downloaded and extracted the original data by running the download_melanoma.R script -# and that you have already resized it with PyTorch - -cache_dir = here("cache") - -duplicates = fread(here(cache_dir, "ISIC_2020_Training_Duplicates.csv")) - -metadata_file_paths = c( - here(cache_dir, "ISIC_2020_Training_GroundTruth.csv"), - here(cache_dir, "ISIC_2020_Training_GroundTruth_v2.csv"), - here(cache_dir, "ISIC_2020_Test_Metadata.csv") -) -metadata_dt_list = map(metadata_file_paths, fread) -metadata_dt_list[[3]] = rename(metadata_dt_list[[3]], image_name = image) - -# deduplicate the metadata -dedup = function(metadata_dt, duplicate_file_names) { - metadata_dt[!(image_name %in% duplicate_file_names), ] -} - -training_metadata = dedup(metadata_dt_list[[1]], duplicates$image_name_2) -training_metadata_v2 = dedup(metadata_dt_list[[2]], duplicates$image_name_2) -test_metadata = metadata_dt_list[[3]] - -hf_dataset_dir = here(cache_dir, "hf_dataset") -hf_train_dir = here(hf_dataset_dir, "train") -hf_test_dir = here(hf_dataset_dir, "ISIC_2020_Test_Input") - -train_dirnames_for_each_img = paste0("train", (training_metadata_v2[, .I] %% 4) + 1) -test_dirnames_for_each_img = paste0("ISIC_2020_Test_Input", (test_metadata[, .I] %% 2) + 1) - -# add a column that Hugging Face wants -add_hf_file_name_col = function(metadata_dt, image_relative_dirnames) { - metadata_dt[, file_name := paste0(file.path(image_relative_dirnames, metadata_dt$image_name), ".jpg")] -} - -# image_relative_paths = c("train", "train", "ISIC_2020_Test_Input") - -add_hf_file_name_col(training_metadata, train_dirnames_for_each_img) -add_hf_file_name_col(training_metadata_v2, train_dirnames_for_each_img) -add_hf_file_name_col(metadata_dt_list[[3]], test_dirnames_for_each_img) - -# delete the duplicated images -list.files(hf_train_dir) |> length() -file.remove(here(hf_train_dir, paste0(duplicates$image_name_2, ".jpg"))) -list.files(hf_train_dir) |> length() - -old_names = function(metadata_dt, dir) { - paste0(file.path(dir, metadata_dt$image_name), ".jpg") -} - -create_if_necessary = function(dirname) { - if (!dir.exists(dirname)) { - dir.create(dirname) - } -} - -walk(here(hf_dataset_dir, unique(train_dirnames_for_each_img)), create_if_necessary) -walk(here(hf_dataset_dir, unique(test_dirnames_for_each_img)), create_if_necessary) - -# file_move(old_names(training_metadata), here(hf_dataset_dir, train_dirnames_for_each_img, paste0(training_metadata$image_name, ".jpg"))) -file_move(old_names(training_metadata_v2, hf_train_dir), here(hf_dataset_dir, train_dirnames_for_each_img, paste0(training_metadata_v2$image_name, ".jpg"))) -file_move(old_names(test_metadata, hf_test_dir), here(hf_dataset_dir, test_dirnames_for_each_img, paste0(test_metadata$image_name, ".jpg"))) - -test_metadata = rename(test_metadata, image = image_name) - -fwrite(training_metadata, here(hf_dataset_dir, "ISIC_2020_Training_GroundTruth.csv")) -fwrite(training_metadata_v2, here(hf_dataset_dir, "ISIC_2020_Training_GroundTruth_v2.csv")) -fwrite(test_metadata, here(hf_dataset_dir, "ISIC_2020_Test_Metadata.csv")) - -# test1 = list.files(here(hf_dataset_dir, "ISIC_2020_Test_Input1")) -# test2 = list.files(here(hf_dataset_dir, "ISIC_2020_Test_Input2")) -# setdiff(test1, test2) - -# test_metadata |> filter(image_name == "ISIC_9999302") |> pull(file_name) diff --git a/attic/find_extensions.py b/attic/find_extensions.py deleted file mode 100644 index 76e75d05..00000000 --- a/attic/find_extensions.py +++ /dev/null @@ -1,78 +0,0 @@ -#!/usr/bin/env python3 -import os -from collections import defaultdict -from pathlib import Path -import argparse - - -def main(): - parser = argparse.ArgumentParser(description='Find all file extensions in a directory') - parser.add_argument('path', nargs='?', default='.', - help='Directory path to scan (default: current directory)') - parser.add_argument('-a', '--all', action='store_true', - help='Include hidden files and directories') - parser.add_argument('-c', '--count', action='store_true', - help='Show count of files per extension') - parser.add_argument('-s', '--sort-count', action='store_true', - help='Sort by count instead of alphabetically') - - args = parser.parse_args() - - # Validate directory - if not os.path.isdir(args.path): - print(f"Error: '{args.path}' is not a valid directory") - return 1 - - # Initialize counter - extensions = defaultdict(int) - - # Walk through directory - for root, dirs, files in os.walk(args.path): - # Skip hidden directories unless -a flag is used - if not args.all: - dirs[:] = [d for d in dirs if not d.startswith('.')] - files = [f for f in files if not f.startswith('.')] - - for file in files: - ext = Path(file).suffix.lower() - if ext: - extensions[ext[1:]] += 1 # Remove the leading dot - else: - extensions['(no extension)'] += 1 - - # No files found - if not extensions: - print("No files found in the specified directory.") - return 0 - - # Prepare for display - if args.sort_count: - # Sort by count (descending) and then by extension name - items = sorted(extensions.items(), key=lambda x: (-x[1], x[0])) - else: - # Sort alphabetically by extension - items = sorted(extensions.items()) - - # Display results - print(f"\nExtensions found in: {os.path.abspath(args.path)}") - print("-" * 40) - - if args.count: - # Show with counts - max_ext_len = max(len(ext) for ext in extensions.keys()) - for ext, count in items: - print(f"{ext:<{max_ext_len}} : {count:>5} files") - else: - # Show just extensions - for ext, _ in items: - print(ext) - - # Print summary - total_files = sum(extensions.values()) - total_extensions = len(extensions) - print("-" * 40) - print(f"Total: {total_files} files, {total_extensions} unique extensions") - - -if __name__ == "__main__": - exit(main()) diff --git a/attic/requirements-resize_melanoma.txt b/attic/requirements-resize_melanoma.txt deleted file mode 100644 index 41f33a3f..00000000 --- a/attic/requirements-resize_melanoma.txt +++ /dev/null @@ -1,4 +0,0 @@ -altgraph @ file:///AppleInternal/Library/BuildRoots/226e9c8c-edb1-11ee-8f17-a65dcee5a99e/Library/Caches/com.apple.xbs/Sources/python3/altgraph-0.17.2-py2.py3-none-any.whl -torch==2.5.0 -torchvision==0.20.0 -tqdm==4.66.1 \ No newline at end of file diff --git a/attic/resize_melanoma copy.py b/attic/resize_melanoma copy.py deleted file mode 100644 index 133243f2..00000000 --- a/attic/resize_melanoma copy.py +++ /dev/null @@ -1,26 +0,0 @@ -import torch -import os -from tqdm import tqdm -import torchvision - -PATH_TO_MLR3TORCH = "." -cache_dir = "cache" - -path_to_melanoma_train = os.path.join(PATH_TO_MLR3TORCH, cache_dir, "train") -path_to_melanoma_test = os.path.join(PATH_TO_MLR3TORCH, cache_dir, "ISIC_2020_Test_Input") - -path_to_output_train = os.path.join(PATH_TO_MLR3TORCH, cache_dir, "train_small") -path_to_output_test = os.path.join(PATH_TO_MLR3TORCH, cache_dir, "ISIC_2020_Test_Input_small") - -tx = torchvision.transforms.Resize((128, 128)) - -for f in tqdm(os.listdir(path_to_melanoma_train)): - img = torchvision.io.read_image(os.path.join(path_to_melanoma_train, f)) - small_img = tx(img.float() / 255) - torchvision.utils.save_image(small_img, os.path.join(path_to_output_train, f)) - -for f in tqdm(os.listdir(path_to_melanoma_test)): - if f.endswith(".jpg"): - img = torchvision.io.read_image(os.path.join(path_to_melanoma_test, f)) - small_img = tx(img.float() / 255) - torchvision.utils.save_image(small_img, os.path.join(path_to_output_test, f)) diff --git a/attic/resize_melanoma.R b/attic/resize_melanoma.R deleted file mode 100644 index 3aa7ed7d..00000000 --- a/attic/resize_melanoma.R +++ /dev/null @@ -1,29 +0,0 @@ -library(torch) -library(torchvision) - -library(purrr) - -library(here) - -# change to wherever your files live -cache_dir = here("cache") - -path_to_melanoma_train = here(cache_dir, "train") -path_to_melanoma_test = here(cache_dir, "ISIC_2020_Test_Input") -path_to_output_train = here(cache_dir, "train_small") -path_to_output_test = here(cache_dir, "ISIC_2020_Test_Input_small") - -resize_to_dims = c(128, 128) - -resize_and_write = function(image_file_name, path_to_input_train, path_to_output_dir, dims) { - image = base_loader(file.path(path_to_input_train, image_file_name)) - small_image = torchvision::transform_resize(transform_to_tensor(image), dims) - - output_file_name = file.path(path_to_output_dir, basename(image_file_name)) - print(output_file_name) - - torch::torch_save(small_image, path_to_output_dir) -} - -walk(.x = list.files(path_to_melanoma_train), .f = resize_and_write(path_to_melanoma_train, path_to_output_train, resize_to_dims), .progress = TRUE) -walk(.x = list.files(path_to_melanoma_test), .f = resize_and_write(path_to_melanoma_test, path_to_output_test), .progress = TRUE) diff --git a/attic/temp-test_melanoma.R b/attic/temp-test_melanoma.R deleted file mode 100644 index 8c22d084..00000000 --- a/attic/temp-test_melanoma.R +++ /dev/null @@ -1,31 +0,0 @@ -library(data.table) -library(here) -library(tidytable) - -withr::local_options(mlr3torch.cache = TRUE) - -load_col_info("melanoma") - -task = tsk("melanoma") -# this makes the test faster -task$row_roles$use = 1:10 -expect_equal(task$id, "melanoma") -expect_equal(task$label, "Melanoma classification") -expect_equal(task$feature_names, c("sex", "anatom_site_general_challenge", "age_approx", "image")) -expect_equal(task$target_names, "benign_malignant") -expect_equal(task$man, "mlr3torch::mlr_tasks_melanoma") -expect_equal(task$properties, c("twoclass", "groups")) - -x = materialize(task$data(task$row_ids[1:2], cols = "image")[[1L]], rbind = TRUE) -expect_equal(x$shape, c(2, 3, 128, 128)) -expect_equal(x$dtype, torch_float32()) - -training_metadata = fread(here::here("cache", "hf_dataset", "train", "ISIC_2020_Training_GroundTruth_v2.csv")) -training_metadata_extrasmall = training_metadata |> - filter(file_name %in% list.files(here("cache", "hf_dataset", "train"), pattern = ".jpg$", recursive = TRUE)) -fwrite(training_metadata_extrasmall, here("cache", "hf_dataset", "train", "ISIC_2020_Training_GroundTruth_v2.csv")) - -test_metadata = fread(here::here("cache", "hf_dataset", "ISIC_2020_Test_Input", "ISIC_2020_Test_Metadata.csv")) -test_metadata_extrasmall = test_metadata |> - filter(file_name %in% list.files(here("cache", "hf_dataset", "ISIC_2020_Test_Input"), pattern = ".jpg$", recursive = TRUE)) -fwrite(test_metadata_extrasmall, here("cache", "hf_dataset", "ISIC_2020_Test_Input", "ISIC_2020_Test_Metadata.csv")) diff --git a/data-raw/melanoma.R b/data-raw/melanoma.R index 4aaf0e12..06cbc907 100644 --- a/data-raw/melanoma.R +++ b/data-raw/melanoma.R @@ -15,29 +15,31 @@ unzip2 <- function(path, exdir) { } constructor_melanoma = function(path) { - # should happen automatically, but this is needed for curl to work - fs::dir_create(path, recurse = TRUE) + require_namespaces("curl") base_url = "https://huggingface.co/datasets/carsonzhang/ISIC_2020_small/resolve/main/" compressed_tarball_file_name = "hf_ISIC_2020_small.tar.gz" - - curl::curl_download(paste0(base_url, compressed_tarball_file_name), file.path(path, compressed_tarball_file_name)) - - untar(file.path(path, compressed_tarball_file_name), exdir = path) + compressed_tarball_path = file.path(path, compressed_tarball_file_name) + curl::curl_download(paste0(base_url, compressed_tarball_file_name), compressed_tarball_path) + utils::untar(compressed_tarball_path, exdir = path) + on.exit({file.remove(compressed_tarball_path)}, add = TRUE) training_metadata_file_name = "ISIC_2020_Training_GroundTruth_v2.csv" - training_metadata = data.table::fread(here::here(path, training_metadata_file_name)) + training_metadata = data.table::fread(file.path(path, training_metadata_file_name)) test_metadata_file_name = "ISIC_2020_Test_Metadata.csv" - test_metadata = data.table::fread(here::here(path, test_metadata_file_name)) + test_metadata = file.path(path, test_metadata_file_name) training_metadata = training_metadata[, split := "train"] test_metadata = setnames(test_metadata, old = c("image", "patient", "anatom_site_general"), new = c("image_name", "patient_id", "anatom_site_general_challenge") )[, split := "test"] - metadata = rbind(training_metadata, test_metadata, fill = TRUE) + metadata = rbind(training_metadata, test_metadata) + metadata[, image_name := NULL] + metadata[, target := NULL] + metadata = setnames(metadata, old = "benign_malignant", new = "outcome") melanoma_ds_generator = torch::dataset( initialize = function() { @@ -65,15 +67,9 @@ constructor_melanoma = function(path) { return(cbind(metadata, data.table(image = lt))) } -# path = file.path(here::here("cache"), "datasets", "melanoma") -# fs::dir_create(path, recurse = TRUE) - bench::system_time(melanoma_dt <- constructor_melanoma(file.path(get_cache_dir(), "datasets", "melanoma"))) # melanoma_dt = constructor_melanoma(file.path(get_cache_dir(), "datasets", "melanoma")) -melanoma_dt[, image_name := NULL] -melanoma_dt[, target := NULL] - # change the encodings of variables: diagnosis, benign_malignant melanoma_dt[, benign_malignant := factor(benign_malignant, levels = c("benign", "malignant"))] @@ -84,7 +80,7 @@ tsk_melanoma = as_task_classif(melanoma_dt, target = "benign_malignant", id = "m tsk_melanoma$set_col_roles("patient_id", "group") tsk_melanoma$col_roles$feature = c(char_features, "age_approx", "image") -tsk_melanoma$label = "Melanoma classification" +tsk_melanoma$label = "Melanoma Classification" ci = col_info(tsk_melanoma$backend) diff --git a/tests/temp-TaskClassif_melanoma.R b/tests/temp-TaskClassif_melanoma.R deleted file mode 100644 index 3f1803eb..00000000 --- a/tests/temp-TaskClassif_melanoma.R +++ /dev/null @@ -1,41 +0,0 @@ -library(torch) -library(torchvision) -library(mlr3torch) -library(here) - -library(data.table) -library(tidytable) - -# TODO: figure out whether we want the v2 file -# I think no, since I don't really see a "use" for the lesion ID -training_metadata = fread(here::here("cache", "ISIC_2020_Training_GroundTruth.csv")) -# training_metadata_v2 = fread(here::here("cache", "ISIC_2020_Training_GroundTruth_v2.csv")) - -cache_dir = here("cache") -# construct a torch dataset -ds = torch::dataset( - initialize = function() { - self$.metadata = fread(here(cache_dir, "ISIC_2020_Training_GroundTruth.csv")) - self$.path = file.path(here(cache_dir), "train") - }, - .getitem = function(idx) { - force(idx) - - x = torchvision::base_loader(file.path(self$.path, paste0(self$.metadata[idx, ]$image_name, ".jpg"))) - x = torchvision::transform_to_tensor(x) - - return(list(x = x)) - }, - .length = function() { - nrow(self$.metadata) - } -) - -melanoma_ds = ds() - -dd = as_data_descriptor(melanoma_ds, list(x = NULL)) -lt = lazy_tensor(dd) -dt_train = cbind(training_metadata, data.table(x = lt)) -# as_task_regr(dt_train, target = "corr", id = "guess_the_correlation") - -training_duplicates = fread(here(cache_dir, "ISIC_2020_Training_Duplicates.csv")) \ No newline at end of file diff --git a/tests/testthat/test_TaskClassif_melanoma.R b/tests/testthat/test_TaskClassif_melanoma.R index 380612f1..0a63ef95 100644 --- a/tests/testthat/test_TaskClassif_melanoma.R +++ b/tests/testthat/test_TaskClassif_melanoma.R @@ -1,4 +1,5 @@ skip_on_cran() +skip_if_not_installed("curl") test_that("melanoma task works", { withr::local_options(mlr3torch.cache = TRUE) From cf1148e84becfe3aff4282ef5e41a8f2827dba58 Mon Sep 17 00:00:00 2001 From: Carson Zhang Date: Fri, 29 Nov 2024 11:50:47 +0100 Subject: [PATCH 36/46] deleted hfhub testS --- attic/hfhub_test.R | 69 ---------------------------------------------- 1 file changed, 69 deletions(-) delete mode 100644 attic/hfhub_test.R diff --git a/attic/hfhub_test.R b/attic/hfhub_test.R deleted file mode 100644 index 8c63b9d1..00000000 --- a/attic/hfhub_test.R +++ /dev/null @@ -1,69 +0,0 @@ -library(here) -library(data.table) - -devtools::load_all() - -file_names = c( - "ISIC_2020_Training_GroundTruth_v2.csv", "train1", "train2", "train3", "train4", - "ISIC_2020_Test_Metadata.csv", "ISIC_2020_Test_Input1", "ISIC_2020_Test_Input2" -) - -hf_cache_dir = here::here("cache", "hf_downloaded") - -# withr::with_envvar(c(HUGGINGFACE_HUB_CACHE = hf_cache_dir), { -# path <- hfhub::hub_snapshot("carsonzhang/ISIC_2020_small", repo_type = "dataset") -# }) - -# print(paths) - - -hf_dataset_path = here(hf_cache_dir, "datasets--carsonzhang--ISIC_2020_small", "snapshots", "2737ff07cc2ef8bd44d692d3323472fce272fca3") - -constructor_melanoma = function(path) { - file_names = c( - "ISIC_2020_Training_GroundTruth_v2.csv", "train1", "train2", "train3", "train4", - "ISIC_2020_Test_Metadata.csv", "ISIC_2020_Test_Input1", "ISIC_2020_Test_Input2" - ) - - # withr::with_envvar(c(HUGGINGFACE_HUB_CACHE = path), { - # hfhub::hub_snapshot("carsonzhang/ISIC_2020_small", repo_type = "dataset") - # }) - - hf_dataset_path = here(path, "datasets--carsonzhang--ISIC_2020_small", "snapshots", "2737ff07cc2ef8bd44d692d3323472fce272fca3") - - training_metadata = fread(here(hf_dataset_path, "ISIC_2020_Training_GroundTruth_v2.csv"))[, split := "train"] - test_metadata = setnames(fread(here(hf_dataset_path, "ISIC_2020_Test_Metadata.csv")), - old = c("image", "patient", "anatom_site_general"), - new = c("image_name", "patient_id", "anatom_site_general_challenge") - )[, split := "test"] - metadata = rbind(training_metadata, test_metadata, fill = TRUE) - - # write to disk? - - melanoma_ds_generator = torch::dataset( - initialize = function() { - self$.metadata = metadata - self$.path = hf_dataset_path - }, - .getitem = function(idx) { - force(idx) - - x = torchvision::base_loader(file.path(self$.path, paste0(self$.metadata[idx, ]$file_name))) - x = torchvision::transform_to_tensor(x) - - return(list(x = x)) - }, - .length = function() { - nrow(self$.metadata) - } - ) - - melanoma_ds = melanoma_ds_generator() - - dd = as_data_descriptor(melanoma_ds, list(x = c(NA, 3, 128, 128))) - lt = lazy_tensor(dd) - - return(cbind(metadata, data.table(image = lt))) -} - -melanoma_ds = constructor_melanoma(hf_cache_dir) From bc0f7c8c1de6ceb710a54b5be86e99788c81707c Mon Sep 17 00:00:00 2001 From: Carson Zhang Date: Fri, 29 Nov 2024 13:31:41 +0100 Subject: [PATCH 37/46] TODO: move lazy tensor construction outside of the cache. Look at tiny_imagenet. Probably need to construct full file paths first --- R/TaskClassif_melanoma.R | 4 ++-- data-raw/melanoma.R | 20 ++++++-------------- inst/col_info/melanoma.rds | Bin 412 -> 410 bytes 3 files changed, 8 insertions(+), 16 deletions(-) diff --git a/R/TaskClassif_melanoma.R b/R/TaskClassif_melanoma.R index 9b4700f9..a9f52878 100644 --- a/R/TaskClassif_melanoma.R +++ b/R/TaskClassif_melanoma.R @@ -83,7 +83,7 @@ load_task_melanoma = function(id = "melanoma") { cached_constructor = function(backend) { data = cached(constructor_melanoma, "datasets", "melanoma")$data - data[, benign_malignant := factor(benign_malignant, levels = c("benign", "malignant"))] + data[, outcome := factor(outcome, levels = c("benign", "malignant"))] char_features = c("sex", "anatom_site_general_challenge") data[, (char_features) := lapply(.SD, factor), .SDcols = char_features] @@ -108,7 +108,7 @@ load_task_melanoma = function(id = "melanoma") { task = TaskClassif$new( backend = backend, id = "melanoma", - target = "benign_malignant", + target = "outcome", label = "Melanoma Classification" ) diff --git a/data-raw/melanoma.R b/data-raw/melanoma.R index 06cbc907..faa3187e 100644 --- a/data-raw/melanoma.R +++ b/data-raw/melanoma.R @@ -6,14 +6,6 @@ devtools::load_all() library(data.table) withr::local_options(mlr3torch.cache = TRUE) -unzip2 <- function(path, exdir) { - if (grepl("linux", R.version$os)) { - utils::unzip(path, exdir = exdir) - } else { - zip::unzip(path, exdir = exdir) - } -} - constructor_melanoma = function(path) { require_namespaces("curl") @@ -26,17 +18,17 @@ constructor_melanoma = function(path) { on.exit({file.remove(compressed_tarball_path)}, add = TRUE) training_metadata_file_name = "ISIC_2020_Training_GroundTruth_v2.csv" - training_metadata = data.table::fread(file.path(path, training_metadata_file_name)) + training_metadata = fread(file.path(path, training_metadata_file_name)) test_metadata_file_name = "ISIC_2020_Test_Metadata.csv" - test_metadata = file.path(path, test_metadata_file_name) + test_metadata = fread(file.path(path, test_metadata_file_name)) training_metadata = training_metadata[, split := "train"] test_metadata = setnames(test_metadata, old = c("image", "patient", "anatom_site_general"), new = c("image_name", "patient_id", "anatom_site_general_challenge") )[, split := "test"] - metadata = rbind(training_metadata, test_metadata) + metadata = rbind(training_metadata, test_metadata, fill = TRUE) metadata[, image_name := NULL] metadata[, target := NULL] metadata = setnames(metadata, old = "benign_malignant", new = "outcome") @@ -70,13 +62,13 @@ constructor_melanoma = function(path) { bench::system_time(melanoma_dt <- constructor_melanoma(file.path(get_cache_dir(), "datasets", "melanoma"))) # melanoma_dt = constructor_melanoma(file.path(get_cache_dir(), "datasets", "melanoma")) -# change the encodings of variables: diagnosis, benign_malignant -melanoma_dt[, benign_malignant := factor(benign_malignant, levels = c("benign", "malignant"))] +# change the encodings of variables: diagnosis, outcome +melanoma_dt[, outcome := factor(outcome, levels = c("benign", "malignant"))] char_features = c("sex", "anatom_site_general_challenge") melanoma_dt[, (char_features) := lapply(.SD, factor), .SDcols = char_features] -tsk_melanoma = as_task_classif(melanoma_dt, target = "benign_malignant", id = "melanoma") +tsk_melanoma = as_task_classif(melanoma_dt, target = "outcome", id = "melanoma") tsk_melanoma$set_col_roles("patient_id", "group") tsk_melanoma$col_roles$feature = c(char_features, "age_approx", "image") diff --git a/inst/col_info/melanoma.rds b/inst/col_info/melanoma.rds index 52d1d4b0fde938d7cb28a81d7cabf5203fd329bb..9825d45accbb4ae921375362868687c839e8d8cb 100644 GIT binary patch literal 410 zcmV;L0cHLliwFP!000001D#S$Ps1<_^|o$f0|W?(8xntj?#eIV7l6dA@>;ixNRuit zALGQI=E`tKvW<@x0~_QMna29P9u7z||#!~472>l?w&*9QQnvJN4EiTn@J z)Y+Fz6(OEtMHx1Yv#$ZUM1ySW%qym>B0~;!mM>6iG8F~Vq)=EH>y?i{sWfE<>&O_Y z8n<*!URkrfc5I(Gw;S@bK~`jVOBi^HhK9b;iaT|Tm0_yL{T_}=%-P0hBJQ04v==zk z__oZLjNhO1hvwd6fb+F)fX}enI>vzrF=0Vi%nap^0XWh2h1?arvZGq@GQdYx+A)*1 zC`KLKH>hhr^HyFtzL{9@HcCA;%@3)qm0e5ySjk+}D-(^?s3k(fJJdnP{ir0N#cn3+ z{hBR93DAqC+VSumiT2G zN@ck5Lcs;55-J}V@e1}fu|UI>Ji@8La+LO<1O`1>WNN`}C!ryi7E9FUw}S#QtIqFY zN=lP}{|aLP-|N((G#3Q@dkJIHmYcDS8%-WVcaZsj1wuZPn0+|mR;VTFM_gHr6}M|A zUn{E1jOxQJ2+v!CsLaeLdRG5tgkqGp`mE~{s~tk$lkbY^K@PP&5N|s4L5F@&Acgtz zhV8mN9rkJNd8nozOvj?XFL`L!wYL}u{6b+m1~K(;;!SRv;2-*Ix7*+DO_>1WZ}NGL za Date: Mon, 2 Dec 2024 15:11:44 +0100 Subject: [PATCH 38/46] tests not working --- R/TaskClassif_melanoma.R | 6 ++++-- data-raw/melanoma.R | 3 ++- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/R/TaskClassif_melanoma.R b/R/TaskClassif_melanoma.R index a9f52878..f20cde2f 100644 --- a/R/TaskClassif_melanoma.R +++ b/R/TaskClassif_melanoma.R @@ -48,7 +48,8 @@ constructor_melanoma = function(path) { old = c("image", "patient", "anatom_site_general"), new = c("image_name", "patient_id", "anatom_site_general_challenge") )[, split := "test"] - metadata = rbind(training_metadata, test_metadata) + # response column needs to be filled for the test data + metadata = rbind(training_metadata, test_metadata, fill = TRUE) metadata[, image_name := NULL] metadata[, target := NULL] metadata = setnames(metadata, old = "benign_malignant", new = "outcome") @@ -83,7 +84,8 @@ load_task_melanoma = function(id = "melanoma") { cached_constructor = function(backend) { data = cached(constructor_melanoma, "datasets", "melanoma")$data - data[, outcome := factor(outcome, levels = c("benign", "malignant"))] + data[, outcome := factor(get(outcome), levels = c("benign", "malignant"))] + set(data, j = "outcome", value = factor(get(outcome), levels = c("benign", "malignant"))) char_features = c("sex", "anatom_site_general_challenge") data[, (char_features) := lapply(.SD, factor), .SDcols = char_features] diff --git a/data-raw/melanoma.R b/data-raw/melanoma.R index faa3187e..240b1927 100644 --- a/data-raw/melanoma.R +++ b/data-raw/melanoma.R @@ -28,6 +28,7 @@ constructor_melanoma = function(path) { old = c("image", "patient", "anatom_site_general"), new = c("image_name", "patient_id", "anatom_site_general_challenge") )[, split := "test"] + # response column needs to be filled for the test data metadata = rbind(training_metadata, test_metadata, fill = TRUE) metadata[, image_name := NULL] metadata[, target := NULL] @@ -63,7 +64,7 @@ bench::system_time(melanoma_dt <- constructor_melanoma(file.path(get_cache_dir() # melanoma_dt = constructor_melanoma(file.path(get_cache_dir(), "datasets", "melanoma")) # change the encodings of variables: diagnosis, outcome -melanoma_dt[, outcome := factor(outcome, levels = c("benign", "malignant"))] +melanoma_dt[, outcome := factor(get(outcome), levels = c("benign", "malignant"))] char_features = c("sex", "anatom_site_general_challenge") melanoma_dt[, (char_features) := lapply(.SD, factor), .SDcols = char_features] From b722a3f86f7fac95f64d5010094668fabc655b60 Mon Sep 17 00:00:00 2001 From: Carson Zhang Date: Tue, 3 Dec 2024 14:31:18 +0100 Subject: [PATCH 39/46] Woiefjwoeif --- R/TaskClassif_melanoma.R | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/R/TaskClassif_melanoma.R b/R/TaskClassif_melanoma.R index f20cde2f..1b66b984 100644 --- a/R/TaskClassif_melanoma.R +++ b/R/TaskClassif_melanoma.R @@ -50,8 +50,8 @@ constructor_melanoma = function(path) { )[, split := "test"] # response column needs to be filled for the test data metadata = rbind(training_metadata, test_metadata, fill = TRUE) - metadata[, image_name := NULL] - metadata[, target := NULL] + metadata[, "image_name" := NULL] + metadata[, "target" := NULL] metadata = setnames(metadata, old = "benign_malignant", new = "outcome") melanoma_ds_generator = torch::dataset( @@ -84,8 +84,8 @@ load_task_melanoma = function(id = "melanoma") { cached_constructor = function(backend) { data = cached(constructor_melanoma, "datasets", "melanoma")$data - data[, outcome := factor(get(outcome), levels = c("benign", "malignant"))] - set(data, j = "outcome", value = factor(get(outcome), levels = c("benign", "malignant"))) + data[, "outcome" := factor(get("outcome"), levels = c("benign", "malignant"))] + # set(data, j = "outcome", value = factor(get(outcome), levels = c("benign", "malignant"))) char_features = c("sex", "anatom_site_general_challenge") data[, (char_features) := lapply(.SD, factor), .SDcols = char_features] From 297a34cf30ccc8c8f7645e295cd799f383e4b5f9 Mon Sep 17 00:00:00 2001 From: Carson Zhang Date: Thu, 5 Dec 2024 23:47:35 +0100 Subject: [PATCH 40/46] tests pass --- R/TaskClassif_melanoma.R | 64 +++++++++++---------- data-raw/melanoma.R | 67 ++++++++++++---------- tests/testthat/test_TaskClassif_melanoma.R | 4 +- 3 files changed, 71 insertions(+), 64 deletions(-) diff --git a/R/TaskClassif_melanoma.R b/R/TaskClassif_melanoma.R index 1b66b984..17f2bbaf 100644 --- a/R/TaskClassif_melanoma.R +++ b/R/TaskClassif_melanoma.R @@ -29,6 +29,9 @@ NULL constructor_melanoma = function(path) { require_namespaces("curl") + # should happen automatically, but this is needed for curl to work + fs::dir_create(path, recurse = TRUE) + base_url = "https://huggingface.co/datasets/carsonzhang/ISIC_2020_small/resolve/main/" compressed_tarball_file_name = "hf_ISIC_2020_small.tar.gz" @@ -41,7 +44,7 @@ constructor_melanoma = function(path) { training_metadata = data.table::fread(file.path(path, training_metadata_file_name)) test_metadata_file_name = "ISIC_2020_Test_Metadata.csv" - test_metadata = file.path(path, test_metadata_file_name) + test_metadata = data.table::fread(file.path(path, test_metadata_file_name)) training_metadata = training_metadata[, split := "train"] test_metadata = setnames(test_metadata, @@ -52,43 +55,42 @@ constructor_melanoma = function(path) { metadata = rbind(training_metadata, test_metadata, fill = TRUE) metadata[, "image_name" := NULL] metadata[, "target" := NULL] - metadata = setnames(metadata, old = "benign_malignant", new = "outcome") - - melanoma_ds_generator = torch::dataset( - initialize = function() { - self$.metadata = metadata - self$.path = path - }, - .getitem = function(idx) { - force(idx) - - x = torchvision::base_loader(file.path(self$.path, paste0(self$.metadata[idx, ]$file_name))) - x = torchvision::transform_to_tensor(x) - - return(list(x = x)) - }, - .length = function() { - nrow(self$.metadata) - } - ) - - melanoma_ds = melanoma_ds_generator() - - dd = as_data_descriptor(melanoma_ds, list(x = c(NA, 3, 128, 128))) - lt = lazy_tensor(dd) + setnames(metadata, old = "benign_malignant", new = "outcome") - return(cbind(metadata, data.table(image = lt))) + metadata } load_task_melanoma = function(id = "melanoma") { cached_constructor = function(backend) { - data = cached(constructor_melanoma, "datasets", "melanoma")$data + metadata = cached(constructor_melanoma, "datasets", "melanoma")$data + + melanoma_ds_generator = torch::dataset( + initialize = function(metadata, cache_dir) { + self$.metadata = metadata + self$.cache_dir = cache_dir + }, + .getitem = function(idx) { + force(idx) + + x = torchvision::base_loader(file.path(self$.cache_dir, "raw", paste0(self$.metadata[idx, ]$file_name))) + x = torchvision::transform_to_tensor(x) + + return(list(x = x)) + }, + .length = function() { + nrow(self$.metadata) + } + ) + + melanoma_ds = melanoma_ds_generator(metadata, file.path(get_cache_dir(), "datasets", "melanoma")) + + dd = as_data_descriptor(melanoma_ds, list(x = c(NA, 3, 128, 128))) + lt = lazy_tensor(dd) - data[, "outcome" := factor(get("outcome"), levels = c("benign", "malignant"))] - # set(data, j = "outcome", value = factor(get(outcome), levels = c("benign", "malignant"))) + data = cbind(metadata, data.table(image = lt)) - char_features = c("sex", "anatom_site_general_challenge") - data[, (char_features) := lapply(.SD, factor), .SDcols = char_features] + char_vars = c("outcome", "sex", "anatom_site_general_challenge") + data[, (char_vars) := lapply(.SD, factor), .SDcols = char_vars] dt = cbind( data, diff --git a/data-raw/melanoma.R b/data-raw/melanoma.R index 240b1927..414835fe 100644 --- a/data-raw/melanoma.R +++ b/data-raw/melanoma.R @@ -9,11 +9,15 @@ withr::local_options(mlr3torch.cache = TRUE) constructor_melanoma = function(path) { require_namespaces("curl") + # should happen automatically, but this is needed for curl to work + fs::dir_create(path, recurse = TRUE) + base_url = "https://huggingface.co/datasets/carsonzhang/ISIC_2020_small/resolve/main/" compressed_tarball_file_name = "hf_ISIC_2020_small.tar.gz" compressed_tarball_path = file.path(path, compressed_tarball_file_name) - curl::curl_download(paste0(base_url, compressed_tarball_file_name), compressed_tarball_path) + + curl::curl_download(url = paste0(base_url, compressed_tarball_file_name), destfile = compressed_tarball_path) utils::untar(compressed_tarball_path, exdir = path) on.exit({file.remove(compressed_tarball_path)}, add = TRUE) @@ -23,55 +27,56 @@ constructor_melanoma = function(path) { test_metadata_file_name = "ISIC_2020_Test_Metadata.csv" test_metadata = fread(file.path(path, test_metadata_file_name)) - training_metadata = training_metadata[, split := "train"] + training_metadata = training_metadata[, "split" := "train"] test_metadata = setnames(test_metadata, old = c("image", "patient", "anatom_site_general"), new = c("image_name", "patient_id", "anatom_site_general_challenge") - )[, split := "test"] + )[, "split" := "test"] # response column needs to be filled for the test data metadata = rbind(training_metadata, test_metadata, fill = TRUE) - metadata[, image_name := NULL] - metadata[, target := NULL] + metadata[, "image_name" := NULL] + metadata[, "target" := NULL] metadata = setnames(metadata, old = "benign_malignant", new = "outcome") - melanoma_ds_generator = torch::dataset( - initialize = function() { - self$.metadata = metadata - self$.path = path - }, - .getitem = function(idx) { - force(idx) + metadata +} - x = torchvision::base_loader(file.path(self$.path, paste0(self$.metadata[idx, ]$file_name))) - x = torchvision::transform_to_tensor(x) +metadata = constructor_melanoma(path <- file.path(get_cache_dir(), "datasets", "melanoma")) - return(list(x = x)) - }, - .length = function() { - nrow(self$.metadata) - } - ) +melanoma_ds_generator = torch::dataset( + initialize = function() { + self$.metadata = metadata + self$.path = path + }, + .getitem = function(idx) { + force(idx) - melanoma_ds = melanoma_ds_generator() + x = torchvision::base_loader(file.path(self$.path, paste0(self$.metadata[idx, ]$file_name))) + x = torchvision::transform_to_tensor(x) - dd = as_data_descriptor(melanoma_ds, list(x = c(NA, 3, 128, 128))) - lt = lazy_tensor(dd) + return(list(x = x)) + }, + .length = function() { + nrow(self$.metadata) + } +) - return(cbind(metadata, data.table(image = lt))) -} +melanoma_ds = melanoma_ds_generator() + +dd = as_data_descriptor(melanoma_ds, list(x = c(NA, 3, 128, 128))) +lt = lazy_tensor(dd) -bench::system_time(melanoma_dt <- constructor_melanoma(file.path(get_cache_dir(), "datasets", "melanoma"))) -# melanoma_dt = constructor_melanoma(file.path(get_cache_dir(), "datasets", "melanoma")) +melanoma_dt = cbind(metadata, data.table(image = lt)) # change the encodings of variables: diagnosis, outcome -melanoma_dt[, outcome := factor(get(outcome), levels = c("benign", "malignant"))] +# melanoma_dt[, "outcome" := factor(get("outcome"), levels = c("benign", "malignant"))] -char_features = c("sex", "anatom_site_general_challenge") -melanoma_dt[, (char_features) := lapply(.SD, factor), .SDcols = char_features] +char_vars = c("outcome", "sex", "anatom_site_general_challenge") +melanoma_dt[, (char_vars) := lapply(.SD, factor), .SDcols = char_vars] tsk_melanoma = as_task_classif(melanoma_dt, target = "outcome", id = "melanoma") tsk_melanoma$set_col_roles("patient_id", "group") -tsk_melanoma$col_roles$feature = c(char_features, "age_approx", "image") +tsk_melanoma$col_roles$feature = c("sex", "anatom_site_general_challenge", "age_approx", "image") tsk_melanoma$label = "Melanoma Classification" diff --git a/tests/testthat/test_TaskClassif_melanoma.R b/tests/testthat/test_TaskClassif_melanoma.R index 0a63ef95..f7c74c17 100644 --- a/tests/testthat/test_TaskClassif_melanoma.R +++ b/tests/testthat/test_TaskClassif_melanoma.R @@ -6,9 +6,9 @@ test_that("melanoma task works", { task = tsk("melanoma") expect_equal(task$id, "melanoma") - expect_equal(task$label, "Melanoma classification") + expect_equal(task$label, "Melanoma Classification") expect_equal(task$feature_names, c("sex", "anatom_site_general_challenge", "age_approx", "image")) - expect_equal(task$target_names, "benign_malignant") + expect_equal(task$target_names, "outcome") expect_equal(task$man, "mlr3torch::mlr_tasks_melanoma") expect_equal(task$properties, c("twoclass", "groups")) From 9ddc43ba6d86ba6389ddd246d8cecb8a85d3d42c Mon Sep 17 00:00:00 2001 From: Carson Zhang Date: Fri, 6 Dec 2024 07:54:59 +0100 Subject: [PATCH 41/46] removed dep on fs --- R/TaskClassif_melanoma.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/TaskClassif_melanoma.R b/R/TaskClassif_melanoma.R index 17f2bbaf..bf626791 100644 --- a/R/TaskClassif_melanoma.R +++ b/R/TaskClassif_melanoma.R @@ -30,7 +30,7 @@ constructor_melanoma = function(path) { require_namespaces("curl") # should happen automatically, but this is needed for curl to work - fs::dir_create(path, recurse = TRUE) + if (!dir.exists(path)) dir.create(path) base_url = "https://huggingface.co/datasets/carsonzhang/ISIC_2020_small/resolve/main/" From eb659f6db2a112a81a2ed93f5c5d9114de8c87a2 Mon Sep 17 00:00:00 2001 From: Sebastian Fischer Date: Fri, 6 Dec 2024 08:34:22 +0000 Subject: [PATCH 42/46] some fixes --- R/TaskClassif_melanoma.R | 9 ++++++--- man/mlr_tasks_melanoma.Rd | 17 ++++++++--------- 2 files changed, 14 insertions(+), 12 deletions(-) diff --git a/R/TaskClassif_melanoma.R b/R/TaskClassif_melanoma.R index bf626791..c59b0da3 100644 --- a/R/TaskClassif_melanoma.R +++ b/R/TaskClassif_melanoma.R @@ -2,8 +2,11 @@ #' @name mlr_tasks_melanoma #' @description #' Classification of melanoma tumor images. +#' The data is a preprocessed version of the 2020 SIIM-ISIC challenge where +#' the images have been reshaped to size $(3, 128, 128)$. #' -#' The data comes from the 2020 SIIM-ISIC challenge. +#' By default only the training rows are active in the task, +#' but the test data (that has no targets) is also included. #' #' @section Construction: #' ``` @@ -13,7 +16,7 @@ #' @template task_download #' #' @source -#' \url{https://challenge2020.isic-archive.com/} +#' \url{https://huggingface.co/datasets/carsonzhang/ISIC_2020_small} #' #' @section Properties: #' `r rd_info_task_torch("melanoma", missings = FALSE)` @@ -36,9 +39,9 @@ constructor_melanoma = function(path) { compressed_tarball_file_name = "hf_ISIC_2020_small.tar.gz" compressed_tarball_path = file.path(path, compressed_tarball_file_name) + on.exit({file.remove(compressed_tarball_path)}, add = TRUE) curl::curl_download(paste0(base_url, compressed_tarball_file_name), compressed_tarball_path) utils::untar(compressed_tarball_path, exdir = path) - on.exit({file.remove(compressed_tarball_path)}, add = TRUE) training_metadata_file_name = "ISIC_2020_Training_GroundTruth_v2.csv" training_metadata = data.table::fread(file.path(path, training_metadata_file_name)) diff --git a/man/mlr_tasks_melanoma.Rd b/man/mlr_tasks_melanoma.Rd index 55839d36..3098495c 100644 --- a/man/mlr_tasks_melanoma.Rd +++ b/man/mlr_tasks_melanoma.Rd @@ -4,12 +4,15 @@ \alias{mlr_tasks_melanoma} \title{Melanoma Image classification} \source{ -\url{https://challenge2020.isic-archive.com/} +\url{https://huggingface.co/datasets/carsonzhang/ISIC_2020_small} } \description{ Classification of melanoma tumor images. +The data is a preprocessed version of the 2020 SIIM-ISIC challenge where +the images have been reshaped to size $(3, 128, 128)$. -The data comes from the 2020 SIIM-ISIC challenge. +By default only the training rows are active in the task, +but the test data (that has no targets) is also included. } \section{Construction}{ @@ -32,21 +35,17 @@ as the cache directory. \item Task type: \dQuote{classif} \item Properties: \dQuote{twoclass}, \dQuote{groups} \item Has Missings: no -\item Target: \dQuote{benign_malignant} +\item Target: \dQuote{outcome} \item Features: \dQuote{sex}, \dQuote{anatom_site_general_challenge}, \dQuote{age_approx}, \dQuote{image} \item Data Dimension: 43683x11 } } -\examples{ -\dontshow{if (torch::torch_is_installed()) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf} -task = tsk("melanoma") -task -\dontshow{\}) # examplesIf} -} \references{ Rotemberg, V., Kurtansky, N., Betz-Stablein, B., Caffery, L., Chousakos, E., Codella, N., Combalia, M., Dusza, S., Guitera, P., Gutman, D., Halpern, A., Helba, B., Kittler, H., Kose, K., Langer, S., Lioprys, K., Malvehy, J., Musthaq, S., Nanda, J., Reiter, O., Shih, G., Stratigos, A., Tschandl, P., Weber, J., Soyer, P. (2021). \dQuote{A patient-centric dataset of images and metadata for identifying melanomas using clinical context.} \emph{Scientific Data}, \bold{8}, 34. \doi{10.1038/s41597-021-00815-z}. +task = tsk("melanoma") +task } From ea7d799246650bbff63637beae2faf1344801bb5 Mon Sep 17 00:00:00 2001 From: Carson Zhang Date: Fri, 6 Dec 2024 10:16:57 +0100 Subject: [PATCH 43/46] started col desc --- R/TaskClassif_melanoma.R | 2 ++ 1 file changed, 2 insertions(+) diff --git a/R/TaskClassif_melanoma.R b/R/TaskClassif_melanoma.R index c59b0da3..2cdd48b8 100644 --- a/R/TaskClassif_melanoma.R +++ b/R/TaskClassif_melanoma.R @@ -7,6 +7,8 @@ #' #' By default only the training rows are active in the task, #' but the test data (that has no targets) is also included. +#' +#' * Column `"Id"` has been removed. #' #' @section Construction: #' ``` From 0bc0c6fad47f42aba89f5cfd5f24d1748146b91d Mon Sep 17 00:00:00 2001 From: Carson Zhang Date: Fri, 6 Dec 2024 11:10:15 +0100 Subject: [PATCH 44/46] docs --- R/TaskClassif_melanoma.R | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/R/TaskClassif_melanoma.R b/R/TaskClassif_melanoma.R index 2cdd48b8..25dd9e29 100644 --- a/R/TaskClassif_melanoma.R +++ b/R/TaskClassif_melanoma.R @@ -7,8 +7,14 @@ #' #' By default only the training rows are active in the task, #' but the test data (that has no targets) is also included. -#' -#' * Column `"Id"` has been removed. +#' +#' There are no labels for the test rows, so by default, these observations are inactive, +#' which means that the task uses only 32701 of the 43683 observations that are defined in the underlying data backend. +#' +#' The data backend also contains a more detailed `diagnosis` of the specific type of tumor. +#' +#' * The positive class has been set to `malignant` +#' * `anatom_site_general_challenge` is the location of the tumor on the patient's body. #' #' @section Construction: #' ``` @@ -118,6 +124,7 @@ load_task_melanoma = function(id = "melanoma") { backend = backend, id = "melanoma", target = "outcome", + positive = "malignant", label = "Melanoma Classification" ) From c7569395522d160b73c6ea795f16bd7248c1a242 Mon Sep 17 00:00:00 2001 From: Carson Zhang Date: Fri, 6 Dec 2024 12:36:27 +0100 Subject: [PATCH 45/46] added task documentation --- R/TaskClassif_melanoma.R | 9 ++++++--- man/mlr_tasks_melanoma.Rd | 13 +++++++++++++ 2 files changed, 19 insertions(+), 3 deletions(-) diff --git a/R/TaskClassif_melanoma.R b/R/TaskClassif_melanoma.R index 25dd9e29..e78fbcfb 100644 --- a/R/TaskClassif_melanoma.R +++ b/R/TaskClassif_melanoma.R @@ -12,9 +12,12 @@ #' which means that the task uses only 32701 of the 43683 observations that are defined in the underlying data backend. #' #' The data backend also contains a more detailed `diagnosis` of the specific type of tumor. -#' -#' * The positive class has been set to `malignant` -#' * `anatom_site_general_challenge` is the location of the tumor on the patient's body. +#' +#' Columns: +#' * `outcome` (factor): the target variable. Whether the tumor is benign or malignant (the positive class) +#' * `anatom_site_general_challenge` (factor): the location of the tumor on the patient's body +#' * `sex` (factor): the sex of the patient +#' * `age_approx` (int): approximate age of the patient at the time of imaging #' #' @section Construction: #' ``` diff --git a/man/mlr_tasks_melanoma.Rd b/man/mlr_tasks_melanoma.Rd index 3098495c..eebf8748 100644 --- a/man/mlr_tasks_melanoma.Rd +++ b/man/mlr_tasks_melanoma.Rd @@ -13,6 +13,19 @@ the images have been reshaped to size $(3, 128, 128)$. By default only the training rows are active in the task, but the test data (that has no targets) is also included. + +There are no labels for the test rows, so by default, these observations are inactive, +which means that the task uses only 32701 of the 43683 observations that are defined in the underlying data backend. + +The data backend also contains a more detailed \code{diagnosis} of the specific type of tumor. + +Columns: +\itemize{ +\item \code{outcome} (factor): the target variable. Whether the tumor is benign or malignant (the positive class) +\item \code{anatom_site_general_challenge} (factor): the location of the tumor on the patient's body +\item \code{sex} (factor): the sex of the patient +\item \code{age_approx} (int): approximate age of the patient at the time of imaging +} } \section{Construction}{ From 04172f859365e312abb4aac7d6fcc12bc41a7619 Mon Sep 17 00:00:00 2001 From: Sebastian Fischer Date: Fri, 6 Dec 2024 17:03:11 +0000 Subject: [PATCH 46/46] last fixes --- R/TaskClassif_melanoma.R | 11 ++++++++--- man/mlr_tasks_melanoma.Rd | 3 +++ tests/testthat/test_TaskClassif_melanoma.R | 3 +++ 3 files changed, 14 insertions(+), 3 deletions(-) diff --git a/R/TaskClassif_melanoma.R b/R/TaskClassif_melanoma.R index e78fbcfb..3c4c5531 100644 --- a/R/TaskClassif_melanoma.R +++ b/R/TaskClassif_melanoma.R @@ -7,17 +7,20 @@ #' #' By default only the training rows are active in the task, #' but the test data (that has no targets) is also included. +#' Whether an observation is part of the train or test set is indicated by the column `"test"`. #' #' There are no labels for the test rows, so by default, these observations are inactive, #' which means that the task uses only 32701 of the 43683 observations that are defined in the underlying data backend. #' #' The data backend also contains a more detailed `diagnosis` of the specific type of tumor. -#' +#' #' Columns: #' * `outcome` (factor): the target variable. Whether the tumor is benign or malignant (the positive class) #' * `anatom_site_general_challenge` (factor): the location of the tumor on the patient's body #' * `sex` (factor): the sex of the patient #' * `age_approx` (int): approximate age of the patient at the time of imaging +#' * `image` (lazy_tensor): The image (shape $(3, 128, 128)$) of the tumor. +#'ee `split` (character): Whether the observation os part of the train or test set. #' #' @section Construction: #' ``` @@ -43,6 +46,8 @@ NULL constructor_melanoma = function(path) { require_namespaces("curl") + lg$info("Downloading the dataset") + # should happen automatically, but this is needed for curl to work if (!dir.exists(path)) dir.create(path) @@ -55,10 +60,10 @@ constructor_melanoma = function(path) { utils::untar(compressed_tarball_path, exdir = path) training_metadata_file_name = "ISIC_2020_Training_GroundTruth_v2.csv" - training_metadata = data.table::fread(file.path(path, training_metadata_file_name)) + training_metadata = fread(file.path(path, training_metadata_file_name)) test_metadata_file_name = "ISIC_2020_Test_Metadata.csv" - test_metadata = data.table::fread(file.path(path, test_metadata_file_name)) + test_metadata = fread(file.path(path, test_metadata_file_name)) training_metadata = training_metadata[, split := "train"] test_metadata = setnames(test_metadata, diff --git a/man/mlr_tasks_melanoma.Rd b/man/mlr_tasks_melanoma.Rd index eebf8748..7ea655ee 100644 --- a/man/mlr_tasks_melanoma.Rd +++ b/man/mlr_tasks_melanoma.Rd @@ -13,6 +13,7 @@ the images have been reshaped to size $(3, 128, 128)$. By default only the training rows are active in the task, but the test data (that has no targets) is also included. +Whether an observation is part of the train or test set is indicated by the column \code{"test"}. There are no labels for the test rows, so by default, these observations are inactive, which means that the task uses only 32701 of the 43683 observations that are defined in the underlying data backend. @@ -25,6 +26,8 @@ Columns: \item \code{anatom_site_general_challenge} (factor): the location of the tumor on the patient's body \item \code{sex} (factor): the sex of the patient \item \code{age_approx} (int): approximate age of the patient at the time of imaging +\item \code{image} (lazy_tensor): The image (shape $(3, 128, 128)$) of the tumor. +ee \code{split} (character): Whether the observation os part of the train or test set. } } \section{Construction}{ diff --git a/tests/testthat/test_TaskClassif_melanoma.R b/tests/testthat/test_TaskClassif_melanoma.R index f7c74c17..edfc71b7 100644 --- a/tests/testthat/test_TaskClassif_melanoma.R +++ b/tests/testthat/test_TaskClassif_melanoma.R @@ -13,6 +13,9 @@ test_that("melanoma task works", { expect_equal(task$properties, c("twoclass", "groups")) task$data() + x = materialize(task$data(1, "image")[[1]])[[1L]] + expect_class(x, "torch_tensor") + expect_equal(x$shape, c(3, 128, 128)) expect_true("data.rds" %in% list.files(file.path(get_cache_dir(), "datasets", "melanoma"))) expect_equal(task$backend$nrow, 32701 + 10982)