write and test split_dataset()

lgessl · Dec 8, 2023 · 0fdafeb · 0fdafeb
1 parent 0f2b554
commit 0fdafeb
Show file tree

Hide file tree

Showing 2 changed files with 132 additions and 0 deletions.
diff --git a/R/split_dataset.R b/R/split_dataset.R
@@ -0,0 +1,82 @@
+split_dataset <- function(
+    expr_tbl,
+    pheno_tbl,
+    data_spec,
+    train_prop,
+    pfs_cut = NULL,
+    based_on_pfs_cut = FALSE,
+    quiet = FALSE
+){
+    check_consistent_patient_ids(
+        stage = "preprocessing", 
+        expr_tbl, 
+        pheno_tbl, 
+        data_spec
+    )
+    pfs_col <- data_spec$pfs_col
+    progression_col <- data_spec$progression_col
+
+    # Split indices
+    if(based_on_pfs_cut){
+        if(is.null(pfs_cut)){
+            stop("pfs_cut must be specified if based_on_pfs_cut is TRUE")
+        }
+        risk_group <- rep("na", nrow(pheno_tbl))
+        risk_group[pheno_tbl[[pfs_col]] < pfs_cut & pheno_tbl[[progression_col]] == 1] <- "high"
+        risk_group[pheno_tbl[[pfs_col]] >= pfs_cut] <- "low"
+        risk_group <- as.factor(risk_group)
+        train_index <- create_data_partition(risk_group, p = train_prop)
+    } else{
+        n_train <- round(nrow(pheno_tbl) * train_prop)
+        train_index <- sample(1:nrow(pheno_tbl), n_train, replace = FALSE)
+    }
+    # Split data by indices
+    split <- list(
+        "train" = list(),
+        "test" = list()
+    )
+    split[["train"]][["expr"]] <- expr_tbl[, c(1, train_index+1)]
+    split[["train"]][["pheno"]] <- pheno_tbl[train_index, ]
+    split[["test"]][["expr"]] <- expr_tbl[, c(-train_index-1)]
+    split[["test"]][["pheno"]] <- pheno_tbl[-train_index, ]
+    if(!quiet){
+        message("Splitting data into train and test data sets")
+        message("Train data set has ", nrow(split[["train"]][["pheno"]]), " samples")
+        message("Test data set has ", nrow(split[["test"]][["pheno"]]), " samples")
+    }
+
+    # Generate new DataSpec and save
+    for(partition in c("train", "test")){
+        ds_partition <- data_spec
+        ds_partition$name <- stringr::str_c(data_spec$name, " ", partition)
+        ds_partition$directory <- file.path(data_spec$directory, partition)
+        split[[partition]][["data_spec"]] <- ds_partition
+        if(!dir.exists(ds_partition$directory) && !quiet){
+            message("Creating directory ", ds_partition$directory)
+            dir.create(ds_partition$directory)
+        }
+        if(!quiet){
+            message("Writing ", partition, " data to ", ds_partition$directory)
+        }
+        if(!quiet){
+            message("... as ", ds_partition$expr_fname)
+        }
+        readr::write_csv(
+            split[[partition]][["expr"]], 
+            file.path(ds_partition$directory, ds_partition$expr_fname)
+        )
+        if(!quiet){
+            message("... as ", ds_partition$pheno_fname)
+        }
+        readr::write_csv(
+            split[[partition]][["pheno"]], 
+            file.path(ds_partition$directory, ds_partition$pheno_fname)
+        )
+        if(!quiet){
+            message("... as data_spec.rds")
+        }
+        saveRDS(ds_partition, file.path(ds_partition$directory, "data_spec.rds"))
+    }
+
+    return(split)
+}
diff --git a/tests/testthat/test-split_dataset.R b/tests/testthat/test-split_dataset.R
@@ -0,0 +1,50 @@
+test_that("split_dataset works", {
+
+  set.seed(234)
+
+  n_samples <- 100
+  n_genes <- 1
+
+  dir <- withr::local_tempdir()
+  data <- generate_mock_data(
+    n_samples = n_samples,
+    n_genes = n_genes,
+    n_na_in_pheno = 0,
+    to_csv = dir
+  )
+  expr_tbl <- data[["expr_tbl"]]
+  pheno_tbl <- data[["pheno_tbl"]]
+  data_spec <- DataSpec(
+    name = "mock",
+    directory = dir
+  )
+
+  for(based_on_pfs_cut in c(TRUE, FALSE)){
+
+    split <- split_dataset(
+      expr_tbl = expr_tbl,
+      pheno_tbl = pheno_tbl,
+      data_spec = data_spec,
+      train_prop = 0.5,
+      pfs_cut = 2,
+      based_on_pfs_cut = based_on_pfs_cut
+    )
+    for(part in c("train", "test")){
+      expect_no_error(
+        qc_preprocess(
+          expr_tbl = split[[part]][["expr"]],
+          pheno_tbl = split[[part]][["pheno"]],
+          data_spec = split[[part]][["data_spec"]]
+        )
+      )
+      expect_equal(
+        nrow(split[[part]][["expr"]]),
+        nrow(expr_tbl)
+      )
+      expect_equal(
+        colnames(split[[part]][["pheno"]]),
+        colnames(pheno_tbl)
+      )
+    }
+  }
+})