Skip to content

Commit

Permalink
write and test split_dataset()
Browse files Browse the repository at this point in the history
  • Loading branch information
lgessl committed Dec 8, 2023
1 parent 0f2b554 commit 0fdafeb
Show file tree
Hide file tree
Showing 2 changed files with 132 additions and 0 deletions.
82 changes: 82 additions & 0 deletions R/split_dataset.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
split_dataset <- function(
expr_tbl,
pheno_tbl,
data_spec,
train_prop,
pfs_cut = NULL,
based_on_pfs_cut = FALSE,
quiet = FALSE
){
check_consistent_patient_ids(
stage = "preprocessing",
expr_tbl,
pheno_tbl,
data_spec
)
pfs_col <- data_spec$pfs_col
progression_col <- data_spec$progression_col

# Split indices
if(based_on_pfs_cut){
if(is.null(pfs_cut)){
stop("pfs_cut must be specified if based_on_pfs_cut is TRUE")
}
risk_group <- rep("na", nrow(pheno_tbl))
risk_group[pheno_tbl[[pfs_col]] < pfs_cut & pheno_tbl[[progression_col]] == 1] <- "high"
risk_group[pheno_tbl[[pfs_col]] >= pfs_cut] <- "low"
risk_group <- as.factor(risk_group)
train_index <- create_data_partition(risk_group, p = train_prop)
} else{
n_train <- round(nrow(pheno_tbl) * train_prop)
train_index <- sample(1:nrow(pheno_tbl), n_train, replace = FALSE)
}
# Split data by indices
split <- list(
"train" = list(),
"test" = list()
)
split[["train"]][["expr"]] <- expr_tbl[, c(1, train_index+1)]
split[["train"]][["pheno"]] <- pheno_tbl[train_index, ]
split[["test"]][["expr"]] <- expr_tbl[, c(-train_index-1)]
split[["test"]][["pheno"]] <- pheno_tbl[-train_index, ]
if(!quiet){
message("Splitting data into train and test data sets")
message("Train data set has ", nrow(split[["train"]][["pheno"]]), " samples")
message("Test data set has ", nrow(split[["test"]][["pheno"]]), " samples")
}

# Generate new DataSpec and save
for(partition in c("train", "test")){
ds_partition <- data_spec
ds_partition$name <- stringr::str_c(data_spec$name, " ", partition)
ds_partition$directory <- file.path(data_spec$directory, partition)
split[[partition]][["data_spec"]] <- ds_partition
if(!dir.exists(ds_partition$directory) && !quiet){
message("Creating directory ", ds_partition$directory)
dir.create(ds_partition$directory)
}
if(!quiet){
message("Writing ", partition, " data to ", ds_partition$directory)
}
if(!quiet){
message("... as ", ds_partition$expr_fname)
}
readr::write_csv(
split[[partition]][["expr"]],
file.path(ds_partition$directory, ds_partition$expr_fname)
)
if(!quiet){
message("... as ", ds_partition$pheno_fname)
}
readr::write_csv(
split[[partition]][["pheno"]],
file.path(ds_partition$directory, ds_partition$pheno_fname)
)
if(!quiet){
message("... as data_spec.rds")
}
saveRDS(ds_partition, file.path(ds_partition$directory, "data_spec.rds"))
}

return(split)
}
50 changes: 50 additions & 0 deletions tests/testthat/test-split_dataset.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
test_that("split_dataset works", {

set.seed(234)

n_samples <- 100
n_genes <- 1

dir <- withr::local_tempdir()
data <- generate_mock_data(
n_samples = n_samples,
n_genes = n_genes,
n_na_in_pheno = 0,
to_csv = dir
)
expr_tbl <- data[["expr_tbl"]]
pheno_tbl <- data[["pheno_tbl"]]
data_spec <- DataSpec(
name = "mock",
directory = dir
)

for(based_on_pfs_cut in c(TRUE, FALSE)){

split <- split_dataset(
expr_tbl = expr_tbl,
pheno_tbl = pheno_tbl,
data_spec = data_spec,
train_prop = 0.5,
pfs_cut = 2,
based_on_pfs_cut = based_on_pfs_cut
)
for(part in c("train", "test")){
expect_no_error(
qc_preprocess(
expr_tbl = split[[part]][["expr"]],
pheno_tbl = split[[part]][["pheno"]],
data_spec = split[[part]][["data_spec"]]
)
)
expect_equal(
nrow(split[[part]][["expr"]]),
nrow(expr_tbl)
)
expect_equal(
colnames(split[[part]][["pheno"]]),
colnames(pheno_tbl)
)
}
}
})

0 comments on commit 0fdafeb

Please sign in to comment.