diff --git a/.Rbuildignore b/.Rbuildignore index 5c38deb..c5136c8 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -27,5 +27,6 @@ inst/rmd/umccr_portal/html inst/rmd/umccr_workflows/alignment_qc/nogit inst/rmd/umccr_workflows/bcl_convert/html inst/rmd/umccr_workflows/interop/html -inst/rmd/umccr_workflows/umccrise/html +inst/rmd/umccr_workflows/sash/nogit +inst/rmd/umccr_workflows/umccrise/nogit inst/sandbox diff --git a/NAMESPACE b/NAMESPACE index b10ca1f..9f9ec02 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -30,6 +30,8 @@ export(TsoTmbFile) export(TsoTmbTraceTsvFile) export(VCMetricsFile) export(Wf) +export(Wf_sash) +export(Wf_sash_download_tidy_write) export(Wf_tso_ctdna_tumor_only) export(Wf_umccrise) export(Wf_umccrise_download_tidy_write) diff --git a/R/Wf.R b/R/Wf.R index aa5bfae..3e00027 100644 --- a/R/Wf.R +++ b/R/Wf.R @@ -165,18 +165,15 @@ Wf <- R6::R6Class( #' download them). #' @param recursive Should files be returned recursively _in and under_ the specified #' GDS directory, or _only directly in_ the specified GDS directory (def: TRUE via ICA API). - #' @param list_filter_fun Function to filter relevant files. download_files = function(path = self$path, outdir, ica_token = Sys.getenv("ICA_ACCESS_TOKEN"), - max_files = 1000, dryrun = FALSE, recursive = NULL, - list_filter_fun = NULL) { + max_files = 1000, dryrun = FALSE, recursive = NULL) { # TODO: add envvar checker regexes <- self$regexes - assertthat::assert_that(!is.null(regexes), !is.null(list_filter_fun)) + assertthat::assert_that(!is.null(regexes)) if (self$filesystem == "gds") { d <- dr_gds_download( gdsdir = path, outdir = outdir, regexes = regexes, token = ica_token, - page_size = max_files, dryrun = dryrun, recursive = recursive, - list_filter_fun = list_filter_fun + page_size = max_files, dryrun = dryrun, recursive = recursive ) if (!dryrun) { self$filesystem <- "local" @@ -185,8 +182,7 @@ Wf <- R6::R6Class( } else if (self$filesystem == "s3") { d <- dr_s3_download( s3dir = path, outdir = outdir, regexes = regexes, - max_objects = max_files, dryrun = dryrun, - list_filter_fun = list_filter_fun + max_objects = max_files, dryrun = dryrun ) if (!dryrun) { self$filesystem <- "local" diff --git a/R/fs_icav1.R b/R/fs_icav1.R index 16df2c9..da50daf 100644 --- a/R/fs_icav1.R +++ b/R/fs_icav1.R @@ -139,7 +139,7 @@ gds_list_files_filter_relevant <- function(gdsdir, pattern = NULL, regexes = DR_ no_recurse = no_recurse, page_token = page_token, recursive = recursive ) |> dplyr::rowwise() |> - dplyr::mutate(type = purrr::map_chr(.data$bname, \(x) match_regex(x, regexes))) |> + dplyr::mutate(type = purrr::map_chr(.data$path, \(x) match_regex(x, regexes))) |> dplyr::ungroup() |> dplyr::filter(!is.na(.data$type), grepl(pattern, .data$type)) |> dplyr::select(dplyr::any_of(cols_sel)) @@ -155,7 +155,6 @@ gds_list_files_filter_relevant <- function(gdsdir, pattern = NULL, regexes = DR_ #' @param outdir Local output directory. #' @param dryrun If TRUE, just list the files that will be downloaded (don't #' download them). -#' @param list_filter_fun Function to filter relevant GDS files. #' @examples #' \dontrun{ #' gdsdir <- "gds://production/analysis_data/SBJ01155/umccrise/202408300c218043/L2101566__L2101565" @@ -171,11 +170,10 @@ gds_list_files_filter_relevant <- function(gdsdir, pattern = NULL, regexes = DR_ #' @export dr_gds_download <- function(gdsdir, outdir, token = Sys.getenv("ICA_ACCESS_TOKEN"), pattern = NULL, page_size = 100, dryrun = FALSE, - regexes = DR_FILE_REGEX, recursive = NULL, - list_filter_fun = gds_list_files_filter_relevant) { + regexes = DR_FILE_REGEX, recursive = NULL) { e <- emojifont::emoji fs::dir_create(outdir) - d <- list_filter_fun( + d <- gds_list_files_filter_relevant( gdsdir = gdsdir, pattern = pattern, regexes = regexes, token = token, page_size = page_size, include_url = FALSE, no_recurse = FALSE, page_token = NULL, diff --git a/R/fs_local.R b/R/fs_local.R index 07397aa..f681bef 100644 --- a/R/fs_local.R +++ b/R/fs_local.R @@ -47,7 +47,7 @@ local_list_files_dir <- function(localdir, max_files = NULL) { local_list_files_filter_relevant <- function(localdir, regexes = DR_FILE_REGEX, max_files = NULL) { local_list_files_dir(localdir = localdir, max_files = max_files) |> dplyr::mutate( - type = purrr::map_chr(.data$bname, \(x) match_regex(x, regexes = regexes)) + type = purrr::map_chr(.data$path, \(x) match_regex(x, regexes = regexes)) ) |> dplyr::filter(!is.na(.data$type)) |> dplyr::select("type", "bname", "size", "lastmodified", localpath = "path") diff --git a/R/fs_s3.R b/R/fs_s3.R index 34c437e..eb314ce 100644 --- a/R/fs_s3.R +++ b/R/fs_s3.R @@ -78,7 +78,7 @@ s3_list_files_filter_relevant <- function(s3dir, pattern = NULL, d <- d_all |> dplyr::rowwise() |> dplyr::mutate( - type = purrr::map_chr(.data$bname, \(x) match_regex(x, regexes)) + type = purrr::map_chr(.data$path, \(x) match_regex(x, regexes)) ) |> dplyr::ungroup() |> dplyr::filter(!is.na(.data$type), grepl(pattern, .data$type)) |> @@ -109,7 +109,6 @@ s3_list_files_filter_relevant <- function(s3dir, pattern = NULL, #' @param outdir Path to output directory. #' @param dryrun If TRUE, just list the files that will be downloaded (don't #' download them). -#' @param list_filter_fun Function to filter relevant S3 files. #' @examples #' \dontrun{ #' p1 <- "s3://org.umccr.data.oncoanalyser/analysis_data/SBJ05373/sash" @@ -125,12 +124,11 @@ s3_list_files_filter_relevant <- function(s3dir, pattern = NULL, #' } #' @export dr_s3_download <- function(s3dir, outdir, max_objects = 100, pattern = NULL, - regexes = DR_FILE_REGEX, dryrun = FALSE, - list_filter_fun = s3_list_files_filter_relevant) { + regexes = DR_FILE_REGEX, dryrun = FALSE) { s3 <- paws.storage::s3() e <- emojifont::emoji fs::dir_create(outdir) - d <- list_filter_fun( + d <- s3_list_files_filter_relevant( s3dir = s3dir, pattern = NULL, regexes = regexes, max_objects = max_objects, presign = FALSE ) diff --git a/R/sash.R b/R/sash.R new file mode 100644 index 0000000..345fc2b --- /dev/null +++ b/R/sash.R @@ -0,0 +1,261 @@ +#' Wf_sash R6 Class +#' +#' @description +#' Reads and writes tidy versions of files from the `sash` workflow +#' +#' @examples +#' \dontrun{ +#' +#' #---- Local ----# +#' p1 <- "~/s3/org.umccr.data.oncoanalyser/analysis_data/SBJ05571/sash" +#' p2 <- "202408270b93455e/L2401308_L2401307" +#' p <- normalizePath(file.path(p1, p2)) +#' SubjectID <- "SBJ05571" +#' SampleID_tumor <- "MDX240307" +#' prefix <- glue("{SubjectID}__{SampleID_tumor}") +#' s1 <- Wf_sash$new(path = p, SubjectID = SubjectID, SampleID_tumor = SampleID_tumor) +#' s1$list_files(max_files = 20) +#' s1$list_files_filter_relevant(max_files = 300) +#' d <- s1$download_files(max_files = 1000, dryrun = F) +#' d_tidy <- s1$tidy_files(d) +#' d_write <- s1$write( +#' d_tidy, +#' outdir = file.path(p, "dracarys_tidy"), +#' prefix = glue("{SubjectID}_{SampleID_tumor}"), +#' format = "tsv" +#' ) +#' +#' #---- S3 ----# +#' p1 <- "s3://org.umccr.data.oncoanalyser/analysis_data/SBJ05571/sash" +#' p2 <- "202408270b93455e/L2401308_L2401307" +#' p <- file.path(p1, p2) +#' SubjectID <- "SBJ05571" +#' SampleID_tumor <- "MDX240307" +#' prefix <- glue("{SubjectID}__{SampleID_tumor}") +#' s1 <- Wf_sash$new(path = p, SubjectID = SubjectID, SampleID_tumor = SampleID_tumor) +#' s1$list_files(max_files = 20) +#' s1$list_files_filter_relevant() +#' outdir <- sub("s3:/", "~/s3", p) +#' d <- s1$download_files(outdir = outdir, max_files = 1000, dryrun = F) +#' d_tidy <- s1$tidy_files(d) +#' d_write <- s1$write( +#' d_tidy, +#' outdir = file.path(p, "dracarys_tidy"), +#' prefix = glue("{SubjectID}__{SampleID_tumor}"), +#' format = "tsv" +#' ) +#' } +#' +#' @export +Wf_sash <- R6::R6Class( + "Wf_sash", + inherit = Wf, + public = list( + #' @field SubjectID The SubjectID of the sample (needed for path lookup). + #' @field SampleID_tumor The SampleID of the tumor sample (needed for path lookup). + SubjectID = NULL, + SampleID_tumor = NULL, + #' @description Create a new Wf_sash object. + #' @param path Path to directory with raw workflow results (from GDS, S3, or + #' local filesystem). + #' @param SubjectID The SubjectID of the sample (needed for path lookup). + #' @param SampleID_tumor The SampleID of the tumor sample (needed for path lookup). + initialize = function(path = NULL, SubjectID = NULL, SampleID_tumor = NULL) { + wname <- "sash" + pref <- glue("{SubjectID}_{SampleID_tumor}") + crep <- "cancer_report/cancer_report_tables" + regexes <- tibble::tribble( + ~regex, ~fun, + glue("{pref}/{crep}/hrd/{pref}-chord\\.tsv\\.gz$"), "hrd_chord", + glue("{pref}/{crep}/hrd/{pref}-hrdetect\\.tsv\\.gz$"), "hrd_hrdetect", + glue("{pref}/{crep}/hrd/{pref}-dragen\\.tsv\\.gz$"), "hrd_dragen", + glue("{pref}/{crep}/sigs/{pref}-snv_2015\\.tsv\\.gz$"), "sigs_snv2015", + glue("{pref}/{crep}/sigs/{pref}-snv_2020\\.tsv\\.gz$"), "sigs_snv2020", + glue("{pref}/{crep}/sigs/{pref}-dbs\\.tsv\\.gz$"), "sigs_dbs", + glue("{pref}/{crep}/sigs/{pref}-indel\\.tsv\\.gz$"), "sigs_indel", + glue("{pref}/{crep}/{pref}-qc_summary\\.tsv\\.gz$"), "qcsum", + glue("{pref}/smlv_somatic/report/pcgr/{SampleID_tumor}\\.pcgr_acmg\\.grch38\\.json\\.gz$"), "pcgr_json" + ) |> + dplyr::mutate(fun = paste0("read_", .data$fun)) + + super$initialize(path = path, wname = wname, regexes = regexes) + self$SubjectID <- SubjectID + self$SampleID_tumor <- SampleID_tumor + }, + #' @description Print details about the Workflow. + #' @param ... (ignored). + print = function(...) { + res <- tibble::tribble( + ~var, ~value, + "path", self$path, + "wname", self$wname, + "filesystem", self$filesystem, + "SubjectID", self$SubjectID, + "SampleID_tumor", self$SampleID_tumor + ) + print(res) + invisible(self) + }, + #' @description Read `pcgr.json.gz` file. + #' @param x Path to file. + read_pcgr_json = function(x) { + j <- read_jsongz_jsonlite(x) + tmb <- + j[["content"]][["tmb"]][["variant_statistic"]] %||% + j[["content"]][["tmb"]][["v_stat"]] %||% + list(tmb_estimate = NA, n_tmb = NA) + tmb <- purrr::flatten(tmb) |> + tibble::as_tibble_row() |> + dplyr::select("tmb_estimate", "n_tmb") + msi <- j[["content"]][["msi"]][["prediction"]][["msi_stats"]] + # handle nulls + msi <- msi %||% list(fracIndels = NA, predicted_class = NA) + msi <- purrr::flatten(msi) |> + tibble::as_tibble_row() |> + dplyr::select("fracIndels", "predicted_class") + metrics <- dplyr::bind_cols(msi, tmb) + return(metrics) + }, + #' @description Read `dragen.tsv.gz` cancer report hrd file. + #' @param x Path to file. + read_hrd_dragen = function(x) { + ct <- readr::cols(.default = "d", Sample = "c") + read_tsvgz(x, col_types = ct) + }, + #' @description Read `chord.tsv.gz` cancer report hrd file. + #' @param x Path to file. + read_hrd_chord = function(x) { + ct <- readr::cols_only( + p_hrd = "d", + hr_status = "c", + hrd_type = "c", + p_BRCA1 = "d", + p_BRCA2 = "d" + ) + read_tsvgz(x, col_types = ct) + }, + #' @description Read `hrdetect.tsv.gz` cancer report hrd file. + #' @param x Path to file. + read_hrd_hrdetect = function(x) { + ct <- readr::cols( + .default = "d", + sample = "c" + ) + read_tsvgz(x, col_types = ct) |> + dplyr::select(-c("sample")) + }, + #' @description Read signature cancer report file. + #' @param x Path to file. + read_sigstsv = function(x) { + ct <- readr::cols( + .default = "d", + Signature = "c" + ) + read_tsvgz(x, col_types = ct) + }, + #' @description Read `snv_2015.tsv.gz` sigs cancer report file. + #' @param x Path to file. + read_sigs_snv2015 = function(x) { + self$read_sigstsv(x) + }, + #' @description Read `snv_2020.tsv.gz` sigs cancer report file. + #' @param x Path to file. + read_sigs_snv2020 = function(x) { + self$read_sigstsv(x) + }, + #' @description Read `dbs.tsv.gz` sigs cancer report file. + #' @param x Path to file. + read_sigs_dbs = function(x) { + self$read_sigstsv(x) + }, + #' @description Read `indel.tsv.gz` sigs cancer report file. + #' @param x Path to file. + read_sigs_indel = function(x) { + self$read_sigstsv(x) + }, + #' @description Read `qc_summary.tsv.gz` cancer report file. + #' @param x Path to file. + read_qcsum = function(x) { + d <- read_tsvgz(x, col_types = readr::cols(.default = "c")) + d |> + dplyr::select("variable", "value") |> + tidyr::pivot_wider(names_from = "variable", values_from = "value") |> + dplyr::rename(MSI_mb_tmp = "MSI (indels/Mb)") |> + dplyr::mutate( + purity_hmf = sub("(.*) \\(.*\\)", "\\1", .data$Purity) |> as.numeric(), + ploidy_hmf = sub("(.*) \\(.*\\)", "\\1", .data$Ploidy) |> as.numeric(), + msi_mb_hmf = sub(".* \\((.*)\\)", "\\1", .data$MSI_mb_tmp) |> as.numeric(), + contamination_hmf = as.numeric(.data$Contamination), + deleted_genes_hmf = as.numeric(.data$DeletedGenes), + msi_hmf = sub("(.*) \\(.*\\)", "\\1", .data$MSI_mb_tmp), + tmb_hmf = sub("(.*) \\(.*\\)", "\\1", .data$TMB) |> as.numeric(), + tml_hmf = sub("(.*) \\(.*\\)", "\\1", .data$TML) |> as.numeric(), + hypermutated = ifelse("Hypermutated" %in% d$variable, .data[["Hypermutated"]], NA) |> as.character() + ) |> + dplyr::select( + qc_status_hmf = "QC_Status", + sex_hmf = "Gender", + "purity_hmf", "ploidy_hmf", "msi_hmf", "msi_mb_hmf", + "contamination_hmf", + "deleted_genes_hmf", "tmb_hmf", "tml_hmf", + wgd_hmf = "WGD", + "hypermutated" + ) + } + ) # end public +) + +#' sash Download Tidy and Write +#' +#' Downloads files from the `sash` workflow and writes them in a tidy format. +#' +#' @param path Path to directory with raw workflow results (from GDS, S3, or +#' local filesystem). +#' @param SubjectID The SubjectID of the sample (needed for path lookup). +#' @param SampleID_tumor The SampleID of the tumor sample (needed for path lookup). +#' @param outdir Path to output directory. +#' @param format Format of output files. +#' @param max_files Max number of files to list. +#' @param ica_token ICA access token (def: $ICA_ACCESS_TOKEN env var). +#' @param dryrun If TRUE, just list the files that will be downloaded (don't +#' download them). +#' @return List where each element is a tidy tibble of a sash file. +#' +#' @examples +#' \dontrun{ +#' SubjectID <- "SBJ03043" +#' SampleID_tumor <- "PRJ230004" +#' p1_gds <- glue("gds://production/analysis_data/{SubjectID}/umccrise") +#' p <- file.path(p1_gds, "20240830ec648f40/L2300064__L2300063") +#' outdir <- file.path(sub("gds:/", "~/icav1/g", p)) +#' token <- Sys.getenv("ICA_ACCESS_TOKEN") +#' d <- Wf_sash_download_tidy_write( +#' path = p, SubjectID = SubjectID, SampleID_tumor = SampleID_tumor, +#' outdir = outdir, +#' dryrun = F +#' ) +#' } +#' @export +Wf_sash_download_tidy_write <- function(path, SubjectID, SampleID_tumor, + outdir, format = "rds", max_files = 1000, + ica_token = Sys.getenv("ICA_ACCESS_TOKEN"), + dryrun = FALSE) { + s <- Wf_sash$new( + path = path, SubjectID = SubjectID, SampleID_tumor = SampleID_tumor + ) + d_dl <- s$download_files( + outdir = outdir, ica_token = ica_token, + max_files = max_files, dryrun = dryrun + ) + if (!dryrun) { + d_tidy <- s$tidy_files(d_dl) + d_write <- s$write( + d_tidy, + outdir = file.path(outdir, "dracarys_tidy"), + prefix = glue("{SubjectID}__{SampleID_tumor}"), + format = format + ) + return(d_write) + } + return(d_dl) +} diff --git a/R/umccrise.R b/R/umccrise.R index ffbe5c5..efcd5fe 100644 --- a/R/umccrise.R +++ b/R/umccrise.R @@ -64,17 +64,19 @@ Wf_umccrise <- R6::R6Class( #' @param SampleID_tumor The SampleID of the tumor sample (needed for path lookup). initialize = function(path = NULL, SubjectID = NULL, SampleID_tumor = NULL) { wname <- "umccrise" + pref <- glue("{SubjectID}__{SampleID_tumor}") + crep <- "cancer_report_tables" regexes <- tibble::tribble( ~regex, ~fun, - "-chord\\.tsv\\.gz$", "chordtsv", - "-hrdetect\\.tsv\\.gz$", "hrdetecttsv", - "-snv_2015\\.tsv\\.gz$", "sigssnv2015tsv", - "-snv_2020\\.tsv\\.gz$", "sigssnv2020tsv", - "-dbs\\.tsv\\.gz$", "sigsdbstsv", - "-indel\\.tsv\\.gz$", "sigsindeltsv", - "-qc_summary\\.tsv\\.gz$", "qcsummarytsv", - "multiqc_conpair\\.txt$", "conpairmultiqc", - "-somatic\\.pcgr\\.json\\.gz$", "pcgrjson" + glue("{pref}/{crep}/hrd/{pref}-chord\\.tsv\\.gz$"), "hrd_chord", + glue("{pref}/{crep}/hrd/{pref}-hrdetect\\.tsv\\.gz$"), "hrd_hrdetect", + glue("{pref}/{crep}/sigs/{pref}-snv_2015\\.tsv\\.gz$"), "sigs_snv2015", + glue("{pref}/{crep}/sigs/{pref}-snv_2020\\.tsv\\.gz$"), "sigs_snv2020", + glue("{pref}/{crep}/sigs/{pref}-dbs\\.tsv\\.gz$"), "sigs_dbs", + glue("{pref}/{crep}/sigs/{pref}-indel\\.tsv\\.gz$"), "sigs_indel", + glue("{pref}/{crep}/{pref}-qc_summary\\.tsv\\.gz$"), "qcsum", + glue("{pref}/{pref}-multiqc_report_data/multiqc_conpair\\.txt$"), "conpairmultiqc", + glue("work/{pref}/pcgr/{pref}-somatic\\.pcgr\\.json\\.gz$"), "pcgr_json" ) |> dplyr::mutate(fun = paste0("read_", .data$fun)) @@ -96,42 +98,9 @@ Wf_umccrise <- R6::R6Class( print(res) invisible(self) }, - #' @description List dracarys files under given path - #' @param max_files Max number of files to list (for gds/s3 only). - #' @param ica_token ICA access token (def: $ICA_ACCESS_TOKEN env var). - #' @param ... Passed on to the `gds_list_files_filter_relevant` or - #' the `s3_list_files_filter_relevant` function. - list_files_filter_relevant = function(max_files = 1000, - ica_token = Sys.getenv("ICA_ACCESS_TOKEN"), ...) { - path <- self$path - dir_final <- file.path(path, glue("{self$SubjectID}__{self$SampleID_tumor}")) - dir_work <- file.path(path, "work", glue("{self$SubjectID}__{self$SampleID_tumor}")) - dir_work_pcgr <- file.path(dir_work, "pcgr") # for pcgr json - f1 <- super$list_files_filter_relevant(path = dir_final, max_files = 300, ica_token = ica_token) - f2 <- super$list_files_filter_relevant(path = dir_work_pcgr, max_files = 50, ica_token = ica_token) - f_all <- dplyr::bind_rows(f1, f2) - return(f_all) - }, - #' @description Download files from GDS/S3 to local filesystem. - #' @param outdir Path to output directory. - #' @param ica_token ICA access token (def: $ICA_ACCESS_TOKEN env var). - #' @param max_files Max number of files to list. - #' @param dryrun If TRUE, just list the files that will be downloaded (don't - #' download them). - #' @param recursive Should files be returned recursively _in and under_ the specified - #' GDS directory, or _only directly in_ the specified GDS directory (def: TRUE via ICA API). - #' @param list_filter_fun Function to filter relevant files. - download_files = function(outdir, ica_token = Sys.getenv("ICA_ACCESS_TOKEN"), - max_files = 1000, dryrun = FALSE, recursive = NULL) { - super$download_files( - outdir = outdir, ica_token = ica_token, max_files = max_files, - dryrun = dryrun, recursive = recursive, - list_filter_fun = self$list_files_filter_relevant - ) - }, #' @description Read `pcgr.json.gz` file. #' @param x Path to file. - read_pcgrjson = function(x) { + read_pcgr_json = function(x) { j <- read_jsongz_jsonlite(x) tmb <- j[["content"]][["tmb"]][["variant_statistic"]] %||% @@ -151,7 +120,7 @@ Wf_umccrise <- R6::R6Class( }, #' @description Read `chord.tsv.gz` cancer report file. #' @param x Path to file. - read_chordtsv = function(x) { + read_hrd_chord = function(x) { ct <- readr::cols_only( p_hrd = "d", hr_status = "c", @@ -163,7 +132,7 @@ Wf_umccrise <- R6::R6Class( }, #' @description Read `hrdetect.tsv.gz` cancer report file. #' @param x Path to file. - read_hrdetecttsv = function(x) { + read_hrd_hrdetect = function(x) { ct <- readr::cols( .default = "d", sample = "c" @@ -182,27 +151,27 @@ Wf_umccrise <- R6::R6Class( }, #' @description Read `snv_2015.tsv.gz` sigs cancer report file. #' @param x Path to file. - read_sigssnv2015tsv = function(x) { + read_sigs_snv2015 = function(x) { self$read_sigstsv(x) }, #' @description Read `snv_2020.tsv.gz` sigs cancer report file. #' @param x Path to file. - read_sigssnv2020tsv = function(x) { + read_sigs_snv2020 = function(x) { self$read_sigstsv(x) }, #' @description Read `dbs.tsv.gz` sigs cancer report file. #' @param x Path to file. - read_sigsdbstsv = function(x) { + read_sigs_dbs = function(x) { self$read_sigstsv(x) }, #' @description Read `indel.tsv.gz` sigs cancer report file. #' @param x Path to file. - read_sigsindeltsv = function(x) { + read_sigs_indel = function(x) { self$read_sigstsv(x) }, #' @description Read `qc_summary.tsv.gz` cancer report file. #' @param x Path to file. - read_qcsummarytsv = function(x) { + read_qcsum = function(x) { d <- read_tsvgz(x, col_types = readr::cols(.default = "c")) d |> dplyr::select("variable", "value") |> @@ -278,11 +247,11 @@ Wf_umccrise <- R6::R6Class( #' @param SubjectID The SubjectID of the sample (needed for path lookup). #' @param SampleID_tumor The SampleID of the tumor sample (needed for path lookup). #' @param outdir Path to output directory. +#' @param format Format of output files. #' @param max_files Max number of files to list. #' @param ica_token ICA access token (def: $ICA_ACCESS_TOKEN env var). #' @param dryrun If TRUE, just list the files that will be downloaded (don't #' download them). -#' @param format Format of output files. #' @return List where each element is a tidy tibble of a umccrise file. #' #' @examples diff --git a/inst/rmd/umccr_workflows/sash/.gitignore b/inst/rmd/umccr_workflows/sash/.gitignore new file mode 100644 index 0000000..0182e8f --- /dev/null +++ b/inst/rmd/umccr_workflows/sash/.gitignore @@ -0,0 +1,3 @@ +nogit + +/.quarto/ diff --git a/inst/rmd/umccr_workflows/sash/dl_and_tidy.R b/inst/rmd/umccr_workflows/sash/dl_and_tidy.R new file mode 100755 index 0000000..9fcfcb1 --- /dev/null +++ b/inst/rmd/umccr_workflows/sash/dl_and_tidy.R @@ -0,0 +1,80 @@ +#!/usr/bin/env Rscript + +{ + require(dplyr) + require(assertthat, include.only = "assert_that") + require(dracarys, include.only = "Wf_sash_download_tidy_write") + require(glue, include.only = "glue") + require(here, include.only = "here") + require(rportal, include.only = c("portaldb_query_workflow")) + require(tidyr, include.only = "separate_wider_delim") +} + +query_workflow_sash <- function(start_date, end_date) { + q1 <- glue( + "WHERE \"type_name\" = 'sash' ", + "AND \"start\" >= date(\'{start_date}\') ", + "AND \"end\" <= date(\'{end_date}\') ", + "ORDER BY \"start\" DESC;" + ) + rportal::portaldb_query_workflow(q1) +} + +query_limsrow_libids <- function(libids) { + assertthat::assert_that(!is.null(libids), all(grepl("^L", libids))) + libids <- unique(libids) |> + paste(collapse = "|") + q1 <- glue("WHERE REGEXP_LIKE(\"library_id\", '{libids}');") + rportal::portaldb_query_limsrow(q1) +} + +# first read in the workflows table, extract metadata, then join with lims +start_date <- "2024-08-29" +end_date <- "2024-09-07" +meta_raw <- query_workflow_sash(start_date, end_date) +meta <- meta_raw |> + rportal::meta_sash() +lims_raw <- query_limsrow_libids(meta$LibraryID_tumor) +lims <- lims_raw |> + tidyr::separate_wider_delim( + library_id, + delim = "_", names = c("library_id", "topup_or_rerun"), too_few = "align_start" + ) |> + select( + subject_id, library_id, sample_id, sample_name, + external_subject_id, external_sample_id, + project_name, project_owner, + source, quality, workflow + ) |> + distinct() +table(lims$library_id %in% meta$LibraryID_tumor) # double-check + +meta_lims <- meta |> + left_join(lims, by = c("LibraryID_tumor" = "library_id")) |> + mutate(rownum = row_number()) |> + select( + rownum, wfr_id, version, end_status, start, end, portal_run_id, SubjectID, LibraryID_tumor, LibraryID_normal, + SampleID_tumor, SampleID_normal, s3_outdir_sash, external_subject_id, external_sample_id, + project_owner, project_name, source, quality, workflow + ) +meta_lims |> + saveRDS(here(glue("inst/rmd/umccr_workflows/sash/nogit/meta/{start_date}_{end_date}.rds"))) + +d <- meta_lims |> + rowwise() |> + mutate( + # indir = .data$s3_outdir_sash, + outdir = file.path(sub("s3://", "", .data$indir)), + outdir = file.path(normalizePath("~/s3"), .data$outdir), + indir = outdir, # for when debugging locally + res = list( + dracarys::Wf_sash_download_tidy_write( + path = .data$indir, SubjectID = .data$SubjectID, SampleID_tumor = .data$SampleID_tumor, + outdir = .data$outdir, max_files = 1000, dryrun = FALSE + ) + ) + ) |> + ungroup() + +d |> + saveRDS(here(glue("inst/rmd/umccr_workflows/sash/nogit/results_{start_date}_{end_date}.rds"))) diff --git a/inst/rmd/umccr_workflows/sash/render.sh b/inst/rmd/umccr_workflows/sash/render.sh new file mode 100644 index 0000000..e1c6efe --- /dev/null +++ b/inst/rmd/umccr_workflows/sash/render.sh @@ -0,0 +1,9 @@ +date_start="2024-08-29" +date_end="2024-09-07" +out="sash_${date_start}_${date_end}.html" + +quarto render summary_sash.qmd \ + -P date_start:${date_start} \ + -P date_end:${date_end} \ + -o ${out} \ + --output-dir nogit/html diff --git a/inst/rmd/umccr_workflows/sash/summary_sash.qmd b/inst/rmd/umccr_workflows/sash/summary_sash.qmd new file mode 100644 index 0000000..ec231d1 --- /dev/null +++ b/inst/rmd/umccr_workflows/sash/summary_sash.qmd @@ -0,0 +1,392 @@ +--- +title: "{{< meta params.title >}}" +subtitle: "Period: `r paste(params$date_start, ' to ', params$date_end)`" +author: "UMCCR - Genomics Platform Group" +date: now +date-format: "YYYY-MM-DD HH:mm Z" +execute: + echo: false +format: + html: + toc: true + toc-expand: 1 + toc-title: Contents + toc-location: body + highlight-style: github + number-sections: false + link-external-icon: true + link-external-newwindow: true + embed-resources: true + code-copy: true + code-link: true + code-fold: true + code-block-border-left: true + smooth-scroll: true + grid: + body-width: 1300px +params: + title: "UMCCR sash Workflow Summary" + date_start: "2024-08-29" + date_end: "2024-09-07" +--- + +```{r} +#| label: pkg_load +#| message: false +{ + require(dplyr) # import all dplyr funcs + require(readr, include.only = c("read_rds")) + require(purrr, include.only = c("map")) + require(tidyr, include.only = c("unnest_wider")) + require(dracarys, include.only = c("session_info_kable")) + require(glue, include.only = "glue") + require(here, include.only = "here") + require(knitr, include.only = "kable") + require(reactable, include.only = "reactable") + require(ggplot2, include.only = c("ggplot", "aes")) + require(lubridate, include.only = c("as_datetime")) + require(plotly, include.only = c("ggplotly")) + require(patchwork, include.only = c("plot_layout")) +} +set.seed(42) +``` + +```{r} +#| label: load_data +date_start <- params$date_start +date_end <- params$date_end +nogit <- "inst/rmd/umccr_workflows/sash/nogit" +myriad <- paste0("SBJ0", c("0695", "0847", "0920", "2397", "2456", "2743", "3186", "3242", "4187", "4221")) +d_raw <- here(glue("{nogit}/results_{date_start}_{date_end}.rds")) |> + readr::read_rds() |> + arrange(desc(SubjectID), desc(LibraryID_tumor)) |> + mutate( + rownum = row_number(), + is_myriad = SubjectID %in% myriad + ) |> + relocate(rownum) |> + relocate(is_myriad, .after = rownum) +``` + +```{r} +#| label: funcs + +tab_view <- function(x, id, ...) { + htmltools::browsable( + htmltools::tagList( + htmltools::tags$button( + htmltools::tagList(fontawesome::fa("download"), "CSV"), + onclick = glue("Reactable.downloadDataCSV('{id}', '{id}.csv')") + ), + x |> + reactable::reactable( + bordered = TRUE, + compact = TRUE, + filterable = TRUE, + # fullWidth = TRUE, + height = 800, + highlight = TRUE, + pagination = TRUE, + showPagination = TRUE, + defaultPageSize = nrow(x), + showPageSizeOptions = TRUE, + pageSizeOptions = c(20, 50, nrow(x)), + resizable = TRUE, + searchable = TRUE, + sortable = TRUE, + striped = TRUE, + wrap = FALSE, + elementId = id, + columns = list( + SubjectID = reactable::colDef( + sticky = "left", + # Add a right border style to visually distinguish the sticky column + style = list(borderRight = "1px solid #eee"), + headerStyle = list(borderRight = "1px solid #eee") + ) + ), + defaultColDef = reactable::colDef( + minWidth = 170, + headerStyle = list(background = "#f7f7f8") + ), + theme = reactable::reactableTheme( + borderColor = "#dfe2e5", + stripedColor = "#f6f8fa", + highlightColor = "#f0f5f9", + style = list( + fontFamily = "Monaco" + ) + ), + ... + ) + ) + ) +} +``` + +## Metadata + +```{r} +#| label: metadata +meta <- d_raw |> + select( + rownum, is_myriad, portal_run_id, SubjectID, LibraryID_tumor, SampleID_tumor, external_subject_id, external_sample_id, + project_owner, project_name, source, quality, workflow + ) +tab_view(meta, id = "metadata") +``` + +## Results + +```{r} +#| label: process +# one row per file type - not all samples have sigsdbstsv +d <- d_raw |> + tidyr::unnest_longer(res, indices_to = "filetype") +# main_cols <- c("rownum", "portal_run_id", "SubjectID", "LibraryID_tumor") +main_cols <- c("portal_run_id") +``` + +```{r} +#| label: qcsum +qcsum <- d |> + filter(filetype == "qcsum") |> + select(all_of(main_cols), res) |> + unnest_wider(res) +``` + +```{r} +#| label: pcgr +pcgr <- d |> + filter(filetype == "pcgr_json") |> + select(all_of(main_cols), res) |> + unnest_wider(res) |> + rename( + msi_fraction_indels_pcgr = "fracIndels", + msi_pcgr = "predicted_class", + tmb_pcgr = "tmb_estimate", + n_tmb_pcgr = "n_tmb" + ) |> + mutate(msi_pcgr = sub(" \\(.*\\)", "", msi_pcgr)) +``` + +```{r} +#| label: hrd +hrd_chord <- d |> + filter(filetype == "hrd_chord") |> + unnest_wider(res) |> + select(all_of(main_cols), hrd_chord = "p_hrd") +hrd_hrdetect <- d |> + filter(filetype == "hrd_hrdetect") |> + unnest_wider(res) |> + select(all_of(main_cols), hrd_hrdetect = "Probability") +hrd_dragen <- d |> + filter(filetype == "hrd_dragen") |> + select(all_of(main_cols), res) |> + unnest_wider(res) |> + select(all_of(main_cols), hrd_dragen = "HRD") +hrd_all <- hrd_dragen |> + left_join(hrd_chord, by = "portal_run_id") |> + left_join(hrd_hrdetect, by = "portal_run_id") |> + select(portal_run_id, hrd_dragen, hrd_chord, hrd_hrdetect) +``` + +```{r} +#| label: sigs +sigs_snv2015 <- d |> + filter(filetype == "sigs_snv2015") |> + select(all_of(main_cols), res) |> + tidyr::unnest_wider(res) |> + tidyr::unnest_longer(col = c(Rank, Signature, Contribution, RelFreq)) +sigs_snv2020 <- d |> + filter(filetype == "sigs_snv2020") |> + select(all_of(main_cols), res) |> + tidyr::unnest_wider(res) |> + tidyr::unnest_longer(col = c(Rank, Signature, Contribution, RelFreq)) +sigs_dbs <- d |> + filter(filetype == "sigs_dbs") |> + select(all_of(main_cols), res) |> + tidyr::unnest_wider(res) |> + tidyr::unnest_longer(col = c(Rank, Signature, Contribution, RelFreq)) +sigs_indel <- d |> + filter(filetype == "sigs_indel") |> + select(all_of(main_cols), res) |> + tidyr::unnest_wider(res) |> + tidyr::unnest_longer(col = c(Rank, Signature, Contribution, RelFreq)) +dsig <- bind_rows( + list( + snv2015 = sigs_snv2015, snv2020 = sigs_snv2020, dbs = sigs_dbs, indel = sigs_indel + ), + .id = "Sig_group" +) + +# keep top two ranked sigs +dsig_filt <- dsig |> + group_by(Sig_group, portal_run_id) |> + mutate(tot_sig_vars = sum(Contribution)) |> + arrange(Rank) |> + slice_head(n = 2) |> + # some sigs have same Rank so use explicit sig_rank + mutate(sig_rank = row_number()) |> + ungroup() |> + mutate( + sig_summary = glue("{Signature} ({RelFreq} = {Contribution} / {tot_sig_vars})") + ) |> + select(Sig_group, portal_run_id, sig_rank, sig_summary) |> + tidyr::pivot_wider(names_from = sig_rank, values_from = sig_summary, names_prefix = "rank") |> + mutate(sig_top2 = paste(rank1, rank2, sep = ", ")) |> + select(Sig_group, portal_run_id, sig_top2) |> + tidyr::pivot_wider(names_from = Sig_group, values_from = sig_top2) |> + select(portal_run_id, snv2015, snv2020, dbs, indel) +``` + +```{r} +#| label: qc_all +dall <- d_raw |> + select( + rownum, + is_myriad, + date_analysed = "start", portal_run_id, + SubjectID, LibraryID_tumor, SampleID_tumor, + external_subject_id, external_sample_id, + project_owner, project_name, source, quality, workflow + ) |> + left_join(hrd_all, by = "portal_run_id") |> + left_join(qcsum, by = "portal_run_id") |> + left_join(pcgr, by = "portal_run_id") +``` + +### Summary Metrics + +```{r} +#| label: summary_metrics +tab_view(dall, "summary_metrics") +``` + +### HRD Plot + +Showing 2-way relationships between DRAGEN, CHORD and HRDetect. + +```{r} +#| label: hrd_plot +#| fig-width: 15 +#| fig-height: 22 + +pdat <- dall |> + mutate(sbj = glue("{SubjectID}_{LibraryID_tumor}")) |> + select(sbj, dragen = hrd_dragen, chord = hrd_chord, hrdetect = hrd_hrdetect) +p1 <- pdat |> + ggplot2::ggplot(aes(x = chord, y = hrdetect, label = sbj)) + + ggplot2::geom_point(colour = "#00bfc4") + + ggplot2::theme_bw() +psub1 <- plotly::ggplotly(p1) +p2 <- pdat |> + ggplot2::ggplot(aes(x = chord, y = dragen, label = sbj)) + + ggplot2::geom_point(colour = "#f8766d") + + ggplot2::theme_bw() +psub2 <- plotly::ggplotly(p2) +p3 <- pdat |> + ggplot2::ggplot(aes(x = hrdetect, y = dragen, label = sbj)) + + ggplot2::geom_point(colour = "#7cae00") + + ggplot2::theme_bw() +psub3 <- plotly::ggplotly(p3) +# p_all <- p1 + p2 + p3 + plot_layout(ncol = 1) +plotly::subplot(psub1, psub2, psub3, nrows = 3, titleX = TRUE, titleY = TRUE, margin = c(0.02, 0.02, 0.04, 0.04)) +``` + +### Signatures + +#### All (SNV, Indel, DBS) + +```{r} +#| label: sig_results_all +dsig |> + left_join(meta |> select(rownum, is_myriad, portal_run_id, SubjectID, LibraryID_tumor), + by = "portal_run_id" + ) |> + select(rownum, is_myriad, portal_run_id, SubjectID, LibraryID_tumor, everything()) |> + tab_view("sig_results_all") +``` + +#### Top 2 + +```{r} +#| label: sig_results_top2 +dsig_filt |> + left_join(meta |> select(rownum, is_myriad, portal_run_id, SubjectID, LibraryID_tumor), + by = "portal_run_id" + ) |> + select(rownum, is_myriad, portal_run_id, SubjectID, LibraryID_tumor, everything()) |> + arrange(rownum) |> + tab_view("sig_results_top2") +``` + +#### Top 3 SNV2015 + +```{r} +#| label: sig_results +#| fig-width: 15 +#| fig-height: 65 + +sig_order2015 <- paste0("Sig", 1:30) +# sig_order2020 <- paste0( +# "SBS", +# c( +# 1:6, +# paste0(7, c("a", "b", "c", "d")), +# 8:9, +# paste0(10, c("a", "b", "c", "d")), +# 11:16, +# paste0(17, c("a", "b")), +# 18:60, +# 84:94 +# ) +# ) + +p2_prep <- dsig |> + filter( + Sig_group == "snv2015", + Rank %in% c(1:3) + ) |> + left_join(dall |> select(portal_run_id, date_analysed, SubjectID, LibraryID_tumor), by = "portal_run_id") |> + mutate(sbj = as.character(glue("{SubjectID}_{LibraryID_tumor}"))) |> + select(date_analysed, sbj, Sig_group, Rank, Signature, Contribution, RelFreq) |> + mutate(Signature = factor(Signature, levels = sig_order2015)) +p2 <- p2_prep |> + ggplot2::ggplot(aes(x = Contribution, y = sbj, fill = Signature, text = sbj)) + + ggplot2::geom_bar(position = "fill", stat = "identity") + + ggplot2::theme_bw(base_size = 7) + +plotly::ggplotly(p2, tooltip = c("x", "text", "fill")) +``` + +## Metadata Summary + +::: {.panel-tabset .nav-pills} + +### Project Name/Owner + +```{r} +#| label: project_owner_name +dall |> + count(project_name, project_owner) |> + knitr::kable() +``` + +### Source / Quality + +```{r} +#| label: source_quality +count(dall, source, quality) |> knitr::kable() +``` + +### Workflow + +```{r} +#| label: workflow_summary +count(dall, workflow) |> knitr::kable() +``` + +::: + + diff --git a/inst/rmd/umccr_workflows/umccrise/dl_and_tidy.R b/inst/rmd/umccr_workflows/umccrise/dl_and_tidy.R index 3b5ede3..fb17721 100755 --- a/inst/rmd/umccr_workflows/umccrise/dl_and_tidy.R +++ b/inst/rmd/umccr_workflows/umccrise/dl_and_tidy.R @@ -12,9 +12,9 @@ query_workflow_umccrise <- function(start_date, end_date) { q1 <- glue( - "WHERE \"type_name\" = 'umccrise'", - "AND \"start\" >= date(\'{start_date}\')", - "AND \"end\" <= date(\'{end_date}\')", + "WHERE \"type_name\" = 'umccrise' ", + "AND \"start\" >= date(\'{start_date}\') ", + "AND \"end\" <= date(\'{end_date}\') ", "ORDER BY \"start\" DESC;" ) rportal::portaldb_query_workflow(q1) diff --git a/man/Wf.Rd b/man/Wf.Rd index f785399..3005725 100644 --- a/man/Wf.Rd +++ b/man/Wf.Rd @@ -202,8 +202,7 @@ Download files from GDS/S3 to local filesystem. ica_token = Sys.getenv("ICA_ACCESS_TOKEN"), max_files = 1000, dryrun = FALSE, - recursive = NULL, - list_filter_fun = NULL + recursive = NULL )}\if{html}{\out{}} } @@ -223,8 +222,6 @@ download them).} \item{\code{recursive}}{Should files be returned recursively \emph{in and under} the specified GDS directory, or \emph{only directly in} the specified GDS directory (def: TRUE via ICA API).} - -\item{\code{list_filter_fun}}{Function to filter relevant files.} } \if{html}{\out{}} } diff --git a/man/Wf_sash.Rd b/man/Wf_sash.Rd new file mode 100644 index 0000000..1caa51a --- /dev/null +++ b/man/Wf_sash.Rd @@ -0,0 +1,320 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/sash.R +\name{Wf_sash} +\alias{Wf_sash} +\title{Wf_sash R6 Class} +\description{ +Reads and writes tidy versions of files from the \code{sash} workflow +} +\examples{ +\dontrun{ + +#---- Local ----# +p1 <- "~/s3/org.umccr.data.oncoanalyser/analysis_data/SBJ05571/sash" +p2 <- "202408270b93455e/L2401308_L2401307" +p <- normalizePath(file.path(p1, p2)) +SubjectID <- "SBJ05571" +SampleID_tumor <- "MDX240307" +prefix <- glue("{SubjectID}__{SampleID_tumor}") +s1 <- Wf_sash$new(path = p, SubjectID = SubjectID, SampleID_tumor = SampleID_tumor) +s1$list_files(max_files = 20) +s1$list_files_filter_relevant(max_files = 300) +d <- s1$download_files(max_files = 1000, dryrun = F) +d_tidy <- s1$tidy_files(d) +d_write <- s1$write( + d_tidy, + outdir = file.path(p, "dracarys_tidy"), + prefix = glue("{SubjectID}_{SampleID_tumor}"), + format = "tsv" +) + +#---- S3 ----# +p1 <- "s3://org.umccr.data.oncoanalyser/analysis_data/SBJ05571/sash" +p2 <- "202408270b93455e/L2401308_L2401307" +p <- file.path(p1, p2) +SubjectID <- "SBJ05571" +SampleID_tumor <- "MDX240307" +prefix <- glue("{SubjectID}__{SampleID_tumor}") +s1 <- Wf_sash$new(path = p, SubjectID = SubjectID, SampleID_tumor = SampleID_tumor) +s1$list_files(max_files = 20) +s1$list_files_filter_relevant() +outdir <- sub("s3:/", "~/s3", p) +d <- s1$download_files(outdir = outdir, max_files = 1000, dryrun = F) +d_tidy <- s1$tidy_files(d) +d_write <- s1$write( + d_tidy, + outdir = file.path(p, "dracarys_tidy"), + prefix = glue("{SubjectID}__{SampleID_tumor}"), + format = "tsv" +) +} + +} +\section{Super class}{ +\code{\link[dracarys:Wf]{dracarys::Wf}} -> \code{Wf_sash} +} +\section{Public fields}{ +\if{html}{\out{
dracarys::Wf$download_files()
dracarys::Wf$list_files()
dracarys::Wf$list_files_filter_relevant()
dracarys::Wf$tidy_files()
dracarys::Wf$write()