From 8364918628e2c2b87a22d5f9ac566d114ef5e5b6 Mon Sep 17 00:00:00 2001 From: pdiakumis Date: Sat, 14 Sep 2024 00:49:12 +1000 Subject: [PATCH 1/8] use path instead of bname for regex matching --- R/fs_icav1.R | 2 +- R/fs_s3.R | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/R/fs_icav1.R b/R/fs_icav1.R index 16df2c9..9ab22f8 100644 --- a/R/fs_icav1.R +++ b/R/fs_icav1.R @@ -139,7 +139,7 @@ gds_list_files_filter_relevant <- function(gdsdir, pattern = NULL, regexes = DR_ no_recurse = no_recurse, page_token = page_token, recursive = recursive ) |> dplyr::rowwise() |> - dplyr::mutate(type = purrr::map_chr(.data$bname, \(x) match_regex(x, regexes))) |> + dplyr::mutate(type = purrr::map_chr(.data$path, \(x) match_regex(x, regexes))) |> dplyr::ungroup() |> dplyr::filter(!is.na(.data$type), grepl(pattern, .data$type)) |> dplyr::select(dplyr::any_of(cols_sel)) diff --git a/R/fs_s3.R b/R/fs_s3.R index 34c437e..d1a57c0 100644 --- a/R/fs_s3.R +++ b/R/fs_s3.R @@ -78,7 +78,7 @@ s3_list_files_filter_relevant <- function(s3dir, pattern = NULL, d <- d_all |> dplyr::rowwise() |> dplyr::mutate( - type = purrr::map_chr(.data$bname, \(x) match_regex(x, regexes)) + type = purrr::map_chr(.data$path, \(x) match_regex(x, regexes)) ) |> dplyr::ungroup() |> dplyr::filter(!is.na(.data$type), grepl(pattern, .data$type)) |> From 27de1ca0432b8abc16ccbb1b1c41250edb7a7859 Mon Sep 17 00:00:00 2001 From: pdiakumis Date: Sat, 14 Sep 2024 01:34:13 +1000 Subject: [PATCH 2/8] add sash support --- NAMESPACE | 2 + R/fs_local.R | 2 +- R/sash.R | 287 ++++++++++++++++++++++ R/umccrise.R | 3 +- man/Wf_sash.Rd | 379 +++++++++++++++++++++++++++++ man/Wf_sash_download_tidy_write.Rd | 57 +++++ man/Wf_umccrise.Rd | 2 - 7 files changed, 727 insertions(+), 5 deletions(-) create mode 100644 R/sash.R create mode 100644 man/Wf_sash.Rd create mode 100644 man/Wf_sash_download_tidy_write.Rd diff --git a/NAMESPACE b/NAMESPACE index b10ca1f..9f9ec02 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -30,6 +30,8 @@ export(TsoTmbFile) export(TsoTmbTraceTsvFile) export(VCMetricsFile) export(Wf) +export(Wf_sash) +export(Wf_sash_download_tidy_write) export(Wf_tso_ctdna_tumor_only) export(Wf_umccrise) export(Wf_umccrise_download_tidy_write) diff --git a/R/fs_local.R b/R/fs_local.R index 07397aa..f681bef 100644 --- a/R/fs_local.R +++ b/R/fs_local.R @@ -47,7 +47,7 @@ local_list_files_dir <- function(localdir, max_files = NULL) { local_list_files_filter_relevant <- function(localdir, regexes = DR_FILE_REGEX, max_files = NULL) { local_list_files_dir(localdir = localdir, max_files = max_files) |> dplyr::mutate( - type = purrr::map_chr(.data$bname, \(x) match_regex(x, regexes = regexes)) + type = purrr::map_chr(.data$path, \(x) match_regex(x, regexes = regexes)) ) |> dplyr::filter(!is.na(.data$type)) |> dplyr::select("type", "bname", "size", "lastmodified", localpath = "path") diff --git a/R/sash.R b/R/sash.R new file mode 100644 index 0000000..ad42b0e --- /dev/null +++ b/R/sash.R @@ -0,0 +1,287 @@ +#' Wf_sash R6 Class +#' +#' @description +#' Reads and writes tidy versions of files from the `sash` workflow +#' +#' @examples +#' \dontrun{ +#' +#' #---- Local ----# +#' p1 <- "~/s3/org.umccr.data.oncoanalyser/analysis_data/SBJ05571/sash" +#' p2 <- "202408270b93455e/L2401308_L2401307" +#' p <- normalizePath(file.path(p1, p2)) +#' SubjectID <- "SBJ05571" +#' SampleID_tumor <- "MDX240307" +#' prefix <- glue("{SubjectID}__{SampleID_tumor}") +#' s1 <- Wf_sash$new(path = p, SubjectID = SubjectID, SampleID_tumor = SampleID_tumor) +#' s1$list_files(max_files = 20) +#' s1$list_files_filter_relevant(max_files = 300) +#' d <- s1$download_files(max_files = 1000, dryrun = F) +#' d_tidy <- s1$tidy_files(d) +#' d_write <- s1$write( +#' d_tidy, +#' outdir = file.path(p, "dracarys_tidy"), +#' prefix = glue("{SubjectID}_{SampleID_tumor}"), +#' format = "tsv" +#' ) +#' +#' #---- S3 ----# +#' p1 <- "s3://org.umccr.data.oncoanalyser/analysis_data/SBJ05571/sash" +#' p2 <- "202408270b93455e/L2401308_L2401307" +#' p <- file.path(p1, p2) +#' SubjectID <- "SBJ05571" +#' SampleID_tumor <- "MDX240307" +#' prefix <- glue("{SubjectID}__{SampleID_tumor}") +#' s1 <- Wf_sash$new(path = p, SubjectID = SubjectID, SampleID_tumor = SampleID_tumor) +#' s1$list_files(max_files = 20) +#' s1$list_files_filter_relevant() +#' outdir <- sub("s3:/", "~/s3", p) +#' d <- s1$download_files(outdir = outdir, max_files = 1000, dryrun = F) +#' d_tidy <- s1$tidy_files(d) +#' d_write <- s1$write( +#' d_tidy, +#' outdir = file.path(p, "dracarys_tidy"), +#' prefix = glue("{SubjectID}__{SampleID_tumor}"), +#' format = "tsv" +#' ) +#' } +#' +#' @export +Wf_sash <- R6::R6Class( + "Wf_sash", + inherit = Wf, + public = list( + #' @field SubjectID The SubjectID of the sample (needed for path lookup). + #' @field SampleID_tumor The SampleID of the tumor sample (needed for path lookup). + SubjectID = NULL, + SampleID_tumor = NULL, + #' @description Create a new Wf_sash object. + #' @param path Path to directory with raw workflow results (from GDS, S3, or + #' local filesystem). + #' @param SubjectID The SubjectID of the sample (needed for path lookup). + #' @param SampleID_tumor The SampleID of the tumor sample (needed for path lookup). + initialize = function(path = NULL, SubjectID = NULL, SampleID_tumor = NULL) { + wname <- "sash" + regexes <- tibble::tribble( + ~regex, ~fun, + "cancer_report/cancer_report_tables/hrd/.*-chord\\.tsv\\.gz$", "hrdchordtsv", + "cancer_report/cancer_report_tables/hrd/.*-hrdetect\\.tsv\\.gz$", "hrdetecttsv", + "cancer_report/cancer_report_tables/hrd/.*-dragen\\.tsv\\.gz$", "hrddragentsv", + "cancer_report/cancer_report_tables/sigs/.*-snv_2015\\.tsv\\.gz$", "sigssnv2015tsv", + "cancer_report/cancer_report_tables/sigs/.*-snv_2020\\.tsv\\.gz$", "sigssnv2020tsv", + "cancer_report/cancer_report_tables/sigs/.*-dbs\\.tsv\\.gz$", "sigsdbstsv", + "cancer_report/cancer_report_tables/sigs/.*-indel\\.tsv\\.gz$", "sigsindeltsv", + "cancer_report/cancer_report_tables/.*-qc_summary\\.tsv\\.gz$", "qcsummarytsv", + "smlv_somatic/report/pcgr/.*\\.pcgr_acmg\\.grch38\\.json\\.gz$", "pcgrjson" + ) |> + dplyr::mutate(fun = paste0("read_", .data$fun)) + + super$initialize(path = path, wname = wname, regexes = regexes) + self$SubjectID <- SubjectID + self$SampleID_tumor <- SampleID_tumor + }, + #' @description Print details about the Workflow. + #' @param ... (ignored). + print = function(...) { + res <- tibble::tribble( + ~var, ~value, + "path", self$path, + "wname", self$wname, + "filesystem", self$filesystem, + "SubjectID", self$SubjectID, + "SampleID_tumor", self$SampleID_tumor + ) + print(res) + invisible(self) + }, + #' @description List dracarys files under given path + #' @param max_files Max number of files to list (for gds/s3 only). + #' @param ica_token ICA access token (def: $ICA_ACCESS_TOKEN env var). + #' @param ... Passed on to the `gds_list_files_filter_relevant` or + #' the `s3_list_files_filter_relevant` function. + list_files_filter_relevant = function(max_files = 1000, + ica_token = Sys.getenv("ICA_ACCESS_TOKEN"), ...) { + path <- self$path + dir1 <- file.path(path, glue("{self$SubjectID}_{self$SampleID_tumor}")) + f1 <- super$list_files_filter_relevant(path = dir1, max_files = 500) + return(f1) + }, + #' @description Download files from GDS/S3 to local filesystem. + #' @param outdir Path to output directory. + #' @param ica_token ICA access token (def: $ICA_ACCESS_TOKEN env var). + #' @param max_files Max number of files to list. + #' @param dryrun If TRUE, just list the files that will be downloaded (don't + #' download them). + #' @param recursive Should files be returned recursively _in and under_ the specified + #' GDS directory, or _only directly in_ the specified GDS directory (def: TRUE via ICA API). + download_files = function(outdir, ica_token = Sys.getenv("ICA_ACCESS_TOKEN"), + max_files = 1000, dryrun = FALSE, recursive = NULL) { + super$download_files( + outdir = outdir, ica_token = ica_token, max_files = max_files, + dryrun = dryrun, recursive = recursive, + list_filter_fun = self$list_files_filter_relevant + ) + }, + #' @description Read `pcgr.json.gz` file. + #' @param x Path to file. + read_pcgrjson = function(x) { + j <- read_jsongz_jsonlite(x) + tmb <- + j[["content"]][["tmb"]][["variant_statistic"]] %||% + j[["content"]][["tmb"]][["v_stat"]] %||% + list(tmb_estimate = NA, n_tmb = NA) + tmb <- purrr::flatten(tmb) |> + tibble::as_tibble_row() |> + dplyr::select("tmb_estimate", "n_tmb") + msi <- j[["content"]][["msi"]][["prediction"]][["msi_stats"]] + # handle nulls + msi <- msi %||% list(fracIndels = NA, predicted_class = NA) + msi <- purrr::flatten(msi) |> + tibble::as_tibble_row() |> + dplyr::select("fracIndels", "predicted_class") + metrics <- dplyr::bind_cols(msi, tmb) + return(metrics) + }, + #' @description Read `dragen.tsv.gz` cancer report hrd file. + #' @param x Path to file. + read_hrddragentsv = function(x) { + ct <- readr::cols(.default = "d", Sample = "c") + read_tsvgz(x, col_types = ct) + }, + #' @description Read `chord.tsv.gz` cancer report hrd file. + #' @param x Path to file. + read_hrdchordtsv = function(x) { + ct <- readr::cols_only( + p_hrd = "d", + hr_status = "c", + hrd_type = "c", + p_BRCA1 = "d", + p_BRCA2 = "d" + ) + read_tsvgz(x, col_types = ct) + }, + #' @description Read `hrdetect.tsv.gz` cancer report hrd file. + #' @param x Path to file. + read_hrdetecttsv = function(x) { + ct <- readr::cols( + .default = "d", + sample = "c" + ) + read_tsvgz(x, col_types = ct) |> + dplyr::select(-c("sample")) + }, + #' @description Read signature cancer report file. + #' @param x Path to file. + read_sigstsv = function(x) { + ct <- readr::cols( + .default = "d", + Signature = "c" + ) + read_tsvgz(x, col_types = ct) + }, + #' @description Read `snv_2015.tsv.gz` sigs cancer report file. + #' @param x Path to file. + read_sigssnv2015tsv = function(x) { + self$read_sigstsv(x) + }, + #' @description Read `snv_2020.tsv.gz` sigs cancer report file. + #' @param x Path to file. + read_sigssnv2020tsv = function(x) { + self$read_sigstsv(x) + }, + #' @description Read `dbs.tsv.gz` sigs cancer report file. + #' @param x Path to file. + read_sigsdbstsv = function(x) { + self$read_sigstsv(x) + }, + #' @description Read `indel.tsv.gz` sigs cancer report file. + #' @param x Path to file. + read_sigsindeltsv = function(x) { + self$read_sigstsv(x) + }, + #' @description Read `qc_summary.tsv.gz` cancer report file. + #' @param x Path to file. + read_qcsummarytsv = function(x) { + d <- read_tsvgz(x, col_types = readr::cols(.default = "c")) + d |> + dplyr::select("variable", "value") |> + tidyr::pivot_wider(names_from = "variable", values_from = "value") |> + dplyr::rename(MSI_mb_tmp = "MSI (indels/Mb)") |> + dplyr::mutate( + purity_hmf = sub("(.*) \\(.*\\)", "\\1", .data$Purity) |> as.numeric(), + ploidy_hmf = sub("(.*) \\(.*\\)", "\\1", .data$Ploidy) |> as.numeric(), + msi_mb_hmf = sub(".* \\((.*)\\)", "\\1", .data$MSI_mb_tmp) |> as.numeric(), + contamination_hmf = as.numeric(.data$Contamination), + deleted_genes_hmf = as.numeric(.data$DeletedGenes), + msi_hmf = sub("(.*) \\(.*\\)", "\\1", .data$MSI_mb_tmp), + tmb_hmf = sub("(.*) \\(.*\\)", "\\1", .data$TMB) |> as.numeric(), + tml_hmf = sub("(.*) \\(.*\\)", "\\1", .data$TML) |> as.numeric(), + hypermutated = ifelse("Hypermutated" %in% d$variable, .data[["Hypermutated"]], NA) |> as.character() + ) |> + dplyr::select( + qc_status_hmf = "QC_Status", + sex_hmf = "Gender", + "purity_hmf", "ploidy_hmf", "msi_hmf", "msi_mb_hmf", + "contamination_hmf", + "deleted_genes_hmf", "tmb_hmf", "tml_hmf", + wgd_hmf = "WGD", + "hypermutated" + ) + } + ) # end public +) + +#' sash Download Tidy and Write +#' +#' Downloads files from the `sash` workflow and writes them in a tidy format. +#' +#' @param path Path to directory with raw workflow results (from GDS, S3, or +#' local filesystem). +#' @param SubjectID The SubjectID of the sample (needed for path lookup). +#' @param SampleID_tumor The SampleID of the tumor sample (needed for path lookup). +#' @param outdir Path to output directory. +#' @param format Format of output files. +#' @param max_files Max number of files to list. +#' @param ica_token ICA access token (def: $ICA_ACCESS_TOKEN env var). +#' @param dryrun If TRUE, just list the files that will be downloaded (don't +#' download them). +#' @return List where each element is a tidy tibble of a sash file. +#' +#' @examples +#' \dontrun{ +#' SubjectID <- "SBJ03043" +#' SampleID_tumor <- "PRJ230004" +#' p1_gds <- glue("gds://production/analysis_data/{SubjectID}/umccrise") +#' p <- file.path(p1_gds, "20240830ec648f40/L2300064__L2300063") +#' outdir <- file.path(sub("gds:/", "~/icav1/g", p)) +#' token <- Sys.getenv("ICA_ACCESS_TOKEN") +#' d <- Wf_sash_download_tidy_write( +#' path = p, SubjectID = SubjectID, SampleID_tumor = SampleID_tumor, +#' outdir = outdir, +#' dryrun = F +#' ) +#' } +#' @export +Wf_sash_download_tidy_write <- function(path, SubjectID, SampleID_tumor, + outdir, format = "rds", max_files = 1000, + ica_token = Sys.getenv("ICA_ACCESS_TOKEN"), + dryrun = FALSE) { + s <- Wf_sash$new( + path = path, SubjectID = SubjectID, SampleID_tumor = SampleID_tumor + ) + d_dl <- s$download_files( + outdir = outdir, ica_token = ica_token, + max_files = max_files, dryrun = dryrun + ) + if (!dryrun) { + d_tidy <- s$tidy_files(d_dl) + d_write <- s$write( + d_tidy, + outdir = file.path(outdir, "dracarys_tidy"), + prefix = glue("{SubjectID}__{SampleID_tumor}"), + format = format + ) + return(d_write) + } + return(d_dl) +} diff --git a/R/umccrise.R b/R/umccrise.R index ffbe5c5..2ba73e6 100644 --- a/R/umccrise.R +++ b/R/umccrise.R @@ -120,7 +120,6 @@ Wf_umccrise <- R6::R6Class( #' download them). #' @param recursive Should files be returned recursively _in and under_ the specified #' GDS directory, or _only directly in_ the specified GDS directory (def: TRUE via ICA API). - #' @param list_filter_fun Function to filter relevant files. download_files = function(outdir, ica_token = Sys.getenv("ICA_ACCESS_TOKEN"), max_files = 1000, dryrun = FALSE, recursive = NULL) { super$download_files( @@ -278,11 +277,11 @@ Wf_umccrise <- R6::R6Class( #' @param SubjectID The SubjectID of the sample (needed for path lookup). #' @param SampleID_tumor The SampleID of the tumor sample (needed for path lookup). #' @param outdir Path to output directory. +#' @param format Format of output files. #' @param max_files Max number of files to list. #' @param ica_token ICA access token (def: $ICA_ACCESS_TOKEN env var). #' @param dryrun If TRUE, just list the files that will be downloaded (don't #' download them). -#' @param format Format of output files. #' @return List where each element is a tidy tibble of a umccrise file. #' #' @examples diff --git a/man/Wf_sash.Rd b/man/Wf_sash.Rd new file mode 100644 index 0000000..d1f9c4d --- /dev/null +++ b/man/Wf_sash.Rd @@ -0,0 +1,379 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/sash.R +\name{Wf_sash} +\alias{Wf_sash} +\title{Wf_sash R6 Class} +\description{ +Reads and writes tidy versions of files from the \code{sash} workflow +} +\examples{ +\dontrun{ + +#---- Local ----# +p1 <- "~/s3/org.umccr.data.oncoanalyser/analysis_data/SBJ05571/sash" +p2 <- "202408270b93455e/L2401308_L2401307" +p <- normalizePath(file.path(p1, p2)) +SubjectID <- "SBJ05571" +SampleID_tumor <- "MDX240307" +prefix <- glue("{SubjectID}__{SampleID_tumor}") +s1 <- Wf_sash$new(path = p, SubjectID = SubjectID, SampleID_tumor = SampleID_tumor) +s1$list_files(max_files = 20) +s1$list_files_filter_relevant(max_files = 300) +d <- s1$download_files(max_files = 1000, dryrun = F) +d_tidy <- s1$tidy_files(d) +d_write <- s1$write( + d_tidy, + outdir = file.path(p, "dracarys_tidy"), + prefix = glue("{SubjectID}_{SampleID_tumor}"), + format = "tsv" +) + +#---- S3 ----# +p1 <- "s3://org.umccr.data.oncoanalyser/analysis_data/SBJ05571/sash" +p2 <- "202408270b93455e/L2401308_L2401307" +p <- file.path(p1, p2) +SubjectID <- "SBJ05571" +SampleID_tumor <- "MDX240307" +prefix <- glue("{SubjectID}__{SampleID_tumor}") +s1 <- Wf_sash$new(path = p, SubjectID = SubjectID, SampleID_tumor = SampleID_tumor) +s1$list_files(max_files = 20) +s1$list_files_filter_relevant() +outdir <- sub("s3:/", "~/s3", p) +d <- s1$download_files(outdir = outdir, max_files = 1000, dryrun = F) +d_tidy <- s1$tidy_files(d) +d_write <- s1$write( + d_tidy, + outdir = file.path(p, "dracarys_tidy"), + prefix = glue("{SubjectID}__{SampleID_tumor}"), + format = "tsv" +) +} + +} +\section{Super class}{ +\code{\link[dracarys:Wf]{dracarys::Wf}} -> \code{Wf_sash} +} +\section{Public fields}{ +\if{html}{\out{
}} +\describe{ +\item{\code{SubjectID}}{The SubjectID of the sample (needed for path lookup).} + +\item{\code{SampleID_tumor}}{The SampleID of the tumor sample (needed for path lookup).} +} +\if{html}{\out{
}} +} +\section{Methods}{ +\subsection{Public methods}{ +\itemize{ +\item \href{#method-Wf_sash-new}{\code{Wf_sash$new()}} +\item \href{#method-Wf_sash-print}{\code{Wf_sash$print()}} +\item \href{#method-Wf_sash-list_files_filter_relevant}{\code{Wf_sash$list_files_filter_relevant()}} +\item \href{#method-Wf_sash-download_files}{\code{Wf_sash$download_files()}} +\item \href{#method-Wf_sash-read_pcgrjson}{\code{Wf_sash$read_pcgrjson()}} +\item \href{#method-Wf_sash-read_hrddragentsv}{\code{Wf_sash$read_hrddragentsv()}} +\item \href{#method-Wf_sash-read_hrdchordtsv}{\code{Wf_sash$read_hrdchordtsv()}} +\item \href{#method-Wf_sash-read_hrdetecttsv}{\code{Wf_sash$read_hrdetecttsv()}} +\item \href{#method-Wf_sash-read_sigstsv}{\code{Wf_sash$read_sigstsv()}} +\item \href{#method-Wf_sash-read_sigssnv2015tsv}{\code{Wf_sash$read_sigssnv2015tsv()}} +\item \href{#method-Wf_sash-read_sigssnv2020tsv}{\code{Wf_sash$read_sigssnv2020tsv()}} +\item \href{#method-Wf_sash-read_sigsdbstsv}{\code{Wf_sash$read_sigsdbstsv()}} +\item \href{#method-Wf_sash-read_sigsindeltsv}{\code{Wf_sash$read_sigsindeltsv()}} +\item \href{#method-Wf_sash-read_qcsummarytsv}{\code{Wf_sash$read_qcsummarytsv()}} +\item \href{#method-Wf_sash-clone}{\code{Wf_sash$clone()}} +} +} +\if{html}{\out{ +
Inherited methods + +
+}} +\if{html}{\out{
}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-Wf_sash-new}{}}} +\subsection{Method \code{new()}}{ +Create a new Wf_sash object. +\subsection{Usage}{ +\if{html}{\out{
}}\preformatted{Wf_sash$new(path = NULL, SubjectID = NULL, SampleID_tumor = NULL)}\if{html}{\out{
}} +} + +\subsection{Arguments}{ +\if{html}{\out{
}} +\describe{ +\item{\code{path}}{Path to directory with raw workflow results (from GDS, S3, or +local filesystem).} + +\item{\code{SubjectID}}{The SubjectID of the sample (needed for path lookup).} + +\item{\code{SampleID_tumor}}{The SampleID of the tumor sample (needed for path lookup).} +} +\if{html}{\out{
}} +} +} +\if{html}{\out{
}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-Wf_sash-print}{}}} +\subsection{Method \code{print()}}{ +Print details about the Workflow. +\subsection{Usage}{ +\if{html}{\out{
}}\preformatted{Wf_sash$print(...)}\if{html}{\out{
}} +} + +\subsection{Arguments}{ +\if{html}{\out{
}} +\describe{ +\item{\code{...}}{(ignored).} +} +\if{html}{\out{
}} +} +} +\if{html}{\out{
}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-Wf_sash-list_files_filter_relevant}{}}} +\subsection{Method \code{list_files_filter_relevant()}}{ +List dracarys files under given path +\subsection{Usage}{ +\if{html}{\out{
}}\preformatted{Wf_sash$list_files_filter_relevant( + max_files = 1000, + ica_token = Sys.getenv("ICA_ACCESS_TOKEN"), + ... +)}\if{html}{\out{
}} +} + +\subsection{Arguments}{ +\if{html}{\out{
}} +\describe{ +\item{\code{max_files}}{Max number of files to list (for gds/s3 only).} + +\item{\code{ica_token}}{ICA access token (def: $ICA_ACCESS_TOKEN env var).} + +\item{\code{...}}{Passed on to the \code{gds_list_files_filter_relevant} or +the \code{s3_list_files_filter_relevant} function.} +} +\if{html}{\out{
}} +} +} +\if{html}{\out{
}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-Wf_sash-download_files}{}}} +\subsection{Method \code{download_files()}}{ +Download files from GDS/S3 to local filesystem. +\subsection{Usage}{ +\if{html}{\out{
}}\preformatted{Wf_sash$download_files( + outdir, + ica_token = Sys.getenv("ICA_ACCESS_TOKEN"), + max_files = 1000, + dryrun = FALSE, + recursive = NULL +)}\if{html}{\out{
}} +} + +\subsection{Arguments}{ +\if{html}{\out{
}} +\describe{ +\item{\code{outdir}}{Path to output directory.} + +\item{\code{ica_token}}{ICA access token (def: $ICA_ACCESS_TOKEN env var).} + +\item{\code{max_files}}{Max number of files to list.} + +\item{\code{dryrun}}{If TRUE, just list the files that will be downloaded (don't +download them).} + +\item{\code{recursive}}{Should files be returned recursively \emph{in and under} the specified +GDS directory, or \emph{only directly in} the specified GDS directory (def: TRUE via ICA API).} +} +\if{html}{\out{
}} +} +} +\if{html}{\out{
}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-Wf_sash-read_pcgrjson}{}}} +\subsection{Method \code{read_pcgrjson()}}{ +Read \code{pcgr.json.gz} file. +\subsection{Usage}{ +\if{html}{\out{
}}\preformatted{Wf_sash$read_pcgrjson(x)}\if{html}{\out{
}} +} + +\subsection{Arguments}{ +\if{html}{\out{
}} +\describe{ +\item{\code{x}}{Path to file.} +} +\if{html}{\out{
}} +} +} +\if{html}{\out{
}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-Wf_sash-read_hrddragentsv}{}}} +\subsection{Method \code{read_hrddragentsv()}}{ +Read \code{dragen.tsv.gz} cancer report hrd file. +\subsection{Usage}{ +\if{html}{\out{
}}\preformatted{Wf_sash$read_hrddragentsv(x)}\if{html}{\out{
}} +} + +\subsection{Arguments}{ +\if{html}{\out{
}} +\describe{ +\item{\code{x}}{Path to file.} +} +\if{html}{\out{
}} +} +} +\if{html}{\out{
}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-Wf_sash-read_hrdchordtsv}{}}} +\subsection{Method \code{read_hrdchordtsv()}}{ +Read \code{chord.tsv.gz} cancer report hrd file. +\subsection{Usage}{ +\if{html}{\out{
}}\preformatted{Wf_sash$read_hrdchordtsv(x)}\if{html}{\out{
}} +} + +\subsection{Arguments}{ +\if{html}{\out{
}} +\describe{ +\item{\code{x}}{Path to file.} +} +\if{html}{\out{
}} +} +} +\if{html}{\out{
}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-Wf_sash-read_hrdetecttsv}{}}} +\subsection{Method \code{read_hrdetecttsv()}}{ +Read \code{hrdetect.tsv.gz} cancer report hrd file. +\subsection{Usage}{ +\if{html}{\out{
}}\preformatted{Wf_sash$read_hrdetecttsv(x)}\if{html}{\out{
}} +} + +\subsection{Arguments}{ +\if{html}{\out{
}} +\describe{ +\item{\code{x}}{Path to file.} +} +\if{html}{\out{
}} +} +} +\if{html}{\out{
}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-Wf_sash-read_sigstsv}{}}} +\subsection{Method \code{read_sigstsv()}}{ +Read signature cancer report file. +\subsection{Usage}{ +\if{html}{\out{
}}\preformatted{Wf_sash$read_sigstsv(x)}\if{html}{\out{
}} +} + +\subsection{Arguments}{ +\if{html}{\out{
}} +\describe{ +\item{\code{x}}{Path to file.} +} +\if{html}{\out{
}} +} +} +\if{html}{\out{
}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-Wf_sash-read_sigssnv2015tsv}{}}} +\subsection{Method \code{read_sigssnv2015tsv()}}{ +Read \code{snv_2015.tsv.gz} sigs cancer report file. +\subsection{Usage}{ +\if{html}{\out{
}}\preformatted{Wf_sash$read_sigssnv2015tsv(x)}\if{html}{\out{
}} +} + +\subsection{Arguments}{ +\if{html}{\out{
}} +\describe{ +\item{\code{x}}{Path to file.} +} +\if{html}{\out{
}} +} +} +\if{html}{\out{
}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-Wf_sash-read_sigssnv2020tsv}{}}} +\subsection{Method \code{read_sigssnv2020tsv()}}{ +Read \code{snv_2020.tsv.gz} sigs cancer report file. +\subsection{Usage}{ +\if{html}{\out{
}}\preformatted{Wf_sash$read_sigssnv2020tsv(x)}\if{html}{\out{
}} +} + +\subsection{Arguments}{ +\if{html}{\out{
}} +\describe{ +\item{\code{x}}{Path to file.} +} +\if{html}{\out{
}} +} +} +\if{html}{\out{
}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-Wf_sash-read_sigsdbstsv}{}}} +\subsection{Method \code{read_sigsdbstsv()}}{ +Read \code{dbs.tsv.gz} sigs cancer report file. +\subsection{Usage}{ +\if{html}{\out{
}}\preformatted{Wf_sash$read_sigsdbstsv(x)}\if{html}{\out{
}} +} + +\subsection{Arguments}{ +\if{html}{\out{
}} +\describe{ +\item{\code{x}}{Path to file.} +} +\if{html}{\out{
}} +} +} +\if{html}{\out{
}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-Wf_sash-read_sigsindeltsv}{}}} +\subsection{Method \code{read_sigsindeltsv()}}{ +Read \code{indel.tsv.gz} sigs cancer report file. +\subsection{Usage}{ +\if{html}{\out{
}}\preformatted{Wf_sash$read_sigsindeltsv(x)}\if{html}{\out{
}} +} + +\subsection{Arguments}{ +\if{html}{\out{
}} +\describe{ +\item{\code{x}}{Path to file.} +} +\if{html}{\out{
}} +} +} +\if{html}{\out{
}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-Wf_sash-read_qcsummarytsv}{}}} +\subsection{Method \code{read_qcsummarytsv()}}{ +Read \code{qc_summary.tsv.gz} cancer report file. +\subsection{Usage}{ +\if{html}{\out{
}}\preformatted{Wf_sash$read_qcsummarytsv(x)}\if{html}{\out{
}} +} + +\subsection{Arguments}{ +\if{html}{\out{
}} +\describe{ +\item{\code{x}}{Path to file.} +} +\if{html}{\out{
}} +} +} +\if{html}{\out{
}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-Wf_sash-clone}{}}} +\subsection{Method \code{clone()}}{ +The objects of this class are cloneable with this method. +\subsection{Usage}{ +\if{html}{\out{
}}\preformatted{Wf_sash$clone(deep = FALSE)}\if{html}{\out{
}} +} + +\subsection{Arguments}{ +\if{html}{\out{
}} +\describe{ +\item{\code{deep}}{Whether to make a deep clone.} +} +\if{html}{\out{
}} +} +} +} diff --git a/man/Wf_sash_download_tidy_write.Rd b/man/Wf_sash_download_tidy_write.Rd new file mode 100644 index 0000000..a806fdd --- /dev/null +++ b/man/Wf_sash_download_tidy_write.Rd @@ -0,0 +1,57 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/sash.R +\name{Wf_sash_download_tidy_write} +\alias{Wf_sash_download_tidy_write} +\title{sash Download Tidy and Write} +\usage{ +Wf_sash_download_tidy_write( + path, + SubjectID, + SampleID_tumor, + outdir, + format = "rds", + max_files = 1000, + ica_token = Sys.getenv("ICA_ACCESS_TOKEN"), + dryrun = FALSE +) +} +\arguments{ +\item{path}{Path to directory with raw workflow results (from GDS, S3, or +local filesystem).} + +\item{SubjectID}{The SubjectID of the sample (needed for path lookup).} + +\item{SampleID_tumor}{The SampleID of the tumor sample (needed for path lookup).} + +\item{outdir}{Path to output directory.} + +\item{format}{Format of output files.} + +\item{max_files}{Max number of files to list.} + +\item{ica_token}{ICA access token (def: $ICA_ACCESS_TOKEN env var).} + +\item{dryrun}{If TRUE, just list the files that will be downloaded (don't +download them).} +} +\value{ +List where each element is a tidy tibble of a sash file. +} +\description{ +Downloads files from the \code{sash} workflow and writes them in a tidy format. +} +\examples{ +\dontrun{ +SubjectID <- "SBJ03043" +SampleID_tumor <- "PRJ230004" +p1_gds <- glue("gds://production/analysis_data/{SubjectID}/umccrise") +p <- file.path(p1_gds, "20240830ec648f40/L2300064__L2300063") +outdir <- file.path(sub("gds:/", "~/icav1/g", p)) +token <- Sys.getenv("ICA_ACCESS_TOKEN") +d <- Wf_sash_download_tidy_write( + path = p, SubjectID = SubjectID, SampleID_tumor = SampleID_tumor, + outdir = outdir, + dryrun = F +) +} +} diff --git a/man/Wf_umccrise.Rd b/man/Wf_umccrise.Rd index 5fd5ade..7753970 100644 --- a/man/Wf_umccrise.Rd +++ b/man/Wf_umccrise.Rd @@ -187,8 +187,6 @@ download them).} \item{\code{recursive}}{Should files be returned recursively \emph{in and under} the specified GDS directory, or \emph{only directly in} the specified GDS directory (def: TRUE via ICA API).} - -\item{\code{list_filter_fun}}{Function to filter relevant files.} } \if{html}{\out{}} } From 91f45c0b1b5c3c95cc11073e9b8df54f1a30d558 Mon Sep 17 00:00:00 2001 From: pdiakumis Date: Sat, 14 Sep 2024 12:05:16 +1000 Subject: [PATCH 3/8] Wf.R: rm list_filter_fun req from download_files --- R/Wf.R | 12 ++++-------- R/fs_icav1.R | 6 ++---- R/fs_s3.R | 6 ++---- man/Wf.Rd | 5 +---- man/dr_gds_download.Rd | 5 +---- man/dr_s3_download.Rd | 5 +---- 6 files changed, 11 insertions(+), 28 deletions(-) diff --git a/R/Wf.R b/R/Wf.R index aa5bfae..3e00027 100644 --- a/R/Wf.R +++ b/R/Wf.R @@ -165,18 +165,15 @@ Wf <- R6::R6Class( #' download them). #' @param recursive Should files be returned recursively _in and under_ the specified #' GDS directory, or _only directly in_ the specified GDS directory (def: TRUE via ICA API). - #' @param list_filter_fun Function to filter relevant files. download_files = function(path = self$path, outdir, ica_token = Sys.getenv("ICA_ACCESS_TOKEN"), - max_files = 1000, dryrun = FALSE, recursive = NULL, - list_filter_fun = NULL) { + max_files = 1000, dryrun = FALSE, recursive = NULL) { # TODO: add envvar checker regexes <- self$regexes - assertthat::assert_that(!is.null(regexes), !is.null(list_filter_fun)) + assertthat::assert_that(!is.null(regexes)) if (self$filesystem == "gds") { d <- dr_gds_download( gdsdir = path, outdir = outdir, regexes = regexes, token = ica_token, - page_size = max_files, dryrun = dryrun, recursive = recursive, - list_filter_fun = list_filter_fun + page_size = max_files, dryrun = dryrun, recursive = recursive ) if (!dryrun) { self$filesystem <- "local" @@ -185,8 +182,7 @@ Wf <- R6::R6Class( } else if (self$filesystem == "s3") { d <- dr_s3_download( s3dir = path, outdir = outdir, regexes = regexes, - max_objects = max_files, dryrun = dryrun, - list_filter_fun = list_filter_fun + max_objects = max_files, dryrun = dryrun ) if (!dryrun) { self$filesystem <- "local" diff --git a/R/fs_icav1.R b/R/fs_icav1.R index 9ab22f8..da50daf 100644 --- a/R/fs_icav1.R +++ b/R/fs_icav1.R @@ -155,7 +155,6 @@ gds_list_files_filter_relevant <- function(gdsdir, pattern = NULL, regexes = DR_ #' @param outdir Local output directory. #' @param dryrun If TRUE, just list the files that will be downloaded (don't #' download them). -#' @param list_filter_fun Function to filter relevant GDS files. #' @examples #' \dontrun{ #' gdsdir <- "gds://production/analysis_data/SBJ01155/umccrise/202408300c218043/L2101566__L2101565" @@ -171,11 +170,10 @@ gds_list_files_filter_relevant <- function(gdsdir, pattern = NULL, regexes = DR_ #' @export dr_gds_download <- function(gdsdir, outdir, token = Sys.getenv("ICA_ACCESS_TOKEN"), pattern = NULL, page_size = 100, dryrun = FALSE, - regexes = DR_FILE_REGEX, recursive = NULL, - list_filter_fun = gds_list_files_filter_relevant) { + regexes = DR_FILE_REGEX, recursive = NULL) { e <- emojifont::emoji fs::dir_create(outdir) - d <- list_filter_fun( + d <- gds_list_files_filter_relevant( gdsdir = gdsdir, pattern = pattern, regexes = regexes, token = token, page_size = page_size, include_url = FALSE, no_recurse = FALSE, page_token = NULL, diff --git a/R/fs_s3.R b/R/fs_s3.R index d1a57c0..eb314ce 100644 --- a/R/fs_s3.R +++ b/R/fs_s3.R @@ -109,7 +109,6 @@ s3_list_files_filter_relevant <- function(s3dir, pattern = NULL, #' @param outdir Path to output directory. #' @param dryrun If TRUE, just list the files that will be downloaded (don't #' download them). -#' @param list_filter_fun Function to filter relevant S3 files. #' @examples #' \dontrun{ #' p1 <- "s3://org.umccr.data.oncoanalyser/analysis_data/SBJ05373/sash" @@ -125,12 +124,11 @@ s3_list_files_filter_relevant <- function(s3dir, pattern = NULL, #' } #' @export dr_s3_download <- function(s3dir, outdir, max_objects = 100, pattern = NULL, - regexes = DR_FILE_REGEX, dryrun = FALSE, - list_filter_fun = s3_list_files_filter_relevant) { + regexes = DR_FILE_REGEX, dryrun = FALSE) { s3 <- paws.storage::s3() e <- emojifont::emoji fs::dir_create(outdir) - d <- list_filter_fun( + d <- s3_list_files_filter_relevant( s3dir = s3dir, pattern = NULL, regexes = regexes, max_objects = max_objects, presign = FALSE ) diff --git a/man/Wf.Rd b/man/Wf.Rd index f785399..3005725 100644 --- a/man/Wf.Rd +++ b/man/Wf.Rd @@ -202,8 +202,7 @@ Download files from GDS/S3 to local filesystem. ica_token = Sys.getenv("ICA_ACCESS_TOKEN"), max_files = 1000, dryrun = FALSE, - recursive = NULL, - list_filter_fun = NULL + recursive = NULL )}\if{html}{\out{}} } @@ -223,8 +222,6 @@ download them).} \item{\code{recursive}}{Should files be returned recursively \emph{in and under} the specified GDS directory, or \emph{only directly in} the specified GDS directory (def: TRUE via ICA API).} - -\item{\code{list_filter_fun}}{Function to filter relevant files.} } \if{html}{\out{}} } diff --git a/man/dr_gds_download.Rd b/man/dr_gds_download.Rd index 61aa7f8..1faaf8a 100644 --- a/man/dr_gds_download.Rd +++ b/man/dr_gds_download.Rd @@ -12,8 +12,7 @@ dr_gds_download( page_size = 100, dryrun = FALSE, regexes = DR_FILE_REGEX, - recursive = NULL, - list_filter_fun = gds_list_files_filter_relevant + recursive = NULL ) } \arguments{ @@ -34,8 +33,6 @@ download them).} \item{recursive}{Should files be returned recursively \emph{in and under} the specified GDS directory, or \emph{only directly in} the specified GDS directory (def: TRUE via ICA API).} - -\item{list_filter_fun}{Function to filter relevant GDS files.} } \description{ Download only GDS files that can be processed by dracarys. diff --git a/man/dr_s3_download.Rd b/man/dr_s3_download.Rd index 674ed93..95136f0 100644 --- a/man/dr_s3_download.Rd +++ b/man/dr_s3_download.Rd @@ -10,8 +10,7 @@ dr_s3_download( max_objects = 100, pattern = NULL, regexes = DR_FILE_REGEX, - dryrun = FALSE, - list_filter_fun = s3_list_files_filter_relevant + dryrun = FALSE ) } \arguments{ @@ -27,8 +26,6 @@ dr_s3_download( \item{dryrun}{If TRUE, just list the files that will be downloaded (don't download them).} - -\item{list_filter_fun}{Function to filter relevant S3 files.} } \description{ Download only S3 files that can be processed by dracarys. From 492369236ebf6b01353279562d0ac6db175c616d Mon Sep 17 00:00:00 2001 From: pdiakumis Date: Sat, 14 Sep 2024 12:06:33 +1000 Subject: [PATCH 4/8] sash: hardcode file paths --- R/sash.R | 66 +++++++-------------- man/Wf_sash.Rd | 153 +++++++++++++++---------------------------------- 2 files changed, 67 insertions(+), 152 deletions(-) diff --git a/R/sash.R b/R/sash.R index ad42b0e..345fc2b 100644 --- a/R/sash.R +++ b/R/sash.R @@ -62,17 +62,19 @@ Wf_sash <- R6::R6Class( #' @param SampleID_tumor The SampleID of the tumor sample (needed for path lookup). initialize = function(path = NULL, SubjectID = NULL, SampleID_tumor = NULL) { wname <- "sash" + pref <- glue("{SubjectID}_{SampleID_tumor}") + crep <- "cancer_report/cancer_report_tables" regexes <- tibble::tribble( ~regex, ~fun, - "cancer_report/cancer_report_tables/hrd/.*-chord\\.tsv\\.gz$", "hrdchordtsv", - "cancer_report/cancer_report_tables/hrd/.*-hrdetect\\.tsv\\.gz$", "hrdetecttsv", - "cancer_report/cancer_report_tables/hrd/.*-dragen\\.tsv\\.gz$", "hrddragentsv", - "cancer_report/cancer_report_tables/sigs/.*-snv_2015\\.tsv\\.gz$", "sigssnv2015tsv", - "cancer_report/cancer_report_tables/sigs/.*-snv_2020\\.tsv\\.gz$", "sigssnv2020tsv", - "cancer_report/cancer_report_tables/sigs/.*-dbs\\.tsv\\.gz$", "sigsdbstsv", - "cancer_report/cancer_report_tables/sigs/.*-indel\\.tsv\\.gz$", "sigsindeltsv", - "cancer_report/cancer_report_tables/.*-qc_summary\\.tsv\\.gz$", "qcsummarytsv", - "smlv_somatic/report/pcgr/.*\\.pcgr_acmg\\.grch38\\.json\\.gz$", "pcgrjson" + glue("{pref}/{crep}/hrd/{pref}-chord\\.tsv\\.gz$"), "hrd_chord", + glue("{pref}/{crep}/hrd/{pref}-hrdetect\\.tsv\\.gz$"), "hrd_hrdetect", + glue("{pref}/{crep}/hrd/{pref}-dragen\\.tsv\\.gz$"), "hrd_dragen", + glue("{pref}/{crep}/sigs/{pref}-snv_2015\\.tsv\\.gz$"), "sigs_snv2015", + glue("{pref}/{crep}/sigs/{pref}-snv_2020\\.tsv\\.gz$"), "sigs_snv2020", + glue("{pref}/{crep}/sigs/{pref}-dbs\\.tsv\\.gz$"), "sigs_dbs", + glue("{pref}/{crep}/sigs/{pref}-indel\\.tsv\\.gz$"), "sigs_indel", + glue("{pref}/{crep}/{pref}-qc_summary\\.tsv\\.gz$"), "qcsum", + glue("{pref}/smlv_somatic/report/pcgr/{SampleID_tumor}\\.pcgr_acmg\\.grch38\\.json\\.gz$"), "pcgr_json" ) |> dplyr::mutate(fun = paste0("read_", .data$fun)) @@ -94,37 +96,9 @@ Wf_sash <- R6::R6Class( print(res) invisible(self) }, - #' @description List dracarys files under given path - #' @param max_files Max number of files to list (for gds/s3 only). - #' @param ica_token ICA access token (def: $ICA_ACCESS_TOKEN env var). - #' @param ... Passed on to the `gds_list_files_filter_relevant` or - #' the `s3_list_files_filter_relevant` function. - list_files_filter_relevant = function(max_files = 1000, - ica_token = Sys.getenv("ICA_ACCESS_TOKEN"), ...) { - path <- self$path - dir1 <- file.path(path, glue("{self$SubjectID}_{self$SampleID_tumor}")) - f1 <- super$list_files_filter_relevant(path = dir1, max_files = 500) - return(f1) - }, - #' @description Download files from GDS/S3 to local filesystem. - #' @param outdir Path to output directory. - #' @param ica_token ICA access token (def: $ICA_ACCESS_TOKEN env var). - #' @param max_files Max number of files to list. - #' @param dryrun If TRUE, just list the files that will be downloaded (don't - #' download them). - #' @param recursive Should files be returned recursively _in and under_ the specified - #' GDS directory, or _only directly in_ the specified GDS directory (def: TRUE via ICA API). - download_files = function(outdir, ica_token = Sys.getenv("ICA_ACCESS_TOKEN"), - max_files = 1000, dryrun = FALSE, recursive = NULL) { - super$download_files( - outdir = outdir, ica_token = ica_token, max_files = max_files, - dryrun = dryrun, recursive = recursive, - list_filter_fun = self$list_files_filter_relevant - ) - }, #' @description Read `pcgr.json.gz` file. #' @param x Path to file. - read_pcgrjson = function(x) { + read_pcgr_json = function(x) { j <- read_jsongz_jsonlite(x) tmb <- j[["content"]][["tmb"]][["variant_statistic"]] %||% @@ -144,13 +118,13 @@ Wf_sash <- R6::R6Class( }, #' @description Read `dragen.tsv.gz` cancer report hrd file. #' @param x Path to file. - read_hrddragentsv = function(x) { + read_hrd_dragen = function(x) { ct <- readr::cols(.default = "d", Sample = "c") read_tsvgz(x, col_types = ct) }, #' @description Read `chord.tsv.gz` cancer report hrd file. #' @param x Path to file. - read_hrdchordtsv = function(x) { + read_hrd_chord = function(x) { ct <- readr::cols_only( p_hrd = "d", hr_status = "c", @@ -162,7 +136,7 @@ Wf_sash <- R6::R6Class( }, #' @description Read `hrdetect.tsv.gz` cancer report hrd file. #' @param x Path to file. - read_hrdetecttsv = function(x) { + read_hrd_hrdetect = function(x) { ct <- readr::cols( .default = "d", sample = "c" @@ -181,27 +155,27 @@ Wf_sash <- R6::R6Class( }, #' @description Read `snv_2015.tsv.gz` sigs cancer report file. #' @param x Path to file. - read_sigssnv2015tsv = function(x) { + read_sigs_snv2015 = function(x) { self$read_sigstsv(x) }, #' @description Read `snv_2020.tsv.gz` sigs cancer report file. #' @param x Path to file. - read_sigssnv2020tsv = function(x) { + read_sigs_snv2020 = function(x) { self$read_sigstsv(x) }, #' @description Read `dbs.tsv.gz` sigs cancer report file. #' @param x Path to file. - read_sigsdbstsv = function(x) { + read_sigs_dbs = function(x) { self$read_sigstsv(x) }, #' @description Read `indel.tsv.gz` sigs cancer report file. #' @param x Path to file. - read_sigsindeltsv = function(x) { + read_sigs_indel = function(x) { self$read_sigstsv(x) }, #' @description Read `qc_summary.tsv.gz` cancer report file. #' @param x Path to file. - read_qcsummarytsv = function(x) { + read_qcsum = function(x) { d <- read_tsvgz(x, col_types = readr::cols(.default = "c")) d |> dplyr::select("variable", "value") |> diff --git a/man/Wf_sash.Rd b/man/Wf_sash.Rd index d1f9c4d..1caa51a 100644 --- a/man/Wf_sash.Rd +++ b/man/Wf_sash.Rd @@ -67,25 +67,25 @@ d_write <- s1$write( \itemize{ \item \href{#method-Wf_sash-new}{\code{Wf_sash$new()}} \item \href{#method-Wf_sash-print}{\code{Wf_sash$print()}} -\item \href{#method-Wf_sash-list_files_filter_relevant}{\code{Wf_sash$list_files_filter_relevant()}} -\item \href{#method-Wf_sash-download_files}{\code{Wf_sash$download_files()}} -\item \href{#method-Wf_sash-read_pcgrjson}{\code{Wf_sash$read_pcgrjson()}} -\item \href{#method-Wf_sash-read_hrddragentsv}{\code{Wf_sash$read_hrddragentsv()}} -\item \href{#method-Wf_sash-read_hrdchordtsv}{\code{Wf_sash$read_hrdchordtsv()}} -\item \href{#method-Wf_sash-read_hrdetecttsv}{\code{Wf_sash$read_hrdetecttsv()}} +\item \href{#method-Wf_sash-read_pcgr_json}{\code{Wf_sash$read_pcgr_json()}} +\item \href{#method-Wf_sash-read_hrd_dragen}{\code{Wf_sash$read_hrd_dragen()}} +\item \href{#method-Wf_sash-read_hrd_chord}{\code{Wf_sash$read_hrd_chord()}} +\item \href{#method-Wf_sash-read_hrd_hrdetect}{\code{Wf_sash$read_hrd_hrdetect()}} \item \href{#method-Wf_sash-read_sigstsv}{\code{Wf_sash$read_sigstsv()}} -\item \href{#method-Wf_sash-read_sigssnv2015tsv}{\code{Wf_sash$read_sigssnv2015tsv()}} -\item \href{#method-Wf_sash-read_sigssnv2020tsv}{\code{Wf_sash$read_sigssnv2020tsv()}} -\item \href{#method-Wf_sash-read_sigsdbstsv}{\code{Wf_sash$read_sigsdbstsv()}} -\item \href{#method-Wf_sash-read_sigsindeltsv}{\code{Wf_sash$read_sigsindeltsv()}} -\item \href{#method-Wf_sash-read_qcsummarytsv}{\code{Wf_sash$read_qcsummarytsv()}} +\item \href{#method-Wf_sash-read_sigs_snv2015}{\code{Wf_sash$read_sigs_snv2015()}} +\item \href{#method-Wf_sash-read_sigs_snv2020}{\code{Wf_sash$read_sigs_snv2020()}} +\item \href{#method-Wf_sash-read_sigs_dbs}{\code{Wf_sash$read_sigs_dbs()}} +\item \href{#method-Wf_sash-read_sigs_indel}{\code{Wf_sash$read_sigs_indel()}} +\item \href{#method-Wf_sash-read_qcsum}{\code{Wf_sash$read_qcsum()}} \item \href{#method-Wf_sash-clone}{\code{Wf_sash$clone()}} } } \if{html}{\out{
Inherited methods @@ -131,71 +131,12 @@ Print details about the Workflow. } } \if{html}{\out{
}} -\if{html}{\out{}} -\if{latex}{\out{\hypertarget{method-Wf_sash-list_files_filter_relevant}{}}} -\subsection{Method \code{list_files_filter_relevant()}}{ -List dracarys files under given path -\subsection{Usage}{ -\if{html}{\out{
}}\preformatted{Wf_sash$list_files_filter_relevant( - max_files = 1000, - ica_token = Sys.getenv("ICA_ACCESS_TOKEN"), - ... -)}\if{html}{\out{
}} -} - -\subsection{Arguments}{ -\if{html}{\out{
}} -\describe{ -\item{\code{max_files}}{Max number of files to list (for gds/s3 only).} - -\item{\code{ica_token}}{ICA access token (def: $ICA_ACCESS_TOKEN env var).} - -\item{\code{...}}{Passed on to the \code{gds_list_files_filter_relevant} or -the \code{s3_list_files_filter_relevant} function.} -} -\if{html}{\out{
}} -} -} -\if{html}{\out{
}} -\if{html}{\out{}} -\if{latex}{\out{\hypertarget{method-Wf_sash-download_files}{}}} -\subsection{Method \code{download_files()}}{ -Download files from GDS/S3 to local filesystem. -\subsection{Usage}{ -\if{html}{\out{
}}\preformatted{Wf_sash$download_files( - outdir, - ica_token = Sys.getenv("ICA_ACCESS_TOKEN"), - max_files = 1000, - dryrun = FALSE, - recursive = NULL -)}\if{html}{\out{
}} -} - -\subsection{Arguments}{ -\if{html}{\out{
}} -\describe{ -\item{\code{outdir}}{Path to output directory.} - -\item{\code{ica_token}}{ICA access token (def: $ICA_ACCESS_TOKEN env var).} - -\item{\code{max_files}}{Max number of files to list.} - -\item{\code{dryrun}}{If TRUE, just list the files that will be downloaded (don't -download them).} - -\item{\code{recursive}}{Should files be returned recursively \emph{in and under} the specified -GDS directory, or \emph{only directly in} the specified GDS directory (def: TRUE via ICA API).} -} -\if{html}{\out{
}} -} -} -\if{html}{\out{
}} -\if{html}{\out{}} -\if{latex}{\out{\hypertarget{method-Wf_sash-read_pcgrjson}{}}} -\subsection{Method \code{read_pcgrjson()}}{ +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-Wf_sash-read_pcgr_json}{}}} +\subsection{Method \code{read_pcgr_json()}}{ Read \code{pcgr.json.gz} file. \subsection{Usage}{ -\if{html}{\out{
}}\preformatted{Wf_sash$read_pcgrjson(x)}\if{html}{\out{
}} +\if{html}{\out{
}}\preformatted{Wf_sash$read_pcgr_json(x)}\if{html}{\out{
}} } \subsection{Arguments}{ @@ -207,12 +148,12 @@ Read \code{pcgr.json.gz} file. } } \if{html}{\out{
}} -\if{html}{\out{}} -\if{latex}{\out{\hypertarget{method-Wf_sash-read_hrddragentsv}{}}} -\subsection{Method \code{read_hrddragentsv()}}{ +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-Wf_sash-read_hrd_dragen}{}}} +\subsection{Method \code{read_hrd_dragen()}}{ Read \code{dragen.tsv.gz} cancer report hrd file. \subsection{Usage}{ -\if{html}{\out{
}}\preformatted{Wf_sash$read_hrddragentsv(x)}\if{html}{\out{
}} +\if{html}{\out{
}}\preformatted{Wf_sash$read_hrd_dragen(x)}\if{html}{\out{
}} } \subsection{Arguments}{ @@ -224,12 +165,12 @@ Read \code{dragen.tsv.gz} cancer report hrd file. } } \if{html}{\out{
}} -\if{html}{\out{}} -\if{latex}{\out{\hypertarget{method-Wf_sash-read_hrdchordtsv}{}}} -\subsection{Method \code{read_hrdchordtsv()}}{ +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-Wf_sash-read_hrd_chord}{}}} +\subsection{Method \code{read_hrd_chord()}}{ Read \code{chord.tsv.gz} cancer report hrd file. \subsection{Usage}{ -\if{html}{\out{
}}\preformatted{Wf_sash$read_hrdchordtsv(x)}\if{html}{\out{
}} +\if{html}{\out{
}}\preformatted{Wf_sash$read_hrd_chord(x)}\if{html}{\out{
}} } \subsection{Arguments}{ @@ -241,12 +182,12 @@ Read \code{chord.tsv.gz} cancer report hrd file. } } \if{html}{\out{
}} -\if{html}{\out{}} -\if{latex}{\out{\hypertarget{method-Wf_sash-read_hrdetecttsv}{}}} -\subsection{Method \code{read_hrdetecttsv()}}{ +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-Wf_sash-read_hrd_hrdetect}{}}} +\subsection{Method \code{read_hrd_hrdetect()}}{ Read \code{hrdetect.tsv.gz} cancer report hrd file. \subsection{Usage}{ -\if{html}{\out{
}}\preformatted{Wf_sash$read_hrdetecttsv(x)}\if{html}{\out{
}} +\if{html}{\out{
}}\preformatted{Wf_sash$read_hrd_hrdetect(x)}\if{html}{\out{
}} } \subsection{Arguments}{ @@ -275,12 +216,12 @@ Read signature cancer report file. } } \if{html}{\out{
}} -\if{html}{\out{}} -\if{latex}{\out{\hypertarget{method-Wf_sash-read_sigssnv2015tsv}{}}} -\subsection{Method \code{read_sigssnv2015tsv()}}{ +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-Wf_sash-read_sigs_snv2015}{}}} +\subsection{Method \code{read_sigs_snv2015()}}{ Read \code{snv_2015.tsv.gz} sigs cancer report file. \subsection{Usage}{ -\if{html}{\out{
}}\preformatted{Wf_sash$read_sigssnv2015tsv(x)}\if{html}{\out{
}} +\if{html}{\out{
}}\preformatted{Wf_sash$read_sigs_snv2015(x)}\if{html}{\out{
}} } \subsection{Arguments}{ @@ -292,12 +233,12 @@ Read \code{snv_2015.tsv.gz} sigs cancer report file. } } \if{html}{\out{
}} -\if{html}{\out{}} -\if{latex}{\out{\hypertarget{method-Wf_sash-read_sigssnv2020tsv}{}}} -\subsection{Method \code{read_sigssnv2020tsv()}}{ +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-Wf_sash-read_sigs_snv2020}{}}} +\subsection{Method \code{read_sigs_snv2020()}}{ Read \code{snv_2020.tsv.gz} sigs cancer report file. \subsection{Usage}{ -\if{html}{\out{
}}\preformatted{Wf_sash$read_sigssnv2020tsv(x)}\if{html}{\out{
}} +\if{html}{\out{
}}\preformatted{Wf_sash$read_sigs_snv2020(x)}\if{html}{\out{
}} } \subsection{Arguments}{ @@ -309,12 +250,12 @@ Read \code{snv_2020.tsv.gz} sigs cancer report file. } } \if{html}{\out{
}} -\if{html}{\out{}} -\if{latex}{\out{\hypertarget{method-Wf_sash-read_sigsdbstsv}{}}} -\subsection{Method \code{read_sigsdbstsv()}}{ +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-Wf_sash-read_sigs_dbs}{}}} +\subsection{Method \code{read_sigs_dbs()}}{ Read \code{dbs.tsv.gz} sigs cancer report file. \subsection{Usage}{ -\if{html}{\out{
}}\preformatted{Wf_sash$read_sigsdbstsv(x)}\if{html}{\out{
}} +\if{html}{\out{
}}\preformatted{Wf_sash$read_sigs_dbs(x)}\if{html}{\out{
}} } \subsection{Arguments}{ @@ -326,12 +267,12 @@ Read \code{dbs.tsv.gz} sigs cancer report file. } } \if{html}{\out{
}} -\if{html}{\out{}} -\if{latex}{\out{\hypertarget{method-Wf_sash-read_sigsindeltsv}{}}} -\subsection{Method \code{read_sigsindeltsv()}}{ +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-Wf_sash-read_sigs_indel}{}}} +\subsection{Method \code{read_sigs_indel()}}{ Read \code{indel.tsv.gz} sigs cancer report file. \subsection{Usage}{ -\if{html}{\out{
}}\preformatted{Wf_sash$read_sigsindeltsv(x)}\if{html}{\out{
}} +\if{html}{\out{
}}\preformatted{Wf_sash$read_sigs_indel(x)}\if{html}{\out{
}} } \subsection{Arguments}{ @@ -343,12 +284,12 @@ Read \code{indel.tsv.gz} sigs cancer report file. } } \if{html}{\out{
}} -\if{html}{\out{}} -\if{latex}{\out{\hypertarget{method-Wf_sash-read_qcsummarytsv}{}}} -\subsection{Method \code{read_qcsummarytsv()}}{ +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-Wf_sash-read_qcsum}{}}} +\subsection{Method \code{read_qcsum()}}{ Read \code{qc_summary.tsv.gz} cancer report file. \subsection{Usage}{ -\if{html}{\out{
}}\preformatted{Wf_sash$read_qcsummarytsv(x)}\if{html}{\out{
}} +\if{html}{\out{
}}\preformatted{Wf_sash$read_qcsum(x)}\if{html}{\out{
}} } \subsection{Arguments}{ From 281a084d6be2e9a0c2790a2be1823840712045ef Mon Sep 17 00:00:00 2001 From: pdiakumis Date: Sat, 14 Sep 2024 12:06:52 +1000 Subject: [PATCH 5/8] umccrise: hardcode file paths --- R/umccrise.R | 68 ++++++--------------- man/Wf_umccrise.Rd | 143 +++++++++++++-------------------------------- 2 files changed, 61 insertions(+), 150 deletions(-) diff --git a/R/umccrise.R b/R/umccrise.R index 2ba73e6..efcd5fe 100644 --- a/R/umccrise.R +++ b/R/umccrise.R @@ -64,17 +64,19 @@ Wf_umccrise <- R6::R6Class( #' @param SampleID_tumor The SampleID of the tumor sample (needed for path lookup). initialize = function(path = NULL, SubjectID = NULL, SampleID_tumor = NULL) { wname <- "umccrise" + pref <- glue("{SubjectID}__{SampleID_tumor}") + crep <- "cancer_report_tables" regexes <- tibble::tribble( ~regex, ~fun, - "-chord\\.tsv\\.gz$", "chordtsv", - "-hrdetect\\.tsv\\.gz$", "hrdetecttsv", - "-snv_2015\\.tsv\\.gz$", "sigssnv2015tsv", - "-snv_2020\\.tsv\\.gz$", "sigssnv2020tsv", - "-dbs\\.tsv\\.gz$", "sigsdbstsv", - "-indel\\.tsv\\.gz$", "sigsindeltsv", - "-qc_summary\\.tsv\\.gz$", "qcsummarytsv", - "multiqc_conpair\\.txt$", "conpairmultiqc", - "-somatic\\.pcgr\\.json\\.gz$", "pcgrjson" + glue("{pref}/{crep}/hrd/{pref}-chord\\.tsv\\.gz$"), "hrd_chord", + glue("{pref}/{crep}/hrd/{pref}-hrdetect\\.tsv\\.gz$"), "hrd_hrdetect", + glue("{pref}/{crep}/sigs/{pref}-snv_2015\\.tsv\\.gz$"), "sigs_snv2015", + glue("{pref}/{crep}/sigs/{pref}-snv_2020\\.tsv\\.gz$"), "sigs_snv2020", + glue("{pref}/{crep}/sigs/{pref}-dbs\\.tsv\\.gz$"), "sigs_dbs", + glue("{pref}/{crep}/sigs/{pref}-indel\\.tsv\\.gz$"), "sigs_indel", + glue("{pref}/{crep}/{pref}-qc_summary\\.tsv\\.gz$"), "qcsum", + glue("{pref}/{pref}-multiqc_report_data/multiqc_conpair\\.txt$"), "conpairmultiqc", + glue("work/{pref}/pcgr/{pref}-somatic\\.pcgr\\.json\\.gz$"), "pcgr_json" ) |> dplyr::mutate(fun = paste0("read_", .data$fun)) @@ -96,41 +98,9 @@ Wf_umccrise <- R6::R6Class( print(res) invisible(self) }, - #' @description List dracarys files under given path - #' @param max_files Max number of files to list (for gds/s3 only). - #' @param ica_token ICA access token (def: $ICA_ACCESS_TOKEN env var). - #' @param ... Passed on to the `gds_list_files_filter_relevant` or - #' the `s3_list_files_filter_relevant` function. - list_files_filter_relevant = function(max_files = 1000, - ica_token = Sys.getenv("ICA_ACCESS_TOKEN"), ...) { - path <- self$path - dir_final <- file.path(path, glue("{self$SubjectID}__{self$SampleID_tumor}")) - dir_work <- file.path(path, "work", glue("{self$SubjectID}__{self$SampleID_tumor}")) - dir_work_pcgr <- file.path(dir_work, "pcgr") # for pcgr json - f1 <- super$list_files_filter_relevant(path = dir_final, max_files = 300, ica_token = ica_token) - f2 <- super$list_files_filter_relevant(path = dir_work_pcgr, max_files = 50, ica_token = ica_token) - f_all <- dplyr::bind_rows(f1, f2) - return(f_all) - }, - #' @description Download files from GDS/S3 to local filesystem. - #' @param outdir Path to output directory. - #' @param ica_token ICA access token (def: $ICA_ACCESS_TOKEN env var). - #' @param max_files Max number of files to list. - #' @param dryrun If TRUE, just list the files that will be downloaded (don't - #' download them). - #' @param recursive Should files be returned recursively _in and under_ the specified - #' GDS directory, or _only directly in_ the specified GDS directory (def: TRUE via ICA API). - download_files = function(outdir, ica_token = Sys.getenv("ICA_ACCESS_TOKEN"), - max_files = 1000, dryrun = FALSE, recursive = NULL) { - super$download_files( - outdir = outdir, ica_token = ica_token, max_files = max_files, - dryrun = dryrun, recursive = recursive, - list_filter_fun = self$list_files_filter_relevant - ) - }, #' @description Read `pcgr.json.gz` file. #' @param x Path to file. - read_pcgrjson = function(x) { + read_pcgr_json = function(x) { j <- read_jsongz_jsonlite(x) tmb <- j[["content"]][["tmb"]][["variant_statistic"]] %||% @@ -150,7 +120,7 @@ Wf_umccrise <- R6::R6Class( }, #' @description Read `chord.tsv.gz` cancer report file. #' @param x Path to file. - read_chordtsv = function(x) { + read_hrd_chord = function(x) { ct <- readr::cols_only( p_hrd = "d", hr_status = "c", @@ -162,7 +132,7 @@ Wf_umccrise <- R6::R6Class( }, #' @description Read `hrdetect.tsv.gz` cancer report file. #' @param x Path to file. - read_hrdetecttsv = function(x) { + read_hrd_hrdetect = function(x) { ct <- readr::cols( .default = "d", sample = "c" @@ -181,27 +151,27 @@ Wf_umccrise <- R6::R6Class( }, #' @description Read `snv_2015.tsv.gz` sigs cancer report file. #' @param x Path to file. - read_sigssnv2015tsv = function(x) { + read_sigs_snv2015 = function(x) { self$read_sigstsv(x) }, #' @description Read `snv_2020.tsv.gz` sigs cancer report file. #' @param x Path to file. - read_sigssnv2020tsv = function(x) { + read_sigs_snv2020 = function(x) { self$read_sigstsv(x) }, #' @description Read `dbs.tsv.gz` sigs cancer report file. #' @param x Path to file. - read_sigsdbstsv = function(x) { + read_sigs_dbs = function(x) { self$read_sigstsv(x) }, #' @description Read `indel.tsv.gz` sigs cancer report file. #' @param x Path to file. - read_sigsindeltsv = function(x) { + read_sigs_indel = function(x) { self$read_sigstsv(x) }, #' @description Read `qc_summary.tsv.gz` cancer report file. #' @param x Path to file. - read_qcsummarytsv = function(x) { + read_qcsum = function(x) { d <- read_tsvgz(x, col_types = readr::cols(.default = "c")) d |> dplyr::select("variable", "value") |> diff --git a/man/Wf_umccrise.Rd b/man/Wf_umccrise.Rd index 7753970..131a62e 100644 --- a/man/Wf_umccrise.Rd +++ b/man/Wf_umccrise.Rd @@ -69,17 +69,15 @@ d_write <- um2$write( \itemize{ \item \href{#method-Wf_umccrise-new}{\code{Wf_umccrise$new()}} \item \href{#method-Wf_umccrise-print}{\code{Wf_umccrise$print()}} -\item \href{#method-Wf_umccrise-list_files_filter_relevant}{\code{Wf_umccrise$list_files_filter_relevant()}} -\item \href{#method-Wf_umccrise-download_files}{\code{Wf_umccrise$download_files()}} -\item \href{#method-Wf_umccrise-read_pcgrjson}{\code{Wf_umccrise$read_pcgrjson()}} -\item \href{#method-Wf_umccrise-read_chordtsv}{\code{Wf_umccrise$read_chordtsv()}} -\item \href{#method-Wf_umccrise-read_hrdetecttsv}{\code{Wf_umccrise$read_hrdetecttsv()}} +\item \href{#method-Wf_umccrise-read_pcgr_json}{\code{Wf_umccrise$read_pcgr_json()}} +\item \href{#method-Wf_umccrise-read_hrd_chord}{\code{Wf_umccrise$read_hrd_chord()}} +\item \href{#method-Wf_umccrise-read_hrd_hrdetect}{\code{Wf_umccrise$read_hrd_hrdetect()}} \item \href{#method-Wf_umccrise-read_sigstsv}{\code{Wf_umccrise$read_sigstsv()}} -\item \href{#method-Wf_umccrise-read_sigssnv2015tsv}{\code{Wf_umccrise$read_sigssnv2015tsv()}} -\item \href{#method-Wf_umccrise-read_sigssnv2020tsv}{\code{Wf_umccrise$read_sigssnv2020tsv()}} -\item \href{#method-Wf_umccrise-read_sigsdbstsv}{\code{Wf_umccrise$read_sigsdbstsv()}} -\item \href{#method-Wf_umccrise-read_sigsindeltsv}{\code{Wf_umccrise$read_sigsindeltsv()}} -\item \href{#method-Wf_umccrise-read_qcsummarytsv}{\code{Wf_umccrise$read_qcsummarytsv()}} +\item \href{#method-Wf_umccrise-read_sigs_snv2015}{\code{Wf_umccrise$read_sigs_snv2015()}} +\item \href{#method-Wf_umccrise-read_sigs_snv2020}{\code{Wf_umccrise$read_sigs_snv2020()}} +\item \href{#method-Wf_umccrise-read_sigs_dbs}{\code{Wf_umccrise$read_sigs_dbs()}} +\item \href{#method-Wf_umccrise-read_sigs_indel}{\code{Wf_umccrise$read_sigs_indel()}} +\item \href{#method-Wf_umccrise-read_qcsum}{\code{Wf_umccrise$read_qcsum()}} \item \href{#method-Wf_umccrise-read_conpairmultiqc}{\code{Wf_umccrise$read_conpairmultiqc()}} \item \href{#method-Wf_umccrise-clone}{\code{Wf_umccrise$clone()}} } @@ -87,7 +85,9 @@ d_write <- um2$write( \if{html}{\out{
Inherited methods @@ -133,71 +133,12 @@ Print details about the Workflow. } } \if{html}{\out{
}} -\if{html}{\out{}} -\if{latex}{\out{\hypertarget{method-Wf_umccrise-list_files_filter_relevant}{}}} -\subsection{Method \code{list_files_filter_relevant()}}{ -List dracarys files under given path -\subsection{Usage}{ -\if{html}{\out{
}}\preformatted{Wf_umccrise$list_files_filter_relevant( - max_files = 1000, - ica_token = Sys.getenv("ICA_ACCESS_TOKEN"), - ... -)}\if{html}{\out{
}} -} - -\subsection{Arguments}{ -\if{html}{\out{
}} -\describe{ -\item{\code{max_files}}{Max number of files to list (for gds/s3 only).} - -\item{\code{ica_token}}{ICA access token (def: $ICA_ACCESS_TOKEN env var).} - -\item{\code{...}}{Passed on to the \code{gds_list_files_filter_relevant} or -the \code{s3_list_files_filter_relevant} function.} -} -\if{html}{\out{
}} -} -} -\if{html}{\out{
}} -\if{html}{\out{}} -\if{latex}{\out{\hypertarget{method-Wf_umccrise-download_files}{}}} -\subsection{Method \code{download_files()}}{ -Download files from GDS/S3 to local filesystem. -\subsection{Usage}{ -\if{html}{\out{
}}\preformatted{Wf_umccrise$download_files( - outdir, - ica_token = Sys.getenv("ICA_ACCESS_TOKEN"), - max_files = 1000, - dryrun = FALSE, - recursive = NULL -)}\if{html}{\out{
}} -} - -\subsection{Arguments}{ -\if{html}{\out{
}} -\describe{ -\item{\code{outdir}}{Path to output directory.} - -\item{\code{ica_token}}{ICA access token (def: $ICA_ACCESS_TOKEN env var).} - -\item{\code{max_files}}{Max number of files to list.} - -\item{\code{dryrun}}{If TRUE, just list the files that will be downloaded (don't -download them).} - -\item{\code{recursive}}{Should files be returned recursively \emph{in and under} the specified -GDS directory, or \emph{only directly in} the specified GDS directory (def: TRUE via ICA API).} -} -\if{html}{\out{
}} -} -} -\if{html}{\out{
}} -\if{html}{\out{}} -\if{latex}{\out{\hypertarget{method-Wf_umccrise-read_pcgrjson}{}}} -\subsection{Method \code{read_pcgrjson()}}{ +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-Wf_umccrise-read_pcgr_json}{}}} +\subsection{Method \code{read_pcgr_json()}}{ Read \code{pcgr.json.gz} file. \subsection{Usage}{ -\if{html}{\out{
}}\preformatted{Wf_umccrise$read_pcgrjson(x)}\if{html}{\out{
}} +\if{html}{\out{
}}\preformatted{Wf_umccrise$read_pcgr_json(x)}\if{html}{\out{
}} } \subsection{Arguments}{ @@ -209,12 +150,12 @@ Read \code{pcgr.json.gz} file. } } \if{html}{\out{
}} -\if{html}{\out{}} -\if{latex}{\out{\hypertarget{method-Wf_umccrise-read_chordtsv}{}}} -\subsection{Method \code{read_chordtsv()}}{ +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-Wf_umccrise-read_hrd_chord}{}}} +\subsection{Method \code{read_hrd_chord()}}{ Read \code{chord.tsv.gz} cancer report file. \subsection{Usage}{ -\if{html}{\out{
}}\preformatted{Wf_umccrise$read_chordtsv(x)}\if{html}{\out{
}} +\if{html}{\out{
}}\preformatted{Wf_umccrise$read_hrd_chord(x)}\if{html}{\out{
}} } \subsection{Arguments}{ @@ -226,12 +167,12 @@ Read \code{chord.tsv.gz} cancer report file. } } \if{html}{\out{
}} -\if{html}{\out{}} -\if{latex}{\out{\hypertarget{method-Wf_umccrise-read_hrdetecttsv}{}}} -\subsection{Method \code{read_hrdetecttsv()}}{ +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-Wf_umccrise-read_hrd_hrdetect}{}}} +\subsection{Method \code{read_hrd_hrdetect()}}{ Read \code{hrdetect.tsv.gz} cancer report file. \subsection{Usage}{ -\if{html}{\out{
}}\preformatted{Wf_umccrise$read_hrdetecttsv(x)}\if{html}{\out{
}} +\if{html}{\out{
}}\preformatted{Wf_umccrise$read_hrd_hrdetect(x)}\if{html}{\out{
}} } \subsection{Arguments}{ @@ -260,12 +201,12 @@ Read signature cancer report file. } } \if{html}{\out{
}} -\if{html}{\out{}} -\if{latex}{\out{\hypertarget{method-Wf_umccrise-read_sigssnv2015tsv}{}}} -\subsection{Method \code{read_sigssnv2015tsv()}}{ +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-Wf_umccrise-read_sigs_snv2015}{}}} +\subsection{Method \code{read_sigs_snv2015()}}{ Read \code{snv_2015.tsv.gz} sigs cancer report file. \subsection{Usage}{ -\if{html}{\out{
}}\preformatted{Wf_umccrise$read_sigssnv2015tsv(x)}\if{html}{\out{
}} +\if{html}{\out{
}}\preformatted{Wf_umccrise$read_sigs_snv2015(x)}\if{html}{\out{
}} } \subsection{Arguments}{ @@ -277,12 +218,12 @@ Read \code{snv_2015.tsv.gz} sigs cancer report file. } } \if{html}{\out{
}} -\if{html}{\out{}} -\if{latex}{\out{\hypertarget{method-Wf_umccrise-read_sigssnv2020tsv}{}}} -\subsection{Method \code{read_sigssnv2020tsv()}}{ +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-Wf_umccrise-read_sigs_snv2020}{}}} +\subsection{Method \code{read_sigs_snv2020()}}{ Read \code{snv_2020.tsv.gz} sigs cancer report file. \subsection{Usage}{ -\if{html}{\out{
}}\preformatted{Wf_umccrise$read_sigssnv2020tsv(x)}\if{html}{\out{
}} +\if{html}{\out{
}}\preformatted{Wf_umccrise$read_sigs_snv2020(x)}\if{html}{\out{
}} } \subsection{Arguments}{ @@ -294,12 +235,12 @@ Read \code{snv_2020.tsv.gz} sigs cancer report file. } } \if{html}{\out{
}} -\if{html}{\out{}} -\if{latex}{\out{\hypertarget{method-Wf_umccrise-read_sigsdbstsv}{}}} -\subsection{Method \code{read_sigsdbstsv()}}{ +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-Wf_umccrise-read_sigs_dbs}{}}} +\subsection{Method \code{read_sigs_dbs()}}{ Read \code{dbs.tsv.gz} sigs cancer report file. \subsection{Usage}{ -\if{html}{\out{
}}\preformatted{Wf_umccrise$read_sigsdbstsv(x)}\if{html}{\out{
}} +\if{html}{\out{
}}\preformatted{Wf_umccrise$read_sigs_dbs(x)}\if{html}{\out{
}} } \subsection{Arguments}{ @@ -311,12 +252,12 @@ Read \code{dbs.tsv.gz} sigs cancer report file. } } \if{html}{\out{
}} -\if{html}{\out{}} -\if{latex}{\out{\hypertarget{method-Wf_umccrise-read_sigsindeltsv}{}}} -\subsection{Method \code{read_sigsindeltsv()}}{ +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-Wf_umccrise-read_sigs_indel}{}}} +\subsection{Method \code{read_sigs_indel()}}{ Read \code{indel.tsv.gz} sigs cancer report file. \subsection{Usage}{ -\if{html}{\out{
}}\preformatted{Wf_umccrise$read_sigsindeltsv(x)}\if{html}{\out{
}} +\if{html}{\out{
}}\preformatted{Wf_umccrise$read_sigs_indel(x)}\if{html}{\out{
}} } \subsection{Arguments}{ @@ -328,12 +269,12 @@ Read \code{indel.tsv.gz} sigs cancer report file. } } \if{html}{\out{
}} -\if{html}{\out{}} -\if{latex}{\out{\hypertarget{method-Wf_umccrise-read_qcsummarytsv}{}}} -\subsection{Method \code{read_qcsummarytsv()}}{ +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-Wf_umccrise-read_qcsum}{}}} +\subsection{Method \code{read_qcsum()}}{ Read \code{qc_summary.tsv.gz} cancer report file. \subsection{Usage}{ -\if{html}{\out{
}}\preformatted{Wf_umccrise$read_qcsummarytsv(x)}\if{html}{\out{
}} +\if{html}{\out{
}}\preformatted{Wf_umccrise$read_qcsum(x)}\if{html}{\out{
}} } \subsection{Arguments}{ From 0ccf417736c4470943098e60bf3b2c2b24bdcebf Mon Sep 17 00:00:00 2001 From: pdiakumis Date: Sat, 14 Sep 2024 15:55:15 +1000 Subject: [PATCH 6/8] sash: download and tidy --- .Rbuildignore | 3 +- inst/rmd/umccr_workflows/sash/.gitignore | 3 + inst/rmd/umccr_workflows/sash/dl_and_tidy.R | 79 +++++++++++++++++++ .../umccr_workflows/umccrise/dl_and_tidy.R | 6 +- 4 files changed, 87 insertions(+), 4 deletions(-) create mode 100644 inst/rmd/umccr_workflows/sash/.gitignore create mode 100755 inst/rmd/umccr_workflows/sash/dl_and_tidy.R diff --git a/.Rbuildignore b/.Rbuildignore index 5c38deb..c5136c8 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -27,5 +27,6 @@ inst/rmd/umccr_portal/html inst/rmd/umccr_workflows/alignment_qc/nogit inst/rmd/umccr_workflows/bcl_convert/html inst/rmd/umccr_workflows/interop/html -inst/rmd/umccr_workflows/umccrise/html +inst/rmd/umccr_workflows/sash/nogit +inst/rmd/umccr_workflows/umccrise/nogit inst/sandbox diff --git a/inst/rmd/umccr_workflows/sash/.gitignore b/inst/rmd/umccr_workflows/sash/.gitignore new file mode 100644 index 0000000..0182e8f --- /dev/null +++ b/inst/rmd/umccr_workflows/sash/.gitignore @@ -0,0 +1,3 @@ +nogit + +/.quarto/ diff --git a/inst/rmd/umccr_workflows/sash/dl_and_tidy.R b/inst/rmd/umccr_workflows/sash/dl_and_tidy.R new file mode 100755 index 0000000..48a5a3a --- /dev/null +++ b/inst/rmd/umccr_workflows/sash/dl_and_tidy.R @@ -0,0 +1,79 @@ +#!/usr/bin/env Rscript + +{ + require(dplyr) + require(assertthat, include.only = "assert_that") + require(dracarys, include.only = "Wf_sash_download_tidy_write") + require(glue, include.only = "glue") + require(here, include.only = "here") + require(rportal, include.only = c("portaldb_query_workflow")) + require(tidyr, include.only = "separate_wider_delim") +} + +query_workflow_sash <- function(start_date, end_date) { + q1 <- glue( + "WHERE \"type_name\" = 'sash' ", + "AND \"start\" >= date(\'{start_date}\') ", + "AND \"end\" <= date(\'{end_date}\') ", + "ORDER BY \"start\" DESC;" + ) + rportal::portaldb_query_workflow(q1) +} + +query_limsrow_libids <- function(libids) { + assertthat::assert_that(!is.null(libids), all(grepl("^L", libids))) + libids <- unique(libids) |> + paste(collapse = "|") + q1 <- glue("WHERE REGEXP_LIKE(\"library_id\", '{libids}');") + rportal::portaldb_query_limsrow(q1) +} + +# first read in the workflows table, extract metadata, then join with lims +start_date <- "2024-08-29" +end_date <- "2024-09-07" +meta_raw <- query_workflow_sash(start_date, end_date) +meta <- meta_raw |> + rportal::meta_sash() +lims_raw <- query_limsrow_libids(meta$LibraryID_tumor) +lims <- lims_raw |> + tidyr::separate_wider_delim( + library_id, + delim = "_", names = c("library_id", "topup_or_rerun"), too_few = "align_start" + ) |> + select( + subject_id, library_id, sample_id, sample_name, + external_subject_id, external_sample_id, + project_name, project_owner, + source, quality, workflow + ) |> + distinct() +table(lims$library_id %in% meta$LibraryID_tumor) # double-check + +meta_lims <- meta |> + left_join(lims, by = c("LibraryID_tumor" = "library_id")) |> + mutate(rownum = row_number()) |> + select( + rownum, wfr_id, version, end_status, start, end, portal_run_id, SubjectID, LibraryID_tumor, LibraryID_normal, + SampleID_tumor, SampleID_normal, s3_outdir_sash, external_subject_id, external_sample_id, + project_owner, project_name, source, quality, workflow + ) +meta_lims |> + saveRDS(here(glue("inst/rmd/umccr_workflows/sash/nogit/meta/{start_date}_{end_date}.rds"))) + +d <- meta_lims |> + rowwise() |> + mutate( + indir = .data$s3_outdir_sash, + outdir = file.path(sub("s3://", "", .data$indir)), + outdir = file.path(normalizePath("~/s3"), .data$outdir), + res = list( + dracarys::Wf_sash_download_tidy_write( + path = .data$indir, SubjectID = .data$SubjectID, SampleID_tumor = .data$SampleID_tumor, + outdir = .data$outdir, max_files = 300, dryrun = FALSE + ) + ) + ) |> + ungroup() + +d |> + saveRDS(here(glue("inst/rmd/umccr_workflows/sash/nogit/results_{start_date}_{end_date}.rds"))) diff --git a/inst/rmd/umccr_workflows/umccrise/dl_and_tidy.R b/inst/rmd/umccr_workflows/umccrise/dl_and_tidy.R index 3b5ede3..fb17721 100755 --- a/inst/rmd/umccr_workflows/umccrise/dl_and_tidy.R +++ b/inst/rmd/umccr_workflows/umccrise/dl_and_tidy.R @@ -12,9 +12,9 @@ query_workflow_umccrise <- function(start_date, end_date) { q1 <- glue( - "WHERE \"type_name\" = 'umccrise'", - "AND \"start\" >= date(\'{start_date}\')", - "AND \"end\" <= date(\'{end_date}\')", + "WHERE \"type_name\" = 'umccrise' ", + "AND \"start\" >= date(\'{start_date}\') ", + "AND \"end\" <= date(\'{end_date}\') ", "ORDER BY \"start\" DESC;" ) rportal::portaldb_query_workflow(q1) From d39f620a3ac9e637a4d9133d2e273661e33ee043 Mon Sep 17 00:00:00 2001 From: pdiakumis Date: Sun, 15 Sep 2024 15:47:11 +1000 Subject: [PATCH 7/8] sash: add summary report --- inst/rmd/umccr_workflows/sash/dl_and_tidy.R | 5 +- inst/rmd/umccr_workflows/sash/render.sh | 9 + .../rmd/umccr_workflows/sash/summary_sash.qmd | 378 ++++++++++++++++++ 3 files changed, 390 insertions(+), 2 deletions(-) create mode 100644 inst/rmd/umccr_workflows/sash/render.sh create mode 100644 inst/rmd/umccr_workflows/sash/summary_sash.qmd diff --git a/inst/rmd/umccr_workflows/sash/dl_and_tidy.R b/inst/rmd/umccr_workflows/sash/dl_and_tidy.R index 48a5a3a..9fcfcb1 100755 --- a/inst/rmd/umccr_workflows/sash/dl_and_tidy.R +++ b/inst/rmd/umccr_workflows/sash/dl_and_tidy.R @@ -63,13 +63,14 @@ meta_lims |> d <- meta_lims |> rowwise() |> mutate( - indir = .data$s3_outdir_sash, + # indir = .data$s3_outdir_sash, outdir = file.path(sub("s3://", "", .data$indir)), outdir = file.path(normalizePath("~/s3"), .data$outdir), + indir = outdir, # for when debugging locally res = list( dracarys::Wf_sash_download_tidy_write( path = .data$indir, SubjectID = .data$SubjectID, SampleID_tumor = .data$SampleID_tumor, - outdir = .data$outdir, max_files = 300, dryrun = FALSE + outdir = .data$outdir, max_files = 1000, dryrun = FALSE ) ) ) |> diff --git a/inst/rmd/umccr_workflows/sash/render.sh b/inst/rmd/umccr_workflows/sash/render.sh new file mode 100644 index 0000000..e1c6efe --- /dev/null +++ b/inst/rmd/umccr_workflows/sash/render.sh @@ -0,0 +1,9 @@ +date_start="2024-08-29" +date_end="2024-09-07" +out="sash_${date_start}_${date_end}.html" + +quarto render summary_sash.qmd \ + -P date_start:${date_start} \ + -P date_end:${date_end} \ + -o ${out} \ + --output-dir nogit/html diff --git a/inst/rmd/umccr_workflows/sash/summary_sash.qmd b/inst/rmd/umccr_workflows/sash/summary_sash.qmd new file mode 100644 index 0000000..24b0903 --- /dev/null +++ b/inst/rmd/umccr_workflows/sash/summary_sash.qmd @@ -0,0 +1,378 @@ +--- +title: "{{< meta params.title >}}" +subtitle: "Period: `r paste(params$date_start, ' to ', params$date_end)`" +author: "UMCCR - Genomics Platform Group" +date: now +date-format: "YYYY-MM-DD HH:mm Z" +execute: + echo: false +format: + html: + toc: true + toc-expand: 1 + toc-title: Contents + toc-location: body + highlight-style: github + number-sections: false + link-external-icon: true + link-external-newwindow: true + embed-resources: true + code-copy: true + code-link: true + code-fold: true + code-block-border-left: true + smooth-scroll: true + grid: + body-width: 1300px +params: + title: "UMCCR sash Workflow Summary" + date_start: "XXXX-XX-XX" + date_end: "XXXX-XX-XX" +--- + +```{r} +#| label: pkg_load +#| message: false +{ + require(dplyr) # import all dplyr funcs + require(readr, include.only = c("read_rds")) + require(purrr, include.only = c("map")) + require(tidyr, include.only = c("unnest_wider")) + require(dracarys, include.only = c("session_info_kable")) + require(glue, include.only = "glue") + require(here, include.only = "here") + require(knitr, include.only = "kable") + require(reactable, include.only = "reactable") + require(ggplot2, include.only = c("ggplot", "aes")) + require(lubridate, include.only = c("as_datetime")) + require(plotly, include.only = c("ggplotly")) + require(patchwork, include.only = c("plot_layout")) +} +set.seed(42) +``` + +```{r} +#| label: load_data +date_start <- params$date_start +date_end <- params$date_end +nogit <- "inst/rmd/umccr_workflows/sash/nogit" +d_raw <- here(glue("{nogit}/results_{date_start}_{date_end}.rds")) |> + readr::read_rds() |> + arrange(desc(SubjectID), desc(LibraryID_tumor)) |> + mutate(rownum = row_number()) |> + relocate(rownum) +myriad <- paste0("SBJ0", c("0695", "0847", "0920", "2397", "2456", "2743", "3186", "3242", "4187", "4221")) +``` + +```{r} +#| label: funcs + +# pal <- colorRamp(c("white", "lightgreen")) +# rgb(pal(0.5), maxColorValue = 255) + +tab_view <- function(x, id, ...) { + htmltools::browsable( + htmltools::tagList( + htmltools::tags$button( + htmltools::tagList(fontawesome::fa("download"), "CSV"), + onclick = glue("Reactable.downloadDataCSV('{id}', '{id}.csv')") + ), + x |> + reactable::reactable( + bordered = TRUE, + compact = TRUE, + filterable = TRUE, + fullWidth = TRUE, + height = 800, + highlight = TRUE, + pagination = TRUE, + showPagination = TRUE, + defaultPageSize = nrow(x), + showPageSizeOptions = TRUE, + pageSizeOptions = c(20, 50, nrow(x)), + resizable = TRUE, + searchable = TRUE, + sortable = TRUE, + striped = TRUE, + wrap = FALSE, + elementId = id, + theme = reactable::reactableTheme( + borderColor = "#dfe2e5", + stripedColor = "#f6f8fa", + highlightColor = "#f0f5f9", + style = list( + fontFamily = "Monaco" + ) + ), + ... + ) + ) + ) +} +``` + +## Metadata + +```{r} +#| label: metadata +meta <- d_raw |> + select( + rownum, portal_run_id, SubjectID, LibraryID_tumor, SampleID_tumor, external_subject_id, external_sample_id, + project_owner, project_name, source, quality, workflow + ) +tab_view(meta, id = "metadata") +``` + +## Results + +```{r} +#| label: process +# one row per file type - not all samples have sigsdbstsv +d <- d_raw |> + tidyr::unnest_longer(res, indices_to = "filetype") +# main_cols <- c("rownum", "portal_run_id", "SubjectID", "LibraryID_tumor") +main_cols <- c("portal_run_id") +``` + +```{r} +#| label: qcsum +qcsum <- d |> + filter(filetype == "qcsum") |> + select(all_of(main_cols), res) |> + unnest_wider(res) +``` + +```{r} +#| label: pcgr +pcgr <- d |> + filter(filetype == "pcgr_json") |> + select(all_of(main_cols), res) |> + unnest_wider(res) |> + rename( + msi_fraction_indels_pcgr = "fracIndels", + msi_pcgr = "predicted_class", + tmb_pcgr = "tmb_estimate", + n_tmb_pcgr = "n_tmb" + ) |> + mutate(msi_pcgr = sub(" \\(.*\\)", "", msi_pcgr)) +``` + +```{r} +#| label: hrd +hrd_chord <- d |> + filter(filetype == "hrd_chord") |> + unnest_wider(res) |> + select(all_of(main_cols), hrd_chord = "p_hrd") +hrd_hrdetect <- d |> + filter(filetype == "hrd_hrdetect") |> + unnest_wider(res) |> + select(all_of(main_cols), hrd_hrdetect = "Probability") +hrd_dragen <- d |> + filter(filetype == "hrd_dragen") |> + select(all_of(main_cols), res) |> + unnest_wider(res) |> + select(all_of(main_cols), hrd_dragen = "HRD") +hrd_all <- hrd_dragen |> + left_join(hrd_chord, by = "portal_run_id") |> + left_join(hrd_hrdetect, by = "portal_run_id") |> + select(portal_run_id, hrd_dragen, hrd_chord, hrd_hrdetect) +``` + +```{r} +#| label: sigs +sigs_snv2015 <- d |> + filter(filetype == "sigs_snv2015") |> + select(all_of(main_cols), res) |> + tidyr::unnest_wider(res) |> + tidyr::unnest_longer(col = c(Rank, Signature, Contribution, RelFreq)) +sigs_snv2020 <- d |> + filter(filetype == "sigs_snv2020") |> + select(all_of(main_cols), res) |> + tidyr::unnest_wider(res) |> + tidyr::unnest_longer(col = c(Rank, Signature, Contribution, RelFreq)) +sigs_dbs <- d |> + filter(filetype == "sigs_dbs") |> + select(all_of(main_cols), res) |> + tidyr::unnest_wider(res) |> + tidyr::unnest_longer(col = c(Rank, Signature, Contribution, RelFreq)) +sigs_indel <- d |> + filter(filetype == "sigs_indel") |> + select(all_of(main_cols), res) |> + tidyr::unnest_wider(res) |> + tidyr::unnest_longer(col = c(Rank, Signature, Contribution, RelFreq)) +dsig <- bind_rows( + list( + snv2015 = sigs_snv2015, snv2020 = sigs_snv2020, dbs = sigs_dbs, indel = sigs_indel + ), + .id = "Sig_group" +) + +# keep top two ranked sigs +dsig_filt <- dsig |> + group_by(Sig_group, portal_run_id) |> + mutate(tot_sig_vars = sum(Contribution)) |> + arrange(Rank) |> + slice_head(n = 2) |> + # some sigs have same Rank so use explicit sig_rank + mutate(sig_rank = row_number()) |> + ungroup() |> + mutate( + sig_summary = glue("{Signature} ({RelFreq} = {Contribution} / {tot_sig_vars})") + ) |> + select(Sig_group, portal_run_id, sig_rank, sig_summary) |> + tidyr::pivot_wider(names_from = sig_rank, values_from = sig_summary, names_prefix = "rank") |> + mutate(sig_top2 = paste(rank1, rank2, sep = ", ")) |> + select(Sig_group, portal_run_id, sig_top2) |> + tidyr::pivot_wider(names_from = Sig_group, values_from = sig_top2) |> + select(portal_run_id, snv2015, snv2020, dbs, indel) +``` + +```{r} +#| label: qc_all +dall <- d_raw |> + select( + rownum, + date_analysed = "start", portal_run_id, + SubjectID, LibraryID_tumor, SampleID_tumor, + external_subject_id, external_sample_id, + project_owner, project_name, source, quality, workflow + ) |> + left_join(hrd_all, by = "portal_run_id") |> + left_join(qcsum, by = "portal_run_id") |> + left_join(pcgr, by = "portal_run_id") +``` + +### Summary Metrics + +```{r} +#| label: summary_metrics +tab_view(dall, "summary_metrics") +``` + +### HRD Plot + +```{r} +#| label: hrd_plot +#| fig-width: 15 +#| fig-height: 15 + +pdat <- dall |> + mutate(sbj = glue("{SubjectID}_{LibraryID_tumor}")) |> + select(sbj, dragen = hrd_dragen, chord = hrd_chord, hrdetect = hrd_hrdetect) +p1 <- pdat |> + ggplot2::ggplot(aes(x = chord, y = hrdetect)) + + ggplot2::geom_point(colour = "#00bfc4") + + ggplot2::theme_bw() +psub1 <- plotly::ggplotly(p1) +p2 <- pdat |> + ggplot2::ggplot(aes(x = chord, y = dragen)) + + ggplot2::geom_point(colour = "#f8766d") + + ggplot2::theme_bw() +psub2 <- plotly::ggplotly(p2) +p3 <- pdat |> + ggplot2::ggplot(aes(x = hrdetect, y = dragen)) + + ggplot2::geom_point(colour = "#7cae00") + + ggplot2::theme_bw() +psub3 <- plotly::ggplotly(p3) +# p_all <- p1 + p2 + p3 + plot_layout(ncol = 1) +plotly::subplot(psub1, psub2, psub3, nrows = 3, titleX = TRUE, titleY = TRUE) +``` + +### Signatures + +#### All (SNV, Indel, DBS) + +Don't show. + +```{r eval=FALSE} +#| label: sig_results_all +dsig |> + left_join(meta |> select(rownum, portal_run_id, SubjectID, LibraryID_tumor), + by = "portal_run_id" + ) |> + select(rownum, portal_run_id, SubjectID, LibraryID_tumor, everything()) |> + tab_view("sig_results_all") +``` + +#### Top 2 + +```{r} +#| label: sig_results_top2 +dsig_filt |> + left_join(meta |> select(rownum, portal_run_id, SubjectID, LibraryID_tumor), + by = "portal_run_id" + ) |> + select(rownum, portal_run_id, SubjectID, LibraryID_tumor, everything()) |> + arrange(rownum) |> + tab_view("sig_results_top2") +``` + +#### Top 3 SNV2015 + +```{r} +#| label: sig_results +#| fig-width: 15 +#| fig-height: 65 + +sig_order2015 <- paste0("Sig", 1:30) +# sig_order2020 <- paste0( +# "SBS", +# c( +# 1:6, +# paste0(7, c("a", "b", "c", "d")), +# 8:9, +# paste0(10, c("a", "b", "c", "d")), +# 11:16, +# paste0(17, c("a", "b")), +# 18:60, +# 84:94 +# ) +# ) + +p2_prep <- dsig |> + filter( + Sig_group == "snv2015", + Rank %in% c(1:3) + ) |> + left_join(dall |> select(portal_run_id, date_analysed, SubjectID, LibraryID_tumor), by = "portal_run_id") |> + mutate(sbj = as.character(glue("{SubjectID}_{LibraryID_tumor}"))) |> + select(date_analysed, sbj, Sig_group, Rank, Signature, Contribution, RelFreq) |> + mutate(Signature = factor(Signature, levels = sig_order2015)) +p2 <- p2_prep |> + ggplot2::ggplot(aes(x = Contribution, y = sbj, fill = Signature, text = sbj)) + + ggplot2::geom_bar(position = "fill", stat = "identity") + + ggplot2::theme_bw(base_size = 7) + +plotly::ggplotly(p2, tooltip = c("x", "text", "fill")) +``` + +## Metadata Summary + +::: {.panel-tabset .nav-pills} + +### Project Name/Owner + +```{r} +#| label: project_owner_name +dall |> + count(project_name, project_owner) |> + knitr::kable() +``` + +### Source / Quality + +```{r} +#| label: source_quality +count(dall, source, quality) |> knitr::kable() +``` + +### Workflow + +```{r} +#| label: workflow_summary +count(dall, workflow) |> knitr::kable() +``` + +::: + + From b143b5424d3806e9ba4c01680e5adcceb5a92e69 Mon Sep 17 00:00:00 2001 From: pdiakumis Date: Sun, 15 Sep 2024 22:55:20 +1000 Subject: [PATCH 8/8] sash: fix reactable; include myriad --- .../rmd/umccr_workflows/sash/summary_sash.qmd | 58 ++++++++++++------- 1 file changed, 36 insertions(+), 22 deletions(-) diff --git a/inst/rmd/umccr_workflows/sash/summary_sash.qmd b/inst/rmd/umccr_workflows/sash/summary_sash.qmd index 24b0903..ec231d1 100644 --- a/inst/rmd/umccr_workflows/sash/summary_sash.qmd +++ b/inst/rmd/umccr_workflows/sash/summary_sash.qmd @@ -26,8 +26,8 @@ format: body-width: 1300px params: title: "UMCCR sash Workflow Summary" - date_start: "XXXX-XX-XX" - date_end: "XXXX-XX-XX" + date_start: "2024-08-29" + date_end: "2024-09-07" --- ```{r} @@ -56,20 +56,21 @@ set.seed(42) date_start <- params$date_start date_end <- params$date_end nogit <- "inst/rmd/umccr_workflows/sash/nogit" +myriad <- paste0("SBJ0", c("0695", "0847", "0920", "2397", "2456", "2743", "3186", "3242", "4187", "4221")) d_raw <- here(glue("{nogit}/results_{date_start}_{date_end}.rds")) |> readr::read_rds() |> arrange(desc(SubjectID), desc(LibraryID_tumor)) |> - mutate(rownum = row_number()) |> - relocate(rownum) -myriad <- paste0("SBJ0", c("0695", "0847", "0920", "2397", "2456", "2743", "3186", "3242", "4187", "4221")) + mutate( + rownum = row_number(), + is_myriad = SubjectID %in% myriad + ) |> + relocate(rownum) |> + relocate(is_myriad, .after = rownum) ``` ```{r} #| label: funcs -# pal <- colorRamp(c("white", "lightgreen")) -# rgb(pal(0.5), maxColorValue = 255) - tab_view <- function(x, id, ...) { htmltools::browsable( htmltools::tagList( @@ -82,7 +83,7 @@ tab_view <- function(x, id, ...) { bordered = TRUE, compact = TRUE, filterable = TRUE, - fullWidth = TRUE, + # fullWidth = TRUE, height = 800, highlight = TRUE, pagination = TRUE, @@ -96,6 +97,18 @@ tab_view <- function(x, id, ...) { striped = TRUE, wrap = FALSE, elementId = id, + columns = list( + SubjectID = reactable::colDef( + sticky = "left", + # Add a right border style to visually distinguish the sticky column + style = list(borderRight = "1px solid #eee"), + headerStyle = list(borderRight = "1px solid #eee") + ) + ), + defaultColDef = reactable::colDef( + minWidth = 170, + headerStyle = list(background = "#f7f7f8") + ), theme = reactable::reactableTheme( borderColor = "#dfe2e5", stripedColor = "#f6f8fa", @@ -117,7 +130,7 @@ tab_view <- function(x, id, ...) { #| label: metadata meta <- d_raw |> select( - rownum, portal_run_id, SubjectID, LibraryID_tumor, SampleID_tumor, external_subject_id, external_sample_id, + rownum, is_myriad, portal_run_id, SubjectID, LibraryID_tumor, SampleID_tumor, external_subject_id, external_sample_id, project_owner, project_name, source, quality, workflow ) tab_view(meta, id = "metadata") @@ -232,6 +245,7 @@ dsig_filt <- dsig |> dall <- d_raw |> select( rownum, + is_myriad, date_analysed = "start", portal_run_id, SubjectID, LibraryID_tumor, SampleID_tumor, external_subject_id, external_sample_id, @@ -251,46 +265,46 @@ tab_view(dall, "summary_metrics") ### HRD Plot +Showing 2-way relationships between DRAGEN, CHORD and HRDetect. + ```{r} #| label: hrd_plot #| fig-width: 15 -#| fig-height: 15 +#| fig-height: 22 pdat <- dall |> mutate(sbj = glue("{SubjectID}_{LibraryID_tumor}")) |> select(sbj, dragen = hrd_dragen, chord = hrd_chord, hrdetect = hrd_hrdetect) p1 <- pdat |> - ggplot2::ggplot(aes(x = chord, y = hrdetect)) + + ggplot2::ggplot(aes(x = chord, y = hrdetect, label = sbj)) + ggplot2::geom_point(colour = "#00bfc4") + ggplot2::theme_bw() psub1 <- plotly::ggplotly(p1) p2 <- pdat |> - ggplot2::ggplot(aes(x = chord, y = dragen)) + + ggplot2::ggplot(aes(x = chord, y = dragen, label = sbj)) + ggplot2::geom_point(colour = "#f8766d") + ggplot2::theme_bw() psub2 <- plotly::ggplotly(p2) p3 <- pdat |> - ggplot2::ggplot(aes(x = hrdetect, y = dragen)) + + ggplot2::ggplot(aes(x = hrdetect, y = dragen, label = sbj)) + ggplot2::geom_point(colour = "#7cae00") + ggplot2::theme_bw() psub3 <- plotly::ggplotly(p3) # p_all <- p1 + p2 + p3 + plot_layout(ncol = 1) -plotly::subplot(psub1, psub2, psub3, nrows = 3, titleX = TRUE, titleY = TRUE) +plotly::subplot(psub1, psub2, psub3, nrows = 3, titleX = TRUE, titleY = TRUE, margin = c(0.02, 0.02, 0.04, 0.04)) ``` ### Signatures #### All (SNV, Indel, DBS) -Don't show. - -```{r eval=FALSE} +```{r} #| label: sig_results_all dsig |> - left_join(meta |> select(rownum, portal_run_id, SubjectID, LibraryID_tumor), + left_join(meta |> select(rownum, is_myriad, portal_run_id, SubjectID, LibraryID_tumor), by = "portal_run_id" ) |> - select(rownum, portal_run_id, SubjectID, LibraryID_tumor, everything()) |> + select(rownum, is_myriad, portal_run_id, SubjectID, LibraryID_tumor, everything()) |> tab_view("sig_results_all") ``` @@ -299,10 +313,10 @@ dsig |> ```{r} #| label: sig_results_top2 dsig_filt |> - left_join(meta |> select(rownum, portal_run_id, SubjectID, LibraryID_tumor), + left_join(meta |> select(rownum, is_myriad, portal_run_id, SubjectID, LibraryID_tumor), by = "portal_run_id" ) |> - select(rownum, portal_run_id, SubjectID, LibraryID_tumor, everything()) |> + select(rownum, is_myriad, portal_run_id, SubjectID, LibraryID_tumor, everything()) |> arrange(rownum) |> tab_view("sig_results_top2") ```