Skip to content

Commit

Permalink
Merge pull request #129 from umccr/sash_tidy
Browse files Browse the repository at this point in the history
Sash support
  • Loading branch information
pdiakumis authored Sep 16, 2024
2 parents 620d795 + b143b54 commit 43c99db
Show file tree
Hide file tree
Showing 19 changed files with 1,205 additions and 189 deletions.
3 changes: 2 additions & 1 deletion .Rbuildignore
Original file line number Diff line number Diff line change
Expand Up @@ -27,5 +27,6 @@ inst/rmd/umccr_portal/html
inst/rmd/umccr_workflows/alignment_qc/nogit
inst/rmd/umccr_workflows/bcl_convert/html
inst/rmd/umccr_workflows/interop/html
inst/rmd/umccr_workflows/umccrise/html
inst/rmd/umccr_workflows/sash/nogit
inst/rmd/umccr_workflows/umccrise/nogit
inst/sandbox
2 changes: 2 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,8 @@ export(TsoTmbFile)
export(TsoTmbTraceTsvFile)
export(VCMetricsFile)
export(Wf)
export(Wf_sash)
export(Wf_sash_download_tidy_write)
export(Wf_tso_ctdna_tumor_only)
export(Wf_umccrise)
export(Wf_umccrise_download_tidy_write)
Expand Down
12 changes: 4 additions & 8 deletions R/Wf.R
Original file line number Diff line number Diff line change
Expand Up @@ -165,18 +165,15 @@ Wf <- R6::R6Class(
#' download them).
#' @param recursive Should files be returned recursively _in and under_ the specified
#' GDS directory, or _only directly in_ the specified GDS directory (def: TRUE via ICA API).
#' @param list_filter_fun Function to filter relevant files.
download_files = function(path = self$path, outdir, ica_token = Sys.getenv("ICA_ACCESS_TOKEN"),
max_files = 1000, dryrun = FALSE, recursive = NULL,
list_filter_fun = NULL) {
max_files = 1000, dryrun = FALSE, recursive = NULL) {
# TODO: add envvar checker
regexes <- self$regexes
assertthat::assert_that(!is.null(regexes), !is.null(list_filter_fun))
assertthat::assert_that(!is.null(regexes))
if (self$filesystem == "gds") {
d <- dr_gds_download(
gdsdir = path, outdir = outdir, regexes = regexes, token = ica_token,
page_size = max_files, dryrun = dryrun, recursive = recursive,
list_filter_fun = list_filter_fun
page_size = max_files, dryrun = dryrun, recursive = recursive
)
if (!dryrun) {
self$filesystem <- "local"
Expand All @@ -185,8 +182,7 @@ Wf <- R6::R6Class(
} else if (self$filesystem == "s3") {
d <- dr_s3_download(
s3dir = path, outdir = outdir, regexes = regexes,
max_objects = max_files, dryrun = dryrun,
list_filter_fun = list_filter_fun
max_objects = max_files, dryrun = dryrun
)
if (!dryrun) {
self$filesystem <- "local"
Expand Down
8 changes: 3 additions & 5 deletions R/fs_icav1.R
Original file line number Diff line number Diff line change
Expand Up @@ -139,7 +139,7 @@ gds_list_files_filter_relevant <- function(gdsdir, pattern = NULL, regexes = DR_
no_recurse = no_recurse, page_token = page_token, recursive = recursive
) |>
dplyr::rowwise() |>
dplyr::mutate(type = purrr::map_chr(.data$bname, \(x) match_regex(x, regexes))) |>
dplyr::mutate(type = purrr::map_chr(.data$path, \(x) match_regex(x, regexes))) |>
dplyr::ungroup() |>
dplyr::filter(!is.na(.data$type), grepl(pattern, .data$type)) |>
dplyr::select(dplyr::any_of(cols_sel))
Expand All @@ -155,7 +155,6 @@ gds_list_files_filter_relevant <- function(gdsdir, pattern = NULL, regexes = DR_
#' @param outdir Local output directory.
#' @param dryrun If TRUE, just list the files that will be downloaded (don't
#' download them).
#' @param list_filter_fun Function to filter relevant GDS files.
#' @examples
#' \dontrun{
#' gdsdir <- "gds://production/analysis_data/SBJ01155/umccrise/202408300c218043/L2101566__L2101565"
Expand All @@ -171,11 +170,10 @@ gds_list_files_filter_relevant <- function(gdsdir, pattern = NULL, regexes = DR_
#' @export
dr_gds_download <- function(gdsdir, outdir, token = Sys.getenv("ICA_ACCESS_TOKEN"),
pattern = NULL, page_size = 100, dryrun = FALSE,
regexes = DR_FILE_REGEX, recursive = NULL,
list_filter_fun = gds_list_files_filter_relevant) {
regexes = DR_FILE_REGEX, recursive = NULL) {
e <- emojifont::emoji
fs::dir_create(outdir)
d <- list_filter_fun(
d <- gds_list_files_filter_relevant(
gdsdir = gdsdir, pattern = pattern, regexes = regexes,
token = token, page_size = page_size, include_url = FALSE,
no_recurse = FALSE, page_token = NULL,
Expand Down
2 changes: 1 addition & 1 deletion R/fs_local.R
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ local_list_files_dir <- function(localdir, max_files = NULL) {
local_list_files_filter_relevant <- function(localdir, regexes = DR_FILE_REGEX, max_files = NULL) {
local_list_files_dir(localdir = localdir, max_files = max_files) |>
dplyr::mutate(
type = purrr::map_chr(.data$bname, \(x) match_regex(x, regexes = regexes))
type = purrr::map_chr(.data$path, \(x) match_regex(x, regexes = regexes))
) |>
dplyr::filter(!is.na(.data$type)) |>
dplyr::select("type", "bname", "size", "lastmodified", localpath = "path")
Expand Down
8 changes: 3 additions & 5 deletions R/fs_s3.R
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,7 @@ s3_list_files_filter_relevant <- function(s3dir, pattern = NULL,
d <- d_all |>
dplyr::rowwise() |>
dplyr::mutate(
type = purrr::map_chr(.data$bname, \(x) match_regex(x, regexes))
type = purrr::map_chr(.data$path, \(x) match_regex(x, regexes))
) |>
dplyr::ungroup() |>
dplyr::filter(!is.na(.data$type), grepl(pattern, .data$type)) |>
Expand Down Expand Up @@ -109,7 +109,6 @@ s3_list_files_filter_relevant <- function(s3dir, pattern = NULL,
#' @param outdir Path to output directory.
#' @param dryrun If TRUE, just list the files that will be downloaded (don't
#' download them).
#' @param list_filter_fun Function to filter relevant S3 files.
#' @examples
#' \dontrun{
#' p1 <- "s3://org.umccr.data.oncoanalyser/analysis_data/SBJ05373/sash"
Expand All @@ -125,12 +124,11 @@ s3_list_files_filter_relevant <- function(s3dir, pattern = NULL,
#' }
#' @export
dr_s3_download <- function(s3dir, outdir, max_objects = 100, pattern = NULL,
regexes = DR_FILE_REGEX, dryrun = FALSE,
list_filter_fun = s3_list_files_filter_relevant) {
regexes = DR_FILE_REGEX, dryrun = FALSE) {
s3 <- paws.storage::s3()
e <- emojifont::emoji
fs::dir_create(outdir)
d <- list_filter_fun(
d <- s3_list_files_filter_relevant(
s3dir = s3dir, pattern = NULL, regexes = regexes,
max_objects = max_objects, presign = FALSE
)
Expand Down
261 changes: 261 additions & 0 deletions R/sash.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,261 @@
#' Wf_sash R6 Class
#'
#' @description
#' Reads and writes tidy versions of files from the `sash` workflow
#'
#' @examples
#' \dontrun{
#'
#' #---- Local ----#
#' p1 <- "~/s3/org.umccr.data.oncoanalyser/analysis_data/SBJ05571/sash"
#' p2 <- "202408270b93455e/L2401308_L2401307"
#' p <- normalizePath(file.path(p1, p2))
#' SubjectID <- "SBJ05571"
#' SampleID_tumor <- "MDX240307"
#' prefix <- glue("{SubjectID}__{SampleID_tumor}")
#' s1 <- Wf_sash$new(path = p, SubjectID = SubjectID, SampleID_tumor = SampleID_tumor)
#' s1$list_files(max_files = 20)
#' s1$list_files_filter_relevant(max_files = 300)
#' d <- s1$download_files(max_files = 1000, dryrun = F)
#' d_tidy <- s1$tidy_files(d)
#' d_write <- s1$write(
#' d_tidy,
#' outdir = file.path(p, "dracarys_tidy"),
#' prefix = glue("{SubjectID}_{SampleID_tumor}"),
#' format = "tsv"
#' )
#'
#' #---- S3 ----#
#' p1 <- "s3://org.umccr.data.oncoanalyser/analysis_data/SBJ05571/sash"
#' p2 <- "202408270b93455e/L2401308_L2401307"
#' p <- file.path(p1, p2)
#' SubjectID <- "SBJ05571"
#' SampleID_tumor <- "MDX240307"
#' prefix <- glue("{SubjectID}__{SampleID_tumor}")
#' s1 <- Wf_sash$new(path = p, SubjectID = SubjectID, SampleID_tumor = SampleID_tumor)
#' s1$list_files(max_files = 20)
#' s1$list_files_filter_relevant()
#' outdir <- sub("s3:/", "~/s3", p)
#' d <- s1$download_files(outdir = outdir, max_files = 1000, dryrun = F)
#' d_tidy <- s1$tidy_files(d)
#' d_write <- s1$write(
#' d_tidy,
#' outdir = file.path(p, "dracarys_tidy"),
#' prefix = glue("{SubjectID}__{SampleID_tumor}"),
#' format = "tsv"
#' )
#' }
#'
#' @export
Wf_sash <- R6::R6Class(
"Wf_sash",
inherit = Wf,
public = list(
#' @field SubjectID The SubjectID of the sample (needed for path lookup).
#' @field SampleID_tumor The SampleID of the tumor sample (needed for path lookup).
SubjectID = NULL,
SampleID_tumor = NULL,
#' @description Create a new Wf_sash object.
#' @param path Path to directory with raw workflow results (from GDS, S3, or
#' local filesystem).
#' @param SubjectID The SubjectID of the sample (needed for path lookup).
#' @param SampleID_tumor The SampleID of the tumor sample (needed for path lookup).
initialize = function(path = NULL, SubjectID = NULL, SampleID_tumor = NULL) {
wname <- "sash"
pref <- glue("{SubjectID}_{SampleID_tumor}")
crep <- "cancer_report/cancer_report_tables"
regexes <- tibble::tribble(
~regex, ~fun,
glue("{pref}/{crep}/hrd/{pref}-chord\\.tsv\\.gz$"), "hrd_chord",
glue("{pref}/{crep}/hrd/{pref}-hrdetect\\.tsv\\.gz$"), "hrd_hrdetect",
glue("{pref}/{crep}/hrd/{pref}-dragen\\.tsv\\.gz$"), "hrd_dragen",
glue("{pref}/{crep}/sigs/{pref}-snv_2015\\.tsv\\.gz$"), "sigs_snv2015",
glue("{pref}/{crep}/sigs/{pref}-snv_2020\\.tsv\\.gz$"), "sigs_snv2020",
glue("{pref}/{crep}/sigs/{pref}-dbs\\.tsv\\.gz$"), "sigs_dbs",
glue("{pref}/{crep}/sigs/{pref}-indel\\.tsv\\.gz$"), "sigs_indel",
glue("{pref}/{crep}/{pref}-qc_summary\\.tsv\\.gz$"), "qcsum",
glue("{pref}/smlv_somatic/report/pcgr/{SampleID_tumor}\\.pcgr_acmg\\.grch38\\.json\\.gz$"), "pcgr_json"
) |>
dplyr::mutate(fun = paste0("read_", .data$fun))

super$initialize(path = path, wname = wname, regexes = regexes)
self$SubjectID <- SubjectID
self$SampleID_tumor <- SampleID_tumor
},
#' @description Print details about the Workflow.
#' @param ... (ignored).
print = function(...) {
res <- tibble::tribble(
~var, ~value,
"path", self$path,
"wname", self$wname,
"filesystem", self$filesystem,
"SubjectID", self$SubjectID,
"SampleID_tumor", self$SampleID_tumor
)
print(res)
invisible(self)
},
#' @description Read `pcgr.json.gz` file.
#' @param x Path to file.
read_pcgr_json = function(x) {
j <- read_jsongz_jsonlite(x)
tmb <-
j[["content"]][["tmb"]][["variant_statistic"]] %||%
j[["content"]][["tmb"]][["v_stat"]] %||%
list(tmb_estimate = NA, n_tmb = NA)
tmb <- purrr::flatten(tmb) |>
tibble::as_tibble_row() |>
dplyr::select("tmb_estimate", "n_tmb")
msi <- j[["content"]][["msi"]][["prediction"]][["msi_stats"]]
# handle nulls
msi <- msi %||% list(fracIndels = NA, predicted_class = NA)
msi <- purrr::flatten(msi) |>
tibble::as_tibble_row() |>
dplyr::select("fracIndels", "predicted_class")
metrics <- dplyr::bind_cols(msi, tmb)
return(metrics)
},
#' @description Read `dragen.tsv.gz` cancer report hrd file.
#' @param x Path to file.
read_hrd_dragen = function(x) {
ct <- readr::cols(.default = "d", Sample = "c")
read_tsvgz(x, col_types = ct)
},
#' @description Read `chord.tsv.gz` cancer report hrd file.
#' @param x Path to file.
read_hrd_chord = function(x) {
ct <- readr::cols_only(
p_hrd = "d",
hr_status = "c",
hrd_type = "c",
p_BRCA1 = "d",
p_BRCA2 = "d"
)
read_tsvgz(x, col_types = ct)
},
#' @description Read `hrdetect.tsv.gz` cancer report hrd file.
#' @param x Path to file.
read_hrd_hrdetect = function(x) {
ct <- readr::cols(
.default = "d",
sample = "c"
)
read_tsvgz(x, col_types = ct) |>
dplyr::select(-c("sample"))
},
#' @description Read signature cancer report file.
#' @param x Path to file.
read_sigstsv = function(x) {
ct <- readr::cols(
.default = "d",
Signature = "c"
)
read_tsvgz(x, col_types = ct)
},
#' @description Read `snv_2015.tsv.gz` sigs cancer report file.
#' @param x Path to file.
read_sigs_snv2015 = function(x) {
self$read_sigstsv(x)
},
#' @description Read `snv_2020.tsv.gz` sigs cancer report file.
#' @param x Path to file.
read_sigs_snv2020 = function(x) {
self$read_sigstsv(x)
},
#' @description Read `dbs.tsv.gz` sigs cancer report file.
#' @param x Path to file.
read_sigs_dbs = function(x) {
self$read_sigstsv(x)
},
#' @description Read `indel.tsv.gz` sigs cancer report file.
#' @param x Path to file.
read_sigs_indel = function(x) {
self$read_sigstsv(x)
},
#' @description Read `qc_summary.tsv.gz` cancer report file.
#' @param x Path to file.
read_qcsum = function(x) {
d <- read_tsvgz(x, col_types = readr::cols(.default = "c"))
d |>
dplyr::select("variable", "value") |>
tidyr::pivot_wider(names_from = "variable", values_from = "value") |>
dplyr::rename(MSI_mb_tmp = "MSI (indels/Mb)") |>
dplyr::mutate(
purity_hmf = sub("(.*) \\(.*\\)", "\\1", .data$Purity) |> as.numeric(),
ploidy_hmf = sub("(.*) \\(.*\\)", "\\1", .data$Ploidy) |> as.numeric(),
msi_mb_hmf = sub(".* \\((.*)\\)", "\\1", .data$MSI_mb_tmp) |> as.numeric(),
contamination_hmf = as.numeric(.data$Contamination),
deleted_genes_hmf = as.numeric(.data$DeletedGenes),
msi_hmf = sub("(.*) \\(.*\\)", "\\1", .data$MSI_mb_tmp),
tmb_hmf = sub("(.*) \\(.*\\)", "\\1", .data$TMB) |> as.numeric(),
tml_hmf = sub("(.*) \\(.*\\)", "\\1", .data$TML) |> as.numeric(),
hypermutated = ifelse("Hypermutated" %in% d$variable, .data[["Hypermutated"]], NA) |> as.character()
) |>
dplyr::select(
qc_status_hmf = "QC_Status",
sex_hmf = "Gender",
"purity_hmf", "ploidy_hmf", "msi_hmf", "msi_mb_hmf",
"contamination_hmf",
"deleted_genes_hmf", "tmb_hmf", "tml_hmf",
wgd_hmf = "WGD",
"hypermutated"
)
}
) # end public
)

#' sash Download Tidy and Write
#'
#' Downloads files from the `sash` workflow and writes them in a tidy format.
#'
#' @param path Path to directory with raw workflow results (from GDS, S3, or
#' local filesystem).
#' @param SubjectID The SubjectID of the sample (needed for path lookup).
#' @param SampleID_tumor The SampleID of the tumor sample (needed for path lookup).
#' @param outdir Path to output directory.
#' @param format Format of output files.
#' @param max_files Max number of files to list.
#' @param ica_token ICA access token (def: $ICA_ACCESS_TOKEN env var).
#' @param dryrun If TRUE, just list the files that will be downloaded (don't
#' download them).
#' @return List where each element is a tidy tibble of a sash file.
#'
#' @examples
#' \dontrun{
#' SubjectID <- "SBJ03043"
#' SampleID_tumor <- "PRJ230004"
#' p1_gds <- glue("gds://production/analysis_data/{SubjectID}/umccrise")
#' p <- file.path(p1_gds, "20240830ec648f40/L2300064__L2300063")
#' outdir <- file.path(sub("gds:/", "~/icav1/g", p))
#' token <- Sys.getenv("ICA_ACCESS_TOKEN")
#' d <- Wf_sash_download_tidy_write(
#' path = p, SubjectID = SubjectID, SampleID_tumor = SampleID_tumor,
#' outdir = outdir,
#' dryrun = F
#' )
#' }
#' @export
Wf_sash_download_tidy_write <- function(path, SubjectID, SampleID_tumor,
outdir, format = "rds", max_files = 1000,
ica_token = Sys.getenv("ICA_ACCESS_TOKEN"),
dryrun = FALSE) {
s <- Wf_sash$new(
path = path, SubjectID = SubjectID, SampleID_tumor = SampleID_tumor
)
d_dl <- s$download_files(
outdir = outdir, ica_token = ica_token,
max_files = max_files, dryrun = dryrun
)
if (!dryrun) {
d_tidy <- s$tidy_files(d_dl)
d_write <- s$write(
d_tidy,
outdir = file.path(outdir, "dracarys_tidy"),
prefix = glue("{SubjectID}__{SampleID_tumor}"),
format = format
)
return(d_write)
}
return(d_dl)
}
Loading

0 comments on commit 43c99db

Please sign in to comment.