Skip to content

Commit

Permalink
Merge pull request #105 from umccr/align_qc_refactor
Browse files Browse the repository at this point in the history
Alignment QC reporter
  • Loading branch information
pdiakumis authored Dec 12, 2023
2 parents b196829 + 621e63c commit 8209cca
Show file tree
Hide file tree
Showing 9 changed files with 894 additions and 142 deletions.
81 changes: 50 additions & 31 deletions R/dragen.R
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,7 @@ WgsContigMeanCovFile <- R6::R6Class(
main_chrom1 <- c(1:22, "X", "Y")
main_chrom2 <- c(paste0("chr", main_chrom1))
main_chrom <- c(main_chrom1, main_chrom2, "Autosomal regions")
min_cvg <- 0.000001

d <- d |>
dplyr::mutate(
Expand Down Expand Up @@ -103,6 +104,7 @@ WgsContigMeanCovFile <- R6::R6Class(
coverage = dplyr::if_else(.data$alt_group == "bottom", .data$mean_cov, .data$coverage)
) |>
dplyr::distinct() |>
dplyr::filter(coverage > min_cvg) |>
dplyr::ungroup() |>
dplyr::select("chrom", "coverage", "panel")

Expand Down Expand Up @@ -174,21 +176,7 @@ WgsCoverageMetricsFile <- R6::R6Class(
"Aligned bases in genome" = "bases_aligned_in_genome_dragen",
"Average alignment coverage over genome" = "cov_alignment_avg_over_genome_dragen",
"Uniformity of coverage (PCT > 0.2*mean) over genome" = "cov_uniformity_over_genome_pct_gt02mean_dragen",
"PCT of genome with coverage [100x: inf)" = "cov_genome_pct_100x_inf_dragen",
"PCT of genome with coverage [ 50x: inf)" = "cov_genome_pct_50x_inf_dragen",
"PCT of genome with coverage [ 20x: inf)" = "cov_genome_pct_20x_inf_dragen",
"PCT of genome with coverage [ 15x: inf)" = "cov_genome_pct_15x_inf_dragen",
"PCT of genome with coverage [ 10x: inf)" = "cov_genome_pct_10x_inf_dragen",
"PCT of genome with coverage [ 3x: inf)" = "cov_genome_pct_3x_inf_dragen",
"PCT of genome with coverage [ 1x: inf)" = "cov_genome_pct_1x_inf_dragen",
"PCT of genome with coverage [ 0x: inf)" = "cov_genome_pct_0x_inf_dragen",
"PCT of genome with coverage [ 50x:100x)" = "cov_genome_pct_50x_100x_dragen",
"PCT of genome with coverage [ 20x: 50x)" = "cov_genome_pct_20x_50x_dragen",
"PCT of genome with coverage [ 15x: 20x)" = "cov_genome_pct_15x_20x_dragen",
"PCT of genome with coverage [ 10x: 15x)" = "cov_genome_pct_10x_15x_dragen",
"PCT of genome with coverage [ 3x: 10x)" = "cov_genome_pct_3x_10x_dragen",
"PCT of genome with coverage [ 1x: 3x)" = "cov_genome_pct_1x_3x_dragen",
"PCT of genome with coverage [ 0x: 1x)" = "cov_genome_pct_0x_1x_dragen",
"Uniformity of coverage (PCT > 0.4*mean) over genome" = "cov_uniformity_over_genome_pct_gt04mean_dragen",
"Average chr X coverage over genome" = "cov_avg_x_over_genome_dragen",
"Average chr Y coverage over genome" = "cov_avg_y_over_genome_dragen",
"Average mitochondrial coverage over genome" = "cov_avg_mt_over_genome_dragen",
Expand All @@ -203,19 +191,36 @@ WgsCoverageMetricsFile <- R6::R6Class(
raw <- readr::read_lines(x)
assertthat::assert_that(grepl("COVERAGE SUMMARY", raw[1]))

raw |>
res <- raw |>
tibble::as_tibble_col(column_name = "value") |>
tidyr::separate_wider_delim(
"value",
delim = ",", too_few = "align_start",
names = c("category", "dummy1", "var", "value", "pct")
)
# split to rename the
# "PCT of genome with coverage [100x: inf)" values
res1 <- res |>
# pct just shows 100% for a couple rows
dplyr::filter(!grepl("PCT of genome with coverage", .data$var)) |>
dplyr::select("var", "value")
res2 <- res |>
dplyr::filter(grepl("PCT of genome with coverage", .data$var)) |>
dplyr::mutate(
var = sub("PCT of genome with coverage ", "", .data$var),
var = gsub("\\[|\\]|\\(|\\)| ", "", .data$var),
var = gsub("x", "", .data$var),
var = gsub("inf", "Inf", .data$var)
) |>
tidyr::separate_wider_delim("var", names = c("start", "end"), delim = ":") |>
dplyr::mutate(var = as.character(glue("cov_genome_pct_{start}_{end}_dragen"))) |>
dplyr::select("var", "value")
res <- dplyr::bind_rows(res1, res2)
res |>
dplyr::mutate(
var = dplyr::recode(.data$var, !!!abbrev_nm),
value = as.numeric(.data$value)
value = as.numeric(.data$value),
var = dplyr::recode(.data$var, !!!abbrev_nm)
) |>
# pct just shows 100% for a couple rows
dplyr::select("var", "value") |>
tidyr::pivot_wider(names_from = "var", values_from = "value")
},
#' @description
Expand Down Expand Up @@ -451,6 +456,15 @@ MappingMetricsFile <- R6::R6Class(
"Reads with mate sequenced" = "reads_w_mate_seq_dragen",
"Reads without mate sequenced" = "reads_wo_mate_seq_dragen",
"QC-failed reads" = "reads_qcfail_dragen",
"Mapped reads adjusted for excluded mapping" = "reads_mapped_adjexcl_dragen",
"Mapped reads adjusted for filtered and excluded mapping" = "reads_mapped_adjfiltexcl_dragen",
"Unmapped reads adjusted for excluded mapping" = "reads_unmapped_adjexcl_dragen",
"Unmapped reads adjusted for filtered and excluded mapping" = "reads_unmapped_adjfiltexcl_dragen",
"Reads mapping to multiple locations" = "reads_map_multiloc_dragen",
"Hard-clipped bases R1" = "bases_hardclip_r1_dragen",
"Hard-clipped bases R2" = "bases_hardclip_r2_dragen",
"Soft-clipped bases" = "bases_softclip_dragen",
"Hard-clipped bases" = "bases_hardclip_dragen",
"Mapped reads" = "reads_mapped_dragen",
"Mapped reads adjusted for filtered mapping" = "reads_mapped_adjfilt_dragen",
"Mapped reads R1" = "reads_mapped_r1_dragen",
Expand All @@ -459,6 +473,7 @@ MappingMetricsFile <- R6::R6Class(
"Unmapped reads" = "reads_unmapped_dragen",
"Unmapped reads adjusted for filtered mapping" = "reads_unmapped_adjfilt_dragen",
"Adjustment of reads matching non-reference decoys" = "reads_match_nonref_decoys_adj_dragen",
"Adjustment of reads matching filter contigs" = "reads_match_filt_contig_adj_dragen",
"Singleton reads (itself mapped; mate unmapped)" = "reads_singleton_dragen",
"Paired reads (itself & mate mapped)" = "reads_paired_dragen",
"Properly paired reads" = "reads_paired_proper_dragen",
Expand All @@ -473,9 +488,11 @@ MappingMetricsFile <- R6::R6Class(
"Reads with MAPQ NA (Unmapped reads)" = "reads_mapq_na_unmapped_dragen",
"Reads with indel R1" = "reads_indel_r1_dragen",
"Reads with indel R2" = "reads_indel_r2_dragen",
"Reads with splice junction" = "reads_splicejunc_dragen",
"Total bases" = "bases_tot_dragen",
"Total bases R1" = "bases_tot_r1_dragen",
"Total bases R2" = "bases_tot_r2_dragen",
"Mapped bases" = "bases_mapped_dragen",
"Mapped bases R1" = "bases_mapped_r1_dragen",
"Mapped bases R2" = "bases_mapped_r2_dragen",
"Soft-clipped bases R1" = "bases_softclip_r1_dragen",
Expand All @@ -502,7 +519,8 @@ MappingMetricsFile <- R6::R6Class(
"Estimated sample contamination" = "contamination_est_dragen",
"DRAGEN mapping rate [mil. reads/second]" = "mapping_rate_dragen_milreads_per_sec_dragen",
"Number of duplicate marked and mate reads removed" = "reads_num_dupmarked_mate_reads_removed_dragen",
"Total reads in RG" = "reads_tot_rg_dragen"
"Total reads in RG" = "reads_tot_rg_dragen",
"Filtered rRNA reads" = "reads_rrna_filtered_dragen"
)
x <- self$path
raw <- readr::read_lines(x)
Expand Down Expand Up @@ -581,15 +599,15 @@ PloidyEstimationMetricsFile <- R6::R6Class(
"Autosomal median coverage" = "cov_auto_median_dragen",
"X median coverage" = "cov_x_median_dragen",
"Y median coverage" = "cov_y_median_dragen",
"1 median / Autosomal median" = "cov_1_div_auto_medians_dragen",
"2 median / Autosomal median" = "cov_2_div_auto_medians_dragen",
"3 median / Autosomal median" = "cov_3_div_auto_medians_dragen",
"4 median / Autosomal median" = "cov_4_div_auto_medians_dragen",
"5 median / Autosomal median" = "cov_5_div_auto_medians_dragen",
"6 median / Autosomal median" = "cov_6_div_auto_medians_dragen",
"7 median / Autosomal median" = "cov_7_div_auto_medians_dragen",
"8 median / Autosomal median" = "cov_8_div_auto_medians_dragen",
"9 median / Autosomal median" = "cov_9_div_auto_medians_dragen",
"1 median / Autosomal median" = "cov_1_div_auto_median_dragen",
"2 median / Autosomal median" = "cov_2_div_auto_median_dragen",
"3 median / Autosomal median" = "cov_3_div_auto_median_dragen",
"4 median / Autosomal median" = "cov_4_div_auto_median_dragen",
"5 median / Autosomal median" = "cov_5_div_auto_median_dragen",
"6 median / Autosomal median" = "cov_6_div_auto_median_dragen",
"7 median / Autosomal median" = "cov_7_div_auto_median_dragen",
"8 median / Autosomal median" = "cov_8_div_auto_median_dragen",
"9 median / Autosomal median" = "cov_9_div_auto_median_dragen",
"10 median / Autosomal median" = "cov_10_div_auto_median_dragen",
"11 median / Autosomal median" = "cov_11_div_auto_median_dragen",
"12 median / Autosomal median" = "cov_12_div_auto_median_dragen",
Expand Down Expand Up @@ -1070,7 +1088,8 @@ WgsHistFile <- R6::R6Class(
dplyr::mutate(
start = as.numeric(.data$start),
end = as.numeric(.data$end),
pct = round(.data$pct, 2)
pct = round(.data$pct, 2),
cumsum = cumsum(.data$pct)
)
},
#' @description
Expand Down
5 changes: 4 additions & 1 deletion R/dragen_fastqc.R
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,10 @@ read_fastqc_metrics <- function(x) {
delim = ",",
names = c("section", "mate", "metric", "value"),
) |>
dplyr::mutate(value = as.numeric(.data$value))
dplyr::mutate(
value = dplyr::na_if(.data$value, "NA"),
value = as.numeric(.data$value)
)

pos_base_cont <- d |>
dplyr::filter(.data$section == "POSITIONAL BASE CONTENT") |>
Expand Down
29 changes: 0 additions & 29 deletions inst/cli/multiqc.R

This file was deleted.

2 changes: 1 addition & 1 deletion inst/cli/tidy.R
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ tidy_add_args <- function(subp) {
tidy$add_argument("-p", "--prefix", help = glue("{emoji('violin')} Prefix string used for all results."), required = TRUE)
tidy$add_argument("-t", "--token", help = glue("{emoji('see_no_evil')} ICA access token. Default: ICA_ACCESS_TOKEN env var."), default = Sys.getenv("ICA_ACCESS_TOKEN"))
tidy$add_argument("-g", "--gds_local_dir", help = glue("{emoji('inbox_tray')} If input is a GDS directory, download the recognisable files to this directory. Default: '<out_dir>/dracarys_gds_sync'."))
tidy$add_argument("-f", "--format", help = glue("{emoji('art')} Format of output. Default: %(default)s."), default = "tsv", choices = c("tsv", "parquet", "both"))
tidy$add_argument("-f", "--format", help = glue("{emoji('art')} Format of output. Default: %(default)s."), default = "tsv")
tidy$add_argument("-n", "--dryrun", help = glue("{emoji('camel')} Dry run - just show files to be tidied."), action = "store_true")
tidy$add_argument("-q", "--quiet", help = glue("{emoji('sleeping')} Shush all the logs."), action = "store_true")
}
Expand Down
72 changes: 0 additions & 72 deletions inst/cli/tso.R

This file was deleted.

36 changes: 28 additions & 8 deletions inst/rmd/umccr_portal/portal_summary.Rmd
Original file line number Diff line number Diff line change
Expand Up @@ -110,14 +110,24 @@ get_sbj_url <- function(x, colour = NULL, account = "pro") {
sbj_url <- glue("<a href={sbj_url}>{x}</a>")
sbj_url
}
add_totals <- function(x) {
x |>
dplyr::ungroup() %>%
dplyr::bind_rows(
dplyr::summarise(
., dplyr::across(dplyr::where(is.numeric), sum),
dplyr::across(dplyr::where(is.character), ~"Total")
)
)
}
```

```{r vars}
# Get metadata for workflows run within the date range
# options(width = 150)
fmt1 <- "%Y-%m-%dT%H:%M:%S"
date_start <- as.POSIXct("2023-12-02T00:00:01", format = fmt1)
date_end <- as.POSIXct("2023-12-03T23:59:59", format = fmt1)
date_start <- as.POSIXct("2023-12-08T00:00:01", format = fmt1)
date_end <- as.POSIXct("2023-12-10T23:59:59", format = fmt1)
wf_order <- c(
"bcl_convert",
"tso_ctdna_tumor_only",
Expand All @@ -140,7 +150,7 @@ lims_rds <- here(glue("nogit/data_portal/lims/{as.Date(date_end)}.rds"))
# saveRDS(lims_raw, file = lims_rds)
lims_raw <- readr::read_rds(lims_rds)
pmeta_rds <- here(glue("nogit/data_portal/workflows/{as.Date(date_end)}.rds"))
# pmeta_raw <- dracarys::portal_meta_read(rows = 150, account = "prod")
# pmeta_raw <- dracarys::portal_meta_read(rows = 200, account = "prod")
# saveRDS(pmeta_raw, file = pmeta_rds)
pmeta_raw <- readr::read_rds(pmeta_rds)
pmeta <- pmeta_raw |>
Expand All @@ -158,7 +168,7 @@ lims <- lims_raw |>
```{r data_setup}
pmeta_sumy <- pmeta |>
dplyr::select(
start, end, wfr_name, type_name, wfr_id, portal_run_id
start, end, wfr_name, type_name, wfr_id, portal_run_id, end_status
) |>
dplyr::rowwise() |>
dplyr::mutate(
Expand All @@ -179,13 +189,23 @@ pmeta_sumy_count <- pmeta_sumy |>
dplyr::count(type_name, .drop = FALSE) |>
tibble::deframe()
chunks1 <- as.list(pmeta_sumy_count > 0)
pmeta_status_count <- pmeta_sumy |>
dplyr::count(type_name, end_status, .drop = FALSE) |>
add_totals()
```

```{r show_sumy_tbls}
pmeta_sumy_count |>
tibble::enframe(name = "workflow", value = "count") |>
pmeta_status_count |>
dplyr::mutate(type_name = dplyr::if_else(is.na(.data$type_name), "", .data$type_name)) |>
kableExtra::kbl(caption = "Workflow Type Count", row.names = TRUE) |>
kableExtra::kable_classic(full_width = FALSE, position = "float_left")
kableExtra::kable_classic(full_width = FALSE, position = "float_left") |>
kableExtra::column_spec(
3,
color = ifelse(
pmeta_status_count$end_status == "Succeeded", "green",
ifelse(pmeta_status_count$end_status == "Failed", "red", "black")
)
)
sbj_sumy <- pmeta_sumy |>
dplyr::count(sbjid, name = "n_wf") |>
dplyr::filter(grepl("SBJ", .data$sbjid)) |>
Expand Down Expand Up @@ -217,7 +237,7 @@ blank_lines()

## RuntimeVis

```{r vistime, fig.width=15, fig.height = 20}
```{r vistime, fig.width=15, fig.height = 30}
p1 <- pmeta_sumy |>
dplyr::arrange(sbj_lib, type_name) |>
dplyr::group_by(sbj_lib, type_name) |>
Expand Down
1 change: 1 addition & 0 deletions inst/rmd/umccr_workflows/alignment_qc/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
/nogit/
Loading

0 comments on commit 8209cca

Please sign in to comment.