Skip to content

Commit

Permalink
Fix issue found on another dataset (#9)
Browse files Browse the repository at this point in the history
  • Loading branch information
grst authored Aug 15, 2024
1 parent 1d2ef60 commit 714b66d
Show file tree
Hide file tree
Showing 3 changed files with 36 additions and 30 deletions.
2 changes: 1 addition & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
Package: PersonalisIO
Title: Read Personalis data into MultiAssayExperiment objects
Version: 0.3.0
Version: 0.3.1
Authors@R:
person("Gregor", "Sturm", , "[email protected]", role = c("aut", "cre"))
Description: This package provides convenience functions for reading real-world evidence data provided by Personalis into Bioconductor MultiAssayExperiment objects.
Expand Down
43 changes: 23 additions & 20 deletions R/personalis.R
Original file line number Diff line number Diff line change
Expand Up @@ -255,7 +255,10 @@ read_personalis_small_variant_report_sample <- function(sample_folder, modality,
mutate(sample = sample_name) |>
# in older versions, the "Chromosome" column is called "Sequence"
rename_with(\(x) if_else(x == "Sequence", "Chromosome", x)) |>
mutate(mut_id = sprintf("%s_%s_%s", Chromosome, `Genomic Variant`, `Variant Type`))
mutate(mut_id = sprintf("%s_%s_%s", Chromosome, `Genomic Variant`, `Variant Type`)) |>
mutate(`Variant ID` = as.character(`Variant ID`)) |>
mutate(`dbSNP Build` = as.character(`dbSNP Build`))


variant_table
}
Expand Down Expand Up @@ -331,29 +334,29 @@ read_personalis_variant_calling_summary_statistics <- function(sample_folder, mo
html_section <- if_else(sample_type == "somatic", "#concordance", sprintf("#%s_%s", str_to_title(sample_type), modality))
table_number <- if_else(sample_type == "somatic", 1, 2)
columns_to_fix <- if (sample_type == "somatic") c() else c("SNVs", "Indels", "Total")

tables <- read_html(html_file) |>
html_elements(html_section) |>
html_elements("table") |>
html_table(na.strings = "N/A")

if (!length(tables)) {
return(tibble())
} else {
tes <- tables[table_number] |>
lapply(function(df) {
colnames(df) <- make.names(colnames(df))
colnames(df)[1] <- "metric"
df |>
mutate(across(all_of(columns_to_fix), fix_thousands_separator))
}) |>
bind_rows() |>
pivot_longer(-metric, names_to = "mut_type", values_to = "value") |>
mutate(sample = sample_name) |>
mutate(var_name = sprintf("%s (%s)", metric, mut_type)) |>
select(sample, var_name, value) |>
pivot_wider(id_cols = sample, names_from = "var_name", values_from = "value") |>
mutate(across(contains("Number"), fix_thousands_separator))
tes <- tables[table_number] |>
lapply(function(df) {
colnames(df) <- make.names(colnames(df))
colnames(df)[1] <- "metric"
df |>
mutate(across(all_of(columns_to_fix), fix_thousands_separator))
}) |>
bind_rows() |>
pivot_longer(-metric, names_to = "mut_type", values_to = "value") |>
mutate(sample = sample_name) |>
mutate(var_name = sprintf("%s (%s)", metric, mut_type)) |>
select(sample, var_name, value) |>
pivot_wider(id_cols = sample, names_from = "var_name", values_from = "value") |>
mutate(across(contains("Number"), fix_thousands_separator))
}
}

Expand Down Expand Up @@ -393,7 +396,7 @@ read_personalis_vcf_files <- function(sample_paths, modality, sample_type) {
col_data <- col_data |>
tibble::column_to_rownames("sample")
}

all_variants <- map(variant_list, "vcf_data") |> bind_rows()
row_data <- all_variants |>
select(
Expand Down Expand Up @@ -454,9 +457,9 @@ read_personalis_vcf_files_sample <- function(sample_folder, modality, sample_typ
sprintf("%s_%s_%s_%s.%s", modality, tmp_sample_name, sample_type, tolower(modality), "vcf.gz")
)
)

if (nrow(variant_table)) {
variant_table <- variant_table |>
variant_table <- variant_table |>
mutate(sample = sample_name) |>
mutate(mut_id = sprintf("%s_%s_%s_%s", CHROM, POS, REF, ALT))
}
Expand Down
21 changes: 12 additions & 9 deletions R/util.R
Original file line number Diff line number Diff line change
Expand Up @@ -127,23 +127,26 @@ add_dummy_entry <- function(df, col_data, sample_col = "sample") {
#' @importFrom tibble as_tibble
parse_vcf_to_df <- function(path) {
# parse VCF file
vcf_content <- tryCatch({
read.vcfR(path)
}, error = function(e) {
read.vcfR(str_replace(path, "vcf.gz", "vcf"))
vcf_content <- tryCatch(
{
read.vcfR(path, verbose = FALSE)
},
error = function(e) {
read.vcfR(str_replace(path, "vcf.gz", "vcf"), verbose = FALSE)
}
)


tidy_vcf <- vcfR2tidy(vcf_content, verbose = FALSE)
# fixed field content to data frame
fixed_df <- vcfR2tidy(vcf_content)$fix
fixed_df <- tidy_vcf$fix

# GT content to data frame
gt_df <- vcfR2tidy(vcf_content)$gt
gt_df <- tidy_vcf$gt

# create addition column with observed nucleotides in order to avoid collisions when we do the left_join
gt_df <- gt_df |>
dplyr::mutate(ALT = str_split_i(gt_GT_alleles, "/", 2))

# next use ChromKey, POS and ALT for joining vcf content data frames
joined_vcf_df <- fixed_df |>
dplyr::left_join(gt_df, by = c("ChromKey", "POS", "ALT"))
Expand Down

0 comments on commit 714b66d

Please sign in to comment.