Skip to content

WGS Benchmark Validation

Mike Lloyd edited this page Mar 22, 2022 · 1 revision

Mouse WGS Comparision

Library Load

library(dplyr)
library(ggplot2)
library(tidyr)
library(stringr)
library(data.table)
Attaching package: ‘dplyr’


The following objects are masked from ‘package:stats’:

    filter, lag


The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union



Attaching package: ‘data.table’


The following objects are masked from ‘package:dplyr’:

    between, first, last

Import Data

Truth Set

Mm_WGS_truth <- as.data.frame(fread('/projects/omics_share/mouse/GRCm38/supporting_files/benchmarking_data/WGS/gold_truth_vcf/Mus_musculus.GRCm38_simVar_10x_ALLchr_golden_sorted.vcf'))
colnames(Mm_WGS_truth)[1] <- 'CHROM'
nrow(Mm_WGS_truth)

2684167

Mm_WGS_truth <- Mm_WGS_truth %>% mutate(CHROM = gsub(pattern = 'chr', replacement = '', CHROM))
head(Mm_WGS_truth)
A data.frame: 6 × 8
CHROM POS ID REF ALT QUAL FILTER INFO
<chr> <int> <chr> <chr> <chr> <chr> <chr> <chr>
1 1 3000631 . T A . PASS WP=1
2 1 3002045 . T C . PASS WP=0
3 1 3003499 . G A . PASS WP=0
4 1 3006043 . C T . PASS WP=0
5 1 3006569 . G A . PASS WP=1
6 1 3006907 . A G . PASS WP=1

Test Set

Mm_WGS_test <- read.delim('/projects/omics_share/mouse/GRCm38/supporting_files/benchmarking_data/RESULTS/WGS_mouse_nxf_bench/Mus_musculus.GRCm38_simVar_10x_allChr/Mus_musculus.GRCm38_simVar_10x_allChr_snpsift_finalTable.txt', stringsAsFactors = F)
nrow(Mm_WGS_test)

2237090

head(Mm_WGS_test)
A data.frame: 6 × 14
CHROM POS REF ALT ID FILTER QUAL FILTER.1 AF SNPEFF_FUNCTIONAL_CLASS SNPEFF_GENE_NAME SNPEFF_AMINO_ACID_CHANGE SNPEFF_EFFECT SNPEFF_TRANSCRIPT_ID
<chr> <int> <chr> <chr> <chr> <chr> <dbl> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
1 1 3000631 T A LowCoverage 191.64 LowCoverage 0.5 NONE INTERGENIC
2 1 3003499 G A LowCoverage 350.64 LowCoverage 0.5 NONE INTERGENIC
3 1 3006043 C T LowCoverage 145.64 LowCoverage 0.5 NONE INTERGENIC
4 1 3006907 A G LowCoverage 138.64 LowCoverage 0.5 NONE INTERGENIC
5 1 3006972 A G LowCoverage 215.64 LowCoverage 0.5 NONE INTERGENIC
6 1 3008253 C G LowCoverage 135.64 LowCoverage 0.5 NONE INTERGENIC
Mm_WGS_test_short <- Mm_WGS_test %>% dplyr::select(CHROM, POS, ID, REF, ALT, QUAL, FILTER) %>% distinct(CHROM, POS, ID, REF, ALT, QUAL, FILTER, .keep_all = TRUE)

Compare Truth to Test

joined_data <- dplyr::left_join(Mm_WGS_truth, Mm_WGS_test_short, by = c('CHROM' = 'CHROM', 'POS' = 'POS'))
head(joined_data, n = 15)
A data.frame: 15 × 13
CHROM POS ID.x REF.x ALT.x QUAL.x FILTER.x INFO ID.y REF.y ALT.y QUAL.y FILTER.y
<chr> <int> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <dbl> <chr>
1 1 3000631 . T A . PASS WP=1 T A 191.64 LowCoverage
2 1 3002045 . T C . PASS WP=0 NA NA NA NA NA
3 1 3003499 . G A . PASS WP=0 G A 350.64 LowCoverage
4 1 3006043 . C T . PASS WP=0 C T 145.64 LowCoverage
5 1 3006569 . G A . PASS WP=1 NA NA NA NA NA
6 1 3006907 . A G . PASS WP=1 A G 138.64 LowCoverage
7 1 3006972 . A G . PASS WP=1 A G 215.64 LowCoverage
8 1 3008253 . C G . PASS WP=1 C G 135.64 LowCoverage
9 1 3008649 . A C . PASS WP=0 NA NA NA NA NA
10 1 3008781 . A G . PASS WP=1 A G 136.64 LowCoverage
11 1 3008970 . AAC A . PASS WP=0 NA NA NA NA NA
12 1 3009871 . T G . PASS WP=0 NA NA NA NA NA
13 1 3010151 . A G . PASS WP=1 A G 99.64 LowCoverage
14 1 3010794 . C T . PASS WP=1 C T 265.64 LowCoverage
15 1 3011900 . T C . PASS WP=1 T C 255.64 LowCoverage
TP_hits <- joined_data %>% filter(complete.cases(.)) %>% filter(ALT.x == ALT.y) 

# filter cases where the position is called, but the 'ALT' call is wrong. These calls are false positive, as a call was made, but it was incorrect. 
head(TP_hits, n = 15)
A data.frame: 15 × 13
CHROM POS ID.x REF.x ALT.x QUAL.x FILTER.x INFO ID.y REF.y ALT.y QUAL.y FILTER.y
<chr> <int> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <dbl> <chr>
1 1 3000631 . T A . PASS WP=1 T A 191.64 LowCoverage
2 1 3003499 . G A . PASS WP=0 G A 350.64 LowCoverage
3 1 3006043 . C T . PASS WP=0 C T 145.64 LowCoverage
4 1 3006907 . A G . PASS WP=1 A G 138.64 LowCoverage
5 1 3006972 . A G . PASS WP=1 A G 215.64 LowCoverage
6 1 3008253 . C G . PASS WP=1 C G 135.64 LowCoverage
7 1 3008781 . A G . PASS WP=1 A G 136.64 LowCoverage
8 1 3010151 . A G . PASS WP=1 A G 99.64 LowCoverage
9 1 3010794 . C T . PASS WP=1 C T 265.64 LowCoverage
10 1 3011900 . T C . PASS WP=1 T C 255.64 LowCoverage
11 1 3012344 . G A . PASS WP=1 G A 286.64 LowCoverage
12 1 3012377 . C T . PASS WP=0 C T 118.64 LowCoverage
13 1 3012598 . T C . PASS WP=0 T C 179.64 LowCoverage
14 1 3012643 . T G . PASS WP=0 T G 353.64 LowCoverage
15 1 3015081 . C T . PASS WP=0 C T 133.64 LowCoverage
TP <- nrow(TP_hits)
FN_hits <- joined_data %>% filter(!complete.cases(.))
head(FN_hits)
A data.frame: 6 × 13
CHROM POS ID.x REF.x ALT.x QUAL.x FILTER.x INFO ID.y REF.y ALT.y QUAL.y FILTER.y
<chr> <int> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <dbl> <chr>
1 1 3002045 . T C . PASS WP=0 NA NA NA NA NA
2 1 3006569 . G A . PASS WP=1 NA NA NA NA NA
3 1 3008649 . A C . PASS WP=0 NA NA NA NA NA
4 1 3008970 . AAC A . PASS WP=0 NA NA NA NA NA
5 1 3009871 . T G . PASS WP=0 NA NA NA NA NA
6 1 3024632 . A G . PASS WP=1 NA NA NA NA NA
FN <- nrow(FN_hits)
FP_hits <- dplyr::left_join(Mm_WGS_test,Mm_WGS_truth, by = c('CHROM' = 'CHROM', 'POS' = 'POS')) %>% dplyr::select(CHROM, POS, REF.x, ALT.x, REF.y, ALT.y, FILTER.x, FILTER.y) %>% filter(!complete.cases(.))
FP <- nrow(FP_hits) + nrow(joined_data %>% filter(complete.cases(.)) %>% filter(ALT.x != ALT.y))
table(TP_hits$FILTER.y)
           LowCoverage LowCoverage;SnpCluster                   PASS 
               2217259                    216                    212 
table(FP_hits$FILTER.x)
LowCoverage        PASS 
      29441           3 
#Precision = TruePositives / (TruePositives + FalsePositives)

precision = TP / (TP + FP)
#Recall = TruePositives / (TruePositives + FalseNegatives)

recall = TP / (TP + FN)
round(precision, digits = 4)

0.9866

round(recall, digits = 4)

0.8264

NOTE: That while precision is high. The majority of calls are 'LowCoverage'

Human WGS Comparison

Import Data

Truth Set

Hs_WGS_truth <- as.data.frame(fread('/projects/omics_share/human/GRCh38/supporting_files/benchmarking_data/WGS/gold_truth_vcf/Homo_sapiens.GRCh38_simVar_10x_ALLchr_golden_sorted.vcf'))

colnames(Hs_WGS_truth)[1] <- 'CHROM'
nrow(Hs_WGS_truth)

2965993

head(Hs_WGS_truth)
A data.frame: 6 × 8
CHROM POS ID REF ALT QUAL FILTER INFO
<chr> <int> <chr> <chr> <chr> <chr> <chr> <chr>
1 chr1 12665 . C CAGTATC . PASS WP=1
2 chr1 14767 . C G . PASS WP=0
3 chr1 15214 . T A . PASS WP=0
4 chr1 15364 . G A . PASS WP=1
5 chr1 15900 . G A . PASS WP=1
6 chr1 17123 . G A . PASS WP=1
ind <- duplicated(Hs_WGS_truth[,1:8])
table(ind)
ind
  FALSE    TRUE 
2965700     293 
head(Hs_WGS_truth[ind,])
A data.frame: 6 × 8
CHROM POS ID REF ALT QUAL FILTER INFO
<chr> <int> <chr> <chr> <chr> <chr> <chr> <chr>
1526 chr1 1669007 . A G . PASS WP=1
1758 chr1 1878417 . C T . PASS WP=1
3158 chr1 3336712 . G A . PASS WP=1
11400 chr1 11463573 . G A . PASS WP=1
23240 chr1 23311086 . A T . PASS WP=1
26258 chr1 26333715 . T C . PASS WP=1

Test Set

Hs_WGS_test <- read.delim('/projects/omics_share/human/GRCh38/supporting_files/benchmarking_data/RESULTS/WGS_human_nxf_bench/Homo_sapiens.GRCh38_simVar_10x_allChr/Homo_sapiens.GRCh38_simVar_10x_allChr_snpsift_finalTable.txt', stringsAsFactors = F)
head(Hs_WGS_test)
A data.frame: 6 × 45
CHROM POS ID REF ALT QUAL FILTER ANN....ALLELE ANN....EFFECT ANN....IMPACT dbNSFP_Polyphen2_HDIV_score dbNSFP_MutationAssessor_score dbNSFP_phyloP100way_vertebrate dbNSFP_1000Gp3_AF dbNSFP_1000Gp3_AFR_AF dbNSFP_1000Gp3_EUR_AF dbNSFP_1000Gp3_AMR_AF dbNSFP_1000Gp3_EAS_AF dbNSFP_ESP6500_AA_AF dbNSFP_ESP6500_EA_AF
<chr> <int> <chr> <chr> <chr> <dbl> <chr> <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 chr1 15214 T A 92.64 LowCoverage A downstream_gene_variant MODIFIER NA NA NA NA NA NA NA NA NA NA
2 chr1 15214 T A 92.64 LowCoverage A downstream_gene_variant MODIFIER NA NA NA NA NA NA NA NA NA NA
3 chr1 15214 T A 92.64 LowCoverage A downstream_gene_variant MODIFIER NA NA NA NA NA NA NA NA NA NA
4 chr1 15214 T A 92.64 LowCoverage A downstream_gene_variant MODIFIER NA NA NA NA NA NA NA NA NA NA
5 chr1 15214 T A 92.64 LowCoverage A downstream_gene_variant MODIFIER NA NA NA NA NA NA NA NA NA NA
6 chr1 15214 T A 92.64 LowCoverage A intron_variant MODIFIER NA NA NA NA NA NA NA NA NA NA
Hs_WGS_test_short <- Hs_WGS_test %>% dplyr::select(CHROM, POS, ID, REF, ALT, QUAL, FILTER) %>% distinct(CHROM, POS, ID, REF, ALT, QUAL, FILTER, .keep_all = TRUE)
head(Hs_WGS_test_short)
A data.frame: 6 × 7
CHROM POS ID REF ALT QUAL FILTER
<chr> <int> <chr> <chr> <chr> <dbl> <chr>
1 chr1 15214 T A 92.64 LowCoverage
2 chr1 15364 rs1477841392 G A 117.64 LowCoverage
3 chr1 15900 G A 111.64 LowCoverage
4 chr1 30882 C A 211.64 LowCoverage
5 chr1 40807 T C 97.14 LowCoverage
6 chr1 46391 A G 69.65 LowCoverage
nrow(Hs_WGS_test_short)

2465227

Compare Truth to Test

joined_data <- dplyr::left_join(Hs_WGS_truth, Hs_WGS_test_short, by = c('CHROM' = 'CHROM', 'POS' = 'POS')) %>% dplyr::select(CHROM, POS, REF.x, ALT.x, REF.y, ALT.y, FILTER.x, FILTER.y)
head(joined_data)
A data.frame: 6 × 8
CHROM POS REF.x ALT.x REF.y ALT.y FILTER.x FILTER.y
<chr> <int> <chr> <chr> <chr> <chr> <chr> <chr>
1 chr1 12665 C CAGTATC NA NA PASS NA
2 chr1 14767 C G NA NA PASS NA
3 chr1 15214 T A T A PASS LowCoverage
4 chr1 15364 G A G A PASS LowCoverage
5 chr1 15900 G A G A PASS LowCoverage
6 chr1 17123 G A NA NA PASS NA
TP_hits <- joined_data %>% filter(ALT.x == ALT.y) 

# filter cases where the position is called, but the 'ALT' call is wrong. These calls are false positive, as a call was made, but it was incorrect. 
head(TP_hits)
A data.frame: 6 × 8
CHROM POS REF.x ALT.x REF.y ALT.y FILTER.x FILTER.y
<chr> <int> <chr> <chr> <chr> <chr> <chr> <chr>
1 chr1 15214 T A T A PASS LowCoverage
2 chr1 15364 G A G A PASS LowCoverage
3 chr1 15900 G A G A PASS LowCoverage
4 chr1 30882 C A C A PASS LowCoverage
5 chr1 40807 T C T C PASS LowCoverage
6 chr1 46391 A G A G PASS LowCoverage
TP <- nrow(TP_hits)
TP

2441101

FN_hits <- joined_data %>% filter(!complete.cases(.))
FN <- nrow(FN_hits)
FP_hits <- dplyr::left_join(Hs_WGS_test, Hs_WGS_truth, by = c('CHROM' = 'CHROM', 'POS' = 'POS')) %>% dplyr::select(CHROM, POS, REF.x, ALT.x, REF.y, ALT.y, FILTER.x, FILTER.y) %>% filter(!complete.cases(.))
head(FP_hits)
A data.frame: 6 × 8
CHROM POS REF.x ALT.x REF.y ALT.y FILTER.x FILTER.y
<chr> <int> <chr> <chr> <chr> <chr> <chr> <chr>
1 chr1 54429 T TATCGGA NA NA LowCoverage NA
2 chr1 108912 C CA NA NA LowCoverage NA
3 chr1 530700 TTTG T NA NA LowCoverage NA
4 chr1 683193 G GACTTAGT NA NA LowCoverage NA
5 chr1 683193 G GACTTAGT NA NA LowCoverage NA
6 chr1 683193 G GACTTAGT NA NA LowCoverage NA
FP <- nrow(FP_hits) + nrow(joined_data %>% filter(complete.cases(.)) %>% filter(ALT.x != ALT.y))
table(TP_hits$FILTER.y)
           LowCoverage LowCoverage;SnpCluster                   PASS 
               2440666                    259                    176 
table(FP_hits$FILTER.x)
LowCoverage        PASS 
      87504          10 
#Precision = TruePositives / (TruePositives + FalsePositives)

precision = TP / (TP + FP)
#Recall = TruePositives / (TruePositives + FalseNegatives)

recall = TP / (TP + FN)
round(precision, digits = 4)

0.9652

round(recall, digits = 4)

0.8232

NOTE: That while precision is high. The majority of calls are 'LowCoverage'

Clone this wiki locally