Merge pull request #273 from replikation/feature_add-AF-to_VCFs

Feature add af to vc fs
replikation · Jul 24, 2024 · 77f9053 · 77f9053
2 parents adc44b0 + 752e20b
commit 77f9053
Show file tree

Hide file tree

Showing 9 changed files with 293 additions and 3 deletions.
diff --git a/README.md b/README.md
@@ -262,3 +262,5 @@ Hardware|Default settings
 
 # 8. Credits
 The key steps of poreCov are carried out using the [ARTIC Network field bioinformatics pipeline](https://github.com/artic-network/fieldbioinformatics). Kudos to all amazing developers for your incredible efforts during this pandemic! Many thanks to all others who have helped out and contributed to poreCov as well. 
+
+The script [convert_VCF_info_fields.py](bin/convert_VCF_info_fields.py) was originally developed by the Intergalactic Utilities Commission in the [Tool Shed repository](https://github.com/galaxyproject/tools-iuc) under a MIT license.
diff --git a/bin/convert_VCF_info_fields.py b/bin/convert_VCF_info_fields.py
@@ -0,0 +1,192 @@
+#!/usr/bin/env python3
+
+# -----------------------------------------------------------------------------
+# The MIT License (MIT)
+
+# Copyright (c) 2014 galaxy-iuc
+
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+# -----------------------------------------------------------------------------
+
+# Takes in VCF file annotated with medaka tools annotate and converts
+#
+# Usage statement:
+# python convert_VCF_info_fields.py in_vcf.vcf out_vcf.vcf
+
+# 10/21/2020 - Nathan P. Roach, [email protected]
+
+# adapted June 2024 - Marie Lataretu, [email protected]
+# source for this scripts:
+# https://github.com/galaxyproject/tools-iuc/blob/fd148a124034b44d0d61db3eec32ff991d8c152c/tools/medaka/convert_VCF_info_fields.py
+
+import re
+import sys
+from collections import OrderedDict
+from math import log10
+
+import scipy.stats
+
+
+def pval_to_phredqual(pval):
+    try:
+        ret = round(-10 * log10(pval))
+    except ValueError:
+        ret = 2147483647  # transform pval of 0.0 to max signed 32 bit int
+    return ret
+
+
+def parseInfoField(info):
+    info_fields = info.split(";")
+    info_dict = OrderedDict()
+    for info_field in info_fields:
+        code, val = info_field.split("=")
+        info_dict[code] = val
+    return info_dict
+
+
+def annotateVCF(in_vcf_filepath, out_vcf_filepath):
+    """Postprocess output of medaka tools annotate.
+
+    Splits multiallelic sites into separate records.
+    Replaces medaka INFO fields that might represent information of the ref
+    and multiple alternate alleles with simple ref, alt allele counterparts.
+    """
+
+    in_vcf = open(in_vcf_filepath, "r")
+    # medaka INFO fields that do not make sense after splitting of
+    # multi-allelic records
+    # DP will be overwritten with the value of DPSP because medaka tools
+    # annotate currently only calculates the latter correctly
+    # (https://github.com/nanoporetech/medaka/issues/192).
+    # DPS, which is as unreliable as DP, gets skipped and the code
+    # calculates the spanning reads equivalent DPSPS instead.
+    to_skip = {"SC", "SR", "AR", "DP", "DPSP", "DPS"}
+    struct_meta_pat = re.compile("##(.+)=<ID=([^,]+)(,.+)?>")
+    header_lines = []
+    contig_ids = set()
+    contig_ids_simple = set()
+    # parse the metadata lines of the input VCF and drop:
+    # - duplicate lines
+    # - INFO lines declaring keys we are not going to write
+    # - redundant contig information
+    while True:
+        line = in_vcf.readline()
+        if line[:2] != "##":
+            assert line.startswith("#CHROM")
+            break
+        if line in header_lines:
+            # the annotate tool may generate lines already written by
+            # medaka variant again (example: medaka version line)
+            continue
+        match = struct_meta_pat.match(line)
+        if match:
+            match_type, match_id, match_misc = match.groups()
+            if match_type == "INFO":
+                if match_id == "DPSP":
+                    line = line.replace("DPSP", "DP")
+                elif match_id in to_skip:
+                    continue
+            elif match_type == "contig":
+                contig_ids.add(match_id)
+                if not match_misc:
+                    # the annotate tools writes its own contig info,
+                    # which is redundant with contig info generated by
+                    # medaka variant, but lacks a length value.
+                    # We don't need the incomplete line.
+                    contig_ids_simple.add(match_id)
+                    continue
+        header_lines.append(line)
+    # Lets check the above assumption about each ID-only contig line
+    # having a more complete counterpart.
+    assert not (contig_ids_simple - contig_ids)
+    header_lines.insert(1, "##convert_VCF_info_fields=0.2\n")
+    header_lines += [
+        '##INFO=<ID=DPSPS,Number=2,Type=Integer,Description="Depth of spanning reads by strand">\n',
+        '##INFO=<ID=AF,Number=1,Type=Float,Description="Spanning Reads Allele Frequency">\n',
+        '##INFO=<ID=FAF,Number=1,Type=Float,Description="Forward Spanning Reads Allele Frequency">\n',
+        '##INFO=<ID=RAF,Number=1,Type=Float,Description="Reverse Spanning Reads Allele Frequency">\n',
+        '##INFO=<ID=SB,Number=1,Type=Integer,Description="Phred-scaled strand bias of spanning reads at this position">\n',
+        '##INFO=<ID=DP4,Number=4,Type=Integer,Description="Counts for ref-forward bases, ref-reverse, alt-forward and alt-reverse bases in spanning reads">\n',
+        '##INFO=<ID=AS,Number=4,Type=Integer,Description="Total alignment score to ref and alt allele of spanning reads by strand (ref fwd, ref rev, alt fwd, alt rev) aligned with parasail match 5, mismatch -4, open 5, extend 3">\n',
+        line,
+    ]
+
+    with open(out_vcf_filepath, "w") as out_vcf:
+        out_vcf.writelines(header_lines)
+        for line in in_vcf:
+            fields = line.split("\t")
+            info_dict = parseInfoField(fields[7])
+            sr_list = [int(x) for x in info_dict["SR"].split(",")]
+            sc_list = [int(x) for x in info_dict["SC"].split(",")]
+            if len(sr_list) != len(sc_list):
+                print("WARNING - SR and SC are different lengths, " "skipping variant")
+                print(line.strip())  # Print the line for debugging purposes
+                continue
+            variant_list = fields[4].split(",")
+            dpsp = int(info_dict["DPSP"])
+            ref_fwd, ref_rev = 0, 1
+            dpspf, dpspr = (int(x) for x in info_dict["AR"].split(","))
+            for i in range(0, len(sr_list), 2):
+                dpspf += sr_list[i]
+                dpspr += sr_list[i + 1]
+            for j, i in enumerate(range(2, len(sr_list), 2)):
+                dp4 = (sr_list[ref_fwd], sr_list[ref_rev], sr_list[i], sr_list[i + 1])
+                dp2x2 = [[dp4[0], dp4[1]], [dp4[2], dp4[3]]]
+                _, p_val = scipy.stats.fisher_exact(dp2x2)
+                sb = pval_to_phredqual(p_val)
+
+                as_ = (sc_list[ref_fwd], sc_list[ref_rev], sc_list[i], sc_list[i + 1])
+
+                info = []
+                for code in info_dict:
+                    if code in to_skip:
+                        continue
+                    val = info_dict[code]
+                    info.append("%s=%s" % (code, val))
+
+                info.append("DP=%d" % dpsp)
+                info.append("DPSPS=%d,%d" % (dpspf, dpspr))
+
+                if dpsp == 0:
+                    info.append("AF=.")
+                else:
+                    af = (dp4[2] + dp4[3]) / dpsp
+                    info.append("AF=%.6f" % af)
+                if dpspf == 0:
+                    info.append("FAF=.")
+                else:
+                    faf = dp4[2] / dpspf
+                    info.append("FAF=%.6f" % faf)
+                if dpspr == 0:
+                    info.append("RAF=.")
+                else:
+                    raf = dp4[3] / dpspr
+                    info.append("RAF=%.6f" % raf)
+                info.append("SB=%d" % sb)
+                info.append("DP4=%d,%d,%d,%d" % dp4)
+                info.append("AS=%d,%d,%d,%d" % as_)
+                new_info = ";".join(info)
+                fields[4] = variant_list[j]
+                fields[7] = new_info
+                out_vcf.write("\t".join(fields))
+    in_vcf.close()
+
+
+if __name__ == "__main__":
+    annotateVCF(sys.argv[1], sys.argv[2])
diff --git a/bin/summary_report.py b/bin/summary_report.py
@@ -640,6 +640,9 @@ def unclass_markup(value):
 
         self.add_col_description(f'Read classification was determined against a database containing only SARS-CoV-2 and human with <a href="https://ccb.jhu.edu/software/kraken2/">Kraken2</a> (v{self.tool_versions["kraken2"]}).')
 
+    def add_mixed_sites_results(self, mixed_site_results):
+        res_data = pd.read_csv(mixed_site_results, index_col='sample', dtype={'sample': str})
+        self.add_column_raw('num_mixed_sites', res_data['num_mixed_sites'])
 
     def add_coverage_plots(self, coverage_plots):
         for coverage_plot in sorted(coverage_plots.split(',')):
@@ -826,6 +829,7 @@ def get_lineage_status(self, lineage):
     parser.add_argument("-n", "--nextclade_results", help="nextclade results")
     parser.add_argument("-q", "--president_results", help="president results")
     parser.add_argument("-k", "--kraken2_results", help="kraken2 results")
+    parser.add_argument("-m", "--mixed_sites_results", help="mixed sites statisics")
     parser.add_argument("-c", "--coverage_plots", help="coverage plots (comma separated)")
     parser.add_argument("-s", "--samples", help="sample ids (comma separated)")
     args = parser.parse_args()
@@ -852,6 +856,8 @@ def get_lineage_status(self, lineage):
         report.add_pangolin_results(args.pangolin_results)
     if args.nextclade_results:
         report.add_nextclade_results(args.nextclade_results)
+    if args.mixed_sites_results:
+        report.add_mixed_sites_results(args.mixed_sites_results)
     if args.coverage_plots:
         report.add_coverage_plots(args.coverage_plots)
 

diff --git a/modules/add_alt_allele_ratio_vcf.nf b/modules/add_alt_allele_ratio_vcf.nf
@@ -0,0 +1,46 @@
+process add_alt_allele_ratio_vcf {
+    label 'artic'
+    publishDir "${params.output}/${params.lineagedir}/${name}/", mode: 'copy'
+    input:
+        tuple val(name), path(bam), path(bai), path(vcf), path(failed_vcf)
+        path(external_scheme) // primer scheme dir as input
+    output:
+       tuple val(name), path("${name}_all-vars-with-aar.vcf"), emit: vcf
+       tuple val(name), path("mixed_sites_stats.csv"), emit: stats
+    script:
+    primer_version_tag = params.primerV.toString().contains(".bed") ? 'V_custom' : "${params.primerV}"
+    primer_dir = params.primerV.toString().contains(".bed") ? "${external_scheme}" : "${external_scheme}/nCoV-2019"
+    """
+    # reheader failed VCF and change FILTER
+    echo '##FILTER=<ID=ARTICFAIL,Description="ARTIC filter failed">' > add-to-hdr.txt
+    # -c and --rename-annots add a header with a default/wrong description
+    bcftools annotate -h add-to-hdr.txt -c "FILTER/ARTICFAIL:=FILTER/PASS" ${failed_vcf} | sed '/##FILTER=<ID=ARTICFAIL,Description="All filters passed">/d' > tmp_failed_updated-filter.vcf
+
+    # concat failed and passed VCF
+    bcftools concat ${vcf} tmp_failed_updated-filter.vcf | bcftools sort -o tmp_merged.vcf
+
+    # call medaka tools annotate for each pool and add the alternate allele ratio
+    for pool in `cut -f5 ${primer_dir}/${primer_version_tag}/nCoV-2019.scheme.bed | sort | uniq`; do
+        bcftools view -i 'INFO/Pool="'\$pool'"' tmp_merged.vcf -o tmp_\$pool.vcf
+        medaka tools annotate --dpsp --pad 25 --RG \$pool tmp_\$pool.vcf ${primer_dir}/${primer_version_tag}/nCoV-2019.reference.fasta ${bam} tmp_annotated_\$pool.vcf
+        convert_VCF_info_fields.py tmp_annotated_\$pool.vcf tmp_aar_\$pool.vcf
+    done
+
+    # merge pool VCF and sort VCF
+    bcftools concat tmp_aar_*.vcf | bcftools sort -o ${name}_all-vars-with-aar.vcf
+
+    # count mixed sites
+    # thresholds for human geotyping: 0.35 <= x <= 0.65 
+    NUM_MIXED_SITES=\$(bcftools view -H -i 'INFO/DP>${params.min_depth} & INFO/AF>=0.3 & INFO/AF<=0.8' ${name}_all-vars-with-aar.vcf | wc -l)
+    echo sample,num_mixed_sites > mixed_sites_stats.csv
+    echo ${name},\$NUM_MIXED_SITES >> mixed_sites_stats.csv
+
+    # remove intermediate files
+    rm tmp_*
+    """
+    stub:
+	"""
+	touch ${name}_all-vars-with-aar.vcf mixed_sites_stats.csv
+	"""
+
+}
diff --git a/poreCov.nf b/poreCov.nf
@@ -309,6 +309,7 @@ include { get_fasta } from './modules/get_fasta_test_data.nf'
 include { align_to_reference } from './modules/align_to_reference.nf'
 include { split_fasta } from './modules/split_fasta.nf'
 include { filter_fastq_by_length } from './modules/filter_fastq_by_length.nf'
+include { add_alt_allele_ratio_vcf } from './modules/add_alt_allele_ratio_vcf.nf'
 
 /************************** 
 * Workflows
@@ -360,7 +361,16 @@ workflow {
             else if (!params.nanopolish) { 
                 artic_ncov_wf(filtered_reads_ch, params.artic_normalize) 
                 fasta_input_ch = artic_ncov_wf.out.assembly
+
+                // add alternative allele ratio to the VCF
+                if (params.primerV.toString().contains(".bed")) {
+                    external_primer_schemes = artic_ncov_wf.out.primer_dir
+                }
+                else {
+                    external_primer_schemes = file(workflow.projectDir + "/data/external_primer_schemes", checkIfExists: true, type: 'dir' )
                 }
+                add_alt_allele_ratio_vcf(artic_ncov_wf.out.trimmed_bam.join(artic_ncov_wf.out.vcf).join(artic_ncov_wf.out.failed_vcf), external_primer_schemes)
+            }
         }
         // fastq input via dir and or files
         if ( (params.fastq || params.fastq_pass) || workflow.profile.contains('test_fastq')) { 
@@ -392,7 +402,16 @@ workflow {
             else if (!params.nanopolish) { 
                 artic_ncov_wf(filtered_reads_ch, params.artic_normalize)
                 fasta_input_ch = artic_ncov_wf.out.assembly
+
+                // add alternative allele ratio to the VCF
+                if (params.primerV.toString().contains(".bed")) {
+                    external_primer_schemes = artic_ncov_wf.out.primer_dir
+                }
+                else {
+                    external_primer_schemes = file(workflow.projectDir + "/data/external_primer_schemes", checkIfExists: true, type: 'dir' )
                 }
+                add_alt_allele_ratio_vcf(artic_ncov_wf.out.trimmed_bam.join(artic_ncov_wf.out.vcf).join(artic_ncov_wf.out.failed_vcf), external_primer_schemes)
+            }
         }
 
     // 2. Genome quality, lineages, clades and mutations
@@ -416,6 +435,7 @@ workflow {
         if (params.fasta || workflow.profile.contains('test_fasta')) {
             taxonomic_read_classification_ch = Channel.from( ['deactivated', 'deactivated', 'deactivated'] ).collect()
             alignments_ch = Channel.from( ['deactivated'] )
+
         } else {
             taxonomic_read_classification_ch = read_classification_wf.out.kraken
             if (params.screen_reads) {
@@ -428,6 +448,11 @@ workflow {
             }
             alignments_ch = align_to_reference(filtered_reads_ch.combine(reference_for_qc_input_ch))
         }
+        if (params.fasta || workflow.profile.contains('test_fasta') || params.nanopolish ) {
+            alt_allele_ratio_ch = Channel.from( ['deactivated'] )
+        } else {
+            alt_allele_ratio_ch = add_alt_allele_ratio_vcf.out.stats
+        }
 
 /*
         if (params.samples) {
@@ -436,7 +461,7 @@ workflow {
         else { samples_table_ch = Channel.from( ['deactivated'] ) }
 */
         create_summary_report_wf(determine_lineage_wf.out, genome_quality_wf.out[0], determine_mutations_wf.out,
-                                taxonomic_read_classification_ch, alignments_ch, samples_file_ch)
+                                taxonomic_read_classification_ch, alt_allele_ratio_ch, alignments_ch, samples_file_ch)
 
 }
 

diff --git a/workflows/artic_nanopore_nCov19.nf b/workflows/artic_nanopore_nCov19.nf
@@ -15,6 +15,10 @@ workflow artic_ncov_wf {
             artic_medaka_custom_bed(fastq.combine(external_primer_schemes).combine(primerBed), normalise_threshold)
             assembly = artic_medaka_custom_bed.out.fasta
             binary_alignment = artic_medaka_custom_bed.out.fullbam
+            trimmed_bam = artic_medaka_custom_bed.out.reference_bam
+            vcf = artic_medaka_custom_bed.out.vcf
+            failed_vcf = artic_medaka_custom_bed.out.vcf_fail
+            primer_dir = artic_medaka_custom_bed.out.primer_dir
 
             // plot amplicon coverage
             covarplot_custom_bed(artic_medaka_custom_bed.out.covarplot.combine(primerBed))
@@ -27,6 +31,10 @@ workflow artic_ncov_wf {
             artic_medaka(fastq.combine(external_primer_schemes), normalise_threshold)
             assembly = artic_medaka.out.fasta
             binary_alignment = artic_medaka.out.fullbam
+            trimmed_bam = artic_medaka.out.reference_bam
+            vcf = artic_medaka.out.vcf
+            failed_vcf = artic_medaka.out.vcf_fail
+            primer_dir = Channel.empty()
 
             // plot amplicon coverage
             covarplot(artic_medaka.out.covarplot.combine(external_primer_schemes))
@@ -40,6 +48,10 @@ workflow artic_ncov_wf {
     emit:   
         assembly
         binary_alignment
+        trimmed_bam
+        vcf
+        primer_dir
+        failed_vcf
 }
 
 workflow artic_ncov_np_wf {
Original file line number	Diff line number	Diff line change
Expand Up		@@ -262,3 +262,5 @@ Hardware\|Default settings

		# 8. Credits
		The key steps of poreCov are carried out using the [ARTIC Network field bioinformatics pipeline](https://github.com/artic-network/fieldbioinformatics). Kudos to all amazing developers for your incredible efforts during this pandemic! Many thanks to all others who have helped out and contributed to poreCov as well.

		The script [convert_VCF_info_fields.py](bin/convert_VCF_info_fields.py) was originally developed by the Intergalactic Utilities Commission in the [Tool Shed repository](https://github.com/galaxyproject/tools-iuc) under a MIT license.