diff --git a/.dockstore.yml b/.dockstore.yml index aad9015e..b27670c7 100644 --- a/.dockstore.yml +++ b/.dockstore.yml @@ -70,6 +70,21 @@ workflows: primaryDescriptorPath: /workflows/wf_titan_fasta.wdl testParameterFiles: - empty.json + - name: Titan_WWVC + subclass: WDL + primaryDescriptorPath: /workflows/wf_titan_wwvc.wdl + testParameterFiles: + - empty.json + - name: Freyja_FASTQ + subclass: WDL + primaryDescriptorPath: /workflows/wf_freyja_fastq.wdl + testParameterFiles: + - empty.json + - name: Freyja_Plot + subclass: WDL + primaryDescriptorPath: /workflows/wf_freyja_plot.wdl + testParameterFiles: + - empty.json - name: TheiaCoV_Validate subclass: WDL primaryDescriptorPath: /workflows/wf_theiacov_validate.wdl diff --git a/README.md b/README.md index b72f7465..41ee00b2 100644 --- a/README.md +++ b/README.md @@ -4,4 +4,5 @@ Bioinformatics workflows for genomic characterization, submission preparation, a ### Contributors & Influence * Based on collaborative work with Andrew Lang, PhD & his [Genomic Analysis WDL workflows](https://github.com/AndrewLangvt/genomic_analyses) * Workflows and task development influenced by The Broad's [Viral Pipes](https://github.com/broadinstitute/viral-pipelines) -* Titan Genomic Characterization workflows influenced by UPHL's [Cecret](https://github.com/UPHL-BioNGS/Cecret) & StaPH-B's [Monroe](https://staph-b.github.io/staphb_toolkit/workflow_docs/monroe/) +* Titan workflows for genomic characterization influenced by UPHL's [Cecret](https://github.com/UPHL-BioNGS/Cecret) & StaPH-B's [Monroe](https://staph-b.github.io/staphb_toolkit/workflow_docs/monroe/) +* The Titan workflow for waste water variant calling (Titan_WWVC) incorporates a modified version of the [CDPHE's WasteWaterVariantCalling WDL Worfklow](https://github.com/CDPHE/WasteWaterVariantCalling). diff --git a/tasks/task_alignment.wdl b/tasks/task_alignment.wdl index b6dcfdee..4c7307ad 100644 --- a/tasks/task_alignment.wdl +++ b/tasks/task_alignment.wdl @@ -6,26 +6,34 @@ task bwa { File read1 File? read2 String samplename - String? reference_genome="/artic-ncov2019/primer_schemes/nCoV-2019/V3/nCoV-2019.reference.fasta" + File? reference_genome Int? cpus=6 } - command { + command <<< # date and version control date | tee DATE echo "BWA $(bwa 2>&1 | grep Version )" | tee BWA_VERSION samtools --version | head -n1 | tee SAMTOOLS_VERSION + + # set reference genome + if [[ ! -z "~{reference_genome}" ]]; then + echo "User reference identified; ~{reference_genome} will be utilized for alignement" + # move to primer_schemes dir; bwa fails if reference file not in this location + cp "~{reference_genome}" "/artic-ncov2019/primer_schemes/nCoV-2019/V3/nCoV-2019.reference.fasta" + fi # Map with BWA MEM + echo "Running bwa mem -t ~{cpus} bwa_reference.bwa ~{read1} ~{read2} | samtools sort | samtools view -F 4 -o ~{samplename}.sorted.bam " bwa mem \ - -t ${cpus} \ - ${reference_genome} \ - ${read1} ${read2} |\ - samtools sort | samtools view -F 4 -o ${samplename}.sorted.bam + -t ~{cpus} \ + "/artic-ncov2019/primer_schemes/nCoV-2019/V3/nCoV-2019.reference.fasta" \ + ~{read1} ~{read2} |\ + samtools sort | samtools view -F 4 -o ~{samplename}.sorted.bam # index BAMs - samtools index ${samplename}.sorted.bam - } + samtools index ~{samplename}.sorted.bam + >>> output { String bwa_version = read_string("BWA_VERSION") @@ -40,7 +48,6 @@ task bwa { cpu: 2 disks: "local-disk 100 SSD" preemptible: 0 - maxRetries: 3 } } diff --git a/tasks/task_taxonID.wdl b/tasks/task_taxonID.wdl index 40bdd9d6..ae3c34ac 100644 --- a/tasks/task_taxonID.wdl +++ b/tasks/task_taxonID.wdl @@ -309,3 +309,62 @@ task nextclade_output_parser_one_sample { String nextclade_aa_dels = read_string("NEXTCLADE_AADELS") } } + +task freyja_one_sample { + input { + File primer_trimmed_bam + String samplename + File reference_genome + File? freyja_usher_barcodes + Boolean update_db = true + String docker = "staphb/freyja:1.2" + } + command <<< + # configure barcode settings and capture version + #if [[ ! -z "~{freyja_usher_barcodes}" ]]; then + # #capture database info + # azfreyja_usher_barcode_version=$(basename -- "~{freyja_usher_barcodes}") + # echo "here" + # #set environment with user-defined db + # mv ~{freyja_usher_barcodes} /opt/conda/envs/freyja-env/lib/python3.7/site-packages/freyja/data/usher_barcodes.csv + #else + # # update db if specified + # if ~{update_db}; then + # freyja update + # freyja_usher_barcode_version="freyja update: $(date +"%Y-%m-%d")" + # else + # freyja_usher_barcode_version="unmodified from freyja container: ~{docker}" + # fi + #fi + + # always update freyja barcodes until v1.3.1 release (will allow user-defined ref files) + freyja update + freyja_usher_barcode_version="freyja update: $(date +"%Y-%m-%d")" + + echo ${freyja_usher_barcode_version} | tee FREYJA_BARCODES + + # Call variants and capture sequencing depth information + freyja variants ~{primer_trimmed_bam} --variants ~{samplename}_freyja_variants.tsv --depths ~{samplename}_freyja_depths.tsv --ref ~{reference_genome} + + # Demix variants + freyja demix ~{samplename}_freyja_variants.tsv ~{samplename}_freyja_depths.tsv --output ~{samplename}_freyja_demixed.tmp + # Adjust output header + echo -e "\t/~{samplename}" > ~{samplename}_freyja_demixed.tsv + tail -n+2 ~{samplename}_freyja_demixed.tmp >> ~{samplename}_freyja_demixed.tsv + + + >>> + runtime { + memory: "4 GB" + cpu: 2 + docker: "~{docker}" + disks: "local-disk 100 HDD" + } + output { + File freyja_variants = "~{samplename}_freyja_variants.tsv" + File freyja_depths = "~{samplename}_freyja_depths.tsv" + File freyja_demixed = "~{samplename}_freyja_demixed.tsv" + String freyja_barcode_version = read_string("FREYJA_BARCODES") + } + +} diff --git a/tasks/task_versioning.wdl b/tasks/task_versioning.wdl index fae4aba9..d6cd8957 100644 --- a/tasks/task_versioning.wdl +++ b/tasks/task_versioning.wdl @@ -8,7 +8,7 @@ task version_capture { volatile: true } command <<< - PHVG_Version="PHVG v1.5.3" + PHVG_Version="PHVG v1.6.0-dev" ~{default='' 'export TZ=' + timezone} date +"%Y-%m-%d" > TODAY echo $PHVG_Version > PHVG_VERSION diff --git a/workflows/WasteWaterVariantCalling_modified.wdl b/workflows/WasteWaterVariantCalling_modified.wdl new file mode 100644 index 00000000..ce656f49 --- /dev/null +++ b/workflows/WasteWaterVariantCalling_modified.wdl @@ -0,0 +1,433 @@ +version 1.0 + +workflow WasteWaterVariantCalling { + meta { + description: "Modified version of the CDPHE's WasteWaterVariantCalling WDL Worfklow to performs variant calling on SARS-CoV-2 in waster water samples and identifies mutations in the Spike gene associated with known VOCs and VUIs: https://github.com/CDPHE/WasteWaterVariantCalling)." + author: "Kevin Libuit" + email: "kevin.libuit@theiagen.com" + } + input { + Array[File] sorted_bam + File covid_genome + File spike_bed + File spike_annotations + Array[String] sample_id + } + + scatter (id_bam in zip(sample_id, sorted_bam)) { + call add_RG { + input: + sample_id = id_bam.left, + bam = id_bam.right + } + call variant_calling { + input: + bam = add_RG.rgbam, + ref = covid_genome, + sample_id = id_bam.left + } + call sort_vcf { + input: + vcf = variant_calling.vcf, + sample_id = id_bam.left + } + call sample_spike { + input: + vcf = sort_vcf.sorted_vcf, + bed = spike_bed, + sample_id = id_bam.left + } + call vcf2tsv { + input: + vcf = sample_spike.sample_spike_vcf, + sample_id = id_bam.left, + bed = spike_bed + } + call fill_NA { + input: + tsv = vcf2tsv.sample_spike_tsv, + sample_id = id_bam.left, + spike_bed = spike_bed + } + call allele_freq { + input: + tsv = fill_NA.fill_NA_tsv, + sample_id = id_bam.left + } + call reformat_tsv { + input: + tsv = allele_freq.allele_freq_tsv, + sample_id = id_bam.left + } + call summary_prep { + input: + tsv = reformat_tsv.reformat_tsv_tsv, + sample_id = id_bam.left, + spike_annotations = spike_annotations + } + } + call dashboard_tsv { + input: + tsv = summary_prep.sample_spike_tsv_summary, + tsv_dash = summary_prep.sample_spike_tsv_dash, + tsv_counts = summary_prep.sample_spike_tsv_counts, + spike_annotations = spike_annotations + } + call summary_tsv { + input: + tsv = dashboard_tsv.spike_summary_temp + } + + output { + Array[File] addrg_bam = add_RG.rgbam + Array[File] variants = variant_calling.vcf + Array[File] sorted_vcf = sort_vcf.sorted_vcf + Array[File] sample_spike_vcf = sample_spike.sample_spike_vcf + Array[File] sample_spike_tsv = vcf2tsv.sample_spike_tsv + Array[File] sample_spike_tsv_summary = summary_prep.sample_spike_tsv_summary + Array[File] sample_spike_tsv_dash = summary_prep.sample_spike_tsv_dash + Array[File] fill_NA_tsv = fill_NA.fill_NA_tsv + Array[File] allele_freq_tsv = allele_freq.allele_freq_tsv + Array[File] reformat_tsv_tsv = reformat_tsv.reformat_tsv_tsv + Array[File] sample_spike_tsv_counts = summary_prep.sample_spike_tsv_counts + File spike_summary_temp = dashboard_tsv.spike_summary_temp + File spike_summary = summary_tsv.spike_summary + File spike_dashboard = dashboard_tsv.spike_dashboard + File spike_counts = dashboard_tsv.spike_counts + + } +} + +task add_RG { + input { + String sample_id + File bam + } + + command <<< + + samtools addreplacerg -r ID:~{sample_id} -r LB:L1 -r SM:~{sample_id} -o ~{sample_id}_addRG.bam ~{bam} + + >>> + + output { + File rgbam = "${sample_id}_addRG.bam" + } + + runtime { + docker: "staphb/samtools:1.10" + memory: "8 GB" + cpu: 2 + disks: "local-disk 100 SSD" + } +} + +task variant_calling { + input { + String sample_id + File bam + File ref + + } + + command <<< + + freebayes -f ~{ref} --haplotype-length 0 --min-alternate-count 3 --min-alternate-fraction 0.05 --min-mapping-quality 20 --min-base-quality 20 --min-coverage 10 --use-duplicate-reads --report-monomorphic --pooled-continuous ~{bam} > ~{sample_id}_variants.vcf + + >>> + + output { + File vcf = "${sample_id}_variants.vcf" + } + + runtime { + docker: "wgspipeline/freebayes:v0.0.1" + memory: "32 GB" + cpu: 8 + disks: "local-disk 200 SSD" + } +} + +task sort_vcf { + input { + String sample_id + File vcf + + } + + command <<< + + bgzip -c ~{vcf} > ~{sample_id}_variants.vcf.gz + + tabix -p vcf ~{sample_id}_variants.vcf.gz + + bcftools sort -O z ~{sample_id}_variants.vcf.gz > ~{sample_id}_sorted.vcf.gz + + >>> + + output { + File sorted_vcf = "${sample_id}_sorted.vcf.gz" + } + + runtime { + docker: "quay.io/biocontainers/bcftools:1.10.2--hd2cd319_0" + memory: "8 GB" + cpu: 2 + disks: "local-disk 100 SSD" + } +} + +task sample_spike { + input { + File vcf + File bed + String sample_id + } + + command <<< + + tabix -p vcf ~{vcf} + + bcftools view --regions-file ~{bed} --output-type v --output-file ~{sample_id}_spike_mutations.vcf ~{vcf} + + >>> + + output { + File sample_spike_vcf = "${sample_id}_spike_mutations.vcf" + } + + runtime { + docker: "quay.io/biocontainers/bcftools:1.10.2--hd2cd319_0" + memory: "16 GB" + cpu: 4 + disks: "local-disk 100 SSD" + } +} + +task vcf2tsv { + input { + File vcf + File bed + String sample_id + } + + command <<< + + bgzip -c ~{vcf} > ~{sample_id}_spike_mutations.vcf.gz + + tabix -p vcf ~{sample_id}_spike_mutations.vcf.gz + + bcftools query --regions-file ~{bed} --format '%CHROM\t%POS\t%REF\t%ALT[\t%DP\t%RO\t%AO]\n' ~{sample_id}_spike_mutations.vcf.gz > ~{sample_id}_spike_mutations.tsv + + >>> + + output { + File sample_spike_tsv = "${sample_id}_spike_mutations.tsv" + } + + runtime { + docker: "quay.io/biocontainers/bcftools:1.10.2--hd2cd319_0" + memory: "16 GB" + cpu: 4 + disks: "local-disk 100 SSD" + } +} + +task fill_NA { + input { + File tsv + String sample_id + File spike_bed + } + + command <<< + + # create key of unique locations + cat ~{spike_bed} | cut -f 1,2 | tr "\t" "_" | sort | uniq > keys.txt + + # add headers to tsv and use key to fill in missing values + echo -e "CHROM\tPOS\tREF\t~{sample_id}_ALT\t~{sample_id}_DP\t~{sample_id}_RO\t~{sample_id}_AO" | cat - ~{tsv} | sed 's/\t/_/' | sort -t $'\t' -k1,1 > ~{sample_id}_spike_mutations_temp1.tsv + + # get the filled columns we want + join -t $'\t' -e NA -a 1 -1 1 -2 1 -o "1.1,2.3,2.4,2.6" keys.txt "~{sample_id}_spike_mutations_temp1.tsv" > ~{sample_id}_spike_fill_NA.tsv + + >>> + + output { + File fill_NA_tsv = "${sample_id}_spike_fill_NA.tsv" + } + + runtime { + docker: "quay.io/theiagen/utility:1.1" + memory: "32 GB" + cpu: 8 + disks: "local-disk 2500 HDD" + } +} + +task allele_freq { + input { + File tsv + String sample_id + } + + command <<< + + # separate the comma separated alleles into separate rows (might need to fix delimiters) + awk '{split($2,a,","); split($4,b,","); for(i in a){print $1,a[i],$3,b[i]}}' ~{tsv} > ~{sample_id}_spike_mutations_temp2.tsv + + # use AO and DP fields to calculate ALT allele frequency, fix delimiters, change -nan allele frequencies to NA + awk '$3~"^NA"||$4~"^NA"{$5="NA";print;next}{$5=$4/$3}1' ~{sample_id}_spike_mutations_temp2.tsv | sed 's/ /\t/g' | awk '$5 == "-nan" {$5="NA"} 1' OFS="\t" > ~{sample_id}_spike_allele_freq.tsv + + >>> + + output { + File allele_freq_tsv = "${sample_id}_spike_allele_freq.tsv" + } + + runtime { + docker: "quay.io/theiagen/utility:1.1" + memory: "32 GB" + cpu: 8 + disks: "local-disk 2500 HDD" + } +} + +task reformat_tsv { + input { + File tsv + String sample_id + } + + command <<< + + # combine the rows based on matching nucl location + + awk '{f2[$1]=f2[$1] sep[$1] $2; + f3[$1]=f3[$1] sep[$1] $3; + f4[$1]=f4[$1] sep[$1] $4; + f5[$1]=f5[$1] sep[$1] $5; + sep[$1]=";"} + END {for(k in f2) print k,f2[k],f3[k],f4[k],f5[k]}' ~{tsv} > ~{sample_id}_spike_mutations_temp3.tsv + + # fix delimiters, add a column containing the sample ids + sed 's/ /\t/g' ~{sample_id}_spike_mutations_temp3.tsv | awk 'NF=NF+1{$NF="~{sample_id}"}1' > ~{sample_id}_spike_mutations_temp4.tsv + + # fix the column headers, convert from space to tab delimited and then sort by col1 + echo -e "CHROMPOS ~{sample_id}_ALT ~{sample_id}_DP ~{sample_id}_AO ~{sample_id}_ALTfreq sample_id" | cat - ~{sample_id}_spike_mutations_temp4.tsv | sed 's/ /\t/g' | sort -t $'\t' -k 1,1 -V > ~{sample_id}_spike_reformat.tsv + + >>> + + output { + File reformat_tsv_tsv = "${sample_id}_spike_reformat.tsv" + } + + runtime { + docker: "quay.io/theiagen/utility:1.1" + memory: "32 GB" + cpu: 8 + disks: "local-disk 2500 HDD" + } +} + +task summary_prep { + input { + File tsv + String sample_id + File spike_annotations + } + + command <<< + + # cut the columns we want for the results summary and make output file + cut -f2,5 ~{tsv} > ~{sample_id}_spike_mutations_forsummary.tsv + + # cut the columns we want for the dashboard summary + awk '{print $6 "\t" $2 "\t" $5}' ~{tsv} > ~{sample_id}_spike_mutations_temp5.tsv + + # add annotations to the dashboard summary, reorder the dashboard summary columns, fix the dashboard summary column headers and make output file + paste ~{spike_annotations} ~{sample_id}_spike_mutations_temp5.tsv | awk '{print $4 "\t" $1 "\t" $2 "\t" $3 "\t" $5 "\t" $6}' | awk 'BEGIN{FS=OFS="\t"; print "sample_id", "AA_change", "Nucl_change", "Lineages", "ALT", "ALTfreq"} NR>1{print $1, $2, $3, $4, $5, $6}' > ~{sample_id}_spike_mutations_fordash.tsv + + # cut the columns we want for the counts summary + awk '{print $6 "\t" $2 "\t" $3 "\t" $4}' ~{tsv} > ~{sample_id}_spike_mutations_temp6.tsv + + # add annotations to the counts summary, reorder the dashboard summary columns, fix the dashboard summary column headers and make output file + paste ~{spike_annotations} ~{sample_id}_spike_mutations_temp6.tsv | awk '{print $4 "\t" $1 "\t" $2 "\t" $3 "\t" $5 "\t" $6 "\t" $7}' | awk 'BEGIN{FS=OFS="\t"; print "sample_id", "AA_change", "Nucl_change", "Lineages", "ALT", "Total_count", "ALT_count"} NR>1{print $1, $2, $3, $4, $5, $6, $7}' > ~{sample_id}_spike_mutations_counts.tsv + + >>> + + output { + File sample_spike_tsv_summary = "${sample_id}_spike_mutations_forsummary.tsv" + File sample_spike_tsv_dash = "${sample_id}_spike_mutations_fordash.tsv" + File sample_spike_tsv_counts = "${sample_id}_spike_mutations_counts.tsv" + } + + runtime { + docker: "quay.io/theiagen/utility:1.1" + memory: "32 GB" + cpu: 8 + disks: "local-disk 2500 HDD" + } +} + +task dashboard_tsv { + input { + Array[File] tsv + Array[File] tsv_dash + Array[File] tsv_counts + File spike_annotations + } + + command <<< + + # concatenate the tsvs and make the dashboard summary output + awk 'FNR==1 && NR!=1{next;}{print}' ~{sep=' ' tsv_dash} >> spike_mutations_dashboard.tsv + + # concatenate the tsvs and make the dashboard summary output + awk 'FNR==1 && NR!=1{next;}{print}' ~{sep=' ' tsv_counts} >> spike_mutations_counts.tsv + + # fix delimiters in annotations file + sed 's/ /\t/g' ~{spike_annotations} > spike_annotations.tsv + + # concatentate tsvs for sequencing and bioinformatics team summary file and make output + paste spike_annotations.tsv ~{sep=' ' tsv} > spike_mutations_summary_temp.tsv + + >>> + + output { + File spike_summary_temp = "spike_mutations_summary_temp.tsv" + File spike_dashboard = "spike_mutations_dashboard.tsv" + File spike_counts = "spike_mutations_counts.tsv" + } + + runtime { + docker: "quay.io/theiagen/utility:1.1" + memory: "16 GB" + cpu: 4 + disks: "local-disk 200 SSD" + } +} + +task summary_tsv { + input { + File tsv + } + + command <<< + + # datamash to tranpose results summary + datamash -H transpose < ~{tsv} > spike_mutations_summary.tsv + + >>> + + output { + File spike_summary = "spike_mutations_summary.tsv" + } + + runtime { + docker: "rapatsky/debian" + memory: "16 GB" + cpu: 4 + disks: "local-disk 200 SSD" + } +} diff --git a/workflows/wf_freyja_fastq.wdl b/workflows/wf_freyja_fastq.wdl new file mode 100644 index 00000000..f051c035 --- /dev/null +++ b/workflows/wf_freyja_fastq.wdl @@ -0,0 +1,90 @@ +version 1.0 + +import "../tasks/task_taxonID.wdl" as taxon_id +import "wf_read_QC_trim.wdl" as read_qc +import "../tasks/task_alignment.wdl" as align +import "../tasks/task_consensus_call.wdl" as consensus_call +import "../tasks/task_versioning.wdl" as versioning + +workflow freyja_fastq { + input { + File read1_raw + File read2_raw + File primer_bed + File reference_genome + Int trimmomatic_minlen = 25 + String samplename + } + call read_qc.read_QC_trim { + input: + samplename = samplename, + read1_raw = read1_raw, + read2_raw = read2_raw, + trimmomatic_minlen = trimmomatic_minlen + } + call align.bwa { + input: + samplename = samplename, + reference_genome=reference_genome, + read1 = read_QC_trim.read1_clean, + read2 = read_QC_trim.read2_clean + } + call consensus_call.primer_trim { + input: + samplename = samplename, + primer_bed = primer_bed, + bamfile = bwa.sorted_bam + } + call taxon_id.freyja_one_sample as freyja { + input: + primer_trimmed_bam = primer_trim.trim_sorted_bam, + samplename = samplename, + reference_genome = reference_genome + } + call versioning.version_capture{ + input: + } + output { + String freyja_fastq_wf_version = version_capture.phvg_version + String freyja_fastq_wf_analysis_date = version_capture.date + + File read1_dehosted = read_QC_trim.read1_dehosted + File read2_dehosted = read_QC_trim.read2_dehosted + File read1_clean = read_QC_trim.read1_clean + File read2_clean = read_QC_trim.read2_clean + Int fastq_scan_raw1 = read_QC_trim.fastq_scan_raw1 + Int fastq_scan_raw2 = read_QC_trim.fastq_scan_raw2 + String fastq_scan_raw_pairs = read_QC_trim.fastq_scan_raw_pairs + String fastq_scan_version = read_QC_trim.fastq_scan_version + + Int fastq_scan_clean1 = read_QC_trim.fastq_scan_clean1 + Int fastq_scan_clean2 = read_QC_trim.fastq_scan_clean2 + String fastq_scan_clean_pairs = read_QC_trim.fastq_scan_clean_pairs + String trimmomatic_version = read_QC_trim.trimmomatic_version + String bbduk_docker = read_QC_trim.bbduk_docker + + String kraken_version = read_QC_trim.kraken_version + Float kraken_human = read_QC_trim.kraken_human + Float kraken_sc2 = read_QC_trim.kraken_sc2 + String kraken_report = read_QC_trim.kraken_report + Float kraken_human_dehosted = read_QC_trim.kraken_human_dehosted + Float kraken_sc2_dehosted = read_QC_trim.kraken_sc2_dehosted + String kraken_report_dehosted = read_QC_trim.kraken_report_dehosted + + String bwa_version = bwa.bwa_version + String samtools_version = bwa.sam_version + String alignment_method = "~{bwa.bwa_version}; ~{primer_trim.ivar_version}" + + File aligned_bam = primer_trim.trim_sorted_bam + File aligned_bai = primer_trim.trim_sorted_bai + Float primer_trimmed_read_percent = primer_trim.primer_trimmed_read_percent + String ivar_version_primtrim = primer_trim.ivar_version + String samtools_version_primtrim = primer_trim.samtools_version + String primer_bed_name = primer_trim.primer_bed_name + + File freyja_variants = freyja.freyja_variants + File freyja_depths = freyja.freyja_depths + File freyja_demixed = freyja.freyja_demixed + String freyja_barcode_version = freyja.freyja_barcode_version + } +} \ No newline at end of file diff --git a/workflows/wf_freyja_plot.wdl b/workflows/wf_freyja_plot.wdl new file mode 100644 index 00000000..6037b9e2 --- /dev/null +++ b/workflows/wf_freyja_plot.wdl @@ -0,0 +1,112 @@ +version 1.0 + +import "../tasks/task_versioning.wdl" as versioning + +workflow freyja_plot { + input { + Array[String] samplename + Array[File] freyja_demixed + Array[String]? collection_date + String freyja_plot_name + } + call freyja_plot_task { + input: + samplename = samplename, + freyja_demixed = freyja_demixed, + collection_date = collection_date, + freyja_plot_name = freyja_plot_name + } + call versioning.version_capture{ + input: + } + output { + String freyja_plot_wf_version = version_capture.phvg_version + String freyja_plot_wf_analysis_date = version_capture.date + + File freyja_plot = freyja_plot_task.freyja_plot + File freyja_demixed_aggregate = freyja_plot_task.demixed_aggregate + File? freyja_plot_metadata = freyja_plot_task.freyja_plot_metadata + } +} + +task freyja_plot_task { + input { + Array[String] samplename + Array[File] freyja_demixed + Array[String]? collection_date + Boolean plot_lineages=false + Boolean plot_time=false + String plot_time_interval="MS" + Int plot_day_window=14 + String freyja_plot_name + String docker = "staphb/freyja:1.2" + } + command <<< + freyja_demixed_array="~{sep=' ' freyja_demixed}" + samplename_array=(~{sep=' ' samplename}) + samplename_array_len=$(echo "${#samplename_array[@]}") + + if ~{plot_time}; then + # create timedate metadata sheet + collection_date_array=(~{sep=' ' collection_date}) + collection_date_array_len=$(echo "${#collection_date_array[@]}") + + if [ "$samplename_array_len" -ne "$collection_date_array_len" ]; then + echo "ERROR: Missing collection date. Samplename array (length: $samplename_array_len) and collection date array (length: $collection_date_array_len) are of unequal length." >&2 + exit 1 + else + echo "Samplename array (length: $samplename_array_len) and collection date array (length: $collection_date_array_len) are of equal length." >&2. + fi + + echo "Sample,sample_collection_datetime" > freyja_times_metadata.csv + + for index in ${!samplename_array[@]}; do + samplename=${samplename_array[$index]} + collection_date=${collection_date_array[$index]} + echo "${samplename},${collection_date}" >> freyja_times_metadata.csv + done + + plot_options="--times freyja_times_metadata.csv" + + if [ ~{plot_time_interval} == "D" ]; then + plot_options="${plot_options} --interval D --windowsize ~{plot_day_window}" + elif [ ~{plot_time_interval} == "MS" ]; then + plot_options="${plot_options} --interval MS" + else + echo "ERROR: plot time interval value (~{plot_time_interval}) not recognized. Must be either \"D\" (days) or \"MS\" (months)" >&2 + exit 1 + fi + + fi + + # move all assemblies into single directory and aggregate files + mkdir ./demixed_files/ + echo "mv ${freyja_demixed_array[@]} demixed_files/" + mv ${freyja_demixed_array[@]} ./demixed_files/ + + freyja aggregate \ + ./demixed_files/ \ + --output demixed_aggregate.tsv + + # create freya plot + echo "Running: freyja plot demixed_aggregate.tsv --output ~{freyja_plot_name}.pdf ${plot_options}" + freyja plot \ + ~{true='--lineages' false ='' plot_lineages} \ + demixed_aggregate.tsv \ + --output ~{freyja_plot_name}.pdf \ + ${plot_options} + + + >>> + output { + File freyja_plot = "~{freyja_plot_name}.pdf" + File demixed_aggregate = "demixed_aggregate.tsv" + File? freyja_plot_metadata = "freyja_times_metadata.csv" + } + runtime { + memory: "4 GB" + cpu: 2 + docker: "~{docker}" + disks: "local-disk 100 HDD" + } +} \ No newline at end of file diff --git a/workflows/wf_titan_wwvc.wdl b/workflows/wf_titan_wwvc.wdl new file mode 100644 index 00000000..4f93ed64 --- /dev/null +++ b/workflows/wf_titan_wwvc.wdl @@ -0,0 +1,81 @@ +version 1.0 + +import "wf_read_QC_trim.wdl" as read_qc +import "../tasks/task_alignment.wdl" as align +import "../tasks/task_consensus_call.wdl" as consensus_call +import "../tasks/task_versioning.wdl" as versioning +import "../workflows/WasteWaterVariantCalling_modified.wdl" as wastewater + +workflow titan_illumina_wwvc { + meta { + description: "Reference-based consensus calling for viral amplicon sequencing data" + } + + input { + Array[String] samplename + Array[File] read1_raw + Array[File] read2_raw + File primer_bed + File reference_genome + File spike_bed + File spike_annotations + Int trimmomatic_minlen = 25 + + } + scatter (r1_r2 in zip(read1_raw, read2_raw)) { + call read_qc.read_QC_trim { + input: + samplename = "wastewater_sample", + read1_raw = r1_r2.left, + read2_raw = r1_r2.right, + trimmomatic_minlen = trimmomatic_minlen + } + call align.bwa { + input: + samplename = "wastewater_sample", + read1 = read_QC_trim.read1_clean, + read2 = read_QC_trim.read2_clean + } + call consensus_call.primer_trim { + input: + samplename = "wastewater_sample", + primer_bed = primer_bed, + bamfile = bwa.sorted_bam + } + } + call wastewater.WasteWaterVariantCalling{ + input: + sorted_bam = primer_trim.trim_sorted_bam, + covid_genome = reference_genome, + spike_bed = spike_bed, + spike_annotations = spike_annotations, + sample_id = samplename + } + call versioning.version_capture{ + input: + } + output { + String titan_wwvc_version = version_capture.phvg_version + String titan_wwcv_date = version_capture.date + + Array[File] addrg_bam = WasteWaterVariantCalling.addrg_bam + Array[File] variants = WasteWaterVariantCalling.variants + Array[File] sorted_vcf = WasteWaterVariantCalling.sorted_vcf + Array[File] sample_spike_vcf = WasteWaterVariantCalling.sample_spike_vcf + Array[File] sample_spike_tsv = WasteWaterVariantCalling.sample_spike_tsv + Array[File] sample_spike_tsv_summary = WasteWaterVariantCalling.sample_spike_tsv_summary + Array[File] sample_spike_tsv_dash = WasteWaterVariantCalling.sample_spike_tsv_dash + Array[File] fill_NA_tsv = WasteWaterVariantCalling.fill_NA_tsv + Array[File] allele_freq_tsv = WasteWaterVariantCalling.allele_freq_tsv + Array[File] reformat_tsv_tsv = WasteWaterVariantCalling.reformat_tsv_tsv + Array[File] sample_spike_tsv_counts = WasteWaterVariantCalling.sample_spike_tsv_counts + Array[File] alignment_files = primer_trim.trim_sorted_bam + File spike_summary_temp = WasteWaterVariantCalling.spike_summary_temp + File spike_summary = WasteWaterVariantCalling.spike_summary + File spike_dashboard = WasteWaterVariantCalling.spike_dashboard + File spike_counts = WasteWaterVariantCalling.spike_counts + + } +} + + \ No newline at end of file