diff --git a/.dockstore.yml b/.dockstore.yml index b27670c7..a86612e4 100644 --- a/.dockstore.yml +++ b/.dockstore.yml @@ -1,23 +1,23 @@ version: 1.2 workflows: - - name: Titan_ClearLabs + - name: TheiaCoV_ClearLabs subclass: WDL - primaryDescriptorPath: /workflows/wf_titan_clearlabs.wdl + primaryDescriptorPath: /workflows/wf_theiacov_clearlabs.wdl testParameterFiles: - empty.json - - name: Titan_ONT + - name: TheiaCoV_ONT subclass: WDL - primaryDescriptorPath: /workflows/wf_titan_ont.wdl + primaryDescriptorPath: /workflows/wf_theiacov_ont.wdl testParameterFiles: - empty.json - - name: Titan_Illumina_PE + - name: TheiaCoV_Illumina_PE subclass: WDL - primaryDescriptorPath: /workflows/wf_titan_illumina_pe.wdl + primaryDescriptorPath: /workflows/wf_theiacov_illumina_pe.wdl testParameterFiles: - empty.json - - name: Titan_Illumina_SE + - name: TheiaCoV_Illumina_SE subclass: WDL - primaryDescriptorPath: /workflows/wf_titan_illumina_se.wdl + primaryDescriptorPath: /workflows/wf_theiacov_illumina_se.wdl testParameterFiles: - empty.json - name: Mercury_PE_Prep @@ -35,14 +35,14 @@ workflows: primaryDescriptorPath: /workflows/wf_mercury_batch.wdl testParameterFiles: - empty.json - - name: Titan_Augur_Prep + - name: TheiaCoV_Augur_Prep subclass: WDL - primaryDescriptorPath: /workflows/wf_titan_augur_prep.wdl + primaryDescriptorPath: /workflows/wf_theiacov_augur_prep.wdl testParameterFiles: - empty.json - - name: Titan_Augur_Run + - name: TheiaCoV_Augur_Run subclass: WDL - primaryDescriptorPath: /workflows/wf_titan_augur_run.wdl + primaryDescriptorPath: /workflows/wf_theiacov_augur_run.wdl testParameterFiles: - empty.json - name: Pangolin_Update @@ -65,14 +65,14 @@ workflows: primaryDescriptorPath: /workflows/wf_ncbi_scrub_pe.wdl testParameterFiles: - empty.json - - name: Titan_FASTA + - name: TheiaCoV_FASTA subclass: WDL - primaryDescriptorPath: /workflows/wf_titan_fasta.wdl + primaryDescriptorPath: /workflows/wf_theiacov_fasta.wdl testParameterFiles: - empty.json - - name: Titan_WWVC + - name: TheiaCoV_WWVC subclass: WDL - primaryDescriptorPath: /workflows/wf_titan_wwvc.wdl + primaryDescriptorPath: /workflows/wf_theiacov_wwvc.wdl testParameterFiles: - empty.json - name: Freyja_FASTQ diff --git a/.github/workflows/theiacov-gc.yml b/.github/workflows/theiacov-gc.yml new file mode 100644 index 00000000..69d41e9b --- /dev/null +++ b/.github/workflows/theiacov-gc.yml @@ -0,0 +1,58 @@ +name: theiacov-gc +on: [push, pull_request] + +jobs: + theiacov-gc: + runs-on: ubuntu-latest + strategy: + fail-fast: false + matrix: + wf: ['clearlabs', 'illumina_pe', 'illumina_se', 'ont'] + defaults: + run: + shell: bash -l {0} + steps: + - name: Checkout PHVG + uses: actions/checkout@v2 + + - name: Free up Disk Space + run: bash ${GITHUB_WORKSPACE}/.github/helpers/free-disk-space.sh + + - name: Setup miniconda + uses: conda-incubator/setup-miniconda@v2 + with: + activate-environment: theiacov-gc + auto-activate-base: false + + - name: Setup TheiaCoV CI Environment + run: | + conda install -y -c conda-forge -c bioconda cromwell 'python>=3.7' pytest pytest-workflow wget + chmod 755 bin/* + cp bin/* ${CONDA_PREFIX}/bin + THEIACOV_GC_VERSION=$(grep "PHVG_Version=" tasks/task_versioning.wdl | sed -E 's/.*="PHVG v(.*)"/\1/') + echo "THEIACOV_GC_VERSION=${THEIACOV_GC_VERSION}" >> $GITHUB_ENV + THEIACOV_SHARE="${CONDA_PREFIX}/share/theiacov-gc-${THEIACOV_GC_VERSION}" + mkdir -p ${THEIACOV_SHARE} + mv conf/ tasks/ workflows/ ${THEIACOV_SHARE} + + - name: Environment Information + run: uname -a && env && theiacov-gc -h + + - name: Test TheiaCoV-GC Workflows + run: | + mkdir -p theiacov/${{ matrix.wf }} + theiacov-gc-prepare.py tests/data/fastqs/${{ matrix.wf }} ${{ matrix.wf }} tests/data/primers/artic-v3.primers.bed > theiacov/${{ matrix.wf }}.json + TMPDIR=~ pytest --symlink --kwdof --tag theiacov_${{ matrix.wf }} + rm -rf theiacov/${{ matrix.wf }} + + - name: Upload logs on failure + if: failure() + uses: actions/upload-artifact@v2 + with: + name: logs-${{ matrix.wf }} + path: | + /home/runner/pytest_workflow_*/*/theiacov/ + /home/runner/pytest_workflow_*/*/log.out + /home/runner/pytest_workflow_*/*/log.err + !/home/runner/pytest_workflow_*/*/theiacov/*/alignments/*.bam* + !/home/runner/pytest_workflow_*/*/theiacov/*/dehosted_reads/*.fastq.gz diff --git a/.github/workflows/titan-gc.yml b/.github/workflows/titan-gc.yml deleted file mode 100644 index 31ba0bc8..00000000 --- a/.github/workflows/titan-gc.yml +++ /dev/null @@ -1,58 +0,0 @@ -name: titan-gc -on: [push, pull_request] - -jobs: - titan-gc: - runs-on: ubuntu-latest - strategy: - fail-fast: false - matrix: - wf: ['clearlabs', 'illumina_pe', 'illumina_se', 'ont'] - defaults: - run: - shell: bash -l {0} - steps: - - name: Checkout PHVG - uses: actions/checkout@v2 - - - name: Free up Disk Space - run: bash ${GITHUB_WORKSPACE}/.github/helpers/free-disk-space.sh - - - name: Setup miniconda - uses: conda-incubator/setup-miniconda@v2 - with: - activate-environment: titan-gc - auto-activate-base: false - - - name: Setup Titan CI Environment - run: | - conda install -y -c conda-forge -c bioconda cromwell 'python>=3.7' pytest pytest-workflow wget - chmod 755 bin/* - cp bin/* ${CONDA_PREFIX}/bin - TITAN_GC_VERSION=$(grep "PHVG_Version=" tasks/task_versioning.wdl | sed -E 's/.*="PHVG v(.*)"/\1/') - echo "TITAN_GC_VERSION=${TITAN_GC_VERSION}" >> $GITHUB_ENV - TITAN_SHARE="${CONDA_PREFIX}/share/titan-gc-${TITAN_GC_VERSION}" - mkdir -p ${TITAN_SHARE} - mv conf/ tasks/ workflows/ ${TITAN_SHARE} - - - name: Environment Information - run: uname -a && env && titan-gc -h - - - name: Test Titan-GC Workflows - run: | - mkdir -p titan/${{ matrix.wf }} - titan-gc-prepare.py tests/data/fastqs/${{ matrix.wf }} ${{ matrix.wf }} tests/data/primers/artic-v3.primers.bed > titan/${{ matrix.wf }}.json - TMPDIR=~ pytest --symlink --kwdof --tag titan_${{ matrix.wf }} - rm -rf titan/${{ matrix.wf }} - - - name: Upload logs on failure - if: failure() - uses: actions/upload-artifact@v2 - with: - name: logs-${{ matrix.wf }} - path: | - /home/runner/pytest_workflow_*/*/titan/ - /home/runner/pytest_workflow_*/*/log.out - /home/runner/pytest_workflow_*/*/log.err - !/home/runner/pytest_workflow_*/*/titan/*/alignments/*.bam* - !/home/runner/pytest_workflow_*/*/titan/*/dehosted_reads/*.fastq.gz diff --git a/README.md b/README.md index 41ee00b2..03e8001f 100644 --- a/README.md +++ b/README.md @@ -4,5 +4,72 @@ Bioinformatics workflows for genomic characterization, submission preparation, a ### Contributors & Influence * Based on collaborative work with Andrew Lang, PhD & his [Genomic Analysis WDL workflows](https://github.com/AndrewLangvt/genomic_analyses) * Workflows and task development influenced by The Broad's [Viral Pipes](https://github.com/broadinstitute/viral-pipelines) -* Titan workflows for genomic characterization influenced by UPHL's [Cecret](https://github.com/UPHL-BioNGS/Cecret) & StaPH-B's [Monroe](https://staph-b.github.io/staphb_toolkit/workflow_docs/monroe/) -* The Titan workflow for waste water variant calling (Titan_WWVC) incorporates a modified version of the [CDPHE's WasteWaterVariantCalling WDL Worfklow](https://github.com/CDPHE/WasteWaterVariantCalling). +* TheiaCoV workflows for genomic characterization influenced by UPHL's [Cecret](https://github.com/UPHL-BioNGS/Cecret) & StaPH-B's [Monroe](https://staph-b.github.io/staphb_toolkit/workflow_docs/monroe/) +* The TheiaCoV workflow for waste water variant calling (TheiaCoV_WWVC) incorporates a modified version of the [CDPHE's WasteWaterVariantCalling WDL Worfklow](https://github.com/CDPHE/WasteWaterVariantCalling). + +### Repository Style Guide +2-space indents (no tabs), braces on same line, single space when defining input/output variables & runtime attributes, single-line breaks between non-intended constructs, and task commands enclosed with triple braces (`<<< ... >>>`). + +E.g.: +``` +workflow w { + input { + String input + } + call task_01 { + input: + input = input + } + call task_02 { + input: + input = input + } + output { + File task_01_out = task_01.output + File task_02_out = task_02.output + } +} + +task task1 { + input { + String input + String docker = "theiagen/utility:1.1" + } + command <<< + echo '~{input}' > output.txt + >>> + output { + File output = "output.txt" + } + runtime { + docker: docker + memory: "8 GB" + cpu: 2 + disks "local-disk 100 SSD" + preemptible: 0 + maxRetries: 0 + } +} + +task task_02 { + input{ + String input + String docker = "theiagen/utility:1.1" + } + command <<< + echo '~{input}' > output.txt + >>> + output { + File output = "output.txt" + } + runtime { + docker: docker + memory: "8 GB" + cpu: 2 + disks "local-disk 100 SSD" + preemptible: 0 + maxRetries: 0 + } +} +``` +Style guide inspired by [scottfrazer](https://gist.github.com/scottfrazer)'s [WDL Best Pratcices Style Guide](https://gist.github.com/scottfrazer/aa4ab1945a6a4c331211) diff --git a/bin/titan-gc b/bin/theiacov-gc similarity index 64% rename from bin/titan-gc rename to bin/theiacov-gc index 54f42e1e..2c664ead 100755 --- a/bin/titan-gc +++ b/bin/theiacov-gc @@ -1,7 +1,7 @@ #! /bin/bash -# usage: titan-gc [-h] [-i STR] [--inputs STR] [-o STR] [--outdir STR] [--options STR] [--verbose] +# usage: theiacov-gc [-h] [-i STR] [--inputs STR] [-o STR] [--outdir STR] [--options STR] [--verbose] # -# titan-gc - Run Titan GC on a set of samples. +# theiacov-gc - Run theiacov GC on a set of samples. # # required arguments: # -i STR, --inputs STR The JSON file to be used with Cromwell for inputs. @@ -11,29 +11,29 @@ # -h, --help show this help message and exit # # --options STR JSON file containing Cromwell options -# --verbose Print out all STDOUT from Cromwell and titan-organize +# --verbose Print out all STDOUT from Cromwell and theiacov-organize set -e set -u OPTIONS="0" QUIET="0" PROFILE="docker" CONFIG="0" -TITAN_PATH=$(which titan-gc | sed 's=bin/titan-gc==') -TITAN_SHARE=${TITAN_PATH}/share/titan-gc-${TITAN_GC_VERSION} +THEIACOV_PATH=$(which theiacov-gc | sed 's=bin/theiacov-gc==') +THEIACOV_SHARE=${THEIACOV_PATH}/share/theiacov-gc-${THEIACOV_GC_VERSION} CROMWELL_JAR=$(which cromwell | sed 's=bin/cromwell=share/cromwell/cromwell.jar=') LOG_LEVEL=ERROR -SINGULARITY_CACHE="${HOME}/.singularity/titan-cache" -CROMWELL_OPTS="${TITAN_SHARE}/conf/options.json" +SINGULARITY_CACHE="${HOME}/.singularity/theiacov-cache" +CROMWELL_OPTS="${THEIACOV_SHARE}/conf/options.json" version() { - echo "titan-gc ${TITAN_GC_VERSION}" + echo "theiacov-gc ${THEIACOV_GC_VERSION}" exit 0 } usage() { - echo "usage: titan-gc [-h] [-i STR] [--inputs STR] [-o STR] [--outdir STR] [--options STR] [--quiet]" + echo "usage: theiacov-gc [-h] [-i STR] [--inputs STR] [-o STR] [--outdir STR] [--options STR] [--quiet]" echo "" - echo "titan-gc - Run Titan GC on a set of samples." + echo "theiacov-gc - Run TheiaCoV GC on a set of samples." echo "" echo "required arguments:" echo " -i STR, --inputs STR The JSON file to be used with Cromwell for inputs." @@ -46,7 +46,7 @@ usage() { echo " --profile STR The backend profile to use [options: docker, singularity]" echo " --config STR Custom backend profile to use" echo " --cromwell_jar STR Path to cromwell.jar (Default use conda install)" - echo " --quiet Silence all STDOUT from Cromwell and titan-gc-organize" + echo " --quiet Silence all STDOUT from Cromwell and theiacov-gc-organize" if [ -n "$1" ]; then exit "$1" @@ -83,9 +83,9 @@ if [[ "${CONFIG}" == "0" ]]; then # Use built in config if [[ "${PROFILE}" == "docker" ]]; then # Default - CONFIG_PATH="${TITAN_SHARE}/conf/docker.config" + CONFIG_PATH="${THEIACOV_SHARE}/conf/docker.config" elif [[ "${PROFILE}" == "singularity" ]]; then - CONFIG_PATH="${TITAN_SHARE}/conf/singularity.config" + CONFIG_PATH="${THEIACOV_SHARE}/conf/singularity.config" else echo "Uknown profile: ${PROFILE}, exiting..." usage 1 @@ -97,23 +97,23 @@ if [[ "${OPTIONS}" != "0" ]]; then fi mkdir -p ${OUTDIR} -echo "Running Titan GC (use --quiet to quiet things down a bit)" 1>&2 +echo "Running TheiaCoV GC (use --quiet to quiet things down a bit)" 1>&2 if [[ ${QUIET} == "0" ]]; then java -Dconfig.file=${CONFIG_PATH} -jar ${CROMWELL_JAR} run \ -i ${INPUTS} \ - -m ${OUTDIR}/titan-metadata.json \ + -m ${OUTDIR}/theiacov-metadata.json \ -o ${CROMWELL_OPTS} \ - ${TITAN_SHARE}/workflows/wf_titan_gc.wdl 2> ${OUTDIR}/cromwell-stderr.txt | tee ${OUTDIR}/cromwell-stdout.txt + ${THEIACOV_SHARE}/workflows/wf_theiacov_gc.wdl 2> ${OUTDIR}/cromwell-stderr.txt | tee ${OUTDIR}/cromwell-stdout.txt else java -Dconfig.file=${CONFIG_PATH} -jar ${CROMWELL_JAR} run \ -i ${INPUTS} \ - -m ${OUTDIR}/titan-metadata.json \ - -o ${CROMWELL_OPTS} ${TITAN_SHARE}/workflows/wf_titan_gc.wdl > ${OUTDIR}/cromwell-stdout.txt 2> ${OUTDIR}/cromwell-stderr.txt + -m ${OUTDIR}/theiacov-metadata.json \ + -o ${CROMWELL_OPTS} ${THEIACOV_SHARE}/workflows/wf_theiacov_gc.wdl > ${OUTDIR}/cromwell-stdout.txt 2> ${OUTDIR}/cromwell-stderr.txt fi -if [[ -f "${OUTDIR}/titan-metadata.json" ]]; then - echo "Titan GC complete, organizing outputs" 1>&2 - titan-gc-organize.py ${OUTDIR}/titan-metadata.json --outdir ${OUTDIR} +if [[ -f "${OUTDIR}/theiacov-metadata.json" ]]; then + echo "TheiaCoV GC complete, organizing outputs" 1>&2 + theiacov-gc-organize.py ${OUTDIR}/theiacov-metadata.json --outdir ${OUTDIR} else - echo "Titan GC did not complete sucessfully, please review the logs (${OUTDIR}/cromwell-std{err|out}.txt)" 1>&2 + echo "TheiaCoV GC did not complete sucessfully, please review the logs (${OUTDIR}/cromwell-std{err|out}.txt)" 1>&2 fi diff --git a/bin/titan-gc-organize.py b/bin/theiacov-gc-organize.py similarity index 76% rename from bin/titan-gc-organize.py rename to bin/theiacov-gc-organize.py index b5dd890b..28136a82 100755 --- a/bin/titan-gc-organize.py +++ b/bin/theiacov-gc-organize.py @@ -1,18 +1,18 @@ #! /usr/bin/env python3 """ -usage: titan-gc-organize [-h] [--outdir STR] [--debug] METADATA_JSON +usage: theiacov-gc-organize [-h] [--outdir STR] [--debug] METADATA_JSON -titan-gc-organize- Read Cromwell metadata to organize files into readable structure +theiacov-gc-organize- Read Cromwell metadata to organize files into readable structure positional arguments: - METADATA_JSON The metadata.json output (-m) from the Titan GC run. + METADATA_JSON The metadata.json output (-m) from the TheiaCoV GC run. optional arguments: -h, --help show this help message and exit - --outdir STR Directory to copy files to. (Default: ./titan-gc). + --outdir STR Directory to copy files to. (Default: ./theiacov-gc). --debug Print helpful information """ -# Known file extensions from Titan-GC outputs +# Known file extensions from TheiaCoV-GC outputs OUTPUTS = { 'aligned_bam': {'ext': ['.primertrim.sorted.bam', '.primertrimmed.rg.sorted.bam'], 'folder': 'alignments'}, 'aligned_bai': {'ext': ['.primertrim.sorted.bam.bai', '.primertrimmed.rg.sorted.bam.bai'], 'folder': 'alignments'}, @@ -36,7 +36,7 @@ def mkdir(path): from pathlib import Path Path(path).mkdir(parents=True, exist_ok=True) -def read_titan_results(tsv, is_json=False): +def read_theiacov_results(tsv, is_json=False): results = {} with open(tsv, 'rt') as tsv_fh: for line in tsv_fh: @@ -60,16 +60,16 @@ def read_titan_results(tsv, is_json=False): import sys parser = ap.ArgumentParser( - prog='titan-gc-organize', + prog='theiacov-gc-organize', conflict_handler='resolve', description=( - f'titan-gc-organize- Read Cromwell metadata to organize files into readable structure' + f'theiacov-gc-organize- Read Cromwell metadata to organize files into readable structure' ) ) parser.add_argument('metadata', metavar="METADATA_JSON", type=str, - help='The metadata.json output (-m) from the Titan GC run.') - parser.add_argument('--outdir', metavar='STR', type=str, default="./titan-gc", - help='Directory to copy files to. (Default: ./titan-gc).') + help='The metadata.json output (-m) from the TheiaCoV GC run.') + parser.add_argument('--outdir', metavar='STR', type=str, default="./theiacov-gc", + help='Directory to copy files to. (Default: ./theiacov-gc).') parser.add_argument('--debug', action='store_true', help='Print helpful information') if len(sys.argv) == 1: parser.print_help() @@ -105,41 +105,41 @@ def read_titan_results(tsv, is_json=False): """ Start moving files: metadata["outputs"] Output keys: - "titan_gc.reads_dehosted" - "titan_gc.kraken_report" - "titan_gc.pango_lineage_report" - "titan_gc.nextclade_json" - "titan_gc.consensus_flagstat" - "titan_gc.kraken_report_dehosted" - "titan_gc.vadr_alerts_list" - "titan_gc.aligned_bam" - "titan_gc.assembly_fasta" - "titan_gc.nextclade_tsv" - "titan_gc.consensus_stats" - "titan_gc.auspice_json" - "titan_gc.aligned_bai" - "titan_gc.json_summary" - - The merged summary is under: "titan_gc.summaries_tsv" and "titan_gc.summaries_json" + "theiacov_gc.reads_dehosted" + "theiacov_gc.kraken_report" + "theiacov_gc.pango_lineage_report" + "theiacov_gc.nextclade_json" + "theiacov_gc.consensus_flagstat" + "theiacov_gc.kraken_report_dehosted" + "theiacov_gc.vadr_alerts_list" + "theiacov_gc.aligned_bam" + "theiacov_gc.assembly_fasta" + "theiacov_gc.nextclade_tsv" + "theiacov_gc.consensus_stats" + "theiacov_gc.auspice_json" + "theiacov_gc.aligned_bai" + "theiacov_gc.json_summary" + + The merged summary is under: "theiacov_gc.summaries_tsv" and "theiacov_gc.summaries_json" Output files should start with the sample name (sample01.file or sample01_file) """ mkdir(f"{args.outdir}") - titan_results = None - titan_json = None + theiacov_results = None + theiacov_json = None for key, outputs in metadata["outputs"].items(): - task_name = key.replace("titan_gc.", "") + task_name = key.replace("theiacov_gc.", "") if args.debug: print(f"Working on {task_name} outputs", file=sys.stderr) if task_name == "summaries_tsv": if args.debug: - print(f"Copying {outputs} to {args.outdir}/complete-titan-results.tsv", file=sys.stderr) - copy2(outputs, f"{args.outdir}/titan-results.tsv") - titan_results = read_titan_results(outputs) + print(f"Copying {outputs} to {args.outdir}/complete-theiacov-results.tsv", file=sys.stderr) + copy2(outputs, f"{args.outdir}/theiacov-results.tsv") + theiacov_results = read_theiacov_results(outputs) elif task_name == "summaries_json": if args.debug: - print(f"Copying {outputs} to {args.outdir}/complete-titan-results.json", file=sys.stderr) - copy2(outputs, f"{args.outdir}/titan-results.json") - titan_json = read_titan_results(outputs, is_json=True) + print(f"Copying {outputs} to {args.outdir}/complete-theiacov-results.json", file=sys.stderr) + copy2(outputs, f"{args.outdir}/theiacov-results.json") + theiacov_json = read_theiacov_results(outputs, is_json=True) else: for output in outputs: samplename = os.path.basename(output) diff --git a/bin/titan-gc-prepare.py b/bin/theiacov-gc-prepare.py similarity index 85% rename from bin/titan-gc-prepare.py rename to bin/theiacov-gc-prepare.py index bdda36f8..572bfa21 100755 --- a/bin/titan-gc-prepare.py +++ b/bin/theiacov-gc-prepare.py @@ -1,17 +1,17 @@ #! /usr/bin/env python3 """ -usage: titan-gc-prepare [-h] [-f STR] [--fastq_separator STR] [--fastq_pattern STR] [--pe1_pattern STR] [--pe2_pattern STR] +usage: theiacov-gc-prepare [-h] [-f STR] [--fastq_separator STR] [--fastq_pattern STR] [--pe1_pattern STR] [--pe2_pattern STR] [-r] [--prefix STR] [--tsv] [--pangolin_docker STR] [--clearlabs_normalise INT] [--ont_normalise INT] [--seq_method STR] FASTQ_PATH WORKFLOW PRIMER -titan-gc-prepare - Read a directory and prepare a JSON for input to Titan GC +theiacov-gc-prepare - Read a directory and prepare a JSON for input to TheiaCoV GC optional arguments: -h, --help show this help message and exit -Titan-GC Prepare Parameters: +TheiaCoV-GC Prepare Parameters: FASTQ_PATH Directory where FASTQ files are stored - WORKFLOW The TItan-GC workflow to use for anlaysis. Options: clearlabs, illumina_pe, illumina_se, ont + WORKFLOW The TheiaCoV-GC workflow to use for anlaysis. Options: clearlabs, illumina_pe, illumina_se, ont PRIMERS A file containing primers (bed format) used during sequencing. -f STR, --fastq_ext STR Extension of the FASTQs. Default: .fastq.gz @@ -24,7 +24,7 @@ --prefix STR Replace the absolute path with a given string. Default: Use absolute path --tsv Output FOFN as a TSV (Default JSON) -Optional Titan-GC Workflow Parameters: +Optional TheiaCoV-GC Workflow Parameters: --pangolin_docker STR Docker image used to run Pangolin --clearlabs_normalise INT @@ -60,18 +60,18 @@ def get_path(fastq, abspath, prefix): import sys parser = ap.ArgumentParser( - prog='titan-gc-prepare', + prog='theiacov-gc-prepare', conflict_handler='resolve', description=( - f'titan-gc-prepare - Read a directory and prepare a JSON for input to Titan GC' + f'theiacov-gc-prepare - Read a directory and prepare a JSON for input to TheiaCoV GC' ) ) - group1 = parser.add_argument_group('Titan-GC Prepare Parameters') + group1 = parser.add_argument_group('TheiaCoV-GC Prepare Parameters') group1.add_argument('path', metavar="FASTQ_PATH", type=str, help='Directory where FASTQ files are stored') group1.add_argument( 'workflow', metavar='WORKFLOW', type=str, choices=['clearlabs', 'illumina_pe', 'illumina_se', 'ont'], - help='The Titan-GC workflow to use for analysis. Options: clearlabs, illumina_pe, illumina_se, ont' + help='The TheiaCoV-GC workflow to use for analysis. Options: clearlabs, illumina_pe, illumina_se, ont' ) group1.add_argument( 'primers', metavar='PRIMER', type=str, default="", @@ -107,7 +107,7 @@ def get_path(fastq, abspath, prefix): ) group1.add_argument('--tsv', action='store_true', help='Output FOFN as a TSV (Default JSON)') - group2 = parser.add_argument_group('Optional Titan-GC Workflow Parameters') + group2 = parser.add_argument_group('Optional TheiaCoV-GC Workflow Parameters') group2.add_argument( '--pangolin_docker', metavar='STR', type=str, help='Docker image used to run Pangolin (takes priority over --params)' @@ -123,9 +123,9 @@ def get_path(fastq, abspath, prefix): args = parser.parse_args() abspath = os.path.abspath(args.path) SAMPLES = {} - EMPTY_FASTQ = f"{str(Path.home())}/.titan/EMPTY.fastq.gz" + EMPTY_FASTQ = f"{str(Path.home())}/.theiacov/EMPTY.fastq.gz" if not os.path.exists(EMPTY_FASTQ): - Path(f"{str(Path.home())}/.titan").mkdir(parents=True, exist_ok=True) + Path(f"{str(Path.home())}/.theiacov").mkdir(parents=True, exist_ok=True) with open(EMPTY_FASTQ, 'a'): pass @@ -206,7 +206,7 @@ def get_path(fastq, abspath, prefix): FOFN.append({ 'sample': sample, - 'titan_wf': args.workflow, + 'theiacov_wf': args.workflow, 'r1': r1, 'r2': r2, 'primers': get_path(Path(args.primers), abspath, args.prefix) @@ -217,12 +217,12 @@ def get_path(fastq, abspath, prefix): needs_header = True for f in FOFN: if needs_header: - print("\t".join(['sample', 'titan_wf', 'r1', 'r2', 'primers'])) + print("\t".join(['sample', 'theiacov_wf', 'r1', 'r2', 'primers'])) needs_header = False - print("\t".join([f['sample'], f['titan_wf'], f['r1'], f['r2'], f['primers']])) + print("\t".join([f['sample'], f['theiacov_wf'], f['r1'], f['r2'], f['primers']])) else: inputs_json = { - "titan_gc.samples": FOFN + "theiacov_gc.samples": FOFN } params_json = {} @@ -233,10 +233,10 @@ def get_path(fastq, abspath, prefix): params_json = json.load(json_fh) if args.pangolin_docker: - params_json['titan_gc.titan_clearlabs.pangolin3.pangolin_docker_image'] = args.pangolin_docker - params_json['titan_gc.titan_illumina_pe.pangolin3.pangolin_docker_image'] = args.pangolin_docker - params_json['titan_gc.titan_illumina_se.pangolin3.pangolin_docker_image'] = args.pangolin_docker - params_json['titan_gc.titan_ont.pangolin3.pangolin_docker_image'] = args.pangolin_docker + params_json['theiacov_gc.theiacov_clearlabs.pangolin3.pangolin_docker_image'] = args.pangolin_docker + params_json['theiacov_gc.theiacov_illumina_pe.pangolin3.pangolin_docker_image'] = args.pangolin_docker + params_json['theiacov_gc.theiacov_illumina_se.pangolin3.pangolin_docker_image'] = args.pangolin_docker + params_json['theiacov_gc.theiacov_ont.pangolin3.pangolin_docker_image'] = args.pangolin_docker print(json.dumps({**inputs_json, **params_json}, indent = 4)) else: diff --git a/conf/params.json b/conf/params.json index e7cc0c2a..80872af2 100644 --- a/conf/params.json +++ b/conf/params.json @@ -1,131 +1,131 @@ { - "titan_gc.pangolin_docker_image": "staphb/pangolin:3.1.3-pangolearn-2021-06-15", + "theiacov_gc.pangolin_docker_image": "staphb/pangolin:3.1.3-pangolearn-2021-06-15", - "titan_gc.titan_clearlabs.nextclade_one_sample.docker": "nextstrain/nextclade:0.14.4", - "titan_gc.titan_illumina_pe.nextclade_one_sample.docker": "nextstrain/nextclade:0.14.4", - "titan_gc.titan_illumina_se.nextclade_one_sample.docker": "nextstrain/nextclade:0.14.4", - "titan_gc.titan_ont.nextclade_one_sample.docker": "nextstrain/nextclade:0.14.4", + "theiacov_gc.theiacov_clearlabs.nextclade_one_sample.docker": "nextstrain/nextclade:0.14.4", + "theiacov_gc.theiacov_illumina_pe.nextclade_one_sample.docker": "nextstrain/nextclade:0.14.4", + "theiacov_gc.theiacov_illumina_se.nextclade_one_sample.docker": "nextstrain/nextclade:0.14.4", + "theiacov_gc.theiacov_ont.nextclade_one_sample.docker": "nextstrain/nextclade:0.14.4", - "titan_gc.titan_clearlabs.ncbi_scrub_se.docker": "ncbi/sra-human-scrubber:1.0.2021-05-05", - "titan_gc.titan_illumina_pe.read_QC_trim.ncbi_scrub_pe.docker": "ncbi/sra-human-scrubber:1.0.2021-05-05", - "titan_gc.titan_ont.ncbi_scrub_se.docker": "ncbi/sra-human-scrubber:1.0.2021-05-05", + "theiacov_gc.theiacov_clearlabs.ncbi_scrub_se.docker": "ncbi/sra-human-scrubber:1.0.2021-05-05", + "theiacov_gc.theiacov_illumina_pe.read_QC_trim.ncbi_scrub_pe.docker": "ncbi/sra-human-scrubber:1.0.2021-05-05", + "theiacov_gc.theiacov_ont.ncbi_scrub_se.docker": "ncbi/sra-human-scrubber:1.0.2021-05-05", - "titan_gc.titan_clearlabs.consensus.docker": "staphb/artic-ncov2019:1.3.0", - "titan_gc.titan_ont.consensus.docker": "staphb/artic-ncov2019:1.3.0", + "theiacov_gc.theiacov_clearlabs.consensus.docker": "staphb/artic-ncov2019:1.3.0", + "theiacov_gc.theiacov_ont.consensus.docker": "staphb/artic-ncov2019:1.3.0", - "titan_gc.titan_illumina_pe.read_QC_trim.bbduk.docker": "staphb/bbtools:38.76", - "titan_gc.titan_illumina_se.read_QC_trim.bbduk_se.docker": "staphb/bbtools:38.76", + "theiacov_gc.theiacov_illumina_pe.read_QC_trim.bbduk.docker": "staphb/bbtools:38.76", + "theiacov_gc.theiacov_illumina_se.read_QC_trim.bbduk_se.docker": "staphb/bbtools:38.76", - "titan_gc.titan_illumina_se.read_QC_trim.trimmomatic_se.docker": "staphb/trimmomatic:0.39", - "titan_gc.titan_illumina_pe.read_QC_trim.trimmomatic.docker": "staphb/trimmomatic:0.39", + "theiacov_gc.theiacov_illumina_se.read_QC_trim.trimmomatic_se.docker": "staphb/trimmomatic:0.39", + "theiacov_gc.theiacov_illumina_pe.read_QC_trim.trimmomatic.docker": "staphb/trimmomatic:0.39", - "titan_gc.titan_clearlabs.vadr.docker": "staphb/vadr:1.2.1", - "titan_gc.titan_illumina_pe.vadr.docker": "staphb/vadr:1.2.1", - "titan_gc.titan_illumina_se.vadr.docker": "staphb/vadr:1.2.1", - "titan_gc.titan_ont.vadr.docker": "staphb/vadr:1.2.1", + "theiacov_gc.theiacov_clearlabs.vadr.docker": "staphb/vadr:1.2.1", + "theiacov_gc.theiacov_illumina_pe.vadr.docker": "staphb/vadr:1.2.1", + "theiacov_gc.theiacov_illumina_se.vadr.docker": "staphb/vadr:1.2.1", + "theiacov_gc.theiacov_ont.vadr.docker": "staphb/vadr:1.2.1", - "titan_gc.titan_clearlabs.consensus.cpu": 8, - "titan_gc.titan_clearlabs.consensus.medaka_model": "r941_min_high_g360", - "titan_gc.titan_clearlabs.kraken2_dehosted.cpus": 4, - "titan_gc.titan_clearlabs.kraken2_raw.cpus": 4, - "titan_gc.titan_clearlabs.normalise": 20000, - "titan_gc.titan_clearlabs.pangolin3.inference_engine": "usher", - "titan_gc.titan_clearlabs.pangolin3.max_ambig": 0.5, - "titan_gc.titan_clearlabs.pangolin3.min_length": 10000, - "titan_gc.titan_clearlabs.seq_method": "ONT via Clear Labs WGS", - "titan_gc.titan_clearlabs.vadr.maxlen": 30000, - "titan_gc.titan_clearlabs.vadr.minlen": 50, - "titan_gc.titan_clearlabs.vadr.skip_length": 10000, - "titan_gc.titan_clearlabs.vadr.vadr_opts": "--glsearch -s -r --nomisc --mkey sarscov2 --alt_fail lowscore,fstukcnf,insertnn,deletinn --mdir /opt/vadr/vadr-models/", + "theiacov_gc.theiacov_clearlabs.consensus.cpu": 8, + "theiacov_gc.theiacov_clearlabs.consensus.medaka_model": "r941_min_high_g360", + "theiacov_gc.theiacov_clearlabs.kraken2_dehosted.cpus": 4, + "theiacov_gc.theiacov_clearlabs.kraken2_raw.cpus": 4, + "theiacov_gc.theiacov_clearlabs.normalise": 20000, + "theiacov_gc.theiacov_clearlabs.pangolin3.inference_engine": "usher", + "theiacov_gc.theiacov_clearlabs.pangolin3.max_ambig": 0.5, + "theiacov_gc.theiacov_clearlabs.pangolin3.min_length": 10000, + "theiacov_gc.theiacov_clearlabs.seq_method": "ONT via Clear Labs WGS", + "theiacov_gc.theiacov_clearlabs.vadr.maxlen": 30000, + "theiacov_gc.theiacov_clearlabs.vadr.minlen": 50, + "theiacov_gc.theiacov_clearlabs.vadr.skip_length": 10000, + "theiacov_gc.theiacov_clearlabs.vadr.vadr_opts": "--glsearch -s -r --nomisc --mkey sarscov2 --alt_fail lowscore,fstukcnf,insertnn,deletinn --mdir /opt/vadr/vadr-models/", - "titan_gc.titan_illumina_pe.bwa.cpus": 6, - "titan_gc.titan_illumina_pe.bwa.reference_genome": "/artic-ncov2019/primer_schemes/nCoV-2019/V3/nCoV-2019.reference.fasta", - "titan_gc.titan_illumina_pe.consensus.char_unknown": "N", - "titan_gc.titan_illumina_pe.consensus.count_orphans": true, - "titan_gc.titan_illumina_pe.consensus.disable_baq": true, - "titan_gc.titan_illumina_pe.consensus.max_depth": "600000", - "titan_gc.titan_illumina_pe.consensus.min_bq": "0", - "titan_gc.titan_illumina_pe.consensus.min_depth": "10", - "titan_gc.titan_illumina_pe.consensus.min_freq": "0.6", - "titan_gc.titan_illumina_pe.consensus.min_qual": "20", - "titan_gc.titan_illumina_pe.consensus.ref_genome": "/artic-ncov2019/primer_schemes/nCoV-2019/V3/nCoV-2019.reference.fasta", - "titan_gc.titan_illumina_pe.consensus.ref_gff": "/reference/GCF_009858895.2_ASM985889v3_genomic.gff", - "titan_gc.titan_illumina_pe.pangolin3.inference_engine": "usher", - "titan_gc.titan_illumina_pe.pangolin3.max_ambig": 0.5, - "titan_gc.titan_illumina_pe.pangolin3.min_length": 10000, - "titan_gc.titan_illumina_pe.primer_trim.keep_noprimer_reads": true, - "titan_gc.titan_illumina_pe.read_QC_trim.kraken2_dehosted.cpus": 4, - "titan_gc.titan_illumina_pe.read_QC_trim.kraken2_raw.cpus": 4, - "titan_gc.titan_illumina_pe.read_QC_trim.trimmomatic.threads": 4, - "titan_gc.titan_illumina_pe.read_QC_trim.trimmomatic_minlen": 75, - "titan_gc.titan_illumina_pe.read_QC_trim.trimmomatic_quality_trim_score": 30, - "titan_gc.titan_illumina_pe.read_QC_trim.trimmomatic_window_size": 4, - "titan_gc.titan_illumina_pe.seq_method": "Illumina paired-end", - "titan_gc.titan_illumina_pe.vadr.maxlen": 30000, - "titan_gc.titan_illumina_pe.vadr.minlen": 50, - "titan_gc.titan_illumina_pe.vadr.skip_length": 10000, - "titan_gc.titan_illumina_pe.vadr.vadr_opts": "--glsearch -s -r --nomisc --mkey sarscov2 --alt_fail lowscore,fstukcnf,insertnn,deletinn --mdir /opt/vadr/vadr-models/", - "titan_gc.titan_illumina_pe.variant_call.count_orphans": true, - "titan_gc.titan_illumina_pe.variant_call.disable_baq": true, - "titan_gc.titan_illumina_pe.variant_call.max_depth": "600000", - "titan_gc.titan_illumina_pe.variant_call.min_bq": "0", - "titan_gc.titan_illumina_pe.variant_call.min_depth": "10", - "titan_gc.titan_illumina_pe.variant_call.min_freq": "0.6", - "titan_gc.titan_illumina_pe.variant_call.min_qual": "20", - "titan_gc.titan_illumina_pe.variant_call.ref_genome": "/artic-ncov2019/primer_schemes/nCoV-2019/V3/nCoV-2019.reference.fasta", - "titan_gc.titan_illumina_pe.variant_call.ref_gff": "/reference/GCF_009858895.2_ASM985889v3_genomic.gff", + "theiacov_gc.theiacov_illumina_pe.bwa.cpus": 6, + "theiacov_gc.theiacov_illumina_pe.bwa.reference_genome": "/artic-ncov2019/primer_schemes/nCoV-2019/V3/nCoV-2019.reference.fasta", + "theiacov_gc.theiacov_illumina_pe.consensus.char_unknown": "N", + "theiacov_gc.theiacov_illumina_pe.consensus.count_orphans": true, + "theiacov_gc.theiacov_illumina_pe.consensus.disable_baq": true, + "theiacov_gc.theiacov_illumina_pe.consensus.max_depth": "600000", + "theiacov_gc.theiacov_illumina_pe.consensus.min_bq": "0", + "theiacov_gc.theiacov_illumina_pe.consensus.min_depth": "10", + "theiacov_gc.theiacov_illumina_pe.consensus.min_freq": "0.6", + "theiacov_gc.theiacov_illumina_pe.consensus.min_qual": "20", + "theiacov_gc.theiacov_illumina_pe.consensus.ref_genome": "/artic-ncov2019/primer_schemes/nCoV-2019/V3/nCoV-2019.reference.fasta", + "theiacov_gc.theiacov_illumina_pe.consensus.ref_gff": "/reference/GCF_009858895.2_ASM985889v3_genomic.gff", + "theiacov_gc.theiacov_illumina_pe.pangolin3.inference_engine": "usher", + "theiacov_gc.theiacov_illumina_pe.pangolin3.max_ambig": 0.5, + "theiacov_gc.theiacov_illumina_pe.pangolin3.min_length": 10000, + "theiacov_gc.theiacov_illumina_pe.primer_trim.keep_noprimer_reads": true, + "theiacov_gc.theiacov_illumina_pe.read_QC_trim.kraken2_dehosted.cpus": 4, + "theiacov_gc.theiacov_illumina_pe.read_QC_trim.kraken2_raw.cpus": 4, + "theiacov_gc.theiacov_illumina_pe.read_QC_trim.trimmomatic.threads": 4, + "theiacov_gc.theiacov_illumina_pe.read_QC_trim.trimmomatic_minlen": 75, + "theiacov_gc.theiacov_illumina_pe.read_QC_trim.trimmomatic_quality_trim_score": 30, + "theiacov_gc.theiacov_illumina_pe.read_QC_trim.trimmomatic_window_size": 4, + "theiacov_gc.theiacov_illumina_pe.seq_method": "Illumina paired-end", + "theiacov_gc.theiacov_illumina_pe.vadr.maxlen": 30000, + "theiacov_gc.theiacov_illumina_pe.vadr.minlen": 50, + "theiacov_gc.theiacov_illumina_pe.vadr.skip_length": 10000, + "theiacov_gc.theiacov_illumina_pe.vadr.vadr_opts": "--glsearch -s -r --nomisc --mkey sarscov2 --alt_fail lowscore,fstukcnf,insertnn,deletinn --mdir /opt/vadr/vadr-models/", + "theiacov_gc.theiacov_illumina_pe.variant_call.count_orphans": true, + "theiacov_gc.theiacov_illumina_pe.variant_call.disable_baq": true, + "theiacov_gc.theiacov_illumina_pe.variant_call.max_depth": "600000", + "theiacov_gc.theiacov_illumina_pe.variant_call.min_bq": "0", + "theiacov_gc.theiacov_illumina_pe.variant_call.min_depth": "10", + "theiacov_gc.theiacov_illumina_pe.variant_call.min_freq": "0.6", + "theiacov_gc.theiacov_illumina_pe.variant_call.min_qual": "20", + "theiacov_gc.theiacov_illumina_pe.variant_call.ref_genome": "/artic-ncov2019/primer_schemes/nCoV-2019/V3/nCoV-2019.reference.fasta", + "theiacov_gc.theiacov_illumina_pe.variant_call.ref_gff": "/reference/GCF_009858895.2_ASM985889v3_genomic.gff", - "titan_gc.titan_illumina_se.bwa.cpus": 6, - "titan_gc.titan_illumina_se.bwa.reference_genome": "/artic-ncov2019/primer_schemes/nCoV-2019/V3/nCoV-2019.reference.fasta", - "titan_gc.titan_illumina_se.consensus.char_unknown": "N", - "titan_gc.titan_illumina_se.consensus.count_orphans": true, - "titan_gc.titan_illumina_se.consensus.disable_baq": true, - "titan_gc.titan_illumina_se.consensus.max_depth": "600000", - "titan_gc.titan_illumina_se.consensus.min_bq": "0", - "titan_gc.titan_illumina_se.consensus.min_depth": "10", - "titan_gc.titan_illumina_se.consensus.min_freq": "0.6", - "titan_gc.titan_illumina_se.consensus.min_qual": "20", - "titan_gc.titan_illumina_se.consensus.ref_genome": "/artic-ncov2019/primer_schemes/nCoV-2019/V3/nCoV-2019.reference.fasta", - "titan_gc.titan_illumina_se.consensus.ref_gff": "/reference/GCF_009858895.2_ASM985889v3_genomic.gff", - "titan_gc.titan_illumina_se.pangolin3.inference_engine": "usher", - "titan_gc.titan_illumina_se.pangolin3.max_ambig": 0.5, - "titan_gc.titan_illumina_se.pangolin3.min_length": 10000, - "titan_gc.titan_illumina_se.primer_trim.keep_noprimer_reads": true, - "titan_gc.titan_illumina_se.read_QC_trim.kraken2_raw.cpus": 4, - "titan_gc.titan_illumina_se.read_QC_trim.trimmomatic_minlen": 25, - "titan_gc.titan_illumina_se.read_QC_trim.trimmomatic_quality_trim_score": 30, - "titan_gc.titan_illumina_se.read_QC_trim.trimmomatic_se.threads": 4, - "titan_gc.titan_illumina_se.read_QC_trim.trimmomatic_window_size": 4, - "titan_gc.titan_illumina_se.seq_method": "Illumina single-end", - "titan_gc.titan_illumina_se.vadr.maxlen": 30000, - "titan_gc.titan_illumina_se.vadr.minlen": 50, - "titan_gc.titan_illumina_se.vadr.skip_length": 10000, - "titan_gc.titan_illumina_se.vadr.vadr_opts": "--glsearch -s -r --nomisc --mkey sarscov2 --alt_fail lowscore,fstukcnf,insertnn,deletinn --mdir /opt/vadr/vadr-models/", - "titan_gc.titan_illumina_se.variant_call.count_orphans": true, - "titan_gc.titan_illumina_se.variant_call.disable_baq": true, - "titan_gc.titan_illumina_se.variant_call.max_depth": "600000", - "titan_gc.titan_illumina_se.variant_call.min_bq": "0", - "titan_gc.titan_illumina_se.variant_call.min_depth": "10", - "titan_gc.titan_illumina_se.variant_call.min_freq": "0.6", - "titan_gc.titan_illumina_se.variant_call.min_qual": "20", - "titan_gc.titan_illumina_se.variant_call.ref_genome": "/artic-ncov2019/primer_schemes/nCoV-2019/V3/nCoV-2019.reference.fasta", - "titan_gc.titan_illumina_se.variant_call.ref_gff": "/reference/GCF_009858895.2_ASM985889v3_genomic.gff", + "theiacov_gc.theiacov_illumina_se.bwa.cpus": 6, + "theiacov_gc.theiacov_illumina_se.bwa.reference_genome": "/artic-ncov2019/primer_schemes/nCoV-2019/V3/nCoV-2019.reference.fasta", + "theiacov_gc.theiacov_illumina_se.consensus.char_unknown": "N", + "theiacov_gc.theiacov_illumina_se.consensus.count_orphans": true, + "theiacov_gc.theiacov_illumina_se.consensus.disable_baq": true, + "theiacov_gc.theiacov_illumina_se.consensus.max_depth": "600000", + "theiacov_gc.theiacov_illumina_se.consensus.min_bq": "0", + "theiacov_gc.theiacov_illumina_se.consensus.min_depth": "10", + "theiacov_gc.theiacov_illumina_se.consensus.min_freq": "0.6", + "theiacov_gc.theiacov_illumina_se.consensus.min_qual": "20", + "theiacov_gc.theiacov_illumina_se.consensus.ref_genome": "/artic-ncov2019/primer_schemes/nCoV-2019/V3/nCoV-2019.reference.fasta", + "theiacov_gc.theiacov_illumina_se.consensus.ref_gff": "/reference/GCF_009858895.2_ASM985889v3_genomic.gff", + "theiacov_gc.theiacov_illumina_se.pangolin3.inference_engine": "usher", + "theiacov_gc.theiacov_illumina_se.pangolin3.max_ambig": 0.5, + "theiacov_gc.theiacov_illumina_se.pangolin3.min_length": 10000, + "theiacov_gc.theiacov_illumina_se.primer_trim.keep_noprimer_reads": true, + "theiacov_gc.theiacov_illumina_se.read_QC_trim.kraken2_raw.cpus": 4, + "theiacov_gc.theiacov_illumina_se.read_QC_trim.trimmomatic_minlen": 25, + "theiacov_gc.theiacov_illumina_se.read_QC_trim.trimmomatic_quality_trim_score": 30, + "theiacov_gc.theiacov_illumina_se.read_QC_trim.trimmomatic_se.threads": 4, + "theiacov_gc.theiacov_illumina_se.read_QC_trim.trimmomatic_window_size": 4, + "theiacov_gc.theiacov_illumina_se.seq_method": "Illumina single-end", + "theiacov_gc.theiacov_illumina_se.vadr.maxlen": 30000, + "theiacov_gc.theiacov_illumina_se.vadr.minlen": 50, + "theiacov_gc.theiacov_illumina_se.vadr.skip_length": 10000, + "theiacov_gc.theiacov_illumina_se.vadr.vadr_opts": "--glsearch -s -r --nomisc --mkey sarscov2 --alt_fail lowscore,fstukcnf,insertnn,deletinn --mdir /opt/vadr/vadr-models/", + "theiacov_gc.theiacov_illumina_se.variant_call.count_orphans": true, + "theiacov_gc.theiacov_illumina_se.variant_call.disable_baq": true, + "theiacov_gc.theiacov_illumina_se.variant_call.max_depth": "600000", + "theiacov_gc.theiacov_illumina_se.variant_call.min_bq": "0", + "theiacov_gc.theiacov_illumina_se.variant_call.min_depth": "10", + "theiacov_gc.theiacov_illumina_se.variant_call.min_freq": "0.6", + "theiacov_gc.theiacov_illumina_se.variant_call.min_qual": "20", + "theiacov_gc.theiacov_illumina_se.variant_call.ref_genome": "/artic-ncov2019/primer_schemes/nCoV-2019/V3/nCoV-2019.reference.fasta", + "theiacov_gc.theiacov_illumina_se.variant_call.ref_gff": "/reference/GCF_009858895.2_ASM985889v3_genomic.gff", - "titan_gc.titan_ont.consensus.cpu": 8, - "titan_gc.titan_ont.consensus.medaka_model": "r941_min_high_g360", - "titan_gc.titan_ont.kraken2_dehosted.cpus": 4, - "titan_gc.titan_ont.kraken2_raw.cpus": 4, - "titan_gc.titan_ont.normalise": 200, - "titan_gc.titan_ont.pangolin3.inference_engine": "usher", - "titan_gc.titan_ont.pangolin3.max_ambig": 0.5, - "titan_gc.titan_ont.pangolin3.min_length": 10000, - "titan_gc.titan_ont.read_filtering.cpu": 8, - "titan_gc.titan_ont.read_filtering.max_length": 700, - "titan_gc.titan_ont.read_filtering.min_length": 400, - "titan_gc.titan_ont.read_filtering.run_prefix": "artic_ncov2019", - "titan_gc.titan_ont.seq_method": "ONT", - "titan_gc.titan_ont.vadr.maxlen": 30000, - "titan_gc.titan_ont.vadr.minlen": 50, - "titan_gc.titan_ont.vadr.skip_length": 10000, - "titan_gc.titan_ont.vadr.vadr_opts": "--glsearch -s -r --nomisc --mkey sarscov2 --alt_fail lowscore,fstukcnf,insertnn,deletinn --mdir /opt/vadr/vadr-models/" + "theiacov_gc.theiacov_ont.consensus.cpu": 8, + "theiacov_gc.theiacov_ont.consensus.medaka_model": "r941_min_high_g360", + "theiacov_gc.theiacov_ont.kraken2_dehosted.cpus": 4, + "theiacov_gc.theiacov_ont.kraken2_raw.cpus": 4, + "theiacov_gc.theiacov_ont.normalise": 200, + "theiacov_gc.theiacov_ont.pangolin3.inference_engine": "usher", + "theiacov_gc.theiacov_ont.pangolin3.max_ambig": 0.5, + "theiacov_gc.theiacov_ont.pangolin3.min_length": 10000, + "theiacov_gc.theiacov_ont.read_filtering.cpu": 8, + "theiacov_gc.theiacov_ont.read_filtering.max_length": 700, + "theiacov_gc.theiacov_ont.read_filtering.min_length": 400, + "theiacov_gc.theiacov_ont.read_filtering.run_prefix": "artic_ncov2019", + "theiacov_gc.theiacov_ont.seq_method": "ONT", + "theiacov_gc.theiacov_ont.vadr.maxlen": 30000, + "theiacov_gc.theiacov_ont.vadr.minlen": 50, + "theiacov_gc.theiacov_ont.vadr.skip_length": 10000, + "theiacov_gc.theiacov_ont.vadr.vadr_opts": "--glsearch -s -r --nomisc --mkey sarscov2 --alt_fail lowscore,fstukcnf,insertnn,deletinn --mdir /opt/vadr/vadr-models/" } diff --git a/conf/singularity.config b/conf/singularity.config index aaadc569..dc31a255 100644 --- a/conf/singularity.config +++ b/conf/singularity.config @@ -8,7 +8,7 @@ backend { runtime-attributes = "String docker" submit-docker = """ if [ -z $SINGULARITY_CACHEDIR ]; then - CACHE_DIR=$HOME/.singularity/titan-cache + CACHE_DIR=$HOME/.singularity/theiacov-cache else CACHE_DIR=$SINGULARITY_CACHEDIR fi diff --git a/docs/source/conf.py b/docs/source/conf.py index c4a070ee..501cfa29 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -22,7 +22,7 @@ author = 'Kevin G. Libuit' # The full version, including alpha/beta/rc tags -release = '1.4.3' +release = '2.0.0' # -- General configuration --------------------------------------------------- @@ -56,4 +56,3 @@ def setup(app): app.add_css_file('my_theme.css') - diff --git a/docs/source/images/Titan_ClearLabs.png b/docs/source/images/TheiaCoV_ClearLabs.png similarity index 100% rename from docs/source/images/Titan_ClearLabs.png rename to docs/source/images/TheiaCoV_ClearLabs.png diff --git a/docs/source/images/Titan_FASTA.png b/docs/source/images/TheiaCoV_FASTA.png similarity index 100% rename from docs/source/images/Titan_FASTA.png rename to docs/source/images/TheiaCoV_FASTA.png diff --git a/docs/source/images/Titan_Illumina_PE.png b/docs/source/images/TheiaCoV_Illumina_PE.png similarity index 100% rename from docs/source/images/Titan_Illumina_PE.png rename to docs/source/images/TheiaCoV_Illumina_PE.png diff --git a/docs/source/images/Titan_Illumina_SE.png b/docs/source/images/TheiaCoV_Illumina_SE.png similarity index 100% rename from docs/source/images/Titan_Illumina_SE.png rename to docs/source/images/TheiaCoV_Illumina_SE.png diff --git a/docs/source/images/Titan_ONT.png b/docs/source/images/TheiaCoV_ONT.png similarity index 100% rename from docs/source/images/Titan_ONT.png rename to docs/source/images/TheiaCoV_ONT.png diff --git a/docs/source/index.rst b/docs/source/index.rst index 27d8b57a..b8a11583 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -13,7 +13,7 @@ Contents :maxdepth: 2 overview - titan_workflows + theiacov_workflows mercury_workflows terra_resources license diff --git a/docs/source/tables/titan_workflows/titan_augur_prep_required_inputs.csv b/docs/source/tables/theiacov_workflows/theiacov_augur_prep_required_inputs.csv similarity index 100% rename from docs/source/tables/titan_workflows/titan_augur_prep_required_inputs.csv rename to docs/source/tables/theiacov_workflows/theiacov_augur_prep_required_inputs.csv diff --git a/docs/source/tables/titan_workflows/titan_augur_run_required_inputs.csv b/docs/source/tables/theiacov_workflows/theiacov_augur_run_required_inputs.csv similarity index 100% rename from docs/source/tables/titan_workflows/titan_augur_run_required_inputs.csv rename to docs/source/tables/theiacov_workflows/theiacov_augur_run_required_inputs.csv diff --git a/docs/source/tables/titan_workflows/titan_clearlabs_optional_inputs.csv b/docs/source/tables/theiacov_workflows/theiacov_clearlabs_optional_inputs.csv similarity index 85% rename from docs/source/tables/titan_workflows/titan_clearlabs_optional_inputs.csv rename to docs/source/tables/theiacov_workflows/theiacov_clearlabs_optional_inputs.csv index a2fa4429..31757372 100644 --- a/docs/source/tables/titan_workflows/titan_clearlabs_optional_inputs.csv +++ b/docs/source/tables/theiacov_workflows/theiacov_clearlabs_optional_inputs.csv @@ -19,14 +19,14 @@ pangolin3,docker,String,Docker tag used for running Pangolin,staphb/pangolin:3.1 pangolin3,inference_engine,String,pangolin inference engine for lineage designations (usher or pangolarn),usher pangolin3,min_length,Int,Minimum query length allowed for pangolin to attempt assignment,10000 pangolin3,max_ambig,Float,Maximum proportion of Ns allowed for pangolin to attempt assignment,0.5 -titan_clearlabs,nextclade_dataset_name,String,Nextclade organism dataset,sars-cov-2 -titan_clearlabs,nextclade_dataset_reference,String,Nextclade reference genome,MN908947 -titan_clearlabs,nextclade_dataset_tag,Nextclade dataset tag,2021-06-25T00:00:00Z -titan_clearlabs,normalise,Int,Value to normalize read counts,200 -titan_clearlabs,seq_method,String,Description of the sequencing methodology used to generate the input read data,ONT via Clear Labs WGS +theiacov_clearlabs,nextclade_dataset_name,String,Nextclade organism dataset,sars-cov-2 +theiacov_clearlabs,nextclade_dataset_reference,String,Nextclade reference genome,MN908947 +theiacov_clearlabs,nextclade_dataset_tag,Nextclade dataset tag,2021-06-25T00:00:00Z +theiacov_clearlabs,normalise,Int,Value to normalize read counts,200 +theiacov_clearlabs,seq_method,String,Description of the sequencing methodology used to generate the input read data,ONT via Clear Labs WGS vadr,docker,String,Docker tag used for running VADR,staphb/vadr:1.2.1 vadr,maxlen,Int,Maximum length for the fasta-trim-terminal-ambigs.pl VADR script,30000 vadr,minlen,Int,Minimum length subsequence to possibly replace Ns for the fasta-trim-terminal-ambigs.pl VADR script,50 vadr,skip_length,Int,Minimum assembly length (unambiguous) to run vadr,10000 vadr,vadr_opts,String,Options for the v-annotate.pl VADR script,"--glsearch -s -r --nomisc --mkey sarscov2 --alt_fail lowscore,fstukcnf,insertnn,deletinn --mdir /opt/vadr/vadr-models/" -version_capture,timezone,String,User time zone in valid Unix TZ string (e.g. America/New_York),None \ No newline at end of file +version_capture,timezone,String,User time zone in valid Unix TZ string (e.g. America/New_York),None diff --git a/docs/source/tables/titan_workflows/titan_clearlabs_outputs.csv b/docs/source/tables/theiacov_workflows/theiacov_clearlabs_outputs.csv similarity index 94% rename from docs/source/tables/titan_workflows/titan_clearlabs_outputs.csv rename to docs/source/tables/theiacov_workflows/theiacov_clearlabs_outputs.csv index a1769127..734c8185 100644 --- a/docs/source/tables/titan_workflows/titan_clearlabs_outputs.csv +++ b/docs/source/tables/theiacov_workflows/theiacov_clearlabs_outputs.csv @@ -1,7 +1,7 @@ Output Name,Data Type,Description aligned_bai,File,Index companion file to the bam file generated during the consensus assembly process aligned_bam,File,Primer-trimmed BAM file; generated during conensus assembly process -artic_version,String,Version of the Artic software utilized for read trimming and conesnsus genome assembly +artic_version,String,Version of the Artic software utilized for read trimming and conesnsus genome assembly assembly_fasta,File,Consensus genome assembly assembly_length_unambiguous,Int,Number of unambiguous basecalls within the SC2 consensus assembly assembly_mean_coverage,Float,Mean sequencing depth throughout the conesnsus assembly generated after performing primer trimming--calculated using the SAMtools coverage command @@ -34,18 +34,18 @@ number_Total,Int,Total number of nucleotides within the consensus assembly pango_lineage,String,Pango lineage as detremined by Pangolin pango_lineage_report,File,Full Pango lineage report generated by Pangolin pangolin_assignment_version,String,Version of the pangolin software (e.g. PANGO or PUSHER) used for lineage asignment -pangolin_conflicts,String,Number of lineage conflicts as deteremed by Pangolin +pangolin_conflicts,String,Number of lineage conflicts as deteremed by Pangolin pangolin_docker,String,Docker image used to run Pangolin pangolin_notes,String,Lineage notes as deteremined by Pangolin -pangolin_versions,String,All Pangolin software and database versions +pangolin_versions,String,All Pangolin software and database versions percent_reference_coverage,Float,"Percent coverage of the reference genome after performing primer trimming; calculated as assembly_length_unambiguous / length of reference genome (SC2: 29,903) x 100" primer_bed_name,String,Name of the primer bed files used for primer trimming reads_dehosted,File,De-hosted read files samtools_version,String,Version of SAMtools used to sort and index the alignment file seq_platform,String,Description of the sequencing methodology used to generate the input read data -titan_clearlabs_analysis_date,String,Date of analysis -titan_clearlabs_version,String,Version of the Public Health Viral Genomics (PHVG) repository used +theiacov_clearlabs_analysis_date,String,Date of analysis +theiacov_clearlabs_version,String,Version of the Public Health Viral Genomics (PHVG) repository used vadr_alerts_list,File,File containing all of the fatal alerts as determined by VADR vadr_docker,String,Docker image used to run VADR vadr_num_alerts,String,Number of fatal alerts as determined by VADR -variants_from_ref_vcf,File,Number of variants relative to the reference genome \ No newline at end of file +variants_from_ref_vcf,File,Number of variants relative to the reference genome diff --git a/docs/source/tables/theiacov_workflows/theiacov_clearlabs_required_inputs.csv b/docs/source/tables/theiacov_workflows/theiacov_clearlabs_required_inputs.csv new file mode 100644 index 00000000..486f8922 --- /dev/null +++ b/docs/source/tables/theiacov_workflows/theiacov_clearlabs_required_inputs.csv @@ -0,0 +1,4 @@ +Task,Input Variable ,Data Type ,Description  +theiacov_clearlabs,clear_lab_fastq ,File ,Clear Labs FASTQ read files +theiacov_clearlabs,primer_bed ,File ,Primer sequence coordinates of the PCR scheme utilized in BED file format +theiacov_clearlabs,samplename ,String ,Name of the sample being analyzed diff --git a/docs/source/tables/titan_workflows/titan_fasta_optional_inputs.csv b/docs/source/tables/theiacov_workflows/theiacov_fasta_optional_inputs.csv similarity index 97% rename from docs/source/tables/titan_workflows/titan_fasta_optional_inputs.csv rename to docs/source/tables/theiacov_workflows/theiacov_fasta_optional_inputs.csv index 544dc978..e91c4baa 100644 --- a/docs/source/tables/titan_workflows/titan_fasta_optional_inputs.csv +++ b/docs/source/tables/theiacov_workflows/theiacov_fasta_optional_inputs.csv @@ -13,4 +13,4 @@ vadr,maxlen,Int,Maximum length for the fasta-trim-terminal-ambigs.pl VADR script vadr,minlen,Int,Minimum length subsequence to possibly replace Ns for the fasta-trim-terminal-ambigs.pl VADR script,50 vadr,skip_length,Int,Minimum assembly length (unambiguous) to run vadr,10000 vadr,vadr_opts,String,Options for the v-annotate.pl VADR script,"--glsearch -s -r --nomisc --mkey sarscov2 --alt_fail lowscore,fstukcnf,insertnn,deletinn --mdir /opt/vadr/vadr-models/" -version_capture,timezone,String,User time zone in valid Unix TZ string (e.g. America/New_York),None \ No newline at end of file +version_capture,timezone,String,User time zone in valid Unix TZ string (e.g. America/New_York),None diff --git a/docs/source/tables/titan_workflows/titan_fasta_outputs.csv b/docs/source/tables/theiacov_workflows/theiacov_fasta_outputs.csv similarity index 92% rename from docs/source/tables/titan_workflows/titan_fasta_outputs.csv rename to docs/source/tables/theiacov_workflows/theiacov_fasta_outputs.csv index 67f22c46..44345754 100644 --- a/docs/source/tables/titan_workflows/titan_fasta_outputs.csv +++ b/docs/source/tables/theiacov_workflows/theiacov_fasta_outputs.csv @@ -14,14 +14,14 @@ number_Total,Int,Total number of nucleotides within the consensus assembly pango_lineage,String,Pango lineage as detremined by Pangolin pango_lineage_report,File,Full Pango lineage report generated by Pangolin pangolin_assignment_version,String,Version of the pangolin software (e.g. PANGO or PUSHER) used for lineage asignment -pangolin_conflicts,String,Number of lineage conflicts as deteremed by Pangolin +pangolin_conflicts,String,Number of lineage conflicts as deteremed by Pangolin pangolin_docker,String,Docker image used to run Pangolin pangolin_notes,String,Lineage notes as deteremined by Pangolin -pangolin_versions,String,All Pangolin software and database versions +pangolin_versions,String,All Pangolin software and database versions percent_reference_coverage,Float,Percent coverage of the reference genome after performing primer trimming; calculated as assembly_length_unambiguous / length of reference genome (SC2: 29,903) x 100 seq_platform,String,Description of the sequencing methodology used to generate the input read data -titan_fasta_analysis_date,String,Date of analysis -titan_fasta_version,String,Version of the Public Health Viral Genomics (PHVG) repository used +theiacov_fasta_analysis_date,String,Date of analysis +theiacov_fasta_version,String,Version of the Public Health Viral Genomics (PHVG) repository used vadr_alerts_list,File,File containing all of the fatal alerts as determined by VADR vadr_docker,String,Docker image used to run VADR vadr_num_alerts,String,Number of fatal alerts as determined by VADR diff --git a/docs/source/tables/theiacov_workflows/theiacov_fasta_required_inputs.csv b/docs/source/tables/theiacov_workflows/theiacov_fasta_required_inputs.csv new file mode 100644 index 00000000..b4f00413 --- /dev/null +++ b/docs/source/tables/theiacov_workflows/theiacov_fasta_required_inputs.csv @@ -0,0 +1,5 @@ +Task,Input Variable,Data Type,Description +theiacov_fasta,assembly_fasta,File,SARS-CoV-2 assemly file in fasta format +theiacov_fasta,input_assembly_method,String,Description of the method utilized to generate the input assembly fasta file; if unknown "NA" will be accepted +theiacov_fasta,samplename,String,Name of the sample being analyzed +theiacov_fasta,seq_method,String,Description of the sequencing method utilized to generate the raw sequencing data; if unknown "NA" will be accepted diff --git a/docs/source/tables/titan_workflows/titan_illumina_pe_optional_inputs.csv b/docs/source/tables/theiacov_workflows/theiacov_illumina_pe_optional_inputs.csv similarity index 91% rename from docs/source/tables/titan_workflows/titan_illumina_pe_optional_inputs.csv rename to docs/source/tables/theiacov_workflows/theiacov_illumina_pe_optional_inputs.csv index fc0b9461..f7a996ff 100644 --- a/docs/source/tables/titan_workflows/titan_illumina_pe_optional_inputs.csv +++ b/docs/source/tables/theiacov_workflows/theiacov_illumina_pe_optional_inputs.csv @@ -22,10 +22,10 @@ read_QC_trim,bbduk_mem,Int,Memory allocated to the BBDuk VM,8 read_QC_trim,trimmomatic_minlen,Int,Specifies the minimum length of reads to be kept for Trimmomatic,25 read_QC_trim,trimmomatic_quality_trim_score,Int,Specifies the average quality required for Trimmomatic,30 read_QC_trim,trimmomatic_window_size,Int,Specifies the number of bases to average across for Trimmomatic,4 -titan_illumina_pe,nextclade_dataset_name,String,Nextclade organism dataset,sars-cov-2 -titan_illumina_pe,nextclade_dataset_reference,String,Nextclade reference genome,MN908947 -titan_illumina_pe,nextclade_dataset_tag,Nextclade dataset tag,2021-06-25T00:00:00Z -titan_illumina_pe,seq_method,String,Description of the sequencing methodology used to generate the input read data,Illumina paired-end +theiacov_illumina_pe,nextclade_dataset_name,String,Nextclade organism dataset,sars-cov-2 +theiacov_illumina_pe,nextclade_dataset_reference,String,Nextclade reference genome,MN908947 +theiacov_illumina_pe,nextclade_dataset_tag,Nextclade dataset tag,2021-06-25T00:00:00Z +theiacov_illumina_pe,seq_method,String,Description of the sequencing methodology used to generate the input read data,Illumina paired-end vadr,docker,String,Docker tag used for running VADR,staphb/vadr:1.2.1 vadr,maxlen,Int,Maximum length for the fasta-trim-terminal-ambigs.pl VADR script,30000 vadr,minlen,Int,Minimum length subsequence to possibly replace Ns for the fasta-trim-terminal-ambigs.pl VADR script,50 @@ -40,4 +40,4 @@ variant_call,min_freq,Float,Minimum frequency threshold(0 - 1) to call variants variant_call,min_qual,Int,Minimum quality threshold for sliding window to pass for iVar variants,20 variant_call,ref_gff,String,Path to the general feature format of the reference genome within the staphb/ivar:1.2.2_artic20200528 Docker container,/reference/GCF_009858895.2_ASM985889v3_genomic.gff variant_call,ref_genome,String,Path to the reference genome within the staphb/ivar:1.2.2_artic20200528 Docker container,/artic-ncov2019/primer_schemes/nCoV-2019/V3/nCoV-2019.reference.fasta -version_capture,timezone,String,User time zone in valid Unix TZ string (e.g. America/New_York),None \ No newline at end of file +version_capture,timezone,String,User time zone in valid Unix TZ string (e.g. America/New_York),None diff --git a/docs/source/tables/titan_workflows/titan_illumina_pe_outputs.csv b/docs/source/tables/theiacov_workflows/theiacov_illumina_pe_outputs.csv similarity index 96% rename from docs/source/tables/titan_workflows/titan_illumina_pe_outputs.csv rename to docs/source/tables/theiacov_workflows/theiacov_illumina_pe_outputs.csv index 8b7dbf76..6cd58303 100644 --- a/docs/source/tables/titan_workflows/titan_illumina_pe_outputs.csv +++ b/docs/source/tables/theiacov_workflows/theiacov_illumina_pe_outputs.csv @@ -43,7 +43,7 @@ number_Total,Int,Total number of nucleotides within the consensus assembly pango_lineage,String,Pango lineage as detremined by Pangolin pango_lineage_report,File,Full Pango lineage report generated by Pangolin pangolin_assignment_version,String,Version of the pangolin software (e.g. PANGO or PUSHER) used for lineage asignment -pangolin_conflicts,String,Number of lineage conflicts as deteremed by Pangolin +pangolin_conflicts,String,Number of lineage conflicts as deteremed by Pangolin pangolin_docker,String,Docker image used to run Pangolin pangolin_notes,String,Lineage notes as deteremined by Pangolin pangolin_versions,String,All Pangolin software and database version @@ -59,9 +59,9 @@ samtools_version_consensus,String,Version of SAMtools used to create the pileup samtools_version_primtrim,String,Version of SAMtools used to create the pileup before running iVar trim samtools_version_stats,String,Version of SAMtools used to assess quality of read mapping seq_platform,String,Description of the sequencing methodology used to generate the input read data -titan_illumina_pe_analysis_date,String,Date of analysis -titan_illumina_pe_version,String,Version of the Public Health Viral Genomics (PHVG) repository used +theiacov_illumina_pe_analysis_date,String,Date of analysis +theiacov_illumina_pe_version,String,Version of the Public Health Viral Genomics (PHVG) repository used trimmomatic_version,String,Version of Trimmomatic used vadr_alerts_list,File,File containing all of the fatal alerts as determined by VADR vadr_docker,String,Docker image used to run VADR -vadr_num_alerts,String,Number of fatal alerts as determined by VADR \ No newline at end of file +vadr_num_alerts,String,Number of fatal alerts as determined by VADR diff --git a/docs/source/tables/theiacov_workflows/theiacov_illumina_pe_required_inputs.csv b/docs/source/tables/theiacov_workflows/theiacov_illumina_pe_required_inputs.csv new file mode 100644 index 00000000..334c691a --- /dev/null +++ b/docs/source/tables/theiacov_workflows/theiacov_illumina_pe_required_inputs.csv @@ -0,0 +1,5 @@ +Task,Input Variable ,Data Type ,Description  +theiacov_illumina_pe,primer_bed ,File ,Primer sequence coordinates of the PCR scheme utilized in BED file format +theiacov_illumina_pe,read1_raw ,File ,Forward Illumina read in FASTQ file format +theiacov_illumina_pe,read2_raw ,File ,Reverse Illumina read in FASTQ file format +theiacov_illumina_pe,samplename ,String ,Name of the sample being analyzed diff --git a/docs/source/tables/titan_workflows/titan_illumina_se_optional_inputs.csv b/docs/source/tables/theiacov_workflows/theiacov_illumina_se_optional_inputs.csv similarity index 91% rename from docs/source/tables/titan_workflows/titan_illumina_se_optional_inputs.csv rename to docs/source/tables/theiacov_workflows/theiacov_illumina_se_optional_inputs.csv index 2d6bc32a..c76dbc78 100644 --- a/docs/source/tables/titan_workflows/titan_illumina_se_optional_inputs.csv +++ b/docs/source/tables/theiacov_workflows/theiacov_illumina_se_optional_inputs.csv @@ -23,10 +23,10 @@ read_QC_trim,bbduk_mem,Int,Memory allocated to the BBDuk VM,8 read_QC_trim,trimmomatic_minlen,Int,Specifies the minimum length of reads to be kept for Trimmomatic,25 read_QC_trim,trimmomatic_quality_trim_score,Int,Specifies the average quality required for Trimmomatic,30 read_QC_trim,trimmomatic_window_size,Int,Specifies the number of bases to average across for Trimmomatic,4 -titan_illumina_se,nextclade_dataset_name,String,Nextclade organism dataset,sars-cov-2 -titan_illumina_se,nextclade_dataset_reference,String,Nextclade reference genome,MN908947 -titan_illumina_se,nextclade_dataset_tag,Nextclade dataset tag,2021-06-25T00:00:00Z -titan_illumina_se,seq_method,String,Description of the sequencing methodology used to generate the input read data,Illumina paired-end +theiacov_illumina_se,nextclade_dataset_name,String,Nextclade organism dataset,sars-cov-2 +theiacov_illumina_se,nextclade_dataset_reference,String,Nextclade reference genome,MN908947 +theiacov_illumina_se,nextclade_dataset_tag,Nextclade dataset tag,2021-06-25T00:00:00Z +theiacov_illumina_se,seq_method,String,Description of the sequencing methodology used to generate the input read data,Illumina paired-end vadr,docker,String,Docker tag used for running VADR,staphb/vadr:1.2.1 vadr,maxlen,Int,Maximum length for the fasta-trim-terminal-ambigs.pl VADR script,30000 vadr,minlen,Int,Minimum length subsequence to possibly replace Ns for the fasta-trim-terminal-ambigs.pl VADR script,50 @@ -41,4 +41,4 @@ variant_call,min_freq,Float,Minimum frequency threshold(0 - 1) to call variants variant_call,min_qual,Int,Minimum quality threshold for sliding window to pass for iVar variants,20 variant_call,ref_gff,String,Path to the general feature format of the reference genome within the staphb/ivar:1.2.2_artic20200528 Docker container,/reference/GCF_009858895.2_ASM985889v3_genomic.gff variant_call,ref_genome,String,Path to the reference genome within the staphb/ivar:1.2.2_artic20200528 Docker container,/artic-ncov2019/primer_schemes/nCoV-2019/V3/nCoV-2019.reference.fasta -version_capture,timezone,String,User time zone in valid Unix TZ string (e.g. America/New_York),None \ No newline at end of file +version_capture,timezone,String,User time zone in valid Unix TZ string (e.g. America/New_York),None diff --git a/docs/source/tables/titan_workflows/titan_illumina_se_outputs.csv b/docs/source/tables/theiacov_workflows/theiacov_illumina_se_outputs.csv similarity index 96% rename from docs/source/tables/titan_workflows/titan_illumina_se_outputs.csv rename to docs/source/tables/theiacov_workflows/theiacov_illumina_se_outputs.csv index 78ef8f47..075d2ee4 100644 --- a/docs/source/tables/titan_workflows/titan_illumina_se_outputs.csv +++ b/docs/source/tables/theiacov_workflows/theiacov_illumina_se_outputs.csv @@ -36,7 +36,7 @@ number_Total,Int,Total number of nucleotides within the consensus assembly pango_lineage,String,Pango lineage as detremined by Pangolin pango_lineage_report,File,Full Pango lineage report generated by Pangolin pangolin_assignment_version,String,Version of the pangolin software (e.g. PANGO or PUSHER) used for lineage asignment -pangolin_conflicts,String,Number of lineage conflicts as deteremed by Pangolin +pangolin_conflicts,String,Number of lineage conflicts as deteremed by Pangolin pangolin_docker,String,Docker image used to run Pangolin pangolin_notes,String,Lineage notes as deteremined by Pangolin pangolin_versions,String,All Pangolin software and database version @@ -49,9 +49,9 @@ samtools_version_consensus,String,Version of SAMtools used to create the pileup samtools_version_primtrim,String,Version of SAMtools used to create the pileup before running iVar trim samtools_version_stats,String,Version of SAMtools used to assess quality of read mapping seq_platform,String,Description of the sequencing methodology used to generate the input read data -titan_illumina_se_analysis_date,String,Date of analysis -titan_illumina_se_version,String,Version of the Public Health Viral Genomics (PHVG) repository used +theiacov_illumina_se_analysis_date,String,Date of analysis +theiacov_illumina_se_version,String,Version of the Public Health Viral Genomics (PHVG) repository used trimmomatic_version,String,Version of Trimmomatic used vadr_alerts_list,File,File containing all of the fatal alerts as determined by VADR vadr_docker,String,Docker image used to run VADR -vadr_num_alerts,String,Number of fatal alerts as determined by VADR \ No newline at end of file +vadr_num_alerts,String,Number of fatal alerts as determined by VADR diff --git a/docs/source/tables/theiacov_workflows/theiacov_illumina_se_required_inputs.csv b/docs/source/tables/theiacov_workflows/theiacov_illumina_se_required_inputs.csv new file mode 100644 index 00000000..db2795fe --- /dev/null +++ b/docs/source/tables/theiacov_workflows/theiacov_illumina_se_required_inputs.csv @@ -0,0 +1,4 @@ +Task,Input Variable ,Data Type ,Description  +theiacov_illumina_pe,primer_bed ,File ,Primer sequence coordinates of the PCR scheme utilized in BED file format +theiacov_illumina_pe,read1_raw ,File ,Single-end Illumina read in FASTQ file format +theiacov_illumina_pe,samplename ,String ,Name of the sample being analyzed diff --git a/docs/source/tables/titan_workflows/titan_ont_optional_inputs.csv b/docs/source/tables/theiacov_workflows/theiacov_ont_optional_inputs.csv similarity index 82% rename from docs/source/tables/titan_workflows/titan_ont_optional_inputs.csv rename to docs/source/tables/theiacov_workflows/theiacov_ont_optional_inputs.csv index 67e1e86f..39a2f17d 100644 --- a/docs/source/tables/titan_workflows/titan_ont_optional_inputs.csv +++ b/docs/source/tables/theiacov_workflows/theiacov_ont_optional_inputs.csv @@ -23,16 +23,16 @@ read_filtering,cpu,Int,CPU resources allocated to the read filtering task (Artic read_filtering,max_length,Int,Maximum sequence length ,700 read_filtering,min_length,Int,Minimum sequence length,400 read_filtering,run_prefix,String,Run name,artic_ncov2019 -titan_ont,nextclade_dataset_name,String,Nextclade organism dataset,sars-cov-2 -titan_ont,nextclade_dataset_reference,String,Nextclade reference genome,MN908947 -titan_ont,nextclade_dataset_tag,Nextclade dataset tag,2021-06-25T00:00:00Z -titan_ont,artic_primer_version,String,Version of the Artic PCR protocol used to generate input read data ,V3 -titan_ont,normalise,Int,Value to normalize read counts,200 -titan_ont,seq_method,String,Description of the sequencing methodology used to generate the input read data,ONT -titan_ont,pangolin_docker_image,String,Docker tag used for running Pangolin,staphb/pangolin:2.4.2-pangolearn-2021-05-19 +theiacov_ont,nextclade_dataset_name,String,Nextclade organism dataset,sars-cov-2 +theiacov_ont,nextclade_dataset_reference,String,Nextclade reference genome,MN908947 +theiacov_ont,nextclade_dataset_tag,Nextclade dataset tag,2021-06-25T00:00:00Z +theiacov_ont,artic_primer_version,String,Version of the Artic PCR protocol used to generate input read data ,V3 +theiacov_ont,normalise,Int,Value to normalize read counts,200 +theiacov_ont,seq_method,String,Description of the sequencing methodology used to generate the input read data,ONT +theiacov_ont,pangolin_docker_image,String,Docker tag used for running Pangolin,staphb/pangolin:2.4.2-pangolearn-2021-05-19 vadr,docker,String,Docker tag used for running VADR,staphb/vadr:1.2.1 vadr,maxlen,Int,Maximum length for the fasta-trim-terminal-ambigs.pl VADR script,30000 vadr,minlen,Int,Minimum length subsequence to possibly replace Ns for the fasta-trim-terminal-ambigs.pl VADR script,50 vadr,vadr_opts,String,Options for the v-annotate.pl VADR script,"--glsearch -s -r --nomisc --mkey sarscov2 --alt_fail lowscore,fstukcnf,insertnn,deletinn --mdir /opt/vadr/vadr-models/" vadr,skip_length,Int,Minimum assembly length (unambiguous) to run vadr,10000 -version_capture,timezone,String,User time zone in valid Unix TZ string (e.g. America/New_York),None \ No newline at end of file +version_capture,timezone,String,User time zone in valid Unix TZ string (e.g. America/New_York),None diff --git a/docs/source/tables/titan_workflows/titan_ont_outputs.csv b/docs/source/tables/theiacov_workflows/theiacov_ont_outputs.csv similarity index 94% rename from docs/source/tables/titan_workflows/titan_ont_outputs.csv rename to docs/source/tables/theiacov_workflows/theiacov_ont_outputs.csv index d49addd3..abc45d0c 100644 --- a/docs/source/tables/titan_workflows/titan_ont_outputs.csv +++ b/docs/source/tables/theiacov_workflows/theiacov_ont_outputs.csv @@ -2,7 +2,7 @@ Output Name,Data Type,Description aligned_bai,File,Index companion file to the bam file generated during the consensus assembly process aligned_bam,File,Primer-trimmed BAM file; generated during conensus assembly process amp_coverage,File,Sequence coverage per amplicon -artic_version,String,Version of the Artic software utilized for read trimming and conesnsus genome assembly +artic_version,String,Version of the Artic software utilized for read trimming and conesnsus genome assembly assembly_fasta,File,Consensus genome assembly assembly_length_unambiguous,Int,Number of unambiguous basecalls within the SC2 consensus assembly assembly_mean_coverage,Float,Mean sequencing depth throughout the conesnsus assembly generated after performing primer trimming--calculated using the SAMtools coverage command @@ -36,19 +36,19 @@ number_Total,Int,Total number of nucleotides within the consensus assembly pango_lineage,String,Pango lineage as detremined by Pangolin pango_lineage_report,File,Full Pango lineage report generated by Pangolin pangolin_assignment_version,String,Version of the pangolin software (e.g. PANGO or PUSHER) used for lineage asignment -pangolin_conflicts,String,Number of lineage conflicts as deteremed by Pangolin +pangolin_conflicts,String,Number of lineage conflicts as deteremed by Pangolin pangolin_docker,String,Docker image used to run Pangolin pangolin_notes,String,Lineage notes as deteremined by Pangolin -pangolin_versions,String,All Pangolin software and database versions +pangolin_versions,String,All Pangolin software and database versions percent_reference_coverage,Float,"Percent coverage of the reference genome after performing primer trimming; calculated as assembly_length_unambiguous / length of reference genome (SC2: 29,903) x 100" primer_bed_name,String,Name of the primer bed files used for primer trimming pangolin_versions,String,All Pangolin software and database versions -reads_dehosted,File,De-hosted read files +reads_dehosted,File,De-hosted read files samtools_version,String,Version of SAMtools used to sort and index the alignment file seq_platform,String,Description of the sequencing methodology used to generate the input read data -titan_ont_analysis_date,String,Date of analysis -titan_ont_version,String,Version of the Public Health Viral Genomics (PHVG) repository used +theiacov_ont_analysis_date,String,Date of analysis +theiacov_ont_version,String,Version of the Public Health Viral Genomics (PHVG) repository used vadr_alerts_list,File,File containing all of the fatal alerts as determined by VADR vadr_docker,String,Docker image used to run VADR vadr_num_alerts,String,Number of fatal alerts as determined by VADR -variants_from_ref_vcf,File,Number of variants relative to the reference genome \ No newline at end of file +variants_from_ref_vcf,File,Number of variants relative to the reference genome diff --git a/docs/source/tables/theiacov_workflows/theiacov_ont_required_inputs.csv b/docs/source/tables/theiacov_workflows/theiacov_ont_required_inputs.csv new file mode 100644 index 00000000..f4ba0875 --- /dev/null +++ b/docs/source/tables/theiacov_workflows/theiacov_ont_required_inputs.csv @@ -0,0 +1,4 @@ +Task,Input Variable ,Data Type ,Description  +theiacov_ont,demultiplexed_reads ,File ,Basecalled and demultiplexed ONT read data (single FASTQ file per sample) +theiacov_ont,primer_bed ,File ,Primer sequence coordinates of the PCR scheme utilized in BED file format +theiacov_ont,samplename ,String ,Name of the sample being analyzed diff --git a/docs/source/tables/titan_workflows/titan_clearlabs_required_inputs.csv b/docs/source/tables/titan_workflows/titan_clearlabs_required_inputs.csv deleted file mode 100644 index 7a2ebfe4..00000000 --- a/docs/source/tables/titan_workflows/titan_clearlabs_required_inputs.csv +++ /dev/null @@ -1,4 +0,0 @@ -Task,Input Variable ,Data Type ,Description  -titan_clearlabs,clear_lab_fastq ,File ,Clear Labs FASTQ read files -titan_clearlabs,primer_bed ,File ,Primer sequence coordinates of the PCR scheme utilized in BED file format -titan_clearlabs,samplename ,String ,Name of the sample being analyzed \ No newline at end of file diff --git a/docs/source/tables/titan_workflows/titan_fasta_required_inputs.csv b/docs/source/tables/titan_workflows/titan_fasta_required_inputs.csv deleted file mode 100644 index a3fd7223..00000000 --- a/docs/source/tables/titan_workflows/titan_fasta_required_inputs.csv +++ /dev/null @@ -1,6 +0,0 @@ -Task,Input Variable,Data Type,Description -titan_fasta,assembly_fasta,File,SARS-CoV-2 assemly file in fasta format -titan_fasta,input_assembly_method,String,Description of the method utilized to generate the input assembly fasta file; if unknown "NA" will be accepted -titan_fasta,samplename,String,Name of the sample being analyzed -titan_fasta,seq_method,String,Description of the sequencing method utilized to generate the raw sequencing data; if unknown "NA" will be accepted - diff --git a/docs/source/tables/titan_workflows/titan_illumina_pe_required_inputs.csv b/docs/source/tables/titan_workflows/titan_illumina_pe_required_inputs.csv deleted file mode 100644 index 797b7c5a..00000000 --- a/docs/source/tables/titan_workflows/titan_illumina_pe_required_inputs.csv +++ /dev/null @@ -1,5 +0,0 @@ -Task,Input Variable ,Data Type ,Description  -titan_illumina_pe,primer_bed ,File ,Primer sequence coordinates of the PCR scheme utilized in BED file format -titan_illumina_pe,read1_raw ,File ,Forward Illumina read in FASTQ file format -titan_illumina_pe,read2_raw ,File ,Reverse Illumina read in FASTQ file format -titan_illumina_pe,samplename ,String ,Name of the sample being analyzed \ No newline at end of file diff --git a/docs/source/tables/titan_workflows/titan_illumina_se_required_inputs.csv b/docs/source/tables/titan_workflows/titan_illumina_se_required_inputs.csv deleted file mode 100644 index 8b1d6006..00000000 --- a/docs/source/tables/titan_workflows/titan_illumina_se_required_inputs.csv +++ /dev/null @@ -1,4 +0,0 @@ -Task,Input Variable ,Data Type ,Description  -titan_illumina_pe,primer_bed ,File ,Primer sequence coordinates of the PCR scheme utilized in BED file format -titan_illumina_pe,read1_raw ,File ,Single-end Illumina read in FASTQ file format -titan_illumina_pe,samplename ,String ,Name of the sample being analyzed \ No newline at end of file diff --git a/docs/source/tables/titan_workflows/titan_ont_required_inputs.csv b/docs/source/tables/titan_workflows/titan_ont_required_inputs.csv deleted file mode 100644 index a93f9033..00000000 --- a/docs/source/tables/titan_workflows/titan_ont_required_inputs.csv +++ /dev/null @@ -1,4 +0,0 @@ -Task,Input Variable ,Data Type ,Description  -titan_ont,demultiplexed_reads ,File ,Basecalled and demultiplexed ONT read data (single FASTQ file per sample) -titan_ont,primer_bed ,File ,Primer sequence coordinates of the PCR scheme utilized in BED file format -titan_ont,samplename ,String ,Name of the sample being analyzed \ No newline at end of file diff --git a/docs/source/theiacov_workflows.rst b/docs/source/theiacov_workflows.rst new file mode 100644 index 00000000..506e7da1 --- /dev/null +++ b/docs/source/theiacov_workflows.rst @@ -0,0 +1,347 @@ +====================== +TheiaCoV Workflow Series +====================== + +The TheiaCoV Workflow Series is a collection of WDL workflows developed for performing genomic characterization and genomic epidemiology of SARS-CoV-2 samples to support public health decision-making. + +TheiaCoV Workflows for Genomic Characterization +-------------------------------------------- +Genomic characterization, *i.e.* generating consensus assemblies (FASTA format) from next-generation sequencing (NGS) read data (FASTQ format) to assign samples with relevant nomenclature designation (e.g. PANGO lineage and NextClade clades) is an increasingly critical function to public health laboratories around the world. + +The TheiaCoV Genomic Characterization Series includes four separate WDL workflows (TheiaCoV_Illumina_PE, TheiaCoV_Illumina_SE, TheiaCoV_ClearLabs, and TheiaCoV_ONT) that process NGS read data from four different sequencing approaches: Illumina paired-end, Illumina single-end, Clear Labs, and Oxford Nanopore Technology (ONT)) to generate consensus assemblies, produce relevant quality-control metrics for both the input read data and the generated assembly, and assign samples with a lineage and clade designation using Pangolin and NextClade, respectively. + +All four TheiaCoV workflows for genomic characterization will generate a viral assembly by mapping input read data to a reference genome, removing primer reads from that alignment, and then calling the consensus assembly based on the primer-trimmed alignment. These consensus assemblies are then fed into the Pangolin and NextClade CLI tools for lineage and clade assignments. + +The major difference between each of these TheiaCoV Genomic Characterization workflows is in how the read mapping, primer trimming, and consensus genome calling is performed. More information on the technical details of these processes and information on how to utilize and apply these workflows for public health investigations is available below. + +A fifth WDL workflow, TheiaCoV_FASTA, was added to take in assembled SC2 genomes, perform basic QC (e.g. number of Ns), and assign samples with a lineage and clade designation using Pangolin and NextClade, respectively. + +A series of introductory training videos that provide conceptual overviews of methodologies and walkthrough tutorials on how to utilize these TheiaCoV workflows through Terra are available on the Theiagen Genomics YouTube page: + +.. raw:: html + + + +| + +**note** Titan workflows in the video have since been renamed to TheiaCoV. + +TheiaCoV_Illumina_PE +================= +The TheiaCoV_Illumina_PE workflow was written to process Illumina paired-end (PE) read data. Input reads are assumed to be the product of sequencing tiled PCR-amplicons designed for the SARS-CoV-2 genome. The most common read data analyzed by the TheiaCoV_Illumina_PE workflow are generated with the Artic V3 protocol. Alternative primer schemes such as the Qiaseq Primer Panel, the Swift Amplicon SARS-CoV-2 Panel and the Artic V4 Amplicon Sequencing Panel however, can also be analysed with this workflow since the primer sequence coordinates of the PCR scheme utilized must be provided along with the raw paired-end Illumina read data in BED and FASTQ file formats, respectively. + +.. note:: + By default, this workflow will assume that input reads were generated using a 300-cycle kit (i.e. 2 x 150 bp reads). Modifications to the optional parameter for trimmomatic_minlen may be required to accommodate for shorter read data, such as 2 x 75bp reads generated using a 150-cycle kit. + +Upon initiating a TheiaCoV_Illumina_PE job, the input primer scheme coordinates and raw paired-end Illumina read data provided for each sample will be processed to perform consensus genome assembly, infer the quality of both raw read data and the generated consensus genome, and assign SARS-CoV-2 lineage and clade types as outlined in the TheiaCoV_Illumina_PE data workflow below. + +.. figure:: images/TheiaCoV_Illumina_PE.png + :width: 800 + :alt: TheiaCoV_Illumina_PE workflow + :figclass: align-center + + **TheiaCoV_Illumina_PE Data Workflow** + +Consensus genome assembly with the TheiaCoV_Illumina_PE workflow is performed by first de-hosting read data with the NCBI SRA-Human-Scrubber tool then trimming low-quality reads with Trimmomatic and removing adapter sequences with BBDuk. These cleaned read data are then aligned to the Wuhan-1 reference genome with BWA to generate a Binary Alignment Mapping (BAM) file. Primer sequences are then removed from the BAM file using the iVar Trim sub-command. The iVar consensus sub-command is then utilized to generate a consensus assembly in FASTA format. This assembly is then used to assign lineage and clade designations with Pangolin and NextClade. NCBI’S VADR tool is also employed to screen for potentially errant features (e.g. erroneous frame-shift mutations) in the consensus assembly. + +More information on required user inputs, optional user inputs, default tool parameters and the outputs generated by TheiaCoV_Illumina_PE are outlined below. + +Required User Inputs +******************** +Download CSV: :download:`TheiaCoV_Illumina_PE_required_inputs.csv ` + +.. csv-table:: + :file: tables/theiacov_workflows/theiacov_illumina_pe_required_inputs.csv + :widths: 20, 20, 20, 40 + :header-rows: 1 + +| + +Optional User Inputs +******************** + +Download CSV: :download:`TheiaCoV_Illumina_PE_optional_inputs.csv ` + +.. csv-table:: + :file: tables/theiacov_workflows/theiacov_illumina_pe_optional_inputs.csv + :widths: 10, 10, 10, 10, 20 + :header-rows: 1 + +| + +Outputs +******************** +Download CSV: :download:`TheiaCoV_Illumina_PE_default_outputs.csv ` + +.. csv-table:: + :file: tables/theiacov_workflows/theiacov_illumina_pe_outputs.csv + :widths: 20, 20, 60 + :header-rows: 1 + +| + +TheiaCoV_Illumina_SE +================= +The TheiaCoV_Illumina_SE workflow was written to process Illumina single-end (SE) read data. Input reads are assumed to be the product of sequencing tiled PCR-amplicons designed for the SARS-CoV-2 genome. The most common read data analyzed by the TheiaCoV_Illumina_SE workflow are generated with the Artic V3 protocol. Alternative primer schemes such as the Qiaseq Primer Panel, however, can also be analysed with this workflow since the primer sequence coordinates of the PCR scheme utilized must be provided along with the raw paired-end Illumina read data in BED and FASTQ file formats, respectively. + +.. note:: + By default, this workflow will assume that input reads were generated using a 35-cycle kit (i.e. 1 x 35 bp reads). Modifications to the optional parameter for trimmomatic_minlen may be required to accommodate for longer read data. + +Upon initiating a TheiaCoV_Illumina_SE job, the input primer scheme coordinates and raw paired-end Illumina read data provided for each sample will be processed to perform consensus genome assembly, infer the quality of both raw read data and the generated consensus genome, and assign SARS-CoV-2 lineage and clade types as outlined in the TheiaCoV_Illumina_PE data workflow below. + +.. figure:: images/TheiaCoV_Illumina_SE.png + :width: 800 + :alt: TheiaCoV_Illumina_SE workflow + :figclass: align-center + + **TheiaCoV_Illumina_SE Data Workflow** + +Consensus genome assembly with the TheiaCoV_Illumina_SE workflow is performed by first trimming low-quality reads with Trimmomatic and removing adapter sequences with BBDuk. These cleaned read data are then aligned to the Wuhan-1 reference genome with BWA to generate a Binary Alignment Mapping (BAM) file. Primer sequences are then removed from the BAM file using the iVar Trim sub-command. The iVar consensus sub-command is then utilized to generate a consensus assembly in FASTA format. This assembly is then used to assign lineage and clade designations with Pangolin and NextClade. NCBI’S VADR tool is also employed to screen for potentially errant features (e.g. erroneous frame-shift mutations) in the consensus assembly. + +More information on required user inputs, optional user inputs, default tool parameters and the outputs generated by TheiaCoV_Illumina_SE are outlined below. + +Required User Inputs +******************** +Download CSV: :download:`TheiaCoV_Illumina_SE_required_inputs.csv ` + +.. csv-table:: + :file: tables/theiacov_workflows/theiacov_illumina_se_required_inputs.csv + :widths: 20, 20, 20, 40 + :header-rows: 1 + +| + +Optional User Inputs +******************** + +Download CSV: :download:`TheiaCoV_Illumina_SE_optional_inputs.csv ` + +.. csv-table:: + :file: tables/theiacov_workflows/theiacov_illumina_se_optional_inputs.csv + :widths: 10, 10, 10, 10, 20 + :header-rows: 1 + +| + +Outputs +******************** +Download CSV: :download:`TheiaCoV_Illumina_SE_default_outputs.csv ` + +.. csv-table:: + :file: tables/theiacov_workflows/theiacov_illumina_se_outputs.csv + :widths: 20, 20, 60 + :header-rows: 1 + +| + +TheiaCoV_ClearLabs +================= +The TheiaCoV_ClearLabs workflow was written to process ClearLabs WGS read data for SARS-CoV-2 amplicon sequencing. Currently, Clear Labs sequencing is performed with the Artic V3 protocol. If alternative primer schemes such as the Qiaseq Primer Panel, the Swift Amplicon SARS-CoV-2 Panel and the Artic V4 Amplicon Sequencing Panel become avaialble on the platform, these data can can also be analysed with this workflow since the primer sequence coordinates of the PCR scheme utilized must be provided along with the raw Clear Labs read data must be provided in BED and FASTQ file formats, respectively. + +Upon initiating a TheiaCoV_ClearLabs run, input ClearLabs read data provided for each sample will be processed to perform consensus genome assembly, infer the quality of both raw read data and the generated consensus genome, and assign SARS-CoV-2 lineage and clade types as outlined in the TheiaCoV_ClearLabs data workflow below. + +.. figure:: images/TheiaCoV_ClearLabs.png + :width: 800 + :alt: TheiaCoV_ClearLabs workflow + :figclass: align-center + + **TheiaCoV_ClearLabs Data Workflow** + +Consensus genome assembly with the TheiaCoV_ClearLabs workflow is performed by first de-hosting read data with the NCBI SRA-Human-Scrubber tool then following the `Artic nCoV-2019 novel coronavirs bioinformatics protocol `. Briefly, input reads are aligned to the Wuhan-1 reference genome with minimap2 to generate a Binary Alignment Mapping (BAM) file. Primer sequences are then removed from the BAM file and a consensus assembly file is generated using the Artic medaka command. This assembly is then used to assign lineage and clade designations with Pangolin and NextClade. NCBI’S VADR tool is also employed to screen for potentially errant features (e.g. erroneous frame-shift mutations) in the consensus assembly. + +.. note:: + Read-trimming is performed on raw read data generated on the ClearLabs instrument and thus not a required step in the TheiaCoV_ClearLabs workflow. + + +More information on required user inputs, optional user inputs, default tool parameters and the outputs generated by TheiaCoV_CLearLabs are outlined below. + +Required User Inputs +******************** +Download CSV: :download:`TheiaCoV_ClearLabs_required_inputs.csv ` + +.. csv-table:: + :file: tables/theiacov_workflows/theiacov_clearlabs_required_inputs.csv + :widths: 20, 20, 20, 40 + :header-rows: 1 + +| + +Optional User Inputs +******************** + +Download CSV: :download:`TheiaCoV_ClearLabs_optional_inputs.csv ` + +.. csv-table:: + :file: tables/theiacov_workflows/theiacov_clearlabs_optional_inputs.csv + :widths: 10, 10, 10, 10, 20 + :header-rows: 1 + +| + +Outputs +******************** +Download CSV: :download:`TheiaCoV_ClearLabs_default_outputs.csv ` + +.. csv-table:: + :file: tables/theiacov_workflows/theiacov_clearlabs_outputs.csv + :widths: 20, 20, 60 + :header-rows: 1 + +| + +TheiaCoV_ONT +========= +The TheiaCoV_ONT workflow was written to process basecalled and demultiplexed Oxford Nanopore Technology (ONT) read data. The most common read data analyzed by the TheiaCoV_ONT workflow are generated with the Artic V3 protocol. Alternative primer schemes such as the Qiaseq Primer Panel, the Swift Amplicon SARS-CoV-2 Panel and the Artic V4 Amplicon Sequencing Panel however, can also be analysed with this workflow since the primer sequence coordinates of the PCR scheme utilized must be provided along with the raw paired-end Illumina read data in BED and FASTQ file formats, respectively. + +Upon initiating a TheiaCoV_ONT run, input ONT read data provided for each sample will be processed to perform consensus genome assembly, infer the quality of both raw read data and the generated consensus genome, and assign SARS-CoV-2 lineage and clade types as outlined in the TheiaCoV_ONT data workflow below. + +.. figure:: images/TheiaCoV_ONT.png + :width: 800 + :alt: TheiaCoV_ONT workflow + :figclass: align-center + + **TheiaCoV_ONT Data Workflow** + +Consensus genome assembly with the TheiaCoV_ONT workflow is performed performed by first de-hosting read data with the NCBI SRA-Human-Scrubber tool then following then following `Artic nCoV-2019 novel coronavirs bioinformatics protocol `. Briefly, input reads are filtered by size (min-length: 400bp; max-length: 700bp) with the Aritc guppyplex command. These size-selected read data are aligned to the Wuhan-1 reference genome with minimap2 to generate a Binary Alignment Mapping (BAM) file. Primer sequences are then removed from the BAM file and a consensus assembly file is generated using the Artic medaka command. This assembly is then used to assign lineage and clade designations with Pangolin and NextClade. NCBI’S VADR tool is also employed to screen for potentially errant features (e.g. erroneous frame-shift mutations) in the consensus assembly. + +More information on required user inputs, optional user inputs, default tool parameters and the outputs generated by TheiaCoV_ONT are outlined below. + +Required User Inputs +******************** +Download CSV: :download:`TheiaCoV_ONT_required_inputs.csv ` + +.. csv-table:: + :file: tables/theiacov_workflows/theiacov_ont_required_inputs.csv + :widths: 20, 20, 20, 40 + :header-rows: 1 + +| + +Optional User Inputs +******************** + +Download CSV: :download:`TheiaCoV_ONT_optional_inputs.csv ` + +.. csv-table:: + :file: tables/theiacov_workflows/theiacov_ont_optional_inputs.csv + :widths: 10, 10, 10, 10, 20 + :header-rows: 1 + +| + +Outputs +******************** +Download CSV: :download:`TheiaCoV_ONT_default_outputs.csv ` + +.. csv-table:: + :file: tables/theiacov_workflows/theiacov_ont_outputs.csv + :widths: 20, 20, 60 + :header-rows: 1 + +| + +TheiaCoV_FASTA +=========== +The TheiaCoV_FASTA workflow was written to process SARS-CoV-2 assembly files to infer the quality of the input assembly and assign SARS-CoV-2 lineage and clade types as outlined in the TheiaCoV_FASTA data workflow below. + +.. figure:: images/TheiaCoV_FASTA.png + :width: 800 + :alt: TheiaCoV_FASTA workflow + :figclass: align-center + + **TheiaCoV_FASTA Data Workflow** + +The quality of input SARS-CoV-2 genome assemblies are assessed by the TheiaCoV_FASTA workflow using a series of bash shell scripts. Input assemblies are then used to assign lineage and clade designations with Pangolin and NextClade. NCBI’S VADR tool is also employed to screen for potentially errant features (e.g. erroneous frame-shift mutations) in the consensus assembly. + +More information on required user inputs, optional user inputs, default tool parameters and the outputs generated by TheiaCoV_FASTA are outlined below. + +Required User Inputs +******************** +Download CSV: :download:`TheiaCoV_FASTA_required_inputs.csv ` + +.. csv-table:: + :file: tables/theiacov_workflows/theiacov_fasta_required_inputs.csv + :widths: 20, 20, 20, 40 + :header-rows: 1 + +| + +Optional User Inputs +******************** + +Download CSV: :download:`TheiaCoV_FASTA_optional_inputs.csv ` + +.. csv-table:: + :file: tables/theiacov_workflows/theiacov_fasta_optional_inputs.csv + :widths: 10, 10, 10, 10, 20 + :header-rows: 1 + +| + +Outputs +******************** +Download CSV: :download:`TheiaCoV_FASTA_default_outputs.csv ` + +.. csv-table:: + :file: tables/theiacov_workflows/theiacov_fasta_outputs.csv + :widths: 20, 20, 60 + :header-rows: 1 + +| + +TheiaCoV Workflows for Genomic Epidemiology +---------------------------------------- + +Genomic Epidemiology, i.e. generating phylogenetic trees from a set of consensus assemblies (FASTA format) to track the spread and evolution of viruses on a local, national or global scale, has been an important methodological approach in the effort to mitigate disease transmission. + +The TheiaCoV Genomic Epidemiology Series contains two seperate WDL workflows (TheiaCoV_Augur_Prep and TheiaCoV_Augur_Run) that process a set of viral genomic assemblies to generate phylogenetic trees (JSON format) and metadata files which can be used to assign epidemiological data to each assembly for subsequent analyses. + +The two TheiaCoV workflows for genomic epidemiology must be run sequentially to first prepare the data for phylogenetic analysis and second to generate the phylogenetic trees. More information on the technical details of these processes and information on how to utilize and apply these workflows for public health investigations is available below. + +Required User Inputs +******************** +Download CSV: :download:`TheiaCoV_Augur_Prep_required_inputs.csv ` + +.. csv-table:: + :file: tables/theiacov_workflows/theiacov_augur_prep_required_inputs.csv + :widths: 20, 20, 20, 40 + :header-rows: 1 +| + +TheiaCoV_Augur_Prep +================ +The TheiaCoV_Augur_Prep workflow was written to process consensus assemblies (FASTA format) and the associated metadata in preparation for running the TheiaCoV_Augur_Run. Input assemblies should be of similar quality (percent reference coverage, number of ambiguous bases, etc.). Inputs with highly discordant quality metrics may result in inaccurate inference of genetic relatedness. + +.. note:: + There must be some sequence diversity in the input set of assemblies to be analyzed. As a rule of thumb, the smaller the input set, the more sequence diversity will be required to make any sort of genomic inference. If a small (~10) set of viral genomic assemblies is used as the input then it may be necessary to add one significantly divergent assembly. + +Upon initiating a TheiaCoV_Augur_Prep run, input assembly/consensus files and associated metadata will be used to produce the array of assembly/consensus files and the array of metadata files to be used as inputs for the TheiaCoV_Augur_Run workflow. + +Metadata files are prepared with the Augur_Prep workflow by using BASH commands to first de-identify, and then to parse the headers of the input assembly files. + +Required User Inputs +******************** +Download CSV: :download:`TheiaCoV_Augur_Prep_required_inputs.csv ` + +.. csv-table:: + :file: tables/theiacov_workflows/theiacov_augur_prep_required_inputs.csv + :widths: 20, 20, 20, 40 + :header-rows: 1 +| + +TheiaCoV_Augur_Run +=============== +The TheiaCoV_Augur_Run workflow was written to process an array of assembly/consensus files (FASTA format) and and array of sample metadata files (TSV format) using a modified version of The Broad Institute's sarscov2_nextstrain WDL workflow to create an Auspice JSON file; output from the modified sarscov2_nextstrain workflow will also be used to infer SNP distances and create a static PDF report. + +Upon initiating a TheiaCoV_Augur_Run run, the input assembly/consensus file array and the associated metadata file array will be used to generate a JSON file that is compatible with phylogenetic tree building software. This JSON can then be used in Auspice or Nextstrain to view the phylogenetic tree. This phylogeneic tree can be used in genomic epidemiological analysis to visualize the genetic relatedness of a set of samples. The associated metadata can then be used to add context to the phylogenetic visualization. + +Required User Inputs +******************** +Download CSV: :download:`TheiaCoV_Augur_Run_required_inputs.csv ` + +.. csv-table:: + :file: tables/theiacov_workflows/theiacov_augur_run_required_inputs.csv + :widths: 20, 20, 20, 40 + :header-rows: 1 + +| diff --git a/docs/source/titan_workflows.rst b/docs/source/titan_workflows.rst deleted file mode 100644 index 59d87ffd..00000000 --- a/docs/source/titan_workflows.rst +++ /dev/null @@ -1,345 +0,0 @@ -====================== -Titan Workflow Series -====================== - -The Titan Workflow Series is a collection of WDL workflows developed for performing genomic characterization and genomic epidemiology of SARS-CoV-2 samples to support public health decision-making. - -Titan Workflows for Genomic Characterization --------------------------------------------- -Genomic characterization, *i.e.* generating consensus assemblies (FASTA format) from next-generation sequencing (NGS) read data (FASTQ format) to assign samples with relevant nomenclature designation (e.g. PANGO lineage and NextClade clades) is an increasingly critical function to public health laboratories around the world. - -The Titan Genomic Characterization Series includes four separate WDL workflows (Titan_Illumina_PE, Titan_Illumina_SE, Titan_ClearLabs, and Titan_ONT) that process NGS read data from four different sequencing approaches: Illumina paired-end, Illumina single-end, Clear Labs, and Oxford Nanopore Technology (ONT)) to generate consensus assemblies, produce relevant quality-control metrics for both the input read data and the generated assembly, and assign samples with a lineage and clade designation using Pangolin and NextClade, respectively. - -All four Titan workflows for genomic characterization will generate a viral assembly by mapping input read data to a reference genome, removing primer reads from that alignment, and then calling the consensus assembly based on the primer-trimmed alignment. These consensus assemblies are then fed into the Pangolin and NextClade CLI tools for lineage and clade assignments. - -The major difference between each of these Titan Genomic Characterization workflows is in how the read mapping, primer trimming, and consensus genome calling is performed. More information on the technical details of these processes and information on how to utilize and apply these workflows for public health investigations is available below. - -A fifth WDL workflow, Titan_FASTA, was added to take in assembled SC2 genomes, perform basic QC (e.g. number of Ns), and assign samples with a lineage and clade designation using Pangolin and NextClade, respectively. - -A series of introductory training videos that provide conceptual overviews of methodologies and walkthrough tutorials on how to utilize these Titan workflows through Terra are available on the Theiagen Genomics YouTube page: - -.. raw:: html - - - -| - -Titan_Illumina_PE -================= -The Titan_Illumina_PE workflow was written to process Illumina paired-end (PE) read data. Input reads are assumed to be the product of sequencing tiled PCR-amplicons designed for the SARS-CoV-2 genome. The most common read data analyzed by the Titan_Illumina_PE workflow are generated with the Artic V3 protocol. Alternative primer schemes such as the Qiaseq Primer Panel, the Swift Amplicon SARS-CoV-2 Panel and the Artic V4 Amplicon Sequencing Panel however, can also be analysed with this workflow since the primer sequence coordinates of the PCR scheme utilized must be provided along with the raw paired-end Illumina read data in BED and FASTQ file formats, respectively. - -.. note:: - By default, this workflow will assume that input reads were generated using a 300-cycle kit (i.e. 2 x 150 bp reads). Modifications to the optional parameter for trimmomatic_minlen may be required to accommodate for shorter read data, such as 2 x 75bp reads generated using a 150-cycle kit. - -Upon initiating a Titan_Illumina_PE job, the input primer scheme coordinates and raw paired-end Illumina read data provided for each sample will be processed to perform consensus genome assembly, infer the quality of both raw read data and the generated consensus genome, and assign SARS-CoV-2 lineage and clade types as outlined in the Titan_Illumina_PE data workflow below. - -.. figure:: images/Titan_Illumina_PE.png - :width: 800 - :alt: Titan_Illumina_PE workflow - :figclass: align-center - - **Titan_Illumina_PE Data Workflow** - -Consensus genome assembly with the Titan_Illumina_PE workflow is performed by first de-hosting read data with the NCBI SRA-Human-Scrubber tool then trimming low-quality reads with Trimmomatic and removing adapter sequences with BBDuk. These cleaned read data are then aligned to the Wuhan-1 reference genome with BWA to generate a Binary Alignment Mapping (BAM) file. Primer sequences are then removed from the BAM file using the iVar Trim sub-command. The iVar consensus sub-command is then utilized to generate a consensus assembly in FASTA format. This assembly is then used to assign lineage and clade designations with Pangolin and NextClade. NCBI’S VADR tool is also employed to screen for potentially errant features (e.g. erroneous frame-shift mutations) in the consensus assembly. - -More information on required user inputs, optional user inputs, default tool parameters and the outputs generated by Titan_Illumina_PE are outlined below. - -Required User Inputs -******************** -Download CSV: :download:`Titan_Illumina_PE_required_inputs.csv ` - -.. csv-table:: - :file: tables/titan_workflows/titan_illumina_pe_required_inputs.csv - :widths: 20, 20, 20, 40 - :header-rows: 1 - -| - -Optional User Inputs -******************** - -Download CSV: :download:`Titan_Illumina_PE_optional_inputs.csv ` - -.. csv-table:: - :file: tables/titan_workflows/titan_illumina_pe_optional_inputs.csv - :widths: 10, 10, 10, 10, 20 - :header-rows: 1 - -| - -Outputs -******************** -Download CSV: :download:`Titan_Illumina_PE_default_outputs.csv ` - -.. csv-table:: - :file: tables/titan_workflows/titan_illumina_pe_outputs.csv - :widths: 20, 20, 60 - :header-rows: 1 - -| - -Titan_Illumina_SE -================= -The Titan_Illumina_SE workflow was written to process Illumina single-end (SE) read data. Input reads are assumed to be the product of sequencing tiled PCR-amplicons designed for the SARS-CoV-2 genome. The most common read data analyzed by the Titan_Illumina_SE workflow are generated with the Artic V3 protocol. Alternative primer schemes such as the Qiaseq Primer Panel, however, can also be analysed with this workflow since the primer sequence coordinates of the PCR scheme utilized must be provided along with the raw paired-end Illumina read data in BED and FASTQ file formats, respectively. - -.. note:: - By default, this workflow will assume that input reads were generated using a 35-cycle kit (i.e. 1 x 35 bp reads). Modifications to the optional parameter for trimmomatic_minlen may be required to accommodate for longer read data. - -Upon initiating a Titan_Illumina_SE job, the input primer scheme coordinates and raw paired-end Illumina read data provided for each sample will be processed to perform consensus genome assembly, infer the quality of both raw read data and the generated consensus genome, and assign SARS-CoV-2 lineage and clade types as outlined in the Titan_Illumina_PE data workflow below. - -.. figure:: images/Titan_Illumina_SE.png - :width: 800 - :alt: Titan_Illumina_SE workflow - :figclass: align-center - - **Titan_Illumina_SE Data Workflow** - -Consensus genome assembly with the Titan_Illumina_SE workflow is performed by first trimming low-quality reads with Trimmomatic and removing adapter sequences with BBDuk. These cleaned read data are then aligned to the Wuhan-1 reference genome with BWA to generate a Binary Alignment Mapping (BAM) file. Primer sequences are then removed from the BAM file using the iVar Trim sub-command. The iVar consensus sub-command is then utilized to generate a consensus assembly in FASTA format. This assembly is then used to assign lineage and clade designations with Pangolin and NextClade. NCBI’S VADR tool is also employed to screen for potentially errant features (e.g. erroneous frame-shift mutations) in the consensus assembly. - -More information on required user inputs, optional user inputs, default tool parameters and the outputs generated by Titan_Illumina_SE are outlined below. - -Required User Inputs -******************** -Download CSV: :download:`Titan_Illumina_SE_required_inputs.csv ` - -.. csv-table:: - :file: tables/titan_workflows/titan_illumina_se_required_inputs.csv - :widths: 20, 20, 20, 40 - :header-rows: 1 - -| - -Optional User Inputs -******************** - -Download CSV: :download:`Titan_Illumina_SE_optional_inputs.csv ` - -.. csv-table:: - :file: tables/titan_workflows/titan_illumina_se_optional_inputs.csv - :widths: 10, 10, 10, 10, 20 - :header-rows: 1 - -| - -Outputs -******************** -Download CSV: :download:`Titan_Illumina_SE_default_outputs.csv ` - -.. csv-table:: - :file: tables/titan_workflows/titan_illumina_se_outputs.csv - :widths: 20, 20, 60 - :header-rows: 1 - -| - -Titan_ClearLabs -================= -The Titan_ClearLabs workflow was written to process ClearLabs WGS read data for SARS-CoV-2 amplicon sequencing. Currently, Clear Labs sequencing is performed with the Artic V3 protocol. If alternative primer schemes such as the Qiaseq Primer Panel, the Swift Amplicon SARS-CoV-2 Panel and the Artic V4 Amplicon Sequencing Panel become avaialble on the platform, these data can can also be analysed with this workflow since the primer sequence coordinates of the PCR scheme utilized must be provided along with the raw Clear Labs read data must be provided in BED and FASTQ file formats, respectively. - -Upon initiating a Titan_ClearLabs run, input ClearLabs read data provided for each sample will be processed to perform consensus genome assembly, infer the quality of both raw read data and the generated consensus genome, and assign SARS-CoV-2 lineage and clade types as outlined in the Titan_ClearLabs data workflow below. - -.. figure:: images/Titan_ClearLabs.png - :width: 800 - :alt: Titan_ClearLabs workflow - :figclass: align-center - - **Titan_ClearLabs Data Workflow** - -Consensus genome assembly with the Titan_ClearLabs workflow is performed by first de-hosting read data with the NCBI SRA-Human-Scrubber tool then following the `Artic nCoV-2019 novel coronavirs bioinformatics protocol `. Briefly, input reads are aligned to the Wuhan-1 reference genome with minimap2 to generate a Binary Alignment Mapping (BAM) file. Primer sequences are then removed from the BAM file and a consensus assembly file is generated using the Artic medaka command. This assembly is then used to assign lineage and clade designations with Pangolin and NextClade. NCBI’S VADR tool is also employed to screen for potentially errant features (e.g. erroneous frame-shift mutations) in the consensus assembly. - -.. note:: - Read-trimming is performed on raw read data generated on the ClearLabs instrument and thus not a required step in the Titan_ClearLabs workflow. - - -More information on required user inputs, optional user inputs, default tool parameters and the outputs generated by Titan_CLearLabs are outlined below. - -Required User Inputs -******************** -Download CSV: :download:`Titan_ClearLabs_required_inputs.csv ` - -.. csv-table:: - :file: tables/titan_workflows/titan_clearlabs_required_inputs.csv - :widths: 20, 20, 20, 40 - :header-rows: 1 - -| - -Optional User Inputs -******************** - -Download CSV: :download:`Titan_ClearLabs_optional_inputs.csv ` - -.. csv-table:: - :file: tables/titan_workflows/titan_clearlabs_optional_inputs.csv - :widths: 10, 10, 10, 10, 20 - :header-rows: 1 - -| - -Outputs -******************** -Download CSV: :download:`Titan_ClearLabs_default_outputs.csv ` - -.. csv-table:: - :file: tables/titan_workflows/titan_clearlabs_outputs.csv - :widths: 20, 20, 60 - :header-rows: 1 - -| - -Titan_ONT -========= -The Titan_ONT workflow was written to process basecalled and demultiplexed Oxford Nanopore Technology (ONT) read data. The most common read data analyzed by the Titan_ONT workflow are generated with the Artic V3 protocol. Alternative primer schemes such as the Qiaseq Primer Panel, the Swift Amplicon SARS-CoV-2 Panel and the Artic V4 Amplicon Sequencing Panel however, can also be analysed with this workflow since the primer sequence coordinates of the PCR scheme utilized must be provided along with the raw paired-end Illumina read data in BED and FASTQ file formats, respectively. - -Upon initiating a Titan_ONT run, input ONT read data provided for each sample will be processed to perform consensus genome assembly, infer the quality of both raw read data and the generated consensus genome, and assign SARS-CoV-2 lineage and clade types as outlined in the Titan_ONT data workflow below. - -.. figure:: images/Titan_ONT.png - :width: 800 - :alt: Titan_ONT workflow - :figclass: align-center - - **Titan_ONT Data Workflow** - -Consensus genome assembly with the Titan_ONT workflow is performed performed by first de-hosting read data with the NCBI SRA-Human-Scrubber tool then following then following `Artic nCoV-2019 novel coronavirs bioinformatics protocol `. Briefly, input reads are filtered by size (min-length: 400bp; max-length: 700bp) with the Aritc guppyplex command. These size-selected read data are aligned to the Wuhan-1 reference genome with minimap2 to generate a Binary Alignment Mapping (BAM) file. Primer sequences are then removed from the BAM file and a consensus assembly file is generated using the Artic medaka command. This assembly is then used to assign lineage and clade designations with Pangolin and NextClade. NCBI’S VADR tool is also employed to screen for potentially errant features (e.g. erroneous frame-shift mutations) in the consensus assembly. - -More information on required user inputs, optional user inputs, default tool parameters and the outputs generated by Titan_ONT are outlined below. - -Required User Inputs -******************** -Download CSV: :download:`Titan_ONT_required_inputs.csv ` - -.. csv-table:: - :file: tables/titan_workflows/titan_ont_required_inputs.csv - :widths: 20, 20, 20, 40 - :header-rows: 1 - -| - -Optional User Inputs -******************** - -Download CSV: :download:`Titan_ONT_optional_inputs.csv ` - -.. csv-table:: - :file: tables/titan_workflows/titan_ont_optional_inputs.csv - :widths: 10, 10, 10, 10, 20 - :header-rows: 1 - -| - -Outputs -******************** -Download CSV: :download:`Titan_ONT_default_outputs.csv ` - -.. csv-table:: - :file: tables/titan_workflows/titan_ont_outputs.csv - :widths: 20, 20, 60 - :header-rows: 1 - -| - -Titan_FASTA -=========== -The Titan_FASTA workflow was written to process SARS-CoV-2 assembly files to infer the quality of the input assembly and assign SARS-CoV-2 lineage and clade types as outlined in the Titan_FASTA data workflow below. - -.. figure:: images/Titan_FASTA.png - :width: 800 - :alt: Titan_FASTA workflow - :figclass: align-center - - **Titan_FASTA Data Workflow** - -The quality of input SARS-CoV-2 genome assemblies are assessed by the Titan_FASTA workflow using a series of bash shell scripts. Input assemblies are then used to assign lineage and clade designations with Pangolin and NextClade. NCBI’S VADR tool is also employed to screen for potentially errant features (e.g. erroneous frame-shift mutations) in the consensus assembly. - -More information on required user inputs, optional user inputs, default tool parameters and the outputs generated by Titan_FASTA are outlined below. - -Required User Inputs -******************** -Download CSV: :download:`Titan_FASTA_required_inputs.csv ` - -.. csv-table:: - :file: tables/titan_workflows/titan_fasta_required_inputs.csv - :widths: 20, 20, 20, 40 - :header-rows: 1 - -| - -Optional User Inputs -******************** - -Download CSV: :download:`Titan_FASTA_optional_inputs.csv ` - -.. csv-table:: - :file: tables/titan_workflows/titan_fasta_optional_inputs.csv - :widths: 10, 10, 10, 10, 20 - :header-rows: 1 - -| - -Outputs -******************** -Download CSV: :download:`Titan_FASTA_default_outputs.csv ` - -.. csv-table:: - :file: tables/titan_workflows/titan_fasta_outputs.csv - :widths: 20, 20, 60 - :header-rows: 1 - -| - -Titan Workflows for Genomic Epidemiology ----------------------------------------- - -Genomic Epidemiology, i.e. generating phylogenetic trees from a set of consensus assemblies (FASTA format) to track the spread and evolution of viruses on a local, national or global scale, has been an important methodological approach in the effort to mitigate disease transmission. - -The Titan Genomic Epidemiology Series contains two seperate WDL workflows (Titan_Augur_Prep and Titan_Augur_Run) that process a set of viral genomic assemblies to generate phylogenetic trees (JSON format) and metadata files which can be used to assign epidemiological data to each assembly for subsequent analyses. - -The two Titan workflows for genomic epidemiology must be run sequentially to first prepare the data for phylogenetic analysis and second to generate the phylogenetic trees. More information on the technical details of these processes and information on how to utilize and apply these workflows for public health investigations is available below. - -Required User Inputs -******************** -Download CSV: :download:`Titan_Augur_Prep_required_inputs.csv ` - -.. csv-table:: - :file: tables/titan_workflows/titan_augur_prep_required_inputs.csv - :widths: 20, 20, 20, 40 - :header-rows: 1 -| - -Titan_Augur_Prep -================ -The Titan_Augur_Prep workflow was written to process consensus assemblies (FASTA format) and the associated metadata in preparation for running the Titan_Augur_Run. Input assemblies should be of similar quality (percent reference coverage, number of ambiguous bases, etc.). Inputs with highly discordant quality metrics may result in innacurate inference of genetic relatedness. - -.. note:: - There must be some sequence diversity in the input set of assemblies to be analyzed. As a rule of thumb, the smaller the input set, the more sequence diversity will be required to make any sort of genomic inference. If a small (~10) set of viral genomic assemblies is used as the input then it may be necessary to add one significantly divergent assembly. - -Upon initiating a Titan_Augur_Prep run, input assembly/consensus files and associated metadata will be used to produce the array of assembly/consensus files and the array of metadata files to be used as inputs for the Titan_Augur_Run workflow. - -Metadata files are prepared with the Auger_Prep workflow by using BASH commands to first de-identify, and then to parse the headers of the input assembly files. - -Required User Inputs -******************** -Download CSV: :download:`Titan_Augur_Prep_required_inputs.csv ` - -.. csv-table:: - :file: tables/titan_workflows/titan_augur_prep_required_inputs.csv - :widths: 20, 20, 20, 40 - :header-rows: 1 -| - -Titan_Augur_Run -=============== -The Titan_Augur_Run workflow was written to process an array of assembly/consensus files (FASTA format) and and array of sample metadata files (TSV format) using a modified version of The Broad Institute's sarscov2_nextstrain WDL workflow to create an Auspice JSON file; output from the modified sarscov2_nextstrain workflow will also be used to infer SNP distances and create a static PDF report. - -Upon initiating a Titan_Augur_Run run, the input assembly/consensus file array and the associated metadata file array will be used to generate a JSON file that is compatible with phylogenetic tree building software. This JSON can then be used in Auspice or Nextstrain to view the phylogenetic tree. This phylogeneic tree can be used in genomic epidemiological analysis to visualize the genetic relatedness of a set of samples. The associated metadata can then be used to add context to the phylogenetic visualization. - -Required User Inputs -******************** -Download CSV: :download:`Titan_Augur_Run_required_inputs.csv ` - -.. csv-table:: - :file: tables/titan_workflows/titan_augur_run_required_inputs.csv - :widths: 20, 20, 20, 40 - :header-rows: 1 - -| diff --git a/tasks/task_alignment.wdl b/tasks/task_alignment.wdl index 4c7307ad..76c740b6 100644 --- a/tasks/task_alignment.wdl +++ b/tasks/task_alignment.wdl @@ -1,85 +1,81 @@ version 1.0 task bwa { - input { - File read1 - File? read2 - String samplename - File? reference_genome - Int? cpus=6 + File read1 + File? read2 + String samplename + File? reference_genome + Int cpu = 6 } - command <<< # date and version control date | tee DATE echo "BWA $(bwa 2>&1 | grep Version )" | tee BWA_VERSION samtools --version | head -n1 | tee SAMTOOLS_VERSION - + # set reference genome if [[ ! -z "~{reference_genome}" ]]; then echo "User reference identified; ~{reference_genome} will be utilized for alignement" + ref_genome="~{reference_genome}" + bwa index "~{reference_genome}" # move to primer_schemes dir; bwa fails if reference file not in this location - cp "~{reference_genome}" "/artic-ncov2019/primer_schemes/nCoV-2019/V3/nCoV-2019.reference.fasta" + else + ref_genome="/artic-ncov2019/primer_schemes/nCoV-2019/V3/nCoV-2019.reference.fasta" fi # Map with BWA MEM - echo "Running bwa mem -t ~{cpus} bwa_reference.bwa ~{read1} ~{read2} | samtools sort | samtools view -F 4 -o ~{samplename}.sorted.bam " + echo "Running bwa mem -t ~{cpu} ${ref_genome} ~{read1} ~{read2} | samtools sort | samtools view -F 4 -o ~{samplename}.sorted.bam " bwa mem \ - -t ~{cpus} \ - "/artic-ncov2019/primer_schemes/nCoV-2019/V3/nCoV-2019.reference.fasta" \ + -t ~{cpu} \ + "${ref_genome}" \ ~{read1} ~{read2} |\ samtools sort | samtools view -F 4 -o ~{samplename}.sorted.bam # index BAMs samtools index ~{samplename}.sorted.bam >>> - output { - String bwa_version = read_string("BWA_VERSION") - String sam_version = read_string("SAMTOOLS_VERSION") - File sorted_bam = "${samplename}.sorted.bam" - File sorted_bai = "${samplename}.sorted.bam.bai" + String bwa_version = read_string("BWA_VERSION") + String sam_version = read_string("SAMTOOLS_VERSION") + File sorted_bam = "${samplename}.sorted.bam" + File sorted_bai = "${samplename}.sorted.bam.bai" } - runtime { - docker: "quay.io/staphb/ivar:1.3.1-titan" - memory: "8 GB" - cpu: 2 - disks: "local-disk 100 SSD" - preemptible: 0 + docker: "quay.io/staphb/ivar:1.3.1-titan" + memory: "8 GB" + cpu: cpu + disks: "local-disk 100 SSD" + preemptible: 0 + maxRetries: 3 } } task mafft { - input { - Array[File] genomes - String? cpus = 16 + Array[File] genomes + Int cpu = 16 } - - command{ + command <<< # date and version control date | tee DATE mafft_vers=$(mafft --version) echo Mafft $(mafft_vers) | tee VERSION - cat ${sep=" " genomes} | sed 's/Consensus_//;s/.consensus_threshold.*//' > assemblies.fasta - mafft --thread -${cpus} assemblies.fasta > msa.fasta - } - + cat ~{sep=" " genomes} | sed 's/Consensus_//;s/.consensus_threshold.*//' > assemblies.fasta + mafft --thread -~{cpu} assemblies.fasta > msa.fasta + >>> output { - String date = read_string("DATE") - String version = read_string("VERSION") - File msa = "msa.fasta" + String date = read_string("DATE") + String version = read_string("VERSION") + File msa = "msa.fasta" } - runtime { - docker: "quay.io/staphb/mafft:7.450" - memory: "32 GB" - cpu: 16 - disks: "local-disk 100 SSD" - preemptible: 0 - maxRetries: 3 + docker: "quay.io/staphb/mafft:7.450" + memory: "32 GB" + cpu: cpu + disks: "local-disk 100 SSD" + preemptible: 0 + maxRetries: 3 } } diff --git a/tasks/task_assembly_metrics.wdl b/tasks/task_assembly_metrics.wdl index 1a4b8d26..895ee0b3 100644 --- a/tasks/task_assembly_metrics.wdl +++ b/tasks/task_assembly_metrics.wdl @@ -1,14 +1,12 @@ -version 1.0 +version 1.0 task stats_n_coverage { - input { - File bamfile - String samplename - Int s_gene_start=21563 - Int s_gene_stop=25384 + File bamfile + String samplename + Int s_gene_start = 21563 + Int s_gene_stop = 25384 } - command <<< date | tee DATE samtools --version | head -n1 | tee VERSION @@ -18,13 +16,13 @@ task stats_n_coverage { samtools coverage ~{bamfile} -m -o ~{samplename}.cov.hist samtools coverage ~{bamfile} -o ~{samplename}.cov.txt samtools flagstat ~{bamfile} > ~{samplename}.flagstat.txt - + coverage=$(cut -f 6 ~{samplename}.cov.txt | tail -n 1) depth=$(cut -f 7 ~{samplename}.cov.txt | tail -n 1) meanbaseq=$(cut -f 8 ~{samplename}.cov.txt | tail -n 1) meanmapq=$(cut -f 9 ~{samplename}.cov.txt | tail -n 1) - - samtools index ~{bamfile} + + samtools index ~{bamfile} chr=$(samtools idxstats ~{bamfile} | cut -f 1 | head -1) samtools coverage -r "${chr}:~{s_gene_start}-~{s_gene_stop}" ~{bamfile} >> ~{samplename}.cov.txt s_gene_depth=$(cut -f 7 ~{samplename}.cov.txt | tail -n 1) @@ -36,32 +34,30 @@ task stats_n_coverage { if [ -z "$meanmapq" ] ; then meanmapq="0" ; fi echo $coverage | tee COVERAGE - echo $depth | tee DEPTH + echo $depth | tee DEPTH echo $s_gene_depth | tee S_GENE_DEPTH - echo $meanbaseq | tee MEANBASEQ - echo $meanmapq | tee MEANMAPQ + echo $meanbaseq | tee MEANBASEQ + echo $meanmapq | tee MEANMAPQ >>> - output { - String date = read_string("DATE") - String samtools_version = read_string("VERSION") - File stats = "~{samplename}.stats.txt" - File cov_hist = "~{samplename}.cov.hist" - File cov_stats = "~{samplename}.cov.txt" - File flagstat = "~{samplename}.flagstat.txt" - Float coverage = read_string("COVERAGE") - Float depth = read_string("DEPTH") - Float s_gene_depth = read_string("S_GENE_DEPTH") - Float meanbaseq = read_string("MEANBASEQ") - Float meanmapq = read_string("MEANMAPQ") + String date = read_string("DATE") + String samtools_version = read_string("VERSION") + File stats = "~{samplename}.stats.txt" + File cov_hist = "~{samplename}.cov.hist" + File cov_stats = "~{samplename}.cov.txt" + File flagstat = "~{samplename}.flagstat.txt" + Float coverage = read_string("COVERAGE") + Float depth = read_string("DEPTH") + Float s_gene_depth = read_string("S_GENE_DEPTH") + Float meanbaseq = read_string("MEANBASEQ") + Float meanmapq = read_string("MEANMAPQ") } - runtime { - docker: "quay.io/staphb/samtools:1.10" - memory: "8 GB" - cpu: 2 - disks: "local-disk 100 SSD" - preemptible: 0 - maxRetries: 3 + docker: "quay.io/staphb/samtools:1.10" + memory: "8 GB" + cpu: 2 + disks: "local-disk 100 SSD" + preemptible: 0 + maxRetries: 3 } } diff --git a/tasks/task_consensus_call.wdl b/tasks/task_consensus_call.wdl index 3845044e..f2a4bbb8 100644 --- a/tasks/task_consensus_call.wdl +++ b/tasks/task_consensus_call.wdl @@ -1,183 +1,199 @@ version 1.0 task primer_trim { - input { - File bamfile - String samplename - File primer_bed - Boolean? keep_noprimer_reads=true + File bamfile + String samplename + File primer_bed + Boolean? keep_noprimer_reads = true } String primer_name = basename(primer_bed) - - command { + command <<< # date and version control - echo "${primer_name}" | tee PRIMER_NAME + echo "~{primer_name}" | tee PRIMER_NAME date | tee DATE ivar version | head -n1 | tee IVAR_VERSION samtools --version | head -n1 | tee SAMTOOLS_VERSION # trimming primers ivar trim \ - ${true="-e" false="" keep_noprimer_reads} \ - -i ${bamfile} \ - -b ${primer_bed} \ - -p ${samplename}.primertrim | tee IVAR_OUT + ~{true="-e" false="" keep_noprimer_reads} \ + -i ~{bamfile} \ + -b ~{primer_bed} \ + -p ~{samplename}.primertrim | tee IVAR_OUT # sorting and indexing the trimmed bams samtools sort \ - ${samplename}.primertrim.bam \ - -o ${samplename}.primertrim.sorted.bam + ~{samplename}.primertrim.bam \ + -o ~{samplename}.primertrim.sorted.bam - samtools index ${samplename}.primertrim.sorted.bam + samtools index ~{samplename}.primertrim.sorted.bam PCT=$(grep "Trimmed primers from" IVAR_OUT | perl -lape 's/Trimmed primers from (\S+)%.*/$1/') echo $PCT if [[ $PCT = -* ]]; then echo 0; else echo $PCT; fi > IVAR_TRIM_PCT - } - + >>> output { - File trimmed_bam = "${samplename}.primertrim.bam" - File trim_sorted_bam = "${samplename}.primertrim.sorted.bam" - File trim_sorted_bai = "${samplename}.primertrim.sorted.bam.bai" + File trimmed_bam = "~{samplename}.primertrim.bam" + File trim_sorted_bam = "~{samplename}.primertrim.sorted.bam" + File trim_sorted_bai = "~{samplename}.primertrim.sorted.bam.bai" String ivar_version = read_string("IVAR_VERSION") String samtools_version = read_string("SAMTOOLS_VERSION") String pipeline_date = read_string("DATE") - Float primer_trimmed_read_percent = read_float("IVAR_TRIM_PCT") + Float primer_trimmed_read_percent = read_float("IVAR_TRIM_PCT") String primer_bed_name = read_string("PRIMER_NAME") } - runtime { - docker: "quay.io/staphb/ivar:1.3.1-titan" - memory: "8 GB" - cpu: 2 - disks: "local-disk 100 SSD" - preemptible: 0 - maxRetries: 3 + docker: "quay.io/staphb/ivar:1.3.1-titan" + memory:"8 GB" + cpu: 2 + disks: "local-disk 100 SSD" + preemptible: 0 + maxRetries: 3 } } task variant_call { - input { - File bamfile - String samplename - String? ref_genome = "/artic-ncov2019/primer_schemes/nCoV-2019/V3/nCoV-2019.reference.fasta" - String? ref_gff = "/reference/GCF_009858895.2_ASM985889v3_genomic.gff" + File bamfile + String samplename + File? reference_genome + File? reference_gff Boolean? count_orphans = true - Int? max_depth = "600000" + Int? max_depth = "600000" Boolean? disable_baq = true - Int? min_bq = "0" - Int? min_qual = "20" - Float? min_freq = "0.6" - Int? min_depth = "10" + Int? min_bq = "0" + Int? min_qual = "20" + Float? min_freq = "0.6" + Int? min_depth = "100" } - - command { + command <<< # date and version control date | tee DATE ivar version | head -n1 | tee IVAR_VERSION samtools --version | head -n1 | tee SAMTOOLS_VERSION + # set reference genome + if [[ ! -z "~{reference_genome}" ]]; then + echo "User reference identified; ~{reference_genome} will be utilized for alignement" + ref_genome="~{reference_genome}" + bwa index "~{reference_genome}" + # move to primer_schemes dir; bwa fails if reference file not in this location + else + ref_genome="/artic-ncov2019/primer_schemes/nCoV-2019/V3/nCoV-2019.reference.fasta" + fi + + # set reference gff + if [[ ! -z "~{reference_gff}" ]]; then + echo "User reference identified; ~{reference_genome} will be utilized for alignement" + ref_gff="~{reference_gff}" + # move to primer_schemes dir; bwa fails if reference file not in this location + else + ref_gff="/reference/GCF_009858895.2_ASM985889v3_genomic.gff" + fi + # call variants samtools mpileup \ - ${true = "-A" false = "" count_orphans} \ - -d ${max_depth} \ - ${true = "-B" false = "" disable_baq} \ - -Q ${min_bq} \ + ~{true = "-A" false = "" count_orphans} \ + -d ~{max_depth} \ + ~{true = "-B" false = "" disable_baq} \ + -Q ~{min_bq} \ --reference ${ref_genome} \ - ${bamfile} | \ + ~{bamfile} | \ ivar variants \ - -p ${samplename}.variants \ - -q ${min_qual} \ - -t ${min_freq} \ - -m ${min_depth} \ + -p ~{samplename}.variants \ + -q ~{min_qual} \ + -t ~{min_freq} \ + -m ~{min_depth} \ -r ${ref_genome} \ -g ${ref_gff} # Convert TSV to VCF - ivar_variants_to_vcf.py ${samplename}.variants.tsv ${samplename}.variants.vcf + ivar_variants_to_vcf.py ~{samplename}.variants.tsv ~{samplename}.variants.vcf - variants_num=$(grep "TRUE" ${samplename}.variants.tsv | wc -l) + variants_num=$(grep "TRUE" ~{samplename}.variants.tsv | wc -l) if [ -z "$variants_num" ] ; then variants_num="0" ; fi echo $variants_num | tee VARIANT_NUM - } - + >>> output { - Int variant_num = read_string("VARIANT_NUM") - File sample_variants_tsv = "${samplename}.variants.tsv" - File sample_variants_vcf = "${samplename}.variants.vcf" - String ivar_version = read_string("IVAR_VERSION") - String samtools_version = read_string("SAMTOOLS_VERSION") - String pipeline_date = read_string("DATE") + Int variant_num = read_string("VARIANT_NUM") + File sample_variants_tsv = "~{samplename}.variants.tsv" + File sample_variants_vcf = "~{samplename}.variants.vcf" + String ivar_version = read_string("IVAR_VERSION") + String samtools_version = read_string("SAMTOOLS_VERSION") + String pipeline_date = read_string("DATE") } - runtime { - docker: "quay.io/staphb/ivar:1.3.1-titan" - memory: "8 GB" - cpu: 2 - disks: "local-disk 100 SSD" - preemptible: 0 - maxRetries: 3 + docker: "quay.io/staphb/ivar:1.3.1-titan" + memory: "8 GB" + cpu: 2 + disks: "local-disk 100 SSD" + preemptible: 0 + maxRetries: 3 } } task consensus { - input { - File bamfile - String samplename - String? ref_genome = "/artic-ncov2019/primer_schemes/nCoV-2019/V3/nCoV-2019.reference.fasta" - String? ref_gff = "/reference/GCF_009858895.2_ASM985889v3_genomic.gff" - Boolean? count_orphans = true - Int? max_depth = "600000" - Boolean? disable_baq = true - Int? min_bq = "0" - Int? min_qual = "20" - Float? min_freq = "0.6" - Int? min_depth = "10" - String? char_unknown = "N" + File bamfile + String samplename + File? reference_genome + Boolean? count_orphans = true + Int? max_depth = "600000" + Boolean? disable_baq = true + Int? min_bq = "0" + Int? min_qual = "20" + Float? min_freq = "0.6" + Int? min_depth = "100" + String? char_unknown = "N" } - - command { + command <<< # date and version control date | tee DATE ivar version | head -n1 | tee IVAR_VERSION samtools --version | head -n1 | tee SAMTOOLS_VERSION + # set reference genome + if [[ ! -z "~{reference_genome}" ]]; then + echo "User reference identified; ~{reference_genome} will be utilized for alignement" + ref_genome="~{reference_genome}" + bwa index "~{reference_genome}" + # move to primer_schemes dir; bwa fails if reference file not in this location + else + ref_genome="/artic-ncov2019/primer_schemes/nCoV-2019/V3/nCoV-2019.reference.fasta" + fi + # call consensus samtools mpileup \ - ${true = "--count-orphans" false = "" count_orphans} \ - -d ${max_depth} \ - ${true = "--no-BAQ" false = "" disable_baq} \ - -Q ${min_bq} \ + ~{true = "--count-orphans" false = "" count_orphans} \ + -d ~{max_depth} \ + ~{true = "--no-BAQ" false = "" disable_baq} \ + -Q ~{min_bq} \ --reference ${ref_genome} \ - ${bamfile} | \ + ~{bamfile} | \ ivar consensus \ - -p ${samplename}.consensus \ - -q ${min_qual} \ - -t ${min_freq} \ - -m ${min_depth} \ - -n ${char_unknown} + -p ~{samplename}.consensus \ + -q ~{min_qual} \ + -t ~{min_freq} \ + -m ~{min_depth} \ + -n ~{char_unknown} # clean up fasta header - echo ">${samplename}" > ${samplename}.ivar.consensus.fasta - grep -v ">" ~{samplename}.consensus.fa >> ${samplename}.ivar.consensus.fasta - } - + echo ">~{samplename}" > ~{samplename}.ivar.consensus.fasta + grep -v ">" ~{samplename}.consensus.fa >> ~{samplename}.ivar.consensus.fasta + >>> output { - File consensus_seq = "${samplename}.ivar.consensus.fasta" - String ivar_version = read_string("IVAR_VERSION") - String samtools_version = read_string("SAMTOOLS_VERSION") - String pipeline_date = read_string("DATE") + File consensus_seq = "~{samplename}.ivar.consensus.fasta" + String ivar_version = read_string("IVAR_VERSION") + String samtools_version = read_string("SAMTOOLS_VERSION") + String pipeline_date = read_string("DATE") } - runtime { - docker: "quay.io/staphb/ivar:1.3.1-titan" - memory: "8 GB" - cpu: 2 - disks: "local-disk 100 SSD" - preemptible: 0 - maxRetries: 3 + docker: "quay.io/staphb/ivar:1.3.1-titan" + memory: "8 GB" + cpu: 2 + disks: "local-disk 100 SSD" + preemptible: 0 + maxRetries: 3 } } diff --git a/tasks/task_data_vis.wdl b/tasks/task_data_vis.wdl deleted file mode 100644 index a0e403be..00000000 --- a/tasks/task_data_vis.wdl +++ /dev/null @@ -1,64 +0,0 @@ -version 1.0 - -task cluster_render { - - input { - File snp_matrix - File ml_tree - String cluster_name - File? render_template - } - - command{ - # date and version control - date | tee DATE - Rscript --version | tee RSCRIPT_VERSION - R --version | head -n1 | sed 's/).*/)/' | tee R_VERSION - - cp ${snp_matrix} snp_matrix.tsv - cp ${ml_tree} ml_tree.tree - if ! [[ -z "${render_template}" ]]; then cp ${render_template} render_template.Rmd; - else cp /reports/sc2_report_template.Rmd render_template.Rmd; fi - - R --no-save < "~{out_base}.vadr.alerts.tsv" cat "~{out_base}.vadr.alerts.tsv" | wc -l > NUM_ALERTS - + else echo "VADR skipped due to poor assembly; assembly length (unambiguous) = ~{assembly_length_unambiguous}" > NUM_ALERTS @@ -48,8 +47,8 @@ task vadr { >>> output { - File? feature_tbl = "~{out_base}/~{out_base}.vadr.pass.tbl" - String num_alerts = read_string("NUM_ALERTS") + File? feature_tbl = "~{out_base}/~{out_base}.vadr.pass.tbl" + String num_alerts = read_string("NUM_ALERTS") File? alerts_list = "~{out_base}/~{out_base}.vadr.alt.list" File? outputs_tgz = "~{out_base}.vadr.tar.gz" String vadr_docker = docker diff --git a/tasks/task_nextclade_output_parser.wdl b/tasks/task_nextclade_output_parser.wdl deleted file mode 100644 index edbda4fe..00000000 --- a/tasks/task_nextclade_output_parser.wdl +++ /dev/null @@ -1,43 +0,0 @@ -version 1.0 - -task nextclade_output_parser_one_sample { - meta { - description: "Python and bash codeblocks for parsing the output files from Nextclade." - } - input { - File nextclade_tsv - String docker = "quay.io/theiagen/utility:1.1" - } - command { - python3 < NEXTCLADE_CLADE - grep ^aaSubstitutions transposed.tsv | cut -f 2 | grep -v aaSubstitutions | sed 's/,/|/g' > NEXTCLADE_AASUBS - grep ^aaDeletions transposed.tsv | cut -f 2 | grep -v aaDeletions | sed 's/,/|/g' > NEXTCLADE_AADELS - fi - } - runtime { - docker: "~{docker}" - memory: "4 GB" - cpu: 2 - disks: "local-disk 50 HDD" - dx_instance_type: "mem1_ssd1_v2_x2" - } - output { - String nextclade_clade = read_string("NEXTCLADE_CLADE") - String nextclade_aa_subs = read_string("NEXTCLADE_AASUBS") - String nextclade_aa_dels = read_string("NEXTCLADE_AADELS") - } -} diff --git a/tasks/task_ont_medaka.wdl b/tasks/task_ont_medaka.wdl index 6381989b..9bbfcab3 100644 --- a/tasks/task_ont_medaka.wdl +++ b/tasks/task_ont_medaka.wdl @@ -1,93 +1,82 @@ version 1.0 - task demultiplexing { - input { Array[File] basecalled_reads - String? run_prefix="artic_ncov2019" - Int? normalise=200 - Int? cpu=8 - } - - command{ - guppy_barcoder -t \$cpus --require_barcodes_both_ends -i . -s . --arrangements_files "barcode_arrs_nb12.cfg barcode_arrs_nb24.cfg barcode_arrs_nb96.cfg" -q 0 -r - + String? run_prefix = "artic_ncov2019" + Int? normalise = 200 + Int? cpu = 8 } + command <<< + guppy_barcoder -t \~cpu --require_barcodes_both_ends -i . -s . --arrangements_files "barcode_arrs_nb12.cfg barcode_arrs_nb24.cfg barcode_arrs_nb96.cfg" -q 0 -r + >>> output { - Array[File] demultiplexed_reads = glob("*.fq.gz") + Array[File] demultiplexed_reads = glob("*.fq.gz") } - runtime { - docker: "genomicpariscentre/guppy" - memory: "16 GB" - cpu: 8 - disks: "local-disk 100 SSD" - preemptible: 0 - maxRetries: 3 + docker: "genomicpariscentre/guppy" + memory: "16 GB" + cpu: 8 + disks: "local-disk 100 SSD" + preemptible: 0 + maxRetries: 3 } } task read_filtering { - input { - File demultiplexed_reads - String samplename - String? run_prefix="artic_ncov2019" - Int? min_length=400 - Int? max_length=700 - Int? cpu=8 + File demultiplexed_reads + String samplename + String? run_prefix = "artic_ncov2019" + Int? min_length = 400 + Int? max_length = 700 + Int? cpu = 8 } - - command{ + command <<< # date and version control mkdir ~{samplename} cp ~{demultiplexed_reads} ~{samplename}/ echo "DIRNAME: $(dirname)" - artic guppyplex --min-length ${min_length} --max-length ${max_length} --directory ~{samplename} --prefix ${run_prefix} - - } + artic guppyplex --min-length ~{min_length} --max-length ~{max_length} --directory ~{samplename} --prefix ~{run_prefix} + >>> output { - File filtered_reads = "${run_prefix}_~{samplename}.fastq" + File filtered_reads = "~{run_prefix}_~{samplename}.fastq" } - runtime { - - docker: "quay.io/staphb/artic-ncov2019:1.3.0" - memory: "16 GB" - cpu: 8 - disks: "local-disk 100 SSD" - preemptible: 0 - maxRetries: 3 + docker: "quay.io/staphb/artic-ncov2019:1.3.0-medaka-1.4.3" + memory: "16 GB" + cpu: cpu + disks: "local-disk 100 SSD" + preemptible: 0 + maxRetries: 3 } } task consensus { - ## Need to output multiple directories input { - String samplename - File filtered_reads - File primer_bed - File? reference_genome - Int? normalise=20000 - Int? cpu=8 - String medaka_model="r941_min_high_g360" - String docker="quay.io/staphb/artic-ncov2019-epi2me" + String samplename + File filtered_reads + File primer_bed + File? reference_genome + Int? normalise = 20000 + Int? cpu = 8 + String medaka_model = "r941_min_high_g360" + String docker = "quay.io/staphb/artic-ncov2019-epi2me" } String primer_name = basename(primer_bed) - command <<< # setup custom primer scheme (/V is required by Artic) mkdir -p ./primer-schemes/SARS-CoV-2/Vuser - - ## set reference genome - if [[ ! -z "~{reference_genome}" ]]; then + + ## set reference genome + if [[ ! -z "~{reference_genome}" ]]; then ref_genome="~{reference_genome}" else - if [[ -d "/fieldbioinformatics" ]]; then - ref_genome="/fieldbioinformatics/artic-ncov2019/primer_schemes/nCoV-2019/V3/nCoV-2019.reference.fasta" + # use reference file in docker--different paths depending on image specified + if [[ -d "/fieldbioinformatics" ]]; then + ref_genome=$(find /fieldbioinformatics/*/primer*schemes/nCoV-2019/V3/ -name "nCoV-2019.reference.fasta") else ref_genome=$(find /wf-artic*/data/primer_schemes/SARS-CoV-2/V4/ -name "SARS-CoV-2.reference.fasta") fi @@ -95,7 +84,7 @@ task consensus { fi head -n1 "${ref_genome}" | sed 's/>//' | tee REFERENCE_GENOME cp "${ref_genome}" ./primer-schemes/SARS-CoV-2/Vuser/SARS-CoV-2.reference.fasta - + ## set primers cp ~{primer_bed} ./primer-schemes/SARS-CoV-2/Vuser/SARS-CoV-2.scheme.bed @@ -109,25 +98,23 @@ task consensus { echo ">~{samplename}" > ~{samplename}.medaka.consensus.fasta grep -v ">" ~{samplename}.consensus.fasta >> ~{samplename}.medaka.consensus.fasta >>> - output { - File consensus_seq = "~{samplename}.medaka.consensus.fasta" - File sorted_bam = "~{samplename}.trimmed.rg.sorted.bam" - File trim_sorted_bam = "~{samplename}.primertrimmed.rg.sorted.bam" - File trim_sorted_bai = "~{samplename}.primertrimmed.rg.sorted.bam.bai" - File medaka_pass_vcf = "~{samplename}.pass.vcf" - String medaka_reference = read_string("REFERENCE_GENOME") - String artic_pipeline_version = read_string("VERSION") - String artic_pipeline_docker = docker - String primer_bed_name = read_string("PRIMER_NAME") + File consensus_seq = "~{samplename}.medaka.consensus.fasta" + File sorted_bam = "~{samplename}.trimmed.rg.sorted.bam" + File trim_sorted_bam = "~{samplename}.primertrimmed.rg.sorted.bam" + File trim_sorted_bai = "~{samplename}.primertrimmed.rg.sorted.bam.bai" + File medaka_pass_vcf = "~{samplename}.pass.vcf" + String medaka_reference = read_string("REFERENCE_GENOME") + String artic_pipeline_version = read_string("VERSION") + String artic_pipeline_docker = docker + String primer_bed_name = read_string("PRIMER_NAME") } - runtime { - docker: "~{docker}" - memory: "16 GB" - cpu: 8 - disks: "local-disk 100 SSD" - preemptible: 0 - maxRetries: 3 + docker: "~{docker}" + memory: "16 GB" + cpu: cpu + disks: "local-disk 100 SSD" + preemptible: 0 + maxRetries: 3 } } diff --git a/tasks/task_pe_pub_repo_submission.wdl b/tasks/task_pe_pub_repo_submission.wdl deleted file mode 100644 index 946ced71..00000000 --- a/tasks/task_pe_pub_repo_submission.wdl +++ /dev/null @@ -1,267 +0,0 @@ -version 1.0 - -task deidentify { - - input { - String samplename - String submission_id - File sequence - - String docker_image = "quay.io/staphb/seqyclean:1.10.09" - Int mem_size_gb = 3 - Int CPUs = 1 - Int disk_size = 100 - Int preemptible_tries = 0 - } - - command { - # de-identified consensus/assembly sequence - echo ">${submission_id}" > ${submission_id}.fasta - grep -v ">" ${sequence} >> ${submission_id}.fasta - - num_N=$( grep -v ">" ${sequence} | grep -o 'N' | wc -l ) - if [ -z "$num_N" ] ; then num_N="0" ; fi - echo $num_N | tee NUM_N - - num_ACTG=$( grep -v ">" ${sequence} | grep -o -E "C|A|T|G" | wc -l ) - if [ -z "$num_ACTG" ] ; then num_ACTG="0" ; fi - echo $num_ACTG | tee NUM_ACTG - - num_total=$( grep -v ">" ${sequence} | grep -o -E '[A-Z]' | wc -l ) - if [ -z "$num_total" ] ; then num_total="0" ; fi - echo $num_total | tee NUM_TOTAL - } - - output { - File deID_assembly = "${submission_id}.fasta" - Int number_N = read_string("NUM_N") - Int number_ATCG = read_string("NUM_ACTG") - Int number_Total = read_string("NUM_TOTAL") - } - - runtime { - docker: docker_image - memory: "~{mem_size_gb} GB" - cpu: CPUs - disks: "local-disk ~{disk_size} SSD" - preemptible: preemptible_tries - maxRetries: 3 - } -} - -task gisaid { - - input { - String samplename - String submission_id - String collection_date - File sequence - String iso_host - String iso_country - - String gisaid_submitter - String iso_state - String iso_continent - String seq_platform - String assembly_method - String originating_lab - String origLab_address - String submitting_lab - String subLab_address - String Authors - - String passage_details="Original" - String gender="unknown" - String patient_age="unknown" - String patient_status="unknown" - String specimen_source="" - String outbreak="" - String last_vaccinated="" - String treatment="" - String iso_county = "" - - String docker_image = "quay.io/staphb/seqyclean:1.10.09" - Int mem_size_gb = 3 - Int CPUs = 1 - Int disk_size = 10 - Int preemptible_tries = 0 - } - - command { - # de-identified consensus/assembly sequence - year=$(echo ${collection_date} | cut -f 1 -d '-') - echo ">hCoV-19/${iso_country}/${submission_id}/$year" > ${submission_id}.gisaid.fa - grep -v ">" ${sequence} >> ${submission_id}.gisaid.fa - - - echo submitter,fn,covv_virus_name,covv_type,covv_passage,covv_collection_date,covv_location,covv_add_location,covv_host,covv_add_host_info,covv_sampling_strategy,covv_gender,covv_patient_age,covv_patient_status,covv_specimen,covv_outbreak,covv_last_vaccinated,covv_treatment,covv_seq_technology,covv_assembly_method,covv_coverage,covv_orig_lab,covv_orig_lab_addr,covv_provider_sample_id,covv_subm_lab,covv_subm_lab_addr,covv_subm_sample_id,covv_authors,covv_comment,comment_type > ${submission_id}.gisaidMeta.csv - - echo Submitter,FASTA filename,Virus name,Type,Passage details/history,Collection date,Location,Additional location information,Host,Additional host information,Sampling Strategy,Gender,Patient age,Patient status,Specimen source,Outbreak,Last vaccinated,Treatment,Sequencing technology,Assembly method,Coverage,Originating lab,Address,Sample ID given by the sample provider,Submitting lab,Address,Sample ID given by the submitting laboratory,Authors,Comment,Comment Icon >> ${submission_id}.gisaidMeta.csv - - echo "\"${gisaid_submitter}\",\"${submission_id}.gisaid.fa\",\"hCoV-19/${iso_country}/${submission_id}/$year\",\"betacoronavirus\",\"${passage_details}\",\"${collection_date}\",\"${iso_continent} / ${iso_country} / ${iso_state} / ${iso_county}\" ,,\"${iso_host}\",,,\"${gender}\",\"${patient_age}\",\"${patient_status}\",\"${specimen_source}\",\"${outbreak}\",\"${last_vaccinated}\",\"${treatment}\",\"${seq_platform}\",\"${assembly_method}\",,\"${originating_lab}\",\"${origLab_address}\",,\"${submitting_lab}\",\"${subLab_address}\",,\"${Authors}\"",, >> ${submission_id}.gisaidMeta.csv - - } - - output { - File gisaid_assembly = "${submission_id}.gisaid.fa" - File gisaid_metadata = "${submission_id}.gisaidMeta.csv" - } - - runtime { - docker: docker_image - memory: "~{mem_size_gb} GB" - cpu: CPUs - disks: "local-disk ~{disk_size} SSD" - preemptible: preemptible_tries - maxRetries: 3 - } -} - -task genbank { - - input { - String samplename - String submission_id - String collection_date - File sequence - String organism - String iso_org - String iso_host - String iso_country - String specimen_source - String BioProject - - String docker_image = "quay.io/theiagen/utility:1.1" - Int mem_size_gb = 3 - Int CPUs = 1 - Int disk_size = 10 - Int preemptible_tries = 0 - } - - command <<< - year=$(echo ~{collection_date} | cut -f 1 -d '-') - isolate=$(echo ~{submission_id} | awk 'BEGIN { FS = "-" } ; {$1=$2=""; print $0}' | sed 's/^ *//g') - - # removing leading Ns, folding sequencing to 75 bp wide, and adding metadata for genbank submissions - echo ">~{submission_id} [organism=~{organism}][isolate=~{iso_org}/~{iso_host}/~{iso_country}/~{submission_id}/$year)][host=~{iso_host}][country=~{iso_country}][collection_date=~{collection_date}]" > ~{submission_id}.genbank.fa - grep -v ">" ~{sequence} | sed 's/^N*N//g' | fold -w 75 >> ~{submission_id}.genbank.fa - - echo Sequence_ID,Country,Host,Isolate,Collection Date, BioProject Accession > ~{submission_id}.genbankMeta.csv - - echo "\"~{submission_id}\",\"~{iso_country}\",\"~{iso_host}\",\"~{submission_id}\",\"~{collection_date}\",\"~{BioProject}\"" >> ~{submission_id}.genbankMeta.csv - - >>> - - output { - File genbank_assembly = "~{submission_id}.genbank.fa" - File genbank_metadata = "~{submission_id}.genbankMeta.csv" - } - - runtime { - docker: docker_image - memory: "~{mem_size_gb} GB" - cpu: CPUs - disks: "local-disk ~{disk_size} SSD" - preemptible: preemptible_tries - maxRetries: 3 - } -} - -task sra { - - input { - String submission_id - File read1 - File? read2 - - String docker_image = "quay.io/staphb/seqyclean:1.10.09" - Int mem_size_gb = 1 - Int CPUs = 1 - Int disk_size = 25 - Int preemptible_tries = 0 - } - - command { - if ! [ -z ${read2} ]; then - cp ${read1} ${submission_id}.R1.fastq.gz - cp ${read2} ${submission_id}.R2.fastq.gz - else - cp ${read1} ${submission_id}.fastq.gz - fi - } - - output { - File? read1_submission = "${submission_id}.R1.fastq.gz" - File? read2_submission = "${submission_id}.R2.fastq.gz" - File? SE_read_submission = "${submission_id}.fastq.gz" - } - - runtime { - docker: docker_image - memory: "~{mem_size_gb} GB" - cpu: CPUs - disks: "local-disk ~{disk_size} SSD" - preemptible: preemptible_tries - maxRetries: 3 - } -} - - -task compile { -input { - Array[File] single_submission_fasta - Array[File] single_submission_meta - Array[Int] vadr_num_alerts - Int vadr_threshold=0 - String repository - String docker_image = "quay.io/theiagen/utility:1.1" - Int mem_size_gb = 1 - Int CPUs = 1 - Int disk_size = 25 - Int preemptible_tries = 0 -} - - command <<< - - assembly_array=(~{sep=' ' single_submission_fasta}) - meta_array=(~{sep=' ' single_submission_meta}) - vadr_array=(~{sep=' ' vadr_num_alerts}) - - # remove samples that excede vadr threshold - for index in ${!assembly_array[@]}; do - assembly=${assembly_array[$index]} - meta=${meta_array[$index]} - vadr=${vadr_array[$index]} - - if [ "${vadr}" -gt "~{vadr_threshold}" ]; then - assembly_array=( "${assembly_array[@]/$assembly}" ) - meta_array=( "${meta_array[@]/$meta}" ) - fi - done - - - head -n -1 ${meta_array[1]} > ~{repository}_upload_meta.csv - for i in ${meta_array[*]}; do - echo $i - tail -n1 $i >> ~{repository}_upload_meta.csv - done - - cat ${assembly_array[*]} > ~{repository}_upload.fasta - - >>> - - output { - File upload_meta = "${repository}_upload_meta.csv" - File upload_fasta = "${repository}_upload.fasta" - - } - - runtime { - docker: docker_image - memory: "~{mem_size_gb} GB" - cpu: CPUs - disks: "local-disk ~{disk_size} SSD" - preemptible: preemptible_tries - maxRetries: 3 - } -} diff --git a/tasks/task_phylo.wdl b/tasks/task_phylo.wdl index d79f90f2..4edeed4f 100644 --- a/tasks/task_phylo.wdl +++ b/tasks/task_phylo.wdl @@ -1,72 +1,63 @@ version 1.0 task snp_dists { - input { - File alignment - String cluster_name + File alignment + String cluster_name } - - command{ + command <<< # date and version control date | tee DATE snp-dists -v | tee VERSION - snp-dists ${alignment} > ${cluster_name}_snp_distance_matrix.tsv - -} - + snp-dists ~{alignment} > ~{cluster_name}_snp_distance_matrix.tsv + >>> output { - String date = read_string("DATE") - String version = read_string("VERSION") - File snp_matrix = "${cluster_name}_snp_distance_matrix.tsv" + String date = read_string("DATE") + String version = read_string("VERSION") + File snp_matrix = "~{cluster_name}_snp_distance_matrix.tsv" } - runtime { - docker: "quay.io/staphb/snp-dists:0.6.2" - memory: "2 GB" - cpu: 2 - disks: "local-disk 100 SSD" - preemptible: 0 - maxRetries: 3 + docker: "quay.io/staphb/snp-dists:0.6.2" + memory: "2 GB" + cpu: 2 + disks: "local-disk 100 SSD" + preemptible: 0 + maxRetries: 3 } } task iqtree { - input { - File alignment - String cluster_name - String? iqtree_model = "GTR+G4" - String? iqtree_bootstraps = 1000 + File alignment + String cluster_name + String? iqtree_model = "GTR+G4" + String? iqtree_bootstraps = 1000 } - - command{ + command <<< # date and version control date | tee DATE iqtree --version | grep version | sed 's/.*version/version/;s/ for Linux.*//' | tee VERSION - numGenomes=`grep -o '>' ${alignment} | wc -l` + numGenomes=`grep -o '>' ~{alignment} | wc -l` if [ $numGenomes -gt 3 ] then - cp ${alignment} msa.fasta - iqtree -nt AUTO -s msa.fasta -m ${iqtree_model} -bb ${iqtree_bootstraps} - cp msa.fasta.contree ${cluster_name}_msa.tree + cp ~{alignment} msa.fasta + iqtree -nt AUTO -s msa.fasta -m ~{iqtree_model} -bb ~{iqtree_bootstraps} + cp msa.fasta.contree ~{cluster_name}_msa.tree fi - } - + >>> output { - String date = read_string("DATE") - String version = read_string("VERSION") - File ml_tree = "${cluster_name}_msa.tree" + String date = read_string("DATE") + String version = read_string("VERSION") + File ml_tree = "~{cluster_name}_msa.tree" } - runtime { - docker: "quay.io/staphb/iqtree:1.6.7" - memory: "8 GB" - cpu: 4 - disks: "local-disk 100 SSD" - preemptible: 0 - maxRetries: 3 + docker: "quay.io/staphb/iqtree:1.6.7" + memory: "8 GB" + cpu: 4 + disks: "local-disk 100 SSD" + preemptible: 0 + maxRetries: 3 } } diff --git a/tasks/task_pub_repo_prep.wdl b/tasks/task_pub_repo_prep.wdl index 2f946ce3..613184ac 100644 --- a/tasks/task_pub_repo_prep.wdl +++ b/tasks/task_pub_repo_prep.wdl @@ -6,7 +6,6 @@ task ncbi_prep_one_sample { File assembly_fasta File read1_dehosted File read2_dehosted - #required metadata String assembly_method String bioproject_accession @@ -30,7 +29,6 @@ task ncbi_prep_one_sample { String seq_platform String state String submission_id - #optional metadata String? amplicon_primer_scheme String? amplicon_size @@ -43,15 +41,13 @@ task ncbi_prep_one_sample { String? purpose_of_sequencing String? submitter_email String? treatment - #GenBank formatting Int minlen = 50 Int maxlen = 30000 - #runtime String docker_image = "quay.io/staphb/vadr:1.3" - Int mem_size_gb = 1 - Int CPUs = 1 + Int memory = 1 + Int cpu = 1 Int disk_size = 25 Int preemptible_tries = 0 } @@ -100,7 +96,6 @@ task ncbi_prep_one_sample { echo -e "~{submission_id}\t~{country}\t~{host_sci_name}\t${isolate}\t~{collection_date}\t~{isolation_source}\t~{biosample_accession}\t~{bioproject_accession}" >> ~{submission_id}_genbank_modifier.tsv >>> - output { File biosample_attributes = "~{submission_id}_biosample_attributes.tsv" File sra_metadata = "~{submission_id}_sra_metadata.tsv" @@ -110,22 +105,21 @@ task ncbi_prep_one_sample { File sra_read2 = "~{submission_id}_R2.fastq.gz" Array[File] sra_reads = ["~{submission_id}_R1.fastq.gz","~{submission_id}_R2.fastq.gz"] } - runtime { - docker: "~{docker_image}" - memory: "~{mem_size_gb} GB" - cpu: CPUs - disks: "local-disk ~{disk_size} SSD" - preemptible: preemptible_tries - maxRetries: 3 + docker: "~{docker_image}" + memory: "~{memory} GB" + cpu: cpu + disks: "local-disk ~{disk_size} SSD" + preemptible: preemptible_tries + maxRetries: 3 } } + task ncbi_prep_one_sample_se { input { #required files File assembly_fasta File reads_dehosted - #required metadata String assembly_method String bioproject_accession @@ -149,7 +143,6 @@ task ncbi_prep_one_sample_se { String seq_platform String state String submission_id - #optional metadata String? amplicon_primer_scheme String? amplicon_size @@ -162,15 +155,13 @@ task ncbi_prep_one_sample_se { String? purpose_of_sequencing String? submitter_email String? treatment - #GenBank formatting Int minlen = 50 Int maxlen = 30000 - #runtime - String docker_image = "quay.io/staphb/vadr:1.3" - Int mem_size_gb = 1 - Int CPUs = 1 + String docker = "quay.io/staphb/vadr:1.3" + Int memory = 1 + Int cpu = 1 Int disk_size = 25 Int preemptible_tries = 0 } @@ -216,9 +207,7 @@ task ncbi_prep_one_sample_se { ##GenBank modifier echo -e "Sequence_ID\tcountry\thost\tisolate\tcollection-date\tisolation-source\tBioSample\tBioProject\tnote" > ~{submission_id}_genbank_modifier.tsv echo -e "~{submission_id}\t~{country}\t~{host_sci_name}\t${isolate}\t~{collection_date}\t~{isolation_source}\t~{biosample_accession}\t~{bioproject_accession}" >> ~{submission_id}_genbank_modifier.tsv - >>> - output { File biosample_attributes = "~{submission_id}_biosample_attributes.tsv" File sra_metadata = "~{submission_id}_sra_metadata.tsv" @@ -226,21 +215,20 @@ task ncbi_prep_one_sample_se { File genbank_modifier = "~{submission_id}_genbank_modifier.tsv" File sra_reads = "~{submission_id}_R1.fastq.gz" } - runtime { - docker: "~{docker_image}" - memory: "~{mem_size_gb} GB" - cpu: CPUs - disks: "local-disk ~{disk_size} SSD" - preemptible: preemptible_tries - maxRetries: 3 + docker: "~{docker}" + memory: "~{memory} GB" + cpu: cpu + disks: "local-disk ~{disk_size} SSD" + preemptible: preemptible_tries + maxRetries: 3 } } + task gisaid_prep_one_sample { input { #required files File assembly_fasta - #required metadata String authors String assembly_method @@ -259,7 +247,6 @@ task gisaid_prep_one_sample { String submitting_lab String submitting_lab_address String type="betacoronavirus" - #optional metadata String? county String? patient_gender = "unknown" @@ -271,13 +258,12 @@ task gisaid_prep_one_sample { String? outbreak String? specimen_source String? treatment - #runtime - String docker_image = "quay.io/theiagen/utility:1.1" - Int mem_size_gb = 1 - Int CPUs = 1 + String docker = "quay.io/theiagen/utility:1.1" + Int memory = 1 + Int cpu = 1 Int disk_size = 25 - Int preemptible_tries = 0 + Int preemptible = 0 } command <<< #Check date format @@ -301,44 +287,38 @@ task gisaid_prep_one_sample { echo "Submitter,FASTA filename,Virus name,Type,Passage details/history,Collection date,Location,Additional location information,Host,Additional host information,Sampling Strategy,Gender,Patient age,Patient status,Specimen source,Outbreak,Last vaccinated,Treatment,Sequencing technology,Assembly method,Coverage,Originating lab,Address,Sample ID given by the sample provider,Submitting lab,Address,Sample ID given by the submitting laboratory,Authors,Comment,Comment Icon" >> ~{submission_id}_gisaid_metadata.csv echo "\"~{gisaid_submitter}\",\"~{submission_id}.gisaid.fa\",\"~{organism}/~{country}/~{submission_id}/$year\",\"~{type}\",\"~{passage_details}\",\"~{collection_date}\",\"~{continent}/~{country}/~{state}/~{county}\",,\"~{host}\",,\"~{purpose_of_sequencing}\",\"~{patient_gender}\",\"~{patient_age}\",\"~{patient_status}\",\"~{specimen_source}\",\"~{outbreak}\",\"~{last_vaccinated}\",\"~{treatment}\",\"~{seq_platform}\",\"~{assembly_method}\",\"~{assembly_mean_coverage}\",\"~{collecting_lab}\",\"~{collecting_lab_address}\",,\"~{submitting_lab}\",\"~{submitting_lab_address}\",,\"~{authors}\",," >> ~{submission_id}_gisaid_metadata.csv - >>> - output { File gisaid_assembly = "~{submission_id}_gisaid.fasta" File gisaid_metadata = "~{submission_id}_gisaid_metadata.csv" } - runtime { - docker: "~{docker_image}" - memory: "~{mem_size_gb} GB" - cpu: CPUs - disks: "local-disk ~{disk_size} SSD" - preemptible: preemptible_tries - maxRetries: 3 + docker: "~{docker}" + memory: "~{memory} GB" + cpu: cpu + disks: "local-disk ~{disk_size} SSD" + preemptible: preemptible + maxRetries: 3 } } - task compile_assembly_n_meta { - input { Array[File] single_submission_fasta Array[File] single_submission_meta Array[String] samplename Array[String] submission_id Array[String] vadr_num_alerts - Int vadr_threshold=0 + Int vadr_threshold = 0 String repository String file_ext String date - String docker_image = "quay.io/theiagen/utility:1.1" - Int mem_size_gb = 8 - Int CPUs = 4 + String docker = "quay.io/theiagen/utility:1.1" + Int memory = 8 + Int cpu = 4 Int disk_size = 100 - Int preemptible_tries = 0 + Int preemptible = 0 } - command <<< assembly_array=(~{sep=' ' single_submission_fasta}) assembly_array_len=$(echo "${#assembly_array[@]}") @@ -419,26 +399,23 @@ task compile_assembly_n_meta { done cat ${passed_assemblies[*]} > ~{repository}_upload_~{date}.fasta - >>> - output { - File? upload_meta = "${repository}_upload_meta_~{date}.~{file_ext}" - File? upload_fasta = "${repository}_upload_~{date}.fasta" - File batched_samples = "${repository}_batched_samples_~{date}.~{file_ext}" - File excluded_samples = "${repository}_excluded_samples_~{date}.~{file_ext}" - + File? upload_meta = "${repository}_upload_meta_~{date}.~{file_ext}" + File? upload_fasta = "${repository}_upload_~{date}.fasta" + File batched_samples = "${repository}_batched_samples_~{date}.~{file_ext}" + File excluded_samples = "${repository}_excluded_samples_~{date}.~{file_ext}" } - runtime { - docker: docker_image - memory: "~{mem_size_gb} GB" - cpu: CPUs - disks: "local-disk ~{disk_size} SSD" - preemptible: preemptible_tries - maxRetries: 3 + docker: docker + memory: "~{memory} GB" + cpu: cpu + disks: "local-disk ~{disk_size} SSD" + preemptible: preemptible + maxRetries: 3 } } + task compile_biosamp_n_sra { input { Array[File] single_submission_biosample_attirbutes @@ -446,14 +423,12 @@ input { Array[String] single_submission_sra_reads String date String? gcp_bucket - - String docker_image = "quay.io/theiagen/utility:1.1" - Int mem_size_gb = 16 - Int CPUs = 4 - Int disk_size = 100 - Int preemptible_tries = 0 + String docker = "quay.io/theiagen/utility:1.1" + Int memory = 16 + Int cpu = 4 + Int disk_size = 100 + Int preemptible = 0 } - command <<< biosample_attributes_array=(~{sep=' ' single_submission_biosample_attirbutes}) biosample_attributes_array_len=$(echo "${#biosample_attributes_array[@]}") @@ -507,11 +482,11 @@ input { File? sra_zipped = "sra_reads_~{date}.zip" } runtime { - docker: docker_image - memory: "~{mem_size_gb} GB" - cpu: CPUs + docker: docker + memory: "~{memory} GB" + cpu: cpu disks: "local-disk ~{disk_size} SSD" - preemptible: preemptible_tries + preemptible: preemptible maxRetries: 3 } } diff --git a/tasks/task_qc_utils.wdl b/tasks/task_qc_utils.wdl index be61b2e2..1a248382 100644 --- a/tasks/task_qc_utils.wdl +++ b/tasks/task_qc_utils.wdl @@ -1,27 +1,25 @@ version 1.0 task fastqc { - input { - File read1 - File read2 - String read1_name = basename(basename(basename(read1, ".gz"), ".fastq"), ".fq") - String read2_name = basename(basename(basename(read2, ".gz"), ".fastq"), ".fq") - Int? cpus = 2 + File read1 + File read2 + String read1_name = basename(basename(basename(read1, ".gz"), ".fastq"), ".fq") + String read2_name = basename(basename(basename(read2, ".gz"), ".fastq"), ".fq") + Int? cpu = 2 } - - command { + command <<< # capture date and version date | tee DATE fastqc --version | grep FastQC | tee VERSION - fastqc --outdir $PWD --threads ${cpus} ${read1} ${read2} + fastqc --outdir $PWD --threads ~{cpu} ~{read1} ~{read2} - unzip -p ${read1_name}_fastqc.zip */fastqc_data.txt | grep "Total Sequences" | cut -f 2 | tee READ1_SEQS - unzip -p ${read2_name}_fastqc.zip */fastqc_data.txt | grep "Total Sequences" | cut -f 2 | tee READ2_SEQS + unzip -p ~{read1_name}_fastqc.zip */fastqc_data.txt | grep "Total Sequences" | cut -f 2 | tee READ1_SEQS + unzip -p ~{read2_name}_fastqc.zip */fastqc_data.txt | grep "Total Sequences" | cut -f 2 | tee READ2_SEQS - READ1_SEQS=$(unzip -p ${read1_name}_fastqc.zip */fastqc_data.txt | grep "Total Sequences" | cut -f 2 ) - READ2_SEQS=$(unzip -p ${read2_name}_fastqc.zip */fastqc_data.txt | grep "Total Sequences" | cut -f 2 ) + READ1_SEQS=$(unzip -p ~{read1_name}_fastqc.zip */fastqc_data.txt | grep "Total Sequences" | cut -f 2 ) + READ2_SEQS=$(unzip -p ~{read2_name}_fastqc.zip */fastqc_data.txt | grep "Total Sequences" | cut -f 2 ) if [ $READ1_SEQS == $READ2_SEQS ]; then read_pairs=$READ1_SEQS @@ -29,95 +27,89 @@ task fastqc { read_pairs="Uneven pairs: R1=$READ1_SEQS, R2=$READ2_SEQS" fi echo $read_pairs | tee READ_PAIRS - } - + >>> output { - File fastqc1_html = "${read1_name}_fastqc.html" - File fastqc1_zip = "${read1_name}_fastqc.zip" - File fastqc2_html = "${read2_name}_fastqc.html" - File fastqc2_zip = "${read2_name}_fastqc.zip" - Int read1_seq = read_string("READ1_SEQS") - Int read2_seq = read_string("READ2_SEQS") - String read_pairs = read_string("READ_PAIRS") - String version = read_string("VERSION") - String pipeline_date = read_string("DATE") + File fastqc1_html = "~{read1_name}_fastqc.html" + File fastqc1_zip = "~{read1_name}_fastqc.zip" + File fastqc2_html = "~{read2_name}_fastqc.html" + File fastqc2_zip = "~{read2_name}_fastqc.zip" + Int read1_seq = read_string("READ1_SEQS") + Int read2_seq = read_string("READ2_SEQS") + String read_pairs = read_string("READ_PAIRS") + String version = read_string("VERSION") + String pipeline_date = read_string("DATE") } - runtime { - docker: "quay.io/staphb/fastqc:0.11.9" - memory: "4 GB" - cpu: 2 - disks: "local-disk 100 SSD" - preemptible: 0 - maxRetries: 3 + docker: "quay.io/staphb/fastqc:0.11.9" + memory: "4 GB" + cpu: 2 + disks: "local-disk 100 SSD" + preemptible: 0 + maxRetries: 3 } } -task fastqc_se { +task fastqc_se { input { - File read1 - String read1_name = basename(basename(basename(read1, ".gz"), ".fastq"), ".fq") - Int? cpus = 2 + File read1 + String read1_name = basename(basename(basename(read1, ".gz"), ".fastq"), ".fq") + Int? cpu = 2 } - - command { + command <<< # capture date and version date | tee DATE fastqc --version | grep FastQC | tee VERSION - fastqc --outdir $PWD --threads ${cpus} ${read1} + fastqc --outdir $PWD --threads ~{cpu} ~{read1} - unzip -p ${read1_name}_fastqc.zip */fastqc_data.txt | grep "Total Sequences" | cut -f 2 | tee READ1_SEQS + unzip -p ~{read1_name}_fastqc.zip */fastqc_data.txt | grep "Total Sequences" | cut -f 2 | tee READ1_SEQS - READ_SEQS=$(unzip -p ${read1_name}_fastqc.zip */fastqc_data.txt | grep "Total Sequences" | cut -f 2 ) + READ_SEQS=$(unzip -p ~{read1_name}_fastqc.zip */fastqc_data.txt | grep "Total Sequences" | cut -f 2 ) echo $read_pairs | tee READ_PAIRS - } - + >>> output { - File fastqc_html = "${read1_name}_fastqc.html" - File fastqc_zip = "${read1_name}_fastqc.zip" - Int number_reads = read_string("READ1_SEQS") - String version = read_string("VERSION") - String pipeline_date = read_string("DATE") + File fastqc_html = "~{read1_name}_fastqc.html" + File fastqc_zip = "~{read1_name}_fastqc.zip" + Int number_reads = read_string("READ1_SEQS") + String version = read_string("VERSION") + String pipeline_date = read_string("DATE") } - runtime { - docker: "quay.io/staphb/fastqc:0.11.8" - memory: "4 GB" - cpu: 2 - disks: "local-disk 100 SSD" - preemptible: 0 - maxRetries: 3 + docker: "quay.io/staphb/fastqc:0.11.8" + memory: "4 GB" + cpu: 2 + disks: "local-disk 100 SSD" + preemptible: 0 + maxRetries: 3 } } -task fastq_scan { +task fastq_scan { input { - File read1 - File read2 - String read1_name = basename(basename(basename(read1, ".gz"), ".fastq"), ".fq") - String read2_name = basename(basename(basename(read2, ".gz"), ".fastq"), ".fq") + File read1 + File read2 + String read1_name = basename(basename(basename(read1, ".gz"), ".fastq"), ".fq") + String read2_name = basename(basename(basename(read2, ".gz"), ".fastq"), ".fq") } - command <<< # capture date and version date | tee DATE fastq-scan -v | tee VERSION # set cat command based on compression - if [[ "~{read1}" == *".gz" ]] ; then + if [[ "~{read1}" == *".gz" ]] ; then cat_reads="zcat" else cat_reads="cat" fi - + # capture forward read stats eval "${cat_reads} ~{read1}" | fastq-scan | tee ~{read1_name}_fastq-scan.json >(jq .qc_stats.read_total > READ1_SEQS) read1_seqs=$(cat READ1_SEQS) eval "${cat_reads} ~{read2}" | fastq-scan | tee ~{read2_name}_fastq-scan.json >(jq .qc_stats.read_total > READ2_SEQS) - read2_seqs=$(cat READ2_SEQS) - + read2_seqs=$(cat READ2_SEQS) + # capture number of read pairs if [ ${read1_seqs} == $read2_seqs]; then read_pairs=${read1_seqs} @@ -127,78 +119,69 @@ task fastq_scan { echo $read_pairs | tee READ_PAIRS >>> - output { - File read1_fastq_scan_report = "~{read1_name}_fastq-scan.json" - File read2_fastq_scan_report = "~{read2_name}_fastq-scan.json" - Int read1_seq = read_string("READ1_SEQS") - Int read2_seq = read_string("READ2_SEQS") - String read_pairs = read_string("READ_PAIRS") - String version = read_string("VERSION") - String pipeline_date = read_string("DATE") + File read1_fastq_scan_report = "~{read1_name}_fastq-scan.json" + File read2_fastq_scan_report = "~{read2_name}_fastq-scan.json" + Int read1_seq = read_string("READ1_SEQS") + Int read2_seq = read_string("READ2_SEQS") + String read_pairs = read_string("READ_PAIRS") + String version = read_string("VERSION") + String pipeline_date = read_string("DATE") } - runtime { - docker: "quay.io/biocontainers/fastq-scan:0.4.4--h7d875b9_1" - memory: "2 GB" - cpu: 2 - disks: "local-disk 100 SSD" - preemptible: 0 - maxRetries: 3 + docker: "quay.io/biocontainers/fastq-scan:0.4.4--h7d875b9_1" + memory: "2 GB" + cpu: 2 + disks: "local-disk 100 SSD" + preemptible: 0 + maxRetries: 3 } } -task fastq_scan_se { +task fastq_scan_se { input { - File read1 - String read1_name = basename(basename(basename(read1, ".gz"), ".fastq"), ".fq") + File read1 + String read1_name = basename(basename(basename(read1, ".gz"), ".fastq"), ".fq") } - command <<< # capture date and version date | tee DATE fastq-scan -v | tee VERSION # set cat command based on compression - if [[ "~{read1}" == *".gz" ]] ; then + if [[ "~{read1}" == *".gz" ]] ; then cat_reads="zcat" else cat_reads="cat" fi - + # capture forward read stats eval "${cat_reads} ~{read1}" | fastq-scan | tee ~{read1_name}_fastq-scan.json >(jq .qc_stats.read_total > READ1_SEQS) - - >>> - output { - File fastq_scan_report = "~{read1_name}_fastq-scan.json" - Int read1_seq = read_string("READ1_SEQS") - String version = read_string("VERSION") - String pipeline_date = read_string("DATE") + File fastq_scan_report = "~{read1_name}_fastq-scan.json" + Int read1_seq = read_string("READ1_SEQS") + String version = read_string("VERSION") + String pipeline_date = read_string("DATE") } - runtime { - docker: "quay.io/biocontainers/fastq-scan:0.4.4--h7d875b9_1" - memory: "2 GB" - cpu: 2 - disks: "local-disk 100 SSD" - preemptible: 0 - maxRetries: 3 + docker: "quay.io/biocontainers/fastq-scan:0.4.4--h7d875b9_1" + memory: "2 GB" + cpu: 2 + disks: "local-disk 100 SSD" + preemptible: 0 + maxRetries: 3 } } -task consensus_qc { +task consensus_qc { input { - File assembly_fasta - + File assembly_fasta } - command <<< # capture date and version date | tee DATE - + num_N=$( grep -v ">" ~{assembly_fasta} | grep -o 'N' | wc -l ) if [ -z "$num_N" ] ; then num_N="0" ; fi echo $num_N | tee NUM_N @@ -218,20 +201,18 @@ task consensus_qc { if [ -z "$num_total" ] ; then num_total="0" ; fi echo $num_total | tee NUM_TOTAL >>> - output { - Int number_N = read_string("NUM_N") - Int number_ATCG = read_string("NUM_ACTG") - Int number_Degenerate = read_string("NUM_DEGENERATE") - Int number_Total = read_string("NUM_TOTAL") - Float percent_reference_coverage = read_string("PERCENT_REF_COVERAGE") + Int number_N = read_string("NUM_N") + Int number_ATCG = read_string("NUM_ACTG") + Int number_Degenerate = read_string("NUM_DEGENERATE") + Int number_Total = read_string("NUM_TOTAL") + Float percent_reference_coverage = read_string("PERCENT_REF_COVERAGE") } - runtime { - docker: "quay.io/theiagen/utility:1.1" - memory: "2 GB" - cpu: 1 - disks: "local-disk 100 SSD" - preemptible: 0 + docker: "quay.io/theiagen/utility:1.1" + memory: "2 GB" + cpu: 1 + disks: "local-disk 100 SSD" + preemptible: 0 } } diff --git a/tasks/task_read_clean.wdl b/tasks/task_read_clean.wdl index f7f979d0..76e4f3c0 100644 --- a/tasks/task_read_clean.wdl +++ b/tasks/task_read_clean.wdl @@ -2,15 +2,13 @@ version 1.0 task ncbi_scrub_pe { input { - File read1 - File read2 - String samplename - String docker = "gcr.io/ncbi-sys-gcr-public-research/sra-human-scrubber@sha256:b7dba71079344daea4ea3363e1a67fa54edb7ec65459d039669c68a66d38b140" - + File read1 + File read2 + String samplename + String docker = "gcr.io/ncbi-sys-gcr-public-research/sra-human-scrubber@sha256:b7dba71079344daea4ea3363e1a67fa54edb7ec65459d039669c68a66d38b140" } String r1_filename = basename(read1) String r2_filename = basename(read2) - command <<< # date and version control date | tee DATE @@ -44,38 +42,32 @@ task ncbi_scrub_pe { /opt/scrubber/scripts/scrub.sh -n ${read2_unzip} |& tail -n1 | awk -F" " '{print $1}' > REV_SPOTS_REMOVED # gzip dehosted reads - gzip ${read2_unzip}.clean -c > ~{samplename}_R2_dehosted.fastq.gz - - + gzip ${read2_unzip}.clean -c > ~{samplename}_R2_dehosted.fastq.gz >>> - output { - File read1_dehosted = "~{samplename}_R1_dehosted.fastq.gz" - File read2_dehosted = "~{samplename}_R2_dehosted.fastq.gz" - Int read1_human_spots_removed = read_int("FWD_SPOTS_REMOVED") - Int read2_human_spots_removed = read_int("REV_SPOTS_REMOVED") + File read1_dehosted = "~{samplename}_R1_dehosted.fastq.gz" + File read2_dehosted = "~{samplename}_R2_dehosted.fastq.gz" + Int read1_human_spots_removed = read_int("FWD_SPOTS_REMOVED") + Int read2_human_spots_removed = read_int("REV_SPOTS_REMOVED") String ncbi_scrub_docker = docker } - runtime { - docker: "~{docker}" - memory: "8 GB" - cpu: 4 - disks: "local-disk 100 SSD" - preemptible: 0 - maxRetries: 3 + docker: "~{docker}" + memory: "8 GB" + cpu: 4 + disks: "local-disk 100 SSD" + preemptible: 0 + maxRetries: 3 } } task ncbi_scrub_se { input { - File read1 - String samplename - String docker = "gcr.io/ncbi-sys-gcr-public-research/sra-human-scrubber@sha256:b7dba71079344daea4ea3363e1a67fa54edb7ec65459d039669c68a66d38b140" - + File read1 + String samplename + String docker = "gcr.io/ncbi-sys-gcr-public-research/sra-human-scrubber@sha256:b7dba71079344daea4ea3363e1a67fa54edb7ec65459d039669c68a66d38b140" } String r1_filename = basename(read1) - command <<< # date and version control date | tee DATE @@ -94,93 +86,85 @@ task ncbi_scrub_se { # gzip dehosted reads gzip ${read1_unzip}.clean -c > ~{samplename}_R1_dehosted.fastq.gz - >>> - output { - File read1_dehosted = "~{samplename}_R1_dehosted.fastq.gz" - Int read1_human_spots_removed = read_int("FWD_SPOTS_REMOVED") - String ncbi_scrub_docker = docker - + File read1_dehosted = "~{samplename}_R1_dehosted.fastq.gz" + Int read1_human_spots_removed = read_int("FWD_SPOTS_REMOVED") + String ncbi_scrub_docker = docker } - runtime { - docker: "~{docker}" - memory: "8 GB" - cpu: 4 - disks: "local-disk 100 SSD" - preemptible: 0 - maxRetries: 3 + docker: "~{docker}" + memory: "8 GB" + cpu: 4 + disks: "local-disk 100 SSD" + preemptible: 0 + maxRetries: 3 } } task seqyclean { input { - File read1 - File read2 - String samplename - String? adapters = "/Adapters_plus_PhiX_174.fasta" - Int? seqyclean_minlen = 15 - String? seqyclean_qual = "20 20" - Boolean? compress = true - Boolean? seqyclean_dup = false - Boolean? seqyclean_no_adapter_trim = false - Int? cpus = 16 + File read1 + File read2 + String samplename + String? adapters = "/Adapters_plus_PhiX_174.fasta" + Int? seqyclean_minlen = 15 + String? seqyclean_qual = "20 20" + Boolean? compress = true + Boolean? seqyclean_dup = false + Boolean? seqyclean_no_adapter_trim = false + Int? cpu = 16 } - - command { + command <<< # date and version control date | tee DATE echo "Seqyclean $(seqyclean -h | grep Version)" | tee VERSION seqyclean \ - -minlen ${seqyclean_minlen} \ - -qual ${seqyclean_qual} \ - -c ${adapters} \ - ${true="-dup" false="" seqyclean_dup} \ - ${true="-no_adapter_trim" false="" seqyclean_no_adapter_trim} \ - ${true="-gz" false="" compress} \ - -t ${cpus} \ - -1 ${read1} \ - -2 ${read2} \ - -o ${samplename} + -minlen ~{seqyclean_minlen} \ + -qual ~{seqyclean_qual} \ + -c ~{adapters} \ + ~{true="-dup" false="" seqyclean_dup} \ + ~{true="-no_adapter_trim" false="" seqyclean_no_adapter_trim} \ + ~{true="-gz" false="" compress} \ + -t ~{cpu} \ + -1 ~{read1} \ + -2 ~{read2} \ + -o ~{samplename} # Capture metrics for summary file - cut -f 58 ${samplename}_SummaryStatistics.tsv | grep -v "PairsKept" | head -n 1 | tee PAIRS_KEPT - cut -f 59 ${samplename}_SummaryStatistics.tsv | grep -v "Perc_Kept" | head -n 1 | tee PERCENT_KEPT - } - + cut -f 58 ~{samplename}_SummaryStatistics.tsv | grep -v "PairsKept" | head -n 1 | tee PAIRS_KEPT + cut -f 59 ~{samplename}_SummaryStatistics.tsv | grep -v "Perc_Kept" | head -n 1 | tee PERCENT_KEPT + >>> output { - File read1_clean = "${samplename}_PE1.fastq.gz" - File read2_clean = "${samplename}_PE2.fastq.gz" - String version = read_string("VERSION") - String pipeline_date = read_string("DATE") - Int seqy_pairs = read_string("PAIRS_KEPT") - Float seqy_percent = read_string("PERCENT_KEPT") + File read1_clean = "~{samplename}_PE1.fastq.gz" + File read2_clean = "~{samplename}_PE2.fastq.gz" + String version = read_string("VERSION") + String pipeline_date = read_string("DATE") + Int seqy_pairs = read_string("PAIRS_KEPT") + Float seqy_percent = read_string("PERCENT_KEPT") } - runtime { - docker: "quay.io/staphb/seqyclean:1.10.09" - memory: "8 GB" - cpu: 2 - disks: "local-disk 100 SSD" - preemptible: 0 - maxRetries: 3 + docker: "quay.io/staphb/seqyclean:1.10.09" + memory: "8 GB" + cpu: 2 + disks: "local-disk 100 SSD" + preemptible: 0 + maxRetries: 3 } } task trimmomatic { input { - File read1 - File read2 - String samplename - String docker="quay.io/staphb/trimmomatic:0.39" - Int? trimmomatic_minlen = 75 - Int? trimmomatic_window_size=4 - Int? trimmomatic_quality_trim_score=30 - Int? threads = 4 + File read1 + File read2 + String samplename + String docker = "quay.io/staphb/trimmomatic:0.39" + Int? trimmomatic_minlen = 75 + Int? trimmomatic_window_size=4 + Int? trimmomatic_quality_trim_score=30 + Int? threads = 4 } - command <<< # date and version control date | tee DATE @@ -194,35 +178,33 @@ task trimmomatic { MINLEN:~{trimmomatic_minlen} > ~{samplename}.trim.stats.txt >>> - output { - File read1_trimmed = "${samplename}_1P.fastq.gz" - File read2_trimmed = "${samplename}_2P.fastq.gz" - File trimmomatic_stats = "${samplename}.trim.stats.txt" - String version = read_string("VERSION") - String pipeline_date = read_string("DATE") + File read1_trimmed = "~{samplename}_1P.fastq.gz" + File read2_trimmed = "~{samplename}_2P.fastq.gz" + File trimmomatic_stats = "~{samplename}.trim.stats.txt" + String version = read_string("VERSION") + String pipeline_date = read_string("DATE") } - runtime { - docker: "~{docker}" - memory: "8 GB" - cpu: 4 - disks: "local-disk 100 SSD" - preemptible: 0 - maxRetries: 3 + docker: "~{docker}" + memory: "8 GB" + cpu: 4 + disks: "local-disk 100 SSD" + preemptible: 0 + maxRetries: 3 } } + task trimmomatic_se { input { - File read1 - String samplename - String docker="quay.io/staphb/trimmomatic:0.39" - Int? trimmomatic_minlen = 25 - Int? trimmomatic_window_size=4 - Int? trimmomatic_quality_trim_score=30 - Int? threads = 4 + File read1 + String samplename + String docker = "quay.io/staphb/trimmomatic:0.39" + Int? trimmomatic_minlen = 25 + Int? trimmomatic_window_size = 4 + Int? trimmomatic_quality_trim_score = 30 + Int? threads = 4 } - command <<< # date and version control date | tee DATE @@ -234,34 +216,31 @@ task trimmomatic_se { ~{samplename}_trimmed.fastq.gz \ SLIDINGWINDOW:~{trimmomatic_window_size}:~{trimmomatic_quality_trim_score} \ MINLEN:~{trimmomatic_minlen} > ~{samplename}.trim.stats.txt - >>> - output { - File read1_trimmed = "${samplename}_trimmed.fastq.gz" - File trimmomatic_stats = "${samplename}.trim.stats.txt" - String version = read_string("VERSION") - String pipeline_date = read_string("DATE") + File read1_trimmed = "~{samplename}_trimmed.fastq.gz" + File trimmomatic_stats = "~{samplename}.trim.stats.txt" + String version = read_string("VERSION") + String pipeline_date = read_string("DATE") } - runtime { - docker: "~{docker}" - memory: "8 GB" - cpu: 4 - disks: "local-disk 100 SSD" - preemptible: 0 - maxRetries: 3 + docker: "~{docker}" + memory: "8 GB" + cpu: 4 + disks: "local-disk 100 SSD" + preemptible: 0 + maxRetries: 3 } } + task bbduk { input { - File read1_trimmed - File read2_trimmed - String samplename - Int mem_size_gb = 8 - String docker="quay.io/staphb/bbtools:38.76" + File read1_trimmed + File read2_trimmed + String samplename + Int memory = 8 + String docker = "quay.io/staphb/bbtools:38.76" } - command <<< # date and version control date | tee DATE @@ -271,35 +250,32 @@ task bbduk { bbduk.sh in1=~{samplename}.paired_1.fastq.gz in2=~{samplename}.paired_2.fastq.gz out1=~{samplename}.rmadpt_1.fastq.gz out2=~{samplename}.rmadpt_2.fastq.gz ref=/bbmap/resources/adapters.fa stats=~{samplename}.adapters.stats.txt ktrim=r k=23 mink=11 hdist=1 tpe tbo bbduk.sh in1=~{samplename}.rmadpt_1.fastq.gz in2=~{samplename}.rmadpt_2.fastq.gz out1=~{samplename}_1.clean.fastq.gz out2=~{samplename}_2.clean.fastq.gz outm=~{samplename}.matched_phix.fq ref=/bbmap/resources/phix174_ill.ref.fa.gz k=31 hdist=1 stats=~{samplename}.phix.stats.txt - >>> - output { - File read1_clean = "${samplename}_1.clean.fastq.gz" - File read2_clean = "${samplename}_2.clean.fastq.gz" - File adapter_stats = "${samplename}.adapters.stats.txt" - File phiX_stats = "${samplename}.phix.stats.txt" - String bbduk_docker = docker - String pipeline_date = read_string("DATE") + File read1_clean = "${samplename}_1.clean.fastq.gz" + File read2_clean = "${samplename}_2.clean.fastq.gz" + File adapter_stats = "${samplename}.adapters.stats.txt" + File phiX_stats = "${samplename}.phix.stats.txt" + String bbduk_docker = docker + String pipeline_date = read_string("DATE") } - runtime { - docker: "~{docker}" - memory: "~{mem_size_gb} GB" - cpu: 4 - disks: "local-disk 100 SSD" - preemptible: 0 - maxRetries: 3 + docker: "~{docker}" + memory: "~{memory} GB" + cpu: 4 + disks: "local-disk 100 SSD" + preemptible: 0 + maxRetries: 3 } } + task bbduk_se { input { - File read1_trimmed - String samplename - String docker="quay.io/staphb/bbtools:38.76" - Int mem_size_gb = 8 + File read1_trimmed + String samplename + String docker = "quay.io/staphb/bbtools:38.76" + Int memory = 8 } - command <<< # date and version control date | tee DATE @@ -307,23 +283,20 @@ task bbduk_se { bbduk.sh in1=~{read1_trimmed} out1=~{samplename}.rmadpt_1.fastq.gz ref=/bbmap/resources/adapters.fa stats=~{samplename}.adapters.stats.txt ktrim=r k=23 mink=11 hdist=1 tpe tbo bbduk.sh in1=~{read1_trimmed} out1=~{samplename}_1.clean.fastq.gz outm=~{samplename}.matched_phix.fq ref=/bbmap/resources/phix174_ill.ref.fa.gz k=31 hdist=1 stats=~{samplename}.phix.stats.txt - >>> - output { - File read1_clean = "${samplename}_1.clean.fastq.gz" - File adapter_stats = "${samplename}.adapters.stats.txt" - File phiX_stats = "${samplename}.phix.stats.txt" - String bbduk_docker = docker - String pipeline_date = read_string("DATE") + File read1_clean = "${samplename}_1.clean.fastq.gz" + File adapter_stats = "${samplename}.adapters.stats.txt" + File phiX_stats = "${samplename}.phix.stats.txt" + String bbduk_docker = docker + String pipeline_date = read_string("DATE") } - runtime { - docker: "~{docker}" - memory: "~{mem_size_gb} GB" - cpu: 4 - disks: "local-disk 100 SSD" - preemptible: 0 - maxRetries: 3 + docker: "~{docker}" + memory: "~{memory} GB" + cpu: 4 + disks: "local-disk 100 SSD" + preemptible: 0 + maxRetries: 3 } } diff --git a/tasks/task_se_pub_repo_submission.wdl b/tasks/task_se_pub_repo_submission.wdl deleted file mode 100644 index e4c96dc0..00000000 --- a/tasks/task_se_pub_repo_submission.wdl +++ /dev/null @@ -1,314 +0,0 @@ -version 1.0 - -task deidentify { - - input { - String samplename - String submission_id - File sequence - - String docker_image = "quay.io/staphb/seqyclean:1.10.09" - Int mem_size_gb = 3 - Int CPUs = 1 - Int disk_size = 100 - Int preemptible_tries = 0 - } - - command { - # de-identified consensus/assembly sequence - echo ">${submission_id}" > ${submission_id}.fasta - grep -v ">" ${sequence} >> ${submission_id}.fasta - - num_N=$( grep -v ">" ${sequence} | grep -o 'N' | wc -l ) - if [ -z "$num_N" ] ; then num_N="0" ; fi - echo $num_N | tee NUM_N - - num_ACTG=$( grep -v ">" ${sequence} | grep -o -E "C|A|T|G" | wc -l ) - if [ -z "$num_ACTG" ] ; then num_ACTG="0" ; fi - echo $num_ACTG | tee NUM_ACTG - - num_total=$( grep -v ">" ${sequence} | grep -o -E '[A-Z]' | wc -l ) - if [ -z "$num_total" ] ; then num_total="0" ; fi - echo $num_total | tee NUM_TOTAL - } - - output { - File deID_assembly = "${submission_id}.fasta" - Int number_N = read_string("NUM_N") - Int number_ATCG = read_string("NUM_ACTG") - Int number_Total = read_string("NUM_TOTAL") - } - - runtime { - docker: docker_image - memory: "~{mem_size_gb} GB" - cpu: CPUs - disks: "local-disk ~{disk_size} SSD" - preemptible: preemptible_tries - maxRetries: 3 - } -} - -task gisaid { - - input { - String samplename - String submission_id - String collection_date - File sequence - String iso_host - String iso_country - - String gisaid_submitter - String iso_state - String iso_continent - String seq_platform - String assembly_method - String originating_lab - String origLab_address - String submitting_lab - String subLab_address - String Authors - - String passage_details="Original" - String gender="unknown" - String patient_age="unknown" - String patient_status="unknown" - String specimen_source="" - String outbreak="" - String last_vaccinated="" - String treatment="" - String iso_county = "" - - String docker_image = "quay.io/staphb/seqyclean:1.10.09" - Int mem_size_gb = 3 - Int CPUs = 1 - Int disk_size = 10 - Int preemptible_tries = 0 - } - - command { - # de-identified consensus/assembly sequence - year=$(echo ${collection_date} | cut -f 1 -d '-') - echo ">hCoV-19/${iso_country}/${submission_id}/$year" > ${submission_id}.gisaid.fa - grep -v ">" ${sequence} >> ${submission_id}.gisaid.fa - - - echo submitter,fn,covv_virus_name,covv_type,covv_passage,covv_collection_date,covv_location,covv_add_location,covv_host,covv_add_host_info,covv_sampling_strategy,covv_gender,covv_patient_age,covv_patient_status,covv_specimen,covv_outbreak,covv_last_vaccinated,covv_treatment,covv_seq_technology,covv_assembly_method,covv_coverage,covv_orig_lab,covv_orig_lab_addr,covv_provider_sample_id,covv_subm_lab,covv_subm_lab_addr,covv_subm_sample_id,covv_authors,covv_comment,comment_type > ${submission_id}.gisaidMeta.csv - - echo Submitter,FASTA filename,Virus name,Type,Passage details/history,Collection date,Location,Additional location information,Host,Additional host information,Sampling Strategy,Gender,Patient age,Patient status,Specimen source,Outbreak,Last vaccinated,Treatment,Sequencing technology,Assembly method,Coverage,Originating lab,Address,Sample ID given by the sample provider,Submitting lab,Address,Sample ID given by the submitting laboratory,Authors,Comment,Comment Icon >> ${submission_id}.gisaidMeta.csv - - echo "\"${gisaid_submitter}\",\"${submission_id}.gisaid.fa\",\"hCoV-19/${iso_country}/${submission_id}/$year\",\"betacoronavirus\",\"${passage_details}\",\"${collection_date}\",\"${iso_continent} / ${iso_country} / ${iso_state} / ${iso_county}\" ,,\"${iso_host}\",,,\"${gender}\",\"${patient_age}\",\"${patient_status}\",\"${specimen_source}\",\"${outbreak}\",\"${last_vaccinated}\",\"${treatment}\",\"${seq_platform}\",\"${assembly_method}\",,\"${originating_lab}\",\"${origLab_address}\",,\"${submitting_lab}\",\"${subLab_address}\",,\"${Authors}\"",, >> ${submission_id}.gisaidMeta.csv - - } - - output { - File gisaid_assembly = "${submission_id}.gisaid.fa" - File gisaid_metadata = "${submission_id}.gisaidMeta.csv" - } - - runtime { - docker: docker_image - memory: "~{mem_size_gb} GB" - cpu: CPUs - disks: "local-disk ~{disk_size} SSD" - preemptible: preemptible_tries - maxRetries: 3 - } -} - -task genbank { - - input { - String samplename - String submission_id - String collection_date - File sequence - String organism - String iso_org - String iso_host - String iso_country - String specimen_source - String BioProject - - String docker_image = "quay.io/theiagen/utility:1.1" - Int mem_size_gb = 3 - Int CPUs = 1 - Int disk_size = 10 - Int preemptible_tries = 0 - } - - command <<< - year=$(echo ~{collection_date} | cut -f 1 -d '-') - isolate=$(echo ~{submission_id} | awk 'BEGIN { FS = "-" } ; {$1=$2=""; print $0}' | sed 's/^ *//g') - - # removing leading Ns, folding sequencing to 75 bp wide, and adding metadata for genbank submissions - echo ">~{submission_id} [organism=~{organism}][isolate=~{iso_org}/~{iso_host}/~{iso_country}/~{submission_id}/~year)][host=~{iso_host}][country=~{iso_country}][collection_date=~{collection_date}]" > ~{submission_id}.genbank.fa - grep -v ">" ~{sequence} | sed 's/^N*N//g' | fold -w 75 >> ~{submission_id}.genbank.fa - - echo Sequence_ID,Country,Host,Isolate,Collection Date, BioProject Accession > ~{submission_id}.genbankMeta.csv - - echo "\"~{submission_id}\",\"~{iso_country}\",\"~{iso_host}\",\"~{submission_id}\",\"~{collection_date}\",\"~{BioProject}\"" >> ~{submission_id}.genbankMeta.csv - - >>> - - output { - File genbank_assembly = "~{submission_id}.genbank.fa" - File genbank_metadata = "~{submission_id}.genbankMeta.csv" - } - - runtime { - docker: docker_image - memory: "~{mem_size_gb} GB" - cpu: CPUs - disks: "local-disk ~{disk_size} SSD" - preemptible: preemptible_tries - maxRetries: 3 - } -} - -task sra { - - input { - String submission_id - File reads - - String docker_image = "quay.io/staphb/seqyclean:1.10.09" - Int mem_size_gb = 1 - Int CPUs = 1 - Int disk_size = 25 - Int preemptible_tries = 0 - } - - command { - cp ${reads} ${submission_id}.fastq.gz - } - - output { - File? reads_submission = "${submission_id}.fastq.gz" - } - - runtime { - docker: docker_image - memory: "~{mem_size_gb} GB" - cpu: CPUs - disks: "local-disk ~{disk_size} SSD" - preemptible: preemptible_tries - maxRetries: 3 - } -} - - -task compile { - - input { - Array[File] single_submission_fasta - Array[File] single_submission_meta - Array[String] samplename - Array[String] submission_id - Array[String] vadr_num_alerts - Int vadr_threshold=0 - String repository - String docker_image = "quay.io/theiagen/utility:1.1" - Int mem_size_gb = 1 - Int CPUs = 1 - Int disk_size = 25 - Int preemptible_tries = 0 - } - - command <<< - assembly_array=(~{sep=' ' single_submission_fasta}) - assembly_array_len=$(echo "${#assembly_array[@]}") - meta_array=(~{sep=' ' single_submission_meta}) - meta_array_len=$(echo "${#meta_array[@]}") - vadr_string="~{sep=',' vadr_num_alerts}" - IFS=',' read -r -a vadr_array <<< ${vadr_string} - vadr_array_len=$(echo "${#vadr_array[@]}") - samplename_array=(~{sep=' ' samplename}) - submission_id_array=(~{sep=' ' submission_id}) - submission_id_array_len=$(echo "${#submission_id_array[@]}") - vadr_array_len=$(echo "${#vadr_array[@]}") - passed_assemblies="" - passed_meta="" - - #Create files to capture batched and excluded samples - echo -e "~{repository} Identifier\tSamplename\tNumber of Vadr Alerts\tNote" > ~{repository}_batched_samples.tsv - echo -e "~{repository} Identifier\tSamplename\tNumber of Vadr Alerts\tNote" > ~{repository}_excluded_samples.tsv - - # Ensure assembly, meta, and vadr arrays are of equal length - if [ "$submission_id_array" -ne "$vadr_array_len" ]; then - echo "Submission_id array (length: $assembly_array_len) and vadr array (length: $vadr_array_len) are of unequal length." >&2 - exit 1 - fi - - # remove samples that excede vadr threshold - for index in ${!submission_id_array[@]}; do - submission_id=${submission_id_array[$index]} - samplename=${samplename_array[$index]} - vadr=${vadr_array[$index]} - batch_note="" - - # check if the sample has submittable assembly file; if so remove those that excede vadr thresholds - assembly=$(printf '%s\n' "${assembly_array[@]}" | grep "${submission_id}") - metadata=$(printf '%s\n' "${meta_array[@]}" | grep "${submission_id}") - - echo -e "Submission_ID: ${submission_id}\n\tAssembly: ${assembly}\n\tMetadata: ${metadata}\n\tVADR: ${vadr}" - - if [ \( ! -z "${assembly}" \) -a \( ! -z "{$metadata}" \) ]; then - repository_identifier=$(grep -e ">" ${assembly} | sed 's/\s.*$//' | sed 's/>//g' ) - re='^[0-9]+$' - if ! [[ "${vadr}" =~ $re ]] ; then - batch_note="No VADR value to evaluate" - echo -e "\t$submission_id removed: ${batch_note}" - echo -e "$repository_identifier\t$samplename\t$vadr\t$batch_note" >> ~{repository}_excluded_samples.tsv - elif [ "${vadr}" -le "~{vadr_threshold}" ] ; then - passed_assemblies=( "${passed_assemblies[@]}" "${assembly}") - passed_meta=( "${passed_meta[@]}" "${metadata}") - echo -e "\t$submission_id added to batch" - echo -e "$repository_identifier\t$samplename\t$vadr\t$batch_note" >> ~{repository}_batched_samples.tsv - else - batch_note="Number of vadr alerts (${vadr}) exceeds threshold ~{vadr_threshold}" - echo -e "\t$submission_id removed: ${batch_note}" - echo -e "$repository_identifier\t$samplename\t$vadr\t$batch_note" >> ~{repository}_excluded_samples.tsv - fi - else - batch_note="Assembly or metadata file missing" - repository_identifier="NA" - echo -e "\t$submission_id removed: ${batch_note}" - echo -e "$repository_identifier\t$samplename\t$vadr\t$batch_note" >> ~{repository}_excluded_samples.tsv - fi - - done - - count=0 - for i in ${passed_meta[*]}; do - # grab header from first sample in meta_array - while [ "$count" -lt 1 ]; do - head -n -1 $i > ~{repository}_upload_meta.csv - count+=1 - done - #populate csv with each samples metadata - sed 's+",\".*\.gisaid\.fa+\",\"GISAID_upload.fasta+g' $i | tail -n1 >> ~{repository}_upload_meta.csv - done - - cat ${passed_assemblies[*]} > ~{repository}_upload.fasta - - >>> - - output { - File? upload_meta = "${repository}_upload_meta.csv" - File? upload_fasta = "${repository}_upload.fasta" - File batched_samples = "${repository}_batched_samples.tsv" - File excluded_samples = "${repository}_excluded_samples.tsv" - - } - - runtime { - docker: docker_image - memory: "~{mem_size_gb} GB" - cpu: CPUs - disks: "local-disk ~{disk_size} SSD" - preemptible: preemptible_tries - maxRetries: 3 - } -} diff --git a/tasks/task_taxonID.wdl b/tasks/task_taxonID.wdl index ae3c34ac..5c2d6b3c 100644 --- a/tasks/task_taxonID.wdl +++ b/tasks/task_taxonID.wdl @@ -2,66 +2,62 @@ version 1.0 task kraken2 { input { - File read1 - File? read2 - String samplename - String? kraken2_db = "/kraken2-db" - Int? cpus=4 + File read1 + File? read2 + String samplename + String? kraken2_db = "/kraken2-db" + Int? cpu = 4 } - - command{ + command <<< # date and version control date | tee DATE kraken2 --version | head -n1 | tee VERSION num_reads=$(ls *fastq.gz 2> /dev/nul | wc -l) - if ! [ -z ${read2} ]; then + if ! [ -z ~{read2} ]; then mode="--paired" fi echo $mode kraken2 $mode \ - --threads ${cpus} \ - --db ${kraken2_db} \ - ${read1} ${read2} \ - --report ${samplename}_kraken2_report.txt >/dev/null + --threads ~{cpu} \ + --db ~{kraken2_db} \ + ~{read1} ~{read2} \ + --report ~{samplename}_kraken2_report.txt >/dev/null - percentage_human=$(grep "Homo sapiens" ${samplename}_kraken2_report.txt | cut -f 1) + percentage_human=$(grep "Homo sapiens" ~{samplename}_kraken2_report.txt | cut -f 1) # | tee PERCENT_HUMAN - percentage_sc2=$(grep "Severe acute respiratory syndrome coronavirus 2" ${samplename}_kraken2_report.txt | cut -f1 ) + percentage_sc2=$(grep "Severe acute respiratory syndrome coronavirus 2" ~{samplename}_kraken2_report.txt | cut -f1 ) # | tee PERCENT_COV if [ -z "$percentage_human" ] ; then percentage_human="0" ; fi if [ -z "$percentage_sc2" ] ; then percentage_sc2="0" ; fi echo $percentage_human | tee PERCENT_HUMAN echo $percentage_sc2 | tee PERCENT_SC2 - } - + >>> output { - String date = read_string("DATE") - String version = read_string("VERSION") - File kraken_report = "${samplename}_kraken2_report.txt" - Float percent_human = read_string("PERCENT_HUMAN") - Float percent_sc2 = read_string("PERCENT_SC2") + String date = read_string("DATE") + String version = read_string("VERSION") + File kraken_report = "~{samplename}_kraken2_report.txt" + Float percent_human = read_string("PERCENT_HUMAN") + Float percent_sc2 = read_string("PERCENT_SC2") } - runtime { - docker: "quay.io/staphb/kraken2:2.0.8-beta_hv" - memory: "8 GB" - cpu: 4 - disks: "local-disk 100 SSD" - preemptible: 0 - maxRetries: 3 + docker: "quay.io/staphb/kraken2:2.0.8-beta_hv" + memory: "8 GB" + cpu: cpu + disks: "local-disk 100 SSD" + preemptible: 0 + maxRetries: 3 } } task pangolin3 { input { - File fasta - String samplename - Int min_length=10000 - Float max_ambig=0.5 - String docker="quay.io/staphb/pangolin:3.1.17-pangolearn-2022-01-05" - String inference_engine="usher" + File fasta + String samplename + Int min_length = 10000 + Float max_ambig = 0.5 + String docker = "quay.io/staphb/pangolin:3.1.20-pangolearn-2022-02-02" + String inference_engine = "usher" } - command <<< set -e # set inference inference_engine @@ -104,29 +100,27 @@ task pangolin3 { with open("PANGOLIN_NOTES", 'wt') as lineage: lineage.write(line["note"]) CODE - >>> - output { - String date = read_string("DATE") - String pangolin_lineage = read_string("PANGOLIN_LINEAGE") - String pangolin_conflicts = read_string("PANGOLIN_CONFLICTS") - String pangolin_notes = read_string("PANGOLIN_NOTES") - String pangolin_assignment_version = read_string("PANGO_ASSIGNMENT_VERSION") - String pangolin_versions = read_string("VERSION_PANGOLIN_ALL") - String pangolin_docker = docker - File pango_lineage_report = "${samplename}.pangolin_report.csv" + String date = read_string("DATE") + String pangolin_lineage = read_string("PANGOLIN_LINEAGE") + String pangolin_conflicts = read_string("PANGOLIN_CONFLICTS") + String pangolin_notes = read_string("PANGOLIN_NOTES") + String pangolin_assignment_version = read_string("PANGO_ASSIGNMENT_VERSION") + String pangolin_versions = read_string("VERSION_PANGOLIN_ALL") + String pangolin_docker = docker + File pango_lineage_report = "~{samplename}.pangolin_report.csv" } - runtime { - docker: "~{docker}" - memory: "8 GB" - cpu: 4 - disks: "local-disk 100 SSD" - preemptible: 0 - maxRetries: 3 + docker: "~{docker}" + memory: "8 GB" + cpu: 4 + disks: "local-disk 100 SSD" + preemptible: 0 + maxRetries: 3 } } + task pangolin_update_log { input { String samplename @@ -139,9 +133,8 @@ task pangolin_update_log { String updated_pangolin_assignment_version String updated_pangolin_versions String? timezone - File? lineage_log + File? lineage_log } - command <<< # set timezone for date outputs ~{default='' 'export TZ=' + timezone} @@ -178,21 +171,18 @@ task pangolin_update_log { echo -e "${DATE}\t${UPDATE_STATUS}\t~{current_lineage}\t~{current_pangolin_docker}\t~{current_pangolin_assignment_version}\t~{current_pangolin_versions}\t~{updated_lineage}\t~{updated_pangolin_docker}\t~{updated_pangolin_assignment_version}\t~{updated_pangolin_versions}" >> "${lineage_log_file}" echo "${UPDATE_STATUS} (${DATE})" | tee PANGOLIN_UPDATE - >>> - output { - String pangolin_updates = read_string("PANGOLIN_UPDATE") - File pango_lineage_log = "~{samplename}_pango_lineage_log.tsv" + String pangolin_updates = read_string("PANGOLIN_UPDATE") + File pango_lineage_log = "~{samplename}_pango_lineage_log.tsv" } - runtime { - docker: "quay.io/theiagen/utility:1.1" - memory: "8 GB" - cpu: 4 - disks: "local-disk 100 SSD" - preemptible: 0 - maxRetries: 3 + docker: "quay.io/theiagen/utility:1.1" + memory: "8 GB" + cpu: 4 + disks: "local-disk 100 SSD" + preemptible: 0 + maxRetries: 3 } } @@ -201,20 +191,20 @@ task nextclade_one_sample { description: "Nextclade classification of one sample. Leaving optional inputs unspecified will use SARS-CoV-2 defaults." } input { - File genome_fasta - File? root_sequence - File? auspice_reference_tree_json - File? qc_config_json - File? gene_annotations_json - File? pcr_primers_csv - File? virus_properties - String docker = "nextstrain/nextclade:1.10.2" - String dataset_name - String dataset_reference - String dataset_tag + File genome_fasta + File? root_sequence + File? auspice_reference_tree_json + File? qc_config_json + File? gene_annotations_json + File? pcr_primers_csv + File? virus_properties + String docker = "nextstrain/nextclade:1.10.3" + String dataset_name + String dataset_reference + String dataset_tag } String basename = basename(genome_fasta, ".fasta") - command { + command <<< NEXTCLADE_VERSION="$(nextclade --version)" echo $NEXTCLADE_VERSION > NEXTCLADE_VERSION @@ -232,33 +222,33 @@ task nextclade_one_sample { --output-tsv "~{basename}".nextclade.tsv \ --output-tree "~{basename}".nextclade.auspice.json \ --verbose - } + >>> runtime { - docker: "~{docker}" - memory: "4 GB" - cpu: 2 - disks: "local-disk 50 HDD" - dx_instance_type: "mem1_ssd1_v2_x2" - maxRetries: 3 + docker: "~{docker}" + memory: "4 GB" + cpu: 2 + disks: "local-disk 50 HDD" + dx_instance_type: "mem1_ssd1_v2_x2" + maxRetries: 3 } output { - String nextclade_version = read_string("NEXTCLADE_VERSION") - File nextclade_json = "~{basename}.nextclade.json" - File auspice_json = "~{basename}.nextclade.auspice.json" - File nextclade_tsv = "~{basename}.nextclade.tsv" - String nextclade_docker = docker + String nextclade_version = read_string("NEXTCLADE_VERSION") + File nextclade_json = "~{basename}.nextclade.json" + File auspice_json = "~{basename}.nextclade.auspice.json" + File nextclade_tsv = "~{basename}.nextclade.tsv" + String nextclade_docker = docker } } task nextclade_output_parser_one_sample { meta { - description: "Python and bash codeblocks for parsing the output files from Nextclade." + description: "Python and bash codeblocks for parsing the output files from Nextclade." } input { - File nextclade_tsv - String docker = "python:slim" + File nextclade_tsv + String docker = "python:slim" } - command { + command <<< # Set WDL input variable to input.tsv file cat "~{nextclade_tsv}" > input.tsv # Parse outputs using python3 @@ -294,19 +284,19 @@ task nextclade_output_parser_one_sample { nc_aa_dels=nc_aa_dels Nextclade_AA_Dels.write(nc_aa_dels) CODE - } + >>> runtime { - docker: "~{docker}" - memory: "4 GB" - cpu: 2 - disks: "local-disk 50 HDD" - dx_instance_type: "mem1_ssd1_v2_x2" - maxRetries: 3 + docker: "~{docker}" + memory: "4 GB" + cpu: 2 + disks: "local-disk 50 HDD" + dx_instance_type: "mem1_ssd1_v2_x2" + maxRetries: 3 } output { - String nextclade_clade = read_string("NEXTCLADE_CLADE") - String nextclade_aa_subs = read_string("NEXTCLADE_AASUBS") - String nextclade_aa_dels = read_string("NEXTCLADE_AADELS") + String nextclade_clade = read_string("NEXTCLADE_CLADE") + String nextclade_aa_subs = read_string("NEXTCLADE_AASUBS") + String nextclade_aa_dels = read_string("NEXTCLADE_AADELS") } } @@ -316,43 +306,53 @@ task freyja_one_sample { String samplename File reference_genome File? freyja_usher_barcodes - Boolean update_db = true - String docker = "staphb/freyja:1.2" + File? freyja_lineage_metadata + Float? eps=0.001 + Boolean update_db = false + String docker = "quay.io/staphb/freyja:1.3.2" } command <<< - # configure barcode settings and capture version - #if [[ ! -z "~{freyja_usher_barcodes}" ]]; then - # #capture database info - # azfreyja_usher_barcode_version=$(basename -- "~{freyja_usher_barcodes}") - # echo "here" - # #set environment with user-defined db - # mv ~{freyja_usher_barcodes} /opt/conda/envs/freyja-env/lib/python3.7/site-packages/freyja/data/usher_barcodes.csv - #else - # # update db if specified - # if ~{update_db}; then - # freyja update - # freyja_usher_barcode_version="freyja update: $(date +"%Y-%m-%d")" - # else - # freyja_usher_barcode_version="unmodified from freyja container: ~{docker}" - # fi - #fi - - # always update freyja barcodes until v1.3.1 release (will allow user-defined ref files) - freyja update - freyja_usher_barcode_version="freyja update: $(date +"%Y-%m-%d")" - + # update freyja reference files if specified + if ~{update_db}; then + freyja update + # can't update barcodes in freyja 1.3.2; will update known issue is closed (https://github.com/andersen-lab/Freyja/issues/33) + freyja_usher_barcode_version="unmodified from freyja container: ~{docker}" + freyja_barcode="" + freyja_metadata_version="freyja update: $(date +"%Y-%m-%d")" + freyja_metadata="" + else + # configure barcode + if [[ ! -z "~{freyja_usher_barcodes}" ]]; then + echo "User freyja usher barcodes identified; ~{freyja_usher_barcodes} will be utilized fre freyja demixing" + freyja_barcode="--barcodes ~{freyja_usher_barcodes}" + freyja_usher_barcode_version=$(basename -- "~{freyja_usher_barcodes}") + else + freyja_barcode="" + freyja_usher_barcode_version="unmodified from freyja container: ~{docker}" + fi + # configure lineage metadata + if [[ ! -z "~{freyja_lineage_metadata}" ]]; then + echo "User lineage metadata; ~{freyja_lineage_metadata} will be utilized fre freyja demixing" + freyja_metadata="--meta ~{freyja_lineage_metadata}" + freyja_metadata_version=$(basename -- "~{freyja_lineage_metadata}") + else + freyja_metadata="" + freyja_metadata_version="unmodified from freyja container: ~{docker}" + fi + fi + # Capture reference file versions echo ${freyja_usher_barcode_version} | tee FREYJA_BARCODES - + echo ${freyja_metadata_version} | tee FREYJA_METADATA + echo $PWD # Call variants and capture sequencing depth information + echo "Running: freyja variants ~{primer_trimmed_bam} --variants ~{samplename}_freyja_variants.tsv --depths ~{samplename}_freyja_depths.tsv --ref ~{reference_genome}" freyja variants ~{primer_trimmed_bam} --variants ~{samplename}_freyja_variants.tsv --depths ~{samplename}_freyja_depths.tsv --ref ~{reference_genome} - # Demix variants - freyja demix ~{samplename}_freyja_variants.tsv ~{samplename}_freyja_depths.tsv --output ~{samplename}_freyja_demixed.tmp + echo "Running: freyja demix --eps ~{eps} ${freyja_barcode} ${freyja_metadata} ~{samplename}_freyja_variants.tsv ~{samplename}_freyja_depths.tsv --output ~{samplename}_freyja_demixed.tmp" + freyja demix --eps ~{eps} ${freyja_barcode} ${freyja_metadata} ~{samplename}_freyja_variants.tsv ~{samplename}_freyja_depths.tsv --output ~{samplename}_freyja_demixed.tmp # Adjust output header echo -e "\t/~{samplename}" > ~{samplename}_freyja_demixed.tsv tail -n+2 ~{samplename}_freyja_demixed.tmp >> ~{samplename}_freyja_demixed.tsv - - >>> runtime { memory: "4 GB" @@ -365,6 +365,6 @@ task freyja_one_sample { File freyja_depths = "~{samplename}_freyja_depths.tsv" File freyja_demixed = "~{samplename}_freyja_demixed.tsv" String freyja_barcode_version = read_string("FREYJA_BARCODES") + String freyja_metadata_version = read_string("FREYJA_METADATA") } - } diff --git a/tasks/task_theiacov_summary.wdl b/tasks/task_theiacov_summary.wdl new file mode 100644 index 00000000..8fbfacc3 --- /dev/null +++ b/tasks/task_theiacov_summary.wdl @@ -0,0 +1,189 @@ +version 1.0 + +task theiacov_summary { + input { + String samplename + String theiacov_workflow + String theiacov_version + String theiacov_analysis_date + String seq_platform + String primer_bed_name + Float percent_reference_coverage + Float? s_gene_mean_coverage + Int number_N + String pango_lineage + String pangolin_conflicts + String pangolin_notes + String pangolin_assignment_version + String pangolin_docker + String pangolin_versions + String nextclade_clade + String nextclade_aa_subs + String nextclade_aa_dels + String vadr_num_alerts + Int assembly_length_unambiguous + Float assembly_mean_coverage + String assembly_method + Int number_Degenerate + Int number_Total + Float meanbaseq_trim + Float meanmapq_trim + Int num_reads_clean1 + String? num_reads_clean2 = "" + String? num_reads_clean_pairs = "" + Int num_reads_raw1 + String? num_reads_raw2 = "" + String? num_reads_raw_pairs = "" + Float kraken_human + String? kraken_human_dehosted = "" + Float kraken_sc2 + String? kraken_sc2_dehosted = "" + String? primer_trimmed_read_percent + String? artic_version + String? artic_docker + String? medaka_reference + String? bbduk_docker = "" + String? bwa_version = "" + String? fastq_scan_version = "" + String? ivar_variant_version = "" + String? ivar_version_consensus = "" + String? ivar_version_primtrim = "" + String? kraken_version = "" + String? nextclade_version = "" + String? nextclade_docker = "" + String samtools_version + String? samtools_version_consensus = "" + String? samtools_version_primtrim = "" + String? samtools_version_stats = "" + String? trimmomatic_version = "" + String? vadr_docker = "" + } + command <<< + python3<>> + output { + File summary = '~{samplename}.results.json' + } + runtime { + docker: "python:slim" + memory: "1 GB" + cpu: 1 + dx_instance_type: "mem1_ssd1_v2_x2" + maxRetries: 3 + } +} + +task merge_theiacov_summary { + input { + Array[File?] clearlabs_summaries + Array[File?] illumina_pe_summaries + Array[File?] illumina_se_summaries + Array[File?] ont_summaries + } + Array[File] clearlabs = select_all(clearlabs_summaries) + Array[File] illumina_pe = select_all(illumina_pe_summaries) + Array[File] illumina_se = select_all(illumina_se_summaries) + Array[File] ont = select_all(ont_summaries) + command <<< + python3<>> + output { + File summaries_tsv = "theiacov-results.tsv" + File summaries_json = "theiacov-results.json" + } + runtime { + docker: "python:3.9.5-slim" + memory: "1 GB" + cpu: 1 + dx_instance_type: "mem1_ssd1_v2_x2" + maxRetries: 3 + } +} diff --git a/tasks/task_titan_summary.wdl b/tasks/task_titan_summary.wdl deleted file mode 100644 index 53080518..00000000 --- a/tasks/task_titan_summary.wdl +++ /dev/null @@ -1,194 +0,0 @@ -version 1.0 - -task titan_summary { - input { - String samplename - String titan_workflow - String titan_version - String titan_analysis_date - String seq_platform - String primer_bed_name - Float percent_reference_coverage - Float? s_gene_mean_coverage - Int number_N - String pango_lineage - String pangolin_conflicts - String pangolin_notes - String pangolin_assignment_version - String pangolin_docker - String pangolin_versions - String nextclade_clade - String nextclade_aa_subs - String nextclade_aa_dels - String vadr_num_alerts - Int assembly_length_unambiguous - Float assembly_mean_coverage - String assembly_method - Int number_Degenerate - Int number_Total - Float meanbaseq_trim - Float meanmapq_trim - Int fastq_scan_clean1 - String? fastq_scan_clean2 = "" - String? fastq_scan_clean_pairs = "" - Int fastq_scan_raw1 - String? fastq_scan_raw2 = "" - String? fastq_scan_raw_pairs = "" - Float kraken_human - String? kraken_human_dehosted = "" - Float kraken_sc2 - String? kraken_sc2_dehosted = "" - String? primer_trimmed_read_percent - String? artic_version - String? artic_docker - String? medaka_reference - String? bbduk_docker = "" - String? bwa_version = "" - String? fastq_scan_version = "" - String? ivar_variant_version = "" - String? ivar_version_consensus = "" - String? ivar_version_primtrim = "" - String? kraken_version = "" - String? nextclade_version = "" - String? nextclade_docker = "" - String samtools_version - String? samtools_version_consensus = "" - String? samtools_version_primtrim = "" - String? samtools_version_stats = "" - String? trimmomatic_version = "" - String? vadr_docker = "" - } - - command <<< - python3<>> - - output { - File summary = '~{samplename}.results.json' - } - - runtime { - docker: "python:slim" - memory: "1 GB" - cpu: 1 - dx_instance_type: "mem1_ssd1_v2_x2" - maxRetries: 3 - } -} - -task merge_titan_summary { - input { - Array[File?] clearlabs_summaries - Array[File?] illumina_pe_summaries - Array[File?] illumina_se_summaries - Array[File?] ont_summaries - } - Array[File] clearlabs = select_all(clearlabs_summaries) - Array[File] illumina_pe = select_all(illumina_pe_summaries) - Array[File] illumina_se = select_all(illumina_se_summaries) - Array[File] ont = select_all(ont_summaries) - - command <<< - python3<>> - output { - File summaries_tsv = "titan-results.tsv" - File summaries_json = "titan-results.json" - } - - runtime { - docker: "python:3.9.5-slim" - memory: "1 GB" - cpu: 1 - dx_instance_type: "mem1_ssd1_v2_x2" - maxRetries: 3 - } -} diff --git a/tasks/task_validate.wdl b/tasks/task_validate.wdl index 48a40c33..8bf8d0c8 100644 --- a/tasks/task_validate.wdl +++ b/tasks/task_validate.wdl @@ -1,52 +1,51 @@ version 1.0 task export_two_tsvs { - input { - String terra_project - String terra_workspace - String datatable1 - String datatable2 + String terra_project + String terra_workspace + String datatable1 + String datatable2 } - command { + command <<< python3 /scripts/export_large_tsv/export_large_tsv.py --project ~{terra_project} --workspace ~{terra_workspace} --entity_type ~{datatable1} --tsv_filename ~{datatable1} python3 /scripts/export_large_tsv/export_large_tsv.py --project ~{terra_project} --workspace ~{terra_workspace} --entity_type ~{datatable2} --tsv_filename ~{datatable2} - } + >>> runtime { - docker: "broadinstitute/terra-tools:tqdm" - memory: "1 GB" - cpu: 1 - disks: "local-disk 10 HDD" - dx_instance_type: "mem1_ssd1_v2_x2" - maxRetries: 3 + docker: "broadinstitute/terra-tools:tqdm" + memory: "1 GB" + cpu: 1 + disks: "local-disk 10 HDD" + dx_instance_type: "mem1_ssd1_v2_x2" + maxRetries: 3 } output { - File datatable1_tsv = "~{datatable1}" - File datatable2_tsv = "~{datatable2}" + File datatable1_tsv = "~{datatable1}" + File datatable2_tsv = "~{datatable2}" } } task compare_two_tsvs { input { - File datatable1_tsv - File datatable2_tsv + File datatable1_tsv + File datatable2_tsv String? out_dir String? out_prefix } - command{ + command <<< compare-data-tables.py ~{datatable1_tsv} ~{datatable2_tsv} --outdir ~{out_dir} --prefix ~{out_prefix} - } + >>> runtime { - docker: "quay.io/theiagen/utility:1.2" - memory: "4 GB" - cpu: 2 - disks: "local-disk 50 HDD" - dx_instance_type: "mem1_ssd1_v2_x2" - maxRetries: 3 + docker: "quay.io/theiagen/utility:1.2" + memory: "4 GB" + cpu: 2 + disks: "local-disk 50 HDD" + dx_instance_type: "mem1_ssd1_v2_x2" + maxRetries: 3 } output { - File pdf_report = "~{out_dir}/~{out_prefix}.pdf" - File xl_report = "~{out_dir}/~{out_prefix}.xlsx" + File pdf_report = "~{out_dir}/~{out_prefix}.pdf" + File xl_report = "~{out_dir}/~{out_prefix}.xlsx" } } diff --git a/tasks/task_versioning.wdl b/tasks/task_versioning.wdl index d6cd8957..394f7e29 100644 --- a/tasks/task_versioning.wdl +++ b/tasks/task_versioning.wdl @@ -8,7 +8,7 @@ task version_capture { volatile: true } command <<< - PHVG_Version="PHVG v1.6.0-dev" + PHVG_Version="PHVG v2.0.0" ~{default='' 'export TZ=' + timezone} date +"%Y-%m-%d" > TODAY echo $PHVG_Version > PHVG_VERSION diff --git a/tests/workflows/theiacov-gc/test_clearlabs.yml b/tests/workflows/theiacov-gc/test_clearlabs.yml new file mode 100644 index 00000000..51aa7f5b --- /dev/null +++ b/tests/workflows/theiacov-gc/test_clearlabs.yml @@ -0,0 +1,32 @@ +- name: theiacov_clearlabs + command: theiacov-gc -i theiacov/clearlabs.json -o theiacov/clearlabs + tags: + - theiacov_gc + - theiacov_clearlabs + files: + - path: theiacov/clearlabs/alignments/clearlabs.flagstat.txt + md5sum: b5af0b1c2721c004f8f476d9ca540a35 + - path: theiacov/clearlabs/alignments/clearlabs.pass.vcf + contains: ["VCF", "medaka", "CHROM", "PASS"] + - path: theiacov/clearlabs/alignments/clearlabs.primertrimmed.rg.sorted.bam + - path: theiacov/clearlabs/alignments/clearlabs.primertrimmed.rg.sorted.bam.bai + - path: theiacov/clearlabs/alignments/clearlabs.stats.txt + - path: theiacov/clearlabs/assemblies/clearlabs.medaka.consensus.fasta + md5sum: ff8bf729b73fd7d34200100957b6f92d + - path: theiacov/clearlabs/cromwell-stderr.txt + - path: theiacov/clearlabs/cromwell-stdout.txt + - path: theiacov/clearlabs/dehosted_reads/clearlabs_R1_dehosted.fastq.gz + - path: theiacov/clearlabs/kraken2_reports/clearlabs_dehosted_kraken2_report.txt + md5sum: 35841fa2d77ec202c275b1de548b8d98 + - path: theiacov/clearlabs/kraken2_reports/clearlabs_kraken2_report.txt + md5sum: 35841fa2d77ec202c275b1de548b8d98 + - path: theiacov/clearlabs/nextclade/clearlabs.medaka.consensus.nextclade.auspice.json + - path: theiacov/clearlabs/nextclade/clearlabs.medaka.consensus.nextclade.json + - path: theiacov/clearlabs/nextclade/clearlabs.medaka.consensus.nextclade.tsv + - path: theiacov/clearlabs/pangolin_reports/clearlabs.pangolin_report.csv + - path: theiacov/clearlabs/results/clearlabs.results.json + - path: theiacov/clearlabs/theiacov-metadata.json + - path: theiacov/clearlabs/theiacov-results.json + - path: theiacov/clearlabs/theiacov-results.tsv + - path: theiacov/clearlabs/vadr_alerts/clearlabs.medaka.consensus.vadr.alt.list + md5sum: 3d858fa350d36dbcdfc1a52180ec9d87 diff --git a/tests/workflows/theiacov-gc/test_illumina_pe.yml b/tests/workflows/theiacov-gc/test_illumina_pe.yml new file mode 100644 index 00000000..7273d063 --- /dev/null +++ b/tests/workflows/theiacov-gc/test_illumina_pe.yml @@ -0,0 +1,33 @@ +- name: theiacov_illumina_pe + command: theiacov-gc -i theiacov/illumina_pe.json -o theiacov/illumina_pe + tags: + - theiacov_gc + - theiacov_illumina_pe + files: + - path: theiacov/illumina_pe/alignments/SRR13687078.flagstat.txt + md5sum: 7cd08dcf0ccb7cf5b1f445852673fe94 + - path: theiacov/illumina_pe/alignments/SRR13687078.primertrim.sorted.bam + - path: theiacov/illumina_pe/alignments/SRR13687078.primertrim.sorted.bam.bai + - path: theiacov/illumina_pe/alignments/SRR13687078.stats.txt + - path: theiacov/illumina_pe/alignments/SRR13687078.variants.vcf + md5sum: 285abef3eca1928504afcdaed801a7ad + - path: theiacov/illumina_pe/assemblies/SRR13687078.ivar.consensus.fasta + md5sum: 6923b89d36bc66e1cdfabfd1dedbea71 + - path: theiacov/illumina_pe/cromwell-stderr.txt + - path: theiacov/illumina_pe/cromwell-stdout.txt + - path: theiacov/illumina_pe/dehosted_reads/SRR13687078_R1_dehosted.fastq.gz + - path: theiacov/illumina_pe/dehosted_reads/SRR13687078_R2_dehosted.fastq.gz + - path: theiacov/illumina_pe/kraken2_reports/SRR13687078_dehosted_kraken2_report.txt + md5sum: 3544d9ca35d45093c03cdead46677765 + - path: theiacov/illumina_pe/kraken2_reports/SRR13687078_kraken2_report.txt + md5sum: 3544d9ca35d45093c03cdead46677765 + - path: theiacov/illumina_pe/nextclade/SRR13687078.ivar.consensus.nextclade.auspice.json + - path: theiacov/illumina_pe/nextclade/SRR13687078.ivar.consensus.nextclade.json + - path: theiacov/illumina_pe/nextclade/SRR13687078.ivar.consensus.nextclade.tsv + - path: theiacov/illumina_pe/pangolin_reports/SRR13687078.pangolin_report.csv + - path: theiacov/illumina_pe/results/SRR13687078.results.json + - path: theiacov/illumina_pe/theiacov-metadata.json + - path: theiacov/illumina_pe/theiacov-results.json + - path: theiacov/illumina_pe/theiacov-results.tsv + - path: theiacov/illumina_pe/vadr_alerts/SRR13687078.ivar.consensus.vadr.alt.list + md5sum: fc0e4e38d8484f8547c4cf265f41f7ad diff --git a/tests/workflows/theiacov-gc/test_illumina_se.yml b/tests/workflows/theiacov-gc/test_illumina_se.yml new file mode 100644 index 00000000..8a74a356 --- /dev/null +++ b/tests/workflows/theiacov-gc/test_illumina_se.yml @@ -0,0 +1,29 @@ +- name: theiacov_illumina_se + command: theiacov-gc -i theiacov/illumina_se.json -o theiacov/illumina_se + tags: + - theiacov_gc + - theiacov_illumina_se + files: + - path: theiacov/illumina_se/alignments/ERR6319327.flagstat.txt + md5sum: 63f3429dcb99b85bcc0e525ef19c8471 + - path: theiacov/illumina_se/alignments/ERR6319327.primertrim.sorted.bam + - path: theiacov/illumina_se/alignments/ERR6319327.primertrim.sorted.bam.bai + - path: theiacov/illumina_se/alignments/ERR6319327.stats.txt + - path: theiacov/illumina_se/alignments/ERR6319327.variants.vcf + md5sum: 9c0f25e6f486f8b8731ac87aed283d23 + - path: theiacov/illumina_se/assemblies/ERR6319327.ivar.consensus.fasta + md5sum: cb2c2f900d2942b30b762bf3b7a0a17a + - path: theiacov/illumina_se/cromwell-stderr.txt + - path: theiacov/illumina_se/cromwell-stdout.txt + - path: theiacov/illumina_se/kraken2_reports/ERR6319327_kraken2_report.txt + md5sum: 7e4fc05efbbc3937b99420e6193be061 + - path: theiacov/illumina_se/nextclade/ERR6319327.ivar.consensus.nextclade.auspice.json + - path: theiacov/illumina_se/nextclade/ERR6319327.ivar.consensus.nextclade.json + - path: theiacov/illumina_se/nextclade/ERR6319327.ivar.consensus.nextclade.tsv + - path: theiacov/illumina_se/pangolin_reports/ERR6319327.pangolin_report.csv + - path: theiacov/illumina_se/results/ERR6319327.results.json + - path: theiacov/illumina_se/theiacov-metadata.json + - path: theiacov/illumina_se/theiacov-results.json + - path: theiacov/illumina_se/theiacov-results.tsv + - path: theiacov/illumina_se/vadr_alerts/ERR6319327.ivar.consensus.vadr.alt.list + md5sum: fc0e4e38d8484f8547c4cf265f41f7ad diff --git a/tests/workflows/theiacov-gc/test_ont.yml b/tests/workflows/theiacov-gc/test_ont.yml new file mode 100644 index 00000000..b506817c --- /dev/null +++ b/tests/workflows/theiacov-gc/test_ont.yml @@ -0,0 +1,32 @@ +- name: theiacov_ont + command: theiacov-gc -i theiacov/ont.json -o theiacov/ont + tags: + - theiacov_gc + - theiacov_ont + files: + - path: theiacov/ont/alignments/ont.flagstat.txt + md5sum: 85c7b4e3985735fd2a45f7f5a09b529f + - path: theiacov/ont/alignments/ont.pass.vcf + contains: ["VCF", "medaka", "CHROM", "PASS"] + - path: theiacov/ont/alignments/ont.primertrimmed.rg.sorted.bam + - path: theiacov/ont/alignments/ont.primertrimmed.rg.sorted.bam.bai + - path: theiacov/ont/alignments/ont.stats.txt + - path: theiacov/ont/assemblies/ont.medaka.consensus.fasta + md5sum: 02a04e9cf297acc8fc03667edab41fde + - path: theiacov/ont/cromwell-stderr.txt + - path: theiacov/ont/cromwell-stdout.txt + - path: theiacov/ont/dehosted_reads/ont_R1_dehosted.fastq.gz + - path: theiacov/ont/kraken2_reports/ont_dehosted_kraken2_report.txt + md5sum: 5780753ae61523a3621fbe2635b04b70 + - path: theiacov/ont/kraken2_reports/ont_kraken2_report.txt + md5sum: f2c6f26b1ef2786d124eae2ab3fe80c1 + - path: theiacov/ont/nextclade/ont.medaka.consensus.nextclade.auspice.json + - path: theiacov/ont/nextclade/ont.medaka.consensus.nextclade.json + - path: theiacov/ont/nextclade/ont.medaka.consensus.nextclade.tsv + - path: theiacov/ont/pangolin_reports/ont.pangolin_report.csv + - path: theiacov/ont/results/ont.results.json + - path: theiacov/ont/theiacov-metadata.json + - path: theiacov/ont/theiacov-results.json + - path: theiacov/ont/theiacov-results.tsv + - path: theiacov/ont/vadr_alerts/ont.medaka.consensus.vadr.alt.list + md5sum: fc0e4e38d8484f8547c4cf265f41f7ad diff --git a/tests/workflows/titan-gc/test_clearlabs.yml b/tests/workflows/titan-gc/test_clearlabs.yml deleted file mode 100644 index a234d422..00000000 --- a/tests/workflows/titan-gc/test_clearlabs.yml +++ /dev/null @@ -1,32 +0,0 @@ -- name: titan_clearlabs - command: titan-gc -i titan/clearlabs.json -o titan/clearlabs - tags: - - titan_gc - - titan_clearlabs - files: - - path: titan/clearlabs/alignments/clearlabs.flagstat.txt - md5sum: 81aaa3b9972d4610e07967553805e65d - - path: titan/clearlabs/alignments/clearlabs.pass.vcf - contains: ["VCF", "medaka", "CHROM", "PASS"] - - path: titan/clearlabs/alignments/clearlabs.primertrimmed.rg.sorted.bam - - path: titan/clearlabs/alignments/clearlabs.primertrimmed.rg.sorted.bam.bai - - path: titan/clearlabs/alignments/clearlabs.stats.txt - - path: titan/clearlabs/assemblies/clearlabs.medaka.consensus.fasta - md5sum: ff8bf729b73fd7d34200100957b6f92d - - path: titan/clearlabs/cromwell-stderr.txt - - path: titan/clearlabs/cromwell-stdout.txt - - path: titan/clearlabs/dehosted_reads/clearlabs_R1_dehosted.fastq.gz - - path: titan/clearlabs/kraken2_reports/clearlabs_dehosted_kraken2_report.txt - md5sum: 35841fa2d77ec202c275b1de548b8d98 - - path: titan/clearlabs/kraken2_reports/clearlabs_kraken2_report.txt - md5sum: 35841fa2d77ec202c275b1de548b8d98 - - path: titan/clearlabs/nextclade/clearlabs.medaka.consensus.nextclade.auspice.json - - path: titan/clearlabs/nextclade/clearlabs.medaka.consensus.nextclade.json - - path: titan/clearlabs/nextclade/clearlabs.medaka.consensus.nextclade.tsv - - path: titan/clearlabs/pangolin_reports/clearlabs.pangolin_report.csv - - path: titan/clearlabs/results/clearlabs.results.json - - path: titan/clearlabs/titan-metadata.json - - path: titan/clearlabs/titan-results.json - - path: titan/clearlabs/titan-results.tsv - - path: titan/clearlabs/vadr_alerts/clearlabs.medaka.consensus.vadr.alt.list - md5sum: 3d858fa350d36dbcdfc1a52180ec9d87 diff --git a/tests/workflows/titan-gc/test_illumina_pe.yml b/tests/workflows/titan-gc/test_illumina_pe.yml deleted file mode 100644 index c0f44134..00000000 --- a/tests/workflows/titan-gc/test_illumina_pe.yml +++ /dev/null @@ -1,33 +0,0 @@ -- name: titan_illumina_pe - command: titan-gc -i titan/illumina_pe.json -o titan/illumina_pe - tags: - - titan_gc - - titan_illumina_pe - files: - - path: titan/illumina_pe/alignments/SRR13687078.flagstat.txt - md5sum: 7cd08dcf0ccb7cf5b1f445852673fe94 - - path: titan/illumina_pe/alignments/SRR13687078.primertrim.sorted.bam - - path: titan/illumina_pe/alignments/SRR13687078.primertrim.sorted.bam.bai - - path: titan/illumina_pe/alignments/SRR13687078.stats.txt - - path: titan/illumina_pe/alignments/SRR13687078.variants.vcf - md5sum: 19e716097dcbad1a20f177d8a2590ef5 - - path: titan/illumina_pe/assemblies/SRR13687078.ivar.consensus.fasta - md5sum: 5ef30a51925f1b2569f998ae12e1e186 - - path: titan/illumina_pe/cromwell-stderr.txt - - path: titan/illumina_pe/cromwell-stdout.txt - - path: titan/illumina_pe/dehosted_reads/SRR13687078_R1_dehosted.fastq.gz - - path: titan/illumina_pe/dehosted_reads/SRR13687078_R2_dehosted.fastq.gz - - path: titan/illumina_pe/kraken2_reports/SRR13687078_dehosted_kraken2_report.txt - md5sum: 3544d9ca35d45093c03cdead46677765 - - path: titan/illumina_pe/kraken2_reports/SRR13687078_kraken2_report.txt - md5sum: 3544d9ca35d45093c03cdead46677765 - - path: titan/illumina_pe/nextclade/SRR13687078.ivar.consensus.nextclade.auspice.json - - path: titan/illumina_pe/nextclade/SRR13687078.ivar.consensus.nextclade.json - - path: titan/illumina_pe/nextclade/SRR13687078.ivar.consensus.nextclade.tsv - - path: titan/illumina_pe/pangolin_reports/SRR13687078.pangolin_report.csv - - path: titan/illumina_pe/results/SRR13687078.results.json - - path: titan/illumina_pe/titan-metadata.json - - path: titan/illumina_pe/titan-results.json - - path: titan/illumina_pe/titan-results.tsv - - path: titan/illumina_pe/vadr_alerts/SRR13687078.ivar.consensus.vadr.alt.list - md5sum: fc0e4e38d8484f8547c4cf265f41f7ad diff --git a/tests/workflows/titan-gc/test_illumina_se.yml b/tests/workflows/titan-gc/test_illumina_se.yml deleted file mode 100644 index 98635b9c..00000000 --- a/tests/workflows/titan-gc/test_illumina_se.yml +++ /dev/null @@ -1,29 +0,0 @@ -- name: titan_illumina_se - command: titan-gc -i titan/illumina_se.json -o titan/illumina_se - tags: - - titan_gc - - titan_illumina_se - files: - - path: titan/illumina_se/alignments/ERR6319327.flagstat.txt - md5sum: 63f3429dcb99b85bcc0e525ef19c8471 - - path: titan/illumina_se/alignments/ERR6319327.primertrim.sorted.bam - - path: titan/illumina_se/alignments/ERR6319327.primertrim.sorted.bam.bai - - path: titan/illumina_se/alignments/ERR6319327.stats.txt - - path: titan/illumina_se/alignments/ERR6319327.variants.vcf - md5sum: 01d93a6a37b9598c594a780a6996d08e - - path: titan/illumina_se/assemblies/ERR6319327.ivar.consensus.fasta - md5sum: 45a6c6f18d5fa448e0aa811bdf934bab - - path: titan/illumina_se/cromwell-stderr.txt - - path: titan/illumina_se/cromwell-stdout.txt - - path: titan/illumina_se/kraken2_reports/ERR6319327_kraken2_report.txt - md5sum: 7e4fc05efbbc3937b99420e6193be061 - - path: titan/illumina_se/nextclade/ERR6319327.ivar.consensus.nextclade.auspice.json - - path: titan/illumina_se/nextclade/ERR6319327.ivar.consensus.nextclade.json - - path: titan/illumina_se/nextclade/ERR6319327.ivar.consensus.nextclade.tsv - - path: titan/illumina_se/pangolin_reports/ERR6319327.pangolin_report.csv - - path: titan/illumina_se/results/ERR6319327.results.json - - path: titan/illumina_se/titan-metadata.json - - path: titan/illumina_se/titan-results.json - - path: titan/illumina_se/titan-results.tsv - - path: titan/illumina_se/vadr_alerts/ERR6319327.ivar.consensus.vadr.alt.list - md5sum: fc0e4e38d8484f8547c4cf265f41f7ad diff --git a/tests/workflows/titan-gc/test_ont.yml b/tests/workflows/titan-gc/test_ont.yml deleted file mode 100644 index 5101da1b..00000000 --- a/tests/workflows/titan-gc/test_ont.yml +++ /dev/null @@ -1,32 +0,0 @@ -- name: titan_ont - command: titan-gc -i titan/ont.json -o titan/ont - tags: - - titan_gc - - titan_ont - files: - - path: titan/ont/alignments/ont.flagstat.txt - md5sum: 85c7b4e3985735fd2a45f7f5a09b529f - - path: titan/ont/alignments/ont.pass.vcf - contains: ["VCF", "medaka", "CHROM", "PASS"] - - path: titan/ont/alignments/ont.primertrimmed.rg.sorted.bam - - path: titan/ont/alignments/ont.primertrimmed.rg.sorted.bam.bai - - path: titan/ont/alignments/ont.stats.txt - - path: titan/ont/assemblies/ont.medaka.consensus.fasta - md5sum: 02a04e9cf297acc8fc03667edab41fde - - path: titan/ont/cromwell-stderr.txt - - path: titan/ont/cromwell-stdout.txt - - path: titan/ont/dehosted_reads/ont_R1_dehosted.fastq.gz - - path: titan/ont/kraken2_reports/ont_dehosted_kraken2_report.txt - md5sum: 5780753ae61523a3621fbe2635b04b70 - - path: titan/ont/kraken2_reports/ont_kraken2_report.txt - md5sum: f2c6f26b1ef2786d124eae2ab3fe80c1 - - path: titan/ont/nextclade/ont.medaka.consensus.nextclade.auspice.json - - path: titan/ont/nextclade/ont.medaka.consensus.nextclade.json - - path: titan/ont/nextclade/ont.medaka.consensus.nextclade.tsv - - path: titan/ont/pangolin_reports/ont.pangolin_report.csv - - path: titan/ont/results/ont.results.json - - path: titan/ont/titan-metadata.json - - path: titan/ont/titan-results.json - - path: titan/ont/titan-results.tsv - - path: titan/ont/vadr_alerts/ont.medaka.consensus.vadr.alt.list - md5sum: fc0e4e38d8484f8547c4cf265f41f7ad diff --git a/workflows/WasteWaterVariantCalling_modified.wdl b/workflows/WasteWaterVariantCalling_modified.wdl deleted file mode 100644 index ce656f49..00000000 --- a/workflows/WasteWaterVariantCalling_modified.wdl +++ /dev/null @@ -1,433 +0,0 @@ -version 1.0 - -workflow WasteWaterVariantCalling { - meta { - description: "Modified version of the CDPHE's WasteWaterVariantCalling WDL Worfklow to performs variant calling on SARS-CoV-2 in waster water samples and identifies mutations in the Spike gene associated with known VOCs and VUIs: https://github.com/CDPHE/WasteWaterVariantCalling)." - author: "Kevin Libuit" - email: "kevin.libuit@theiagen.com" - } - input { - Array[File] sorted_bam - File covid_genome - File spike_bed - File spike_annotations - Array[String] sample_id - } - - scatter (id_bam in zip(sample_id, sorted_bam)) { - call add_RG { - input: - sample_id = id_bam.left, - bam = id_bam.right - } - call variant_calling { - input: - bam = add_RG.rgbam, - ref = covid_genome, - sample_id = id_bam.left - } - call sort_vcf { - input: - vcf = variant_calling.vcf, - sample_id = id_bam.left - } - call sample_spike { - input: - vcf = sort_vcf.sorted_vcf, - bed = spike_bed, - sample_id = id_bam.left - } - call vcf2tsv { - input: - vcf = sample_spike.sample_spike_vcf, - sample_id = id_bam.left, - bed = spike_bed - } - call fill_NA { - input: - tsv = vcf2tsv.sample_spike_tsv, - sample_id = id_bam.left, - spike_bed = spike_bed - } - call allele_freq { - input: - tsv = fill_NA.fill_NA_tsv, - sample_id = id_bam.left - } - call reformat_tsv { - input: - tsv = allele_freq.allele_freq_tsv, - sample_id = id_bam.left - } - call summary_prep { - input: - tsv = reformat_tsv.reformat_tsv_tsv, - sample_id = id_bam.left, - spike_annotations = spike_annotations - } - } - call dashboard_tsv { - input: - tsv = summary_prep.sample_spike_tsv_summary, - tsv_dash = summary_prep.sample_spike_tsv_dash, - tsv_counts = summary_prep.sample_spike_tsv_counts, - spike_annotations = spike_annotations - } - call summary_tsv { - input: - tsv = dashboard_tsv.spike_summary_temp - } - - output { - Array[File] addrg_bam = add_RG.rgbam - Array[File] variants = variant_calling.vcf - Array[File] sorted_vcf = sort_vcf.sorted_vcf - Array[File] sample_spike_vcf = sample_spike.sample_spike_vcf - Array[File] sample_spike_tsv = vcf2tsv.sample_spike_tsv - Array[File] sample_spike_tsv_summary = summary_prep.sample_spike_tsv_summary - Array[File] sample_spike_tsv_dash = summary_prep.sample_spike_tsv_dash - Array[File] fill_NA_tsv = fill_NA.fill_NA_tsv - Array[File] allele_freq_tsv = allele_freq.allele_freq_tsv - Array[File] reformat_tsv_tsv = reformat_tsv.reformat_tsv_tsv - Array[File] sample_spike_tsv_counts = summary_prep.sample_spike_tsv_counts - File spike_summary_temp = dashboard_tsv.spike_summary_temp - File spike_summary = summary_tsv.spike_summary - File spike_dashboard = dashboard_tsv.spike_dashboard - File spike_counts = dashboard_tsv.spike_counts - - } -} - -task add_RG { - input { - String sample_id - File bam - } - - command <<< - - samtools addreplacerg -r ID:~{sample_id} -r LB:L1 -r SM:~{sample_id} -o ~{sample_id}_addRG.bam ~{bam} - - >>> - - output { - File rgbam = "${sample_id}_addRG.bam" - } - - runtime { - docker: "staphb/samtools:1.10" - memory: "8 GB" - cpu: 2 - disks: "local-disk 100 SSD" - } -} - -task variant_calling { - input { - String sample_id - File bam - File ref - - } - - command <<< - - freebayes -f ~{ref} --haplotype-length 0 --min-alternate-count 3 --min-alternate-fraction 0.05 --min-mapping-quality 20 --min-base-quality 20 --min-coverage 10 --use-duplicate-reads --report-monomorphic --pooled-continuous ~{bam} > ~{sample_id}_variants.vcf - - >>> - - output { - File vcf = "${sample_id}_variants.vcf" - } - - runtime { - docker: "wgspipeline/freebayes:v0.0.1" - memory: "32 GB" - cpu: 8 - disks: "local-disk 200 SSD" - } -} - -task sort_vcf { - input { - String sample_id - File vcf - - } - - command <<< - - bgzip -c ~{vcf} > ~{sample_id}_variants.vcf.gz - - tabix -p vcf ~{sample_id}_variants.vcf.gz - - bcftools sort -O z ~{sample_id}_variants.vcf.gz > ~{sample_id}_sorted.vcf.gz - - >>> - - output { - File sorted_vcf = "${sample_id}_sorted.vcf.gz" - } - - runtime { - docker: "quay.io/biocontainers/bcftools:1.10.2--hd2cd319_0" - memory: "8 GB" - cpu: 2 - disks: "local-disk 100 SSD" - } -} - -task sample_spike { - input { - File vcf - File bed - String sample_id - } - - command <<< - - tabix -p vcf ~{vcf} - - bcftools view --regions-file ~{bed} --output-type v --output-file ~{sample_id}_spike_mutations.vcf ~{vcf} - - >>> - - output { - File sample_spike_vcf = "${sample_id}_spike_mutations.vcf" - } - - runtime { - docker: "quay.io/biocontainers/bcftools:1.10.2--hd2cd319_0" - memory: "16 GB" - cpu: 4 - disks: "local-disk 100 SSD" - } -} - -task vcf2tsv { - input { - File vcf - File bed - String sample_id - } - - command <<< - - bgzip -c ~{vcf} > ~{sample_id}_spike_mutations.vcf.gz - - tabix -p vcf ~{sample_id}_spike_mutations.vcf.gz - - bcftools query --regions-file ~{bed} --format '%CHROM\t%POS\t%REF\t%ALT[\t%DP\t%RO\t%AO]\n' ~{sample_id}_spike_mutations.vcf.gz > ~{sample_id}_spike_mutations.tsv - - >>> - - output { - File sample_spike_tsv = "${sample_id}_spike_mutations.tsv" - } - - runtime { - docker: "quay.io/biocontainers/bcftools:1.10.2--hd2cd319_0" - memory: "16 GB" - cpu: 4 - disks: "local-disk 100 SSD" - } -} - -task fill_NA { - input { - File tsv - String sample_id - File spike_bed - } - - command <<< - - # create key of unique locations - cat ~{spike_bed} | cut -f 1,2 | tr "\t" "_" | sort | uniq > keys.txt - - # add headers to tsv and use key to fill in missing values - echo -e "CHROM\tPOS\tREF\t~{sample_id}_ALT\t~{sample_id}_DP\t~{sample_id}_RO\t~{sample_id}_AO" | cat - ~{tsv} | sed 's/\t/_/' | sort -t $'\t' -k1,1 > ~{sample_id}_spike_mutations_temp1.tsv - - # get the filled columns we want - join -t $'\t' -e NA -a 1 -1 1 -2 1 -o "1.1,2.3,2.4,2.6" keys.txt "~{sample_id}_spike_mutations_temp1.tsv" > ~{sample_id}_spike_fill_NA.tsv - - >>> - - output { - File fill_NA_tsv = "${sample_id}_spike_fill_NA.tsv" - } - - runtime { - docker: "quay.io/theiagen/utility:1.1" - memory: "32 GB" - cpu: 8 - disks: "local-disk 2500 HDD" - } -} - -task allele_freq { - input { - File tsv - String sample_id - } - - command <<< - - # separate the comma separated alleles into separate rows (might need to fix delimiters) - awk '{split($2,a,","); split($4,b,","); for(i in a){print $1,a[i],$3,b[i]}}' ~{tsv} > ~{sample_id}_spike_mutations_temp2.tsv - - # use AO and DP fields to calculate ALT allele frequency, fix delimiters, change -nan allele frequencies to NA - awk '$3~"^NA"||$4~"^NA"{$5="NA";print;next}{$5=$4/$3}1' ~{sample_id}_spike_mutations_temp2.tsv | sed 's/ /\t/g' | awk '$5 == "-nan" {$5="NA"} 1' OFS="\t" > ~{sample_id}_spike_allele_freq.tsv - - >>> - - output { - File allele_freq_tsv = "${sample_id}_spike_allele_freq.tsv" - } - - runtime { - docker: "quay.io/theiagen/utility:1.1" - memory: "32 GB" - cpu: 8 - disks: "local-disk 2500 HDD" - } -} - -task reformat_tsv { - input { - File tsv - String sample_id - } - - command <<< - - # combine the rows based on matching nucl location - - awk '{f2[$1]=f2[$1] sep[$1] $2; - f3[$1]=f3[$1] sep[$1] $3; - f4[$1]=f4[$1] sep[$1] $4; - f5[$1]=f5[$1] sep[$1] $5; - sep[$1]=";"} - END {for(k in f2) print k,f2[k],f3[k],f4[k],f5[k]}' ~{tsv} > ~{sample_id}_spike_mutations_temp3.tsv - - # fix delimiters, add a column containing the sample ids - sed 's/ /\t/g' ~{sample_id}_spike_mutations_temp3.tsv | awk 'NF=NF+1{$NF="~{sample_id}"}1' > ~{sample_id}_spike_mutations_temp4.tsv - - # fix the column headers, convert from space to tab delimited and then sort by col1 - echo -e "CHROMPOS ~{sample_id}_ALT ~{sample_id}_DP ~{sample_id}_AO ~{sample_id}_ALTfreq sample_id" | cat - ~{sample_id}_spike_mutations_temp4.tsv | sed 's/ /\t/g' | sort -t $'\t' -k 1,1 -V > ~{sample_id}_spike_reformat.tsv - - >>> - - output { - File reformat_tsv_tsv = "${sample_id}_spike_reformat.tsv" - } - - runtime { - docker: "quay.io/theiagen/utility:1.1" - memory: "32 GB" - cpu: 8 - disks: "local-disk 2500 HDD" - } -} - -task summary_prep { - input { - File tsv - String sample_id - File spike_annotations - } - - command <<< - - # cut the columns we want for the results summary and make output file - cut -f2,5 ~{tsv} > ~{sample_id}_spike_mutations_forsummary.tsv - - # cut the columns we want for the dashboard summary - awk '{print $6 "\t" $2 "\t" $5}' ~{tsv} > ~{sample_id}_spike_mutations_temp5.tsv - - # add annotations to the dashboard summary, reorder the dashboard summary columns, fix the dashboard summary column headers and make output file - paste ~{spike_annotations} ~{sample_id}_spike_mutations_temp5.tsv | awk '{print $4 "\t" $1 "\t" $2 "\t" $3 "\t" $5 "\t" $6}' | awk 'BEGIN{FS=OFS="\t"; print "sample_id", "AA_change", "Nucl_change", "Lineages", "ALT", "ALTfreq"} NR>1{print $1, $2, $3, $4, $5, $6}' > ~{sample_id}_spike_mutations_fordash.tsv - - # cut the columns we want for the counts summary - awk '{print $6 "\t" $2 "\t" $3 "\t" $4}' ~{tsv} > ~{sample_id}_spike_mutations_temp6.tsv - - # add annotations to the counts summary, reorder the dashboard summary columns, fix the dashboard summary column headers and make output file - paste ~{spike_annotations} ~{sample_id}_spike_mutations_temp6.tsv | awk '{print $4 "\t" $1 "\t" $2 "\t" $3 "\t" $5 "\t" $6 "\t" $7}' | awk 'BEGIN{FS=OFS="\t"; print "sample_id", "AA_change", "Nucl_change", "Lineages", "ALT", "Total_count", "ALT_count"} NR>1{print $1, $2, $3, $4, $5, $6, $7}' > ~{sample_id}_spike_mutations_counts.tsv - - >>> - - output { - File sample_spike_tsv_summary = "${sample_id}_spike_mutations_forsummary.tsv" - File sample_spike_tsv_dash = "${sample_id}_spike_mutations_fordash.tsv" - File sample_spike_tsv_counts = "${sample_id}_spike_mutations_counts.tsv" - } - - runtime { - docker: "quay.io/theiagen/utility:1.1" - memory: "32 GB" - cpu: 8 - disks: "local-disk 2500 HDD" - } -} - -task dashboard_tsv { - input { - Array[File] tsv - Array[File] tsv_dash - Array[File] tsv_counts - File spike_annotations - } - - command <<< - - # concatenate the tsvs and make the dashboard summary output - awk 'FNR==1 && NR!=1{next;}{print}' ~{sep=' ' tsv_dash} >> spike_mutations_dashboard.tsv - - # concatenate the tsvs and make the dashboard summary output - awk 'FNR==1 && NR!=1{next;}{print}' ~{sep=' ' tsv_counts} >> spike_mutations_counts.tsv - - # fix delimiters in annotations file - sed 's/ /\t/g' ~{spike_annotations} > spike_annotations.tsv - - # concatentate tsvs for sequencing and bioinformatics team summary file and make output - paste spike_annotations.tsv ~{sep=' ' tsv} > spike_mutations_summary_temp.tsv - - >>> - - output { - File spike_summary_temp = "spike_mutations_summary_temp.tsv" - File spike_dashboard = "spike_mutations_dashboard.tsv" - File spike_counts = "spike_mutations_counts.tsv" - } - - runtime { - docker: "quay.io/theiagen/utility:1.1" - memory: "16 GB" - cpu: 4 - disks: "local-disk 200 SSD" - } -} - -task summary_tsv { - input { - File tsv - } - - command <<< - - # datamash to tranpose results summary - datamash -H transpose < ~{tsv} > spike_mutations_summary.tsv - - >>> - - output { - File spike_summary = "spike_mutations_summary.tsv" - } - - runtime { - docker: "rapatsky/debian" - memory: "16 GB" - cpu: 4 - disks: "local-disk 200 SSD" - } -} diff --git a/workflows/wf_WasteWaterVariantCalling_modified.wdl b/workflows/wf_WasteWaterVariantCalling_modified.wdl new file mode 100644 index 00000000..7e4861fc --- /dev/null +++ b/workflows/wf_WasteWaterVariantCalling_modified.wdl @@ -0,0 +1,394 @@ +version 1.0 + +workflow WasteWaterVariantCalling { + meta { + description: "Modified version of the CDPHE's WasteWaterVariantCalling WDL Worfklow to performs variant calling on SARS-CoV-2 in waster water samples and identifies mutations in the Spike gene associated with known VOCs and VUIs: https://github.com/CDPHE/WasteWaterVariantCalling)." + author: "Kevin Libuit" + email: "kevin.libuit@theiagen.com" + } + input { + Array[File] sorted_bam + File covid_genome + File spike_bed + File spike_annotations + Array[String] sample_id + } + scatter (id_bam in zip(sample_id, sorted_bam)) { + call add_RG { + input: + sample_id = id_bam.left, + bam = id_bam.right + } + call variant_calling { + input: + bam = add_RG.rgbam, + ref = covid_genome, + sample_id = id_bam.left + } + call sort_vcf { + input: + vcf = variant_calling.vcf, + sample_id = id_bam.left + } + call sample_spike { + input: + vcf = sort_vcf.sorted_vcf, + bed = spike_bed, + sample_id = id_bam.left + } + call vcf2tsv { + input: + vcf = sample_spike.sample_spike_vcf, + sample_id = id_bam.left, + bed = spike_bed + } + call fill_NA { + input: + tsv = vcf2tsv.sample_spike_tsv, + sample_id = id_bam.left, + spike_bed = spike_bed + } + call allele_freq { + input: + tsv = fill_NA.fill_NA_tsv, + sample_id = id_bam.left + } + call reformat_tsv { + input: + tsv = allele_freq.allele_freq_tsv, + sample_id = id_bam.left + } + call summary_prep { + input: + tsv = reformat_tsv.reformat_tsv_tsv, + sample_id = id_bam.left, + spike_annotations = spike_annotations + } + } + call dashboard_tsv { + input: + tsv = summary_prep.sample_spike_tsv_summary, + tsv_dash = summary_prep.sample_spike_tsv_dash, + tsv_counts = summary_prep.sample_spike_tsv_counts, + spike_annotations = spike_annotations + } + call summary_tsv { + input: + tsv = dashboard_tsv.spike_summary_temp + } + output { + Array[File] addrg_bam = add_RG.rgbam + Array[File] variants = variant_calling.vcf + Array[File] sorted_vcf = sort_vcf.sorted_vcf + Array[File] sample_spike_vcf = sample_spike.sample_spike_vcf + Array[File] sample_spike_tsv = vcf2tsv.sample_spike_tsv + Array[File] sample_spike_tsv_summary = summary_prep.sample_spike_tsv_summary + Array[File] sample_spike_tsv_dash = summary_prep.sample_spike_tsv_dash + Array[File] fill_NA_tsv = fill_NA.fill_NA_tsv + Array[File] allele_freq_tsv = allele_freq.allele_freq_tsv + Array[File] reformat_tsv_tsv = reformat_tsv.reformat_tsv_tsv + Array[File] sample_spike_tsv_counts = summary_prep.sample_spike_tsv_counts + File spike_summary_temp = dashboard_tsv.spike_summary_temp + File spike_summary = summary_tsv.spike_summary + File spike_dashboard = dashboard_tsv.spike_dashboard + File spike_counts = dashboard_tsv.spike_counts + } +} + +task add_RG { + input { + String sample_id + File bam + } + command <<< + + samtools addreplacerg -r ID:~{sample_id} -r LB:L1 -r SM:~{sample_id} -o ~{sample_id}_addRG.bam ~{bam} + + >>> + output { + File rgbam = "~{sample_id}_addRG.bam" + } + runtime { + docker: "quay.io/staphb/samtools:1.10" + memory: "8 GB" + cpu: 2 + disks: "local-disk 100 SSD" + } +} + +task variant_calling { + input { + String sample_id + File bam + File ref + } + command <<< + + freebayes -f ~{ref} --haplotype-length 0 --min-alternate-count 3 --min-alternate-fraction 0.05 --min-mapping-quality 20 --min-base-quality 20 --min-coverage 10 --use-duplicate-reads --report-monomorphic --pooled-continuous ~{bam} > ~{sample_id}_variants.vcf + + >>> + output { + File vcf = "${sample_id}_variants.vcf" + } + runtime { + docker: "wgspipeline/freebayes:v0.0.1" + memory: "32 GB" + cpu: 8 + disks: "local-disk 200 SSD" + } +} + +task sort_vcf { + input { + String sample_id + File vcf + } + command <<< + + bgzip -c ~{vcf} > ~{sample_id}_variants.vcf.gz + + tabix -p vcf ~{sample_id}_variants.vcf.gz + + bcftools sort -O z ~{sample_id}_variants.vcf.gz > ~{sample_id}_sorted.vcf.gz + + >>> + output { + File sorted_vcf = "${sample_id}_sorted.vcf.gz" + } + runtime { + docker: "quay.io/biocontainers/bcftools:1.10.2--hd2cd319_0" + memory: "8 GB" + cpu: 2 + disks: "local-disk 100 SSD" + } +} + +task sample_spike { + input { + File vcf + File bed + String sample_id + } + command <<< + + tabix -p vcf ~{vcf} + + bcftools view --regions-file ~{bed} --output-type v --output-file ~{sample_id}_spike_mutations.vcf ~{vcf} + + >>> + output { + File sample_spike_vcf = "${sample_id}_spike_mutations.vcf" + } + runtime { + docker: "quay.io/biocontainers/bcftools:1.10.2--hd2cd319_0" + memory: "16 GB" + cpu: 4 + disks: "local-disk 100 SSD" + } +} + +task vcf2tsv { + input { + File vcf + File bed + String sample_id + } + command <<< + + bgzip -c ~{vcf} > ~{sample_id}_spike_mutations.vcf.gz + + tabix -p vcf ~{sample_id}_spike_mutations.vcf.gz + + bcftools query --regions-file ~{bed} --format '%CHROM\t%POS\t%REF\t%ALT[\t%DP\t%RO\t%AO]\n' ~{sample_id}_spike_mutations.vcf.gz > ~{sample_id}_spike_mutations.tsv + + >>> + output { + File sample_spike_tsv = "${sample_id}_spike_mutations.tsv" + } + runtime { + docker: "quay.io/biocontainers/bcftools:1.10.2--hd2cd319_0" + memory: "16 GB" + cpu: 4 + disks: "local-disk 100 SSD" + } +} + +task fill_NA { + input { + File tsv + String sample_id + File spike_bed + } + command <<< + + # create key of unique locations + cat ~{spike_bed} | cut -f 1,2 | tr "\t" "_" | sort | uniq > keys.txt + + # add headers to tsv and use key to fill in missing values + echo -e "CHROM\tPOS\tREF\t~{sample_id}_ALT\t~{sample_id}_DP\t~{sample_id}_RO\t~{sample_id}_AO" | cat - ~{tsv} | sed 's/\t/_/' | sort -t $'\t' -k1,1 > ~{sample_id}_spike_mutations_temp1.tsv + + # get the filled columns we want + join -t $'\t' -e NA -a 1 -1 1 -2 1 -o "1.1,2.3,2.4,2.6" keys.txt "~{sample_id}_spike_mutations_temp1.tsv" > ~{sample_id}_spike_fill_NA.tsv + + >>> + output { + File fill_NA_tsv = "${sample_id}_spike_fill_NA.tsv" + } + runtime { + docker: "quay.io/theiagen/utility:1.1" + memory: "32 GB" + cpu: 8 + disks: "local-disk 200 HDD" + } +} + +task allele_freq { + input { + File tsv + String sample_id + } + command <<< + + # separate the comma separated alleles into separate rows (might need to fix delimiters) + awk '{split($2,a,","); split($4,b,","); for(i in a){print $1,a[i],$3,b[i]}}' ~{tsv} > ~{sample_id}_spike_mutations_temp2.tsv + + # use AO and DP fields to calculate ALT allele frequency, fix delimiters, change -nan allele frequencies to NA + awk '$3~"^NA"||$4~"^NA"{$5="NA";print;next}{$5=$4/$3}1' ~{sample_id}_spike_mutations_temp2.tsv | sed 's/ /\t/g' | awk '$5 == "-nan" {$5="NA"} 1' OFS="\t" > ~{sample_id}_spike_allele_freq.tsv + + >>> + output { + File allele_freq_tsv = "${sample_id}_spike_allele_freq.tsv" + } + runtime { + docker: "quay.io/theiagen/utility:1.1" + memory: "32 GB" + cpu: 8 + disks: "local-disk 200 HDD" + } +} + +task reformat_tsv { + input { + File tsv + String sample_id + } + command <<< + + # combine the rows based on matching nucl location + + awk '{f2[$1]=f2[$1] sep[$1] $2; + f3[$1]=f3[$1] sep[$1] $3; + f4[$1]=f4[$1] sep[$1] $4; + f5[$1]=f5[$1] sep[$1] $5; + sep[$1]=";"} + END {for(k in f2) print k,f2[k],f3[k],f4[k],f5[k]}' ~{tsv} > ~{sample_id}_spike_mutations_temp3.tsv + + # fix delimiters, add a column containing the sample ids + sed 's/ /\t/g' ~{sample_id}_spike_mutations_temp3.tsv | awk 'NF=NF+1{$NF="~{sample_id}"}1' > ~{sample_id}_spike_mutations_temp4.tsv + + # fix the column headers, convert from space to tab delimited and then sort by col1 + echo -e "CHROMPOS ~{sample_id}_ALT ~{sample_id}_DP ~{sample_id}_AO ~{sample_id}_ALTfreq sample_id" | cat - ~{sample_id}_spike_mutations_temp4.tsv | sed 's/ /\t/g' | sort -t $'\t' -k 1,1 -V > ~{sample_id}_spike_reformat.tsv + + >>> + output { + File reformat_tsv_tsv = "${sample_id}_spike_reformat.tsv" + } + runtime { + docker: "quay.io/theiagen/utility:1.1" + memory: "32 GB" + cpu: 8 + disks: "local-disk 200 HDD" + } +} + +task summary_prep { + input { + File tsv + String sample_id + File spike_annotations + } + command <<< + + # cut the columns we want for the results summary and make output file + cut -f2,5 ~{tsv} > ~{sample_id}_spike_mutations_forsummary.tsv + + # cut the columns we want for the dashboard summary + awk '{print $6 "\t" $2 "\t" $5}' ~{tsv} > ~{sample_id}_spike_mutations_temp5.tsv + + # add annotations to the dashboard summary, reorder the dashboard summary columns, fix the dashboard summary column headers and make output file + paste ~{spike_annotations} ~{sample_id}_spike_mutations_temp5.tsv | awk '{print $4 "\t" $1 "\t" $2 "\t" $3 "\t" $5 "\t" $6}' | awk 'BEGIN{FS=OFS="\t"; print "sample_id", "AA_change", "Nucl_change", "Lineages", "ALT", "ALTfreq"} NR>1{print $1, $2, $3, $4, $5, $6}' > ~{sample_id}_spike_mutations_fordash.tsv + + # cut the columns we want for the counts summary + awk '{print $6 "\t" $2 "\t" $3 "\t" $4}' ~{tsv} > ~{sample_id}_spike_mutations_temp6.tsv + + # add annotations to the counts summary, reorder the dashboard summary columns, fix the dashboard summary column headers and make output file + paste ~{spike_annotations} ~{sample_id}_spike_mutations_temp6.tsv | awk '{print $4 "\t" $1 "\t" $2 "\t" $3 "\t" $5 "\t" $6 "\t" $7}' | awk 'BEGIN{FS=OFS="\t"; print "sample_id", "AA_change", "Nucl_change", "Lineages", "ALT", "Total_count", "ALT_count"} NR>1{print $1, $2, $3, $4, $5, $6, $7}' > ~{sample_id}_spike_mutations_counts.tsv + + >>> + output { + File sample_spike_tsv_summary = "${sample_id}_spike_mutations_forsummary.tsv" + File sample_spike_tsv_dash = "${sample_id}_spike_mutations_fordash.tsv" + File sample_spike_tsv_counts = "${sample_id}_spike_mutations_counts.tsv" + } + runtime { + docker: "quay.io/theiagen/utility:1.1" + memory: "32 GB" + cpu: 8 + disks: "local-disk 200 HDD" + } +} +task dashboard_tsv { + input { + Array[File] tsv + Array[File] tsv_dash + Array[File] tsv_counts + File spike_annotations + } + command <<< + + # concatenate the tsvs and make the dashboard summary output + awk 'FNR==1 && NR!=1{next;}{print}' ~{sep=' ' tsv_dash} >> spike_mutations_dashboard.tsv + + # concatenate the tsvs and make the dashboard summary output + awk 'FNR==1 && NR!=1{next;}{print}' ~{sep=' ' tsv_counts} >> spike_mutations_counts.tsv + + # fix delimiters in annotations file + sed 's/ /\t/g' ~{spike_annotations} > spike_annotations.tsv + + # concatentate tsvs for sequencing and bioinformatics team summary file and make output + paste spike_annotations.tsv ~{sep=' ' tsv} > spike_mutations_summary_temp.tsv + + >>> + output { + File spike_summary_temp = "spike_mutations_summary_temp.tsv" + File spike_dashboard = "spike_mutations_dashboard.tsv" + File spike_counts = "spike_mutations_counts.tsv" + } + runtime { + docker: "quay.io/theiagen/utility:1.1" + memory: "16 GB" + cpu: 4 + disks: "local-disk 200 SSD" + } +} + +task summary_tsv { + input { + File tsv + } + command <<< + + # datamash to tranpose results summary + datamash -H transpose < ~{tsv} > spike_mutations_summary.tsv + + >>> + output { + File spike_summary = "spike_mutations_summary.tsv" + } + runtime { + docker: "rapatsky/debian" + memory: "16 GB" + cpu: 4 + disks: "local-disk 200 SSD" + } +} diff --git a/workflows/wf_freyja_fastq.wdl b/workflows/wf_freyja_fastq.wdl index f051c035..cca8377e 100644 --- a/workflows/wf_freyja_fastq.wdl +++ b/workflows/wf_freyja_fastq.wdl @@ -45,24 +45,25 @@ workflow freyja_fastq { input: } output { + # Version Capture String freyja_fastq_wf_version = version_capture.phvg_version String freyja_fastq_wf_analysis_date = version_capture.date - + # Raw Read QC File read1_dehosted = read_QC_trim.read1_dehosted File read2_dehosted = read_QC_trim.read2_dehosted File read1_clean = read_QC_trim.read1_clean File read2_clean = read_QC_trim.read2_clean - Int fastq_scan_raw1 = read_QC_trim.fastq_scan_raw1 - Int fastq_scan_raw2 = read_QC_trim.fastq_scan_raw2 - String fastq_scan_raw_pairs = read_QC_trim.fastq_scan_raw_pairs + Int num_reads_raw1 = read_QC_trim.fastq_scan_raw1 + Int num_reads_raw2 = read_QC_trim.fastq_scan_raw2 + String num_reads_raw_pairs = read_QC_trim.fastq_scan_raw_pairs String fastq_scan_version = read_QC_trim.fastq_scan_version - - Int fastq_scan_clean1 = read_QC_trim.fastq_scan_clean1 - Int fastq_scan_clean2 = read_QC_trim.fastq_scan_clean2 - String fastq_scan_clean_pairs = read_QC_trim.fastq_scan_clean_pairs + # Read Trim + Int num_reads_clean1 = read_QC_trim.fastq_scan_clean1 + Int num_reads_clean2 = read_QC_trim.fastq_scan_clean2 + String num_reads_clean_pairs = read_QC_trim.fastq_scan_clean_pairs String trimmomatic_version = read_QC_trim.trimmomatic_version String bbduk_docker = read_QC_trim.bbduk_docker - + # Contaminent Check String kraken_version = read_QC_trim.kraken_version Float kraken_human = read_QC_trim.kraken_human Float kraken_sc2 = read_QC_trim.kraken_sc2 @@ -70,21 +71,21 @@ workflow freyja_fastq { Float kraken_human_dehosted = read_QC_trim.kraken_human_dehosted Float kraken_sc2_dehosted = read_QC_trim.kraken_sc2_dehosted String kraken_report_dehosted = read_QC_trim.kraken_report_dehosted - + # Mapping and Alignment String bwa_version = bwa.bwa_version String samtools_version = bwa.sam_version String alignment_method = "~{bwa.bwa_version}; ~{primer_trim.ivar_version}" - File aligned_bam = primer_trim.trim_sorted_bam File aligned_bai = primer_trim.trim_sorted_bai Float primer_trimmed_read_percent = primer_trim.primer_trimmed_read_percent String ivar_version_primtrim = primer_trim.ivar_version String samtools_version_primtrim = primer_trim.samtools_version String primer_bed_name = primer_trim.primer_bed_name - + # Freyja Analysis File freyja_variants = freyja.freyja_variants File freyja_depths = freyja.freyja_depths File freyja_demixed = freyja.freyja_demixed String freyja_barcode_version = freyja.freyja_barcode_version + String freyja_metadata_version = freyja.freyja_metadata_version } -} \ No newline at end of file +} diff --git a/workflows/wf_freyja_plot.wdl b/workflows/wf_freyja_plot.wdl index 6037b9e2..222648ec 100644 --- a/workflows/wf_freyja_plot.wdl +++ b/workflows/wf_freyja_plot.wdl @@ -14,19 +14,20 @@ workflow freyja_plot { samplename = samplename, freyja_demixed = freyja_demixed, collection_date = collection_date, - freyja_plot_name = freyja_plot_name + freyja_plot_name = freyja_plot_name } call versioning.version_capture{ input: } output { + # Version Capture String freyja_plot_wf_version = version_capture.phvg_version String freyja_plot_wf_analysis_date = version_capture.date - + # Freyja Plot Visualization File freyja_plot = freyja_plot_task.freyja_plot File freyja_demixed_aggregate = freyja_plot_task.demixed_aggregate File? freyja_plot_metadata = freyja_plot_task.freyja_plot_metadata - } + } } task freyja_plot_task { @@ -37,9 +38,9 @@ task freyja_plot_task { Boolean plot_lineages=false Boolean plot_time=false String plot_time_interval="MS" - Int plot_day_window=14 + Int plot_day_window=14 String freyja_plot_name - String docker = "staphb/freyja:1.2" + String docker = "quay.io/staphb/freyja:1.3.2" } command <<< freyja_demixed_array="~{sep=' ' freyja_demixed}" @@ -54,20 +55,20 @@ task freyja_plot_task { if [ "$samplename_array_len" -ne "$collection_date_array_len" ]; then echo "ERROR: Missing collection date. Samplename array (length: $samplename_array_len) and collection date array (length: $collection_date_array_len) are of unequal length." >&2 exit 1 - else + else echo "Samplename array (length: $samplename_array_len) and collection date array (length: $collection_date_array_len) are of equal length." >&2. fi - + echo "Sample,sample_collection_datetime" > freyja_times_metadata.csv - + for index in ${!samplename_array[@]}; do samplename=${samplename_array[$index]} collection_date=${collection_date_array[$index]} echo "${samplename},${collection_date}" >> freyja_times_metadata.csv done - + plot_options="--times freyja_times_metadata.csv" - + if [ ~{plot_time_interval} == "D" ]; then plot_options="${plot_options} --interval D --windowsize ~{plot_day_window}" elif [ ~{plot_time_interval} == "MS" ]; then @@ -76,27 +77,27 @@ task freyja_plot_task { echo "ERROR: plot time interval value (~{plot_time_interval}) not recognized. Must be either \"D\" (days) or \"MS\" (months)" >&2 exit 1 fi - + fi - + # move all assemblies into single directory and aggregate files mkdir ./demixed_files/ echo "mv ${freyja_demixed_array[@]} demixed_files/" mv ${freyja_demixed_array[@]} ./demixed_files/ - + freyja aggregate \ ./demixed_files/ \ --output demixed_aggregate.tsv - - # create freya plot + + # create freya plot echo "Running: freyja plot demixed_aggregate.tsv --output ~{freyja_plot_name}.pdf ${plot_options}" freyja plot \ ~{true='--lineages' false ='' plot_lineages} \ demixed_aggregate.tsv \ --output ~{freyja_plot_name}.pdf \ ${plot_options} - - + + >>> output { File freyja_plot = "~{freyja_plot_name}.pdf" @@ -109,4 +110,4 @@ task freyja_plot_task { docker: "~{docker}" disks: "local-disk 100 HDD" } -} \ No newline at end of file +} diff --git a/workflows/wf_mercury_batch.wdl b/workflows/wf_mercury_batch.wdl index 7c3a8f1e..3f574af2 100644 --- a/workflows/wf_mercury_batch.wdl +++ b/workflows/wf_mercury_batch.wdl @@ -16,9 +16,9 @@ workflow mercury_batch { Array[String] submission_id Array[String] vadr_num_alerts Int vadr_threshold = 0 - Int CPUs = 4 + Int cpu = 4 Int disk_size = 100 - Int mem_size_gb = 8 + Int memory = 8 String? gcp_bucket } call submission_prep.compile_assembly_n_meta as genbank_compile { @@ -32,9 +32,9 @@ workflow mercury_batch { vadr_threshold = vadr_threshold, submission_id = submission_id, date = version_capture.date, - CPUs = CPUs, + cpu = cpu, disk_size = disk_size, - mem_size_gb = mem_size_gb + memory = memory } call submission_prep.compile_assembly_n_meta as gisaid_compile { input: @@ -47,10 +47,10 @@ workflow mercury_batch { vadr_threshold = vadr_threshold, submission_id = submission_id, date = version_capture.date, - CPUs = CPUs, + cpu = cpu, disk_size = disk_size, - mem_size_gb = mem_size_gb - } + memory = memory + } call submission_prep.compile_biosamp_n_sra { input: single_submission_biosample_attirbutes = biosample_attributes, @@ -58,30 +58,31 @@ workflow mercury_batch { single_submission_sra_reads = sra_reads, gcp_bucket = gcp_bucket, date = version_capture.date, - CPUs = CPUs, + cpu = cpu, disk_size = disk_size, - mem_size_gb = mem_size_gb - } - call versioning.version_capture{ - input: - } - output { - String mercury_batch_version = version_capture.phvg_version - String mercury_batch_analysis_date = version_capture.date - - File? GenBank_modifier = genbank_compile.upload_meta - File? GenBank_assembly = genbank_compile.upload_fasta - File GenBank_batched_samples = genbank_compile.batched_samples - File GenBank_excluded_samples = genbank_compile.excluded_samples - - File? GISAID_metadata = gisaid_compile.upload_meta - File? GISAID_assembly = gisaid_compile.upload_fasta - File GISAID_batched_samples = gisaid_compile.batched_samples - File GISAID_excluded_samples = gisaid_compile.excluded_samples - - File BioSample_attributes = compile_biosamp_n_sra.biosample_attributes - File SRA_metadata = compile_biosamp_n_sra.sra_metadata - File? SRA_zipped_reads = compile_biosamp_n_sra.sra_zipped - String? SRA_gcp_bucket = gcp_bucket - } + memory = memory + } + call versioning.version_capture{ + input: + } + output { + # Version Capture + String mercury_batch_version = version_capture.phvg_version + String mercury_batch_analysis_date = version_capture.date + # GenBank Submission Files + File? GenBank_modifier = genbank_compile.upload_meta + File? GenBank_assembly = genbank_compile.upload_fasta + File GenBank_batched_samples = genbank_compile.batched_samples + File GenBank_excluded_samples = genbank_compile.excluded_samples + # GISAID Submission Files + File? GISAID_metadata = gisaid_compile.upload_meta + File? GISAID_assembly = gisaid_compile.upload_fasta + File GISAID_batched_samples = gisaid_compile.batched_samples + File GISAID_excluded_samples = gisaid_compile.excluded_samples + # BioSample and SRA Submission Files + File BioSample_attributes = compile_biosamp_n_sra.biosample_attributes + File SRA_metadata = compile_biosamp_n_sra.sra_metadata + File? SRA_zipped_reads = compile_biosamp_n_sra.sra_zipped + String? SRA_gcp_bucket = gcp_bucket + } } diff --git a/workflows/wf_mercury_pe_prep.wdl b/workflows/wf_mercury_pe_prep.wdl index 413b9927..8fc6283e 100644 --- a/workflows/wf_mercury_pe_prep.wdl +++ b/workflows/wf_mercury_pe_prep.wdl @@ -5,16 +5,14 @@ import "../tasks/task_pub_repo_prep.wdl" as submission_prep workflow mercury_pe_prep { input { - #required files + # Required Files File assembly_fasta File read1_dehosted File read2_dehosted - - #required metadata (titan gc outputs) + # Required Metadata (TheiaCoV GC Outputs) String assembly_method Float assembly_mean_coverage - - #required metadata (user inputs) + # Required Metadata (User Inputs) String authors String bioproject_accession String collecting_lab @@ -42,9 +40,8 @@ workflow mercury_pe_prep { String state String submission_id String submitting_lab - String submitting_lab_address - - #optional metadata + String submitting_lab_address + # Optional Metadata String? amplicon_primer_scheme String? amplicon_size String? biosample_accession @@ -56,14 +53,12 @@ workflow mercury_pe_prep { String? purpose_of_sequencing String? submitter_email String? treatment - - # Optional user-defined thresholds for generating submission files + # Optional User-Defined Thresholds for Generating Submission Files Int number_N_threshold = 5000 } - if (number_N <= number_N_threshold) { call submission_prep.ncbi_prep_one_sample { - input: + input: amplicon_primer_scheme = amplicon_primer_scheme, amplicon_size = amplicon_size, assembly_fasta = assembly_fasta, @@ -99,10 +94,10 @@ workflow mercury_pe_prep { state = state, submission_id = submission_id, submitter_email = submitter_email, - treatment = treatment + treatment = treatment } call submission_prep.gisaid_prep_one_sample { - input: + input: assembly_fasta = assembly_fasta, authors = authors, assembly_method = assembly_method, @@ -127,14 +122,14 @@ workflow mercury_pe_prep { treatment = treatment } } - call versioning.version_capture{ input: } output { + # Version Capture String mercury_pe_prep_version = version_capture.phvg_version String mercury_pe_prep_analysis_date = version_capture.date - + # NCBI Submission Files File? biosample_attributes = ncbi_prep_one_sample.biosample_attributes File? sra_metadata = ncbi_prep_one_sample.sra_metadata File? genbank_assembly = ncbi_prep_one_sample.genbank_assembly @@ -142,11 +137,8 @@ workflow mercury_pe_prep { File? sra_read1 = ncbi_prep_one_sample.sra_read1 File? sra_read2 = ncbi_prep_one_sample.sra_read2 Array[File]? sra_reads = ncbi_prep_one_sample.sra_reads - + # GISAID Submission Files File? gisaid_assembly = gisaid_prep_one_sample.gisaid_assembly File? gisaid_metadata = gisaid_prep_one_sample.gisaid_metadata } } - - -#coverage >= coverage_gisaid && number_N <= number_N_gisaid && diff --git a/workflows/wf_mercury_se_prep.wdl b/workflows/wf_mercury_se_prep.wdl index f3d02b36..c1ff2a94 100644 --- a/workflows/wf_mercury_se_prep.wdl +++ b/workflows/wf_mercury_se_prep.wdl @@ -5,15 +5,13 @@ import "../tasks/task_pub_repo_prep.wdl" as submission_prep workflow mercury_se_prep { input { - #required files + # Required Files File assembly_fasta File reads_dehosted - - #required metadata (titan gc outputs) + # Required Metadata (TheiaCoV GC Outputs) String assembly_method Float assembly_mean_coverage - - #required metadata (user inputs) + # Required Metadata (User Inputs) String authors String bioproject_accession String collecting_lab @@ -41,9 +39,8 @@ workflow mercury_se_prep { String state String submission_id String submitting_lab - String submitting_lab_address - - #optional metadata + String submitting_lab_address + # Optional Metadata String? amplicon_primer_scheme String? amplicon_size String? biosample_accession @@ -55,14 +52,12 @@ workflow mercury_se_prep { String? purpose_of_sequencing String? submitter_email String? treatment - - # Optional user-defined thresholds for generating submission files + # Optional User-Defined Thresholds for Generating Submission Files Int number_N_threshold = 5000 } - if (number_N <= number_N_threshold) { call submission_prep.ncbi_prep_one_sample_se { - input: + input: amplicon_primer_scheme = amplicon_primer_scheme, amplicon_size = amplicon_size, assembly_fasta = assembly_fasta, @@ -97,10 +92,10 @@ workflow mercury_se_prep { state = state, submission_id = submission_id, submitter_email = submitter_email, - treatment = treatment + treatment = treatment } call submission_prep.gisaid_prep_one_sample { - input: + input: assembly_fasta = assembly_fasta, authors = authors, assembly_method = assembly_method, @@ -125,20 +120,20 @@ workflow mercury_se_prep { treatment = treatment } } - call versioning.version_capture{ input: } output { + # Version Capture String mercury_pe_prep_version = version_capture.phvg_version String mercury_pe_prep_analysis_date = version_capture.date - + # NCBI Submission Files File? biosample_attributes = ncbi_prep_one_sample_se.biosample_attributes File? sra_metadata = ncbi_prep_one_sample_se.sra_metadata File? genbank_assembly = ncbi_prep_one_sample_se.genbank_assembly File? genbank_modifier = ncbi_prep_one_sample_se.genbank_modifier File? sra_reads = ncbi_prep_one_sample_se.sra_reads - + # GISAID Submission Files File? gisaid_assembly = gisaid_prep_one_sample.gisaid_assembly File? gisaid_metadata = gisaid_prep_one_sample.gisaid_metadata } diff --git a/workflows/wf_ncbi_scrub_pe.wdl b/workflows/wf_ncbi_scrub_pe.wdl index 9c9f8b2b..83cc5b24 100644 --- a/workflows/wf_ncbi_scrub_pe.wdl +++ b/workflows/wf_ncbi_scrub_pe.wdl @@ -5,39 +5,37 @@ import "../tasks/task_taxonID.wdl" as taxonID import "../tasks/task_versioning.wdl" as versioning workflow dehost_pe { - input { - String samplename - File read1 - File read2 - } - - call read_clean.ncbi_scrub_pe { + input { + String samplename + File read1 + File read2 + } + call read_clean.ncbi_scrub_pe { input: samplename = samplename, read1 = read1, read2 = read2 } - call taxonID.kraken2 { - input: - samplename = samplename, - read1 = ncbi_scrub_pe.read1_dehosted, - read2 = ncbi_scrub_pe.read2_dehosted - } - call versioning.version_capture{ - input: - } - output { - String ncbi_scrub_pe_version = version_capture.phvg_version - String ncbi_scrub_se_analysis_date = version_capture.date - File read1_dehosted = ncbi_scrub_pe.read1_dehosted - File read2_dehosted = ncbi_scrub_pe.read2_dehosted - Int read1_human_spots_removed = ncbi_scrub_pe.read1_human_spots_removed - Int read2_human_spots_removed = ncbi_scrub_pe.read2_human_spots_removed - String ncbi_scrub_docker = ncbi_scrub_pe.ncbi_scrub_docker - - Float kraken_human_dehosted = kraken2.percent_human - Float kraken_sc2_dehosted = kraken2.percent_sc2 - String kraken_report_dehosted = kraken2.kraken_report - String kraken_version_dehosted = kraken2.version - } + call taxonID.kraken2 { + input: + samplename = samplename, + read1 = ncbi_scrub_pe.read1_dehosted, + read2 = ncbi_scrub_pe.read2_dehosted + } + call versioning.version_capture{ + input: + } + output { + String ncbi_scrub_pe_version = version_capture.phvg_version + String ncbi_scrub_se_analysis_date = version_capture.date + File read1_dehosted = ncbi_scrub_pe.read1_dehosted + File read2_dehosted = ncbi_scrub_pe.read2_dehosted + Int read1_human_spots_removed = ncbi_scrub_pe.read1_human_spots_removed + Int read2_human_spots_removed = ncbi_scrub_pe.read2_human_spots_removed + String ncbi_scrub_docker = ncbi_scrub_pe.ncbi_scrub_docker + Float kraken_human_dehosted = kraken2.percent_human + Float kraken_sc2_dehosted = kraken2.percent_sc2 + String kraken_report_dehosted = kraken2.kraken_report + String kraken_version_dehosted = kraken2.version + } } diff --git a/workflows/wf_ncbi_scrub_se.wdl b/workflows/wf_ncbi_scrub_se.wdl index 2dada73a..d6f51f36 100644 --- a/workflows/wf_ncbi_scrub_se.wdl +++ b/workflows/wf_ncbi_scrub_se.wdl @@ -5,34 +5,32 @@ import "../tasks/task_taxonID.wdl" as taxonID import "../tasks/task_versioning.wdl" as versioning workflow dehost_se { - input { - String samplename - File reads - } - - call read_clean.ncbi_scrub_se { + input { + String samplename + File reads + } + call read_clean.ncbi_scrub_se { input: samplename = samplename, read1 = reads } - call taxonID.kraken2 { - input: - samplename = samplename, - read1 = ncbi_scrub_se.read1_dehosted - } - call versioning.version_capture{ - input: - } - output { - String ncbi_scrub_se_version = version_capture.phvg_version - String ncbi_scrub_se_analysis_date = version_capture.date - File reads_dehosted = ncbi_scrub_se.read1_dehosted - String ncbi_scrub_docker = ncbi_scrub_se.ncbi_scrub_docker - Int human_spots_removed = ncbi_scrub_se.read1_human_spots_removed - - Float kraken_human_dehosted = kraken2.percent_human - Float kraken_sc2_dehosted = kraken2.percent_sc2 - String kraken_version_dehosted = kraken2.version - String kraken_report_dehosted = kraken2.kraken_report - } + call taxonID.kraken2 { + input: + samplename = samplename, + read1 = ncbi_scrub_se.read1_dehosted + } + call versioning.version_capture { + input: + } + output { + String ncbi_scrub_se_version = version_capture.phvg_version + String ncbi_scrub_se_analysis_date = version_capture.date + File reads_dehosted = ncbi_scrub_se.read1_dehosted + String ncbi_scrub_docker = ncbi_scrub_se.ncbi_scrub_docker + Int human_spots_removed = ncbi_scrub_se.read1_human_spots_removed + Float kraken_human_dehosted = kraken2.percent_human + Float kraken_sc2_dehosted = kraken2.percent_sc2 + String kraken_version_dehosted = kraken2.version + String kraken_report_dehosted = kraken2.kraken_report + } } diff --git a/workflows/wf_pangolin_update.wdl b/workflows/wf_pangolin_update.wdl index e7bca470..fa87a366 100644 --- a/workflows/wf_pangolin_update.wdl +++ b/workflows/wf_pangolin_update.wdl @@ -4,55 +4,55 @@ import "../tasks/task_taxonID.wdl" as taxon_ID import "../tasks/task_versioning.wdl" as versioning workflow pangolin_update { - input { - String samplename - File assembly - String current_lineage - String current_pangolin_docker - String current_pangolin_assignment_version - String current_pangolin_versions - String updated_pangolin_docker - String? timezone - File? lineage_log - } - - call taxon_ID.pangolin3 { - input: - samplename = samplename, - fasta = assembly, - docker = updated_pangolin_docker - } - call taxon_ID.pangolin_update_log { - input: - samplename = samplename, - current_lineage = current_lineage, - current_pangolin_docker = current_pangolin_docker, - current_pangolin_assignment_version = current_pangolin_assignment_version, - current_pangolin_versions = current_pangolin_versions, - updated_lineage = pangolin3.pangolin_lineage, - updated_pangolin_docker = pangolin3.pangolin_docker, - updated_pangolin_assignment_version = pangolin3.pangolin_assignment_version, - updated_pangolin_versions = pangolin3.pangolin_versions, - timezone = timezone, - lineage_log = lineage_log - } - call versioning.version_capture{ - input: - timezone = timezone - } - output { - String pangolin_update_version = version_capture.phvg_version - String pangolin_update_analysis_date = version_capture.date - - String pango_lineage = pangolin3.pangolin_lineage - String pangolin_conflicts = pangolin3.pangolin_conflicts - String pangolin_notes = pangolin3.pangolin_notes - String pangolin_assignment_version = pangolin3.pangolin_assignment_version - String pangolin_versions = pangolin3.pangolin_versions - File pango_lineage_report = pangolin3.pango_lineage_report - String pangolin_docker = pangolin3.pangolin_docker - - String pangolin_updates = pangolin_update_log.pangolin_updates - File pango_lineage_log = pangolin_update_log.pango_lineage_log - } + input { + String samplename + File assembly + String current_lineage + String current_pangolin_docker + String current_pangolin_assignment_version + String current_pangolin_versions + String updated_pangolin_docker + String? timezone + File? lineage_log + } + call taxon_ID.pangolin3 { + input: + samplename = samplename, + fasta = assembly, + docker = updated_pangolin_docker + } + call taxon_ID.pangolin_update_log { + input: + samplename = samplename, + current_lineage = current_lineage, + current_pangolin_docker = current_pangolin_docker, + current_pangolin_assignment_version = current_pangolin_assignment_version, + current_pangolin_versions = current_pangolin_versions, + updated_lineage = pangolin3.pangolin_lineage, + updated_pangolin_docker = pangolin3.pangolin_docker, + updated_pangolin_assignment_version = pangolin3.pangolin_assignment_version, + updated_pangolin_versions = pangolin3.pangolin_versions, + timezone = timezone, + lineage_log = lineage_log + } + call versioning.version_capture{ + input: + timezone = timezone + } + output { + # Version Capture + String pangolin_update_version = version_capture.phvg_version + String pangolin_update_analysis_date = version_capture.date + # Pangolin Assignments + String pango_lineage = pangolin3.pangolin_lineage + String pangolin_conflicts = pangolin3.pangolin_conflicts + String pangolin_notes = pangolin3.pangolin_notes + String pangolin_assignment_version = pangolin3.pangolin_assignment_version + String pangolin_versions = pangolin3.pangolin_versions + File pango_lineage_report = pangolin3.pango_lineage_report + String pangolin_docker = pangolin3.pangolin_docker + # Update Log + String pangolin_updates = pangolin_update_log.pangolin_updates + File pango_lineage_log = pangolin_update_log.pango_lineage_log + } } diff --git a/workflows/wf_read_QC_trim.wdl b/workflows/wf_read_QC_trim.wdl index 6ab5cf3f..bfe5f7e5 100644 --- a/workflows/wf_read_QC_trim.wdl +++ b/workflows/wf_read_QC_trim.wdl @@ -8,22 +8,21 @@ workflow read_QC_trim { meta { description: "Runs basic QC (FastQC), trimming (SeqyClean), and taxonomic ID (Kraken2) on illumina PE reads" } - input { - String samplename - File read1_raw - File read2_raw - Int? trimmomatic_minlen = 75 - Int? trimmomatic_quality_trim_score = 30 - Int? trimmomatic_window_size = 4 - Int bbduk_mem = 8 + String samplename + File read1_raw + File read2_raw + Int? trimmomatic_minlen = 75 + Int? trimmomatic_quality_trim_score = 30 + Int? trimmomatic_window_size = 4 + Int bbduk_mem = 8 } call read_clean.ncbi_scrub_pe { input: samplename = samplename, read1 = read1_raw, read2 = read2_raw - } + } call read_clean.trimmomatic { input: samplename = samplename, @@ -38,7 +37,7 @@ workflow read_QC_trim { samplename = samplename, read1_trimmed = trimmomatic.read1_trimmed, read2_trimmed = trimmomatic.read2_trimmed, - mem_size_gb = bbduk_mem + memory = bbduk_mem } call qc_utils.fastq_scan as fastq_scan_raw { input: @@ -63,32 +62,27 @@ workflow read_QC_trim { read2 = ncbi_scrub_pe.read2_dehosted } output { - File read1_dehosted = ncbi_scrub_pe.read1_dehosted - File read2_dehosted = ncbi_scrub_pe.read2_dehosted - Int read1_human_spots_removed = ncbi_scrub_pe.read1_human_spots_removed - Int read2_human_spots_removed = ncbi_scrub_pe.read2_human_spots_removed - - File read1_clean = bbduk.read1_clean - File read2_clean = bbduk.read2_clean - - Int fastq_scan_raw1 = fastq_scan_raw.read1_seq - Int fastq_scan_raw2 = fastq_scan_raw.read2_seq - String fastq_scan_raw_pairs = fastq_scan_raw.read_pairs - - Int fastq_scan_clean1 = fastq_scan_clean.read1_seq - Int fastq_scan_clean2 = fastq_scan_clean.read2_seq - String fastq_scan_clean_pairs = fastq_scan_clean.read_pairs - - String kraken_version = kraken2_raw.version - Float kraken_human = kraken2_raw.percent_human - Float kraken_sc2 = kraken2_raw.percent_sc2 - String kraken_report = kraken2_raw.kraken_report - Float kraken_human_dehosted = kraken2_dehosted.percent_human - Float kraken_sc2_dehosted = kraken2_dehosted.percent_sc2 - String kraken_report_dehosted = kraken2_dehosted.kraken_report - - String fastq_scan_version = fastq_scan_raw.version - String bbduk_docker = bbduk.bbduk_docker - String trimmomatic_version = trimmomatic.version + File read1_dehosted = ncbi_scrub_pe.read1_dehosted + File read2_dehosted = ncbi_scrub_pe.read2_dehosted + Int read1_human_spots_removed = ncbi_scrub_pe.read1_human_spots_removed + Int read2_human_spots_removed = ncbi_scrub_pe.read2_human_spots_removed + File read1_clean = bbduk.read1_clean + File read2_clean = bbduk.read2_clean + Int fastq_scan_raw1 = fastq_scan_raw.read1_seq + Int fastq_scan_raw2 = fastq_scan_raw.read2_seq + String fastq_scan_raw_pairs = fastq_scan_raw.read_pairs + Int fastq_scan_clean1 = fastq_scan_clean.read1_seq + Int fastq_scan_clean2 = fastq_scan_clean.read2_seq + String fastq_scan_clean_pairs = fastq_scan_clean.read_pairs + String kraken_version = kraken2_raw.version + Float kraken_human = kraken2_raw.percent_human + Float kraken_sc2 = kraken2_raw.percent_sc2 + String kraken_report = kraken2_raw.kraken_report + Float kraken_human_dehosted = kraken2_dehosted.percent_human + Float kraken_sc2_dehosted = kraken2_dehosted.percent_sc2 + String kraken_report_dehosted = kraken2_dehosted.kraken_report + String fastq_scan_version = fastq_scan_raw.version + String bbduk_docker = bbduk.bbduk_docker + String trimmomatic_version = trimmomatic.version } } diff --git a/workflows/wf_read_QC_trim_se.wdl b/workflows/wf_read_QC_trim_se.wdl index 7f1d9070..50f6d298 100644 --- a/workflows/wf_read_QC_trim_se.wdl +++ b/workflows/wf_read_QC_trim_se.wdl @@ -8,7 +8,6 @@ workflow read_QC_trim { meta { description: "Runs basic QC (fastq-scan), trimming (SeqyClean), and taxonomic ID (Kraken2) on illumina PE reads" } - input { String samplename File read1_raw @@ -17,6 +16,7 @@ workflow read_QC_trim { Int? trimmomatic_window_size = 4 Int bbduk_mem = 8 } +# Commented out as NCBI SCRUB not currently compatible with 75bp SE data used in SC2 sequencing # call read_clean.ncbi_scrub_se { # input: # samplename = samplename, @@ -34,7 +34,7 @@ workflow read_QC_trim { input: samplename = samplename, read1_trimmed = trimmomatic_se.read1_trimmed, - mem_size_gb = bbduk_mem + memory = bbduk_mem } call qc_utils.fastq_scan_se as fastq_scan_raw { input: @@ -54,23 +54,19 @@ workflow read_QC_trim { # samplename = samplename, # read1 = ncbi_scrub_se.read1_dehosted # } - output { - File read1_clean = bbduk_se.read1_clean - - Int fastq_scan_number_reads = fastq_scan_raw.read1_seq - Int fastq_scan_clean_number_reads = fastq_scan_clean.read1_seq - - String kraken_version = kraken2_raw.version - Float kraken_human = kraken2_raw.percent_human - Float kraken_sc2 = kraken2_raw.percent_sc2 - String kraken_report = kraken2_raw.kraken_report + File read1_clean = bbduk_se.read1_clean + Int fastq_scan_number_reads = fastq_scan_raw.read1_seq + Int fastq_scan_clean_number_reads = fastq_scan_clean.read1_seq + String kraken_version = kraken2_raw.version + Float kraken_human = kraken2_raw.percent_human + Float kraken_sc2 = kraken2_raw.percent_sc2 + String kraken_report = kraken2_raw.kraken_report # Float kraken_human_dehosted = kraken2_dehosted.percent_human # Float kraken_sc2_dehosted = kraken2_dehosted.percent_sc2 # String kraken_report_dehosted = kraken2_dehosted.kraken_report - - String fastq_scan_version = fastq_scan_raw.version - String bbduk_docker = bbduk_se.bbduk_docker - String trimmomatic_version = trimmomatic_se.version + String fastq_scan_version = fastq_scan_raw.version + String bbduk_docker = bbduk_se.bbduk_docker + String trimmomatic_version = trimmomatic_se.version } } diff --git a/workflows/wf_sarscov2_nextstrain_modified.wdl b/workflows/wf_sarscov2_nextstrain_modified.wdl index 328732d9..3ab71242 100644 --- a/workflows/wf_sarscov2_nextstrain_modified.wdl +++ b/workflows/wf_sarscov2_nextstrain_modified.wdl @@ -6,217 +6,193 @@ import "../tasks/tasks_intrahost.wdl" as intrahost import "../tasks/tasks_utils.wdl" as utils workflow sarscov2_nextstrain { - meta { - description: "Modified version of the Broad's sars_cov2_nextstrain WDL Worfklow to align assemblies, build trees, and convert to json representation suitable for Nextstrain visualization. See https://nextstrain.org/docs/getting-started/ and https://nextstrain-augur.readthedocs.io/en/stable/" - author: "Kevin Libuit" - email: "kevin.libuit@theiagen.com" - } - - input { - Array[File]+ assembly_fastas - Array[File]+ sample_metadata_tsvs - String tree_root_seq_id = "Wuhan-Hu-1/2019" - - String build_name - File? builds_yaml - - Array[String]? ancestral_traits_to_infer - - File? auspice_config - File? ref_fasta - File? clades_tsv - File? lat_longs_tsv - Float? clock_rate - Float? clock_std_dev - Int mafft_cpu=64 - Int mafft_mem_size=500 - - Int min_unambig_genome = 27000 - } - - parameter_meta { - assembly_fastas: { - description: "Set of assembled genomes to align and build trees. These must represent a single chromosome/segment of a genome only. Fastas may be one-sequence-per-individual or a concatenated multi-fasta (unaligned) or a mixture of the two. They may be compressed (gz, bz2, zst, lz4), uncompressed, or a mixture.", - patterns: ["*.fasta", "*.fa", "*.fasta.gz", "*.fasta.zst"] - } - sample_metadata_tsvs: { - description: "Tab-separated metadata file that contain binning variables and values. Must contain all samples: output will be filtered to the IDs present in this file.", - patterns: ["*.txt", "*.tsv"] - } - ref_fasta: { - description: "A reference assembly (not included in assembly_fastas) to align assembly_fastas against. Typically from NCBI RefSeq or similar.", - patterns: ["*.fasta", "*.fa"] - } - min_unambig_genome: { - description: "Minimum number of called bases in genome to pass prefilter." - } - ancestral_traits_to_infer: { - description: "A list of metadata traits to use for ancestral node inference (see https://nextstrain-augur.readthedocs.io/en/stable/usage/cli/traits.html). Multiple traits may be specified; must correspond exactly to column headers in metadata file. Omitting these values will skip ancestral trait inference, and ancestral nodes will not have estimated values for metadata." - } - clades_tsv: { - description: "A TSV file containing clade mutation positions in four columns: [clade gene site alt]; see: https://nextstrain.org/docs/tutorials/defining-clades", - patterns: ["*.tsv", "*.txt"] - } - } - - call nextstrain.nextstrain_ncov_defaults - - #### mafft_and_snp - - call utils.zcat { - input: - infiles = assembly_fastas, - output_name = "all_samples_combined_assembly.fasta" - } - - call nextstrain.nextstrain_deduplicate_sequences as dedup_seqs { + meta { + description: "Modified version of the Broad's sars_cov2_nextstrain WDL Worfklow to align assemblies, build trees, and convert to json representation suitable for Nextstrain visualization. See https://nextstrain.org/docs/getting-started/ and https://nextstrain-augur.readthedocs.io/en/stable/" + author: "Kevin Libuit" + email: "kevin.libuit@theiagen.com" + } + input { + Array[File]+ assembly_fastas + Array[File]+ sample_metadata_tsvs + String tree_root_seq_id = "Wuhan-Hu-1/2019" + String build_name + File? builds_yaml + Array[String]? ancestral_traits_to_infer + File? auspice_config + File? ref_fasta + File? clades_tsv + File? lat_longs_tsv + Float? clock_rate + Float? clock_std_dev + Int mafft_cpu=64 + Int mafft_mem_size=500 + Int min_unambig_genome = 27000 + } + parameter_meta { + assembly_fastas: { + description: "Set of assembled genomes to align and build trees. These must represent a single chromosome/segment of a genome only. Fastas may be one-sequence-per-individual or a concatenated multi-fasta (unaligned) or a mixture of the two. They may be compressed (gz, bz2, zst, lz4), uncompressed, or a mixture.", + patterns: ["*.fasta", "*.fa", "*.fasta.gz", "*.fasta.zst"] + } + sample_metadata_tsvs: { + description: "Tab-separated metadata file that contain binning variables and values. Must contain all samples: output will be filtered to the IDs present in this file.", + patterns: ["*.txt", "*.tsv"] + } + ref_fasta: { + description: "A reference assembly (not included in assembly_fastas) to align assembly_fastas against. Typically from NCBI RefSeq or similar.", + patterns: ["*.fasta", "*.fa"] + } + min_unambig_genome: { + description: "Minimum number of called bases in genome to pass prefilter." + } + ancestral_traits_to_infer: { + description: "A list of metadata traits to use for ancestral node inference (see https://nextstrain-augur.readthedocs.io/en/stable/usage/cli/traits.html). Multiple traits may be specified; must correspond exactly to column headers in metadata file. Omitting these values will skip ancestral trait inference, and ancestral nodes will not have estimated values for metadata." + } + clades_tsv: { + description: "A TSV file containing clade mutation positions in four columns: [clade gene site alt]; see: https://nextstrain.org/docs/tutorials/defining-clades", + patterns: ["*.tsv", "*.txt"] + } + } + call nextstrain.nextstrain_ncov_defaults + #### mafft_and_snp + call utils.zcat { input: - sequences_fasta = zcat.combined - } - - call utils.filter_sequences_by_length { - input: - sequences_fasta = dedup_seqs.sequences_deduplicated_fasta, - min_non_N = min_unambig_genome - } - - call nextstrain.mafft_one_chr_chunked as mafft { - input: - sequences = filter_sequences_by_length.filtered_fasta, - ref_fasta = select_first([ref_fasta, nextstrain_ncov_defaults.reference_fasta]), - basename = "all_samples_aligned.fasta" - - } - - #### merge metadata, compute derived cols - if(length(sample_metadata_tsvs)>1) { - call utils.tsv_join { - input: - input_tsvs = sample_metadata_tsvs, - id_col = 'strain', - out_basename = "metadata-merged" - } - } - call nextstrain.derived_cols { - input: - metadata_tsv = select_first(flatten([[tsv_join.out_tsv], sample_metadata_tsvs])) - } - - ## Subsample if builds.yaml file provided - if(defined(builds_yaml)) { - call nextstrain.nextstrain_build_subsample as subsample { - input: - alignment_msa_fasta = mafft.aligned_sequences, - sample_metadata_tsv = derived_cols.derived_metadata, - build_name = build_name, - builds_yaml = builds_yaml - } - } - - call utils.fasta_to_ids { - input: - sequences_fasta = select_first([subsample.subsampled_msa, mafft.aligned_sequences]) - } - - call nextstrain.snp_sites { - input: - msa_fasta = select_first([subsample.subsampled_msa, mafft.aligned_sequences]) - } - - #### augur_from_msa - - call nextstrain.augur_mask_sites { - input: - sequences = select_first([subsample.subsampled_msa, mafft.aligned_sequences]) - } - call nextstrain.draft_augur_tree { - input: - msa_or_vcf = augur_mask_sites.masked_sequences - } - - call nextstrain.refine_augur_tree { - input: - raw_tree = draft_augur_tree.aligned_tree, - msa_or_vcf = select_first([subsample.subsampled_msa, augur_mask_sites.masked_sequences]), - metadata = derived_cols.derived_metadata, - clock_rate = clock_rate, - clock_std_dev = clock_std_dev, - root = tree_root_seq_id - } - if(defined(ancestral_traits_to_infer) && length(select_first([ancestral_traits_to_infer,[]]))>0) { - call nextstrain.ancestral_traits { - input: - tree = refine_augur_tree.tree_refined, - metadata = derived_cols.derived_metadata, - columns = select_first([ancestral_traits_to_infer,[]]) - } - } - call nextstrain.tip_frequencies { - input: - tree = refine_augur_tree.tree_refined, - metadata = derived_cols.derived_metadata, - min_date = 2020.0, - pivot_interval = 1, - pivot_interval_units = "weeks", - narrow_bandwidth = 0.05, - proportion_wide = 0.0, - out_basename = "auspice-~{build_name}" - } - call nextstrain.ancestral_tree { - input: - tree = refine_augur_tree.tree_refined, - msa_or_vcf = select_first([subsample.subsampled_msa, augur_mask_sites.masked_sequences]) - } - call nextstrain.translate_augur_tree { - input: - tree = refine_augur_tree.tree_refined, - nt_muts = ancestral_tree.nt_muts_json, - genbank_gb = nextstrain_ncov_defaults.reference_gb - } - call nextstrain.assign_clades_to_nodes { - input: - tree_nwk = refine_augur_tree.tree_refined, - nt_muts_json = ancestral_tree.nt_muts_json, - aa_muts_json = translate_augur_tree.aa_muts_json, - ref_fasta = select_first([ref_fasta, nextstrain_ncov_defaults.reference_fasta]), - clades_tsv = select_first([clades_tsv, nextstrain_ncov_defaults.clades_tsv]) - } - call nextstrain.export_auspice_json { - input: - tree = refine_augur_tree.tree_refined, - sample_metadata = derived_cols.derived_metadata, - lat_longs_tsv = select_first([lat_longs_tsv, nextstrain_ncov_defaults.lat_longs_tsv]), - node_data_jsons = select_all([ - refine_augur_tree.branch_lengths, - ancestral_traits.node_data_json, - ancestral_tree.nt_muts_json, - translate_augur_tree.aa_muts_json, - assign_clades_to_nodes.node_clade_data_json]), - auspice_config = select_first([auspice_config, nextstrain_ncov_defaults.auspice_config]), - out_basename = "auspice-~{build_name}" - } - - output { - File combined_assemblies = filter_sequences_by_length.filtered_fasta - File multiple_alignment = mafft.aligned_sequences - File unmasked_snps = snp_sites.snps_vcf - File masked_alignment = augur_mask_sites.masked_sequences - - File metadata_merged = derived_cols.derived_metadata - File keep_list = fasta_to_ids.ids_txt - File mafft_alignment = select_first([subsample.subsampled_msa, mafft.aligned_sequences]) - - - File ml_tree = draft_augur_tree.aligned_tree - File time_tree = refine_augur_tree.tree_refined - Array[File] node_data_jsons = select_all([ - refine_augur_tree.branch_lengths, - ancestral_traits.node_data_json, - ancestral_tree.nt_muts_json, - translate_augur_tree.aa_muts_json, - assign_clades_to_nodes.node_clade_data_json]) - File tip_frequencies_json = tip_frequencies.node_data_json - File root_sequence_json = export_auspice_json.root_sequence_json - File auspice_input_json = export_auspice_json.virus_json - } + infiles = assembly_fastas, + output_name = "all_samples_combined_assembly.fasta" + } + call nextstrain.nextstrain_deduplicate_sequences as dedup_seqs { + input: + sequences_fasta = zcat.combined + } + call utils.filter_sequences_by_length { + input: + sequences_fasta = dedup_seqs.sequences_deduplicated_fasta, + min_non_N = min_unambig_genome + } + call nextstrain.mafft_one_chr_chunked as mafft { + input: + sequences = filter_sequences_by_length.filtered_fasta, + ref_fasta = select_first([ref_fasta, nextstrain_ncov_defaults.reference_fasta]), + basename = "all_samples_aligned.fasta" + } + #### merge metadata, compute derived cols + if(length(sample_metadata_tsvs)>1) { + call utils.tsv_join { + input: + input_tsvs = sample_metadata_tsvs, + id_col = 'strain', + out_basename = "metadata-merged" + } + } + call nextstrain.derived_cols { + input: + metadata_tsv = select_first(flatten([[tsv_join.out_tsv], sample_metadata_tsvs])) + } + ## Subsample if builds.yaml file provided + if(defined(builds_yaml)) { + call nextstrain.nextstrain_build_subsample as subsample { + input: + alignment_msa_fasta = mafft.aligned_sequences, + sample_metadata_tsv = derived_cols.derived_metadata, + build_name = build_name, + builds_yaml = builds_yaml + } + } + call utils.fasta_to_ids { + input: + sequences_fasta = select_first([subsample.subsampled_msa, mafft.aligned_sequences]) + } + call nextstrain.snp_sites { + input: + msa_fasta = select_first([subsample.subsampled_msa, mafft.aligned_sequences]) + } + #### augur_from_msa + call nextstrain.augur_mask_sites { + input: + sequences = select_first([subsample.subsampled_msa, mafft.aligned_sequences]) + } + call nextstrain.draft_augur_tree { + input: + msa_or_vcf = augur_mask_sites.masked_sequences + } + call nextstrain.refine_augur_tree { + input: + raw_tree = draft_augur_tree.aligned_tree, + msa_or_vcf = select_first([subsample.subsampled_msa, augur_mask_sites.masked_sequences]), + metadata = derived_cols.derived_metadata, + clock_rate = clock_rate, + clock_std_dev = clock_std_dev, + root = tree_root_seq_id + } + if(defined(ancestral_traits_to_infer) && length(select_first([ancestral_traits_to_infer,[]]))>0) { + call nextstrain.ancestral_traits { + input: + tree = refine_augur_tree.tree_refined, + metadata = derived_cols.derived_metadata, + columns = select_first([ancestral_traits_to_infer,[]]) + } + } + call nextstrain.tip_frequencies { + input: + tree = refine_augur_tree.tree_refined, + metadata = derived_cols.derived_metadata, + min_date = 2020.0, + pivot_interval = 1, + pivot_interval_units = "weeks", + narrow_bandwidth = 0.05, + proportion_wide = 0.0, + out_basename = "auspice-~{build_name}" + } + call nextstrain.ancestral_tree { + input: + tree = refine_augur_tree.tree_refined, + msa_or_vcf = select_first([subsample.subsampled_msa, augur_mask_sites.masked_sequences]) + } + call nextstrain.translate_augur_tree { + input: + tree = refine_augur_tree.tree_refined, + nt_muts = ancestral_tree.nt_muts_json, + genbank_gb = nextstrain_ncov_defaults.reference_gb + } + call nextstrain.assign_clades_to_nodes { + input: + tree_nwk = refine_augur_tree.tree_refined, + nt_muts_json = ancestral_tree.nt_muts_json, + aa_muts_json = translate_augur_tree.aa_muts_json, + ref_fasta = select_first([ref_fasta, nextstrain_ncov_defaults.reference_fasta]), + clades_tsv = select_first([clades_tsv, nextstrain_ncov_defaults.clades_tsv]) + } + call nextstrain.export_auspice_json { + input: + tree = refine_augur_tree.tree_refined, + sample_metadata = derived_cols.derived_metadata, + lat_longs_tsv = select_first([lat_longs_tsv, nextstrain_ncov_defaults.lat_longs_tsv]), + node_data_jsons = select_all([ + refine_augur_tree.branch_lengths, + ancestral_traits.node_data_json, + ancestral_tree.nt_muts_json, + translate_augur_tree.aa_muts_json, + assign_clades_to_nodes.node_clade_data_json]), + auspice_config = select_first([auspice_config, nextstrain_ncov_defaults.auspice_config]), + out_basename = "auspice-~{build_name}" + } + output { + File combined_assemblies = filter_sequences_by_length.filtered_fasta + File multiple_alignment = mafft.aligned_sequences + File unmasked_snps = snp_sites.snps_vcf + File masked_alignment = augur_mask_sites.masked_sequences + File metadata_merged = derived_cols.derived_metadata + File keep_list = fasta_to_ids.ids_txt + File mafft_alignment = select_first([subsample.subsampled_msa, mafft.aligned_sequences]) + File ml_tree = draft_augur_tree.aligned_tree + File time_tree = refine_augur_tree.tree_refined + Array[File] node_data_jsons = select_all([ + refine_augur_tree.branch_lengths, + ancestral_traits.node_data_json, + ancestral_tree.nt_muts_json, + translate_augur_tree.aa_muts_json, + assign_clades_to_nodes.node_clade_data_json]) + File tip_frequencies_json = tip_frequencies.node_data_json + File root_sequence_json = export_auspice_json.root_sequence_json + File auspice_input_json = export_auspice_json.virus_json + } } diff --git a/workflows/wf_sc2_pubRepo_submission_local.wdl b/workflows/wf_sc2_pubRepo_submission_local.wdl deleted file mode 100644 index db42b258..00000000 --- a/workflows/wf_sc2_pubRepo_submission_local.wdl +++ /dev/null @@ -1,33 +0,0 @@ -version 1.0 - -import "wf_sc2_pubRepo_submission.wdl" as submission - -workflow SC2_submission_files_local { - input { - Array[String] inputdata - Array[File] inputfiles - } - - call submission.SC2_submission_files { - input: - samplename = inputdata[0], - submission_id = inputdata[1], - collection_date = inputdata[2], - coverage = inputdata[3], - number_N = inputdata[4], - number_ATCG = inputdata[5], - number_Total = inputdata[6], - sequence = inputfiles[0], - read1 = inputfiles[1], - read2 = inputfiles[2] - } - output { - File deID_assembly = SC2_submission_files.deID_assembly - File read1_submission = SC2_submission_files.read1_submission - File read2_submission = SC2_submission_files.read2_submission - File? genbank_assembly = SC2_submission_files.genbank_assembly - File? gisaid_assembly = SC2_submission_files.gisaid_assembly - } -} - - diff --git a/workflows/wf_theiacov_augur_distance_tree.wdl b/workflows/wf_theiacov_augur_distance_tree.wdl index dcee4f93..d81af83b 100644 --- a/workflows/wf_theiacov_augur_distance_tree.wdl +++ b/workflows/wf_theiacov_augur_distance_tree.wdl @@ -5,7 +5,7 @@ import "../tasks/tasks_utils.wdl" as utils import "../tasks/task_phylo.wdl" as phylo import "../tasks/task_versioning.wdl" as versioning -workflow titan_distance_tree { +workflow theiacov_distance_tree { meta { description: "Workflow for SC2 cluster investigations. TheiaCoV_Augur_DistanceTree is will generate a ML distance tree using select tasks incorporated in the ThieaCoV_Augur_Run workflow; output from the modified sarscov2_nextstrain workflow will also be used to infer SNP distances. The ML distance tree output can be visualized using the Auspice web application https://auspice.us/" author: "Kevin G Libuit" @@ -17,7 +17,7 @@ workflow titan_distance_tree { String build_name File? builds_yaml File? ref_fasta - Int min_unambig_genome = 27000 + Int min_unambig_genome = 27000 } parameter_meta { assembly_fastas: { @@ -53,7 +53,7 @@ workflow titan_distance_tree { call nextstrain.nextstrain_deduplicate_sequences as dedup_seqs { input: sequences_fasta = zcat.combined - } + } call utils.filter_sequences_by_length { input: sequences_fasta = dedup_seqs.sequences_deduplicated_fasta, @@ -72,12 +72,12 @@ workflow titan_distance_tree { input_tsvs = sample_metadata_tsvs, id_col = 'strain', out_basename = "metadata-merged" - } + } } call nextstrain.derived_cols { input: metadata_tsv = select_first(flatten([[tsv_join.out_tsv], sample_metadata_tsvs])) - } + } ## Subsample if builds.yaml file provided if(defined(builds_yaml)) { call nextstrain.nextstrain_build_subsample as subsample { @@ -86,12 +86,12 @@ workflow titan_distance_tree { sample_metadata_tsv = derived_cols.derived_metadata, build_name = build_name, builds_yaml = builds_yaml - } } + } call utils.fasta_to_ids { input: sequences_fasta = select_first([subsample.subsampled_msa, mafft.aligned_sequences]) - } + } call nextstrain.snp_sites { input: msa_fasta = select_first([subsample.subsampled_msa, mafft.aligned_sequences]) @@ -114,19 +114,19 @@ workflow titan_distance_tree { input: } output { - #version capture + # Version Capture String TheiaCoV_Augur_DistanceTree_version = version_capture.phvg_version - String TheiaCoV_Augur_DistanceTree_analysis_date = version_capture.date - #tree, intermediates, and metadata - File combined_assemblies = filter_sequences_by_length.filtered_fasta - File multiple_alignment = mafft.aligned_sequences - File unmasked_snps = snp_sites.snps_vcf - File masked_alignment = augur_mask_sites.masked_sequences - File metadata_merged = derived_cols.derived_metadata - File keep_list = fasta_to_ids.ids_txt - File mafft_alignment = select_first([subsample.subsampled_msa, mafft.aligned_sequences]) - File distance_tree = draft_augur_tree.aligned_tree - #SNP matrix + String TheiaCoV_Augur_DistanceTree_analysis_date = version_capture.date + # Tree, Intermediates, and Metadata + File combined_assemblies = filter_sequences_by_length.filtered_fasta + File multiple_alignment = mafft.aligned_sequences + File unmasked_snps = snp_sites.snps_vcf + File masked_alignment = augur_mask_sites.masked_sequences + File metadata_merged = derived_cols.derived_metadata + File keep_list = fasta_to_ids.ids_txt + File mafft_alignment = select_first([subsample.subsampled_msa, mafft.aligned_sequences]) + File distance_tree = draft_augur_tree.aligned_tree + # SNP Matrix File snp_matrix = snp_dists.snp_matrix } } diff --git a/workflows/wf_theiacov_augur_prep.wdl b/workflows/wf_theiacov_augur_prep.wdl new file mode 100644 index 00000000..48f6eecc --- /dev/null +++ b/workflows/wf_theiacov_augur_prep.wdl @@ -0,0 +1,34 @@ +version 1.0 + +import "../tasks/tasks_nextstrain.wdl" as nextstrain +import "../tasks/task_versioning.wdl" as versioning + +workflow theiacov_augur_prep { + input { + String assembly + String collection_date + String iso_country + String iso_state + String iso_continent + String? iso_county + String pango_lineage + } + call nextstrain.prep_augur_metadata { + input: + assembly = assembly, + collection_date = collection_date, + iso_country = iso_country, + iso_state = iso_state, + iso_continent = iso_continent, + iso_county = iso_county, + pango_lineage = pango_lineage + } + call versioning.version_capture { + input: + } + output { + String theiacov_augur_run_version = version_capture.phvg_version + String theiacov_augur_run_analysis_date = version_capture.date + File augur_metadata = prep_augur_metadata.augur_metadata + } +} diff --git a/workflows/wf_theiacov_augur_run.wdl b/workflows/wf_theiacov_augur_run.wdl new file mode 100644 index 00000000..bd20afa6 --- /dev/null +++ b/workflows/wf_theiacov_augur_run.wdl @@ -0,0 +1,58 @@ +version 1.0 + +import "wf_sarscov2_nextstrain_modified.wdl" as augur +import "../tasks/task_phylo.wdl" as phylo +import "../tasks/task_versioning.wdl" as versioning + +workflow theiacov_augur_run { + meta { + description: "Workflow for SC2 cluster investigations. TheiaCoV_Augur_Run will run Augur without a subsampling module using a modified version of The Broad Institute's sarscov2_nextstrain WDL workflow to create an Auspice JSON file; output from the modified sarscov2_nextstrain workflow will also be used to infer SNP distances" + author: "Kevin G Libuit" + email: "kevin.libuit@theiagen.com" + } + input { + Array[File]+ assembly_fastas + Array[File]+ sample_metadata_tsvs + String build_name + } + parameter_meta { + assembly_fastas: { + description: "Set of assembled genomes to align and build trees. These must represent a single chromosome/segment of a genome only. Fastas may be one-sequence-per-individual or a concatenated multi-fasta (unaligned) or a mixture of the two. They may be compressed (gz, bz2, zst, lz4), uncompressed, or a mixture.", + patterns: ["*.fasta", "*.fa", "*.fasta.gz", "*.fasta.zst"] + } + sample_metadata_tsvs: { + description: "Tab-separated metadata file that contain binning variables and values. Must contain all samples: output will be filtered to the IDs present in this file.", + patterns: ["*.txt", "*.tsv"] + } + } + call augur.sarscov2_nextstrain { + input: + assembly_fastas = assembly_fastas, + sample_metadata_tsvs = sample_metadata_tsvs, + build_name = build_name + } + call phylo.snp_dists { + input: + cluster_name = build_name, + alignment = sarscov2_nextstrain.mafft_alignment + } + call versioning.version_capture{ + input: + } + output { + # Version Capture + String theiacov_augur_run_version = version_capture.phvg_version + String theiacov_augur_run_analysis_date = version_capture.date + # Augur outputs + File combined_assemblies = sarscov2_nextstrain.combined_assemblies + File MAFFT_alignment = sarscov2_nextstrain.mafft_alignment + File unmasked_snps = sarscov2_nextstrain.unmasked_snps + File metadata_merged = sarscov2_nextstrain.metadata_merged + File keep_list = sarscov2_nextstrain.keep_list + File distance_tree = sarscov2_nextstrain.ml_tree + File time_tree = sarscov2_nextstrain.time_tree + File auspice_input_json = sarscov2_nextstrain.auspice_input_json + # SNP Matrix + File snp_matrix = snp_dists.snp_matrix + } +} diff --git a/workflows/wf_theiacov_clearlabs.wdl b/workflows/wf_theiacov_clearlabs.wdl new file mode 100644 index 00000000..8d2fe20f --- /dev/null +++ b/workflows/wf_theiacov_clearlabs.wdl @@ -0,0 +1,159 @@ +version 1.0 + +import "../tasks/task_ont_medaka.wdl" as medaka +import "../tasks/task_assembly_metrics.wdl" as assembly_metrics +import "../tasks/task_taxonID.wdl" as taxon_ID +import "../tasks/task_ncbi.wdl" as ncbi +import "../tasks/task_read_clean.wdl" as read_clean +import "../tasks/task_qc_utils.wdl" as qc_utils +import "../tasks/task_versioning.wdl" as versioning + +workflow theiacov_clearlabs { + meta { + description: "Reference-based consensus calling for viral amplicon ont sequencing data generated on the Clear Labs platform." + } + input { + String samplename + File clear_lab_fastq + String seq_method = "OXFORD_NANOPORE" + File primer_bed + Int? normalise = 20000 + String nextclade_dataset_name = "sars-cov-2" + String nextclade_dataset_reference = "MN908947" + String nextclade_dataset_tag = "2022-02-07T12:00:00Z" + String medaka_docker = "quay.io/staphb/artic-ncov2019:1.3.0-medaka-1.4.3" + } + call qc_utils.fastq_scan_se as fastq_scan_raw_reads { + input: + read1 = clear_lab_fastq + } + call read_clean.ncbi_scrub_se { + input: + samplename = samplename, + read1 = clear_lab_fastq + } + call qc_utils.fastq_scan_se as fastq_scan_clean_reads { + input: + read1 = ncbi_scrub_se.read1_dehosted + } + call taxon_ID.kraken2 as kraken2_dehosted { + input: + samplename = samplename, + read1 = ncbi_scrub_se.read1_dehosted + } + call medaka.consensus { + input: + samplename = samplename, + filtered_reads = ncbi_scrub_se.read1_dehosted, + primer_bed = primer_bed, + normalise = normalise, + docker = medaka_docker + } + call assembly_metrics.stats_n_coverage { + input: + samplename = samplename, + bamfile = consensus.sorted_bam + } + call qc_utils.consensus_qc { + input: + assembly_fasta = consensus.consensus_seq + } + call assembly_metrics.stats_n_coverage as stats_n_coverage_primtrim { + input: + samplename = samplename, + bamfile = consensus.trim_sorted_bam + } + call taxon_ID.pangolin3 { + input: + samplename = samplename, + fasta = consensus.consensus_seq + } + call taxon_ID.kraken2 as kraken2_raw { + input: + samplename = samplename, + read1 = clear_lab_fastq + } + call taxon_ID.nextclade_one_sample { + input: + genome_fasta = consensus.consensus_seq, + dataset_name = nextclade_dataset_name, + dataset_reference = nextclade_dataset_reference, + dataset_tag = nextclade_dataset_tag + } + call taxon_ID.nextclade_output_parser_one_sample { + input: + nextclade_tsv = nextclade_one_sample.nextclade_tsv + } + call ncbi.vadr { + input: + genome_fasta = consensus.consensus_seq, + assembly_length_unambiguous = consensus_qc.number_ATCG + } + call versioning.version_capture{ + input: + } + output { + # Version Capture + String theiacov_clearlabs_version = version_capture.phvg_version + String theiacov_clearlabs_analysis_date = version_capture.date + # Read Metadata + String seq_platform = seq_method + # Read QC + File reads_dehosted = ncbi_scrub_se.read1_dehosted + Int num_reads_raw = fastq_scan_raw_reads.read1_seq + Int num_reads_clean = fastq_scan_clean_reads.read1_seq + String fastq_scan_version = fastq_scan_raw_reads.version + String kraken_version = kraken2_raw.version + Float kraken_human = kraken2_raw.percent_human + Float kraken_sc2 = kraken2_raw.percent_sc2 + String kraken_report = kraken2_raw.kraken_report + Float kraken_human_dehosted = kraken2_dehosted.percent_human + Float kraken_sc2_dehosted = kraken2_dehosted.percent_sc2 + String kraken_report_dehosted = kraken2_dehosted.kraken_report + # Read Alignment + File aligned_bam = consensus.trim_sorted_bam + File aligned_bai = consensus.trim_sorted_bai + File variants_from_ref_vcf = consensus.medaka_pass_vcf + String artic_version = consensus.artic_pipeline_version + String artic_docker = consensus.artic_pipeline_docker + String medaka_reference = consensus.medaka_reference + String primer_bed_name = consensus.primer_bed_name + File assembly_fasta = consensus.consensus_seq + String assembly_method = consensus.artic_pipeline_version + # Assembly QC + Int number_N = consensus_qc.number_N + Int assembly_length_unambiguous = consensus_qc.number_ATCG + Int number_Degenerate = consensus_qc.number_Degenerate + Int number_Total = consensus_qc.number_Total + Float percent_reference_coverage = consensus_qc.percent_reference_coverage + # Lineage Assignment + String pango_lineage = pangolin3.pangolin_lineage + String pangolin_conflicts = pangolin3.pangolin_conflicts + String pangolin_notes = pangolin3.pangolin_notes + String pangolin_assignment_version = pangolin3.pangolin_assignment_version + File pango_lineage_report= pangolin3.pango_lineage_report + String pangolin_docker = pangolin3.pangolin_docker + String pangolin_versions = pangolin3.pangolin_versions + # Alignment QC + File consensus_stats = stats_n_coverage.stats + File consensus_flagstat = stats_n_coverage.flagstat + Float meanbaseq_trim = stats_n_coverage_primtrim.meanbaseq + Float meanmapq_trim = stats_n_coverage_primtrim.meanmapq + Float assembly_mean_coverage = stats_n_coverage_primtrim.depth + Float s_gene_mean_coverage = stats_n_coverage_primtrim.s_gene_depth + String samtools_version = stats_n_coverage.samtools_version + # Clade Assigment + File nextclade_json = nextclade_one_sample.nextclade_json + File auspice_json = nextclade_one_sample.auspice_json + File nextclade_tsv = nextclade_one_sample.nextclade_tsv + String nextclade_version = nextclade_one_sample.nextclade_version + String nextclade_docker = nextclade_one_sample.nextclade_docker + String nextclade_aa_subs = nextclade_output_parser_one_sample.nextclade_aa_subs + String nextclade_aa_dels = nextclade_output_parser_one_sample.nextclade_aa_dels + String nextclade_clade = nextclade_output_parser_one_sample.nextclade_clade + # VADR Annotation QC + File? vadr_alerts_list = vadr.alerts_list + String vadr_num_alerts = vadr.num_alerts + String vadr_docker = vadr.vadr_docker + } +} diff --git a/workflows/wf_theiacov_fasta.wdl b/workflows/wf_theiacov_fasta.wdl new file mode 100644 index 00000000..dbfbb4ce --- /dev/null +++ b/workflows/wf_theiacov_fasta.wdl @@ -0,0 +1,87 @@ +version 1.0 + +import "../tasks/task_ont_medaka.wdl" as medaka +import "../tasks/task_assembly_metrics.wdl" as assembly_metrics +import "../tasks/task_taxonID.wdl" as taxon_ID +import "../tasks/task_ncbi.wdl" as ncbi +import "../tasks/task_read_clean.wdl" as read_clean +import "../tasks/task_qc_utils.wdl" as qc_utils +import "../tasks/task_versioning.wdl" as versioning + +workflow theiacov_fasta { + meta { + description: "Reference-based consensus calling for viral amplicon ont sequencing data generated on the Clear Labs platform." + } + input { + String samplename + File assembly_fasta + String seq_method + String input_assembly_method + String nextclade_dataset_name = "sars-cov-2" + String nextclade_dataset_reference = "MN908947" + String nextclade_dataset_tag = "2022-02-07T12:00:00Z" + } + call qc_utils.consensus_qc { + input: + assembly_fasta = assembly_fasta + } + call taxon_ID.pangolin3 { + input: + samplename = samplename, + fasta = assembly_fasta + } + call taxon_ID.nextclade_one_sample { + input: + genome_fasta = assembly_fasta, + dataset_name = nextclade_dataset_name, + dataset_reference = nextclade_dataset_reference, + dataset_tag = nextclade_dataset_tag + } + call taxon_ID.nextclade_output_parser_one_sample { + input: + nextclade_tsv = nextclade_one_sample.nextclade_tsv + } + call ncbi.vadr { + input: + genome_fasta = assembly_fasta, + assembly_length_unambiguous = consensus_qc.number_ATCG + } + call versioning.version_capture{ + input: + } + output { + # Version Capture + String theiacov_fasta_version = version_capture.phvg_version + String theiacov_fasta_analysis_date = version_capture.date + # Read & Assembly Metadata + String seq_platform = seq_method + String assembly_method = input_assembly_method + # Assembly QC + Int number_N = consensus_qc.number_N + Int assembly_length_unambiguous = consensus_qc.number_ATCG + Int number_Degenerate = consensus_qc.number_Degenerate + Int number_Total = consensus_qc.number_Total + Float percent_reference_coverage = consensus_qc.percent_reference_coverage + # Lineage Assignment + String pango_lineage = pangolin3.pangolin_lineage + String pangolin_conflicts = pangolin3.pangolin_conflicts + String pangolin_notes = pangolin3.pangolin_notes + String pangolin_assignment_version = pangolin3.pangolin_assignment_version + File pango_lineage_report = pangolin3.pango_lineage_report + String pangolin_docker = pangolin3.pangolin_docker + String pangolin_versions = pangolin3.pangolin_versions + # Clade Assigment + File nextclade_json = nextclade_one_sample.nextclade_json + File auspice_json = nextclade_one_sample.auspice_json + File nextclade_tsv = nextclade_one_sample.nextclade_tsv + String nextclade_version = nextclade_one_sample.nextclade_version + String nextclade_docker = nextclade_one_sample.nextclade_docker + String nextclade_clade = nextclade_output_parser_one_sample.nextclade_clade + String nextclade_aa_subs = nextclade_output_parser_one_sample.nextclade_aa_subs + String nextclade_aa_dels = nextclade_output_parser_one_sample.nextclade_aa_dels + # VADR Annotation QC + File? vadr_alerts_list = vadr.alerts_list + String vadr_num_alerts = vadr.num_alerts + String vadr_docker = vadr.vadr_docker + } +} diff --git a/workflows/wf_theiacov_gc.wdl b/workflows/wf_theiacov_gc.wdl new file mode 100644 index 00000000..c663fcbe --- /dev/null +++ b/workflows/wf_theiacov_gc.wdl @@ -0,0 +1,328 @@ +version 1.0 + +# Workflows from Theiagen's public_health_viral_genomics +# Source: https://github.com/theiagen/public_health_viral_genomics +import "wf_theiacov_clearlabs.wdl" as clearlabs +import "wf_theiacov_illumina_pe.wdl" as illumina_pe +import "wf_theiacov_illumina_se.wdl" as illumina_se +import "wf_theiacov_ont.wdl" as ont +import "../tasks/task_theiacov_summary.wdl" as summary + +struct parseJSON { + String sample + String theiacov_wf + File r1 + File r2 + File primers +} +workflow theiacov_gc { + meta { + description: "Incorporates each of the TheiaCoV workflows (clearlabs, illumina_pe, illumina_se, ont) into a single run." + author: "Robert A. Petit III" + email: "robert.petit@theiagen.com" + } + input { + Array[parseJSON] samples + } + + scatter (sample in samples) { + if (sample.theiacov_wf == "clearlabs") { + call clearlabs.theiacov_clearlabs as theiacov_clearlabs { + input: + samplename = sample.sample, + clear_lab_fastq = sample.r1, + primer_bed = sample.primers + } + call summary.theiacov_summary as clearlabs_summary { + input: + samplename = sample.sample, + theiacov_workflow = 'theiacov_clearlabs', + theiacov_version = theiacov_clearlabs.theiacov_clearlabs_version, + theiacov_analysis_date = theiacov_clearlabs.theiacov_clearlabs_analysis_date, + seq_platform = theiacov_clearlabs.seq_platform, + primer_bed_name = theiacov_clearlabs.primer_bed_name, + percent_reference_coverage = theiacov_clearlabs.percent_reference_coverage, + number_N = theiacov_clearlabs.number_N, + pango_lineage = theiacov_clearlabs.pango_lineage, + pangolin_conflicts = theiacov_clearlabs.pangolin_conflicts, + pangolin_notes = theiacov_clearlabs.pangolin_notes, + pangolin_assignment_version = theiacov_clearlabs.pangolin_assignment_version, + pangolin_docker = theiacov_clearlabs.pangolin_docker, + pangolin_versions = theiacov_clearlabs.pangolin_versions, + nextclade_clade = theiacov_clearlabs.nextclade_clade, + nextclade_aa_subs = theiacov_clearlabs.nextclade_aa_subs, + nextclade_aa_dels = theiacov_clearlabs.nextclade_aa_dels, + vadr_num_alerts = theiacov_clearlabs.vadr_num_alerts, + assembly_length_unambiguous = theiacov_clearlabs.assembly_length_unambiguous, + assembly_mean_coverage = theiacov_clearlabs.assembly_mean_coverage, + s_gene_mean_coverage = theiacov_clearlabs.s_gene_mean_coverage, + assembly_method = theiacov_clearlabs.assembly_method, + number_Degenerate = theiacov_clearlabs.number_Degenerate, + number_Total = theiacov_clearlabs.number_Total, + meanbaseq_trim = theiacov_clearlabs.meanbaseq_trim, + meanmapq_trim = theiacov_clearlabs.meanmapq_trim, + num_reads_clean1 = theiacov_clearlabs.num_reads_clean, + num_reads_raw1 = theiacov_clearlabs.num_reads_raw, + fastq_scan_version = theiacov_clearlabs.fastq_scan_version, + kraken_human = theiacov_clearlabs.kraken_human, + kraken_human_dehosted = theiacov_clearlabs.kraken_human_dehosted, + kraken_sc2 = theiacov_clearlabs.kraken_sc2, + kraken_sc2_dehosted = theiacov_clearlabs.kraken_sc2_dehosted, + artic_version = theiacov_clearlabs.artic_version, + artic_docker = theiacov_clearlabs.artic_docker, + medaka_reference = theiacov_clearlabs.medaka_reference, + kraken_version = theiacov_clearlabs.kraken_version, + nextclade_version = theiacov_clearlabs.nextclade_version, + nextclade_docker = theiacov_clearlabs.nextclade_docker, + samtools_version = theiacov_clearlabs.samtools_version, + vadr_docker = theiacov_clearlabs.vadr_docker + } + } + if (sample.theiacov_wf == "illumina_pe") { + call illumina_pe.theiacov_illumina_pe as theiacov_illumina_pe { + input: + samplename = sample.sample, + read1_raw = sample.r1, + read2_raw = sample.r2, + primer_bed = sample.primers + } + call summary.theiacov_summary as illumina_pe_summary { + input: + samplename = sample.sample, + theiacov_workflow = 'theiacov_illumina_pe', + theiacov_version = theiacov_illumina_pe.theiacov_illumina_pe_version, + theiacov_analysis_date = theiacov_illumina_pe.theiacov_illumina_pe_analysis_date, + seq_platform = theiacov_illumina_pe.seq_platform, + primer_bed_name = theiacov_illumina_pe.primer_bed_name, + percent_reference_coverage = theiacov_illumina_pe.percent_reference_coverage, + number_N = theiacov_illumina_pe.number_N, + pango_lineage = theiacov_illumina_pe.pango_lineage, + pangolin_conflicts = theiacov_illumina_pe.pangolin_conflicts, + pangolin_notes = theiacov_illumina_pe.pangolin_notes, + pangolin_assignment_version = theiacov_illumina_pe.pangolin_assignment_version, + pangolin_versions = theiacov_illumina_pe.pangolin_versions, + pangolin_docker = theiacov_illumina_pe.pangolin_docker, + nextclade_clade = theiacov_illumina_pe.nextclade_clade, + nextclade_aa_subs = theiacov_illumina_pe.nextclade_aa_subs, + nextclade_aa_dels = theiacov_illumina_pe.nextclade_aa_dels, + vadr_num_alerts = theiacov_illumina_pe.vadr_num_alerts, + assembly_length_unambiguous = theiacov_illumina_pe.assembly_length_unambiguous, + assembly_mean_coverage = theiacov_illumina_pe.assembly_mean_coverage, + s_gene_mean_coverage = theiacov_illumina_pe.s_gene_mean_coverage, + assembly_method = theiacov_illumina_pe.assembly_method, + number_Degenerate = theiacov_illumina_pe.number_Degenerate, + number_Total = theiacov_illumina_pe.number_Total, + meanbaseq_trim = theiacov_illumina_pe.meanbaseq_trim, + meanmapq_trim = theiacov_illumina_pe.meanmapq_trim, + num_reads_clean1 = theiacov_illumina_pe.num_reads_clean1, + num_reads_clean2 = theiacov_illumina_pe.num_reads_clean2, + num_reads_clean_pairs = theiacov_illumina_pe.num_reads_clean_pairs, + num_reads_raw1 = theiacov_illumina_pe.num_reads_raw1, + num_reads_raw2 = theiacov_illumina_pe.num_reads_raw2, + num_reads_raw_pairs = theiacov_illumina_pe.num_reads_raw_pairs, + kraken_human = theiacov_illumina_pe.kraken_human, + kraken_human_dehosted = theiacov_illumina_pe.kraken_human_dehosted, + kraken_sc2 = theiacov_illumina_pe.kraken_sc2, + kraken_sc2_dehosted = theiacov_illumina_pe.kraken_sc2_dehosted, + primer_trimmed_read_percent = theiacov_illumina_pe.primer_trimmed_read_percent, + bbduk_docker = theiacov_illumina_pe.bbduk_docker, + bwa_version = theiacov_illumina_pe.bwa_version, + fastq_scan_version = theiacov_illumina_pe.fastq_scan_version, + ivar_variant_version = theiacov_illumina_pe.ivar_variant_version, + ivar_version_consensus = theiacov_illumina_pe.ivar_version_consensus, + ivar_version_primtrim = theiacov_illumina_pe.ivar_version_primtrim, + kraken_version = theiacov_illumina_pe.kraken_version, + nextclade_version = theiacov_illumina_pe.nextclade_version, + nextclade_docker = theiacov_illumina_pe.nextclade_docker, + samtools_version = theiacov_illumina_pe.samtools_version, + samtools_version_consensus = theiacov_illumina_pe.samtools_version_consensus, + samtools_version_primtrim = theiacov_illumina_pe.samtools_version_primtrim, + samtools_version_stats = theiacov_illumina_pe.samtools_version_stats, + trimmomatic_version = theiacov_illumina_pe.trimmomatic_version, + vadr_docker = theiacov_illumina_pe.vadr_docker + } + } + if (sample.theiacov_wf == "illumina_se") { + call illumina_se.theiacov_illumina_se as theiacov_illumina_se { + input: + samplename = sample.sample, + read1_raw = sample.r1, + primer_bed = sample.primers + } + call summary.theiacov_summary as illumina_se_summary { + input: + samplename = sample.sample, + theiacov_workflow = 'theiacov_illumina_se', + theiacov_version = theiacov_illumina_se.theiacov_illumina_se_version, + theiacov_analysis_date = theiacov_illumina_se.theiacov_illumina_se_analysis_date, + seq_platform = theiacov_illumina_se.seq_platform, + primer_bed_name = theiacov_illumina_se.primer_bed_name, + percent_reference_coverage = theiacov_illumina_se.percent_reference_coverage, + number_N = theiacov_illumina_se.number_N, + pango_lineage = theiacov_illumina_se.pango_lineage, + pangolin_conflicts = theiacov_illumina_se.pangolin_conflicts, + pangolin_notes = theiacov_illumina_se.pangolin_notes, + pangolin_versions = theiacov_illumina_se.pangolin_versions, + pangolin_assignment_version = theiacov_illumina_se.pangolin_assignment_version, + pangolin_docker = theiacov_illumina_se.pangolin_docker, + nextclade_clade = theiacov_illumina_se.nextclade_clade, + nextclade_aa_subs = theiacov_illumina_se.nextclade_aa_subs, + nextclade_aa_dels = theiacov_illumina_se.nextclade_aa_dels, + vadr_num_alerts = theiacov_illumina_se.vadr_num_alerts, + assembly_length_unambiguous = theiacov_illumina_se.assembly_length_unambiguous, + assembly_mean_coverage = theiacov_illumina_se.assembly_mean_coverage, + s_gene_mean_coverage = theiacov_illumina_se.s_gene_mean_coverage, + assembly_method = theiacov_illumina_se.assembly_method, + number_Degenerate = theiacov_illumina_se.number_Degenerate, + number_Total = theiacov_illumina_se.number_Total, + meanbaseq_trim = theiacov_illumina_se.meanbaseq_trim, + meanmapq_trim = theiacov_illumina_se.meanmapq_trim, + num_reads_clean1 = theiacov_illumina_se.num_reads_clean, + num_reads_raw1 = theiacov_illumina_se.num_reads_raw, + fastq_scan_version = theiacov_illumina_se.fastq_scan_version, + kraken_human = theiacov_illumina_se.kraken_human, + kraken_sc2 = theiacov_illumina_se.kraken_sc2, + primer_trimmed_read_percent = theiacov_illumina_se.primer_trimmed_read_percent, + bbduk_docker = theiacov_illumina_se.bbduk_docker, + bwa_version = theiacov_illumina_se.bwa_version, + fastq_scan_version = theiacov_illumina_se.fastq_scan_version, + ivar_variant_version = theiacov_illumina_se.ivar_variant_version, + ivar_version_consensus = theiacov_illumina_se.ivar_version_consensus, + ivar_version_primtrim = theiacov_illumina_se.ivar_version_primtrim, + kraken_version = theiacov_illumina_se.kraken_version, + nextclade_version = theiacov_illumina_se.nextclade_version, + nextclade_docker = theiacov_illumina_se.nextclade_docker, + samtools_version = theiacov_illumina_se.samtools_version, + samtools_version_consensus = theiacov_illumina_se.samtools_version_consensus, + samtools_version_primtrim = theiacov_illumina_se.samtools_version_primtrim, + samtools_version_stats = theiacov_illumina_se.samtools_version_stats, + trimmomatic_version = theiacov_illumina_se.trimmomatic_version, + vadr_docker = theiacov_illumina_se.vadr_docker + } + } + if (sample.theiacov_wf == "ont") { + call ont.theiacov_ont as theiacov_ont { + input: + samplename = sample.sample, + demultiplexed_reads = sample.r1, + primer_bed = sample.primers + } + call summary.theiacov_summary as ont_summary { + input: + samplename = sample.sample, + theiacov_workflow = 'theiacov_ont', + theiacov_version = theiacov_ont.theiacov_ont_version, + theiacov_analysis_date = theiacov_ont.theiacov_ont_analysis_date, + seq_platform = theiacov_ont.seq_platform, + primer_bed_name = theiacov_ont.primer_bed_name, + percent_reference_coverage = theiacov_ont.percent_reference_coverage, + number_N = theiacov_ont.number_N, + pango_lineage = theiacov_ont.pango_lineage, + pangolin_conflicts = theiacov_ont.pangolin_conflicts, + pangolin_notes = theiacov_ont.pangolin_notes, + pangolin_versions = theiacov_ont.pangolin_versions, + pangolin_assignment_version = theiacov_ont.pangolin_assignment_version, + pangolin_docker = theiacov_ont.pangolin_docker, + nextclade_clade = theiacov_ont.nextclade_clade, + nextclade_aa_subs = theiacov_ont.nextclade_aa_subs, + nextclade_aa_dels = theiacov_ont.nextclade_aa_dels, + vadr_num_alerts = theiacov_ont.vadr_num_alerts, + assembly_length_unambiguous = theiacov_ont.assembly_length_unambiguous, + assembly_mean_coverage = theiacov_ont.assembly_mean_coverage, + s_gene_mean_coverage = theiacov_ont.s_gene_mean_coverage, + assembly_method = theiacov_ont.assembly_method, + number_Degenerate = theiacov_ont.number_Degenerate, + number_Total = theiacov_ont.number_Total, + meanbaseq_trim = theiacov_ont.meanbaseq_trim, + meanmapq_trim = theiacov_ont.meanmapq_trim, + num_reads_clean1 = theiacov_ont.num_reads_clean, + num_reads_raw1 = theiacov_ont.num_reads_raw, + fastq_scan_version = theiacov_ont.fastq_scan_version, + kraken_human = theiacov_ont.kraken_human, + kraken_human_dehosted = theiacov_ont.kraken_human_dehosted, + kraken_sc2 = theiacov_ont.kraken_sc2, + kraken_sc2_dehosted = theiacov_ont.kraken_sc2_dehosted, + artic_version = theiacov_ont.artic_version, + artic_docker = theiacov_ont.artic_docker, + medaka_reference = theiacov_ont.medaka_reference, + kraken_version = theiacov_ont.kraken_version, + nextclade_version = theiacov_ont.nextclade_version, + nextclade_docker = theiacov_ont.nextclade_docker, + samtools_version = theiacov_ont.samtools_version, + vadr_docker = theiacov_ont.vadr_docker + } + } + } + call summary.merge_theiacov_summary { + input: + clearlabs_summaries = clearlabs_summary.summary, + illumina_pe_summaries = illumina_pe_summary.summary, + illumina_se_summaries = illumina_se_summary.summary, + ont_summaries = ont_summary.summary + } + output { + # TheiaCoV outputs + File summaries_tsv = merge_theiacov_summary.summaries_tsv + File summaries_json = merge_theiacov_summary.summaries_json + Array[File] reads_dehosted = flatten([ + select_all(theiacov_clearlabs.reads_dehosted), select_all(theiacov_illumina_pe.read1_dehosted), + select_all(theiacov_illumina_pe.read2_dehosted), select_all(theiacov_ont.reads_dehosted) + ]) + Array[File] aligned_bam = flatten([ + select_all(theiacov_clearlabs.aligned_bam), select_all(theiacov_illumina_pe.aligned_bam), + select_all(theiacov_illumina_se.aligned_bam), select_all(theiacov_ont.aligned_bam) + ]) + Array[File] aligned_bai = flatten([ + select_all(theiacov_clearlabs.aligned_bai), select_all(theiacov_illumina_pe.aligned_bai), + select_all(theiacov_illumina_se.aligned_bai), select_all(theiacov_ont.aligned_bai) + ]) + Array[File] assembly_fasta = flatten([ + select_all(theiacov_clearlabs.assembly_fasta), select_all(theiacov_illumina_pe.assembly_fasta), + select_all(theiacov_illumina_se.assembly_fasta), select_all(theiacov_ont.assembly_fasta) + ]) + Array[File] consensus_stats = flatten([ + select_all(theiacov_clearlabs.consensus_stats), select_all(theiacov_illumina_pe.consensus_stats), + select_all(theiacov_illumina_se.consensus_stats), select_all(theiacov_ont.consensus_stats) + ]) + Array[File] consensus_flagstat = flatten([ + select_all(theiacov_clearlabs.consensus_flagstat), select_all(theiacov_illumina_pe.consensus_flagstat), + select_all(theiacov_illumina_se.consensus_flagstat), select_all(theiacov_ont.consensus_flagstat) + ]) + Array[File] consensus_variants = flatten([ + select_all(theiacov_clearlabs.variants_from_ref_vcf), select_all(theiacov_illumina_pe.ivar_vcf), + select_all(theiacov_illumina_se.ivar_vcf), select_all(theiacov_ont.variants_from_ref_vcf) + ]) + Array[File] pango_lineage_report = flatten([ + select_all(theiacov_clearlabs.pango_lineage_report), select_all(theiacov_illumina_pe.pango_lineage_report), + select_all(theiacov_illumina_se.pango_lineage_report), select_all(theiacov_ont.pango_lineage_report) + ]) + Array[File] nextclade_json = flatten([ + select_all(theiacov_clearlabs.nextclade_json), select_all(theiacov_illumina_pe.nextclade_json), + select_all(theiacov_illumina_se.nextclade_json), select_all(theiacov_ont.nextclade_json) + ]) + Array[File] auspice_json = flatten([ + select_all(theiacov_clearlabs.auspice_json), select_all(theiacov_illumina_pe.auspice_json), + select_all(theiacov_illumina_se.auspice_json), select_all(theiacov_ont.auspice_json) + ]) + Array[File] nextclade_tsv = flatten([ + select_all(theiacov_clearlabs.nextclade_tsv), select_all(theiacov_illumina_pe.nextclade_tsv), + select_all(theiacov_illumina_se.nextclade_tsv), select_all(theiacov_ont.nextclade_tsv) + ]) + Array[File] vadr_alerts_list = flatten([ + select_all(theiacov_clearlabs.vadr_alerts_list), select_all(theiacov_illumina_pe.vadr_alerts_list), + select_all(theiacov_illumina_se.vadr_alerts_list), select_all(theiacov_ont.vadr_alerts_list) + ]) + Array[File] kraken_report = flatten([ + select_all(theiacov_clearlabs.kraken_report), select_all(theiacov_illumina_pe.kraken_report), + select_all(theiacov_illumina_se.kraken_report), select_all(theiacov_ont.kraken_report) + ]) + Array[File] kraken_report_dehosted = flatten([ + select_all(theiacov_clearlabs.kraken_report_dehosted), select_all(theiacov_illumina_pe.kraken_report_dehosted), + select_all(theiacov_ont.kraken_report_dehosted) + ]) + Array[File] json_summary = flatten([ + select_all(clearlabs_summary.summary), select_all(illumina_pe_summary.summary), + select_all(illumina_se_summary.summary), select_all(ont_summary.summary) + ]) + } +} diff --git a/workflows/wf_theiacov_illumina_pe.wdl b/workflows/wf_theiacov_illumina_pe.wdl new file mode 100644 index 00000000..9ddb7005 --- /dev/null +++ b/workflows/wf_theiacov_illumina_pe.wdl @@ -0,0 +1,175 @@ +version 1.0 + +import "wf_read_QC_trim.wdl" as read_qc +import "../tasks/task_alignment.wdl" as align +import "../tasks/task_consensus_call.wdl" as consensus_call +import "../tasks/task_assembly_metrics.wdl" as assembly_metrics +import "../tasks/task_taxonID.wdl" as taxon_ID +import "../tasks/task_ncbi.wdl" as ncbi +import "../tasks/task_versioning.wdl" as versioning +import "../tasks/task_qc_utils.wdl" as qc_utils + +workflow theiacov_illumina_pe { + meta { + description: "Reference-based consensus calling for viral amplicon sequencing data" + } + input { + String samplename + String seq_method = "ILLUMINA" + File read1_raw + File read2_raw + File primer_bed + String nextclade_dataset_name = "sars-cov-2" + String nextclade_dataset_reference = "MN908947" + String nextclade_dataset_tag = "2022-02-07T12:00:00Z" + File? reference_genome + } + call read_qc.read_QC_trim { + input: + samplename = samplename, + read1_raw = read1_raw, + read2_raw = read2_raw + } + call align.bwa { + input: + samplename = samplename, + read1 = read_QC_trim.read1_clean, + read2 = read_QC_trim.read2_clean, + reference_genome = reference_genome + } + call consensus_call.primer_trim { + input: + samplename = samplename, + primer_bed = primer_bed, + bamfile = bwa.sorted_bam + } + call consensus_call.variant_call { + input: + samplename = samplename, + bamfile = primer_trim.trim_sorted_bam, + reference_genome = reference_genome + } + call consensus_call.consensus { + input: + samplename = samplename, + bamfile = primer_trim.trim_sorted_bam, + reference_genome = reference_genome + } + call qc_utils.consensus_qc { + input: + assembly_fasta = consensus.consensus_seq + } + call assembly_metrics.stats_n_coverage { + input: + samplename = samplename, + bamfile = bwa.sorted_bam + } + call assembly_metrics.stats_n_coverage as stats_n_coverage_primtrim { + input: + samplename = samplename, + bamfile = primer_trim.trim_sorted_bam + } + call taxon_ID.pangolin3 { + input: + samplename = samplename, + fasta = consensus.consensus_seq + } + call taxon_ID.nextclade_one_sample { + input: + genome_fasta = consensus.consensus_seq, + dataset_name = nextclade_dataset_name, + dataset_reference = nextclade_dataset_reference, + dataset_tag = nextclade_dataset_tag + } + call taxon_ID.nextclade_output_parser_one_sample { + input: + nextclade_tsv = nextclade_one_sample.nextclade_tsv + } + call ncbi.vadr { + input: + genome_fasta = consensus.consensus_seq, + assembly_length_unambiguous = consensus_qc.number_ATCG + } + call versioning.version_capture{ + input: + } + output { + # Version Capture + String theiacov_illumina_pe_version = version_capture.phvg_version + String theiacov_illumina_pe_analysis_date = version_capture.date + # Read Metadata + String seq_platform = seq_method + # Read QC + File read1_dehosted = read_QC_trim.read1_dehosted + File read2_dehosted = read_QC_trim.read2_dehosted + File read1_clean = read_QC_trim.read1_clean + File read2_clean = read_QC_trim.read2_clean + Int num_reads_raw1 = read_QC_trim.fastq_scan_raw1 + Int? num_reads_raw2 = read_QC_trim.fastq_scan_raw2 + String? num_reads_raw_pairs = read_QC_trim.fastq_scan_raw_pairs + String fastq_scan_version = read_QC_trim.fastq_scan_version + Int num_reads_clean1 = read_QC_trim.fastq_scan_clean1 + Int num_reads_clean2 = read_QC_trim.fastq_scan_clean2 + String num_reads_clean_pairs = read_QC_trim.fastq_scan_clean_pairs + String trimmomatic_version = read_QC_trim.trimmomatic_version + String bbduk_docker = read_QC_trim.bbduk_docker + String kraken_version = read_QC_trim.kraken_version + Float kraken_human = read_QC_trim.kraken_human + Float kraken_sc2 = read_QC_trim.kraken_sc2 + String kraken_report = read_QC_trim.kraken_report + Float kraken_human_dehosted = read_QC_trim.kraken_human_dehosted + Float kraken_sc2_dehosted = read_QC_trim.kraken_sc2_dehosted + String kraken_report_dehosted = read_QC_trim.kraken_report_dehosted + # Read Alignment + String bwa_version = bwa.bwa_version + String samtools_version = bwa.sam_version + String assembly_method = "~{bwa.bwa_version}; ~{primer_trim.ivar_version}" + File aligned_bam = primer_trim.trim_sorted_bam + File aligned_bai = primer_trim.trim_sorted_bai + Float primer_trimmed_read_percent = primer_trim.primer_trimmed_read_percent + String ivar_version_primtrim = primer_trim.ivar_version + String samtools_version_primtrim = primer_trim.samtools_version + String primer_bed_name = primer_trim.primer_bed_name + File ivar_tsv = variant_call.sample_variants_tsv + File ivar_vcf = variant_call.sample_variants_vcf + String ivar_variant_version = variant_call.ivar_version + # Assembly QC + File assembly_fasta = consensus.consensus_seq + String ivar_version_consensus = consensus.ivar_version + String samtools_version_consensus = consensus.samtools_version + Int number_N = consensus_qc.number_N + Int assembly_length_unambiguous = consensus_qc.number_ATCG + Int number_Degenerate = consensus_qc.number_Degenerate + Int number_Total = consensus_qc.number_Total + Float percent_reference_coverage = consensus_qc.percent_reference_coverage + # Alignment QC + File consensus_stats = stats_n_coverage.stats + File consensus_flagstat = stats_n_coverage.flagstat + Float meanbaseq_trim = stats_n_coverage_primtrim.meanbaseq + Float meanmapq_trim = stats_n_coverage_primtrim.meanmapq + Float assembly_mean_coverage = stats_n_coverage_primtrim.depth + Float s_gene_mean_coverage = stats_n_coverage_primtrim.s_gene_depth + String samtools_version_stats = stats_n_coverage.samtools_version + # Lineage Assignment + String pango_lineage = pangolin3.pangolin_lineage + String pangolin_conflicts = pangolin3.pangolin_conflicts + String pangolin_notes = pangolin3.pangolin_notes + String pangolin_assignment_version = pangolin3.pangolin_assignment_version + File pango_lineage_report = pangolin3.pango_lineage_report + String pangolin_docker = pangolin3.pangolin_docker + String pangolin_versions = pangolin3.pangolin_versions + # Clade Assigment + File nextclade_json = nextclade_one_sample.nextclade_json + File auspice_json = nextclade_one_sample.auspice_json + File nextclade_tsv = nextclade_one_sample.nextclade_tsv + String nextclade_version = nextclade_one_sample.nextclade_version + String nextclade_docker = nextclade_one_sample.nextclade_docker + String nextclade_aa_subs = nextclade_output_parser_one_sample.nextclade_aa_subs + String nextclade_aa_dels = nextclade_output_parser_one_sample.nextclade_aa_dels + String nextclade_clade = nextclade_output_parser_one_sample.nextclade_clade + # VADR Annotation QC + File? vadr_alerts_list = vadr.alerts_list + String vadr_num_alerts = vadr.num_alerts + String vadr_docker = vadr.vadr_docker + } +} diff --git a/workflows/wf_theiacov_illumina_se.wdl b/workflows/wf_theiacov_illumina_se.wdl new file mode 100644 index 00000000..b4e63904 --- /dev/null +++ b/workflows/wf_theiacov_illumina_se.wdl @@ -0,0 +1,165 @@ +version 1.0 + +import "wf_read_QC_trim_se.wdl" as read_qc +import "../tasks/task_alignment.wdl" as align +import "../tasks/task_consensus_call.wdl" as consensus_call +import "../tasks/task_assembly_metrics.wdl" as assembly_metrics +import "../tasks/task_taxonID.wdl" as taxon_ID +import "../tasks/task_ncbi.wdl" as ncbi +import "../tasks/task_versioning.wdl" as versioning +import "../tasks/task_qc_utils.wdl" as qc_utils + +workflow theiacov_illumina_se { + meta { + description: "Reference-based consensus calling for viral amplicon sequencing data" + } + input { + String samplename + String seq_method = "ILLUMINA" + File read1_raw + File primer_bed + String nextclade_dataset_name = "sars-cov-2" + String nextclade_dataset_reference = "MN908947" + String nextclade_dataset_tag = "2022-02-07T12:00:00Z" + File? reference_genome + } + call read_qc.read_QC_trim { + input: + samplename = samplename, + read1_raw = read1_raw + } + call align.bwa { + input: + samplename = samplename, + read1 = read_QC_trim.read1_clean, + reference_genome = reference_genome + } + call consensus_call.primer_trim { + input: + samplename = samplename, + primer_bed = primer_bed, + bamfile = bwa.sorted_bam + } + call consensus_call.variant_call { + input: + samplename = samplename, + bamfile = primer_trim.trim_sorted_bam, + reference_genome = reference_genome + } + call consensus_call.consensus { + input: + samplename = samplename, + bamfile = primer_trim.trim_sorted_bam, + reference_genome = reference_genome + } + call qc_utils.consensus_qc { + input: + assembly_fasta = consensus.consensus_seq + } + call assembly_metrics.stats_n_coverage { + input: + samplename = samplename, + bamfile = bwa.sorted_bam + } + call assembly_metrics.stats_n_coverage as stats_n_coverage_primtrim { + input: + samplename = samplename, + bamfile = primer_trim.trim_sorted_bam + } + call taxon_ID.pangolin3 { + input: + samplename = samplename, + fasta = consensus.consensus_seq + } + call taxon_ID.nextclade_one_sample { + input: + genome_fasta = consensus.consensus_seq, + dataset_name = nextclade_dataset_name, + dataset_reference = nextclade_dataset_reference, + dataset_tag = nextclade_dataset_tag + } + call taxon_ID.nextclade_output_parser_one_sample { + input: + nextclade_tsv = nextclade_one_sample.nextclade_tsv + } + call ncbi.vadr { + input: + genome_fasta = consensus.consensus_seq, + assembly_length_unambiguous = consensus_qc.number_ATCG + } + call versioning.version_capture{ + input: + } + output { + # Version Capture + String theiacov_illumina_se_version = version_capture.phvg_version + String theiacov_illumina_se_analysis_date = version_capture.date + # Read Metadata + String seq_platform = seq_method + # Read QC + File read1_clean = read_QC_trim.read1_clean + Int num_reads_raw = read_QC_trim.fastq_scan_number_reads + String fastq_scan_version = read_QC_trim.fastq_scan_version + Int num_reads_clean = read_QC_trim.fastq_scan_clean_number_reads + String trimmomatic_version = read_QC_trim.trimmomatic_version + String bbduk_docker = read_QC_trim.bbduk_docker + Float kraken_human = read_QC_trim.kraken_human + Float kraken_sc2 = read_QC_trim.kraken_sc2 + String kraken_version = read_QC_trim.kraken_version + String kraken_report = read_QC_trim.kraken_report +# Float kraken_human_dehosted = read_QC_trim.kraken_human_dehosted +# Float kraken_sc2_dehosted = read_QC_trim.kraken_sc2_dehosted +# String kraken_report_dehosted = read_QC_trim.kraken_report_dehosted + # Read Alignment + String bwa_version = bwa.bwa_version + String samtools_version = bwa.sam_version + String assembly_method = "~{bwa.bwa_version}; ~{primer_trim.ivar_version}" + File aligned_bam = primer_trim.trim_sorted_bam + File aligned_bai = primer_trim.trim_sorted_bai + Float primer_trimmed_read_percent = primer_trim.primer_trimmed_read_percent + String ivar_version_primtrim = primer_trim.ivar_version + String samtools_version_primtrim = primer_trim.samtools_version + String primer_bed_name = primer_trim.primer_bed_name + File ivar_tsv = variant_call.sample_variants_tsv + File ivar_vcf = variant_call.sample_variants_vcf + String ivar_variant_version = variant_call.ivar_version + # Assembly QC + File assembly_fasta = consensus.consensus_seq + String ivar_version_consensus = consensus.ivar_version + String samtools_version_consensus = consensus.samtools_version + Int number_N = consensus_qc.number_N + Int assembly_length_unambiguous = consensus_qc.number_ATCG + Int number_Degenerate = consensus_qc.number_Degenerate + Int number_Total = consensus_qc.number_Total + Float percent_reference_coverage = consensus_qc.percent_reference_coverage + # Alignment QC + File consensus_stats = stats_n_coverage.stats + File consensus_flagstat = stats_n_coverage.flagstat + Float meanbaseq_trim = stats_n_coverage_primtrim.meanbaseq + Float meanmapq_trim = stats_n_coverage_primtrim.meanmapq + Float assembly_mean_coverage = stats_n_coverage_primtrim.depth + Float s_gene_mean_coverage = stats_n_coverage_primtrim.s_gene_depth + String samtools_version_stats = stats_n_coverage.samtools_version + # Lineage Assignment + String pango_lineage = pangolin3.pangolin_lineage + String pangolin_conflicts = pangolin3.pangolin_conflicts + String pangolin_notes = pangolin3.pangolin_notes + String pangolin_assignment_version = pangolin3.pangolin_assignment_version + File pango_lineage_report = pangolin3.pango_lineage_report + String pangolin_docker = pangolin3.pangolin_docker + String pangolin_versions = pangolin3.pangolin_versions + # Clade Assigment + File nextclade_json = nextclade_one_sample.nextclade_json + File auspice_json = nextclade_one_sample.auspice_json + File nextclade_tsv = nextclade_one_sample.nextclade_tsv + String nextclade_version = nextclade_one_sample.nextclade_version + String nextclade_docker = nextclade_one_sample.nextclade_docker + String nextclade_aa_subs = nextclade_output_parser_one_sample.nextclade_aa_subs + String nextclade_aa_dels = nextclade_output_parser_one_sample.nextclade_aa_dels + String nextclade_clade = nextclade_output_parser_one_sample.nextclade_clade + # VADR Annotation QC + File? vadr_alerts_list = vadr.alerts_list + String vadr_num_alerts = vadr.num_alerts + String vadr_docker = vadr.vadr_docker + } +} diff --git a/workflows/wf_theiacov_ont.wdl b/workflows/wf_theiacov_ont.wdl new file mode 100644 index 00000000..fd6b1a53 --- /dev/null +++ b/workflows/wf_theiacov_ont.wdl @@ -0,0 +1,166 @@ +version 1.0 + +import "../tasks/task_ont_medaka.wdl" as medaka +import "../tasks/task_assembly_metrics.wdl" as assembly_metrics +import "../tasks/task_taxonID.wdl" as taxon_ID +import "../tasks/task_ncbi.wdl" as ncbi +import "../tasks/task_read_clean.wdl" as read_clean +import "../tasks/task_qc_utils.wdl" as qc_utils +import "../tasks/task_versioning.wdl" as versioning + +workflow theiacov_ont { + meta { + description: "Reference-based consensus calling for viral amplicon ont sequencing data generated on ONT NGS platforms." + } + input { + String samplename + String seq_method = "OXFORD_NANOPORE" + File primer_bed + File demultiplexed_reads + Int? normalise = 200 + String nextclade_dataset_name = "sars-cov-2" + String nextclade_dataset_reference = "MN908947" + String nextclade_dataset_tag = "2022-02-07T12:00:00Z" + Int? max_length = 700 + Int? min_length = 400 + } + call qc_utils.fastq_scan_se as fastq_scan_raw_reads { + input: + read1 = demultiplexed_reads + } + call read_clean.ncbi_scrub_se { + input: + samplename = samplename, + read1 = demultiplexed_reads + } + call medaka.read_filtering { + input: + demultiplexed_reads = ncbi_scrub_se.read1_dehosted, + samplename = samplename, + min_length = min_length, + max_length = max_length + } + call qc_utils.fastq_scan_se as fastq_scan_clean_reads { + input: + read1 = read_filtering.filtered_reads + } + call taxon_ID.kraken2 as kraken2_dehosted { + input: + samplename = samplename, + read1 = ncbi_scrub_se.read1_dehosted + } + call medaka.consensus { + input: + samplename = samplename, + filtered_reads = read_filtering.filtered_reads, + primer_bed = primer_bed, + normalise = normalise + } + call qc_utils.consensus_qc { + input: + assembly_fasta = consensus.consensus_seq + } + call assembly_metrics.stats_n_coverage { + input: + samplename = samplename, + bamfile = consensus.sorted_bam + } + call assembly_metrics.stats_n_coverage as stats_n_coverage_primtrim { + input: + samplename = samplename, + bamfile = consensus.trim_sorted_bam + } + call taxon_ID.pangolin3 { + input: + samplename = samplename, + fasta = consensus.consensus_seq + } + call taxon_ID.kraken2 as kraken2_raw { + input: + samplename = samplename, + read1 = demultiplexed_reads + } + call taxon_ID.nextclade_one_sample { + input: + genome_fasta = consensus.consensus_seq, + dataset_name = nextclade_dataset_name, + dataset_reference = nextclade_dataset_reference, + dataset_tag = nextclade_dataset_tag + } + call taxon_ID.nextclade_output_parser_one_sample { + input: + nextclade_tsv = nextclade_one_sample.nextclade_tsv + } + call ncbi.vadr { + input: + genome_fasta = consensus.consensus_seq, + assembly_length_unambiguous = consensus_qc.number_ATCG + } + call versioning.version_capture{ + input: + } + output { + # Version Capture + String theiacov_ont_version = version_capture.phvg_version + String theiacov_ont_analysis_date = version_capture.date + # Read Metadata + String seq_platform = seq_method + # Read QC + File reads_dehosted = ncbi_scrub_se.read1_dehosted + Int num_reads_raw = fastq_scan_raw_reads.read1_seq + Int num_reads_clean = fastq_scan_clean_reads.read1_seq + String fastq_scan_version = fastq_scan_clean_reads.version + String kraken_version = kraken2_raw.version + Float kraken_human = kraken2_raw.percent_human + Float kraken_sc2 = kraken2_raw.percent_sc2 + String kraken_report = kraken2_raw.kraken_report + Float kraken_human_dehosted = kraken2_dehosted.percent_human + Float kraken_sc2_dehosted = kraken2_dehosted.percent_sc2 + String kraken_report_dehosted = kraken2_dehosted.kraken_report + # Read Alignment + File aligned_bam = consensus.trim_sorted_bam + File aligned_bai = consensus.trim_sorted_bai + File variants_from_ref_vcf = consensus.medaka_pass_vcf + String artic_version = consensus.artic_pipeline_version + String artic_docker = consensus.artic_pipeline_docker + String medaka_reference = consensus.medaka_reference + String primer_bed_name = consensus.primer_bed_name + File assembly_fasta = consensus.consensus_seq + String assembly_method = consensus.artic_pipeline_version + # Assembly QC + Int number_N = consensus_qc.number_N + Int assembly_length_unambiguous = consensus_qc.number_ATCG + Int number_Degenerate = consensus_qc.number_Degenerate + Int number_Total = consensus_qc.number_Total + Float percent_reference_coverage = consensus_qc.percent_reference_coverage + # Alignment QC + File consensus_stats = stats_n_coverage.stats + File consensus_flagstat = stats_n_coverage.flagstat + Float meanbaseq_trim = stats_n_coverage_primtrim.meanbaseq + Float meanmapq_trim = stats_n_coverage_primtrim.meanmapq + Float assembly_mean_coverage = stats_n_coverage_primtrim.depth + Float s_gene_mean_coverage = stats_n_coverage_primtrim.s_gene_depth + String samtools_version = stats_n_coverage.samtools_version + # Lineage Assignment + String pango_lineage = pangolin3.pangolin_lineage + String pangolin_conflicts = pangolin3.pangolin_conflicts + String pangolin_notes = pangolin3.pangolin_notes + String pangolin_assignment_version = pangolin3.pangolin_assignment_version + File pango_lineage_report = pangolin3.pango_lineage_report + String pangolin_docker = pangolin3.pangolin_docker + String pangolin_versions = pangolin3.pangolin_versions + # Clade Assigment + File nextclade_json = nextclade_one_sample.nextclade_json + File auspice_json = nextclade_one_sample.auspice_json + File nextclade_tsv = nextclade_one_sample.nextclade_tsv + String nextclade_version = nextclade_one_sample.nextclade_version + String nextclade_docker = nextclade_one_sample.nextclade_docker + String nextclade_aa_subs = nextclade_output_parser_one_sample.nextclade_aa_subs + String nextclade_aa_dels = nextclade_output_parser_one_sample.nextclade_aa_dels + String nextclade_clade = nextclade_output_parser_one_sample.nextclade_clade + # VADR Annotation QC + File? vadr_alerts_list = vadr.alerts_list + String vadr_num_alerts = vadr.num_alerts + String vadr_docker = vadr.vadr_docker + } +} diff --git a/workflows/wf_theiacov_validate.wdl b/workflows/wf_theiacov_validate.wdl index 1e5eb9de..d80549ae 100644 --- a/workflows/wf_theiacov_validate.wdl +++ b/workflows/wf_theiacov_validate.wdl @@ -4,35 +4,35 @@ import "../tasks/task_validate.wdl" as validation import "../tasks/task_versioning.wdl" as versioning workflow theiacov_validate { - input { - String terra_project - String terra_workspace - String datatable1 - String datatable2 - String out_dir = "./" - String out_prefix = "VALIDATION" - } - call validation.export_two_tsvs { + input { + String terra_project + String terra_workspace + String datatable1 + String datatable2 + String out_dir = "./" + String out_prefix = "VALIDATION" + } + call validation.export_two_tsvs { input: terra_project = terra_project, terra_workspace = terra_workspace, datatable1 = datatable1, datatable2 = datatable2 - } - call validation.compare_two_tsvs { + } + call validation.compare_two_tsvs { input: datatable1_tsv = export_two_tsvs.datatable1_tsv, datatable2_tsv = export_two_tsvs.datatable2_tsv, out_dir = out_dir, out_prefix = out_prefix - } - call versioning.version_capture{ - input: - } - output { - String theiacov_validation_version = version_capture.phvg_version - String theiacov_validation_date = version_capture.date - File theiacov_validation_report_pdf = compare_two_tsvs.pdf_report - File theiacov_validation_report_xl = compare_two_tsvs.xl_report - } + } + call versioning.version_capture { + input: + } + output { + String theiacov_validation_version = version_capture.phvg_version + String theiacov_validation_date = version_capture.date + File theiacov_validation_report_pdf = compare_two_tsvs.pdf_report + File theiacov_validation_report_xl = compare_two_tsvs.xl_report + } } diff --git a/workflows/wf_theiacov_wwvc.wdl b/workflows/wf_theiacov_wwvc.wdl new file mode 100644 index 00000000..7e77bd3e --- /dev/null +++ b/workflows/wf_theiacov_wwvc.wdl @@ -0,0 +1,77 @@ +version 1.0 + +import "wf_read_QC_trim.wdl" as read_qc +import "../tasks/task_alignment.wdl" as align +import "../tasks/task_consensus_call.wdl" as consensus_call +import "../tasks/task_versioning.wdl" as versioning +import "../workflows/wf_WasteWaterVariantCalling_modified.wdl" as wastewater + +workflow theiacov_illumina_wwvc { + meta { + description: "Reference-based consensus calling for viral amplicon sequencing data" + } + input { + Array[String] samplename + Array[File] read1_raw + Array[File] read2_raw + File primer_bed + File reference_genome + File spike_bed + File spike_annotations + Int trimmomatic_minlen = 25 + } + scatter (r1_r2 in zip(read1_raw, read2_raw)) { + call read_qc.read_QC_trim { + input: + samplename = "wastewater_sample", + read1_raw = r1_r2.left, + read2_raw = r1_r2.right, + trimmomatic_minlen = trimmomatic_minlen + } + call align.bwa { + input: + samplename = "wastewater_sample", + read1 = read_QC_trim.read1_clean, + read2 = read_QC_trim.read2_clean + } + call consensus_call.primer_trim { + input: + samplename = "wastewater_sample", + primer_bed = primer_bed, + bamfile = bwa.sorted_bam + } + } + call wastewater.WasteWaterVariantCalling{ + input: + sorted_bam = primer_trim.trim_sorted_bam, + covid_genome = reference_genome, + spike_bed = spike_bed, + spike_annotations = spike_annotations, + sample_id = samplename + } + call versioning.version_capture{ + input: + } + output { + # Version Capture + String theiacov_wwvc_version = version_capture.phvg_version + String theiacov_wwcv_date = version_capture.date + # Waste Water Variant Calling + Array[File] addrg_bam = WasteWaterVariantCalling.addrg_bam + Array[File] variants = WasteWaterVariantCalling.variants + Array[File] sorted_vcf = WasteWaterVariantCalling.sorted_vcf + Array[File] sample_spike_vcf = WasteWaterVariantCalling.sample_spike_vcf + Array[File] sample_spike_tsv = WasteWaterVariantCalling.sample_spike_tsv + Array[File] sample_spike_tsv_summary = WasteWaterVariantCalling.sample_spike_tsv_summary + Array[File] sample_spike_tsv_dash = WasteWaterVariantCalling.sample_spike_tsv_dash + Array[File] fill_NA_tsv = WasteWaterVariantCalling.fill_NA_tsv + Array[File] allele_freq_tsv = WasteWaterVariantCalling.allele_freq_tsv + Array[File] reformat_tsv_tsv = WasteWaterVariantCalling.reformat_tsv_tsv + Array[File] sample_spike_tsv_counts = WasteWaterVariantCalling.sample_spike_tsv_counts + Array[File] alignment_files = primer_trim.trim_sorted_bam + File spike_summary_temp = WasteWaterVariantCalling.spike_summary_temp + File spike_summary = WasteWaterVariantCalling.spike_summary + File spike_dashboard = WasteWaterVariantCalling.spike_dashboard + File spike_counts = WasteWaterVariantCalling.spike_counts + } +} diff --git a/workflows/wf_titan_augur_prep.wdl b/workflows/wf_titan_augur_prep.wdl deleted file mode 100644 index 57ca5d3c..00000000 --- a/workflows/wf_titan_augur_prep.wdl +++ /dev/null @@ -1,37 +0,0 @@ -version 1.0 - -import "../tasks/tasks_nextstrain.wdl" as nextstrain -import "../tasks/task_versioning.wdl" as versioning - -workflow titan_augur_prep { - input { - String assembly - String collection_date - String iso_country - String iso_state - String iso_continent - String? iso_county - String pango_lineage - - } - - call nextstrain.prep_augur_metadata { - input: - assembly=assembly, - collection_date = collection_date, - iso_country = iso_country, - iso_state = iso_state, - iso_continent = iso_continent, - iso_county = iso_county, - pango_lineage = pango_lineage - } - call versioning.version_capture{ - input: - } - output { - String titan_augur_run_version = version_capture.phvg_version - String titan_augur_run_analysis_date = version_capture.date - File augur_metadata = prep_augur_metadata.augur_metadata - - } -} diff --git a/workflows/wf_titan_augur_run.wdl b/workflows/wf_titan_augur_run.wdl deleted file mode 100644 index 28327560..00000000 --- a/workflows/wf_titan_augur_run.wdl +++ /dev/null @@ -1,62 +0,0 @@ -version 1.0 - -import "wf_sarscov2_nextstrain_modified.wdl" as augur -import "../tasks/task_phylo.wdl" as phylo -import "../tasks/task_versioning.wdl" as versioning - -workflow titan_augur_run { - meta { - description: "Workflow for SC2 cluster investigations. Titan_Augur_Run will run Augur without a subsampling module using a modified version of The Broad Institute's sarscov2_nextstrain WDL workflow to create an Auspice JSON file; output from the modified sarscov2_nextstrain workflow will also be used to infer SNP distances" - author: "Kevin G Libuit" - email: "kevin.libuit@theiagen.com" - } - - input { - Array[File]+ assembly_fastas - Array[File]+ sample_metadata_tsvs - String build_name - - } - - parameter_meta { - assembly_fastas: { - description: "Set of assembled genomes to align and build trees. These must represent a single chromosome/segment of a genome only. Fastas may be one-sequence-per-individual or a concatenated multi-fasta (unaligned) or a mixture of the two. They may be compressed (gz, bz2, zst, lz4), uncompressed, or a mixture.", - patterns: ["*.fasta", "*.fa", "*.fasta.gz", "*.fasta.zst"] - } - sample_metadata_tsvs: { - description: "Tab-separated metadata file that contain binning variables and values. Must contain all samples: output will be filtered to the IDs present in this file.", - patterns: ["*.txt", "*.tsv"] - } - } - - call augur.sarscov2_nextstrain { - input: - assembly_fastas=assembly_fastas, - sample_metadata_tsvs=sample_metadata_tsvs, - build_name=build_name - } - - call phylo.snp_dists { - input: - cluster_name = build_name, - alignment = sarscov2_nextstrain.mafft_alignment - } - call versioning.version_capture{ - input: - } - output { - String titan_augur_run_version = version_capture.phvg_version - String titan_augur_run_analysis_date = version_capture.date - - File combined_assemblies = sarscov2_nextstrain.combined_assemblies - File MAFFT_alignment = sarscov2_nextstrain.mafft_alignment - File unmasked_snps = sarscov2_nextstrain.unmasked_snps - File metadata_merged = sarscov2_nextstrain.metadata_merged - File keep_list = sarscov2_nextstrain.keep_list - File distance_tree = sarscov2_nextstrain.ml_tree - File time_tree = sarscov2_nextstrain.time_tree - File auspice_input_json = sarscov2_nextstrain.auspice_input_json - - File snp_matrix = snp_dists.snp_matrix - } -} diff --git a/workflows/wf_titan_clearlabs.wdl b/workflows/wf_titan_clearlabs.wdl deleted file mode 100644 index 23662210..00000000 --- a/workflows/wf_titan_clearlabs.wdl +++ /dev/null @@ -1,158 +0,0 @@ -version 1.0 - -import "../tasks/task_ont_medaka.wdl" as medaka -import "../tasks/task_assembly_metrics.wdl" as assembly_metrics -import "../tasks/task_taxonID.wdl" as taxon_ID -import "../tasks/task_ncbi.wdl" as ncbi -import "../tasks/task_read_clean.wdl" as read_clean -import "../tasks/task_qc_utils.wdl" as qc_utils -import "../tasks/task_versioning.wdl" as versioning - -workflow titan_clearlabs { - meta { - description: "Reference-based consensus calling for viral amplicon ont sequencing data generated on the Clear Labs platform." - } - - input { - String samplename - File clear_lab_fastq - String seq_method = "OXFORD_NANOPORE" - File primer_bed - Int? normalise = 20000 - String nextclade_dataset_name = "sars-cov-2" - String nextclade_dataset_reference = "MN908947" - String nextclade_dataset_tag = "2022-01-18T12:00:00Z" - } - call qc_utils.fastq_scan_se as fastq_scan_raw_reads { - input: - read1 = clear_lab_fastq - } - call read_clean.ncbi_scrub_se { - input: - samplename = samplename, - read1 = clear_lab_fastq - } - call qc_utils.fastq_scan_se as fastq_scan_clean_reads { - input: - read1 = ncbi_scrub_se.read1_dehosted - } - call taxon_ID.kraken2 as kraken2_dehosted { - input: - samplename = samplename, - read1 = ncbi_scrub_se.read1_dehosted - } - call medaka.consensus { - input: - samplename = samplename, - filtered_reads = ncbi_scrub_se.read1_dehosted, - primer_bed = primer_bed, - normalise = normalise - } - call assembly_metrics.stats_n_coverage { - input: - samplename = samplename, - bamfile = consensus.sorted_bam - } - call qc_utils.consensus_qc { - input: - assembly_fasta = consensus.consensus_seq - } - call assembly_metrics.stats_n_coverage as stats_n_coverage_primtrim { - input: - samplename = samplename, - bamfile = consensus.trim_sorted_bam - } - call taxon_ID.pangolin3 { - input: - samplename = samplename, - fasta = consensus.consensus_seq - } - call taxon_ID.kraken2 as kraken2_raw { - input: - samplename = samplename, - read1 = clear_lab_fastq - } - call taxon_ID.nextclade_one_sample { - input: - genome_fasta = consensus.consensus_seq, - dataset_name = nextclade_dataset_name, - dataset_reference = nextclade_dataset_reference, - dataset_tag = nextclade_dataset_tag - } - call taxon_ID.nextclade_output_parser_one_sample { - input: - nextclade_tsv = nextclade_one_sample.nextclade_tsv - } - call ncbi.vadr { - input: - genome_fasta = consensus.consensus_seq, - assembly_length_unambiguous = consensus_qc.number_ATCG - } - call versioning.version_capture{ - input: - } - output { - String titan_clearlabs_version = version_capture.phvg_version - String titan_clearlabs_analysis_date = version_capture.date - String seq_platform = seq_method - - File reads_dehosted = ncbi_scrub_se.read1_dehosted - - Int fastq_scan_raw = fastq_scan_raw_reads.read1_seq - Int fastq_scan_clean = fastq_scan_clean_reads.read1_seq - String fastq_scan_version = fastq_scan_raw_reads.version - - String kraken_version = kraken2_raw.version - Float kraken_human = kraken2_raw.percent_human - Float kraken_sc2 = kraken2_raw.percent_sc2 - String kraken_report = kraken2_raw.kraken_report - Float kraken_human_dehosted = kraken2_dehosted.percent_human - Float kraken_sc2_dehosted = kraken2_dehosted.percent_sc2 - String kraken_report_dehosted = kraken2_dehosted.kraken_report - - File aligned_bam = consensus.trim_sorted_bam - File aligned_bai = consensus.trim_sorted_bai - File variants_from_ref_vcf = consensus.medaka_pass_vcf - String artic_version = consensus.artic_pipeline_version - String artic_docker = consensus.artic_pipeline_docker - String medaka_reference = consensus.medaka_reference - String primer_bed_name = consensus.primer_bed_name - File assembly_fasta = consensus.consensus_seq - String assembly_method = consensus.artic_pipeline_version - - Int number_N = consensus_qc.number_N - Int assembly_length_unambiguous = consensus_qc.number_ATCG - Int number_Degenerate = consensus_qc.number_Degenerate - Int number_Total = consensus_qc.number_Total - Float percent_reference_coverage = consensus_qc.percent_reference_coverage - - String pango_lineage = pangolin3.pangolin_lineage - String pangolin_conflicts = pangolin3.pangolin_conflicts - String pangolin_notes = pangolin3.pangolin_notes - String pangolin_assignment_version = pangolin3.pangolin_assignment_version - File pango_lineage_report = pangolin3.pango_lineage_report - String pangolin_docker = pangolin3.pangolin_docker - String pangolin_versions = pangolin3.pangolin_versions - - File consensus_stats = stats_n_coverage.stats - File consensus_flagstat = stats_n_coverage.flagstat - Float meanbaseq_trim = stats_n_coverage_primtrim.meanbaseq - Float meanmapq_trim = stats_n_coverage_primtrim.meanmapq - Float assembly_mean_coverage = stats_n_coverage_primtrim.depth - Float s_gene_mean_coverage = stats_n_coverage_primtrim.s_gene_depth - String samtools_version = stats_n_coverage.samtools_version - - File nextclade_json = nextclade_one_sample.nextclade_json - File auspice_json = nextclade_one_sample.auspice_json - File nextclade_tsv = nextclade_one_sample.nextclade_tsv - String nextclade_version = nextclade_one_sample.nextclade_version - String nextclade_docker = nextclade_one_sample.nextclade_docker - String nextclade_aa_subs = nextclade_output_parser_one_sample.nextclade_aa_subs - String nextclade_aa_dels = nextclade_output_parser_one_sample.nextclade_aa_dels - String nextclade_clade = nextclade_output_parser_one_sample.nextclade_clade - - File? vadr_alerts_list = vadr.alerts_list - String vadr_num_alerts = vadr.num_alerts - String vadr_docker = vadr.vadr_docker - } -} diff --git a/workflows/wf_titan_fasta.wdl b/workflows/wf_titan_fasta.wdl deleted file mode 100644 index 1a68e18f..00000000 --- a/workflows/wf_titan_fasta.wdl +++ /dev/null @@ -1,86 +0,0 @@ -version 1.0 - -import "../tasks/task_ont_medaka.wdl" as medaka -import "../tasks/task_assembly_metrics.wdl" as assembly_metrics -import "../tasks/task_taxonID.wdl" as taxon_ID -import "../tasks/task_ncbi.wdl" as ncbi -import "../tasks/task_read_clean.wdl" as read_clean -import "../tasks/task_qc_utils.wdl" as qc_utils -import "../tasks/task_versioning.wdl" as versioning - -workflow titan_fasta { - meta { - description: "Reference-based consensus calling for viral amplicon ont sequencing data generated on the Clear Labs platform." - } - - input { - String samplename - File assembly_fasta - String seq_method - String input_assembly_method - String nextclade_dataset_name = "sars-cov-2" - String nextclade_dataset_reference = "MN908947" - String nextclade_dataset_tag = "2022-01-18T12:00:00Z" - } - call qc_utils.consensus_qc { - input: - assembly_fasta = assembly_fasta - } - call taxon_ID.pangolin3 { - input: - samplename = samplename, - fasta = assembly_fasta - } - call taxon_ID.nextclade_one_sample { - input: - genome_fasta = assembly_fasta, - dataset_name = nextclade_dataset_name, - dataset_reference = nextclade_dataset_reference, - dataset_tag = nextclade_dataset_tag - } - call taxon_ID.nextclade_output_parser_one_sample { - input: - nextclade_tsv = nextclade_one_sample.nextclade_tsv - } - call ncbi.vadr { - input: - genome_fasta = assembly_fasta, - assembly_length_unambiguous = consensus_qc.number_ATCG - } - call versioning.version_capture{ - input: - } - output { - String titan_fasta_version = version_capture.phvg_version - String titan_fasta_analysis_date = version_capture.date - String seq_platform = seq_method - String assembly_method = input_assembly_method - - Int number_N = consensus_qc.number_N - Int assembly_length_unambiguous = consensus_qc.number_ATCG - Int number_Degenerate = consensus_qc.number_Degenerate - Int number_Total = consensus_qc.number_Total - Float percent_reference_coverage = consensus_qc.percent_reference_coverage - - String pango_lineage = pangolin3.pangolin_lineage - String pangolin_conflicts = pangolin3.pangolin_conflicts - String pangolin_notes = pangolin3.pangolin_notes - String pangolin_assignment_version = pangolin3.pangolin_assignment_version - File pango_lineage_report = pangolin3.pango_lineage_report - String pangolin_docker = pangolin3.pangolin_docker - String pangolin_versions = pangolin3.pangolin_versions - - File nextclade_json = nextclade_one_sample.nextclade_json - File auspice_json = nextclade_one_sample.auspice_json - File nextclade_tsv = nextclade_one_sample.nextclade_tsv - String nextclade_version = nextclade_one_sample.nextclade_version - String nextclade_docker = nextclade_one_sample.nextclade_docker - String nextclade_clade = nextclade_output_parser_one_sample.nextclade_clade - String nextclade_aa_subs = nextclade_output_parser_one_sample.nextclade_aa_subs - String nextclade_aa_dels = nextclade_output_parser_one_sample.nextclade_aa_dels - - File? vadr_alerts_list = vadr.alerts_list - String vadr_num_alerts = vadr.num_alerts - String vadr_docker = vadr.vadr_docker - } -} diff --git a/workflows/wf_titan_gc.wdl b/workflows/wf_titan_gc.wdl deleted file mode 100644 index 8b6cd87b..00000000 --- a/workflows/wf_titan_gc.wdl +++ /dev/null @@ -1,339 +0,0 @@ -version 1.0 - -# Workflows from Theiagen's public_health_viral_genomics -# Source: https://github.com/theiagen/public_health_viral_genomics -import "wf_titan_clearlabs.wdl" as clearlabs -import "wf_titan_illumina_pe.wdl" as illumina_pe -import "wf_titan_illumina_se.wdl" as illumina_se -import "wf_titan_ont.wdl" as ont -import "../tasks/task_titan_summary.wdl" as summary - -struct parseJSON { - String sample - String titan_wf - File r1 - File r2 - File primers -} - -workflow titan_gc { - meta { - description: "Incorporates each of the Titan workflows (clearlabs, illumina_pe, illumina_se, ont) into a single run." - author: "Robert A. Petit III" - email: "robert.petit@theiagen.com" - } - - input { - Array[parseJSON] samples - } - - scatter (sample in samples) { - if (sample.titan_wf == "clearlabs") { - call clearlabs.titan_clearlabs as titan_clearlabs { - input: - samplename = sample.sample, - clear_lab_fastq = sample.r1, - primer_bed = sample.primers - } - - call summary.titan_summary as clearlabs_summary { - input: - samplename = sample.sample, - titan_workflow = 'titan_clearlabs', - titan_version = titan_clearlabs.titan_clearlabs_version, - titan_analysis_date = titan_clearlabs.titan_clearlabs_analysis_date, - seq_platform = titan_clearlabs.seq_platform, - primer_bed_name = titan_clearlabs.primer_bed_name, - percent_reference_coverage = titan_clearlabs.percent_reference_coverage, - number_N = titan_clearlabs.number_N, - pango_lineage = titan_clearlabs.pango_lineage, - pangolin_conflicts = titan_clearlabs.pangolin_conflicts, - pangolin_notes = titan_clearlabs.pangolin_notes, - pangolin_assignment_version = titan_clearlabs.pangolin_assignment_version, - pangolin_docker = titan_clearlabs.pangolin_docker, - pangolin_versions = titan_clearlabs.pangolin_versions, - nextclade_clade = titan_clearlabs.nextclade_clade, - nextclade_aa_subs = titan_clearlabs.nextclade_aa_subs, - nextclade_aa_dels = titan_clearlabs.nextclade_aa_dels, - vadr_num_alerts = titan_clearlabs.vadr_num_alerts, - assembly_length_unambiguous = titan_clearlabs.assembly_length_unambiguous, - assembly_mean_coverage = titan_clearlabs.assembly_mean_coverage, - s_gene_mean_coverage = titan_clearlabs.s_gene_mean_coverage, - assembly_method = titan_clearlabs.assembly_method, - number_Degenerate = titan_clearlabs.number_Degenerate, - number_Total = titan_clearlabs.number_Total, - meanbaseq_trim = titan_clearlabs.meanbaseq_trim, - meanmapq_trim = titan_clearlabs.meanmapq_trim, - fastq_scan_clean1 = titan_clearlabs.fastq_scan_clean, - fastq_scan_raw1 = titan_clearlabs.fastq_scan_raw, - fastq_scan_version = titan_clearlabs.fastq_scan_version, - kraken_human = titan_clearlabs.kraken_human, - kraken_human_dehosted = titan_clearlabs.kraken_human_dehosted, - kraken_sc2 = titan_clearlabs.kraken_sc2, - kraken_sc2_dehosted = titan_clearlabs.kraken_sc2_dehosted, - artic_version = titan_clearlabs.artic_version, - artic_docker = titan_clearlabs.artic_docker, - medaka_reference = titan_clearlabs.medaka_reference, - kraken_version = titan_clearlabs.kraken_version, - nextclade_version = titan_clearlabs.nextclade_version, - nextclade_docker = titan_clearlabs.nextclade_docker, - samtools_version = titan_clearlabs.samtools_version, - vadr_docker = titan_clearlabs.vadr_docker - } - } - - if (sample.titan_wf == "illumina_pe") { - call illumina_pe.titan_illumina_pe as titan_illumina_pe { - input: - samplename = sample.sample, - read1_raw = sample.r1, - read2_raw = sample.r2, - primer_bed = sample.primers - } - - call summary.titan_summary as illumina_pe_summary { - input: - samplename = sample.sample, - titan_workflow = 'titan_illumina_pe', - titan_version = titan_illumina_pe.titan_illumina_pe_version, - titan_analysis_date = titan_illumina_pe.titan_illumina_pe_analysis_date, - seq_platform = titan_illumina_pe.seq_platform, - primer_bed_name = titan_illumina_pe.primer_bed_name, - percent_reference_coverage = titan_illumina_pe.percent_reference_coverage, - number_N = titan_illumina_pe.number_N, - pango_lineage = titan_illumina_pe.pango_lineage, - pangolin_conflicts = titan_illumina_pe.pangolin_conflicts, - pangolin_notes = titan_illumina_pe.pangolin_notes, - pangolin_assignment_version = titan_illumina_pe.pangolin_assignment_version, - pangolin_versions = titan_illumina_pe.pangolin_versions, - pangolin_docker = titan_illumina_pe.pangolin_docker, - nextclade_clade = titan_illumina_pe.nextclade_clade, - nextclade_aa_subs = titan_illumina_pe.nextclade_aa_subs, - nextclade_aa_dels = titan_illumina_pe.nextclade_aa_dels, - vadr_num_alerts = titan_illumina_pe.vadr_num_alerts, - assembly_length_unambiguous = titan_illumina_pe.assembly_length_unambiguous, - assembly_mean_coverage = titan_illumina_pe.assembly_mean_coverage, - s_gene_mean_coverage = titan_illumina_pe.s_gene_mean_coverage, - assembly_method = titan_illumina_pe.assembly_method, - number_Degenerate = titan_illumina_pe.number_Degenerate, - number_Total = titan_illumina_pe.number_Total, - meanbaseq_trim = titan_illumina_pe.meanbaseq_trim, - meanmapq_trim = titan_illumina_pe.meanmapq_trim, - fastq_scan_clean1 = titan_illumina_pe.fastq_scan_clean1, - fastq_scan_clean2 = titan_illumina_pe.fastq_scan_clean2, - fastq_scan_clean_pairs = titan_illumina_pe.fastq_scan_clean_pairs, - fastq_scan_raw1 = titan_illumina_pe.fastq_scan_raw1, - fastq_scan_raw2 = titan_illumina_pe.fastq_scan_raw2, - fastq_scan_raw_pairs = titan_illumina_pe.fastq_scan_raw_pairs, - kraken_human = titan_illumina_pe.kraken_human, - kraken_human_dehosted = titan_illumina_pe.kraken_human_dehosted, - kraken_sc2 = titan_illumina_pe.kraken_sc2, - kraken_sc2_dehosted = titan_illumina_pe.kraken_sc2_dehosted, - primer_trimmed_read_percent = titan_illumina_pe.primer_trimmed_read_percent, - bbduk_docker = titan_illumina_pe.bbduk_docker, - bwa_version = titan_illumina_pe.bwa_version, - fastq_scan_version = titan_illumina_pe.fastq_scan_version, - ivar_variant_version = titan_illumina_pe.ivar_variant_version, - ivar_version_consensus = titan_illumina_pe.ivar_version_consensus, - ivar_version_primtrim = titan_illumina_pe.ivar_version_primtrim, - kraken_version = titan_illumina_pe.kraken_version, - nextclade_version = titan_illumina_pe.nextclade_version, - nextclade_docker = titan_illumina_pe.nextclade_docker, - samtools_version = titan_illumina_pe.samtools_version, - samtools_version_consensus = titan_illumina_pe.samtools_version_consensus, - samtools_version_primtrim = titan_illumina_pe.samtools_version_primtrim, - samtools_version_stats = titan_illumina_pe.samtools_version_stats, - trimmomatic_version = titan_illumina_pe.trimmomatic_version, - vadr_docker = titan_illumina_pe.vadr_docker - } - } - - if (sample.titan_wf == "illumina_se") { - call illumina_se.titan_illumina_se as titan_illumina_se { - input: - samplename = sample.sample, - read1_raw = sample.r1, - primer_bed = sample.primers - } - - call summary.titan_summary as illumina_se_summary { - input: - samplename = sample.sample, - titan_workflow = 'titan_illumina_se', - titan_version = titan_illumina_se.titan_illumina_se_version, - titan_analysis_date = titan_illumina_se.titan_illumina_se_analysis_date, - seq_platform = titan_illumina_se.seq_platform, - primer_bed_name = titan_illumina_se.primer_bed_name, - percent_reference_coverage = titan_illumina_se.percent_reference_coverage, - number_N = titan_illumina_se.number_N, - pango_lineage = titan_illumina_se.pango_lineage, - pangolin_conflicts = titan_illumina_se.pangolin_conflicts, - pangolin_notes = titan_illumina_se.pangolin_notes, - pangolin_versions = titan_illumina_se.pangolin_versions, - pangolin_assignment_version = titan_illumina_se.pangolin_assignment_version, - pangolin_docker = titan_illumina_se.pangolin_docker, - nextclade_clade = titan_illumina_se.nextclade_clade, - nextclade_aa_subs = titan_illumina_se.nextclade_aa_subs, - nextclade_aa_dels = titan_illumina_se.nextclade_aa_dels, - vadr_num_alerts = titan_illumina_se.vadr_num_alerts, - assembly_length_unambiguous = titan_illumina_se.assembly_length_unambiguous, - assembly_mean_coverage = titan_illumina_se.assembly_mean_coverage, - s_gene_mean_coverage = titan_illumina_se.s_gene_mean_coverage, - assembly_method = titan_illumina_se.assembly_method, - number_Degenerate = titan_illumina_se.number_Degenerate, - number_Total = titan_illumina_se.number_Total, - meanbaseq_trim = titan_illumina_se.meanbaseq_trim, - meanmapq_trim = titan_illumina_se.meanmapq_trim, - fastq_scan_clean1 = titan_illumina_se.fastq_scan_clean, - fastq_scan_raw1 = titan_illumina_se.fastq_scan_raw, - fastq_scan_version = titan_illumina_se.fastq_scan_version, - kraken_human = titan_illumina_se.kraken_human, - kraken_sc2 = titan_illumina_se.kraken_sc2, - primer_trimmed_read_percent = titan_illumina_se.primer_trimmed_read_percent, - bbduk_docker = titan_illumina_se.bbduk_docker, - bwa_version = titan_illumina_se.bwa_version, - fastq_scan_version = titan_illumina_se.fastq_scan_version, - ivar_variant_version = titan_illumina_se.ivar_variant_version, - ivar_version_consensus = titan_illumina_se.ivar_version_consensus, - ivar_version_primtrim = titan_illumina_se.ivar_version_primtrim, - kraken_version = titan_illumina_se.kraken_version, - nextclade_version = titan_illumina_se.nextclade_version, - nextclade_docker = titan_illumina_se.nextclade_docker, - samtools_version = titan_illumina_se.samtools_version, - samtools_version_consensus = titan_illumina_se.samtools_version_consensus, - samtools_version_primtrim = titan_illumina_se.samtools_version_primtrim, - samtools_version_stats = titan_illumina_se.samtools_version_stats, - trimmomatic_version = titan_illumina_se.trimmomatic_version, - vadr_docker = titan_illumina_se.vadr_docker - } - } - - if (sample.titan_wf == "ont") { - call ont.titan_ont as titan_ont { - input: - samplename = sample.sample, - demultiplexed_reads = sample.r1, - primer_bed = sample.primers - } - - call summary.titan_summary as ont_summary { - input: - samplename = sample.sample, - titan_workflow = 'titan_ont', - titan_version = titan_ont.titan_ont_version, - titan_analysis_date = titan_ont.titan_ont_analysis_date, - seq_platform = titan_ont.seq_platform, - primer_bed_name = titan_ont.primer_bed_name, - percent_reference_coverage = titan_ont.percent_reference_coverage, - number_N = titan_ont.number_N, - pango_lineage = titan_ont.pango_lineage, - pangolin_conflicts = titan_ont.pangolin_conflicts, - pangolin_notes = titan_ont.pangolin_notes, - pangolin_versions = titan_ont.pangolin_versions, - pangolin_assignment_version = titan_ont.pangolin_assignment_version, - pangolin_docker = titan_ont.pangolin_docker, - nextclade_clade = titan_ont.nextclade_clade, - nextclade_aa_subs = titan_ont.nextclade_aa_subs, - nextclade_aa_dels = titan_ont.nextclade_aa_dels, - vadr_num_alerts = titan_ont.vadr_num_alerts, - assembly_length_unambiguous = titan_ont.assembly_length_unambiguous, - assembly_mean_coverage = titan_ont.assembly_mean_coverage, - s_gene_mean_coverage = titan_ont.s_gene_mean_coverage, - assembly_method = titan_ont.assembly_method, - number_Degenerate = titan_ont.number_Degenerate, - number_Total = titan_ont.number_Total, - meanbaseq_trim = titan_ont.meanbaseq_trim, - meanmapq_trim = titan_ont.meanmapq_trim, - fastq_scan_clean1 = titan_ont.fastq_scan_clean, - fastq_scan_raw1 = titan_ont.fastq_scan_raw, - fastq_scan_version = titan_ont.fastq_scan_version, - kraken_human = titan_ont.kraken_human, - kraken_human_dehosted = titan_ont.kraken_human_dehosted, - kraken_sc2 = titan_ont.kraken_sc2, - kraken_sc2_dehosted = titan_ont.kraken_sc2_dehosted, - artic_version = titan_ont.artic_version, - artic_docker = titan_ont.artic_docker, - medaka_reference = titan_ont.medaka_reference, - kraken_version = titan_ont.kraken_version, - nextclade_version = titan_ont.nextclade_version, - nextclade_docker = titan_ont.nextclade_docker, - samtools_version = titan_ont.samtools_version, - vadr_docker = titan_ont.vadr_docker - } - } - } - - call summary.merge_titan_summary { - input: - clearlabs_summaries = clearlabs_summary.summary, - illumina_pe_summaries = illumina_pe_summary.summary, - illumina_se_summaries = illumina_se_summary.summary, - ont_summaries = ont_summary.summary - } - - output { - # Titan outputs - File summaries_tsv = merge_titan_summary.summaries_tsv - File summaries_json = merge_titan_summary.summaries_json - Array[File] reads_dehosted = flatten([ - select_all(titan_clearlabs.reads_dehosted), select_all(titan_illumina_pe.read1_dehosted), - select_all(titan_illumina_pe.read2_dehosted), select_all(titan_ont.reads_dehosted) - ]) - Array[File] aligned_bam = flatten([ - select_all(titan_clearlabs.aligned_bam), select_all(titan_illumina_pe.aligned_bam), - select_all(titan_illumina_se.aligned_bam), select_all(titan_ont.aligned_bam) - ]) - Array[File] aligned_bai = flatten([ - select_all(titan_clearlabs.aligned_bai), select_all(titan_illumina_pe.aligned_bai), - select_all(titan_illumina_se.aligned_bai), select_all(titan_ont.aligned_bai) - ]) - Array[File] assembly_fasta = flatten([ - select_all(titan_clearlabs.assembly_fasta), select_all(titan_illumina_pe.assembly_fasta), - select_all(titan_illumina_se.assembly_fasta), select_all(titan_ont.assembly_fasta) - ]) - Array[File] consensus_stats = flatten([ - select_all(titan_clearlabs.consensus_stats), select_all(titan_illumina_pe.consensus_stats), - select_all(titan_illumina_se.consensus_stats), select_all(titan_ont.consensus_stats) - ]) - Array[File] consensus_flagstat = flatten([ - select_all(titan_clearlabs.consensus_flagstat), select_all(titan_illumina_pe.consensus_flagstat), - select_all(titan_illumina_se.consensus_flagstat), select_all(titan_ont.consensus_flagstat) - ]) - Array[File] consensus_variants = flatten([ - select_all(titan_clearlabs.variants_from_ref_vcf), select_all(titan_illumina_pe.ivar_vcf), - select_all(titan_illumina_se.ivar_vcf), select_all(titan_ont.variants_from_ref_vcf) - ]) - Array[File] pango_lineage_report = flatten([ - select_all(titan_clearlabs.pango_lineage_report), select_all(titan_illumina_pe.pango_lineage_report), - select_all(titan_illumina_se.pango_lineage_report), select_all(titan_ont.pango_lineage_report) - ]) - Array[File] nextclade_json = flatten([ - select_all(titan_clearlabs.nextclade_json), select_all(titan_illumina_pe.nextclade_json), - select_all(titan_illumina_se.nextclade_json), select_all(titan_ont.nextclade_json) - ]) - Array[File] auspice_json = flatten([ - select_all(titan_clearlabs.auspice_json), select_all(titan_illumina_pe.auspice_json), - select_all(titan_illumina_se.auspice_json), select_all(titan_ont.auspice_json) - ]) - Array[File] nextclade_tsv = flatten([ - select_all(titan_clearlabs.nextclade_tsv), select_all(titan_illumina_pe.nextclade_tsv), - select_all(titan_illumina_se.nextclade_tsv), select_all(titan_ont.nextclade_tsv) - ]) - Array[File] vadr_alerts_list = flatten([ - select_all(titan_clearlabs.vadr_alerts_list), select_all(titan_illumina_pe.vadr_alerts_list), - select_all(titan_illumina_se.vadr_alerts_list), select_all(titan_ont.vadr_alerts_list) - ]) - Array[File] kraken_report = flatten([ - select_all(titan_clearlabs.kraken_report), select_all(titan_illumina_pe.kraken_report), - select_all(titan_illumina_se.kraken_report), select_all(titan_ont.kraken_report) - ]) - Array[File] kraken_report_dehosted = flatten([ - select_all(titan_clearlabs.kraken_report_dehosted), select_all(titan_illumina_pe.kraken_report_dehosted), - select_all(titan_ont.kraken_report_dehosted) - ]) - Array[File] json_summary = flatten([ - select_all(clearlabs_summary.summary), select_all(illumina_pe_summary.summary), - select_all(illumina_se_summary.summary), select_all(ont_summary.summary) - ]) - } -} diff --git a/workflows/wf_titan_illumina_pe.wdl b/workflows/wf_titan_illumina_pe.wdl deleted file mode 100644 index 29de7cab..00000000 --- a/workflows/wf_titan_illumina_pe.wdl +++ /dev/null @@ -1,178 +0,0 @@ -version 1.0 - -import "wf_read_QC_trim.wdl" as read_qc -import "../tasks/task_alignment.wdl" as align -import "../tasks/task_consensus_call.wdl" as consensus_call -import "../tasks/task_assembly_metrics.wdl" as assembly_metrics -import "../tasks/task_taxonID.wdl" as taxon_ID -import "../tasks/task_ncbi.wdl" as ncbi -import "../tasks/task_versioning.wdl" as versioning -import "../tasks/task_qc_utils.wdl" as qc_utils - -workflow titan_illumina_pe { - meta { - description: "Reference-based consensus calling for viral amplicon sequencing data" - } - - input { - String samplename - String seq_method="ILLUMINA" - File read1_raw - File read2_raw - File primer_bed - String nextclade_dataset_name = "sars-cov-2" - String nextclade_dataset_reference = "MN908947" - String nextclade_dataset_tag = "2022-01-18T12:00:00Z" - - } - - call read_qc.read_QC_trim { - input: - samplename = samplename, - read1_raw = read1_raw, - read2_raw = read2_raw - } - call align.bwa { - input: - samplename = samplename, - read1 = read_QC_trim.read1_clean, - read2 = read_QC_trim.read2_clean - } - call consensus_call.primer_trim { - input: - samplename = samplename, - primer_bed = primer_bed, - bamfile = bwa.sorted_bam - } - call consensus_call.variant_call { - input: - samplename = samplename, - bamfile = primer_trim.trim_sorted_bam - } - call consensus_call.consensus { - input: - samplename = samplename, - bamfile = primer_trim.trim_sorted_bam - } - call qc_utils.consensus_qc { - input: - assembly_fasta = consensus.consensus_seq - } - call assembly_metrics.stats_n_coverage { - input: - samplename = samplename, - bamfile = bwa.sorted_bam - } - call assembly_metrics.stats_n_coverage as stats_n_coverage_primtrim { - input: - samplename = samplename, - bamfile = primer_trim.trim_sorted_bam - } - call taxon_ID.pangolin3 { - input: - samplename = samplename, - fasta = consensus.consensus_seq - } - call taxon_ID.nextclade_one_sample { - input: - genome_fasta = consensus.consensus_seq, - dataset_name = nextclade_dataset_name, - dataset_reference = nextclade_dataset_reference, - dataset_tag = nextclade_dataset_tag - } - call taxon_ID.nextclade_output_parser_one_sample { - input: - nextclade_tsv = nextclade_one_sample.nextclade_tsv - } - call ncbi.vadr { - input: - genome_fasta = consensus.consensus_seq, - assembly_length_unambiguous = consensus_qc.number_ATCG - } - call versioning.version_capture{ - input: - } - output { - String titan_illumina_pe_version = version_capture.phvg_version - String titan_illumina_pe_analysis_date = version_capture.date - String seq_platform = seq_method - - File read1_dehosted = read_QC_trim.read1_dehosted - File read2_dehosted = read_QC_trim.read2_dehosted - File read1_clean = read_QC_trim.read1_clean - File read2_clean = read_QC_trim.read2_clean - Int fastq_scan_raw1 = read_QC_trim.fastq_scan_raw1 - Int? fastq_scan_raw2 = read_QC_trim.fastq_scan_raw2 - String? fastq_scan_raw_pairs = read_QC_trim.fastq_scan_raw_pairs - String fastq_scan_version = read_QC_trim.fastq_scan_version - - Int fastq_scan_clean1 = read_QC_trim.fastq_scan_clean1 - Int fastq_scan_clean2 = read_QC_trim.fastq_scan_clean2 - String fastq_scan_clean_pairs = read_QC_trim.fastq_scan_clean_pairs - String trimmomatic_version = read_QC_trim.trimmomatic_version - String bbduk_docker = read_QC_trim.bbduk_docker - - String kraken_version = read_QC_trim.kraken_version - Float kraken_human = read_QC_trim.kraken_human - Float kraken_sc2 = read_QC_trim.kraken_sc2 - String kraken_report = read_QC_trim.kraken_report - Float kraken_human_dehosted = read_QC_trim.kraken_human_dehosted - Float kraken_sc2_dehosted = read_QC_trim.kraken_sc2_dehosted - String kraken_report_dehosted = read_QC_trim.kraken_report_dehosted - - String bwa_version = bwa.bwa_version - String samtools_version = bwa.sam_version - String assembly_method = "~{bwa.bwa_version}; ~{primer_trim.ivar_version}" - - File aligned_bam = primer_trim.trim_sorted_bam - File aligned_bai = primer_trim.trim_sorted_bai - Float primer_trimmed_read_percent = primer_trim.primer_trimmed_read_percent - String ivar_version_primtrim = primer_trim.ivar_version - String samtools_version_primtrim = primer_trim.samtools_version - String primer_bed_name = primer_trim.primer_bed_name - - File assembly_fasta = consensus.consensus_seq - String ivar_version_consensus = consensus.ivar_version - String samtools_version_consensus = consensus.samtools_version - - Int number_N = consensus_qc.number_N - Int assembly_length_unambiguous = consensus_qc.number_ATCG - Int number_Degenerate = consensus_qc.number_Degenerate - Int number_Total = consensus_qc.number_Total - Float percent_reference_coverage = consensus_qc.percent_reference_coverage - - - File consensus_stats = stats_n_coverage.stats - File consensus_flagstat = stats_n_coverage.flagstat - Float meanbaseq_trim = stats_n_coverage_primtrim.meanbaseq - Float meanmapq_trim = stats_n_coverage_primtrim.meanmapq - Float assembly_mean_coverage = stats_n_coverage_primtrim.depth - Float s_gene_mean_coverage = stats_n_coverage_primtrim.s_gene_depth - String samtools_version_stats = stats_n_coverage.samtools_version - - String pango_lineage = pangolin3.pangolin_lineage - String pangolin_conflicts = pangolin3.pangolin_conflicts - String pangolin_notes = pangolin3.pangolin_notes - String pangolin_assignment_version = pangolin3.pangolin_assignment_version - File pango_lineage_report = pangolin3.pango_lineage_report - String pangolin_docker = pangolin3.pangolin_docker - String pangolin_versions = pangolin3.pangolin_versions - - File nextclade_json = nextclade_one_sample.nextclade_json - File auspice_json = nextclade_one_sample.auspice_json - File nextclade_tsv = nextclade_one_sample.nextclade_tsv - String nextclade_version = nextclade_one_sample.nextclade_version - String nextclade_docker = nextclade_one_sample.nextclade_docker - String nextclade_aa_subs = nextclade_output_parser_one_sample.nextclade_aa_subs - String nextclade_aa_dels = nextclade_output_parser_one_sample.nextclade_aa_dels - String nextclade_clade = nextclade_output_parser_one_sample.nextclade_clade - - File ivar_tsv = variant_call.sample_variants_tsv - File ivar_vcf = variant_call.sample_variants_vcf - String ivar_variant_version = variant_call.ivar_version - - File? vadr_alerts_list = vadr.alerts_list - String vadr_num_alerts = vadr.num_alerts - String vadr_docker = vadr.vadr_docker - } -} diff --git a/workflows/wf_titan_illumina_se.wdl b/workflows/wf_titan_illumina_se.wdl deleted file mode 100644 index db6ae304..00000000 --- a/workflows/wf_titan_illumina_se.wdl +++ /dev/null @@ -1,168 +0,0 @@ -version 1.0 - -import "wf_read_QC_trim_se.wdl" as read_qc -import "../tasks/task_alignment.wdl" as align -import "../tasks/task_consensus_call.wdl" as consensus_call -import "../tasks/task_assembly_metrics.wdl" as assembly_metrics -import "../tasks/task_taxonID.wdl" as taxon_ID -import "../tasks/task_ncbi.wdl" as ncbi -import "../tasks/task_versioning.wdl" as versioning -import "../tasks/task_qc_utils.wdl" as qc_utils - -workflow titan_illumina_se { - meta { - description: "Reference-based consensus calling for viral amplicon sequencing data" - } - - input { - String samplename - String seq_method="ILLUMINA" - File read1_raw - File primer_bed - String nextclade_dataset_name = "sars-cov-2" - String nextclade_dataset_reference = "MN908947" - String nextclade_dataset_tag = "2022-01-18T12:00:00Z" - - } - - call read_qc.read_QC_trim { - input: - samplename = samplename, - read1_raw = read1_raw - } - call align.bwa { - input: - samplename = samplename, - read1 = read_QC_trim.read1_clean - } - call consensus_call.primer_trim { - input: - samplename = samplename, - primer_bed = primer_bed, - bamfile = bwa.sorted_bam - } - call consensus_call.variant_call { - input: - samplename = samplename, - bamfile = primer_trim.trim_sorted_bam - } - call consensus_call.consensus { - input: - samplename = samplename, - bamfile = primer_trim.trim_sorted_bam - } - call qc_utils.consensus_qc { - input: - assembly_fasta = consensus.consensus_seq - } - call assembly_metrics.stats_n_coverage { - input: - samplename = samplename, - bamfile = bwa.sorted_bam - } - call assembly_metrics.stats_n_coverage as stats_n_coverage_primtrim { - input: - samplename = samplename, - bamfile = primer_trim.trim_sorted_bam - } - call taxon_ID.pangolin3 { - input: - samplename = samplename, - fasta = consensus.consensus_seq - } - call taxon_ID.nextclade_one_sample { - input: - genome_fasta = consensus.consensus_seq, - dataset_name = nextclade_dataset_name, - dataset_reference = nextclade_dataset_reference, - dataset_tag = nextclade_dataset_tag - } - call taxon_ID.nextclade_output_parser_one_sample { - input: - nextclade_tsv = nextclade_one_sample.nextclade_tsv - } - call ncbi.vadr { - input: - genome_fasta = consensus.consensus_seq, - assembly_length_unambiguous = consensus_qc.number_ATCG - } - call versioning.version_capture{ - input: - } - output { - String titan_illumina_se_version = version_capture.phvg_version - String titan_illumina_se_analysis_date = version_capture.date - String seq_platform = seq_method - - File read1_clean = read_QC_trim.read1_clean - Int fastq_scan_raw = read_QC_trim.fastq_scan_number_reads - String fastq_scan_version = read_QC_trim.fastq_scan_version - - Int fastq_scan_clean = read_QC_trim.fastq_scan_clean_number_reads - String trimmomatic_version = read_QC_trim.trimmomatic_version - String bbduk_docker = read_QC_trim.bbduk_docker - - Float kraken_human = read_QC_trim.kraken_human - Float kraken_sc2 = read_QC_trim.kraken_sc2 - String kraken_version = read_QC_trim.kraken_version - String kraken_report = read_QC_trim.kraken_report -# Float kraken_human_dehosted = read_QC_trim.kraken_human_dehosted -# Float kraken_sc2_dehosted = read_QC_trim.kraken_sc2_dehosted -# String kraken_report_dehosted = read_QC_trim.kraken_report_dehosted - - String bwa_version = bwa.bwa_version - String samtools_version = bwa.sam_version - String assembly_method = "~{bwa.bwa_version}; ~{primer_trim.ivar_version}" - - File aligned_bam = primer_trim.trim_sorted_bam - File aligned_bai = primer_trim.trim_sorted_bai - Float primer_trimmed_read_percent = primer_trim.primer_trimmed_read_percent - String ivar_version_primtrim = primer_trim.ivar_version - String samtools_version_primtrim = primer_trim.samtools_version - String primer_bed_name = primer_trim.primer_bed_name - - File assembly_fasta = consensus.consensus_seq - String ivar_version_consensus = consensus.ivar_version - String samtools_version_consensus = consensus.samtools_version - - Int number_N = consensus_qc.number_N - Int assembly_length_unambiguous = consensus_qc.number_ATCG - Int number_Degenerate = consensus_qc.number_Degenerate - Int number_Total = consensus_qc.number_Total - Float percent_reference_coverage = consensus_qc.percent_reference_coverage - - - File consensus_stats = stats_n_coverage.stats - File consensus_flagstat = stats_n_coverage.flagstat - Float meanbaseq_trim = stats_n_coverage_primtrim.meanbaseq - Float meanmapq_trim = stats_n_coverage_primtrim.meanmapq - Float assembly_mean_coverage = stats_n_coverage_primtrim.depth - Float s_gene_mean_coverage = stats_n_coverage_primtrim.s_gene_depth - String samtools_version_stats = stats_n_coverage.samtools_version - - String pango_lineage = pangolin3.pangolin_lineage - String pangolin_conflicts = pangolin3.pangolin_conflicts - String pangolin_notes = pangolin3.pangolin_notes - String pangolin_assignment_version = pangolin3.pangolin_assignment_version - File pango_lineage_report = pangolin3.pango_lineage_report - String pangolin_docker = pangolin3.pangolin_docker - String pangolin_versions = pangolin3.pangolin_versions - - File nextclade_json = nextclade_one_sample.nextclade_json - File auspice_json = nextclade_one_sample.auspice_json - File nextclade_tsv = nextclade_one_sample.nextclade_tsv - String nextclade_version = nextclade_one_sample.nextclade_version - String nextclade_docker = nextclade_one_sample.nextclade_docker - String nextclade_aa_subs = nextclade_output_parser_one_sample.nextclade_aa_subs - String nextclade_aa_dels = nextclade_output_parser_one_sample.nextclade_aa_dels - String nextclade_clade = nextclade_output_parser_one_sample.nextclade_clade - - File ivar_tsv = variant_call.sample_variants_tsv - File ivar_vcf = variant_call.sample_variants_vcf - String ivar_variant_version = variant_call.ivar_version - - File? vadr_alerts_list = vadr.alerts_list - String vadr_num_alerts = vadr.num_alerts - String vadr_docker = vadr.vadr_docker - } -} diff --git a/workflows/wf_titan_ont.wdl b/workflows/wf_titan_ont.wdl deleted file mode 100644 index 80370175..00000000 --- a/workflows/wf_titan_ont.wdl +++ /dev/null @@ -1,168 +0,0 @@ -version 1.0 - -import "../tasks/task_ont_medaka.wdl" as medaka -import "../tasks/task_assembly_metrics.wdl" as assembly_metrics -import "../tasks/task_taxonID.wdl" as taxon_ID -import "../tasks/task_ncbi.wdl" as ncbi -import "../tasks/task_read_clean.wdl" as read_clean -import "../tasks/task_qc_utils.wdl" as qc_utils -import "../tasks/task_versioning.wdl" as versioning - -workflow titan_ont { - meta { - description: "Reference-based consensus calling for viral amplicon ont sequencing data generated on ONT NGS platforms." - } - - input { - String samplename - String seq_method = "OXFORD_NANOPORE" - File primer_bed - File demultiplexed_reads - Int? normalise = 200 - String nextclade_dataset_name = "sars-cov-2" - String nextclade_dataset_reference = "MN908947" - String nextclade_dataset_tag = "2022-01-18T12:00:00Z" - Int? max_length = 700 - Int? min_length = 400 - - } - call qc_utils.fastq_scan_se as fastq_scan_raw_reads { - input: - read1 = demultiplexed_reads - } - call read_clean.ncbi_scrub_se { - input: - samplename = samplename, - read1 = demultiplexed_reads - } - call medaka.read_filtering { - input: - demultiplexed_reads = ncbi_scrub_se.read1_dehosted, - samplename = samplename, - min_length = min_length, - max_length = max_length - } - call qc_utils.fastq_scan_se as fastq_scan_clean_reads { - input: - read1 = read_filtering.filtered_reads - } - call taxon_ID.kraken2 as kraken2_dehosted { - input: - samplename = samplename, - read1 = ncbi_scrub_se.read1_dehosted - } - call medaka.consensus { - input: - samplename = samplename, - filtered_reads = read_filtering.filtered_reads, - primer_bed = primer_bed, - normalise = normalise - } - call qc_utils.consensus_qc { - input: - assembly_fasta = consensus.consensus_seq - } - call assembly_metrics.stats_n_coverage { - input: - samplename = samplename, - bamfile = consensus.sorted_bam - } - call assembly_metrics.stats_n_coverage as stats_n_coverage_primtrim { - input: - samplename = samplename, - bamfile = consensus.trim_sorted_bam - } - call taxon_ID.pangolin3 { - input: - samplename = samplename, - fasta = consensus.consensus_seq - } - call taxon_ID.kraken2 as kraken2_raw { - input: - samplename = samplename, - read1 = demultiplexed_reads - } - call taxon_ID.nextclade_one_sample { - input: - genome_fasta = consensus.consensus_seq, - dataset_name = nextclade_dataset_name, - dataset_reference = nextclade_dataset_reference, - dataset_tag = nextclade_dataset_tag - } - call taxon_ID.nextclade_output_parser_one_sample { - input: - nextclade_tsv = nextclade_one_sample.nextclade_tsv - } - call ncbi.vadr { - input: - genome_fasta = consensus.consensus_seq, - assembly_length_unambiguous = consensus_qc.number_ATCG - } - call versioning.version_capture{ - input: - } - output { - String titan_ont_version = version_capture.phvg_version - String titan_ont_analysis_date = version_capture.date - String seq_platform = seq_method - - File reads_dehosted = ncbi_scrub_se.read1_dehosted - - Int fastq_scan_raw = fastq_scan_raw_reads.read1_seq - Int fastq_scan_clean = fastq_scan_clean_reads.read1_seq - String fastq_scan_version = fastq_scan_clean_reads.version - - String kraken_version = kraken2_raw.version - Float kraken_human = kraken2_raw.percent_human - Float kraken_sc2 = kraken2_raw.percent_sc2 - String kraken_report = kraken2_raw.kraken_report - Float kraken_human_dehosted = kraken2_dehosted.percent_human - Float kraken_sc2_dehosted = kraken2_dehosted.percent_sc2 - String kraken_report_dehosted = kraken2_dehosted.kraken_report - - File aligned_bam = consensus.trim_sorted_bam - File aligned_bai = consensus.trim_sorted_bai - File variants_from_ref_vcf = consensus.medaka_pass_vcf - String artic_version = consensus.artic_pipeline_version - String artic_docker = consensus.artic_pipeline_docker - String medaka_reference = consensus.medaka_reference - String primer_bed_name = consensus.primer_bed_name - File assembly_fasta = consensus.consensus_seq - String assembly_method = consensus.artic_pipeline_version - - Int number_N = consensus_qc.number_N - Int assembly_length_unambiguous = consensus_qc.number_ATCG - Int number_Degenerate = consensus_qc.number_Degenerate - Int number_Total = consensus_qc.number_Total - Float percent_reference_coverage = consensus_qc.percent_reference_coverage - - File consensus_stats = stats_n_coverage.stats - File consensus_flagstat = stats_n_coverage.flagstat - Float meanbaseq_trim = stats_n_coverage_primtrim.meanbaseq - Float meanmapq_trim = stats_n_coverage_primtrim.meanmapq - Float assembly_mean_coverage = stats_n_coverage_primtrim.depth - Float s_gene_mean_coverage = stats_n_coverage_primtrim.s_gene_depth - String samtools_version = stats_n_coverage.samtools_version - - String pango_lineage = pangolin3.pangolin_lineage - String pangolin_conflicts = pangolin3.pangolin_conflicts - String pangolin_notes = pangolin3.pangolin_notes - String pangolin_assignment_version = pangolin3.pangolin_assignment_version - File pango_lineage_report = pangolin3.pango_lineage_report - String pangolin_docker = pangolin3.pangolin_docker - String pangolin_versions = pangolin3.pangolin_versions - - File nextclade_json = nextclade_one_sample.nextclade_json - File auspice_json = nextclade_one_sample.auspice_json - File nextclade_tsv = nextclade_one_sample.nextclade_tsv - String nextclade_version = nextclade_one_sample.nextclade_version - String nextclade_docker = nextclade_one_sample.nextclade_docker - String nextclade_aa_subs = nextclade_output_parser_one_sample.nextclade_aa_subs - String nextclade_aa_dels = nextclade_output_parser_one_sample.nextclade_aa_dels - String nextclade_clade = nextclade_output_parser_one_sample.nextclade_clade - - File? vadr_alerts_list = vadr.alerts_list - String vadr_num_alerts = vadr.num_alerts - String vadr_docker = vadr.vadr_docker - } -} diff --git a/workflows/wf_titan_wwvc.wdl b/workflows/wf_titan_wwvc.wdl deleted file mode 100644 index 4f93ed64..00000000 --- a/workflows/wf_titan_wwvc.wdl +++ /dev/null @@ -1,81 +0,0 @@ -version 1.0 - -import "wf_read_QC_trim.wdl" as read_qc -import "../tasks/task_alignment.wdl" as align -import "../tasks/task_consensus_call.wdl" as consensus_call -import "../tasks/task_versioning.wdl" as versioning -import "../workflows/WasteWaterVariantCalling_modified.wdl" as wastewater - -workflow titan_illumina_wwvc { - meta { - description: "Reference-based consensus calling for viral amplicon sequencing data" - } - - input { - Array[String] samplename - Array[File] read1_raw - Array[File] read2_raw - File primer_bed - File reference_genome - File spike_bed - File spike_annotations - Int trimmomatic_minlen = 25 - - } - scatter (r1_r2 in zip(read1_raw, read2_raw)) { - call read_qc.read_QC_trim { - input: - samplename = "wastewater_sample", - read1_raw = r1_r2.left, - read2_raw = r1_r2.right, - trimmomatic_minlen = trimmomatic_minlen - } - call align.bwa { - input: - samplename = "wastewater_sample", - read1 = read_QC_trim.read1_clean, - read2 = read_QC_trim.read2_clean - } - call consensus_call.primer_trim { - input: - samplename = "wastewater_sample", - primer_bed = primer_bed, - bamfile = bwa.sorted_bam - } - } - call wastewater.WasteWaterVariantCalling{ - input: - sorted_bam = primer_trim.trim_sorted_bam, - covid_genome = reference_genome, - spike_bed = spike_bed, - spike_annotations = spike_annotations, - sample_id = samplename - } - call versioning.version_capture{ - input: - } - output { - String titan_wwvc_version = version_capture.phvg_version - String titan_wwcv_date = version_capture.date - - Array[File] addrg_bam = WasteWaterVariantCalling.addrg_bam - Array[File] variants = WasteWaterVariantCalling.variants - Array[File] sorted_vcf = WasteWaterVariantCalling.sorted_vcf - Array[File] sample_spike_vcf = WasteWaterVariantCalling.sample_spike_vcf - Array[File] sample_spike_tsv = WasteWaterVariantCalling.sample_spike_tsv - Array[File] sample_spike_tsv_summary = WasteWaterVariantCalling.sample_spike_tsv_summary - Array[File] sample_spike_tsv_dash = WasteWaterVariantCalling.sample_spike_tsv_dash - Array[File] fill_NA_tsv = WasteWaterVariantCalling.fill_NA_tsv - Array[File] allele_freq_tsv = WasteWaterVariantCalling.allele_freq_tsv - Array[File] reformat_tsv_tsv = WasteWaterVariantCalling.reformat_tsv_tsv - Array[File] sample_spike_tsv_counts = WasteWaterVariantCalling.sample_spike_tsv_counts - Array[File] alignment_files = primer_trim.trim_sorted_bam - File spike_summary_temp = WasteWaterVariantCalling.spike_summary_temp - File spike_summary = WasteWaterVariantCalling.spike_summary - File spike_dashboard = WasteWaterVariantCalling.spike_dashboard - File spike_counts = WasteWaterVariantCalling.spike_counts - - } -} - - \ No newline at end of file diff --git a/workflows/wf_vadr_update.wdl b/workflows/wf_vadr_update.wdl index 399208c9..29a1c93a 100644 --- a/workflows/wf_vadr_update.wdl +++ b/workflows/wf_vadr_update.wdl @@ -4,23 +4,25 @@ import "../tasks/task_ncbi.wdl" as ncbi import "../tasks/task_versioning.wdl" as versioning workflow vadr_update { - input { - File genome_fasta - String docker - } - call ncbi.vadr { + input { + File genome_fasta + String docker + } + call ncbi.vadr { input: genome_fasta = genome_fasta, - docker = docker - } - call versioning.version_capture{ - input: - } - output { - String vadr_update_version = version_capture.phvg_version - String vadr_update_analysis_date = version_capture.date - File? vadr_alerts_list = vadr.alerts_list - String vadr_num_alerts = vadr.num_alerts - String vadr_docker = vadr.vadr_docker - } + docker = docker + } + call versioning.version_capture{ + input: + } + output { + # Version Capture + String vadr_update_version = version_capture.phvg_version + String vadr_update_analysis_date = version_capture.date + # VADR Annotation QC + File? vadr_alerts_list = vadr.alerts_list + String vadr_num_alerts = vadr.num_alerts + String vadr_docker = vadr.vadr_docker + } }