From 39d8822e8eb4871f02b76b53fe816be46600c09c Mon Sep 17 00:00:00 2001 From: Dan Fornika Date: Wed, 3 Aug 2022 17:46:53 -0700 Subject: [PATCH] Rename processes (#4) * Many changes, streamlining and renaming * Remove unused scripts * Update help, README * Update README, help * Remove unnecessary change * Remove unused config, adjust alignment --- .gitignore | 2 + README.md | 55 ++-- bin/downsample_amplicons.py | 309 ------------------ bin/ivar_variants_add_codon_position.py | 145 -------- bin/qc.py | 170 +++++++--- conf/base.config | 43 --- conf/conda.config | 10 - conf/illumina.config | 82 ----- environments/{illumina => }/environment.yml | 3 +- environments/illumina/Dockerfile | 10 - environments/illumina/Singularity | 17 - environments/nanopore/Dockerfile | 11 - environments/nanopore/Singularity | 23 -- environments/nanopore/environment.yml | 7 - main.nf | 103 +----- modules/artic.nf | 155 --------- modules/help.nf | 88 ++--- modules/illumina.nf | 306 +++++++---------- modules/out.nf | 24 -- modules/qc.nf | 12 +- modules/upload.nf | 19 +- modules/utils.nf | 76 ++--- nextflow.config | 127 ++++--- workflows/articNcovNanopore.nf | 130 -------- .../{illuminaNcov.nf => illuminaMpxv.nf} | 91 ++---- 25 files changed, 461 insertions(+), 1557 deletions(-) delete mode 100755 bin/downsample_amplicons.py delete mode 100755 bin/ivar_variants_add_codon_position.py delete mode 100644 conf/base.config delete mode 100644 conf/conda.config delete mode 100644 conf/illumina.config rename environments/{illumina => }/environment.yml (94%) delete mode 100644 environments/illumina/Dockerfile delete mode 100644 environments/illumina/Singularity delete mode 100644 environments/nanopore/Dockerfile delete mode 100644 environments/nanopore/Singularity delete mode 100644 environments/nanopore/environment.yml delete mode 100644 modules/artic.nf delete mode 100644 modules/out.nf delete mode 100644 workflows/articNcovNanopore.nf rename workflows/{illuminaNcov.nf => illuminaMpxv.nf} (55%) diff --git a/.gitignore b/.gitignore index 1d89db3..12a1b5c 100644 --- a/.gitignore +++ b/.gitignore @@ -3,3 +3,5 @@ nextflow results *.sif work +test_output +test_input diff --git a/README.md b/README.md index 1dc1e32..a97a2cc 100644 --- a/README.md +++ b/README.md @@ -5,16 +5,39 @@ A Nextflow pipeline for running the ARTIC network's fieldbioinformatics tools (h #### Introduction ------------- This pipeline is based on the [BCCDC-PHL/ncov2019-artic-nf](https://github.com/BCCDC-PHL/ncov2019-artic-nf) pipeline, which is a fork of the [connor-lab/ncov2019-artic-nf](https://github.com/connor-lab/ncov2019-artic-nf) pipeline. It has been modified to support analysis of monkeypox virus. - + +```mermaid +flowchart TD + ref[ref.fa] + composite_ref[composite_ref.fa] + primers[primer.bed] + fastq[fastq_dir] + fastq --> performHostFilter(performHostFilter) + composite_ref --> performHostFilter + performHostFilter(performHostFilter) --> normalizeDepth(normalizeDepth) + readTrimming(readTrimming) --> filterResidualAdapters(filterResidualAdapters) + normalizeDepth(normalizeDepth) --> readTrimming(readTrimming) + filterResidualAdapters --> readMapping(readMapping) + ref --> readMapping(readMapping) + readMapping(readMapping) --> trimPrimerSequences(trimPrimerSequences) + primers --> trimPrimerSequences(trimPrimerSequences) + trimPrimerSequences(trimPrimerSequences) --> callConsensusFreebayes(callConsensusFreebayes) + callConsensusFreebayes(callConsensusFreebayes) --> alignConsensusToReference(alignConsensusToReference) + ref --> alignConsensusToReference + trimPrimerSequences --> makeQCCSV(makeQCCSV) + callConsensusFreebayes --> makeQCCSV + callConsensusFreebayes --> consensus[consensus.fa] + callConsensusFreebayes --> variants[variants.vcf] + ref --> makeQCCSV + makeQCCSV --> qcCSV(qc.csv) +``` #### Quick-start -##### Illumina ``` nextflow run BCCDC-PHL/mpxv-artic-nf -profile conda \ - --illumina --prefix "output_file_prefix" \ + --prefix "output_file_prefix" \ --bed /path/to/primers.bed \ --ref /path/to/ref.fa \ --primer_pairs_tsv /path/to/primer_pairs_tsv \ @@ -33,33 +56,23 @@ The repo contains a environment.yml files which automatically build the correct --cache /some/dir can be specified to have a fixed, shared location to store the conda build for use by multiple runs of the workflow. -#### Executors -By default, the pipeline just runs on the local machine. You can specify `-profile slurm` to use a SLURM cluster, or `-profile lsf` to use an LSF cluster. In either case you may need to also use one of the COG-UK institutional config profiles (phw or sanger), or provide queue names to use in your own config file. - -#### Profiles -You can use multiple profiles at once, separating them with a comma. This is described in the Nextflow [documentation](https://www.nextflow.io/docs/latest/config.html#config-profiles) - #### Config -Common configuration options are set in `conf/base.config`. Workflow specific configuration options are set in `conf/illumina.config` They are described and set to sensible defaults (as suggested in the [nCoV-2019 novel coronavirus bioinformatics protocol](https://artic.network/ncov-2019/ncov2019-bioinformatics-sop.html "nCoV-2019 novel coronavirus bioinformatics protocol")) Important config options are: | Option | Default | Description | |:---------------------------------|---------:|--------------------------------------------------------------------------------------------------------------------:| -| `allowNoprimer` | `true` | Allow reads that don't have primer sequence? Ligation prep = `false`, nextera = `true` | -| `illuminaKeepLen` | `50` | Length of illumina reads to keep after primer trimming | -| `illuminaQualThreshold` | `20` | Sliding window quality threshold for keeping reads after primer trimming (illumina) | -| `mpileupDepth` | `100000` | Mpileup depth for ivar | -| `varFreqThreshold` | `0.75` | ivar/freebayes frequency threshold for consensus variant | -| `varMinFreqThreshold | `0.25` | ivar/freebayes frequency threshold for ambiguous variant | +| `normalizationTargetDepth` | `200` | Target depth of coverage to normalize to prior to alignment | +| `normalizationMinDepth` | `5` | Minimum depth of coverage to normalize to prior to alignment | +| `keepLen` | `50` | Length of reads to keep after primer trimming | +| `qualThreshold` | `20` | Sliding window quality threshold for keeping reads after primer trimming | +| `varMinFreqThreshold` | `0.25` | Allele frequency threshold for ambiguous variant | +| `varFreqThreshold` | `0.75` | Allele frequency threshold for unambiguous variant | | `varMinDepth` | `10` | Minimum coverage depth to call variant | -| `ivarMinVariantQuality` | `20` | ivear minimum mapping quality to call variant | -| `downsampleMappingQuality` | `20` | Exclude reads below this mapping quality while downsampling | -| `downsampleAmpliconSubdivisions` | `3` | Number of times amplicons are subdivided to determine locations of checkpoints to test for depth while downsampling | #### QC A script to do some basic QC is provided in `bin/qc.py`. This currently tests if >50% of reference bases are covered by >10 reads (Illumina) or >20 reads (Nanopore), OR if there is a stretch of more than 10 Kb of sequence without N - setting qc_pass in `/.qc.csv` to TRUE. `bin/qc.py` can be extended to incorporate any QC test, as long as the script outputs a csv file a "qc_pass" last column, with samples TRUE or FALSE. #### Output -A subdirectory for each process in the workflow is created in `--outdir`. A `nml_upload` subdirectory containing files important for [CanCOGeN](https://www.genomecanada.ca/en/cancogen) is created. +A subdirectory for each process in the workflow is created in `--outdir`. A `nml_upload` subdirectory containing dehosted fastq files and consensus sequences is included. diff --git a/bin/downsample_amplicons.py b/bin/downsample_amplicons.py deleted file mode 100755 index 5ba68ff..0000000 --- a/bin/downsample_amplicons.py +++ /dev/null @@ -1,309 +0,0 @@ -#!/usr/bin/env python - -import argparse -import csv -import json -import re -import sys - -import pysam - -from collections import defaultdict -from functools import reduce - - -def get_idxstats(bam_path): - idxstats = { - "ref_name": "", - "seq_length": 0, - "num_mapped_segments": 0, - "num_unmapped_segments": 0, - } - - idxstats_out = pysam.idxstats(bam_path).strip().split('\t') - idxstats_out[3] = idxstats_out[3].split('\n')[0] - - idxstats['ref_name'] = idxstats_out[0] - idxstats['seq_length'] = int(idxstats_out[1]) - idxstats['num_mapped_segments'] = int(idxstats_out[2]) - idxstats['num_unmapped_segments'] = int(idxstats_out[3]) - - return idxstats - - - -def merge_primers(p1, p2): - try: - assert p1['direction'] == p2['direction'] - except AssertionError as e: - err_message = "Error parsing bed file. primers " + p1['primer_id'] + " and " + p2['primer_id'] + " not in same direction, cannot be merged" - print(err_message, file=sys.stderr) - sys.exit(1) - if p1['direction'] == '+': - if p1['start'] < p2['start']: - merged_primer = p1 - else: - merged_primer = p2 - elif p1['direction'] == '-': - if p1['end'] > p2['end']: - merged_primer = p1 - else: - merged_primer = p2 - - return merged_primer - - -def read_bed_file(bed_file_path): - """ - { "nCoV-2019_1_LEFT": { - "chrom": "MN908947.3", - "start": 30, - "end": 54, - "primer_id": "nCoV-2019_1_LEFT", - "pool_name": "1", - "direction": "+" - }, - "nCoV-2019_1_RIGHT": { - "chrom": "MN908947.3", - "start": 1183, - "end": 1205, - "primer_id": "nCoV-2019_1_RIGHT", - "pool_name": "1", - "direction": "-" - }, - ... - } - """ - fieldnames = [ - 'chrom', - 'start', - 'end', - 'primer_id', - 'pool_name', - 'direction' - ] - int_fields = [ - 'start', - 'end', - ] - primers = {} - with open(bed_file_path, 'r') as f: - reader = csv.DictReader(f, fieldnames=fieldnames, dialect='excel-tab') - for row in reader: - for field in int_fields: - row[field] = int(row[field]) - if re.search('_alt', row['primer_id']): - primer_id = re.match('(.+)_alt', row['primer_id']).group(1) - else: - primer_id = row['primer_id'] - if all([not primer_id == primer_key for primer_key in primers.keys()]): - primers[primer_id] = row - else: - existing_primer = primers[primer_id] - merged_primer = merge_primers(existing_primer, row) - merged_primer['primer_id'] = primer_id - primers[primer_id] = merged_primer - - return primers - - -def midpoint(start, end): - """ - Find the midpoint between two loci - returns: int - """ - mid = round(start + ((end - start) / 2)) - return mid - - -def get_initial_amplicon_checkpoints(parsed_bed): - """ - input: { "nCoV-2019_1_LEFT": { - "chrom": "MN908947.3", - "start": 30, - "end": 54, - "primer_id": "nCoV-2019_1_LEFT", - "pool_name": "1", - "direction": "+" - }, - "nCoV-2019_1_RIGHT": { - "chrom": "MN908947.3", - "start": 1183, - "end": 1205, - "primer_id": "nCoV-2019_1_RIGHT", - "pool_name": "1", - "direction": "-" - }, - ... - } - returns: { - "nCoV-2019_1": [64, 618, 1173], - "nCoV-2019_2": [1138, 1683, 2234], - ... - } - """ - amplicon_checkpoints = {} - for primer_id, primer in parsed_bed.items(): - amplicon_id = re.match("(.+)_LEFT", primer_id) - if amplicon_id: - amplicon_checkpoints[amplicon_id.group(1)] = {} - - for amplicon_id in amplicon_checkpoints: - left_primer_id = amplicon_id + "_LEFT" - right_primer_id = amplicon_id + "_RIGHT" - left_primer = parsed_bed[left_primer_id] - right_primer = parsed_bed[right_primer_id] - # consider ends of amplicon to be inner ends of primers - amplicon_start = left_primer['end'] - amplicon_end = right_primer['start'] - amplicon_midpoint = midpoint(amplicon_start, amplicon_end) - # add a 10-base buffer inside the ends of the primers to define our first and last checkpoints - amplicon_checkpoints[amplicon_id] = [(amplicon_start + 10), amplicon_midpoint, (amplicon_end - 10)] - - return amplicon_checkpoints - - -def subdivide_amplicon_checkpoints(amplicon_checkpoints): - """ - input: [64, 618, 1173] - output: [64, 341, 618, 896, 1173] - """ - new_amplicon_checkpoints = [] - for idx in range(len(amplicon_checkpoints)): - try: - new_checkpoint = midpoint(amplicon_checkpoints[idx], amplicon_checkpoints[idx + 1]) - new_amplicon_checkpoints.append(new_checkpoint) - except IndexError as e: - pass - - all_checkpoints = amplicon_checkpoints + new_amplicon_checkpoints - all_checkpoints.sort() - - return all_checkpoints - - -def read_pair_generator(bam, region_string=None): - """ - from: https://www.biostars.org/p/306041/#332022 - Generate read pairs in a BAM file or within a region string. - Reads are added to read_dict until a pair is found. - """ - read_dict = defaultdict(lambda: [None, None]) - for read in bam.fetch(region=region_string): - if not read.is_proper_pair or read.is_secondary or read.is_supplementary: - continue - qname = read.query_name - if qname not in read_dict: - if read.is_read1: - read_dict[qname][0] = read - else: - read_dict[qname][1] = read - else: - if read.is_read1: - yield read, read_dict[qname][1] - else: - yield read_dict[qname][0], read - del read_dict[qname] - - -def main(args): - """ - """ - - idxstats = get_idxstats(args.bam) - total_reads = idxstats['num_mapped_segments'] + idxstats['num_unmapped_segments'] - - # open the primer scheme and get the pools - bed = read_bed_file(args.bed) - - amplicon_checkpoints = get_initial_amplicon_checkpoints(bed) - - for _ in range(args.amplicon_subdivisions - 1): - for amplicon_id in amplicon_checkpoints: - amplicon_checkpoints[amplicon_id] = subdivide_amplicon_checkpoints(amplicon_checkpoints[amplicon_id]) - - genome_checkpoints = [] - for checkpoints in amplicon_checkpoints.values(): - genome_checkpoints.extend(checkpoints) - genome_checkpoints.sort() - - required_depth_achieved = [False] * len(genome_checkpoints) - - depths = [0] * (args.genome_size + 100) - - infile = pysam.AlignmentFile(args.bam, "rb") - bam_header = infile.header.copy().to_dict() - - outfile = pysam.AlignmentFile("-", "wh", header=bam_header) - - reads_processed = 0 - reads_written = 0 - reads_discarded = 0 - - for segment, mate_segment in read_pair_generator(infile): - if segment.mapping_quality < args.mapping_quality: - reads_discarded += 2 - reads_processed += 2 - continue - if not segment.is_proper_pair: - reads_discarded += 2 - reads_processed += 2 - continue - if segment.is_unmapped or segment.is_supplementary: - reads_discarded += 2 - reads_processed += 2 - continue - if not segment.is_paired or segment.mate_is_unmapped: - reads_discarded += 2 - reads_processed += 2 - continue - - checkpoints_under_segment = list(filter(lambda cp: segment.reference_start <= cp and segment.reference_end >= cp, genome_checkpoints)) - checkpoints_under_mate_segment = list(filter(lambda cp: mate_segment.reference_start <= cp and mate_segment.reference_end >= cp, genome_checkpoints)) - checkpoints_under_both_segments = checkpoints_under_segment + checkpoints_under_mate_segment - checkpoint_depths = [depths[checkpoint] for checkpoint in checkpoints_under_both_segments] - checkpoints_achieved_required_depth = [True if depths[checkpoint] >= args.min_depth else False for checkpoint in checkpoints_under_both_segments] - - if not all(checkpoints_achieved_required_depth): - outfile.write(segment) - reads_written += 1 - outfile.write(mate_segment) - reads_written += 1 - for checkpoint in checkpoints_under_both_segments: - depths[checkpoint] += 1 - else: - reads_discarded += 2 - - reads_processed += 2 - - if reads_processed % 1000 == 0: - genome_checkpoint_depths = [depths[checkpoint] for checkpoint in genome_checkpoints] - genome_checkpoints_achieved_required_depth = [True if depth >= args.min_depth else False for depth in genome_checkpoint_depths] - if all(genome_checkpoints_achieved_required_depth): - break - - if total_reads > 0: - downsampling_factor = reads_written / total_reads - else: - downsampling_factor = 0.0 - - print(','.join(["total_input_reads","reads_processed","reads_written", "reads_discarded", "downsampling_factor"]), file=sys.stderr) - print(','.join([str(total_reads), str(reads_processed), str(reads_written), str(reads_discarded), str(round(downsampling_factor, 4))]), file=sys.stderr) - - # close up the file handles - infile.close() - outfile.close() - - -if __name__ == "__main__": - - parser = argparse.ArgumentParser(description='Downsample alignments from an amplicon scheme.') - parser.add_argument('bam', help='bam file containing the alignment') - parser.add_argument('--bed', help='BED file containing the amplicon scheme') - parser.add_argument('--min-depth', type=int, default=200, help='Subsample to n coverage') - parser.add_argument('--mapping-quality', type=int, default=20, help='Minimum mapping quality to include read in output') - parser.add_argument('--amplicon-subdivisions', type=int, default=3, help='How many times to divide amplicons to detect coverage') - parser.add_argument('--genome-size', type=int, default=29903, help='') - parser.add_argument('--verbose', action='store_true', help='Debug mode') - args = parser.parse_args() - main(args) diff --git a/bin/ivar_variants_add_codon_position.py b/bin/ivar_variants_add_codon_position.py deleted file mode 100755 index c4742c3..0000000 --- a/bin/ivar_variants_add_codon_position.py +++ /dev/null @@ -1,145 +0,0 @@ -#!/usr/bin/env python - -import argparse -import csv -import json -import math -import re -import sys - - -def parse_gff_attribute(attribute_str): - attribute_lst = attribute_str.split(';') - attribute_dict = {} - for attr in attribute_lst: - [key, val] = attr.split('=') - attribute_dict[key] = val - - return attribute_dict - - -def parse_gff(gff_path): - parsed_gff = [] - - gff_fieldnames = [ - 'seqname', - 'source', - 'feature', - 'start', - 'end', - 'score', - 'strand', - 'frame', - 'attribute', - - ] - with open(gff_path, 'r') as f: - for row in f: - if row.startswith('#'): - pass - else: - break - reader = csv.DictReader(f, fieldnames=gff_fieldnames, dialect='excel-tab') - for row in reader: - if row['seqname'].startswith('#'): - pass - else: - row['attribute'] = parse_gff_attribute(row['attribute']) - parsed_gff.append(row) - - return parsed_gff - - -def parse_variants(variants_path): - parsed_variants = [] - with open(variants_path, 'r') as f: - reader = csv.DictReader(f, dialect='excel-tab') - for row in reader: - parsed_variants.append(row) - - return parsed_variants - - - -def add_ref_codon_pos(variant, cds_features_by_id): - if variant['GFF_FEATURE'] == 'NA': - variant['CODON_POS'] = 'NA' - else: - gff_feature = cds_features_by_id[variant['GFF_FEATURE']] - variant_pos = int(variant['POS']) - cds_start = int(gff_feature['start']) - ref_codon_pos = int(math.floor((variant_pos - (cds_start - 3)) / 3)) - variant['CODON_POS'] = ref_codon_pos - return variant - -def add_aa_mutation_name(variant): - feature = variant['GFF_FEATURE'] - ref_codon_pos = variant['CODON_POS'] - ref_aa = variant['REF_AA'] - alt_aa = variant['ALT_AA'] - if feature == 'NA' or ref_aa == 'NA' or ref_codon_pos == 'NA' or alt_aa == 'NA': - variant['MUT_NAME'] = 'NA' - else: - variant['MUT_NAME'] = feature + ':' + ref_aa + str(ref_codon_pos) + alt_aa - - return variant - -def main(args): - - gff = [] - cds_by_name = {} - - if args.gff: - gff = parse_gff(args.gff) - for gff_feature in gff: - if gff_feature['feature'] == 'CDS': - cds_by_name[gff_feature['attribute']['ID']] = gff_feature - - variants = parse_variants(args.variants) - - for variant in variants: - if args.gff: - variant = add_ref_codon_pos(variant, cds_by_name) - variant = add_aa_mutation_name(variant) - else: - variant['CODON_POS'] = 'NA' - variant['MUT_NAME'] = 'NA' - - output_fieldnames = [ - 'REGION', - 'POS', - 'REF', - 'ALT', - 'REF_DP', - 'REF_RV', - 'REF_QUAL', - 'ALT_DP', - 'ALT_RV', - 'ALT_QUAL', - 'ALT_FREQ', - 'TOTAL_DP', - 'PVAL', - 'PASS', - 'GFF_FEATURE', - 'REF_CODON', - 'REF_AA', - 'ALT_CODON', - 'ALT_AA', - 'CODON_POS', - 'MUT_NAME', - ] - - csv.register_dialect('unix-tab', delimiter='\t', doublequote=False, lineterminator='\n', quoting=csv.QUOTE_MINIMAL) - writer = csv.DictWriter(sys.stdout, fieldnames=output_fieldnames, dialect='unix-tab') - writer.writeheader() - for variant in variants: - writer.writerow(variant) - - -if __name__ == "__main__": - - parser = argparse.ArgumentParser(description='Add codon position column to ivar variants output') - parser.add_argument('variants', help='bam file containing the alignment') - parser.add_argument('-g', '--gff', help='GFF file used to label GFF_FEATURE field of ivar variants outptut') - args = parser.parse_args() - main(args) diff --git a/bin/qc.py b/bin/qc.py index ad12f3b..84ca2a4 100755 --- a/bin/qc.py +++ b/bin/qc.py @@ -1,11 +1,16 @@ #!/usr/bin/env python3 -from Bio import SeqIO +import argparse import csv +import json +import shlex import subprocess -import pandas as pd + import matplotlib.pyplot as plt -import shlex +import pandas as pd + +from Bio import SeqIO +from matplotlib.patches import Rectangle """ This script can incorporate as many QC checks as required @@ -14,28 +19,63 @@ 'TRUE' if the overall QC check has passed or 'FALSE' if not. """ -def make_qc_plot(depth_pos, n_density, samplename, window=200): - depth_df = pd.DataFrame( { 'position' : [pos[1] for pos in depth_pos], 'depth' : [dep[2] for dep in depth_pos] } ) +def make_qc_plot(depth_pos, n_density, ref_length, samplename, min_depth, amplicons, window=200, ylim_top=10**5, width_inches=20.0, height_inches=4.0): + # Initialize position/depth to zero for every position. + # Ensures that dataframe is correct dimensions. Otherwise script will fail for samples with zero depth across genome. + depth_df = pd.DataFrame({ + 'position': [x + 1 for x in range(ref_length)], + 'depth': [0 for x in range(ref_length)] + }) + + for pos in depth_pos: + idx = int(pos[1]) - 1 + depth = int(pos[2]) + depth_df.loc[idx, 'depth'] = depth + depth_df['depth_moving_average'] = depth_df.iloc[:,1].rolling(window=window).mean() - + n_df = pd.DataFrame( { 'position' : [pos[0] for pos in n_density], 'n_density' : [dens[1] for dens in n_density] } ) - fig, ax1 = plt.subplots() - ax2 = ax1.twinx() + fig, (ax_depth, ax_amplicons) = plt.subplots(2, gridspec_kw={'height_ratios': [5, 1]}) + fig.set_size_inches(width_inches, height_inches) - ax1.set_xlabel('Position') + ax_n_density = ax_depth.twinx() - ax1.set_ylabel('Depth', color = 'g') - ax1.set_ylim(top=10**5, bottom=1) - ax1.set_yscale('log') - ax1.plot(depth_df['depth_moving_average'], color = 'g') + ax_depth.set_xlabel('Position') - ax2.set_ylabel('N density', color = 'r') - ax2.plot(n_df['n_density'], color = 'r') - ax2.set_ylim(top=1) + ax_depth.set_ylabel('Depth', color = 'g') + ax_depth.set_ylim(top=ylim_top, bottom=1) + ax_depth.set_yscale('log') + ax_depth.axhline(y=min_depth, c="blue", linestyle='dotted', linewidth=0.5) + ax_depth.plot(depth_df['depth_moving_average'], color = 'g', linewidth=0.5) + ax_n_density.set_ylabel('N density', color = 'r') + ax_n_density.plot(n_df['n_density'], color = 'r', linewidth=0.5) + ax_n_density.set_ylim(top=1, bottom=0) + + ax_amplicons.get_shared_x_axes().join(ax_amplicons, ax_depth) + ax_amplicons.plot() + + amplicon_pool_colors = { + "1": "tab:red", + "2": "tab:blue", + } + + for amplicon_num, amplicon in amplicons.items(): + amplicon_rectangle = Rectangle((amplicon['start'], int(amplicon['pool'])), amplicon['length'], 1, color=amplicon_pool_colors[amplicon['pool']]) + rx, ry = amplicon_rectangle.get_xy() + cx = rx + amplicon_rectangle.get_width() / 2.0 + cy = ry + amplicon_rectangle.get_height() / 2.0 + ax_amplicons.add_patch(amplicon_rectangle) + ax_amplicons.annotate(amplicon_num, (cx, cy), color='black', weight='bold', fontsize=6, ha='center', va='center') + + ax_amplicons.set_ylim(top=4, bottom=0) + ax_amplicons.yaxis.set_visible(False) + + plt.xlim(left=0) plt.title(samplename) - plt.savefig(samplename + '.depth.png') + plt.savefig(samplename + '.depth.png', bbox_inches='tight') + def read_depth_file(bamfile): p = subprocess.Popen(['samtools', 'depth', '-a', '-d', '0', bamfile], @@ -51,9 +91,63 @@ def read_depth_file(bamfile): return pos_depth +def read_primers(primer_bed_path, primer_pairs_path, primer_name_delimiter='_', amplicon_number_offset=2): + primer_pairs = {} + with open(primer_pairs_path, 'r') as f: + for line in f: + left, right = line.strip().split('\t') + primer_pairs[left] = right + primer_pairs[right] = left + + primers_by_name = {} + with open(primer_bed_path, 'r') as f: + for line in f: + fields = line.strip().split('\t') + start = int(fields[1]) + end = int(fields[2]) + name = fields[3] + pool = fields[4] + orientation = fields[5] + pair_name = primer_pairs[name] + amplicon_number = name.split(primer_name_delimiter)[amplicon_number_offset] + primers_by_name[name] = { + 'name': name, + 'pair_name': pair_name, + 'start': start, + 'end': end, + 'amplicon_number': amplicon_number, + 'pool': pool, + 'orientation': orientation + } + if orientation == '+': + pass + + return primers_by_name + + +def primers_to_amplicons(primers): + amplicons = {} + positive_orientation_primers = [primers[p] for p in primers if primers[p]['orientation'] == '+'] + for primer in positive_orientation_primers: + amplicon_number = primer['amplicon_number'] + amplicon_pool = primer['pool'] + amplicon_start = primer['end'] + amplicon_end = primers[primer['pair_name']]['start'] + amplicon_length = amplicon_end - amplicon_start + amplicons[amplicon_number] = { + 'number': amplicon_number, + 'start': amplicon_start, + 'end': amplicon_end, + 'length': amplicon_length, + 'pool': amplicon_pool, + } + + return amplicons + + def get_covered_pos(pos_depth, min_depth): counter = 0 - for contig, pos,depth in pos_depth: + for contig, pos, depth in pos_depth: if int(depth) >= min_depth: counter = counter + 1 @@ -107,17 +201,15 @@ def get_num_reads(bamfile): return subprocess.check_output(what).decode().strip() -def go(args): - if args.illumina: - depth = 10 - elif args.nanopore: - depth = 20 - +def main(args): + primers = read_primers(args.primer_bed, args.primer_pairs) + amplicons = primers_to_amplicons(primers) + ## Depth calcs ref_length = get_ref_length(args.ref) depth_pos = read_depth_file(args.bam) - depth_covered_bases = get_covered_pos(depth_pos, depth) + depth_covered_bases = get_covered_pos(depth_pos, args.min_depth) pct_covered_bases = depth_covered_bases / ref_length * 100 @@ -136,10 +228,6 @@ def go(args): pct_N_bases = get_pct_N_bases(fasta) largest_N_gap = get_largest_N_gap(fasta) - # QC PASS / FAIL - if largest_N_gap >= 10000 or pct_N_bases < 50.0: - qc_pass = "TRUE" - qc_line = { 'sample_name' : args.sample, 'pct_N_bases' : "{:.2f}".format(pct_N_bases), @@ -147,8 +235,7 @@ def go(args): 'longest_no_N_run' : largest_N_gap, 'num_aligned_reads' : num_reads, 'fasta': args.fasta, - 'bam' : args.bam, - 'qc_pass' : qc_pass} + 'bam' : args.bam} with open(args.outfile, 'w') as csvfile: @@ -158,23 +245,22 @@ def go(args): writer.writerow(qc_line) N_density = sliding_window_N_density(fasta) - make_qc_plot(depth_pos, N_density, args.sample) + make_qc_plot(depth_pos, N_density, ref_length, args.sample, args.min_depth, amplicons, window=args.window, ylim_top=10**args.max_y_axis_exponent, width_inches=args.width, height_inches=args.height) -def main(): - import argparse +if __name__ == "__main__": parser = argparse.ArgumentParser() - group = parser.add_mutually_exclusive_group(required=True) - group.add_argument('--nanopore', action='store_true') - group.add_argument('--illumina', action='store_true') parser.add_argument('--outfile', required=True) parser.add_argument('--sample', required=True) parser.add_argument('--ref', required=True) parser.add_argument('--bam', required=True) parser.add_argument('--fasta', required=True) - + parser.add_argument('--primer-bed') + parser.add_argument('--primer-pairs') + parser.add_argument('--min-depth', type=int, default=10) + parser.add_argument('--max-y-axis-exponent', type=int, default=4) + parser.add_argument('--window', type=int, default=200) + parser.add_argument('--width', type=float, default=36.0) + parser.add_argument('--height', type=float, default=6.0) args = parser.parse_args() - go(args) - -if __name__ == "__main__": - main() + main(args) diff --git a/conf/base.config b/conf/base.config deleted file mode 100644 index e6fdfda..0000000 --- a/conf/base.config +++ /dev/null @@ -1,43 +0,0 @@ -// base specific params - -params{ - // Boilerplate to keep nextflow happy about undefined vars - directory = false - prefix = false - basecalled_fastq = false - fast5_pass = false - sequencing_summary = false - ref = false - bed = false - profile = false - - // host filtering - composite_ref = false - viral_contig_name = 'MN908947.3' - - // Repo to download your primer scheme from - schemeRepoURL = 'https://github.com/artic-network/artic-ncov2019.git' - - // Directory within schemeRepoURL that contains primer schemes - schemeDir = 'primer_schemes' - - // Scheme name - scheme = 'nCoV-2019' - - // Scheme version - schemeVersion = 'V3' - - // For ivar's amplicon filter - primer_pairs_tsv = false - - // Run experimental medaka pipeline? Specify in the command using "--medaka" - medaka = false - - // Run Illumina pipeline - illumina = false - - // Run nanopolish pipeline - nanopolish = false -} - - diff --git a/conf/conda.config b/conf/conda.config deleted file mode 100644 index d232d64..0000000 --- a/conf/conda.config +++ /dev/null @@ -1,10 +0,0 @@ - -process { - withName: makeQCCSV { - conda = "$baseDir/environments/extras.yml" - } - withName: bamToCram { - conda = "$baseDir/environments/extras.yml" - } -} - diff --git a/conf/illumina.config b/conf/illumina.config deleted file mode 100644 index f7826af..0000000 --- a/conf/illumina.config +++ /dev/null @@ -1,82 +0,0 @@ -// Illumina specific params - -params { - - // Instead of using the ivar-compatible bed file in the scheme repo, the - // full path to a previously-created ivar bed file. Must also supply - // ref. - bed = false - - // Instead of indexing the reference file in the scheme repo, the prefix - // of previously-created reference index files. Must also supply bed. - // (With these defined, none of the scheme* variables will be used.) - ref = false - - // Provide a gff file to annotate variants.tsv with codon info - gff = 'NO_FILE' - - // illumina fastq search path - illuminaSuffixes = ['*_R{1,2}_001', '*_R{1,2}', '*_{1,2}' ] - fastq_exts = ['.fastq.gz', '.fq.gz'] - - fastqSearchPath = makeFastqSearchPath( params.illuminaSuffixes, params.fastq_exts ) - - // Use cram input instead of fastq files - cram = false - - // Output cram instead of bam files - outCram = false - - // Allow reads that don't have primer sequence? Ligation prep = false, nextera = true - allowNoprimer = true - - // Length of illumina reads to keep after primer trimming - illuminaKeepLen = 50 - - // Sliding window quality threshold for keeping reads after primer trimming (illumina) - illuminaQualThreshold = 20 - - // Mpileup depth for ivar (although undocumented in mpileup, setting to zero removes limit) - mpileupDepth = 100000 - - // Reads will be sampled until at least this depth is met for all amplicons - downsampleMinDepth = 250 - - // Exclude reads below this mapping quality while downsampling - downsampleMappingQuality = 20 - - // Number of times that amplicons are subdivided to determine number of checkpoints - // to test for depth while downsampling. - downsampleAmpliconSubdivisions = 3 - - // frequency threshold for consensus variant (ivar/freebayes) - varFreqThreshold = 0.75 - - // Minimum coverage depth to call variant (ivar/freebayes) - varMinDepth = 10 - - // iVar frequency threshold to call variant (ivar/freebayes) - varMinFreqThreshold = 0.25 - - // iVar minimum mapQ to call variant (ivar variants: -q) - ivarMinVariantQuality = 20 - -} - -def makeFastqSearchPath ( illuminaSuffixes, fastq_exts ) { - //if (! params.directory ) { - // println("Please supply a directory containing fastqs with --directory") - // System.exit(1) - //} - - if ( params.directory ) { - def fastq_searchpath = [] - for (item in illuminaSuffixes){ - for(thing in fastq_exts){ - fastq_searchpath.add(params.directory.toString() + '/**' + item.toString() + thing.toString()) - } - } - return fastq_searchpath - } -} - diff --git a/environments/illumina/environment.yml b/environments/environment.yml similarity index 94% rename from environments/illumina/environment.yml rename to environments/environment.yml index 67c8c59..51a056f 100644 --- a/environments/illumina/environment.yml +++ b/environments/environment.yml @@ -12,6 +12,7 @@ dependencies: - trim-galore=0.6.5 - ivar=1.3 - pysam=0.16.0.1 - - mafft=7.471 - freebayes=1.3.2 - bcftools=1.10 + - mafft=7.471 + - bbmap=38.97 diff --git a/environments/illumina/Dockerfile b/environments/illumina/Dockerfile deleted file mode 100644 index be23e1b..0000000 --- a/environments/illumina/Dockerfile +++ /dev/null @@ -1,10 +0,0 @@ -FROM continuumio/miniconda3:latest -LABEL authors="Matt Bull" \ - description="Docker image containing all requirements for an Illumina mpxv pipeline" - -COPY environments/extras.yml /extras.yml -COPY environments/illumina/environment.yml /environment.yml -RUN apt-get update && apt-get install -y curl g++ git make procps && apt-get clean -y -RUN /opt/conda/bin/conda env create -f /environment.yml -RUN /opt/conda/bin/conda env update -f /extras.yml -n artic-mpxv-illumina && /opt/conda/bin/conda clean -a -ENV PATH=/opt/conda/envs/artic-mpxv-illumina/bin:$PATH diff --git a/environments/illumina/Singularity b/environments/illumina/Singularity deleted file mode 100644 index 063479f..0000000 --- a/environments/illumina/Singularity +++ /dev/null @@ -1,17 +0,0 @@ -Bootstrap: docker -From: continuumio/miniconda3:latest -%files -environments/illumina/environment.yml /environment.yml -environments/extras.yml /extras.yml -%labels -authors="Matt Bull" -description="Docker image containing all requirements for the ARTIC project's mpxv pipeline" -%post - -apt-get update && apt-get install -y g++ git make procps rsync && apt-get clean -y -/opt/conda/bin/conda env create -f /environment.yml -/opt/conda/bin/conda env update -f /extras.yml -n artic-mpxv-illumina -PATH=/opt/conda/envs/artic-mpxv-illumina/bin:$PATH - -%environment -export PATH=/opt/conda/envs/artic-mpxv-illumina/bin:$PATH diff --git a/environments/nanopore/Dockerfile b/environments/nanopore/Dockerfile deleted file mode 100644 index 6028f94..0000000 --- a/environments/nanopore/Dockerfile +++ /dev/null @@ -1,11 +0,0 @@ -FROM continuumio/miniconda3:latest -LABEL authors="Matt Bull" \ - description="Docker image containing all requirements for the ARTIC project's ncov2019 pipeline" - -COPY environments/extras.yml /extras.yml -RUN apt-get update && apt-get install -y curl g++ git make procps && apt-get clean -y -RUN curl -o /environment.yml -fsSL 'https://raw.githubusercontent.com/artic-network/artic-ncov2019/master/environment.yml' -RUN /opt/conda/bin/conda env create -f /environment.yml -RUN /opt/conda/bin/conda env update -f /extras.yml -n artic-ncov2019 -ENV PATH /opt/conda/envs/artic-ncov2019/bin:$PATH -ENTRYPOINT ["/opt/conda/envs/artic-ncov2019/bin/artic"] diff --git a/environments/nanopore/Singularity b/environments/nanopore/Singularity deleted file mode 100644 index 9b6e2e3..0000000 --- a/environments/nanopore/Singularity +++ /dev/null @@ -1,23 +0,0 @@ -Bootstrap: docker -From: continuumio/miniconda3:latest -%labels -authors="Matt Bull" -description="Docker image containing all requirements for the ARTIC project's ncov2019 pipeline" -%files -environments/extras.yml /extras.yml -%post - -apt-get update && apt-get install -y curl g++ git make procps rsync && apt-get clean -y -curl -o /environment.yml -fsSL 'https://raw.githubusercontent.com/artic-network/artic-ncov2019/master/environment.yml' -/opt/conda/bin/conda env create -f /environment.yml -/opt/conda/bin/conda env update -f /extras.yml -n artic-ncov2019 -/opt/conda/bin/conda clean -a -PATH=/opt/conda/envs/artic-ncov2019/bin:$PATH - -%environment -export PATH=/opt/conda/envs/artic-ncov2019/bin:$PATH - -%runscript - -exec "/opt/conda/envs/artic-ncov2019/bin/artic" "$@" - diff --git a/environments/nanopore/environment.yml b/environments/nanopore/environment.yml deleted file mode 100644 index 8e2ae75..0000000 --- a/environments/nanopore/environment.yml +++ /dev/null @@ -1,7 +0,0 @@ -name: artic -channels: - - conda-forge - - bioconda - - defaults -dependencies: - - artic diff --git a/main.nf b/main.nf index c6d6b33..165d618 100644 --- a/main.nf +++ b/main.nf @@ -1,15 +1,12 @@ #!/usr/bin/env nextflow -// enable dsl2 -nextflow.preview.dsl = 2 +nextflow.enable.dsl = 2 // include modules include {printHelp} from './modules/help.nf' // import subworkflows -include {articNcovNanopore} from './workflows/articNcovNanopore.nf' -include {ncovIllumina} from './workflows/illuminaNcov.nf' -include {ncovIlluminaCram} from './workflows/illuminaNcov.nf' +include {mpxvIllumina} from './workflows/illuminaMpxv.nf' if (params.help){ printHelp() @@ -21,38 +18,15 @@ if (params.profile){ System.exit(1) } -if ( params.illumina ) { - if ( !params.directory ) { - println("Please supply a directory containing fastqs or CRAMs with --directory. Specify --cram if supplying a CRAMs directory") - println("Use --help to print help") - System.exit(1) - } - if ( (params.bed && ! params.ref) || (!params.bed && params.ref) ) { - println("--bed and --ref must be supplied together") - System.exit(1) - } -} else if ( params.nanopolish ) { - if (! params.basecalled_fastq ) { - println("Please supply a directory containing basecalled fastqs with --basecalled_fastq. This is the output directory from guppy_barcoder or guppy_basecaller - usually fastq_pass. This can optionally contain barcodeXX directories, which are auto-detected.") - } - if (! params.fast5_pass ) { - println("Please supply a directory containing fast5 files with --fast5_pass (this is the fast5_pass directory)") - } - if (! params.sequencing_summary ) { - println("Please supply the path to the sequencing_summary.txt file from your run with --sequencing_summary") - System.exit(1) - } - if ( params.bed || params.ref ) { - println("ivarBed and alignerRefPrefix only work in illumina mode") - System.exit(1) - } -} else if ( params.medaka ) { - if (! params.basecalled_fastq ) { - println("Please supply a directory containing basecalled fastqs with --basecalled_fastq. This is the output directory from guppy_barcoder or guppy_basecaller - usually fastq_pass. This can optionally contain barcodeXX directories, which are auto-detected.") - } -} else { - println("Please select a workflow with --nanopolish, --illumina or --medaka, or use --help to print help") - System.exit(1) +if ( !params.directory ) { + println("Please supply a directory containing fastqs or CRAMs with --directory.") + println("Use --help to print help") + System.exit(1) +} + +if ( (params.bed && ! params.ref) || (!params.bed && params.ref) ) { + println("--bed and --ref must be supplied together") + System.exit(1) } @@ -71,59 +45,12 @@ if ( ! params.prefix ) { // main workflow workflow { - if ( params.illumina ) { - if (params.cram) { - Channel.fromPath( "${params.directory}/**.cram" ) - .map { file -> tuple(file.baseName, file) } - .set{ ch_cramFiles } - } - else { - Channel.fromFilePairs( params.fastqSearchPath, flat: true) + + Channel.fromFilePairs( params.fastqSearchPath, flat: true) .filter{ !( it[0] =~ /Undetermined/ ) } .set{ ch_filePairs } - } - } - else { - // Check to see if we have barcodes - nanoporeBarcodeDirs = file("${params.basecalled_fastq}/barcode*", type: 'dir', maxdepth: 1 ) - nanoporeNoBarcode = file("${params.basecalled_fastq}/*.fastq", type: 'file', maxdepth: 1) - if( nanoporeBarcodeDirs ) { - // Yes, barcodes! - Channel.fromPath( nanoporeBarcodeDirs ) - .filter( ~/.*barcode[0-9]{1,4}$/ ) - .filter{ d -> - def count = 0 - for (x in d.listFiles()) { - if (x.isFile()) { - count += x.countFastq() - } - } - count > params.minReadsPerBarcode - }.set{ ch_fastqDirs } - } else if ( nanoporeNoBarcode ){ - // No, no barcodes - Channel.fromPath( "${params.basecalled_fastq}", type: 'dir', maxDepth: 1 ) - .set{ ch_fastqDirs } - } else { - println("Couldn't detect whether your Nanopore run was barcoded or not. Use --basecalled_fastq to point to the unmodified guppy output directory.") - System.exit(1) - } - } - - main: - if ( params.nanopolish || params.medaka ) { - articNcovNanopore(ch_fastqDirs) - } else if ( params.illumina ) { - if ( params.cram ) { - ncovIlluminaCram(ch_cramFiles) - } - else { - ncovIllumina(ch_filePairs) - } - } else { - println("Please select a workflow with --nanopolish, --illumina or --medaka") - } + main: + mpxvIllumina(ch_filePairs) } - diff --git a/modules/artic.nf b/modules/artic.nf deleted file mode 100644 index bd5b821..0000000 --- a/modules/artic.nf +++ /dev/null @@ -1,155 +0,0 @@ -// ARTIC processes - -process articDownloadScheme{ - tag params.schemeRepoURL - - label 'internet' - - publishDir "${params.outdir}/${task.process.replaceAll(":","_")}", pattern: "scheme", mode: "copy" - - output: - path "scheme/**/${params.schemeVersion}/*.reference.fasta" , emit: reffasta - path "scheme/**/${params.schemeVersion}/${params.scheme}.bed" , emit: bed - path "scheme" , emit: scheme - - script: - """ - git clone ${params.schemeRepoURL} scheme - """ -} - -process articGuppyPlex { - tag { samplePrefix + "-" + fastqDir } - - label 'largemem' - - publishDir "${params.outdir}/${task.process.replaceAll(":","_")}", pattern: "${params.prefix}*.fastq", mode: "copy" - - input: - path(fastqDir) - - output: - path "${params.prefix}*.fastq", emit: fastq - - script: - """ - artic guppyplex \ - --min-length ${params.min_length} \ - --max-length ${params.max_length} \ - --prefix ${params.prefix} \ - --directory ${fastqDir} - """ -} - -process articMinIONMedaka { - tag { sampleName } - - label 'largecpu' - - publishDir "${params.outdir}/${task.process.replaceAll(":","_")}", pattern: "${sampleName}.*", mode: "copy" - - input: - tuple file(fastq), file(schemeRepo) - - output: - file("${sampleName}.*") - - tuple sampleName, file("${sampleName}.primertrimmed.rg.sorted.bam"), emit: ptrim - tuple sampleName, file("${sampleName}.sorted.bam"), emit: mapped - tuple sampleName, file("${sampleName}.consensus.fasta"), emit: consensus_fasta - - script: - // Make an identifier from the fastq filename - sampleName = fastq.getBaseName().replaceAll(~/\.fastq.*$/, '') - - // Configure artic minion pipeline - minionRunConfigBuilder = [] - - if ( params.normalise ) { - minionRunConfigBuilder.add("--normalise ${params.normalise}") - } - - if ( params.bwa ) { - minionRunConfigBuilder.add("--bwa") - } else { - minionRunConfigBuilder.add("--minimap2") - } - - minionFinalConfig = minionRunConfigBuilder.join(" ") - - """ - artic minion --medaka \ - ${minionFinalConfig} \ - --threads ${task.cpus} \ - --scheme-directory ${schemeRepo}/${params.schemeDir} \ - --read-file ${fastq} \ - ${params.scheme}/${params.schemeVersion} \ - ${sampleName} - """ -} - -process articMinIONNanopolish { - tag { sampleName } - - label 'largecpu' - - publishDir "${params.outdir}/${task.process.replaceAll(":","_")}", pattern: "${sampleName}.*", mode: "copy" - - input: - tuple file(fastq), file(schemeRepo), file(fast5Pass), file(seqSummary) - - output: - file("${sampleName}.*") - - tuple sampleName, file("${sampleName}.primertrimmed.rg.sorted.bam"), emit: ptrim - tuple sampleName, file("${sampleName}.sorted.bam"), emit: mapped - tuple sampleName, file("${sampleName}.consensus.fasta"), emit: consensus_fasta - - script: - // Make an identifier from the fastq filename - sampleName = fastq.getBaseName().replaceAll(~/\.fastq.*$/, '') - - // Configure artic minion pipeline - minionRunConfigBuilder = [] - - if ( params.normalise ) { - minionRunConfigBuilder.add("--normalise ${params.normalise}") - } - - if ( params.bwa ) { - minionRunConfigBuilder.add("--bwa") - } else { - minionRunConfigBuilder.add("--minimap2") - } - - minionFinalConfig = minionRunConfigBuilder.join(" ") - - """ - artic minion ${minionFinalConfig} \ - --threads ${task.cpus} \ - --scheme-directory ${schemeRepo}/${params.schemeDir} \ - --read-file ${fastq} \ - --fast5-directory ${fast5Pass} \ - --sequencing-summary ${seqSummary} \ - ${params.scheme}/${params.schemeVersion} \ - ${sampleName} - """ -} - -process articRemoveUnmappedReads { - tag { sampleName } - - cpus 1 - - input: - tuple(sampleName, path(bamfile)) - - output: - tuple( sampleName, file("${sampleName}.mapped.sorted.bam")) - - script: - """ - samtools view -F4 -o ${sampleName}.mapped.sorted.bam ${bamfile} - """ -} - diff --git a/modules/help.nf b/modules/help.nf index b242995..afbedd3 100644 --- a/modules/help.nf +++ b/modules/help.nf @@ -2,79 +2,43 @@ def printHelp() { log.info""" Usage: - nextflow run connor-lab/ncov2019-artic-nf -profile (singularity,docker,conda) ( --illumina | --nanpolish | --medaka ) --prefix [prefix] [workflow-options] + nextflow run BCCDC-PHL/mpxv-artic-nf -profile conda --prefix [prefix] [workflow-options] Description: - Turn Nanopore or Illumina SARS-CoV2 sequencing reads generated by ARTIC tiling amplification protocols into consensus sequences. - - Nanopore: ARTIC (https://github.com/artic-network/fieldbioinformatics) - - Illumina: iVar (https://github.com/andersen-lab/ivar) + Turn Illumina monkeypox virus (mpxv) sequencing reads generated by ARTIC tiling amplification protocols into consensus sequences. All options set via CLI can be set in conf directory Nextflow arguments (single DASH): - -profile Allowed values: conda, singularity, docker, [COG-UK institutional profile] - - Mandatory workflow arguments (mutually exclusive): - --illumina Run the Illumina workflow - --nanopolish Run the Nanopore/Nanopolish workflow (https://github.com/jts/nanopolish) - --medaka Run the Nanopore/Medaka workflow (https://github.com/nanoporetech/medaka) - - Nanopore workflow options: - Mandatory: - --prefix A (unique) string prefix for output files. - Sequencing run name is a good choice e.g DDMMYY_MACHINEID_RUN_FLOWCELLID. - --basecalled_fastq The output directory from guppy_barcoder or guppy_basecaller - usually fastq_pass. - This can optionally contain barcodeXXX directories, which are - auto-detected and analysed in parallel. - --fast5_pass Directory containing fast5 files - usually fast5_pass. NOT REQUIRED FOR MEDAKA WORKFLOW. - --sequencing_summary Path to sequencing_summary.txt. NOT REQUIRED FOR MEDAKA WORKFLOW. - - Optional: - --outdir Output directory (Default: ./results) - - --schemeVersion ARTIC scheme version (Default: 'V2') - --schemeRepoURL Repo to download your primer scheme from (Default: 'https://github.com/artic-network/artic-ncov2019') - --schemeDir Directory within schemeRepoURL that contains primer schemes (Default: 'primer_schemes') - --scheme Scheme name (Default: 'nCoV-2019') + -profile Allowed values: conda - --min_length Minimum read length for artic guppyplex (Default: 400) - --max_length Maximum read length for artic guppyplex (Default: 700) - --bwa Use BWA for mapping Nanopore reads (Default: false, use Minimap2) - --outCram Output cram instead of bam files (Default: false) - --minReadsPerBarcode Minimum number of reads accepted for a single barcode when supplying deplexed Fastq - files as input. Barcodes having fewer reads are ignored. (Default: 100) - - Illumina workflow options: + Workflow options: Mandatory: - --prefix A (unique) string prefix for output files. - Sequencing run name is a good choice e.g DDMMYY_MACHINEID_RUN_FLOWCELLID. - --directory Path to a directory containing paired-end Illumina reads. - Reads will be found and paired RECURSIVELY beneath this directory. + --prefix A (unique) string prefix for output files. + Sequencing run name is a good choice e.g DDMMYY_MACHINEID_RUN_FLOWCELLID. + --directory Path to a directory containing paired-end Illumina reads. + Reads will be found and paired RECURSIVELY beneath this directory. Optional: - --outdir Output directory (Default: ./results) + --outdir Output directory (Default: ./results) - --schemeVersion ARTIC scheme version (Default: 'V2') - --schemeRepoURL Repo to download your primer scheme from (Default: 'https://github.com/artic-network/artic-ncov2019') - --schemeDir Directory within schemeRepoURL that contains primer schemes (Default: 'primer_schemes') - --scheme Scheme name (Default: 'nCoV-2019') + --schemeVersion ARTIC scheme version (Default: 'V2.3') + --schemeRepoURL Repo to download your primer scheme from (Default: 'https://github.com/BCCDC-PHL/artic-mpxv2022') + --schemeDir Directory within schemeRepoURL that contains primer schemes (Default: 'primer_schemes') + --scheme Scheme name (Default: 'mpxv-2022') - --bed Path to iVar-compatible bed file, also requires --ref - Overrides --scheme* options. (Default: unset, download scheme from git) - --ref Path to iVar-compatible reference fasta file, also requires --bed - Overrides --scheme* options. (Default: unset, download scheme from git) - - --allowNoprimer Allow reads that don't have primer sequence? - Depends on your library prep method: ligation == false, tagmentation == true (Default: true) - --illuminaKeepLen Length (bp) of reads to keep after primer trimming (Default: 20) - --illuminaQualThreshold Sliding window quality threshold for keeping - reads after primer trimming (Default: 20) - --mpileupDepth Mpileup depth (Default: 100000) - --ivarFreqThreshold iVar frequency threshold for consensus variant (ivar consensus -t, Default: 0.75) - --ivarMinDepth Minimum coverage depth to call variant (ivar consensus -m; ivar variants -m, Default: 10) - --ivarMinFreqThreshold iVar frequency threshold to call variant (ivar variants -t, Default: 0.25) - --ivarMinVariantQuality iVar minimum mapQ to call variant (ivar variants -q, Default: 20) + --bed Path to primer bed file, also requires --ref + Overrides --scheme* options. (Default: unset, download scheme from git) + --ref Path to iVar-compatible reference fasta file, also requires --bed + Overrides --scheme* options. (Default: unset, download scheme from git) + --primer_pairs_tsv File showing which primers are paired. + --keepLen Length (bp) of reads to keep after primer trimming (Default: 50) + --qualThreshold Sliding window quality threshold for keeping + reads after primer trimming (Default: 20) + --normalizationTargetDepth Target depth of coverage to normalize to. + --normalizationMinDepth Minimum depth of coverage to normalize to. + --varMinFreqThreshold Frequency threshold to call ambiguous variant (Default: 0.25) + --varFreqThreshold Frequency threshold for unambiguous variant (Default: 0.75) + --varMinDepth Minimum coverage depth to call variant (Default: 10) - --cram Input is CRAM files not fastq (Default: false) - --outCram Output cram instead of bam files (Default: false) """.stripIndent() } diff --git a/modules/illumina.nf b/modules/illumina.nf index 544c3ef..f042d14 100644 --- a/modules/illumina.nf +++ b/modules/illumina.nf @@ -1,3 +1,48 @@ +process performHostFilter { + + tag { sampleName } + + publishDir "${params.outdir}/${task.process.replaceAll(":","_")}", pattern: "${sampleName}_hostfiltered_R*.fastq.gz", mode: 'copy' + + input: + tuple val(sampleName), path(forward), path(reverse) + + output: + tuple val(sampleName), path("${sampleName}_hostfiltered_R1.fastq.gz"), path("${sampleName}_hostfiltered_R2.fastq.gz"), emit: fastqPairs + + script: + """ + bwa mem -t ${task.cpus} ${params.composite_ref} ${forward} ${reverse} | \ + filter_non_human_reads.py -c ${params.viral_contig_name} > ${sampleName}.viral_and_nonmapping_reads.bam + samtools sort -@ ${task.cpus} -n ${sampleName}.viral_and_nonmapping_reads.bam | \ + samtools fastq -1 ${sampleName}_hostfiltered_R1.fastq.gz -2 ${sampleName}_hostfiltered_R2.fastq.gz -s ${sampleName}_singletons.fastq.gz - + """ +} + +process normalizeDepth { + + tag { sampleName } + + publishDir "${params.outdir}/${task.process.replaceAll(":","_")}", pattern: '*_norm_R{1,2}.fq.gz', mode: 'copy' + + input: + tuple val(sampleName), path(forward), path(reverse) + + output: + tuple val(sampleName), path("${sampleName}_norm_R1.fq.gz"), path("${sampleName}_norm_R2.fq.gz") + + script: + """ + bbnorm.sh \ + target=${params.normalizationTargetDepth} \ + mindepth=${params.normalizationMinDepth} \ + in=${forward} \ + in2=${reverse} \ + out=${sampleName}_norm_R1.fq.gz \ + out2=${sampleName}_norm_R2.fq.gz + """ +} + process readTrimming { /** * Trims paired fastq using trim_galore (https://github.com/FelixKrueger/TrimGalore) @@ -12,10 +57,10 @@ process readTrimming { cpus 1 input: - tuple(sampleName, path(forward), path(reverse)) + tuple val(sampleName), path(forward), path(reverse) output: - tuple(sampleName, path("*_val_1.fq.gz"), path("*_val_2.fq.gz")) optional true + tuple val(sampleName), path("*_val_1.fq.gz"), path("*_val_2.fq.gz"), optional: true script: """ @@ -42,10 +87,10 @@ process filterResidualAdapters { cpus 1 input: - tuple(sampleName, path(forward), path(reverse)) + tuple val(sampleName), path(forward), path(reverse) output: - tuple(sampleName, path("*1_posttrim_filter.fq.gz"), path("*2_posttrim_filter.fq.gz")) + tuple val(sampleName), path("*1_posttrim_filter.fq.gz"), path("*2_posttrim_filter.fq.gz") script: """ @@ -61,16 +106,16 @@ process indexReference { tag { ref } input: - path(ref) + path(ref) output: - tuple path('ref.fa'), path('ref.fa.*') + tuple path('ref.fa'), path('ref.fa.*') script: - """ - ln -s ${ref} ref.fa - bwa index ref.fa - """ + """ + ln -s ${ref} ref.fa + bwa index ref.fa + """ } process readMapping { @@ -88,17 +133,17 @@ process readMapping { publishDir "${params.outdir}/${task.process.replaceAll(":","_")}", pattern: "${sampleName}.sorted{.bam,.bam.bai}", mode: 'copy' input: - tuple sampleName, path(forward), path(reverse), path(ref), path("*") + tuple val(sampleName), path(forward), path(reverse), path(ref), path("*") output: - tuple(sampleName, path("${sampleName}.sorted.bam"), path("${sampleName}.sorted.bam.bai")) + tuple val(sampleName), path("${sampleName}.sorted.bam"), path("${sampleName}.sorted.bam.bai") script: - """ - bwa mem -t ${task.cpus} ${ref} ${forward} ${reverse} | \ - samtools sort -o ${sampleName}.sorted.bam - samtools index ${sampleName}.sorted.bam - """ + """ + bwa mem -t ${task.cpus} ${ref} ${forward} ${reverse} | \ + samtools sort -o ${sampleName}.sorted.bam + samtools index ${sampleName}.sorted.bam + """ } process trimPrimerSequences { @@ -109,72 +154,20 @@ process trimPrimerSequences { publishDir "${params.outdir}/${task.process.replaceAll(":","_")}", pattern: "${sampleName}.mapped.primertrimmed.sorted{.bam,.bam.bai}", mode: 'copy' input: - tuple sampleName, path(bam), path(bam_index), path(bedfile) - - output: - tuple sampleName, path("${sampleName}.mapped.bam"), path("${sampleName}.mapped.bam.bai"), emit: mapped - tuple sampleName, path("${sampleName}.mapped.primertrimmed.sorted.bam"), path("${sampleName}.mapped.primertrimmed.sorted.bam.bai" ), emit: ptrim - - script: - if (params.allowNoprimer){ - ivarCmd = "ivar trim -e" - } else { - ivarCmd = "ivar trim" - } - """ - samtools view -F4 -o ${sampleName}.mapped.bam ${bam} - samtools index ${sampleName}.mapped.bam - ${ivarCmd} -i ${sampleName}.mapped.bam -b ${bedfile} -m ${params.illuminaKeepLen} -q ${params.illuminaQualThreshold} -f ${params.primer_pairs_tsv} -p ivar.out - samtools sort -o ${sampleName}.mapped.primertrimmed.sorted.bam ivar.out.bam - samtools index ${sampleName}.mapped.primertrimmed.sorted.bam - """ -} - -process callVariants { - - tag { sampleName } - - publishDir "${params.outdir}/${task.process.replaceAll(":","_")}", pattern: "${sampleName}.variants.tsv", mode: 'copy' - - input: - tuple val(sampleName), path(bam), path(bam_index), path(ref), path(gff) - - output: - tuple val(sampleName), path("${sampleName}.variants.tsv") - - script: - def gff_arg = gff.name == 'NO_FILE' ? "" : "-g ${gff}" - """ - samtools faidx ${ref} - samtools mpileup -A -d 0 --reference ${ref} -B -Q 0 ${bam} |\ - ivar variants -r ${ref} -m ${params.varMinDepth} -p ${sampleName}.variants -q ${params.ivarMinVariantQuality} -t ${params.varMinFreqThreshold} ${gff_arg} - """ -} - -process makeConsensus { - - tag { sampleName } - - publishDir "${params.outdir}/${task.process.replaceAll(":","_")}", pattern: "${sampleName}.primertrimmed.consensus.fa", mode: 'copy' - - input: - tuple(sampleName, path(bam), path(bam_index)) + tuple val(sampleName), path(bam), path(bam_index), path(bedfile) output: - tuple(sampleName, path("${sampleName}.primertrimmed.consensus.fa")) + tuple val(sampleName), path("${sampleName}.mapped.bam"), path("${sampleName}.mapped.bam.bai"), emit: mapped + tuple val(sampleName), path("${sampleName}.mapped.primertrimmed.sorted.bam"), path("${sampleName}.mapped.primertrimmed.sorted.bam.bai" ), emit: ptrim script: - """ - samtools mpileup -aa -A -B -d ${params.mpileupDepth} -Q0 ${bam} | \ - ivar consensus -t ${params.varFreqThreshold} -m ${params.varMinDepth} \ - -n N -p ${sampleName}.primertrimmed.consensus - # If the consensus sequence is empty, fill it with 29903 'N' characters. - if [[ \$(tail -n 1 ${sampleName}.primertrimmed.consensus.fa | tr -d '\n' | wc -c) == 0 ]] - then - mv ${sampleName}.primertrimmed.consensus.fa ${sampleName}.primertrimmed.consensus.no_seq.fa - cat <(head -n 1 ${sampleName}.primertrimmed.consensus.no_seq.fa) <(head -c 29903 < /dev/zero | tr '\\0' 'N') <(echo) > ${sampleName}.primertrimmed.consensus.fa - fi - """ + """ + samtools view -F4 -o ${sampleName}.mapped.bam ${bam} + samtools index ${sampleName}.mapped.bam + ivar trim -e -i ${sampleName}.mapped.bam -b ${bedfile} -m ${params.keepLen} -q ${params.qualThreshold} -f ${params.primer_pairs_tsv} -p ivar.out + samtools sort -o ${sampleName}.mapped.primertrimmed.sorted.bam ivar.out.bam + samtools index ${sampleName}.mapped.primertrimmed.sorted.bam + """ } process callConsensusFreebayes { @@ -192,66 +185,44 @@ process callConsensusFreebayes { tuple val(sampleName), path("${sampleName}.variants.norm.vcf"), emit: variants script: - """ - # the sed is to fix the header until a release is made with this fix - # https://github.com/freebayes/freebayes/pull/549 - freebayes -p 1 \ - -f ${ref} \ - -F 0.2 \ - -C 1 \ - --pooled-continuous \ - --min-coverage ${params.varMinDepth} \ - --gvcf --gvcf-dont-use-chunk true ${bam} | sed s/QR,Number=1,Type=Integer/QR,Number=1,Type=Float/ > ${sampleName}.gvcf - - # make depth mask, split variants into ambiguous/consensus - # NB: this has to happen before bcftools norm or else the depth mask misses any bases exposed during normalization - process_gvcf.py -d ${params.varMinDepth} \ - -l ${params.varMinFreqThreshold} \ - -u ${params.varFreqThreshold} \ - -m ${sampleName}.mask.txt \ - -v ${sampleName}.variants.vcf \ - -c ${sampleName}.consensus.vcf ${sampleName}.gvcf - - # normalize variant records into canonical VCF representation - for v in "variants" "consensus"; do - bcftools norm -f ${ref} ${sampleName}.\$v.vcf > ${sampleName}.\$v.norm.vcf - done - - # split the consensus sites file into a set that should be IUPAC codes and all other bases, using the ConsensusTag in the VCF - for vt in "ambiguous" "fixed"; do - cat ${sampleName}.consensus.norm.vcf | awk -v vartag=ConsensusTag=\$vt '\$0 ~ /^#/ || \$0 ~ vartag' > ${sampleName}.\$vt.norm.vcf - bgzip -f ${sampleName}.\$vt.norm.vcf - tabix -f -p vcf ${sampleName}.\$vt.norm.vcf.gz - done - - # apply ambiguous variants first using IUPAC codes. this variant set cannot contain indels or the subsequent step will break - bcftools consensus -f ${ref} -I ${sampleName}.ambiguous.norm.vcf.gz > ${sampleName}.ambiguous.fa - - # apply remaninng variants, including indels - bcftools consensus -f ${sampleName}.ambiguous.fa -m ${sampleName}.mask.txt ${sampleName}.fixed.norm.vcf.gz | sed s/${params.viral_contig_name}/${sampleName}/ > ${sampleName}.consensus.fa - """ -} - -process cramToFastq { - /** - * Converts CRAM to fastq (http://bio-bwa.sourceforge.net/) - * Uses samtools to convert to CRAM, to FastQ (http://www.htslib.org/doc/samtools.html) - * @input - * @output - */ - - input: - tuple sampleName, file(cram) - - output: - tuple sampleName, path("${sampleName}_1.fastq.gz"), path("${sampleName}_2.fastq.gz") - - script: - """ - samtools collate -u ${cram} -o tmp.bam - samtools fastq -1 ${sampleName}_1.fastq.gz -2 ${sampleName}_2.fastq.gz tmp.bam - rm tmp.bam - """ + """ + # the sed is to fix the header until a release is made with this fix + # https://github.com/freebayes/freebayes/pull/549 + freebayes -p 1 \ + -f ${ref} \ + -F 0.2 \ + -C 1 \ + --pooled-continuous \ + --min-coverage ${params.varMinDepth} \ + --gvcf --gvcf-dont-use-chunk true ${bam} | sed s/QR,Number=1,Type=Integer/QR,Number=1,Type=Float/ > ${sampleName}.gvcf + + # make depth mask, split variants into ambiguous/consensus + # NB: this has to happen before bcftools norm or else the depth mask misses any bases exposed during normalization + process_gvcf.py -d ${params.varMinDepth} \ + -l ${params.varMinFreqThreshold} \ + -u ${params.varFreqThreshold} \ + -m ${sampleName}.mask.txt \ + -v ${sampleName}.variants.vcf \ + -c ${sampleName}.consensus.vcf ${sampleName}.gvcf + + # normalize variant records into canonical VCF representation + for v in "variants" "consensus"; do + bcftools norm -f ${ref} ${sampleName}.\$v.vcf > ${sampleName}.\$v.norm.vcf + done + + # split the consensus sites file into a set that should be IUPAC codes and all other bases, using the ConsensusTag in the VCF + for vt in "ambiguous" "fixed"; do + cat ${sampleName}.consensus.norm.vcf | awk -v vartag=ConsensusTag=\$vt '\$0 ~ /^#/ || \$0 ~ vartag' > ${sampleName}.\$vt.norm.vcf + bgzip -f ${sampleName}.\$vt.norm.vcf + tabix -f -p vcf ${sampleName}.\$vt.norm.vcf.gz + done + + # apply ambiguous variants first using IUPAC codes. this variant set cannot contain indels or the subsequent step will break + bcftools consensus -f ${ref} -I ${sampleName}.ambiguous.norm.vcf.gz > ${sampleName}.ambiguous.fa + + # apply remaninng variants, including indels + bcftools consensus -f ${sampleName}.ambiguous.fa -m ${sampleName}.mask.txt ${sampleName}.fixed.norm.vcf.gz | sed s/${params.viral_contig_name}/${sampleName}/ > ${sampleName}.consensus.fa + """ } process alignConsensusToReference { @@ -265,50 +236,25 @@ process alignConsensusToReference { publishDir "${params.outdir}/${task.process.replaceAll(":","_")}", pattern: "${sampleName}.consensus.aln.fa", mode: 'copy' input: - tuple(sampleName, path(consensus), path(reference)) + tuple val(sampleName), path(consensus), path(reference) output: - tuple(sampleName, path("${sampleName}.consensus.aln.fa")) + tuple val(sampleName), path("${sampleName}.consensus.aln.fa") script: - // Convert multi-line fasta to single line - awk_string = '/^>/ {printf("\\n%s\\n", $0); next; } { printf("%s", $0); } END { printf("\\n"); }' - """ - mafft \ - --preservecase \ - --keeplength \ - --add \ - ${consensus} \ - ${reference} \ - > ${sampleName}.with_ref.multi_line.alignment.fa - awk '${awk_string}' ${sampleName}.with_ref.multi_line.alignment.fa > ${sampleName}.with_ref.single_line.alignment.fa - tail -n 2 ${sampleName}.with_ref.single_line.alignment.fa > ${sampleName}.consensus.aln.fa - """ -} - -process trimUTRFromAlignment { - /** - * Trim the aligned consensus to remove 3' and 5' UTR sequences. - */ - - tag { sampleName } - - publishDir "${params.outdir}/${task.process.replaceAll(":","_")}", pattern: "${sampleName}.consensus.aln.utr_trimmed.fa", mode: 'copy' - - input: - tuple(sampleName, path(alignment)) - - output: - tuple(sampleName, path("${sampleName}.consensus.aln.utr_trimmed.fa")) - - script: - awk_string = '/^>/ { printf("%s\\n", $0); next; } { printf("%s", $0); } END { printf("\\n"); }' - """ - echo -e "\$(head -n 1 ${alignment} | cut -c 2-):266-29674" > non_utr.txt - samtools faidx ${alignment} - samtools faidx -r non_utr.txt ${alignment} > ${sampleName}.consensus.aln.utr_trimmed.multi_line.fa - awk '${awk_string}' ${sampleName}.consensus.aln.utr_trimmed.multi_line.fa > ${sampleName}.consensus.aln.utr_trimmed.fa - """ + // Convert multi-line fasta to single line + awk_string = '/^>/ {printf("\\n%s\\n", $0); next; } { printf("%s", $0); } END { printf("\\n"); }' + """ + mafft \ + --preservecase \ + --keeplength \ + --add \ + ${consensus} \ + ${reference} \ + > ${sampleName}.with_ref.multi_line.alignment.fa + awk '${awk_string}' ${sampleName}.with_ref.multi_line.alignment.fa > ${sampleName}.with_ref.single_line.alignment.fa + tail -n 2 ${sampleName}.with_ref.single_line.alignment.fa > ${sampleName}.consensus.aln.fa + """ } process annotateVariantsVCF { @@ -333,4 +279,4 @@ process annotateVariantsVCF { bcftools csq -f ${ref} -g ${gff} ${vcf} -Ov -o ${sampleName}.variants.norm.consequence.vcf bcftools csq -f ${ref} -g ${gff} ${vcf} -Ot -o ${sampleName}.variants.norm.consequence.tsv """ -} \ No newline at end of file +} diff --git a/modules/out.nf b/modules/out.nf deleted file mode 100644 index 29426f3..0000000 --- a/modules/out.nf +++ /dev/null @@ -1,24 +0,0 @@ -process bamToCram { - /** - * Convert bam to cram using samtools with embeded reference, indexes the CRAM (http://www.htslib.org/doc/samtools.html) - * @input - * @output - */ - - tag { sampleName } - - publishDir "${params.outdir}/${task.process.replaceAll(":","_")}", pattern: "${bam.baseName}.cram", mode: 'copy' - publishDir "${params.outdir}/${task.process.replaceAll(":","_")}", pattern: "${bam.baseName}.cram.crai", mode: 'copy' - - input: - tuple(sampleName, path(bam), path(ref)) - - output: - tuple sampleName, path("${bam.baseName}.cram"), emit: cramed - tuple sampleName, path("${bam.baseName}.cram.crai"), emit: cramedidx - - script: - """ - samtools view --write-index -C -O cram,embed_ref -T ${ref} -o ${bam.baseName}.cram ${bam} - """ -} diff --git a/modules/qc.nf b/modules/qc.nf index 6dd4f85..0fdcb74 100644 --- a/modules/qc.nf +++ b/modules/qc.nf @@ -1,26 +1,18 @@ process makeQCCSV { tag { sampleName } - //conda 'environments/extras.txt' - publishDir "${params.outdir}/qc_plots", pattern: "${sampleName}.depth.png", mode: 'copy' input: - tuple sampleName, path(bam), path(bam_index), path(fasta), path(ref) + tuple val(sampleName), path(bam), path(bam_index), path(fasta), path(ref), path(primer_bed), path(primer_pairs) output: path "${params.prefix}.${sampleName}.qc.csv", emit: csv path "${sampleName}.depth.png" script: - if ( params.illumina ) { - qcSetting = "--illumina" - } else { - qcSetting = "--nanopore" - } - """ - qc.py ${qcSetting} --outfile ${params.prefix}.${sampleName}.qc.csv --sample ${sampleName} --ref ${ref} --bam ${bam} --fasta ${fasta} + qc.py --outfile ${params.prefix}.${sampleName}.qc.csv --sample ${sampleName} --ref ${ref} --bam ${bam} --fasta ${fasta} --primer-bed ${primer_bed} --primer-pairs ${primer_pairs} """ } diff --git a/modules/upload.nf b/modules/upload.nf index da5f26c..2ea300b 100644 --- a/modules/upload.nf +++ b/modules/upload.nf @@ -1,10 +1,11 @@ process collateSamples { + tag { sampleName } publishDir "${params.outdir}/nml_upload/", pattern: "${params.prefix}/${sampleName}*", mode: 'copy' input: - tuple(sampleName, path(fasta), path(fastq_r1), path(fastq_r2)) + tuple val(sampleName), path(fasta), path(fastq_r1), path(fastq_r2) output: path("${params.prefix}/${sampleName}*") @@ -14,19 +15,3 @@ process collateSamples { mkdir ${params.prefix} && cp ${sampleName}* ${params.prefix} """ } - -process prepareUploadDirectory { - tag { params.prefix } - - input: - path("${params.prefix}/*") - - output: - path("${params.prefix}") - - script: - """ - echo "dummy" > dummyfile - """ -} - diff --git a/modules/utils.nf b/modules/utils.nf index c0655a8..c6cbc68 100644 --- a/modules/utils.nf +++ b/modules/utils.nf @@ -1,69 +1,39 @@ -process performHostFilter { - cpus 1 +process articDownloadScheme{ + tag params.schemeRepoURL - tag { sampleId } + label 'internet' - publishDir "${params.outdir}/${task.process.replaceAll(":","_")}", pattern: "${sampleId}_hostfiltered_R*.fastq.gz", mode: 'copy' + publishDir "${params.outdir}/${task.process.replaceAll(":","_")}", pattern: "scheme", mode: "copy" - input: - tuple val(sampleId), path(forward), path(reverse) output: - tuple val(sampleId), path("${sampleId}_hostfiltered_R1.fastq.gz"), path("${sampleId}_hostfiltered_R2.fastq.gz"), emit: fastqPairs + path "scheme/**/${params.schemeVersion}/*.reference.fasta" , emit: reffasta + path "scheme/**/${params.schemeVersion}/${params.scheme}.bed" , emit: bed + path "scheme" , emit: scheme script: - """ - bwa mem -t ${task.cpus} ${params.composite_ref} ${forward} ${reverse} | \ - filter_non_human_reads.py -c ${params.viral_contig_name} > ${sampleId}.viral_and_nonmapping_reads.bam - samtools sort -n ${sampleId}.viral_and_nonmapping_reads.bam | \ - samtools fastq -1 ${sampleId}_hostfiltered_R1.fastq.gz -2 ${sampleId}_hostfiltered_R2.fastq.gz -s ${sampleId}_singletons.fastq.gz - - """ + """ + git clone ${params.schemeRepoURL} scheme + """ } -process downsampleAmplicons { - cpus 1 - - tag { sampleId } - - publishDir "${params.outdir}/${task.process.replaceAll(":","_")}", pattern: "${sampleId}.mapped.primertrimmed.downsampled.sorted{.bam,.bam.bai}", mode: 'copy' - - input: - tuple val(sampleId), path(trimmed_bam), path(trimmed_bam_index), path(bedfile), path(ref) - output: - tuple val(sampleId), path("${sampleId}.mapped.primertrimmed.downsampled.sorted.bam"), path("${sampleId}.mapped.primertrimmed.downsampled.sorted.bam.bai"), emit: alignment - path("downsampling_summary.csv"), emit: summary - - script: - """ - samtools faidx ${ref} - downsample_amplicons.py \ - --bed ${bedfile} \ - --min-depth ${params.downsampleMinDepth} \ - --mapping-quality ${params.downsampleMappingQuality} \ - --amplicon-subdivisions ${params.downsampleAmpliconSubdivisions} \ - --genome-size \$(cut -d \$'\\t' -f 2 ${ref}.fai) \ - ${trimmed_bam} 2> downsampling_summary_no_sample_id.csv | \ - samtools sort - -o ${sampleId}.mapped.primertrimmed.downsampled.sorted.bam - samtools index ${sampleId}.mapped.primertrimmed.downsampled.sorted.bam - paste -d ',' <(echo "sample_id" && echo "${sampleId}") downsampling_summary_no_sample_id.csv > downsampling_summary.csv - """ -} - -process downsampledBamToFastq { +process performHostFilter { cpus 1 - tag { sampleId } + tag { sampleName } - publishDir "${params.outdir}/${task.process.replaceAll(":","_")}", pattern: "${sampleId}_hostfiltered_downsampled_R{1,2}.fastq.gz", mode: 'copy' + publishDir "${params.outdir}/${task.process.replaceAll(":","_")}", pattern: "${sampleName}_hostfiltered_R*.fastq.gz", mode: 'copy' input: - tuple val(sampleId), path(downsampled_bam), path(downsampled_bam_index) + tuple val(sampleName), path(forward), path(reverse) output: - tuple val(sampleId), path("${sampleId}_hostfiltered_downsampled_R1.fastq.gz"), path("${sampleId}_hostfiltered_downsampled_R2.fastq.gz"), emit: fastqPairs + tuple val(sampleName), path("${sampleName}_hostfiltered_R1.fastq.gz"), path("${sampleName}_hostfiltered_R2.fastq.gz"), emit: fastqPairs script: """ - samtools collate -u ${downsampled_bam} -o ${sampleId}.collate.bam ${sampleId} - samtools fastq -n -0 /dev/null -1 ${sampleId}_hostfiltered_downsampled_R1.fastq.gz -2 ${sampleId}_hostfiltered_downsampled_R2.fastq.gz -s ${sampleId}_hostfiltered_downsampled_S.fastq.gz ${sampleId}.collate.bam + bwa mem -t ${task.cpus} ${params.composite_ref} ${forward} ${reverse} | \ + filter_non_human_reads.py -c ${params.viral_contig_name} > ${sampleName}.viral_and_nonmapping_reads.bam + samtools sort -n ${sampleName}.viral_and_nonmapping_reads.bam | \ + samtools fastq -1 ${sampleName}_hostfiltered_R1.fastq.gz -2 ${sampleName}_hostfiltered_R2.fastq.gz -s ${sampleName}_singletons.fastq.gz - """ } @@ -74,16 +44,16 @@ process addCodonPositionToVariants { tag { sampleId } - publishDir "${params.outdir}/${task.process.replaceAll(":","_")}", pattern: "${sampleId}.variants.with_codon_pos.tsv", mode: 'copy' + publishDir "${params.outdir}/${task.process.replaceAll(":","_")}", pattern: "${sampleName}.variants.with_codon_pos.tsv", mode: 'copy' input: - tuple val(sampleId), path(variants), path(gff) + tuple val(sampleName), path(variants), path(gff) output: - tuple val(sampleId), path("${sampleId}.variants.with_codon_pos.tsv") + tuple val(sampleName), path("${sampleName}.variants.with_codon_pos.tsv") script: def gff_arg = gff.name == 'NO_FILE' ? "" : "-g ${gff}" """ - ivar_variants_add_codon_position.py ${gff_arg} ${variants} > ${sampleId}.variants.with_codon_pos.tsv + ivar_variants_add_codon_position.py ${gff_arg} ${variants} > ${sampleName}.variants.with_codon_pos.tsv """ } \ No newline at end of file diff --git a/nextflow.config b/nextflow.config index 0586bce..006bce3 100644 --- a/nextflow.config +++ b/nextflow.config @@ -1,69 +1,101 @@ -// Global default params, used in configs +manifest { + author = 'connor-lab: Matt Bull, Sendu Bala. BCCDC-PHL: Dan Fornika. oicr-gsi: Jared Simpson, Michael Laszloffy' + description = 'Nextflow for running the Artic mpxv pipeline' + mainScript = 'main.nf' + nextflowVersion = '>=20.01.0' + version = '0.1.0' +} + +def makeFastqSearchPath ( illuminaSuffixes, fastq_exts ) { + if ( params.directory ) { + def fastq_searchpath = [] + for (item in illuminaSuffixes){ + for(thing in fastq_exts){ + fastq_searchpath.add(params.directory.toString() + '/**' + item.toString() + thing.toString()) + } + } + return fastq_searchpath + } +} + params { - // Workflow flags - outdir = './results' + illuminaSuffixes = ['*_R{1,2}_001', '*_R{1,2}', '*_{1,2}' ] + fastq_exts = ['.fastq.gz', '.fq.gz'] + fastqSearchPath = makeFastqSearchPath( params.illuminaSuffixes, params.fastq_exts ) // Boilerplate options + directory = false + prefix = false + ref = false + bed = false + gff = 'NO_FILE' + primer_pairs_tsv = 'NO_FILE' + profile = false help = false + outdir = './results' tracedir = "${params.outdir}/pipeline_info" + + // host filtering + composite_ref = false + viral_contig_name = 'BCCDCmpx2' - // cache option makes it a bit easier to set conda or singularity cacheDir - cache = '' -} + // Repo to download your primer scheme from + schemeRepoURL = 'https://github.com/BCCDC-PHL/artic-mpxv2022.git' -// Load base.config by default for all pipelines -includeConfig 'conf/base.config' + // Directory within schemeRepoURL that contains primer schemes + schemeDir = 'primer_schemes' + // Scheme name + scheme = 'mpxv-2022' -if ( params.medaka || params.nanopolish ){ - includeConfig 'conf/nanopore.config' -} + // Scheme version + schemeVersion = 'V2.4' -if ( params.illumina ){ - includeConfig 'conf/illumina.config' -} + // Target depth for bbnorm kmer-based depth normalization + normalizationTargetDepth = 200 + + // Minimum depth for bbnorm kmer-based depth normalization + normalizationMinDepth = 5 + // Length of reads to keep after primer trimming + keepLen = 50 + + // Sliding window quality threshold for keeping reads after primer trimming (illumina) + qualThreshold = 20 + + // frequency threshold to call ambiguous variant + varMinFreqThreshold = 0.25 + + // frequency threshold for unambiguous variant + varFreqThreshold = 0.75 + + // Minimum coverage depth to call variant + varMinDepth = 10 + +} profiles { conda { - if ( params.medaka || params.nanopolish ) { - process.conda = "$baseDir/environments/nanopore/environment.yml" - - } else if (params.illumina) { - process.conda = "$baseDir/environments/illumina/environment.yml" - } - if (params.cache){ + process.conda = "$baseDir/environments/environment.yml" + if (params.cache) { conda.cacheDir = params.cache } - includeConfig 'conf/conda.config' - } - docker { - docker.enabled = true - fixOwnership = true - runOptions = "-u \$(id -u):\$(id -g)" - } - singularity { - singularity.enabled = true - singularity.autoMounts = true - - if ( params.medaka || params.nanopolish ){ - process.container = "file:///${baseDir}/artic-ncov2019-nanopore.sif" - } else if (params.illumina) { - process.container = "file:///${baseDir}/artic-ncov2019-illumina.sif" - } - if (params.cache){ - singularity.cacheDir = params.cache - } - } - slurm { - process.executor = 'slurm' } } // Capture exit codes from upstream processes when piping process.shell = ['/bin/bash', '-euo', 'pipefail'] +process { + withName: makeQCCSV { + conda = "$baseDir/environments/extras.yml" + } + withName: performHostFilter { + cpus = 4 + } +} + timeline { enabled = false file = "${params.tracedir}/execution_timeline.html" @@ -80,12 +112,3 @@ dag { enabled = false file = "${params.tracedir}/pipeline_dag.svg" } - -manifest { - author = 'connor-lab: Matt Bull, Sendu Bala. BCCDC-PHL: Dan Fornika. oicr-gsi: Jared Simpson, Michael Laszloffy' - description = 'Nextflow for running the Artic ncov2019 pipeline' - mainScript = 'main.nf' - nextflowVersion = '>=20.01.0' - version = '1.3.2' -} - diff --git a/workflows/articNcovNanopore.nf b/workflows/articNcovNanopore.nf deleted file mode 100644 index 2eebf23..0000000 --- a/workflows/articNcovNanopore.nf +++ /dev/null @@ -1,130 +0,0 @@ -// ARTIC ncov workflow - -// enable dsl2 -nextflow.preview.dsl = 2 - -// import modules -include {articDownloadScheme} from '../modules/artic.nf' -include {articGuppyPlex} from '../modules/artic.nf' -include {articMinIONNanopolish} from '../modules/artic.nf' -include {articMinIONMedaka} from '../modules/artic.nf' -include {articRemoveUnmappedReads} from '../modules/artic.nf' - -include {makeQCCSV} from '../modules/qc.nf' -include {writeQCSummaryCSV} from '../modules/qc.nf' - -include {bamToCram} from '../modules/out.nf' - -include {collateSamples} from '../modules/upload.nf' - -// workflow component for artic pipeline -workflow sequenceAnalysisNanopolish { - take: - ch_runFastqDirs - ch_fast5Pass - ch_seqSummary - - main: - articDownloadScheme() - - articGuppyPlex(ch_runFastqDirs.flatten()) - - articMinIONNanopolish(articGuppyPlex.out.fastq - .combine(articDownloadScheme.out.scheme) - .combine(ch_fast5Pass) - .combine(ch_seqSummary)) - - articRemoveUnmappedReads(articMinIONNanopolish.out.mapped) - - makeQCCSV(articMinIONNanopolish.out.ptrim - .join(articMinIONNanopolish.out.consensus_fasta, by: 0) - .combine(articDownloadScheme.out.reffasta)) - - makeQCCSV.out.csv.splitCsv() - .unique() - .branch { - header: it[-1] == 'qc_pass' - fail: it[-1] == 'FALSE' - pass: it[-1] == 'TRUE' - } - .set { qc } - - writeQCSummaryCSV(qc.header.concat(qc.pass).concat(qc.fail).toList()) - - collateSamples(qc.pass.map{ it[0] } - .join(articMinIONNanopolish.out.consensus_fasta, by: 0) - .join(articRemoveUnmappedReads.out)) - - if (params.outCram) { - bamToCram(articMinIONNanopolish.out.ptrim.map{ it[0] } - .join (articDownloadScheme.out.reffasta.combine(ch_preparedRef.map{ it[0] })) ) - - } - - - emit: - qc_pass = collateSamples.out - -} - -workflow sequenceAnalysisMedaka { - take: - ch_runFastqDirs - - main: - articDownloadScheme() - - articGuppyPlex(ch_runFastqDirs.flatten()) - - articMinIONMedaka(articGuppyPlex.out.fastq - .combine(articDownloadScheme.out.scheme)) - - articRemoveUnmappedReads(articMinIONMedaka.out.mapped) - - makeQCCSV(articMinIONMedaka.out.ptrim.join(articMinIONMedaka.out.consensus_fasta, by: 0) - .combine(articDownloadScheme.out.reffasta)) - - makeQCCSV.out.csv.splitCsv() - .unique() - .branch { - header: it[-1] == 'qc_pass' - fail: it[-1] == 'FALSE' - pass: it[-1] == 'TRUE' - } - .set { qc } - - writeQCSummaryCSV(qc.header.concat(qc.pass).concat(qc.fail).toList()) - - collateSamples(qc.pass.map{ it[0] } - .join(articMinIONMedaka.out.consensus_fasta, by: 0) - .join(articRemoveUnmappedReads.out)) - - if (params.outCram) { - bamToCram(articMinIONMedaka.out.ptrim.map{ it[0] } - .join (articDownloadScheme.out.reffasta.combine(ch_preparedRef.map{ it[0] })) ) - - } - emit: - qc_pass = collateSamples.out - -} - - -workflow articNcovNanopore { - take: - ch_fastqDirs - - main: - if ( params.nanopolish ) { - Channel.fromPath( "${params.fast5_pass}" ) - .set{ ch_fast5Pass } - - Channel.fromPath( "${params.sequencing_summary}" ) - .set{ ch_seqSummary } - - sequenceAnalysisNanopolish(ch_fastqDirs, ch_fast5Pass, ch_seqSummary) - - } else if ( params.medaka ) { - sequenceAnalysisMedaka(ch_fastqDirs) - } -} diff --git a/workflows/illuminaNcov.nf b/workflows/illuminaMpxv.nf similarity index 55% rename from workflows/illuminaNcov.nf rename to workflows/illuminaMpxv.nf index 0687e49..256a1a3 100644 --- a/workflows/illuminaNcov.nf +++ b/workflows/illuminaMpxv.nf @@ -1,32 +1,22 @@ #!/usr/bin/env nextflow -// enable dsl2 -nextflow.preview.dsl = 2 +nextflow.enable.dsl = 2 -// import modules -include {articDownloadScheme } from '../modules/artic.nf' -include {readTrimming} from '../modules/illumina.nf' -include {filterResidualAdapters} from '../modules/illumina.nf' +include {articDownloadScheme } from '../modules/utils.nf' +include {performHostFilter} from '../modules/illumina.nf' +include {normalizeDepth} from '../modules/illumina.nf' +include {readTrimming} from '../modules/illumina.nf' +include {filterResidualAdapters} from '../modules/illumina.nf' include {indexReference} from '../modules/illumina.nf' -include {readMapping} from '../modules/illumina.nf' +include {readMapping} from '../modules/illumina.nf' include {trimPrimerSequences} from '../modules/illumina.nf' include {callConsensusFreebayes} from '../modules/illumina.nf' include {annotateVariantsVCF} from '../modules/illumina.nf' -include {callVariants} from '../modules/illumina.nf' -include {makeConsensus} from '../modules/illumina.nf' -include {cramToFastq} from '../modules/illumina.nf' include {alignConsensusToReference} from '../modules/illumina.nf' -include {trimUTRFromAlignment} from '../modules/illumina.nf' -include {performHostFilter} from '../modules/utils.nf' -include {downsampleAmplicons} from '../modules/utils.nf' -include {downsampledBamToFastq} from '../modules/utils.nf' -include {addCodonPositionToVariants} from '../modules/utils.nf' include {makeQCCSV} from '../modules/qc.nf' include {writeQCSummaryCSV} from '../modules/qc.nf' -include {bamToCram} from '../modules/out.nf' - include {collateSamples} from '../modules/upload.nf' workflow prepareReferenceFiles { @@ -70,21 +60,20 @@ workflow prepareReferenceFiles { /* If bedfile is supplied, use that, if not, get it from ARTIC github repo */ - if (params.bed ) { - Channel.fromPath(params.bed) - .set{ ch_bedFile } - + if (params.bed) { + ch_bedFile = Channel.fromPath(params.bed) } else { - articDownloadScheme.out.bed - .set{ ch_bedFile } + ch_bedFile = articDownloadScheme.out.bed } - Channel.fromPath(params.gff) - .set{ ch_gff } + ch_gff = Channel.fromPath(params.gff) + + ch_primerPairs = Channel.fromPath(params.primer_pairs_tsv) emit: bwaindex = ch_preparedRef bedfile = ch_bedFile + primer_pairs = ch_primerPairs gff = ch_gff } @@ -94,13 +83,16 @@ workflow sequenceAnalysis { ch_filePairs ch_preparedRef ch_bedFile + ch_primerPairs ch_gff main: performHostFilter(ch_filePairs) - readTrimming(performHostFilter.out) + normalizeDepth(performHostFilter.out) + + readTrimming(normalizeDepth.out) filterResidualAdapters(readTrimming.out) @@ -108,30 +100,18 @@ workflow sequenceAnalysis { trimPrimerSequences(readMapping.out.combine(ch_bedFile)) - downsampleAmplicons(trimPrimerSequences.out.ptrim.combine(ch_bedFile).combine(Channel.fromPath(params.ref))) - - downsampledBamToFastq(downsampleAmplicons.out.alignment) - - callVariants(downsampleAmplicons.out.alignment.combine(ch_preparedRef.map{ it[0] }).combine(ch_gff)) - - addCodonPositionToVariants(callVariants.out.combine(ch_gff)) - - callConsensusFreebayes(downsampleAmplicons.out.alignment.combine(ch_preparedRef.map{ it[0] })) + callConsensusFreebayes(trimPrimerSequences.out.ptrim.combine(ch_preparedRef.map{ it[0] })) if (params.gff != 'NO_FILE') { annotateVariantsVCF(callConsensusFreebayes.out.variants.combine(ch_preparedRef.map{ it[0] }).combine(ch_gff)) } - - makeConsensus(downsampleAmplicons.out.alignment) alignConsensusToReference(callConsensusFreebayes.out.consensus.combine(ch_preparedRef.map{ it[0] })) - trimUTRFromAlignment(alignConsensusToReference.out) - - makeQCCSV(downsampleAmplicons.out.alignment.join(callConsensusFreebayes.out.consensus, by: 0) - .combine(ch_preparedRef.map{ it[0] })) - - downsampleAmplicons.out.summary.collectFile(keepHeader: true, sort: { it.text }, name: "${params.prefix}.downsampling.csv", storeDir: "${params.outdir}") + makeQCCSV(trimPrimerSequences.out.ptrim.join(callConsensusFreebayes.out.consensus, by: 0) + .combine(ch_preparedRef.map{ it[0] }) + .combine(ch_bedFile) + .combine(ch_primerPairs)) makeQCCSV.out.csv.splitCsv() .unique() @@ -144,38 +124,19 @@ workflow sequenceAnalysis { writeQCSummaryCSV(qc.header.concat(qc.pass).concat(qc.fail).toList()) - collateSamples(callConsensusFreebayes.out.consensus.join(downsampledBamToFastq.out.fastqPairs)) - - if (params.outCram) { - bamToCram(trimPrimerSequences.out.mapped.map{ it[0] } - .join (trimPrimerSequences.out.ptrim.combine(ch_preparedRef.map{ it[0] })) ) - - } + collateSamples(callConsensusFreebayes.out.consensus.join(trimPrimerSequences.out.mapped)) emit: qc_pass = collateSamples.out } -workflow ncovIllumina { +workflow mpxvIllumina { take: ch_filePairs main: - // Build or download fasta, index and bedfile as required prepareReferenceFiles() - - // Actually do analysis - sequenceAnalysis(ch_filePairs, prepareReferenceFiles.out.bwaindex, prepareReferenceFiles.out.bedfile, prepareReferenceFiles.out.gff) + sequenceAnalysis(ch_filePairs, prepareReferenceFiles.out.bwaindex, prepareReferenceFiles.out.bedfile, prepareReferenceFiles.out.primer_pairs, prepareReferenceFiles.out.gff) } -workflow ncovIlluminaCram { - take: - ch_cramFiles - main: - // Convert CRAM to fastq - cramToFastq(ch_cramFiles) - - // Run standard pipeline - ncovIllumina(cramToFastq.out) -}