CSQ are Consequence + annotations from Ensembl VEP. + ''' + csq_columns = bcf_in.header.info['CSQ'].description.split()[-1].split('|') # grab the definitions + return csq_columns + + +def get_csqs(record, csq_columns): + ''' + Get new INFO field results. + ''' + alt_count = len(record.alts) + csq_dicts = {} + for i in range(alt_count): + try: + csq_line = record.info['CSQ'][i] + except UnicodeDecodeError: # for names with accents and other unexpected characters (rare) + line = str(record) + csq_line = line.split('\t')[7].split('CSQ=')[1] + csq_line = csq_line.split(';')[0] + csq_line = csq_line.split(',')[i] + csq_values = csq_line.split('|') + csq_dict = dict(zip(csq_columns, csq_values)) + csq_dicts[i] = csq_dict + return csq_dicts + + +def read_vcf(vcf_file): + ''' + Read in annotated VCF file. + ''' + bcf_in = pysam.VariantFile(vcf_file) # auto-detect input format + return bcf_in + + +def read_cosmic(cosmic_resistance_file): + ''' + Read in annotated VCF file. + ''' + cosmic_resistance = pd.read_csv(cosmic_resistance_file, sep='\t') + return cosmic_resistance + + +def get_record(match_Mutation, key, region_cosmic_resistance, match_key): + ''' + Get value for field from Cancer Gene Census records + ''' + try: + value = region_cosmic_resistance[(region_cosmic_resistance[match_key] == match_Mutation)][key].values[0] + except (KeyError, IndexError): + value = '' + return value + + +def match_cosmic(cosmic_resistance, record, csq_columns): + ''' + Check if HGVSp or HGVSc is at the same position + in the Cosmic Restance database return values + for each ALT in list. + ''' + # ======================= + # Get VEP annotation + # ======================= + csq_dicts = get_csqs(record, csq_columns) + cosmic_resistance_annotation = {} + for i, alt in enumerate(record.alts): + cosmic_resistance_annotation[i] = {} + for key in ['MUTATION_ID', 'GENOMIC_MUTATION_ID', 'Drug Name', 'Tier']: + cosmic_resistance_annotation[i][key] = '' + # ======================= + # Check for id match + # ======================= + kind = 'CosmicCoding' + id = csq_dicts[i][kind] + legacy_id = csq_dicts[i][kind +'_LEGACY_ID'] + match = cosmic_resistance[cosmic_resistance.GENOMIC_MUTATION_ID == id].copy() + legacy_match = cosmic_resistance[cosmic_resistance.LEGACY_MUTATION_ID == legacy_id].copy() + # ID Mutation + if not match.empty: + for key in ['MUTATION_ID', 'GENOMIC_MUTATION_ID', 'Drug Name', 'Tier']: + cosmic_resistance_annotation[i][key] = match[key].tolist()[0] + + elif not legacy_match.empty: + for key in ['MUTATION_ID', 'GENOMIC_MUTATION_ID', 'Drug Name', 'Tier']: + cosmic_resistance_annotation[i][key] = legacy_match[key].tolist()[0] + else: + # All rights are reserved. This software is supplied without
# any warranty or guaranteed support whatsoever. The New York Genome Center
# cannot be responsible for its use, misuse, or functionality.

# Version: 0.1 (2018-12-06)
# Author: Kanika Arora (karora@nygenome.org) All rights are reserved. This software is supplied without
# any warranty or guaranteed support whatsoever. The New York Genome Center
# cannot be responsible for its use, misuse, or functionality.

# Version: 0.1 (2018-12-06)
# Author: Kanika Arora (karora@nygenome.org) Cannot run.".format(file_or_dir_path)) + sys.exit(1) + else: + if not os.path.exists(file_or_dir_path): + print("ERROR: {0} does not exist. Cannot run.".format(file_or_dir_path)) + sys.exit(1) + +def compute_vaf(alt_count,dp): + ''' + Compute VAF from dp and alt_count. + ''' + if not isinstance(alt_count, int): + if str(int(alt_count)) == alt_count: + alt_count=int(alt_count) + else: + raise ValueError("alt_count should be an integer. "+alt_count+" provided. Cannot run") + if not isinstance(dp, int): + if str(int(dp)) == dp: + dp=int(dp) + else: + raise ValueError("dp should be an integer. "+dp+" provided. Cannot run") + if alt_count > dp: + raise ValueError("alt_count {0} is greater than depth {1}.".format(alt_count,dp)) + return (0 if dp==0 else round(float(alt_count)/dp,4)) + +def parse_format_return_allele_counts(ref,alt,format_dict,caller): + ''' + Return allele counts, depth and allele fraction as reported by the given caller. + If these fields are not present as is, they are computed from other fields from that caller. + ''' + AD = format_dict[caller+"_AD"] if caller+"_AD" in format_dict else "." + DP = format_dict[caller+"_DP"] if caller+"_DP" in format_dict else "." + AF = format_dict[caller+"_AF"] if caller+"_AF" in format_dict else "." + if caller == "lancet": + AF=str(compute_vaf(AD.split(",")[1],DP)) + if caller == "strelka2": + ## compute as suggested in https://github.com/Illumina/strelka/blob/master/docs/userGuide/README.md#somatic + if len(ref)==1 and len(alt)==1: + ## SNV + AD_ref=format_dict["strelka2_"+ref+"U"].split(",")[0] + AD_alt=format_dict["strelka2_"+alt+"U"].split(",")[0] + DP=str(int(format_dict["strelka2_AU"].split(",")[0])+int(format_dict["strelka2_CU"].split(",")[0])+int(format_dict["strelka2_GU"].split(",")[0])+int(format_dict["strelka2_TU"].split(",")[0])) + else: + ## INDEL + AD_ref=format_dict["strelka2_TAR"].split(",")[0] + AD_alt=format_dict["strelka2_TIR"].split(",")[0] + DP=str(int(AD_ref)+int(AD_alt)) + AD=AD_ref+","+AD_alt + AF=str(compute_vaf(AD_alt,DP)) + return (AD,DP,AF) + + + +def __main__(): + parser = ArgumentParser(prog='add_final_allele_counts', + description='Picks final values for AD and DP based on set caller priority.', epilog='', + formatter_class=lambda prog: argparse.ArgumentDefaultsHelpFormatter(prog, max_help_position=100, width=150)) + parser.add_argument('-v', '--vcf', help = 'SNV VCF file.', required=True) + parser.add_argument('-o', '--output', help = 'Output VCF file.', required=True) + parser.add_argument('-p', '--priority', help = 'Comma-separated prioritized list of sources (callers) for picking final allele counts for the variants.', default='nygc,strelka2,mutect2,lancet') + args=parser.parse_args() + VCF=args.vcf + OUT=args.output + check_if_exists(VCF) + CALLER_PRIORITY=args.priority.split(",") + f=open(VCF) + o=open(OUT,"w") + seen=0 + for line in f: + if line.startswith("#"): + o.write(line) + if line.startswith("##FORMAT") and seen==0: + o.write('##INFO=\n') + o.write('##FORMAT=\n') + o.write('##FORMAT=\n') + o.write('##FORMAT=\n') + seen=1 + else: + toks=line.strip().split("\t") + info_pattern=r"(\w+)=([^;]+);*" + matches = re.findall(info_pattern,toks[7]) + info_dict = {a:b for a,b in matches} + normal_format=dict(zip(toks[8].split(":"),toks[9].split(":"))) + tumor_format=dict(zip(toks[8].split(":"),toks[10].split(":"))) + if 'called_by' in info_dict: + called_by=info_dict["called_by"].split(",") + else: + called_by = [] + if "nygc_AD" in normal_format: + called_by.append("nygc") + chosen_caller="" + for caller in CALLER_PRIORITY: + if caller in called_by: + chosen_caller=caller + break + if chosen_caller=="": + o.write(line) + else: + toks[7]=toks[7]+";AlleleCountSource="+chosen_caller + (normal_AD,normal_DP,normal_AF)=parse_format_return_allele_counts(toks[3],toks[4],normal_format,chosen_caller) + (tumor_AD,tumor_DP,tumor_AF)=parse_format_return_allele_counts(toks[3],toks[4],tumor_format,chosen_caller) + toks[8]=toks[8]+":AD:DP:AF" + toks[9]=toks[9]+':{0}:{1}:{2}'.format(normal_AD,normal_DP,normal_AF) + toks[10]=toks[10]+':{0}:{1}:{2}'.format(tumor_AD,tumor_DP,tumor_AF) + o.write("\t".join(toks)+"\n") + f.close() + o.close() + +if __name__ == "__main__": + __main__() \ No newline at end of file diff --git a/bin/pta/add_nygc_allele_counts_to_vcf.py b/bin/pta/add_nygc_allele_counts_to_vcf.py new file mode 100644 index 00000000..81e3ef2a --- /dev/null +++ b/bin/pta/add_nygc_allele_counts_to_vcf.py @@ -0,0 +1,357 @@ +#!/usr/bin/env python + +################################################################################ +##################### COPYRIGHT ################################################ +# New York Genome Center + +# SOFTWARE COPYRIGHT NOTICE AGREEMENT +# This software and its documentation are copyright (2018) by the New York +# Genome Center. All rights are reserved. This software is supplied without
# any warranty or guaranteed support whatsoever. The New York Genome Center +# cannot be responsible for its use, misuse, or functionality. + +# Version: 0.5 (2018-09-05) +# Author: Kanika Arora (karora@nygenome.org) +##################### COPYRIGHT ################################################ +################################################################################ + +#### PLEASE NOTE THAT THIS SCRIPT EXPECTS THE FORMAT COLUMN FOR THE NORMAL SAMPLE +#### TO BE COLUMN 10 AND TUMOR SAMPLE TO BE COLUMN 11 +#### IT WILL ADD INCORRECT ALLELE COUNTS IF THAT ORDER OF SAMPLES IS NOT TRUE + +import pysam +import sys +import argparse +import os +import re + +header=dict() + +def add_new_header_field(KEY, VALUE): + if KEY not in header: + header[KEY]='' + header[KEY]=header[KEY]+"##{0}={1}\n".format(KEY,VALUE) + +class ArgumentParser(argparse.ArgumentParser): + def error(self, message): + self.print_help(sys.stderr) + self.exit(2, '\nERROR: %s\n\n' % (message)) + +def check_if_exists(file_or_dir_path, type="file"): + if type=="file": + if not os.path.isfile(file_or_dir_path): + print("ERROR: Required file {0} does not exist. Cannot run.".format(file_or_dir_path)) + sys.exit(1) + elif type=="directory": + if not os.path.isdir(file_or_dir_path): + print("ERROR: Required file {0} does not exist. Cannot run.".format(file_or_dir_path)) + sys.exit(1) + else: + if not os.path.exists(file_or_dir_path): + print("ERROR: {0} does not exist. Cannot run.".format(file_or_dir_path)) + sys.exit(1) + +def compute_vaf(alt_count,dp): + ''' + Compute VAF from dp and alt_count. + ''' + if not isinstance(alt_count,int): + if str(int(alt_count)) == alt_count: + alt_count=int(alt_count) + else: + raise ValueError("alt_count should be an integer. "+alt_count+" provided. Cannot run") + if not isinstance(dp,int): + if str(int(dp)) == dp: + dp=int(dp) + else: + raise ValueError("dp should be an integer. "+dp+" provided. Cannot run") + if alt_count > dp: + raise ValueError("alt_count {0} is greater than depth {1}.".format(alt_count,dp)) + return (0 if dp==0 else round(float(alt_count)/dp,4)) + + +def infer_variant_type(ref, alt): + ''' + Infers whether the variant is a SNV,MNV,INDEL or COMPLEX (delin). + ''' + variant_type="COMPLEX" + if len(ref) == len(alt): + if len(ref)==1: + variant_type="SNV" + else: + variant_type="MNV" + elif (len(ref) == 1 or len(alt) == 1) and ref[0] == alt[0]: + ## VCF file has anchor bases for indels + variant_type="INDEL" + return variant_type + +def is_too_long(ref, alt, variant_type, MAX_INDEL_LEN): + ''' + Test if an INDEL or COMPLEX event is too long for computing allele counts using NYGC's pileup method given the length cut off. + ''' + too_long = False + if (len(ref) > MAX_INDEL_LEN or len(alt) > MAX_INDEL_LEN) and (variant_type == "INDEL" or variant_type == "COMPLEX"): + too_long = True + return too_long + + +def read_pileup_return_count(samfile, chr, pos, ref, + alt, variant_type, MIN_MQ=10, MIN_BQ=10, + testing=False): + ref_reads = [] + alt_reads = [] + other_reads = [] + f1r2_reads = [] + f2r1_reads = [] + properly_paired_reads = [] + fwd_reads = [] + rev_reads = [] + pileup = samfile.pileup(chr, pos - 1, pos) + possible_complex=False + anchor_mismatch=0 + for pileupcolumn in pileup: + if pileupcolumn.pos == pos - 1: + for pileupread in pileupcolumn.pileups: + # if the position in the read is .is_del pos is none so take next + pos_in_read = pileupread.query_position + # skip reads where the position is already a deletion (is_del) + if not pos_in_read: + continue +# print ('pos_in_read', pos_in_read, pos, ref, alt) +# print('VERSION', pysam.__version__) + # ========================== + # pysam filters secondary, dup, and qcfail by default unless nofilter is used + # ========================== + BQ = pileupread.alignment.query_qualities[pos_in_read] + MQ = pileupread.alignment.mapping_quality + if testing: + if pileupread.alignment.is_duplicate: + print('is_duplicate') + sys.exit(0) + if pileupread.alignment.is_qcfail: + print('is_qcfail') + sys.exit(0) + if BQ < MIN_BQ \ + or MQ < MIN_MQ \ + or pileupread.alignment.is_supplementary: + continue + read_name = pileupread.alignment.query_name + # Check strand of reads + if pileupread.alignment.is_reverse is False: + fwd_reads.append(read_name) + else: + rev_reads.append(read_name) + #Check if properly paired + if pileupread.alignment.is_proper_pair is True: + properly_paired_reads.append(read_name) + #If properly paired, check if read-pair in F1R2 or F2R1 orientation + if pileupread.alignment.is_reverse is False and pileupread.alignment.is_read1: + f1r2_reads.append(read_name) + else: + f2r1_reads.append(read_name) + # filter reads that don't span the indel + if pos_in_read + len(ref) > pileupread.alignment.query_alignment_end \ + or pos_in_read + len(alt) > pileupread.alignment.query_alignment_end: + continue + + ## Check if read has ref allele or alt allele + if variant_type == "SNV" or variant_type == "MNV": + if pileupread.alignment.query_sequence[pos_in_read:pos_in_read + len(ref)] == ref: + ref_reads.append(read_name) + elif pileupread.alignment.query_sequence[pos_in_read:pos_in_read + len(alt)] == alt: + alt_reads.append(read_name) + if pileupread.indel != 0: + anchor_mismatch+=1 + else: + other_reads.append(read_name) + elif variant_type == "COMPLEX": + ### Exact length and sequence of allele match required for complex events. Example: if the variant is AC>T, and if a read has deletion of C but the nt at anchor position is A, it will go into other_reads ### + if pileupread.indel == 0 and pileupread.alignment.query_sequence[pos_in_read:pos_in_read + len(ref)] == ref: + ref_reads.append(read_name) + elif pileupread.indel == len(alt) - len(ref) and pileupread.alignment.query_sequence[pos_in_read:pos_in_read + len(alt)] == alt: + alt_reads.append(read_name) + else: + other_reads.append(read_name) + else: + #### Variant type is "INDEL" #### + ############################# PLEASE NOTE ####################################### + ### Variant calling for indels: For insertion, check whether the length of the + ### insertion and sequence matches alt allele. If there is no indel at the anchor + ### position (even if the base at the anchor position doesn't match), we consider + ### the read as adding support to the reference. For deletions, if the length of + ### deletion matches alt allele, we consider that read supporting the alt allele, + ### and if there is no deletion at that position (even if there are mismatches in + ### the bases spanning the deletion), it's considered to support reference allele. + ### Examples for insertions: + ### Let's say that the variant is chr1:12345 A > AT + ### Scenario1: The read has a C at chr1:12345 along with insertion of T + ### This read will be used to add support to the alternate allele + ### Scenario2: Read has a mismatch (let's say 'C') at chr1:12345, but no indel + ### This read will be used to add support to the reference allele + ### Scenario3: Read has a different insertion, let's say 'G' insted of 'T' + ### This read will go into the other_reads category. + ### Example for deletions: + ### Let's say the variant is chr1:12345 AT > A + ### Scenario1: The read has a C at chr1:12345 along with deletion of T + ### It will be used to add support to the alt allele. + ### Scenario2: The read has a A at chr1:12345 followed by a 2nt deletion + ### This read will go into the other_reads_category + ################################################################################### + if len(ref)==1: + #Variant is an insertion + if pileupread.indel == 0: ### and pileupread.alignment.query_sequence[pos_in_read:pos_in_read + 1] == ref: + ref_reads.append(read_name) + elif pileupread.indel == len(alt) - len(ref) and pileupread.alignment.query_sequence[pos_in_read+1:pos_in_read + len(alt)] == alt[1:]: + alt_reads.append(read_name) + if pileupread.alignment.query_sequence[pos_in_read] != alt[0]: + anchor_mismatch+=1 + else: + other_reads.append(read_name) + else: + # Variant is a deletion (len(ref)>1 and len(alt)==1) + if pileupread.indel == 0: ## and pileupread.alignment.query_sequence[pos_in_read+1:pos_in_read + len(ref)] == ref[1:]: + ref_reads.append(read_name) + elif pileupread.indel == len(alt) - len(ref): ## and pileupread.alignment.query_sequence[pos_in_read:pos_in_read + len(alt)] == alt: + alt_reads.append(read_name) + if pileupread.alignment.query_sequence[pos_in_read] != alt[0]: + anchor_mismatch+=1 + else: + other_reads.append(read_name) + + ## If there are more than 2 reads that support the alternate allele of an indel variant, but the anchor base does not match, we report that as a PossiblyComplex event ## + ## Similarly, if there are more than 2 reads that support alt allele of an SNV, but have an indel immediately following the SNV variant, we report that as a PossiblyComplex event ## + if anchor_mismatch > 2: + possible_complex=True + # check sets to make sure reads don't show up in multiple sets + # supporting multiple calls + set_ref_raw = set(ref_reads) + set_alt_raw = set(alt_reads) + set_other_raw = set(other_reads) + ref_reads_set = set_ref_raw - set_alt_raw - set_other_raw + alt_reads_set = set_alt_raw - set_ref_raw - set_other_raw + other_reads_set = set_other_raw - set_ref_raw - set_alt_raw + all_reads_set = alt_reads_set|ref_reads_set|other_reads_set + # make read-type sets + f1r2_reads_set = set(f1r2_reads) + f2r1_reads_set = set(f2r1_reads) + fwd_reads_set = set(fwd_reads) + rev_reads_set = set(rev_reads) + properly_paired_reads_set = set(properly_paired_reads) + # tally set in ref and alt, non-ref/alt, all reads + ref_count = len(ref_reads_set) + alt_count = len(alt_reads_set) + # other_count=len(other_reads_set) # not used + dp = len(all_reads_set) + # get VAF + vaf = compute_vaf(alt_count, dp) + # get read set count by pair info ref/alt + f1r2_ref = len(ref_reads_set.intersection(f1r2_reads_set)) + f1r2_alt = len(alt_reads_set.intersection(f1r2_reads_set)) + f2r1_ref = len(ref_reads_set.intersection(f2r1_reads_set)) + f2r1_alt = len(alt_reads_set.intersection(f2r1_reads_set)) + # get read set count by orientation ref/alt + fwd_ref = len(ref_reads_set.intersection(fwd_reads_set)) + fwd_alt = len(alt_reads_set.intersection(fwd_reads_set)) + rev_ref = len(ref_reads_set.intersection(rev_reads_set)) + rev_alt = len(alt_reads_set.intersection(rev_reads_set)) + # tally properly paired sets ref/alt + proper_paired_ref = len(ref_reads_set.intersection(properly_paired_reads_set)) + proper_paired_alt = len(alt_reads_set.intersection(properly_paired_reads_set)) + # tally not properly paired sets ref/alt + not_proper_paired_ref = len(ref_reads_set) - proper_paired_ref + not_proper_paired_alt = len(alt_reads_set) - proper_paired_alt + return ('{0},{1}'.format(ref_count, alt_count), + str(dp), str(vaf), + '{0},{1}'.format(f1r2_ref, f1r2_alt), + '{0},{1}'.format(f2r1_ref, f2r1_alt), + '{0},{1}'.format(fwd_ref, fwd_alt), + '{0},{1}'.format(rev_ref,rev_alt), + '{0},{1}'.format(proper_paired_ref, proper_paired_alt), + '{0},{1}'.format(not_proper_paired_ref, not_proper_paired_alt),possible_complex) + + +def __main__(): + parser = ArgumentParser(prog='add_nygc_allele_counts', + description='Runs pileup on tumor and normal bam files to compute allele counts for bi-allelic SNV and Indel variants in VCF file and adds pileup format columns to the VCF file.', epilog='', + formatter_class=lambda prog: argparse.ArgumentDefaultsHelpFormatter(prog, max_help_position=100, width=150)) + parser.add_argument('-t', '--tumor_bam', help = 'Tumor BAM file.', required=True) + parser.add_argument('-n', '--normal_bam', help = 'Normal BAM file.', required=True) + parser.add_argument('-v', '--vcf', help = 'SNV VCF file.', required=True) + parser.add_argument('-o', '--output', help = 'Output VCF file.', required=True) + parser.add_argument('-b', '--min_base_quality', help='Minimum base quality', default=10, type=int) + parser.add_argument('-m', '--min_mapping_quality', help='Minimum mapping quality', + default=10, type=int) + parser.add_argument('-i', '--max_indel_len_for_count', + help='Maximum indel or delin (complex event) length for generating counts', + default=10, type=int) + args=parser.parse_args() + # name variables + TBAM=args.tumor_bam + NBAM=args.normal_bam + VCF=args.vcf + OUT=args.output + MIN_BQ=args.min_base_quality + MIN_MQ=args.min_mapping_quality + MAX_INDEL_LEN=args.max_indel_len_for_count + # test files + check_if_exists(TBAM) + check_if_exists(NBAM) + check_if_exists(VCF) + + TumorSamFile=pysam.AlignmentFile(TBAM, "rb") + NormalSamFile=pysam.AlignmentFile(NBAM, "rb") + + f=open(VCF) + o=open(OUT,"w") + seen=0 + for line in f: + if line.startswith("#"): + if line.startswith("##FORMAT") and seen==0: + o.write('##INFO=\n') + o.write('##FORMAT=\n') + o.write('##FORMAT=\n') + o.write('##FORMAT=\n') + o.write('##FORMAT=\n') + o.write('##FORMAT=\n') + o.write('##FORMAT=\n') + o.write('##FORMAT=\n') + o.write('##FORMAT=\n') + o.write('##FORMAT=\n') + seen=1 + o.write(line) + else: + new_ids=["nygc_AD", "nygc_DP", "nygc_AF", "nygc_F1R2", "nygc_F2R1", "nygc_FWD", "nygc_REV", + "nygc_PROPER_PAIRED", "nygc_NOT_PROPER_PAIRED"] + line=line.strip() + toks=line.split("\t") + chrom = toks[0] + pos = toks[1] + ref = toks[3] + alt = toks[4] + variant_type=infer_variant_type(ref,alt) + + too_long = is_too_long(ref, alt, variant_type, MAX_INDEL_LEN) + if too_long: + o.write("\t".join(toks)+"\n") + else: + toks[8] = toks[8]+":"+":".join(new_ids) + (AD,DP,AF,F1R2,F2R1,FWD,REV,PROPER_PAIRED,NOT_PROPER_PAIRED,possibly_complex) = read_pileup_return_count(NormalSamFile, chrom, + int(pos), + ref, alt, variant_type, + MIN_MQ=10, MIN_BQ=10) + toks[9]=toks[9]+":"+":".join([AD, DP, AF, F1R2, F2R1, FWD, REV, + PROPER_PAIRED, NOT_PROPER_PAIRED]) + (AD,DP,AF,F1R2,F2R1,FWD,REV,PROPER_PAIRED,NOT_PROPER_PAIRED, possibly_complex) = read_pileup_return_count(TumorSamFile, chrom, + int(pos), + ref, alt, variant_type, + MIN_MQ=10, MIN_BQ=10) + toks[10]=toks[10]+":"+":".join([AD, DP, AF, F1R2, F2R1, FWD, REV, + PROPER_PAIRED, NOT_PROPER_PAIRED]) + if possibly_complex is True: + toks[7]=toks[7]+";PossiblyComplex" + o.write("\t".join(toks)+"\n") + f.close() + o.close() + +if __name__ == "__main__": + __main__() \ No newline at end of file diff --git a/bin/pta/annotate-bedpe-with-cnv.r b/bin/pta/annotate-bedpe-with-cnv.r new file mode 100644 index 00000000..982b9b9f --- /dev/null +++ b/bin/pta/annotate-bedpe-with-cnv.r @@ -0,0 +1,145 @@ +## BJS Note: this script was located in the root path of the Docker container +## gcr.io/nygc-public/sv_cnv@sha256:1c14a50d131323a2a4bab323cf224879776af8de37f93df79292fd2e63269274 +## It is reproduced below as it exists there without modification + +## Annotate a merged & annotated BEDPE with closest CNV changepoint +libs = c('optparse', 'StructuralVariantAnnotation', 'VariantAnnotation', 'rtracklayer', 'stringr') +invisible(suppressPackageStartupMessages(sapply(libs, require, character.only=T, quietly=T))) +options(width=200, scipen=999) + + + +## Handle non-standard bedpe columns better +readBEDPE = function(f) { + + ## Read file as Pairs object + x = rtracklayer::import(f, format='bedpe') + + ## Update metadata column names + x.mcol.names = colnames(read.csv(f, h=T, stringsAsFactors=F, sep='\t', check.names=F)) + colnames(mcols(x))[3:ncol(mcols(x))] = x.mcol.names[11:length(x.mcol.names)] + mcols(x)$type = mcols(x)$name + + + ## Convert to breakpoint ranges + x = StructuralVariantAnnotation::pairs2breakpointgr(x) + + return(x) + +} + + + +## Read a headered, tab-delimited CNV file into a GRanges object +readCNV = function(f) { + + x = read.csv(f, h=F, stringsAsFactors=F, sep='\t', comment.char='#') + colnames(x)[1:3] = c('chr','start','end') + x = makeGRangesFromDataFrame(x) + + return(x) + +} + + + +## Find the nearest copy number changepoints to each breakend +annotateWithClosestChangepoint = function(sv, cnv) { + + sv$cnv = '' + cnv.str = paste0(as.character(seqnames(cnv)), ':', start(cnv), '-', end(cnv)) + nearest.cnv = GenomicRanges::nearest(sv, cnv) + + ## Make sure NAs (i.e. no nearest neightbor) are preserved as blanks + idx.na = which(is.na(nearest.cnv)) + nearest.cnv[idx.na] = 1 + + sv$cnv = cnv.str[nearest.cnv] + sv$cnv[idx.na] = '' + + return(sv) + +} + + + +## Convert breakpointRanges to BEDPE +vcfToBedpe = function(vcf) { + + sqn = as.character(seqnames(vcf)) + strand = as.character(strand(vcf)) + res = c() + processed = c() + + for (i in 1:length(vcf)) { + bnd = names(vcf)[i] + partner = vcf$partner[i] + partner.idx = which(names(vcf) == partner) + + ## If we don't have exactly one partner, exclude this variant + if (length(partner.idx) != 1) { + warning('Missing partner for breakend ', bnd) + next + } + + ## Check to see if we've alrady processed this or it's partner + if (any(c(bnd, partner) %in% processed)) { + next + } + + + ## Combine breakends in single line + res.i = c(sqn[i], start(vcf)[i], end(vcf)[i], ## chr1, start1, end1 + sqn[partner.idx], start(vcf)[partner.idx], end(vcf)[partner.idx], ## chr2, start2, end 2 + vcf$type[i], '.', strand[i], strand[partner.idx], ## type, score, strand1, strand2 + vcf$evidence[i], vcf$tools[i], vcf$`tumor--normal`[i], vcf$info[i], ## evidence, tools, TN, info + vcf$cnv[i], vcf$cnv[partner.idx]) ## changepoint1 , changpoint2 + ## Add to result, keep track of processed breakends + res = rbind(res, res.i) + processed = c(processed, bnd, partner) + } + + + ## Add colnames and fill in simple event classifications + colnames(res) = c('chr1', 'start1', 'end1', 'chr2', 'start2', 'end2', 'type', + 'score', 'strand1', 'strand2', 'evidence', 'tools', 'tumor--normal', + 'info', 'cnv_changepoint_1', 'cnv_changepoint_2') + res = as.data.frame(res, stringsAsFactors=F) + + + ## Fix coordinates (have to subtract when starting from a bedpe) + res$start1 = as.numeric(res$start1) - 1 + res$start2 = as.numeric(res$start2) - 1 + + + colnames(res)[1] = paste0('#', colnames(res)[1]) + + return(res) + +} + + + +## Collect arguments +option_list = list( + make_option(c("-b", "--bedpe"), type='character', help="Input BEDPE"), + make_option(c("-c", "--cnv"), type='character', help="BED file containing CNV intervals"), + make_option(c("-o", "--out_file"), type='character', help="Output BEDPE")) +opt = parse_args(OptionParser(option_list=option_list)) + + + +## Read bedpe +sv = readBEDPE(opt$bedpe) + +## Read CNV +cnv = readCNV(opt$cnv) + +## Annotate breakpoints with closest changepoint +sv = annotateWithClosestChangepoint(sv=sv, cnv=cnv) + +## Convert to bedpe +res = vcfToBedpe(sv) + +## Write result +write.table(res, opt$out_file, row.names=F, col.names=T, sep='\t', quote=F) diff --git a/bin/pta/annotate-bedpe-with-databases.r b/bin/pta/annotate-bedpe-with-databases.r new file mode 100644 index 00000000..385d21e1 --- /dev/null +++ b/bin/pta/annotate-bedpe-with-databases.r @@ -0,0 +1,254 @@ +## BJS Note: this script was located in the root path of the Docker container +## gcr.io/nygc-public/sv_cnv@sha256:1c14a50d131323a2a4bab323cf224879776af8de37f93df79292fd2e63269274 +## It is reproduced below as it exists there without modification + +## Annotate a merged bedpe with arbitrary databases +libs = c('optparse', 'StructuralVariantAnnotation', 'VariantAnnotation', 'rtracklayer', 'stringr', 'gUtils') +invisible(suppressPackageStartupMessages(sapply(libs, require, character.only=T, quietly=T))) +options(width=200, scipen=999) + + + +## Handle non-standard bedpe columns better +readBEDPE = function(f) { + + ## Read file as Pairs object + x = rtracklayer::import(f, format='bedpe') + + ## Update metadata column names + x.mcol.names = colnames(read.csv(f, h=T, stringsAsFactors=F, sep='\t', check.names=F)) + colnames(mcols(x))[3:ncol(mcols(x))] = x.mcol.names[11:length(x.mcol.names)] + mcols(x)$type = mcols(x)$name + + + ## Convert to breakpoint ranges + x = StructuralVariantAnnotation::pairs2breakpointgr(x) + + return(x) + +} + + + +## Handle both BEDPE and BED files (headerless) +readDB = function(f) { + + is.bed = grepl('\\.bed\\.gz$|\\.bed$', f) + is.bedpe = grepl('\\.bedpe\\.gz$|\\.bedpe$', f) + is.vcf = grepl('\\.vcf\\.gz$|\\.vcf$', f) + + if (is.bed || is.bedpe) { + x = rtracklayer::import(f) + + ## If this is a BEDPE, convert to a breakpointRanges object + ## The package seems to misname SV type so update that + if (is.bedpe) { + x = StructuralVariantAnnotation::pairs2breakpointgr(x) + x$type = x$sourceId + } + + } + + if (is.vcf) { + + x = VariantAnnotation::readVcf(f) + x = StructuralVariantAnnotation::breakpointRanges(x, nominalPosition=T) + + if ('svtype' %in% colnames(mcols(x))) { + x$type = x$svtype + } else { + x$type = x$sourceId + } + + + } + + + return(x) + +} + + + +## Check overlaps between a breakpointRanges and a GRanges, +pairInBed = function(query, subject) { + + overlaps = rep(NA, length(query)) + + ## For each breakend + processed = c() + for (i in 1:length(query)) { + + partner.idx = which(names(query) == query$partner[i]) + is.translocation = as.character(seqnames(query[i])) != as.character(seqnames(query[partner.idx])) + + ## Check if we've already procesed the full breakpoint + if (i %in% processed) { + next + } + + if (is.translocation) { + + overlap = any(query[c(i, partner.idx)] %^% subject) + + } else { + + bkpt = GRanges(as.character(seqnames(query))[i], IRanges(start(query)[i], end(query)[partner.idx])) + overlap = any(bkpt %^% subject) + + } + + overlaps[c(i, partner.idx)] = overlap + processed = c(processed, i, partner.idx) + + } + + return(overlaps) + +} + + + +## Annotate breakpointRanges object with breakpointRanges or GRanges +annotateDB = function(x, db, name, slop, ignore.strand=F) { + + + ## Use different overlap method depending on whether DB is BED or BEDPE + if ('partner' %in% colnames(mcols(db))) { + overlaps = StructuralVariantAnnotation::findBreakpointOverlaps(query=x, + subject=db, + maxgap=slop, + sizemargin=0.8, + restrictMarginToSizeMultiple=0.8, + ignore.strand=ignore.strand) + overlaps = queryHits(overlaps) + + } else { + # overlaps = GenomicRanges::findOverlaps(query=x, subject=db) + overlaps = pairInBed(query=x, subject=db) + } + + ## Annotate with hits if there are any + x$db[overlaps] = paste0(dta$db[overlaps],name,',') + + return(x) + +} + + + +## Convert breakpointRanges to BEDPE +vcfToBedpe = function(vcf) { + + sqn = as.character(seqnames(vcf)) + strand = as.character(strand(vcf)) + res = c() + processed = c() + + for (i in 1:length(vcf)) { + bnd = names(vcf)[i] + partner = vcf$partner[i] + partner.idx = which(names(vcf) == partner) + + ## If we don't have exactly one partner, exclude this variant + if (length(partner.idx) != 1) { + warning('Missing partner for breakend ', bnd) + next + } + + ## Check to see if we've alrady processed this or it's partner + if (any(c(bnd, partner) %in% processed)) { + next + } + + ## Aggregate database string + dbstr = unique(unlist(strsplit(c(vcf$db[i], vcf$db[partner.idx]), ',', fixed=T))) + dbstr = paste(dbstr, collapse=',') + dbstr = paste0('known=',dbstr,';') + + + ## Combine breakends in single line + res.i = c(sqn[i], start(vcf)[i], end(vcf)[i], ## chr1, start1, end1 + sqn[partner.idx], start(vcf)[partner.idx], end(vcf)[partner.idx], ## chr2, start2, end 2 + vcf$type[i], '.', strand[i], strand[partner.idx], ## type, score, strand1, strand2 + vcf$evidence[i], vcf$tools[i], vcf$`tumor--normal`[i], dbstr) ## evidence, tools, TN, info + + ## Add to result, keep track of processed breakends + res = rbind(res, res.i) + processed = c(processed, bnd, partner) + } + + + ## Add colnames and fill in simple event classifications + colnames(res) = c('chr1', 'start1', 'end1', 'chr2', 'start2', 'end2', 'type', 'score', 'strand1', 'strand2', 'evidence', 'tools', 'tumor--normal','info') + res = as.data.frame(res, stringsAsFactors=F) + + + ## Fix coordinates (have to subtract when starting from a bedpe) + res$start1 = as.numeric(res$start1) - 1 + res$start2 = as.numeric(res$start2) - 1 + + + colnames(res)[1] = paste0('#', colnames(res)[1]) + + return(res) + +} + + + +## Collect arguments +option_list = list( + make_option(c("-b", "--bedpe"), type='character', help="Input BEDPE"), + make_option(c("-n", "--db_names"), type='character', help="Comma-delimited list of database names corresponding to the order in --db_files"), + make_option(c("-f", "--db_files"), type='character', help="Comma-delimited list of database files corresponding to the order in --db_names"), + make_option(c("-i", "--db_ignore_strand"), type='character', help="Comma-delimited list of database names to ignore strand orientation for when overlapping? Should be present in --db_names"), + make_option(c("-s", "--slop"), type='numeric', help="Padding to use when comparing breakpoints"), + make_option(c("-o", "--out_file"), type='character', help="Output BEDPE")) +opt = parse_args(OptionParser(option_list=option_list)) + + +## Unpack arguments +opt$db_names = unlist(strsplit(opt$db_names, ',', fixed=T)) +opt$db_files = unlist(strsplit(opt$db_files, ',', fixed=T)) + +if (!is.null(opt$db_ignore_strand)) { + opt$db_ignore_strand = unlist(strsplit(opt$db_ignore_strand, ',', fixed=T)) +} + + + +## Sanity-check ignore-strand option +if (!is.null(opt$db_ignore_strand) && !all(opt$db_ignore_strand %in% opt$db_names)) { + missing = paste(setdiff(opt$db_ignore_strand, opt$db_names), collapse=',') + stop('Databases present in --db_ignore_strand not present in --db_names: ', missing) +} + + +## Read bedpe +dta = readBEDPE(opt$bedpe) +dta$db = '' + + +## Annotate with databases +for (i in 1:length(opt$db_names)) { + + db.name = opt$db_names[i] + db.file = opt$db_files[i] + is = !is.null(opt$db_ignore_strand) && db.name %in% opt$db_ignore_strand + + db = readDB(db.file) + dta = annotateDB(x=dta, + db=db, + name=db.name, + slop=opt$slop, + ignore.strand=is) + +} + + +## Convert to bedpe +res = vcfToBedpe(dta) + +## Write result +write.table(res, opt$out_file, row.names=F, col.names=T, sep='\t', quote=F) diff --git a/bin/pta/annotate-bedpe-with-genes.r b/bin/pta/annotate-bedpe-with-genes.r new file mode 100644 index 00000000..652ed67d --- /dev/null +++ b/bin/pta/annotate-bedpe-with-genes.r @@ -0,0 +1,344 @@ +## BJS Note: this script was located in the root path of the Docker container +## gcr.io/nygc-public/sv_cnv@sha256:1c14a50d131323a2a4bab323cf224879776af8de37f93df79292fd2e63269274 +## It is reproduced below as it exists there without modification + +## Annotate a merged & annotated BEDPE with gene information +libs = c('optparse', 'StructuralVariantAnnotation', 'VariantAnnotation', 'rtracklayer', 'gUtils') +invisible(suppressPackageStartupMessages(sapply(libs, require, character.only=T, quietly=T))) +options(width=200, scipen=999) + + +CLOSEST_MAX_DISTANCE = 2e4 ## For intergenic CNVs, ignore distanceToNearest() hits farther than this + + + +## Handle non-standard bedpe columns better +readBEDPE = function(f) { + + ## Read file as Pairs object + x = rtracklayer::import(f, format='bedpe') + + ## Update metadata column names + x.mcol.names = colnames(read.csv(f, h=T, stringsAsFactors=F, sep='\t', check.names=F)) + colnames(mcols(x))[3:ncol(mcols(x))] = x.mcol.names[11:length(x.mcol.names)] + mcols(x)$type = mcols(x)$name + + + ## Convert to breakpoint ranges + x = StructuralVariantAnnotation::pairs2breakpointgr(x) + + return(x) + +} + + + +## Read Ensembl gene info +readEnsembl = function(f) { + + ## Read and get rid of unnecessary columns + x = read.csv(f, h=F, stringsAsFactors=F, sep='\t') + colnames(x) = c('gene_chr', 'gene_start', 'gene_end', 'strand', 'name', 'na1', 'na2', 'exons', 'exon_starts', 'exon_ends', 'na3', 'intron_starts', 'intron_ends') + x = x[, !grepl('na[0-9]$', colnames(x))] + + x = GenomicRanges::makeGRangesFromDataFrame(x, + keep.extra.columns=T, + seqnames.field='gene_chr', + start.field='gene_start', + end.field='gene_end') + + x$intron_starts = gsub(',$','',x$intron_starts) + x$intron_ends = gsub(',$','',x$intron_ends) + + return(x) + +} + + + +## Check if breakpoints fall within genes +annotateWithDisruptions = function(sv, genes) { + + genes$disrupt = genes$name + sv = gr.val(query=sv, target=genes, val='disrupt') + sv$disrupt = gsub(' ', '', sv$disrupt, fixed=T) + + return(sv) + +} + + + +readCancerCensus = function(f) { + + x = read.csv(f, h=T, stringsAsFactors=F, sep='\t') + colnames(x) = c('chrom', 'start', 'end', 'name', 'locus') + + + # x$name = gsub('\\|.*$', '', x$name) + x = x[, !colnames(x) %in% 'locus'] + + x = GenomicRanges::makeGRangesFromDataFrame(x, + keep.extra.columns=T, + seqnames.field='chrom', + start.field='start', + end.field='end') + + return(x) + +} + + + +## Check if breakends fall within introns +annotateWithIntronic = function(sv=sv, genes=genes) { + + sv$intronic = '' + + ## Subset gene list to those that have introns and are already known to overlap with breakpoints + genes = genes[genes$intron_starts != '-' & genes$name %in% unique(unlist(strsplit(sv$disrupt, ',')))] + + ## Expand introns to GRanges object (keeping track of what genes they belong to) + introns = base::mapply(function(x, y, z, n) GRanges(x, IRanges(as.numeric(y), as.numeric(z), name=rep(n, length(y)))), + x=as.character(seqnames(genes)), + y=strsplit(genes$intron_starts, ',', fixed=T), + z=strsplit(genes$intron_ends, ',', fixed=T), + n=genes$name) + + introns = Reduce(c, introns) + + + ## Do SV breakends overlap any introns? + processed = c() + for (i in 1:length(sv)) { + partner.idx = which(names(sv) == sv$partner[i]) + + ## Check if we've already procesed the full breakpoint + if (i %in% processed) { + next + } + + ## Check which introns these breakpoints overlap + intron.hits = findOverlaps(query=c(sv[c(i, partner.idx)]), subject=introns) + + ## Do we have any duplicated intron hits? If so, these breakends fall within the same intron + gene.names = names(introns)[subjectHits(intron.hits)[duplicated(subjectHits(intron.hits))]] + sv$intronic[c(i, partner.idx)] = paste(gene.names, collapse=',') + + processed = c(processed, i, partner.idx) + + } + + return(sv) + +} + + + +## Find the nearest copy number changepoints to each breakend +annotateWithClosest = function(sv, genes, closest.max.distance=CLOSEST_MAX_DISTANCE) { + + sv$closest = '' + + ## For each breakend + for (i in 1:length(sv)) { + + ## Find the nearest non-overlapping gene, i.e. + ## exclude any disrupted/contained gene(s) from the comparisons + disrupt = unlist(strsplit(sv$disrupt[i], ',')) + # contains = unlist(strsplit(sv$contains[i], ',')) + intronic = unlist(strsplit(sv$intronic[i], ',')) + + if (length(c(disrupt, intronic)) > 0) { + genes.i = genes[-which(genes$name %in% c(disrupt, intronic))] + } else { + genes.i = genes + } + + + ## Add closest gene, subject to distance cutoff (default 20kb) + closest = GenomicRanges::distanceToNearest(x=sv[i], subject=genes.i, ignore.strand=T) + closest = closest[mcols(closest)$distance <= closest.max.distance] + + if (length(closest) > 0) { + sv$closest[i] = genes.i$name[subjectHits(closest)] + } + + } + + return(sv) + +} + + + +annotateWithContained = function(sv, genes, sv.colname='contains', allow.partial.overlap=F) { + + mcols(sv)[,sv.colname] = '' + + ## For each breakend + processed = c() + for (i in 1:length(sv)) { + + partner.idx = which(names(sv) == sv$partner[i]) + is.translocation = as.character(seqnames(sv[i])) != as.character(seqnames(sv[partner.idx])) + contains = '' + + ## Check if we've already procesed the full breakpoint + if (i %in% processed) { + next + } + + ## Only check contained genes if this is intrachromosomal + if (!is.translocation) { + + ## If allow.partial.overlap=T, allow partially overlapping intervals, otherwise, just a + ## simple coordinate check to see if intervals are fully contained + if (allow.partial.overlap) { + + bkpt = GRanges(as.character(seqnames(sv))[i], IRanges(start(sv)[i], end(sv)[partner.idx])) + contains = genes$name[genes %^% bkpt] + + } else { + + contains = genes$name[start(sv)[i] <= start(genes) & + end(genes) <= end(sv)[partner.idx] & + as.character(seqnames(genes)) == as.character(seqnames(sv[i]))] + + } + + + + } else if (allow.partial.overlap && is.translocation) { + + contains = genes$name[genes %^% sv[c(i, partner.idx)]] + + } + + mcols(sv)[i, sv.colname] = paste(contains, collapse=',') + processed = c(processed, i, partner.idx) + + } + + return(sv) + +} + + + +## Convert breakpointRanges to BEDPE +vcfToBedpe = function(vcf, supplemental) { + + sqn = as.character(seqnames(vcf)) + strand = as.character(strand(vcf)) + res = c() + processed = c() + + for (i in 1:length(vcf)) { + bnd = names(vcf)[i] + partner = vcf$partner[i] + partner.idx = which(names(vcf) == partner) + + ## If we don't have exactly one partner, exclude this variant + if (length(partner.idx) != 1) { + warning('Missing partner for breakend ', bnd) + next + } + + ## Check to see if we've alrady processed this or it's partner + if (any(c(bnd, partner) %in% processed)) { + next + } + + + ## Add disrupt/closest/intronic to info field + disrupt.l = paste0('DisruptL=', vcf$disrupt[i]) + disrupt.r = paste0('DisruptR=', vcf$disrupt[partner.idx]) + closest.l = paste0('ClosestL=', vcf$closest[i]) + closest.r = paste0('ClosestR=', vcf$closest[partner.idx]) + intronic = paste0('Intronic=', vcf$intronic[i]) + contained = paste0('Contained=', vcf$contains[i]) + + if (supplemental) { + gene.str = paste(disrupt.l, disrupt.r, closest.l, closest.r, intronic, contained, sep=';') + } else { + gene.str = paste(disrupt.l, disrupt.r, closest.l, closest.r, intronic, sep=';') + } + + + + ## Add cancer census genes only if they exist + if (vcf$cgc[i] != '' || vcf$cgc[partner.idx] !='') { + cgc.str = paste0('Cancer_census=', paste(setdiff(vcf$cgc[c(i,partner.idx)],''),collapse=',')) + gene.str = paste(gene.str, cgc.str, sep=';') + } + + vcf$info[i] = paste0(vcf$info[i], gene.str) + + + ## Combine breakends in single line + res.i = c(sqn[i], start(vcf)[i], end(vcf)[i], ## chr1, start1, end1 + sqn[partner.idx], start(vcf)[partner.idx], end(vcf)[partner.idx], ## chr2, start2, end 2 + vcf$type[i], '.', strand[i], strand[partner.idx], ## type, score, strand1, strand2 + vcf$evidence[i], vcf$tools[i], vcf$`tumor--normal`[i], vcf$info[i]) ## evidence, tools, TN, info + + ## Add to result, keep track of processed breakends + res = rbind(res, res.i) + processed = c(processed, bnd, partner) + } + + + ## Add colnames and fill in simple event classifications + colnames(res) = c('chr1', 'start1', 'end1', 'chr2', 'start2', 'end2', 'type', + 'score', 'strand1', 'strand2', 'evidence', 'tools', 'tumor--normal', + 'info') + res = as.data.frame(res, stringsAsFactors=F) + + ## Fix coordinates (have to subtract when starting from a bedpe) + res$start1 = as.numeric(res$start1) - 1 + res$start2 = as.numeric(res$start2) - 1 + + + colnames(res)[1] = paste0('#', colnames(res)[1]) + + return(res) + +} + + + +## Collect arguments +option_list = list( + make_option(c("-b", "--bedpe"), type='character', help="Input BEDPE"), + make_option(c("-e", "--ensembl"), type='character', help="Ensembl gene list"), + make_option(c("-c", "--cancer_census"), type='character', help="Cancer census gene list"), + make_option(c("-s", "--supplemental"), action='store_true', default=F, help="Add supplementary gene annotations?"), + make_option(c("-o", "--out_file"), type='character', help="Output BEDPE")) +opt = parse_args(OptionParser(option_list=option_list)) + + +## Read bedpe +sv = readBEDPE(opt$bedpe) + +## Read gene lists +genes = readEnsembl(opt$ensembl) +cgc = readCancerCensus(opt$cancer_census) + +## Add contained ensembl and cgc genes +sv = annotateWithContained(sv=sv, genes=genes, allow.partial.overlap=F) +sv = annotateWithContained(sv=sv, genes=cgc, sv.colname='cgc', allow.partial.overlap=T) + +## Add disrupted genes +sv = annotateWithDisruptions(sv=sv, genes=genes) + +## Which breakpoints fall within introns? +sv = annotateWithIntronic(sv=sv, genes=genes) + +## Add closest (non-contained) genes +sv = annotateWithClosest(sv=sv, genes=genes) + +## Convert to bedpe +res = vcfToBedpe(sv, supplemental=opt$supplemental) + +## Write result +write.table(res, opt$out_file, row.names=F, col.names=T, sep='\t', quote=F) diff --git a/bin/pta/annotate-cnv.r b/bin/pta/annotate-cnv.r new file mode 100644 index 00000000..814847a4 --- /dev/null +++ b/bin/pta/annotate-cnv.r @@ -0,0 +1,331 @@ +## BJS Note: this script was located in the root path of the Docker container +## gcr.io/nygc-public/sv_cnv@sha256:1c14a50d131323a2a4bab323cf224879776af8de37f93df79292fd2e63269274 +## It is reproduced below as it exists there without modification + +## Annotate a merged bedpe with arbitrary databases +libs = c('optparse', 'gUtils', 'GenomicRanges', 'rtracklayer') +invisible(suppressPackageStartupMessages(sapply(libs, require, character.only=T, quietly=T))) +options(width=200, scipen=999) + +## TODO: Move to config? +CLOSEST_MAX_DISTANCE = 2e4 ## For intergenic CNVs, ignore nearest() hits farther than this +LARGESCALE_MIN = 3e6 ## Any events smaller than this are considered focal +DUP_LOG2 = 0.2 ## log2 ratio cutoff for considering an event a duplication +DEL_LOG2 = -0.235 ## log2 ratio cutoff for considering an event a deletion + +## Read BIC-Seq2 output into a GRanges object +## Optionally subset to CNVs in chr +readCNV = function(f, chr=NULL) { + + x = read.csv(f, h=T, stringsAsFactors=F, sep='\t') + colnames(x)[colnames(x) == 'log2.copyRatio'] = 'log2' + + x = GenomicRanges::makeGRangesFromDataFrame(x, + keep.extra.columns=T, + seqnames.field='chrom', + start.field='start', + end.field='end') + + + if (!is.null(chr)) { + x = x[as.character(seqnames(x)) %in% chr] + } + + return(x) + +} + +## Read cytoband into a GRanges object +readCytoband = function(f) { + + x = read.csv(f, h=F, stringsAsFactors=F, sep='\t') + colnames(x) = c('chrom', 'start', 'end', 'cytoband', 'stain') + + x = x[, !colnames(x) %in% 'stain'] + + x = GenomicRanges::makeGRangesFromDataFrame(x, + keep.extra.columns=T, + seqnames.field='chrom', + start.field='start', + end.field='end') + + return(x) + +} + +readDB = function(f) { + + x <- import(f, format = 'BED') + + return(x) + +} + +readCancerCensus = function(f) { + + x = read.csv(f, h=T, stringsAsFactors=F, sep='\t') + colnames(x) = c('chrom', 'start', 'end', 'cgc', 'locus') + + # x$cgc = gsub('\\|.*$', '', x$cgc) + x = x[, !colnames(x) %in% 'locus'] + + x = GenomicRanges::makeGRangesFromDataFrame(x, + keep.extra.columns=T, + seqnames.field='chrom', + start.field='start', + end.field='end') + + return(x) + +} + +## Read Ensembl +readEnsembl = function(f) { + + ## Read and get rid of unnecessary columns + x = read.csv(f, h=F, stringsAsFactors=F, sep='\t') + colnames(x) = c('gene_chr', 'gene_start', 'gene_end', 'strand', 'name', 'na1', 'na2', 'exons', 'exon_starts', 'exon_ends', 'na3', 'intron_starts', 'intron_ends') + x = x[, !grepl('na[0-9]$', colnames(x))] + + x = GenomicRanges::makeGRangesFromDataFrame(x, + keep.extra.columns=T, + seqnames.field='gene_chr', + start.field='gene_start', + end.field='gene_end') + + x$intron_starts = gsub(',$','',x$intron_starts) + x$intron_ends = gsub(',$','',x$intron_ends) + + return(x) + +} + +## Simplify comma-delimited cytoband list to only the first and last cytobands +.simplifyCytoband = function(x, delim=', ', collapse='-') { + + x = unlist(strsplit(x, delim, fixed=T)) + + if (length(x) > 1) { + x = paste0(x[1], collapse, x[length(x)]) + } + + return(x) + +} + +## Annotate with cytoband +annotateCytoband = function(cnv, cytoband) { + + ## Pull in cytoband info + cnv = cnv %$% cytoband + + ## Simplify comma-delimited representation to hyphenated if necessary + cnv$cytoband = sapply(cnv$cytoband, .simplifyCytoband) + + ## Add chromosome information + cnv$cytoband = paste0(as.character(seqnames(cnv)), cnv$cytoband) + + return(cnv) + +} + +## Annotate with databases, subject to reciprocal overlap criteria +annotateDB = function(x, db, name, overlap) { + + ## Find hits + hits = GenomicRanges::findOverlaps(query=x, subject=db) + + ## Compute overlap + mcols(hits)$intersection = width(pintersect(x[queryHits(hits)], db[subjectHits(hits)])) + mcols(hits)$overlap_query = mcols(hits)$intersection / width(x[queryHits(hits)]) + mcols(hits)$overlap_subject = mcols(hits)$intersection / width(db[subjectHits(hits)]) + + ## Hits should meet the minimum reciprocal overlap cutoff + ## To match bedtools::intersect's implementation of reciprocal oerlap, + ## The fraction overlap should be at least the same in each direction + hits = hits[mcols(hits)$overlap_query >= overlap & mcols(hits)$overlap_subject >= overlap] + + ## Annotate any hits we get + x$db[queryHits(hits)] = paste0(x$db[queryHits(hits)], ',', name) + x$db = gsub('^,', '', x$db) + + return(x) + +} + +## Compare GRanges x to GRanges gene mcols intron_start, intron_end +.isIntronic = function(x, gene) { + + if (gene$intron_starts == '-') { + + is.intronic = F + + } else { + + introns = GRanges(as.character(seqnames(gene)), + IRanges(as.numeric(unlist(strsplit(gene$intron_starts, ',', fixed=T))), + as.numeric(unlist(strsplit(gene$intron_ends, ',', fixed=T))))) + + is.intronic = any(start(introns) <= start(x) && end(x) <= end(introns)) + + } + + return(is.intronic) + +} + +## Annotate with ensembl genes +annotateEnsembl = function(x, ens, closest.max.distance=CLOSEST_MAX_DISTANCE) { + + ## Init empty columns + mcols(x)[, c('disrupt.l', 'disrupt.r', 'contains', 'intronic', 'intergenic', 'closest')] = '' + + ## Find hits + hits = GenomicRanges::findOverlaps(query=x, subject=ens) + + ## Check contains, disruption on each side, intronic + x.start = start(x[queryHits(hits)]) + x.end = end(x[queryHits(hits)]) + gene.start = start(ens[subjectHits(hits)]) + gene.end = end(ens[subjectHits(hits)]) + + contains = x.start <= gene.start & gene.end <= x.end + disrupt.r = gene.start <= x.end & x.end <= gene.end + disrupt.l = gene.start <= x.start & x.start <= gene.end + intronic = F + + + ## We only need to check CNVs in introns if they don't contain their hit + ## and intersect the gene on both CNV ends + for (i in which(!contains & disrupt.r & disrupt.l)) { + + message('Checking potential intronic variant...') + intronic[i] = .isIntronic(x=x[queryHits(hits[i])], gene=ens[subjectHits(hits[i])]) + + } + + ## Concatenate genes and store + ## The tapply() aggregates genes by query index (i.e. x index), which we use to map back to x + contains = tapply(ens[subjectHits(hits)]$name[contains], queryHits(hits)[contains], paste, collapse=',') + x$contains[as.numeric(names(contains))] = contains + + disrupt.r = tapply(ens[subjectHits(hits)]$name[disrupt.r], queryHits(hits)[disrupt.r], paste, collapse=',') + x$disrupt.r[as.numeric(names(disrupt.r))] = disrupt.r + + disrupt.l = tapply(ens[subjectHits(hits)]$name[disrupt.l], queryHits(hits)[disrupt.l], paste, collapse=',') + x$disrupt.l[as.numeric(names(disrupt.l))] = disrupt.l + + intronic = tapply(ens[subjectHits(hits)]$name[intronic], queryHits(hits)[intronic], paste, collapse=',') + x$intronic[as.numeric(names(intronic))] = intronic + + + ## Add intergenic and closest gene + x$intergenic[setdiff(1:length(x), queryHits(hits))] = 'yes' + + + ## Add closest gene, subject to distance cutoff + dist.to.nearest = GenomicRanges::distanceToNearest(x=x, subject=ens) + dist.to.nearest = dist.to.nearest[mcols(dist.to.nearest)$distance <= closest.max.distance] + x$closest[queryHits(dist.to.nearest)] = ens$name[subjectHits(dist.to.nearest)] + + return(x) + +} + +## Collect arguments +option_list = list( + make_option(c("-c", "--cnv"), type='character', help="Input CNV calles"), + make_option(c("-a", "--caller"), type='character', help="Name of tool used to call CNVs in --cnv (only bicseq2 is currently supported)"), + make_option(c("-t", "--tumor"), type='character', help="Comma-delimited list of database names corresponding to the order in --db_files"), + make_option(c("-n", "--normal"), type='character', help="Comma-delimited list of database files corresponding to the order in --db_names"), + make_option(c("-b", "--cytoband"), type='character', help="Cytoband file: headerless tab-delimited files with chr, start, end, cytoband, stain"), + make_option(c("-d", "--db_names"), type='character', help="Comma-delimited list of database names corresponding to the order in --db_files"), + make_option(c("-s", "--db_files"), type='character', help="Comma-delimited list of database files corresponding to the order in --db_names"), + make_option(c("-e", "--ensembl"), type='character', help="Ensembl gene list"), + make_option(c("-l", "--allowed_chr"), type='character', help="Comma-delimited list of chromosomes to keep"), + make_option(c("-g", "--cancer_census"), type='character', help="Cancer census gene list"), + make_option(c("-f", "--overlap_fraction"), type='numeric', help="Fraction that database hits must overlap query interval"), + make_option(c("-o", "--out_file_main"), type='character', help="Main output BED"), + make_option(c("-p", "--out_file_supplemental"), type='character', help="Supplemental output BED")) +opt = parse_args(OptionParser(option_list=option_list)) + + +## Unpack arguments +opt$db_names = unlist(strsplit(opt$db_names, ',', fixed=T)) +opt$db_files = unlist(strsplit(opt$db_files, ',', fixed=T)) +opt$allowed_chr = unlist(strsplit(opt$allowed_chr, ',', fixed=T)) + + +## Read files +cnv = readCNV(opt$cnv, chr=opt$allowed_chr) +cyto = readCytoband(opt$cytoband) +cgc = readCancerCensus(opt$cancer_census) +ensembl = readEnsembl(opt$ensembl) + +## Add cytoband annotation +cnv = annotateCytoband(cnv=cnv, cytoband=cyto) + +## Add tumor-normal id, caller info +cnv$`tumor--normal` = paste0(opt$tumor,'--',opt$normal) +cnv$tool = opt$caller + + +## Annotate focal/large-scale +cnv$focal = ifelse(width(cnv) < LARGESCALE_MIN, 'yes', 'no') + + +## Annotate dup/del/neu +cnv$type = 'NEU' +cnv$type[cnv$log2 > DUP_LOG2] = 'DUP' +cnv$type[cnv$log2 < DEL_LOG2] = 'DEL' + + +## Annotate with databases +cnv$db = '' +for (i in 1:length(opt$db_names)) { + + db.name = opt$db_names[i] + db.file = opt$db_files[i] + + print(db.name) + + db = readDB(db.file) + cnv = annotateDB(x=cnv, db=db, name=db.name, overlap=opt$overlap_fraction) + +} + +## Annotate with CGC genes +cnv = cnv %$% cgc +cnv$cgc = gsub(' ', '', cnv$cgc) + +## Annotate with Ensembl genes +cnv = annotateEnsembl(x=cnv, ens=ensembl) + +## Subtract 1 from the output start to adhere to BED standard +start(cnv) = start(cnv) - 1 + +## Rename chr, convert to data frame +cnv = as.data.frame(cnv) +cnv$`#chr` = cnv$seqnames + +## Build info field +cnv$info = paste0('known=',cnv$db, ';Cancer_census=',cnv$cgc, ';DisruptL=',cnv$disrupt.l, ';DisruptR=', cnv$disrupt.r) +cnv$info[cnv$intergenic == 'yes'] = paste0(cnv$info[cnv$intergenic == 'yes'], ';Intergenic') +cnv$info[cnv$intergenic == 'yes'] = paste0(cnv$info[cnv$intergenic == 'yes'], ';Closest=', cnv$closest[cnv$intergenic == 'yes']) + +## Fields included in main/supplemental are slightly different +for (i in c('main', 'supplemental')) { + + cnv.i = cnv[, c('#chr', 'start', 'end', 'type', 'log2', 'tool', 'tumor..normal', 'info', 'focal', 'cytoband')] + colnames(cnv.i) = gsub('..', '--', colnames(cnv.i), fixed=T) + outfile = ifelse(i == 'main', opt$out_file_main, opt$out_file_supplemental) + + if (i=='supplemental') { + + cnv.i$info = paste0(cnv$info,';Contained=',cnv$contains) + + } + + write.table(cnv.i, outfile, row.names=F, col.names=T, sep='\t', quote=F) + +} diff --git a/bin/pta/annotate_id.py b/bin/pta/annotate_id.py new file mode 100644 index 00000000..3eac8237 --- /dev/null +++ b/bin/pta/annotate_id.py @@ -0,0 +1,144 @@ All rights are reserved. This software is supplied without
# any warranty or guaranteed support whatsoever. The New York Genome Center
# cannot be responsible for its use, misuse, or functionality.
# Version: 0.1
# Author: Jennifer M Shelton All rights are reserved. This software is supplied without +# any warranty or guaranteed support whatsoever. The New York Genome Center +# cannot be responsible for its use, misuse, or functionality. +# Version: 0.1 +# Author: Jennifer M Shelton +##################### /COPYRIGHT ############################################### +################################################################################ + + +import sys +import os +import logging as log +import pysam +########################################################################## +############## Custom functions ############ +########################################################################## + + +def read_vcf(vcf_file): + ''' + Read in annotated VCF file. + ''' + bcf_in = pysam.VariantFile(vcf_file) # auto-detect input format + return bcf_in + + +def get_csq_columns(bcf_in): + ''' + get column names from the bar + separated CSQ VEP annotation + results. CSQ are Consequence + annotations from Ensembl VEP. + ''' + csq_columns = bcf_in.header.info['CSQ'].description.split()[-1].split('|') # grab the definitions + return csq_columns + + +def get_csqs(record, csq_columns): + ''' + Get new INFO field results. + ''' + alt_count = len(record.alts) + csq_dicts = {} + for i in range(alt_count): + try: + csq_line = record.info['CSQ'][i] + except UnicodeDecodeError: # for names with accents and other unexpected characters (rare) + line = str(record) + csq_line = line.split('\t')[7].split('CSQ=')[1] + csq_line = csq_line.split(';')[0] + csq_line = csq_line.split(',')[i] + csq_values = csq_line.split('|') + csq_dict = dict(zip(csq_columns, csq_values)) + csq_dicts[i] = csq_dict + return csq_dicts + + +def get_ID(record, csq_columns): + ''' + Get new ID results is a Cosmic Coding or non-coding result is available. + ''' + csq_dicts = get_csqs(record, csq_columns) + alt_count = len(record.alts) + coding_ids = [] + noncoding_ids = [] + for i in range(alt_count): + if 'CosmicCoding' in csq_dicts[i]: + coding_ids += [csq_dicts[i]['CosmicCoding'].replace('&', ';')] + if 'CosmicNonCoding' in csq_dicts[i]: + noncoding_ids += [csq_dicts[i]['CosmicNonCoding'].replace('&', ';')] + ids = ';'.join([id for id in coding_ids + noncoding_ids if not id == '']) + return ids + + +def fix_gt(gt): + ''' + change GT 0/0/0/1/0, 0/0/0/1, 0/0/1, etc to 0/1 + ''' + if len(gt.split('/')) > 2: + gt = '0/1' + return gt + + +def modify_record(record, csq_columns): + ''' + Add new ID field to records as needed + ''' + # 'CosmicCoding', 'CosmicNonCoding' + gts = [key for key in record.samples[0].keys() if key.endswith('_GT')] + for key in gts: + gt = record.samples[0][key] + record.samples[0][key] = fix_gt(gt) + gt = record.samples[1][key] + record.samples[1][key] = fix_gt(gt) + ids = get_ID(record, csq_columns) + if ids: + if record.id: + record.id = record.id + ';' + ids + else: + record.id = ids + return record + +def write_vcf(bcf_in, vcf_out_file, csq_columns): + ''' + Write out the VCF + ''' + bcf_out = pysam.VariantFile(vcf_out_file, 'w', header=bcf_in.header) + for record in bcf_in.fetch(): + record = modify_record(record, csq_columns) + exit_status = bcf_out.write(record) + if exit_status != 0: + print(exit_status) + + +def main(): + ''' + Annotates files by adding information about the + Cosmic coding and non-coding entries to the ID column. + Also changes GT 0/0/0/1/0, 0/0/0/1, 0/0/1, etc to 0/1. + ''' + vcf_file = sys.argv[1] + vcf_out_file = sys.argv[2] + assert os.path.isfile(vcf_file), 'Failed to find caller VCF call file :' + vcf_file + bcf_in = read_vcf(vcf_file) + csq_columns = get_csq_columns(bcf_in) + write_vcf(bcf_in, vcf_out_file, csq_columns) + + +########################################################################## +##### Execute main unless script is simply imported ############ +##### for individual functions ############ +########################################################################## +if __name__ == '__main__': + main() diff --git a/bin/pta/bicseq2_config_writer.py b/bin/pta/bicseq2_config_writer.py new file mode 100644 index 00000000..56d7fd25 --- /dev/null +++ b/bin/pta/bicseq2_config_writer.py @@ -0,0 +1,109 @@ +import pandas as pd +import os +import argparse +from os import listdir +from os.path import isfile, join +import glob + +class Bicseq2Prep(): + def __init__(self, sample_id, + fa_files, + out_file, + mappability_directory, + norm_bicseq2_config, + temp_seqs): + self.out_file = out_file + self.sample_id = sample_id + self.norm_bicseq2_config = norm_bicseq2_config + self.mappability_directory = mappability_directory + self.fa_files = fa_files + self.temp_seqs = temp_seqs + self.write_sample_configs() + + def match_fa_file(self, row): + for fa_file in self.fa_files: + if fa_file.endswith('_' + str(row.chrom_name) + '.fasta'): + return fa_file + + def match_mappability_file(self, row): + mappability_files = (glob.glob(self.mappability_directory+"/*")) + # from the directory provided, find all files. + for mappability_file in mappability_files: + if os.path.basename(mappability_file) == str(row.chrom_name) + '.uniq.txt': + return mappability_file + + def match_seq_file(self, row): + for temp_seq in self.temp_seqs: + if os.path.basename(temp_seq) == str(self.sample_id) + '_' + str(row.chrom_name) + '.seq': + return temp_seq + + def match_norm_file(self, row): + for temp_seq in self.temp_seqs: + if os.path.splitext(os.path.basename(temp_seq))[0] + '.norm.bin.txt' == str(self.sample_id) + '_' + str(row.chrom_name) + '.norm.bin.txt': + return os.path.splitext(os.path.basename(temp_seq))[0] + '.norm.bin.txt' + # Modified to programtically set this output. + + def prep(self): + '''initial file should start with one column named chrom_name ''' + data = pd.read_csv(self.norm_bicseq2_config, sep='\t') + assert ''.join(data.columns.tolist()) == 'chrom_name', 'Error: initial config file should start with one column named chrom_name' + data['fa_file'] = data.apply(lambda row: self.match_fa_file(row), axis=1) + data['mappability'] = data.apply(lambda row: self.match_mappability_file(row), axis=1) + data['readPosFile'] = data.apply(lambda row: self.match_seq_file(row), axis=1) + data['bin_file_normalized'] = data.apply(lambda row: self.match_norm_file(row), axis=1) + return data + + def write_sample_configs(self): + '''write configs for the normalization step''' + # tumor + data = self.prep() + config = data.to_csv(self.out_file, sep='\t', index=False) + + +def get_args(): + '''Parse input flags + ''' + parser = argparse.ArgumentParser() + parser.add_argument('--fa-files', + help='List of chrom fasta files. ', + required=True, + nargs='*' + ) + parser.add_argument('--mappability-directory', + help='Directory containing mappability files. ', + required=True + ) + parser.add_argument('--temp-seqs', + help='List of file paths ${sample_id}_${chr}.seq ' + '(readPosFile files output from samtools getUnique) ', + required=True, + nargs='*' + ) + parser.add_argument('--norm-bicseq2-config', + help='Pre filled file for ${sample_id}.bicseq2.config. ' + 'Fasta-specific but sample-independent portion of config file.', + required=True + ) + parser.add_argument('--out-file', + help='Output config filename', + required=True + ) + parser.add_argument('--sample-id', + help='sample id', + required=True + ) + args_namespace = parser.parse_args() + return args_namespace.__dict__ + +def main(): + args = get_args() + bicseq = Bicseq2Prep(sample_id=args['sample_id'], + fa_files=args['fa_files'], + out_file=args['out_file'], + mappability_directory=args['mappability_directory'], + norm_bicseq2_config=args['norm_bicseq2_config'], + temp_seqs=args['temp_seqs']) + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/bin/pta/bicseq2_seg_config_writer.py b/bin/pta/bicseq2_seg_config_writer.py new file mode 100644 index 00000000..787c39da --- /dev/null +++ b/bin/pta/bicseq2_seg_config_writer.py @@ -0,0 +1,88 @@ +import pandas as pd +import os +import argparse + + +class Bicseq2Prep(): + def __init__(self, pair_id, + out_file, + seg_bicseq2_config, + tumor_norms, + normal_norms): + self.out_file = out_file + self.pair_id = pair_id + self.seg_bicseq2_config = seg_bicseq2_config + self.tumor_norms = tumor_norms + self.normal_norms = normal_norms + self.write_sample_configs() + + def match_file(self, row, files): + for temp_norm in files: + if os.path.basename(temp_norm).endswith('_' + str(row.chrom_name) + '.norm.bin.txt'): + return temp_norm + + def prep_pair(self): + ''' file: tumor--normal.bicseq2.seg.config + prep fasta-specific but sample independent portion of config file''' + data = pd.read_csv(self.seg_bicseq2_config, sep='\t') + # assert ''.join(data.columns.tolist()) == 'chr', 'Error: initial config file should start with one column named chr' + assert ''.join(data.columns.tolist()) == 'chrom_name', 'Error: initial config file should start with one column named chrom_name' + + data['case'] = data.apply(lambda row: self.match_file(row, + files=self.tumor_norms), axis=1) + data['control'] = data.apply(lambda row: self.match_file(row, + files=self.normal_norms), axis=1) + return data + + + def write_sample_configs(self): + '''create and upload configs for the normalization step''' + # tumor + data = self.prep_pair() + config = data.to_csv(self.out_file, sep='\t', index=False) + + +def get_args(): + '''Parse input flags + ''' + parser = argparse.ArgumentParser() + parser.add_argument('--tumor-norms', + help='List of file paths ${tumor}_${chrom_name}.norm.bin.txt ' + ' (Output from Bicseq2Norm ).', + required=True, + nargs='*' + ) + parser.add_argument('--normal-norms', + help='List of file paths ${normal}_${chrom_name}.norm.bin.txt ' + ' (Output from Bicseq2Norm ).', + required=True, + nargs='*' + ) + parser.add_argument('--seg-bicseq2-config', + help='Pre filled file for ${pair_id}.bicseq2.seg.config. ' + 'Fasta-specific but sample-independent portion of config file.', + required=True + ) + parser.add_argument('--out-file', + help='Output config filename', + required=True + ) + parser.add_argument('--pair-id', + help='pair id', + required=True + ) + args_namespace = parser.parse_args() + return args_namespace.__dict__ + + +def main(): + args = get_args() + bicseq = Bicseq2Prep(pair_id=args['pair_id'], + out_file=args['out_file'], + seg_bicseq2_config=args['seg_bicseq2_config'], + tumor_norms=args['tumor_norms'], + normal_norms=args['normal_norms']) + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/bin/pta/bicseq2_seg_config_writer_unpaired.py b/bin/pta/bicseq2_seg_config_writer_unpaired.py new file mode 100644 index 00000000..71c3bc28 --- /dev/null +++ b/bin/pta/bicseq2_seg_config_writer_unpaired.py @@ -0,0 +1,77 @@ +import pandas as pd +import os +import argparse + + +class Bicseq2Prep(): + def __init__(self, pair_id, + out_file, + seg_bicseq2_config, + tumor_norms): + self.out_file = out_file + self.pair_id = pair_id + self.seg_bicseq2_config = seg_bicseq2_config + self.tumor_norms = tumor_norms + self.write_sample_configs() + + def match_file(self, row, files): + for temp_norm in files: + if os.path.basename(temp_norm).endswith('_' + str(row.chrom_name) + '.norm.bin.txt'): + return temp_norm + + def prep_pair(self): + ''' file: tumor.bicseq2.seg.config + prep fasta-specific but sample independent portion of config file''' + data = pd.read_csv(self.seg_bicseq2_config, sep='\t') + # assert ''.join(data.columns.tolist()) == 'chr', 'Error: initial config file should start with one column named chr' + assert ''.join(data.columns.tolist()) == 'chrom_name', 'Error: initial config file should start with one column named chrom_name' + + data['case'] = data.apply(lambda row: self.match_file(row, + files=self.tumor_norms), axis=1) + return data + + + def write_sample_configs(self): + '''create and upload configs for the normalization step''' + # tumor + data = self.prep_pair() + config = data.to_csv(self.out_file, sep='\t', index=False) + + +def get_args(): + '''Parse input flags + ''' + parser = argparse.ArgumentParser() + parser.add_argument('--tumor-norms', + help='List of file paths ${tumor}_${chrom_name}.norm.bin.txt ' + ' (Output from Bicseq2Norm ).', + required=True, + nargs='*' + ) + parser.add_argument('--seg-bicseq2-config', + help='Pre filled file for ${pair_id}.bicseq2.seg.config. ' + 'Fasta-specific but sample-independent portion of config file.', + required=True + ) + parser.add_argument('--out-file', + help='Output config filename', + required=True + ) + parser.add_argument('--pair-id', + help='pair id', + required=True + ) + args_namespace = parser.parse_args() + return args_namespace.__dict__ + + +def main(): + args = get_args() + bicseq = Bicseq2Prep(pair_id=args['pair_id'], + out_file=args['out_file'], + seg_bicseq2_config=args['seg_bicseq2_config'], + tumor_norms=args['tumor_norms']) + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/bin/pta/filter-bedpe.r b/bin/pta/filter-bedpe.r new file mode 100644 index 00000000..14ab7bef --- /dev/null +++ b/bin/pta/filter-bedpe.r @@ -0,0 +1,82 @@ +## BJS Note: this script was located in the root path of the Docker container +## gcr.io/nygc-public/sv_cnv@sha256:1c14a50d131323a2a4bab323cf224879776af8de37f93df79292fd2e63269274 +## It is reproduced below as it exists there without modification + +## Filter a bedpe for somatic variants (i.e., not in specified germline databases), and +## high-confidence variants (2+ callers or 1 caller with a nearby copy number changepoint) +libs = c('optparse', 'GenomicRanges') +invisible(suppressPackageStartupMessages(sapply(libs, require, character.only=T, quietly=T))) +options(width=200, scipen=999) + + +## Check if databases db are in info string x +inDatabase = function(x, db) { + + ## Split info field, look for database entry + x = unlist(strsplit(x, ';', fixed=T)) + x = grep('known=', x, fixed=T, value=T) + x = gsub('known=', '', x, fixed=T) + x = unlist(strsplit(x, ',', fixed=T)) + + return(any(x %in% db)) + +} + + + +makeGRangesFromChangepoint = function(x) { + + x = unlist(strsplit(x, ':|-')) + GRanges(seqnames=x[1], ranges=IRanges(as.numeric(x[2:3]), as.numeric(x[2:3]))) + +} + + + +## Is variant x a high-confidence variant? +## Meant to be used with apply(,2,) +isHighConfidence = function(x, cpmax) { + + ## Is there support from multiple callers? + multi.caller = grepl(',', x['tools']) + + ## Is either breakpoint close enough to its nearest changepoint? + x1.gr = GRanges(seqnames=x['#chr1'], ranges=IRanges(as.numeric(x['start1']), as.numeric(x['end1']))) + ch1.gr = makeGRangesFromChangepoint(x['cnv_changepoint_1']) + near.ch1 = any(GenomicRanges::distance(x1.gr, ch1.gr) <= cpmax) + + x2.gr = GRanges(seqnames=x['chr2'], ranges=IRanges(as.numeric(x['start2']), as.numeric(x['end2']))) + ch2.gr = makeGRangesFromChangepoint(x['cnv_changepoint_2']) + near.ch2 = any(GenomicRanges::distance(x2.gr, ch2.gr) <= cpmax) + + return(multi.caller || near.ch1 || near.ch2) + +} + + + +## Collect arguments +option_list = list( + make_option(c("-b", "--bedpe"), type='character', help="Input BEDPE"), + make_option(c("-m", "--max_changepoint_distance"), type='numeric', help="Maximum distance a changepoint can be from a breakpoint to 'rescue' it into the high-confidence set"), + make_option(c("-f", "--filter_databases"), type='character', help="Comma-separated list of databases to filter, looking in the info field"), + make_option(c("-s", "--out_file_somatic"), type='character', help="Output somatic BEDPE"), + make_option(c("-o", "--out_file_highconf"), type='character', help="Output high-confidence BEDPE")) +opt = parse_args(OptionParser(option_list=option_list)) + +## Unpack arguments +opt$filter_databases = unlist(strsplit(opt$filter_databases, ',', fixed=T)) + + +## Read bedpe, filter for known germline variants +x = read.csv(opt$bedpe, h=T, stringsAsFactors=F, sep='\t', check.names=F) +x = x[!sapply(x$info, inDatabase, opt$filter_databases), ] + +## Write out somatic variants +write.table(x, opt$out_file_somatic, row.names=F, col.names=T, sep='\t', quote=F) + +## Filter for high confidence +x = x[apply(x, 1, isHighConfidence, opt$max_changepoint_distance), ] + +## Write result +write.table(x, opt$out_file_highconf, row.names=F, col.names=T, sep='\t', quote=F) diff --git a/bin/pta/filter_bam b/bin/pta/filter_bam new file mode 100755 index 00000000..e5252cdc Binary files /dev/null and b/bin/pta/filter_bam differ diff --git a/bin/pta/filter_pon.py b/bin/pta/filter_pon.py new file mode 100644 index 00000000..a2d8e1db --- /dev/null +++ b/bin/pta/filter_pon.py @@ -0,0 +1,261 @@ +#!/usr/bin/env python +# USAGE: python filter_pon.py +# DESCRIPTION: +################################################################################ +##################### COPYRIGHT ################################################ +# New York Genome Center + +# SOFTWARE COPYRIGHT NOTICE AGREEMENT +# This software and its documentation are copyright (2018) by the New York +# Genome Center. All rights are reserved. This software is supplied without
# any warranty or guaranteed support whatsoever. The New York Genome Center
# cannot be responsible for its use, misuse, or functionality.
# Version: 0.1
# Author: Kanika Arora (karora@nygenome.org) and Jennifer M Shelton Require TotalUniqueSamples be integers. + ''' + bed_in = pd.read_csv(bed_file, sep = '\t', dtype={'#CHROM': str}) # auto-detect input format + assert 'TotalUniqueSamples' in list(bed_in.columns), 'BED file missing TotalUniqueSamples column' + if chrom: + bed_in = bed_in[(bed_in['#CHROM'] == chrom)].copy() + bed_in['TotalUniqueSamples'] = bed_in['TotalUniqueSamples'].astype('int64') + return bed_in + + +def compare_num(observed, rule, value): + ''' + Compare two numbers. Available rules are lt, gt and eq + ''' + observed = int(observed) + if rule == 'lt': + if observed < value: + return False + elif rule == 'gt': + if observed > value: + return False + elif rule == 'eq': + if observed == value: + return False + return True + + +def compare_str(observed, rule, value): + ''' + Compare two strings to see that they are the same 'eq' + or are not the same 'ne' + ''' + if rule == 'eq': + if value == observed: + return False + elif rule == 'ne': + if value != observed: + return False + return True + + +def test_rules(filter_func, rule): + ''' + Test that rule matches combination. + ''' + combos = {compare_num : ['lt', 'gt', 'eq'], + compare_str : ['eq', 'ne']} + assert rule in combos[filter_func], 'rule not is possible rules for given function. Possible rules: ' + str(combos[filter_func]) + ' Rule : ' + rule + + +def custom(row, filter_func, key, value, rule): + ''' + Run custom filter for bed file row + ''' + observed = row[key] + return filter_func(observed, rule, value) + + +def filter_bed(bed_in, + filter_func, key, + value, rule): + ''' + Filter based on column and rule. + ''' + bed_in['fail'] = bed_in.apply(lambda row: custom(row, filter_func, key, value, rule), axis=1) + bed_in_filtered = bed_in[(bed_in.fail == False)].copy() + return bed_in_filtered + + +def get_bad_pos(bed_in_filtered): + ''' + Grab filtered position info + ''' + bed_in_filtered['key'] = bed_in_filtered.apply(lambda row: row['#CHROM'] + '{' + str(row.END), axis=1) + bad_pos = set(bed_in_filtered.key) + return bad_pos + + +def add_filter_header(bcf_out, + id, + description): + ''' + Add new FILTER field + ''' + bcf_out.header.filters.add(id=id, + number=None, + type=None, + description=description) + return bcf_out + + +def compose(bcf_in, bad_pos): + ''' + Filter based on the PON bad positions. + ''' + passing = False + for record in bcf_in.fetch(): + key = record.chrom + '{' + str(record.pos) + if key in bad_pos: + filters = record.filter.keys() + if len(filters) == 1 and filters[0] in ['PASS', 'SUPPORT']: + record.filter.clear() + record.filter.add('PON') + yield record + + +def write_file(bcf_out, record): + ''' + Write to a VCF. + ''' + exit_status = bcf_out.write(record) + if exit_status != 0: + print(exit_status) + return bcf_out + + +def main(): + ''' + Filter VCF for start positions which match + ''' + # ========================== + # Input variables + # ========================== + parser = argparse.ArgumentParser( + description='DESCRIPTION: Filters a VCF file \ + if a start position of a VCF matches the \ + start + 1 position of the bed file. Command-line \ + options that may be omitted (i.e. are NOT \ + required) are shown in square brackets.') + # Documentation parameters + parser.add_argument('-v', '--vcf', + dest='vcf_file', + help='VCF file', + required=True) + parser.add_argument('-o', '--out', + dest='out', + help='Output VCF file', + required=True) + parser.add_argument('-b', '--bed', + dest='bed_file', + help='Input BED file to use to filter', + required=True) + parser.add_argument('-d', '--default', + dest='default', + help='Default filter for TotalUniqueSamples. Result \ + must be greater than this value [1]', + default='1') + parser.add_argument('-f', '--filter', + dest='filter_func', + choices=['str', + 'num'], + nargs='+', + help='Filter type(s)', + default=[False]) + parser.add_argument('-r', '--rule', + dest='rule', + nargs='+', + help='Rule(s) to filter based on value. Acceptable options \ + are "lt", "gt", "eq" for "num" and \ + "eq" and "ne" for "str"', + default=[False]) + parser.add_argument('-val', '--value', + dest='value', + nargs='+', + help='Value(s) used to compare to custom filter', + default=[False]) + parser.add_argument('-c', '--chrom', + dest='chrom', + help='Chrom used for filtering', + default=False) + args = parser.parse_args() + assert os.path.isfile(args.vcf_file), 'Failed to find caller VCF call file :' + args.vcf_file + assert os.path.isfile(args.bed_file), 'Failed to find BED file ' + args.bed_file + if args.filter_func[0]: + assert args.key[0], 'key is required for custom filter' + assert args.value[0], 'value is required for custom filter' + assert args.rule[0], 'rule is required for custom filter' + # ========================== + # Filter PON + # ========================== + functions = {'num' : compare_num, + 'str' : compare_str} + # default filter + bed_file = args.bed_file + filter_func = functions['num'] + key = 'TotalUniqueSamples' + rule = 'gt' + value = int(args.default) + + bed_in = read_bed(bed_file, args.chrom) + test_rules(filter_func, rule) + bed_in_filtered = filter_bed(bed_in, filter_func, key, value, rule) + # optional_filters + if args.filter_func[0]: + for i in range(len(args.filter_func)): + filter_func = functions[args.filter_func[i]] + key = args.key[i] + rule = args.rule[i] + value = args.value[i] + test_rules(filter_func, rule) + bed_in_filtered = filter_bed(bed_in_filtered, filter_func, key, value, rule) + bad_pos = get_bad_pos(bed_in_filtered) + # ========================== + # Filter with PON + # ========================== + bcf_in = read_vcf(args.vcf_file) + bcf_in = add_filter_header(bcf_out=bcf_in, + id='PON', + description='Variant in panel of normal database') + bcf_out = pysam.VariantFile(args.out, 'w', header=bcf_in.header) + for record in compose(bcf_in, bad_pos): + bcf_out = write_file(bcf_out, record) + + + + +########################################################################## +##### Execute main unless script is simply imported ############ +##### for individual functions ############ +########################################################################## + + +if __name__ == '__main__': + main() \ No newline at end of file diff --git a/bin/pta/filter_vcf.py b/bin/pta/filter_vcf.py new file mode 100644 index 00000000..6800ef4b --- /dev/null +++ b/bin/pta/filter_vcf.py @@ -0,0 +1,88 @@ +import pysam +import numpy as np +import argparse +import logging as log +import re + +chrom_pattern = re.compile('[\[\]](.*):') + +class FilterNonChroms(): + def __init__(self, vcf_file, + out_file, + chroms): + ''' + Requires + NYGC column headers: + #chr start end type log2 tool pair_id info focal cytoband + Without nygc columns the step returns an empty table + ''' + self.chroms = chroms + self.bcf_in = self.read_vcf(vcf_file) + self.bcf_out = self.start_vcf(out_file) + self.filter_vcf() + + def read_vcf(self, vcf_file): + bcf_in = pysam.VariantFile(vcf_file) + return bcf_in + + def start_vcf(self, out_file): + bcf_out = pysam.VariantFile(out_file, 'w', header=self.bcf_in.header) + return bcf_out + + def filter_non_chroms(self, record): + ''' + Filter calls to leave calls that are only on chroms. + ''' + if record.contig in self.chroms: + for alt in record.alts: + result = re.search(chrom_pattern, alt) + if result and result[1] not in self.chroms: + return True + else: + return True + return False + + def filter_vcf(self): + ''' + Only print calls that have: + 1) ref in chroms + 2) alt in chroms + ''' + for record in self.bcf_in.fetch(): + if not self.filter_non_chroms(record): + exit_status = self.bcf_out.write(record) + if exit_status != 0: + print(exit_status) + + +def get_args(): + '''Parse input flags + ''' + parser = argparse.ArgumentParser() + parser.add_argument('--vcf-file', + help='SV VCF file', + required=True + ) + parser.add_argument('--output', + help='Output VCF file', + required=True + ) + parser.add_argument('--chroms', + help='A space separated list of chroms to plot.', + required=False, + nargs='*', + default=False + ) + args_namespace = parser.parse_args() + return args_namespace.__dict__ + + +def main(): + args = get_args() + FilterNonChroms(args['vcf_file'], + args['output'], + chroms=args['chroms']) + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/bin/pta/get_candidates.py b/bin/pta/get_candidates.py new file mode 100644 index 00000000..62c4b243 --- /dev/null +++ b/bin/pta/get_candidates.py @@ -0,0 +1,82 @@ +#!/usr/bin/env python +# USAGE: python get_candidates.py VCF_FILE OUT_FILE +# DESCRIPTION: +################################################################################ +##################### COPYRIGHT ################################################ +# New York Genome Center + +# SOFTWARE COPYRIGHT NOTICE AGREEMENT +# This software and its documentation are copyright (2018) by the New York +# Genome Center. All rights are reserved. This software is supplied without
# any warranty or guaranteed support whatsoever. The New York Genome Center +# cannot be responsible for its use, misuse, or functionality. +# Version: 0.1 +# Author: Kanika Arora (karora@nygenome.org) and Jennifer M Shelton +##################### /COPYRIGHT ############################################### +################################################################################ +import sys +import os +import logging as log +import pysam +########################################################################## +############## Custom functions ############ +########################################################################## + +def read_vcf(vcf_file): + ''' + Read in annotated VCF file. + ''' + bcf_in = pysam.VariantFile(vcf_file) # auto-detect input format + return bcf_in + + +def filter_call(record): + ''' + Filter calls to leave calls that may be + supported by a second Lancet run. + ''' + if 'called_by' in record.info.keys() and \ + not 'lancet' in record.info['called_by'] and \ + record.info['num_callers'] == 1: + if 'supported_by' in record.info.keys(): + if record.info['supported_by']: + return True + return False + return True + + +def filter_vcf(bcf_in, bcf_out): + ''' + Only print calls that are: + 1) Not already called by Lancet + 2) Only supported by on caller + 3) Not Supported by a support caller + ''' + for record in bcf_in.fetch(): + if not filter_call(record): + exit_status = bcf_out.write(record) + if exit_status != 0: + print(exit_status) + + +def main(): + ''' + Only print calls that are: + 1) Not already called by Lancet + 2) Only supported by on caller #MWL NOTE: on caller = one caller? + 3) Not Supported by a support caller + ''' + vcf_file = sys.argv[1] + out_file = sys.argv[2] + assert os.path.isfile(vcf_file), 'Failed to find somatic VCF call file :' + vcf_file + bcf_in = read_vcf(vcf_file) + bcf_out = pysam.VariantFile(out_file, 'w', header=bcf_in.header) + filter_vcf(bcf_in, bcf_out) + + +########################################################################## +##### Execute main unless script is simply imported ############ +##### for individual functions ############ +########################################################################## +if __name__ == '__main__': + main() \ No newline at end of file diff --git a/bin/pta/make_maf.py b/bin/pta/make_maf.py new file mode 100644 index 00000000..a1bde93f --- /dev/null +++ b/bin/pta/make_maf.py @@ -0,0 +1,490 @@ +#!/usr/bin/env python +# USAGE: python make_maf.py VCF MAF LIBRARY GENOME +# DESCRIPTION: Makes MAF file from VCF file. +################################################################################ +##################### COPYRIGHT ################################################ +# New York Genome Center + +# SOFTWARE COPYRIGHT NOTICE AGREEMENT +# This software and its documentation are copyright (2018) by the New York +# Genome Center. All rights are reserved. This software is supplied without
# any warranty or guaranteed support whatsoever. The New York Genome Center +# cannot be responsible for its use, misuse, or functionality. +# Version: 0.2 +# Author: Jennifer M Shelton +##################### /COPYRIGHT ############################################### +################################################################################ + +import sys +import os +import re +import logging as log +import pysam +import mygene +import argparse +import pandas as pd +########################################################################## +############## Custom functions ############ +########################################################################## + + +def ensembl_gene_id_entrez_id(ensembl_gene_id, mg): + ''' + Returns entrez id from ensemble. + False is used for regions that do not + correspond to a gene region or Ensembl ID + ''' + entrez_id = 0 + if ensembl_gene_id != '': + results = mg.query(ensembl_gene_id) + try: + entrez_id = str(results['hits'][0]['entrezgene']) + except (KeyError, IndexError): + pass +# sys.stderr.write('WARNING: entrezgene not found for ' + str(ensembl_gene_id) + ' in ' + str(results) + '\n') + return entrez_id + + +def ensembl_gene_entrez_local(ensembl_gene_id, data): + ''' + Returns entrez id from ensemble. + False is used for regions that do not + correspond to a gene region or Ensembl ID + ''' + try: + entrez_id = data[(data['Gene stable ID'] == ensembl_gene_id)]['NCBI gene (formerly Entrezgene) ID'].values[0] + except IndexError: + entrez_id = 0 + if str(entrez_id) == 'nan': + entrez_id = 0 + return int(entrez_id) + +def get_csq_columns(bcf_in): + ''' + get column names from the bar + separated CSQ VEP annotation + results. CSQ are Consequence + annotations from Ensembl VEP. + ''' + csq_columns = bcf_in.header.info['CSQ'].description.split()[-1].split('|') # grab the definitions + return csq_columns + + +def get_csqs(record, csq_columns): + ''' + Get new INFO field results. + ''' + alt_count = len(record.alts) + csq_values = [] + csq_dicts = {} + for i in range(alt_count): + try: + csq_line = record.info['CSQ'][i] + except UnicodeDecodeError: # for names with accents and other unexpected characters (rare) + line = str(record) + csq_line = line.split('\t')[7].split('CSQ=')[1] + csq_line = csq_line.split(';')[0] + csq_line = csq_line.split(',')[i] + csq_values = csq_line.split('|') + csq_dict = dict(zip(csq_columns, csq_values)) + csq_dicts[i] = csq_dict + return csq_dicts + + +def group_mnv(record): + ''' + Convert inhouse MNV and SNV type to GDC-like calls. + This does not work for multiple alts. + Only the first alt will be considered because + type only takes one value. + ''' + if record.info['TYPE'] == 'SNV': + record.info['TYPE'] = 'SNP' + elif record.info['TYPE'] == 'MNV': + if len(record.ref) == 2 \ + and len(record.alts[0]) == 2: + record.info['TYPE'] = 'DNP' + elif len(record.ref) == 3 \ + and len(record.alts[0]) == 3: + record.info['TYPE'] = 'TNP' + elif len(record.ref) > 3 \ + and len(record.alts[0]) > 3 \ + and len(record.ref) == len(record.alts[0]): + record.info['TYPE'] = 'ONP' + return record + + + +def read_vcf(vcf_file): + ''' + Read in annotated VCF file. + ''' + bcf_in = pysam.VariantFile(vcf_file) # auto-detect input format + return bcf_in + + +def get_dbsnp_rs(Existing_variation): + ''' + Remove Cosmic IDs and split by comma. + ''' + ids = Existing_variation.split('&') + good_ids = [id for id in ids if id.startswith('rs')] + return ','.join(good_ids) + + +def set_frame_shift(type): + ''' + Set the variant classification based on whether the variant is + an insertion or a deletion. + ''' + if type == 'DEL': + variant_classification = 'Frame_Shift_Del' + elif type == 'INS': + variant_classification = 'Frame_Shift_Ins' + else: + variant_classification = False + return variant_classification + + +def set_protein_altering(type, ref, alt): + ''' + Set the variant classification based on whether the variant is + an insertion or a deletion. + ''' + inframe = False + if abs(len(ref) - len(alt)) % 3 == 0: + inframe = True + if inframe: + if type == 'DEL': + variant_classification = 'In_Frame_Del' + elif type == 'INS': + variant_classification = 'In_Frame_Ins' + else: + variant_classification = False + else: + if type == 'DEL': + variant_classification = 'Frame_Shift_Del' + elif type == 'INS': + variant_classification = 'Frame_Shift_Ins' + else: + variant_classification = False + return variant_classification + + +def shorten_AA(AAMutation): + ''' + Lengthen to the three letter AA code + ''' + AA_dict = {'Cys': 'C', 'Asp': 'D', 'Ser': 'S', 'Gln': 'Q', 'Lys': 'K', + 'Ile': 'I', 'Pro': 'P', 'Thr': 'T', 'Phe': 'F', 'Asn': 'N', + 'Gly': 'G', 'His': 'H', 'Leu': 'L', 'Arg': 'R', 'Trp': 'W', + 'Ala': 'A', 'Val':'V', 'Glu': 'E', 'Tyr': 'Y', 'Met': 'M', + 'Ter' : '*'} + short_mutation = [] + skip_until = -1 + for i, char in enumerate(AAMutation): + if i > skip_until: + if AAMutation[i:i + 3] in AA_dict: + short_mutation += AA_dict[AAMutation[i:i + 3]] + skip_until = i + 2 + else: + short_mutation += char + return ''.join(short_mutation) + + +def get_variant_classification(consequences, type, ref, alt): + ''' + Convert VEP consequences to MAF variant_classification. + ''' + consequences_to_class = {'intergenic_variant' : 'Silent', + 'upstream_gene_variant' : 'Silent', + '5_prime_UTR_variant' : 'Silent', + 'splice_acceptor_variant' : 'Splice_Site', + 'splice_donor_variant' : 'Splice_Site', + 'splice_region_variant' : 'Splice_Site', + 'missense_variant' : 'Missense_Mutation', + 'synonymous_variant' : 'Silent', + 'frameshift_variant' : set_frame_shift(type), + 'protein_altering_variant' : set_protein_altering(type, ref, alt), + 'inframe_insertion' : 'In_Frame_Ins', + 'inframe_deletion' : 'In_Frame_Del', + 'stop_gained' : 'Nonsense_Mutation', + 'stop_retained_variant' : 'Silent', + 'stop_lost' : 'Nonstop_Mutation', + 'intron_variant' : 'Silent', + '3_prime_UTR_variant' : 'Silent', + 'downstream_gene_variant' : 'Silent', + 'initiator_codon_variant' : 'Translation_Start_Site', + 'regulatory_region_variant' : 'Silent', + 'TF_binding_site_variant' : 'Silent', + 'mature_miRNA_variant' : 'RNA', + 'regulatory_region_ablation' : 'Silent', + 'regulatory_region_amplification' : 'Silent', + 'TFBS_ablation' : 'Silent', + 'TFBS_amplification' : 'Silent', + 'non_coding_transcript_variant' : 'Silent', + 'NMD_transcript_variant' : 'Silent', + 'incomplete_terminal_codon_variant' : 'Silent', + 'non_coding_transcript_exon_variant' : 'RNA', + 'transcript_ablation' : 'Splice_Site', + 'transcript_amplification' : 'Silent', + 'feature_elongation' : False, + 'feature_truncation' : False, + 'start_lost' : 'Translation_Start_Site', + 'start_retained_variant' : 'Silent', + 'coding_sequence_variant' : 'Missense_Mutation', + 'splice_polypyrimidine_tract_variant' : 'Splice_Site', + 'splice_donor_5th_base_variant' : 'Splice_Site', + 'splice_donor_region_variant' : 'Splice_Site' + } # https://useast.ensembl.org/info/genome/variation/prediction/predicted_data.html + return consequences_to_class[consequences] + +def get_HGVSp_Short(HGVSp_string, HGVSc_string, csq_term): + ''' + Convert HGVSp to HGVSp_Short: + derive HGVSp_Short from HGVSp. if Consequence is splice acceptor/donor variants, generate HGVSp_Short + ''' + aa_to_short = {'Ala': 'A', + 'Arg': 'R', + 'Asn': 'N', + 'Asp': 'D', + 'Asx': 'B', + 'Cys': 'C', + 'Glu': 'E', + 'Gln': 'Q', + 'Glx': 'Z', + 'Gly': 'G', + 'His': 'H', + 'Ile': 'I', + 'Leu': 'L', + 'Lys': 'K', + 'Met': 'M', + 'Phe': 'F', + 'Pro': 'P', + 'Ser': 'S', + 'Thr': 'T', + 'Trp': 'W', + 'Tyr': 'Y', + 'Val': 'V', + 'Xxx': 'X', + 'Xaa': 'X', + 'Ter': '*' + } + HGVSp_Short = '' + if csq_term == 'splice_acceptor_variant' \ + or csq_term == 'splice_donor_variant': + if len(HGVSc_string.split(':'))>1: + HGVSc_string = HGVSc_string.split(':')[1] + HGVSc_coding = re.findall('^c.(\d+)',HGVSc_string) + if len(HGVSc_coding) > 0: + input_pos = float(HGVSc_coding[0]) + if input_pos < 1: + input_pos = 1 + corrected_pos = (input_pos + input_pos % 3)/3 + HGVSp_Short = 'p.X' + str(int(corrected_pos)) + '_splice' + return HGVSp_Short + elif len(HGVSp_string) > 0: + HGVSp_Short = HGVSp_string.split(':')[1] + for item in aa_to_short.keys(): + HGVSp_Short = re.sub(item, aa_to_short[item], HGVSp_Short) + return HGVSp_Short + else: + return HGVSp_Short + +def make_row(record, csq_columns, bcf_in, library, + normal, tumor, VEP_version='GRCh38', + mg=False, + ensembl_entrez=False): + ''' + Fill in MAF row + ''' + # ======================= + # Get VEP annotation + # ======================= + csq_dicts = get_csqs(record, csq_columns) + cosmic_resistance_annotation = {} + if normal == bcf_in.header.samples[1]: + normal_index = 1 + tumor_index = 0 + elif tumor == bcf_in.header.samples[1]: + normal_index = 0 + tumor_index = 1 + for i, alt in enumerate(record.alts): + if csq_dicts[i]['SYMBOL_SOURCE'] == 'HGNC': + hugo = csq_dicts[i]['SYMBOL'] + else: + hugo = 'Unknown' + ensembl_gene_id = csq_dicts[i]['Gene'] + if mg: + entrez_id = ensembl_gene_id_entrez_id(ensembl_gene_id, mg) # default '0' + else: + entrez_id = ensembl_gene_entrez_local(ensembl_gene_id, ensembl_entrez) + center = 'NYGenome' + ncbi_build = VEP_version + chrom = record.chrom + if record.info['TYPE'] == 'DEL': + start = record.pos + 1 # skip anchor base + else: + start = record.pos + # get end position for 1-based inclusive coordinates + if record.info['TYPE'] == 'SNP': + end = record.pos + if record.info['TYPE'] == 'INS': + end = record.pos + 1 + elif record.info['TYPE'] == 'DEL': + end = (record.pos + 1) + len(record.ref) - len(alt) - 1 # add one to skip anchor + else: + end = record.pos + len(record.ref) - 1 + strand = '+' + record = group_mnv(record) + variant_type = record.info['TYPE'] + if record.info['TYPE'] == 'INS': + reference_allele = '-' + Tumor_Seq_Allele1 = '-' + Tumor_Seq_Allele2 = alt[1:] + elif record.info['TYPE'] == 'DEL': + reference_allele = record.ref[1:] + Tumor_Seq_Allele1 = record.ref[1:] + Tumor_Seq_Allele2 = '-' + else: + reference_allele = record.ref + Tumor_Seq_Allele1 = record.ref + Tumor_Seq_Allele2 = alt + dbSNP_RS = get_dbsnp_rs(csq_dicts[i]['Existing_variation']) + dbSNP_Val_Status = 'bySubmitter' + Tumor_Sample_Barcode = bcf_in.header.samples[tumor_index] + Matched_Norm_Sample_Barcode = bcf_in.header.samples[normal_index] + Match_Norm_Seq_Allele1 = '' + Match_Norm_Seq_Allele2 = '' + Tumor_Validation_Allele1 = '' + Tumor_Validation_Allele2 = '' + Match_Norm_Validation_Allele1 = '' + Match_Norm_Validation_Allele2 = '' + Verification_Status = 'Unknown' + Validation_Status = 'Untested' + Mutation_Status = 'Somatic' + Sequencing_Phase = 'Phase_I' + Sequence_Source = library + Validation_Method = 'none' + Score = '' + BAM_file= '' + Sequencer = 'Illumina' + if 'AD' in record.samples[bcf_in.header.samples[1]].keys() \ + and 'AD' in record.samples[bcf_in.header.samples[0]].keys(): + t_alt_count = record.samples[bcf_in.header.samples[tumor_index]]['AD'][1] + t_ref_count = record.samples[bcf_in.header.samples[tumor_index]]['AD'][0] + n_alt_count = record.samples[bcf_in.header.samples[normal_index]]['AD'][1] + n_ref_count = record.samples[bcf_in.header.samples[normal_index]]['AD'][0] + else: + t_alt_count = '' + t_ref_count = '' + n_alt_count = '' + n_ref_count = '' + HGVSc = csq_dicts[i]['HGVSc'] + HGVSp = csq_dicts[i]['HGVSp'] + SYMBOL_SOURCE = csq_dicts[i]['SYMBOL_SOURCE'] + SYMBOL = csq_dicts[i]['SYMBOL'] + IMPACT = csq_dicts[i]['IMPACT'] + return_line = [] + for csq_term in csq_dicts[i]['Consequence'].split('&'): + variant_classification = get_variant_classification(csq_term, + record.info['TYPE'], + record.ref, + alt) + HGVSp_Short = get_HGVSp_Short(HGVSp,HGVSc,csq_term) + line = [hugo, entrez_id, center, ncbi_build, chrom, start, end, strand, + variant_classification, variant_type, reference_allele, + Tumor_Seq_Allele1, Tumor_Seq_Allele2, dbSNP_RS, + dbSNP_Val_Status, Tumor_Sample_Barcode, + Matched_Norm_Sample_Barcode, Match_Norm_Seq_Allele1, + Match_Norm_Seq_Allele2, Tumor_Validation_Allele1, + Tumor_Validation_Allele2, + Match_Norm_Validation_Allele1, Match_Norm_Validation_Allele2, + Verification_Status, Validation_Status, Mutation_Status, + Sequencing_Phase, + Sequence_Source, Validation_Method, Score, BAM_file, + Sequencer, t_alt_count, t_ref_count, + n_alt_count, n_ref_count, HGVSc, HGVSp, HGVSp_Short, SYMBOL, SYMBOL_SOURCE, IMPACT] + joined_line = '\t'.join([str(x) for x in line]) + '\n' + joined_line = joined_line.replace('&', ',') + return_line.append(joined_line) + yield ''.join(set(return_line)) + + +def write_file(bcf_in, out, csq_columns, library, + normal, tumor, VEP_version, ensembl_entrez=False): + ''' + Write out the header + ''' + if ensembl_entrez: + data = pd.read_csv(ensembl_entrez) + mg = False + else: + data = False + mg = mygene.MyGeneInfo() + with open(out, 'w') as o: + header = ['Hugo_Symbol', 'Entrez_Gene_Id', 'Center', 'NCBI_Build', 'Chromosome', 'Start_Position', 'End_Position', 'Strand', 'Variant_Classification', 'Variant_Type', 'Reference_Allele', 'Tumor_Seq_Allele1', 'Tumor_Seq_Allele2', 'dbSNP_RS', 'dbSNP_Val_Status', 'Tumor_Sample_Barcode', 'Matched_Norm_Sample_Barcode', 'Match_Norm_Seq_Allele1', 'Match_Norm_Seq_Allele2', 'Tumor_Validation_Allele1', 'Tumor_Validation_Allele2', 'Match_Norm_Validation_Allele1', 'Match_Norm_Validation_Allele2', 'Verification_Status', 'Validation_Status', 'Mutation_Status', 'Sequencing_Phase', 'Sequence_Source', 'Validation_Method', 'Score', 'BAM_file', 'Sequencer', 't_alt_count', 't_ref_count', 'n_alt_count', 'n_ref_count', 'HGVSc', 'HGVSp', 'HGVSp_Short', 'SYMBOL', 'SYMBOL_SOURCE','IMPACT'] + o.write('\t'.join(header) + '\n') + for record in bcf_in.fetch(): + if record.info['HighConfidence']: + for alt_line in make_row(record, csq_columns, bcf_in, library, + normal, tumor, VEP_version=VEP_version, + mg=mg, + ensembl_entrez=data): + o.write(alt_line) + + + +def main(): + ''' + Script to make MAF file from VEP annotated VCF files + ''' + ###################################################################### + ############ Get commandline arguments ############ + ###################################################################### + parser = argparse.ArgumentParser( + description='DESCRIPTION: Takes in a VCF \ + file and returns a MAF. Command-line \ + options that may be omitted (i.e. are NOT \ + required) are shown in square brackets.') + # Documentation parameters + # Parameter options + parser.add_argument('-v', '--vcf', + dest='vcf_file', + help='Annotated VCF file') + parser.add_argument('-m', '--maf', + dest='maf', + help='MAF output file') + parser.add_argument('-l', '--library', + dest='library', + help='Sequence library type', + choices=['WGS', 'Exome']) + parser.add_argument('-vep', '--vep-version', + dest='VEP_version', + help='VEP genome version', + choices=['GRCh37', 'GRCh38']) + parser.add_argument('-t', '--tumor', + dest='tumor', + help='Tumor sample name') + parser.add_argument('-n', '--normal', + dest='normal', + help='Normal sample name') + parser.add_argument('-e', '--ensembl-entrez', + dest='ensembl_entrez', + default=False, + help='Map of ensembl ids to entrez ids') + args = parser.parse_args() + assert os.path.isfile(args.vcf_file), 'Failed to find caller VCF call file :' + args.vcf_file + bcf_in = read_vcf(args.vcf_file) + csq_columns = get_csq_columns(bcf_in) + write_file(bcf_in, args.maf, csq_columns, args.library, + args.normal, args.tumor, args.VEP_version, + args.ensembl_entrez) + + +########################################################################## +##### Execute main unless script is simply imported ############ +##### for individual functions ############ +########################################################################## +if __name__ == '__main__': + main() diff --git a/bin/pta/make_main_vcf.py b/bin/pta/make_main_vcf.py new file mode 100644 index 00000000..d61d809f --- /dev/null +++ b/bin/pta/make_main_vcf.py @@ -0,0 +1,230 @@ +#!/usr/bin/env python +# USAGE: python cancer_gene_census.py cancer_gene_census.csv VCF VCF_OUT +# DESCRIPTION: Annotates files by adding information about the +# Cosmic Genome Census entry for the nearest gene. +################################################################################ +##################### COPYRIGHT ################################################ +# New York Genome Center + +# SOFTWARE COPYRIGHT NOTICE AGREEMENT +# This software and its documentation are copyright (2018) by the New York +# Genome Center. All rights are reserved. This software is supplied without
# any warranty or guaranteed support whatsoever. The New York Genome Center +# cannot be responsible for its use, misuse, or functionality. +# Version: 0.2 +# Author: Jennifer M Shelton +##################### /COPYRIGHT ############################################### +################################################################################ + +import sys +import os +import logging as log +import pysam +import pprint +from collections import OrderedDict +########################################################################## +############## Custom functions ############ +########################################################################## + + +def remove_info(bcf_in, csq_columns): + ''' + Remove a INFO field from VCF. + ''' + for id in bcf_in.header.info.keys(): + if not id in ['HighConfidence','TYPE', 'called_by', 'num_callers', + 'supported_by', 'CSQ', 'CancerGeneCensus'] + csq_columns: + bcf_in.header.info.remove_header(id) + return bcf_in + + +def remove_format(bcf_in): + ''' + Remove a FORMAT field from VCF. + ''' + for id in bcf_in.header.formats.keys(): + if not id in ['AD','DP', 'AF']: + bcf_in.header.formats.remove_header(id) + return bcf_in + + + +def get_csqs(record, csq_columns): + ''' + Get new INFO field results. + ''' + alt_count = len(record.alts) + csq_dicts = {} + for i in range(alt_count): + try: + csq_line = record.info['CSQ'][i] + except UnicodeDecodeError: # for names with accents and other unexpected characters (rare) + line = str(record) + csq_line = line.split('\t')[7].split('CSQ=')[1] + csq_line = csq_line.split(';')[0] + csq_line = csq_line.split(',')[i] + csq_values = csq_line.split('|') + assert len(csq_columns) == len(csq_values), 'failed because lengths do not match' + csq_dict = dict(zip(csq_columns, csq_values)) + csq_dicts[i] = csq_dict + return csq_dicts + + +def modify_record(record, csq_columns, good_fields): + ''' + Shorten CSQ fields + ''' + csq_dicts = get_csqs(record, csq_columns) + csq_out = '|'.join([csq_dicts[0][key] for key in good_fields]) + return csq_out + + +def check_build(bcf_in): + ''' + Check if genome is in a list of supprted non-human genomes. + ''' + VEP_line = [metadata.value for metadata in bcf_in.header.records if metadata.key == 'VEP'][0] + vep_info = {entry.split('=')[0] : entry.split('=')[-1] for entry in VEP_line.split(' ')} + if vep_info['assembly'] in ['"GRCm38.p6"']: + return False + else: + return True + + +class Variant(object): + + + def __init__(self, record, csq_out, human=True): + self.record = record + self.csq_out = csq_out + self.human = human + self.line = str(self.record).rstrip() + self.parts = self.line.split('\t') + # VCF columns + self.chrom = self.parts[0] + self.pos = self.parts[1] + self.id = self.parts[2] + self.ref = self.parts[3] + self.alts = self.parts[4].split(',') + self.qual = self.parts[5] + self.filters = self.parts[6].split(';') + self.info = self.parts[7].split(';') + self.format = self.parts[8].split(':') + self.samples = self.parts[9:] + # modify + self.good_format = ['AD','DP', 'AF'] + if self.human: + self.good_info = ['HighConfidence','TYPE', 'called_by', 'num_callers', + 'supported_by', 'CSQ', 'CancerGeneCensus'] + else: + self.good_info = ['HighConfidence','TYPE', 'called_by', 'num_callers', + 'supported_by', 'CSQ'] + self.info_dict = self.get_info() + self.samples[0] = self.fix_format(self.samples[0].split(':')) + self.samples[1] = self.fix_format(self.samples[1].split(':')) + self.format = [key for key in self.good_format if key in self.format] + + def get_info(self): + ''' + Get current info line and add prefix if needed + ''' + info_dict = OrderedDict() + for item in self.info: + if item.split('=')[0] in self.good_info: + if item.split('=')[0] == 'CSQ': + info_dict.update({'CSQ' : self.csq_out}) + elif '=' in item: + info_dict.update({item.split('=')[0] : item.split('=')[1]}) + else: + info_dict.update({item : None }) + return info_dict + + def fix_format(self, sample): + ''' + Reduce to good formats + ''' + format_dict = dict(zip(self.format, sample)) + new_format = [format_dict[key] for key in self.good_format if key in format_dict] + return ':'.join(new_format) + + def write(self): + if ':'.join(self.format) == '': + self.format = '.' + self.samples = ['.', '.'] + if ';'.join(self.filters) == 'PASS': + line = [self.chrom, + self.pos, + self.id, + self.ref, + ','.join(self.alts), + str(self.qual), + ';'.join(self.filters), + ';'.join(['='.join([x for x in [key, self.info_dict[key]] if x != None]) for key in self.info_dict]), + ':'.join(self.format)] + line += self.samples + self.new_line = '\t'.join(line) + return self.new_line + else: + return False + + +def read_vcf(vcf_file): + ''' + Read in annotated VCF file. + ''' + bcf_in = pysam.VariantFile(vcf_file) # auto-detect input format + return bcf_in + + +def write_vcf(bcf_in, vcf_out_file, csq_columns, human=True): + ''' + Write out the download + ''' + # CSQ to keep + if human: + good_fields = ['Gene', 'BIOTYPE', 'CLIN_SIG', 'Consequence', + 'CosmicCoding', 'CosmicCoding_AA', 'CosmicNonCoding', + 'Existing_variation', 'GnomadExomes_AF', 'GnomadGenomes_AF', + 'HGVSc', 'HGVSp', 'IMPACT', 'Polyphen2_HVAR_pred', + 'FATHMM_pred', 'fathmm-MKL_coding_pred', + 'SIFT4G_pred', 'SIFT_pred', 'SYMBOL', 'SYMBOL_SOURCE', 'AF_1000G'] + else: + good_fields = ['Gene', 'BIOTYPE', 'Consequence', + 'Existing_variation', 'HGVSc', 'HGVSp', 'IMPACT', + 'SIFT4G_pred', 'SIFT_pred', 'SYMBOL', 'SYMBOL_SOURCE'] + # Import the header after removal of extra metadata + header = str(bcf_in.header).rstrip() + csq_format = '|'.join(good_fields) + # Write new header with corrected CSQ and fewer metadata keys overall + with open(vcf_out_file, 'w') as vcf_out: + for line in header.split('\n'): + if 'ID=CSQ' in line: + line = '##INFO=' + vcf_out.write(line + '\n') + for record in bcf_in: + csq_out = modify_record(record, csq_columns, good_fields) + line = Variant(record, csq_out, human).write() + if line: + vcf_out.write(line + '\n') + + +def main(): + ''' + Reduce metadata in VCF for main VCF output + ''' + vcf_file = sys.argv[1] + vcf_out_file = sys.argv[2] + bcf_in = read_vcf(vcf_file) + human = check_build(bcf_in) + csq_columns = bcf_in.header.info['CSQ'].description.split()[-1].split('|') # grab the definitions + bcf_in = remove_format(bcf_in) + bcf_in = remove_info(bcf_in, csq_columns) + write_vcf(bcf_in, vcf_out_file, csq_columns, human=human) + + +########################################################################## +##### Execute main unless script is simply imported ############ +##### for individual functions ############ +########################################################################## +if __name__ == '__main__': + main() diff --git a/bin/pta/make_txt.py b/bin/pta/make_txt.py new file mode 100644 index 00000000..8d5b9200 --- /dev/null +++ b/bin/pta/make_txt.py @@ -0,0 +1,233 @@ +#!/usr/bin/env python +# USAGE: python make_maf.py --vcf VCF --txt TXT -n NORMAL -t TUMOR +# DESCRIPTION: Makes MAF file from VCF file. +################################################################################ +##################### COPYRIGHT ################################################ +# New York Genome Center + +# SOFTWARE COPYRIGHT NOTICE AGREEMENT +# This software and its documentation are copyright (2018) by the New York +# Genome Center. All rights are reserved. This software is supplied without
# any warranty or guaranteed support whatsoever. The New York Genome Center
# cannot be responsible for its use, misuse, or functionality.
# Version: 0.2
# Author: Jennifer M Shelton CSQ are Consequence + annotations from Ensembl VEP. + ''' + csq_columns = bcf_in.header.info['CSQ'].description.split()[-1].split('|') # grab the definitions + return csq_columns + + +def get_csqs(record, csq_columns): + ''' + Get new INFO field results. + ''' + alt_count = len(record.alts) + csq_values = [] + csq_dicts = {} + for i in range(alt_count): + try: + csq_line = record.info['CSQ'][i] + except UnicodeDecodeError: # for names with accents and other unexpected characters (rare) + line = str(record) + csq_line = line.split('\t')[7].split('CSQ=')[1] + csq_line = csq_line.split(';')[0] + csq_line = csq_line.split(',')[i] + csq_values = csq_line.split('|') + csq_dict = dict(zip(csq_columns, csq_values)) + csq_dicts[i] = csq_dict + return csq_dicts + + +def read_vcf(vcf_file): + ''' + Read in annotated VCF file. + ''' + bcf_in = pysam.VariantFile(vcf_file) # auto-detect input format + return bcf_in + + +def check_build(bcf_in): + ''' + Check if genome is in a list of supprted non-human genomes. + ''' + VEP_line = [metadata.value for metadata in bcf_in.header.records if metadata.key == 'VEP'][0] + vep_info = {entry.split('=')[0] : entry.split('=')[-1] for entry in VEP_line.split(' ')} + if vep_info['assembly'] in ['"GRCm38.p6"']: + return False + else: + return True + + +def make_row(record, csq_columns, bcf_in, + normal, tumor, human=True): + ''' + Fill in MAF row + ''' + # ======================= + # Get VEP annotation + # ======================= + csq_dicts = get_csqs(record, csq_columns) + if normal == bcf_in.header.samples[1]: + normal_index = 1 + tumor_index = 0 + elif tumor == bcf_in.header.samples[1]: + normal_index = 0 + tumor_index = 1 + else: + log.error('VCF sample names do not match listed tumor or normal name') + sys.exit(1) + id = record.id + if record.id == None: + id = '.' + for i, alt in enumerate(record.alts): + consequence = csq_dicts[i]['Consequence'] + impact = csq_dicts[i]['IMPACT'] + GENE_SYMBOL = csq_dicts[i]['SYMBOL'] + HGVSc = csq_dicts[i]['HGVSc'] + HGVSp = csq_dicts[i]['HGVSp'] + type = record.info['TYPE'] + if human: + PolyPhen = csq_dicts[i]['Polyphen2_HVAR_pred'] + AF_1000G = csq_dicts[i]['AF_1000G'] + GnomadExomes_AF = csq_dicts[i]['GnomadExomes_AF'] + GnomadGenomes_AF = csq_dicts[i]['GnomadGenomes_AF'] + CosmicCoding = csq_dicts[i]['CosmicCoding'] + CosmicCoding_AA = csq_dicts[i]['CosmicCoding_AA'] + CosmicNonCoding = csq_dicts[i]['CosmicNonCoding'] + fathmm = csq_dicts[i]['FATHMM_pred'] + fathmm_MKL_coding = csq_dicts[i]['fathmm-MKL_coding_pred'] + sift = csq_dicts[i]['SIFT_pred'] + sift_4g = csq_dicts[i]['SIFT4G_pred'] + HighConfidence = record.info['HighConfidence'] + if 'called_by' in record.info: + called_by = ','.join(record.info['called_by']) + else: + called_by = '' + if 'supported_by' in record.info: + supported_by = ','.join(record.info['supported_by']) + else: + supported_by = '' + if 'AD' in record.samples[bcf_in.header.samples[1]].keys() \ + and 'AD' in record.samples[bcf_in.header.samples[0]].keys(): + t_alt_count = record.samples[bcf_in.header.samples[tumor_index]]['AD'][1] + t_ref_count = record.samples[bcf_in.header.samples[tumor_index]]['AD'][0] + n_alt_count = record.samples[bcf_in.header.samples[normal_index]]['AD'][1] + n_ref_count = record.samples[bcf_in.header.samples[normal_index]]['AD'][0] + else: + t_alt_count = '' + t_ref_count = '' + n_alt_count = '' + n_ref_count = '' + if 'AF' in record.samples[bcf_in.header.samples[1]].keys(): + t_VAF = record.samples[bcf_in.header.samples[tumor_index]]['AF'][i] + else: + t_VAF = '' + if human: + line = [record.chrom, record.pos, id, record.ref, alt, + consequence, impact, GENE_SYMBOL, HGVSc, HGVSp, type, + PolyPhen, AF_1000G, GnomadExomes_AF, GnomadGenomes_AF, + CosmicCoding, CosmicCoding_AA, CosmicNonCoding, + n_ref_count, n_alt_count, t_ref_count, t_alt_count, + t_VAF, fathmm, fathmm_MKL_coding, sift, sift_4g, + HighConfidence, called_by, supported_by] + else: + line = [record.chrom, record.pos, id, record.ref, alt, + consequence, impact, GENE_SYMBOL, HGVSc, HGVSp, type, + n_ref_count, n_alt_count, t_ref_count, t_alt_count, + t_VAF, sift, sift_4g, HighConfidence, called_by, supported_by] + line = [str(part).replace('&', ',') for part in line] + line = [str(part).replace(';', ',') for part in line] + yield line + + +def write_file(bcf_in, out, csq_columns, + normal, tumor, human=True): + ''' + Write out the header + ''' + with open(out, 'w') as o: + if human: + header = ['CHROM', 'POS', 'ID', 'REF', 'ALT', 'Consequence', 'IMPACT', + 'GENE_SYMBOL', 'HGVSc', 'HGVSp', 'TYPE', 'PolyPhen', 'AF_1000G', + 'GnomadExomes_AF', 'GnomadGenomes_AF', 'CosmicCoding', + 'CosmicCoding_AA', 'CosmicNonCoding', + 'n_ref_count', 'n_alt_count', 't_ref_count', 't_alt_count', + 't_VAF', 'FATHMM', 'fathmm_MKL_coding', 'SIFT', 'SIFT4G', 'HighConfidence', + 'called_by', 'supported_by'] + else: + header = ['CHROM', 'POS', 'ID', 'REF', 'ALT', 'Consequence', 'IMPACT', + 'GENE_SYMBOL', 'HGVSc', 'HGVSp', 'TYPE', + 'n_ref_count', 'n_alt_count', 't_ref_count', 't_alt_count', + 't_VAF', 'SIFT', 'SIFT4G', 'HighConfidence', + 'called_by', 'supported_by'] + o.write('\t'.join(header) + '\n') + header_len = len(header) + for record in bcf_in.fetch(): + for alt_line in make_row(record, csq_columns, bcf_in, + normal, tumor, human): + assert len(alt_line) == header_len, "columns don't equal header names" + joined_line = '\t'.join([x for x in alt_line]) + '\n' + o.write(joined_line) + + + +def main(): + ''' + Script to make TEXT file from VEP annotated VCF files + ''' + ###################################################################### + ############ Get commandline arguments ############ + ###################################################################### + parser = argparse.ArgumentParser( + description='DESCRIPTION: Takes in a VCF \ + file and returns a TEXT file. Command-line \ + options that may be omitted (i.e. are NOT \ + required) are shown in square brackets.') + # Documentation parameters + # Parameter options + parser.add_argument('-v', '--vcf', + dest='vcf_file', + help='Annotated VCF file') + parser.add_argument('--txt', + dest='txt', + help='TEXT output file') + parser.add_argument('-t', '--tumor', + dest='tumor', + help='Tumor sample name') + parser.add_argument('-n', '--normal', + dest='normal', + help='Normal sample name') + args = parser.parse_args() + assert os.path.isfile(args.vcf_file), 'Failed to find caller VCF call file :' + args.vcf_file + bcf_in = read_vcf(args.vcf_file) + human = check_build(bcf_in) + csq_columns = get_csq_columns(bcf_in) + write_file(bcf_in, args.txt, csq_columns, + args.normal, args.tumor, human=human) + + +########################################################################## +##### Execute main unless script is simply imported ############ +##### for individual functions ############ +########################################################################## +if __name__ == '__main__': + main() diff --git a/bin/pta/merge-caller-vcfs.r b/bin/pta/merge-caller-vcfs.r new file mode 100644 index 00000000..2232334c --- /dev/null +++ b/bin/pta/merge-caller-vcfs.r @@ -0,0 +1,425 @@ +## BJS Note: this script was located in the root path of the Docker container +## gcr.io/nygc-public/sv_cnv@sha256:1c14a50d131323a2a4bab323cf224879776af8de37f93df79292fd2e63269274 +## It is reproduced below as it exists there without modification + +## Merge arbitrary number of VCFs, annotate with simple event type +libs = c('optparse', 'StructuralVariantAnnotation', 'VariantAnnotation', 'rtracklayer', 'stringr') +invisible(suppressPackageStartupMessages(sapply(libs, require, character.only=T, quietly=T))) +options(width=200, scipen=999) + + +SUPPORTED_CALLERS = c('manta', 'lumpy', 'svaba', 'gridss') ## Update this flag when adding support for new callers +SVABA_MIN_LENGTH = 1001 ## Svaba-unique calls shorter than this appear to be artifactual + + +## Callers have different names for the same pieces of evidence, +## For now handle each case separately +## TODO: Add support for GRIDSS +getReadSupport = function(vcf, caller, sample_id, supplementary=FALSE, supported_callers=SUPPORTED_CALLERS) { + + ## Don't try to process genotype info if we don't know how + if (!caller %in% supported_callers) { + stop('Caller ', caller, ' is not currently supported. Supported callers: ', paste(supported_callers, collapse=',')) + } + + ## It's a possibility that the sample names in the VCF will be + ## the full path to the BAM used instead of just the sample ID + ## Just grab the index of the correct column + if (!sample_id %in% colnames(geno(vcf)[[1]])) { + sample_id = which(gsub('\\.final\\.bam$','',basename(colnames(geno(vcf)[[1]]))) %in% sample_id) + } + + + if (caller == 'manta') { + + ## Common info + sr = geno(vcf)$SR[, sample_id] + sr = sapply(sr, `[`, 2) + pe = geno(vcf)$PR[, sample_id] + pe = sapply(pe, `[`, 2) + + ## Supplementary info + supp_string = paste0(caller,'_SOMATICSCORE=', info(vcf)$SOMATICSCORE) + + } else if (caller == 'svaba') { + + ## Common info + sr = geno(vcf)$SR[, sample_id] + pe = geno(vcf)$DR[, sample_id] + + ## Supplementary info + ad = paste0(caller,'_AD=', geno(vcf)$AD[, sample_id]) + dp = paste0(caller,'_DP=', geno(vcf)$DP[, sample_id]) + lo = paste0(caller,'_LO=', geno(vcf)$LO[, sample_id]) + gt = paste0(caller,'_GT=', geno(vcf)$GT[, sample_id]) + supp_string = paste(ad, dp, lo, gt, sep=',') + + } else if (caller == 'lumpy') { + + ## Common info + sr = geno(vcf)$SR[, sample_id] + sr = unlist(sr) + pe = geno(vcf)$PE[, sample_id] + pe = unlist(pe) + + ## Supplementary info + ro = paste0(caller,'_RO=', geno(vcf)$RO[, sample_id]) + ao = paste0(caller,'_AO=', geno(vcf)$AO[, sample_id]) + dp = paste0(caller,'_DP=', geno(vcf)$DP[, sample_id]) + gt = paste0(caller,'_GT=', geno(vcf)$GT[, sample_id]) + supp_string = paste(ro, ao, dp, gt, sep=',') + + + } else if (caller == 'gridss') { + + ## Common info + sr = geno(vcf)$SR[, sample_id] + pe = geno(vcf)$RP[, sample_id] + + ## Supplementary info + vf = paste0(caller,'_VF=', geno(vcf)$VF[, sample_id]) + asq = paste0(caller,'_ASQ=', geno(vcf)$ASQ[, sample_id]) + qual = paste0(caller,'_QUAL=', geno(vcf)$QUAL[, sample_id]) + supp_string = paste(vf, asq, qual, sep=',') + + } + + ## Set NA to 0 + ## TODO: Keep this? + sr[is.na(sr)] = 0 + pe[is.na(pe)] = 0 + + ## Build output string + if (supplementary) { + res = paste0('[',caller,'_SR=',sr,',', caller,'_PE=', pe,',', supp_string,']') + } else { + res = paste0('[',caller,'_SR=',sr,',', caller,'_PE=', pe,']') + } + + return(res) + +} + + + +sumSupport = function(x) { + sapply(str_extract_all(x, '(?<=\\=)[0-9]+(?=,|\\])'), function(y) sum(as.numeric(y))) +} + + + +removeRedundantBreakpoints = function(x) { + + ## Find duplicates + key = unlist(strsplit(x$breakendPosID,',')) + key.count = table(key) + key.dup = key.count[key.count > 1] + + + ## If there aren't duplicates we don't have anything to do + if (length(key.dup) == 0) { + return(x) + } + + + ## For each set of duplicate breakends, select the one with the higher score + x.idx.rm = c() + for (i in names(key.dup)) { + + ## Subset to breakends of interest + x.idx = grep(i, x$breakendPosID, fixed=T) + xi = x[x.idx] + + ## Collect support + xi$read.support = sumSupport(xi$support) + xi$multicaller.support = grepl('],[',xi$support,fixed=T) + + + + ## Automatically keep breakends with multi-caller support + idx.multi = which(xi$multicaller.support) + if (length(idx.multi) > 0) { + x.idx.rm = c(x.idx.rm, x.idx[-idx.multi]) + next + } + + + ## Automatically discard breakends with the lowest support + idx.max = which(xi$read.support %in% max(xi$read.support)) + if (length(idx.max) > 0) { + x.idx.rm = c(x.idx.rm, x.idx[-idx.max]) + x.idx = x.idx[idx.max] + xi = xi[idx.max] + } + + + ## If there are multiple breakends tied for highest read support + if (length(xi) > 1) { + + if (all(!is.na(xi$svLen))) { + + ## If all are non-TRA take longest SV + x.idx.rm = c(x.idx.rm, x.idx[-which.max(abs(xi$svLen))]) + + } else if (all(is.na(xi$svLen)) && length(unique(as.character(seqnames(xi)))) == 1) { + + ## If all TRA to the same chr select rightmost coordinate + partners = x[names(x) %in% xi$partner] + partner.keep = names(partners)[which.max(start(partners))] + x.idx.rm = c(x.idx.rm, x.idx[!xi$partner %in% partner.keep]) + + } + + ## Otherwise, just keep tied SVs + + } + + } + + + ## Remove breakends and their partners if we have any to remove + if (length(x.idx.rm) > 0) { + x = x[-x.idx.rm] + x = x[names(x) %in% x$partner] + } + + return(x) + +} + + + +## Compute error between query and subject for a hits object +computeError = function(query, subject, hits) { + + ## Init result dataframe + error = data.frame(local=rep(NA, length(queryHits(hits))), + remote=rep(NA, length(queryHits(hits)))) + + + ## For each hit + for (i in 1:length(queryHits(hits))) { + + ## Compute local error (error between breakends at hit i) and remote error (error between the partners of + ## the breakends at hit i) + error$local[i] = StructuralVariantAnnotation:::.distance(query[queryHits(hits)[i]], subject[subjectHits(hits)[i]])$min + error$remote[i] = StructuralVariantAnnotation:::.distance(query[names(query) == query[queryHits(hits)[i]]$partner], + subject[names(subject) == subject[subjectHits(hits)[i]]$partner] + )$min + } + + return(error) + +} + + + +## Take the union of callsets a and b, both breakpointRanges objects +## If multiple hits found in b for a, choose the closest match, measured +## as the mean distance between breakends +mergeCallsets = function(a, b, slop) { + + ## Find overlaps + overlaps = StructuralVariantAnnotation::findBreakpointOverlaps(query=a, + subject=b, + maxgap=slop, + sizemargin=0.8, + restrictMarginToSizeMultiple=0.8) + + + + ## If we have any duplicate query hits, choose hit based on match quality + if(anyDuplicated(queryHits(overlaps))) { + + ## Compute local and remote breakend basepair error on matches + error = computeError(query=a, subject=b, hits=overlaps) + + ## Get duplicate hits + dup.query.hits = table(queryHits(overlaps)) + dup.query.hits = names(dup.query.hits[dup.query.hits > 1]) + + ## Determine which hits we're removing + idx.hits.rm = c() + for (d in dup.query.hits) { + + idx.dup.query.hits = which(queryHits(overlaps) %in% d) + + local.error = error$local[idx.dup.query.hits] + remote.error = error$remote[idx.dup.query.hits] + mean.error = rowMeans(cbind(local.error, remote.error)) + + ## Keep the query hit with the smallest mean error + idx.hits.rm = c(idx.hits.rm, idx.dup.query.hits[which.max(mean.error)]) + + } + + overlaps = overlaps[-idx.hits.rm] + + } + + ## For matching SVs, merge caller support + a$support[queryHits(overlaps)] = paste0(a$support[queryHits(overlaps)],',',b$support[subjectHits(overlaps)]) + a$breakendPosID[queryHits(overlaps)] = paste0(a$breakendPosID[queryHits(overlaps)],',',b$breakendPosID[subjectHits(overlaps)]) + + ## Pull in non-matching SVs from b + res = c(a, b[-subjectHits(overlaps)]) + + return(res) + +} + + + +## Convert breakpointRanges to BEDPE +vcfToBedpe = function(vcf, supplemental=F) { + + sqn = as.character(seqnames(vcf)) + strand = as.character(strand(vcf)) + res = c() + processed = c() + + for (i in 1:length(vcf)) { + bnd = names(vcf)[i] + partner = vcf$partner[i] + partner.idx = which(names(vcf) == partner) + + ## If we don't have exactly one partner, exclude this variant + if (length(partner.idx) != 1) { + warning('Missing partner for breakend ', bnd) + next + } + + ## Check to see if we've alrady processed this or it's partner + if (any(c(bnd, partner) %in% processed)) { + next + } + + ## Which support column should we use? + if (supplemental) { + support = vcf$supplemental[i] + } else { + support = vcf$support[i] + } + + + ## Combine breakends in single line + res.i = c(sqn[i], start(vcf)[i], end(vcf)[i], ## chr1, start1, end1 + sqn[partner.idx], start(vcf)[partner.idx], end(vcf)[partner.idx], ## chr2, start2, end 2 + 'BND', '.', strand[i], strand[partner.idx], support) ## type, score, strand1, strand2, support + + ## Add to result, keep track of processed breakends + res = rbind(res, res.i) + processed = c(processed, bnd, partner) + } + + + ## Add colnames and fill in simple event classifications + colnames(res) = c('chr1', 'start1', 'end1', 'chr2', 'start2', 'end2', 'type', 'score', 'strand1', 'strand2', 'evidence') + res = as.data.frame(res, stringsAsFactors=F) + + res$type[res$strand1 == '+' & res$strand2 == '-'] = 'DEL' + res$type[res$strand1 == '-' & res$strand2 == '+'] = 'DUP' + res$type[res$strand1 == '-' & res$strand2 == '-'] = 'INV' + res$type[res$strand1 == '+' & res$strand2 == '+'] = 'INV' + res$type[res$chr1 != res$chr2] = 'TRA' + + ## Sort by chromosome + res = res[order(factor(res$chr1, levels=levels(seqnames(vcf))), res$start1, res$end1, decreasing=F), ] + + ## Simplify coordinates + res$end1 = as.numeric(res$start1) + 1 + res$end2 = as.numeric(res$start2) + 1 + + + ## Extract tool info from read support column + res$tools = sapply(res$evidence, function(x) paste(unlist(stringr::str_extract_all(x, '(?<=\\[)[a-z]+(?=_)')), collapse=',')) + + colnames(res)[1] = paste0('#', colnames(res)[1]) + + return(res) + +} + + + +## Collect arguments +option_list = list( + make_option(c("-v", "--vcf"), type='character', help="Comma-delimited list of breakend notation VCFs"), + make_option(c("-c", "--callers"), type='character', help="Comma-delimited list of SV caller names corresponding to the order of VCFs given in --vcf"), + make_option(c("-t", "--tumor"), type='character', help="Tumor sample ID"), + make_option(c("-n", "--normal"), type='character', help="Normal sample ID"), + make_option(c("-b", "--build"), type='character', help="Genome build"), + make_option(c("-s", "--slop"), type='numeric', help="Padding to use when comparing breakpoints"), + make_option(c("-l", "--min_sv_length"), type='numeric', help="Filter SVs shorter than this length"), + make_option(c("-a", "--allowed_chr"), type='character', help="Comma-delimited list of chromosomes to keep"), + make_option(c("-o", "--out_file"), type='character', help="Output BEDPE"), + make_option(c("-p", "--out_file_supplemental"), type='character', help="Output supplemental BEDPE")) +opt = parse_args(OptionParser(option_list=option_list)) + + + +## Unpack arguments +opt$vcf = unlist(strsplit(opt$vcf, ',', fixed=T)) +opt$callers = unlist(strsplit(opt$callers, ',', fixed=T)) +opt$allowed_chr = unlist(strsplit(opt$allowed_chr, ',', fixed=T)) + + + +## Iteratively merge VCFs +res = NULL +for (i in 1:length(opt$vcf)) { + + ## Read VCF + caller = opt$caller[i] + vcf = VariantAnnotation::readVcf(opt$vcf[i], genome=opt$build) + + ## Get read support + rowRanges(vcf)$support = getReadSupport(vcf=vcf, caller=caller, sample_id=opt$tumor) + rowRanges(vcf)$supplemental = getReadSupport(vcf=vcf, caller=caller, sample_id=opt$tumor, supplementary=T ) + + ## Convert to breakpointRanges object, don't adjust for CIPOS uncertainty (i.e. keep nominalPosition) + vcf = StructuralVariantAnnotation::breakpointRanges(vcf, nominalPosition=T) + + ## Add breakendPosID for later redundancy checks + vcf$breakendPosID = paste0('[',caller,'=',as.character(seqnames(vcf)),':',start(vcf),':',strand(vcf),']') + + ## Overlap if this isn't the first callset + if (i == 1) { + res = vcf + } else { + res = mergeCallsets(a=res, b=vcf, slop=opt$slop) + } + +} + + + +## Handle breakpoints with duplicate start or end positions +res = removeRedundantBreakpoints(res) + + + +## Convert to bedpe, apply some filters +for (i in c('main','supplemental')) { + + outfile = ifelse(i=='main', opt$out_file, opt$out_file_supplemental) + + ## Convert to BEDPE format + res.i = vcfToBedpe(res, supplemental=i=='supplemental') + res.i$`tumor--normal` = paste0(opt$tumor,'--',opt$normal) + + ## Filter non-TRA variants for minimum length opt$min_sv_length + sv.lengths = abs(as.numeric(res.i$start2) - as.numeric(res.i$start1)) + res.i = res.i[res.i$type == 'TRA' | sv.lengths >= opt$min_sv_length, ] + + ## Filter non-TRA svaba-unique variants less than SVABA_MIN_LENGTH + sv.lengths = abs(as.numeric(res.i$start2) - as.numeric(res.i$start1)) + res.i = res.i[(res.i$tools != 'svaba' | res.i$type == 'TRA') | (res.i$tools == 'svaba' & sv.lengths >= SVABA_MIN_LENGTH), ] + + ## Filter SVs not occurring in allowed chromosomes (i.e. autosomes and sex chromosomes) + res.i = res.i[res.i$`#chr1` %in% opt$allowed_chr & res.i$chr2 %in% opt$allowed_chr, ] + + ## Write result + write.table(res.i, outfile, row.names=F, col.names=T, sep='\t', quote=F) + +} diff --git a/bin/pta/merge_columns.py b/bin/pta/merge_columns.py new file mode 100644 index 00000000..76b77b25 --- /dev/null +++ b/bin/pta/merge_columns.py @@ -0,0 +1,326 @@ +#!/usr/bin/env python +# USAGE: python merge_columns.py +# DESCRIPTION: +################################################################################ +##################### COPYRIGHT ################################################ +# New York Genome Center + +# SOFTWARE COPYRIGHT NOTICE AGREEMENT +# This software and its documentation are copyright (2018) by the New York +# Genome Center. All rights are reserved. This software is supplied without
# any warranty or guaranteed support whatsoever. The New York Genome Center
# cannot be responsible for its use, misuse, or functionality.
# Version: 0.1
# Author: Kanika Arora (karora@nygenome.org) and Jennifer M Shelton Assumes id is + _. + ''' + + def __init__(self, samples, tumor, normal): + self.samples = samples + self.normal = normal + self.tumor = tumor + self.get_lists() + + + def get_lists(self): + ''' + Make list of tumor-only and normal-only sample names + NOTE: N/T order is reversed by bcftools merge. + + MWL NOTE: This does not appear to be true! Sanity check is needed with real data. + + ''' + print(self.get_originals(self.samples[0])) + if self.get_originals(self.samples[0]) == self.normal: + self.tumor_samples = self.samples[1::2] + self.normal_samples = self.samples[::2] + elif self.get_originals(self.samples[0]) == self.tumor: + self.tumor_samples = self.samples[::2] + self.normal_samples = self.samples[1::2] + self.pairs = [sample_name for sample_name in zip(self.normal_samples, self.tumor_samples)] + + + def get_originals(self, sample): + ''' + get original sample names. NOTE: N/T order is + reversed by bcftools merge if alpha order is reversed. + ''' + return '_'.join(sample.split('_')[1:]).replace('indel_', '').replace('support_','').replace('sv_','') + # // MWL NOTE: replace statements added to remove additions to sample names that were made to clarify what tools calls originated from. + # i.e., lancet_support_, strelka2_sv_, strelka2_indel_. + +class Variant(Naming): + ''' + Import a pysam record. and write out from record. + The class allows editing of fixed elements like + the min number of samples in the VCF. + ''' + + def __init__(self, record, samples, tumor, normal): + self.record = record + self.line = str(record) + self.samples = samples + self.tumor = tumor + self.normal = normal + Naming.__init__(self, self.samples, self.tumor, self.normal) + self.parts = self.line.split('\t') + # GT fields found + self.gt_tools = set() + # VCF columns + self.chrom = self.parts[0] + self.pos = self.parts[1] + self.id = self.parts[2] + self.ref = self.parts[3] + self.alts = self.parts[4] + self.qual = self.parts[5] + self.filters = self.parts[6] + self.info = self.parts[7] + self.format = ':'.join([key for key in self.get_uniq_keys()]) + self.find_tools() + + + def replace_empty(self, value, key, sep=','): + ''' + Replace an empty field with ".". In pysam this will be + a tuple with None as the only value. + ''' + if isinstance(value, tuple): + if len(value) == 1 and value[0] == None: + return '.' + if len(value) > 1 and all(map(lambda x: x != None, value)) and key == 'GT': + return '/'.join(['.' if x == None else str(x) for x in value]) + elif value == None: + return '.' + if key == 'GT': + return '/'.join(['.' if x == None else str(x) for x in value]) + if (isinstance(value, collections.Iterable) and not isinstance(value, str)) or \ + isinstance(value, tuple): + joined = [] + for i in value: + if i == None: + joined.append('') + else: + joined.append(str(i)) + return sep.join(joined) + return str(value) + + + def not_empty(self, value, key): + ''' + Test if a format field is not empty. In pysam this will be + a tuple with None as the only value. + ''' + if isinstance(value, tuple): + if len(value) == 1 and value[0] == None: + return False + if len(value) > 1 and all(map(lambda x: x == None, value)) and key == 'GT': + return False + if value == None: + return False + if value == '.': + return False + return True + + + def deuniqify_gt(self, key, sample_name): + ''' + Remove tool prefix from key (only if it was added for merge). + ''' + format_keys = self.record.samples[sample_name].keys() + if key not in format_keys and \ + key.endswith('GT') and \ + 'GT' in format_keys: + return 'GT' + return key + + + def find_tools(self): + ''' + Return list of samples with keys. + ''' + self.final_normal_samples = [] + self.final_tumor_samples = [] + for pair in self.pairs: + found_keys = [] + for sample_name in pair: + found_keys += [key for key in self.uniq_keys if self.not_empty(self.record.samples[sample_name][self.deuniqify_gt(key, sample_name)], self.deuniqify_gt(key, sample_name))] + if len(found_keys) > 0: + self.final_normal_samples.append(pair[0]) + self.final_tumor_samples.append(pair[1]) + + + def uniqify_gt(self, key, sample_name): + ''' + Add tool prefix to key. + ''' + if key == 'GT': + tool = sample_name.split('_')[0] + tool = tool.split(':')[-1] + self.gt_tools.update(set([tool])) + return tool + '_' + key + return key + + + def find_keys(self, record, sample_name): + ''' + Return list of keys with values for FORMAT + ''' + format_keys = record.samples[sample_name].keys() + found_keys = [self.uniqify_gt(key, sample_name) for key in format_keys if self.not_empty(record.samples[sample_name][key], key)] + return found_keys + + + + def get_uniq_keys(self): + ''' + Return a key/value pairs for any key with a value for on tool's results. + ''' + seen = set() + seen_add = seen.add + found_keys = reduce(list.__add__, + [self.find_keys(self.record, sample_name) for sample_name in self.samples], + []) + self.uniq_keys = [key for key in found_keys if not (key in seen or seen_add(key))] + return self.uniq_keys + + + def reduce_samples(self, samples): + ''' + Find all keys with a value for any sample. Return '.' for missing values. + ''' + uniq_keys = self.get_uniq_keys() + sample_result = [] + for sample_name in samples: + tool = sample_name.split('_')[0] + tool = tool.split(':')[-1] + sample_result.append(':'.join([self.replace_empty(self.record.samples[sample_name][self.deuniqify_gt(key, sample_name)], self.deuniqify_gt(key, sample_name)) for key in uniq_keys if tool == key.split('_')[0]])) + return ':'.join(sample_result) + + + def write(self): + ''' + Return a reformatted string from a pysam object. + ''' + line = [self.chrom, + self.pos, + self.id, + self.ref, + self.alts, + self.qual, + self.filters, + self.info, + self.format] + line += [self.reduce_samples(self.final_normal_samples)] + line += [self.reduce_samples(self.final_tumor_samples)] + self.new_line = '\t'.join(line) + '\n' + return self.new_line + + +def modify_header(bcf_in, tool): + ''' + Add new FORMAT field + ''' + bcf_in.header.formats.add(id=tool + '_GT', number='1', + type='String', + description='Genotype from ' + tool) + return bcf_in + + +def load_header(bcf_in): + ''' + Load a VCF file header as a list of lines. + ''' + header = '\n'.join(str(bcf_in.header).split('\n')[:-2]) + '\n' + return header + + +def read_vcf(vcf_file): + ''' + Read in annotated VCF file. + ''' + bcf_in = pysam.VariantFile(vcf_file) # auto-detect input format + return bcf_in + + +def vcf_writer(bcf_in, vcf_out_file, tumor, normal): + ''' + Write out the VCF file with corrected information + ''' + with open(vcf_out_file, 'w') as vcf_out: + samples = [sample_name for sample_name in bcf_in.header.samples] + names = Naming(samples, tumor, normal) + lines = [] + gt_tools = set() + for record in bcf_in.fetch(): + out = Variant(record, samples, tumor, normal) + gt_tools.update(out.gt_tools) + lines.append(out.write()) + # ========================== + # Add GT + # ========================== + for tool in gt_tools: + bcf_in = modify_header(bcf_in, tool) + header = load_header(bcf_in) + # ========================== + # Write header + # ========================== + for line in header: + vcf_out.write(line) + vcf_out.write('\t'.join(['#CHROM', 'POS', 'ID', 'REF', 'ALT', + 'QUAL', 'FILTER', 'INFO', 'FORMAT', + names.normal, names.tumor]) + '\n') + # ========================== + # Write variants + # ========================== + for line in lines: + vcf_out.write(line) + + +def main(): + ''' + Merge the VCF columns by: + 1) Getting all INFO fields with values + 2) Getting all FORMAT fields from any sample with non-empty values + 3) Uniqify GT now that bcftools merge is done + ''' + # ========================== + # Input variables + # ========================== + vcf_in = sys.argv[1] + vcf_out_file = sys.argv[2] + normal = sys.argv[3] + tumor = sys.argv[4] + + # NOTE: Order was changed to be consistent with prior scripts. + + assert os.path.isfile(vcf_in), 'Failed to find caller VCF call file :' + vcf_in + # ========================== + # Run prep + # ========================== + bcf_in = read_vcf(vcf_in) + vcf_writer(bcf_in, vcf_out_file, tumor, normal) + + + +########################################################################## +##### Execute main unless script is simply imported ############ +##### for individual functions ############ +########################################################################## + + +if __name__ == '__main__': + main() \ No newline at end of file diff --git a/bin/pta/merge_prep.py b/bin/pta/merge_prep.py new file mode 100644 index 00000000..5f2ebd07 --- /dev/null +++ b/bin/pta/merge_prep.py @@ -0,0 +1,242 @@ +#!/usr/bin/env python +# USAGE: python merge_prep.py +# DESCRIPTION: +################################################################################ +##################### COPYRIGHT ################################################ +# New York Genome Center + +# SOFTWARE COPYRIGHT NOTICE AGREEMENT +# This software and its documentation are copyright (2018) by the New York +# Genome Center. All rights are reserved. This software is supplied without
# any warranty or guaranteed support whatsoever. The New York Genome Center +# cannot be responsible for its use, misuse, or functionality. +# Version: 0.1 +# Author: Kanika Arora (karora@nygenome.org) and Jennifer M Shelton +##################### /COPYRIGHT ############################################### +################################################################################ +import sys +import os +import logging as log +import pysam +import pandas as pd +import re +import argparse +########################################################################## +############## Custom functions ############ +########################################################################## +base_pattern = re.compile(r'^[ACGTN]+$') + + + +def read_vcf(vcf_file): + ''' + Read in annotated VCF file. + ''' + bcf_in = pysam.VariantFile(vcf_file) # auto-detect input format + return bcf_in + + +def add_info_header(bcf_out, + id, + number, + type, + description): + ''' + Add new INFO field + ''' + bcf_out.header.info.add(id=id, + number=number, + type=type, + description=description) + return bcf_out + + +def add_filter_header(bcf_out, + id, + description): + ''' + Add new FILTER field + ''' + bcf_out.header.filters.add(id=id, + number=None, + type=None, + description=description) + return bcf_out + + +def remove_filters_header(bcf_out): + ''' + Remove all filters except PASS + ''' + for id in bcf_out.header.filters.keys(): + if not id in ['PASS','SUPPORT']: + bcf_out.header.filters.remove_header(id) + return bcf_out + + +def pass_alleles(record, base_pattern=base_pattern): + ''' + Pass lines that have no special characters in REF/ALT + ''' + passed = True + alleles = list(record.alts) + [record.ref] + for allele in alleles: + if not re.match(base_pattern, allele): + passed = False + return passed + + +def prep_record(record, tool, passing, support): + ''' + Pass lines that have no special characters in ALT/REF + ''' + record.id = None + record.qual = None + if passing: + if support: + if tool == 'manta': + tool_supported_by = tool + 'SV' + else: + tool_supported_by = tool + record.info['supported_by'] = (tool_supported_by,) + else: + record.info['called_by'] = (tool,) + record.info['num_callers'] = 1 + return record + + +def write_file(bcf_in, bcf_out, tool, filter=True, support=False): + ''' + Filter based on FILTER column, + also filter lines with special characters in ALT/REF + ''' + passing = False + for record in bcf_in.fetch(): + filters = record.filter.keys() + # ==================== + # Passing variants + # ==================== + if len(filters) == 1 and \ + filters[0] == 'PASS': + passing = True + write = True + if support: + record.filter.clear() + record.filter.add('SUPPORT') + # ==================== + # Failing variants + # ==================== + else: + if filter: + write = False + else: + write = True + if write and not pass_alleles(record): + write = False + if write: + record = prep_record(record, tool, passing, support) + exit_status = bcf_out.write(record) + if exit_status != 0: + print(exit_status) + return True + + +def main(): + ''' + Prepare the VCF file for merging by: + 1) 'TYPE' is added to header + 2) 'called_by' is added to header + 3) 'num_callers' is added to header + 4) filter lines with special characters in REF/ALT (e.g. in manta) + 5) fill in 'called_by', 'num_callers' + 6) 'SUPPORT' FILTER line is added + 7) non 'PASS'/'SUPPORT' FILTER lines are removed (if not skip-filter) + ''' + # ========================== + # Input variables + # ========================== + parser = argparse.ArgumentParser( + description='DESCRIPTION: Takes in a VCF \ + file and preps the file by: \ + 1) "TYPE" is added to header \ + 2) "called_by" is added to header \ + 3) "num_callers" is added to header \ + 4) filter lines with special characters in REF/ALT (e.g. in manta) \ + 5) fill in "called_by", "num_callers" \ + 6) "SUPPORT" FILTER line is added \ + 7) non "PASS"/"SUPPORT" FILTER lines are removed (if not skip-filter) \ + . Command-line \ + options that may be omitted (i.e. are NOT \ + required) are shown in square brackets.') + # Documentation parameters + parser.add_argument('-v', '--vcf', + dest='vcf_file', + help='VCF file', + required=True) + parser.add_argument('-o', '--out', + dest='out', + help='Output VCF file', + required=True) + parser.add_argument('-t', '--tool', + dest='tool', + choices=['strelka2_sv', + 'strelka2_indel', + 'mutect2', + 'svaba', + 'lancet', + 'manta'], + help='Tool name', + required=True) + parser.add_argument('-s', '--support', + dest='support', + help='Use if calls are only support calls', + action='store_true') + parser.add_argument('-f', '--skip-filter', + dest='skip_filter', + help='Remove calls that are not PASS or SUPPORT', + action='store_true') + args = parser.parse_args() + filter = True + if args.skip_filter: + filter = False + assert os.path.isfile(args.vcf_file), 'Failed to find caller VCF call file :' + args.vcf_file + # ========================== + # Run prep + # ========================== + bcf_in = read_vcf(args.vcf_file) + bcf_in = add_info_header(bcf_out=bcf_in, + id='called_by', + number='.', + type='String', + description='Name of the variant caller(s) that the variant was called by') + bcf_in = add_info_header(bcf_out=bcf_in, + id='num_callers', + number='1', + type='Integer', + description='Number of callers') + bcf_in = add_info_header(bcf_out=bcf_in, + id='supported_by', + number='.', + type='String', + description='Name of the tool(s) apart from the main variant callers in the pipeline that support the variant') + bcf_in = add_filter_header(bcf_out=bcf_in, + id='SUPPORT', + description='Variant from Validation caller') + bcf_out = pysam.VariantFile(args.out, 'w', header=bcf_in.header) + if filter: + bcf_out = remove_filters_header(bcf_out) + write_file(bcf_in, + bcf_out, + tool=args.tool, + filter=filter, + support=args.support) + + +########################################################################## +##### Execute main unless script is simply imported ############ +##### for individual functions ############ +########################################################################## + + +if __name__ == '__main__': + main() \ No newline at end of file diff --git a/bin/pta/remove_contig.py b/bin/pta/remove_contig.py new file mode 100644 index 00000000..e87f12f6 --- /dev/null +++ b/bin/pta/remove_contig.py @@ -0,0 +1,51 @@ +#!/usr/bin/env python +# USAGE: remove_contig.py VCF_IN VCF_OUT +# DESCRIPTION: Print a VCF file skipping contig descriptions (for use on bad contig descriptions) +# Version 1.0 +import sys +import shutil + + +def remove_contig(vcf): + ''' + Skips line if starts with ##contig= to remove Lumpy + VCF line with contig name but witout required length value + ''' + for line in vcf: + if not line.startswith('##contig='): + yield line + + +def vcf_writer(vcf_file, vcf_out_file): + ''' + Write out the VCF file with corrected information + ''' + # ===================== + # test if renaming should occur + # ===================== + rename = False + if vcf_out_file == vcf_file: + vcf_out_file = vcf_file + '_tmp.vcf' + rename = True + # ===================== + # write non-contig lines + # ===================== + with open(vcf_out_file, 'w') as vcf_out: + with open(vcf_file) as vcf: + for line in remove_contig(vcf): + vcf_out.write(line) + # ===================== + # rename output VCF + # ===================== + if rename: + shutil.move(vcf_out_file, vcf_file) + + + +# ===================== +# Main +# ===================== +if __name__ == "__main__": + vcf_file = sys.argv[1] + vcf_out_file = sys.argv[2] + vcf_writer(vcf_file, vcf_out_file) \ No newline at end of file diff --git a/bin/pta/rename_csq_vcf.py b/bin/pta/rename_csq_vcf.py new file mode 100644 index 00000000..e458ab5a --- /dev/null +++ b/bin/pta/rename_csq_vcf.py @@ -0,0 +1,171 @@ +#!/usr/bin/env python +# USAGE: python cancer_gene_census.py cancer_gene_census.csv VCF VCF_OUT +# DESCRIPTION: Annotates files by adding information about the +# Cosmic Genome Census entry for the nearest gene. +################################################################################ +##################### COPYRIGHT ################################################ +# New York Genome Center + +# SOFTWARE COPYRIGHT NOTICE AGREEMENT +# This software and its documentation are copyright (2018) by the New York +# Genome Center. All rights are reserved. This software is supplied without
# any warranty or guaranteed support whatsoever. The New York Genome Center +# cannot be responsible for its use, misuse, or functionality. +# Version: 0.1 +# Author: Jennifer M Shelton +##################### /COPYRIGHT ############################################### +################################################################################ + + +import sys +import os +import logging as log +import pysam +import pprint +from collections import OrderedDict +########################################################################## +############## Custom functions ############ +########################################################################## + + +def add_info_header(bcf_out, + id, + number, + type, + description): + ''' + Add new INFO field + ''' + bcf_out.header.info.add(id=id, + number=number, + type=type, + description=description) + return bcf_out + + +def af_1000g(bcf_out): + ''' + Add description for VEP annotation. + ''' + for af in ['AF_1000G', 'AFR_AF_1000G', 'AMR_AF_1000G', + 'EAS_AF_1000G', 'EUR_AF_1000G', 'SAS_AF_1000G']: + description = af + ' field from phase3 1000genomes' + bcf_out = add_info_header(bcf_out, + id=af, + number='.', + type='String', + description=description) + return bcf_out + + +def get_good_fields(csq_columns, suffix='_1000G'): + ''' + rename CSQ fields as needed. + ''' + change = ['AF', 'AFR_AF', 'AMR_AF', 'EAS_AF', 'EUR_AF', 'SAS_AF'] + good = [] + for key in csq_columns: + if key in change: + key += suffix + good.append(key) + return good + +class Variant(object): + + + def __init__(self, record): + self.record = record + self.line = str(self.record).rstrip() + self.parts = self.line.split('\t') + # VCF columns + self.chrom = self.parts[0] + self.pos = self.parts[1] + self.id = self.parts[2] + self.ref = self.parts[3] + self.alts = self.parts[4].split(',') + self.qual = self.parts[5] + self.filters = self.parts[6].split(';') + self.info = self.parts[7].split(';') + self.format = self.parts[8].split(':') + self.samples = self.parts[9:] + # modify + self.info_dict = self.get_info() + + + def get_info(self): + ''' + Get current info line and add prefix if needed + ''' + info_dict = OrderedDict() + for item in self.info: + if '=' in item: + if '=' in item: + info_dict.update({item.split('=')[0] : '='.join(item.split('=')[1:])}) + else: + info_dict.update({item : None }) + return info_dict + + + + def write(self): + line = [self.chrom, + self.pos, + self.id, + self.ref, + ','.join(self.alts), + str(self.qual), + ';'.join(self.filters), + ';'.join(['='.join([x for x in [key, self.info_dict[key]] if x != None]) for key in self.info_dict]), + ':'.join(self.format)] + line += self.samples + self.new_line = '\t'.join(line) + return self.new_line + + +def read_vcf(vcf_file): + ''' + Read in annotated VCF file. + ''' + bcf_in = pysam.VariantFile(vcf_file) # auto-detect input format + return bcf_in + + +def write_vcf(bcf_in, vcf_out_file, csq_columns): + ''' + Write out the download + ''' + # CSQ to keep + good_fields = get_good_fields(csq_columns) + # Import the header after removal of extra metadata + header = str(bcf_in.header).rstrip() + csq_format = '|'.join(good_fields) + # Write new header with corrected CSQ and fewer metadata keys overall + with open(vcf_out_file, 'w') as vcf_out: + for line in header.split('\n'): + if 'ID=CSQ' in line: + line = '##INFO=' + vcf_out.write(line + '\n') + for record in bcf_in: + line = Variant(record).write() + if line: + vcf_out.write(line + '\n') + + +def main(): + ''' + Reduce metadata in VCF for main VCF output + ''' + vcf_file = sys.argv[1] + vcf_out_file = sys.argv[2] + bcf_in = read_vcf(vcf_file) + bcf_in = af_1000g(bcf_out=bcf_in) + csq_columns = bcf_in.header.info['CSQ'].description.split()[-1].split('|') # grab the definitions + write_vcf(bcf_in, vcf_out_file, csq_columns) + + +########################################################################## +##### Execute main unless script is simply imported ############ +##### for individual functions ############ +########################################################################## +if __name__ == '__main__': + main() diff --git a/bin/pta/rename_metadata.py b/bin/pta/rename_metadata.py new file mode 100644 index 00000000..d4cf28f1 --- /dev/null +++ b/bin/pta/rename_metadata.py @@ -0,0 +1,195 @@ +#!/usr/bin/env python +# USAGE: rename_metadata.py VCF_IN VCF_OUT PREFIX +# DESCRIPTION: Takes in a VCF and a tool name +# and preps the file by: +# 1) add tool + "_" to all INFO and FORMAT def lines (unless the key is "GT") + +import sys +import shutil +import logging as log +from collections import OrderedDict +import re +import os + + +class Variant(object): + + + def __init__(self, line, prefix=''): + self.line = line + self.prefix = prefix + self.parts = line.split('\t') + # VCF columns + self.chrom = self.parts[0] + self.pos = self.parts[1] + self.id = self.parts[2] + self.ref = self.parts[3] + self.alts = self.parts[4].split(',') + self.qual = self.parts[5] + self.filters = self.parts[6].split(';') + self.info = self.parts[7].split(';') + self.format = self.parts[8].split(':') + self.samples = self.parts[9:] + # modify + self.info_dict = self.get_info() + self.fix_format() + + + def get_info(self): + ''' + Get current info line and add prefix if needed + ''' + info_dict = OrderedDict() + for item in self.info: + if '=' in item: + info_dict.update({self.prefix + item.split('=')[0] : item.split('=')[1]}) + else: + info_dict.update({self.prefix + item : None}) + return info_dict + + + def add_prefix(self, key): + ''' + Add prefix unless variant is GT. + ''' + if key != 'GT': + return self.prefix + key + return key + + + def fix_format(self): + ''' + Add any prefix to format entries. Skips GT becuase + in at least one arrangement (the GT being first without the + full name GT bcftools combines other GT feilds (e.g. Mutect2's + PGT into the first position and writes bad VCF files. + This: + "0/0:37,1:0.098:10,0:27,1:12:224,349:25:2:0|1:16759500_G_C:.:." + Becomes (with non-Non-ASCII characters changed to *): + "0/00/1!%I9**=m**=! + :37,1:0.098:10,0:27,1:12:224,349:25:2:0|1:16759500_G_C:.:." + ''' + new_format = [self.add_prefix(key) for key in self.format] + self.format = new_format + + + def write(self): + line = [self.chrom, + self.pos, + self.id, + self.ref, + ','.join(self.alts), + str(self.qual), + ';'.join(self.filters), + ';'.join(['='.join([x for x in [key, self.info_dict[key]] if x != None]) for key in self.info_dict]), + ':'.join(self.format)] + line += self.samples + self.new_line = '\t'.join(line) + return self.new_line + + +def fix_header(line, prefix): + ''' + Add prefix as needed. Replace AD with standard AD line because + GATK also makes this replacement and it conflicts with Lancet similar but unique + wording. + ''' + if '##FORMAT=' in line or \ + '##INFO=' in line: + id = re.search('ID=(?P[^>,]+)', line) + if id == None: + log.error('FORMAT or INFO line missing ID field: ' + line) + sys.exit() + if id.group(1) == 'AD' and '##FORMAT=' in line: + line = '##FORMAT=\n' + if id.group(1) != 'GT': + line = line.replace('ID=' + id.group(1), + 'ID=' + prefix + id.group(1)) + return line + + +def load_header(vcf_in, prefix): + ''' + Load a VCF file header as a list of lines. + ''' + with open(vcf_in) as vcf: + header = [fix_header(line, prefix) for line in vcf if line.startswith('#')] + return header + + +def load_vcf(vcf_in, prefix): + ''' + Load a VCF file and fixes lines. + ''' + with open(vcf_in) as vcf: + for line in vcf: + if not line.startswith('#'): + yield Variant(line, prefix).write() + + + + +def rename_metadata(vcf_file, vcf_out_file, prefix): + ''' + Add prefix to FORMAT and INFO keys + ''' + # ===================== + # test if renaming should occur + # ===================== + rename = False + if vcf_out_file == vcf_file: + vcf_out_file = vcf_file + '_tmp.vcf' + rename = True + # ===================== + # rename + # ===================== + header = load_header(vcf_file, prefix) + vcf_reader = load_vcf(vcf_file, prefix) + # ===================== + # rewrite + # ===================== + vcf_writer(header, vcf_reader, vcf_out_file) + # ===================== + # rename output VCF + # ===================== + if rename: + shutil.move(vcf_out_file, vcf_file) + return True + + +def correct_ad_line(): + ''' + ''' + +def vcf_writer(header, vcf_reader, vcf_out_file): + ''' + Write out a VCF file with the a prefix added to INFO and FORMAT keys + ''' + with open(vcf_out_file, 'w') as vcf_out: + for line in header: + vcf_out.write(line) + for line in vcf_reader: + vcf_out.write(line) + return True + + +def main(): + ''' + DESCRIPTION: Takes in a VCF and a tool name + and preps the file by: + 1) add tool + "_" to all INFO and FORMAT def lines (unless the key is "GT") + ''' + vcf_file = sys.argv[1] + vcf_out_file = sys.argv[2] + prefix = sys.argv[3] + '_' + assert os.path.isfile(vcf_file), 'Failed to find prep caller VCF call file :' + vcf_file + rename_metadata(vcf_file, vcf_out_file, prefix) + + +# ===================== +# Main +# ===================== + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/bin/pta/rename_vcf.py b/bin/pta/rename_vcf.py new file mode 100644 index 00000000..c1c43e31 --- /dev/null +++ b/bin/pta/rename_vcf.py @@ -0,0 +1,123 @@ +#!/usr/bin/env python +# USAGE: rename_vcf.py VCF_IN VCF_OUT NORMAL TUMOR PREFIX +# DESCRIPTION: Print a VCF file with the sample order indicated in the 3rd +# and 4th arguments and a prefix added to the sample names + +import sys +import pandas as pd +import shutil +import logging as log +import os + + +def load_header(vcf_in): + ''' + Load a VCF file header as a list of lines. + ''' + with open(vcf_in) as vcf: + header = [line for line in vcf if line.startswith('#')] + return header + + +def load_vcf(vcf_in, header, reorder, paired, normal, tumor, prefix): + ''' + Load a VCF file as an pandas dataframe. + ''' + names = header[-1].rstrip().replace('^#', '').split('\t') + if paired: + names = ['#CHROM', 'POS', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER', + 'INFO', 'FORMAT', + prefix + '_' + normal, prefix + '_' + tumor] + vcf_reader = pd.read_csv(vcf_in, comment='#', + names=names, sep='\t') + return vcf_reader + + +def order_wrong(last_header, normal, tumor): + ''' + Check if the order is perfect + ''' + header_parts = last_header.rstrip().split('\t') + if header_parts[9] == tumor and \ + header_parts[10] == normal: + return True + else: + return False + + +def check_paired(last_header): + ''' + Return False if VCF has only a single sample + ''' + header_parts = last_header.rstrip().split('\t') + if len(header_parts) == 10: + return False + return True + + +def rename(vcf_file, vcf_out_file, normal, tumor, prefix): + ''' + Add prefix to sample name + ''' + # ===================== + # test if renaming should occur + # ===================== + rename = False + if vcf_out_file == vcf_file: + vcf_out_file = vcf_file + '_tmp.vcf' + rename = True + # ===================== + # reorder + # ===================== + header = load_header(vcf_file) + last_header = header[-1] + paired = check_paired(last_header) + if not paired: + reorder = False + else: + reorder = order_wrong(last_header, normal, tumor) + if reorder: + log.error('VCF must start with expected sample names in the order normal, tumor.') + sys.exit(1) + vcf_reader = load_vcf(vcf_file, header, reorder, + paired, normal, tumor, prefix) + # ===================== + # rewrite + # ===================== + vcf_writer(header, vcf_reader, vcf_out_file) + # ===================== + # rename output VCF + # ===================== + if rename: + shutil.move(vcf_out_file, vcf_file) + return True + + +def vcf_writer(header, vcf_reader, vcf_out_file): + ''' + Write out the VCF file with corrected sample names + ''' + with open(vcf_out_file, 'w') as vcf_out: + for line in header[:-1]: + vcf_out.write(line) + vcf_reader.to_csv(vcf_out_file, sep='\t', + mode='a', index=False) + + +def main(): + vcf_file = sys.argv[1] + vcf_out_file = sys.argv[2] + normal = sys.argv[3] + tumor = sys.argv[4] + prefix = sys.argv[5] + assert os.path.isfile(vcf_file), 'Failed to find prep caller VCF call file :' + vcf_file + rename(vcf_file, vcf_out_file, normal, tumor, prefix) + + +# ===================== +# Main +# ===================== + + +if __name__ == "__main__": + main() diff --git a/bin/pta/reorder_vcf.py b/bin/pta/reorder_vcf.py new file mode 100644 index 00000000..2f70b30b --- /dev/null +++ b/bin/pta/reorder_vcf.py @@ -0,0 +1,135 @@ +#!/usr/bin/env python +# USAGE: reorder_vcf.py VCF_IN VCF_OUT NORMAL TUMOR +# DESCRIPTION: Print a VCF file with the sample order indicated in the 3rd +# and 4th arguments + + +# ## MWL NOTE: +# This script requires the header and input 'tumor/normal' names in the 3rd and 4th arg to match. +# If you pass names NOT present in the header, it will simply emit the file AS IS. +# The script DOES NOT inform the user of if a change has been made in the sample order. +# NOTE ALSO: if the header already contains the strings 'TUMOR' and 'NORMAL, +# 'TUMOR and NORMAL are RENAMED to string provided in 3rd and 4th args. + +import sys +import pandas as pd +import shutil + +def load_header(vcf_in): + ''' + Load a VCF file header as a list of lines. + ''' + with open(vcf_in) as vcf: + header = [line for line in vcf if line.startswith('#')] + return header + + +def load_vcf(vcf_in, header, reorder, paired, normal, tumor): + ''' + Load a VCF file as an pandas dataframe. + ''' + names = header[-1].rstrip().replace('^#', '').split('\t') + if paired and reorder: + names = ['#CHROM', 'POS', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER', + 'INFO', 'FORMAT', tumor, normal] + elif paired and not reorder: + names = ['#CHROM', 'POS', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER', + 'INFO', 'FORMAT', normal, tumor] + vcf_reader = pd.read_csv(vcf_in, comment='#', + names=names, sep='\t', + dtype={'#CHROM' : str}) + return vcf_reader + + +def reorder_vcf(vcf_reader, reorder): + ''' + reorder corrected names. + ''' + if reorder: + vcf_reader = vcf_reader[['#CHROM', 'POS', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER', + 'INFO', 'FORMAT', normal, tumor]] + return vcf_reader + + +def order_wrong(last_header, normal, tumor): + ''' + Check if the order is good + ''' + header_parts = last_header.rstrip().split('\t') + if header_parts[9] in [tumor, 'TUMOR'] and \ + header_parts[10] in [normal, 'NORMAL']: + return True + elif tumor in header_parts[9] and \ + normal in header_parts[10]: + return True + else: + return False + + +def check_paired(last_header): + ''' + Return False if VCF has only a single sample + ''' + header_parts = last_header.rstrip().split('\t') + if len(header_parts) == 10: + return False + return True + + +def reorder_column(vcf_file, vcf_out_file, normal, tumor): + ''' + Order columns Normal and then Tumor + ''' + # ===================== + # test if renaming should occur + # ===================== + rename = False + if vcf_out_file == vcf_file: + vcf_out_file = vcf_file + '_tmp.vcf' + rename = True + # ===================== + # reorder + # ===================== + header = load_header(vcf_file) + last_header = header[-1] + paired = check_paired(last_header) + if not paired: + reorder = False + else: + reorder = order_wrong(last_header, normal, tumor) + vcf_reader = load_vcf(vcf_file, header, reorder, paired, normal, tumor) + if reorder: + vcf_reader = reorder_vcf(vcf_reader, reorder) + # ===================== + # rewrite + # ===================== + vcf_writer(header, vcf_reader, vcf_out_file) + # ===================== + # rename output VCF + # ===================== + if rename: + shutil.move(vcf_out_file, vcf_file) + return True + + +def vcf_writer(header, vcf_reader, vcf_out_file): + ''' + Write out the VCF file with corrected information + ''' + with open(vcf_out_file, 'w') as vcf_out: + for line in header[:-1]: + vcf_out.write(line) + vcf_reader.to_csv(vcf_out_file, sep='\t', + mode='a', index=False) + + +# ===================== +# Main +# ===================== +if __name__ == "__main__": + vcf_file = sys.argv[1] + vcf_out_file = sys.argv[2] + normal = sys.argv[3] + tumor = sys.argv[4] + reorder_column(vcf_file, vcf_out_file, normal, tumor) + diff --git a/bin/pta/split_annotations.py b/bin/pta/split_annotations.py new file mode 100644 index 00000000..2384330d --- /dev/null +++ b/bin/pta/split_annotations.py @@ -0,0 +1,11 @@ +import pandas as pd +import sys + +df = pd.read_csv(sys.argv[1], sep='\t') + +df[['Allele','Consequence','IMPACT','SYMBOL','Gene','Feature_type','Feature','BIOTYPE','EXON','INTRON','HGVSc','HGVSp','cDNA_position','CDS_position','Protein_position','Amino_acids','Codons','Existing_variation','DISTANCE','STRAND','FLAGS','SYMBOL_SOURCE','HGNC_ID','REFSEQ_MATCH','REFSEQ_OFFSET','SOURCE','HGVS_OFFSET','AF_1000G','AFR_AF_1000G','AMR_AF_1000G','EAS_AF_1000G','EUR_AF_1000G','SAS_AF_1000G','gnomADe_AF','gnomADe_AFR_AF','gnomADe_AMR_AF','gnomADe_ASJ_AF','gnomADe_EAS_AF','gnomADe_FIN_AF','gnomADe_NFE_AF','gnomADe_OTH_AF','gnomADe_SAS_AF','MAX_AF','MAX_AF_POPS','CLIN_SIG','SOMATIC','PHENO','ada_score','rf_score','MaxEntScan_alt','MaxEntScan_diff','MaxEntScan_ref','CADD_phred','FATHMM_pred','GERP++_RS','LRT_pred','MetaSVM_pred','MutationAssessor_pred','MutationTaster_pred','PROVEAN_pred','Polyphen2_HVAR_pred','PrimateAI_pred','REVEL_score','SIFT4G_pred','SIFT_pred','fathmm-MKL_coding_pred','phyloP100way_vertebrate','CosmicCoding','CosmicCoding_GENOMIC_ID','CosmicCoding_LEGACY_ID','CosmicCoding_CNT','CosmicCoding_CDS','CosmicCoding_AA','CosmicNonCoding','CosmicNonCoding_GENOMIC_ID','CosmicNonCoding_LEGACY_ID','CosmicNonCoding_CNT','CosmicNonCoding_CDS','CosmicNonCoding_AA','NYGC','NYGC_AF','NYGC_Samples','NYGC_AC_Het','NYGC_AC_Hom','CLN_Overlap','CLN_Overlap_CLIN_ID','CLN_Overlap_CLNSIG','CLN_Overlap_CLNREVSTAT','CLN_Overlap_CLNDN','CLN_Exact','CLN_Exact_CLIN_ID','CLN_Exact_CLNSIG','CLN_Exact_CLNREVSTAT','CLN_Exact_CLNDN','GnomadExomes','GnomadExomes_AF','GnomadExomes_nhomalt','GnomadGenomes','GnomadGenomes_AF','GnomadGenomes_nhomalt','CHD_GENES','CHD_GENES_GENE','CHD_EVOLVING','CHD_EVOLVING_GENE','chd_whitelist','chd_whitelist_END','INTRONIC','INTRONIC_INTRONIC','CLINVAR_INTRONIC','CLINVAR_INTRONIC_INTRONIC','mm','mm_GENE','mm_HGVSG','mm_MMCNT1','mm_MMCNT2','mm_MMCNT3','mm_MMID3','mm_MMURI3','SPLICEAI','SPLICEAI_DS_AG','SPLICEAI_DS_AL','SPLICEAI_DS_DG','SPLICEAI_DS_DL','PLI','PLI_pLI','PLI_mis_z','Domino','Domino_Domino_Score','AR','AR_AR_GENE','ACMG59','ACMG59_GENE','ACMG59_DISEASE','DIALS','DIALS_DIALS_GENE','PGx','PGx_pgx_rsid','IMMUNO','IMMUNO_IMMUNO_Gene','NEURO','NEURO_NEURO_Gene','CARDIO','CARDIO_CARDIO_Gene','N19','N19_NYGC_CUR','R19','R19_NYGC_REPORTED_SAMPLE','R19_NYGC_CLASS','R19_NYGC_DISEASE']] = df['CSQ'].str.split('|',expand=True) +df[['CancerGeneCensus_Tier','Hallmark','Somatic','Germline','Tumour_Types_Somatic','Tumour_Types_Germline','Cancer_Syndrome','Tissue_Type','Molecular_Genetics','Role_in_Cancer','Mutation_Types']] = df['CancerGeneCensus'].str.split('|',expand=True) +df[['MUTATION_ID','GENOMIC_MUTATION_ID','Drug_Name','CosmicResistanceMutation_Tier']] = df['CosmicResistanceMutation'].str.split('|',expand=True) +df.drop(['CSQ', 'CancerGeneCensus', 'CosmicResistanceMutation'], axis=1, inplace = True) + +df.to_csv(sys.argv[2], sep='\t', index = False) \ No newline at end of file diff --git a/bin/pta/split_mnv.py b/bin/pta/split_mnv.py new file mode 100644 index 00000000..c5d326a6 --- /dev/null +++ b/bin/pta/split_mnv.py @@ -0,0 +1,198 @@ +#!/usr/bin/env python +# USAGE: python split_mnv.py +# DESCRIPTION: +################################################################################ +##################### COPYRIGHT ################################################ +# New York Genome Center

# SOFTWARE COPYRIGHT NOTICE AGREEMENT
# This software and its documentation are copyright (2018) by the New York
# Genome Center. All rights are reserved. This software is supplied without +# any warranty or guaranteed support whatsoever. The New York Genome Center
# cannot be responsible for its use, misuse, or functionality.
# Version: 0.1
# Author: Kanika Arora (karora@nygenome.org) and Jennifer M Shelton without anchor base + start_pos = 0 + # with anchor base + for alt in record.alts: + if alt[0] == record.ref[0]: + start_pos = 1 + return start_pos + + +def write_file(bcf_in, bcf_out, tool): + ''' + Split MNV records + ''' + splits = [] + for record in bcf_in.fetch(): + if len(record.alts) > 1: + log.error('VCF file must have only one ALT per line') + sys.exit(1) + if len(record.ref) > 1 \ + and len(record.ref) == len(record.alts[0]): + start_pos = determine_anchor_base(record) + mnv_id = '_'.join([tool, record.contig, str(record.pos), + record.ref, record.alts[0]]) + refs = list(record.ref) + alts = list(record.alts[0]) + orig_record = record.copy() + # print full record + record.pos = record.pos + start_pos + record.ref = record.ref[start_pos:] + record.alts = [record.alts[0][start_pos:]] + record.info['TYPE'] = 'MNV' + record.info['MNV_ID'] = [mnv_id] # change if never multi-allelic + print_record(record, bcf_out) + # print split records + for new_record in split_records(start_pos, refs, + orig_record, alts, + mnv_id): + print_record(new_record, bcf_out) + splits.append('|'.join([new_record.chrom, str(new_record.pos), new_record.ref, new_record.alts[0]])) + return set(splits) + + +def write_file_non_mnv(bcf_in, splits, bcf_out, tool): + ''' + Split MNV records + ''' + for record in bcf_in.fetch(): + if not (len(record.ref) > 1 \ + and len(record.ref) == len(record.alts[0])): + # print non-MNVs + id = '|'.join([record.chrom, str(record.pos), record.ref, record.alts[0]]) + if not id in splits: + record.info['TYPE'] = get_type(record) # change if multi-allelic + print_record(record, bcf_out) + else: + print('duplicate...', id) + return True + +def split_records(start_pos, refs, orig_record, alts, mnv_id): + ''' + print split records (skipping the first anchor base) + (if start_pos is 1) + ''' + for i in range(start_pos, len(refs)): + if refs[i] != alts[i]: + new_record = orig_record.copy() + new_record.ref = refs[i] + new_record.alts = [alts[i]] + new_record.pos = orig_record.pos + i + new_record.info['TYPE'] = 'SNV' # change if multi-allelic + new_record.info['MNV_ID'] = [mnv_id] # change if never multi-allelic + yield new_record + + +def main(): + ''' + Prepare the VCF file for merging by: + 1) Split MNV records to one line per nucleotide + 2) Skip any line that has an SNV called by an MNV for the same tool + ''' + # ========================== + # Input variables + # ========================== + vcf_file = sys.argv[1] + out = sys.argv[2] + tool = sys.argv[3] + + assert os.path.isfile(vcf_file), 'Failed to find caller VCF call file :' + vcf_file + # ========================== + # Run prep + # ========================== + bcf_in = read_vcf(vcf_file) + bcf_in = add_info_header(bcf_out=bcf_in, + id='MNV_ID', + number='.', + type='String', + description='ID of multi-nucleotide variant (MNV) that the SNV is part of') + bcf_in = add_info_header(bcf_out=bcf_in, + id='TYPE', + number='1', + type='String', + description='Variant type (SNV,INS,DEL,MNV,COMPLEX,MULTI)') + bcf_out = pysam.VariantFile(out, 'w', header=bcf_in.header) + splits = write_file(bcf_in, + bcf_out, + tool) + write_file_non_mnv(bcf_in, + splits, + bcf_out, + tool) + + +########################################################################## +##### Execute main unless script is simply imported ############ +##### for individual functions ############ +########################################################################## + + +if __name__ == '__main__': + main() \ No newline at end of file diff --git a/bin/pta/vcf_filter.py b/bin/pta/vcf_filter.py new file mode 100644 index 00000000..f0fcd9a1 --- /dev/null +++ b/bin/pta/vcf_filter.py @@ -0,0 +1,127 @@ +#!/usr/bin/env python +# USAGE: python vcf_filter.py GRM_FILE VCF_FILE OUT_FILE +# DESCRIPTION: +################################################################################ +##################### COPYRIGHT ################################################ +# New York Genome Center + +# SOFTWARE COPYRIGHT NOTICE AGREEMENT +# This software and its documentation are copyright (2018) by the New York +# Genome Center. All rights are reserved. This software is supplied without
# any warranty or guaranteed support whatsoever. The New York Genome Center +# cannot be responsible for its use, misuse, or functionality. +# Version: 0.2 +# Author: Kanika Arora (karora@nygenome.org) and Jennifer M Shelton +##################### /COPYRIGHT ############################################### +################################################################################ +import sys +import os +import logging as log +import pysam +########################################################################## +############## Custom functions ############ +########################################################################## + + +def read_vcf(vcf_file): + ''' + Read in annotated VCF file. + ''' + bcf_in = pysam.VariantFile(vcf_file) # auto-detect input format + return bcf_in + + +def add_filter_header(bcf_out, + id, + description): + ''' + Add new FILTER field + ''' + bcf_out.header.filters.add(id=id, + number=None, + type=None, + description=description) + return bcf_out + + +def test_af(record, alt_index, af=0.01): + ''' + Test if AF > 0.01 in one germline database. PASS + variants that don't have AF listed (for example + in records from mouse databases) + Pass variants that don't have sample columns (e.g. mouse variants from 00-All.normalized.vcf.gz) + ''' + samples = record.samples.keys() + none_count = 0 + if len(samples) > 0: + for sample_name in samples: + if not 'AF' in record.format.keys(): + return True + elif record.samples[sample_name]['AF'][alt_index]: + if record.samples[sample_name]['AF'][alt_index] > af: + return True + else: + none_count += 1 + if none_count < len(samples): + return False + return True + + +def is_germline(germ_in, record, alt, af=0.01): + ''' + Check if matching variant is in the GRM VCF. + ''' + for germ_record in germ_in.fetch(record.contig, record.pos - 1, record.pos): + if germ_record.ref == record.ref: + for alt_index, germ_alt in enumerate(germ_record.alts): + if test_af(germ_record, alt_index, af=af): + if germ_alt == alt: + return True + return False + + +def filter_vcf(bcf_in, bcf_out, germ_in): + ''' + Change filter column from PASS to GRM or + add GRM to filters. + ''' + for record in bcf_in.fetch(): + filters = record.filter.keys() + filter = False + for alt in record.alts: + if is_germline(germ_in, record, alt): + filter = True + if filter: + if len(filters) == 1 and filters[0] == 'PASS': + record.filter.clear() + record.filter.add('GRM') + exit_status = bcf_out.write(record) + if exit_status != 0: + print(exit_status) + + +def main(): + ''' + Change filter column from PASS to GRM or + add GRM to filters. + ''' + germ_file = sys.argv[1] + vcf_file = sys.argv[2] + out_file = sys.argv[3] + assert os.path.isfile(germ_file), 'Failed to find germline VCF call file :' + germ_file + assert os.path.isfile(vcf_file), 'Failed to find somatic VCF call file :' + vcf_file + germ_in = read_vcf(germ_file) + bcf_in = read_vcf(vcf_file) + bcf_in = add_filter_header(bcf_in, + id='GRM', + description='Known germline variant') + bcf_out = pysam.VariantFile(out_file, 'w', header=bcf_in.header) + filter_vcf(bcf_in, bcf_out, germ_in) + + +########################################################################## +##### Execute main unless script is simply imported ############ +##### for individual functions ############ +########################################################################## +if __name__ == '__main__': + main() \ No newline at end of file diff --git a/bin/pta/vcf_to_bed.py b/bin/pta/vcf_to_bed.py new file mode 100644 index 00000000..5068ea79 --- /dev/null +++ b/bin/pta/vcf_to_bed.py @@ -0,0 +1,71 @@ +#!/usr/bin/env python +# USAGE: vcf_to_bed.py +# DESCRIPTION: Make bed file from VCF. +################################################################################ +##################### COPYRIGHT ################################################ +# New York Genome Center + +# SOFTWARE COPYRIGHT NOTICE AGREEMENT +# This software and its documentation are copyright (2018) by the New York +# Genome Center. All rights are reserved. This software is supplied without
# any warranty or guaranteed support whatsoever. The New York Genome Center +# cannot be responsible for its use, misuse, or functionality. + +# Version: 1.0 +# Author: Jennifer M Shelton, Andre Corvelo +##################### /COPYRIGHT ############################################### +################################################################################ +import sys +import os +import logging as log +import pysam +########################################################################## +############## Custom functions ############ +########################################################################## + +def read_vcf(vcf_file): + ''' + Read in annotated VCF file. + ''' + bcf_in = pysam.VariantFile(vcf_file) # auto-detect input format + return bcf_in + + +def feed_vcf(bcf_in): + ''' + Generate relevant columns for BED. + Converts to 0-based intervals + ''' + for record in bcf_in.fetch(): + yield record.chrom, record.pos - 1, record.pos + len(record.alts[0]) - 1 + + +def make_bed(vcf_file): + ''' + Make BED file from the interval list + ''' + log.info('#######################################') + log.info('# Making bed...') + log.info('#######################################') + bcf_in = read_vcf(vcf_file) + for chrom, start, end in feed_vcf(bcf_in): + sys.stdout.write('\t'.join([chrom, str(start), str(end), '.']) + '\n') + log.info('#######################################') + log.info('# Done making bed.') + log.info('#######################################') + + +def main(): + ''' + Makes BED file from a VCF. + ''' + assert os.path.isfile(sys.argv[1]); 'Failed to open reference VCF file' + vcf_file = sys.argv[1] + make_bed(vcf_file) + +########################################################################## +##### Execute main unless script is simply imported ############ +##### for individual functions ############ +########################################################################## +if __name__ == '__main__': + main() \ No newline at end of file diff --git a/bin/rna_fusion/compute_insert_size.py b/bin/rna_fusion/compute_insert_size.py new file mode 100644 index 00000000..a2dae74c --- /dev/null +++ b/bin/rna_fusion/compute_insert_size.py @@ -0,0 +1,22 @@ +import h5py +import numpy as np +import sys + +fn = sys.argv[1] +f = h5py.File(fn) +x = np.asarray(f['aux']['fld'], dtype='float64') +y = np.cumsum(x)/np.sum(x) +cutoff = np.argmax(y > .95) +# 95% CI insert size. See: https://github.com/pmelsted/pizzly/issues/45 +print(cutoff) + +# cutoff = np.argmax(y) +# max insert size. If needed. + +## Mean insert size. If needed. +# fn = sys.argv[1] +# f = h5py.File(fn) +# x = np.asarray(f['aux']['fld'], dtype='float64') +# t = np.arange(0,len(x),1) +# mean = np.sum(x*t)/np.sum(x) +# print(mean) diff --git a/bin/shared/bamtools/bamtools_filter_pe.json b/bin/shared/bamtools/bamtools_filter_pe.json new file mode 100755 index 00000000..323c186c --- /dev/null +++ b/bin/shared/bamtools/bamtools_filter_pe.json @@ -0,0 +1,18 @@ +{ + "filters" : [ + { "id" : "insert_min", + "insertSize" : ">=-2000" + }, + + { "id" : "insert_max", + "insertSize" : "<=2000" + }, + + { "id" : "mismatch", + "tag" : "NM:<=4" + } + ], + + "rule" : " insert_min & insert_max & mismatch " + +} diff --git a/bin/shared/bamtools/bamtools_filter_se.json b/bin/shared/bamtools/bamtools_filter_se.json new file mode 100755 index 00000000..0b21d3e9 --- /dev/null +++ b/bin/shared/bamtools/bamtools_filter_se.json @@ -0,0 +1,10 @@ +{ + "filters" : [ + { "id" : "mismatch", + "tag" : "NM:<=4" + } + ], + + "rule" : " mismatch " + +} diff --git a/bin/shared/extract_csv.nf b/bin/shared/extract_csv.nf new file mode 100644 index 00000000..0cb09b9f --- /dev/null +++ b/bin/shared/extract_csv.nf @@ -0,0 +1,77 @@ +// Function to extract information (meta data + file(s)) from csv file(s) +// https://github.com/nf-core/sarek/blob/master/workflows/sarek.nf#L1084 +def extract_csv(csv_file) { + + // check that the sample sheet is not 1 line or less, because it'll skip all subsequent checks if so. + file(csv_file).withReader('UTF-8') { reader -> + def line, numberOfLinesInSampleSheet = 0; + while ((line = reader.readLine()) != null) {numberOfLinesInSampleSheet++} + if (numberOfLinesInSampleSheet < 2) { + log.error "Samplesheet had less than two lines. The sample sheet must be a csv file with a header, so at least two lines." + System.exit(1) + } + } + + Channel.from(csv_file).splitCsv(header: true) + .map{ row -> + if (!(row.sampleID)){ + log.error "Missing field in csv file header. The csv file must have a field named 'sampleID'." + System.exit(1) + } + [row.sampleID.toString(), row] + }.groupTuple() + .map{ meta, rows -> + size = rows.size() + [rows, size] + }.transpose() + .map{ row, numLanes -> //from here do the usual thing for csv parsing + + def meta = [:] + + // Meta data to identify samplesheet + if (row.sampleID) meta.sampleID = row.sampleID.toString() + + // If no lane specified, lane is not considered + if (row.lane) meta.lane = row.lane.toString() + else meta.lane = 'NA' + + /* + NOTE: Additional metadata parsing could be added here. This function is a minimal implimentation of a csv parser. + */ + + meta.id = row.sampleID.toString() + /* + NOTE: Additional ID parsing could be added here. For example a concatenation of patient and sample, if those fields were added to the csv sheet. + */ + meta.size = size + // defines the number of lanes for each sample. + + // join meta to fastq + + if (row.fastq_2) { + + return [meta.id, meta, row.fastq_1, row.fastq_2] + + } else { + return [meta.id, meta, row.fastq_1] + + } + } +} + +/* + // Additional check of sample sheet: + // 1. Each row should specify a lane and the same combination of patient, sample and lane shouldn't be present in different rows. + // 2. The same sample shouldn't be listed for different patients. + def sample2patient = [:] + + Channel.from(csv_file).splitCsv(header: true) + .map{ row -> + if (!sample2patient.containsKey(row.sample.toString())) { + sample2patient[row.sample.toString()] = row.patient.toString() + } else if (sample2patient[row.sample.toString()] != row.patient.toString()) { + log.error('The sample "' + row.sample.toString() + '" is registered for both patient "' + row.patient.toString() + '" and "' + sample2patient[row.sample.toString()] + '" in the sample sheet.') + System.exit(1) + } + } +*/ \ No newline at end of file diff --git a/bin/shared/multiqc/JAX_logo_rgb_transparentback.png b/bin/shared/multiqc/JAX_logo_rgb_transparentback.png new file mode 100644 index 00000000..9ea8a5cf Binary files /dev/null and b/bin/shared/multiqc/JAX_logo_rgb_transparentback.png differ diff --git a/bin/shared/multiqc/amplicon_multiqc.yaml b/bin/shared/multiqc/amplicon_multiqc.yaml new file mode 100644 index 00000000..a38f1076 --- /dev/null +++ b/bin/shared/multiqc/amplicon_multiqc.yaml @@ -0,0 +1,30 @@ +title: "Amplicon QC Summary Report" +intro_text: This report has been generated by the JAX NGS Operations Nextflow DSL2 Pipelines + analysis pipeline. For information about how to interpret these results, please see the + wiki documentation. + +export_plots: true + +module_order: + - cutadapt + - fastqc + - primerclip + - gatk + - coverage_metrics + - picard + +table_columns_visible: + FastQC: + percent_duplicates: False + percent_gc: False + total_sequences: False + +extra_fn_clean_exts: + - "_sortsam" + - "_realigned_BQSR" + - "_paired" + - "_001" + +custom_logo: "./JAX_logo_rgb_transparentback.png" +custom_logo_url: "https://github.com/TheJacksonLaboratory/cs-nf-pipelines" +custom_logo_title: "The Jackson Laboratory - NGS Operations Nextflow DSL2 Pipelines" diff --git a/bin/shared/multiqc/atac_multiqc.yaml b/bin/shared/multiqc/atac_multiqc.yaml new file mode 100644 index 00000000..a427165d --- /dev/null +++ b/bin/shared/multiqc/atac_multiqc.yaml @@ -0,0 +1,94 @@ +title: "ATAC-Seq QC Summary Report" +intro_text: This report has been generated by the JAX NGS Operations Nextflow DSL2 Pipelines + analysis pipeline. For information about how to interpret these results, please see the + wiki documentation. + +export_plots: true + +module_order: + - fastqc + - cutadapt + - bowtie2 + - picard + - custom_content: + - fraglen_plot + +table_columns_visible: + FastQC: + percent_duplicates: False + percent_gc: False + total_sequences: False + +table_columns_placement: + PCR_statistics: + NRF: 1010 + PBC1: 1020 + PBC2: 1030 + MT_content: + 'Perc mtDNA': 1040 + FRiP: + FRiP: 1050 + +extra_fn_clean_exts: + - "_bowtie2" + - "_R2" + +custom_data: + PCR_statistics: + file_format: 'tsv' + plot_type: 'generalstats' + pconfig: + - NRF: + description: 'Non Redundant Fraction' + scale: False + - PBC1: + description: 'PCR Bottlenecking Coefficient 1' + scale: False + - PBC2: + description: 'PCR Bottlenecking Coefficient 2' + scale: False + format: '{:,.2f}' + MT_content: + file_format: 'tsv' + plot_type: 'generalstats' + pconfig: + - 'Perc mtDNA': + description: 'Percent mtDNA' + scale: False + suffix: "%" + FRiP: + file_format: 'tsv' + plot_type: 'generalstats' + pconfig: + - FRiP: + description: 'Fraction of Reads in Peak' + scale: False + - 'Filtered Reads': + description: 'Total Filtered Reads' + scale: False + format: '{:,.0f}' + fraglen_plot: + file_format: "tsv" + section_name: "Fragment Length" + description: "This plot comes from files acommpanied by a mutliqc_config.yaml file for configuration" + plot_type: "linegraph" + pconfig: + id: "example_coverage_lineplot" + title: "Fragment Length Plot" + ylab: "Read Count" + xlab: "Insert Size (bp)" + + +sp: + PCR_statistics: + fn: '*.pbc.qc' + MT_content: + fn: '*mtDNA_Content.txt' + FRiP: + fn: '*Fraction_reads_in_peak.txt' + fraglen_plot: + fn: '*spline_table.txt' + +custom_logo: "./JAX_logo_rgb_transparentback.png" +custom_logo_url: "https://github.com/TheJacksonLaboratory/cs-nf-pipelines" +custom_logo_title: "The Jackson Laboratory - NGS Operations Nextflow DSL2 Pipelines" diff --git a/bin/shared/multiqc/chipseq.yaml b/bin/shared/multiqc/chipseq.yaml new file mode 100644 index 00000000..0fd756a6 --- /dev/null +++ b/bin/shared/multiqc/chipseq.yaml @@ -0,0 +1,173 @@ +report_comment: > + This report has been generated by the JAX NGS Operations Nextflow DSL2 Pipelines + analysis pipeline. For information about how to interpret these results, please see the + wiki documentation. + +export_plots: true + +run_modules: + - custom_content + - fastqc + - cutadapt + - samtools + - picard + - preseq + - featureCounts + - deeptools + - phantompeakqualtools + +exclude_modules: + - "general_stats" + +module_order: + - fastqc: + name: "LIB: FastQC (raw)" + info: "This section of the report shows FastQC results before adapter trimming for individual libraries." + path_filters: + - "./fastqc/*.zip" + - cutadapt: + name: "LIB: cutadapt (trimmed)" + info: "This section of the report shows the length of trimmed reads by cutadapt for individual libraries." + - fastqc: + name: "LIB: FastQC (trimmed)" + info: "This section of the report shows FastQC results after adapter trimming for individual libraries." + path_filters: + - "./trimgalore/fastqc/*.zip" + - samtools: + name: "LIB: SAMTools" + info: "This section of the report shows SAMTools results for individual libraries." + path_filters: + - "./alignment/library/*" + - samtools: + name: "MERGED LIB: SAMTools (unfiltered)" + info: "This section of the report shows SAMTools results after merging libraries and before filtering." + path_filters: + - "./alignment/mergedLibrary/unfiltered/*.mLb.mkD.sorted.bam*" + - picard: + name: "MERGED LIB: Picard (unfiltered)" + info: "This section of the report shows picard results after merging libraries and before filtering." + path_filters: + - "./alignment/mergedLibrary/unfiltered/picard_metrics/*" + - preseq: + name: "MERGED LIB: Preseq (unfiltered)" + info: "This section of the report shows Preseq results after merging libraries and before filtering." + - samtools: + name: "MERGED LIB: SAMTools (filtered)" + info: "This section of the report shows SAMTools results after merging libraries and after filtering." + path_filters: + - "./alignment/mergedLibrary/filtered/*.mLb.clN.sorted.bam*" + - picard: + name: "MERGED LIB: Picard (filtered)" + info: "This section of the report shows picard results after merging libraries and after filtering." + path_filters: + - "./alignment/mergedLibrary/filtered/picard_metrics/*" + - deeptools: + name: "MERGED LIB: deepTools" + anchor: "mlib_deeptools" + info: "This section of the report shows ChIP-seq QC plots generated by deepTools." + - featureCounts: + name: "MERGED LIB: featureCounts" + anchor: "mlib_featurecounts" + info: "This section of the report shows featureCounts results for the number of reads assigned to merged library consensus peaks." + path_filters: + - "./macs2/featurecounts/*.summary" + +report_section_order: + peak_count: + before: mlib_deeptools + frip_score: + before: peak_count + peak_annotation: + before: frip_score + strand_shift_correlation: + before: peak_annotation + nsc_coefficient: + before: strand_shift_correlation + rsc_coefficient: + before: nsc_coefficient + mlib_featurecounts: + before: rsc_coefficient + deseq2_pca_1: + order: -1600 + deseq2_pca_2: + order: -1700 + deseq2_pca_3: + order: -1800 + deseq2_pca_4: + order: -1900 + deseq2_pca_5: + order: -2000 + deseq2_pca_6: + order: -2100 + deseq2_pca_7: + order: -2200 + deseq2_pca_8: + order: -2300 + deseq2_pca_9: + order: -2400 + deseq2_pca_10: + order: -2500 + deseq2_clustering_1: + order: -2600 + deseq2_clustering_2: + order: -2700 + deseq2_clustering_3: + order: -2800 + deseq2_clustering_4: + order: -2900 + deseq2_clustering_5: + order: -3000 + deseq2_clustering_6: + order: -3100 + deseq2_clustering_7: + order: -3200 + deseq2_clustering_8: + order: -3300 + deseq2_clustering_9: + order: -3400 + deseq2_clustering_10: + order: -3500 + software_versions: + order: -3600 + nf-core-chipseq-summary: + order: -3700 + +custom_plot_config: + picard_insert_size: + cpswitch_c_active: False + smooth_points: 1000 + featurecounts: + cpswitch_c_active: False + +extra_fn_clean_exts: + - "fastq.gz" + - "_trimmed" + - "_val" + - "sorted.bam" + - ".Lb" + - "mkD" + - "clN" + - "mLb" + - "_peaks" + - ".FRiP" + - ".peak" + - "_spp" + - ".spp" + - "lc_extrap" + +# # Customise the module search patterns to speed up execution time +# # - Skip module sub-tools that we are not interested in +# # - Replace file-content searching with filename pattern searching +# # - Don't add anything that is the same as the MultiQC default +# # See https://multiqc.info/docs/#optimise-file-search-patterns for details +sp: + cutadapt: + fn: "*trimming_report.txt" + preseq: + fn: "*.lc_extrap.txt" + deeptools/plotFingerprintOutRawCounts: + fn: "*plotFingerprint*" + deeptools/plotProfile: + fn: "*plotProfile*" + phantompeakqualtools/out: + fn: "*.spp.out" diff --git a/bin/shared/multiqc/chipseq/deseq2_clustering_header.txt b/bin/shared/multiqc/chipseq/deseq2_clustering_header.txt new file mode 100644 index 00000000..f7bb33d8 --- /dev/null +++ b/bin/shared/multiqc/chipseq/deseq2_clustering_header.txt @@ -0,0 +1,12 @@ +#id: 'deseq2_clustering' +#section_name: 'MERGED LIB: DESeq2 sample similarity' +#description: "Matrix is generated from clustering with Euclidean distances between +# DESeq2 +# rlog values for each sample +# in the deseq2_qc.r script." +#plot_type: 'heatmap' +#anchor: 'deseq2_clustering' +#pconfig: +# title: 'DESeq2: Heatmap of the sample-to-sample distances' +# xlab: True +# reverseColors: True diff --git a/bin/shared/multiqc/chipseq/deseq2_pca_header.txt b/bin/shared/multiqc/chipseq/deseq2_pca_header.txt new file mode 100644 index 00000000..250c1cb7 --- /dev/null +++ b/bin/shared/multiqc/chipseq/deseq2_pca_header.txt @@ -0,0 +1,11 @@ +#id: 'deseq2_pca' +#section_name: 'MERGED LIB: DESeq2 PCA plot' +#description: "PCA plot of the samples in the experiment. +# These values are calculated using DESeq2 +# in the deseq2_qc.r script." +#plot_type: 'scatter' +#anchor: 'deseq2_pca' +#pconfig: +# title: 'DESeq2: Principal component plot' +# xlab: PC1 +# ylab: PC2 diff --git a/bin/shared/multiqc/chipseq/frip_score_header.txt b/bin/shared/multiqc/chipseq/frip_score_header.txt new file mode 100644 index 00000000..82902115 --- /dev/null +++ b/bin/shared/multiqc/chipseq/frip_score_header.txt @@ -0,0 +1,13 @@ +#id: 'frip_score' +#section_name: 'MERGED LIB: MACS2 FRiP score' +#description: "is generated by calculating the fraction of all mapped reads that fall +# into the MACS2 called peak regions. A read must overlap a peak by at least 20% to be counted. +# See FRiP score." +#plot_type: 'bargraph' +#anchor: 'frip_score' +#pconfig: +# title: 'FRiP score' +# ylab: 'FRiP score' +# ymax: 1 +# ymin: 0 +# tt_decimals: 2 diff --git a/bin/shared/multiqc/chipseq/peak_annotation_header.txt b/bin/shared/multiqc/chipseq/peak_annotation_header.txt new file mode 100644 index 00000000..2b3ee938 --- /dev/null +++ b/bin/shared/multiqc/chipseq/peak_annotation_header.txt @@ -0,0 +1,9 @@ +#id: 'peak_annotation' +#section_name: 'MERGED LIB: HOMER peak annotation' +#description: "is generated by calculating the proportion of peaks assigned to genomic features by +# HOMER annotatePeaks.pl." +#plot_type: 'bargraph' +#anchor: 'peak_annotation' +#pconfig: +# title: 'Peak to feature proportion' +# ylab: 'Peak count' diff --git a/bin/shared/multiqc/chipseq/peak_count_header.txt b/bin/shared/multiqc/chipseq/peak_count_header.txt new file mode 100644 index 00000000..aa4dd346 --- /dev/null +++ b/bin/shared/multiqc/chipseq/peak_count_header.txt @@ -0,0 +1,9 @@ +#id: 'peak_count' +#section_name: 'MERGED LIB: MACS2 peak count' +#description: "is calculated from total number of peaks called by +# MACS2" +#plot_type: 'bargraph' +#anchor: 'peak_count' +#pconfig: +# title: 'Total peak count' +# ylab: 'Peak count' diff --git a/bin/shared/multiqc/chipseq/spp_correlation_header.txt b/bin/shared/multiqc/chipseq/spp_correlation_header.txt new file mode 100644 index 00000000..ad571563 --- /dev/null +++ b/bin/shared/multiqc/chipseq/spp_correlation_header.txt @@ -0,0 +1,12 @@ +#id: 'strand_shift_correlation' +#section_name: 'MERGED LIB: spp strand-shift correlation' +#description: "generated using run_spp.R script from +# phantompeakqualtools." +#plot_type: 'linegraph' +#anchor: 'strand_shift_correlation' +#pconfig: +# title: 'Strand-shift correlation plot' +# ylab: 'Cross-correlation' +# xlab: 'Strand-shift (bp)' +# xDecimals: False +# tt_label: 'Strand-shift (bp) {point.x}: {point.y:.2f} Cross-correlation' diff --git a/bin/shared/multiqc/chipseq/spp_nsc_header.txt b/bin/shared/multiqc/chipseq/spp_nsc_header.txt new file mode 100644 index 00000000..43370f32 --- /dev/null +++ b/bin/shared/multiqc/chipseq/spp_nsc_header.txt @@ -0,0 +1,11 @@ +#id: 'nsc_coefficient' +#section_name: 'MERGED LIB: spp NSC coefficient' +#description: "generated using run_spp.R script from +# phantompeakqualtools." +#plot_type: 'bargraph' +#anchor: 'nsc_coefficient' +#pconfig: +# title: 'Normalized strand cross-correlation coefficient' +# ylab: 'NSC coefficient' +# ymin: 1 +# tt_decimals: 1 diff --git a/bin/shared/multiqc/chipseq/spp_rsc_header.txt b/bin/shared/multiqc/chipseq/spp_rsc_header.txt new file mode 100644 index 00000000..bab5e09b --- /dev/null +++ b/bin/shared/multiqc/chipseq/spp_rsc_header.txt @@ -0,0 +1,11 @@ +#id: 'rsc_coefficient' +#section_name: 'MERGED LIB: spp RSC coefficient' +#description: "generated using run_spp.R script from +# phantompeakqualtools." +#plot_type: 'bargraph' +#anchor: 'rsc_coefficient' +#pconfig: +# title: 'Relative strand cross-correlation coefficient' +# ylab: 'RSC coefficient' +# ymin: 0 +# tt_decimals: 1 diff --git a/bin/shared/multiqc/pdx_wes_multiqc.yaml b/bin/shared/multiqc/pdx_wes_multiqc.yaml new file mode 100644 index 00000000..7a9750bd --- /dev/null +++ b/bin/shared/multiqc/pdx_wes_multiqc.yaml @@ -0,0 +1,30 @@ +title: "PDX Whole Exome Seq QC Summary Report" +intro_text: This report has been generated by the JAX NGS Operations Nextflow DSL2 Pipelines + analysis pipeline. For information about how to interpret these results, please see the + wiki documentation. + +export_plots: true + +module_order: + - jax_trimmer + - fastqc + - xenome + - gatk + - picard + +table_columns_visible: + FastQC: + percent_duplicates: False + percent_gc: False + total_sequences: False + +extra_fn_clean_exts: + - "_sortsam" + - "_realigned_BQSR" + - "_FilterTrim" + - type: "truncate" + pattern: ".fastq" + +custom_logo: "./JAX_logo_rgb_transparentback.png" +custom_logo_url: "https://github.com/TheJacksonLaboratory/cs-nf-pipelines" +custom_logo_title: "The Jackson Laboratory - NGS Operations Nextflow DSL2 Pipelines" diff --git a/bin/shared/multiqc/pta_multiqc.yaml b/bin/shared/multiqc/pta_multiqc.yaml new file mode 100644 index 00000000..076ead18 --- /dev/null +++ b/bin/shared/multiqc/pta_multiqc.yaml @@ -0,0 +1,35 @@ +title: "Paired Tumor Analysis QC Summary Report" +intro_text: This report has been generated by the JAX NGS Operations Nextflow DSL2 Pipelines + analysis pipeline. For information about how to interpret these results, please see the + wiki documentation. + +export_plots: true + +module_order: + - jax_trimmer + - fastqc + - xenome + - conpair + - gatk + - picard + +table_columns_visible: + FastQC: + percent_duplicates: False + percent_gc: False + total_sequences: False + +extra_fn_clean_exts: + - "_sortsam" + - "_realigned_BQSR" + - "_FilterTrim" + - "_concordance" + - "_contamination" + - ".final_sorted" + - ".R1" + - type: "truncate" + pattern: ".fastq" + +custom_logo: "./JAX_logo_rgb_transparentback.png" +custom_logo_url: "https://github.com/TheJacksonLaboratory/cs-nf-pipelines" +custom_logo_title: "The Jackson Laboratory - NGS Operations Nextflow DSL2 Pipelines" diff --git a/bin/shared/multiqc/rna_fusion_multiqc.yaml b/bin/shared/multiqc/rna_fusion_multiqc.yaml new file mode 100644 index 00000000..3ba15cca --- /dev/null +++ b/bin/shared/multiqc/rna_fusion_multiqc.yaml @@ -0,0 +1,24 @@ +title: "RNA Fusion QC Summary Report" +intro_text: This report has been generated by the JAX NGS Operations Nextflow DSL2 Pipelines + analysis pipeline. For information about how to interpret these results, please see the + wiki documentation. + +export_plots: true + +module_order: + - fastqc + - xenome + - custom_content + +table_columns_visible: + FastQC: + percent_duplicates: False + percent_gc: False + total_sequences: False + +extra_fn_clean_exts: + - "_sortsam" + +custom_logo: "./JAX_logo_rgb_transparentback.png" +custom_logo_url: "https://github.com/TheJacksonLaboratory/cs-nf-pipelines" +custom_logo_title: "The Jackson Laboratory - NGS Operations Nextflow DSL2 Pipelines" diff --git a/bin/shared/multiqc/rnaseq_multiqc.yaml b/bin/shared/multiqc/rnaseq_multiqc.yaml new file mode 100644 index 00000000..dc8f8c08 --- /dev/null +++ b/bin/shared/multiqc/rnaseq_multiqc.yaml @@ -0,0 +1,25 @@ +title: "RNA-Seq QC Summary Report" +intro_text: This report has been generated by the JAX NGS Operations Nextflow DSL2 Pipelines + analysis pipeline. For information about how to interpret these results, please see the + wiki documentation. + +export_plots: true + +module_order: + - fastqc + - xenome + - rsem + - picard + +table_columns_visible: + FastQC: + percent_duplicates: False + percent_gc: False + total_sequences: False + +extra_fn_clean_exts: + - "_sortsam" + +custom_logo: "./JAX_logo_rgb_transparentback.png" +custom_logo_url: "https://github.com/TheJacksonLaboratory/cs-nf-pipelines" +custom_logo_title: "The Jackson Laboratory - NGS Operations Nextflow DSL2 Pipelines" diff --git a/bin/shared/multiqc/rrbs_multiqc.yaml b/bin/shared/multiqc/rrbs_multiqc.yaml new file mode 100644 index 00000000..5f2328e3 --- /dev/null +++ b/bin/shared/multiqc/rrbs_multiqc.yaml @@ -0,0 +1,15 @@ +title: "RRBS QC Summary Report" +intro_text: This report has been generated by the JAX NGS Operations Nextflow DSL2 Pipelines + analysis pipeline. For information about how to interpret these results, please see the + wiki documentation. + +export_plots: true + +module_order: + - fastqc + - cutadapt + - bismark + +custom_logo: "./JAX_logo_rgb_transparentback.png" +custom_logo_url: "https://github.com/TheJacksonLaboratory/cs-nf-pipelines" +custom_logo_title: "The Jackson Laboratory - NGS Operations Nextflow DSL2 Pipelines" diff --git a/bin/shared/multiqc/wes_multiqc.yaml b/bin/shared/multiqc/wes_multiqc.yaml new file mode 100644 index 00000000..c916444d --- /dev/null +++ b/bin/shared/multiqc/wes_multiqc.yaml @@ -0,0 +1,30 @@ +title: "Whole Exome Seq QC Summary Report" +intro_text: This report has been generated by the JAX NGS Operations Nextflow DSL2 Pipelines + analysis pipeline. For information about how to interpret these results, please see the + wiki documentation. + +export_plots: true + +module_order: + - jax_trimmer + - fastqc + - gatk + - picard + +table_columns_visible: + FastQC: + percent_duplicates: False + percent_gc: False + total_sequences: False + +extra_fn_clean_exts: + - "_sortsam" + - "_realigned_BQSR" + - "_FilterTrim" + - "_dedup" + - type: "truncate" + pattern: ".fastq" + +custom_logo: "./JAX_logo_rgb_transparentback.png" +custom_logo_url: "https://github.com/TheJacksonLaboratory/cs-nf-pipelines" +custom_logo_title: "The Jackson Laboratory - NGS Operations Nextflow DSL2 Pipelines" diff --git a/bin/shared/multiqc/wgs_multiqc.yaml b/bin/shared/multiqc/wgs_multiqc.yaml new file mode 100644 index 00000000..2efc95cf --- /dev/null +++ b/bin/shared/multiqc/wgs_multiqc.yaml @@ -0,0 +1,30 @@ +title: "Whole Genome Seq QC Summary Report" +intro_text: This report has been generated by the JAX NGS Operations Nextflow DSL2 Pipelines + analysis pipeline. For information about how to interpret these results, please see the + wiki documentation. + +export_plots: true + +module_order: + - jax_trimmer + - fastqc + - gatk + - picard + +table_columns_visible: + FastQC: + percent_duplicates: False + percent_gc: False + total_sequences: False + +extra_fn_clean_exts: + - "_sortsam" + - "_realigned_BQSR" + - "_FilterTrim" + - "_dedup" + - type: "truncate" + pattern: ".fastq" + +custom_logo: "./JAX_logo_rgb_transparentback.png" +custom_logo_url: "https://github.com/TheJacksonLaboratory/cs-nf-pipelines" +custom_logo_title: "The Jackson Laboratory - NGS Operations Nextflow DSL2 Pipelines" diff --git a/bin/shared/read_group_from_fastq.py b/bin/shared/read_group_from_fastq.py index 702779d3..5d198620 100644 --- a/bin/shared/read_group_from_fastq.py +++ b/bin/shared/read_group_from_fastq.py @@ -39,6 +39,8 @@ def parse_args(): help="Sample is tumor in a tumor/normal pair") parser.add_argument('-n', '--normal', action='store_true', help="Sample is normal in a tumor/normal pair") + parser.add_argument('-s', '--sample_id', dest="sample_id", + help="SampleID of file") parser.add_argument('-o', '--output', dest="output_file", help="Output file name [STDOUT]") parser.add_argument('fastq', nargs="+", @@ -115,15 +117,17 @@ def main(): pos = n break if pos == -1: - # Didn't find the GES marker. Use the filename up to the end name. - match = re.search('(.*)[._]R[12]_.*',fn) - if match is not None: - fn = match.group(1) - else: - # something is seriously odd here, but we'll just use the - # whole filename - pass - + if args.sample_id: + fn = args.sample_id + else: + # Didn't find the GES marker. Use the filename up to the end name. + match = re.search('(.*)[._]R[12]_.*',fn) + if match is not None: + fn = match.group(1) + else: + # something is seriously odd here, but we'll just use the + # whole filename + pass cust_id = ges_id = fn else: cust_id = '_'.join(fn_parts[:pos]) diff --git a/config/amplicon.config b/config/amplicon.config new file mode 100644 index 00000000..249272ac --- /dev/null +++ b/config/amplicon.config @@ -0,0 +1,50 @@ +//==================== Nextflow/Container Config ========== + +manifest { + name = "amplicon" + description = 'Pipeline for Processing xGEN amplicon panel data' + author = 'Anuj Srivastava, Carolyn Paisie, Barry Guglielmo, Michael Lloyd, Brian Sanderson Copyright Jackson Laboratory 2021' +} + +params { + // Shared params + gen_org = 'human' + extension='.fastq.gz' + pattern="*_R{1,2}*" + read_type = 'PE' // SE + sample_folder = null + concat_lanes = false + download_data = false + csv_input = null + + multiqc_config = "${projectDir}/bin/shared/multiqc/amplicon_multiqc.yaml" + + cutadaptMinLength = 20 + cutadaptQualCutoff = 20 + cutadaptAdapterR1 = 'CTGTCTCTTATACACATCTCCGAGCCCACGAGAC' + cutadaptAdapterR2 = 'CTGTCTCTTATACACATCTGACGCTGCCGACGA' + + + ref_fa = '/projects/omics_share/human/GRCh38/genome/sequence/gatk/Homo_sapiens_assembly38.fasta' + ref_fa_indices = '/projects/omics_share/human/GRCh38/genome/indices/gatk/bwa/Homo_sapiens_assembly38.fasta' + mismatch_penalty = "-B 8" + + masterfile = '/projects/compsci/omics_share/human/GRCh38/supporting_files/capture_kit_files/IDT/xGen_sampleID_amplicon/hg38Lifted_xGen_masterfile.txt' + + amplicon_primer_intervals = '/projects/compsci/omics_share/human/GRCh38/supporting_files/capture_kit_files/IDT/xGen_sampleID_amplicon/hg38Lifted_xGen_SampleID_primers.interval_list' + amplicon_target_intervals = '/projects/compsci/omics_share/human/GRCh38/supporting_files/capture_kit_files/IDT/xGen_sampleID_amplicon/hg38Lifted_xGen_SampleID_merged_targets.interval_list' + + gold_std_indels = '/projects/omics_share/human/GRCh38/genome/annotation/snps_indels/Mills_and_1000G_gold_standard.indels.hg38.vcf.gz' + phase1_1000G = '/projects/omics_share/human/GRCh38/genome/annotation/snps_indels/1000G_phase1.snps.high_confidence.hg38.vcf.gz' + dbSNP = '/projects/omics_share/human/GRCh38/genome/annotation/snps_indels/dbsnp_151.vcf.gz' + + ploidy_val = '-ploidy 2' // variable in haplotypecaller. not required for amplicon, but present in module. + target_gatk = '/projects/compsci/omics_share/human/GRCh38/supporting_files/capture_kit_files/IDT/xGen_sampleID_amplicon/hg38Lifted_xGen_SampleID_merged_targets.bed' + params.call_val = "50.0" + + dbSNP = '/projects/omics_share/human/GRCh38/genome/annotation/snps_indels/dbsnp_151.vcf.gz' + dbSNP_index = '/projects/omics_share/human/GRCh38/genome/annotation/snps_indels/dbsnp_151.vcf.gz.tbi' + + tmpdir = "/fastscratch/${USER}" + bwa_min_score = null +} \ No newline at end of file diff --git a/config/atac.config b/config/atac.config index 6d9c49e2..3937214b 100644 --- a/config/atac.config +++ b/config/atac.config @@ -3,7 +3,7 @@ manifest { name = "atac" description = 'Pipeline for ATAC Seq Samples' - author = 'Sai Lek, Copyright Jackson Laboratory 2022' + author = 'Sai Lek, Michael Lloyd, Anuj Srivastava, Copyright Jackson Laboratory 2022' version = "0.1.0" } @@ -13,10 +13,16 @@ manifest { params { // Shared params gen_org = 'mouse' + genome_build = 'GRCm38' // GRCm39 extension='.fastq.gz' pattern="*_R{1,2}*" + sample_folder = null read_type = 'PE' // 'SE' concat_lanes = false + download_data = false + csv_input = null + + multiqc_config = "${projectDir}/bin/shared/multiqc/atac_multiqc.yaml" // Reference bowtie2Index = '/projects/omics_share/mouse/GRCm38/genome/indices/ensembl/v102/bowtie2/Mus_musculus.GRCm38.dna.primary_assembly.fa' @@ -39,7 +45,15 @@ params { if (params.gen_org=='human'){ - // Reference + // Reference + params.genome_build = 'GRCh38' params.bowtie2Index = '/projects/omics_share/human/GRCh38/genome/indices/gatk/bowtie2/hg38_noalt' } +// Defaults for GRCm39 build +if (params.genome_build=='GRCm39'){ + // Reference + params.bowtie2Index = '/projects/compsci/omics_share/mouse/GRCm39/genome/indices/ensembl/v105/bowtie2/Mus_musculus.GRCm39.dna.primary_assembly.fa' + params.effective_genome_size = '2654621783' + +} diff --git a/config/chipseq.config b/config/chipseq.config new file mode 100644 index 00000000..2476d8e0 --- /dev/null +++ b/config/chipseq.config @@ -0,0 +1,118 @@ +//==================== Nextflow/Container Config ========== + +manifest { + name = "chipseq" + description = 'Pipeline for ChIP-Seq Samples. Adapted from: https://nf-co.re/chipseq, which is available under MIT License' + author = 'Sai Lek, Copyright Jackson Laboratory 2022' + version = "0.1.0" +} + + +// Default to Mouse, If gen_org == 'human' parameters are overwritten with values +// in the "Defaults for Human" section below + +params { + // Shared params + gen_org = 'mouse' // human + read_type = 'PE' // 'SE' + genome_build = 'GRCm38' // 'GRCm38' or 'GRCm39' + + // Reference fasta + ref_fa = '/projects/omics_share/mouse/GRCm38/genome/sequence/ensembl/v102/Mus_musculus.GRCm38.dna.primary_assembly.fa' + ref_fa_indices='/projects/omics_share/mouse/GRCm38/genome/indices/ensembl/v102/bwa/Mus_musculus.GRCm38.dna.primary_assembly.fa' + + // GTF & BED annotation + gtf = '/projects/omics_share/mouse/GRCm38/transcriptome/annotation/ensembl/v102/Mus_musculus.GRCm38.102.gtf' + gene_bed = '/projects/omics_share/mouse/GRCm38/transcriptome/annotation/ensembl/v102/Mus_musculus.GRCm38.102.bed' + + // Global default params, used in configs + + // Options: Generic + input = '' + fragment_size = 200 + fingerprint_bins = 500000 + + // Mac2 Effective Genome Size - based on GRCm38 ensembl primary_assembly chroms and MT only + macs_gsize = 2725537669 + + // Blacklist regions: + blacklist = "" + + // Trim-Galore settings. + trimLength = '30' + qualThreshold = '30' + adapOverlap = '1' + adaptorSeq = 'AGATCGGAAGAGC' + + // bwa parameters + mismatch_penalty = "" + bwa_min_score = false + + // samtools merge bam filter parameters + keep_dups = false + keep_multi_map = false + + // bamtools filter + bamtools_filter_pe_config = "$projectDir/bin/shared/bamtools/bamtools_filter_pe.json" + bamtools_filter_se_config = "$projectDir/bin/shared/bamtools/bamtools_filter_se.json" + + // preseq paramters + skip_preseq = false + + // Options: Peaks + narrow_peak = false + broad_cutoff = 0.05 + macs_fdr = false + macs_pvalue = false + min_reps_consensus = 1 + save_macs_pileup = false + skip_peak_qc = false + skip_peak_annotation = false + skip_consensus_peaks = false + + // Options: Differential analysis + deseq2_vst = false + skip_diff_analysis = false + + // MultiQC + multiqc_config = "${projectDir}/bin/shared/multiqc/chipseq.yaml" + + tmpdir = "/fastscratch/${USER}" + extension = null // not used in this workflow + pattern = null // not used in this workflow + concat_lanes = false // not used in this workflow + non_directional = '' // not used in this workflow + +} + +if (params.gen_org=='human'){ + + params.genome_build = 'GRCh38' + + // Mac2 Effective Genome Size - based on GRCh38 GATK assembly chroms and MT only + params.macs_gsize = 3088286401 + + // Reference fasta + params.ref_fa = '/projects/omics_share/human/GRCh38/genome/sequence/gatk/Homo_sapiens_assembly38.fasta' + params.ref_fa_indices = '/projects/omics_share/human/GRCh38/genome/indices/gatk/bwa/Homo_sapiens_assembly38.fasta' + + // GTF & BED annotation + params.gtf = '/projects/omics_share/human/GRCh38/transcriptome/annotation/ensembl/v104/Homo_sapiens.GRCh38.104.chr.GATKchrom.gtf' + params.gene_bed = '/projects/omics_share/human/GRCh38/transcriptome/annotation/ensembl/v104/Homo_sapiens.GRCh38.104.GATKchrom.bed' +} + +// Defaults for GRCm39 build +if (params.genome_build=='GRCm39'){ + + // Mac2 Effective Genome Size - based on GRCm38 ensembl primary_assembly chroms and MT only + params.macs_gsize = 2723431143 + + // Reference fasta + params.ref_fa = '/projects/omics_share/mouse/GRCm39/genome/sequence/ensembl/v105/Mus_musculus.GRCm39.dna.primary_assembly.fa' + params.ref_fa_indices = '/projects/omics_share/mouse/GRCm39/genome/indices/ensembl/v105/bwa/Mus_musculus.GRCm39.dna.primary_assembly.fa' + + // GTF & BED annotation + gtf = '/projects/omics_share/mouse/GRCm39/transcriptome/annotation/ensembl/v105/Mus_musculus.GRCm39.105.gtf' + gene_bed = '/projects/omics_share/mouse/GRCm39/transcriptome/annotation/ensembl/v105/Mus_musculus.GRCm39.105.bed' + +} \ No newline at end of file diff --git a/config/pdx_wes.config b/config/pdx_wes.config new file mode 100644 index 00000000..83ae6f88 --- /dev/null +++ b/config/pdx_wes.config @@ -0,0 +1,63 @@ +//==================== Nextflow/Container Config ========== + +manifest { + name = "pdx_wes" + description = 'Pipeline for Processing PDX Whole Exome Samples' + author = 'Anuj Srivastava, Carolyn Paisie, Barry Guglielmo, Michael Lloyd, Brian Sanderson Copyright Jackson Laboratory 2021' +} + +// Default to Mouse, If gen_org == 'human' parameters are overwritten with values +// in the "Defaults for Human" section below + +params { + // Shared params + gen_org = 'human' // human + extension='.fastq.gz' + pattern="*_R{1,2}*" + read_type = 'PE' // SE + sample_folder = null + concat_lanes = false + download_data = false + csv_input = null + + multiqc_config = "${projectDir}/bin/shared/multiqc/pdx_wes_multiqc.yaml" + + // Reference fasta + ref_fa = '/projects/omics_share/human/GRCh38/genome/sequence/gatk/Homo_sapiens_assembly38.fasta' + ref_fa_indices = '/projects/omics_share/human/GRCh38/genome/indices/gatk/bwa/Homo_sapiens_assembly38.fasta' + + // Quality Stats params + min_pct_hq_reads = '0.0' + hq_pct = '70' + + // Xenome index + xenome_prefix='/projects/compsci/omics_share/human/GRCh38/supporting_files/xenome/hg38_broad_NOD_based_on_mm10_k25' + + // WES capture array BED and GATK intervals lists + target_gatk = '/projects/omics_share/human/GRCh38/supporting_files/capture_kit_files/agilent/v7/S31285117_MergedProbes_no_gene_names.bed' + target_picard = '/projects/omics_share/human/GRCh38/supporting_files/capture_kit_files/agilent/v7/S31285117_MergedProbes_no_gene_names.picard.interval_list' + bait_picard = '/projects/omics_share/human/GRCh38/supporting_files/capture_kit_files/agilent/v7/S31285117_MergedProbes_no_gene_names.picard.interval_list' + + // Variant calling parameters + mismatch_penalty = "-B 8" + call_val = "50.0" + + gnomad_ref='/projects/compsci/omics_share/human/GRCh38/genome/annotation/snps_indels/af-only-gnomad.hg38.vcf.gz' + pon_ref='/projects/compsci/omics_share/human/GRCh38/genome/annotation/snps_indels/1000g_pon.hg38.vcf.gz' + + msisensor_model='/projects/compsci/omics_share/human/GRCh38/supporting_files/msisensor2/models_hg38' + + // VCF annotation + gold_std_indels = '/projects/omics_share/human/GRCh38/genome/annotation/snps_indels/Mills_and_1000G_gold_standard.indels.hg38.vcf.gz' + phase1_1000G = '/projects/omics_share/human/GRCh38/genome/annotation/snps_indels/1000G_phase1.snps.high_confidence.hg38.vcf.gz' + dbSNP = '/projects/omics_share/human/GRCh38/genome/annotation/snps_indels/dbsnp_151.vcf.gz' + dbSNP_index = '/projects/omics_share/human/GRCh38/genome/annotation/snps_indels/dbsnp_151.vcf.gz.tbi' + dbNSFP='/projects/omics_share/human/GRCh38/genome/annotation/function/dbNSFP4.2a.gatk_formatted.txt.gz' + cosmic = '/projects/omics_share/human/GRCh38/genome/annotation/function/COSMICv95_Coding_Noncoding.gatk_formatted.vcf.gz' + cosmic_index = '/projects/omics_share/human/GRCh38/genome/annotation/function/COSMICv95_Coding_Noncoding.gatk_formatted.vcf.gz.tbi' + gen_ver = "hg38" + snpEff_config = '/projects/omics_share/human/GRCh38/genome/indices/snpEff_5_1/snpEff.config' + + tmpdir = "/fastscratch/${USER}" + bwa_min_score = null +} \ No newline at end of file diff --git a/config/profiles/elion.config b/config/profiles/elion.config index 3eae133d..dc1c7c05 100644 --- a/config/profiles/elion.config +++ b/config/profiles/elion.config @@ -14,18 +14,16 @@ process { } executor { - $slurm { - queueSize = 250 - // The number of tasks the executor will handle in a parallel manner - submitRateLimit = '1 / 2 s' - // Determines the max rate of job submission per time unit, for example '10sec' eg. max 10 jobs per second or '1/2 s' i.e. 1 job submissions every 2 seconds. - } + name = 'slurm' + // The number of tasks the executor will handle in a parallel manner + queueSize = 100 + submitRateLimit = '1 / 2 s' + // Determines the max rate of job submission per time unit, for example '10sec' eg. max 10 jobs per second or '1/2 s' i.e. 1 job submissions every 2 seconds. } env { NXF_ANSI_SUMMARY = true NXF_ANSI_LOG = true - NXF_DEBUG = 2 } trace { diff --git a/config/profiles/sumner.config b/config/profiles/sumner.config index bce233e0..6135db21 100644 --- a/config/profiles/sumner.config +++ b/config/profiles/sumner.config @@ -14,18 +14,16 @@ process { } executor { - $slurm { - queueSize = 250 - // The number of tasks the executor will handle in a parallel manner - submitRateLimit = '1 / 2 s' - // Determines the max rate of job submission per time unit, for example '10sec' eg. max 10 jobs per second or '1/2 s' i.e. 1 job submissions every 2 seconds. - } + name = 'slurm' + // The number of tasks the executor will handle in a parallel manner + queueSize = 150 + submitRateLimit = '1 / 2 s' + // Determines the max rate of job submission per time unit, for example '10sec' eg. max 10 jobs per second or '1/2 s' i.e. 1 job submissions every 2 seconds. } env { NXF_ANSI_SUMMARY = true NXF_ANSI_LOG = true - NXF_DEBUG = 2 } trace { diff --git a/config/pta.config b/config/pta.config new file mode 100644 index 00000000..6ff86dee --- /dev/null +++ b/config/pta.config @@ -0,0 +1,115 @@ +//==================== Nextflow/Container Config ========== + +manifest { + name = "Patient Tumor Analysis" + description = 'Pipeline for processing of germline and somatic SNP/InDEL and somatic structural variants and copy number alterations.' + author = 'Anuj Srivastava, Michael Lloyd, Brian Sanderson, Harshpreet Chandok, Peter Fields, Copyright Jackson Laboratory 2023' +} + +params { + // PDX sample: + pdx = false + + multiqc_config = "${projectDir}/bin/shared/multiqc/pta_multiqc.yaml" + + // Quality Stats params + + min_pct_hq_reads = '0.0' + hq_pct = '70' + + // Xenome index + xenome_prefix='/projects/compsci/omics_share/human/GRCh38/supporting_files/xenome/hg38_broad_NOD_based_on_mm10_k25' + + // Reference fasta + ref_fa = '/projects/compsci/omics_share/human/GRCh38/genome/sequence/gatk/Homo_sapiens_assembly38.fasta' + ref_fa_indices = '/projects/compsci/omics_share/human/GRCh38/genome/indices/gatk/bwa/Homo_sapiens_assembly38.fasta' + ref_fa_dict = '/projects/omics_share/human/GRCh38/genome/sequence/gatk/Homo_sapiens_assembly38.dict' + combined_reference_set = '/projects/compsci/omics_share/human/GRCh38/supporting_files/PTA_inputs/combined_ref_set/Homo_sapiens_assembly38.fasta' // Several tools (GRIDSS, SVABA) requires reference and bwa index files in same directory. Links used within this directory to avoid duplication. See note in directory. + + // BWA params + mismatch_penalty = "-B 8" + + // Known Sites for BQSR + gold_std_indels = '/projects/omics_share/human/GRCh38/genome/annotation/snps_indels/Mills_and_1000G_gold_standard.indels.hg38.vcf.gz' // used in variant recal, and variant tranche recal. GATK resource bundle. + phase1_1000G = '/projects/omics_share/human/GRCh38/genome/annotation/snps_indels/1000G_phase1.snps.high_confidence.hg38.vcf.gz' // used in variant recal, and variant tranche recal. GATK resource bundle. + dbSNP = '/projects/omics_share/human/GRCh38/genome/annotation/snps_indels/dbsnp_151.vcf.gz' // used in annotation, variant recal, variant tranche recal, and by SVABA. + dbSNP_index = '/projects/omics_share/human/GRCh38/genome/annotation/snps_indels/dbsnp_151.vcf.gz.tbi' + + // Chromosome contig lists, used in scatter / gather operations. + chrom_contigs = '/projects/omics_share/human/GRCh38/genome/sequence/gatk/Homo_sapiens_assembly38.primaryChr.contig_list' // Contig list used for scatter / gather in calling and annotation. + chrom_intervals = '/projects/omics_share/human/GRCh38/genome/annotation/intervals/hg38_calling_intervals/' // Chromosome intervals used for scatter gather in calling. + + + // Germline Haplotypecaller and downstream filtering. + call_val = 50.0 + ploidy_val = "-ploidy 2" + excludeIntervalList = '/projects/compsci/omics_share/human/GRCh38/genome/annotation/intervals/hg38_haplotypeCaller_skip.interval_list' // Germline caller exclusion list. + hapmap = '/projects/compsci/omics_share/human/GRCh38/genome/annotation/snps_indels/hapmap_3.3.hg38.vcf.gz' // variant tranche requirement. GATK resource bundle. + omni = '/projects/compsci/omics_share/human/GRCh38/genome/annotation/snps_indels/1000G_omni2.5.hg38.vcf.gz' // variant tranche requirement. GATK resource bundle. + + // Somatic SNP/InDEL filtering + pon_bed = '/projects/compsci/omics_share/human/GRCh38/supporting_files/PTA_inputs/filtering/WGS_1000g_GRCh38.pon.bed' // used in snp/indel filtering. + intervalListBed='/projects/compsci/omics_share/human/GRCh38/supporting_files/PTA_inputs/filtering/SureSelect_V6plusCOSMIC.target.GRCh38_full_analysis_set_plus_decoy_hla.interval_list.bed' // used to extract non-exonic regions, to attempt recovery with Lancet calls. + + // Lancet: + lancet_beds_directory = '/projects/omics_share/human/GRCh38/genome/annotation/intervals/lancet_chr_beds/' // Lancet requirement + + // Bicseq2 + mappability_directory = '/projects/compsci/omics_share/human/GRCh38/genome/annotation/intervals/mappability' // Bicseq2 requirement. + bicseq2_chromList = '/projects/compsci/omics_share/human/GRCh38/supporting_files/PTA_inputs/configs/sampleId.bicseq2.config' // bicseq2 requirement + bicseq2_no_scaling = false + + // Gridss and Gripss (filtering) + germline_filtering_vcf = '/projects/compsci/omics_share/human/GRCh38/supporting_files/PTA_inputs/filtering/gnomad-and-ALL_GRCh38_sites.20170504.normalized.modified.PASS.vcf.gz' // used in gridss call filtering. + gripss_pon = '/projects/compsci/omics_share/human/GRCh38/supporting_files/PTA_inputs/gripss_pon' // gripss requirement + + // Manta + callRegions = '/projects/compsci/omics_share/human/GRCh38/genome/annotation/intervals/GRCh38.callregions.bed.gz' // manta requirement. + + // Strelka + strelka_config = '/projects/compsci/omics_share/human/GRCh38/supporting_files/PTA_inputs/configs/configureStrelkaSomaticWorkflow.py.ini' // strelka requirement. + + // MSIsensor2 + msisensor_model='/projects/compsci/omics_share/human/GRCh38/supporting_files/msisensor2/models_hg38' // model files for MSIsensor2 + + // Annotations: + // VEP + vep_cache_directory = '/projects/compsci/omics_share/human/GRCh38/genome/annotation/vep_data' // VEP annotation cache. Note this directory contains additional annotation cache files. + vep_fasta = '/projects/compsci/omics_share/human/GRCh38/genome/sequence/ensembl/GRCh38.p13/Homo_sapiens.GRCh38.dna.primary_assembly.fa' // VEP is ensembl based, and requires a separate reference file. + + // Cosmic. + cosmic_cgc = '/projects/compsci/omics_share/human/GRCh38/genome/annotation/function/cancer_gene_census_v97.csv' + cosmic_cancer_resistance_muts = '/projects/compsci/omics_share/human/GRCh38/genome/annotation/function/CosmicResistanceMutations.tsv.gz' + + // Additional somatic annotations + ensembl_entrez='/projects/compsci/omics_share/human/GRCh38/supporting_files/PTA_inputs/annotations/GRCh39.p13_ensemblv109_entrez_id_map.csv' // used in somatic vcf finalization. + + // CNV and SV annotations and filtering files. + cytoband = '/projects/compsci/omics_share/human/GRCh38/supporting_files/PTA_inputs/annotations/GRCh38.cytoBand.UCSC.chr.sorted.txt' // used in bicseq2 annotations + dgv = '/projects/compsci/omics_share/human/GRCh38/supporting_files/PTA_inputs/annotations/DGV.GRCh38_hg38_variants_2020-02-25.bed' // used in bicseq2 annotations + thousandG = '/projects/compsci/omics_share/human/GRCh38/supporting_files/PTA_inputs/annotations/1KGP.CNV.GRCh38.canvas.merged.bed' // used in bicseq2 annotations + cosmicUniqueBed = '/projects/compsci/omics_share/human/GRCh38/supporting_files/PTA_inputs/annotations/CosmicCompleteCNA_uniqIntervals.bed' // used in bicseq2 annotations + cancerCensusBed = '/projects/compsci/omics_share/human/GRCh38/supporting_files/PTA_inputs/annotations/cancer_gene_census.GRCh38-v92.bed' // used in bicseq2 annotations and SV annotation. + ensemblUniqueBed = '/projects/compsci/omics_share/human/GRCh38/supporting_files/PTA_inputs/annotations/ensembl_genes_unique_sorted.final.v93.chr.sorted.bed' // used in bicseq2 annotations and SV annotation. + gap = '/projects/compsci/omics_share/human/GRCh38/supporting_files/PTA_inputs/annotations/GRCh38.gap.UCSC.annotated.chr.sorted.bed' // used in SV annotation. + dgvBedpe = '/projects/compsci/omics_share/human/GRCh38/supporting_files/PTA_inputs/annotations/DGV.GRCh38_hg38_variants_2020-02-25.bedpe' // used in SV annotation. + thousandGVcf = '/projects/compsci/omics_share/human/GRCh38/supporting_files/PTA_inputs/annotations/1KGP.pruned_wAFs.PASS_and_MULTIALLELIC_Mosaic.GRCh38.vcf' // used in SV annotation. + svPon = '/projects/compsci/omics_share/human/GRCh38/supporting_files/PTA_inputs/annotations/1000G-SV-PON.survivor-merged.GRCh38.filtered.bedpe' // used in SV annotation. + cosmicBedPe = '/projects/compsci/omics_share/human/GRCh38/supporting_files/PTA_inputs/annotations/cosmic-sv-GRCh38-v92.bedpe' // used in SV annotation. + + // NA12878 BAM file. For use in tumor-only processing. + na12878_bam = '/projects/compsci/omics_share/human/GRCh38/supporting_files/PTA_inputs/NA12878/NA12878_realigned_BQSR.bam' + na12878_bai = '/projects/compsci/omics_share/human/GRCh38/supporting_files/PTA_inputs/NA12878/NA12878_realigned_BQSR.bai' + na12878_sampleName = 'NA12878' + + // General ngs-ops arguments unsed in this workflow. + read_type = 'PE' // Only PE accepted. + gen_org='human' // Only human accepted. + tmpdir = '' + sample_folder = null // not used, csv input required. + extension='' // not used, csv input required. + pattern="" // not used, csv input required. + concat_lanes = false // not used, csv input required. + csv_input = null + bwa_min_score = null +} \ No newline at end of file diff --git a/config/rna_fusion.config b/config/rna_fusion.config new file mode 100644 index 00000000..a5167672 --- /dev/null +++ b/config/rna_fusion.config @@ -0,0 +1,89 @@ +//==================== Nextflow/Container Config ========== + +manifest { + name = "rna_fusion" + description = 'Pipeline for processing of PDX RNASeq samples to call RNA Fusions, contains xenome step for processing PDX samples' + author = 'Michael Lloyd, Sai Lek, Brian Sanderson Copyright Jackson Laboratory 2022' + version = "0.1.0" +} + +params { + + //Shared params + extension='.fastq.gz' + pattern="*{.,_,-}R{1,2}*" + read_type = 'PE' // PE only supported. + concat_lanes = false + sample_folder = null + download_data = false + csv_input = null + + multiqc_config = "${projectDir}/bin/shared/multiqc/rna_fusion_multiqc.yaml" + + // Xenome index + xenome_prefix='/projects/compsci/omics_share/human/GRCh38/supporting_files/xenome/hg38_broad_NOD_based_on_mm10_k25' + + // READ LENGTH ADJUSTMENTS: + read_length = 150 // change relative to sample being processed. 75, 100, 125, and 150 are supported. + star_index = '/projects/omics_share/human/GRCh38/transcriptome/indices/rna_fusion/star/star-index-150bp' // change relative to read length. 75, 100, 125, and 150 are supported. + + // GTF Annotation File. + gencode_gtf = '/projects/compsci/omics_share/human/GRCh38/transcriptome/indices/rna_fusion/gencode/gencode.v37.annotation.gtf.revised.custom.gtf' + + // FASTA + fasta = '/projects/omics_share/human/GRCh38/transcriptome/indices/rna_fusion/ensembl/Homo_sapiens.GRCh38.102.all.fa' + + // Arriba Options + arriba_star_args = '--outSAMtype BAM Unsorted \ + --outSAMunmapped Within \ + --outBAMcompression 0 \ + --outFilterMultimapNmax 50 \ + --peOverlapNbasesMin 10 \ + --alignSplicedMateMapLminOverLmate 0.5 \ + --alignSJstitchMismatchNmax 5 -1 5 5 \ + --chimSegmentMin 10 \ + --chimOutType WithinBAM HardClip \ + --chimJunctionOverhangMin 10 \ + --chimScoreDropMax 30 \ + --chimScoreJunctionNonGTAG 0 \ + --chimScoreSeparation 1 \ + --chimSegmentReadGapMax 3 \ + --chimMultimapNmax 50' + + arriba_blacklist = '/projects/compsci/omics_share/human/GRCh38/transcriptome/indices/rna_fusion/arriba/blacklist_hg38_GRCh38_v2.4.0.tsv.gz' + arriba_known_fusions = '/projects/compsci/omics_share/human/GRCh38/transcriptome/indices/rna_fusion/arriba/known_fusions_hg38_GRCh38_v2.4.0.tsv.gz' + arriba_protein_domains = '/projects/compsci/omics_share/human/GRCh38/transcriptome/indices/rna_fusion/arriba/protein_domains_hg38_GRCh38_v2.4.0.gff3' + + // Fusioncatcher Options + fusioncatcher_ref = '/projects/compsci/omics_share/human/GRCh38/transcriptome/indices/rna_fusion/fusioncatcher/human_v102' + fusioncatcher_limitSjdbInsertNsj = 2000000 + + // Jaffa Options + jaffa_ref_dir = '/projects/omics_share/human/GRCh38/transcriptome/indices/rna_fusion/jaffa/' + + // Pizzly Options + kallisto_index = '/projects/omics_share/human/GRCh38/transcriptome/indices/rna_fusion/pizzly/Homo_sapiens.GRCh38.102.cdna.all.kallisto-0.48.0.index' + transcript_fasta = '/projects/omics_share/human/GRCh38/transcriptome/indices/rna_fusion/ensembl/Homo_sapiens.GRCh38.102.cdna.all.fa.gz' + ensembl_gtf = '/projects/omics_share/human/GRCh38/transcriptome/indices/rna_fusion/ensembl/Homo_sapiens.GRCh38.102.gtf' + + // Squid Options + squid_star_args = '--twopassMode Basic \ + --chimOutType SeparateSAMold \ + --chimSegmentMin 20 \ + --chimJunctionOverhangMin 12 \ + --alignSJDBoverhangMin 10 \ + --outReadsUnmapped Fastx \ + --outSAMstrandField intronMotif \ + --outSAMtype BAM SortedByCoordinate' + + //Star-Fusion Options + star_fusion_ref = '/projects/omics_share/human/GRCh38/transcriptome/indices/rna_fusion/starfusion/ctat_genome_lib_build_dir' + star_fusion_opt = '' + + // Fusion Report Options: + fusion_report_opt = false + + //Fusion-report databases + databases = '/projects/compsci/omics_share/human/GRCh38/supporting_files/rna_fusion_dbs' + +} diff --git a/config/rnaseq.config b/config/rnaseq.config index f406fbb1..f61496e0 100644 --- a/config/rnaseq.config +++ b/config/rnaseq.config @@ -13,52 +13,128 @@ params { //Shared params gen_org='mouse' // human + genome_build = 'GRCm38' // GRCm39 extension='.fastq.gz' pattern="*_R{1,2}*" + sample_folder = null read_type = 'PE' // SE concat_lanes = false - read_prep = 'reverse_stranded' // 'reverse_stranded, forward_stranded, or non_stranded' - ref_fa='/projects/omics_share/mouse/GRCm38/genome/sequence/ensembl/v102/Mus_musculus.GRCm38.dna.toplevel.fa' + download_data = false + csv_input = null + + + pdx = false // if PDX, gen_org == human and xenome is run to remove mouse reads from the sample(s). + + multiqc_config = "${projectDir}/bin/shared/multiqc/rnaseq_multiqc.yaml" //Quality Stats params min_pct_hq_reads='0.0' + hq_pct = '70' + + // strand check - used only for QC check, not for mapping. + strandedness_ref = '/projects/compsci/omics_share/mouse/GRCm38/transcriptome/indices/ensembl/v102/kallisto/kallisto_index' + strandedness_gtf = '/projects/compsci/omics_share/mouse/GRCm38/transcriptome/annotation/ensembl/v102/Mus_musculus.GRCm38.102.gtf' //RSEM params seed_length = '25' rsem_aligner = 'bowtie2' // 'star' - rsem_ref_prefix = 'Mus_musculus.GRCm38.dna.toplevel' + rsem_ref_prefix = 'Mus_musculus.GRCm38.dna.primary_assembly' rsem_ref_files = '/projects/omics_share/mouse/GRCm38/transcriptome/indices/ensembl/v102' - rsem_star_prefix = 'toplevel/GRCm38_100' // 'toplevel/GRCm38_75' or 'toplevel/GRCm38_150' + rsem_star_prefix = 'primary' // 'primary' or 'top_level' - //Picard params - picard_dict='/projects/omics_share/mouse/GRCm38/genome/sequence/ensembl/v102/Mus_musculus.GRCm38.dna.toplevel.dict' + picard_dict='/projects/omics_share/mouse/GRCm38/genome/sequence/ensembl/v102/Mus_musculus.GRCm38.dna.primary_assembly.dict' - ref_flat='/projects/omics_share/mouse/GRCm38/transcriptome/annotation/ensembl/v102/Mus_musculus.GRCm38.102.chr_patch_hapl_scaff.refFlat.txt' - ribo_intervals='/projects/omics_share/mouse/GRCm38/transcriptome/annotation/ensembl/v102/Mus_musculus.GRCm38.102.chr_patch_hapl_scaff.rRNA.interval_list' + ref_flat='/projects/omics_share/mouse/GRCm38/transcriptome/annotation/ensembl/v102/Mus_musculus.GRCm38.102.chr.refFlat.txt' + ribo_intervals='/projects/omics_share/mouse/GRCm38/transcriptome/annotation/ensembl/v102/Mus_musculus.GRCm38.102.chr.rRNA.interval_list' tmpdir = "/fastscratch/${USER}" + } // Defaults for Human if (params.gen_org=='human'){ - params.ref_fa='/projects/omics_share/human/GRCh38/genome/sequence/ensembl/v104/Homo_sapiens.GRCh38.dna.toplevel.fa' - params.ref_fai='/projects/omics_share/human/GRCh38/genome/sequence/ensembl/v104/Homo_sapiens.GRCh38.dna.toplevel.fa.fai' + params.genome_build = 'GRCh38' //Quality Stats params params.min_pct_hq_reads='0.0' + params.hq_pct = '70' + + // strand check + params.strandedness_ref = '/projects/compsci/omics_share/human/GRCh38/transcriptome/indices/ensembl/v104/kallisto/kallisto_index' + params.strandedness_gtf = '/projects/compsci/omics_share/human/GRCh38/transcriptome/annotation/ensembl/v104/Homo_sapiens.GRCh38.104.gtf' // RSEM params.seed_length = '25' params.rsem_aligner = 'bowtie2' // 'star' - params.rsem_ref_prefix = 'Homo_sapiens.GRCh38.dna.toplevel' + params.rsem_ref_prefix = 'Homo_sapiens.GRCh38.dna.primary_assembly' params.rsem_ref_files = '/projects/omics_share/human/GRCh38/transcriptome/indices/ensembl/v104' - params.rsem_star_prefix = 'toplevel/GRCh38_100' // 'toplevel/GRCh38_75' or 'toplevel/GRCh38_150' + params.rsem_star_prefix = 'primary' // 'primary' or 'top_level' // Picard - params.picard_dict='/projects/omics_share/human/GRCh38/genome/sequence/ensembl/v104/Homo_sapiens.GRCh38.dna.toplevel.dict' - params.ref_flat='/projects/omics_share/human/GRCh38/transcriptome/annotation/ensembl/v104/Homo_sapiens.GRCh38.104.chr_patch_hapl_scaff.refFlat.txt' - params.ribo_intervals='/projects/omics_share/human/GRCh38/transcriptome/annotation/ensembl/v104/Homo_sapiens.GRCh38.104.chr_patch_hapl_scaff.rRNA.interval_list' + params.picard_dict='/projects/omics_share/human/GRCh38/genome/sequence/ensembl/v104/Homo_sapiens.GRCh38.dna.primary_assembly.dict' + params.ref_flat='/projects/omics_share/human/GRCh38/transcriptome/annotation/ensembl/v104/Homo_sapiens.GRCh38.104.chr.refFlat.txt' + params.ribo_intervals='/projects/omics_share/human/GRCh38/transcriptome/annotation/ensembl/v104/Homo_sapiens.GRCh38.104.chr.rRNA.interval_list' + +} + +// Defaults for GRCm39 build +if (params.genome_build=='GRCm39'){ + + //RSEM params + params.rsem_ref_prefix = 'Mus_musculus.GRCm39.dna.primary_assembly' + params.rsem_ref_files = '/projects/omics_share/mouse/GRCm39/transcriptome/indices/ensembl/v105' + params.rsem_star_prefix = 'primary' // 'primary' or 'top_level' + + //Picard params + params.picard_dict='/projects/omics_share/mouse/GRCm39/genome/sequence/ensembl/v105/Mus_musculus.GRCm39.dna.primary_assembly.dict' + + params.ref_flat='/projects/omics_share/mouse/GRCm39/transcriptome/annotation/ensembl/v105/Mus_musculus.GRCm39.105.refFlat.txt' + params.ribo_intervals='/projects/omics_share/mouse/GRCm39/transcriptome/annotation/ensembl/v105/Mus_musculus.GRCm39.105.rRNA.interval_list' + +} + +// Defaults for PDX +if (params.gen_org=='human' && params.pdx){ + + params.rsem_ref_prefix = null // zero out params to avoid accidental collision + params.rsem_ref_files = null // zero out params to avoid accidental collision + params.rsem_star_prefix = null // zero out params to avoid accidental collision + // strand check + params.strandedness_ref = '/projects/compsci/omics_share/human/GRCh38/transcriptome/indices/ensembl/v104/kallisto/kallisto_index' + params.strandedness_gtf = '/projects/compsci/omics_share/human/GRCh38/transcriptome/annotation/ensembl/v104/Homo_sapiens.GRCh38.104.gtf' + + //Quality Stats params + params.min_pct_hq_reads='0.0' + + // Xenome + params.xenome_prefix='/projects/compsci/omics_share/human/GRCh38/supporting_files/xenome/hg38_broad_NOD_based_on_mm10_k25' + + // General RSEM + params.seed_length = '25' + params.rsem_aligner = 'bowtie2' // 'star' + + // Human RSEM + params.rsem_ref_prefix_human = 'Homo_sapiens.GRCh38.dna.primary_assembly' + params.rsem_ref_files_human = '/projects/omics_share/human/GRCh38/transcriptome/indices/ensembl/v104' + params.rsem_star_prefix_human = 'primary' // 'primary' or 'top_level' + + // Human Picard + params.picard_dict_human='/projects/omics_share/human/GRCh38/genome/sequence/ensembl/v104/Homo_sapiens.GRCh38.dna.primary_assembly.dict' + params.ref_flat_human='/projects/omics_share/human/GRCh38/transcriptome/annotation/ensembl/v104/Homo_sapiens.GRCh38.104.chr.refFlat.txt' + params.ribo_intervals_human='/projects/omics_share/human/GRCh38/transcriptome/annotation/ensembl/v104/Homo_sapiens.GRCh38.104.chr.rRNA.interval_list' + + // Mouse RSEM + params.rsem_ref_prefix_mouse = 'Mus_musculus.GRCm38.dna.primary_assembly' + params.rsem_ref_files_mouse = '/projects/omics_share/mouse/GRCm38/transcriptome/indices/ensembl/v102' + params.rsem_star_prefix_mouse = 'primary' // 'primary' or 'top_level' + + // Mouse Picard + params.picard_dict_mouse='/projects/omics_share/mouse/GRCm38/genome/sequence/ensembl/v102/Mus_musculus.GRCm38.dna.primary_assembly.dict' + + params.ref_flat_mouse='/projects/omics_share/mouse/GRCm38/transcriptome/annotation/ensembl/v102/Mus_musculus.GRCm38.102.chr.refFlat.txt' + params.ribo_intervals_mouse='/projects/omics_share/mouse/GRCm38/transcriptome/annotation/ensembl/v102/Mus_musculus.GRCm38.102.chr.rRNA.interval_list' + } \ No newline at end of file diff --git a/config/rrbs.config b/config/rrbs.config index a8322e5a..dd910b90 100644 --- a/config/rrbs.config +++ b/config/rrbs.config @@ -11,10 +11,16 @@ manifest { params { //Shared params gen_org='mouse' // human + genome_build = 'GRCm38' // GRCm39 extension='.fastq.gz' pattern="*_R{1,2}*" + sample_folder = null read_type = 'PE' // SE concat_lanes = false + download_data = false + csv_input = null + + multiqc_config = "${projectDir}/bin/shared/multiqc/rrbs_multiqc.yaml" // Trimming & Bismark Setting non_directional = true @@ -46,6 +52,8 @@ params { if (params.gen_org=='human'){ + params.genome_build = 'GRCh38' + // Trimming & Bismark Setting params.non_directional = true @@ -69,4 +77,11 @@ if (params.gen_org=='human'){ // Bismark Methylation extraction settings. params.cytosine_report = false params.comprehensive = true +} + +// Defaults for GRCm39 build +if (params.genome_build=='GRCm39'){ + // Bismark Mapping settings. + params.ref_fa_index = '/projects/omics_share/mouse/GRCm39/genome/indices/ensembl/v105/bismark/bowtie2' + } \ No newline at end of file diff --git a/config/wes.config b/config/wes.config index b955a3aa..dc83c45c 100644 --- a/config/wes.config +++ b/config/wes.config @@ -12,23 +12,32 @@ manifest { params { // Shared params gen_org = 'mouse' // human + genome_build = 'GRCm38' // GRCm39 extension='.fastq.gz' pattern="*_R{1,2}*" + sample_folder = null read_type = 'PE' // SE concat_lanes = false - + download_data = false + csv_input = null - // Reference fasta - ref_fa = '/projects/omics_share/mouse/GRCm38/genome/sequence/ensembl/v102/Mus_musculus.GRCm38.dna.toplevel.fa' - ref_fa_indices='/projects/omics_share/mouse/GRCm38/genome/indices/ensembl/v102/bwa/Mus_musculus.GRCm38.dna.toplevel.fa' + multiqc_config = "${projectDir}/bin/shared/multiqc/wes_multiqc.yaml" + // Reference fasta + ref_fa = '/projects/omics_share/mouse/GRCm38/genome/sequence/ensembl/v102/Mus_musculus.GRCm38.dna.primary_assembly.fa' + ref_fa_indices='/projects/omics_share/mouse/GRCm38/genome/indices/ensembl/v102/bwa/Mus_musculus.GRCm38.dna.primary_assembly.fa' + // Quality Stats params min_pct_hq_reads = '0.0' + hq_pct = '70' + + // GVCF + run_gvcf = false // WES capture array BED and GATK intervals lists target_gatk = '/projects/omics_share/mouse/GRCm38/supporting_files/capture_kit_files/agilent/v2/S32371113_mouse_exon_V2.bare.bed' - target_picard = '/projects/omics_share/mouse/GRCm38/supporting_files/capture_kit_files/agilent/v2/S32371113_mouse_exon_V2.bare.picard.interval_list' - bait_picard = '/projects/omics_share/mouse/GRCm38/supporting_files/capture_kit_files/agilent/v2/S32371113_mouse_exon_V2.bare.picard.interval_list' + target_picard = '/projects/omics_share/mouse/GRCm38/supporting_files/capture_kit_files/agilent/v2/S32371113_mouse_exon_V2.bare.picard.primary_assembly.interval_list' + bait_picard = '/projects/omics_share/mouse/GRCm38/supporting_files/capture_kit_files/agilent/v2/S32371113_mouse_exon_V2.bare.picard.primary_assembly.interval_list' // Variant calling parameters mismatch_penalty = "-B 8" @@ -37,21 +46,27 @@ params { // VCF annotation dbSNP = '/projects/omics_share/mouse/GRCm38/genome/annotation/snps_indels/GCA_000001635.6_current_ids.vcf.gz' + dbSNP_index = '/projects/omics_share/mouse/GRCm38/genome/annotation/snps_indels/GCA_000001635.6_current_ids.vcf.gz.tbi' + gen_ver = "GRCm38.99" snpEff_config = "/projects/omics_share/mouse/GRCm38/genome/indices/snpEff_5_1/snpEff.config" tmpdir = "/fastscratch/${USER}" + bwa_min_score = null } -// Defaults for Human (Default HG38 PE) Should we have a switch for other versions? +// Defaults for Human (Default HG38 PE) if (params.gen_org=='human'){ + params.genome_build = 'GRCh38' + // Reference fasta params.ref_fa = '/projects/omics_share/human/GRCh38/genome/sequence/gatk/Homo_sapiens_assembly38.fasta' - params.ref_fa_indices = '/projects/omics_share/human/GRCh38/genome/indices/gatk/bwa/Homo_sapiens_assembly38.fasta.64' + params.ref_fa_indices = '/projects/omics_share/human/GRCh38/genome/indices/gatk/bwa/Homo_sapiens_assembly38.fasta' // Quality Stats params params.min_pct_hq_reads = '0.0' + params.hq_pct = '70' // WES capture array BED and GATK intervals lists params.target_gatk = '/projects/omics_share/human/GRCh38/supporting_files/capture_kit_files/agilent/v7/S31285117_MergedProbes_no_gene_names.bed' @@ -64,7 +79,6 @@ if (params.gen_org=='human'){ params.ploidy_val = "-ploidy 2" // VCF annotation - // These gold standard snp & indel files are bgzipped and tabixed, but old versions were not. Is this an issue? params.gold_std_indels = '/projects/omics_share/human/GRCh38/genome/annotation/snps_indels/Mills_and_1000G_gold_standard.indels.hg38.vcf.gz' params.phase1_1000G = '/projects/omics_share/human/GRCh38/genome/annotation/snps_indels/1000G_phase1.snps.high_confidence.hg38.vcf.gz' params.dbSNP = '/projects/omics_share/human/GRCh38/genome/annotation/snps_indels/dbsnp_151.vcf.gz' @@ -75,4 +89,26 @@ if (params.gen_org=='human'){ params.gen_ver = "hg38" params.snpEff_config = '/projects/omics_share/human/GRCh38/genome/indices/snpEff_5_1/snpEff.config' +} + +// Defaults for GRCm39 build +if (params.genome_build=='GRCm39'){ + + // Reference fasta + params.ref_fa = '/projects/omics_share/mouse/GRCm39/genome/sequence/ensembl/v105/Mus_musculus.GRCm39.dna.primary_assembly.fa' + params.ref_fa_indices = '/projects/omics_share/mouse/GRCm39/genome/indices/ensembl/v105/bwa/Mus_musculus.GRCm39.dna.primary_assembly.fa' + params.chrom_contigs = '/projects/omics_share/mouse/GRCm39/genome/sequence/ensembl/v105/Mus_musculus.GRCm39.dna.primary_assembly.primaryChr.contig_list' + + // WES capture array BED and GATK intervals lists + params.target_gatk = '/projects/omics_share/mouse/GRCm39/supporting_files/capture_kit_files/agilent/v2/S32371113_mouse_exon_V2.mm39.bare.bed' + params.target_picard = '/projects/omics_share/mouse/GRCm39/supporting_files/capture_kit_files/agilent/v2/S32371113_mouse_exon_V2.mm39.picard.interval_list' + params.bait_picard = '/projects/omics_share/mouse/GRCm39/supporting_files/capture_kit_files/agilent/v2/S32371113_mouse_exon_V2.mm39.picard.interval_list' + + // VCF annotation + params.dbSNP = '/projects/omics_share/mouse/GRCm39/genome/annotation/snps_indels/GCA_000001635.9_current_ids.vcf.gz' + params.dbSNP_index = '/projects/omics_share/mouse/GRCm39/genome/annotation/snps_indels/GCA_000001635.9_current_ids.vcf.gz.tbi' + params.gen_ver = 'GRCm39.105' + params.snpEff_config = '/projects/omics_share/mouse/GRCm39/genome/indices/snpEff_5_1d/snpEff.config' + params.comment = 'This script will run whole genome sequencing on mouse samples using default GRCm39' + } \ No newline at end of file diff --git a/config/wgs.config b/config/wgs.config index 070e32b4..aa5b4cfc 100644 --- a/config/wgs.config +++ b/config/wgs.config @@ -12,22 +12,33 @@ manifest { params { // Shared params gen_org = 'mouse' // human + genome_build = 'GRCm38' // GRCm39 extension='.fastq.gz' pattern="*_R{1,2}*" read_type = 'PE' // SE + sample_folder = null concat_lanes = false - + download_data = false + csv_input = null + + multiqc_config = "${projectDir}/bin/shared/multiqc/wgs_multiqc.yaml" + // Reference fasta - ref_fa = '/projects/omics_share/mouse/GRCm38/genome/sequence/ensembl/v102/Mus_musculus.GRCm38.dna.toplevel.fa' - ref_fa_indices='/projects/omics_share/mouse/GRCm38/genome/indices/ensembl/v102/bwa/Mus_musculus.GRCm38.dna.toplevel.fa' - chrom_contigs = '/projects/omics_share/mouse/GRCm38/genome/sequence/ensembl/v102/Mus_musculus.GRCm38.dna.toplevel.primaryChr.contig_list' + ref_fa = '/projects/omics_share/mouse/GRCm38/genome/sequence/ensembl/v102/Mus_musculus.GRCm38.dna.primary_assembly.fa' + ref_fa_indices='/projects/omics_share/mouse/GRCm38/genome/indices/ensembl/v102/bwa/Mus_musculus.GRCm38.dna.primary_assembly.fa' + chrom_contigs = '/projects/omics_share/mouse/GRCm38/genome/sequence/ensembl/v102/Mus_musculus.GRCm38.dna.primary_assembly.primaryChr.contig_list' // Quality Stats params min_pct_hq_reads = '0.0' + hq_pct = '70' + // GVCF + run_gvcf = false + // VCF annotation gen_ver = "GRCm38.99" dbSNP = '/projects/omics_share/mouse/GRCm38/genome/annotation/snps_indels/GCA_000001635.6_current_ids.vcf.gz' + dbSNP_index = '/projects/omics_share/mouse/GRCm38/genome/annotation/snps_indels/GCA_000001635.6_current_ids.vcf.gz.tbi' snpEff_config = '/projects/omics_share/mouse/GRCm38/genome/indices/snpEff_5_1/snpEff.config' // Variant calling parameters @@ -35,14 +46,17 @@ params { ploidy_val = "-ploidy 2" call_val = "50.0" - tmpdir = "/fastscratch/${USER}" + tmpdir = "/fastscratch/${USER}" + bwa_min_score = null } if (params.gen_org=='human'){ + params.genome_build = 'GRCh38' + // Reference fasta params.ref_fa = '/projects/omics_share/human/GRCh38/genome/sequence/gatk/Homo_sapiens_assembly38.fasta' - params.ref_fa_indices = '/projects/omics_share/human/GRCh38/genome/indices/gatk/bwa/Homo_sapiens_assembly38.fasta.64' + params.ref_fa_indices = '/projects/omics_share/human/GRCh38/genome/indices/gatk/bwa/Homo_sapiens_assembly38.fasta' params.chrom_contigs = '/projects/omics_share/human/GRCh38/genome/sequence/gatk/Homo_sapiens_assembly38.primaryChr.contig_list' // Variant calling parameters @@ -60,4 +74,21 @@ if (params.gen_org=='human'){ params.cosmic_index = '/projects/omics_share/human/GRCh38/genome/annotation/function/COSMICv95_Coding_Noncoding.gatk_formatted.vcf.gz.tbi' params.gen_ver = "hg38" params.snpEff_config = '/projects/omics_share/human/GRCh38/genome/indices/snpEff_5_1/snpEff.config' +} + +// Defaults for GRCm39 build +if (params.genome_build=='GRCm39'){ + + // Reference fasta + params.ref_fa = '/projects/omics_share/mouse/GRCm39/genome/sequence/ensembl/v105/Mus_musculus.GRCm39.dna.primary_assembly.fa' + params.ref_fa_indices = '/projects/omics_share/mouse/GRCm39/genome/indices/ensembl/v105/bwa/Mus_musculus.GRCm39.dna.primary_assembly.fa' + params.chrom_contigs = '/projects/omics_share/mouse/GRCm39/genome/sequence/ensembl/v105/Mus_musculus.GRCm39.dna.primary_assembly.primaryChr.contig_list' + + // VCF annotation + params.dbSNP = '/projects/omics_share/mouse/GRCm39/genome/annotation/snps_indels/GCA_000001635.9_current_ids.vcf.gz' + params.dbSNP_index = '/projects/omics_share/mouse/GRCm39/genome/annotation/snps_indels/GCA_000001635.9_current_ids.vcf.gz.tbi' + params.gen_ver = 'GRCm39.105' + params.snpEff_config = '/projects/omics_share/mouse/GRCm39/genome/indices/snpEff_5_1d/snpEff.config' + params.comment = 'This script will run whole genome sequencing on mouse samples using default GRCm39' + } \ No newline at end of file diff --git a/lib/Logos.groovy b/lib/Logos.groovy new file mode 100644 index 00000000..448063f0 --- /dev/null +++ b/lib/Logos.groovy @@ -0,0 +1,88 @@ +class Colors { + final non = "\033[0m" + final dim = "\033[2m" + final blk = "\033[0;30m" + final grn = "\033[0;32m" + final ylw = "\033[0;33m" + final blu = "\033[0;34m" + final pur = "\033[0;35m" + final cyn = "\033[0;36m" + final wht = "\033[0;37m" +} + +class Logo { + def c = new Colors() + + def logColors() { + // Mod Log colors if not ANSI + final mono = !env.NXF_ANSI_LOG + //final mono = true + if ( mono ) { + for (k in c.keySet()) { + c[k] = '' + } + } + } + + private def frameLogo(String logo, String color=c.blu) { + // return framed lines of passed logo text + def logos = logo.split('[\n\r]') + int maxLen = ( logos.collect{ it.length() } ).max() + 4 + def frameit = { it * maxLen } + // def bar = "${c.dim}.${c.non}" + def bar = "" + def logoLines = '' + for ( line in logos ) { + (line =~ /\S/) && \ + (logoLines += "${bar}${color} ${line} ${c.non}${bar}\n") + } + logos = "${c.dim}.${frameit('.')}.${c.non}\n" \ + + logoLines \ + + "${bar}${c.dim}${frameit('.')}${bar}" + logos = logoLines + return logos.stripIndent() + } + + public def show(String logo=this.logo, String clr=c.non) { + // return framed lines of chosen org logo in chosen color + frameLogo(logo, clr) + } + +/* +ASCII Name Art Options: http://patorjk.com/software/taag/ +*/ + +static def logo = this.logo_jaxngsops_cyber + +static def logo_jaxgm_ansi_regular = $/ + ██ █████ ██ ██ ██████ ███████ ███ ██ ██████ ███████ ██████ ██████ ███████ + ██ ██ ██ ██ ██ ██ ██ ████ ██ ██ ██ ██ ██ ██ ██ ██ + ██ ███████ ███ █████ ██ ███████ █████ ██ ██ ██ ██ ███ ███████ █████ ██ ██ ██████ ███████ +██ ██ ██ ██ ██ ██ ██ ██ ██ ██ ██ ██ ██ ██ ██ ██ ██ ██ + █████ ██ ██ ██ ██ ██████ ███████ ██ ████ ██████ ███████ ██████ ██ ███████ +/$ + +static def logo_jaxcsngsops_big = $/ + _ _ __ __ ____ ____ _ _ ____ ____ ___ ____ ____ + | | / \ \ \/ / / ___/ ___| | \ | |/ ___/ ___| / _ \| _ \/ ___| + _ | |/ _ \ \ /_____| | \___ \ _____| \| | | _\___ \ _____| | | | |_) \___ \ +| |_| / ___ \ / |_____| |___ ___) |_____| |\ | |_| |___) |_____| |_| | __/ ___) | + \___/_/ \_/_/\_\ \____|____/ |_| \_|\____|____/ \___/|_| |____/ +/$ + +static def logo_jaxngsops_mini = $/ + _ __ __ __ _ _ __ + | /\ \/ __ / (_ __ |\ | /__ (_ __ / \ |_) (_ + \_| /--\ /\ \_ __) | \| \_| __) \_/ | __) +/$ + + +static def logo_jaxngsops_cyber = $/ +_____ _______ _ _ _______ _______ __ _ ______ _______ _____ _____ _______ + | |_____| \___/ ___ | |______ ___ | \ | | ____ |______ ___ | | |_____| |______ +__| | | _/ \_ |_____ ______| | \_| |_____| ______| |_____| | ______| +/$ + + + +} diff --git a/main.nf b/main.nf index b802a15c..b4a86776 100644 --- a/main.nf +++ b/main.nf @@ -6,19 +6,36 @@ nextflow.enable.dsl=2 if (params.workflow == "rnaseq"){ include {RNASEQ} from './workflows/rnaseq' } -if (params.workflow == "wes"){ +else if (params.workflow == "wes"){ include {WES} from './workflows/wes' } -if (params.workflow == "wgs"){ +else if (params.workflow == "pdx_wes"){ + include {PDX_WES} from './workflows/pdx_wes' +} +else if (params.workflow == "wgs"){ include {WGS} from './workflows/wgs' } -if (params.workflow == "rrbs"){ +else if (params.workflow == "rrbs"){ include {RRBS} from './workflows/rrbs' } -if (params.workflow == "atac"){ +else if (params.workflow == "atac"){ include {ATAC} from './workflows/atac' } -// conditional to kick off appropriate workflow +else if (params.workflow == "chipseq"){ + include {CHIPSEQ} from './workflows/chipseq' +} +else if (params.workflow == "pta"){ + include {PTA} from './workflows/pta' +} +else if (params.workflow == "rna_fusion"){ + include {RNA_FUSION} from './workflows/rna_fusion' +} +else { + // if workflow name is not supported: + exit 1, "ERROR: No valid pipeline called. '--workflow ${params.workflow}' is not a valid workflow name." +} + +// conditional to launch appropriate workflow workflow{ if (params.workflow == "rnaseq"){ RNASEQ() @@ -26,6 +43,9 @@ workflow{ if (params.workflow == "wes"){ WES() } + if (params.workflow == "pdx_wes"){ + PDX_WES() + } if (params.workflow == "wgs"){ WGS() } @@ -35,4 +55,13 @@ workflow{ if (params.workflow == "atac"){ ATAC() } + if (params.workflow == "chipseq"){ + CHIPSEQ() + } + if (params.workflow == "pta"){ + PTA() + } + if (params.workflow == "rna_fusion"){ + RNA_FUSION() + } } diff --git a/modules/arriba/arriba.nf b/modules/arriba/arriba.nf new file mode 100644 index 00000000..0a93548d --- /dev/null +++ b/modules/arriba/arriba.nf @@ -0,0 +1,43 @@ +process ARRIBA { + + tag "$sampleID" + + cpus 1 + memory 10.GB + time 2.h + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} + + container 'quay.io/biocontainers/arriba:2.4.0--ha04fe3b_0' + + publishDir "${params.pubdir}/${ params.organize_by=='sample' ? sampleID + '/fusions' : 'arriba' }", pattern: "*.{tsv,txt}", mode:'copy' + + input: + tuple val(sampleID), path(bam), path(bai) + path(gtf) + + output: + tuple val(sampleID), path("*_arriba_fusions.tsv"), emit: arriba_fusions + tuple val(sampleID), path("*_arriba_fusions_discarded.tsv"), emit: arriba_fusions_fail + + script: + + """ + arriba \\ + -x ${bam} \\ + -a ${params.fasta} \\ + -g ${gtf} \\ + -o ${sampleID}_arriba_fusions.tsv \\ + -O ${sampleID}_arriba_fusions_discarded.tsv \\ + -b ${params.arriba_blacklist} \\ + -k ${params.arriba_known_fusions} \\ + -t ${params.arriba_known_fusions} \\ + -p ${params.arriba_protein_domains} + """ +} + +/* +From the documentation: + Note: In this execution, the same file is passed to the parameters -k and -t, because it is used for two purposes: + applying sensitive filtering parameters to known fusions (-k) and tagging known fusions in the tags column (-t). + However, it is possible to use different files for these two parameters if a user wants to separate the two tasks. +*/ \ No newline at end of file diff --git a/modules/bamtools/bamtools_filter.nf b/modules/bamtools/bamtools_filter.nf new file mode 100644 index 00000000..a0a34ccf --- /dev/null +++ b/modules/bamtools/bamtools_filter.nf @@ -0,0 +1,25 @@ +process BAMTOOLS_FILTER { + + tag "$sampleID" + + cpus 1 + memory 8.GB + time '12:00:00' + + container 'quay.io/biocontainers/bamtools:2.5.1--h9a82719_9' + + + input: + tuple val(sampleID), file(bam) + file(bamtools_filter_config) + + output: + tuple val(sampleID), file("*.sorted.bam"), emit: bam + + script: + prefix = params.read_type == 'SE' ? "${sampleID}.mLb.clN" : "${sampleID}.mLb.flT" + """ + bamtools filter -in ${bam} -script ${bamtools_filter_config} -out ${prefix}.sorted.bam + """ + +} diff --git a/modules/bamtools/bamtools_stats.nf b/modules/bamtools/bamtools_stats.nf index 088827ff..d2ef0376 100644 --- a/modules/bamtools/bamtools_stats.nf +++ b/modules/bamtools/bamtools_stats.nf @@ -5,7 +5,8 @@ process BAMTOOLS_STATS { cpus 1 memory 8.GB time '12:00:00' - + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} + container 'quay.io/biocontainers/bamtools:2.5.1--h9a82719_9' publishDir "${params.pubdir}/${ params.organize_by=='sample' ? sampleID : 'quality_stats' }", pattern:"*.txt", mode:'copy' @@ -17,7 +18,6 @@ process BAMTOOLS_STATS { tuple val(sampleID), file("*metrics.txt"), emit: picard_metrics script: - log.info "----- Bamtools Stats Running on: ${sampleID} -----" if (params.read_type == "PE") """ diff --git a/modules/bcftools/bcftools_germline_filter.nf b/modules/bcftools/bcftools_germline_filter.nf new file mode 100644 index 00000000..e888ec2e --- /dev/null +++ b/modules/bcftools/bcftools_germline_filter.nf @@ -0,0 +1,137 @@ +process BCFTOOLS_GERMLINE_FILTER { + // This modules is a port of the NYGC germline filtering scheme found at this site: + // https://bitbucket.nygenome.org/projects/WDL/repos/somatic_dna_wdl/browse/germline/germline.wdl?at=7.4.0 + + tag "$sampleID" + + cpus = 1 + memory = 2.GB + time = '00:30:00' + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} + + container 'quay.io/biocontainers/bcftools:1.15--h0ea216a_2' + + publishDir "${params.pubdir}/${ params.organize_by=='sample' ? sampleID : 'bedtools' }", pattern: "*haplotypecaller.gatk.filtered.vcf.gz", mode:'copy' + + input: + tuple val(sampleID), file(vcf) + + output: + tuple val(sampleID), file("*haplotypecaller.gatk.filtered.vcf.gz"), file("*haplotypecaller.gatk.filtered.vcf.gz.tbi"), emit: vcf_idx + + + // NOTE: These are hard coded to resources provided at: https://bitbucket.nygenome.org/projects/WDL/repos/somatic_dna_wdl/browse/config/fasta_references.json + // Many of the files used in the filtering here are used again by VEP. Therefore, the reosource sets were combined to reduce the number of params. + + script: + """ + bgzip ${vcf} + tabix -p vcf ${vcf}.gz + + ## Remove existing AF annotations from merged VCF + bcftools annotate \ + -x INFO/AF \ + -Oz \ + ${vcf}.gz \ + > noaf.vcf.gz + + tabix -p vcf noaf.vcf.gz + + ## Annotate with NYGC AF for filtering + bcftools annotate \ + --annotations ${params.vep_cache_directory}/annotations/04142020_NYGC_samples.vcf.gz \ + --columns 'INFO/AF,INFO/AC_Hom' \ + -Oz \ + noaf.vcf.gz \ + > ${sampleID}.final.annotated.vcf.gz + + tabix -p vcf ${sampleID}.final.annotated.vcf.gz + + ## filter variants >3% AF and >10 Homozygotes in NYGC vars + bcftools filter \ + --exclude 'INFO/AF[*] > 0.03 || INFO/AC_Hom[*] > 10' \ + ${sampleID}.final.annotated.vcf.gz \ + > ${sampleID}.pop.filtered.vcf + + bgzip ${sampleID}.pop.filtered.vcf + tabix -p vcf ${sampleID}.pop.filtered.vcf.gz + + ## select whitelist variants + bcftools view \ + -Oz \ + -R ${params.vep_cache_directory}/annotations/vep_whitelist_38.20201118.vcf.gz \ + ${vcf}.gz \ + > ${sampleID}.whitelist.filtered.vcf.gz + + tabix -p vcf ${sampleID}.whitelist.filtered.vcf.gz + + ## select pgx variants + bcftools view \ + -Oz \ + -R ${params.vep_cache_directory}/annotations/pgx_vep_hg38.vcf.gz \ + ${vcf}.gz \ + > ${sampleID}.pgx.filtered.vcf.gz + + tabix -p vcf ${sampleID}.pgx.filtered.vcf.gz + + ## select chd whitelist variants + bcftools view \ + -Oz \ + -R ${params.vep_cache_directory}/annotations/chd_whitelist.vcf.gz \ + ${vcf}.gz \ + > ${sampleID}.chdwhitelist.filtered.vcf.gz + + tabix -p vcf ${sampleID}.chdwhitelist.filtered.vcf.gz + + ## select rwgs pgx variants + bcftools view \ + -Oz \ + -R ${params.vep_cache_directory}/annotations/rWGS_PGx.bed.gz \ + ${vcf}.gz \ + > ${sampleID}.rwgspgx.filtered.vcf.gz + + tabix -p vcf ${sampleID}.rwgspgx.filtered.vcf.gz + + ## Select deep intronics + bcftools view \ + -Oz \ + -R ${params.vep_cache_directory}/annotations/deep_intronic_whitelist_08132020.vcf.gz \ + ${vcf}.gz \ + > ${sampleID}.deep_intronics.filtered.vcf.gz + + tabix -p vcf ${sampleID}.deep_intronics.filtered.vcf.gz + + ## Select clinvar intronics + bcftools view \ + -Oz \ + -R ${params.vep_cache_directory}/annotations/clinvar_deep_intronics_09012020.vcf.gz \ + ${vcf}.gz \ + > ${sampleID}.clinvar_intronics.filtered.vcf.gz + + tabix -p vcf ${sampleID}.clinvar_intronics.filtered.vcf.gz + + bcftools query -l ${vcf}.gz > samples.txt + + ## merge all filtered files for further processing + bcftools concat \ + -a \ + -d all \ + ${sampleID}.pop.filtered.vcf.gz \ + ${sampleID}.whitelist.filtered.vcf.gz \ + ${sampleID}.pgx.filtered.vcf.gz \ + ${sampleID}.chdwhitelist.filtered.vcf.gz \ + ${sampleID}.rwgspgx.filtered.vcf.gz \ + ${sampleID}.deep_intronics.filtered.vcf.gz \ + ${sampleID}.clinvar_intronics.filtered.vcf.gz \ + | \ + bcftools view \ + -i 'GT[@samples.txt]="alt"' \ + | \ + bcftools sort \ + -Oz \ + > ${sampleID}_haplotypecaller.gatk.filtered.vcf.gz + + tabix -p vcf ${sampleID}_haplotypecaller.gatk.filtered.vcf.gz + + """ +} \ No newline at end of file diff --git a/modules/bcftools/bcftools_intersect_lancet_candidates.nf b/modules/bcftools/bcftools_intersect_lancet_candidates.nf new file mode 100644 index 00000000..d6fdd1c2 --- /dev/null +++ b/modules/bcftools/bcftools_intersect_lancet_candidates.nf @@ -0,0 +1,32 @@ +process BCFTOOLS_INTERSECTVCFS { + tag "$sampleID" + + cpus = 8 + memory = 6.GB + time = '06:00:00' + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} + + container 'quay.io/biocontainers/bcftools:1.15--h0ea216a_2' + + input: + tuple val(sampleID), file(candidate_vcf), file(candidate_tbi), file(lancet_confirm_vcf), file(lancet_confirm_tbi), val(meta), val(normal_name), val(tumor_name), val(chrom) + + output: + tuple val(sampleID), file("*.vcf.gz"), file("*.tbi"), val(meta), val(normal_name), val(tumor_name), emit: vcf + + script: + """ + bcftools \ + isec \ + -w 1 \ + -c none \ + -n =2 \ + --threads ${task.cpus} \ + ${lancet_confirm_vcf} \ + ${candidate_vcf} \ + > ${sampleID}_confirmed_lancet_merged_${chrom}.vcf + + bgzip ${sampleID}_confirmed_lancet_merged_${chrom}.vcf + tabix ${sampleID}_confirmed_lancet_merged_${chrom}.vcf.gz + """ +} diff --git a/modules/bcftools/bcftools_merge_callers.nf b/modules/bcftools/bcftools_merge_callers.nf new file mode 100644 index 00000000..d2fd237c --- /dev/null +++ b/modules/bcftools/bcftools_merge_callers.nf @@ -0,0 +1,61 @@ +process BCFTOOLS_MERGECALLERS { + tag "$sampleID" + + cpus = 8 + memory = 6.GB + time = '06:00:00' + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} + + container 'quay.io/biocontainers/bcftools:1.15--h0ea216a_2' + + input: + tuple val(sampleID), file(vcf), file(idx), val(meta), val(chrom) + + output: + tuple val(sampleID), file("*.vcf"), val(meta), val(chrom), emit: vcf + + script: + + """ + bcftools \ + merge \ + -r ${chrom} \ + --force-samples \ + --no-version \ + --threads ${task.cpus} \ + -f PASS,SUPPORT \ + -F x \ + -m none \ + -o ${sampleID}_mergedCallers_${chrom}.vcf \ + -i called_by:join,num_callers:sum,MNV_ID:join,supported_by:join \ + ${vcf} +""" +} + + +/* + +About: Merge multiple VCF/BCF files from non-overlapping sample sets to create one multi-sample file. + Note that only records from different files can be merged, never from the same file. For + "vertical" merge take a look at "bcftools norm" instead. +Usage: bcftools merge [options] [...] + +Options: + --force-samples resolve duplicate sample names + --print-header print only the merged header and exit + --use-header use the provided header + -0 --missing-to-ref assume genotypes at missing sites are 0/0 + -f, --apply-filters require at least one of the listed FILTER strings (e.g. "PASS,.") + -F, --filter-logic remove filters if some input is PASS ("x"), or apply all filters ("+") [+] + -g, --gvcf <-|ref.fa> merge gVCF blocks, INFO/END tag is expected. Implies -i QS:sum,MinDP:min,I16:sum,IDV:max,IMF:max + -i, --info-rules rules for merging INFO fields (method is one of sum,avg,min,max,join) or "-" to turn off the default [DP:sum,DP4:sum] + -l, --file-list read file names from the file + -m, --merge allow multiallelic records for , see man page for details [both] + --no-version do not append version and command line to the header + -o, --output write output to a file [standard output] + -O, --output-type 'b' compressed BCF; 'u' uncompressed BCF; 'z' compressed VCF; 'v' uncompressed VCF [v] + -r, --regions restrict to comma-separated list of regions + -R, --regions-file restrict to regions listed in a file + --threads number of extra output compression threads [0] + +*/ \ No newline at end of file diff --git a/modules/bcftools/bcftools_remove_spanning.nf b/modules/bcftools/bcftools_remove_spanning.nf new file mode 100644 index 00000000..5ab088d0 --- /dev/null +++ b/modules/bcftools/bcftools_remove_spanning.nf @@ -0,0 +1,28 @@ +process BCFTOOLS_REMOVESPANNING { + tag "$sampleID" + + cpus = 4 + memory = 2.GB + time = '01:00:00' + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} + + container 'quay.io/biocontainers/bcftools:1.15--h0ea216a_2' + + input: + tuple val(sampleID), file(vcf) + + output: + tuple val(sampleID), file("*.vcf"), emit: vcf + + script: + + """ + bcftools \ + view \ + --exclude 'ALT="*"' \ + --threads ${task.cpus} \ + -o ${sampleID}_nospanning_calls.vcf \ + ${vcf} + """ + +} \ No newline at end of file diff --git a/modules/bcftools/bcftools_sort.nf b/modules/bcftools/bcftools_sort.nf index 371d9de7..2912d493 100644 --- a/modules/bcftools/bcftools_sort.nf +++ b/modules/bcftools/bcftools_sort.nf @@ -1,9 +1,10 @@ -process BCF_SORT { +process BCFTOOLS_SORT { tag "$sampleID" cpus = 1 memory = 6.GB time = '06:00:00' + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} container 'quay.io/biocontainers/bcftools:1.15--h0ea216a_2' @@ -14,7 +15,6 @@ process BCF_SORT { tuple val(sampleID), file("*.vcf"), emit: vcf script: - log.info "----- BCFTools Sort Running on: ${sampleID} -----" """ bcftools sort -o ${sampleID}_only_${indel_snp}.vcf ${vcf} diff --git a/modules/bcftools/bcftools_split_multiallelic.nf b/modules/bcftools/bcftools_split_multiallelic.nf new file mode 100644 index 00000000..b5872c3f --- /dev/null +++ b/modules/bcftools/bcftools_split_multiallelic.nf @@ -0,0 +1,30 @@ +process BCFTOOLS_SPLITMULTIALLELIC { + tag "$sampleID" + + cpus = 8 + memory = 6.GB + time = '06:00:00' + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} + + container 'quay.io/biocontainers/bcftools:1.15--h0ea216a_2' + + input: + tuple val(sampleID), file(vcf), file(tbi), val(meta), val(normal_name), val(tumor_name), val(tool) + + output: + tuple val(sampleID), file("*.vcf"), val(meta), val(normal_name), val(tumor_name), val(tool), emit: vcf + + script: + output_name = vcf.getBaseName().replace('.vcf', '') + """ + bcftools \ + norm \ + -m \ + -any \ + --threads ${task.cpus} \ + --no-version \ + -f ${params.ref_fa} \ + -o ${output_name}_multiAllelicSplit.vcf \ + ${vcf} +""" +} \ No newline at end of file diff --git a/modules/bcftools/bcftools_split_multiallelic_regions.nf b/modules/bcftools/bcftools_split_multiallelic_regions.nf new file mode 100644 index 00000000..54c6e323 --- /dev/null +++ b/modules/bcftools/bcftools_split_multiallelic_regions.nf @@ -0,0 +1,42 @@ +process BCFTOOLS_SPLITMULTIALLELIC_REGIONS { + tag "$sampleID" + + cpus = 4 + memory = 6.GB + time = '06:00:00' + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} + + container 'quay.io/biocontainers/bcftools:1.15--h0ea216a_2' + + input: + tuple val(sampleID), file(vcf), file(index) + val(chrom_list) + + output: + tuple val(sampleID), file("*.vcf.gz"), file("*.vcf.gz.tbi"), emit: vcf_idx + + script: + + listOfChroms = chrom_list.collect { "$it" }.join(',') + + """ + bcftools \ + norm \ + -m \ + -any \ + --threads ${task.cpus} \ + --regions ${listOfChroms} \ + --no-version \ + -f ${params.ref_fa} \ + -o ${sampleID}_split.vcf \ + ${vcf} + + bgzip \ + -c \ + ${sampleID}_split.vcf > ${sampleID}_split.vcf.gz + + tabix ${sampleID}_split.vcf.gz + + """ + +} \ No newline at end of file diff --git a/modules/bedtools/bedtools_amplicon_metrics.nf b/modules/bedtools/bedtools_amplicon_metrics.nf new file mode 100644 index 00000000..60a2bd6e --- /dev/null +++ b/modules/bedtools/bedtools_amplicon_metrics.nf @@ -0,0 +1,45 @@ +process TARGET_COVERAGE_METRICS { + tag "$sampleID" + + cpus 4 + memory 20.GB + time = '10:00:00' + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} + + publishDir "${params.pubdir}/${ params.organize_by=='sample' ? sampleID+'/stats' : 'bedtools' }", pattern: "*coverage_metrics.txt", mode: 'copy' + + container 'quay.io/jaxcompsci/bedtools-python3:2.26.0' // note: version difference over other bedtools modules. The 2.23.0 container was failing to parse bed target file. + + input: + tuple val(sampleID), file(bam) + + output: + tuple val(sampleID), file("*coverage_metrics.txt"), emit: qc_metrics + + shell: + ''' + ### total bases (B) that map/align to the on-target (OT) region + bases_on_target=$(coverageBed -a !{params.target_gatk} -b !{bam} | awk '{if($7>0) total+=$7}END{print total}') + + ### Total length covered by BAM alignment. + total_bases_covered=$(genomeCoverageBed -ibam !{bam} -bg | awk '{if($4>0) total += ($3-$2)}END{print total}') + + ## Bot / Btot + awk -v a="$bases_on_target" -v b="$total_bases_covered" 'BEGIN { printf "on_target_percent\\t%s\\n", (a/b)*100 }' !{sampleID}_amplicon_coverage_metrics.txt + + ## Get the depth of 20% of the Average coverage + perc_mean=$(coverageBed -d -a !{params.target_gatk} -b !{bam} | awk '{if($7>0) total+=1;s+=$7}END{print (s/total)*.2}') + + ### total capture array bases + total_target_bases=$(awk -F'\\t' 'BEGIN{SUM=0}{ SUM+=$3-$2 }END{print SUM}' !{params.target_gatk}) + + ### compute total bases that exceed 20% coverage in capture target region, and calculated coverage uniformity + coverageBed -d -a !{params.target_gatk} -b !{bam} | awk -v percmean=$perc_mean -v totalbases=$total_target_bases '{if($7>percmean) total+=1;s+=$7}END{print "coverage_uniformity\\t"(total/totalbases)*100}' >> !{sampleID}_amplicon_coverage_metrics.txt + + ''' +} + + +/* +Calculations Per: https://sfvideo.blob.core.windows.net/sitefinity/docs/default-source/application-note/primerclip-a-tool-for-trimming-primer-sequences-application-note.pdf?sfvrsn=cf83e107_14 +*/ \ No newline at end of file diff --git a/modules/bedtools/bedtools_calc_pbc_metrics.nf b/modules/bedtools/bedtools_calc_pbc_metrics.nf index f16a81d5..6d046b99 100644 --- a/modules/bedtools/bedtools_calc_pbc_metrics.nf +++ b/modules/bedtools/bedtools_calc_pbc_metrics.nf @@ -4,6 +4,7 @@ process CALC_PBC_METRICS { cpus 4 memory 20.GB time = '10:00:00' + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} publishDir "${params.pubdir}/${ params.organize_by=='sample' ? sampleID+'/stats' : 'bedtools' }", pattern: "*.pbc.qc", mode: 'copy' container 'quay.io/biocontainers/bedtools:2.23.0--h5b5514e_6' @@ -15,7 +16,6 @@ process CALC_PBC_METRICS { tuple val(sampleID), file("*.pbc.qc") shell: - log.info "----- Calculate PBC Metrics on ${sampleID} -----" ''' { # try @@ -24,9 +24,9 @@ process CALC_PBC_METRICS { -i !{tmp_bams[0]} \ | awk 'BEGIN{OFS="\\t"}{print $1,$2,$4,$6,$9,$10}' \ | grep -v 'MT' | sort | uniq -c \ - | awk 'BEGIN{mt=0;m0=0;m1=0;m2=0}($1==1){m1=m1+1} \ + | awk 'BEGIN{mt=0;m0=0;m1=0;m2=0;sample=!{sampleID}}($1==1){m1=m1+1} \ ($1==2){m2=m2+1} {m0=m0+1} {mt=mt+$1} \ - END{printf "%d\\t%d\\t%d\\t%d\\t%f\\t%f\\t%f\\n", mt,m0,m1,m2,m0/mt,m1/m0,m1/m2}' \ + END{printf "SAMPLEID\\tMT\\tM0\\tM1\\tM2\\tNRF\\tPBC1\\tPBC2\\n!{sampleID}\\t%d\\t%d\\t%d\\t%d\\t%f\\t%f\\t%f\\n",mt,m0,m1,m2,m0/mt,m1/m0,m1/m2}' \ > !{sampleID}.pbc.qc } || { # catch diff --git a/modules/bedtools/bedtools_feature_count2bed.nf b/modules/bedtools/bedtools_feature_count2bed.nf index 6ae39a51..9baeef91 100644 --- a/modules/bedtools/bedtools_feature_count2bed.nf +++ b/modules/bedtools/bedtools_feature_count2bed.nf @@ -4,7 +4,7 @@ process FEATURE_COUNT2BED { cpus 1 memory 4.GB time = '04:00:00' - + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} publishDir "${params.pubdir}/${ params.organize_by=='sample' ? sampleID : 'bash-utils' }", pattern: "*_peaks_countMatrix.mm10.bed", mode: 'copy' container 'quay.io/biocontainers/bedtools:2.23.0--h5b5514e_6' @@ -16,7 +16,6 @@ process FEATURE_COUNT2BED { tuple val(sampleID), file("*_peaks_countMatrix.mm10.bed") shell: - log.info "----- Feature Count to Bed on ${sampleID} -----" ''' tail -n +3 !{peak_cnt_matrx} \ | awk -F $'\\t' 'BEGIN {OFS = FS} { print $2, $3, $4, $7, $6 }' \ diff --git a/modules/bedtools/bedtools_frip_reads_in_peaks.nf b/modules/bedtools/bedtools_frip_reads_in_peaks.nf index 6759bec2..79cb27b1 100644 --- a/modules/bedtools/bedtools_frip_reads_in_peaks.nf +++ b/modules/bedtools/bedtools_frip_reads_in_peaks.nf @@ -4,6 +4,7 @@ process FRIP_READS_IN_PEAKS { cpus 2 memory 4.GB time '04:00:00' + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} container 'quay.io/biocontainers/bedtools:2.23.0--h5b5514e_6' @@ -14,7 +15,6 @@ process FRIP_READS_IN_PEAKS { tuple val(sampleID), file("reads_in_peaks.tmp.ba*") script: - log.info "----- Fraction of reads in peaks (FRiP) on ${sampleID} -----" """ bedtools sort \ -i ${narrow_peaks} \ diff --git a/modules/bedtools/bedtools_genomecov.nf b/modules/bedtools/bedtools_genomecov.nf new file mode 100644 index 00000000..fbf99ee4 --- /dev/null +++ b/modules/bedtools/bedtools_genomecov.nf @@ -0,0 +1,35 @@ +process BEDTOOLS_GENOMECOV { + tag "$sampleID" + + cpus 2 + memory 4.GB + time '04:00:00' + + publishDir { + def type = "${params.workflow}" == 'chipseq' ? ( sampleID =~ /INPUT/ ? 'control_samples' : 'immuno_precip_samples') : '' + "${params.pubdir}/${ params.organize_by=='sample' ? type+'/'+sampleID+'/bigwig' : 'bedtools'}" + }, pattern: "*.txt", mode: 'copy' + + + container 'quay.io/jaxcompsci/bedtools-sv_refs:2.30.0--hc088bd4_0' + + + input: + tuple val(sampleID), path(bam), path(flagstat) + + output: + tuple val(sampleID), path("*.bedGraph"), emit: bedgraph + tuple val(sampleID), path("*.txt"), emit: scale_factor + + + script: + pe_fragment = params.read_type == 'SE' ? '' : '-pc' + extend = (params.read_type == 'SE' && params.fragment_size > 0) ? "-fs ${params.fragment_size}" : '' + """ + SCALE_FACTOR=\$(grep '[0-9] mapped (' $flagstat | awk '{print 1000000/\$1}') + echo \$SCALE_FACTOR > ${sampleID}.scale_factor.txt + + bedtools genomecov -ibam ${bam[0]} -bg -scale \$SCALE_FACTOR $pe_fragment $extend | sort -T '.' -k1,1 -k2,2n > ${sampleID}.bedGraph + """ + +} diff --git a/modules/bedtools/bedtools_start_candidates.nf b/modules/bedtools/bedtools_start_candidates.nf new file mode 100644 index 00000000..82a90606 --- /dev/null +++ b/modules/bedtools/bedtools_start_candidates.nf @@ -0,0 +1,28 @@ +process BEDTOOLS_STARTCANDIDATES { + tag "$sampleID" + + cpus = 1 + memory = 6.GB + time = '06:00:00' + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} + + container 'quay.io/biocontainers/bedtools:2.23.0--h5b5514e_6' + + input: + tuple val(sampleID), file(vcf), val(meta), val(chrom) + + output: + tuple val(sampleID), file("*.vcf"), val(meta), val(chrom), emit: vcf + + script: + + """ + bedtools \ + intersect \ + -header \ + -a ${vcf} \ + -b ${params.intervalListBed} \ + -v \ + > ${sampleID}_startCand_merged_${chrom}.vcf +""" +} \ No newline at end of file diff --git a/modules/biqseq2/bicseq2_normalize.nf b/modules/biqseq2/bicseq2_normalize.nf new file mode 100644 index 00000000..718e4b90 --- /dev/null +++ b/modules/biqseq2/bicseq2_normalize.nf @@ -0,0 +1,102 @@ +process BICSEQ2_NORMALIZE { + tag "$sampleID" + + cpus = 1 + memory = 8.GB + time = '05:00:00' + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} + + container 'quay.io/jaxcompsci/bicseq2:v3' + // publishDir "${params.pubdir}/${ params.organize_by=='sample' ? sampleID : 'biqseq2' }", pattern:".txt", mode:'copy' + + input: + tuple val(sampleID), path(individual_chr_seq_files), val(meta), val(read_ID), val(read_length), val(insert_size) + val(fasta_file_list) + + output: + tuple val(sampleID), path("*.norm.bin.txt"), val(meta), val(read_ID), emit: normalized_output + + script: + + // fasta and mappability are set file lists. mappability is set by by read length of sample. + // tempSeqs are the .seq files from the prior step. + // tempnormpaths are the output bins. + // bicseq2config file is a file with just the list of chroms. + // sampleID is sampleID + // out-file is the configuration file used in the next step. + + // `bicseq2_config_writer` will sort lists by chromosome name, and omit invalid chr names. + // Chromosome names in file names must have `chr` in the name. OR the bicseq2config file must be changed to exclude it. + + fasta_files = fasta_file_list.collect { "$it" }.join(' ') + + if( read_length == '100' || read_length == '101') { + mappability_path = params.mappability_directory + '/100' + } else if( read_length == '125') { + mappability_path = params.mappability_directory + '/125' + } else if( read_length == '150' || read_length == '151' ) { + mappability_path = params.mappability_directory + '/151' + } else if( read_length == '250') { + mappability_path = params.mappability_directory + '/250' + } else { + log.info("\nUnsupported read length " + read_length + " in BicSeq2 normalization. This step is about to fail gracefully.\n\n") + mappability_path = 'error' + } + + seq_file_list = individual_chr_seq_files.collect { "$it" }.join(' ') + + + """ + if [ "${mappability_path}" = "error" ]; then exit 1; fi + + mappability_file_list=`echo ${mappability_path}` + + python3 \ + ${projectDir}/bin/pta/bicseq2_config_writer.py \ + --fa-files ${fasta_files} \ + --mappability-directory ${mappability_path} \ + --temp-seqs ${seq_file_list} \ + --norm-bicseq2-config ${params.bicseq2_chromList} \ + --sample-id ${read_ID} \ + --out-file configuration_file.txt + + rounded_length=`echo ${insert_size} | awk '{print int(\$1+0.5)}'` + + /NBICseq-norm_v0.2.4/NBICseq-norm.pl \ + -l=${read_length} \ + -s=\${rounded_length} \ + -fig=${sampleID}.GCvsRD.pdf \ + -tmp=${sampleID}.tmp \ + configuration_file.txt \ + ${sampleID}.params.out + """ + + stub: + """ + touch ${read_ID}_chr1.norm.bin.txt + touch ${read_ID}_chr2.norm.bin.txt + touch ${read_ID}_chr3.norm.bin.txt + touch ${read_ID}_chr4.norm.bin.txt + touch ${read_ID}_chr5.norm.bin.txt + touch ${read_ID}_chr6.norm.bin.txt + touch ${read_ID}_chr7.norm.bin.txt + touch ${read_ID}_chr8.norm.bin.txt + touch ${read_ID}_chr9.norm.bin.txt + touch ${read_ID}_chr10.norm.bin.txt + touch ${read_ID}_chr11.norm.bin.txt + touch ${read_ID}_chr12.norm.bin.txt + touch ${read_ID}_chr13.norm.bin.txt + touch ${read_ID}_chr14.norm.bin.txt + touch ${read_ID}_chr15.norm.bin.txt + touch ${read_ID}_chr16.norm.bin.txt + touch ${read_ID}_chr17.norm.bin.txt + touch ${read_ID}_chr18.norm.bin.txt + touch ${read_ID}_chr19.norm.bin.txt + touch ${read_ID}_chr20.norm.bin.txt + touch ${read_ID}_chr21.norm.bin.txt + touch ${read_ID}_chr22.norm.bin.txt + touch ${read_ID}_chrX.norm.bin.txt + touch ${read_ID}_chrY.norm.bin.txt + """ + +} \ No newline at end of file diff --git a/modules/biqseq2/bicseq2_seg.nf b/modules/biqseq2/bicseq2_seg.nf new file mode 100644 index 00000000..770e248d --- /dev/null +++ b/modules/biqseq2/bicseq2_seg.nf @@ -0,0 +1,53 @@ +process BICSEQ2_SEG { + tag "$sampleID" + + cpus = 1 + memory = 8.GB + time = '03:00:00' + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} + + container 'quay.io/jaxcompsci/bicseq2:v3' + publishDir "${params.pubdir}/${ params.organize_by=='sample' ? sampleID + '/callers' : 'biqseq2' }", pattern:"{*.txt,*.png}", mode:'copy' + + input: + tuple val(sampleID), file(individual_normal_norm_bin_files), file(individual_tumor_norm_bin_files), val(meta), val(normal_name), val(tumor_name) + + output: + tuple val(sampleID), file("*.bicseq2.png"), val('no_idx'), val(meta), val(normal_name), val(tumor_name), val('bicseq2'), emit: bicseq2_png + tuple val(sampleID), file("*.bicseq2.txt"), val('no_idx'), val(meta), val(normal_name), val(tumor_name), val('bicseq2'), emit: bicseq2_sv_calls + + script: + + normal_norm_list = individual_normal_norm_bin_files.collect { "$it" }.join(' ') + tumor_norm_list = individual_tumor_norm_bin_files.collect { "$it" }.join(' ') + + scale = params.bicseq2_no_scaling ? "--noscale" : "" + + """ + + python3 \ + ${projectDir}/bin/pta/bicseq2_seg_config_writer.py \ + --normal-norms ${normal_norm_list} \ + --tumor-norms ${tumor_norm_list} \ + --seg-bicseq2-config ${params.bicseq2_chromList} \ + --out-file configuration_file.txt \ + --pair-id ${sampleID} + + perl /NBICseq-seg_v0.7.2/NBICseq-seg.pl \ + --control \ + --tmp ${sampleID} \ + --fig ${sampleID}.bicseq2.png \ + --title ${sampleID} \ + --lambda 4 \ + ${scale} \ + configuration_file.txt \ + ${sampleID}.bicseq2.txt + + """ + + stub: + """ + touch ${sampleID}.bicseq2.png + touch ${sampleID}.bicseq2.txt + """ +} diff --git a/modules/biqseq2/bicseq2_seg_unpaired.nf b/modules/biqseq2/bicseq2_seg_unpaired.nf new file mode 100644 index 00000000..032e13ef --- /dev/null +++ b/modules/biqseq2/bicseq2_seg_unpaired.nf @@ -0,0 +1,55 @@ +process BICSEQ2_SEG_UNPAIRED { + tag "$sampleID" + + cpus = 1 + memory = 8.GB + time = '03:00:00' + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} + + container 'quay.io/jaxcompsci/bicseq2:v3' + publishDir "${params.pubdir}/${ params.organize_by=='sample' ? sampleID + '/callers' : 'biqseq2' }", pattern:"{*.txt,*.png}", mode:'copy' + + input: + tuple val(sampleID), file(individual_tumor_norm_bin_files), val(meta), val(tumor_name) + + output: + tuple val(sampleID), file("*.bicseq2.png"), val('no_idx'), val(meta), val(params.na12878_sampleName), val(tumor_name), val('bicseq2'), emit: bicseq2_png + tuple val(sampleID), file("*.bicseq2.txt"), val('no_idx'), val(meta), val(params.na12878_sampleName), val(tumor_name), val('bicseq2'), emit: bicseq2_sv_calls + + script: + + tumor_norm_list = individual_tumor_norm_bin_files.collect { "$it" }.join(' ') + + scale = params.bicseq2_no_scaling ? "--noscale" : "" + + """ + + python3 \ + ${projectDir}/bin/pta/bicseq2_seg_config_writer_unpaired.py \ + --tumor-norms ${tumor_norm_list} \ + --seg-bicseq2-config ${params.bicseq2_chromList} \ + --out-file configuration_file.txt \ + --pair-id ${sampleID} + + perl /NBICseq-seg_v0.7.2/NBICseq-seg.pl \ + --tmp ${sampleID} \ + --fig ${sampleID}.bicseq2.png \ + --title ${sampleID} \ + --lambda 4 \ + ${scale} \ + configuration_file.txt \ + ${sampleID}.bicseq2.txt + + """ + + stub: + """ + touch ${sampleID}.bicseq2.png + touch ${sampleID}.bicseq2.txt + """ +} + + + + + diff --git a/modules/bismark/bismark_alignment.nf b/modules/bismark/bismark_alignment.nf index 4dc75b71..e365bad4 100644 --- a/modules/bismark/bismark_alignment.nf +++ b/modules/bismark/bismark_alignment.nf @@ -2,14 +2,13 @@ process BISMARK_ALIGNMENT { tag "$sampleID" cpus 20 - memory {60.GB * task.attempt} - time {30.hour * task.attempt} - errorStrategy 'retry' - maxRetries 1 + memory 60.GB + time 30.hour + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} container 'quay.io/biocontainers/bismark:0.23.1--hdfd78af_0' - publishDir "${params.pubdir}/${ params.organize_by=='sample' ? sampleID+'/alignment' : 'bismark_align' }", pattern: "*.bam", mode:'copy' + publishDir "${params.pubdir}/${ params.organize_by=='sample' ? sampleID+'/alignment' : 'bismark_align' }", pattern: "*.bam", mode:'copy', enabled: params.keep_intermediate publishDir "${params.pubdir}/${ params.organize_by=='sample' ? sampleID+'/stats' : 'bismark_align' }", pattern: "*report.txt", mode:'copy' publishDir "${params.pubdir}/${ params.organize_by=='sample' ? sampleID+'/alignment' : 'bismark_align' }", pattern: "*unmapped*", mode:'copy', enabled: params.keep_intermediate publishDir "${params.pubdir}/${ params.organize_by=='sample' ? sampleID+'/alignment' : 'bismark_align' }", pattern: "*ambiguous*", mode:'copy', enabled: params.keep_intermediate @@ -24,7 +23,6 @@ process BISMARK_ALIGNMENT { tuple val(sampleID), file("*unmapped*"), emit: unmapped_reads script: - log.info "----- Bismark Alignment Running on: ${sampleID} -----" inputfq = params.read_type == 'PE' ? "-1 ${fq_reads[0]} -2 ${fq_reads[1]}" : "-1 ${fq_reads[0]}" directionality = params.non_directional ? '--non_directional': '' diff --git a/modules/bismark/bismark_deduplication.nf b/modules/bismark/bismark_deduplication.nf index 86dc5527..e9e0a466 100644 --- a/modules/bismark/bismark_deduplication.nf +++ b/modules/bismark/bismark_deduplication.nf @@ -2,10 +2,9 @@ process BISMARK_DEDUPLICATION { tag "$sampleID" cpus 8 - memory {60.GB * task.attempt} - time {30.hour * task.attempt} - errorStrategy 'retry' - maxRetries 1 + memory 60.GB + time 30.hour + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} container 'quay.io/biocontainers/bismark:0.23.1--hdfd78af_0' @@ -20,7 +19,6 @@ process BISMARK_DEDUPLICATION { tuple val(sampleID), file("*report.txt"), emit: dedup_report script: - log.info "----- Bismark Deduplication Running on: ${sampleID} -----" fq_type = params.read_type == 'PE' ? '-p' : '-s' diff --git a/modules/bismark/bismark_methylation_extraction.nf b/modules/bismark/bismark_methylation_extraction.nf index df661e59..774ab6b0 100644 --- a/modules/bismark/bismark_methylation_extraction.nf +++ b/modules/bismark/bismark_methylation_extraction.nf @@ -2,10 +2,9 @@ process BISMARK_METHYLATION_EXTRACTION { tag "$sampleID" cpus 8 - memory {60.GB * task.attempt} - time {30.hour * task.attempt} - errorStrategy 'retry' - maxRetries 1 + memory 60.GB + time 30.hour + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} container 'quay.io/biocontainers/bismark:0.23.1--hdfd78af_0' @@ -21,7 +20,6 @@ process BISMARK_METHYLATION_EXTRACTION { tuple val(sampleID), file("*.{png,gz}"), emit: extractor_png_gz script: - log.info "----- Bismark Methylation Extractor Running on: ${sampleID} -----" comprehensive = params.comprehensive ? '--comprehensive --merge_non_CpG' : '' cytosine_report = params.cytosine_report ? "--cytosine_report --genome_folder ${params.ref_fa_index}" : '' diff --git a/modules/bowtie2/bowtie2_align_trimmed_fastq.nf b/modules/bowtie2/bowtie2_align_trimmed_fastq.nf index 28cd63f1..a9c9a621 100644 --- a/modules/bowtie2/bowtie2_align_trimmed_fastq.nf +++ b/modules/bowtie2/bowtie2_align_trimmed_fastq.nf @@ -4,6 +4,7 @@ process ALIGN_TRIMMED_FASTQ { cpus 16 memory 30.GB time '48:00:00' + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} publishDir "${params.pubdir}/${ params.organize_by=='sample' ? sampleID+'/stats' : 'bowtie2' }", pattern: "*.log", mode: 'copy' container 'biocontainers/bowtie2:v2.4.1_cv1' @@ -16,7 +17,6 @@ process ALIGN_TRIMMED_FASTQ { tuple val(sampleID), file("*_bowtie2.log"), emit: bowtie_log script: - log.info "----- Bowtie2 Running on: ${sampleID} -----" String options = params.bowtieVSensitive == 'true' ? '--very-sensitive' : '' """ bowtie2 \ diff --git a/modules/bwa/bwa_mem.nf b/modules/bwa/bwa_mem.nf index 14681487..dc679a54 100644 --- a/modules/bwa/bwa_mem.nf +++ b/modules/bwa/bwa_mem.nf @@ -2,14 +2,17 @@ process BWA_MEM { tag "$sampleID" cpus 8 - memory {60.GB * task.attempt} - time {30.hour * task.attempt} - errorStrategy 'retry' - maxRetries 1 + memory 60.GB + time 30.hour + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} container 'quay.io/biocontainers/bwakit:0.7.17.dev1--hdfd78af_1' - publishDir "${params.pubdir}/${ params.organize_by=='sample' ? sampleID : 'bwa_mem' }", pattern: "*.sam", mode:'copy', enabled: params.keep_intermediate + publishDir { + def type = "${params.workflow}" == 'chipseq' ? ( sampleID =~ /INPUT/ ? 'control_samples/' : 'immuno_precip_samples/') : '' + "${params.pubdir}/${ params.organize_by=='sample' ? type+sampleID : 'bwa_mem'}" + }, pattern: "*.sam", mode: 'copy', enabled: params.keep_intermediate + input: tuple val(sampleID), file(fq_reads), file(read_groups) @@ -18,7 +21,6 @@ process BWA_MEM { tuple val(sampleID), file("*.sam"), emit: sam script: - log.info "----- BWA-MEM Alignment Running on: ${sampleID} -----" if (params.read_type == "SE"){ inputfq="${fq_reads[0]}" @@ -27,9 +29,11 @@ process BWA_MEM { inputfq="${fq_reads[0]} ${fq_reads[1]}" } + score = params.bwa_min_score ? "-T ${params.bwa_min_score}" : '' + split_hits = params.workflow == "chipseq" ? "-M" : '' """ rg=\$(cat $read_groups) bwa mem -R \${rg} \ - -t $task.cpus ${params.mismatch_penalty} ${params.ref_fa_indices} $inputfq > ${sampleID}.sam + -t $task.cpus $split_hits ${params.mismatch_penalty} $score ${params.ref_fa_indices} $inputfq > ${sampleID}.sam """ -} \ No newline at end of file +} diff --git a/modules/bwa/bwa_mem_hla.nf b/modules/bwa/bwa_mem_hla.nf index 569772d4..4dfbdb88 100644 --- a/modules/bwa/bwa_mem_hla.nf +++ b/modules/bwa/bwa_mem_hla.nf @@ -2,10 +2,9 @@ process BWA_MEM_HLA { tag "$sampleID" cpus 8 - memory {60.GB * task.attempt} - time {30.hour * task.attempt} - errorStrategy 'retry' - maxRetries 1 + memory 60.GB + time 30.hour + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} container 'quay.io/biocontainers/bwakit:0.7.17.dev1--hdfd78af_1' @@ -18,7 +17,6 @@ process BWA_MEM_HLA { tuple val(sampleID), file("*.bam"), emit: bam script: - log.info "----- BWA-MEM Alignment Running on: ${sampleID} -----" if (params.read_type == "SE"){ inputfq="${fq_reads[0]}" diff --git a/modules/conpair/conpair.nf b/modules/conpair/conpair.nf new file mode 100644 index 00000000..93a1e92d --- /dev/null +++ b/modules/conpair/conpair.nf @@ -0,0 +1,31 @@ +process CONPAIR { + tag "$pairName" + + cpus 1 + memory 4.GB + time '10:00:00' + container 'quay.io/jaxcompsci/conpair:v0.2' + errorStrategy 'ignore' + + publishDir "${params.pubdir}/${ params.organize_by=='sample' ? "$pairName" : 'conpair' }", pattern:"*.txt", mode:'copy' + + input: + tuple val(sampleID), val(pairName), file(tumor_pileup), file(normal_pileup) + + output: + tuple val(pairName), file("*_concordance.txt"), emit: concordance + tuple val(pairName), file("*_contamination.txt"), emit: contamination + + script: + """ + python2 /Conpair-0.2/scripts/verify_concordance.py -T ${tumor_pileup} -N ${normal_pileup} --outfile ${pairName}_concordance.txt -M /Conpair-0.2/data/markers/GRCh38.autosomes.phase3_shapeit2_mvncall_integrated.20130502.SNV.genotype.sselect_v4_MAF_0.4_LD_0.8.liftover.txt + + python2 /Conpair-0.2/scripts/estimate_tumor_normal_contamination.py -T ${tumor_pileup} -N ${normal_pileup} --outfile ${pairName}_contamination.txt -M /Conpair-0.2/data/markers/GRCh38.autosomes.phase3_shapeit2_mvncall_integrated.20130502.SNV.genotype.sselect_v4_MAF_0.4_LD_0.8.liftover.txt + """ + + stub: + """ + touch ${pairName}_concordance.txt + touch ${pairName}_contamination.txt + """ +} diff --git a/modules/conpair/conpair_pileup.nf b/modules/conpair/conpair_pileup.nf new file mode 100644 index 00000000..c267776f --- /dev/null +++ b/modules/conpair/conpair_pileup.nf @@ -0,0 +1,24 @@ +process CONPAIR_PILEUP { + tag "$sampleName" + + cpus 1 + memory 4.GB + time '10:00:00' + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} + + container 'quay.io/jaxcompsci/conpair:v0.2' + + input: + tuple val(sampleID), val(sampleName), file(bam), file(bai) + val(type) + + output: + tuple val(sampleID), val(sampleName), file("*pileup.txt"), emit: pileup + + script: + """ + python2 /Conpair-0.2/scripts/run_gatk_pileup_for_sample.py -B ${bam} -O ${sampleName}_${type}_pileup.txt --reference ${params.ref_fa} --markers /Conpair-0.2/data/markers/GRCh38.autosomes.phase3_shapeit2_mvncall_integrated.20130502.SNV.genotype.sselect_v4_MAF_0.4_LD_0.8.liftover.bed + """ +} + +// Marker file inputs: `--markers /Conpair-0.2/data/markers/...` are located in the container. \ No newline at end of file diff --git a/modules/cosmic/cosmic_add_cancer_resistance_mutations_germline.nf b/modules/cosmic/cosmic_add_cancer_resistance_mutations_germline.nf new file mode 100644 index 00000000..ab00911f --- /dev/null +++ b/modules/cosmic/cosmic_add_cancer_resistance_mutations_germline.nf @@ -0,0 +1,30 @@ +process COSMIC_CANCER_RESISTANCE_MUTATION_GERMLINE { + tag "$sampleID" + + cpus 1 + memory 5.GB + time 1.hour + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} + + container 'quay.io/jaxcompsci/py3_perl_pylibs:v2' + + input: + tuple val(sampleID), file(vcf) + + output: + tuple val(sampleID), file("*.vcf"), emit: vcf + + script: + """ + python \ + ${projectDir}/bin/pta/add_cancer_resistance_mutations.py \ + ${params.cosmic_cancer_resistance_muts} \ + ${vcf} \ + ${sampleID}_germline_snv_indel_annotated_supplemental.vcf + """ +} + +// cosmic for 'pta' pipeline comes from: +// curl -H "Authorization: Basic ADD AUTHORIZATION" https://cancer.sanger.ac.uk/cosmic/file_download/GRCh38/cosmic/v97/CosmicResistanceMutations.tsv.gz +// the above command provides a URL for curl download +// curl "https://cog.sanger.ac.uk/cosmic/GRCh38/cosmic/v97/CosmicResistanceMutations.tsv.gz?AWSAccessKeyId=KRV7P7QR9DL41J9EWGA2&Expires=1672933745&Signature=nQ9AFGONT4rDKfM4UZ1cmN4J%2F%2BM%3D" --output CosmicResistanceMutations.tsv.gz \ No newline at end of file diff --git a/modules/cosmic/cosmic_add_cancer_resistance_mutations_somatic.nf b/modules/cosmic/cosmic_add_cancer_resistance_mutations_somatic.nf new file mode 100644 index 00000000..74f00643 --- /dev/null +++ b/modules/cosmic/cosmic_add_cancer_resistance_mutations_somatic.nf @@ -0,0 +1,30 @@ +process COSMIC_CANCER_RESISTANCE_MUTATION_SOMATIC { + tag "$sampleID" + + cpus 1 + memory 40.GB + time 20.hour + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} + + container 'quay.io/jaxcompsci/py3_perl_pylibs:v2' + + input: + tuple val(sampleID), file(vcf), val(meta), val(normal_name), val(tumor_name) + + output: + tuple val(sampleID), file("*_somatic_vep_cosmic_cancerResitMut_annotated.vcf"), val(meta), val(normal_name), val(tumor_name), emit: vcf + + script: + """ + python \ + ${projectDir}/bin/pta/add_cancer_resistance_mutations.py \ + ${params.cosmic_cancer_resistance_muts} \ + ${vcf} \ + ${sampleID}_somatic_vep_cosmic_cancerResitMut_annotated.vcf + """ +} + +// cosmic for 'pta' pipeline comes from: +// curl -H "Authorization: Basic ADD AUTHORIZATION" https://cancer.sanger.ac.uk/cosmic/file_download/GRCh38/cosmic/v97/CosmicResistanceMutations.tsv.gz +// the above command provides a URL for curl download +// curl "https://cog.sanger.ac.uk/cosmic/GRCh38/cosmic/v97/CosmicResistanceMutations.tsv.gz?AWSAccessKeyId=KRV7P7QR9DL41J9EWGA2&Expires=1672933745&Signature=nQ9AFGONT4rDKfM4UZ1cmN4J%2F%2BM%3D" --output CosmicResistanceMutations.tsv.gz \ No newline at end of file diff --git a/modules/cosmic/cosmic_annotation.nf b/modules/cosmic/cosmic_annotation.nf new file mode 100644 index 00000000..43e4066a --- /dev/null +++ b/modules/cosmic/cosmic_annotation.nf @@ -0,0 +1,30 @@ +process COSMIC_ANNOTATION { + tag "$sampleID" + + cpus 1 + memory 1.GB + time 5.hour + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} + + container 'quay.io/jaxcompsci/py3_perl_pylibs:v2' + + input: + tuple val(sampleID), file(vcf) + + output: + tuple val(sampleID), file("*.vcf"), emit: vcf + + script: + """ + python \ + ${projectDir}/bin/pta/add_cancer_gene_census.py \ + ${params.cosmic_cgc} \ + ${vcf} \ + ${sampleID}_germline_vep_cosmic_annotated.vcf + """ +} + +// cosmic for 'pta' pipeline comes from: +// curl -H "Authorization: Basic ADD AUTHORIZATION" https://cancer.sanger.ac.uk/cosmic/file_download/GRCh38/cosmic/v97/cancer_gene_census.csv +// the above command provides a URL for curl download +// curl "https://cog.sanger.ac.uk/cosmic/GRCh38/cosmic/v97/cancer_gene_census.csv?AWSAccessKeyId=KRV7P7QR9DL41J9EWGA2&Expires=1672931317&Signature=PK8YAGC%2Bh9veZqc7mIZzywkOSf0%3D" --output cancer_gene_census.csv diff --git a/modules/cosmic/cosmic_annotation_somatic.nf b/modules/cosmic/cosmic_annotation_somatic.nf new file mode 100644 index 00000000..84a70435 --- /dev/null +++ b/modules/cosmic/cosmic_annotation_somatic.nf @@ -0,0 +1,30 @@ +process COSMIC_ANNOTATION_SOMATIC { + tag "$sampleID" + + cpus 1 + memory 40.GB + time 20.hour + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} + + container 'quay.io/jaxcompsci/py3_perl_pylibs:v2' + + input: + tuple val(sampleID), file(vcf), val(meta), val(normal_name), val(tumor_name) + + output: + tuple val(sampleID), file("*_somatic_vep_cosmic_annotated.vcf"), val(meta), val(normal_name), val(tumor_name), emit: vcf + + script: + """ + python \ + ${projectDir}/bin/pta/add_cancer_gene_census.py \ + ${params.cosmic_cgc} \ + ${vcf} \ + ${sampleID}_somatic_vep_cosmic_annotated.vcf + """ +} + +// cosmic for 'pta' pipeline comes from: +// curl -H "Authorization: Basic ADD AUTHORIZATION" https://cancer.sanger.ac.uk/cosmic/file_download/GRCh38/cosmic/v97/cancer_gene_census.csv +// the above command provides a URL for curl download +// curl "https://cog.sanger.ac.uk/cosmic/GRCh38/cosmic/v97/cancer_gene_census.csv?AWSAccessKeyId=KRV7P7QR9DL41J9EWGA2&Expires=1672931317&Signature=PK8YAGC%2Bh9veZqc7mIZzywkOSf0%3D" --output cancer_gene_census.csv diff --git a/modules/cutadapt/cutadapt_trim_fastq.nf b/modules/cutadapt/cutadapt_trim_fastq.nf index 9ecff88e..6077524a 100644 --- a/modules/cutadapt/cutadapt_trim_fastq.nf +++ b/modules/cutadapt/cutadapt_trim_fastq.nf @@ -2,6 +2,7 @@ process TRIM_FASTQ { cpus 8 memory 10.GB time '20:00:00' + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} publishDir "${params.pubdir}/${ params.organize_by=='sample' ? sampleID+'/stats' : 'cutadapt' }", pattern: "*.log", mode: 'copy' container 'quay.io/biocontainers/cutadapt:2.3--py37h14c3975_0' @@ -14,7 +15,6 @@ process TRIM_FASTQ { tuple val(sampleID), file("*.log"), emit: cutadapt_log script: - log.info "----- Cutadapt Running on: ${sampleID} -----" paired_end = params.read_type == 'PE' ? "-p ${sampleID}_R2_paired_trimmed.fq" : '' diff --git a/modules/deeptools/deeptools_bam_coverage_bigwig.nf b/modules/deeptools/deeptools_bam_coverage_bigwig.nf index 3a10bfc4..fbc267c1 100644 --- a/modules/deeptools/deeptools_bam_coverage_bigwig.nf +++ b/modules/deeptools/deeptools_bam_coverage_bigwig.nf @@ -4,8 +4,9 @@ process BAM_COVERAGE_BIGWIG { cpus 8 memory 10.GB time '04:00:00' + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} - publishDir "${params.pubdir}/${ params.organize_by=='sample' ? sampleID : 'deeptools' }", pattern: "*.bigwig", mode: 'copy' + publishDir "${params.pubdir}/${ params.organize_by=='sample' ? sampleID+'/deeptools' : 'deeptools' }", pattern: "*.bigwig", mode: 'copy' container 'quay.io/biocontainers/deeptools:3.3.2--py_1' input: @@ -15,7 +16,6 @@ process BAM_COVERAGE_BIGWIG { tuple val(sampleID), file("*.bigwig") script: - log.info "----- Running deeptools bamCoverage bigwig on ${sampleID} -----" """ bamCoverage \ --numberOfProcessors $task.cpus \ diff --git a/modules/deeptools/deeptools_computematrix.nf b/modules/deeptools/deeptools_computematrix.nf new file mode 100644 index 00000000..c4ee39af --- /dev/null +++ b/modules/deeptools/deeptools_computematrix.nf @@ -0,0 +1,32 @@ +process DEEPTOOLS_COMPUTEMATRIX { + tag "$sampleID" + + cpus 8 + memory 10.GB + time '04:00:00' + + container 'quay.io/biocontainers/deeptools:3.3.2--py_1' + + input: + tuple val(sampleID), file(bigwig) + file(bed) + + output: + tuple val(sampleID), file("*.mat.gz") , emit: matrix + tuple val(sampleID), file("*.mat.tab"), emit: table + + script: + """ + computeMatrix scale-regions \\ + --regionsFileName $bed \\ + --scoreFileName $bigwig \\ + --outFileName ${sampleID}.computeMatrix.mat.gz \\ + --outFileNameMatrix ${sampleID}.computeMatrix.vals.mat.tab \\ + --regionBodyLength 1000 \\ + --beforeRegionStartLength 3000 \\ + --afterRegionStartLength 3000 \\ + --skipZeros \\ + --smartLabels \\ + --numberOfProcessors $task.cpus + """ +} diff --git a/modules/deeptools/deeptools_filter_remove_multi_sieve.nf b/modules/deeptools/deeptools_filter_remove_multi_sieve.nf index 7797a17a..c8d2eb32 100644 --- a/modules/deeptools/deeptools_filter_remove_multi_sieve.nf +++ b/modules/deeptools/deeptools_filter_remove_multi_sieve.nf @@ -4,6 +4,7 @@ process FILTER_REMOVE_MULTI_SIEVE { cpus 8 memory 4.GB time '04:00:00' + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} container 'quay.io/biocontainers/deeptools:3.3.2--py_1' @@ -14,7 +15,6 @@ process FILTER_REMOVE_MULTI_SIEVE { tuple val(sampleID), file("*.shift.tmp.ba*") script: - log.info "----- Running deeptools alignmentSieve on ${sampleID} -----" """ alignmentSieve \ --numberOfProcessors $task.cpus \ diff --git a/modules/deeptools/deeptools_plotfingerprint.nf b/modules/deeptools/deeptools_plotfingerprint.nf new file mode 100644 index 00000000..f6ba9eb5 --- /dev/null +++ b/modules/deeptools/deeptools_plotfingerprint.nf @@ -0,0 +1,38 @@ +process DEEPTOOLS_PLOTFINGERPRINT { + tag "${ip} vs ${control}" + + cpus 8 + memory 10.GB + time '04:00:00' + + publishDir "${params.pubdir}/${ params.organize_by=='sample' ? 'immuno_precip_samples/'+ip+'_vs_'+control+'/deeptools' : 'deeptools' }", pattern: "*.pdf", mode: 'copy' + publishDir "${params.pubdir}/${ params.organize_by=='sample' ? 'immuno_precip_samples/'+ip+'_vs_'+control+'/deeptools' : 'deeptools' }", pattern: "*.raw.txt", mode: 'copy' + publishDir "${params.pubdir}/${ params.organize_by=='sample' ? 'immuno_precip_samples/'+ip+'_vs_'+control+'/deeptools' : 'deeptools' }", pattern: "*.qcmetrics.txt", mode: 'copy' + + container 'quay.io/biocontainers/deeptools:3.3.2--py_1' + + input: + tuple val(antibody), val(replicatesExist), val(multipleGroups), val(ip), file(ipbam), val(control), file(controlbam), file(ipflagstat) + + output: + tuple val(ip), file("*.pdf"), emit : pdf + tuple val(ip), file("*.raw.txt"), emit : raw + tuple val(ip), file("*.qcmetrics.txt"), emit : qc + + + script: + extend = (params.read_type == 'SE' && params.fragment_size > 0) ? "--extendReads ${params.fragment_size}" : '' + """ + plotFingerprint \\ + --bamfiles ${ipbam[0]} ${controlbam[0]} \\ + --plotFile ${ip}.plotFingerprint.pdf \\ + $extend \\ + --labels $ip $control \\ + --outRawCounts ${ip}.plotFingerprint.raw.txt \\ + --outQualityMetrics ${ip}.plotFingerprint.qcmetrics.txt \\ + --skipZeros \\ + --JSDsample ${controlbam[0]} \\ + --numberOfProcessors $task.cpus \\ + --numberOfSamples $params.fingerprint_bins + """ +} diff --git a/modules/deeptools/deeptools_plotheatmap.nf b/modules/deeptools/deeptools_plotheatmap.nf new file mode 100644 index 00000000..aaab3770 --- /dev/null +++ b/modules/deeptools/deeptools_plotheatmap.nf @@ -0,0 +1,29 @@ +process DEEPTOOLS_PLOTHEATMAP { + tag "$sampleID" + + cpus 8 + memory 10.GB + time '04:00:00' + + publishDir { + def type = "${params.workflow}" == 'chipseq' ? ( sampleID =~ /INPUT/ ? 'control_samples/' : 'immuno_precip_samples/') : '' + "${params.pubdir}/${ params.organize_by=='sample' ? type+sampleID+'/deeptools' : 'deeptools'}" + }, pattern: "*.pdf", mode: 'copy' + + + container 'quay.io/biocontainers/deeptools:3.3.2--py_1' + + input: + tuple val(sampleID), file(matrix) + + output: + tuple val(sampleID), path("*.pdf"), emit: pdf + tuple val(sampleID), path("*.tab"), emit: table + + script: + """ + plotHeatmap --matrixFile $matrix \\ + --outFileName ${sampleID}.plotHeatmap.pdf \\ + --outFileNameMatrix ${sampleID}.plotHeatmap.mat.tab + """ +} diff --git a/modules/deeptools/deeptools_plotprofile.nf b/modules/deeptools/deeptools_plotprofile.nf new file mode 100644 index 00000000..f179d517 --- /dev/null +++ b/modules/deeptools/deeptools_plotprofile.nf @@ -0,0 +1,30 @@ +process DEEPTOOLS_PLOTPROFILE { + tag "$sampleID" + + cpus 8 + memory 10.GB + time '04:00:00' + + publishDir { + def type = "${params.workflow}" == 'chipseq' ? ( sampleID =~ /INPUT/ ? 'control_samples/' : 'immuno_precip_samples/') : '' + "${params.pubdir}/${ params.organize_by=='sample' ? type+sampleID+'/deeptools' : 'deeptools'}" + }, pattern: "*.pdf", mode: 'copy' + + + container 'quay.io/biocontainers/deeptools:3.3.2--py_1' + + + input: + tuple val(sampleID), file(matrix) + + output: + tuple val(sampleID), path("*.pdf"), emit: pdf + tuple val(sampleID), path("*.tab"), emit: table + + script: + """ + plotProfile --matrixFile ${sampleID}.computeMatrix.mat.gz \\ + --outFileName ${sampleID}.plotProfile.pdf \\ + --outFileNameData ${sampleID}.plotProfile.tab + """ +} diff --git a/modules/ensembl/varianteffectpredictor_germline.nf b/modules/ensembl/varianteffectpredictor_germline.nf new file mode 100644 index 00000000..0af31fa6 --- /dev/null +++ b/modules/ensembl/varianteffectpredictor_germline.nf @@ -0,0 +1,131 @@ +process VEP_GERMLINE { + tag "$sampleID" + + cpus = 4 + memory = 15.GB + time = '10:00:00' + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} + + container 'ensemblorg/ensembl-vep:release_109.3' + + publishDir "${params.pubdir}/${ params.organize_by=='sample' ? sampleID : 'vep' }", pattern: "*.vcf", mode:'copy', enabled: params.keep_intermediate + + input: + tuple val(sampleID), file(vcf), file(idx) + + output: + tuple val(sampleID), file("*_vep_annotated.vcf"), emit: vcf + + script: + + """ + vep \ + --input_file ${vcf} \ + --output_file ${sampleID}_germline_vep_annotated.vcf \ + --fork ${task.cpus} \ + --buffer_size 50000 \ + --format vcf \ + --no_stats \ + --no_escape \ + --offline \ + --assembly GRCh38 \ + --cache \ + --dir_cache ${params.vep_cache_directory} \ + --refseq \ + --max_af \ + --af \ + --af_1kg \ + --af_gnomad \ + --exclude_predicted \ + --fasta ${params.vep_fasta} \ + --symbol \ + --hgvs \ + --check_existing \ + --vcf \ + --pick_allele_gene \ + --dir_plugins ${params.vep_cache_directory}/Plugins \ + --plugin dbscSNV,${params.vep_cache_directory}/Plugins/dbscSNV1.1/dbscSNV1.1_GRCh38.txt.gz \ + --plugin MaxEntScan,${params.vep_cache_directory}/Plugins/maxentscan \ + --plugin dbNSFP,${params.vep_cache_directory}/Plugins/dbNSFP/dbNSFP4.3a_grch38.gz,${params.vep_cache_directory}/Plugins/dbNSFP_replacement_logic,REVEL_score,SIFT_pred,SIFT4G_pred,LRT_pred,MutationTaster_pred,MutationAssessor_pred,FATHMM_pred,PROVEAN_pred,MetaSVM_pred,PrimateAI_pred,fathmm-MKL_coding_pred,GERP++_RS,phyloP100way_vertebrate,CADD_phred,Polyphen2_HVAR_pred \ + --custom ${params.vep_cache_directory}/annotations/COSMIC_v97/CosmicCodingMuts.vcf.gz,CosmicCoding,vcf,exact,0,GENOMIC_ID,LEGACY_ID,CNT,CDS,AA \ + --custom ${params.vep_cache_directory}/annotations/COSMIC_v97/CosmicNonCodingVariants.normal.vcf.gz,CosmicNonCoding,vcf,exact,0,GENOMIC_ID,LEGACY_ID,CNT,CDS,AA \ + --custom ${params.vep_cache_directory}/annotations/04142020_NYGC_samples.vcf.gz,NYGC,vcf,exact,0,AF,Samples,AC_Het,AC_Hom \ + --custom ${params.vep_cache_directory}/annotations/clinvar.vep.vcf.gz,CLN_Overlap,vcf,overlap,0,CLIN_ID,CLNSIG,CLNREVSTAT,CLNDN \ + --custom ${params.vep_cache_directory}/annotations/clinvar.vep.vcf.gz,CLN_Exact,vcf,exact,0,CLIN_ID,CLNSIG,CLNREVSTAT,CLNDN \ + --custom ${params.vep_cache_directory}/annotations/gnomad_exomes_subset_final.vcf.gz,GnomadExomes,vcf,exact,0,AF,nhomalt \ + --custom ${params.vep_cache_directory}/annotations/gnomad_genomes_subset_final.vcf.gz,GnomadGenomes,vcf,exact,0,AF,nhomalt \ + --custom ${params.vep_cache_directory}/annotations/chd_genes.vcf.gz,CHD_GENES,vcf,overlap,0,GENE \ + --custom ${params.vep_cache_directory}/annotations/chd_evolving.vcf.gz,CHD_EVOLVING,vcf,overlap,0,GENE \ + --custom ${params.vep_cache_directory}/annotations/chd_whitelist.vcf.gz,chd_whitelist,vcf,overlap,0,END \ + --custom ${params.vep_cache_directory}/annotations/deep_intronic_whitelist_08132020.vcf.gz,INTRONIC,vcf,exact,0,INTRONIC \ + --custom ${params.vep_cache_directory}/annotations/clinvar_deep_intronics_09012020.vcf.gz,CLINVAR_INTRONIC,vcf,exact,0,INTRONIC \ + --custom ${params.vep_cache_directory}/annotations/mastermind_cited_variants_reference-2021.01.02-grch38_fixed-contigs.vcf.gz,mm,vcf,exact,0,GENE,HGVSG,MMCNT1,MMCNT2,MMCNT3,MMID3,MMURI3 \ + --custom ${params.vep_cache_directory}/annotations/spliceai_scores.hg38.sorted.vcf.gz,SPLICEAI,vcf,exact,0,DS_AG,DS_AL,DS_DG,DS_DL \ + --custom ${params.vep_cache_directory}/annotations/pli_hg38.vcf.gz,PLI,vcf,overlap,0,pLI,mis_z \ + --custom ${params.vep_cache_directory}/annotations/domino_genes_38.vcf.gz,Domino,vcf,overlap,0,Domino_Score \ + --custom ${params.vep_cache_directory}/annotations/ar_extended.vcf.gz,AR,vcf,overlap,0,AR_GENE \ + --custom ${params.vep_cache_directory}/annotations/ACMG59_2017-09-28.vcf.gz,ACMG59,vcf,overlap,0,GENE,DISEASE \ + --custom ${params.vep_cache_directory}/annotations/dials_genes_b38.vcf.gz,DIALS,vcf,overlap,0,DIALS_GENE \ + --custom ${params.vep_cache_directory}/annotations/pgx_vep.vcf.gz,PGx,vcf,exact,0,pgx_rsid \ + --custom ${params.vep_cache_directory}/annotations/sema4_immuno_genes_b38.vcf.gz,IMMUNO,vcf,overlap,0,IMMUNO_Gene \ + --custom ${params.vep_cache_directory}/annotations/sema4_neuro_genes_b38.vcf.gz,NEURO,vcf,overlap,0,NEURO_Gene \ + --custom ${params.vep_cache_directory}/annotations/sema4_cardio_genes_b38.vcf.gz,CARDIO,vcf,overlap,0,CARDIO_Gene \ + --custom ${params.vep_cache_directory}/annotations/nygc_curation_b38.vcf.gz,N19,vcf,overlap,0,NYGC_CUR \ + --custom ${params.vep_cache_directory}/annotations/nygc_reported_variants_b38.vcf.gz,R19,vcf,overlap,0,NYGC_REPORTED_SAMPLE,NYGC_CLASS,NYGC_DISEASE + """ +} + +// NOTE: Many of the resources are hard coded based on those provided in: +// https://bitbucket.nygenome.org/projects/WDL/repos/somatic_dna_wdl/browse/annotate/variantEffectPredictor.wdl +// https://bitbucket.nygenome.org/projects/WDL/repos/somatic_dna_wdl/browse/config/fasta_references.json +// For VEP cache, dbNSFP and dbscSNV resources were rebuild using VEPv108 and as noted below. + +// VEP Cache setup: + +// singularity pull --name vep.sif docker://ensemblorg/ensembl-vep:release_108.2 +// singularity exec vep.sif INSTALL.pl -c /PATH_TO_VEP/vep -a cfp -s homo_sapiens_refseq -y GRCh38 -g dbNSFP,dbscSNV,MaxEntScan +// ln -sf homo_sapiens homo_sapiens_refseq + +// In the plugin directory: + +// dbNSFP: +// wget https://dbnsfp.s3.amazonaws.com/dbNSFP4.3a.zip +// unzip dbNSFP4.3a.zip +// zcat dbNSFP4.3a_variant.chr1.gz | head -n1 > h +// mkdir temp +// zgrep -h -v ^#chr dbNSFP4.3a_variant.chr* | sort -T temp -k1,1 -k2,2n - | cat h - | bgzip -c > dbNSFP4.3a.gz +// tabix -s 1 -b 2 -e 2 dbNSFP4.3a.gz +// rm -rf temp dbNSFP4.3a_variant.chr* h dbNSFP4.3_gene.gz dbNSFP4.3_gene.complete.gz dbNSFP4.3a.zip search_dbNSFP43a.readme.pdf search_dbNSFP43a.class search_dbNSFP43a.jar + + +// dbscSNV: +// wget ftp://dbnsfp:dbnsfp@dbnsfp.softgenetics.com/dbscSNV1.1.zip +// unzip dbscSNV1.1.zip +// head -n1 dbscSNV1.1.chr1 > h2 +// mkdir temp2 +// cat dbscSNV1.1.chr* | grep -v ^chr | sort -T temp2 -k5,5 -k6,6n | cat h2 - | awk '$5 != "."' | bgzip -c > dbscSNV1.1_GRCh38.txt.gz +// tabix -s 5 -b 6 -e 6 -c c dbscSNV1.1_GRCh38.txt.gz +// rm dbscSNV1.1.chr* dbscSNV1.1.zip h2 + +// wget http://hollywood.mit.edu/burgelab/maxent/download/fordownload.tar.gz +// gunzip fordownload.tar.gz +// tar -xvf fordownload.tar +// mkdir maxentscan +// mv fordownload ./maxentscan/ + + +// COSMIC: +// mkdir COSMIC_v97 +// echo "@jax.org:" | base64 +// +// curl -H "Authorization: Basic ADD AUTHORIZATION" https://cancer.sanger.ac.uk/cosmic/file_download/GRCh38/cosmic/v97/VCF/CosmicCodingMuts.vcf.gz +// the above command provides a URL for curl download +// curl "https://cog.sanger.ac.uk/cosmic/GRCh38/cosmic/v97/VCF/CosmicCodingMuts.vcf.gz?AWSAccessKeyId=KRV7P7QR9DL41J9EWGA2&Expires=1672843877&Signature=TSsIiQodqoKS5skE1ziS49zEWSU%3D" --output CosmicCodingMuts.vcf.gz + +// curl -H "Authorization: Basic ADD AUTHORIZATION" https://cancer.sanger.ac.uk/cosmic/file_download/GRCh38/cosmic/v97/VCF/CosmicNonCodingVariants.normal.vcf.gz +// the above command provides a URL for curl download +// curl "https://cog.sanger.ac.uk/cosmic/GRCh38/cosmic/v97/VCF/CosmicNonCodingVariants.normal.vcf.gz?AWSAccessKeyId=KRV7P7QR9DL41J9EWGA2&Expires=1672844121&Signature=4JkeRizNMg0pv%2FChw4QAl268dVw%3D" --output CosmicNonCodingVariants.normal.vcf.gz + + +// ALL REMAINING ANNOTATIONS: +// Note: remaining annotations come from: gs://nygc-resources-public/ensembl_vep/annotations.tar.gz \ No newline at end of file diff --git a/modules/ensembl/varianteffectpredictor_somatic.nf b/modules/ensembl/varianteffectpredictor_somatic.nf new file mode 100644 index 00000000..7c84aad0 --- /dev/null +++ b/modules/ensembl/varianteffectpredictor_somatic.nf @@ -0,0 +1,131 @@ +process VEP_SOMATIC { + tag "$sampleID" + + cpus = 4 + memory = 15.GB + time = '10:00:00' + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} + + container 'ensemblorg/ensembl-vep:release_109.3' + + publishDir "${params.pubdir}/${ params.organize_by=='sample' ? sampleID : 'vep' }", pattern: "*.vcf", mode:'copy', enabled: params.keep_intermediate + + input: + tuple val(sampleID), file(vcf), file(idx), val(meta), val(normal_name), val(tumor_name) + + output: + tuple val(sampleID), file("*_vep_annotated.vcf"), val(meta), val(normal_name), val(tumor_name), emit: vcf + + script: + + """ + vep \ + --input_file ${vcf} \ + --output_file ${sampleID}_somatic_vep_annotated.vcf \ + --fork ${task.cpus} \ + --buffer_size 50000 \ + --format vcf \ + --no_stats \ + --no_escape \ + --offline \ + --assembly GRCh38 \ + --cache \ + --dir_cache ${params.vep_cache_directory} \ + --refseq \ + --max_af \ + --af \ + --af_1kg \ + --af_gnomad \ + --exclude_predicted \ + --fasta ${params.vep_fasta} \ + --symbol \ + --hgvs \ + --check_existing \ + --vcf \ + --pick_allele_gene \ + --dir_plugins ${params.vep_cache_directory}/Plugins \ + --plugin dbscSNV,${params.vep_cache_directory}/Plugins/dbscSNV1.1/dbscSNV1.1_GRCh38.txt.gz \ + --plugin MaxEntScan,${params.vep_cache_directory}/Plugins/maxentscan \ + --plugin dbNSFP,${params.vep_cache_directory}/Plugins/dbNSFP/dbNSFP4.3a_grch38.gz,${params.vep_cache_directory}/Plugins/dbNSFP_replacement_logic,REVEL_score,SIFT_pred,SIFT4G_pred,LRT_pred,MutationTaster_pred,MutationAssessor_pred,FATHMM_pred,PROVEAN_pred,MetaSVM_pred,PrimateAI_pred,fathmm-MKL_coding_pred,GERP++_RS,phyloP100way_vertebrate,CADD_phred,Polyphen2_HVAR_pred \ + --custom ${params.vep_cache_directory}/annotations/COSMIC_v97/CosmicCodingMuts.vcf.gz,CosmicCoding,vcf,exact,0,GENOMIC_ID,LEGACY_ID,CNT,CDS,AA \ + --custom ${params.vep_cache_directory}/annotations/COSMIC_v97/CosmicNonCodingVariants.normal.vcf.gz,CosmicNonCoding,vcf,exact,0,GENOMIC_ID,LEGACY_ID,CNT,CDS,AA \ + --custom ${params.vep_cache_directory}/annotations/04142020_NYGC_samples.vcf.gz,NYGC,vcf,exact,0,AF,Samples,AC_Het,AC_Hom \ + --custom ${params.vep_cache_directory}/annotations/clinvar.vep.vcf.gz,CLN_Overlap,vcf,overlap,0,CLIN_ID,CLNSIG,CLNREVSTAT,CLNDN \ + --custom ${params.vep_cache_directory}/annotations/clinvar.vep.vcf.gz,CLN_Exact,vcf,exact,0,CLIN_ID,CLNSIG,CLNREVSTAT,CLNDN \ + --custom ${params.vep_cache_directory}/annotations/gnomad_exomes_subset_final.vcf.gz,GnomadExomes,vcf,exact,0,AF,nhomalt \ + --custom ${params.vep_cache_directory}/annotations/gnomad_genomes_subset_final.vcf.gz,GnomadGenomes,vcf,exact,0,AF,nhomalt \ + --custom ${params.vep_cache_directory}/annotations/chd_genes.vcf.gz,CHD_GENES,vcf,overlap,0,GENE \ + --custom ${params.vep_cache_directory}/annotations/chd_evolving.vcf.gz,CHD_EVOLVING,vcf,overlap,0,GENE \ + --custom ${params.vep_cache_directory}/annotations/chd_whitelist.vcf.gz,chd_whitelist,vcf,overlap,0,END \ + --custom ${params.vep_cache_directory}/annotations/deep_intronic_whitelist_08132020.vcf.gz,INTRONIC,vcf,exact,0,INTRONIC \ + --custom ${params.vep_cache_directory}/annotations/clinvar_deep_intronics_09012020.vcf.gz,CLINVAR_INTRONIC,vcf,exact,0,INTRONIC \ + --custom ${params.vep_cache_directory}/annotations/mastermind_cited_variants_reference-2021.01.02-grch38_fixed-contigs.vcf.gz,mm,vcf,exact,0,GENE,HGVSG,MMCNT1,MMCNT2,MMCNT3,MMID3,MMURI3 \ + --custom ${params.vep_cache_directory}/annotations/spliceai_scores.hg38.sorted.vcf.gz,SPLICEAI,vcf,exact,0,DS_AG,DS_AL,DS_DG,DS_DL \ + --custom ${params.vep_cache_directory}/annotations/pli_hg38.vcf.gz,PLI,vcf,overlap,0,pLI,mis_z \ + --custom ${params.vep_cache_directory}/annotations/domino_genes_38.vcf.gz,Domino,vcf,overlap,0,Domino_Score \ + --custom ${params.vep_cache_directory}/annotations/ar_extended.vcf.gz,AR,vcf,overlap,0,AR_GENE \ + --custom ${params.vep_cache_directory}/annotations/ACMG59_2017-09-28.vcf.gz,ACMG59,vcf,overlap,0,GENE,DISEASE \ + --custom ${params.vep_cache_directory}/annotations/dials_genes_b38.vcf.gz,DIALS,vcf,overlap,0,DIALS_GENE \ + --custom ${params.vep_cache_directory}/annotations/pgx_vep.vcf.gz,PGx,vcf,exact,0,pgx_rsid \ + --custom ${params.vep_cache_directory}/annotations/sema4_immuno_genes_b38.vcf.gz,IMMUNO,vcf,overlap,0,IMMUNO_Gene \ + --custom ${params.vep_cache_directory}/annotations/sema4_neuro_genes_b38.vcf.gz,NEURO,vcf,overlap,0,NEURO_Gene \ + --custom ${params.vep_cache_directory}/annotations/sema4_cardio_genes_b38.vcf.gz,CARDIO,vcf,overlap,0,CARDIO_Gene \ + --custom ${params.vep_cache_directory}/annotations/nygc_curation_b38.vcf.gz,N19,vcf,overlap,0,NYGC_CUR \ + --custom ${params.vep_cache_directory}/annotations/nygc_reported_variants_b38.vcf.gz,R19,vcf,overlap,0,NYGC_REPORTED_SAMPLE,NYGC_CLASS,NYGC_DISEASE + """ +} + +// NOTE: Many of the resources are hard coded based on those provided in: +// https://bitbucket.nygenome.org/projects/WDL/repos/somatic_dna_wdl/browse/annotate/variantEffectPredictor.wdl +// https://bitbucket.nygenome.org/projects/WDL/repos/somatic_dna_wdl/browse/config/fasta_references.json +// For VEP cache, dbNSFP and dbscSNV resources were rebuild using VEPv108 and as noted below. + +// VEP Cache setup: + +// singularity pull --name vep.sif docker://ensemblorg/ensembl-vep:release_108.2 +// singularity exec vep.sif INSTALL.pl -c /PATH_TO_VEP/vep -a cfp -s homo_sapiens_refseq -y GRCh38 -g dbNSFP,dbscSNV,MaxEntScan +// ln -sf homo_sapiens homo_sapiens_refseq + +// In the plugin directory: + +// dbNSFP: +// wget https://dbnsfp.s3.amazonaws.com/dbNSFP4.3a.zip +// unzip dbNSFP4.3a.zip +// zcat dbNSFP4.3a_variant.chr1.gz | head -n1 > h +// mkdir temp +// zgrep -h -v ^#chr dbNSFP4.3a_variant.chr* | sort -T temp -k1,1 -k2,2n - | cat h - | bgzip -c > dbNSFP4.3a.gz +// tabix -s 1 -b 2 -e 2 dbNSFP4.3a.gz +// rm -rf temp dbNSFP4.3a_variant.chr* h dbNSFP4.3_gene.gz dbNSFP4.3_gene.complete.gz dbNSFP4.3a.zip search_dbNSFP43a.readme.pdf search_dbNSFP43a.class search_dbNSFP43a.jar + + +// dbscSNV: +// wget ftp://dbnsfp:dbnsfp@dbnsfp.softgenetics.com/dbscSNV1.1.zip +// unzip dbscSNV1.1.zip +// head -n1 dbscSNV1.1.chr1 > h2 +// mkdir temp2 +// cat dbscSNV1.1.chr* | grep -v ^chr | sort -T temp2 -k5,5 -k6,6n | cat h2 - | awk '$5 != "."' | bgzip -c > dbscSNV1.1_GRCh38.txt.gz +// tabix -s 5 -b 6 -e 6 -c c dbscSNV1.1_GRCh38.txt.gz +// rm dbscSNV1.1.chr* dbscSNV1.1.zip h2 + +// wget http://hollywood.mit.edu/burgelab/maxent/download/fordownload.tar.gz +// gunzip fordownload.tar.gz +// tar -xvf fordownload.tar +// mkdir maxentscan +// mv fordownload ./maxentscan/ + + +// COSMIC: +// mkdir COSMIC_v97 +// echo "@jax.org:" | base64 +// +// curl -H "Authorization: Basic ADD AUTHORIZATION" https://cancer.sanger.ac.uk/cosmic/file_download/GRCh38/cosmic/v97/VCF/CosmicCodingMuts.vcf.gz +// the above command provides a URL for curl download +// curl "https://cog.sanger.ac.uk/cosmic/GRCh38/cosmic/v97/VCF/CosmicCodingMuts.vcf.gz?AWSAccessKeyId=KRV7P7QR9DL41J9EWGA2&Expires=1672843877&Signature=TSsIiQodqoKS5skE1ziS49zEWSU%3D" --output CosmicCodingMuts.vcf.gz + +// curl -H "Authorization: Basic ADD AUTHORIZATION" https://cancer.sanger.ac.uk/cosmic/file_download/GRCh38/cosmic/v97/VCF/CosmicNonCodingVariants.normal.vcf.gz +// the above command provides a URL for curl download +// curl "https://cog.sanger.ac.uk/cosmic/GRCh38/cosmic/v97/VCF/CosmicNonCodingVariants.normal.vcf.gz?AWSAccessKeyId=KRV7P7QR9DL41J9EWGA2&Expires=1672844121&Signature=4JkeRizNMg0pv%2FChw4QAl268dVw%3D" --output CosmicNonCodingVariants.normal.vcf.gz + + +// ALL REMAINING ANNOTATIONS: +// Note: remaining annotations come from: gs://nygc-resources-public/ensembl_vep/annotations.tar.gz \ No newline at end of file diff --git a/modules/fastq-tools/fastq-pair.nf b/modules/fastq-tools/fastq-pair.nf new file mode 100644 index 00000000..c09e4c9d --- /dev/null +++ b/modules/fastq-tools/fastq-pair.nf @@ -0,0 +1,23 @@ +process FASTQ_PAIR { + tag "$sampleID" + + cpus 1 + memory 50.GB + time { reads[0].size() < 35.GB ? 10.h : 18.h } + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} + + container 'quay.io/biocontainers/fastq-pair:1.0--h87f3376_3' + + input: + tuple val(sampleID), file(reads) + + output: + tuple val(sampleID), file("*.paired.fq"), emit: paired_fastq + tuple val(sampleID), file("*.single.fq"), emit: single_fastq + + script: + + """ + fastq_pair ${reads[0]} ${reads[1]} + """ +} diff --git a/modules/fastq-tools/fastq-sort.nf b/modules/fastq-tools/fastq-sort.nf new file mode 100644 index 00000000..e147835e --- /dev/null +++ b/modules/fastq-tools/fastq-sort.nf @@ -0,0 +1,28 @@ +process FASTQ_SORT { + + tag "$sampleID" + + cpus 1 + memory 50.GB + time 2.h + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} + + container 'quay.io/biocontainers/fastq-tools:0.8.3--hbd632db_2' + + publishDir "${params.pubdir}/${ params.organize_by=='sample' ? sampleID + '/deconvoluted_reads': 'deconvoluted_reads' }", pattern: "*.fastq", mode:'copy', enabled: params.keep_intermediate + + input: + tuple val(sampleID), file(reads) + val(suffix) + + output: + tuple val(sampleID), file("*sorted*{1,2}.fastq"), emit: sorted_fastq + + script: + command_two = params.read_type == 'PE' ? "fastq-sort --id ${reads[1]} > ${sampleID}_sorted_${suffix}_2.fastq" : '' + + """ + fastq-sort --id ${reads[0]} > ${sampleID}_sorted_${suffix}_1.fastq + ${command_two} + """ +} diff --git a/modules/fastqc/fastqc.nf b/modules/fastqc/fastqc.nf index 85ffa2db..af472cd4 100644 --- a/modules/fastqc/fastqc.nf +++ b/modules/fastqc/fastqc.nf @@ -5,9 +5,15 @@ process FASTQC { cpus 8 memory 4.GB time '10:00:00' + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} container 'quay.io/biocontainers/fastqc:0.11.9--hdfd78af_1' - publishDir "${params.pubdir}/${ params.organize_by=='sample' ? sampleID+'/stats' : 'fastqc' }", pattern: "*_fastqc.{zip,html}", mode:'copy' + + publishDir { + def type = "${params.workflow}" == 'chipseq' ? 'fastqc/' : '' + "${params.pubdir}/${ params.organize_by=='sample' ? type+sampleID+'/stats' : 'fastqc'}" + }, pattern: "*_fastqc.{zip,html}", mode: 'copy' + input: tuple val(sampleID), file(fq_reads) @@ -17,9 +23,23 @@ process FASTQC { script: - log.info "----- FASTQC Running on: ${sampleID} -----" + if (params.workflow == "chipseq" && params.read_type == 'SE') + """ + [ ! -f ${sampleID}.fastq.gz ] && ln -s ${fq_reads} ${sampleID}.fastq.gz + + fastqc --quiet -t ${task.cpus} ${sampleID}.fastq.gz + """ + else if (params.workflow == "chipseq" && params.read_type == 'PE') + """ + [ ! -f ${sampleID}_1.fastq.gz ] && ln -s ${fq_reads[0]} ${sampleID}_1.fastq.gz + [ ! -f ${sampleID}_2.fastq.gz ] && ln -s ${fq_reads[1]} ${sampleID}_2.fastq.gz + fastqc --quiet -t ${task.cpus} ${sampleID}_1.fastq.gz + fastqc --quiet -t ${task.cpus} ${sampleID}_2.fastq.gz + """ + else """ fastqc --quiet -t ${task.cpus} ${fq_reads} """ + } diff --git a/modules/fusion_report/fusion_report.nf b/modules/fusion_report/fusion_report.nf new file mode 100644 index 00000000..db4f2776 --- /dev/null +++ b/modules/fusion_report/fusion_report.nf @@ -0,0 +1,36 @@ +process FUSION_REPORT { + tag "$sampleID" + + cpus 1 + memory 2.GB + time 2.h + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} + + container 'quay.io/biocontainers/fusion-report:2.1.5--pyhdfd78af_0' + + publishDir "${params.pubdir}/${ params.organize_by=='sample' ? sampleID+'/fusion-report/' : 'star-fusion' }", mode:'copy' + + input: + tuple val(sampleID), path(arriba), path(fusioncatcher), path(jaffa), path(pizzly), path(squid), path(starfusion) + + output: + tuple val(sampleID), file("${sampleID}_fusion_list.tsv"), emit: fusion_inspector_input_list + tuple val(sampleID), file("${sampleID}_fusion_genes_mqc.json"), emit: summary_fusions_mq + tuple val(sampleID), file("*"), emit: report + + script: + def extra_params = params.fusion_report_opt ? params.fusion_report_opt : '' + def tools = !arriba.empty() ? "--arriba ${arriba} " : '' + tools += !jaffa.empty() ? "--jaffa ${jaffa} " : '' + tools += !fusioncatcher.empty() ? "--fusioncatcher ${fusioncatcher} " : '' + tools += !pizzly.empty() ? "--pizzly ${pizzly} " : '' + tools += !squid.empty() ? "--squid ${squid} " : '' + tools += !starfusion.empty() ? "--starfusion ${starfusion} " : '' + + """ + fusion_report run ${sampleID} . ${params.databases} ${tools} ${extra_params} + mv fusion_list.tsv ${sampleID}_fusion_list.tsv + mv fusion_list_filtered.tsv ${sampleID}_fusion_list_filtered.tsv + mv fusion_genes_mqc.json ${sampleID}_fusion_genes_mqc.json + """ +} \ No newline at end of file diff --git a/modules/fusioncatcher/fusioncatcher.nf b/modules/fusioncatcher/fusioncatcher.nf new file mode 100644 index 00000000..26f03618 --- /dev/null +++ b/modules/fusioncatcher/fusioncatcher.nf @@ -0,0 +1,44 @@ +process FUSIONCATCHER { + + tag "$sampleID" + + cpus 12 + memory 84.GB + time 24.h + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} + + container 'quay.io/biocontainers/fusioncatcher:1.33--hdfd78af_4' + + publishDir "${params.pubdir}/${ params.organize_by=='sample' ? sampleID + '/fusions': 'fusioncatcher' }", pattern: "*.{tsv,txt}", mode:'copy' + + + input: + tuple val(sampleID), path(reads) + + output: + tuple val(sampleID), path("*_fusioncatcher_fusions.txt"), optional:true, emit: fusioncatcher_fusions + tuple val(sampleID), path("*_fusioncatcher_summary.txt"), optional:true, emit: fusioncatcher_summary + tuple val(sampleID), path("*_fusioncatcher.log"), emit: fusioncatcher_log + + script: + + def input_reads = reads.toString().replace(" ", ",") + + """ + fusioncatcher.py \\ + -d ${params.fusioncatcher_ref} \\ + -i ${input_reads} \\ + -p ${task.cpus} \\ + -o . \\ + --skip-blat \\ + --limitSjdbInsertNsj ${params.fusioncatcher_limitSjdbInsertNsj} + + mv final-list_candidate-fusion-genes.txt ${sampleID}_fusioncatcher_fusions.txt + mv summary_candidate_fusions.txt ${sampleID}_fusioncatcher_summary.txt + mv fusioncatcher.log ${sampleID}_fusioncatcher.log + + """ + + + +} diff --git a/modules/g2gtools/g2gtools_chain_convert_peak.nf b/modules/g2gtools/g2gtools_chain_convert_peak.nf index 7202eb24..748184d9 100644 --- a/modules/g2gtools/g2gtools_chain_convert_peak.nf +++ b/modules/g2gtools/g2gtools_chain_convert_peak.nf @@ -4,6 +4,7 @@ process CHAIN_CONVERT { cpus 1 memory 10.GB time '10:00:00' + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} container 'quay.io/jaxcompsci/g2gtools:0.1.31' @@ -13,13 +14,12 @@ process CHAIN_CONVERT { tuple val(sampleID), file(bam_shifted) output: - tuple val(sampleID), file("*.tmp.mm10.ba*") - tuple val(sampleID), file("*g2gconvert.log") + tuple val(sampleID), file("*.tmp.mm10.bam"), emit: converted_bam + tuple val(sampleID), file("*g2gconvert.log"), emit: log when: params.chain != null script: - log.info "----- Converting Coordinates to Reference on ${sampleID} -----" """ g2gtools convert \ -r -f bam -c ${params.chain} \ diff --git a/modules/gatk/gatk3_applyrecalibration.nf b/modules/gatk/gatk3_applyrecalibration.nf new file mode 100644 index 00000000..4c27da26 --- /dev/null +++ b/modules/gatk/gatk3_applyrecalibration.nf @@ -0,0 +1,38 @@ +process GATKv3_5_ApplyRecalibration { + tag "$sampleID" + + cpus = 1 + memory = 15.GB + time = '10:00:00' + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} + + container 'broadinstitute/gatk3:3.5-0' + + publishDir "${params.pubdir}/${ params.organize_by=='sample' ? "$sampleID" : 'gatk' }", pattern: "*.vcf", mode:'copy' + + input: + tuple val(sampleID), file(normal_germline_vcf) + tuple val(sampleID), file(normal_germline_vcf_index) + tuple val(sampleID), file(normal_germline_recal) + tuple val(sampleID), file(normal_germline_tranches) + + output: + tuple val(sampleID), file("*.*recalibrated.filtered.vcf"), emit: normal_germline_recalibrated_vcf + tuple val(sampleID), file("*.*recalibrated.filtered.vcf.idx"), emit: normal_germline_recalibrated_vcf_index + + script: + String my_mem = (task.memory-1.GB).toString() + my_mem = my_mem[0..-4] + + """ + java -Djava.io.tmpdir=$TMPDIR -Xmx${my_mem}G -jar GenomeAnalysisTK.jar \ + -T ApplyRecalibration \ + -R ${params.ref_fa} \ + -input ${sampleID}_variants_raw.vcf \ + --ts_filter_level 99.6 \ + -tranchesFile ${sampleID}.tranches.txt \ + -recalFile ${sampleID}.recal.txt \ + -mode SNP + -o ${sampleID}_variants_raw.recalibrated.filtered.vcf + """ +} \ No newline at end of file diff --git a/modules/gatk/gatk3_genotypegvcf.nf b/modules/gatk/gatk3_genotypegvcf.nf new file mode 100644 index 00000000..3eef231b --- /dev/null +++ b/modules/gatk/gatk3_genotypegvcf.nf @@ -0,0 +1,33 @@ +process GATKv3_5_GENOTYPEGVCF { + tag "$sampleID" + + cpus = 1 + memory = 15.GB + time = '10:00:00' + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} + + container 'broadinstitute/gatk3:3.5-0' + + publishDir "${params.pubdir}/${ params.organize_by=='sample' ? "sampleID" : 'gatk' }", pattern: "*.vcf", mode:'copy', enabled: params.keep_intermediate + + input: + tuple val(sampleID), file(normal_germline_gvcf) + tuple val(sampleID), file(normal_germline_gvcf_index) + + output: + tuple val(sampleID), file("*.*vcf"), emit: normal_germline_vcf + tuple val(sampleID), file("*.vcf.idx"), emit: normal_germline_vcf_index + + script: + String my_mem = (task.memory-1.GB).toString() + my_mem = my_mem[0..-4] + + """ + java -Djava.io.tmpdir=$TMPDIR -Xmx${my_mem}G -jar GenomeAnalysisTK.jar \ + -T GenotypeGVCFs \ + -R ${params.ref_fa} \ + --variant ${sampleID}_variants_raw.gvcf \ + -o ${sampleID}_variants_raw.vcf + """ +} + \ No newline at end of file diff --git a/modules/gatk/gatk3_haplotypecaller.nf b/modules/gatk/gatk3_haplotypecaller.nf new file mode 100644 index 00000000..2e6982e1 --- /dev/null +++ b/modules/gatk/gatk3_haplotypecaller.nf @@ -0,0 +1,36 @@ +process GATKv3_5_HAPLOTYPECALLER { + tag "$sampleID" + + cpus = 1 + memory = 15.GB + time = '10:00:00' + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} + + container 'broadinstitute/gatk3:3.5-0' + + publishDir "${params.pubdir}/${ params.organize_by=='sample' ? "$sampleID" : 'gatk' }", pattern: "*.gvcf", mode:'copy' + + input: + tuple val(sampleID), val(meta), path(normal_bam), path(normal_bai) + + output: + tuple val(sampleID), path("*.gvcf"), emit: normal_germline_gvcf + tuple val(sampleID), path("*.gvcf.idx"), emit: normal_germline_gvcf_index + + script: + String my_mem = (task.memory-1.GB).toString() + my_mem = my_mem[0..-4] + + """ + java -Djava.io.tmpdir=$TMPDIR -Xmx${my_mem}G -jar /usr/GenomeAnalysisTK.jar \ + -T HaplotypeCaller \ + -R ${params.ref_fa} \ + -I ${normal_bam} \ + -o ${sampleID}_variants_raw.gvcf \ + -L ${params.target_gatk} \ + -stand_call_conf ${params.call_val} \ + -ERC GVCF \ + -variant_index_type LINEAR \ + -variant_index_parameter 128000 + """ +} \ No newline at end of file diff --git a/modules/gatk/gatk_indelrealigner.nf b/modules/gatk/gatk3_indelrealigner.nf similarity index 76% rename from modules/gatk/gatk_indelrealigner.nf rename to modules/gatk/gatk3_indelrealigner.nf index a4da1172..84eb22d8 100644 --- a/modules/gatk/gatk_indelrealigner.nf +++ b/modules/gatk/gatk3_indelrealigner.nf @@ -5,6 +5,7 @@ process GATK_INDELREALIGNER{ cpus = 1 memory = 35.GB time = '08:00:00' + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} // Command Depricated in GATK 4 container 'broadinstitute/gatk3:3.6-0' @@ -21,7 +22,6 @@ process GATK_INDELREALIGNER{ tuple val(sampleID), file("*.bai"), emit: bai script: - log.info "----- GATK IndelRealigner Running on: ${sampleID} -----" String my_mem = (task.memory-1.GB).toString() my_mem = my_mem[0..-4] diff --git a/modules/gatk/gatk_realignertargetcreator.nf b/modules/gatk/gatk3_realignertargetcreator.nf similarity index 72% rename from modules/gatk/gatk_realignertargetcreator.nf rename to modules/gatk/gatk3_realignertargetcreator.nf index 6e9cf465..06c4cf79 100644 --- a/modules/gatk/gatk_realignertargetcreator.nf +++ b/modules/gatk/gatk3_realignertargetcreator.nf @@ -5,6 +5,7 @@ process GATK_REALIGNERTARGETCREATOR { cpus = 12 memory = 35.GB time = '12:00:00' + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} container 'broadinstitute/gatk3:3.6-0' @@ -17,7 +18,6 @@ process GATK_REALIGNERTARGETCREATOR { tuple val(sampleID), file("*.intervals"), emit: intervals script: - log.info "----- GATK RealignerTargetCreator Running on: ${sampleID} -----" String my_mem = (task.memory-1.GB).toString() my_mem = my_mem[0..-4] diff --git a/modules/gatk/gatk_variantannotator.nf b/modules/gatk/gatk3_variantannotator.nf similarity index 63% rename from modules/gatk/gatk_variantannotator.nf rename to modules/gatk/gatk3_variantannotator.nf index 4abf05ba..f9e1c3fe 100644 --- a/modules/gatk/gatk_variantannotator.nf +++ b/modules/gatk/gatk3_variantannotator.nf @@ -4,12 +4,13 @@ process GATK_VARIANTANNOTATOR { cpus 1 memory 15.GB time '05:00:00' + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} // Legacy Reasons Leave as GATK3 (public) // Flag --snpEffFile was removed in GATK4 container 'broadinstitute/gatk3:3.6-0' - publishDir "${params.pubdir}/${ params.organize_by=='sample' ? sampleID : 'gatk' }", pattern: "*.vcf", mode:'copy' + publishDir "${params.pubdir}/${ params.organize_by=='sample' ? sampleID : 'gatk' }", pattern: "*.vcf", mode:'copy', enabled: params.gen_org=='mouse' ? true : params.keep_intermediate input: tuple val(sampleID), file(sample_vcf), file(snpeff_vcf) @@ -18,7 +19,6 @@ process GATK_VARIANTANNOTATOR { tuple val(sampleID), file("*.vcf"), emit: vcf script: - log.info "----- GATK VariantAnnotator Running on: ${sampleID} -----" String my_mem = (task.memory-1.GB).toString() my_mem = my_mem[0..-4] """ diff --git a/modules/gatk/gatk3_variantrecalibrator.nf b/modules/gatk/gatk3_variantrecalibrator.nf new file mode 100644 index 00000000..76e547a6 --- /dev/null +++ b/modules/gatk/gatk3_variantrecalibrator.nf @@ -0,0 +1,42 @@ +process GATKv3_5_VARIANTRECALIBRATOR { + tag "$sampleID" + + cpus = 1 + memory = 15.GB + time = '10:00:00' + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} + + container 'broadinstitute/gatk3:3.5-0' + + publishDir "${params.pubdir}/${ params.organize_by=='sample' ? "$sampleID" : 'gatk' }", pattern: "*.txt", mode:'copy' + + input: + tuple val(sampleID), file(normal_germline_vcf) + tuple val(sampleID), file(normal_germline_vcf_index) + + output: + tuple val(sampleID), file("*.*recal.txt"), emit: normal_germline_recal + tuple val(sampleID), file("*.*tranches.txt"), emit: normal_germline_tranches + tuple val(sampleID), file("*.*plot.R.txt"), emit: normal_germline_plot_R + + script: + String my_mem = (task.memory-1.GB).toString() + my_mem = my_mem[0..-4] + + """ + java -Djava.io.tmpdir=$TMPDIR -Xmx${my_mem}G -jar /usr/GenomeAnalysisTK.jar \ + -T VariantRecalibrator \ + -R ${params.ref_fa} \ + -input ${normal_germline_vcf} \ + -resource:hapmap,known=false,training=true,truth=true,prior=15.0 hapmap_3.3.b37.sites.vcf \ + -resource:omni,known=false,training=true,truth=false,prior=12.0 1000G_omni2.5.b37.sites.vcf \ + -resource:1000G,known=false,training=true,truth=false,prior=10.0 1000G_phase1.snps.high_confidence.vcf \ + -resource:dbsnp,known=true,training=false,truth=false,prior=6.0 dbsnp_135.b37.vcf \ + -an QD -an MQ -an MQRankSum -an ReadPosRankSum -an FS -an SOR -an InbreedingCoeff \ + -mode SNP \ + -tranche 99.6 \ + -recalFile ${sampleID}.recal.txt \ + -tranchesFile ${sampleID}.tranches.txt \ + -rscriptFile ${sampleID}.plots.R.txt + """ +} \ No newline at end of file diff --git a/modules/gatk/gatk_applybqsr.nf b/modules/gatk/gatk_applybqsr.nf index 3dc4face..dcefbdec 100644 --- a/modules/gatk/gatk_applybqsr.nf +++ b/modules/gatk/gatk_applybqsr.nf @@ -2,14 +2,13 @@ process GATK_APPLYBQSR { tag "$sampleID" cpus = 1 - memory = {40.GB * task.attempt} + memory = 40.GB time = '12:00:00' - errorStrategy 'retry' - maxRetries 1 + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} container 'broadinstitute/gatk:' - publishDir "${params.pubdir}/${ params.organize_by=='sample' ? sampleID+'/bam' : 'gatk' }", pattern: "*.bam", mode:'copy' + publishDir "${params.pubdir}/${ params.organize_by=='sample' ? sampleID+'/bam' : 'gatk' }", pattern: "*.ba*", mode:'copy' input: tuple val(sampleID), file(bam), file(table) @@ -19,7 +18,6 @@ process GATK_APPLYBQSR { tuple val(sampleID), file("*.bai"), emit: bai script: - log.info "----- GATK ApplyBQSR Running on: ${sampleID} -----" String my_mem = (task.memory-1.GB).toString() my_mem = my_mem[0..-4] """ diff --git a/modules/gatk/gatk_baserecalibrator.nf b/modules/gatk/gatk_baserecalibrator.nf index a52450e4..35e299ec 100644 --- a/modules/gatk/gatk_baserecalibrator.nf +++ b/modules/gatk/gatk_baserecalibrator.nf @@ -2,10 +2,9 @@ process GATK_BASERECALIBRATOR { tag "$sampleID" cpus = 1 - memory = {40.GB * task.attempt} - time = '12:00:00' - errorStrategy 'retry' - maxRetries 1 + memory { bam.size() < 60.GB ? 40.GB : 80.GB } + time { bam.size() < 60.GB ? '12:00:00' : '24:00:00' } + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} container 'broadinstitute/gatk:' @@ -18,7 +17,6 @@ process GATK_BASERECALIBRATOR { tuple val(sampleID), file("*.table"), emit: table script: - log.info "----- GATK BaseRecalibrator Running on: ${sampleID} -----" String my_mem = (task.memory-1.GB).toString() my_mem = my_mem[0..-4] """ diff --git a/modules/gatk/gatk_chain_extract_badreads.nf b/modules/gatk/gatk_chain_extract_badreads.nf index ac89330d..0e3c9d5f 100644 --- a/modules/gatk/gatk_chain_extract_badreads.nf +++ b/modules/gatk/gatk_chain_extract_badreads.nf @@ -4,11 +4,12 @@ process CHAIN_EXTRACT_BADREADS { cpus 2 memory 4.GB time = '04:00:00' + errorStrategy { [0,3,4].contains(task.exitStatus) ? 'ignore' : 'terminate' } publishDir "${params.pubdir}/${ params.organize_by=='sample' ? sampleID+'/stats' : 'gatk' }", pattern: "*.log", mode: 'copy' + container 'broadinstitute/gatk:' - errorStrategy { [0,3,4].contains(task.exitStatus) ? 'ignore' : 'terminate' } input: tuple val(sampleID), file(bam_sort_mm10) @@ -20,7 +21,6 @@ process CHAIN_EXTRACT_BADREADS { when: params.chain != null script: - log.info "----- Extracting a list of 'bad reads' on ${sampleID} -----" """ gatk ValidateSamFile \ -I ${bam_sort_mm10[0]} \ diff --git a/modules/gatk/gatk_chain_filter_reads.nf b/modules/gatk/gatk_chain_filter_reads.nf index 09782b2c..e012b3c7 100644 --- a/modules/gatk/gatk_chain_filter_reads.nf +++ b/modules/gatk/gatk_chain_filter_reads.nf @@ -4,6 +4,7 @@ process CHAIN_FILTER_READS { cpus 2 memory 4.GB time = '10:00:00' + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} publishDir "${params.pubdir}/${ params.organize_by=='sample' ? sampleID+'/stats' : 'gatk' }", pattern: "*.log", mode: 'copy' container 'broadinstitute/gatk:' @@ -13,13 +14,12 @@ process CHAIN_FILTER_READS { output: - tuple val(sampleID), file("*.tmp2.mm10.ba*") + tuple val(sampleID), path("*.tmp2.mm10.bam"), emit: bam tuple val(sampleID), file("*_FilterSamReads.log"), emit: filterReads_log when: params.chain != null script: - log.info "----- Filtering list to unique name on ${sampleID} -----" """ gatk FilterSamReads \ -I ${bam_sort_mm10[0]} \ diff --git a/modules/gatk/gatk_cnnscorevariants.nf b/modules/gatk/gatk_cnnscorevariants.nf new file mode 100644 index 00000000..9602e8b1 --- /dev/null +++ b/modules/gatk/gatk_cnnscorevariants.nf @@ -0,0 +1,31 @@ +process GATK_CNNSCORE_VARIANTS { + tag "$sampleID" + + cpus = 1 + memory = 15.GB + time = '04:00:00' + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} + + container 'broadinstitute/gatk:' + + publishDir "${params.pubdir}/${ params.organize_by=='sample' ? sampleID : 'gatk' }", pattern: "*.*vcf", mode:'copy', enabled: params.keep_intermediate + + input: + tuple val(sampleID), file(vcf), file(vcf_index), path(interval), val(index) + + output: + tuple val(sampleID), file("*.vcf"), emit: vcf + tuple val(sampleID), file("*.idx"), emit: idx + + script: + String my_mem = (task.memory-1.GB).toString() + my_mem = my_mem[0..-4] + + """ + gatk --java-options "-Xmx${my_mem}G" CNNScoreVariants \ + -R ${params.ref_fa} \ + -V ${vcf} \ + -O ${sampleID}_${index}_haplotypecaller.annotated.vcf \ + -L ${interval} + """ +} diff --git a/modules/gatk/gatk_combinegvcfs.nf b/modules/gatk/gatk_combinegvcfs.nf new file mode 100644 index 00000000..2de1ae18 --- /dev/null +++ b/modules/gatk/gatk_combinegvcfs.nf @@ -0,0 +1,33 @@ +process GATK_COMBINEGVCFS { + tag "$sampleID" + + cpus 1 + memory 10.GB + time '05:00:00' + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} + + container 'broadinstitute/gatk:' + + publishDir "${params.pubdir}/${ params.organize_by=='sample' ? sampleID : 'gatk' }", pattern: "*.gvcf", mode:'copy' + + input: + tuple val(sampleID), path(gvcf) + + output: + tuple val(sampleID), file("*.gvcf"), emit: gvcf + tuple val(sampleID), file("*.idx"), emit: idx + + script: + // memory needs to be set explicitly + String my_mem = (task.memory-1.GB).toString() + my_mem = my_mem[0..-4] + + inputs = gvcf.collect { "--variant $it" }.join(' ') + + """ + gatk --java-options "-Xmx${my_mem}G" CombineGVCFs \ + -R ${params.ref_fa} \ + ${inputs} \ + -O ${sampleID}_GATKcombined_raw.gvcf + """ +} \ No newline at end of file diff --git a/modules/gatk/gatk_depthofcoverage.nf b/modules/gatk/gatk_depthofcoverage.nf index 6d7eec4c..e699d77d 100644 --- a/modules/gatk/gatk_depthofcoverage.nf +++ b/modules/gatk/gatk_depthofcoverage.nf @@ -5,9 +5,9 @@ process GATK_DEPTHOFCOVERAGE { cpus 1 memory 15.GB time '05:00:00' + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} container 'broadinstitute/gatk:' - file(params.ref_fai) input: tuple val(sampleID), file(bam), file(bai) @@ -17,7 +17,6 @@ process GATK_DEPTHOFCOVERAGE { tuple val(sampleID), file("*_gatk_temp.txt"), emit: txt script: - log.info "----- GATK Depth of Coverage Running on: ${sampleID} -----" String my_mem = (task.memory-1.GB).toString() my_mem = my_mem[0..-4] """ diff --git a/modules/gatk/gatk_filtermutectcalls.nf b/modules/gatk/gatk_filtermutectcalls.nf new file mode 100644 index 00000000..deaa9f27 --- /dev/null +++ b/modules/gatk/gatk_filtermutectcalls.nf @@ -0,0 +1,32 @@ +process GATK_FILTERMUECTCALLS { + tag "$sampleID" + + cpus = 1 + memory = 15.GB + time '05:00:00' + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} + + container 'broadinstitute/gatk:' + + publishDir "${params.pubdir}/${ params.organize_by=='sample' ? sampleID + '/callers' : 'gatk' }", pattern: "*_mutect2_somatic.filtered.vcf.gz", mode:'copy' + publishDir "${params.pubdir}/${ params.organize_by=='sample' ? sampleID + '/stats' : 'gatk' }", pattern: "*.filteringStats.tsv", mode:'copy' + + input: + tuple val(sampleID), path(vcf), path(tbi), val(meta), val(normal_name), val(tumor_name), val(tool), path(stats) + + output: + tuple val(sampleID), file("*_mutect2_somatic.filtered.vcf.gz"), file("*_mutect2_somatic.filtered.vcf.gz.tbi"), val(meta), val(normal_name), val(tumor_name), val('mutect2'), emit: mutect2_vcf_tbi + tuple val(sampleID), file("*.filteringStats.tsv"), emit: stats + + script: + //Estimate somatic variants using Mutect2 + String my_mem = (task.memory-1.GB).toString() + my_mem = my_mem[0..-4] + """ + gatk --java-options "-Xmx${my_mem}G" FilterMutectCalls \ + -R ${params.ref_fa} \ + -V ${vcf} \ + --stats ${stats} \ + -O ${sampleID}_mutect2_somatic.filtered.vcf.gz + """ +} \ No newline at end of file diff --git a/modules/gatk/gatk_filtermutectcalls_tumorOnly.nf b/modules/gatk/gatk_filtermutectcalls_tumorOnly.nf new file mode 100644 index 00000000..8de1c636 --- /dev/null +++ b/modules/gatk/gatk_filtermutectcalls_tumorOnly.nf @@ -0,0 +1,32 @@ +process GATK_FILTERMUECTCALLS { + tag "$sampleID" + + cpus = 1 + memory = 15.GB + time '05:00:00' + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} + + container 'broadinstitute/gatk:' + + publishDir "${params.pubdir}/${ params.organize_by=='sample' ? sampleID : 'gatk' }", pattern: "*_mutect2_somatic.filtered.vcf.gz", mode:'copy' + publishDir "${params.pubdir}/${ params.organize_by=='sample' ? sampleID+'/stats' : 'gatk' }", pattern: "*.filteringStats.tsv", mode:'copy' + + input: + tuple val(sampleID), path(vcf), path(tbi), path(stats) + + output: + tuple val(sampleID), file("*_mutect2_somatic.filtered.vcf.gz"), file("*_mutect2_somatic.filtered.vcf.gz.tbi"), emit: mutect2_vcf_tbi + tuple val(sampleID), file("*.filteringStats.tsv"), emit: stats + + script: + //Estimate somatic variants using Mutect2 + String my_mem = (task.memory-1.GB).toString() + my_mem = my_mem[0..-4] + """ + gatk --java-options "-Xmx${my_mem}G" FilterMutectCalls \ + -R ${params.ref_fa} \ + -V ${vcf} \ + --stats ${stats} \ + -O ${sampleID}_mutect2_somatic.filtered.vcf.gz + """ +} \ No newline at end of file diff --git a/modules/gatk/gatk_filtervarianttranches.nf b/modules/gatk/gatk_filtervarianttranches.nf new file mode 100644 index 00000000..4ecd68dd --- /dev/null +++ b/modules/gatk/gatk_filtervarianttranches.nf @@ -0,0 +1,39 @@ +process GATK_FILTER_VARIANT_TRANCHES { + // This modules is a port of the NYGC germline filtering scheme found at this site: + // https://bitbucket.nygenome.org/projects/WDL/repos/somatic_dna_wdl/browse/germline/germline.wdl?at=7.4.0 + + tag "$sampleID" + + cpus = 1 + memory = 15.GB + time = '01:30:00' + errorStrategy 'ignore' + + container 'broadinstitute/gatk:' + + publishDir "${params.pubdir}/${ params.organize_by=='sample' ? sampleID : 'gatk' }", pattern: "*.*vcf", mode:'copy', enabled: params.keep_intermediate + + input: + tuple val(sampleID), file(vcf), file(vcf_index) + + output: + tuple val(sampleID), file("*.*vcf"), file("*.idx"), emit: vcf_idx + + script: + String my_mem = (task.memory-1.GB).toString() + my_mem = my_mem[0..-4] + + """ + gatk --java-options "-Xmx${my_mem}G" FilterVariantTranches \ + -V ${vcf} \ + -O ${sampleID}_haplotypecaller.gatk.filtered.genotypedGVCFs.vcf \ + --snp-tranche 99.9 --snp-tranche 99.95 \ + --indel-tranche 99.0 --indel-tranche 99.4 \ + --resource ${params.hapmap} \ + --resource ${params.omni} \ + --resource ${params.phase1_1000G} \ + --resource ${params.dbSNP} \ + --info-key CNN_1D \ + --create-output-variant-index true + """ +} \ No newline at end of file diff --git a/modules/gatk/gatk_genotype_gvcf.nf b/modules/gatk/gatk_genotype_gvcf.nf new file mode 100644 index 00000000..a5c9e238 --- /dev/null +++ b/modules/gatk/gatk_genotype_gvcf.nf @@ -0,0 +1,30 @@ +process GATK_GENOTYPE_GVCF { + tag "$sampleID" + + cpus = 1 + memory = 15.GB + time = '01:30:00' + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} + + container 'broadinstitute/gatk:' + + publishDir "${params.pubdir}/${ params.organize_by=='sample' ? sampleID : 'gatk' }", pattern: "*.*vcf", mode:'copy', enabled: params.keep_intermediate + + input: + tuple val(sampleID), file(vcf), file(vcf_index), path(interval), val(index) + + output: + tuple val(sampleID), file("*.*vcf"), file("*.idx"), path(interval), val(index), emit: vcf_idx + + script: + String my_mem = (task.memory-1.GB).toString() + my_mem = my_mem[0..-4] + + """ + gatk --java-options "-Xmx${my_mem}G" GenotypeGVCFs \ + -R ${params.ref_fa} \ + -V ${vcf} \ + -O ${sampleID}_${index}_genotypedGVCFs.vcf \ + -L ${interval} + """ +} \ No newline at end of file diff --git a/modules/gatk/gatk_getsamplename.nf b/modules/gatk/gatk_getsamplename.nf new file mode 100644 index 00000000..8b41dbd5 --- /dev/null +++ b/modules/gatk/gatk_getsamplename.nf @@ -0,0 +1,25 @@ +process GATK_GETSAMPLENAME { + tag "$sampleID" + + cpus = 1 + memory = 1.GB + time = '00:05:00' + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} + + container 'broadinstitute/gatk:' + + input: + tuple val(sampleID), val(meta), file(bam), file(bai) + + output: + tuple val(sampleID), stdout, emit: sample_name + + script: + """ + gatk GetSampleName \ + -I ${bam} \ + -O sample_name.txt + + cat sample_name.txt + """ +} \ No newline at end of file diff --git a/modules/gatk/gatk_getsamplename_noMeta.nf b/modules/gatk/gatk_getsamplename_noMeta.nf new file mode 100644 index 00000000..28ced711 --- /dev/null +++ b/modules/gatk/gatk_getsamplename_noMeta.nf @@ -0,0 +1,25 @@ +process GATK_GETSAMPLENAME { + tag "$sampleID" + + cpus = 1 + memory = 1.GB + time = '00:05:00' + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} + + container 'broadinstitute/gatk:' + + input: + tuple val(sampleID), file(bam), file(bai) + + output: + tuple val(sampleID), stdout, emit: sample_name + + script: + """ + gatk GetSampleName \ + -I ${bam} \ + -O sample_name.txt + + cat sample_name.txt + """ +} \ No newline at end of file diff --git a/modules/gatk/gatk_haplotypecaller.nf b/modules/gatk/gatk_haplotypecaller.nf index 5506970c..a39b222f 100644 --- a/modules/gatk/gatk_haplotypecaller.nf +++ b/modules/gatk/gatk_haplotypecaller.nf @@ -4,6 +4,7 @@ process GATK_HAPLOTYPECALLER { cpus = 1 memory = 15.GB time = '10:00:00' + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} container 'broadinstitute/gatk:' @@ -18,7 +19,6 @@ process GATK_HAPLOTYPECALLER { tuple val(sampleID), file("*.idx"), emit: idx script: - log.info "----- GATK Haplotype Caller Running on: ${sampleID} -----" String my_mem = (task.memory-1.GB).toString() my_mem = my_mem[0..-4] @@ -27,7 +27,7 @@ process GATK_HAPLOTYPECALLER { output_suffix='gvcf' } else{ - delta="--dbsnp ${params.dbSNP} " + delta="--dbsnp ${params.dbSNP} -stand-call-conf ${params.call_val}" output_suffix='vcf' } @@ -38,7 +38,6 @@ process GATK_HAPLOTYPECALLER { -I ${bam} \ -O ${sampleID}_variants_raw.${output_suffix} \ -L ${params.target_gatk} \ - -stand-call-conf ${params.call_val} \ ${params.ploidy_val} \ ${delta} \ """ diff --git a/modules/gatk/gatk_haplotypecaller_interval.nf b/modules/gatk/gatk_haplotypecaller_interval.nf index 83555298..ea410bf7 100644 --- a/modules/gatk/gatk_haplotypecaller_interval.nf +++ b/modules/gatk/gatk_haplotypecaller_interval.nf @@ -4,28 +4,42 @@ process GATK_HAPLOTYPECALLER_INTERVAL { cpus = 1 memory = 15.GB - time = '05:30:00' + time 12.hour + errorStrategy 'finish' + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} container 'broadinstitute/gatk:' + publishDir "${params.pubdir}/${ params.organize_by=='sample' ? sampleID : 'gatk' }", pattern: "*.*vcf", mode:'copy', enabled: params.keep_intermediate + input: tuple val(sampleID), file(bam), file(bai), val(chrom) + val(gvcf) output: - tuple val(sampleID), file("*.vcf"), emit: vcf + tuple val(sampleID), file("*.*vcf"), emit: vcf tuple val(sampleID), file("*.idx"), emit: idx script: - log.info "----- GATK Haplotype Caller Running on Chromosome ${chrom} for sample: ${sampleID} -----" String my_mem = (task.memory-1.GB).toString() my_mem = my_mem[0..-4] + + if (gvcf=='gvcf'){ + delta="-ERC GVCF" + output_suffix='gvcf' + } + else{ + delta="-stand-call-conf ${params.call_val}" + output_suffix='vcf' + } + """ gatk --java-options "-Xmx${my_mem}G" HaplotypeCaller \ -R ${params.ref_fa} \ -I ${bam} \ - -O ${sampleID}_HaplotypeCaller_${chrom}.vcf \ + -O ${sampleID}_HaplotypeCaller_${chrom}.${output_suffix} \ -L ${chrom} \ - -stand-call-conf ${params.call_val} + ${delta} \ """ } \ No newline at end of file diff --git a/modules/gatk/gatk_haplotypecaller_sv_germline.nf b/modules/gatk/gatk_haplotypecaller_sv_germline.nf new file mode 100644 index 00000000..022fbb74 --- /dev/null +++ b/modules/gatk/gatk_haplotypecaller_sv_germline.nf @@ -0,0 +1,38 @@ +process GATK_HAPLOTYPECALLER_SV_GERMLINE { + tag "$sampleID" + + cpus = 1 + memory = 15.GB + time = '10:00:00' + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} + + container 'broadinstitute/gatk:' + + publishDir "${params.pubdir}/${ params.organize_by=='sample' ? sampleID + '/callers' : 'gatk' }", pattern: "*.*vcf", mode:'copy', enabled: params.keep_intermediate + + input: + tuple val(sampleID), val(meta), path(normal_bam), path(normal_bai), val(read_name), path(interval), val(index) + + output: + tuple val(sampleID), path("*.*vcf"), emit: vcf + tuple val(sampleID), path("*.idx"), emit: idx + + script: + String my_mem = (task.memory-1.GB).toString() + my_mem = my_mem[0..-4] + + """ + gatk --java-options "-Xmx${my_mem}G" HaplotypeCaller \ + -R ${params.ref_fa} \ + -I ${normal_bam} \ + -O ${sampleID}_${index}_variants_raw.gvcf \ + -L ${interval} \ + -XL ${params.excludeIntervalList} \ + -stand-call-conf ${params.call_val} \ + -G StandardAnnotation \ + -G StandardHCAnnotation \ + -G AS_StandardAnnotation \ + -GQB 10 -GQB 20 -GQB 30 -GQB 40 -GQB 50 -GQB 60 -GQB 70 -GQB 80 -GQB 90 \ + -ERC GVCF + """ +} \ No newline at end of file diff --git a/modules/gatk/gatk_indexfeaturefile.nf b/modules/gatk/gatk_indexfeaturefile.nf index 14c6f2eb..e5ff5a31 100644 --- a/modules/gatk/gatk_indexfeaturefile.nf +++ b/modules/gatk/gatk_indexfeaturefile.nf @@ -4,6 +4,7 @@ process GATK_INDEXFEATUREFILE { cpus = 1 memory = 6.GB time = '03:00:00' + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} container 'broadinstitute/gatk:' @@ -16,7 +17,6 @@ process GATK_INDEXFEATUREFILE { tuple val(sampleID), file("*.idx"), emit: idx script: - log.info "----- GATK IndexFeatureFile Running on: ${sampleID} -----" String my_mem = (task.memory-1.GB).toString() my_mem = my_mem[0..-4] """ diff --git a/modules/gatk/gatk_mergemutectstats.nf b/modules/gatk/gatk_mergemutectstats.nf new file mode 100644 index 00000000..4ad056cf --- /dev/null +++ b/modules/gatk/gatk_mergemutectstats.nf @@ -0,0 +1,29 @@ + process GATK_MERGEMUTECTSTATS { + tag "$sampleID" + + cpus = 1 + memory = 15.GB + time '05:00:00' + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} + + container 'broadinstitute/gatk:' + + input: + tuple val(sampleID), path(list) + + output: + tuple val(sampleID), file("*.stats"), emit: stats + + script: + //Estimate somatic variants using Mutect2 + String my_mem = (task.memory-1.GB).toString() + my_mem = my_mem[0..-4] + + stats = list.collect { "-stats $it" }.join(' ') + + """ + gatk --java-options "-Xmx${my_mem}G" MergeMutectStats \ + ${stats} \ + -O ${sampleID}_merged.stats + """ + } \ No newline at end of file diff --git a/modules/gatk/gatk_mergevcf.nf b/modules/gatk/gatk_mergevcf.nf index ebd56d6e..4537674a 100644 --- a/modules/gatk/gatk_mergevcf.nf +++ b/modules/gatk/gatk_mergevcf.nf @@ -4,6 +4,7 @@ process GATK_MERGEVCF { cpus 1 memory 15.GB time '05:00:00' + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} container 'broadinstitute/gatk:' @@ -11,12 +12,12 @@ process GATK_MERGEVCF { input: tuple val(sampleID), file(snp_vcf), file(indel_vcf) + val(suffix) output: tuple val(sampleID), file("*.vcf"), emit: vcf script: - log.info "----- GATK MergeVcfs Running on: ${sampleID} -----" String my_mem = (task.memory-1.GB).toString() my_mem = my_mem[0..-4] """ @@ -24,6 +25,6 @@ process GATK_MERGEVCF { -R ${params.ref_fa} \ -I ${snp_vcf} \ -I ${indel_vcf} \ - -O ${sampleID}_GATKcombined.vcf + -O ${sampleID}_${suffix}.vcf """ } \ No newline at end of file diff --git a/modules/gatk/gatk_mergevcf_list.nf b/modules/gatk/gatk_mergevcf_list.nf index a544c238..f3150197 100644 --- a/modules/gatk/gatk_mergevcf_list.nf +++ b/modules/gatk/gatk_mergevcf_list.nf @@ -4,6 +4,7 @@ process GATK_MERGEVCF_LIST { cpus 1 memory 10.GB time '05:00:00' + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} container 'broadinstitute/gatk:' @@ -17,7 +18,6 @@ process GATK_MERGEVCF_LIST { tuple val(sampleID), file("*.idx"), emit: idx script: - log.info "----- GATK MergeVcfs Running on: ${sampleID} -----" // memory needs to be set explicitly String my_mem = (task.memory-1.GB).toString() my_mem = my_mem[0..-4] diff --git a/modules/gatk/gatk_mutect2.nf b/modules/gatk/gatk_mutect2.nf new file mode 100644 index 00000000..b1076eb1 --- /dev/null +++ b/modules/gatk/gatk_mutect2.nf @@ -0,0 +1,37 @@ +process GATK_MUTECT2 { + tag "$sampleID" + + cpus = 4 + memory = 15.GB + time 15.hour + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} + + container 'broadinstitute/gatk:' + + publishDir "${params.pubdir}/${ params.organize_by=='sample' ? sampleID + '/callers' : 'gatk' }", pattern: "*_somatic.vcf.gz", mode:'copy', enabled: params.keep_intermediate + + input: + tuple val(sampleID), val(meta), path(normal_bam), path(normal_bai), val(normal_name), path(tumor_bam), path(tumor_bai), val(tumor_name), path(interval), val(interval_index) + + output: + tuple val(sampleID), path("*_somatic.vcf.gz"), val(meta), val(normal_name), val(tumor_name), val('mutect2'), emit: vcf + tuple val(sampleID), path("*_somatic.vcf.gz.tbi"), emit: tbi + tuple val(sampleID), path("*.stats"), emit: stats + + script: + //Estimate somatic variants using Mutect2 + String my_mem = (task.memory-1.GB).toString() + my_mem = my_mem[0..-4] + + """ + gatk --java-options "-Xmx${my_mem}G -XX:ParallelGCThreads=${task.cpus}" Mutect2 \ + -R ${params.ref_fa} \ + -I ${tumor_bam} \ + -tumor ${tumor_name} \ + -I ${normal_bam} \ + -normal ${normal_name} \ + -L ${interval} \ + --native-pair-hmm-threads 4 \ + -O ${sampleID}_${interval_index}_somatic.vcf.gz + """ +} \ No newline at end of file diff --git a/modules/gatk/gatk_mutect2_tumorOnly.nf b/modules/gatk/gatk_mutect2_tumorOnly.nf new file mode 100644 index 00000000..b55e9c10 --- /dev/null +++ b/modules/gatk/gatk_mutect2_tumorOnly.nf @@ -0,0 +1,55 @@ +process GATK_MUTECT2 { + tag "$sampleID" + + cpus = 4 + memory = 15.GB + time 15.hour + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} + + container 'broadinstitute/gatk:' + + publishDir "${params.pubdir}/${ params.organize_by=='sample' ? sampleID : 'gatk' }", pattern: "*_somatic.vcf.gz", mode:'copy', enabled: params.keep_intermediate + + input: + tuple val(sampleID), file(tumor_bam), file(tumor_bai), val(tumor_name) + + output: + tuple val(sampleID), file("*_somatic.vcf.gz"), file("*_somatic.vcf.gz.tbi"), file("*.stats"), emit: vcf_tbi_stats + + script: + //Estimate somatic variants using Mutect2 + String my_mem = (task.memory-1.GB).toString() + my_mem = my_mem[0..-4] + + """ + gatk --java-options "-Xmx${my_mem}G -XX:ParallelGCThreads=${task.cpus}" Mutect2 \ + -R ${params.ref_fa} \ + -I ${tumor_bam} \ + --germline-resource ${params.gnomad_ref} \ + --panel-of-normals ${params.pon_ref} \ + --genotype-germline-sites true \ + --genotype-pon-sites true \ + --pileup-detection \ + --dont-use-soft-clipped-bases false \ + -L ${params.target_gatk} \ + --native-pair-hmm-threads 4 \ + --annotation QualByDepth \ + --annotation RMSMappingQuality \ + --annotation FisherStrand \ + --annotation MappingQualityRankSumTest \ + --annotation ReadPosRankSumTest \ + --min-base-quality-score 20 \ + -O ${sampleID}_mutect2_somatic.vcf.gz + """ +} + +/* +As of v4.1, there is no longer a need to specify the tumor sample name with -tumor. You need only specify the normal sample name with -normal, if you include a normal. + +Starting with v4.0.4.0, GATK recommends the default setting of --af-of-alleles-not-in-resource, which the tool dynamically adjusts for different modes. +tumor-only calling sets the default to 5e-8, tumor-normal calling sets it to 1e-6 and mitochondrial mode sets it to 4e-3. +For previous versions, the default was 0.001, the average heterozygosity of humans. +For other organisms, change --af-of-alleles-not-in-resource to 1/(ploidy*samples in resource). + +https://console.cloud.google.com/storage/browser/gatk-best-practices/somatic-hg38;tab=objects?prefix=&forceOnObjectsSortingFiltering=false +*/ \ No newline at end of file diff --git a/modules/gatk/gatk_selectvariants.nf b/modules/gatk/gatk_selectvariants.nf index 51fe520c..e6614882 100644 --- a/modules/gatk/gatk_selectvariants.nf +++ b/modules/gatk/gatk_selectvariants.nf @@ -4,21 +4,23 @@ process GATK_SELECTVARIANTS { cpus = 1 memory = 6.GB time = '03:00:00' + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} container 'broadinstitute/gatk:' - publishDir "${params.pubdir}/${ params.organize_by=='sample' ? sampleID : 'gatk' }", pattern: "*.vcf", mode:'copy' + publishDir "${params.pubdir}/${ params.organize_by=='sample' ? sampleID : 'gatk' }", pattern: "*.vcf", mode:'copy', enabled: params.keep_intermediate + publishDir "${params.pubdir}/${ params.organize_by=='sample' ? sampleID : 'gatk' }", pattern: "*filtered_dbsnpID.vcf", mode:'copy' input: tuple val(sampleID), file(vcf), file(idx) val(indel_snp) + val(suffix) output: tuple val(sampleID), file("*.vcf"), emit: vcf tuple val(sampleID), file("*.idx"), emit: idx script: - log.info "----- GATK Selectvariants Running on: ${sampleID} -----" String my_mem = (task.memory-1.GB).toString() my_mem = my_mem[0..-4] """ @@ -26,6 +28,6 @@ process GATK_SELECTVARIANTS { -R ${params.ref_fa} \ -V ${vcf} \ -select-type ${indel_snp} \ - -O ${sampleID}_selectedvariants_${indel_snp}.vcf + -O ${sampleID}_${suffix}.vcf """ } \ No newline at end of file diff --git a/modules/gatk/gatk_sortvcf_germline.nf b/modules/gatk/gatk_sortvcf_germline.nf new file mode 100644 index 00000000..b017944a --- /dev/null +++ b/modules/gatk/gatk_sortvcf_germline.nf @@ -0,0 +1,38 @@ +process GATK_SORTVCF_GERMLINE { + + tag "$sampleID" + + cpus = 1 + memory = 15.GB + time = '05:30:00' + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} + + container 'broadinstitute/gatk:' + + input: + tuple val(sampleID), path(list) + val(gvcf) + + output: + tuple val(sampleID), file("*.vcf"), file("*.idx"), emit: vcf_idx, optional: true + + script: + String my_mem = (task.memory-1.GB).toString() + my_mem = my_mem[0..-4] + + inputs = list.collect { "-I $it" }.join(' ') + + if (gvcf=='gvcf'){ + output_suffix='g.vcf' + } + else{ + output_suffix='vcf' + } + + """ + gatk --java-options "-Xmx${my_mem}G" SortVcf \ + -SD ${params.ref_fa_dict} \ + ${inputs} \ + -O ${sampleID}_merged.${output_suffix} + """ +} \ No newline at end of file diff --git a/modules/gatk/gatk_sortvcf_somatic_merge.nf b/modules/gatk/gatk_sortvcf_somatic_merge.nf new file mode 100644 index 00000000..0da71331 --- /dev/null +++ b/modules/gatk/gatk_sortvcf_somatic_merge.nf @@ -0,0 +1,30 @@ +process GATK_SORTVCF_SOMATIC { + + tag "$sampleID" + + cpus = 1 + memory = 15.GB + time = '05:30:00' + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} + + container 'broadinstitute/gatk:' + + input: + tuple val(sampleID), path(list), val(meta) + + output: + tuple val(sampleID), file("*.vcf"), file("*.idx"), val(meta), emit: vcf_idx, optional: true + + script: + String my_mem = (task.memory-1.GB).toString() + my_mem = my_mem[0..-4] + + inputs = list.collect { "-I $it" }.join(' ') + + """ + gatk --java-options "-Xmx${my_mem}G" SortVcf \ + -SD ${params.ref_fa_dict} \ + ${inputs} \ + -O ${sampleID}_mnv_final_filtered_merged.vcf + """ +} \ No newline at end of file diff --git a/modules/gatk/gatk_sortvcf_somatic_tools.nf b/modules/gatk/gatk_sortvcf_somatic_tools.nf new file mode 100644 index 00000000..a6ed490b --- /dev/null +++ b/modules/gatk/gatk_sortvcf_somatic_tools.nf @@ -0,0 +1,48 @@ +process GATK_SORTVCF { + + tag "$sampleID" + + cpus = 1 + memory = 15.GB + time = '05:30:00' + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} + + container 'broadinstitute/gatk:' + + publishDir "${params.pubdir}/${ params.organize_by=='sample' ? sampleID + '/callers' : 'lancet' }", pattern:"*_lancet_merged.vcf.gz", mode:'copy' + + input: + tuple val(sampleID), path(list), val(meta), val(normal_name), val(tumor_name), val(tool) + + output: + tuple val(sampleID), file("*.vcf.gz"), file("*.tbi"), val(meta), val(normal_name), val(tumor_name), val(tool), emit: vcf_tbi + + script: + String my_mem = (task.memory-1.GB).toString() + my_mem = my_mem[0..-4] + + inputs = list.collect { "-I $it" }.join(' ') + + if (tool == 'lancet_support') { + chrom_extract = (list =~ /\w+merged_(chr.+)_h.+/) + tool_name = "lancet_support_"+chrom_extract[0][1] + tool = chrom_extract[0][1] + // for final sort merge of lancet confirm, set 'tool_name' to include chrom. + // set tool to chrom. These steps are required for tuple build as input to final merge. + + } else { + tool_name = tool + } + + """ + gatk --java-options "-Xmx${my_mem}G" SortVcf \ + -SD ${params.ref_fa_dict} \ + ${inputs} \ + -O ${sampleID}_${tool_name}_merged.vcf + + bgzip -f -c ${sampleID}_${tool_name}_merged.vcf > ${sampleID}_${tool_name}_merged.vcf.gz + tabix ${sampleID}_${tool_name}_merged.vcf.gz + + """ +} + diff --git a/modules/gatk/gatk_variantfiltration.nf b/modules/gatk/gatk_variantfiltration.nf index 2b4ef3fc..4023d8e3 100644 --- a/modules/gatk/gatk_variantfiltration.nf +++ b/modules/gatk/gatk_variantfiltration.nf @@ -4,9 +4,12 @@ process GATK_VARIANTFILTRATION { cpus = 1 memory = 6.GB time = '03:00:00' + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} container 'broadinstitute/gatk:' - publishDir "${params.pubdir}/${ params.organize_by=='sample' ? sampleID : 'gatk' }", pattern: "*.vcf", mode:'copy' + + publishDir "${params.pubdir}/${ params.organize_by=='sample' ? sampleID : 'gatk' }", pattern: "*.vcf", mode:'copy', enabled: params.keep_intermediate + publishDir "${params.pubdir}/${ params.organize_by=='sample' ? sampleID : 'gatk' }", pattern: "*SNP_INDEL_filtered_unannotated_final.vcf", mode:'copy' input: tuple val(sampleID), file(vcf), file(idx) @@ -17,7 +20,6 @@ process GATK_VARIANTFILTRATION { tuple val(sampleID), file("*.idx"), emit: idx script: - log.info "----- GATK VariantFiltration Running on: ${sampleID} -----" String my_mem = (task.memory-1.GB).toString() my_mem = my_mem[0..-4] if (indel_snp == 'INDEL'){ @@ -30,14 +32,14 @@ process GATK_VARIANTFILTRATION { } if (indel_snp == 'BOTH'){ fs = '60.0' - output_suffix = 'snp_indel_filtered.vcf' + output_suffix = 'SNP_INDEL_filtered_unannotated_final.vcf' } """ gatk --java-options "-Xmx${my_mem}G" VariantFiltration \ -R ${params.ref_fa} \ -V ${vcf} \ - -O ${sampleID}_variantfiltration_${output_suffix} \ + -O ${sampleID}_${output_suffix} \ --cluster-window-size 10 \ --filter-name "LowCoverage" --filter-expression "DP < 25" \ --filter-name "VeryLowQual" --filter-expression "QUAL < 30.0" \ diff --git a/modules/gatk/gatk_variantfiltration_af.nf b/modules/gatk/gatk_variantfiltration_af.nf new file mode 100644 index 00000000..1ab1df69 --- /dev/null +++ b/modules/gatk/gatk_variantfiltration_af.nf @@ -0,0 +1,54 @@ + +process GATK_VARIANTFILTRATION_AF { + // This modules is a port of the NYGC germline filtering scheme found at this site: + // https://bitbucket.nygenome.org/projects/WDL/repos/somatic_dna_wdl/browse/germline/germline.wdl?at=7.4.0 + + tag "$sampleID" + + cpus = 1 + memory = 6.GB + time = '03:00:00' + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} + + container 'broadinstitute/gatk:' + publishDir "${params.pubdir}/${ params.organize_by=='sample' ? sampleID : 'gatk' }", pattern: "*.vcf", mode:'copy', enabled: params.keep_intermediate + + input: + tuple val(sampleID), file(vcf), file(idx) + + output: + tuple val(sampleID), file("*haplotypecaller.gatk.af-gq-filtered.vcf"), emit: vcf + + script: + String my_mem = (task.memory-1.GB).toString() + my_mem = my_mem[0..-4] + """ + ## Annotate FORMAT/AF + gatk --java-options "-Xmx${my_mem}G" VariantAnnotator \ + -R ${params.ref_fa} \ + -V ${vcf} \ + -O ${sampleID}_haplotypecaller.gatk.af.vcf.gz \ + -A AlleleFraction + + ## remove biallellic sites + zcat ${sampleID}_haplotypecaller.gatk.af.vcf.gz \ + | awk '(\$5 !~ ",")' \ + > ${sampleID}.biallellic.vcf + + ## Variant filtration + gatk --java-options "-Xmx${my_mem}G" VariantFiltration \ + -R ${params.ref_fa} \ + -V ${sampleID}.biallellic.vcf \ + -O ${sampleID}.haplotypecaller.af-gq-filtered.vcf.gz \ + --genotype-filter-name "AlleleFraction" \ + --genotype-filter-expression "(AF < 0.25 && AF > 0.0) || AF > 0.75" \ + --genotype-filter-name "GQ20" \ + --genotype-filter-expression "GQ < 20" + + ## filter with AF (deliver) + zcat ${sampleID}.haplotypecaller.af-gq-filtered.vcf.gz \ + | grep -v "AlleleFraction" \ + > ${sampleID}_haplotypecaller.gatk.af-gq-filtered.vcf + + """ +} \ No newline at end of file diff --git a/modules/gatk/gatk_variantfiltration_mutect2.nf b/modules/gatk/gatk_variantfiltration_mutect2.nf new file mode 100644 index 00000000..356b11ee --- /dev/null +++ b/modules/gatk/gatk_variantfiltration_mutect2.nf @@ -0,0 +1,46 @@ +process GATK_VARIANTFILTRATION { + tag "$sampleID" + + cpus = 1 + memory = 6.GB + time = '03:00:00' + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} + + container 'broadinstitute/gatk:' + + publishDir "${params.pubdir}/${ params.organize_by=='sample' ? sampleID : 'gatk' }", pattern: "*.vcf", mode:'copy', enabled: params.keep_intermediate + + input: + tuple val(sampleID), file(vcf), file(idx) + val(indel_snp) + + output: + tuple val(sampleID), file("*.vcf"), emit: vcf + tuple val(sampleID), file("*.idx"), emit: idx + + script: + String my_mem = (task.memory-1.GB).toString() + my_mem = my_mem[0..-4] + if (indel_snp == 'INDEL'){ + fs='200.0' + output_suffix = 'INDEL_filtered.vcf' + } + if (indel_snp =='SNP'){ + fs ='60.0' + output_suffix = 'SNP_filtered.vcf' + } + if (indel_snp == 'BOTH'){ + fs = '60.0' + output_suffix = 'snp_indel_filtered.vcf' + } + + """ + gatk --java-options "-Xmx${my_mem}G" VariantFiltration \ + -R ${params.ref_fa} \ + -V ${vcf} \ + -O ${sampleID}_${output_suffix} \ + --cluster-window-size 10 \ + --filter-name "LowCoverage" --filter-expression "DP < 25" \ + --filter-name "StrandBias" --filter-expression "FS > ${fs}" + """ +} \ No newline at end of file diff --git a/modules/gridss/gridss_assemble.nf b/modules/gridss/gridss_assemble.nf new file mode 100644 index 00000000..d3611597 --- /dev/null +++ b/modules/gridss/gridss_assemble.nf @@ -0,0 +1,44 @@ +process GRIDSS_ASSEMBLE { + tag "$sampleID" + + cpus = 4 + memory = 15.GB + time = '10:00:00' + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} + + container 'quay.io/jaxcompsci/gridss:2.13.2-2_ln' + + input: + tuple val(sampleID), val(meta), path(normal_bam), path(normal_bai), val(normal_name), path(tumor_bam), path(tumor_bai), val(tumor_name), val(gridss_preprocessed) + + output: + tuple val(sampleID), path('gridss_assemble/'), emit: gridss_assembly + + script: + String my_mem = (task.memory-1.GB).toString() + my_mem = my_mem[0..-4]+'g' + + output_dir = 'gridss_assemble/' + + """ + # https://github.com/umccr/gridss-purple-linx-nf + # Create shadow directory with file symlinks of GRIDSS 'workingdir' to prevent NF cache invalidation (resume related) + # NOTE: for reasons that elude me, NF doesn't always stage in the workingdir; remove if it is present + mkdir -p "${output_dir}/work/" + lndir \$(readlink -f "${gridss_preprocessed}/") "${output_dir}/work" + if [[ -L "${gridss_preprocessed.name}" ]]; then + rm "${gridss_preprocessed}" + fi + + gridss \ + --jvmheap "${my_mem}" \ + --steps assemble \ + --reference "${params.combined_reference_set}" \ + --jar /opt/gridss/gridss-2.13.2-gridss-jar-with-dependencies.jar \ + --threads ${task.cpus} \ + --workingdir "${output_dir}/work/" \ + --assembly ${output_dir}/${sampleID}.gridssassembly.bam \ + --picardoptions VALIDATION_STRINGENCY=LENIENT \ + ${normal_bam} ${tumor_bam} + """ +} \ No newline at end of file diff --git a/modules/gridss/gridss_calling.nf b/modules/gridss/gridss_calling.nf new file mode 100644 index 00000000..4b191c00 --- /dev/null +++ b/modules/gridss/gridss_calling.nf @@ -0,0 +1,47 @@ +process GRIDSS_CALLING { + tag "$sampleID" + + cpus = 4 + memory = 15.GB + time = '10:00:00' + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} + + container 'quay.io/jaxcompsci/gridss:2.13.2-2_ln' + + publishDir "${params.pubdir}/${ params.organize_by=='sample' ? sampleID + '/callers' : 'gridss' }", pattern: "*_gridss_sv.vcf.gz", mode:'copy', enabled: params.keep_intermediate + + input: + tuple val(sampleID), val(meta), path(normal_bam), path(normal_bai), val(normal_name), path(tumor_bam), path(tumor_bai), val(tumor_name), val(gridss_assembled) + + output: + tuple val(sampleID), path('*_gridss_sv.vcf.gz'), val(meta), val(normal_name), val(tumor_name), emit: gridss_vcf + + script: + String my_mem = (task.memory-1.GB).toString() + my_mem = my_mem[0..-4]+'g' + + output_dir = 'gridss_call/' + + """ + # https://github.com/umccr/gridss-purple-linx-nf + # Create shadow directory with file symlinks of GRIDSS 'workingdir' to prevent NF cache invalidation (resume related) + # NOTE: for reasons that elude me, NF doesn't always stage in the workingdir; remove if it is present + mkdir -p "${output_dir}" + lndir \$(readlink -f "${gridss_assembled}/") "${output_dir}/" + if [[ -L "${gridss_assembled.name}" ]]; then + rm "${gridss_assembled}" + fi + + gridss \ + --jvmheap "${my_mem}" \ + --steps call \ + --reference "${params.combined_reference_set}" \ + --jar /opt/gridss/gridss-2.13.2-gridss-jar-with-dependencies.jar \ + --threads ${task.cpus} \ + --workingdir "${output_dir}/work/" \ + --assembly "${output_dir}/${sampleID}.gridssassembly.bam" \ + --output "${sampleID}_gridss_sv.vcf.gz" \ + --picardoptions VALIDATION_STRINGENCY=LENIENT \ + ${normal_bam} ${tumor_bam} + """ +} \ No newline at end of file diff --git a/modules/gridss/gridss_chrom_filter.nf b/modules/gridss/gridss_chrom_filter.nf new file mode 100644 index 00000000..88986086 --- /dev/null +++ b/modules/gridss/gridss_chrom_filter.nf @@ -0,0 +1,31 @@ +process GRIDSS_CHROM_FILTER { + tag "$sampleID" + + cpus = 1 + memory = 1.GB + time = '01:00:00' + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} + + container 'quay.io/jaxcompsci/internal_tools:v1.0' + + stageInMode = 'copy' + + publishDir "${params.pubdir}/${ params.organize_by=='sample' ? sampleID + '/callers' : 'gridss' }", pattern: "*_gridss_sv_unfiltered_chroms.vcf", mode:'copy', enabled: params.keep_intermediate + + input: + tuple val(sampleID), path(vcf), val(meta), val(normal_name), val(tumor_name) + val(chroms) + + output: + tuple val(sampleID), path('*_gridss_sv_unfiltered_chroms.vcf'), val(meta), val(normal_name), val(tumor_name), emit: gridss_chrom_vcf + + script: + chrom_list = chroms.collect { "$it" }.join(' ') + + """ + python ${projectDir}/bin/pta/filter_vcf.py \ + --vcf-file ${vcf} \ + --output ${sampleID}_gridss_sv_unfiltered_chroms.vcf \ + --chroms ${chrom_list} + """ +} diff --git a/modules/gridss/gridss_preprocess.nf b/modules/gridss/gridss_preprocess.nf new file mode 100644 index 00000000..3d8449f3 --- /dev/null +++ b/modules/gridss/gridss_preprocess.nf @@ -0,0 +1,32 @@ +process GRIDSS_PREPROCESS { + tag "$sampleID" + + cpus = 4 + memory = 15.GB + time = '10:00:00' + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} + + container 'quay.io/jaxcompsci/gridss:2.13.2-2_ln' + + input: + tuple val(sampleID), val(meta), path(normal_bam), path(normal_bai), val(normal_name), path(tumor_bam), path(tumor_bai), val(tumor_name) + + output: + tuple val(sampleID), path('gridss_preprocess/'), emit: gridss_preproc + + script: + String my_mem = (task.memory-1.GB).toString() + my_mem = my_mem[0..-4]+'g' + """ + # https://github.com/umccr/gridss-purple-linx-nf + gridss \ + --jvmheap "${my_mem}" \ + --steps preprocess \ + --reference "${params.combined_reference_set}" \ + --jar /opt/gridss/gridss-2.13.2-gridss-jar-with-dependencies.jar \ + --threads ${task.cpus} \ + --workingdir gridss_preprocess/ \ + --picardoptions VALIDATION_STRINGENCY=LENIENT \ + ${normal_bam} ${tumor_bam} + """ +} \ No newline at end of file diff --git a/modules/gridss/gripss_somatic_filter.nf b/modules/gridss/gripss_somatic_filter.nf new file mode 100644 index 00000000..bfb9d5a9 --- /dev/null +++ b/modules/gridss/gripss_somatic_filter.nf @@ -0,0 +1,53 @@ + +process GRIPSS_SOMATIC_FILTER { + tag "$sampleID" + + cpus = 1 + memory = 5.GB + time = '01:00:00' + errorStrategy 'ignore' + + container 'quay.io/biocontainers/hmftools-gripss:2.3.2--hdfd78af_0' + + stageInMode = 'copy' + + publishDir "${params.pubdir}/${ params.organize_by=='sample' ? sampleID + '/callers' : 'gridss' }", pattern: "*gripss.filtered.vcf.gz", mode:'copy' + + input: + tuple val(sampleID), path(vcf), val(meta), val(normal_name), val(tumor_name) + + + output: + tuple val(sampleID), path('*gripss.filtered.vcf.gz'),path('*gripss.filtered.vcf.gz.tbi'), val(meta), val(normal_name), val(tumor_name), val('gridss'), emit: gripss_filtered_bgz + //note: while this is "GRIPSS" filtering. + // GRIDSS was the caller and downstream + // scripts expect "gridss" as the tool name. + tuple val(sampleID), path('*.gripss.vcf.gz'), path('*.gripss.vcf.gz.tbi'), val(meta), val(normal_name), val(tumor_name), val('gridss'), emit: gripss_all_bgz + + script: + """ + gripss -Xmx5g \ + -sample ${tumor_name} \ + -reference ${normal_name} \ + -ref_genome_version 38 \ + -ref_genome ${params.ref_fa} \ + -pon_sgl_file ${params.gripss_pon}/sgl_pon.38.bed \ + -pon_sv_file ${params.gripss_pon}/sv_pon.38.bedpe \ + -known_hotspot_file ${params.gripss_pon}/known_fusions.38.bedpe \ + -repeat_mask_file ${params.gripss_pon}/repeat_mask_data.38.fa.gz \ + -vcf ${vcf} \ + -output_dir . + + mv ${tumor_name}.gripss.filtered.vcf.gz ${sampleID}.gripss.filtered.vcf.gz + mv ${tumor_name}.gripss.filtered.vcf.gz.tbi ${sampleID}.gripss.filtered.vcf.gz.tbi + mv ${tumor_name}.gripss.vcf.gz ${sampleID}.gripss.vcf.gz + mv ${tumor_name}.gripss.vcf.gz.tbi ${sampleID}.gripss.vcf.gz.tbi + + """ + + stub: + """ + touch ${sampleID}_gripss.filtered.vcf.gz + touch ${sampleID}_gripss.filtered.vcf.gz.tbi + """ +} diff --git a/modules/homer/annotate_boolean_peaks.nf b/modules/homer/annotate_boolean_peaks.nf new file mode 100644 index 00000000..88badbaf --- /dev/null +++ b/modules/homer/annotate_boolean_peaks.nf @@ -0,0 +1,22 @@ +process ANNOTATE_BOOLEAN_PEAKS { + tag "${antibody}" + + cpus 1 + memory 5.GB + time '04:00:00' + + container 'ubuntu:20.04' + + input: + tuple val(antibody), path(boolean_txt), path(homer_peaks) + + output: + path '*.boolean.annotatePeaks.txt', emit: annotate_peaks_txt + + script: + prefix="\$(echo ${boolean_txt} | sed 's/.boolean.txt//g')" + """ + cut -f2- ${homer_peaks} | awk 'NR==1; NR > 1 {print \$0 | "sort -T '.' -k1,1 -k2,2n"}' | cut -f6- > tmp.txt + paste ${boolean_txt} tmp.txt > ${prefix}.boolean.annotatePeaks.txt + """ +} diff --git a/modules/homer/homer_annotatepeaks.nf b/modules/homer/homer_annotatepeaks.nf new file mode 100644 index 00000000..df2c7e68 --- /dev/null +++ b/modules/homer/homer_annotatepeaks.nf @@ -0,0 +1,40 @@ +process HOMER_ANNOTATEPEAKS { + tag "${run_tag}" + + cpus 2 + memory 10.GB + time '10:00:00' + + publishDir { + def type = "${ip}" ? "${'immuno_precip_samples/'+ip+'_vs_'+control+'/macs2'}" : "${'consensusCalling_'+antibody+'/macs2'}" + "${params.pubdir}/${ params.organize_by=='sample' ? type : 'macs2'}" + }, pattern: "*annotatePeaks.txt", mode: 'copy' + + container 'quay.io/biocontainers/homer:4.11--pl526hc9558a2_3' + + input: + tuple val(antibody), val(replicatesExist), val(multipleGroups), val(ip), val(control), file(peak) + file(fasta) + file(gtf) + + when: + params.macs_gsize && !params.skip_peak_annotation + + output: + tuple val(tuple_tag), path("*annotatePeaks.txt"), emit: txt + + script: + prefix = peak =~ /bed/ ? "${antibody}.consensus_peaks" : "${ip}_peaks" + run_tag = ip ? "${ip} vs ${control}" : "${antibody}" + tuple_tag = ip ? ip : antibody + + """ + annotatePeaks.pl \\ + $peak \\ + $fasta \\ + -gid \\ + -gtf $gtf \\ + -cpu $task.cpus \\ + > ${prefix}.annotatePeaks.txt + """ +} diff --git a/modules/homer/plot_homer_annotatepeaks.nf b/modules/homer/plot_homer_annotatepeaks.nf new file mode 100644 index 00000000..eeeef450 --- /dev/null +++ b/modules/homer/plot_homer_annotatepeaks.nf @@ -0,0 +1,37 @@ +process PLOT_HOMER_ANNOTATEPEAKS { + + cpus 2 + memory 10.GB + time '10:00:00' + + + container 'quay.io/biocontainers/mulled-v2-ad9dd5f398966bf899ae05f8e7c54d0fb10cdfa7:05678da05b8e5a7a5130e90a9f9a6c585b965afa-0' + + publishDir "${params.pubdir}/${ params.organize_by=='sample' ? 'immuno_precip_samples/cross_sample_plots' : 'homer' }", pattern: "*.{pdf,txt}", mode: 'copy' + + input: + file(annos) + file(mqc_header) + val suffix //_peaks.annotatePeaks.txt + + when: + params.macs_gsize && !params.skip_peak_annotation && !params.skip_peak_qc + + output: + path '*.txt' , emit: txt + path '*.pdf' , emit: pdf + path '*.tsv' , emit: tsv + + script: // This script was bundled withing the nf-core/chipseq/bin/ directory + def prefix = "macs_annotatepeaks" + """ + ${projectDir}/bin/chipseq/plot_homer_annotatepeaks.r \\ + -i ${annos.join(',')} \\ + -s ${annos.join(',').replaceAll("${suffix}","")} \\ + -p $prefix \\ + -o ./ + + find ./ -type f -name "*summary.txt" -exec cat {} \\; | cat $mqc_header - > ${prefix}.summary_mqc.tsv + + """ +} diff --git a/modules/illumina/manta.nf b/modules/illumina/manta.nf new file mode 100644 index 00000000..3d4550dd --- /dev/null +++ b/modules/illumina/manta.nf @@ -0,0 +1,50 @@ +process MANTA { + tag "$sampleID" + + cpus = 4 + memory { normal_bam.size() < 60.GB ? 12.GB : 24.GB } + time { normal_bam.size() < 60.GB ? '03:00:00' : '12:00:00' } + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} + + container 'quay.io/jaxcompsci/manta:v1.5.0' + + publishDir "${params.pubdir}/${ params.organize_by=='sample' ? "$sampleID" + '/callers' : 'manta' }", pattern:"*.vcf.gz", mode:'copy' + + input: + tuple val(sampleID), val(meta), path(normal_bam), path(normal_bai), val(normal_name), path(tumor_bam), path(tumor_bai), val(tumor_name) + + output: + tuple val(sampleID), path("*candidateSmallIndels.vcf.gz"), path("*candidateSmallIndels.vcf.gz.tbi"), emit: manta_smallindel_vcf_tbi + tuple val(sampleID), path("*diploidSV.vcf.gz"), path("*diploidSV.vcf.gz.tbi"), emit: manta_diploidsv_tbi + tuple val(sampleID), path("*somaticSV.vcf.gz"), path("*somaticSV.vcf.gz.tbi"), val(meta), val(normal_name), val(tumor_name), val('manta'), emit: manta_somaticsv_tbi + tuple val(sampleID), path("*candidateSV.vcf.gz"), path("*candidateSV.vcf.gz.tbi"), emit: manta_candidatesv_tbi + + + script: + String my_mem = (task.memory-1.GB).toString() + my_mem = my_mem[0..-4] + + """ + # configure manta + python /usr/local/bin/configManta.py \ + --normalBam ${normal_bam} \ + --tumorBam ${tumor_bam} \ + --referenceFasta ${params.ref_fa} \ + --callRegions ${params.callRegions} \ + --runDir ${sampleID} + + # execute manta + python ${sampleID}/runWorkflow.py -j ${task.cpus} \ + --mode local \ + --memGb ${my_mem} + + mv ${sampleID}/results/variants/candidateSmallIndels.vcf.gz ${sampleID}_manta_candidateSmallIndels.vcf.gz + mv ${sampleID}/results/variants/candidateSmallIndels.vcf.gz.tbi ${sampleID}_manta_candidateSmallIndels.vcf.gz.tbi + mv ${sampleID}/results/variants/diploidSV.vcf.gz ${sampleID}_manta_diploidSV.vcf.gz + mv ${sampleID}/results/variants/diploidSV.vcf.gz.tbi ${sampleID}_manta_diploidSV.vcf.gz.tbi + mv ${sampleID}/results/variants/somaticSV.vcf.gz ${sampleID}_manta_somaticSV.vcf.gz + mv ${sampleID}/results/variants/somaticSV.vcf.gz.tbi ${sampleID}_manta_somaticSV.vcf.gz.tbi + mv ${sampleID}/results/variants/candidateSV.vcf.gz ${sampleID}_manta_candidateSV.vcf.gz + mv ${sampleID}/results/variants/candidateSV.vcf.gz.tbi ${sampleID}_manta_candidateSV.vcf.gz.tbi + """ +} \ No newline at end of file diff --git a/modules/illumina/strelka2.nf b/modules/illumina/strelka2.nf new file mode 100644 index 00000000..1a444e61 --- /dev/null +++ b/modules/illumina/strelka2.nf @@ -0,0 +1,48 @@ +process STRELKA2 { + tag "$sampleID" + + cpus = 4 + memory { normal_bam.size() < 60.GB ? 8.GB : 24.GB } + time { normal_bam.size() < 60.GB ? '03:00:00' : '12:00:00' } + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} + + container 'quay.io/jaxcompsci/strelka2:v2.9.3' + + publishDir "${params.pubdir}/${ params.organize_by=='sample' ? "$sampleID" + '/callers' : 'strelka' }", pattern:"*.vcf.gz", mode:'copy' + + input: + tuple val(sampleID), val(meta), path(normal_bam), path(normal_bai), val(normal_name), path(tumor_bam), path(tumor_bai), val(tumor_name), path(candidateSmallIndels), path(candidateSmallIndels_tbi) + + output: + tuple val(sampleID), path("*indels.vcf.gz"), path("*indels.vcf.gz.tbi"), val(meta), val(normal_name), val(tumor_name), val('strelka2_indel'), emit: strelka_indel_vcf_tbi + tuple val(sampleID), path("*snvs.vcf.gz"), path("*snvs.vcf.gz.tbi"), val(meta), val(normal_name), val(tumor_name), val('strelka2_sv'), emit: strelka_snv_vcf_tbi + + script: + + String my_mem = (task.memory-1.GB).toString() + my_mem = my_mem[0..-4] + + """ + # configure strelka + python /usr/local/bin/configureStrelkaSomaticWorkflow.py \ + --normalBam ${normal_bam} \ + --tumorBam ${tumor_bam} \ + --callRegions ${params.callRegions} \ + --referenceFasta ${params.ref_fa} \ + --indelCandidates ${candidateSmallIndels} \ + --config ${params.strelka_config} \ + --runDir ${sampleID} + + # execute strelka + python ${sampleID}/runWorkflow.py \ + --mode local \ + --job ${task.cpus} \ + --memGb ${my_mem} + + mv ${sampleID}/results/variants/somatic.snvs.vcf.gz ${sampleID}_strelka_somatic.snvs.vcf.gz + mv ${sampleID}/results/variants/somatic.snvs.vcf.gz.tbi ${sampleID}_strelka_somatic.snvs.vcf.gz.tbi + mv ${sampleID}/results/variants/somatic.indels.vcf.gz ${sampleID}_strelka_somatic.indels.vcf.gz + mv ${sampleID}/results/variants/somatic.indels.vcf.gz.tbi ${sampleID}_strelka_somatic.indels.vcf.gz.tbi + + """ +} \ No newline at end of file diff --git a/modules/jaffa/jaffa.nf b/modules/jaffa/jaffa.nf new file mode 100644 index 00000000..68c81345 --- /dev/null +++ b/modules/jaffa/jaffa.nf @@ -0,0 +1,41 @@ +process JAFFA { + + tag "$sampleID" + + cpus 12 + memory 84.GB + time 10.h + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} + + container 'quay.io/jaxcompsci/jaffa:d1587c9' + + publishDir "${params.pubdir}/${ params.organize_by=='sample' ? sampleID + '/fusions': 'jaffa' }", pattern: "*_jaffa_fusions.csv", mode:'copy' + publishDir "${params.pubdir}/${ params.organize_by=='sample' ? sampleID + '/fusions': 'jaffa' }", pattern: "*_jaffa_fusions.fasta", mode:'copy', enabled: params.keep_intermediate + + input: + tuple val(sampleID), path(reads) + + output: + tuple val(sampleID), path("*_jaffa_fusions.csv"), emit: jaffa_fusions + tuple val(sampleID), path("*_jaffa_fusions.fasta"), emit: jaffa_fasta + + script: + ext = reads[0].getExtension() + + """ + + bpipe run -v \ + -n ${task.cpus} \ + -p fastqInputFormat='%_*.${ext}' \ + -p refBase=${params.jaffa_ref_dir} \ + -p genome=hg38 \ + -p annotation=genCode22 \ + /opt/JAFFA/JAFFA_direct.groovy \ + ${reads[0]} \ + ${reads[1]} + + mv jaffa_results.csv ${sampleID}_jaffa_fusions.csv + mv jaffa_results.fasta ${sampleID}_jaffa_fusions.fasta ; + + """ +} \ No newline at end of file diff --git a/modules/kallisto/kallisto_insert_size.nf b/modules/kallisto/kallisto_insert_size.nf new file mode 100644 index 00000000..a26a2af3 --- /dev/null +++ b/modules/kallisto/kallisto_insert_size.nf @@ -0,0 +1,23 @@ +process KALLISTO_INSERT_SIZE { + tag "$sampleID" + + cpus 1 + memory 1.GB + time '00:05:00' + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} + + cache 'lenient' + + container 'quay.io/biocontainers/pizzly:0.37.3--h470a237_3' + + input: + tuple val(sampleID), val(kallisto_abundance) + + output: + tuple val(sampleID), path('insert_size.txt'), emit: kallisto_insert_size + + script: + """ + python ${projectDir}/bin/rna_fusion/compute_insert_size.py ${kallisto_abundance} > insert_size.txt + """ +} diff --git a/modules/kallisto/kallisto_quant.nf b/modules/kallisto/kallisto_quant.nf new file mode 100644 index 00000000..508f239f --- /dev/null +++ b/modules/kallisto/kallisto_quant.nf @@ -0,0 +1,33 @@ +process KALLISTO_QUANT { + + tag "$sampleID" + + cpus 12 + memory 84.GB + time 24.h + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} + + container 'quay.io/biocontainers/kallisto:0.48.0--h15996b6_2' + + input: + tuple val(sampleID), path(reads) + + output: + tuple val(sampleID), path("*kallisto_quant.fusions.txt"), emit: kallisto_fusions + tuple val(sampleID), path("*abundance.h5"), emit: kallisto_abundance + + script: + """ + kallisto quant \ + -t $task.cpus \ + -i ${params.kallisto_index} \ + --fusion \ + -o . \ + ${reads} + mv fusion.txt ${sampleID}.kallisto_quant.fusions.txt + mv abundance.h5 ${sampleID}.abundance.h5 + """ +} +// NOTE: +// Index built with command: +// singularity run /projects/omics_share/meta/containers/quay.io-biocontainers-kallisto-0.48.0--h15996b6_2.img kallisto index -k 31 -i Homo_sapiens.GRCh38.102.cdna.all.kallisto-0.48.0.index Homo_sapiens.GRCh38.102.cdna.all.fa.gz diff --git a/modules/lumpy_sv/lumpy_sv.nf b/modules/lumpy_sv/lumpy_sv.nf new file mode 100644 index 00000000..ac03d439 --- /dev/null +++ b/modules/lumpy_sv/lumpy_sv.nf @@ -0,0 +1,25 @@ +process LUMPY_SV { + tag "$sampleID" + + cpus = 1 + memory { normal_bam.size() < 60.GB ? 8.GB : 24.GB } + time { normal_bam.size() < 60.GB ? '03:00:00' : '12:00:00' } + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} + + container 'quay.io/biocontainers/lumpy-sv:0.3.1--hdfd78af_3' + + publishDir "${params.pubdir}/${ params.organize_by=='sample' ? sampleID + '/callers' : 'lumpy-sv' }", pattern:"*.vcf", mode:'copy' + + input: + tuple val(sampleID), val(meta), path(normal_bam), path(normal_bai), val(normal_name), path(tumor_bam), path(tumor_bai), val(tumor_name) + + output: + tuple val(sampleID), path("*_lumpy_sv.vcf"), val(meta), val(normal_name), val(tumor_name), val('lumpy'), emit: lumpy_sv_vcf + + script: + """ + lumpyexpress \ + -B ${tumor_bam},${normal_bam} \ + -o ${sampleID}_lumpy_sv.vcf + """ +} \ No newline at end of file diff --git a/modules/macs2/macs2_consensus.nf b/modules/macs2/macs2_consensus.nf new file mode 100644 index 00000000..d25c2cb5 --- /dev/null +++ b/modules/macs2/macs2_consensus.nf @@ -0,0 +1,67 @@ +/* + * Consensus peaks across samples, create boolean filtering file, SAF file for featureCounts + */ +process MACS2_CONSENSUS { + tag "${antibody}" + + cpus 8 + memory 10.GB + time '10:00:00' + + publishDir "${params.pubdir}/${ params.organize_by=='sample' ? 'consensusCalling_'+antibody+'/macs2' : 'macs2' }", pattern: "*_peaks.*", mode: 'copy' + + container 'quay.io/biocontainers/mulled-v2-2f48cc59b03027e31ead6d383fe1b8057785dd24:5d182f583f4696f4c4d9f3be93052811b383341f-0' + + input: + tuple val(antibody), val(replicatesExist), val(multipleGroups), path(peaks) + + output: + tuple val(antibody), val(replicatesExist), val(multipleGroups), path('*.bed') , emit: ano + tuple val(antibody), val(replicatesExist), val(multipleGroups), val(''), val(''), path('*.bed') , emit: bed + tuple val(antibody), path('*.saf') , emit: saf + tuple val(antibody), path("*.pdf") , emit: pdf + tuple val(antibody), path("*.antibody.txt") , emit: txt + tuple val(antibody), path("*.boolean.txt") , emit: boolean_txt + tuple val(antibody), path("*.intersect.txt"), emit: intersect_txt + + when: + params.macs_gsize && (replicatesExist || multipleGroups) && !params.skip_consensus_peaks + + script: + peak_type = params.narrow_peak ? 'narrowPeak' : 'broadPeak' + prefix = "${antibody}.consensus_peaks" + mergecols = params.narrow_peak ? (2..10).join(',') : (2..9).join(',') + collapsecols = params.narrow_peak ? (['collapse']*9).join(',') : (['collapse']*8).join(',') + expandparam = params.narrow_peak ? '--is_narrow_peak' : '' + """ + sort -T '.' -k1,1 -k2,2n ${peaks.collect{it.toString()}.sort().join(' ')} \\ + | mergeBed -c $mergecols -o $collapsecols > ${prefix}.txt + + ${projectDir}/bin/chipseq/macs2_merged_expand.py \\ + ${prefix}.txt \\ + ${peaks.collect{it.toString()}.sort().join(',').replaceAll("_peaks.${peak_type}","")} \\ + ${prefix}.boolean.txt \\ + --min_replicates $params.min_reps_consensus \\ + $expandparam + + awk -v FS='\t' -v OFS='\t' 'FNR > 1 { print \$1, \$2, \$3, \$4, "0", "+" }' ${prefix}.boolean.txt > ${prefix}.bed + + echo -e "GeneID\tChr\tStart\tEnd\tStrand" > ${prefix}.saf + awk -v FS='\t' -v OFS='\t' 'FNR > 1 { print \$4, \$1, \$2, \$3, "+" }' ${prefix}.boolean.txt >> ${prefix}.saf + + ${projectDir}/bin/chipseq/plot_peak_intersect.r -i ${prefix}.boolean.intersect.txt -o ${prefix}.boolean.intersect.plot.pdf + + echo "${prefix}.bed\t${antibody}/${prefix}.bed" > ${prefix}.antibody.txt + + """ + +} + +/* +IGV steps removed, re-add if IGV is needed: + + OUTPUT: tuple val(antibody), path("*.bed.igv.txt"), emit: igv_txt + + + SCRIPT: find * -type f -name "${prefix}.bed" -exec echo -e "macs2/"{}"\\t0,0,0" \\; > ${prefix}.bed.igv.txt +*/ diff --git a/modules/macs2/macs2_peak_calling.nf b/modules/macs2/macs2_peak_calling.nf index 857e5615..6d1c84d2 100644 --- a/modules/macs2/macs2_peak_calling.nf +++ b/modules/macs2/macs2_peak_calling.nf @@ -4,13 +4,14 @@ process PEAK_CALLING { cpus 2 memory 10.GB time '10:00:00' + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} + + container 'quay.io/biocontainers/macs2:' publishDir "${params.pubdir}/${ params.organize_by=='sample' ? sampleID : 'macs2' }", pattern: "*_peaks.narrowPeak", mode: 'copy' publishDir "${params.pubdir}/${ params.organize_by=='sample' ? sampleID : 'macs2' }", pattern: "*_summits.bed", mode: 'copy' publishDir "${params.pubdir}/${ params.organize_by=='sample' ? sampleID+'/stats' : 'macs2' }", pattern: "*.log", mode: 'copy' - container 'quay.io/biocontainers/macs2:' - - + input: tuple val(sampleID), file(processed_bams) @@ -21,7 +22,6 @@ process PEAK_CALLING { script: - log.info "----- Performing Peak Calling on on ${sampleID} -----" String genome = params.gen_org == 'human' ? 'hs' : 'mm' """ macs2 callpeak \ diff --git a/modules/macs2/macs2_peak_calling_chipseq.nf b/modules/macs2/macs2_peak_calling_chipseq.nf new file mode 100644 index 00000000..78025734 --- /dev/null +++ b/modules/macs2/macs2_peak_calling_chipseq.nf @@ -0,0 +1,47 @@ +process PEAK_CALLING_CHIPSEQ { + tag "${ip} vs ${control}" + + cpus 2 + memory 10.GB + time '10:00:00' + + publishDir "${params.pubdir}/${ params.organize_by=='sample' ? 'immuno_precip_samples/'+ip+'_vs_'+control+'/macs2' : 'macs2' }", pattern: "*_peaks.*", mode: 'copy' + + container 'quay.io/biocontainers/macs2:' + + + input: + tuple val(antibody), val(replicatesExist), val(multipleGroups), val(ip), file(ipbam), val(control), file(controlbam), file(ipflagstat) + file(peak_count_header) + file(frip_score_header) + + + output: + tuple val(antibody), val(replicatesExist), val(multipleGroups), file("*.{narrowPeak,broadPeak}"), emit: arm_peak + tuple val(antibody), val(replicatesExist), val(multipleGroups), val(ip), val(control), file("*.{narrowPeak,broadPeak}"), emit: ip_control_peak + tuple val(antibody), val(replicatesExist), val(multipleGroups), val(ip), val(control), emit: ip_control + tuple val(ip), file("*.{narrowPeak,broadPeak}"), emit: peak + tuple val(ip), file("*_peaks.gappedPeak"), emit: gapped, optional: true + tuple val(ip), file("*_peaks.xls"), emit: xls + + + script: + broad = params.narrow_peak ? '' : "--broad --broad-cutoff ${params.broad_cutoff}" + format = params.read_type == 'SE' ? 'BAM' : 'BAMPE' + pileup = params.save_macs_pileup ? '-B --SPMR' : '' + fdr = params.macs_fdr ? "--qvalue ${params.macs_fdr}" : '' + pvalue = params.macs_pvalue ? "--pvalue ${params.macs_pvalue}" : '' + """ + macs2 callpeak \\ + -t ${ipbam[0]} \\ + -c ${controlbam[0]} \\ + $broad \\ + -f $format \\ + -g $params.macs_gsize \\ + -n $ip \\ + $pileup \\ + $fdr \\ + $pvalue \\ + --keep-dup all + """ +} diff --git a/modules/macs2/macs2_peak_coverage.nf b/modules/macs2/macs2_peak_coverage.nf index 8d632fbd..8bb39381 100644 --- a/modules/macs2/macs2_peak_coverage.nf +++ b/modules/macs2/macs2_peak_coverage.nf @@ -4,6 +4,7 @@ process PEAK_COVERAGE { cpus = 1 memory 1.GB time '01:00:00' + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} container 'quay.io/biocontainers/macs2:' @@ -14,7 +15,6 @@ process PEAK_COVERAGE { tuple val(sampleID), file("*_peaks.narrowPeak.saf") shell: - log.info "----- Get coverage in each peak on ${sampleID} -----" ''' awk 'OFS="\\t" {print $1"."$2"."$3, $1, $2, $3, "."}' !{narrow_peaks} \ > !{sampleID}_peaks.narrowPeak.saf diff --git a/modules/macs2/plot_macs2_qc.nf b/modules/macs2/plot_macs2_qc.nf new file mode 100644 index 00000000..35f952ef --- /dev/null +++ b/modules/macs2/plot_macs2_qc.nf @@ -0,0 +1,30 @@ +process PLOT_MACS2_QC { + + cpus 2 + memory 10.GB + time '10:00:00' + + container 'quay.io/biocontainers/mulled-v2-ad9dd5f398966bf899ae05f8e7c54d0fb10cdfa7:05678da05b8e5a7a5130e90a9f9a6c585b965afa-0' + + publishDir "${params.pubdir}/${ params.organize_by=='sample' ? 'immuno_precip_samples/cross_sample_plots' : 'macs2' }", mode: 'copy' + + input: + file(peaks) + + when: + params.macs_gsize && !params.skip_peak_annotation && !params.skip_peak_qc + + output: + path '*.txt' , emit: txt + path '*.pdf' , emit: pdf + + script: // This script was bundled withing the nf-core/chipseq/bin/ directory + def peak_type = params.narrow_peak ? 'narrowPeak' : 'broadPeak' + """ + ${projectDir}/bin/chipseq/plot_macs_qc.r \\ + -i ${peaks.join(',')} \\ + -s ${peaks.join(',').replaceAll("_peaks.${peak_type}","")} \\ + -o ./ \\ + -p macs_peak + """ +} diff --git a/modules/msisensor2/msisensor2.nf b/modules/msisensor2/msisensor2.nf new file mode 100644 index 00000000..f1925912 --- /dev/null +++ b/modules/msisensor2/msisensor2.nf @@ -0,0 +1,31 @@ +process MSISENSOR2_MSI { + tag "$sampleID" + + cpus = 1 + memory = 6.GB + time = '03:00:00' + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} + + container 'quay.io/biocontainers/msisensor2:0.1--hd03093a_0' + + publishDir "${params.pubdir}/${ params.organize_by=='sample' ? sampleID + '/msi' : 'msisensor2' }", pattern:"*msisensor", mode:'copy' + + input: + tuple val(sampleID), val(meta), file(bam), file(bai), val(seqID) + + output: + tuple val(sampleID), file("*msisensor"), emit: msisensor + file("${sampleID}_msisensor_dis") + file("${sampleID}_msisensor_somatic") + + script: + + """ + mkdir models + + cp -r ${params.msisensor_model} models + + msisensor2 msi -M models/models_hg38 -t ${bam} -o ${sampleID}_msisensor + + """ +} diff --git a/modules/msisensor2/msisensor2_tumorOnly.nf b/modules/msisensor2/msisensor2_tumorOnly.nf new file mode 100644 index 00000000..ec00a7f2 --- /dev/null +++ b/modules/msisensor2/msisensor2_tumorOnly.nf @@ -0,0 +1,31 @@ +process MSISENSOR2_MSI { + tag "$sampleID" + + cpus = 1 + memory = 6.GB + time = '03:00:00' + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} + + container 'quay.io/biocontainers/msisensor2:0.1--hd03093a_0' + + publishDir "${params.pubdir}/${ params.organize_by=='sample' ? sampleID+'/msi' : 'msisensor2' }", pattern:"*msisensor", mode:'copy' + + input: + tuple val(sampleID), file(bam), file(bai) + + output: + tuple val(sampleID), file("*msisensor"), emit: msisensor + file("${sampleID}_msisensor_dis") + file("${sampleID}_msisensor_somatic") + + script: + + """ + mkdir models + + cp -r ${params.msisensor_model} models + + msisensor2 msi -M models/models_hg38 -t ${bam} -o ${sampleID}_msisensor + + """ +} diff --git a/modules/multiqc/multiqc.nf b/modules/multiqc/multiqc.nf index afff31ee..dadee379 100644 --- a/modules/multiqc/multiqc.nf +++ b/modules/multiqc/multiqc.nf @@ -1,7 +1,9 @@ process MULTIQC { + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} - container 'quay.io/biocontainers/multiqc:1.12--pyhdfd78af_0' - + container 'quay.io/jaxcompsci/multiqc:v1.15.dev0' + //quay.io/biocontainers/multiqc:1.12--pyhdfd78af_0 + publishDir "${params.pubdir}/multiqc", pattern: "*multiqc_report.html", mode:'copy' publishDir "${params.pubdir}/multiqc", pattern: "*_data", mode:'copy' @@ -14,9 +16,9 @@ process MULTIQC { path "*_plots" , optional:true, emit: plots script: - + def custom_config = params.multiqc_config ? " --config $params.multiqc_config " : '' """ - multiqc . + multiqc . ${custom_config} """ -} +} \ No newline at end of file diff --git a/modules/multiqc/multiqc_custom_phantompeakqualtools.nf b/modules/multiqc/multiqc_custom_phantompeakqualtools.nf new file mode 100644 index 00000000..8b1a72fa --- /dev/null +++ b/modules/multiqc/multiqc_custom_phantompeakqualtools.nf @@ -0,0 +1,25 @@ +process MULTIQC_CUSTOM_PHANTOMPEAKQUALTOOLS { + tag "$sampleID" + + container 'quay.io/biocontainers/r-base:3.5.1' + + input: + tuple val(sampleID), file(spp), file(rdata) + file(nsc_header) + file(rsc_header) + file(correlation_header) + + output: + tuple val(sampleID), file("*.spp_nsc_mqc.tsv") , emit: nsc + tuple val(sampleID), file("*.spp_rsc_mqc.tsv") , emit: rsc + tuple val(sampleID), file("*.spp_correlation_mqc.tsv"), emit: correlation + + script: + """ + cp $correlation_header ${sampleID}.spp_correlation_mqc.tsv + Rscript --max-ppsize=500000 -e "load('$rdata'); write.table(crosscorr\\\$cross.correlation, file=\\"${sampleID}.spp_correlation_mqc.tsv\\", sep=",", quote=FALSE, row.names=FALSE, col.names=FALSE,append=TRUE)" + + awk -v OFS='\t' '{print "${sampleID}", \$9}' $spp | cat $nsc_header - > ${sampleID}.spp_nsc_mqc.tsv + awk -v OFS='\t' '{print "${sampleID}", \$10}' $spp | cat $rsc_header - > ${sampleID}.spp_rsc_mqc.tsv + """ +} diff --git a/modules/novocraft/novosort.nf b/modules/novocraft/novosort.nf new file mode 100644 index 00000000..981db78f --- /dev/null +++ b/modules/novocraft/novosort.nf @@ -0,0 +1,23 @@ +process NOVOSORT_markDuplicates { + tag "$sampleID" + + cpus = 1 + memory = 8.GB + time = '03:00:00' + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} + + container 'quay.io/jaxcompsci/novosort:lastest' + publishDir "${params.pubdir}/${ params.organize_by=='sample' ? sampleID : 'novosort' }", pattern:"*_fixed_mate_dup_marked.bam", mode:'copy' + + input: + tuple val(sampleID), file(fixed_mate_bam) + + output: + tuple val(sampleID), file("*_fixed_mate_dup_marked.bam"), emit: fixed_mate_dup_marked_bam + + script: + + """ + novosort -markduplicates -t . -m 8G \ + ${fixed_mate_bam} > ${sampleID}_fixed_mate_dup_marked.bam + """ diff --git a/modules/nygc-short-alignment-marking/short_alignment_marking.nf b/modules/nygc-short-alignment-marking/short_alignment_marking.nf new file mode 100644 index 00000000..3701e6e0 --- /dev/null +++ b/modules/nygc-short-alignment-marking/short_alignment_marking.nf @@ -0,0 +1,36 @@ +process SHORT_ALIGNMENT_MARKING { + tag "$sampleID" + + cpus 1 + memory 24.GB + time '24:00:00' + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} + + container 'quay.io/biocontainers/samtools:1.14--hb421002_0' + + publishDir "${params.pubdir}/${ params.organize_by=='sample' ? sampleID : 'short_alignment_marking' }", pattern:"*.marked.bam", mode:'copy', enabled: params.keep_intermediate + + input: + tuple val(sampleID), file(aligned_bam) + + output: + tuple val(sampleID), file("*.marked.bam"), emit: marked_bam + + script: + // parses the bam file and marks as unmapped a read with alignment length below a user-defined threshold. Reads are not filtered from the bam file but kept as unmapped. + """ + ${projectDir}/bin/pta/filter_bam -I ${aligned_bam} -A1 30 -A2 30 -o ${sampleID}.marked.bam | samtools view -b -o ${sampleID}.marked.bam + """ +} + +/* +-A1, --ALN_LEN_PRIM ALN_LEN_PRIM + Primary (loose) alignment length +-A2, --ALN_LEN_SECOND ALN_LEN_SECOND + Supplementary (strict) alignment length +-o, --OUT_PREFIX OUT_PREFIX + Output file prefix + +NOTE: -o does not actually produce an output file. +NOTE: The BAM file produced here, is corrupt. It requires sorting and cleaning (non mapped reads have non 0 MAPQ) and mate information to be fixed. +*/ \ No newline at end of file diff --git a/modules/nygenome/lancet.nf b/modules/nygenome/lancet.nf new file mode 100644 index 00000000..f59b63c2 --- /dev/null +++ b/modules/nygenome/lancet.nf @@ -0,0 +1,33 @@ +process LANCET { + tag "$sampleID" + + cpus = 4 + memory = 15.GB + time = '10:00:00' + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} + + container 'quay.io/jaxcompsci/lancet:v1.1.0' + publishDir "${params.pubdir}/${ params.organize_by=='sample' ? "$sampleID" : 'lancet' }", pattern:"*.vcf", mode:'copy', enabled: params.keep_intermediate + + input: + tuple val(sampleID), val(meta), path(normal_bam), path(normal_bai), val(normal_name), path(tumor_bam), path(tumor_bai), val(tumor_name), path(bed), val(index) + + output: + tuple val(sampleID), path("*_lancet.vcf"), val(meta), val(normal_name), val(tumor_name), val('lancet'), emit: vcf + + script: + """ + lancet \ + --tumor ${tumor_bam} \ + --normal ${normal_bam} \ + --ref ${params.ref_fa} \ + --bed ${bed} \ + --min-k 11 \ + --low-cov 1 \ + --min-phred-fisher 5 \ + --min-strand-bias 1 \ + --min-alt-count-tumor 3 \ + --min-vaf-tumor 0.04 \ + --num-threads ${task.cpus} > ${sampleID}_${index}_lancet.vcf + """ +} \ No newline at end of file diff --git a/modules/nygenome/lancet_confirm.nf b/modules/nygenome/lancet_confirm.nf new file mode 100644 index 00000000..e84f7378 --- /dev/null +++ b/modules/nygenome/lancet_confirm.nf @@ -0,0 +1,36 @@ +process LANCET_CONFIRM { + tag "$sampleID" + + cpus = 8 + memory = 15.GB + time = '20:00:00' + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} + + container 'quay.io/jaxcompsci/lancet:v1.1.0' + // publishDir "${params.pubdir}/${ params.organize_by=='sample' ? sampleID : 'lancet' }", pattern:".vcf", mode:'copy' + + input: + tuple val(sampleID), path(bed), val(meta), path(normal_bam), path(normal_bai), val(normal_name), path(tumor_bam), path(tumor_bai), val(tumor_name), val(chrom) + + output: + tuple val(sampleID), path("*.vcf"), val(meta), val(normal_name), val(tumor_name), val(chrom), emit: vcf + + script: + """ + lancet \ + --tumor ${tumor_bam} \ + --normal ${normal_bam} \ + --bed ${bed} \ + --ref ${params.ref_fa} \ + --min-k 11 \ + --low-cov 1 \ + --min-phred-fisher 5 \ + --min-strand-bias 1 \ + --min-alt-count-tumor 3 \ + --min-vaf-tumor 0.04 \ + --padding 250 \ + --window-size 2000 \ + --num-threads ${task.cpus} \ + > ${sampleID}_lancet_merged_${chrom}.vcf + """ +} diff --git a/modules/phantompeakqualtools/phantompeakqualtools.nf b/modules/phantompeakqualtools/phantompeakqualtools.nf new file mode 100644 index 00000000..92fb4277 --- /dev/null +++ b/modules/phantompeakqualtools/phantompeakqualtools.nf @@ -0,0 +1,22 @@ +process PHANTOMPEAKQUALTOOLS { + tag "$sampleID" + cpus 8 + memory 10.GB + time '04:00:00' + + container 'quay.io/biocontainers/phantompeakqualtools:1.2.2--0' + + input: + tuple val(sampleID), file(bam) + + output: + tuple val(sampleID), file("*.out") , emit: spp + tuple val(sampleID), file("*.pdf") , emit: pdf + tuple val(sampleID), file("*.Rdata"), emit: rdata + + script: + """ + RUN_SPP=`which run_spp.R` + Rscript -e "library(caTools); source(\\"\$RUN_SPP\\")" -c="$bam" -savp="${sampleID}.spp.pdf" -savd="${sampleID}.spp.Rdata" -out="${sampleID}.spp.out" -p=$task.cpus + """ +} diff --git a/modules/picard/picard_addorreplacereadgroups.nf b/modules/picard/picard_addorreplacereadgroups.nf index 20429376..e7c2cfa8 100644 --- a/modules/picard/picard_addorreplacereadgroups.nf +++ b/modules/picard/picard_addorreplacereadgroups.nf @@ -4,6 +4,7 @@ process PICARD_ADDORREPLACEREADGROUPS { cpus 1 memory 8.GB time '06:00:00' + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} container 'quay.io/biocontainers/picard:2.26.10--hdfd78af_0' publishDir "${params.pubdir}/${ params.organize_by=='sample' ? sampleID+'/bam' : 'picard' }", pattern: "*.bam", mode:'copy', enabled: params.keep_intermediate @@ -16,7 +17,6 @@ process PICARD_ADDORREPLACEREADGROUPS { tuple val(sampleID), file("*.bai"), emit: bai script: - log.info "----- Picard Add or Replace Read Groups Running on: ${sampleID} -----" String my_mem = (task.memory-1.GB).toString() my_mem = my_mem[0..-4] diff --git a/modules/picard/picard_cleansam.nf b/modules/picard/picard_cleansam.nf new file mode 100644 index 00000000..0ebd610a --- /dev/null +++ b/modules/picard/picard_cleansam.nf @@ -0,0 +1,30 @@ +process PICARD_CLEANSAM { + tag "$sampleID" + + cpus = 1 + memory { bam.size() < 60.GB ? 8.GB : 24.GB } + time { bam.size() < 60.GB ? '06:00:00' : '12:00:00' } + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} + + container 'quay.io/biocontainers/picard:2.26.10--hdfd78af_0' + + publishDir "${params.pubdir}/${ params.organize_by=='sample' ? sampleID : 'picard' }", pattern: "*_cleaned.bam", mode:'copy', enabled: params.keep_intermediate + + input: + tuple val(sampleID), file(bam) + + output: + tuple val(sampleID), file("*_cleaned.bam"), emit: cleaned_bam + + script: + + String my_mem = (task.memory-1.GB).toString() + my_mem = my_mem[0..-4] + + """ + picard -Xmx${my_mem}G CleanSam \ + I=${bam} \ + TMP_DIR=${workDir}/temp \ + O=${sampleID}_cleaned.bam + """ +} diff --git a/modules/picard/picard_collectalignmentsummarymetrics.nf b/modules/picard/picard_collectalignmentsummarymetrics.nf index 086474c0..f9be1a83 100644 --- a/modules/picard/picard_collectalignmentsummarymetrics.nf +++ b/modules/picard/picard_collectalignmentsummarymetrics.nf @@ -4,6 +4,7 @@ process PICARD_COLLECTALIGNMENTSUMMARYMETRICS{ cpus = 1 memory = 5.GB time = '03:00:00' + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} container 'broadinstitute/gatk:' @@ -16,7 +17,6 @@ process PICARD_COLLECTALIGNMENTSUMMARYMETRICS{ tuple val(sampleID), file("*.txt"), emit: txt script: - log.info "----- Collect Alignment Sumary Metrics Running on: ${sampleID} -----" String my_mem = (task.memory-1.GB).toString() my_mem = my_mem[0..-4] @@ -28,4 +28,4 @@ process PICARD_COLLECTALIGNMENTSUMMARYMETRICS{ --METRIC_ACCUMULATION_LEVEL ALL_READS \ --VALIDATION_STRINGENCY LENIENT """ -} \ No newline at end of file +} diff --git a/modules/picard/picard_collecthsmetrics.nf b/modules/picard/picard_collecthsmetrics.nf index 7582efdf..f40e8460 100644 --- a/modules/picard/picard_collecthsmetrics.nf +++ b/modules/picard/picard_collecthsmetrics.nf @@ -4,6 +4,7 @@ process PICARD_COLLECTHSMETRICS { cpus = 1 memory = 6.GB time = '03:00:00' + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} container 'quay.io/biocontainers/picard:2.26.10--hdfd78af_0' @@ -16,7 +17,6 @@ process PICARD_COLLECTHSMETRICS { tuple val(sampleID), file("*Metrics.txt"), emit: hsmetrics script: - log.info "----- Picard CollectHsMetrics Running on: ${sampleID} -----" String my_mem = (task.memory-1.GB).toString() my_mem = my_mem[0..-4] @@ -29,4 +29,4 @@ process PICARD_COLLECTHSMETRICS { REFERENCE_SEQUENCE=${params.ref_fa} \ VALIDATION_STRINGENCY=SILENT """ -} \ No newline at end of file +} diff --git a/modules/picard/picard_collectmultiplemetrics.nf b/modules/picard/picard_collectmultiplemetrics.nf new file mode 100644 index 00000000..692ad39f --- /dev/null +++ b/modules/picard/picard_collectmultiplemetrics.nf @@ -0,0 +1,37 @@ +process PICARD_COLLECTMULTIPLEMETRICS { + tag "$sampleID" + + cpus = 1 + memory = 6.GB + time = '03:00:00' + + container 'quay.io/biocontainers/picard:2.26.10--hdfd78af_0' + + publishDir { + def type = "${params.workflow}" == 'chipseq' ? ( sampleID =~ /INPUT/ ? 'control_samples/' : 'immuno_precip_samples/') : '' + "${params.pubdir}/${ params.organize_by=='sample' ? type+sampleID+'/stats' : 'picard'}" + }, pattern: "*.CollectMultipleMetrics.*", mode: 'copy' + + + input: + tuple val(sampleID), file(bam) + + output: + tuple val(sampleID), file("*_metrics"), emit : metrics + tuple val(sampleID), file("*.pdf"), emit : pdf + + + script: + prefix = "${sampleID}.mLb.clN" + String my_mem = (task.memory-1.GB).toString() + my_mem = my_mem[0..-4] + + """ + picard -Xmx${my_mem}G CollectMultipleMetrics \ + INPUT=${bam[0]} \ + OUTPUT=${prefix}.CollectMultipleMetrics \ + REFERENCE_SEQUENCE=${params.ref_fa} \ + VALIDATION_STRINGENCY=LENIENT + TMP_DIR=${params.tmpdir} + """ +} diff --git a/modules/picard/picard_collectrnaseqmetrics.nf b/modules/picard/picard_collectrnaseqmetrics.nf index bd125891..5c71b5ee 100644 --- a/modules/picard/picard_collectrnaseqmetrics.nf +++ b/modules/picard/picard_collectrnaseqmetrics.nf @@ -4,6 +4,7 @@ process PICARD_COLLECTRNASEQMETRICS { cpus 1 memory 8.GB time '03:00:00' + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} container 'quay.io/biocontainers/picard:2.26.10--hdfd78af_0' @@ -11,23 +12,25 @@ process PICARD_COLLECTRNASEQMETRICS { publishDir "${params.pubdir}/${ params.organize_by=='sample' ? sampleID+'/stats' : 'picard' }", pattern: "*.pdf", mode:'copy' input: - tuple val(sampleID), file(bam) + tuple val(sampleID), file(bam), val(strand_setting) + val(ref_flat) + val(ribo_intervals) + output: tuple val(sampleID), file("*metrics.txt"), emit: picard_metrics script: - log.info "----- Collect RNA Sequence Metrics on: ${sampleID} -----" - if (params.read_prep == "reverse_stranded") { + if (strand_setting == "reverse_stranded") { strand_setting = "SECOND_READ_TRANSCRIPTION_STRAND" } - if (params.read_prep == "forward_stranded") { + if (strand_setting == "forward_stranded") { strand_setting = "FIRST_READ_TRANSCRIPTION_STRAND" } - if (params.read_prep == "non_stranded") { + if (strand_setting == "non_stranded") { strand_setting = "NONE" } @@ -35,9 +38,9 @@ process PICARD_COLLECTRNASEQMETRICS { picard CollectRnaSeqMetrics \ I=${bam} \ O=${sampleID}_picard_aln_metrics.txt \ - REF_FLAT=${params.ref_flat} \ - RIBOSOMAL_INTERVALS=${params.ribo_intervals} \ + REF_FLAT=${ref_flat} \ + RIBOSOMAL_INTERVALS=${ribo_intervals} \ STRAND=${strand_setting} \ CHART_OUTPUT=${sampleID}_coverage_vs_transcript_plot.pdf """ -} \ No newline at end of file +} diff --git a/modules/picard/picard_collecttargetpcrmetrics.nf b/modules/picard/picard_collecttargetpcrmetrics.nf new file mode 100644 index 00000000..7a150eef --- /dev/null +++ b/modules/picard/picard_collecttargetpcrmetrics.nf @@ -0,0 +1,35 @@ +process PICARD_COLLECTTARGETPCRMETRICS { + tag "$sampleID" + + cpus = 1 + memory = 5.GB + time = '08:00:00' + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} + + container 'broadinstitute/gatk:' + + publishDir "${params.pubdir}/${ params.organize_by=='sample' ? sampleID+'/stats' : 'picard' }", pattern: "*.txt", mode:'copy' + + input: + tuple val(sampleID), file(bam) + + output: + tuple val(sampleID), file("*.txt"), emit: txt + + script: + String my_mem = (task.memory-1.GB).toString() + my_mem = my_mem[0..-4] + + """ + gatk --java-options "-Xmx${my_mem}G" CollectTargetedPcrMetrics \ + --INPUT ${bam} \ + --OUTPUT ${sampleID}_CollectTargetedPcrMetrics.txt \ + --REFERENCE_SEQUENCE ${params.ref_fa} \ + --AMPLICON_INTERVALS ${params.amplicon_primer_intervals} \ + --TARGET_INTERVALS ${params.amplicon_target_intervals} \ + --COVERAGE_CAP 1500 \ + --NEAR_DISTANCE 50 \ + --VALIDATION_STRINGENCY LENIENT + """ + +} diff --git a/modules/picard/picard_collectwgsmetrics.nf b/modules/picard/picard_collectwgsmetrics.nf index cbd1dc75..b380a5a1 100644 --- a/modules/picard/picard_collectwgsmetrics.nf +++ b/modules/picard/picard_collectwgsmetrics.nf @@ -4,6 +4,7 @@ process PICARD_COLLECTWGSMETRICS { cpus = 1 memory = 5.GB time = '08:00:00' + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} container 'broadinstitute/gatk:' @@ -16,7 +17,6 @@ process PICARD_COLLECTWGSMETRICS { tuple val(sampleID), file("*.txt"), emit: txt script: - log.info "----- Collect Alignment Sumary Metrics Running on: ${sampleID} -----" String my_mem = (task.memory-1.GB).toString() my_mem = my_mem[0..-4] @@ -27,5 +27,4 @@ process PICARD_COLLECTWGSMETRICS { --REFERENCE_SEQUENCE ${params.ref_fa} \ --VALIDATION_STRINGENCY LENIENT """ - -} \ No newline at end of file +} diff --git a/modules/picard/picard_fix_mate_information.nf b/modules/picard/picard_fix_mate_information.nf new file mode 100644 index 00000000..d3443c47 --- /dev/null +++ b/modules/picard/picard_fix_mate_information.nf @@ -0,0 +1,31 @@ +process PICARD_FIX_MATE_INFORMATION { + tag "$sampleID" + + cpus = 1 + memory { bam.size() < 30.GB ? 6.GB : 48.GB } + time { bam.size() < 30.GB ? '03:00:00' : '24:00:00' } + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} + + container 'quay.io/biocontainers/picard:2.26.10--hdfd78af_0' + + publishDir "${params.pubdir}/${ params.organize_by=='sample' ? sampleID : 'picard' }", pattern: "*fixed_mate.bam", mode:'copy', enabled: params.keep_intermediate + + input: + tuple val(sampleID), file(bam) + + output: + tuple val(sampleID), file("*_fixed_mate.bam"), emit: fixed_mate_bam + + script: + + String my_mem = (task.memory-1.GB).toString() + my_mem = my_mem[0..-4] + + """ + picard -Xmx${my_mem}G FixMateInformation \ + I=${bam} \ + O=${sampleID}_fixed_mate.bam \ + TMP_DIR=${workDir}/temp \ + ADD_MATE_CIGAR=true + """ +} diff --git a/modules/picard/picard_markduplicates.nf b/modules/picard/picard_markduplicates.nf index 23cf338b..df5ee7fa 100644 --- a/modules/picard/picard_markduplicates.nf +++ b/modules/picard/picard_markduplicates.nf @@ -2,14 +2,23 @@ process PICARD_MARKDUPLICATES { tag "$sampleID" cpus 1 - memory 16.GB - time '12:00:00' + memory { bam.size() < 60.GB ? 16.GB : 32.GB } + time { bam.size() < 60.GB ? '12:00:00' : '24:00:00' } + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} container 'quay.io/biocontainers/picard:2.26.10--hdfd78af_0' - // save if mouse and wes or save if keep intermediate - publishDir "${params.pubdir}/${ params.organize_by=='sample' ? sampleID+'/bam' : 'picard' }", pattern: "*.bam", mode:'copy', enabled: params.gen_org=='mouse' ? true : params.keep_intermediate - publishDir "${params.pubdir}/${ params.organize_by=='sample' ? sampleID+'/stats' : 'picard' }", pattern: "*.txt", mode:'copy' + // save if mouse or save if keep intermediate + publishDir { + def type = "${params.workflow}" == 'chipseq' ? ( sampleID =~ /INPUT/ ? 'control_samples/' : 'immuno_precip_samples/') : '' + "${params.pubdir}/${ params.organize_by=='sample' ? type+sampleID+'/bam' : 'picard'}" + }, pattern: "*.{bam,bai}", mode: 'copy', enabled: params.gen_org=='mouse' || params.workflow=='chipseq' ? true : params.keep_intermediate + + publishDir { + def type = "${params.workflow}" == 'chipseq' ? ( sampleID =~ /INPUT/ ? 'control_samples/' : 'immuno_precip_samples/') : '' + "${params.pubdir}/${ params.organize_by=='sample' ? type+sampleID+'/stats' : 'picard'}" + }, pattern: "*.txt", mode: 'copy' + input: tuple val(sampleID), file(bam) @@ -20,30 +29,17 @@ process PICARD_MARKDUPLICATES { tuple val(sampleID), file("*.txt"), emit: dedup_metrics script: - log.info "----- Picard SortSam Running on: ${sampleID} -----" String my_mem = (task.memory-1.GB).toString() my_mem = my_mem[0..-4] - if (params.workflow != "atac") """ picard -Xmx${my_mem}G MarkDuplicates \ - I=${bam} \ + I=${bam[0]} \ O=${sampleID}_dedup.bam \ M=${sampleID}_dup_metrics.txt \ - REMOVE_DUPLICATES=true \ - CREATE_INDEX=true \ - VALIDATION_STRINGENCY=SILENT - """ - else - """ - picard -Xmx${my_mem}G MarkDuplicates \ - I=${bam[0]} \ - O=${sampleID}.sorted.marked4_dedup.bam \ - M=${sampleID}.sorted.metrics.txt \ REMOVE_DUPLICATES=false \ CREATE_INDEX=true \ - VALIDATION_STRINGENCY=LENIENT \ - TMP_DIR=${params.tmpdir} \ - > ${sampleID}.picard.log 2>&1 + TMP_DIR=${workDir}/temp \ + VALIDATION_STRINGENCY=SILENT """ } diff --git a/modules/picard/picard_mergesamfiles.nf b/modules/picard/picard_mergesamfiles.nf new file mode 100644 index 00000000..785e5b3b --- /dev/null +++ b/modules/picard/picard_mergesamfiles.nf @@ -0,0 +1,43 @@ +process PICARD_MERGESAMFILES { + tag "$sampleID" + + cpus 1 + memory 8.GB + time '06:00:00' + + container 'quay.io/biocontainers/picard:2.26.10--hdfd78af_0' + + publishDir { + def type = "${params.workflow}" == 'chipseq' ? ( sampleID =~ /INPUT/ ? 'control_samples/' : 'immuno_precip_samples/') : '' + "${params.pubdir}/${ params.organize_by=='sample' ? type+sampleID+'/bam' : 'picard'}" + }, pattern: "*.bam", mode: 'copy', enabled: params.keep_intermediate + + + input: + tuple val(sampleID), file(bam) + + output: + tuple val(sampleID), file("*.bam"), emit: bam + + script: + String my_mem = (task.memory-1.GB).toString() + my_mem = my_mem[0..-4] + + prefix = "${sampleID}.mLb.mkD" + bam_files = bam.findAll { it.toString().endsWith('.bam') }.sort() + if (bam_files.size() > 1) { + """ + picard -Xmx${my_mem}G MergeSamFiles \ + ${'INPUT='+bam_files.join(' INPUT=')} \ + OUTPUT=${sampleID}.sorted.bam \ + SORT_ORDER=coordinate \ + VALIDATION_STRINGENCY=LENIENT \ + TMP_DIR=tmp + """ + }else { + """ + ln -s ${bam_files[0]} ${prefix}.bam + """ + } + +} diff --git a/modules/picard/picard_reordersam.nf b/modules/picard/picard_reordersam.nf index 0d1873b2..38334949 100644 --- a/modules/picard/picard_reordersam.nf +++ b/modules/picard/picard_reordersam.nf @@ -4,6 +4,7 @@ process PICARD_REORDERSAM { cpus 1 memory 8.GB time '06:00:00' + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} container 'quay.io/biocontainers/picard:2.26.10--hdfd78af_0' @@ -11,13 +12,13 @@ process PICARD_REORDERSAM { input: tuple val(sampleID), file(bam) + val(picard_dict) output: tuple val(sampleID), file("*.bam"), emit: bam tuple val(sampleID), file("*.bai"), emit: bai script: - log.info "----- Picard Alignment Metrics Running on: ${sampleID} -----" String my_mem = (task.memory-1.GB).toString() my_mem = my_mem[0..-4] @@ -25,7 +26,7 @@ process PICARD_REORDERSAM { picard -Xmx${my_mem}G ReorderSam \ INPUT=${bam} \ OUTPUT=${sampleID}_genome_bam_with_read_group_reorder.bam \ - SEQUENCE_DICTIONARY=${params.picard_dict} \ + SEQUENCE_DICTIONARY=${picard_dict} \ CREATE_INDEX=true """ -} \ No newline at end of file +} diff --git a/modules/picard/picard_sortsam.nf b/modules/picard/picard_sortsam.nf index bc2ad583..07b9a894 100644 --- a/modules/picard/picard_sortsam.nf +++ b/modules/picard/picard_sortsam.nf @@ -2,8 +2,9 @@ process PICARD_SORTSAM { tag "$sampleID" cpus 1 - memory 8.GB - time '06:00:00' + memory { sam.size() < 60.GB ? 6.GB : 24.GB } + time { sam.size() < 60.GB ? '03:00:00' : '12:00:00' } + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} container 'quay.io/biocontainers/picard:2.26.10--hdfd78af_0' @@ -17,16 +18,16 @@ process PICARD_SORTSAM { tuple val(sampleID), file("*_sortsam.bai"), emit: bai script: - log.info "----- Picard SortSam Running on: ${sampleID} -----" String my_mem = (task.memory-1.GB).toString() my_mem = my_mem[0..-4] """ - picard -Xmx${my_mem}G SortSam \ + picard -Xmx${my_mem}G -Djava.io.tmpdir=`pwd`/tmp SortSam \ SO=coordinate \ INPUT=${sam} \ OUTPUT=${sampleID}_sortsam.bam \ + TMP_DIR=`pwd`/tmp \ VALIDATION_STRINGENCY=SILENT \ CREATE_INDEX=true """ -} \ No newline at end of file +} diff --git a/modules/pizzly/pizzly.nf b/modules/pizzly/pizzly.nf new file mode 100644 index 00000000..9cd40b53 --- /dev/null +++ b/modules/pizzly/pizzly.nf @@ -0,0 +1,38 @@ +process PIZZLY { + + tag "$sampleID" + + cpus 1 + memory 10.GB + time 2.h + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} + + container 'quay.io/biocontainers/pizzly:0.37.3--h470a237_3' + + publishDir "${params.pubdir}/${ params.organize_by=='sample' ? sampleID + '/fusions': 'pizzly' }", pattern: "*_pizzly_fusions.txt", mode:'copy' + + input: + tuple val(sampleID), path(kallisto_fusions), path(kallisto_insert_size) + path(gtf) + + output: + tuple val(sampleID), path("*_pizzly_fusions.txt"), emit: pizzly_fusions + + script: + """ + + insert_size="\$(cat ${kallisto_insert_size})" + + pizzly \ + -k 31 \ + --align-score 2 \ + --insert-size "\${insert_size}" \ + --cache index.cache.txt \ + --gtf ${gtf} \ + --fasta ${params.transcript_fasta} \ + --output ${sampleID}.pizzly ${kallisto_fusions} + + pizzly_flatten_json.py ${sampleID}.pizzly.json ${sampleID}_pizzly_fusions.txt + + """ +} diff --git a/modules/preseq/preseq.nf b/modules/preseq/preseq.nf new file mode 100644 index 00000000..950f5b84 --- /dev/null +++ b/modules/preseq/preseq.nf @@ -0,0 +1,35 @@ +process PRESEQ { + tag "$sampleID" + + cpus 4 + memory 20.GB + time '20:00:00' + errorStrategy 'ignore' + + + container 'quay.io/biocontainers/preseq:3.1.2--h445547b_2' + + + input: + tuple val(sampleID), file(bam) + + output: + tuple val(sampleID), path("*.ccurve.txt"), emit: txt + tuple val(sampleID), path("*.log") , emit: log + + when: + !params.skip_preseq + + script: + pe = params.read_type == 'SE' ? '' : '-pe' + """ + preseq lc_extrap \\ + -output ${sampleID}.ccurve.txt \\ + -verbose \\ + -bam \\ + $pe \\ + -seed 1 \\ + $bam + cp .command.err ${sampleID}.command.log + """ +} diff --git a/modules/primerclip/primerclip.nf b/modules/primerclip/primerclip.nf new file mode 100644 index 00000000..c21f2f50 --- /dev/null +++ b/modules/primerclip/primerclip.nf @@ -0,0 +1,26 @@ +process PRIMERCLIP { + tag "$sampleID" + + cpus 1 + memory 8.GB + time '06:00:00' + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} + + container 'quay.io/biocontainers/primerclip:0.3.8--h9ee0642_1' + + publishDir "${params.pubdir}/${ params.organize_by=='sample' ? sampleID : 'primerclip' }", pattern:"*.sam", mode:'copy', enabled: params.keep_intermediate + publishDir "${params.pubdir}/${ params.organize_by=='sample' ? sampleID + '/stats' : 'primerclip' }", pattern:"*primerclip_runstats.log", mode:'copy' + + input: + tuple val(sampleID), file(sam) + + output: + tuple val(sampleID), file("*.sam"), emit: sam + tuple val(sampleID), file("*primerclip_runstats.log"), emit: log + + script: + + """ + primerclip ${params.masterfile} ${sam} ${sam.baseName}_primerclip.sam + """ +} diff --git a/modules/python/python_add_final_allele_counts.nf b/modules/python/python_add_final_allele_counts.nf new file mode 100644 index 00000000..265f6799 --- /dev/null +++ b/modules/python/python_add_final_allele_counts.nf @@ -0,0 +1,24 @@ +process ADD_FINAL_ALLELE_COUNTS { + tag "$sampleID" + + cpus 1 + memory 4.GB + time '04:00:00' + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} + + container 'quay.io/jaxcompsci/bedtools-python3:2.26.0' + + input: + tuple val(sampleID), file(vcf), val(meta), val(chrom) + + output: + tuple val(sampleID), file("*.vcf"), val(meta), val(chrom), emit: vcf + + script: + """ + python \ + ${projectDir}/bin/pta/add_final_allele_counts_to_vcf.py \ + -v ${vcf} \ + -o ${sampleID}_final_${chrom}.vcf \ + """ +} diff --git a/modules/python/python_add_nygc_allele_counts.nf b/modules/python/python_add_nygc_allele_counts.nf new file mode 100644 index 00000000..5786ef0e --- /dev/null +++ b/modules/python/python_add_nygc_allele_counts.nf @@ -0,0 +1,28 @@ +process ADD_NYGC_ALLELE_COUNTS { + tag "$sampleID" + + cpus 1 + memory 120.GB + time '24:00:00' + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} + + container 'quay.io/jaxcompsci/bedtools-python3:2.26.0' + + input: + tuple val(sampleID), file(vcf), val(meta), path(normal_bam), path(normal_bai), path(tumor_bam), path(tumor_bai), val(chrom) + + output: + tuple val(sampleID), path("*.vcf"), val(meta), val(chrom), emit: vcf + + script: + """ + python \ + ${projectDir}/bin/pta/add_nygc_allele_counts_to_vcf.py \ + -t ${tumor_bam} \ + -n ${normal_bam} \ + -v ${vcf} \ + -b 10 \ + -m 10 \ + -o ${sampleID}_pre_count_${chrom}.vcf + """ +} diff --git a/modules/python/python_check_strandedness.nf b/modules/python/python_check_strandedness.nf new file mode 100644 index 00000000..7215577a --- /dev/null +++ b/modules/python/python_check_strandedness.nf @@ -0,0 +1,41 @@ + +process CHECK_STRANDEDNESS { + tag "$sampleID" + + cpus 1 + memory 10.GB + time '1:00:00' + errorStrategy 'finish' + + container 'quay.io/jaxcompsci/how-are-we-stranded-here:v1.0.1-e6ce74d' + + input: + tuple val(sampleID), path(reads) + + output: + tuple val(sampleID), env(STRAND), emit: strand_setting + + script: + paired = params.read_type == 'PE' ? "-r2 ${reads[1]}" : '' + + """ + check_strandedness -g ${params.strandedness_gtf} -k ${params.strandedness_ref} -r1 ${reads[0]} ${paired} > ${sampleID}_strandedness.txt 2>&1 + + data_type=`grep "Data is likely" ${sampleID}_strandedness.txt` + + if [[ \$data_type == *RF* ]] ; then + STRAND='reverse_stranded' + elif [[ \$data_type == *FR* ]] ; then + STRAND='forward_stranded' + elif [[ \$data_type == *unstranded* ]] ; then + STRAND='non_stranded' + else + echo "RNA Seq data does not fall into a likely stranded (max percent explained > 0.9) or unstranded layout (max percent explained < 0.6). Please check your data for low quality and contaminating reads before proceeding."; exit 1; + fi + + """ +} + +// Data is likely RF/fr-firststrand +// Data is likely FR/fr-secondstrand +// Data is likely unstranded diff --git a/modules/python/python_filter_pon.nf b/modules/python/python_filter_pon.nf new file mode 100644 index 00000000..0d027af6 --- /dev/null +++ b/modules/python/python_filter_pon.nf @@ -0,0 +1,26 @@ +process FILTER_PON { + tag "$sampleID" + + cpus 1 + memory 15.GB + time '04:00:00' + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} + + container 'quay.io/jaxcompsci/bedtools-python3:2.26.0' + + input: + tuple val(sampleID), file(vcf), val(meta), val(chrom) + + output: + tuple val(sampleID), file("*.vcf"), val(meta), val(chrom), emit: vcf + + script: + """ + python \ + ${projectDir}/bin/pta/filter_pon.py \ + --bed ${params.pon_bed} \ + --chrom ${chrom} \ + --vcf ${vcf} \ + --out ${sampleID}_pon_final_${chrom}.vcf + """ +} diff --git a/modules/python/python_filter_vcf.nf b/modules/python/python_filter_vcf.nf new file mode 100644 index 00000000..d9955997 --- /dev/null +++ b/modules/python/python_filter_vcf.nf @@ -0,0 +1,28 @@ + +process FILTER_VCF { + tag "$sampleID" + + cpus 1 + memory 4.GB + time '04:00:00' + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} + + container 'quay.io/jaxcompsci/bedtools-python3:2.26.0' + + input: + tuple val(sampleID), file(vcf), val(meta), val(chrom) + + output: + tuple val(sampleID), file("*.vcf"), val(meta), val(chrom), emit: vcf + + script: + """ + python \ + ${projectDir}/bin/pta/vcf_filter.py \ + ${params.germline_filtering_vcf} \ + ${vcf} \ + ${sampleID}_final_filtered_${chrom}.vcf + """ +} +// NOTE: There are two similarly named scripts: vcf_filter.py and filter_vcf.py. +// The above script is used here. filter_vcf.py is used in gridss_chrom_filter.nf diff --git a/modules/python/python_germline_vcf_finalization.nf b/modules/python/python_germline_vcf_finalization.nf new file mode 100644 index 00000000..cbd9940d --- /dev/null +++ b/modules/python/python_germline_vcf_finalization.nf @@ -0,0 +1,36 @@ +process GERMLINE_VCF_FINALIZATION { + tag "$sampleID" + + cpus 1 + memory 5.GB + time 1.hour + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} + + container 'quay.io/jaxcompsci/py3_perl_pylibs:v2' + + publishDir "${params.pubdir}/${ params.organize_by=='sample' ? sampleID : 'vcf' }", pattern: "*final.vcf", mode:'copy' + + input: + tuple val(sampleID), file(vcf) + val(filtered) + + output: + tuple val(sampleID), file("*final.vcf"), emit: vcf + + script: + + output_suffix = filtered == 'filtered' ? 'filtered' : 'unfiltered' + + """ + python \ + ${projectDir}/bin/pta/annotate_id.py \ + ${vcf} \ + ${sampleID}_germline_vep_cosmic_cancerResitMut_annotated_id.vcf + + python \ + ${projectDir}/bin/pta/rename_csq_vcf.py \ + ${sampleID}_germline_vep_cosmic_cancerResitMut_annotated_id.vcf \ + ${sampleID}_germline_snv_indel_annotated_${output_suffix}_final.vcf + + """ +} diff --git a/modules/python/python_get_candidates.nf b/modules/python/python_get_candidates.nf new file mode 100644 index 00000000..8f113392 --- /dev/null +++ b/modules/python/python_get_candidates.nf @@ -0,0 +1,24 @@ +process GET_CANDIDATES { + tag "$sampleID" + + cpus 1 + memory 4.GB + time '04:00:00' + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} + + container 'quay.io/jaxcompsci/bedtools-python3:2.26.0' + + input: + tuple val(sampleID), file(vcf), val(meta), val(chrom) + + output: + tuple val(sampleID), file("*.vcf"), val(meta), val(chrom), emit: vcf + + script: + """ + python \ + ${projectDir}/bin/pta/get_candidates.py \ + ${vcf} \ + ${sampleID}_candidate_merged_${chrom}.vcf + """ +} diff --git a/modules/python/python_log_parser.nf b/modules/python/python_log_parser.nf index 637b9fde..f8564da1 100644 --- a/modules/python/python_log_parser.nf +++ b/modules/python/python_log_parser.nf @@ -4,6 +4,7 @@ process LOG_PARSER { cpus 1 memory 4.GB time '04:00:00' + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} publishDir "${params.pubdir}/${ params.organize_by=='sample' ? sampleID : 'logparser' }", pattern: "*.summary_QC_metrics.txt", mode: 'copy' @@ -16,7 +17,6 @@ process LOG_PARSER { tuple val(sampleID), file("*.summary_QC_metrics.txt") script: - log.info "----- LogParser on ${sampleID} -----" """ python ${projectDir}/bin/atac/LogParser.py > ${sampleID}.summary_QC_metrics.txt """ diff --git a/modules/python/python_merge_columns.nf b/modules/python/python_merge_columns.nf new file mode 100644 index 00000000..ca955b6d --- /dev/null +++ b/modules/python/python_merge_columns.nf @@ -0,0 +1,34 @@ +process MERGE_COLUMNS { + tag "$sampleID" + + cpus 1 + memory 4.GB + time '04:00:00' + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} + + container 'quay.io/jaxcompsci/bedtools-python3:2.26.0' + + input: + tuple val(sampleID), file(vcf), file(tbi), val(meta), val('empty_name'), val('empty_name'), val(chrom) + + output: + tuple val(sampleID), file("*.vcf"), val(meta), val(chrom), emit: mergeColumn_vcf + + script: + + normal = meta.normal_id + tumor = meta.tumor_id + + """ + python \ + ${projectDir}/bin/pta/merge_columns.py \ + ${vcf} \ + ${sampleID}_single_column_${chrom}.vcf \ + ${normal} \ + ${tumor} + """ +} + /* + NOTE: This script will take 'tumor' and 'normal' names and match string based on a simple split on '_'. + Sample names are currently _ or _. This script merged based on the index[1] of split('_'). + */ diff --git a/modules/python/python_merge_prep.nf b/modules/python/python_merge_prep.nf new file mode 100644 index 00000000..6e437422 --- /dev/null +++ b/modules/python/python_merge_prep.nf @@ -0,0 +1,43 @@ +process MERGE_PREP { + tag "$sampleID" + + cpus 1 + memory 4.GB + time '04:00:00' + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} + + container 'quay.io/jaxcompsci/bedtools-python3:2.26.0' + + input: + tuple val(sampleID), path(vcf), val(meta), val(normal_name), val(tumor_name), val(tool) + + output: + tuple val(sampleID), path("*_mergePrep.vcf"), val(meta), val(normal_name), val(tumor_name), val(tool), emit: merge_prep_vcf + + script: + String support_call = tool == 'manta' || tool == 'lancet_support' ? '--support' : '' + String tool_name = tool == 'lancet_support' ? 'lancet' : tool + + """ + python \ + ${projectDir}/bin/pta/reorder_vcf.py \ + ${vcf} \ + ${vcf.baseName}_ordered.vcf \ + ${normal_name} ${tumor_name} + + python \ + ${projectDir}/bin/pta/merge_prep.py \ + --vcf ${vcf.baseName}_ordered.vcf \ + --out ${vcf.baseName}_mergePrep.vcf \ + --tool ${tool_name} \ + ${support_call} + """ +} + +/* NOTE: PLEASE READ!!! + `reorder_vcf.py` requires the header and input 'tumor/normal' names in the 3rd and 4th arg to match. + If you pass names NOT present in the header, it will simply emit the file AS IS. + The script DOES NOT inform the user of if a change has been made in the sample order. + NOTE ALSO: if the header already contains the strings 'TUMOR' and 'NORMAL, + 'TUMOR and NORMAL are RENAMED to string provided in 3rd and 4th args. +*/ diff --git a/modules/python/python_remove_contig.nf b/modules/python/python_remove_contig.nf new file mode 100644 index 00000000..52ff21eb --- /dev/null +++ b/modules/python/python_remove_contig.nf @@ -0,0 +1,24 @@ +process REMOVE_CONTIG { + tag "$sampleID" + + cpus 1 + memory 4.GB + time '04:00:00' + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} + + container 'quay.io/jaxcompsci/bedtools-python3:2.26.0' + + input: + tuple val(sampleID), file(vcf), val(meta), val(normal_name), val(tumor_name), val(tool) + + output: + tuple val(sampleID), file("*.vcf"), val(meta), val(normal_name), val(tumor_name), val(tool), emit: remove_contig_vcf + + script: + """ + python \ + ${projectDir}/bin/pta/remove_contig.py \ + ${vcf} \ + ${vcf.baseName}_removeContig.vcf + """ +} diff --git a/modules/python/python_rename_metadata.nf b/modules/python/python_rename_metadata.nf new file mode 100644 index 00000000..44724e03 --- /dev/null +++ b/modules/python/python_rename_metadata.nf @@ -0,0 +1,27 @@ +process RENAME_METADATA { + tag "$sampleID" + + cpus 1 + memory 4.GB + time '04:00:00' + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} + + container 'quay.io/jaxcompsci/bedtools-python3:2.26.0' + + input: + tuple val(sampleID), file(vcf), path(idx), val(meta), val(normal_name), val(tumor_name), val(tool) + + output: + tuple val(sampleID), file("*headerAdjust.vcf"), val(meta), val(normal_name), val(tumor_name), val(tool), emit: rename_metadata_vcf + + script: + output_name = vcf.getBaseName().replace('.vcf', '') + """ + gunzip -c ${vcf} > temp.vcf + python \ + ${projectDir}/bin/pta/rename_metadata.py \ + temp.vcf \ + ${output_name}_headerAdjust.vcf \ + ${tool} + """ +} diff --git a/modules/python/python_rename_vcf.nf b/modules/python/python_rename_vcf.nf new file mode 100644 index 00000000..75f8f8cb --- /dev/null +++ b/modules/python/python_rename_vcf.nf @@ -0,0 +1,33 @@ +process RENAME_VCF { + tag "$sampleID" + + cpus 1 + memory 4.GB + time '04:00:00' + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} + + container 'quay.io/jaxcompsci/bedtools-python3:2.26.0' + + input: + tuple val(sampleID), file(vcf), val(meta), val(normal_name), val(tumor_name), val(tool) + + output: + tuple val(sampleID), file("*_sampleNamed.vcf"), val(meta), val(normal_name), val(tumor_name), val(tool), emit: rename_vcf + + script: + + normal = meta.normal_id + tumor = meta.tumor_id + + tool_name = tool == 'lancet_support' ? 'lancet' : tool + + """ + python \ + ${projectDir}/bin/pta/rename_vcf.py \ + ${vcf} \ + ${vcf.baseName}_sampleNamed.vcf \ + ${normal} \ + ${tumor} \ + ${tool} + """ +} diff --git a/modules/python/python_reorder_vcf_columns.nf b/modules/python/python_reorder_vcf_columns.nf new file mode 100644 index 00000000..f4fd8f59 --- /dev/null +++ b/modules/python/python_reorder_vcf_columns.nf @@ -0,0 +1,29 @@ +process REORDER_VCF_COLUMNS { + tag "$sampleID" + + cpus 1 + memory 4.GB + time '04:00:00' + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} + + container 'quay.io/jaxcompsci/bedtools-python3:2.26.0' + + input: + tuple val(sampleID), path(vcf), path(idx), val(meta) + + output: + tuple val(sampleID), path("*_mnv_final_filtered_merged_reordered.vcf"), val(meta), emit: vcf + + script: + + normal = meta.normal_id + tumor = meta.tumor_id + + """ + python \ + ${projectDir}/bin/pta/reorder_vcf.py \ + ${vcf} \ + ${vcf.baseName}_mnv_final_filtered_merged_reordered.vcf \ + ${normal} ${tumor} + """ +} diff --git a/modules/python/python_snv_to_mnv_final_filter.nf b/modules/python/python_snv_to_mnv_final_filter.nf new file mode 100644 index 00000000..00b9fbff --- /dev/null +++ b/modules/python/python_snv_to_mnv_final_filter.nf @@ -0,0 +1,24 @@ +process SNV_TO_MNV_FINAL_FILTER { + tag "$sampleID" + + cpus 1 + memory 4.GB + time '04:00:00' + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} + + container 'quay.io/jaxcompsci/bedtools-python3:2.26.0' + + input: + tuple val(sampleID), file(vcf), val(meta), val(chrom) + + output: + tuple val(sampleID), file("*.vcf"), val(meta), val(chrom), emit: vcf + + script: + """ + python \ + ${projectDir}/bin/pta/SNVsToMNVs_CountsBasedFilter_AnnotateHighConf.py \ + -i ${vcf} \ + -o ${sampleID}_mnv_final_filtered_${chrom}.vcf + """ +} diff --git a/modules/python/python_somatic_vcf_finalization.nf b/modules/python/python_somatic_vcf_finalization.nf new file mode 100644 index 00000000..aa6b3b6a --- /dev/null +++ b/modules/python/python_somatic_vcf_finalization.nf @@ -0,0 +1,62 @@ +process SOMATIC_VCF_FINALIZATION { + tag "$sampleID" + + cpus 1 + memory 50.GB + time 1.hour + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} + + container 'quay.io/jaxcompsci/py3_perl_pylibs:v2' + + publishDir "${params.pubdir}/${ params.organize_by=='sample' ? sampleID : 'vcf' }", pattern: "*final.*", mode:'copy' + publishDir "${params.pubdir}/${ params.organize_by=='sample' ? sampleID : 'vcf' }", pattern: "*supplemental.vcf", mode:'copy' + + input: + tuple val(sampleID), file(vcf), val(meta), val(normal_name), val(tumor_name) + val(filtered) + + output: + tuple val(sampleID), file("*final.vcf"), emit: vcf + tuple val(sampleID), file("*final.txt"), emit: txt + tuple val(sampleID), file("*final.maf"), emit: maf + tuple val(sampleID), file("*supplemental.vcf"), emit: supp_vcf + + script: + + output_suffix = filtered == 'filtered' ? 'filtered' : 'unfiltered' + + """ + python \ + ${projectDir}/bin/pta/annotate_id.py \ + ${vcf} \ + ${sampleID}_somatic_vep_cosmic_cancerResitMut_annotated_id.vcf + + python \ + ${projectDir}/bin/pta/rename_csq_vcf.py \ + ${sampleID}_somatic_vep_cosmic_cancerResitMut_annotated_id.vcf \ + ${sampleID}_somatic_snv_indel_annotated_${output_suffix}_supplemental.vcf + + python \ + ${projectDir}/bin/pta/make_main_vcf.py \ + ${sampleID}_somatic_snv_indel_annotated_${output_suffix}_supplemental.vcf \ + ${sampleID}_somatic_snv_indel_annotated_${output_suffix}_final.vcf + + python \ + ${projectDir}/bin/pta/make_txt.py \ + --vcf ${sampleID}_somatic_snv_indel_annotated_${output_suffix}_final.vcf \ + --txt ${sampleID}_somatic_snv_indel_annotated_${output_suffix}_final.txt \ + --tumor ${tumor_name} \ + --normal ${normal_name} + + python \ + ${projectDir}/bin/pta/make_maf.py \ + --vcf ${sampleID}_somatic_snv_indel_annotated_${output_suffix}_final.vcf \ + --maf ${sampleID}_somatic_snv_indel_annotated_${output_suffix}_final.maf \ + --library WGS \ + --vep-version GRCh38 \ + --tumor ${tumor_name} \ + --normal ${normal_name} \ + --ensembl-entrez ${params.ensembl_entrez} + + """ +} diff --git a/modules/python/python_split_mnv.nf b/modules/python/python_split_mnv.nf new file mode 100644 index 00000000..80286816 --- /dev/null +++ b/modules/python/python_split_mnv.nf @@ -0,0 +1,25 @@ +process SPLIT_MNV { + tag "$sampleID" + + cpus 1 + memory 4.GB + time '04:00:00' + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} + + container 'quay.io/jaxcompsci/bedtools-python3:2.26.0' + + input: + tuple val(sampleID), file(vcf), val(meta), val(normal_name), val(tumor_name), val(tool) + + output: + tuple val(sampleID), file("*.vcf"), val(meta), val(normal_name), val(tumor_name), val(tool), emit: split_mnv_vcf + + script: + """ + python \ + ${projectDir}/bin/pta/split_mnv.py \ + ${vcf} \ + ${vcf.baseName}_splitMNV.vcf \ + ${tool} + """ +} diff --git a/modules/python/python_vcf_to_bed.nf b/modules/python/python_vcf_to_bed.nf new file mode 100644 index 00000000..0c284320 --- /dev/null +++ b/modules/python/python_vcf_to_bed.nf @@ -0,0 +1,26 @@ +process VCF_TO_BED { + tag "$sampleID" + + cpus 1 + memory 4.GB + time '04:00:00' + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} + + container 'quay.io/jaxcompsci/bedtools-python3:2.26.0' + + input: + tuple val(sampleID), file(vcf), val(meta), val(chrom) + + output: + tuple val(sampleID), file("*.bed"), val(meta), val(chrom), emit: bed + + script: + """ + python \ + ${projectDir}/bin/pta/vcf_to_bed.py \ + ${vcf} \ + | bedtools \ + merge \ + > ${sampleID}_candidate_merged_${chrom}.bed + """ +} diff --git a/modules/r/annotate_bicseq2_cnv.nf b/modules/r/annotate_bicseq2_cnv.nf new file mode 100644 index 00000000..6263c491 --- /dev/null +++ b/modules/r/annotate_bicseq2_cnv.nf @@ -0,0 +1,42 @@ +process ANNOTATE_BICSEQ2_CNV { + tag "$sampleID" + + cpus 1 + memory 10.GB + time '08:00:00' + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} + + container 'quay.io/jaxcompsci/r-sv_cnv_annotate:4.1.1' + + publishDir "${params.pubdir}/${ params.organize_by=='sample' ? sampleID : 'cnv'}", pattern: "*.bed", mode: 'copy' + + input: + //BICSEQ2_SEG.out.bicseq2_sv_calls + tuple val(sampleID), file(bicseq2_calls), val(no_idx), val(meta), val(normal_name), val(tumor_name), val(bicseq2) + val(chrom_list) + + output: + tuple val(sampleID), file("${sampleID}_cnv_annotated_final.bed"), val(normal_name), val(tumor_name), emit: bicseq_annot + tuple val(sampleID), file("${sampleID}_cnv_annotated_supplemental.bed"), val(normal_name), val(tumor_name), emit: bicseq_annot_suppl + + script: + listOfChroms = chrom_list.collect { "$it" }.join(',') + + """ + Rscript ${projectDir}/bin/pta/annotate-cnv.r \ + --cnv=${bicseq2_calls} \ + --caller="bicseq2" \ + --tumor=${tumor_name} \ + --normal=${normal_name} \ + --cytoband=${params.cytoband} \ + --db_names="DGV,1000G,COSMIC" \ + --db_files=${params.dgv},${params.thousandG},${params.cosmicUniqueBed} \ + --cancer_census=${params.cancerCensusBed} \ + --ensembl=${params.ensemblUniqueBed} \ + --allowed_chr=${listOfChroms} \ + --overlap_fraction=0.8 \ + --out_file_main=${sampleID}_cnv_annotated_final.bed \ + --out_file_supplemental=${sampleID}_cnv_annotated_supplemental.bed + + """ +} diff --git a/modules/r/annotate_genes_sv.nf b/modules/r/annotate_genes_sv.nf new file mode 100644 index 00000000..d7563fd2 --- /dev/null +++ b/modules/r/annotate_genes_sv.nf @@ -0,0 +1,38 @@ +process ANNOTATE_GENES_SV { + tag "$sampleID" + + cpus 1 + memory 8.GB + time '04:00:00' + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} + + container 'quay.io/jaxcompsci/r-sv_cnv_annotate:4.1.1' + + input: + tuple val(sampleID), file(annot_sv_bedpe), val(normal_name), val(tumor_name) + val(suppl_switch) + + output: + tuple val(sampleID), file("*.manta_gridss_sv_annotated_genes*.bed"), val(normal_name), val(tumor_name), emit: annot_sv_genes_bedpe + + script: + + + if (suppl_switch == "main") + """ + Rscript ${projectDir}/bin/pta/annotate-bedpe-with-genes.r \ + --ensembl=${params.ensemblUniqueBed} \ + --cancer_census=${params.cancerCensusBed} \ + --bedpe=${annot_sv_bedpe} \ + --out_file=${sampleID}.manta_gridss_sv_annotated_genes.bed + """ + else if (suppl_switch == "supplemental") + """ + Rscript ${projectDir}/bin/pta/annotate-bedpe-with-genes.r \ + --ensembl=${params.ensemblUniqueBed} \ + --cancer_census=${params.cancerCensusBed} \ + --bedpe=${annot_sv_bedpe} \ + --out_file=${sampleID}.manta_gridss_sv_annotated_genes_supplemental.bed \ + --supplemental + """ +} diff --git a/modules/r/annotate_sv.nf b/modules/r/annotate_sv.nf new file mode 100644 index 00000000..21bc27e2 --- /dev/null +++ b/modules/r/annotate_sv.nf @@ -0,0 +1,42 @@ +process ANNOTATE_SV { + tag "$sampleID" + + cpus 1 + memory 8.GB + time '04:00:00' + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} + + container 'quay.io/jaxcompsci/r-sv_cnv_annotate:4.1.1' + + input: + // MERGE_SV.out.merged + tuple val(sampleID), file(merged_sv_bed), val(normal_name), val(tumor_name) + val(suppl_switch) + + output: + tuple val(sampleID), file("${sampleID}.manta_gridss_sv_annotated*.bed"), val(normal_name), val(tumor_name), emit: annot_sv_bedpe + + script: + + if (suppl_switch == "main") + """ + Rscript ${projectDir}/bin/pta/annotate-bedpe-with-databases.r \ + --db_names=gap,DGV,1000G,PON,COSMIC \ + --db_files=${params.gap},${params.dgvBedpe},${params.thousandGVcf},${params.svPon},${params.cosmicBedPe} \ + --slop=500 \ + --db_ignore_strand=COSMIC \ + --bedpe=${merged_sv_bed} \ + --out_file=${sampleID}.manta_gridss_sv_annotated.bed + + """ + else if (suppl_switch == "supplemental") + """ + Rscript ${projectDir}/bin/pta/annotate-bedpe-with-databases.r \ + --db_names=gap,DGV,1000G,PON,COSMIC \ + --db_files=${params.gap},${params.dgvBedpe},${params.thousandGVcf},${params.svPon},${params.cosmicBedPe} \ + --slop=500 \ + --db_ignore_strand=COSMIC \ + --bedpe=${merged_sv_bed} \ + --out_file=${sampleID}.manta_gridss_sv_annotated_supplemental.bed + """ +} diff --git a/modules/r/annotate_sv_with_cnv.nf b/modules/r/annotate_sv_with_cnv.nf new file mode 100644 index 00000000..c2fc0a71 --- /dev/null +++ b/modules/r/annotate_sv_with_cnv.nf @@ -0,0 +1,35 @@ +process ANNOTATE_SV_WITH_CNV { + tag "$sampleID" + + cpus 1 + memory 8.GB + time '04:00:00' + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} + + container 'quay.io/jaxcompsci/r-sv_cnv_annotate:4.1.1' + + input: + tuple val(sampleID), val(normal_name), val(tumor_name), file(bicseq_annot), file(annot_sv_genes_bedpe) + val(suppl_switch) + + output: + tuple val(sampleID), file("${sampleID}.manta_gridss_sv_annotated_genes_cnv*.bed"), val(normal_name), val(tumor_name), emit: sv_genes_cnv_bedpe + + script: + + if (suppl_switch == "main") + """ + Rscript ${projectDir}/bin/pta/annotate-bedpe-with-cnv.r \ + --cnv=${bicseq_annot} \ + --bedpe=${annot_sv_genes_bedpe} \ + --out_file=${sampleID}.manta_gridss_sv_annotated_genes_cnv.bed + """ + + else if (suppl_switch == "supplemental") + """ + Rscript ${projectDir}/bin/pta/annotate-bedpe-with-cnv.r \ + --cnv=${bicseq_annot} \ + --bedpe=${annot_sv_genes_bedpe} \ + --out_file=${sampleID}.manta_gridss_sv_annotated_genes_cnv_supplemental.bed + """ +} diff --git a/modules/r/filter_bedpe.nf b/modules/r/filter_bedpe.nf new file mode 100644 index 00000000..5da2a0e6 --- /dev/null +++ b/modules/r/filter_bedpe.nf @@ -0,0 +1,43 @@ +process FILTER_BEDPE { + tag "$sampleID" + + cpus 1 + memory 8.GB + time '04:00:00' + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} + + publishDir "${params.pubdir}/${ params.organize_by=='sample' ? sampleID : 'bedpe'}", pattern: "*.bedpe", mode: 'copy' + + container 'quay.io/jaxcompsci/r-sv_cnv_annotate:4.1.1' + + input: + // ANNOTATE_SV_WITH_CNV.out.sv_genes_cnv_bedpe + tuple val(sampleID), file(sv_genes_cnv_bedpe), val(normal_name), val(tumor_name) + val(suppl_switch) + output: + tuple val(sampleID), file("${sampleID}_sv_annotated_somatic_final.bedpe"), val(normal_name), val(tumor_name), optional: true + tuple val(sampleID), file("${sampleID}_sv_annotated_somatic_supplemental.bedpe"), val(normal_name), val(tumor_name), optional: true + tuple val(sampleID), file("${sampleID}_sv_annotated_somatic_high_confidence_final.bedpe"), val(normal_name), val(tumor_name), optional: true + tuple val(sampleID), file("${sampleID}_sv_annotated_somatic_high_confidence_supplemental.bedpe"), val(normal_name), val(tumor_name), optional: true + + script: + if(suppl_switch == "main") + """ + Rscript ${projectDir}/bin/pta/filter-bedpe.r \ + --max_changepoint_distance=1000 \ + --filter_databases=DGV,1000G,PON \ + --bedpe=${sv_genes_cnv_bedpe} \ + --out_file_somatic=${sampleID}_sv_annotated_somatic_final.bedpe \ + --out_file_highconf=${sampleID}_sv_annotated_somatic_high_confidence_final.bedpe + """ + + else if (suppl_switch == "supplemental") + """ + Rscript ${projectDir}/bin/pta/filter-bedpe.r \ + --max_changepoint_distance=1000 \ + --filter_databases=DGV,1000G,PON \ + --bedpe=${sv_genes_cnv_bedpe} \ + --out_file_somatic=${sampleID}_sv_annotated_somatic_supplemental.bedpe \ + --out_file_highconf=${sampleID}_sv_annotated_somatic_high_confidence_supplemental.bedpe + """ +} diff --git a/modules/rstudio/rstudio_frag_len_plot.nf b/modules/r/frag_len_plot.nf similarity index 54% rename from modules/rstudio/rstudio_frag_len_plot.nf rename to modules/r/frag_len_plot.nf index 67081f76..7a0ee302 100644 --- a/modules/rstudio/rstudio_frag_len_plot.nf +++ b/modules/r/frag_len_plot.nf @@ -4,6 +4,7 @@ process FRAG_LEN_PLOT { cpus 1 memory 4.GB time '04:00:00' + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} publishDir "${params.pubdir}/${ params.organize_by=='sample' ? sampleID+'/stats' : 'rstudio' }", pattern: "*fraglen_plot.pdf", mode: 'copy' container 'quay.io/jaxcompsci/rstudio:4.2.0' @@ -13,11 +14,11 @@ process FRAG_LEN_PLOT { output: tuple val(sampleID), file("*fraglen_plot.pdf") + tuple val(sampleID), file("*_spline_table.txt"), emit: spline_table script: - log.info "----- Fragment Length Plot on ${sampleID} -----" """ - Rscript ${projectDir}/bin/atac/fragment_length_plot.R ${frag_len_count} + Rscript ${projectDir}/bin/atac/fragment_length_plot.R ${frag_len_count} ${sampleID}_spline_table.txt mv fraglen_plot.pdf ${sampleID}_fraglen_plot.pdf """ } diff --git a/modules/r/merge_sv.nf b/modules/r/merge_sv.nf new file mode 100644 index 00000000..8f6ccf7c --- /dev/null +++ b/modules/r/merge_sv.nf @@ -0,0 +1,36 @@ +process MERGE_SV { + tag "$sampleID" + + cpus 1 + memory 8.GB + time '04:00:00' + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} + + container 'quay.io/jaxcompsci/r-sv_cnv_annotate:4.1.1' + + input: + tuple val(sampleID), val(normal_name), val(tumor_name), file(manta_vcf), file(manta_vcf_tbi), val(meta_manta), val(manta), file(gripss_vcf), val(gripss_idx), val(meta_gripss), val(gripss) + val(chrom_list) + + output: + tuple val(sampleID), file("${sampleID}.manta_gridss_sv.bed"), val(normal_name), val(tumor_name), emit: merged + tuple val(sampleID), file("${sampleID}.manta_gridss_sv_supplemental.bed"), val(normal_name), val(tumor_name), emit: merged_suppl + + + script: + listOfChroms = chrom_list.collect { "$it" }.join(',') + + """ + Rscript ${projectDir}/bin/pta/merge-caller-vcfs.r \ + --vcf=${manta_vcf},${gripss_vcf} \ + --caller=manta,gridss \ + --tumor=${tumor_name} \ + --normal=${normal_name} \ + --build=GRCh38 \ + --slop=300 \ + --allowed_chr=${listOfChroms} \ + --min_sv_length=500 \ + --out_file=${sampleID}.manta_gridss_sv.bed \ + --out_file_supplemental=${sampleID}.manta_gridss_sv_supplemental.bed + """ +} diff --git a/modules/rsem/rsem_alignment_expression.nf b/modules/rsem/rsem_alignment_expression.nf index 31179a5e..c98b3514 100644 --- a/modules/rsem/rsem_alignment_expression.nf +++ b/modules/rsem/rsem_alignment_expression.nf @@ -2,43 +2,46 @@ process RSEM_ALIGNMENT_EXPRESSION { tag "$sampleID" cpus 12 - memory { 60.GB * task.attempt } - time { 24.h * task.attempt } - errorStrategy 'retry' - maxRetries 1 + memory 60.GB + time 24.h + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} - container 'quay.io/jaxcompsci/rsem_bowtie2_star:0.1.0' + container 'quay.io/jaxcompsci/rsem_bowtie2_star:0.1.0' publishDir "${params.pubdir}/${ params.organize_by=='sample' ? sampleID+'/stats' : 'rsem' }", pattern: "*stats", mode:'copy', enabled: params.rsem_aligner == "bowtie2" publishDir "${params.pubdir}/${ params.organize_by=='sample' ? sampleID : 'rsem' }", pattern: "*results*", mode:'copy' - publishDir "${params.pubdir}/${ params.organize_by=='sample' ? sampleID+'/bam' : 'rsem' }", pattern: "*genome.bam", mode:'copy' - publishDir "${params.pubdir}/${ params.organize_by=='sample' ? sampleID+'/bam' : 'rsem' }", pattern: "*transcript.bam", mode:'copy' + publishDir "${params.pubdir}/${ params.organize_by=='sample' ? sampleID+'/bam' : 'rsem' }", pattern: "*genome.sorted.ba*", mode:'copy' + publishDir "${params.pubdir}/${ params.organize_by=='sample' ? sampleID+'/bam' : 'rsem' }", pattern: "*transcript.sorted.ba*", mode:'copy' input: - tuple val(sampleID), file(reads) - file(rsem_ref_files) + tuple val(sampleID), path(reads), val(strand_setting), val(read_length) + val(rsem_ref_path) + val(rsem_star_prefix) + val(rsem_ref_prefix) output: - file "*stats" - file "*results*" - tuple val(sampleID), file("rsem_aln_*.stats"), emit: rsem_stats - tuple val(sampleID), file("*genes.results"), emit: rsem_genes - tuple val(sampleID), file("*isoforms.results"), emit: rsem_isoforms - tuple val(sampleID), file("*.genome.bam"), emit: bam - tuple val(sampleID), file("*.transcript.bam"), emit: transcript_bam - + path "*stats" + path "*results*" + tuple val(sampleID), path("rsem_aln_*.stats"), emit: rsem_stats + tuple val(sampleID), path("*.stat/*.cnt"), emit: rsem_cnt + tuple val(sampleID), path("*genes.results"), emit: rsem_genes + tuple val(sampleID), path("*isoforms.results"), emit: rsem_isoforms + tuple val(sampleID), path("*.genome.bam"), emit: bam + tuple val(sampleID), path("*.transcript.bam"), emit: transcript_bam + tuple val(sampleID), path("*.genome.sorted.bam"), path("*.genome.sorted.bam.bai"), emit: sorted_genomic_bam + tuple val(sampleID), path("*.transcript.sorted.bam"), path("*.transcript.sorted.bam.bai"), emit: sorted_transcript_bam + script: - log.info "----- Genome Alignment Running on: ${sampleID} -----" - if (params.read_prep == "reverse_stranded") { + if (strand_setting == "reverse_stranded") { prob="--forward-prob 0" } - if (params.read_prep == "forward_stranded") { + if (strand_setting == "forward_stranded") { prob="--forward-prob 1" } - if (params.read_prep == "non_stranded") { + if (strand_setting == "non_stranded") { prob="--forward-prob 0.5" } @@ -53,15 +56,43 @@ process RSEM_ALIGNMENT_EXPRESSION { trimmedfq="${reads[0]}" } if (params.rsem_aligner == "bowtie2"){ - outbam="--output-genome-bam" + + rsem_ref_files = file("${rsem_ref_path}/bowtie2/*").collect { "$it" }.join(' ') + + outbam="--output-genome-bam --sort-bam-by-coordinate" seed_length="--seed-length ${params.seed_length}" + sort_command='' + index_command='' } if (params.rsem_aligner == "star") { - outbam="--star-output-genome-bam" + outbam="--star-output-genome-bam --sort-bam-by-coordinate" seed_length="" + samtools_mem = task.memory.giga / task.cpus + sort_command="samtools sort -@ ${task.cpus} -m ${samtools_mem}G -o ${sampleID}.STAR.genome.sorted.bam ${sampleID}.STAR.genome.bam" + index_command="samtools index ${sampleID}.STAR.genome.sorted.bam" + + read_length = read_length.toInteger() + + if( read_length >= 65 && read_length <= 85) { + rsem_ref_files = file("${rsem_ref_path}/STAR/${rsem_star_prefix}_75/*").collect { "$it" }.join(' ') + } else if( read_length >= 90 && read_length <= 110 ) { + rsem_ref_files = file("${rsem_ref_path}/STAR/${rsem_star_prefix}_100/*").collect { "$it" }.join(' ') + } else if( read_length >= 115 && read_length <= 135 ) { + rsem_ref_files = file("${rsem_ref_path}/STAR/${rsem_star_prefix}_125/*").collect { "$it" }.join(' ') + } else if( read_length >= 140 && read_length <= 160 ) { + rsem_ref_files = file("${rsem_ref_path}/STAR/${rsem_star_prefix}_150/*").collect { "$it" }.join(' ') + } else { + log.info("\nUnsupported read length " + read_length + " in RSEM with STAR. RSEM will now fail gracefully.\n\n") + rsem_ref_files = 'error' + } + } """ + if [ "${rsem_ref_files}" = "error" ]; then exit 1; fi + + ln -s -f ${rsem_ref_files} . + rsem-calculate-expression -p $task.cpus \ ${prob} \ ${stype} \ @@ -71,8 +102,12 @@ process RSEM_ALIGNMENT_EXPRESSION { ${seed_length} \ ${outbam} \ ${trimmedfq} \ - ${params.rsem_ref_prefix} \ + ${rsem_ref_prefix} \ ${sampleID} \ 2> rsem_aln_${sampleID}.stats + + ${sort_command} + + ${index_command} """ -} \ No newline at end of file +} diff --git a/modules/samtools/samtools_calc_mtdna_filter_chrm.nf b/modules/samtools/samtools_calc_mtdna_filter_chrm.nf index f275d8f7..45336f68 100644 --- a/modules/samtools/samtools_calc_mtdna_filter_chrm.nf +++ b/modules/samtools/samtools_calc_mtdna_filter_chrm.nf @@ -4,6 +4,7 @@ process CALC_MTDNA_FILTER_CHRM { cpus 4 memory 4.GB time '10:00:00' + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} publishDir "${params.pubdir}/${ params.organize_by=='sample' ? sampleID+'/stats' : 'samtools' }", pattern: "*_mtDNA_Content.txt", mode: 'copy' container 'quay.io/jaxcompsci/samtools_with_bc:1.3.1' @@ -17,7 +18,6 @@ process CALC_MTDNA_FILTER_CHRM { tuple val(sampleID), file("*_mtDNA_Content.txt"), emit: mtdna_log shell: - log.info "----- Calculate %mtDNA and Filter Mitochondrial Reads on ${sampleID} -----" // Get Mitochondrial and total read counts, calculate %mtDNA and filter Mitochondrial Reads from bam file mt_name = params.gen_org == 'mouse' ? 'MT' : 'chrM' @@ -37,7 +37,7 @@ process CALC_MTDNA_FILTER_CHRM { fi # Calculate %mtDNA - echo 'mtDNA Content:' $(bc <<< "scale=2;100*$mtReads/$totalReads")'%' >> !{sampleID}_mtDNA_Content.txt + echo -e 'sampleID\\tPerc mtDNA\\n'!{sampleID}'\\t'$(bc <<< "scale=2;100*$mtReads/$totalReads") >> !{sampleID}_mtDNA_Content.txt # Filter Mitochondrial Reads from bam file samtools view -@ !{task.cpus} -h !{rmdup_bam_file} \ diff --git a/modules/samtools/samtools_chain_bad2uniq_reads.nf b/modules/samtools/samtools_chain_bad2uniq_reads.nf index 78d5f4c0..0011e211 100644 --- a/modules/samtools/samtools_chain_bad2uniq_reads.nf +++ b/modules/samtools/samtools_chain_bad2uniq_reads.nf @@ -4,6 +4,7 @@ process CHAIN_BAD2UNIQ_READS { cpus 1 memory 4.GB time '04:00:00' + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} container 'quay.io/jaxcompsci/samtools_with_bc:1.3.1' @@ -16,7 +17,6 @@ process CHAIN_BAD2UNIQ_READS { when: params.chain != null shell: - log.info "----- Getting 'bad reads' from bam file on ${sampleID} -----" // Get unique 'bad read names' from bam file using gatk ValidateSamFile out results ''' cat !{bad_reads} \ diff --git a/modules/samtools/samtools_chain_sort_fixmate_bam.nf b/modules/samtools/samtools_chain_sort_fixmate_bam.nf index 24f4a6f3..78e1d0d9 100644 --- a/modules/samtools/samtools_chain_sort_fixmate_bam.nf +++ b/modules/samtools/samtools_chain_sort_fixmate_bam.nf @@ -4,20 +4,21 @@ process CHAIN_SORT_FIXMATE_BAM { cpus 8 memory 20.GB time '20:00:00' + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} publishDir "${params.pubdir}/${ params.organize_by=='sample' ? sampleID+'/bam' : 'samtools' }", pattern: "*.filtered.shifted.*", mode: 'copy' + container 'quay.io/jaxcompsci/samtools_with_bc:1.3.1' input: - tuple val(sampleID), file(bam_mm10) + tuple val(sampleID), file(bam) output: - tuple val(sampleID), file("*.filtered.shifted.*") + tuple val(sampleID), path("*.filtered.shifted.*") when: params.chain != null script: - log.info "----- Performing sort, fixmate, filter the bam on ${sampleID} -----" // This module is for Non-Reference Strain Samples. // To sort bam by read name, fix the mate information, re-sort by coordinates and filter Mitochondrial Reads from bam file. """ @@ -25,7 +26,7 @@ process CHAIN_SORT_FIXMATE_BAM { samtools sort \ -n \ -@ $task.cpus -O bam \ - -o ${sampleID}.tmp3.mm10.bam ${bam_mm10[0]} + -o ${sampleID}.tmp3.mm10.bam ${bam[0]} # fix the mate information. This is done to fix 'TLEN' which is required for MACS2 samtools fixmate \ diff --git a/modules/samtools/samtools_faidx.nf b/modules/samtools/samtools_faidx.nf new file mode 100644 index 00000000..639c2c83 --- /dev/null +++ b/modules/samtools/samtools_faidx.nf @@ -0,0 +1,23 @@ +process SAMTOOLS_FAIDX { + tag "${fasta}" + + cpus 1 + memory 8.GB + time '06:00:00' + + container 'quay.io/biocontainers/samtools:1.14--hb421002_0' + + publishDir "${params.pubdir}/genome_info", mode: 'copy' + + input: + file(fasta) + + output: + file("*.fai") + + script: + + """ + samtools faidx ${fasta} + """ +} diff --git a/modules/samtools/samtools_filter.nf b/modules/samtools/samtools_filter.nf new file mode 100644 index 00000000..77d44d80 --- /dev/null +++ b/modules/samtools/samtools_filter.nf @@ -0,0 +1,31 @@ +process SAMTOOLS_FILTER { + tag "$sampleID" + + cpus 2 + memory 4.GB + time '10:00:00' + + container 'quay.io/jaxcompsci/samtools_with_bc:1.3.1' + + input: + tuple val(sampleID), file(in_file) + val(option) + + output: + tuple val(sampleID), file("*.bam"), emit: bam + + script: + // Exclude reads based on input bit flag. + + prefix = "${sampleID}.Lb" + if (params.workflow == "chipseq"){ + output = "${prefix}.bam" + } + else{ + output = "${sampleID}.bam" + } + """ + samtools view -h -b ${option} ${in_file} > ${output} + """ + +} diff --git a/modules/samtools/samtools_filter_remove_multi_shift.nf b/modules/samtools/samtools_filter_remove_multi_shift.nf index 7ecefe36..2fff05e5 100644 --- a/modules/samtools/samtools_filter_remove_multi_shift.nf +++ b/modules/samtools/samtools_filter_remove_multi_shift.nf @@ -4,8 +4,10 @@ process FILTER_REMOVE_MULTI_SHIFT { cpus 4 memory 10.GB time '10:00:00' + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} publishDir "${params.pubdir}/${ params.organize_by=='sample' ? sampleID+'/bam' : 'samtools' }", pattern: "*.sorted.rmDup.rmChrM.rmMulti.filtered.ba*", mode: 'copy' + container 'quay.io/jaxcompsci/samtools_with_bc:1.3.1' input: @@ -16,7 +18,6 @@ process FILTER_REMOVE_MULTI_SHIFT { tuple val(sampleID), file("*.sorted.rmDup.rmChrM.rmMulti.filtered.ba*"), emit: srf_bam script: - log.info "----- Filter Non-Unique and Include Only 'properly mapped reads' Alignments on ${sampleID} -----" // Filter reads unmapped, mate unmapped, not primary alignment, reads failing platform, pcr duplicates (-F 1804) and reatin properly paired reads (-f 2) in bam file """ # filter low quality reads diff --git a/modules/samtools/samtools_filter_unique_reads.nf b/modules/samtools/samtools_filter_unique_reads.nf new file mode 100644 index 00000000..5c5510df --- /dev/null +++ b/modules/samtools/samtools_filter_unique_reads.nf @@ -0,0 +1,39 @@ +process SAMTOOLS_FILTER_UNIQUE { + tag "$sampleID" + + cpus 1 + memory 4.GB + time '04:00:00' + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} + + container 'quay.io/jaxcompsci/bicseq2:v2' + + input: + tuple val(sampleID), val(meta), path(bam), path(bai), val(read_ID) + val(chroms) + + output: + tuple val(sampleID), path("seq_out/*.seq"), val(meta), val(read_ID), emit: uniq_seq + + script: + chrom_list = chroms.collect { "$it" }.join(' ') + """ + /samtools-0.1.7a_getUnique-0.1.3/samtools view -U "BWA,${read_ID}_,N,N" ${bam} + + mkdir seq_out + + for chrom in ${chrom_list}; do mv *\$chrom.seq seq_out/; done + + """ +} +// Modified samtools view: +// -U STR If specified, get the uique reads. STR should be or +// e.g. means that the aligner is BWA, the prefix of the output is output, and that the chromosome names and strands will not be reported +// StrandReport should be S (separate positive and negative reads), Y, or N +// minLen and maxLen specifies the length range of the reported reads + +// NOTE: The modified samtools view with -U does not work as normal samtools. It does not pass the region to the filter step. + +// NOTE: The modified samtools view reads the header and uses the chrom names in the header, not matter what is present in the mapped file. + +// NOTE: Therefore, to avoid `chrUn_*_decoy.seq` and `_HLA-*.seq` non-primary chroms, the primary list is moved to an output directory. diff --git a/modules/samtools/samtools_final_calc_frip.nf b/modules/samtools/samtools_final_calc_frip.nf index 8152a183..b7a6347d 100644 --- a/modules/samtools/samtools_final_calc_frip.nf +++ b/modules/samtools/samtools_final_calc_frip.nf @@ -4,8 +4,10 @@ process FINAL_CALC_FRIP { cpus 1 memory 4.GB time '04:00:00' + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} publishDir "${params.pubdir}/${ params.organize_by=='sample' ? sampleID+'/stats' : 'samtools' }", pattern: "*_Fraction_reads_in_peak.txt", mode: 'copy' + container 'quay.io/jaxcompsci/samtools_with_bc:1.3.1' input: @@ -15,13 +17,12 @@ process FINAL_CALC_FRIP { tuple val(sampleID), file("*_Fraction_reads_in_peak.txt") shell: - log.info "----- Final Calculate (FRiP) on ${sampleID} -----" // Calculate fraction of reads in peak ''' total_reads=$(samtools view -c !{processed_bams[0]}) reads_in_peaks=$(samtools view -c !{reads_peaks_bams[0]}) FRiP=$(awk "BEGIN {print "${reads_in_peaks}"/"${total_reads}"}") - echo -e ${FRiP}"\\t"${total_reads} \ + echo -e 'SAMPLEID\\tFRiP\\tFiltered Reads\\n'!{sampleID}"\\t"${FRiP}"\\t"${total_reads} \ > !{sampleID}_Fraction_reads_in_peak.txt ''' } diff --git a/modules/samtools/samtools_index.nf b/modules/samtools/samtools_index.nf index 248be1a8..a864b89d 100644 --- a/modules/samtools/samtools_index.nf +++ b/modules/samtools/samtools_index.nf @@ -4,19 +4,19 @@ process SAMTOOLS_INDEX { cpus 1 memory 8.GB time '06:00:00' + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} container 'quay.io/biocontainers/samtools:1.14--hb421002_0' - publishDir "${params.pubdir}/${ params.organize_by=='sample' ? sampleID+'/bam' : 'samtools' }", pattern:"*.ba*", mode:'copy', enabled: params.keep_intermediate + publishDir "${params.pubdir}/${ params.organize_by=='sample' ? sampleID+'/bam' : 'samtools' }", pattern:"*.ba*", mode:'copy', enabled: params.workflow == 'rrbs' ? true : false input: - tuple val("sampleID"), file(bam) + tuple val(sampleID), file(bam) output: - tuple val("sampleID"), file("*.bai"), emit: bai + tuple val(sampleID), file("*.bai"), emit: bai script: - log.info "----- Samtools Index Running on: ${sampleID} -----" """ samtools index ${bam} diff --git a/modules/samtools/samtools_mergebam_filter.nf b/modules/samtools/samtools_mergebam_filter.nf new file mode 100644 index 00000000..23c2bb10 --- /dev/null +++ b/modules/samtools/samtools_mergebam_filter.nf @@ -0,0 +1,35 @@ +process SAMTOOLS_MERGEBAM_FILTER { + tag "$sampleID" + + cpus 2 + memory 4.GB + time '10:00:00' + + container 'quay.io/jaxcompsci/samtools_with_bc:1.3.1' + + input: + tuple val(sampleID), file(in_file) + file(bed) + + output: + tuple val(sampleID), file("*.bam"), emit: bam + + script: + // Setup for chipseq pipeline + + prefix = params.read_type == 'SE' ? "${sampleID}.mLb.clN" : "${sampleID}.mLb.flT" + filter_params = params.read_type == 'SE' ? '-F 0x004' : '-F 0x004 -F 0x0008 -f 0x001' + dup_params = params.keep_dups ? '' : '-F 0x0400' + multimap_params = params.keep_multi_map ? '' : '-q 1' + blacklist_params = params.blacklist ? "-L $bed" : '' + + """ + samtools view \\ + $filter_params \\ + $dup_params \\ + $multimap_params \\ + $blacklist_params \\ + -b ${in_file} > ${prefix}.bam + """ + +} diff --git a/modules/samtools/samtools_non_chain_reindex.nf b/modules/samtools/samtools_non_chain_reindex.nf index 058bc9f7..40418f78 100644 --- a/modules/samtools/samtools_non_chain_reindex.nf +++ b/modules/samtools/samtools_non_chain_reindex.nf @@ -4,8 +4,10 @@ process NON_CHAIN_REINDEX { cpus 1 memory 8.GB time '10:00:00' + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} publishDir "${params.pubdir}/${ params.organize_by=='sample' ? sampleID+'/bam' : 'samtools' }", pattern: "*.filtered.shifted.*", mode: 'copy' + container 'quay.io/jaxcompsci/samtools_with_bc:1.3.1' input: @@ -17,7 +19,6 @@ process NON_CHAIN_REINDEX { when: params.chain == null script: - log.info "----- Filtering Mitochondrial, Unplaced/Unlocalized Reads and reindex on ${sampleID} -----" // This module is for Reference Strain Samples. // To filter Mitochondrial, Unplaced/Unlocalized Reads from bam file. """ diff --git a/modules/samtools/samtools_quality_checks.nf b/modules/samtools/samtools_quality_checks.nf index 851d60f2..7bf02938 100644 --- a/modules/samtools/samtools_quality_checks.nf +++ b/modules/samtools/samtools_quality_checks.nf @@ -4,8 +4,10 @@ process QUALITY_CHECKS { cpus 2 memory 4.GB time '04:00:00' + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} publishDir "${params.pubdir}/${ params.organize_by=='sample' ? sampleID+'/stats' : 'samtools' }", pattern: "*.fragment_length_count.txt", mode: 'copy' + container 'quay.io/jaxcompsci/samtools_with_bc:1.3.1' input: @@ -15,13 +17,11 @@ process QUALITY_CHECKS { tuple val(sampleID), file("*.fragment_length_count.txt") script: - log.info "----- Quality checks on ${sampleID} -----" - log.info "----- Fragment/Insert size on ${sampleID} -----" // Get the fragment length count from bam file for Quality Checks. """ samtools view \ -@ $task.cpus ${sort_rm_filter_bam[0]} \ | awk '\$9>0' | cut -f 9 | sort | uniq -c | sort -b -k2,2n \ - | sed -e 's/^[ \\t]*//' > ${sampleID}.fragment_length_count.txt + | sed -e 's/^[ \\t]*//' | awk -v sample="${sampleID}" -F' ' '{print sample,\$1,\$2}' OFS="\\t" > ${sampleID}.fragment_length_count.txt """ } diff --git a/modules/samtools/samtools_remove_duplicate_reads.nf b/modules/samtools/samtools_remove_duplicate_reads.nf index a54bc3f3..b40267f6 100644 --- a/modules/samtools/samtools_remove_duplicate_reads.nf +++ b/modules/samtools/samtools_remove_duplicate_reads.nf @@ -4,6 +4,7 @@ process REMOVE_DUPLICATE_READS { cpus 2 memory 4.GB time '10:00:00' + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} container 'quay.io/jaxcompsci/samtools_with_bc:1.3.1' @@ -15,7 +16,6 @@ process REMOVE_DUPLICATE_READS { tuple val(sampleID), file("*.sorted.rmDup.bam.bai"), emit: rmDup_bai script: - log.info "----- Samtools Removing PCR Duplicates on: ${sampleID} -----" // Exclude reads flagged as pcr or optical duplicates (0x400), marked with bit flag 1024 in the BAM. """ samtools view -h -b -F 1024 ${marked_bam_file} > ${sampleID}.sorted.rmDup.bam diff --git a/modules/samtools/samtools_sort.nf b/modules/samtools/samtools_sort.nf index 96e38933..6f08983e 100644 --- a/modules/samtools/samtools_sort.nf +++ b/modules/samtools/samtools_sort.nf @@ -1,42 +1,29 @@ -process SORT { +process SAMTOOLS_SORT { tag "$sampleID" cpus 4 memory 20.GB time '20:00:00' + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} + publishDir "${params.pubdir}/${ params.organize_by=='sample' ? sampleID+'/bam' : 'samtools' }", pattern: "*.bam", mode:'copy', enabled: params.workflow == 'rrbs' ? true : false + container 'quay.io/jaxcompsci/samtools_with_bc:1.3.1' input: tuple val(sampleID), file(sam_file) val(options) + val(suffix) output: - tuple val(sampleID), file("*.sorted.bam*") + tuple val(sampleID), file("*.sorted.*"), emit: sorted_file script: - log.info "----- Samtools sort Running on: ${sampleID} -----" - - // check if not sorting by name - if(options != "-n ") - """ - samtools sort \ - ${options} \ - -@ $task.cpus \ - -O bam \ - -o ${sampleID}.sorted.bam \ - ${sam_file[0]} - - samtools index \ - ${sampleID}.sorted.bam - """ - else """ samtools sort \ ${options} \ - -@ $task.cpus \ - -O bam \ - -o ${sampleID}.sorted.bam \ - ${sam_file[0]} + -@ ${task.cpus} \ + -o ${sam_file.baseName}.sorted.${suffix} \ + ${sam_file} """ } diff --git a/modules/samtools/samtools_stats.nf b/modules/samtools/samtools_stats.nf new file mode 100644 index 00000000..42741699 --- /dev/null +++ b/modules/samtools/samtools_stats.nf @@ -0,0 +1,41 @@ +process SAMTOOLS_STATS { + tag "$sampleID" + + cpus 1 + memory 8.GB + time '06:00:00' + + container 'quay.io/biocontainers/samtools:1.14--hb421002_0' + + publishDir { + def type = "${params.workflow}" == 'chipseq' ? ( sampleID =~ /INPUT/ ? 'control_samples/' : 'immuno_precip_samples/') : '' + "${params.pubdir}/${ params.organize_by=='sample' ? type+sampleID+'/samtools' : 'samtools'}" + }, pattern: "*.flagstat", mode: 'copy', enabled: params.keep_intermediate + + publishDir { + def type = "${params.workflow}" == 'chipseq' ? ( sampleID =~ /INPUT/ ? 'control_samples/' : 'immuno_precip_samples/') : '' + "${params.pubdir}/${ params.organize_by=='sample' ? type+sampleID+'/samtools' : 'samtools'}" + }, pattern: "*.idxstats", mode: 'copy', enabled: params.keep_intermediate + + publishDir { + def type = "${params.workflow}" == 'chipseq' ? ( sampleID =~ /INPUT/ ? 'control_samples/' : 'immuno_precip_samples/') : '' + "${params.pubdir}/${ params.organize_by=='sample' ? type+sampleID+'/samtools' : 'samtools'}" + }, pattern: "*.stats", mode: 'copy', enabled: params.keep_intermediate + + + input: + tuple val(sampleID), file(bam) + + output: + tuple val(sampleID), file("*.flagstat"), emit: flagstat + tuple val(sampleID), file("*.idxstats"), emit: idxstat + tuple val(sampleID), file("*.stats"), emit: stats + + script: + + """ + samtools flagstat ${bam[0]} > ${bam[0]}.flagstat + samtools idxstats ${bam[0]} > ${bam[0]}.idxstats + samtools stats ${bam[0]} > ${bam[0]}.stats + """ +} diff --git a/modules/samtools/samtools_stats_insertsize.nf b/modules/samtools/samtools_stats_insertsize.nf new file mode 100644 index 00000000..5b0b7299 --- /dev/null +++ b/modules/samtools/samtools_stats_insertsize.nf @@ -0,0 +1,26 @@ +process SAMTOOLS_STATS_INSERTSIZE { + tag "$sampleID" + + cpus 8 + memory 1.GB + time '01:00:00' + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} + + container 'quay.io/biocontainers/samtools:1.14--hb421002_0' + + publishDir "${params.pubdir}/${ params.organize_by=='sample' ? sampleID+'/stats' : 'samtools' }", pattern: "*insert_size.txt", mode:'copy' + + input: + tuple val(sampleID), val(meta), path(bam), path(bai), val(read_ID) + + output: + tuple val(sampleID), env(read_length), env(insert_size), emit: read_length_insert_size + file("*insert_size.txt") + + script: + """ + samtools stats --insert-size 8000 ${bam} --threads ${task.cpus} | grep ^SN | cut -f 2- > ${sampleID}_insert_size.txt + read_length=`grep "maximum length" ${sampleID}_insert_size.txt | cut -d ':' -f2 | tr -d " \\t\\n\\r"` + insert_size=`grep "insert size average" ${sampleID}_insert_size.txt | cut -d ':' -f2 | tr -d " \\t\\n\\r"` + """ +} diff --git a/modules/samtools/samtools_view.nf b/modules/samtools/samtools_view.nf new file mode 100644 index 00000000..fbd94378 --- /dev/null +++ b/modules/samtools/samtools_view.nf @@ -0,0 +1,26 @@ +process SAMTOOLS_VIEW { + tag "$sampleID" + + cpus 1 + memory 8.GB + time '06:00:00' + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} + + container 'quay.io/biocontainers/samtools:1.14--hb421002_0' + + publishDir "${params.pubdir}/${ params.organize_by=='sample' ? sampleID : 'samtools_view' }", pattern:"*.bam", mode:'copy', enabled: params.keep_intermediate + + input: + tuple val(sampleID), file(sam) + val(view_string) + val(filename) + + output: + tuple val(sampleID), file("*.bam"), emit: bam + + script: + + """ + samtools view ${view_string} ${sam} > ${sampleID}_${filename}.bam + """ +} diff --git a/modules/snpeff_snpsift/snpeff_oneperline.nf b/modules/snpeff_snpsift/snpeff_oneperline.nf index 99cabfca..65920879 100644 --- a/modules/snpeff_snpsift/snpeff_oneperline.nf +++ b/modules/snpeff_snpsift/snpeff_oneperline.nf @@ -4,6 +4,7 @@ process SNPEFF_ONEPERLINE { cpus 1 memory 2.GB time '00:10:00' + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} input: tuple val(sampleID), file(vcf) @@ -25,4 +26,4 @@ process SNPEFF_ONEPERLINE { """ cat ${vcf} | perl ${projectDir}/bin/shared/vcfEffOnePerLine.pl > ${sampleID}_oneperline_${output_suffix} """ -} \ No newline at end of file +} diff --git a/modules/snpeff_snpsift/snpeff_snpeff.nf b/modules/snpeff_snpsift/snpeff_snpeff.nf index 8bfe3c64..52cc872a 100644 --- a/modules/snpeff_snpsift/snpeff_snpeff.nf +++ b/modules/snpeff_snpsift/snpeff_snpeff.nf @@ -4,11 +4,11 @@ process SNPEFF{ cpus = 1 memory = 8.GB time = '06:00:00' + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} - // SNPEFF and SNPSIFT need updating - container 'quay.io/jaxcompsci/snpeff_snpsift_5.1:v5.1' + container 'quay.io/jaxcompsci/snpeff_snpsift_5.1:v5.1d' - publishDir "${params.pubdir}/${ params.organize_by=='sample' ? sampleID : 'snpeff' }", pattern:"*.*", mode:'copy' + publishDir "${params.pubdir}/${ params.organize_by=='sample' ? sampleID : 'snpeff' }", pattern:"*.*", mode:'copy', enabled: params.gen_org=='mouse' ? true : params.keep_intermediate input: tuple val(sampleID),file(vcf) @@ -19,11 +19,10 @@ process SNPEFF{ tuple val(sampleID),file("*.vcf"), emit:vcf //tuple val(sampleID),file("*.html") // If adding back in ^ this command should be added to the java block below - // -s ${sampleID}_snpeff.html \ + // -s ${sampleID}_snpeff.html \ // tuple val(sampleID),file("*") script: - log.info "----- snpEff Running on: ${sampleID} -----" String my_mem = (task.memory-1.GB).toString() my_mem = my_mem[0..-4] @@ -35,7 +34,7 @@ process SNPEFF{ output_suffix = 'SNP_snpeff.vcf' } if (indel_snp == 'BOTH'){ - output_suffix = 'snp_indel_snpeff.vcf' + output_suffix = 'SNP_INDEL_filtered_annotated_final.vcf' } """ @@ -46,4 +45,4 @@ process SNPEFF{ -noStats \ ${vcf} > ${sampleID}_${output_suffix} """ -} \ No newline at end of file +} diff --git a/modules/snpeff_snpsift/snpsift_annotate.nf b/modules/snpeff_snpsift/snpsift_annotate.nf index 2f72e326..020b5a85 100644 --- a/modules/snpeff_snpsift/snpsift_annotate.nf +++ b/modules/snpeff_snpsift/snpsift_annotate.nf @@ -4,9 +4,12 @@ process SNPSIFT_ANNOTATE { cpus = 1 memory = 6.GB time = '06:00:00' + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} - // SNPEFF and SNPSIFT need updating - container 'quay.io/jaxcompsci/snpeff_snpsift_5.1:v5.1' + container 'quay.io/jaxcompsci/snpeff_snpsift_5.1:v5.1d' + + publishDir "${params.pubdir}/${ params.organize_by=='sample' ? sampleID : 'snpsift' }", pattern:"*dbsnpID.vcf", mode:'copy' + publishDir "${params.pubdir}/${ params.organize_by=='sample' ? sampleID : 'snpeff' }", pattern:"*.vcf", mode:'copy', enabled: params.workflow == 'amplicon' ? true : false input: tuple val(sampleID), file(vcf) @@ -26,4 +29,4 @@ process SNPSIFT_ANNOTATE { java -Xmx${my_mem}G -jar /opt/snpEff/SnpSift.jar \ annotate -noDownload -id ${annot_source} ${vcf} > ${vcf.baseName}_${output_suffix}.vcf """ -} \ No newline at end of file +} diff --git a/modules/snpeff_snpsift/snpsift_dbnsfp.nf b/modules/snpeff_snpsift/snpsift_dbnsfp.nf index cabff374..510a61ed 100644 --- a/modules/snpeff_snpsift/snpsift_dbnsfp.nf +++ b/modules/snpeff_snpsift/snpsift_dbnsfp.nf @@ -4,11 +4,11 @@ process SNPSIFT_DBNSFP{ cpus = 1 memory = 6.GB time = '06:00:00' + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} - // SNPEFF and SNPSIFT need updating - container 'quay.io/jaxcompsci/snpeff_snpsift_5.1:v5.1' + container 'quay.io/jaxcompsci/snpeff_snpsift_5.1:v5.1d' - publishDir "${params.pubdir}/${ params.organize_by=='sample' ? sampleID : 'snpeff' }", pattern:"*.vcf", mode:'copy' + publishDir "${params.pubdir}/${ params.organize_by=='sample' ? sampleID : 'snpeff' }", pattern:"*.vcf", mode:'copy', enabled: params.keep_intermediate input: tuple val(sampleID), file(vcf) @@ -18,7 +18,6 @@ process SNPSIFT_DBNSFP{ tuple val(sampleID), file("*.vcf"), emit: vcf script: - log.info "----- snpSift DBNSFP Running on: ${sampleID} -----" String my_mem = (task.memory-1.GB).toString() my_mem = my_mem[0..-4] @@ -39,4 +38,4 @@ process SNPSIFT_DBNSFP{ -f SIFT_score,SIFT_pred,Polyphen2_HDIV_score,MutationAssessor_score,phyloP100way_vertebrate,1000Gp3_AF,1000Gp3_AFR_AF,1000Gp3_EUR_AF,1000Gp3_AMR_AF,1000Gp3_EAS_AF,ESP6500_AA_AF,ESP6500_EA_AF \ ${vcf} > ${sampleID}_${output_suffix} """ -} \ No newline at end of file +} diff --git a/modules/snpeff_snpsift/snpsift_extractfields.nf b/modules/snpeff_snpsift/snpsift_extractfields.nf index 9ae9b2bb..e53449f1 100644 --- a/modules/snpeff_snpsift/snpsift_extractfields.nf +++ b/modules/snpeff_snpsift/snpsift_extractfields.nf @@ -4,9 +4,9 @@ process SNPSIFT_EXTRACTFIELDS { cpus = 1 memory = 6.GB time = '01:00:00' + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} - // SNPEFF and SNPSIFT need updating - container 'quay.io/jaxcompsci/snpeff_snpsift_5.1:v5.1' + container 'quay.io/jaxcompsci/snpeff_snpsift_5.1:v5.1d' publishDir "${params.pubdir}/${ params.organize_by=='sample' ? sampleID : 'snpeff' }", pattern:"*.txt", mode:'copy' @@ -14,26 +14,29 @@ process SNPSIFT_EXTRACTFIELDS { tuple val(sampleID), file(vcf) output: - tuple val(sampleID), file("*.txt"), emit: txt - + tuple val(sampleID), file("*.txt"), emit: txt, optional: true + tuple val(sampleID), file("*.temp"), emit: temp, optional: true + script: - log.info "----- snpSift DBNSFP Running on: ${sampleID} -----" // add suffix for snp indel both for output name String my_mem = (task.memory-1.GB).toString() my_mem = my_mem[0..-4] if (params.gen_org=='human'){ - fields = 'CHROM POS ID REF ALT QUAL FILTER "ANN[*].ALLELE" "ANN[*].EFFECT" "ANN[*].IMPACT" "ANN[*].GENE" "ANN[*].GENEID" "ANN[*].FEATURE" "ANN[*].FEATUREID" "ANN[*].BIOTYPE" "ANN[*].RANK" "ANN[*].HGVS_C" "ANN[*].HGVS_P" "ANN[*].CDNA_POS" "ANN[*].CDNA_LEN" "ANN[*].CDS_POS" "ANN[*].CDS_LEN" "ANN[*].AA_POS" "ANN[*].AA_LEN" "ANN[*].DISTANCE" "LOF[*].GENE" "LOF[*].GENEID" "LOF[*].NUMTR" "LOF[*].PERC" "NMD[*].GENE" "NMD[*].GENEID" "NMD[*].NUMTR" "NMD[*].PERC" "dbNSFP_SIFT_score" "dbNSFP_SIFT_pred" "dbNSFP_Polyphen2_HDIV_score" "dbNSFP_MutationAssessor_score" "dbNSFP_phyloP100way_vertebrate" "dbNSFP_1000Gp3_AF" "dbNSFP_1000Gp3_AFR_AF" "dbNSFP_1000Gp3_EUR_AF" "dbNSFP_1000Gp3_AMR_AF" "dbNSFP_1000Gp3_EAS_AF" "dbNSFP_ESP6500_AA_AF" "dbNSFP_ESP6500_EA_AF"' + fields = 'CHROM POS ID REF ALT QUAL FILTER AF "ANN[*].ALLELE" "ANN[*].EFFECT" "ANN[*].IMPACT" "ANN[*].GENE" "ANN[*].GENEID" "ANN[*].FEATURE" "ANN[*].FEATUREID" "ANN[*].BIOTYPE" "ANN[*].RANK" "ANN[*].HGVS_C" "ANN[*].HGVS_P" "ANN[*].CDNA_POS" "ANN[*].CDNA_LEN" "ANN[*].CDS_POS" "ANN[*].CDS_LEN" "ANN[*].AA_POS" "ANN[*].AA_LEN" "ANN[*].DISTANCE" "LOF[*].GENE" "LOF[*].GENEID" "LOF[*].NUMTR" "LOF[*].PERC" "NMD[*].GENE" "NMD[*].GENEID" "NMD[*].NUMTR" "NMD[*].PERC" "dbNSFP_SIFT_score" "dbNSFP_SIFT_pred" "dbNSFP_Polyphen2_HDIV_score" "dbNSFP_MutationAssessor_score" "dbNSFP_phyloP100way_vertebrate" "dbNSFP_1000Gp3_AF" "dbNSFP_1000Gp3_AFR_AF" "dbNSFP_1000Gp3_EUR_AF" "dbNSFP_1000Gp3_AMR_AF" "dbNSFP_1000Gp3_EAS_AF" "dbNSFP_ESP6500_AA_AF" "dbNSFP_ESP6500_EA_AF"' + suffix = 'txt' } if (params.gen_org=='mouse'){ - fields = 'CHROM POS REF ALT ID FILTER QUAL FILTER AF SNPEFF_FUNCTIONAL_CLASS SNPEFF_GENE_NAME SNPEFF_AMINO_ACID_CHANGE SNPEFF_EFFECT SNPEFF_TRANSCRIPT_ID' + fields = 'CHROM POS ID REF ALT QUAL FILTER AF "ANN[*].ALLELE" "ANN[*].EFFECT" "ANN[*].IMPACT" "ANN[*].GENE" "ANN[*].GENEID" "ANN[*].FEATURE" "ANN[*].FEATUREID" "ANN[*].BIOTYPE" "ANN[*].RANK" "ANN[*].HGVS_C" "ANN[*].HGVS_P" "ANN[*].CDNA_POS" "ANN[*].CDNA_LEN" "ANN[*].CDS_POS" "ANN[*].CDS_LEN" "ANN[*].AA_POS" "ANN[*].AA_LEN" "ANN[*].DISTANCE"' + + suffix = 'txt' } """ java -Xmx${my_mem}G -jar /opt/snpEff/SnpSift.jar \ extractFields ${vcf} ${fields} \ - > ${sampleID}_snpsift_finalTable.txt + > ${sampleID}_snpsift_finalTable.${suffix} """ -} \ No newline at end of file +} diff --git a/modules/squid/squid_annotate.nf b/modules/squid/squid_annotate.nf new file mode 100644 index 00000000..ce8c0bc8 --- /dev/null +++ b/modules/squid/squid_annotate.nf @@ -0,0 +1,25 @@ +process SQUID_ANNOTATE { + + tag "$sampleID" + + cpus 1 + memory 10.GB + time 5.h + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} + + container 'docker.io/nfcore/rnafusion:squid_1.5-star2.7.1a' + + publishDir "${params.pubdir}/${ params.organize_by=='sample' ? sampleID + '/fusions': 'squid' }", pattern: "*.{tsv,txt}", mode:'copy' + + input: + tuple val(sampleID), path(txt) + path(gtf) + + output: + tuple val(sampleID), path("*annotated.txt"), emit: squid_fusions_annotated + + script: + """ + AnnotateSQUIDOutput.py ${gtf} ${txt} ${sampleID}_squid_fusions_annotated.txt + """ +} diff --git a/modules/squid/squid_call.nf b/modules/squid/squid_call.nf new file mode 100644 index 00000000..818f18e6 --- /dev/null +++ b/modules/squid/squid_call.nf @@ -0,0 +1,23 @@ +process SQUID { + + tag "$sampleID" + + cpus 1 + memory 10.GB + time 5.h + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} + + + container 'docker.io/nfcore/rnafusion:squid_1.5-star2.7.1a' + + input: + tuple val(sampleID), path(bam), path(chimeric_bam) + + output: + tuple val(sampleID), path("*sv.txt"), emit: squid_fusions + + script: + """ + squid -b ${bam} -c ${chimeric_bam} -o ${sampleID}.squid.fusions + """ +} diff --git a/modules/star-fusion/star-fusion.nf b/modules/star-fusion/star-fusion.nf new file mode 100644 index 00000000..fee06e1e --- /dev/null +++ b/modules/star-fusion/star-fusion.nf @@ -0,0 +1,95 @@ +process STAR_FUSION { + + tag "$sampleID" + + cpus 12 + memory 42.GB + time 5.h + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} + + container 'trinityctat/starfusion:1.12.0' + + publishDir "${params.pubdir}/${ params.organize_by=='sample' ? sampleID + '/fusions': 'star-fusion' }", pattern: "*.{tsv,txt}", mode:'copy' + + input: + tuple val(sampleID), file(reads) + + output: + tuple val(sampleID), file("*_star-fusion_fusions.tsv"), emit: star_fusion_fusions + tuple val(sampleID), file("*_abridged.tsv"), emit: star_fusion_fusions_abridge + tuple val(sampleID), file("*_abridged.coding_effect.tsv"), optional: true, emit: star_fusion_abridge_coding + + script: + def avail_mem = task.memory ? "--limitBAMsortRAM ${task.memory.toBytes() - 100000000}" : '' + option = params.read_type == 'PE' ? "--left_fq ${reads[0]} --right_fq ${reads[1]}" : "--left_fq ${reads[0]}" + def extra_params = params.star_fusion_opt ? params.star_fusion_opt : '' + + """ + STAR \\ + --genomeDir ${params.star_index} \\ + --readFilesIn ${reads} \\ + --twopassMode Basic \\ + --outReadsUnmapped None \\ + --chimSegmentMin 12 \\ + --chimJunctionOverhangMin 12 \\ + --alignSJDBoverhangMin 10 \\ + --alignMatesGapMax 100000 \\ + --alignIntronMax 6000 \\ + --chimSegmentReadGapMax 3 \\ + --alignSJstitchMismatchNmax 5 -1 5 5 \\ + --runThreadN ${task.cpus} \\ + --outSAMstrandField intronMotif ${avail_mem} \\ + --outSAMunmapped Within \\ + --outSAMtype BAM Unsorted \\ + --outSAMattrRGline ID:GRPundef \\ + --chimMultimapScoreRange 10 \\ + --chimMultimapNmax 10 \\ + --chimNonchimScoreDropMin 10 \\ + --peOverlapNbasesMin 12 \\ + --peOverlapMMp 0.1 \\ + --sjdbOverhang ${params.read_length - 1} \\ + --chimOutJunctionFormat 1 + + STAR-Fusion \\ + --genome_lib_dir ${params.star_fusion_ref} \\ + -J Chimeric.out.junction \\ + ${option} \\ + --CPU ${task.cpus} \\ + --examine_coding_effect \\ + --output_dir . ${extra_params} + + mv star-fusion.fusion_predictions.tsv ${sampleID}_star-fusion_fusions.tsv + mv star-fusion.fusion_predictions.abridged.tsv ${sampleID}_star-fusion_abridged.tsv + mv star-fusion.fusion_predictions.abridged.coding_effect.tsv ${sampleID}_star-fusion_abridged.coding_effect.tsv + """ +} + +//`--readFilesCommand zcat` this option is included in STAR if files are compressed. + +/* + +To build a new reference set: + + export TMPDIR=/fastscratch/lloydm/tmp + + wget http://ftp.ebi.ac.uk/pub/databases/Pfam/releases/Pfam34.0/Pfam-A.hmm.gz --no-check-certificate + wget https://github.com/FusionAnnotator/CTAT_HumanFusionLib/releases/download/v0.3.0/fusion_lib.Mar2021.dat.gz -O CTAT_HumanFusionLib_Mar2021.dat.gz --no-check-certificate + wget https://data.broadinstitute.org/Trinity/CTAT_RESOURCE_LIB/AnnotFilterRule.pm -O AnnotFilterRule.pm --no-check-certificate + wget https://www.dfam.org/releases/Dfam_3.4/infrastructure/dfamscan/homo_sapiens_dfam.hmm --no-check-certificate + wget https://www.dfam.org/releases/Dfam_3.4/infrastructure/dfamscan/homo_sapiens_dfam.hmm.h3f --no-check-certificate + wget https://www.dfam.org/releases/Dfam_3.4/infrastructure/dfamscan/homo_sapiens_dfam.hmm.h3i --no-check-certificate + wget https://www.dfam.org/releases/Dfam_3.4/infrastructure/dfamscan/homo_sapiens_dfam.hmm.h3m --no-check-certificate + wget https://www.dfam.org/releases/Dfam_3.4/infrastructure/dfamscan/homo_sapiens_dfam.hmm.h3p --no-check-certificate + gunzip Pfam-A.hmm.gz && hmmpress Pfam-A.hmm + + singularity exec /projects/omics_share/meta/containers/trinityctat-starfusion-1.12.0.img \ + /usr/local/src/STAR-Fusion/ctat-genome-lib-builder/prep_genome_lib.pl \ + --genome_fa /projects/compsci/omics_share/human/GRCh38/transcriptome/indices/ensembl/Homo_sapiens.GRCh38.102.all.fa \ + --gtf /projects/compsci/omics_share/human/GRCh38/transcriptome/indices/ensembl/Homo_sapiens.GRCh38.102.chr.gtf \ + --annot_filter_rule AnnotFilterRule.pm \ + --fusion_annot_lib CTAT_HumanFusionLib_Mar2021.dat.gz \ + --pfam_db Pfam-A.hmm \ + --dfam_db homo_sapiens_dfam.hmm \ + --max_readlength 150 \ + --CPU 8 +*/ diff --git a/modules/star/star_align.nf b/modules/star/star_align.nf new file mode 100644 index 00000000..36fd4c80 --- /dev/null +++ b/modules/star/star_align.nf @@ -0,0 +1,39 @@ +process STAR_ALIGN { + + tag "$sampleID" + + cpus 12 + memory 84.GB + time 24.h + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} + + container 'quay.io/biocontainers/star:2.7.8a--h9ee0642_1' + + input: + tuple val(sampleID), path(reads) + val(args) + path(gtf) + + output: + tuple val(sampleID), path('*d.out.bam'), emit: bam + tuple val(sampleID), path('*Log.final.out'), emit: log_final + tuple val(sampleID), path('*Log.out'), emit: log_out + + tuple val(sampleID), path('*sortedByCoord.out.bam'), optional:true, emit: bam_sorted + tuple val(sampleID), path('*toTranscriptome.out.bam'), optional:true, emit: bam_transcript + tuple val(sampleID), path('*Aligned.unsort.out.bam'), optional:true, emit: bam_unsorted + tuple val(sampleID), path('*.tab'), optional:true, emit: tab + tuple val(sampleID), path('*.out.junction'), optional:true, emit: junction + tuple val(sampleID), path('*.out.sam'), optional:true, emit: sam + + script: + """ + STAR \\ + --genomeDir ${params.star_index} \\ + --readFilesIn ${reads} \\ + --runThreadN ${task.cpus} \\ + --outFileNamePrefix ${sampleID}_ \\ + --sjdbGTFfile ${gtf} \\ + ${args} + """ +} diff --git a/modules/subread/subread_feature_counts.nf b/modules/subread/subread_feature_counts.nf index 1c253886..7f84b371 100644 --- a/modules/subread/subread_feature_counts.nf +++ b/modules/subread/subread_feature_counts.nf @@ -16,7 +16,6 @@ process FEATURE_COUNTS { tuple val(sampleID), file("*_peaks_countMatrix.txt") script: - log.info "----- Feature Counts on ${sampleID} -----" """ featureCounts \ -a ${peak_cvg_saf} \ diff --git a/modules/subread/subread_feature_counts_chipseq.nf b/modules/subread/subread_feature_counts_chipseq.nf new file mode 100644 index 00000000..29c4d722 --- /dev/null +++ b/modules/subread/subread_feature_counts_chipseq.nf @@ -0,0 +1,34 @@ +process SUBREAD_FEATURECOUNTS { + tag "${antibody}" + + cpus 4 + memory 4.GB + time '10:00:00' + + publishDir "${params.pubdir}/${ params.organize_by=='sample' ? 'consensusCalling_'+antibody+'/subread' : 'subread' }", pattern: "*.txt*", mode: 'copy' + + container 'quay.io/biocontainers/subread:2.0.1--hed695b0_0' + + input: + tuple val(antibody), val(replicatesExist), val(multipleGroups), path(bams), path(saf) + + output: + tuple val(antibody), file("*featureCounts.txt") , emit: counts + tuple val(antibody), file("*featureCounts.txt.summary"), emit: summary + + script: + prefix = "${antibody}.consensus_peaks" + bam_files = bams.findAll { it.toString().endsWith('.bam') }.sort() + pe_params = params.read_type == 'SE' ? '' : '-p --donotsort' + """ + featureCounts \\ + -F SAF \\ + -O \\ + --fracOverlap 0.2 \\ + -T $task.cpus \\ + $pe_params \\ + -a $saf \\ + -o ${prefix}.featureCounts.txt \\ + ${bam_files.join(' ')} + """ +} diff --git a/modules/svaba/svaba.nf b/modules/svaba/svaba.nf new file mode 100644 index 00000000..693747a4 --- /dev/null +++ b/modules/svaba/svaba.nf @@ -0,0 +1,101 @@ +process SVABA { + tag "$sampleID" + + cpus = 8 + memory { normal_bam.size() < 60.GB ? 15.GB : 48.GB } + time { normal_bam.size() < 60.GB ? '10:00:00' : '24:00:00' } + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} + + container 'quay.io/jaxcompsci/svaba:v0.2.1' + + publishDir "${params.pubdir}/${ params.organize_by=='sample' ? "$sampleID" + '/callers' : 'svaba' }", pattern: "*.vcf.gz", mode:'copy' + + input: + tuple val(sampleID), val(meta), path(normal_bam), path(normal_bai), val(normal_name), path(tumor_bam), path(tumor_bai), val(tumor_name) + + output: + tuple val(sampleID), path("*svaba.germline.indel.vcf.gz"), val(meta), val(normal_name), val(tumor_name), val('svaba'), emit: svaba_germline_indel_vcf + tuple val(sampleID), path("*svaba.germline.sv.vcf.gz"), val(meta), val(normal_name), val(tumor_name), val('svaba'), emit: svaba_germline_sv_vcf + tuple val(sampleID), path("*svaba.somatic.indel.vcf.gz"), val(meta), val(normal_name), val(tumor_name), val('svaba'), emit: svaba_somatic_indel_vcf + tuple val(sampleID), path("*svaba.somatic.sv.vcf.gz"), val(meta), val(normal_name), val(tumor_name), val('svaba'), emit: svaba_somatic_sv_vcf + tuple val(sampleID), path("*svaba.bps.txt.gz"), val(meta), val(normal_name), val(tumor_name), val('svaba'), emit: svaba_unfiltered_variants + tuple val(sampleID), path("*svaba.contigs.bam"), emit: svaba_contigs_bam + tuple val(sampleID), path("*svaba.discordant.txt.gz"), emit: svaba_discordants + tuple val(sampleID), path("*svaba.log"), emit: svaba_log + tuple val(sampleID), path("*svaba.alignments.txt.gz"), emit: svaba_alignments + + script: + """ + svaba run \ + -t ${tumor_bam} \ + -n ${normal_bam} \ + -p ${task.cpus} \ + -a ${sampleID}_svaba \ + -G ${params.combined_reference_set} \ + --region ${params.callRegions} \ + -D ${params.dbSNP} \ + -z on + """ +} +// NOTE: VCF Output header has the BAM file names as 'sampleID' e.g.,: +// #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT test-test_realigned_BQSR.bam test-test2_realigned_BQSR.bam + +// Usage: svaba run -t -G -a myid [OPTIONS] + +// Description: SV and indel detection using rolling SGA assembly and BWA-MEM realignment + +// General options +// -v, --verbose Select verbosity level (0-4). Default: 0 +// -h, --help Display this help and exit +// -p, --threads Use NUM threads to run svaba. Default: 1 +// -a, --id-string String specifying the analysis ID to be used as part of ID common. +// Main input +// -G, --reference-genome Path to indexed reference genome to be used by BWA-MEM. +// -t, --case-bam Case BAM/CRAM/SAM file (eg tumor). Can input multiple. +// -n, --control-bam (optional) Control BAM/CRAM/SAM file (eg normal). Can input multiple. +// -k, --region Run on targeted intervals. Accepts BED file or Samtools-style string +// --germline Sets recommended settings for case-only analysis (eg germline). (-I, -L5, assembles NM >= 3 reads) +// Variant filtering and classification +// --lod LOD cutoff to classify indel as non-REF (tests AF=0 vs AF=MaxLikelihood(AF)) [8] +// --lod-dbsnp LOD cutoff to classify indel as non-REF (tests AF=0 vs AF=MaxLikelihood(AF)) at DBSnp indel site [5] +// --lod-somatic LOD cutoff to classify indel as somatic (tests AF=0 in normal vs AF=ML(0.5)) [2.5] +// --lod-somatic-dbsnp LOD cutoff to classify indel as somatic (tests AF=0 in normal vs AF=ML(0.5)) at DBSnp indel site [4] +// --scale-errors Scale the priors that a site is artifact at given repeat count. 0 means assume low (const) error rate [1] +// Additional options +// -L, --mate-lookup-min Minimum number of somatic reads required to attempt mate-region lookup [3] +// -s, --disc-sd-cutoff Number of standard deviations of calculated insert-size distribution to consider discordant. [3.92] +// -c, --chunk-size Size of a local assembly window (in bp). Set 0 for whole-BAM in one assembly. [25000] +// -x, --max-reads Max total read count to read in from assembly region. Set 0 to turn off. [50000] +// -C, --max-coverage Max read coverage to send to assembler (per BAM). Subsample reads if exceeded. [500] +// --no-interchrom-lookup Skip mate lookup for inter-chr candidate events. Reduces power for translocations but less I/O. +// --discordant-only Only run the discordant read clustering module, skip assembly. +// --num-assembly-rounds Run assembler multiple times. > 1 will bootstrap the assembly. [2] +// --num-to-sample When learning about inputs, number of reads to sample. [2,000,000] +// --hp Highly parallel. Don't write output until completely done. More memory, but avoids all thread-locks. +// Output options +// -z, --g-zip Gzip and tabix the output VCF files. [off] +// -A, --all-contigs Output all contigs that were assembled, regardless of mapping or length. [off] +// --read-tracking Track supporting reads by qname. Increases file sizes. [off] +// --write-extracted-reads For the case BAM, write reads sent to assembly to a BAM file. [off] +// Optional external database +// -D, --dbsnp-vcf DBsnp database (VCF) to compare indels against +// -B, --blacklist BED-file with blacklisted regions to not extract any reads from. +// -Y, --microbial-genome Path to indexed reference genome of microbial sequences to be used by BWA-MEM to filter reads. +// -V, --germline-sv-database BED file containing sites of known germline SVs. Used as additional filter for somatic SV detection +// -R, --simple-seq-database BED file containing sites of simple DNA that can confuse the contig re-alignment. +// Assembly and EC params +// -m, --min-overlap Minimum read overlap, an SGA parameter. Default: 0.4* readlength +// -e, --error-rate Fractional difference two reads can have to overlap. See SGA. 0 is fast, but requires error correcting. [0] +// -K, --ec-correct-type (f) Fermi-kit BFC correction, (s) Kmer-correction from SGA, (0) no correction (then suggest non-zero -e) [f] +// -E, --ec-subsample Learn from fraction of non-weird reads during error-correction. Lower number = faster compute [0.5] +// --write-asqg Output an ASQG graph file for each assembly window. +// BWA-MEM alignment params +// --bwa-match-score Set the BWA-MEM match score. BWA-MEM -A [2] +// --gap-open-penalty Set the BWA-MEM gap open penalty for contig to genome alignments. BWA-MEM -O [32] +// --gap-extension-penalty Set the BWA-MEM gap extension penalty for contig to genome alignments. BWA-MEM -E [1] +// --mismatch-penalty Set the BWA-MEM mismatch penalty for contig to genome alignments. BWA-MEM -b [18] +// --bandwidth Set the BWA-MEM SW alignment bandwidth for contig to genome alignments. BWA-MEM -w [1000] +// --z-dropoff Set the BWA-MEM SW alignment Z-dropoff for contig to genome alignments. BWA-MEM -d [100] +// --reseed-trigger Set the BWA-MEM reseed trigger for reseeding mems for contig to genome alignments. BWA-MEM -r [1.5] +// --penalty-clip-3 Set the BWA-MEM penalty for 3' clipping. [5] +// --penalty-clip-5 Set the BWA-MEM penalty for 5' clipping. [5] diff --git a/modules/tabix/compress_merged_vcf.nf b/modules/tabix/compress_merged_vcf.nf new file mode 100644 index 00000000..36159f9f --- /dev/null +++ b/modules/tabix/compress_merged_vcf.nf @@ -0,0 +1,29 @@ +process COMPRESS_INDEX_MERGED_VCF { + tag "$sampleID" + + cpus = 1 + memory = 6.GB + time = '06:00:00' + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} + + container 'quay.io/biocontainers/tabix:1.11--hdfd78af_0' + + input: + tuple val(sampleID), file(vcf), val(meta) + + output: + tuple val(sampleID), file("*.vcf.gz"), file("*.vcf.gz.tbi"), val(meta), val(normal_name), val(tumor_name), emit: compressed_vcf_tbi + + script: + normal_name = meta.normal_id + tumor_name = meta.tumor_id + + """ + bgzip \ + -c \ + ${vcf} \ + > ${vcf}.gz + + tabix ${vcf}.gz + """ +} diff --git a/modules/tabix/compress_vcf.nf b/modules/tabix/compress_vcf.nf new file mode 100644 index 00000000..79a476b6 --- /dev/null +++ b/modules/tabix/compress_vcf.nf @@ -0,0 +1,25 @@ +process COMPRESS_INDEX_VCF { + tag "$sampleID" + + cpus = 1 + memory = 6.GB + time = '06:00:00' + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} + + container 'quay.io/biocontainers/tabix:1.11--hdfd78af_0' + + input: + tuple val(sampleID), file(vcf), val(meta), val(normal_name), val(tumor_name), val(tool) + + output: + tuple val(sampleID), file("*.vcf.gz"), file("*.vcf.gz.tbi"), val(meta), val(normal_name), val(tumor_name), val(tool), emit: compressed_vcf_tbi + + """ + bgzip \ + -c \ + ${vcf} \ + > ${vcf}.gz + + tabix ${vcf}.gz + """ +} diff --git a/modules/tabix/compress_vcf_region.nf b/modules/tabix/compress_vcf_region.nf new file mode 100644 index 00000000..9b6e2366 --- /dev/null +++ b/modules/tabix/compress_vcf_region.nf @@ -0,0 +1,25 @@ +process COMPRESS_INDEX_VCF_REGION { + tag "$sampleID" + + cpus = 1 + memory = 6.GB + time = '06:00:00' + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} + + container 'quay.io/biocontainers/tabix:1.11--hdfd78af_0' + + input: + tuple val(sampleID), file(vcf), val(meta), val(chrom) + + output: + tuple val(sampleID), file("*.vcf.gz"), file("*.vcf.gz.tbi"), val(meta), val('empty_name'), val('empty_name'), val(chrom), emit: compressed_vcf_tbi + + """ + bgzip \ + -c \ + ${vcf} \ + > ${vcf}.gz + + tabix ${vcf}.gz + """ +} diff --git a/modules/trim_galore/trim_galore.nf b/modules/trim_galore/trim_galore.nf index f8c4b8bb..b9968dfd 100644 --- a/modules/trim_galore/trim_galore.nf +++ b/modules/trim_galore/trim_galore.nf @@ -4,11 +4,25 @@ process TRIM_GALORE { cpus 8 memory 16.GB time '06:00:00' + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} container 'quay.io/biocontainers/trim-galore:0.6.7--hdfd78af_0' - publishDir "${params.pubdir}/${ params.organize_by=='sample' ? sampleID+'/trimmed_fastq' : 'trim_galore' }", pattern: "*.fq.gz", mode:'copy' - publishDir "${params.pubdir}/${ params.organize_by=='sample' ? sampleID+'/stats' : 'fastqc' }", pattern: "*_fastqc.{zip,html}", mode:'copy' - publishDir "${params.pubdir}/${ params.organize_by=='sample' ? sampleID+'/stats' : 'trim_report' }", pattern: "*trimming_report.txt", mode:'copy' + + publishDir { + def type = "${params.workflow}" == 'chipseq' ? ( sampleID =~ /INPUT/ ? 'control_samples/' : 'immuno_precip_samples/') : '' + "${params.pubdir}/${ params.organize_by=='sample' ? type+sampleID+'/trimmed_fastq' : 'trim_galore'}" + }, pattern: "*.fq.gz", mode: 'copy', enabled: params.keep_intermediate + + publishDir { + def type = "${params.workflow}" == 'chipseq' ? 'fastqc/' : '' + "${params.pubdir}/${ params.organize_by=='sample' ? type+sampleID+'/stats' : 'fastqc'}" + }, pattern: "*_fastqc.{zip,html}", mode: 'copy' + + publishDir { + def type = "${params.workflow}" == 'chipseq' ? 'fastqc/' : '' + "${params.pubdir}/${ params.organize_by=='sample' ? type+sampleID+'/trimmed_fastq' : 'trim_galore'}" + }, pattern: "*trimming_report.txt", mode: 'copy' + input: tuple val(sampleID), file(fq_reads) @@ -19,7 +33,6 @@ process TRIM_GALORE { tuple val(sampleID), file("*trimming_report.txt"), emit: trim_stats script: - log.info "----- Trim Galore Running on: ${sampleID} -----" paired_end = params.read_type == 'PE' ? '--paired' : '' rrbs_flag = params.workflow == "rrbs" ? '--rrbs' : '' @@ -39,8 +52,22 @@ process TRIM_GALORE { refer to the RRBS guide for the meaning of CTOT and CTOB strands). */ + if (params.workflow == "chipseq" && params.read_type == 'SE') + """ + [ ! -f ${sampleID}.fastq.gz ] && ln -s ${fq_reads} ${sampleID}.fastq.gz + + trim_galore --cores ${task.cpus} ${paired_end} ${rrbs_flag} ${directionality} --gzip --length ${params.trimLength} -q ${params.qualThreshold} --stringency ${params.adapOverlap} -a ${params.adaptorSeq} --fastqc ${sampleID}.fastq.gz """ + else if (params.workflow == "chipseq" && params.read_type == 'PE') + """ + [ ! -f ${sampleID}_1.fastq.gz ] && ln -s ${fq_reads[0]} ${sampleID}_1.fastq.gz + [ ! -f ${sampleID}_2.fastq.gz ] && ln -s ${fq_reads[1]} ${sampleID}_2.fastq.gz + + trim_galore --cores ${task.cpus} ${paired_end} ${rrbs_flag} ${directionality} --gzip --length ${params.trimLength} -q ${params.qualThreshold} --stringency ${params.adapOverlap} -a ${params.adaptorSeq} --fastqc ${sampleID}_1.fastq.gz ${sampleID}_2.fastq.gz + """ + else + """ trim_galore --basename ${sampleID} --cores ${task.cpus} ${paired_end} ${rrbs_flag} ${directionality} --gzip --length ${params.trimLength} -q ${params.qualThreshold} --stringency ${params.adapOverlap} -a ${params.adaptorSeq} --fastqc ${fq_reads} """ -} +} diff --git a/modules/ucsc/ucsc_bedgraphtobigwig.nf b/modules/ucsc/ucsc_bedgraphtobigwig.nf new file mode 100644 index 00000000..439e69d6 --- /dev/null +++ b/modules/ucsc/ucsc_bedgraphtobigwig.nf @@ -0,0 +1,40 @@ +process UCSC_BEDGRAPHTOBIGWIG { + tag "$sampleID" + + cpus 8 + memory 10.GB + time '04:00:00' + + publishDir { + def type = "${params.workflow}" == 'chipseq' ? ( sampleID =~ /INPUT/ ? 'control_samples/' : 'immuno_precip_samples/') : '' + "${params.pubdir}/${ params.organize_by=='sample' ? type+sampleID+'/bigwig' : 'ucsc'}" + }, pattern: "*.bigWig", mode: 'copy' + + container 'quay.io/biocontainers/ucsc-bedgraphtobigwig:377--h446ed27_1' + + input: + tuple val(sampleID), file(bedgraph) + file(sizes) + + output: + tuple val(sampleID), file("*.bigWig"), emit: bigwig + + script: + """ + bedGraphToBigWig \\ + $bedgraph \\ + $sizes \\ + ${sampleID}.bigWig + + + """ +} + +/* +IGV steps removed, re-add if IGV is needed: + + OUTPUT: tuple val(sampleID), file("*.igv.txt"), emit: igv_txt + + SCRIPT: find * -type f -name "*.bigWig" -exec echo -e "bigwig/"{}"\\t0,0,178" \\; > ${sampleID}.bigWig.igv.txt + +*/ \ No newline at end of file diff --git a/modules/utility_modules/aggregate_stats_rna.nf b/modules/utility_modules/aggregate_stats_rna.nf index 707705bd..7f8f0675 100644 --- a/modules/utility_modules/aggregate_stats_rna.nf +++ b/modules/utility_modules/aggregate_stats_rna.nf @@ -3,6 +3,7 @@ process RNA_SUMMARY_STATS { cpus = 1 time = '00:15:00' + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} container 'quay.io/jaxcompsci/perl:0.1.0' @@ -15,7 +16,6 @@ process RNA_SUMMARY_STATS { tuple val(sampleID), file("*.txt") script: - log.info "----- Summary Metrics running on ${sampleID} -----" if (params.read_type == "PE") diff --git a/modules/utility_modules/aggregate_stats_wes.nf b/modules/utility_modules/aggregate_stats_wes.nf index 7c6304a3..79052719 100644 --- a/modules/utility_modules/aggregate_stats_wes.nf +++ b/modules/utility_modules/aggregate_stats_wes.nf @@ -3,6 +3,7 @@ process AGGREGATE_STATS { cpus = 1 time = '00:30:00' + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} container 'quay.io/jaxcompsci/python-bz2file:np_2.7.18' @@ -15,7 +16,6 @@ process AGGREGATE_STATS { tuple val(sampleID), file("*summary_stats.txt"), emit: txt script: - log.info "----- Generating Summary Stats for: ${sampleID} -----" """ python ${projectDir}/bin/wes/aggregate_stats_wes.py ${sampleID}_summary_stats.txt ${filter_stats} ${picard_met} ${algn_met} diff --git a/modules/utility_modules/aggregate_stats_wgs.nf b/modules/utility_modules/aggregate_stats_wgs.nf index 23c08334..03d4b0c5 100644 --- a/modules/utility_modules/aggregate_stats_wgs.nf +++ b/modules/utility_modules/aggregate_stats_wgs.nf @@ -3,6 +3,7 @@ process AGGREGATE_STATS { cpus = 1 time = '00:30:00' + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} container 'quay.io/jaxcompsci/python-bz2file:np_2.7.18' @@ -15,7 +16,6 @@ process AGGREGATE_STATS { tuple val(sampleID), file("*summary_stats.txt"), emit: txt script: - log.info "----- Generating Summary Stats for: ${sampleID} -----" """ python ${projectDir}/bin/wgs/aggregate_stats_wgs.py ${sampleID}_summary_stats.txt ${filter_stats} ${picard_met} ${algn_met} ${cov_met} diff --git a/modules/utility_modules/aria_download.nf b/modules/utility_modules/aria_download.nf new file mode 100644 index 00000000..7bb02477 --- /dev/null +++ b/modules/utility_modules/aria_download.nf @@ -0,0 +1,23 @@ +process ARIA_DOWNLOAD { + + tag "$sampleID" + + cpus 1 + memory 15.GB + time '10:00:00' + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} + + container 'quay.io/jaxcompsci/aria2:1.36.0' + + input: + tuple val(sampleID), val(meta), val(read_num), val(link) + + output: + tuple val(sampleID), val(meta), val(read_num), path("*"), emit: file + + script: + + """ + aria2c --connect-timeout=180 --retry-wait=60 --timeout=180 ${link} + """ +} diff --git a/modules/utility_modules/chipseq_bampe_rm_orphan.nf b/modules/utility_modules/chipseq_bampe_rm_orphan.nf new file mode 100644 index 00000000..6b4d3c84 --- /dev/null +++ b/modules/utility_modules/chipseq_bampe_rm_orphan.nf @@ -0,0 +1,17 @@ +process BAMPE_RM_ORPHAN { + tag "$sampleID" + + container 'quay.io/biocontainers/mulled-v2-57736af1eb98c01010848572c9fec9fff6ffaafd:402e865b8f6af2f3e58c6fc8d57127ff0144b2c7-0' + + input: + tuple val(sampleID), file(bam) + + output: + tuple val(sampleID), file("*.bam"), emit: bam + + script: // This script was bundled withing the nf-core/chipseq/bin/ directory + prefix = "${sampleID}.mLb.clN" + """ + python ${projectDir}/bin/chipseq/bampe_rm_orphan.py ${bam[0]} ${prefix}.bam --only_fr_pairs + """ +} diff --git a/modules/utility_modules/chipseq_check_design.nf b/modules/utility_modules/chipseq_check_design.nf new file mode 100644 index 00000000..f98d40e6 --- /dev/null +++ b/modules/utility_modules/chipseq_check_design.nf @@ -0,0 +1,17 @@ +process CHECK_DESIGN { + tag "$design" + publishDir "${params.pubdir}/parsed_samplesheets", mode: 'copy' + + input: + path(design) + + output: + path('design_reads.csv'), emit: sample_reads + path('design_controls.csv'), emit: study_design + + script: + """ + python ${projectDir}/bin/chipseq/check_design.py $design design_reads.csv design_controls.csv + """ +} + diff --git a/modules/utility_modules/chipseq_make_genome_filter.nf b/modules/utility_modules/chipseq_make_genome_filter.nf new file mode 100644 index 00000000..87f0a063 --- /dev/null +++ b/modules/utility_modules/chipseq_make_genome_filter.nf @@ -0,0 +1,20 @@ +process MAKE_GENOME_FILTER { + tag "$fai" + publishDir "${params.pubdir}/genome_info", mode: 'copy' + + input: + file(fai) + file(blacklist) + + output: + path('*.bed'), emit: bed + path('*.sizes'), emit: sizes + + script: + fasta="\$(echo ${fai} | sed 's/.fai//g')" + blacklist_filter = params.blacklist ? "sortBed -i $blacklist -g ${fasta}.sizes | complementBed -i stdin -g ${fasta}.sizes" : "awk '{print \$1, '0' , \$2}' OFS='\t' ${fasta}.sizes" + """ + cut -f 1,2 ${fai} > ${fasta}.sizes + $blacklist_filter > ${fasta}.include_regions.bed + """ +} diff --git a/modules/utility_modules/concatenate_reads_PE.nf b/modules/utility_modules/concatenate_reads_PE.nf index d7e705fe..cc5786a0 100644 --- a/modules/utility_modules/concatenate_reads_PE.nf +++ b/modules/utility_modules/concatenate_reads_PE.nf @@ -5,20 +5,22 @@ process CONCATENATE_READS_PE { cpus 1 memory 15.GB time '03:00:00' + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} - publishDir "${params.pubdir}/${ params.organize_by=='sample' ? sampleID+'/concatenated_reads' : 'concatenated_reads' }", pattern: "*fastq.gz", mode:'copy' + container 'ubuntu:20.04' + + publishDir "${params.pubdir}/${ params.organize_by=='sample' ? sampleID+'/concatenated_reads' : 'concatenated_reads' }", pattern: "*", mode:'copy' input: tuple val(sampleID), file(R1), file(R2) output: - tuple val(sampleID), file("*fastq.gz"), emit: concat_fastq + tuple val(sampleID), file("*"), emit: concat_fastq script: - log.info "----- Concatenate Reads Running on: ${sampleID} -----" """ - cat $R1 > ${sampleID}_R1.fastq.gz - cat $R2 > ${sampleID}_R2.fastq.gz + cat $R1 > ${sampleID}_R1${params.extension} + cat $R2 > ${sampleID}_R2${params.extension} """ } diff --git a/modules/utility_modules/concatenate_reads_SE.nf b/modules/utility_modules/concatenate_reads_SE.nf index f110e575..c9eb5b12 100644 --- a/modules/utility_modules/concatenate_reads_SE.nf +++ b/modules/utility_modules/concatenate_reads_SE.nf @@ -5,19 +5,21 @@ process CONCATENATE_READS_SE { cpus 1 memory 15.GB time '03:00:00' + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} - publishDir "${params.pubdir}/${ params.organize_by=='sample' ? sampleID+'/concatenated_reads' : 'concatenated_reads' }", pattern: "*fastq.gz", mode:'copy' + container 'ubuntu:20.04' + + publishDir "${params.pubdir}/${ params.organize_by=='sample' ? sampleID+'/concatenated_reads' : 'concatenated_reads' }", pattern: "*", mode:'copy' input: tuple val(sampleID), file(R1) output: - tuple val(sampleID), file("*fastq.gz"), emit: concat_fastq + tuple val(sampleID), file("*"), emit: concat_fastq script: - log.info "----- Concatenate Reads Running on: ${sampleID} -----" """ - cat $R1 > ${sampleID}_R1.fastq.gz + cat $R1 > ${sampleID}_R1${params.extension} """ } diff --git a/modules/utility_modules/concatenate_reads_sampleSheet.nf b/modules/utility_modules/concatenate_reads_sampleSheet.nf new file mode 100644 index 00000000..d22695e9 --- /dev/null +++ b/modules/utility_modules/concatenate_reads_sampleSheet.nf @@ -0,0 +1,26 @@ +process CONCATENATE_READS_SAMPLESHEET { + + tag "$sampleID" + + cpus 1 + memory 15.GB + time '03:00:00' + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} + + publishDir "${params.pubdir}/${ params.organize_by=='sample' ? sampleID+'/concatenated_reads' : 'concatenated_reads' }", pattern: "*fastq.gz", mode:'copy' + + input: + tuple val(sampleID), val(num_lanes), val(meta), val(read_num), path(reads) + + output: + tuple val(sampleID), val(num_lanes), val(meta), val(read_num), path("*fastq.gz"), emit: concat_fastq + + when: + num_lanes > 1 + + script: + + """ + cat $reads > ${sampleID}_${read_num}.fastq.gz + """ +} diff --git a/modules/utility_modules/deseq2_qc.nf b/modules/utility_modules/deseq2_qc.nf new file mode 100644 index 00000000..e9b75b2f --- /dev/null +++ b/modules/utility_modules/deseq2_qc.nf @@ -0,0 +1,52 @@ +process DESEQ2_QC { + tag "${antibody}" + + cpus 1 + memory 15.GB + time '10:00:00' + + container 'quay.io/biocontainers/mulled-v2-8849acf39a43cdd6c839a369a74c0adc823e2f91:ab110436faf952a33575c64dd74615a84011450b-0' + + publishDir "${params.pubdir}/${ params.organize_by=='sample' ? 'consensusCalling_'+antibody+'/deseq2' : 'deseq2' }", mode: 'copy' + + input: + tuple val(antibody), path(counts) + file(deseq2_pca_header) + file(deseq2_clustering_header) + + output: + path "*.pdf" , optional:true, emit: pdf + path "*.RData" , optional:true, emit: rdata + path "*.rds" , optional:true, emit: rds + path "*pca.vals.txt" , optional:true, emit: pca_txt + path "*pca.vals_mqc.tsv" , optional:true, emit: pca_multiqc + path "*sample.dists.txt" , optional:true, emit: dists_txt + path "*sample.dists_mqc.tsv", optional:true, emit: dists_multiqc + path "*.log" , optional:true, emit: log + path "size_factors" , optional:true, emit: size_factors + + + script: + prefix = "${antibody}.consensus_peaks" + bam_ext = params.read_type == 'SE' ? '.mLb.clN.sorted.bam' : '.mLb.clN.bam' + vst = params.deseq2_vst ? '--vst TRUE' : '' + peak_type = params.narrow_peak ? 'narrowPeak' : 'broadPeak' + """ + ${projectDir}/bin/chipseq/deseq2_qc.r \\ + --count_file $counts \\ + --sample_suffix '$bam_ext' \\ + --outdir ./ \\ + --outprefix $prefix \\ + --cores $task.cpus \\ + --id_col 1 --count_col 7 --vst TRUE + + sed 's/deseq2_pca/deseq2_pca_${task.index}/g' <$deseq2_pca_header >tmp.txt + sed -i -e 's/DESeq2 /${antibody} DESeq2 /g' tmp.txt + cat tmp.txt ${prefix}.pca.vals.txt > ${prefix}.pca.vals_mqc.tsv + + sed 's/deseq2_clustering/deseq2_clustering_${task.index}/g' <$deseq2_clustering_header >tmp.txt + sed -i -e 's/DESeq2 /${antibody} DESeq2 /g' tmp.txt + cat tmp.txt ${prefix}.sample.dists.txt > ${prefix}.sample.dists_mqc.tsv + + """ +} diff --git a/modules/utility_modules/frip_score.nf b/modules/utility_modules/frip_score.nf new file mode 100644 index 00000000..375e1274 --- /dev/null +++ b/modules/utility_modules/frip_score.nf @@ -0,0 +1,40 @@ +process FRIP_SCORE { + tag "${ip} vs ${control}" + + cpus 1 + memory 10.GB + time '10:00:00' + + + publishDir "${params.pubdir}/${ params.organize_by=='sample' ? 'immuno_precip_samples/'+ip+'_vs_'+control+'/macs2' : 'macs2' }", pattern: "*.tsv", mode: 'copy' + + container 'quay.io/biocontainers/mulled-v2-8186960447c5cb2faa697666dc1e6d919ad23f3e:3127fcae6b6bdaf8181e21a26ae61231030a9fcb-0' + + input: + tuple val(antibody), val(replicatesExist), val(multipleGroups), val(ip), path(ipbam), val(control), path(controlbam), path(ipflagstat), path(peak) + path(peak_count_header) + path(frip_score_header) + + output: + tuple val(ip), path("*.tsv"), emit : tsv + + script: + def PEAK_TYPE = params.narrow_peak ? 'narrowPeak' : 'broadPeak' + """ + cat $peak | wc -l | awk -v OFS='\t' '{ print "${ip}", \$1 }' | cat $peak_count_header - > ${ip}_peaks.count_mqc.tsv + READS_IN_PEAKS=\$(intersectBed -a ${ipbam[0]} -b $peak -bed -c -f 0.20 | awk -F '\t' '{sum += \$NF} END {print sum}')i + grep 'mapped (' $ipflagstat | awk -v a="\$READS_IN_PEAKS" -v OFS='\t' '{print "${ip}", a/\$1}' | cat $frip_score_header - > ${ip}_peaks.FRiP_mqc.tsv + + + """ +} + +/* +IGV steps removed, re-add if IGV is needed: + + PUBDIR: publishDir "${params.pubdir}/${ params.organize_by=='sample' ? 'comparison/'+ip+'_vs_'+control+'/macs2' : 'macs2' }", pattern: "*.txt", mode: 'copy' + + OUTPUT: tuple val(ip), path("*.txt"), emit : txt + + SCRIPT: find * -type l -name "*.${PEAK_TYPE}" -exec echo -e "macs2/"{}"\\t0,0,178" \\; > ${ip}_peaks.igv.txt +*/ \ No newline at end of file diff --git a/modules/utility_modules/get_read_length.nf b/modules/utility_modules/get_read_length.nf new file mode 100644 index 00000000..e32c4ddd --- /dev/null +++ b/modules/utility_modules/get_read_length.nf @@ -0,0 +1,19 @@ +process GET_READ_LENGTH { + tag "$sampleID" + + cpus = 1 + time = '00:05:00' + + container 'ubuntu:20.04' + + input: + tuple val(sampleID), path(reads) + + output: + tuple val(sampleID), env(READ_LENGTH), emit: read_length + + script: + """ + READ_LENGTH=`zcat ${reads[0]} | head -n 400 | awk 'NR%4==2{m=length(\$0)}{print m}' | sort -n | tail -1` + """ +} \ No newline at end of file diff --git a/modules/utility_modules/gunzip.nf b/modules/utility_modules/gunzip.nf new file mode 100644 index 00000000..4e6bb93a --- /dev/null +++ b/modules/utility_modules/gunzip.nf @@ -0,0 +1,34 @@ +process GUNZIP { + + tag "$sampleID" + + cpus 1 + memory 5.GB + time 2.h + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} + + + container "quay.io/jaxcompsci/py3_perl_pylibs:v2" + + input: + tuple val(sampleID), path(reads) + + output: + tuple val(sampleID), path("*.{fastq,fq}"), emit: gunzip_fastq + shell: + + ''' + if [[ !{reads[0]} =~ ".gz" ]]; + then + gunzip -c !{reads[0]} > !{reads[0].baseName} + else + mv !{reads[0]} input_!{reads[0]} + fi + if [[ !{reads[1]} =~ ".gz" ]]; + then + gunzip -c !{reads[1]} > !{reads[1].baseName} + else + mv !{reads[1]} input_!{reads[1]} + fi + ''' +} diff --git a/modules/utility_modules/jax_trimmer.nf b/modules/utility_modules/jax_trimmer.nf new file mode 100644 index 00000000..ba6edcfd --- /dev/null +++ b/modules/utility_modules/jax_trimmer.nf @@ -0,0 +1,35 @@ +process JAX_TRIMMER { + + tag "$sampleID" + + cpus 1 + memory 30.GB + time '24:00:00' + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} + + container 'quay.io/jaxcompsci/python-bz2file:np_2.7.18' + + publishDir "${params.pubdir}/${ params.organize_by=='sample' ? sampleID+'/stats' : 'quality_stats' }", pattern: "*_stat", mode:'copy' + + input: + tuple val(sampleID), path(fq_reads) + + output: + tuple val(sampleID), file("*_stat"), emit: quality_stats + tuple val(sampleID), file("*filtered_trimmed"), emit: trimmed_fastq + + script: + + if (params.read_type == "SE"){ + mode_HQ="-S -M" + inputfq="${fq_reads[0]}" + } + if (params.read_type == "PE"){ + mode_HQ="-M" + inputfq="${fq_reads[0]} ${fq_reads[1]}" + } + + """ + python ${projectDir}/bin/shared/filter_trim.py $mode_HQ ${params.min_pct_hq_reads} -p ${params.hq_pct} $inputfq + """ +} diff --git a/modules/utility_modules/make_vcf_list.nf b/modules/utility_modules/make_vcf_list.nf index a92cff18..0f08dc51 100644 --- a/modules/utility_modules/make_vcf_list.nf +++ b/modules/utility_modules/make_vcf_list.nf @@ -1,5 +1,6 @@ process MAKE_VCF_LIST { tag "$sampleID" + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} input: tuple val(sampleID), val(chroms) @@ -9,7 +10,6 @@ process MAKE_VCF_LIST { tuple val(sampleID), file("*.list"), emit: list script: - log.info "----- Make VCF List from Chromosomes: ${sampleID} -------" // Puts Individual Chromosome Files In Order and Then Into List for MergeVCFs // convert paths to strings diff --git a/modules/utility_modules/parse_extracted_sv_table.nf b/modules/utility_modules/parse_extracted_sv_table.nf new file mode 100644 index 00000000..b8c0e24c --- /dev/null +++ b/modules/utility_modules/parse_extracted_sv_table.nf @@ -0,0 +1,28 @@ +process SNPSIFT_EXTRACT_AND_PARSE { + + // NOTE: This script is for the parsing of the 'SV' pipeline germline annotationed table from snpeff extractfields. + // It is hard coded to the annotations used. + + tag "$sampleID" + + cpus = 1 + memory = 6.GB + time = '03:00:00' + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} + + container 'quay.io/jaxcompsci/py3_perl_pylibs:v2' + + publishDir "${params.pubdir}/${ params.organize_by=='sample' ? sampleID : 'snpeff' }", pattern:"*.txt", mode:'copy' + + input: + tuple val(sampleID), file(table) + + output: + tuple val(sampleID), file("*.txt"), emit: txt + + script: + + """ + python ${projectDir}/bin/pta/split_annotations.py ${table} ${sampleID}_annotated_filtered_final_table.txt + """ +} diff --git a/modules/utility_modules/quality_stats.nf b/modules/utility_modules/quality_stats.nf deleted file mode 100644 index 8ac6848e..00000000 --- a/modules/utility_modules/quality_stats.nf +++ /dev/null @@ -1,35 +0,0 @@ -process QUALITY_STATISTICS { - - tag "$sampleID" - - cpus 1 - memory 30.GB - time '24:00:00' - - container 'quay.io/jaxcompsci/python-bz2file:np_2.7.18' - - publishDir "${params.pubdir}/${ params.organize_by=='sample' ? sampleID+'/stats' : 'quality_stats' }", pattern: "*fastq.gz_stat", mode:'copy' - - input: - tuple val(sampleID), file(fq_reads) - - output: - tuple val(sampleID), file("*.fastq.gz_stat"), emit: quality_stats - tuple val(sampleID), file("*filtered_trimmed"), emit: trimmed_fastq - - script: - log.info "----- Quality Stats Running on: ${sampleID} -----" - - if (params.read_type == "SE"){ - mode_HQ="-S -M" - inputfq="${fq_reads[0]}" - } - if (params.read_type == "PE"){ - mode_HQ="-M" - inputfq="${fq_reads[0]} ${fq_reads[1]}" - } - - """ - python ${projectDir}/bin/shared/filter_trim.py $mode_HQ ${params.min_pct_hq_reads} $inputfq - """ -} diff --git a/modules/utility_modules/read_groups.nf b/modules/utility_modules/read_groups.nf index 9d41d75b..a27d951d 100644 --- a/modules/utility_modules/read_groups.nf +++ b/modules/utility_modules/read_groups.nf @@ -4,10 +4,11 @@ process READ_GROUPS { cpus 1 memory 5.GB time '01:00:00' + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} container 'quay.io/jaxcompsci/python-bz2file:np_2.7.18' - publishDir "${params.pubdir}/${ params.organize_by=='sample' ? sampleID : 'read_groups' }", pattern: "*read_group.txt", mode:'copy', enabled: params.keep_intermediate + publishDir "${params.pubdir}/${ params.organize_by=='sample' ? sampleID+'/stats' : 'read_groups' }", pattern: "*read_group.txt", mode:'copy', enabled: params.workflow == 'rnaseq' || params.keep_intermediate input: tuple val(sampleID), file(fq_reads) @@ -17,7 +18,6 @@ process READ_GROUPS { tuple val(sampleID), file("*.txt"), emit: read_groups script: - log.info "----- Read Group Information Determination Running on: ${sampleID} -----" if (picard=="picard"){ p='-p' } @@ -25,6 +25,6 @@ process READ_GROUPS { p='' } """ - python ${projectDir}/bin/shared/read_group_from_fastq.py $p -o ${sampleID}_read_group.txt ${fq_reads[0]} + python ${projectDir}/bin/shared/read_group_from_fastq.py $p -s ${sampleID} -o ${sampleID}_read_group.txt ${fq_reads[0]} """ } diff --git a/modules/utility_modules/rna_covcalc_gatk.nf b/modules/utility_modules/rna_covcalc_gatk.nf index 7abbf82e..33b4078f 100644 --- a/modules/utility_modules/rna_covcalc_gatk.nf +++ b/modules/utility_modules/rna_covcalc_gatk.nf @@ -4,10 +4,10 @@ process COVCALC_GATK { cpus 1 memory 15.GB time '24:00:00' + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} container 'quay.io/jaxcompsci/python-bz2file:np_2.7.18' - // store in /stats publishDir "${params.pubdir}/${ params.organize_by=='sample' ? sampleID : 'gatk' }", pattern: "*.bed", mode:'copy' input: @@ -18,7 +18,6 @@ process COVCALC_GATK { tuple val(sampleID), file("*.bed"), emit: bed script: - log.info "----- GATK COVCALC Running on: ${sampleID} -----" """ python ${projectDir}/bin/rnaseq/coveragecalculator.py ${txt} ${sampleID}_${filename}_avg_median_coverage.bed diff --git a/modules/utility_modules/rna_format_gatk.nf b/modules/utility_modules/rna_format_gatk.nf index 0b1e3cc2..529bec33 100644 --- a/modules/utility_modules/rna_format_gatk.nf +++ b/modules/utility_modules/rna_format_gatk.nf @@ -4,10 +4,10 @@ process FORMAT_GATK { cpus 1 memory 15.GB time '24:00:00' + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} container 'quay.io/biocontainers/bedtools:2.23.0--h5b5514e_6' - file(params.ref_fai) - + input: tuple val(sampleID), file(txt) val(L) @@ -16,10 +16,9 @@ process FORMAT_GATK { tuple val(sampleID), file("*_gatk_formatter.txt"), emit: txt script: - log.info "----- GATK Formatter Running on: ${sampleID} -----" """ chmod +x ${projectDir}/bin/rnaseq/gatk_formatter.sh ${projectDir}/bin/rnaseq/gatk_formatter.sh ${txt} ${sampleID}_gatk_temp2.txt ${sampleID}_gatk_formatter.txt ${L} """ // This is a script to format gatk coverage file for subsequent use in log aggregation -} \ No newline at end of file +} diff --git a/modules/vcftools/vcf_annotate.nf b/modules/vcftools/vcf_annotate.nf index a403bf05..135aa3a4 100644 --- a/modules/vcftools/vcf_annotate.nf +++ b/modules/vcftools/vcf_annotate.nf @@ -4,6 +4,7 @@ process VCF_ANNOTATE { cpus = 1 memory = 10.GB time = '23:00:00' + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} input: tuple val(sampleID), file(snp_vcf) @@ -12,14 +13,11 @@ process VCF_ANNOTATE { output: tuple val(sampleID), file("*.vcf"), emit: vcf - // vcftools container needed container 'quay.io/biocontainers/perl-vcftools-vcf:0.1.16--pl5321hdfd78af_4' script: - log.info "----- CAT VCF-ANNOTATE Running on: ${sampleID} -----" if (params.gen_org=='mouse'){ - // make sure it does not break delta="CHROM,POS,ID,REF,ALT" } else if (params.gen_org=='human'){ diff --git a/modules/xenome/xenome.nf b/modules/xenome/xenome.nf new file mode 100644 index 00000000..7e2b1211 --- /dev/null +++ b/modules/xenome/xenome.nf @@ -0,0 +1,29 @@ +process XENOME_CLASSIFY { + tag "$sampleID" + + cpus 8 + memory 50.GB + time 8.h + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} + + container 'quay.io/jaxcompsci/xenome:1.0.1' + + publishDir "${params.pubdir}/${ params.organize_by=='sample' ? sampleID + '/stats': 'xenome' }", pattern: "*.txt", mode:'copy' + + input: + tuple val(sampleID), path(trimmed) + + output: + tuple val(sampleID), path("human*.fastq"), emit: xenome_fastq + tuple val(sampleID), path("mouse*.fastq"), emit: xenome_mouse_fastq + tuple val(sampleID), path("*.txt"), emit: xenome_stats + + script: + + read_input = params.read_type == 'PE' ? "-i ${trimmed[0]} -i ${trimmed[1]}" : "-i ${trimmed[0]}" + pairs = params.read_type == 'PE' ? "--pairs" : "" + + """ + /xenome-1.0.1-r/xenome classify -T 8 -P ${params.xenome_prefix} ${pairs} --host-name mouse --graft-name human ${read_input} > ${sampleID}_xenome_stats.txt + """ +} diff --git a/nextflow.config b/nextflow.config index b1018417..f6b06c5a 100644 --- a/nextflow.config +++ b/nextflow.config @@ -2,7 +2,7 @@ Nextflow DSL2 Main Config - Author(s): Anuj Srivastava, Carolyn Paisie, Barry Guglielmo, Michael Lloyd, Brian Sanderson, Sai Lek + Authors: Anuj Srivastava, Carolyn Paisie, Barry Guglielmo, Michael Lloyd, Brian Sanderson, Sai Lek, Harshpreet Chandok, Peter Fields Copyright of Jackson Laboratories 2022 _____________________________________________________*/ @@ -32,17 +32,25 @@ params { } // specific config for the pipeline -includeConfig params.config + + +try { + includeConfig params.config +} catch (Exception e) { + System.err.println("ERROR: Could not load ${params.config} check that you are using a valid pipeline name") +} + + // work directory is important as it will be large, plan accordingly -workDir = "/fastscratch/nextflow/${params.workflow}" +workDir = "/fastscratch/${USER}/${params.workflow}" manifest { name = "The Jackson Laboratory Computational Sciences Nextflow based analysis pipelines" homePage = "https://github.com/TheJacksonLaboratory/cs-nf-pipelines" mainScript = "main.nf" nextflowVersion = "!>=20.10.0" - version = "0.2.0" + version = "0.3.0" } profiles { diff --git a/run.sh b/run.sh index ff81e06f..f05fcff4 100644 --- a/run.sh +++ b/run.sh @@ -19,5 +19,5 @@ nextflow main.nf \ --workflow rnaseq \ --gen_org mouse \ --sample_folder 'test/rna/mouse' \ ---pubdir '/fastscratch/outputDir' \ --w '/fastscratch/outputDir/work' \ No newline at end of file +--pubdir "/fastscratch/${USER}/outputDir" \ +-w "/fastscratch/${USER}/outputDir/work" diff --git a/run_scripts/README.md b/run_scripts/README.md index 3da98f9d..08a34c6d 100644 --- a/run_scripts/README.md +++ b/run_scripts/README.md @@ -44,4 +44,6 @@ There are several things a user must change before running these scripts: **NOTE:** -These scripts assume they are being run from within `cs-nf-pipelines/run_scripts`. If they are moved to other locations, specify the absolute path to `main.nf` (e.g., `/home/USERNAME/cs-nf-pipelines/main.nf`) \ No newline at end of file +1. These scripts assume they are being run from within `cs-nf-pipelines/run_scripts`. If they are moved to other locations, specify the absolute path to `main.nf` (e.g., `/home/USERNAME/cs-nf-pipelines/main.nf`) + +2. Sample data for each workflow and species are provided in cs-nf-pipelines/test/ \ No newline at end of file diff --git a/run_scripts/atac_human.sh b/run_scripts/atac_human.sh index fbe06b4b..12e7edc4 100644 --- a/run_scripts/atac_human.sh +++ b/run_scripts/atac_human.sh @@ -20,6 +20,6 @@ nextflow ../main.nf \ -profile sumner \ --sample_folder \ --gen_org human \ ---pubdir '/fastscratch/outputDir' \ --w '/fastscratch/outputDir/work' \ +--pubdir "/fastscratch/${USER}/outputDir" \ +-w "/fastscratch/${USER}/outputDir/work" \ --comment "This script will run atac sequencing on human samples using default hg38" diff --git a/run_scripts/atac_mouse.sh b/run_scripts/atac_mouse.sh index 23e85d38..4df5fa56 100644 --- a/run_scripts/atac_mouse.sh +++ b/run_scripts/atac_mouse.sh @@ -23,6 +23,6 @@ nextflow ../main.nf \ --effective_genome_size 2652783500 \ --bowtie2Index '/projects/omics_share/mouse/GRCm38/genome/indices/ensembl/v102/bowtie2/Mus_musculus.GRCm38.dna.primary_assembly.fa' \ --chain '' \ ---pubdir '/fastscratch/outputDir' \ --w '/fastscratch/outputDir/work' \ +--pubdir "/fastscratch/${USER}/outputDir" \ +-w "/fastscratch/${USER}/outputDir/work" \ --comment "This script will run atac sequencing on mouse samples using default mm10" \ No newline at end of file diff --git a/run_scripts/pta_human.sh b/run_scripts/pta_human.sh new file mode 100644 index 00000000..533aefa6 --- /dev/null +++ b/run_scripts/pta_human.sh @@ -0,0 +1,24 @@ +#!/bin/bash +#SBATCH --mail-user=first.last@jax.org +#SBATCH --job-name=pta_human +#SBATCH --mail-type=END,FAIL +#SBATCH -p compute +#SBATCH -q batch +#SBATCH -t 72:00:00 +#SBATCH --mem=1G +#SBATCH --ntasks=1 + +cd $SLURM_SUBMIT_DIR + +# LOAD NEXTFLOW +module use --append /projects/omics_share/meta/modules +module load nextflow + +# RUN PIPELINE +nextflow ../main.nf \ +--workflow pta \ +-profile sumner \ +--csv_input ../test/csv_samplesheets/pta_test.csv +--pubdir "/fastscratch/${USER}/outputDir" \ +-w "/fastscratch/${USER}/outputDir/work" \ +--comment "This script will run paired tumor analysis on test data" diff --git a/run_scripts/rnafusion_human.sh b/run_scripts/rnafusion_human.sh new file mode 100644 index 00000000..bce1d0fc --- /dev/null +++ b/run_scripts/rnafusion_human.sh @@ -0,0 +1,25 @@ +#!/bin/bash +#SBATCH --mail-user=first.last@jax.org +#SBATCH --job-name=rna_fusion_human +#SBATCH --mail-type=END,FAIL +#SBATCH -p compute +#SBATCH -q batch +#SBATCH -t 72:00:00 +#SBATCH --mem=1G +#SBATCH --ntasks=1 + +cd $SLURM_SUBMIT_DIR + +# LOAD NEXTFLOW +module use --append /projects/omics_share/meta/modules +module load nextflow + +# RUN PIPELINE +nextflow ../main.nf \ +--workflow rna_fusion \ +-profile sumner \ +--sample_folder \ +--gen_org human \ +--pubdir "/fastscratch/${USER}/outputDir" \ +-w "/fastscratch/${USER}/outputDir/work" \ +--comment "This script will run rna_fusion on human samples using default hg38" diff --git a/run_scripts/rnaseq_human.sh b/run_scripts/rnaseq_human.sh index 577b308c..a0f97345 100644 --- a/run_scripts/rnaseq_human.sh +++ b/run_scripts/rnaseq_human.sh @@ -20,6 +20,6 @@ nextflow ../main.nf \ -profile sumner \ --sample_folder \ --gen_org human \ ---pubdir '/fastscratch/outputDir' \ --w '/fastscratch/outputDir/work' \ +--pubdir "/fastscratch/${USER}/outputDir" \ +-w "/fastscratch/${USER}/outputDir/work" \ --comment "This script will run rnaseq on human samples using default hg38" diff --git a/run_scripts/rnaseq_mouse.sh b/run_scripts/rnaseq_mouse.sh index e72da816..a21562d3 100644 --- a/run_scripts/rnaseq_mouse.sh +++ b/run_scripts/rnaseq_mouse.sh @@ -20,6 +20,6 @@ nextflow ../main.nf \ -profile sumner \ --sample_folder \ --gen_org mouse \ ---pubdir '/fastscratch/outputDir' \ --w '/fastscratch/outputDir/work' \ +--pubdir "/fastscratch/${USER}/outputDir" \ +-w "/fastscratch/${USER}/outputDir/work" \ --comment "This script will run rnaseq on mouse samples using default mm10" diff --git a/run_scripts/rnaseq_pdx.sh b/run_scripts/rnaseq_pdx.sh new file mode 100644 index 00000000..b186eeae --- /dev/null +++ b/run_scripts/rnaseq_pdx.sh @@ -0,0 +1,26 @@ +#!/bin/bash +#SBATCH --mail-user=first.last@jax.org +#SBATCH --job-name=rnaseq_pdx_human +#SBATCH --mail-type=END,FAIL +#SBATCH -p compute +#SBATCH -q batch +#SBATCH -t 72:00:00 +#SBATCH --mem=1G +#SBATCH --ntasks=1 + +cd $SLURM_SUBMIT_DIR + +# LOAD NEXTFLOW +module use --append /projects/omics_share/meta/modules +module load nextflow + +# RUN PIPELINE +nextflow ../main.nf \ +--workflow rnaseq \ +--pdx \ +-profile sumner \ +--sample_folder \ +--gen_org human \ +--pubdir "/fastscratch/${USER}/outputDir" \ +-w "/fastscratch/${USER}/outputDir/work" \ +--comment "This script will run rnaseq on pdx samples using default hg38 and mm10" diff --git a/run_scripts/rrbs_human.sh b/run_scripts/rrbs_human.sh index b1434a1f..968f5705 100644 --- a/run_scripts/rrbs_human.sh +++ b/run_scripts/rrbs_human.sh @@ -20,6 +20,6 @@ nextflow ../main.nf \ -profile sumner \ --sample_folder \ --gen_org human \ ---pubdir '/fastscratch/outputDir' \ --w '/fastscratch/outputDir/work' \ +--pubdir "/fastscratch/${USER}/outputDir" \ +-w "/fastscratch/${USER}/outputDir/work" \ --comment "This script will run the reduced-representation bisulfite sequencing analysis pipeline on human samples using default hg38" \ No newline at end of file diff --git a/run_scripts/rrbs_mouse.sh b/run_scripts/rrbs_mouse.sh index 6fb963fb..a6bd872b 100644 --- a/run_scripts/rrbs_mouse.sh +++ b/run_scripts/rrbs_mouse.sh @@ -20,6 +20,6 @@ nextflow ../main.nf \ -profile sumner \ --sample_folder \ --gen_org mouse \ ---pubdir '/fastscratch/outputDir' \ --w '/fastscratch/outputDir/work' \ +--pubdir "/fastscratch/${USER}/outputDir" \ +-w "/fastscratch/${USER}/outputDir/work" \ --comment "This script will run the reduced-representation bisulfite sequencing analysis pipeline on mouse samples using default mm10" \ No newline at end of file diff --git a/run_scripts/wes_human.sh b/run_scripts/wes_human.sh index 9c68260c..01e57be8 100644 --- a/run_scripts/wes_human.sh +++ b/run_scripts/wes_human.sh @@ -20,6 +20,6 @@ nextflow ../main.nf \ -profile sumner \ --sample_folder \ --gen_org human \ ---pubdir '/fastscratch/outputDir' \ --w '/fastscratch/outputDir/work' \ +--pubdir "/fastscratch/${USER}/outputDir" \ +-w "/fastscratch/${USER}/outputDir/work" \ --comment "This script will run whole exome sequencing on human samples using default hg38" diff --git a/run_scripts/wes_mouse.sh b/run_scripts/wes_mouse.sh index 7a4bd973..1e30bba7 100644 --- a/run_scripts/wes_mouse.sh +++ b/run_scripts/wes_mouse.sh @@ -20,6 +20,6 @@ nextflow ../main.nf \ -profile sumner \ --sample_folder \ --gen_org mouse \ ---pubdir '/fastscratch/outputDir' \ --w '/fastscratch/outputDir/work' \ +--pubdir "/fastscratch/${USER}/outputDir" \ +-w "/fastscratch/${USER}/outputDir/work" \ --comment "This script will run whole exome sequencing on mouse samples using default mm10" diff --git a/run_scripts/wes_pdx.sh b/run_scripts/wes_pdx.sh new file mode 100644 index 00000000..18e3c1d0 --- /dev/null +++ b/run_scripts/wes_pdx.sh @@ -0,0 +1,25 @@ +#!/bin/bash +#SBATCH --mail-user=first.last@jax.org +#SBATCH --job-name=wes_pdx_human +#SBATCH --mail-type=END,FAIL +#SBATCH -p compute +#SBATCH -q batch +#SBATCH -t 72:00:00 +#SBATCH --mem=1G +#SBATCH --ntasks=1 + +cd $SLURM_SUBMIT_DIR + +# LOAD NEXTFLOW +module use --append /projects/omics_share/meta/modules +module load nextflow + +# RUN PIPELINE +nextflow ../main.nf \ +--workflow pdx_wes \ +-profile sumner \ +--sample_folder \ +--gen_org human \ +--pubdir "/fastscratch/${USER}/outputDir" \ +-w "/fastscratch/${USER}/outputDir/work" \ +--comment "This script will run whole exome sequencing on pdx samples using default hg38" diff --git a/run_scripts/wgs_human.sh b/run_scripts/wgs_human.sh index d598cd79..7741b70a 100644 --- a/run_scripts/wgs_human.sh +++ b/run_scripts/wgs_human.sh @@ -20,6 +20,6 @@ nextflow ../main.nf \ -profile sumner \ --sample_folder \ --gen_org human \ ---pubdir '/fastscratch/outputDir' \ --w '/fastscratch/outputDir/work' \ +--pubdir "/fastscratch/${USER}/outputDir" \ +-w "/fastscratch/${USER}/outputDir/work" \ --comment "This script will run whole genome sequencing on human samples using default hg38" diff --git a/run_scripts/wgs_mouse.sh b/run_scripts/wgs_mouse.sh index 25f433c6..5b4f95b2 100644 --- a/run_scripts/wgs_mouse.sh +++ b/run_scripts/wgs_mouse.sh @@ -20,6 +20,6 @@ nextflow ../main.nf \ -profile sumner \ --sample_folder \ --gen_org mouse \ ---pubdir '/fastscratch/outputDir' \ --w '/fastscratch/outputDir/work' \ +--pubdir "/fastscratch/${USER}/outputDir" \ +-w "/fastscratch/${USER}/outputDir/work" \ --comment "This script will run whole genome sequencing on mouse samples using default mm10" diff --git a/subworkflows/aria_download_parse.nf b/subworkflows/aria_download_parse.nf new file mode 100644 index 00000000..d756c6af --- /dev/null +++ b/subworkflows/aria_download_parse.nf @@ -0,0 +1,105 @@ +#!/usr/bin/env nextflow +nextflow.enable.dsl=2 + +// import modules +include {ARIA_DOWNLOAD} from "${projectDir}/modules/utility_modules/aria_download" +include {CONCATENATE_READS_SAMPLESHEET} from "${projectDir}/modules/utility_modules/concatenate_reads_sampleSheet" + +workflow FILE_DOWNLOAD { + + take: + ch_input_sample + + main: + + /* + General note: + + Input tuple expected from the CSV sheet: + it[0] is sample ID. + it[1] is metadata information + it[2] and it[3] are R1 and R2 if PE. it[3] is empty if SE. + + All steps expect that sampleID is in position [0] of tuples. + + */ + + + if (params.read_type == 'PE') { + aria_download_input = ch_input_sample + .multiMap { it -> + R1: tuple(it[0], it[1], 'R1', it[2]) + R2: tuple(it[0], it[1], 'R2', it[3]) + } + .mix() + group_size = 2 + } else { + aria_download_input = ch_input_sample + .multiMap { it -> + R1: tuple(it[0], it[1], 'R1', it[2]) + } + .mix() + group_size = 1 + } + /* + remap the data to individual R1 / R2 tuples. + These individual tuples are then mixed to pass individual files to the downloader. + R1 vs. R2 is maintained in the mix. Order is irrelavent here as data are grouped + by sampleID downstream. + */ + + // Download files. + ARIA_DOWNLOAD(aria_download_input) + + concat_input = ARIA_DOWNLOAD.out.file + .map { it -> + def meta = [:] + meta.sampleID = it[1].sampleID + [it[0], it[1].lane, meta, it[2], it[3], it[1].size] // sampleID, laneID, meta, read_ID:[R1|R2], file, number_of_lanes + } + .map { sampleID, laneID, meta, readID, file, size -> tuple( groupKey([sampleID, meta, readID], size), laneID, file ) } + .groupTuple() // controlled by group key: [sampleID, meta, read_ID] + .map{ it -> tuple(it[0][0], it[1].size(), it[0][1], it[0][2], it[2])} // sampleID, num_lanes, meta, read_ID:[R1|R2], file + .branch{ + concat: it[1] > 1 + pass: it[1] == 1 + } + /* + remap the downloaded files to exclude lane from meta, and group on sampleID, meta, and read_ID: R1|R2. + The number of lanes in the grouped data is used to determine if concatenation is needed. + The branch statement makes a 'concat' set for concatenation and a 'pass' set that isn't concatenated. + The branch is using it[1].size() from the preceding step, i.e., the list size of lanes for the sample. + + Metadata inclusion here is for future expansion. As implimented above, metadata is redundant to sampleID in `it[0]`. + However, if additional metadata are added to sample sheets, those metadata can be added and tracked above. + + groupTuple size is dynamically defined by metadata field 'size' i.e., the number of lanes per sample. + + See: https://www.nextflow.io/docs/latest/operator.html#grouptuple and the note about dynamic group size. + + */ + + no_concat_samples = concat_input.pass + .map{it -> tuple(it[0], it[1], it[2], it[3], it[4][0])} // sampleID, num_lanes, meta, read_ID:[R1|R2], file + /* + this delists the the file in `it[4]` as it is a single fastq sample (i.e., non-concat samples). + */ + + // Concatenate samples as needed. + CONCATENATE_READS_SAMPLESHEET(concat_input.concat) + + read_meta_ch = CONCATENATE_READS_SAMPLESHEET.out.concat_fastq + .mix(no_concat_samples) + .groupTuple(by: [0,2], size: group_size) // sampleID, meta + .map{it -> tuple(it[0], it[2], it[4].toSorted( { a, b -> a.getName() <=> b.getName() } ) ) } + + /* + Mix concatenation files, with non-concat files. 'mix' allows for, all, some, or no files to have + gone through concatenation. + + Reads are remapped to read_ch and meta is placed in meta_ch. Input tuples for existing modules + do not expect 'meta' in the tuple. Example expected input tuple: [sampleID, [reads]] + */ + emit: + read_meta_ch +} \ No newline at end of file diff --git a/subworkflows/concatenate_local_files.nf b/subworkflows/concatenate_local_files.nf new file mode 100644 index 00000000..f567b8f0 --- /dev/null +++ b/subworkflows/concatenate_local_files.nf @@ -0,0 +1,78 @@ +#!/usr/bin/env nextflow +nextflow.enable.dsl=2 + +// import modules +include {CONCATENATE_READS_SAMPLESHEET} from "${projectDir}/modules/utility_modules/concatenate_reads_sampleSheet" + +workflow CONCATENATE_LOCAL_FILES { + + take: + ch_input_sample + + main: + + if (params.read_type == 'PE') { + temp_map = ch_input_sample + .multiMap { it -> + def meta = [:] + meta.sampleID = it[1].sampleID + R1: tuple(it[0], it[1].lane, meta, 'R1', it[2]) + R2: tuple(it[0], it[1].lane, meta, 'R2', it[3]) + } + .mix() + .groupTuple(by: [0,2,3]) + .map{ it -> tuple(it[0], it[1].size(), it[2], it[3], it[4]) } // sampleID, num_lanes, meta, read_ID:[R1|R2], file + + concat_input = temp_map + .branch { + concat: it[1] > 1 + pass: it[1] == 1 + } + group_size = 2 + } else { + + temp_map = ch_input_sample + .multiMap { it -> + def meta = [:] + meta.sampleID = it[1].sampleID + R1: tuple(it[0], it[1].lane, meta, 'R1', it[2]) + } + .mix() + .groupTuple(by: [0,2,3]) + .map{ it -> tuple(it[0], it[1].size(), it[2], it[3], it[4]) } // sampleID, num_lanes, meta, read_ID:[R1], file + + concat_input = temp_map + .branch { + concat: it[1] > 1 + pass: it[1] == 1 + } + group_size = 1 + } + + no_concat_samples = concat_input.pass + .map{it -> tuple(it[0], it[1], it[2], it[3], it[4][0])} // sampleID, num_lanes, meta, read_ID:[R1|R2], file + + /* + this delists the the file in `it[4]` as it is a single fastq sample (i.e., non-concat samples). + + */ + + CONCATENATE_READS_SAMPLESHEET(concat_input.concat) + + read_meta_ch = CONCATENATE_READS_SAMPLESHEET.out.concat_fastq + .mix(no_concat_samples) + .groupTuple(by: [0,2], size: group_size) // sampleID, meta + .map{it -> tuple(it[0], it[2], it[4].toSorted( { a, b -> file(a).getName() <=> file(b).getName() } ) ) } + + /* + Mix concatenation files, with non-concat files. 'mix' allows for, all, some, or no files to have + gone through concatenation. + + Reads are remapped to read_ch and meta is placed in meta_ch. Input tuples for existing modules + do not expect 'meta' in the tuple. Example expected input tuple: [sampleID, [reads]] + */ + + emit: + read_meta_ch + +} \ No newline at end of file diff --git a/subworkflows/concatenate_pta_fastq.nf b/subworkflows/concatenate_pta_fastq.nf new file mode 100644 index 00000000..14899d71 --- /dev/null +++ b/subworkflows/concatenate_pta_fastq.nf @@ -0,0 +1,72 @@ +#!/usr/bin/env nextflow +nextflow.enable.dsl=2 + +// import modules +include {CONCATENATE_READS_SAMPLESHEET} from "${projectDir}/modules/utility_modules/concatenate_reads_sampleSheet" + +workflow CONCATENATE_PTA_FASTQ { + + take: + ch_input_sample + + main: + + if (params.read_type == 'PE') { + temp_map = ch_input_sample + .multiMap { it -> + R1: tuple(it[0], it[1].lane, it[1], 'R1', it[2][0]) + R2: tuple(it[0], it[1].lane, it[1], 'R2', it[2][1]) + } + .mix() + .groupTuple(by: [0,2,3]) + .map{ it -> tuple(it[0], it[1].size(), it[2], it[3], it[4]) } // sampleID, num_lanes, meta, read_ID:[R1|R2], file + + concat_input = temp_map + .branch { + concat: it[1] > 1 + pass: it[1] == 1 + } + } else { + + temp_map = ch_input_sample + .multiMap { it -> + R1: tuple(it[0], it[1].lane, it[1], 'R1', it[2][0]) + } + .mix() + .groupTuple(by: [0,2,3]) + .map{ it -> tuple(it[0], it[1].size(), it[2], it[3], it[4]) } // sampleID, num_lanes, meta, read_ID:[R1], file + + concat_input = temp_map + .branch { + concat: it[1] > 1 + pass: it[1] == 1 + } + } + + no_concat_samples = concat_input.pass + .map{it -> tuple(it[0], it[1], it[2], it[3], it[4][0])} // sampleID, num_lanes, meta, read_ID:[R1|R2], file + + /* + this delists the the file in `it[4]` as it is a single fastq sample (i.e., non-concat samples). + + */ + + CONCATENATE_READS_SAMPLESHEET(concat_input.concat) + + read_meta_ch = CONCATENATE_READS_SAMPLESHEET.out.concat_fastq + .mix(no_concat_samples) + .groupTuple(by: [0,2]) // sampleID, meta + .map{it -> tuple(it[0], it[2], it[4].toSorted( { a, b -> file(a).getName() <=> file(b).getName() } ) ) } + + /* + Mix concatenation files, with non-concat files. 'mix' allows for, all, some, or no files to have + gone through concatenation. + + Reads are remapped to read_ch and meta is placed in meta_ch. Input tuples for existing modules + do not expect 'meta' in the tuple. Example expected input tuple: [sampleID, [reads]] + */ + + emit: + read_meta_ch + +} \ No newline at end of file diff --git a/subworkflows/pdx_rnaseq.nf b/subworkflows/pdx_rnaseq.nf new file mode 100644 index 00000000..6f96df4c --- /dev/null +++ b/subworkflows/pdx_rnaseq.nf @@ -0,0 +1,120 @@ +#!/usr/bin/env nextflow +nextflow.enable.dsl=2 + +// import modules +include {JAX_TRIMMER} from "${projectDir}/modules/utility_modules/jax_trimmer" +include {READ_GROUPS as READ_GROUPS_HUMAN; + READ_GROUPS as READ_GROUPS_MOUSE} from "${projectDir}/modules/utility_modules/read_groups" +include {FASTQC} from "${projectDir}/modules/fastqc/fastqc" +include {GET_READ_LENGTH} from "${projectDir}/modules/utility_modules/get_read_length" +include {CHECK_STRANDEDNESS} from "${projectDir}/modules/python/python_check_strandedness" +include {XENOME_CLASSIFY} from "${projectDir}/modules/xenome/xenome" +include {FASTQ_SORT as FASTQ_SORT_HUMAN; + FASTQ_SORT as FASTQ_SORT_MOUSE} from "${projectDir}/modules/fastq-tools/fastq-sort" +include {RSEM_ALIGNMENT_EXPRESSION as RSEM_ALIGNMENT_EXPRESSION_HUMAN; + RSEM_ALIGNMENT_EXPRESSION as RSEM_ALIGNMENT_EXPRESSION_MOUSE} from "${projectDir}/modules/rsem/rsem_alignment_expression" +include {PICARD_ADDORREPLACEREADGROUPS as PICARD_ADDORREPLACEREADGROUPS_HUMAN; + PICARD_ADDORREPLACEREADGROUPS as PICARD_ADDORREPLACEREADGROUPS_MOUSE} from "${projectDir}/modules/picard/picard_addorreplacereadgroups" +include {PICARD_REORDERSAM as PICARD_REORDERSAM_HUMAN; + PICARD_REORDERSAM as PICARD_REORDERSAM_MOUSE} from "${projectDir}/modules/picard/picard_reordersam" +include {PICARD_SORTSAM as PICARD_SORTSAM_HUMAN; + PICARD_SORTSAM as PICARD_SORTSAM_MOUSE} from "${projectDir}/modules/picard/picard_sortsam" +include {PICARD_COLLECTRNASEQMETRICS as PICARD_COLLECTRNASEQMETRICS_HUMAN; + PICARD_COLLECTRNASEQMETRICS as PICARD_COLLECTRNASEQMETRICS_MOUSE} from "${projectDir}/modules/picard/picard_collectrnaseqmetrics" + +include {MULTIQC} from "${projectDir}/modules/multiqc/multiqc" + +workflow PDX_RNASEQ { + + take: + read_ch + + main: + // Step 1: Qual_Stat, Get read group information, Run Xenome + JAX_TRIMMER(read_ch) + + GET_READ_LENGTH(read_ch) + + if (params.read_type == 'PE') { + xenome_input = JAX_TRIMMER.out.trimmed_fastq + } else { + xenome_input = JAX_TRIMMER.out.trimmed_fastq + } + + // QC is assess on all reads. Mouse/human is irrelevant here. + FASTQC(JAX_TRIMMER.out.trimmed_fastq) + + CHECK_STRANDEDNESS(JAX_TRIMMER.out.trimmed_fastq) + + // Xenome Classification + XENOME_CLASSIFY(xenome_input) + + // Xenome Read Sort + FASTQ_SORT_HUMAN(XENOME_CLASSIFY.out.xenome_fastq, 'human') + FASTQ_SORT_MOUSE(XENOME_CLASSIFY.out.xenome_mouse_fastq, 'mouse') + + human_reads = FASTQ_SORT_HUMAN.out.sorted_fastq + .join(CHECK_STRANDEDNESS.out.strand_setting) + .join(GET_READ_LENGTH.out.read_length) + .map{it -> tuple(it[0]+'_human', it[1], it[2], it[3])} + + mouse_reads = FASTQ_SORT_MOUSE.out.sorted_fastq + .join(CHECK_STRANDEDNESS.out.strand_setting) + .join(GET_READ_LENGTH.out.read_length) + .map{it -> tuple(it[0]+'_mouse', it[1], it[2], it[3])} + + // Step 2: RSEM Human and Stats: + + RSEM_ALIGNMENT_EXPRESSION_HUMAN(human_reads, params.rsem_ref_files_human, params.rsem_star_prefix_human, params.rsem_ref_prefix_human) + + // Picard Alignment Metrics + READ_GROUPS_HUMAN(human_reads.map{it -> tuple(it[0], it[1])}, "picard") + + add_replace_groups_human = READ_GROUPS_HUMAN.out.read_groups.join(RSEM_ALIGNMENT_EXPRESSION_HUMAN.out.bam) + PICARD_ADDORREPLACEREADGROUPS_HUMAN(add_replace_groups_human) + + PICARD_REORDERSAM_HUMAN(PICARD_ADDORREPLACEREADGROUPS_HUMAN.out.bam, params.picard_dict_human) + + // Picard Alignment Metrics + PICARD_SORTSAM_HUMAN(PICARD_REORDERSAM_HUMAN.out.bam) + + human_qc_input = PICARD_SORTSAM_HUMAN.out.bam.join(human_reads) + .map{it -> [it[0], it[1], it[3]]} + + PICARD_COLLECTRNASEQMETRICS_HUMAN(human_qc_input, params.ref_flat_human, params.ribo_intervals_human) + + // Step 3 RSEM Mouse and Stats: + + RSEM_ALIGNMENT_EXPRESSION_MOUSE(mouse_reads, params.rsem_ref_files_mouse, params.rsem_star_prefix_mouse, params.rsem_ref_prefix_mouse) + + // Step 4: Picard Alignment Metrics + READ_GROUPS_MOUSE(mouse_reads.map{it -> tuple(it[0], it[1])}, "picard") + + add_replace_groups_mouse = READ_GROUPS_MOUSE.out.read_groups.join(RSEM_ALIGNMENT_EXPRESSION_MOUSE.out.bam) + PICARD_ADDORREPLACEREADGROUPS_MOUSE(add_replace_groups_mouse) + + PICARD_REORDERSAM_MOUSE(PICARD_ADDORREPLACEREADGROUPS_MOUSE.out.bam, params.picard_dict_mouse) + + // Step 5: Picard Alignment Metrics + PICARD_SORTSAM_MOUSE(PICARD_REORDERSAM_MOUSE.out.bam) + + mouse_qc_input = PICARD_SORTSAM_MOUSE.out.bam.join(mouse_reads) + .map{it -> [it[0], it[1], it[3]]} + + PICARD_COLLECTRNASEQMETRICS_MOUSE(mouse_qc_input, params.ref_flat_mouse, params.ribo_intervals_mouse) + + + ch_multiqc_files = Channel.empty() + ch_multiqc_files = ch_multiqc_files.mix(JAX_TRIMMER.out.quality_stats.collect{it[1]}.ifEmpty([])) + ch_multiqc_files = ch_multiqc_files.mix(FASTQC.out.quality_stats.collect{it[1]}.ifEmpty([])) + ch_multiqc_files = ch_multiqc_files.mix(XENOME_CLASSIFY.out.xenome_stats.collect{it[1]}.ifEmpty([])) + ch_multiqc_files = ch_multiqc_files.mix(RSEM_ALIGNMENT_EXPRESSION_HUMAN.out.rsem_cnt.collect{it[1]}.ifEmpty([])) + ch_multiqc_files = ch_multiqc_files.mix(PICARD_COLLECTRNASEQMETRICS_HUMAN.out.picard_metrics.collect{it[1]}.ifEmpty([])) + ch_multiqc_files = ch_multiqc_files.mix(RSEM_ALIGNMENT_EXPRESSION_MOUSE.out.rsem_cnt.collect{it[1]}.ifEmpty([])) + ch_multiqc_files = ch_multiqc_files.mix(PICARD_COLLECTRNASEQMETRICS_MOUSE.out.picard_metrics.collect{it[1]}.ifEmpty([])) + + MULTIQC ( + ch_multiqc_files.collect() + ) + +} \ No newline at end of file diff --git a/test/README.md b/test/README.md index e68c2ea1..b9ea606e 100644 --- a/test/README.md +++ b/test/README.md @@ -1,5 +1,11 @@ # Test Data -This directory contains 10,000 simulated RNA, whole exome, and whole genome paired end reads based on GRCm38 (mm10) and GRCh38 (hg38). +This directory contains 10,000 simulated RNA, whole exome, whole genome, and ATAC-seq paired end reads based on GRCm38 (mm10) and GRCh38 (hg38). -These files can be used in testing pipeline functionality. \ No newline at end of file +These files can be used in testing pipeline functionality. + +Sample chip-seq data from NF-core are provided. These data can be staged by the workflow from the provided URLs. + +A sample CSV datasheet are provided for PTA. + +A sample WES datasheet for both remote and local files is provided for testing download and/or local samplesheet input to workflows. \ No newline at end of file diff --git a/test/csv_samplesheets/pdx_wes_test.csv b/test/csv_samplesheets/pdx_wes_test.csv new file mode 100644 index 00000000..5ee3b514 --- /dev/null +++ b/test/csv_samplesheets/pdx_wes_test.csv @@ -0,0 +1,9 @@ +sampleID,lane,fastq_1,fastq_2 +112475_105-R_G2UN84PK7,baz_L1,https://github.com/hartwigmedical/testdata/raw/master/cancerPanel/CPCT12345678T/CPCT12345678T_HJJLGCCXX_S1_L001_R1_001.fastq.gz,https://github.com/hartwigmedical/testdata/raw/master/cancerPanel/CPCT12345678T/CPCT12345678T_HJJLGCCXX_S1_L001_R2_001.fastq.gz +112475_105-R_G2U,foo_L1,https://github.com/hartwigmedical/testdata/raw/master/cancerPanel/CPCT12345678T/CPCT12345678T_AHHKYHDSXX_S12_L001_R1_001.fastq.gz,https://github.com/hartwigmedical/testdata/raw/master/cancerPanel/CPCT12345678T/CPCT12345678T_AHHKYHDSXX_S12_L001_R2_001.fastq.gz +112475_105-R_G2U,foo_L2,https://github.com/hartwigmedical/testdata/raw/master/cancerPanel/CPCT12345678T/CPCT12345678T_AHHKYHDSXX_S12_L002_R1_001.fastq.gz,https://github.com/hartwigmedical/testdata/raw/master/cancerPanel/CPCT12345678T/CPCT12345678T_AHHKYHDSXX_S12_L002_R2_001.fastq.gz +112475_105-R_G2U,foo_L3,https://github.com/hartwigmedical/testdata/raw/master/cancerPanel/CPCT12345678T/CPCT12345678T_AHHKYHDSXX_S12_L003_R1_001.fastq.gz,https://github.com/hartwigmedical/testdata/raw/master/cancerPanel/CPCT12345678T/CPCT12345678T_AHHKYHDSXX_S12_L003_R2_001.fastq.gz +2475_105-R_G2U,bar_L1,https://github.com/hartwigmedical/testdata/raw/master/cancerPanel/CPCT12345678R/CPCT12345678R_AHHKYHDSXX_S13_L001_R1_001.fastq.gz,https://github.com/hartwigmedical/testdata/raw/master/cancerPanel/CPCT12345678R/CPCT12345678R_AHHKYHDSXX_S13_L001_R2_001.fastq.gz +2475_105-R_G2U,bar_L2,https://github.com/hartwigmedical/testdata/raw/master/cancerPanel/CPCT12345678R/CPCT12345678R_AHHKYHDSXX_S13_L002_R1_001.fastq.gz,https://github.com/hartwigmedical/testdata/raw/master/cancerPanel/CPCT12345678R/CPCT12345678R_AHHKYHDSXX_S13_L002_R2_001.fastq.gz +2475_105-R_G2UN84PK7,bar_L1,https://github.com/hartwigmedical/testdata/raw/master/cancerPanel/CPCT12345678R/CPCT12345678R_AHHKYHDSXX_S13_L003_R1_001.fastq.gz,https://github.com/hartwigmedical/testdata/raw/master/cancerPanel/CPCT12345678R/CPCT12345678R_AHHKYHDSXX_S13_L003_R2_001.fastq.gz +2475_105-R_G2UN84PK7,bar_L2,https://github.com/hartwigmedical/testdata/raw/master/cancerPanel/CPCT12345678R/CPCT12345678R_AHHKYHDSXX_S13_L004_R1_001.fastq.gz,https://github.com/hartwigmedical/testdata/raw/master/cancerPanel/CPCT12345678R/CPCT12345678R_AHHKYHDSXX_S13_L004_R2_001.fastq.gz diff --git a/test/csv_samplesheets/pdx_wes_test_local.csv b/test/csv_samplesheets/pdx_wes_test_local.csv new file mode 100644 index 00000000..850cc3b1 --- /dev/null +++ b/test/csv_samplesheets/pdx_wes_test_local.csv @@ -0,0 +1,9 @@ +sampleID,lane,fastq_1,fastq_2 +112475_105-R_G2U,foo_L1,/projects/compsci/omics_share/human/GRCh38/supporting_files/benchmarking_data/PDX/test_reads/CPCT12345678T_AHHKYHDSXX_S12_L001_R1_001.fastq.gz,/projects/compsci/omics_share/human/GRCh38/supporting_files/benchmarking_data/PDX/test_reads/CPCT12345678T_AHHKYHDSXX_S12_L001_R2_001.fastq.gz +112475_105-R_G2U,foo_L2,/projects/compsci/omics_share/human/GRCh38/supporting_files/benchmarking_data/PDX/test_reads/CPCT12345678T_AHHKYHDSXX_S12_L002_R1_001.fastq.gz,/projects/compsci/omics_share/human/GRCh38/supporting_files/benchmarking_data/PDX/test_reads/CPCT12345678T_AHHKYHDSXX_S12_L002_R2_001.fastq.gz +112475_105-R_G2U,foo_L3,/projects/compsci/omics_share/human/GRCh38/supporting_files/benchmarking_data/PDX/test_reads/CPCT12345678T_AHHKYHDSXX_S12_L003_R1_001.fastq.gz,/projects/compsci/omics_share/human/GRCh38/supporting_files/benchmarking_data/PDX/test_reads/CPCT12345678T_AHHKYHDSXX_S12_L003_R2_001.fastq.gz +2475_105-R_G2U,bar_L1,/projects/compsci/omics_share/human/GRCh38/supporting_files/benchmarking_data/PDX/test_reads/CPCT12345678R_AHHKYHDSXX_S13_L001_R1_001.fastq.gz,/projects/compsci/omics_share/human/GRCh38/supporting_files/benchmarking_data/PDX/test_reads/CPCT12345678R_AHHKYHDSXX_S13_L001_R2_001.fastq.gz +2475_105-R_G2U,bar_L2,/projects/compsci/omics_share/human/GRCh38/supporting_files/benchmarking_data/PDX/test_reads/CPCT12345678R_AHHKYHDSXX_S13_L002_R1_001.fastq.gz,/projects/compsci/omics_share/human/GRCh38/supporting_files/benchmarking_data/PDX/test_reads/CPCT12345678R_AHHKYHDSXX_S13_L002_R2_001.fastq.gz +2475_105-R_G2UN84PK7,bar_L1,/projects/compsci/omics_share/human/GRCh38/supporting_files/benchmarking_data/PDX/test_reads/CPCT12345678R_AHHKYHDSXX_S13_L003_R1_001.fastq.gz,/projects/compsci/omics_share/human/GRCh38/supporting_files/benchmarking_data/PDX/test_reads/CPCT12345678R_AHHKYHDSXX_S13_L003_R2_001.fastq.gz +2475_105-R_G2UN84PK7,bar_L2,/projects/compsci/omics_share/human/GRCh38/supporting_files/benchmarking_data/PDX/test_reads/CPCT12345678R_AHHKYHDSXX_S13_L004_R1_001.fastq.gz,/projects/compsci/omics_share/human/GRCh38/supporting_files/benchmarking_data/PDX/test_reads/CPCT12345678R_AHHKYHDSXX_S13_L004_R2_001.fastq.gz +112475_105-R_G2UN84PK7,baz_L1,/projects/compsci/omics_share/human/GRCh38/supporting_files/benchmarking_data/PDX/test_reads/CPCT12345678T_HJJLGCCXX_S1_L001_R1_001.fastq.gz,/projects/compsci/omics_share/human/GRCh38/supporting_files/benchmarking_data/PDX/test_reads/CPCT12345678T_HJJLGCCXX_S1_L001_R2_001.fastq.gz \ No newline at end of file diff --git a/test/csv_samplesheets/pta_test.csv b/test/csv_samplesheets/pta_test.csv new file mode 100644 index 00000000..528f02f5 --- /dev/null +++ b/test/csv_samplesheets/pta_test.csv @@ -0,0 +1,4 @@ +patient,sex,status,sampleID,lane,fastq_1,fastq_2 +fizzbang,XX,0,n_fizz,test_L1,/projects/compsci/omics_share/human/GRCh38/supporting_files/benchmarking_data/PTA/test_reads/normal_withSV_hg38_WGS_sample_R1.fastq.gz,/projects/compsci/omics_share/human/GRCh38/supporting_files/benchmarking_data/PTA/test_reads/normal_withSV_hg38_WGS_sample_R2.fastq.gz +fizzbang,XX,1,t_bang,test_L1,/projects/compsci/omics_share/human/GRCh38/supporting_files/benchmarking_data/PTA/test_reads/tumor_withSV_hg38_WGS_sample_R1.fastq.gz,/projects/compsci/omics_share/human/GRCh38/supporting_files/benchmarking_data/PTA/test_reads/tumor_withSV_hg38_WGS_sample_R2.fastq.gz +foobar,XX,1,t_bar,test_L1,/projects/compsci/omics_share/human/GRCh38/supporting_files/benchmarking_data/PTA/test_reads/tumor2_withSV_hg38_WGS_sample_R1.fastq.gz,/projects/compsci/omics_share/human/GRCh38/supporting_files/benchmarking_data/PTA/test_reads/tumor2_withSV_hg38_WGS_sample_R2.fastq.gz diff --git a/test/fusion/human/hg38_FUSION_sample_R1.fastq.gz b/test/fusion/human/hg38_FUSION_sample_R1.fastq.gz new file mode 100644 index 00000000..c4950c24 Binary files /dev/null and b/test/fusion/human/hg38_FUSION_sample_R1.fastq.gz differ diff --git a/test/fusion/human/hg38_FUSION_sample_R2.fastq.gz b/test/fusion/human/hg38_FUSION_sample_R2.fastq.gz new file mode 100644 index 00000000..5efb0fff Binary files /dev/null and b/test/fusion/human/hg38_FUSION_sample_R2.fastq.gz differ diff --git a/test/pta/README.md b/test/pta/README.md new file mode 100644 index 00000000..1faa890c --- /dev/null +++ b/test/pta/README.md @@ -0,0 +1,3 @@ +# NOTE: + +Due to size restrictions within the repository, this CSV file points to files on Sumner. When running this test dataset, use the option `--bicseq2_no_scaling` \ No newline at end of file diff --git a/test/pta/test_input.csv b/test/pta/test_input.csv new file mode 100644 index 00000000..528f02f5 --- /dev/null +++ b/test/pta/test_input.csv @@ -0,0 +1,4 @@ +patient,sex,status,sampleID,lane,fastq_1,fastq_2 +fizzbang,XX,0,n_fizz,test_L1,/projects/compsci/omics_share/human/GRCh38/supporting_files/benchmarking_data/PTA/test_reads/normal_withSV_hg38_WGS_sample_R1.fastq.gz,/projects/compsci/omics_share/human/GRCh38/supporting_files/benchmarking_data/PTA/test_reads/normal_withSV_hg38_WGS_sample_R2.fastq.gz +fizzbang,XX,1,t_bang,test_L1,/projects/compsci/omics_share/human/GRCh38/supporting_files/benchmarking_data/PTA/test_reads/tumor_withSV_hg38_WGS_sample_R1.fastq.gz,/projects/compsci/omics_share/human/GRCh38/supporting_files/benchmarking_data/PTA/test_reads/tumor_withSV_hg38_WGS_sample_R2.fastq.gz +foobar,XX,1,t_bar,test_L1,/projects/compsci/omics_share/human/GRCh38/supporting_files/benchmarking_data/PTA/test_reads/tumor2_withSV_hg38_WGS_sample_R1.fastq.gz,/projects/compsci/omics_share/human/GRCh38/supporting_files/benchmarking_data/PTA/test_reads/tumor2_withSV_hg38_WGS_sample_R2.fastq.gz diff --git a/test/rna/human/pdx/pdx_RNA_sample_R1.fastq.gz b/test/rna/human/pdx/pdx_RNA_sample_R1.fastq.gz new file mode 100644 index 00000000..2746ec82 Binary files /dev/null and b/test/rna/human/pdx/pdx_RNA_sample_R1.fastq.gz differ diff --git a/test/rna/human/pdx/pdx_RNA_sample_R2.fastq.gz b/test/rna/human/pdx/pdx_RNA_sample_R2.fastq.gz new file mode 100644 index 00000000..1d320fb1 Binary files /dev/null and b/test/rna/human/pdx/pdx_RNA_sample_R2.fastq.gz differ diff --git a/test/wes/human/hg38_WES_sample_R1.fastq.gz b/test/wes/human/hg38_WES_sample_R1.fastq.gz index 4c5aafbe..5ec973ba 100644 Binary files a/test/wes/human/hg38_WES_sample_R1.fastq.gz and b/test/wes/human/hg38_WES_sample_R1.fastq.gz differ diff --git a/test/wes/human/hg38_WES_sample_R2.fastq.gz b/test/wes/human/hg38_WES_sample_R2.fastq.gz index f4a8efb3..36046400 100644 Binary files a/test/wes/human/hg38_WES_sample_R2.fastq.gz and b/test/wes/human/hg38_WES_sample_R2.fastq.gz differ diff --git a/test/wes/human/pdx/pdx_WES_sample_R1.fastq.gz b/test/wes/human/pdx/pdx_WES_sample_R1.fastq.gz new file mode 100644 index 00000000..6ba04ec2 Binary files /dev/null and b/test/wes/human/pdx/pdx_WES_sample_R1.fastq.gz differ diff --git a/test/wes/human/pdx/pdx_WES_sample_R2.fastq.gz b/test/wes/human/pdx/pdx_WES_sample_R2.fastq.gz new file mode 100644 index 00000000..38506851 Binary files /dev/null and b/test/wes/human/pdx/pdx_WES_sample_R2.fastq.gz differ diff --git a/test/wes/mouse/mm10_WES_sample_R1.fastq.gz b/test/wes/mouse/mm10_WES_sample_R1.fastq.gz index b1f86d2a..f53e3a4f 100644 Binary files a/test/wes/mouse/mm10_WES_sample_R1.fastq.gz and b/test/wes/mouse/mm10_WES_sample_R1.fastq.gz differ diff --git a/test/wes/mouse/mm10_WES_sample_R2.fastq.gz b/test/wes/mouse/mm10_WES_sample_R2.fastq.gz index 058cbc1a..10eb4f15 100644 Binary files a/test/wes/mouse/mm10_WES_sample_R2.fastq.gz and b/test/wes/mouse/mm10_WES_sample_R2.fastq.gz differ diff --git a/test/wgs/human/hg38_WGS_sample_R1.fastq.gz b/test/wgs/human/hg38_WGS_sample_R1.fastq.gz index 315aadf3..dc7e001e 100644 Binary files a/test/wgs/human/hg38_WGS_sample_R1.fastq.gz and b/test/wgs/human/hg38_WGS_sample_R1.fastq.gz differ diff --git a/test/wgs/human/hg38_WGS_sample_R2.fastq.gz b/test/wgs/human/hg38_WGS_sample_R2.fastq.gz index 6f3c0a91..88cd8939 100644 Binary files a/test/wgs/human/hg38_WGS_sample_R2.fastq.gz and b/test/wgs/human/hg38_WGS_sample_R2.fastq.gz differ diff --git a/test/wgs/mouse/mm10_WGS_sample_R1.fastq.gz b/test/wgs/mouse/mm10_WGS_sample_R1.fastq.gz index 7e74bfd4..2a85e51e 100644 Binary files a/test/wgs/mouse/mm10_WGS_sample_R1.fastq.gz and b/test/wgs/mouse/mm10_WGS_sample_R1.fastq.gz differ diff --git a/test/wgs/mouse/mm10_WGS_sample_R2.fastq.gz b/test/wgs/mouse/mm10_WGS_sample_R2.fastq.gz index 37e17a1c..266dd096 100644 Binary files a/test/wgs/mouse/mm10_WGS_sample_R2.fastq.gz and b/test/wgs/mouse/mm10_WGS_sample_R2.fastq.gz differ diff --git a/workflows/amplicon.nf b/workflows/amplicon.nf new file mode 100644 index 00000000..2118abd0 --- /dev/null +++ b/workflows/amplicon.nf @@ -0,0 +1,163 @@ +#!/usr/bin/env nextflow +nextflow.enable.dsl=2 + +// import modules +include {help} from "${projectDir}/bin/help/amplicon.nf" +include {param_log} from "${projectDir}/bin/log/amplicon.nf" +include {getLibraryId} from "${projectDir}/bin/shared/getLibraryId.nf" +include {extract_csv} from "${projectDir}/bin/shared/extract_csv.nf" +include {FILE_DOWNLOAD} from "${projectDir}/subworkflows/aria_download_parse" +include {CONCATENATE_LOCAL_FILES} from "${projectDir}/subworkflows/concatenate_local_files" +include {CONCATENATE_READS_PE} from "${projectDir}/modules/utility_modules/concatenate_reads_PE" +include {CONCATENATE_READS_SE} from "${projectDir}/modules/utility_modules/concatenate_reads_SE" +include {TRIM_FASTQ as CUTADAPT} from "${projectDir}/modules/cutadapt/cutadapt_trim_fastq" +include {FASTQC} from "${projectDir}/modules/fastqc/fastqc" +include {READ_GROUPS} from "${projectDir}/modules/utility_modules/read_groups" +include {BWA_MEM} from "${projectDir}/modules/bwa/bwa_mem" +include {SAMTOOLS_SORT as SAMTOOLS_SORT_PRIMERCLIP; + SAMTOOLS_SORT as SAMTOOLS_SORT_CALLING} from "${projectDir}/modules/samtools/samtools_sort" +include {PRIMERCLIP} from "${projectDir}/modules/primerclip/primerclip" +include {TARGET_COVERAGE_METRICS} from "${projectDir}/modules/bedtools/bedtools_amplicon_metrics" +include {SNPSIFT_ANNOTATE} from "${projectDir}/modules/snpeff_snpsift/snpsift_annotate" +include {PICARD_COLLECTTARGETPCRMETRICS} from "${projectDir}/modules/picard/picard_collecttargetpcrmetrics" +include {GATK_BASERECALIBRATOR} from "${projectDir}/modules/gatk/gatk_baserecalibrator" +include {GATK_APPLYBQSR} from "${projectDir}/modules/gatk/gatk_applybqsr" +include {GATK_HAPLOTYPECALLER} from "${projectDir}/modules/gatk/gatk_haplotypecaller" +include {MULTIQC} from "${projectDir}/modules/multiqc/multiqc" + +// help if needed +if (params.help){ + help() + exit 0 +} + +// log params +param_log() + +if (params.download_data && !params.csv_input) { + exit 1, "Data download was specified with `--download_data`. However, no input CSV file was specified with `--csv_input`. This is an invalid parameter combination. `--download_data` requires a CSV manifest. See `--help` for information." +} + +// prepare reads channel +if (params.csv_input) { + + ch_input_sample = extract_csv(file(params.csv_input, checkIfExists: true)) + + if (params.read_type == 'PE'){ + ch_input_sample.map{it -> [it[0], [it[2], it[3]]]}.set{read_ch} + ch_input_sample.map{it -> [it[0], it[1]]}.set{meta_ch} + } else if (params.read_type == 'SE') { + ch_input_sample.map{it -> [it[0], it[2]]}.set{read_ch} + ch_input_sample.map{it -> [it[0], it[1]]}.set{meta_ch} + } + +} else if (params.concat_lanes){ + + if (params.read_type == 'PE'){ + read_ch = Channel + .fromFilePairs("${params.sample_folder}/${params.pattern}${params.extension}",checkExists:true, flat:true ) + .map { file, file1, file2 -> tuple(getLibraryId(file), file1, file2) } + .groupTuple() + } + else if (params.read_type == 'SE'){ + read_ch = Channel.fromFilePairs("${params.sample_folder}/*${params.extension}", checkExists:true, size:1 ) + .map { file, file1 -> tuple(getLibraryId(file), file1) } + .groupTuple() + .map{t-> [t[0], t[1].flatten()]} + } + // if channel is empty give error message and exit + read_ch.ifEmpty{ exit 1, "ERROR: No Files Found in Path: ${params.sample_folder} Matching Pattern: ${params.pattern} and file extension: ${params.extension}"} + +} else { + + if (params.read_type == 'PE'){ + read_ch = Channel.fromFilePairs("${params.sample_folder}/${params.pattern}${params.extension}",checkExists:true ) + } + else if (params.read_type == 'SE'){ + read_ch = Channel.fromFilePairs("${params.sample_folder}/*${params.extension}",checkExists:true, size:1 ) + } + // if channel is empty give error message and exit + read_ch.ifEmpty{ exit 1, "ERROR: No Files Found in Path: ${params.sample_folder} Matching Pattern: ${params.pattern} and file extension: ${params.extension}"} +} + +workflow AMPLICON { + // Step 0: Download data and concat Fastq files if needed. + if (params.download_data){ + FILE_DOWNLOAD(ch_input_sample) + + FILE_DOWNLOAD.out.read_meta_ch.map{it -> [it[0], it[2]]}.set{read_ch} + FILE_DOWNLOAD.out.read_meta_ch.map{it -> [it[0], it[1]]}.set{meta_ch} + } + + // Step 00: Concat local Fastq files from CSV input if required. + if (!params.download_data && params.csv_input){ + CONCATENATE_LOCAL_FILES(ch_input_sample) + CONCATENATE_LOCAL_FILES.out.read_meta_ch.map{it -> [it[0], it[2]]}.set{read_ch} + CONCATENATE_LOCAL_FILES.out.read_meta_ch.map{it -> [it[0], it[1]]}.set{meta_ch} + } + + // Step 00: Concat local Fastq files if required. + if (params.concat_lanes && !params.csv_input){ + if (params.read_type == 'PE'){ + CONCATENATE_READS_PE(read_ch) + read_ch = CONCATENATE_READS_PE.out.concat_fastq + } else if (params.read_type == 'SE'){ + CONCATENATE_READS_SE(read_ch) + read_ch = CONCATENATE_READS_SE.out.concat_fastq + } + } + + // ** MAIN workflow starts: + + CUTADAPT(read_ch) + + FASTQC(CUTADAPT.out.paired_trimmed_fastq) + + // Step 2: Get Read Group Information + READ_GROUPS(CUTADAPT.out.paired_trimmed_fastq, "gatk") + + // Step 3: BWA-MEM Alignment + bwa_mem_mapping = CUTADAPT.out.paired_trimmed_fastq.join(READ_GROUPS.out.read_groups) + BWA_MEM(bwa_mem_mapping) + + SAMTOOLS_SORT_PRIMERCLIP(BWA_MEM.out.sam, '-O sam -n', 'sam') + + PRIMERCLIP(SAMTOOLS_SORT_PRIMERCLIP.out.sorted_file) + + SAMTOOLS_SORT_CALLING(PRIMERCLIP.out.sam, '-O bam', 'bam') + + PICARD_COLLECTTARGETPCRMETRICS(SAMTOOLS_SORT_CALLING.out.sorted_file) + + TARGET_COVERAGE_METRICS(SAMTOOLS_SORT_CALLING.out.sorted_file) + + /* + Important: While the use of the Picard tool, MarkDuplicates, is a common quality control step to identify + low-complexity libraries, MarkDuplicates cannot be used on data derived from PCR-based target enrichment + methods such as the xGen Amplicon Panels. Since these targeted panels contain high numbers of identical + library fragments (particularly regarding alignment start position), MarkDuplicates cannot appropriately + analyze Amplicon libraries. + https://sfvideo.blob.core.windows.net/sitefinity/docs/default-source/application-note/primerclip-a-tool-for-trimming-primer-sequences-application-note.pdf?sfvrsn=cf83e107_14 + */ + + GATK_BASERECALIBRATOR(SAMTOOLS_SORT_CALLING.out.sorted_file) + + GATK_APPLYBQSR(SAMTOOLS_SORT_CALLING.out.sorted_file.join(GATK_BASERECALIBRATOR.out.table)) + + GATK_HAPLOTYPECALLER(GATK_APPLYBQSR.out.bam.join(GATK_APPLYBQSR.out.bai), '') + + SNPSIFT_ANNOTATE(GATK_HAPLOTYPECALLER.out.vcf, params.dbSNP, params.dbSNP_index, 'dbsnpID') + + // MultiQC + // coverage metrics? + ch_multiqc_files = Channel.empty() + ch_multiqc_files = ch_multiqc_files.mix(CUTADAPT.out.cutadapt_log.collect{it[1]}.ifEmpty([])) + ch_multiqc_files = ch_multiqc_files.mix(FASTQC.out.quality_stats.collect{it[1]}.ifEmpty([])) + ch_multiqc_files = ch_multiqc_files.mix(GATK_BASERECALIBRATOR.out.table.collect{it[1]}.ifEmpty([])) + ch_multiqc_files = ch_multiqc_files.mix(PICARD_COLLECTTARGETPCRMETRICS.out.txt.collect{it[1]}.ifEmpty([])) + ch_multiqc_files = ch_multiqc_files.mix(PRIMERCLIP.out.log.collect{it[1]}.ifEmpty([])) + ch_multiqc_files = ch_multiqc_files.mix(TARGET_COVERAGE_METRICS.out.qc_metrics.collect{it[1]}.ifEmpty([])) + + MULTIQC ( + ch_multiqc_files.collect() + ) +} \ No newline at end of file diff --git a/workflows/atac.nf b/workflows/atac.nf index 5ed9e5ad..d696bb5c 100755 --- a/workflows/atac.nf +++ b/workflows/atac.nf @@ -5,15 +5,18 @@ nextflow.enable.dsl=2 include {help} from "${projectDir}/bin/help/atac.nf" include {param_log} from "${projectDir}/bin/log/atac.nf" include {getLibraryId} from "${projectDir}/bin/shared/getLibraryId.nf" +include {extract_csv} from "${projectDir}/bin/shared/extract_csv.nf" +include {FILE_DOWNLOAD} from "${projectDir}/subworkflows/aria_download_parse" +include {CONCATENATE_LOCAL_FILES} from "${projectDir}/subworkflows/concatenate_local_files" include {CONCATENATE_READS_PE} from "${projectDir}/modules/utility_modules/concatenate_reads_PE" include {CONCATENATE_READS_SE} from "${projectDir}/modules/utility_modules/concatenate_reads_SE" include {TRIM_FASTQ} from "${projectDir}/modules/cutadapt/cutadapt_trim_fastq" include {FASTQC} from "${projectDir}/modules/fastqc/fastqc" include {ALIGN_TRIMMED_FASTQ} from "${projectDir}/modules/bowtie2/bowtie2_align_trimmed_fastq" -include {SORT as SORT_ALIGN_TRIM; - SORT as SORT_SHIFTED_BAM; - SORT as SORT_MARK_DUP_BAM; - SORT as SORT_LIFTOVER_BAM } from "${projectDir}/modules/samtools/samtools_sort" +include {SAMTOOLS_SORT as SORT_ALIGN_TRIM; + SAMTOOLS_SORT as SORT_SHIFTED_BAM; + SAMTOOLS_SORT as SORT_MARK_DUP_BAM; + SAMTOOLS_SORT as SORT_LIFTOVER_BAM } from "${projectDir}/modules/samtools/samtools_sort" include {PICARD_MARKDUPLICATES} from "${projectDir}/modules/picard/picard_markduplicates" include {REMOVE_DUPLICATE_READS} from "${projectDir}/modules/samtools/samtools_remove_duplicate_reads" include {CALC_MTDNA_FILTER_CHRM} from "${projectDir}/modules/samtools/samtools_calc_mtdna_filter_chrm" @@ -33,9 +36,10 @@ include {PEAK_COVERAGE} from "${projectDir}/modules/macs2/macs2_peak_coverage" include {FEATURE_COUNTS} from "${projectDir}/modules/subread/subread_feature_counts" include {FEATURE_COUNT2BED} from "${projectDir}/modules/bedtools/bedtools_feature_count2bed" include {QUALITY_CHECKS} from "${projectDir}/modules/samtools/samtools_quality_checks" -include {FRAG_LEN_PLOT} from "${projectDir}/modules/rstudio/rstudio_frag_len_plot" +include {FRAG_LEN_PLOT} from "${projectDir}/modules/r/frag_len_plot" include {CALC_PBC_METRICS} from "${projectDir}/modules/bedtools/bedtools_calc_pbc_metrics" include {LOG_PARSER} from "${projectDir}/modules/python/python_log_parser" +include {MULTIQC} from "${projectDir}/modules/multiqc/multiqc" // help if needed if (params.help){ @@ -46,8 +50,25 @@ if (params.help){ // log params param_log() +if (params.download_data && !params.csv_input) { + exit 1, "Data download was specified with `--download_data`. However, no input CSV file was specified with `--csv_input`. This is an invalid parameter combination. `--download_data` requires a CSV manifest. See `--help` for information." +} + // prepare reads channel -if (params.concat_lanes){ +if (params.csv_input) { + + ch_input_sample = extract_csv(file(params.csv_input, checkIfExists: true)) + + if (params.read_type == 'PE'){ + ch_input_sample.map{it -> [it[0], [it[2], it[3]]]}.set{read_ch} + ch_input_sample.map{it -> [it[0], it[1]]}.set{meta_ch} + } else if (params.read_type == 'SE') { + ch_input_sample.map{it -> [it[0], it[2]]}.set{read_ch} + ch_input_sample.map{it -> [it[0], it[1]]}.set{meta_ch} + } + +} else if (params.concat_lanes){ + if (params.read_type == 'PE'){ read_ch = Channel .fromFilePairs("${params.sample_folder}/${params.pattern}${params.extension}",checkExists:true, flat:true ) @@ -60,33 +81,52 @@ if (params.concat_lanes){ .groupTuple() .map{t-> [t[0], t[1].flatten()]} } + // if channel is empty give error message and exit + read_ch.ifEmpty{ exit 1, "ERROR: No Files Found in Path: ${params.sample_folder} Matching Pattern: ${params.pattern} and file extension: ${params.extension}"} + } else { + if (params.read_type == 'PE'){ read_ch = Channel.fromFilePairs("${params.sample_folder}/${params.pattern}${params.extension}",checkExists:true ) } else if (params.read_type == 'SE'){ read_ch = Channel.fromFilePairs("${params.sample_folder}/*${params.extension}",checkExists:true, size:1 ) } + // if channel is empty give error message and exit + read_ch.ifEmpty{ exit 1, "ERROR: No Files Found in Path: ${params.sample_folder} Matching Pattern: ${params.pattern} and file extension: ${params.extension}"} } -// if channel is empty give error message and exit -read_ch.ifEmpty{ exit 1, "ERROR: No Files Found in Path: ${params.sample_folder} Matching Pattern: ${params.pattern}"} - - // main workflow workflow ATAC { - // Step 0: Concatenate Fastq files if required. - if (params.concat_lanes){ - if (params.read_type == 'PE'){ - CONCATENATE_READS_PE(read_ch) - read_ch = CONCATENATE_READS_PE.out.concat_fastq - } else if (params.read_type == 'SE'){ - CONCATENATE_READS_SE(read_ch) - read_ch = CONCATENATE_READS_SE.out.concat_fastq - } + // Step 0: Download data and concat Fastq files if needed. + if (params.download_data){ + FILE_DOWNLOAD(ch_input_sample) + + FILE_DOWNLOAD.out.read_meta_ch.map{it -> [it[0], it[2]]}.set{read_ch} + FILE_DOWNLOAD.out.read_meta_ch.map{it -> [it[0], it[1]]}.set{meta_ch} } + // Step 00: Concat local Fastq files from CSV input if required. + if (!params.download_data && params.csv_input){ + CONCATENATE_LOCAL_FILES(ch_input_sample) + CONCATENATE_LOCAL_FILES.out.read_meta_ch.map{it -> [it[0], it[2]]}.set{read_ch} + CONCATENATE_LOCAL_FILES.out.read_meta_ch.map{it -> [it[0], it[1]]}.set{meta_ch} + } + + // Step 00: Concat local Fastq files if required. + if (params.concat_lanes && !params.csv_input){ + if (params.read_type == 'PE'){ + CONCATENATE_READS_PE(read_ch) + read_ch = CONCATENATE_READS_PE.out.concat_fastq + } else if (params.read_type == 'SE'){ + CONCATENATE_READS_SE(read_ch) + read_ch = CONCATENATE_READS_SE.out.concat_fastq + } + } + + // ** MAIN workflow starts: + // Step 1: Trim_Fastq TRIM_FASTQ(read_ch) @@ -97,7 +137,7 @@ workflow ATAC { ALIGN_TRIMMED_FASTQ(TRIM_FASTQ.out.paired_trimmed_fastq) // Step 4: Sort alignment file - SORT_ALIGN_TRIM(ALIGN_TRIMMED_FASTQ.out.sam, '') + SORT_ALIGN_TRIM(ALIGN_TRIMMED_FASTQ.out.sam, '-O bam', 'bam') // Step 5: Flag pcr duplicates PICARD_MARKDUPLICATES(SORT_ALIGN_TRIM.out) @@ -118,34 +158,35 @@ workflow ATAC { FILTER_REMOVE_MULTI_SIEVE(FILTER_REMOVE_MULTI_SHIFT.out[0]) // Step 10: Re-sort shifted bam - SORT_SHIFTED_BAM(FILTER_REMOVE_MULTI_SIEVE.out[0], '') + SORT_SHIFTED_BAM(FILTER_REMOVE_MULTI_SIEVE.out[0], '-O bam', 'bam') // If Mouse if (params.gen_org=='mouse'){ // Step 11: Convert peak coordinates // Step occurs when chain != null || chain != false - CHAIN_CONVERT(SORT_SHIFTED_BAM.out[0]) + CHAIN_CONVERT(SORT_SHIFTED_BAM.out.sorted_file) // Step 12: Sort bam by coordinates - SORT_LIFTOVER_BAM(CHAIN_CONVERT.out[0], '') + SORT_LIFTOVER_BAM(CHAIN_CONVERT.out.converted_bam, '-O bam', 'bam') // Step 13: Extract a list of 'bad reads' - CHAIN_EXTRACT_BADREADS(SORT_LIFTOVER_BAM.out[0]) + CHAIN_EXTRACT_BADREADS(SORT_LIFTOVER_BAM.out.sorted_file) // Step 14: Remove 'bad reads' from bam file CHAIN_BAD2UNIQ_READS(CHAIN_EXTRACT_BADREADS.out.bad_reads) // Step 15: Filter list to unique names - filter_chain_reads = SORT_LIFTOVER_BAM.out[0].join(CHAIN_BAD2UNIQ_READS.out.uniq_reads) + filter_chain_reads = SORT_LIFTOVER_BAM.out.sorted_file.join(CHAIN_BAD2UNIQ_READS.out.uniq_reads) CHAIN_FILTER_READS(filter_chain_reads) // Step 16: Sort fixmate bam and filter mitochondrial reads - CHAIN_SORT_FIXMATE_BAM(CHAIN_FILTER_READS.out[0]) + CHAIN_SORT_FIXMATE_BAM(CHAIN_FILTER_READS.out.bam) // Step 17: Reference strain samples, filter mitochondrial, unplaced/unlocalized reads and reindex // Step occurs when chain == null || chain == false - NON_CHAIN_REINDEX(SORT_SHIFTED_BAM.out[0]) + + NON_CHAIN_REINDEX(SORT_SHIFTED_BAM.out.sorted_file) // Step 18 : Mix chain and non-chain @@ -155,10 +196,10 @@ workflow ATAC { // Step 17 will only run when `--chain` is not used (controlled via modules). // A bam file is required in the next step. `mix` ensures that one OR the other output is used. // When '--gen_org == human' data_ch is set to the tuple output in step 10. - + } else if (params.gen_org=='human'){ - data_ch = SORT_SHIFTED_BAM.out[0] + data_ch = SORT_SHIFTED_BAM.out.sorted_file } // Step 19: Peak calling @@ -197,14 +238,27 @@ workflow ATAC { FRAG_LEN_PLOT(QUALITY_CHECKS.out) // Step 28: Sort markduplicates bam by read names - SORT_MARK_DUP_BAM(PICARD_MARKDUPLICATES.out.dedup_bam, '-n ') + SORT_MARK_DUP_BAM(PICARD_MARKDUPLICATES.out.dedup_bam, '-n -O bam', 'bam') // Step 29: Calculating PBC Metrics - CALC_PBC_METRICS(SORT_MARK_DUP_BAM.out[0]) + CALC_PBC_METRICS(SORT_MARK_DUP_BAM.out.sorted_file) // Step 30: Log Parser log_agg = TRIM_FASTQ.out.cutadapt_log.join(ALIGN_TRIMMED_FASTQ.out.bowtie_log).join(PICARD_MARKDUPLICATES.out.dedup_metrics).join(CALC_MTDNA_FILTER_CHRM.out.mtdna_log).join(CALC_PBC_METRICS.out).join(FINAL_CALC_FRIP.out) - LOG_PARSER(log_agg) + LOG_PARSER(log_agg) + + ch_multiqc_files = Channel.empty() + ch_multiqc_files = ch_multiqc_files.mix(FASTQC.out.quality_stats.collect{it[1]}.ifEmpty([])) + ch_multiqc_files = ch_multiqc_files.mix(TRIM_FASTQ.out.cutadapt_log.collect{it[1]}.ifEmpty([])) + ch_multiqc_files = ch_multiqc_files.mix(ALIGN_TRIMMED_FASTQ.out.bowtie_log.collect{it[1]}.ifEmpty([])) + ch_multiqc_files = ch_multiqc_files.mix(PICARD_MARKDUPLICATES.out.dedup_metrics.collect{it[1]}.ifEmpty([])) + ch_multiqc_files = ch_multiqc_files.mix(CALC_MTDNA_FILTER_CHRM.out.mtdna_log.collect{it[1]}.ifEmpty([])) + ch_multiqc_files = ch_multiqc_files.mix(CALC_PBC_METRICS.out.collect{it[1]}.ifEmpty([])) + ch_multiqc_files = ch_multiqc_files.mix(FINAL_CALC_FRIP.out.collect{it[1]}.ifEmpty([])) + ch_multiqc_files = ch_multiqc_files.mix(FRAG_LEN_PLOT.out.spline_table.collect{it[1]}.ifEmpty([])) + + MULTIQC ( + ch_multiqc_files.collect() + ) } - diff --git a/workflows/chipseq.nf b/workflows/chipseq.nf new file mode 100755 index 00000000..08abf5ac --- /dev/null +++ b/workflows/chipseq.nf @@ -0,0 +1,370 @@ +#!/usr/bin/env nextflow +nextflow.enable.dsl=2 + +// import modules +include {help} from "${projectDir}/bin/help/chipseq.nf" +include {param_log} from "${projectDir}/bin/log/chipseq.nf" +include {getLibraryId} from "${projectDir}/bin/shared/getLibraryId.nf" +include {CHECK_DESIGN} from "${projectDir}/modules/utility_modules/chipseq_check_design" +include {SAMTOOLS_FAIDX} from "${projectDir}/modules/samtools/samtools_faidx" +include {MAKE_GENOME_FILTER} from "${projectDir}/modules/utility_modules/chipseq_make_genome_filter" +include {FASTQC} from "${projectDir}/modules/fastqc/fastqc" +include {TRIM_GALORE} from "${projectDir}/modules/trim_galore/trim_galore" +include {READ_GROUPS} from "${projectDir}/modules/utility_modules/read_groups" +include {BWA_MEM} from "${projectDir}/modules/bwa/bwa_mem" +include {SAMTOOLS_FILTER} from "${projectDir}/modules/samtools/samtools_filter" +include {SAMTOOLS_SORT; + SAMTOOLS_SORT as PAIR_SORT; + SAMTOOLS_SORT as NAME_SORT} from "${projectDir}/modules/samtools/samtools_sort" +include {SAMTOOLS_INDEX} from "${projectDir}/modules/samtools/samtools_index" +include {SAMTOOLS_STATS; + SAMTOOLS_STATS as SAMTOOLS_STATS_MD; + SAMTOOLS_STATS as SAMTOOLS_STATS_FILTERED; + SAMTOOLS_STATS as SAMTOOLS_STATS_BF} from "${projectDir}/modules/samtools/samtools_stats" +include {PICARD_MERGESAMFILES} from "${projectDir}/modules/picard/picard_mergesamfiles" +include {PICARD_MARKDUPLICATES} from "${projectDir}/modules/picard/picard_markduplicates" +include {SAMTOOLS_MERGEBAM_FILTER} from "${projectDir}/modules/samtools/samtools_mergebam_filter" +include {BAMTOOLS_FILTER} from "${projectDir}/modules/bamtools/bamtools_filter" +include {BAMPE_RM_ORPHAN} from "${projectDir}/modules/utility_modules/chipseq_bampe_rm_orphan" +include {PRESEQ} from "${projectDir}/modules/preseq/preseq" +include {PICARD_COLLECTMULTIPLEMETRICS} from "${projectDir}/modules/picard/picard_collectmultiplemetrics" +include {BEDTOOLS_GENOMECOV} from "${projectDir}/modules/bedtools/bedtools_genomecov" +include {UCSC_BEDGRAPHTOBIGWIG} from "${projectDir}/modules/ucsc/ucsc_bedgraphtobigwig" +include {DEEPTOOLS_COMPUTEMATRIX} from "${projectDir}/modules/deeptools/deeptools_computematrix" +include {DEEPTOOLS_PLOTPROFILE} from "${projectDir}/modules/deeptools/deeptools_plotprofile" +include {DEEPTOOLS_PLOTHEATMAP} from "${projectDir}/modules/deeptools/deeptools_plotheatmap" +include {PHANTOMPEAKQUALTOOLS} from "${projectDir}/modules/phantompeakqualtools/phantompeakqualtools" +include {MULTIQC_CUSTOM_PHANTOMPEAKQUALTOOLS} from "${projectDir}/modules/multiqc/multiqc_custom_phantompeakqualtools" +include {DEEPTOOLS_PLOTFINGERPRINT} from "${projectDir}/modules/deeptools/deeptools_plotfingerprint" +include {PEAK_CALLING_CHIPSEQ} from "${projectDir}/modules/macs2/macs2_peak_calling_chipseq" +include {FRIP_SCORE} from "${projectDir}/modules/utility_modules/frip_score" +include {HOMER_ANNOTATEPEAKS; + HOMER_ANNOTATEPEAKS as CONSENSUS_PEAKS_ANNOTATE} from "${projectDir}/modules/homer/homer_annotatepeaks" +include {PLOT_MACS2_QC} from "${projectDir}/modules/macs2/plot_macs2_qc" +include {PLOT_HOMER_ANNOTATEPEAKS} from "${projectDir}/modules/homer/plot_homer_annotatepeaks" +include {MACS2_CONSENSUS} from "${projectDir}/modules/macs2/macs2_consensus" +include {ANNOTATE_BOOLEAN_PEAKS} from "${projectDir}/modules/homer/annotate_boolean_peaks" +include {SUBREAD_FEATURECOUNTS} from "${projectDir}/modules/subread/subread_feature_counts_chipseq" +include {DESEQ2_QC} from "${projectDir}/modules/utility_modules/deseq2_qc" +include {MULTIQC} from "${projectDir}/modules/multiqc/multiqc" + +// help if needed +if (params.help){ + help() + exit 0 +} + +// log params +param_log() + +// main workflow +workflow CHIPSEQ { + + if (params.input) { ch_input = file(params.input, checkIfExists: true) } else { exit 1, 'Samples design file not specified!' } + + // Step 1: CHECK_DESIGN + CHECK_DESIGN(ch_input) + + /* + * Create channels for input fastq files + */ + + if (params.read_type == 'SE'){ + read_ch = CHECK_DESIGN.out.sample_reads + .splitCsv(header:true, sep:',') + .map { row -> [ row.sample_id, [ file(row.fastq_1, checkIfExists: true) ] ] } + } else { + read_ch = CHECK_DESIGN.out.sample_reads + .splitCsv(header:true, sep:',') + .map { row -> [ row.sample_id, [ file(row.fastq_1, checkIfExists: true), file(row.fastq_2, checkIfExists: true) ] ] } + } + + /* + * Create a channel with [sample_id, control id, antibody, replicatesExist, multipleGroups] + */ + control_ch = CHECK_DESIGN.out.study_design + .splitCsv(header:true, sep:',') + .map { row -> [ row.sample_id, row.control_id, row.antibody, row.replicatesExist.toBoolean(), row.multipleGroups.toBoolean() ] } + + + // Header files for MultiQC + ch_spp_nsc_header = file("${projectDir}/bin/shared/multiqc/chipseq/spp_nsc_header.txt", checkIfExists: true) + ch_spp_rsc_header = file("${projectDir}/bin/shared/multiqc/chipseq/spp_rsc_header.txt", checkIfExists: true) + ch_spp_correlation_header = file("${projectDir}/bin/shared/multiqc/chipseq/spp_correlation_header.txt", checkIfExists: true) + ch_peak_count_header = file("${projectDir}/bin/shared/multiqc/chipseq/peak_count_header.txt", checkIfExists: true) + ch_frip_score_header = file("${projectDir}/bin/shared/multiqc/chipseq/frip_score_header.txt", checkIfExists: true) + ch_peak_annotation_header = file("${projectDir}/bin/shared/multiqc/chipseq/peak_annotation_header.txt", checkIfExists: true) + ch_deseq2_pca_header = file("${projectDir}/bin/shared/multiqc/chipseq/deseq2_pca_header.txt", checkIfExists: true) + ch_deseq2_clustering_header = file("${projectDir}/bin/shared/multiqc/chipseq/deseq2_clustering_header.txt", checkIfExists: true) + + // Reference genome + ch_fasta = file(params.ref_fa, checkIfExists: true) + ch_gtf = file(params.gtf, checkIfExists: true) + + // genes.bed + if (params.gene_bed) { ch_gene_bed = file(params.gene_bed, checkIfExists: true) } + + // Step 2: Make genome filter + SAMTOOLS_FAIDX(ch_fasta) + MAKE_GENOME_FILTER(SAMTOOLS_FAIDX.out, params.blacklist) + + // Step 3: Fastqc + FASTQC(read_ch) + + // Step 4: Trim Galore + TRIM_GALORE(read_ch) + + // Step 5: Get Read Group Information + READ_GROUPS(TRIM_GALORE.out.trimmed_fastq, "gatk") + + // Step 6: BWA-MEM + bwa_mem_mapping = TRIM_GALORE.out.trimmed_fastq.join(READ_GROUPS.out.read_groups) + BWA_MEM(bwa_mem_mapping) + + // Step 7: Samtools Removing Unmapped + SAMTOOLS_FILTER(BWA_MEM.out, '-F 0x0100') + + // Step 8: Samtools Sort + SAMTOOLS_SORT(SAMTOOLS_FILTER.out.bam, '-O bam', 'bam') + + // Step 9: Samtools Stats + SAMTOOLS_STATS(SAMTOOLS_SORT.out.sorted_file) + + // Step 10: Merge BAM files + // Merge techincal replicates of sample replicates (if tech reps exist). + // BAM files for all libraries from same techincal sample replicate. + // i.e., merge multiple lanes per sample or resequenceing of 1 sample. + // see: https://github.com/nf-core/chipseq/blob/1.2.2/docs/usage.md#multiple-runs-of-the-same-library + + ch_sort_bam_merge = SAMTOOLS_SORT.out.sorted_file + .map { it -> [ it[0].split('_')[0..-2].join('_'), it[1] ] } + .groupTuple(by: [0]) + .map { it -> [ it[0], it[1].flatten() ] } + // The design script adds 2 fields to sample names: R (replicate), and T (treatment), which are delimited by '_'. + // The first step, splits off the T identifier, and groups all remaining samples by the new ID. + // This allows all samples that are techincal replicataes, i.e., share the same + // sampleID and replicate ID, to be joined and then merged in the next step. + + PICARD_MERGESAMFILES(ch_sort_bam_merge) + + // Step 11: Mark Duplicates + PICARD_MARKDUPLICATES(PICARD_MERGESAMFILES.out.bam) + + // Step 12: Samtools Stats + SAMTOOLS_STATS_MD(PICARD_MARKDUPLICATES.out.dedup_bam) + + // Step 13: Samtools Mergebam Filter + SAMTOOLS_MERGEBAM_FILTER(PICARD_MARKDUPLICATES.out.dedup_bam, MAKE_GENOME_FILTER.out.bed) + // Note: genome filter file is generic and used by all samples. + + // JSON files required by BAMTools for alignment filtering + if (params.read_type == 'SE'){ + ch_bamtools_filter_config = file(params.bamtools_filter_se_config, checkIfExists: true) + } else { + ch_bamtools_filter_config = file(params.bamtools_filter_pe_config, checkIfExists: true) + } + + // Step 14: Bamtools Filter + BAMTOOLS_FILTER(SAMTOOLS_MERGEBAM_FILTER.out.bam, ch_bamtools_filter_config) + + // Step 15: Samtools Stats + SAMTOOLS_STATS_BF(BAMTOOLS_FILTER.out.bam) + + if (params.read_type == 'SE'){ + + filtered_sorted_bam = BAMTOOLS_FILTER.out.bam + // Note: the output BAM from the preceding step was coordinate sorted in step 8, + // and the sort was maintained in the optional merge step and in markduplicates step. + + } else { + // Step 16: Samtools Name Sort + NAME_SORT(BAMTOOLS_FILTER.out.bam, '-n -O bam', 'bam') + // Name sorting is required to remove orphaned singletons in PE data. + + // Step 17: Remove singleton reads from paired-end BAM file + BAMPE_RM_ORPHAN(NAME_SORT.out.sorted_file) + + // Step 18 : Samtools Pair Sort + PAIR_SORT(BAMPE_RM_ORPHAN.out.bam, '-O bam', 'bam') + // Coordinate sorting must be used for next steps. + + filtered_sorted_bam = PAIR_SORT.out.sorted_file + + } + + // Step 19 : Samtools Stats + SAMTOOLS_STATS_FILTERED(filtered_sorted_bam) + + // Step 20 : Preseq + PRESEQ(PICARD_MARKDUPLICATES.out.dedup_bam) + //Note: preseq package is aimed at predicting and estimating the complexity of a genomic sequencing library + + // Step 21 : Collect Multiple Metrics + + SAMTOOLS_INDEX(filtered_sorted_bam) + + PICARD_COLLECTMULTIPLEMETRICS(filtered_sorted_bam) + + // Step 22 : Bedtools Genome Coverage + BEDTOOLS_GENOMECOV(filtered_sorted_bam.join(SAMTOOLS_STATS_FILTERED.out.flagstat)) + + // Step 23 : USCS Bedgraph to bigwig + UCSC_BEDGRAPHTOBIGWIG(BEDTOOLS_GENOMECOV.out.bedgraph, MAKE_GENOME_FILTER.out.sizes) + // Note: genome filter is a generic file used for all samples. + + // Step 24 : Deeptools Compute matrix + DEEPTOOLS_COMPUTEMATRIX(UCSC_BEDGRAPHTOBIGWIG.out.bigwig, ch_gene_bed) + // Note: ch_gene_bed is a generic file used for all samples. + + // Step 25 : Deeptools Plot Profile + DEEPTOOLS_PLOTPROFILE(DEEPTOOLS_COMPUTEMATRIX.out.matrix) + + // Step 26 : Deeptools Plot Heatmap + DEEPTOOLS_PLOTHEATMAP(DEEPTOOLS_COMPUTEMATRIX.out.matrix) + + // Step 27 : Phantompeakqualtools + PHANTOMPEAKQUALTOOLS(filtered_sorted_bam) + + // Step 28 : Multiqc Custom Phantompeakqualtools + mcp_ch = PHANTOMPEAKQUALTOOLS.out.spp.join(PHANTOMPEAKQUALTOOLS.out.rdata, by: [0]) + MULTIQC_CUSTOM_PHANTOMPEAKQUALTOOLS(mcp_ch, ch_spp_nsc_header, ch_spp_rsc_header, ch_spp_correlation_header) + + // Create channel linking IP bams with control bams + ch_genome_bam_bai = filtered_sorted_bam.join(SAMTOOLS_INDEX.out.bai) + .map{it -> [it[0], [it[1], it[2]]]} + // next step requires a tuple with [sampleID, [bam, bai]] + + ch_genome_bam_bai = ch_genome_bam_bai + .combine(ch_genome_bam_bai) + // this combine step genenerates pairs of samples, which are then refined in the next step. + + ch_group_bam = control_ch + .combine(ch_genome_bam_bai ) + .filter { it[0] == it[5] && it[1] == it[7] } + .join(SAMTOOLS_STATS_FILTERED.out.flagstat) + .map { it -> it[2..-1] } + // Generate combinations of all study design objects: + // [SPT5_T0_R1, SPT5_INPUT_R1, SPT5, true, true] + // with all combined bams from the 'combine' above: + // [SPT5_T0_R1, [/.../SPT5_T0_R1.mLb.clN.sorted.bam, /.../SPT5_T0_R1.mLb.clN.sorted.bam.bai], SPT5_INPUT_R1, [/.../SPT5_INPUT_R2.mLb.clN.sorted.bam, /.../SPT5_INPUT_R2.mLb.clN.sorted.bam.bai]] + // the combinations between design and all pairs, has combinations that are not relavent to the study design. Therefore, the combinations are filtered to cases where: + // it[0] == it[5] (e.g., SPT5_T0_R1 == SPT5_T0_R1) AND it[1] == it[7] (e.g., SPT5_INPUT_R1 == SPT5_INPUT_R1) + // it then adjust the output tuple to remove the extra sample IDs: + // [SPT5, true, true, SPT5_T0_R2, [/../SPT5_T0_R2.mLb.clN.sorted.bam, /../SPT5_T0_R2.mLb.clN.sorted.bam.bai], SPT5_INPUT_R2, [/../SPT5_INPUT_R2.mLb.clN.sorted.bam, /../SPT5_INPUT_R2.mLb.clN.sorted.bam.bai], /../SPT5_T0_R2.mLb.clN.sorted.bam.flagstat] + + // Step 29 : Deeptools plotFingerprint + DEEPTOOLS_PLOTFINGERPRINT(ch_group_bam) + + // Step 30 : Call peaks with MACS2 + PEAK_CALLING_CHIPSEQ(ch_group_bam, ch_peak_count_header, ch_frip_score_header) + // Note: ch_peak_count_header is a generic file used for all samples. ch_frip_score_header is a generic file used for all samples. + + // Step 31 : Calculate FRiP score + frip_input = ch_group_bam + .map{it -> [it[3], it[0], it[1], it[2], it[3], it[4], it[5], it[6], it[7]]} + .join(PEAK_CALLING_CHIPSEQ.out.peak) + .map{it -> it[1..-1]} + // 'ch_group_bam' is indexed on antibody. peak calling is indexed on the IP sample. + // This map adjusts the tuple to put IP in the index position + // Joins 'ch_group_bam' to the peak file by IP sample ID, + // and then readjusts the tuple to place antibody in the index position. + + FRIP_SCORE(frip_input, ch_peak_count_header, ch_frip_score_header) + // Note: ch_peak_count_header is a generic file used for all samples. ch_frip_score_header is a generic file used for all samples. + + // Step 32 : Homer Annotate Peaks + HOMER_ANNOTATEPEAKS(PEAK_CALLING_CHIPSEQ.out.ip_control_peak, ch_fasta, ch_gtf) + + // Step 33 : Plot Macs2 QC + PLOT_MACS2_QC(PEAK_CALLING_CHIPSEQ.out.peak.collect{ it[-1] }) + // Note: *collect{ it[-1] } collects all peak files, and passes those to the module. + + // Step 34 : Plot Homer Annotate Peaks + PLOT_HOMER_ANNOTATEPEAKS(HOMER_ANNOTATEPEAKS.out.txt.collect{ it[-1] }, ch_peak_annotation_header, '_peaks.annotatePeaks.txt') + // Note: *collect{ it[-1] } collects all peak files, and passes those to the module. + + // Step 35 : Consensus peaks across samples, create boolean filtering file, SAF file + + // Create channel for CONSENSUS PEAKS ANALYSIS + // Group by antibody from this point and carry forward boolean variables + + ch_macs_consensus = PEAK_CALLING_CHIPSEQ.out.ip_control_peak + .map { it -> [ it[0], it[1], it[2], it[-1] ] } + .groupTuple() + .map { it -> [ it[0], it[1][0], it[2][0], it[3].toSorted( { a, b -> a.getName() <=> b.getName() } ) ] } + // Note: re-order the output tuple from PEAK_CALLING_CHIPSEQ: + // [SPT5, true, true, SPT5_T15_R2, SPT5_INPUT_R2, /.../SPT5_T15_R2_peaks.broadPeak] + // to remove the case and control sample IDs: + // [SPT5, true, true, /.../SPT5_T15_R1_peaks.broadPeak] + // Then group by antibody. Map: keep only the first index position of replicatesExist, multipleGroups + // as the remaining array for those are duplicate values. sort the broadpeak file array by file name. + + MACS2_CONSENSUS(ch_macs_consensus) + // Note: this step will not run when replicatesExist || multipleGroups are false. + // Subequently all steps beyond this point will not run as they rely on output from this step. + + // Step 36 : Consensus peaks annotation + CONSENSUS_PEAKS_ANNOTATE(MACS2_CONSENSUS.out.bed, ch_fasta, ch_gtf) + // Note: ch_fasta and ch_gtf are generic files and shared by all samples. + + // Step 37 : Annotate boolean peaks + ANNOTATE_BOOLEAN_PEAKS(MACS2_CONSENSUS.out.boolean_txt.join(CONSENSUS_PEAKS_ANNOTATE.out.txt)) + + // Get BAM and SAF files for each antibody + + ch_group_bam // [antibody, replicatesExist, multipleGroups, sample_id, [bam, bai], control_id, [bam, bai], sample_id bam.flagstat] + .map { it -> [ it[3], [ it[0], it[1], it[2] ] ] } // [sample_id, [antibody, replicatesExist, multipleGroups]] + .join(filtered_sorted_bam) // [sample_id, [antibody, replicatesExist, multipleGroups], final filtered sample_id indexed bam] + .map { it -> [ it[1][0], it[1][1], it[1][2], it[2] ] } // [antibody, replicatesExist, multipleGroups, OR sample_id bam] + .groupTuple() + .map { it -> [ it[0], it[1][0], it[2][0], it[3].flatten().sort() ] } // [antibody, replicatesExist, multipleGroups, [OR sample_id1 R1 bam, OR sample_id1 R2 bam, OR sample_id2 R1 bam, OR sample_id2 R2 bam]] + .join(MACS2_CONSENSUS.out.saf) // [antibody, replicatesExist, multipleGroups, [OR sample_id1 R1 bam, OR sample_id1 R2 bam, OR sample_id2 R1 bam, OR sample_id2 R2 bam], SAF] + .set { ch_group_bam } + + // Step 38 : Count reads in consensus peaks with featureCounts + SUBREAD_FEATURECOUNTS(ch_group_bam) + + // Step 39 : Differential analysis with DESeq2 + DESEQ2_QC(SUBREAD_FEATURECOUNTS.out.counts, ch_deseq2_pca_header, ch_deseq2_clustering_header) + // note: ch_deseq2_pca_header, ch_deseq2_clustering_header are generic files used for all samples. + + // Create channels for multi input files + ch_multiqc_files = Channel.empty() + + ch_multiqc_files = ch_multiqc_files.mix(FASTQC.out.quality_stats.collect{it[1]}.ifEmpty([])) + ch_multiqc_files = ch_multiqc_files.mix(TRIM_GALORE.out.trim_stats.collect{it[1]}.ifEmpty([])) + ch_multiqc_files = ch_multiqc_files.mix(TRIM_GALORE.out.trimmed_fastqc.collect{it[1]}.ifEmpty([])) + + ch_multiqc_files = ch_multiqc_files.mix(SAMTOOLS_STATS.out.flagstat.collect{it[1]}.ifEmpty([])) + ch_multiqc_files = ch_multiqc_files.mix(SAMTOOLS_STATS.out.idxstat.collect{it[1]}.ifEmpty([])) + ch_multiqc_files = ch_multiqc_files.mix(SAMTOOLS_STATS.out.stats.collect{it[1]}.ifEmpty([])) + ch_multiqc_files = ch_multiqc_files.mix(SAMTOOLS_STATS_MD.out.flagstat.collect{it[1]}.ifEmpty([])) + ch_multiqc_files = ch_multiqc_files.mix(SAMTOOLS_STATS_MD.out.idxstat.collect{it[1]}.ifEmpty([])) + ch_multiqc_files = ch_multiqc_files.mix(SAMTOOLS_STATS_MD.out.stats.collect{it[1]}.ifEmpty([])) + ch_multiqc_files = ch_multiqc_files.mix(SAMTOOLS_STATS_FILTERED.out.flagstat.collect{it[1]}.ifEmpty([])) + ch_multiqc_files = ch_multiqc_files.mix(SAMTOOLS_STATS_FILTERED.out.idxstat.collect{it[1]}.ifEmpty([])) + ch_multiqc_files = ch_multiqc_files.mix(SAMTOOLS_STATS_FILTERED.out.stats.collect{it[1]}.ifEmpty([])) + ch_multiqc_files = ch_multiqc_files.mix(PICARD_MARKDUPLICATES.out.dedup_metrics.collect{it[1]}.ifEmpty([])) + ch_multiqc_files = ch_multiqc_files.mix(PICARD_COLLECTMULTIPLEMETRICS.out.metrics.collect{it[1]}.ifEmpty([])) + + ch_multiqc_files = ch_multiqc_files.mix(FRIP_SCORE.out.tsv.collect{it[1]}.ifEmpty([])) + ch_multiqc_files = ch_multiqc_files.mix(PLOT_HOMER_ANNOTATEPEAKS.out.tsv.collect()) + ch_multiqc_files = ch_multiqc_files.mix(SUBREAD_FEATURECOUNTS.out.summary.collect{it[1]}.ifEmpty([])) + ch_multiqc_files = ch_multiqc_files.mix(DESEQ2_QC.out.pca_multiqc.collect()) + ch_multiqc_files = ch_multiqc_files.mix(DESEQ2_QC.out.dists_multiqc.collect()) + + ch_multiqc_files = ch_multiqc_files.mix(PRESEQ.out.txt.collect{it[1]}.ifEmpty([])) + ch_multiqc_files = ch_multiqc_files.mix(DEEPTOOLS_PLOTFINGERPRINT.out.raw.collect{it[1]}.ifEmpty([])) + ch_multiqc_files = ch_multiqc_files.mix(DEEPTOOLS_PLOTPROFILE.out.table.collect{it[1]}.ifEmpty([])) + ch_multiqc_files = ch_multiqc_files.mix(PHANTOMPEAKQUALTOOLS.out.spp.collect{it[1]}.ifEmpty([])) + ch_multiqc_files = ch_multiqc_files.mix(MULTIQC_CUSTOM_PHANTOMPEAKQUALTOOLS.out.nsc.collect{it[1]}.ifEmpty([])) + ch_multiqc_files = ch_multiqc_files.mix(MULTIQC_CUSTOM_PHANTOMPEAKQUALTOOLS.out.rsc.collect{it[1]}.ifEmpty([])) + ch_multiqc_files = ch_multiqc_files.mix(MULTIQC_CUSTOM_PHANTOMPEAKQUALTOOLS.out.correlation.collect{it[1]}.ifEmpty([])) + + + // Step 41 : MultiQC + MULTIQC ( + ch_multiqc_files.collect() + ) + +} diff --git a/workflows/pdx_wes.nf b/workflows/pdx_wes.nf new file mode 100755 index 00000000..bed77d59 --- /dev/null +++ b/workflows/pdx_wes.nf @@ -0,0 +1,246 @@ +#!/usr/bin/env nextflow +nextflow.enable.dsl=2 + +// import modules +include {help} from "${projectDir}/bin/help/pdx_wes.nf" +include {param_log} from "${projectDir}/bin/log/pdx_wes.nf" +include {getLibraryId} from "${projectDir}/bin/shared/getLibraryId.nf" +include {extract_csv} from "${projectDir}/bin/shared/extract_csv.nf" +include {FILE_DOWNLOAD} from "${projectDir}/subworkflows/aria_download_parse" +include {CONCATENATE_LOCAL_FILES} from "${projectDir}/subworkflows/concatenate_local_files" +include {CONCATENATE_READS_PE} from "${projectDir}/modules/utility_modules/concatenate_reads_PE" +include {CONCATENATE_READS_SE} from "${projectDir}/modules/utility_modules/concatenate_reads_SE" +include {JAX_TRIMMER} from "${projectDir}/modules/utility_modules/jax_trimmer" +include {FASTQC} from "${projectDir}/modules/fastqc/fastqc" +include {XENOME_CLASSIFY} from "${projectDir}/modules/xenome/xenome" +include {FASTQ_SORT as FASTQ_SORT_HUMAN; + FASTQ_SORT as FASTQ_SORT_MOUSE} from "${projectDir}/modules/fastq-tools/fastq-sort" +include {READ_GROUPS} from "${projectDir}/modules/utility_modules/read_groups" +include {BWA_MEM} from "${projectDir}/modules/bwa/bwa_mem" +include {PICARD_SORTSAM} from "${projectDir}/modules/picard/picard_sortsam" +include {PICARD_MARKDUPLICATES} from "${projectDir}/modules/picard/picard_markduplicates" +include {GATK_BASERECALIBRATOR} from "${projectDir}/modules/gatk/gatk_baserecalibrator" +include {GATK_APPLYBQSR} from "${projectDir}/modules/gatk/gatk_applybqsr" +include {GATK_GETSAMPLENAME} from "${projectDir}/modules/gatk/gatk_getsamplename_noMeta" +include {GATK_VARIANTFILTRATION as GATK_VARIANTFILTRATION_SNP; + GATK_VARIANTFILTRATION as GATK_VARIANTFILTRATION_INDEL} from "${projectDir}/modules/gatk/gatk_variantfiltration_mutect2" +include {GATK_SELECTVARIANTS as GATK_SELECTVARIANTS_SNP; + GATK_SELECTVARIANTS as GATK_SELECTVARIANTS_INDEL} from "${projectDir}/modules/gatk/gatk_selectvariants" +include {GATK_MUTECT2} from "${projectDir}/modules/gatk/gatk_mutect2_tumorOnly" +include {GATK_FILTERMUECTCALLS} from "${projectDir}/modules/gatk/gatk_filtermutectcalls_tumorOnly" +include {MSISENSOR2_MSI} from "${projectDir}/modules/msisensor2/msisensor2_tumorOnly" +include {GATK_MERGEVCF as GATK_MERGEVCF_UNANNOTATED; + GATK_MERGEVCF as GATK_MERGEVCF_ANNOTATED} from "${projectDir}/modules/gatk/gatk_mergevcf" +include {COSMIC_ANNOTATION as COSMIC_ANNOTATION_SNP; + COSMIC_ANNOTATION as COSMIC_ANNOTATION_INDEL} from "${projectDir}/modules/cosmic/cosmic_annotation" +include {SNPSIFT_ANNOTATE as SNPSIFT_ANNOTATE_SNP_COSMIC; + SNPSIFT_ANNOTATE as SNPSIFT_ANNOTATE_INDEL_COSMIC; + SNPSIFT_ANNOTATE as SNPSIFT_ANNOTATE_SNP_DBSNP; + SNPSIFT_ANNOTATE as SNPSIFT_ANNOTATE_INDEL_DBSNP} from "${projectDir}/modules/snpeff_snpsift/snpsift_annotate" +include {SNPEFF as SNPEFF_SNP; + SNPEFF as SNPEFF_INDEL} from "${projectDir}/modules/snpeff_snpsift/snpeff_snpeff" +include {SNPEFF_ONEPERLINE as SNPEFF_ONEPERLINE_SNP; + SNPEFF_ONEPERLINE as SNPEFF_ONEPERLINE_INDEL} from "${projectDir}/modules/snpeff_snpsift/snpeff_oneperline" +include {SNPSIFT_EXTRACTFIELDS} from "${projectDir}/modules/snpeff_snpsift/snpsift_extractfields" +include {SNPSIFT_DBNSFP as SNPSIFT_DBNSFP_SNP; + SNPSIFT_DBNSFP as SNPSIFT_DBNSFP_INDEL} from "${projectDir}/modules/snpeff_snpsift/snpsift_dbnsfp" +include {PICARD_COLLECTHSMETRICS} from "${projectDir}/modules/picard/picard_collecthsmetrics" +include {AGGREGATE_STATS} from "${projectDir}/modules/utility_modules/aggregate_stats_wes" +include {MULTIQC} from "${projectDir}/modules/multiqc/multiqc" + +// help if needed +if (params.help){ + help() + exit 0 +} + +// log params +param_log() + +// prepare reads channel + +if (params.download_data && !params.csv_input) { + exit 1, "Data download was specified with `--download_data`. However, no input CSV file was specified with `--csv_input`. This is an invalid parameter combination. `--download_data` requires a CSV manifest. See `--help` for information." +} + +if (params.gen_org == 'mouse') { + exit 1, "PDX workflow was called; however, `--gen_org` was set to: ${params.gen_org}. This is an invalid parameter combination. `--gen_org` must == 'human' for PDX analysis." +} + +if (params.csv_input) { + + ch_input_sample = extract_csv(file(params.csv_input, checkIfExists: true)) + + if (params.read_type == 'PE'){ + ch_input_sample.map{it -> [it[0], [it[2], it[3]]]}.set{read_ch} + ch_input_sample.map{it -> [it[0], it[1]]}.set{meta_ch} + } else if (params.read_type == 'SE') { + ch_input_sample.map{it -> [it[0], it[2]]}.set{read_ch} + ch_input_sample.map{it -> [it[0], it[1]]}.set{meta_ch} + } + +} else if (params.concat_lanes){ + + if (params.read_type == 'PE'){ + read_ch = Channel + .fromFilePairs("${params.sample_folder}/${params.pattern}${params.extension}",checkExists:true, flat:true ) + .map { file, file1, file2 -> tuple(getLibraryId(file), file1, file2) } + .groupTuple() + } + else if (params.read_type == 'SE'){ + read_ch = Channel.fromFilePairs("${params.sample_folder}/*${params.extension}", checkExists:true, size:1 ) + .map { file, file1 -> tuple(getLibraryId(file), file1) } + .groupTuple() + .map{t-> [t[0], t[1].flatten()]} + } + // if channel is empty give error message and exit + read_ch.ifEmpty{ exit 1, "ERROR: No Files Found in Path: ${params.sample_folder} Matching Pattern: ${params.pattern} and file extension: ${params.extension}"} + +} else { + + if (params.read_type == 'PE'){ + read_ch = Channel.fromFilePairs("${params.sample_folder}/${params.pattern}${params.extension}",checkExists:true ) + } + else if (params.read_type == 'SE'){ + read_ch = Channel.fromFilePairs("${params.sample_folder}/*${params.extension}",checkExists:true, size:1 ) + } + // if channel is empty give error message and exit + read_ch.ifEmpty{ exit 1, "ERROR: No Files Found in Path: ${params.sample_folder} Matching Pattern: ${params.pattern} and file extension: ${params.extension}"} + +} + +workflow PDX_WES { + + // Step 0: Download data and concat Fastq files if needed. + if (params.download_data){ + FILE_DOWNLOAD(ch_input_sample) + + FILE_DOWNLOAD.out.read_meta_ch.map{it -> [it[0], it[2]]}.set{read_ch} + FILE_DOWNLOAD.out.read_meta_ch.map{it -> [it[0], it[1]]}.set{meta_ch} + } + + // Step 00: Concat local Fastq files from CSV input if required. + if (!params.download_data && params.csv_input){ + CONCATENATE_LOCAL_FILES(ch_input_sample) + CONCATENATE_LOCAL_FILES.out.read_meta_ch.map{it -> [it[0], it[2]]}.set{read_ch} + CONCATENATE_LOCAL_FILES.out.read_meta_ch.map{it -> [it[0], it[1]]}.set{meta_ch} + } + + // Step 00: Concat local Fastq files from directory if required. + if (params.concat_lanes && !params.csv_input){ + if (params.read_type == 'PE'){ + CONCATENATE_READS_PE(read_ch) + read_ch = CONCATENATE_READS_PE.out.concat_fastq + } else if (params.read_type == 'SE'){ + CONCATENATE_READS_SE(read_ch) + read_ch = CONCATENATE_READS_SE.out.concat_fastq + } + } + + // ** MAIN workflow starts: + + // Step 1: Qual_Stat + JAX_TRIMMER(read_ch) + + xenome_input = JAX_TRIMMER.out.trimmed_fastq + + FASTQC(JAX_TRIMMER.out.trimmed_fastq) + + // Step 2: Xenome classify and sort. + XENOME_CLASSIFY(xenome_input) + + // Xenome Read Sort + FASTQ_SORT_HUMAN(XENOME_CLASSIFY.out.xenome_fastq, 'human') + FASTQ_SORT_MOUSE(XENOME_CLASSIFY.out.xenome_mouse_fastq, 'mouse') + + // Step 3: Get Read Group Information + READ_GROUPS(JAX_TRIMMER.out.trimmed_fastq, "gatk") + + // Step 4: BWA-MEM Alignment + bwa_mem_mapping = FASTQ_SORT_HUMAN.out.sorted_fastq.join(READ_GROUPS.out.read_groups) + + BWA_MEM(bwa_mem_mapping) + + // Step 5: Variant Preprocessing - Part 1 + PICARD_SORTSAM(BWA_MEM.out.sam) + PICARD_MARKDUPLICATES(PICARD_SORTSAM.out.bam) + + // Step 6: Variant Pre-Processing - Part 2 + GATK_BASERECALIBRATOR(PICARD_MARKDUPLICATES.out.dedup_bam) + + apply_bqsr = PICARD_MARKDUPLICATES.out.dedup_bam.join(GATK_BASERECALIBRATOR.out.table) + GATK_APPLYBQSR(apply_bqsr) + + // Step 7: Variant Pre-Processing - Part 3 + collect_metrics = GATK_APPLYBQSR.out.bam.join(GATK_APPLYBQSR.out.bai) + PICARD_COLLECTHSMETRICS(collect_metrics) + + // Step 8: MSI + MSISENSOR2_MSI(GATK_APPLYBQSR.out.bam.join(GATK_APPLYBQSR.out.bai)) + + // Step 9: Get sample names + GATK_GETSAMPLENAME(collect_metrics) + + // ** Variant Calling + mutect2_caller_input = GATK_APPLYBQSR.out.bam.join(GATK_APPLYBQSR.out.bai).join(GATK_GETSAMPLENAME.out.sample_name) + + // Step 10: Mutect2 + GATK_MUTECT2(mutect2_caller_input) + GATK_FILTERMUECTCALLS(GATK_MUTECT2.out.vcf_tbi_stats) + + // Step 8: Variant Filtration + // SNP + GATK_SELECTVARIANTS_SNP(GATK_FILTERMUECTCALLS.out.mutect2_vcf_tbi, 'SNP', 'selected_SNP') + + var_filter_snp = GATK_SELECTVARIANTS_SNP.out.vcf.join(GATK_SELECTVARIANTS_SNP.out.idx) + GATK_VARIANTFILTRATION_SNP(var_filter_snp, 'SNP') + + // INDEL + GATK_SELECTVARIANTS_INDEL(GATK_FILTERMUECTCALLS.out.mutect2_vcf_tbi, 'INDEL', 'selected_INDEL') + + var_filter_indel = GATK_SELECTVARIANTS_INDEL.out.vcf.join(GATK_SELECTVARIANTS_INDEL.out.idx) + GATK_VARIANTFILTRATION_INDEL(var_filter_indel, 'INDEL') + + // Step 9: Post Variant Calling Processing - Part 1 + // + SNPSIFT_ANNOTATE_SNP_DBSNP(GATK_VARIANTFILTRATION_SNP.out.vcf, params.dbSNP, params.dbSNP_index, 'dbsnpID') + SNPSIFT_ANNOTATE_SNP_COSMIC(SNPSIFT_ANNOTATE_SNP_DBSNP.out.vcf, params.cosmic, params.cosmic_index, 'cosmicID') + SNPEFF_SNP(SNPSIFT_ANNOTATE_SNP_COSMIC.out.vcf, 'SNP', 'vcf') + SNPSIFT_DBNSFP_SNP(SNPEFF_SNP.out.vcf, 'SNP') + SNPEFF_ONEPERLINE_SNP(SNPSIFT_DBNSFP_SNP.out.vcf, 'SNP') + + // INDEL + SNPSIFT_ANNOTATE_INDEL_DBSNP(GATK_VARIANTFILTRATION_INDEL.out.vcf, params.dbSNP, params.dbSNP_index, 'dbsnpID') + SNPSIFT_ANNOTATE_INDEL_COSMIC(SNPSIFT_ANNOTATE_INDEL_DBSNP.out.vcf, params.cosmic, params.cosmic_index, 'cosmicID') + SNPEFF_INDEL(SNPSIFT_ANNOTATE_INDEL_COSMIC.out.vcf, 'INDEL', 'vcf') + SNPSIFT_DBNSFP_INDEL(SNPEFF_INDEL.out.vcf, 'INDEL') + SNPEFF_ONEPERLINE_INDEL(SNPSIFT_DBNSFP_INDEL.out.vcf, 'INDEL') + + // Step 10: Post Variant Calling Processing - Part 2 + vcf_files_unannotated = SNPSIFT_ANNOTATE_SNP_COSMIC.out.vcf.join(SNPSIFT_ANNOTATE_INDEL_COSMIC.out.vcf) + GATK_MERGEVCF_UNANNOTATED (vcf_files_unannotated, 'SNP_INDEL_filtered_unannotated_final') + + vcf_files_annotated = SNPEFF_ONEPERLINE_SNP.out.vcf.join(SNPEFF_ONEPERLINE_INDEL.out.vcf) + GATK_MERGEVCF_ANNOTATED(vcf_files_annotated, 'SNP_INDEL_filtered_annotated_final') + + SNPSIFT_EXTRACTFIELDS(GATK_MERGEVCF_ANNOTATED.out.vcf) + + agg_stats = JAX_TRIMMER.out.quality_stats.join(PICARD_COLLECTHSMETRICS.out.hsmetrics).join(PICARD_MARKDUPLICATES.out.dedup_metrics) + + // Step 11: Aggregate Stats + AGGREGATE_STATS(agg_stats) + + ch_multiqc_files = Channel.empty() + ch_multiqc_files = ch_multiqc_files.mix(JAX_TRIMMER.out.quality_stats.collect{it[1]}.ifEmpty([])) + ch_multiqc_files = ch_multiqc_files.mix(FASTQC.out.quality_stats.collect{it[1]}.ifEmpty([])) + ch_multiqc_files = ch_multiqc_files.mix(GATK_BASERECALIBRATOR.out.table.collect{it[1]}.ifEmpty([])) + ch_multiqc_files = ch_multiqc_files.mix(PICARD_COLLECTHSMETRICS.out.hsmetrics.collect{it[1]}.ifEmpty([])) + ch_multiqc_files = ch_multiqc_files.mix(PICARD_MARKDUPLICATES.out.dedup_metrics.collect{it[1]}.ifEmpty([])) + ch_multiqc_files = ch_multiqc_files.mix(XENOME_CLASSIFY.out.xenome_stats.collect{it[1]}.ifEmpty([])) + ch_multiqc_files = ch_multiqc_files.mix(GATK_FILTERMUECTCALLS.out.stats.collect{it[1]}.ifEmpty([])) + + MULTIQC ( + ch_multiqc_files.collect() + ) + +} diff --git a/workflows/pta.nf b/workflows/pta.nf new file mode 100644 index 00000000..fa7e77df --- /dev/null +++ b/workflows/pta.nf @@ -0,0 +1,909 @@ +#!/usr/bin/env nextflow +nextflow.enable.dsl=2 + +// import modules +include {help} from "${projectDir}/bin/help/pta.nf" +include {param_log} from "${projectDir}/bin/log/pta.nf" +include {CONCATENATE_PTA_FASTQ} from "${projectDir}/subworkflows/concatenate_pta_fastq" +include {JAX_TRIMMER} from "${projectDir}/modules/utility_modules/jax_trimmer" +include {FASTQC} from "${projectDir}/modules/fastqc/fastqc" +include {READ_GROUPS} from "${projectDir}/modules/utility_modules/read_groups" +include {XENOME_CLASSIFY} from "${projectDir}/modules/xenome/xenome" +include {FASTQ_SORT} from "${projectDir}/modules/fastq-tools/fastq-sort" +include {BWA_MEM} from "${projectDir}/modules/bwa/bwa_mem" +include {PICARD_SORTSAM} from "${projectDir}/modules/picard/picard_sortsam" +include {SHORT_ALIGNMENT_MARKING} from "${projectDir}/modules/nygc-short-alignment-marking/short_alignment_marking" +include {PICARD_CLEANSAM} from "${projectDir}/modules/picard/picard_cleansam" +include {PICARD_FIX_MATE_INFORMATION} from "${projectDir}/modules/picard/picard_fix_mate_information" +include {PICARD_MARKDUPLICATES} from "${projectDir}/modules/picard/picard_markduplicates" +include {GATK_BASERECALIBRATOR} from "${projectDir}/modules/gatk/gatk_baserecalibrator" +include {GATK_APPLYBQSR} from "${projectDir}/modules/gatk/gatk_applybqsr" + +include {PICARD_COLLECTALIGNMENTSUMMARYMETRICS} from "${projectDir}/modules/picard/picard_collectalignmentsummarymetrics" +include {PICARD_COLLECTWGSMETRICS} from "${projectDir}/modules/picard/picard_collectwgsmetrics" + +include {CONPAIR_PILEUP as CONPAIR_TUMOR_PILEUP; + CONPAIR_PILEUP as CONPAIR_NORMAL_PILEUP} from "${projectDir}/modules/conpair/conpair_pileup" +include {CONPAIR} from "${projectDir}/modules/conpair/conpair" + +include {GATK_HAPLOTYPECALLER_SV_GERMLINE} from "${projectDir}/modules/gatk/gatk_haplotypecaller_sv_germline" +include {GATK_SORTVCF_GERMLINE as GATK_SORTVCF_GERMLINE; + GATK_SORTVCF_GERMLINE as GATK_SORTVCF_GENOTYPE} from "${projectDir}/modules/gatk/gatk_sortvcf_germline" +include {GATK_GENOTYPE_GVCF} from "${projectDir}/modules/gatk/gatk_genotype_gvcf" +include {GATK_CNNSCORE_VARIANTS} from "${projectDir}/modules/gatk/gatk_cnnscorevariants" +include {GATK_FILTER_VARIANT_TRANCHES} from "${projectDir}/modules/gatk/gatk_filtervarianttranches" +include {GATK_VARIANTFILTRATION_AF} from "${projectDir}/modules/gatk/gatk_variantfiltration_af" +include {BCFTOOLS_GERMLINE_FILTER} from "${projectDir}/modules/bcftools/bcftools_germline_filter" +include {BCFTOOLS_SPLITMULTIALLELIC_REGIONS} from "${projectDir}/modules/bcftools/bcftools_split_multiallelic_regions" +include {VEP_GERMLINE} from "${projectDir}/modules/ensembl/varianteffectpredictor_germline" +include {BCFTOOLS_REMOVESPANNING} from "${projectDir}/modules/bcftools/bcftools_remove_spanning" +include {COSMIC_ANNOTATION} from "${projectDir}/modules/cosmic/cosmic_annotation" +include {COSMIC_CANCER_RESISTANCE_MUTATION_GERMLINE} from "${projectDir}/modules/cosmic/cosmic_add_cancer_resistance_mutations_germline" +include {GERMLINE_VCF_FINALIZATION} from "${projectDir}/modules/python/python_germline_vcf_finalization" +include {GATK_GETSAMPLENAME as GATK_GETSAMPLENAME_NORMAL; + GATK_GETSAMPLENAME as GATK_GETSAMPLENAME_TUMOR} from "${projectDir}/modules/gatk/gatk_getsamplename" + +include {GATK_MUTECT2} from "${projectDir}/modules/gatk/gatk_mutect2" +include {GATK_MERGEMUTECTSTATS} from "${projectDir}/modules/gatk/gatk_mergemutectstats" +include {GATK_FILTERMUECTCALLS} from "${projectDir}/modules/gatk/gatk_filtermutectcalls" +include {MANTA} from "${projectDir}/modules/illumina/manta" +include {STRELKA2} from "${projectDir}/modules/illumina/strelka2" +include {LANCET} from "${projectDir}/modules/nygenome/lancet" +include {GATK_SORTVCF as GATK_SORTVCF_MUTECT; + GATK_SORTVCF as GATK_SORTVCF_LANCET; + GATK_SORTVCF as GATK_SORTVCF_TOOLS; + GATK_SORTVCF as GATK_SORTVCF_TOOLS_LANCET} from "${projectDir}/modules/gatk/gatk_sortvcf_somatic_tools" +include {GRIDSS_PREPROCESS} from "${projectDir}/modules/gridss/gridss_preprocess" +include {GRIDSS_ASSEMBLE} from "${projectDir}/modules/gridss/gridss_assemble" +include {GRIDSS_CALLING} from "${projectDir}/modules/gridss/gridss_calling" +include {GRIDSS_CHROM_FILTER} from "${projectDir}/modules/gridss/gridss_chrom_filter" +include {GRIPSS_SOMATIC_FILTER} from "${projectDir}/modules/gridss/gripss_somatic_filter" +include {SAMTOOLS_STATS_INSERTSIZE as SAMTOOLS_STATS_INSERTSIZE_NORMAL; + SAMTOOLS_STATS_INSERTSIZE as SAMTOOLS_STATS_INSERTSIZE_TUMOR} from "${projectDir}/modules/samtools/samtools_stats_insertsize" +include {SAMTOOLS_FILTER_UNIQUE as SAMTOOLS_FILTER_UNIQUE_NORMAL; + SAMTOOLS_FILTER_UNIQUE as SAMTOOLS_FILTER_UNIQUE_TUMOR} from "${projectDir}/modules/samtools/samtools_filter_unique_reads" +include {BICSEQ2_NORMALIZE as BICSEQ2_NORMALIZE_NORMAL; + BICSEQ2_NORMALIZE as BICSEQ2_NORMALIZE_TUMOR} from "${projectDir}/modules/biqseq2/bicseq2_normalize" +include {BICSEQ2_SEG} from "${projectDir}/modules/biqseq2/bicseq2_seg" +include {BICSEQ2_SEG_UNPAIRED} from "${projectDir}/modules/biqseq2/bicseq2_seg_unpaired" +include {MSISENSOR2_MSI} from "${projectDir}/modules/msisensor2/msisensor2" + +include {RENAME_METADATA; + RENAME_METADATA as RENAME_METADATA_LANCET} from "${projectDir}/modules/python/python_rename_metadata" +include {MERGE_PREP; + MERGE_PREP as MERGE_PREP_LANCET} from "${projectDir}/modules/python/python_merge_prep" +include {RENAME_VCF; + RENAME_VCF as RENAME_VCF_LANCET;} from "${projectDir}/modules/python/python_rename_vcf" +include {COMPRESS_INDEX_VCF; + COMPRESS_INDEX_VCF as COMPRESS_INDEX_VCF_LANCET; + COMPRESS_INDEX_VCF as COMPRESS_INDEX_VCF_REGION_LANCET} from "${projectDir}/modules/tabix/compress_vcf" +include {BCFTOOLS_SPLITMULTIALLELIC; + BCFTOOLS_SPLITMULTIALLELIC as BCFTOOLS_SPLITMULTIALLELIC_LANCET} from "${projectDir}/modules/bcftools/bcftools_split_multiallelic" +include {SPLIT_MNV; + SPLIT_MNV as SPLIT_MNV_LANCET} from "${projectDir}/modules/python/python_split_mnv" +include {REMOVE_CONTIG} from "${projectDir}/modules/python/python_remove_contig" + +include {BCFTOOLS_MERGECALLERS; + BCFTOOLS_MERGECALLERS as BCFTOOLS_MERGECALLERS_FINAL} from "${projectDir}/modules/bcftools/bcftools_merge_callers" +include {BEDTOOLS_STARTCANDIDATES} from "${projectDir}/modules/bedtools/bedtools_start_candidates" +include {GET_CANDIDATES} from "${projectDir}/modules/python/python_get_candidates" +include {VCF_TO_BED} from "${projectDir}/modules/python/python_vcf_to_bed" +include {LANCET_CONFIRM} from "${projectDir}/modules/nygenome/lancet_confirm" +include {COMPRESS_INDEX_VCF_REGION; + COMPRESS_INDEX_VCF_REGION as COMPRESS_INDEX_VCF_ALL_CALLERS; + COMPRESS_INDEX_VCF_REGION as COMPRESS_INDEX_VCF_MERGED} from "${projectDir}/modules/tabix/compress_vcf_region" +include {BCFTOOLS_INTERSECTVCFS} from "${projectDir}/modules/bcftools/bcftools_intersect_lancet_candidates" + +include {MERGE_COLUMNS} from "${projectDir}/modules/python/python_merge_columns" +include {ADD_NYGC_ALLELE_COUNTS} from "${projectDir}/modules/python/python_add_nygc_allele_counts" +include {ADD_FINAL_ALLELE_COUNTS} from "${projectDir}/modules/python/python_add_final_allele_counts" +include {FILTER_PON} from "${projectDir}/modules/python/python_filter_pon" +include {FILTER_VCF} from "${projectDir}/modules/python/python_filter_vcf" +include {SNV_TO_MNV_FINAL_FILTER} from "${projectDir}/modules/python/python_snv_to_mnv_final_filter" + +include {GATK_SORTVCF_SOMATIC} from "${projectDir}/modules/gatk/gatk_sortvcf_somatic_merge" +include {REORDER_VCF_COLUMNS} from "${projectDir}/modules/python/python_reorder_vcf_columns" +include {COMPRESS_INDEX_MERGED_VCF} from "${projectDir}/modules/tabix/compress_merged_vcf" +include {VEP_SOMATIC} from "${projectDir}/modules/ensembl/varianteffectpredictor_somatic" +include {COSMIC_ANNOTATION_SOMATIC} from "${projectDir}/modules/cosmic/cosmic_annotation_somatic" +include {COSMIC_CANCER_RESISTANCE_MUTATION_SOMATIC} from "${projectDir}/modules/cosmic/cosmic_add_cancer_resistance_mutations_somatic" +include {SOMATIC_VCF_FINALIZATION} from "${projectDir}/modules/python/python_somatic_vcf_finalization" +include {SNPSIFT_ANNOTATE as SNPSIFT_ANNOTATE_DBSNP_GERMLINE; + SNPSIFT_ANNOTATE as SNPSIFT_ANNOTATE_DBSNP_SOMATIC} from "${projectDir}/modules/snpeff_snpsift/snpsift_annotate" +include {ANNOTATE_BICSEQ2_CNV} from "${projectDir}/modules/r/annotate_bicseq2_cnv" +include {MERGE_SV} from "${projectDir}/modules/r/merge_sv" +include {ANNOTATE_SV; + ANNOTATE_SV as ANNOTATE_SV_SUPPLEMENTAL} from "${projectDir}/modules/r/annotate_sv" +include {ANNOTATE_GENES_SV; + ANNOTATE_GENES_SV as ANNOTATE_GENES_SV_SUPPLEMENTAL} from "${projectDir}/modules/r/annotate_genes_sv" +include {ANNOTATE_SV_WITH_CNV; + ANNOTATE_SV_WITH_CNV as ANNOTATE_SV_WITH_CNV_SUPPLEMENTAL} from "${projectDir}/modules/r/annotate_sv_with_cnv" +include {FILTER_BEDPE; + FILTER_BEDPE as FILTER_BEDPE_SUPPLEMENTAL} from "${projectDir}/modules/r/filter_bedpe" + +include {MULTIQC} from "${projectDir}/modules/multiqc/multiqc" + + +// help if needed +if (params.help){ + help() + exit 0 +} + +// log paramiter info +param_log() + +// main workflow +workflow PTA { + + if (params.csv_input) { + ch_input_sample = extract_csv(file(params.csv_input, checkIfExists: true)) + // Concat local Fastq files from CSV input if required. + CONCATENATE_PTA_FASTQ(ch_input_sample) + CONCATENATE_PTA_FASTQ.out.read_meta_ch.map{it -> [it[0], it[2]]}.set{read_ch} + CONCATENATE_PTA_FASTQ.out.read_meta_ch.map{it -> [it[0], it[1]]}.set{meta_ch} + } + + // ** Step 1: Qual_Stat + JAX_TRIMMER(read_ch) + + FASTQC(JAX_TRIMMER.out.trimmed_fastq) + + // ** Step 2: Get Read Group Information + READ_GROUPS(JAX_TRIMMER.out.trimmed_fastq, "gatk") + + // PDX CASES TO ADD AND VALIDATE: + // Normal samples should PASS the PDX step. + + // ** Step 2a: Xenome if PDX data used. + ch_XENOME_CLASSIFY_multiqc = Channel.empty() //optional log file. + if (params.pdx){ + // Xenome Classification + XENOME_CLASSIFY(JAX_TRIMMER.out.trimmed_fastq) + ch_XENOME_CLASSIFY_multiqc = XENOME_CLASSIFY.out.xenome_stats // set log file for multiqc + + // Xenome Read Sort + FASTQ_SORT(XENOME_CLASSIFY.out.xenome_fastq, 'human') + bwa_mem_mapping = FASTQ_SORT.out.sorted_fastq.join(READ_GROUPS.out.read_groups) + + } else { + bwa_mem_mapping = JAX_TRIMMER.out.trimmed_fastq.join(READ_GROUPS.out.read_groups) + } + + // ** Step 3: BWA-MEM Alignment + BWA_MEM(bwa_mem_mapping) + + // ** Step 4: Sort mapped reads + PICARD_SORTSAM(BWA_MEM.out.sam) + + // ** Step 5: Remove short mapping 'artifacts': https://github.com/nygenome/nygc-short-alignment-marking + SHORT_ALIGNMENT_MARKING(PICARD_SORTSAM.out.bam) + + // ** Step 6: Clean BAM to set MAPQ = 0 when read is unmapped (issue introduced in step 5) + PICARD_CLEANSAM(PICARD_SORTSAM.out.bam) + + // ** Step 7: Fix mate information (fix pair flags due to mapping adjustment in step 5) + PICARD_FIX_MATE_INFORMATION(PICARD_CLEANSAM.out.cleaned_bam) + + // ** Step 8: Markduplicates + PICARD_MARKDUPLICATES(PICARD_FIX_MATE_INFORMATION.out.fixed_mate_bam) + + // ** Step 9: Calculate BQSR + GATK_BASERECALIBRATOR(PICARD_MARKDUPLICATES.out.dedup_bam) + + // ** Step 10: Apply BQSR + apply_bqsr = PICARD_MARKDUPLICATES.out.dedup_bam.join(GATK_BASERECALIBRATOR.out.table) + GATK_APPLYBQSR(apply_bqsr) + + // Step 12: Nextflow channel processing + // https://github.com/nf-core/sarek/blob/master/workflows/sarek.nf#L854 + + GATK_APPLYBQSR.out.bam.join(GATK_APPLYBQSR.out.bai).join(meta_ch).branch{ + normal: it[3].status == 0 + tumor: it[3].status == 1 + }.set{ch_final_bam} + // re-join the sampleID to metadata information. Split normal and tumor samples into 2 different paths. + // Process tumor and normal BAMs seperately for conpair. For calling, use mapped and crossed data. + + // ** Get alignment and WGS metrics + PICARD_COLLECTALIGNMENTSUMMARYMETRICS(GATK_APPLYBQSR.out.bam) + PICARD_COLLECTWGSMETRICS(GATK_APPLYBQSR.out.bam) + + // ** NEXTFLOW OPERATORS::: Establish channels with sample pairs and individual input objects for downstream calling + + // get sample names, and join to bams. + GATK_GETSAMPLENAME_NORMAL(ch_final_bam.normal.map{ id, bam, bai, meta -> [id, meta, bam, bai] }) + GATK_GETSAMPLENAME_TUMOR(ch_final_bam.tumor.map{ id, bam, bai, meta -> [id, meta, bam, bai] }) + + ch_normal_to_cross = ch_final_bam.normal.join(GATK_GETSAMPLENAME_NORMAL.out.sample_name).map{ id, bam, bai, meta, readID -> [meta.patient, meta, bam, bai, readID] } + ch_tumor_to_cross = ch_final_bam.tumor.join(GATK_GETSAMPLENAME_TUMOR.out.sample_name).map{ id, bam, bai, meta, readID -> [meta.patient, meta, bam, bai, readID] } + + /* + The above map statements adjusts channels for normal, tumor samples to organize them by patient IDs. + A common key ID is needed to cross tumor by normal samples when multiples of each patient are run together. + + NOTE!!!! that if a common patient key is then used as 'sampleID' across samples, + downstream results will have name collisions and results will be overwritten in muliple tumor to normal mappings. + + e.g., patient: foo; tumor1 = bar, tumor2 = baz; normal = fizz. + + Common key: sampleID == patient, results = foo.calls.vcf for both bar--fizz and baz--fizz, and results are mangled. + + Unique key: sampleID == patient--tumor--normal, results == foo--bar--fizz.calls.vcf & foo--baz--fizz.calls.vcf. Results OK. + + Therefore, the above ch_*_to_cross should ONLY be used in crossing samples. + A different channel is made below for cases when needed in callers. + */ + + // Cross all normal and tumor by common patient ID. + ch_paired_samples = ch_normal_to_cross.cross(ch_tumor_to_cross) + .map { normal, tumor -> + def meta = [:] + meta.patient = normal[0] + meta.normal_id = normal[1].sampleID + meta.tumor_id = tumor[1].sampleID + meta.sex = normal[1].sex + meta.id = "${meta.patient}--${meta.tumor_id}--${meta.normal_id}".toString() + + [meta.id, meta, normal[2], normal[3], normal[4], tumor[2], tumor[3], tumor[4]] + } + /* + normal[0] is patient ID, + normal[1] and tumor[1] are meta info + normal[2] is normal bam, normal[3] is bai, normal[4] is read group ID. + tumor[2] is bam, tumor[3] is bai, tumor[4] is read group ID. + */ + + // Restore un-paired tumor samples, and add NA12878 as pairing in those cases + ch_paired_samples = ch_tumor_to_cross + .mix(ch_paired_samples) + .map{it -> [it[1].patient, it[1], it[2], it[3], it[4]]}.groupTuple().filter{it[2].size() == 1} + // it[0] = sampleID, it[1] = meta, it[2] = bam, it[3] = bai, it[4] = sampleReadID. + // unknown group size, no 'size' statement can be used in groupTuple + .map{tumor -> + def meta = [:] + meta.patient = tumor[1][0].patient + meta.normal_id = 'NA12878' + meta.tumor_id = tumor[1][0].sampleID + meta.sex = tumor[1][0].sex + meta.id = "${meta.patient}--${meta.tumor_id}--${meta.normal_id}".toString() + + [meta.id, meta, params.na12878_bam, params.na12878_bai, params.na12878_sampleName, tumor[2][0], tumor[3][0], tumor[4][0]] + } + .mix(ch_paired_samples) + + /* SAMPLE PAIRING CASES AND NOTES: + 1. Paired only for all samples: Managed by the intial cross statement. + 2. Tumor only provide for all samples: Managed by the intial cross and subsequent remapping of non-crossed samples. + 3. Some samples have a pair and others do not: Managed by the intial cross, and subsequent remapping of non-crossed samples. + 4. Multiple tumors per normal, or multiple normals per tumor, or a mixture of this: See note below. + + Notes: + The cross statement manages one normal to many tumors, many normals to one tumor, and many normals to many tumors. + E.g.,: + [foo, [patient:foo, normal_id:n_baz, tumor_id:t_bar, sex:XX, id:t_bar_vs_n_baz], ....bam, ....bai, , ....bam, ....bai, ] + [foo, [patient:foo, normal_id:n_baz, tumor_id:t_qux, sex:XX, id:t_qux_vs_n_baz], ....bam, ....bai, , ....bam, ....bai, ] + ... + + When samples are provided without a pair, they will not be paired in the cross statment and dropped from the first: 'ch_paired_samples' instantiation. + To recover un-paired tumors, pair them with NA12878 and pass them with paired samples downstream, + the group of all tumor samples: 'ch_tumor_to_cross' is mixed with the paired sample: 'ch_paired_samples'. + Cases where tumor samples were paired are then filtered. Tumors that were paired in the cross will appear > 2 times in the mix results, and are removed via the 'filter it[2].size()==1' statement. + The resulting tumor-only samples are mapped into the format seen in the cross statement, with NA12878 being added via parameters as the 'normal' sample. + */ + + + ch_ind_samples = ch_paired_samples + .filter{it[4] != params.na12878_sampleName} + .multiMap{it -> + normal: ["${it[1].patient}--${it[1].normal_id}".toString(), it[1], it[2], it[3], it[4]] + tumor: ["${it[1].patient}--${it[1].tumor_id}".toString(), it[1], it[5], it[6], it[7]] + } + ch_normal_samples = ch_ind_samples.normal.unique{it[0]} + ch_tumor_samples = ch_ind_samples.tumor.unique{it[0]} + + ch_tumor_only = ch_paired_samples + .filter{it[4] == params.na12878_sampleName} + .map{it -> ["${it[1].patient}--${it[1].tumor_id}".toString(), it[1], it[5], it[6], it[7]]} + .unique{it[0]} + + ch_msisensor2_input = ch_paired_samples + .map{["${it[1].patient}--${it[1].tumor_id}".toString(), it[1], it[5], it[6], it[7]]} + .unique{it[0]} + + /* + The above establishes channels needed for germline calling, bicseq2 and MSIsensor2. + Those steps require BAM, index and readID. + Here sampleID is reset to the original ID from the CSV parser which is: 'patient--sample' + Note that NA12878 is filtered for germline and can be filtered for bicseq2 / conpair, + CNA and sample comparision analysis may not make sense for that pairing. + All tumor samples are passed to MSIsensor2 as it runs in tumor-only mode. + */ + + + + // ** Step 13: Conpair pileup for T/N true pairs. + // Step not run on tumor-only samples. As contamination analysis is not biologcally relavent. + + conpair_input = ch_paired_samples + .filter{it[4] != params.na12878_sampleName} + .multiMap{it -> + normal: [it[1].patient, "${it[1].normal_id}".toString(), it[2], it[3]] + tumor: [it[1].patient, "${it[1].tumor_id}".toString(), it[5], it[6]] + } + /* + Remap the paired samples to required normal/tumor inputs for conpair, and filter NA12878 paired samples. + it[1] = metadata, it[2] = normal BAM, it[3] = normal BAI. + it[5] = tumor BAM, it[6] = tumor BAI. + Patient ID is used here because samples must be re-crossed after the pileup to match all tumors and normals. + */ + + CONPAIR_NORMAL_PILEUP(conpair_input.normal.unique{it[2]}, 'normal') + CONPAIR_TUMOR_PILEUP(conpair_input.tumor.unique{it[2]}, 'tumor') + + conpair_input = CONPAIR_NORMAL_PILEUP.out.pileup.cross(CONPAIR_TUMOR_PILEUP.out.pileup) + .map { normal, tumor -> [normal[0], "${normal[0]}--${tumor[1]}--${normal[1]}".toString(), normal[2], tumor[2]] + } + // normal[0] is patientID or 'sampleID', normal[2] is normal pileup, tumor[2] is tumor pileup. + + CONPAIR(conpair_input) + + + // ** Step 14: Germline Calling and Annotation + + // Find the paths of all `scattered.interval_list` files, and make tuples with an index value. + // This is used for for HaplotypeCaller variant regions and GenotypeGVCF + + // Loads paths from a directory, collects into a list, sorts, + // maps to a set with indicies, flattens the map [file, index, file, index ...], + // collates the flattened map into pairs, then remaps to the pairs to tuple + + intervals = Channel.fromPath( params.chrom_intervals+'/*/scattered.interval_list' ) + .collect() + .sort() + .map { items -> items.withIndex() } + .flatten() + .collate(2) + .map { item, idx -> tuple( item, idx + 1 ) } + // https://stackoverflow.com/a/67084467/18557826 + interval_count = files( params.chrom_intervals+'/*/scattered.interval_list' ).size() + // interval count is used in groupTuple size statements. + + // Applies scatter intervals from above to the BAM file channel prior to variant calling. + chrom_channel = ch_normal_samples.combine(intervals).filter{it[4] != params.na12878_sampleName} + + // Read a list of chromosome names from a parameter. These are provided to several tools. + chroms = Channel + .fromPath("${params.chrom_contigs}") + .splitText() + .map{it -> it.trim()} + + // Get a list of primary chromosomes and exclude chrM (dropRight(1)) + chrom_list = chroms.collect().dropRight(1) + chrom_list_noY = chrom_list.dropRight(1) + + + + // Variant calling. + GATK_HAPLOTYPECALLER_SV_GERMLINE(chrom_channel) + + // Applies gather to scattered haplotype calls. + GATK_SORTVCF_GERMLINE(GATK_HAPLOTYPECALLER_SV_GERMLINE.out.vcf.groupTuple(size: interval_count), 'gvcf') + + // Applies scatter intervals from above to the merged file, and genotype. + genotype_channel = GATK_SORTVCF_GERMLINE.out.vcf_idx.combine(intervals) + + GATK_GENOTYPE_GVCF(genotype_channel) + GATK_CNNSCORE_VARIANTS(GATK_GENOTYPE_GVCF.out.vcf_idx) + + // Applies gather to genotyped/cnn called vcfs prior to tranche filtering. + GATK_SORTVCF_GENOTYPE(GATK_CNNSCORE_VARIANTS.out.vcf.groupTuple(size: interval_count), 'vcf') + + // Variant tranche filtering. + GATK_FILTER_VARIANT_TRANCHES(GATK_SORTVCF_GENOTYPE.out.vcf_idx) + + // Allele frequency and call refinement filtering. + GATK_VARIANTFILTRATION_AF(GATK_FILTER_VARIANT_TRANCHES.out.vcf_idx) + BCFTOOLS_GERMLINE_FILTER(GATK_VARIANTFILTRATION_AF.out.vcf) + + // Germline annotation - Filtered + // 1. SplitMultiAllelicRegions & compress & index + BCFTOOLS_SPLITMULTIALLELIC_REGIONS(BCFTOOLS_GERMLINE_FILTER.out.vcf_idx, chrom_list_noY) + // 2. vepPublicSvnIndel + VEP_GERMLINE(BCFTOOLS_SPLITMULTIALLELIC_REGIONS.out.vcf_idx) + // 3. RemoveSpanning + BCFTOOLS_REMOVESPANNING(VEP_GERMLINE.out.vcf) + // 4. AddCosmic + COSMIC_ANNOTATION(BCFTOOLS_REMOVESPANNING.out.vcf) + // 5. AddCancerResistanceMutations and dbsnpIDs + COSMIC_CANCER_RESISTANCE_MUTATION_GERMLINE(COSMIC_ANNOTATION.out.vcf) + SNPSIFT_ANNOTATE_DBSNP_GERMLINE(COSMIC_CANCER_RESISTANCE_MUTATION_GERMLINE.out.vcf, params.dbSNP, params.dbSNP_index, 'intermediate') + // 6. AnnotateId & RenameCsqVcf + GERMLINE_VCF_FINALIZATION(SNPSIFT_ANNOTATE_DBSNP_GERMLINE.out.vcf, 'filtered') + + + + // ** Step 15: Somatic Calling + + // Applies scatter intervals from above to the BQSR bam file + somatic_calling_channel = ch_paired_samples.combine(intervals) + + /* Applies scatter intervals from above to the BQSR bam file + somatic_calling_channel = ch_paired_samples.combine(chroms) + NOTE: The above code line will split by Mutect2 calling by individual 'chroms'. + Entire chromosomes are scattered. For WGS, this is computationally intensive. + We changed to calling to be done based on the same intervals passed to the germline caller. + These intervals are based on the 'NoN' file made by BROAD/GATK. + If complete chromosomes are requried, the above line of code can be uncommented. + */ + + // ** Mutect2 - SNP/InDEL Calling + // STEPS: Call on each chromosome / interval. + // Prior to 'filtermutectcalls' vcfs must be merged (GATK best practice). + // NOTE: The group and map statement ensures that VCFs are organzied by sampleID, and carry and toolID is maintained through the process. + // Prior to 'filtermutectcalls' "stats" files from mutect2 must be merged (GATK best practice). + // Merge vcfs and stats must be Nextflow joined prior to 'filtermutectcalls' to avoid samples being confounded. + + GATK_MUTECT2(somatic_calling_channel) + + sort_merge_input_mutect2VCF = GATK_MUTECT2.out.vcf + .groupTuple(size: interval_count) + .map { sampleID, vcf, meta, normal, tumor, tool -> tuple( sampleID, vcf, meta.unique()[0], normal.unique()[0], tumor.unique()[0], tool.unique()[0] ) } + + GATK_SORTVCF_MUTECT(sort_merge_input_mutect2VCF) + GATK_MERGEMUTECTSTATS(GATK_MUTECT2.out.stats.groupTuple(size: interval_count)) + + filter_mutect_input = GATK_SORTVCF_MUTECT.out.vcf_tbi.join(GATK_MERGEMUTECTSTATS.out.stats) + + GATK_FILTERMUECTCALLS(filter_mutect_input) + + // ** Lancet - SNP/InDEL Calling + // Generate a list of chromosome beds. This is generated in the same manner as the calling `intervals` variable above. + lancet_beds = Channel.fromPath( params.lancet_beds_directory+'/*.bed' ) + .collect() + .sort() + .map { items -> items.withIndex() } + .flatten() + .collate(2) + .map { item, idx -> tuple( item, idx + 1 ) } + // https://stackoverflow.com/a/67084467/18557826 + lancet_beds_count = files( params.lancet_beds_directory+'/*.bed' ).size() + // bed file count is used in groupTuple size statements. + + // Applies scatter intervals from above to the BQSR bam file + lancet_calling_channel = ch_paired_samples.combine(lancet_beds) + LANCET(lancet_calling_channel) + + sort_merge_input_lancetVCF = LANCET.out.vcf + .groupTuple(size: lancet_beds_count) + .map { sampleID, vcf, meta, normal, tumor, tool -> tuple( sampleID, vcf, meta.unique()[0], normal.unique()[0], tumor.unique()[0], tool.unique()[0] ) } + + GATK_SORTVCF_LANCET(sort_merge_input_lancetVCF) + + // ** Manta - SV Calling + MANTA(ch_paired_samples) + // FilterNonpass can be used with `SelectVariants` and `--exclude-filtered`. However, hard filtering excluded for now. + + // ** Strelka2 - SNP/InDEL Calling + strekla2_input = ch_paired_samples.join(MANTA.out.manta_smallindel_vcf_tbi) + STRELKA2(strekla2_input) + + // ** Gridss - SV Calling + GRIDSS_PREPROCESS(ch_paired_samples) + gridss_assemble_input = ch_paired_samples.join(GRIDSS_PREPROCESS.out.gridss_preproc) + GRIDSS_ASSEMBLE(gridss_assemble_input) + gridss_call_input = ch_paired_samples.join(GRIDSS_ASSEMBLE.out.gridss_assembly) + GRIDSS_CALLING(gridss_call_input) + GRIDSS_CHROM_FILTER(GRIDSS_CALLING.out.gridss_vcf, chrom_list) + GRIPSS_SOMATIC_FILTER(GRIDSS_CHROM_FILTER.out.gridss_chrom_vcf) + // NOTE: this filtering tool is hard coded for GRCh38 based on PON naming. + + // ** BicSeq2 - CNV Calling + /* This step does not run on unpaired samples. + CNV of tumor samples against an unrelated normal will produce spurious results. + NA12878 paired samples can be filtered from ch_normal_samples and ch_tumor_samples channels at their creation. + */ + SAMTOOLS_STATS_INSERTSIZE_NORMAL(ch_normal_samples) + SAMTOOLS_STATS_INSERTSIZE_TUMOR(ch_tumor_samples.mix(ch_tumor_only)) + + SAMTOOLS_FILTER_UNIQUE_NORMAL(ch_normal_samples, chrom_list) + SAMTOOLS_FILTER_UNIQUE_TUMOR(ch_tumor_samples.mix(ch_tumor_only), chrom_list) + + biqseq_norm_input_normal = SAMTOOLS_FILTER_UNIQUE_NORMAL.out.uniq_seq.join(SAMTOOLS_STATS_INSERTSIZE_NORMAL.out.read_length_insert_size) + // sampleID, individual_chr_seq_files, meta, read_ID, read_length, insert_size. + biqseq_norm_input_tumor = SAMTOOLS_FILTER_UNIQUE_TUMOR.out.uniq_seq.join(SAMTOOLS_STATS_INSERTSIZE_TUMOR.out.read_length_insert_size) + // sampleID, individual_chr_seq_files, meta, read_ID, read_length, insert_size. + + fasta_files = Channel.fromPath( file(params.ref_fa).parent + '/*_chr*' ) + .collect() + // collect individual chr fasta files. These are located in the same directory as the main reference. + // if the extension of `name_chr#.fa` changes this match will break. + + BICSEQ2_NORMALIZE_NORMAL(biqseq_norm_input_normal, fasta_files) + BICSEQ2_NORMALIZE_TUMOR(biqseq_norm_input_tumor, fasta_files) + // note: this can not be split by chrom, even though bicseq2 norm acts on chroms in turn, + // it needs all chroms to parameterize the normalization. + // reported error will be in these cases: "Error in bin_read: bin file is in incorrect format." + + bicseq_normal = BICSEQ2_NORMALIZE_NORMAL.out.normalized_output + .map{it -> [it[2].patient, it[1], it[2], it[3]]} + + bicseq_tumor = BICSEQ2_NORMALIZE_TUMOR.out.normalized_output + .map{it -> [it[2].patient, it[1], it[2], it[3]]} + + bicseq2_seg_input = bicseq_normal.cross(bicseq_tumor) + .map{normal, tumor -> + def meta = [:] + meta.patient = normal[2].patient + meta.normal_id = normal[2].sampleID + meta.tumor_id = tumor[2].sampleID + meta.sex = normal[2].sex + meta.id = "${tumor[2].patient}--${tumor[2].tumor_id}--${tumor[2].normal_id}".toString() + + ["${tumor[2].patient}--${tumor[2].tumor_id}--${tumor[2].normal_id}".toString(), normal[1], tumor[1], normal[2], normal[3], tumor[3]]} + // sampleID, individual_normal_norm_bin_files, individual_tumor_norm_bin_files, metadata, norm_readID, tumor_readID. + // The metadata object here is reset following the cross. So that ID matches up again. + // It is possible that in many to many or one to many crosses, the ID field will not reflect the crossed samples. + + BICSEQ2_SEG(bicseq2_seg_input) + // NOTE: with insufficent coverage, the segmentation will fail because the 'lamda' factor can not be properly optimized. + + bicseq2_tumoronly_input = BICSEQ2_NORMALIZE_TUMOR.out.normalized_output + .filter{it[2].normal_id == 'NA12878'} + + BICSEQ2_SEG_UNPAIRED(bicseq2_tumoronly_input) + + bicseq2_calls = BICSEQ2_SEG_UNPAIRED.out.bicseq2_sv_calls + .map{it -> [it[3].id, it[1], it[2], it[3], it[4], it[5], it[6]]} + .mix(BICSEQ2_SEG.out.bicseq2_sv_calls) + // remap output from unpaired bicseq2 to standard format for bicseq2 paired. And mix both channel outputs. This is passed to annotation. + + // Step 15: MSI + MSISENSOR2_MSI(ch_msisensor2_input) + + /* + The follow are the harmonized output channels for each tool: + + Manta + MANTA.out.manta_somaticsv_tbi + + Strelka_SV + STRELKA2.out.strelka_snv_vcf_tbi + + Strelka_INDEL + STRELKA2.out.strelka_indel_vcf_tbi + + Mutect2 + GATK_FILTERMUECTCALLS.out.mutect2_vcf_tbi + + Lancet + GATK_SORTVCF_LANCET.out.lancet_vcf + + Gridss + GRIPSS_SOMATIC_FILTER.out.gripss_filtered_bgz + + Bicseq2 + BICSEQ2_SEG.out.bicseq2_sv_calls + */ + + /* + NOTE: + The call merging and annotatoins sections of this workflow becomes highly complex. + Files from each caller are passed through a set of 'merge prep' steps. + These steps apply various functions to manipulate the VCF header, and also calls within the VCFs. + Once the VCFs are prepared, a merge occurs. Following the merge, non-exonic regions are parsed out, + and calls in those regions are passed to Lancet for confirmation/rescue. + Following this, confirmed calls are used as 'support' and merged back to the full caller call set. + Additional manipulations are done on the VCF, and then the 'final' VCF + is passed through to the annotation steps. Additional and different annotations are done on SV and CNV + calls. The steps are commented to faciliate understanding of what is being done. + */ + + somatic_caller_concat = MANTA.out.manta_somaticsv_tbi.concat( STRELKA2.out.strelka_snv_vcf_tbi, + STRELKA2.out.strelka_indel_vcf_tbi, + GATK_FILTERMUECTCALLS.out.mutect2_vcf_tbi, + GATK_SORTVCF_LANCET.out.vcf_tbi ) + + // Merge prep: + // 1. Rename VCF header to include tool name: + RENAME_METADATA(somatic_caller_concat) + + // 2. Order samples in VCF to 'normal', 'tumor' and prep for merge. + // See script for list of changes applied to the VCF: + MERGE_PREP(RENAME_METADATA.out.rename_metadata_vcf) + + // 3. Rename VCF header to specfied 'normal' and 'tumor' names, add tool prefix to sampleIDs. + RENAME_VCF(MERGE_PREP.out.merge_prep_vcf) + + // 4. Compress and Index VCF: + COMPRESS_INDEX_VCF(RENAME_VCF.out.rename_vcf) + + // 5. Split out multi-allelic calls: + BCFTOOLS_SPLITMULTIALLELIC(COMPRESS_INDEX_VCF.out.compressed_vcf_tbi) + + // 6. Split MNV calls: + SPLIT_MNV(BCFTOOLS_SPLITMULTIALLELIC.out.vcf) + + // 7. Sort VCF: + GATK_SORTVCF_TOOLS(SPLIT_MNV.out.split_mnv_vcf) + + callers_for_merge = GATK_SORTVCF_TOOLS.out.vcf_tbi + .groupTuple(size: 5) + .map{sampleID, vcf, idx, meta, normal_sample, tumor_sample, tool_list -> tuple( sampleID, vcf, idx, meta.unique()[0] ) } + .combine(chrom_list_noY.flatten()) + // The above collects all callers on sampleID, then maps to avoid duplication of data and to drop the tool list, which is not needed anymore. + // Note that this could be done using 'by' in the groupTuple statement. However, the map is still required to remove the tool list. + // 'size: 5' corresponds to the 5 callers used in the workflow. If additional callers are added, this must be changed. + + // Merge Callers, Extract non-exonic calls and try to confirm those with Lancet, + // then prep confirmed calls for merged back to full merge set: + + // ** Make all caller merge set, and compress and index: + BCFTOOLS_MERGECALLERS(callers_for_merge) + COMPRESS_INDEX_VCF_ALL_CALLERS(BCFTOOLS_MERGECALLERS.out.vcf) + + // ** Extract non-exonic, and try to confirm with Lancet. + // 1. Intersect with '-v' against a list of exonic regions. This step subsets calls to non-exonic regions. + BEDTOOLS_STARTCANDIDATES(BCFTOOLS_MERGECALLERS.out.vcf) + + // 2. Get candidates from intersected, using rules outlined in get_candidates.py (script docs provided by original dev). + // Compress and index the resulting VCF. + GET_CANDIDATES(BEDTOOLS_STARTCANDIDATES.out.vcf) + COMPRESS_INDEX_VCF_REGION(GET_CANDIDATES.out.vcf) + + // 3. VCF to BED + VCF_TO_BED(GET_CANDIDATES.out.vcf) + + // 4. Confirm extracted calls with Lancet: + // Compress and index the resulting VCF. + lancet_confirm_input = VCF_TO_BED.out.bed + .combine(ch_paired_samples, by: 0) + .map{sampleID, bed, meta, chrom, meta2, normal_bam, normal_bai, normal_name, tumor_bam, tumor_bai, tumor_name -> tuple( sampleID, bed, meta, normal_bam, normal_bai, normal_name, tumor_bam, tumor_bai, tumor_name, chrom ) } + // The above combines output by sampleID with BAM files. Then maps to avoid duplication of data, and set input tuples for the steps that follow. + // Note that "combine" here, combines each output stream from VCF_TO_BED with ch_paired_samples, keeping the scattered chrom seperate. + + LANCET_CONFIRM(lancet_confirm_input) + COMPRESS_INDEX_VCF_REGION_LANCET(LANCET_CONFIRM.out.vcf) + + // 5. Intersect Lancet Confirm with candidate extractions. + candidate_lancet_intersect_input = COMPRESS_INDEX_VCF_REGION.out.compressed_vcf_tbi + .join(COMPRESS_INDEX_VCF_REGION_LANCET.out.compressed_vcf_tbi, by: [0,6]) + .map{sampleID, chrom, vcf, tbi, meta, empty_name, empty_name2, vcf2, tbi2, meta2, normal_name, tumor_name -> tuple( sampleID, vcf, tbi, vcf2, tbi2, meta, normal_name, tumor_name, chrom )} + // The above joins candidate VCF with Lancet Confirm VCF by sampleID and chrom. Then maps to avoid duplication of data, and set input tuples for the steps that follow. + // Note: A. The 'by' statement here, joins on sampleID and chrom, which correspond to index values 0 and 6 in the output tuples. + // B. 'empty_name' is used here because 'normal_name' and 'tumor_name' are not required/used in the candidate steps. + // C. 'normal_name' and 'tumor_name' are needed to match input tuple expectations for teh steps that follow. + + BCFTOOLS_INTERSECTVCFS(candidate_lancet_intersect_input) + + lancet_confirm_mergePrep_input = BCFTOOLS_INTERSECTVCFS.out.vcf.map{sampleID, vcf, index, meta, normal_name, tumor_name -> tuple(sampleID, vcf, index, meta, normal_name, tumor_name, 'lancet_support')} + // The above remaps the output tuple from BCFTOOLS_INTERSECTVCF to include the tool name 'lancet', which is needed for the steps that follow. + // 'lancet_support' is used to trigger `--support` in the MERGE_PREP_LANCET statement. Logic is present in RENAME_VCF_LANCET to set the header to 'lancet' rather than 'lancet_support' + + // ** Prep calls for merge back to all caller merge set. + // 1. Rename VCF header to include tool name: + RENAME_METADATA_LANCET(lancet_confirm_mergePrep_input) + + // 2. Order samples in VCF to 'normal', 'tumor' and prep for merge. + // See script for list of changes applied to the VCF: + // This step is done as `--support` + MERGE_PREP_LANCET(RENAME_METADATA_LANCET.out.rename_metadata_vcf) + + // 3. Rename VCF header to specfied 'normal' and 'tumor' names, add tool prefix to sampleIDs. + RENAME_VCF_LANCET(MERGE_PREP_LANCET.out.merge_prep_vcf) + + // 4. Compress and Index VCF: + COMPRESS_INDEX_VCF_LANCET(RENAME_VCF_LANCET.out.rename_vcf) + + // 5. Split out multi-allelic calls: + BCFTOOLS_SPLITMULTIALLELIC_LANCET(COMPRESS_INDEX_VCF_LANCET.out.compressed_vcf_tbi) + + // 6. Split MNV calls: + SPLIT_MNV_LANCET(BCFTOOLS_SPLITMULTIALLELIC_LANCET.out.vcf) + + // 7. Remove contig descriptions: + REMOVE_CONTIG(SPLIT_MNV_LANCET.out.split_mnv_vcf) + + // 8. Sort VCF. + GATK_SORTVCF_TOOLS_LANCET(REMOVE_CONTIG.out.remove_contig_vcf) + + // ** Merge lancet confirmed back to all merged callers. Compress and index merged calls. + allCalls_lancetConfirm_merge_input = COMPRESS_INDEX_VCF_ALL_CALLERS.out.compressed_vcf_tbi + .join(GATK_SORTVCF_TOOLS_LANCET.out.vcf_tbi, by: [0,6]) + .map{sampleID, chrom, vcf, tbi, meta, empty_name, empty_name2, vcf2, tbi2, meta2, normal_name, tumor_name -> tuple( sampleID, [vcf, vcf2], [tbi, tbi2], meta, chrom )} + // BCFTOOLS_MERGE Requires an input tuple as follows: [val(sampleID), file(vcf), file(idx), val(meta), val(chrom)] + // Join the output streams on sampleID and chrom, and then map to the require tuple structure. Note that [vcf, vcf2] makes a list that is understoon by the module. + + BCFTOOLS_MERGECALLERS_FINAL(allCalls_lancetConfirm_merge_input) + COMPRESS_INDEX_VCF_MERGED(BCFTOOLS_MERGECALLERS_FINAL.out.vcf) + + // ** Manipulation of VCF into final file to be passed to annotation modules. + // 1. Merge Columns. + // See script merge_columns.py for the three features used in merge (script docs provided by original dev). + MERGE_COLUMNS(COMPRESS_INDEX_VCF_MERGED.out.compressed_vcf_tbi) + + // 2. Add Allele Count to VCF. + // "Runs pileup on tumor and normal bam files to compute allele counts for bi-allelic SNV and Indel variants in VCF file and adds pileup format columns to the VCF file."" + addAlleleCounts_confirm_input = MERGE_COLUMNS.out.mergeColumn_vcf + .combine(ch_paired_samples, by: 0) + .map{sampleID, vcf, meta, chrom, meta2, normal_bam, normal_bai, normal_name, tumor_bam, tumor_bai, tumor_name -> tuple( sampleID, vcf, meta, normal_bam, normal_bai, tumor_bam, tumor_bai, chrom ) } + ADD_NYGC_ALLELE_COUNTS(addAlleleCounts_confirm_input) + + // 3. Add Final Allele Counts to VCF + ADD_FINAL_ALLELE_COUNTS(ADD_NYGC_ALLELE_COUNTS.out.vcf) + + // 4. Filter VCF based on PON + FILTER_PON(ADD_FINAL_ALLELE_COUNTS.out.vcf) + + // 5. Filter VCF based on gnomad and "ALL_GRCh38_sites" + FILTER_VCF(FILTER_PON.out.vcf) + + // 6. "SnvstomnvsCountsbasedfilterAnnotatehighconf" + // Parses file and converts adjacent SNVs to MNVs if they have they match the MNV_ID and called_by fields. + SNV_TO_MNV_FINAL_FILTER(FILTER_VCF.out.vcf) + + // ** Collect and Merge Chroms. + num_intervals = file(params.chrom_contigs).countLines().toInteger() - 2 + // number of chrom intervals split on during the above steps. A 'value' variable used in groupTuple size statement. MT and Y are removed, hence '- 2' + chrom_merge_input = SNV_TO_MNV_FINAL_FILTER.out.vcf + .groupTuple(size: num_intervals) + .map{sampleID, vcf, meta, chrom -> tuple( sampleID, vcf, meta.unique()[0] ) } + // Collect scattered chroms, remap to tuple without chrom names. + + GATK_SORTVCF_SOMATIC(chrom_merge_input) + REORDER_VCF_COLUMNS(GATK_SORTVCF_SOMATIC.out.vcf_idx) + // output tuple = val(sampleID), path("*_mergePrep.vcf"), val(meta). + // meta = [patient:test, normal_id:test, tumor_id:test2, sex:XX, id:test2_vs_test] + // This named list can be accessed in the script section prior to """ via calls like: meta.patient + + // Compress and index the merged vcf + COMPRESS_INDEX_MERGED_VCF(REORDER_VCF_COLUMNS.out.vcf) + + // ** Annotation of somatic indels and snps + + VEP_SOMATIC(COMPRESS_INDEX_MERGED_VCF.out.compressed_vcf_tbi) + COSMIC_ANNOTATION_SOMATIC(VEP_SOMATIC.out.vcf) + COSMIC_CANCER_RESISTANCE_MUTATION_SOMATIC(COSMIC_ANNOTATION_SOMATIC.out.vcf) + + SNPSIFT_ANNOTATE_DBSNP_SOMATIC(COSMIC_CANCER_RESISTANCE_MUTATION_SOMATIC.out.vcf.map{it -> [it[0], it[1]]}, params.dbSNP, params.dbSNP_index, 'intermediate') + // note: existing module requires only sampleID and VCF. input remapped to required tuple. + + somatic_finalization_input = SNPSIFT_ANNOTATE_DBSNP_SOMATIC.out.vcf.join(COSMIC_CANCER_RESISTANCE_MUTATION_SOMATIC.out.vcf).map{it -> [it[0], it[1], it[3], it[4], it[5]]} + // re-join dbSNP ID annotated VCF output with [meta], normalID, tumorID. + + SOMATIC_VCF_FINALIZATION(somatic_finalization_input, 'filtered') + + // ** Annotation of somatic CNV and SV + + ANNOTATE_BICSEQ2_CNV(bicseq2_calls, chrom_list_noY) + + // note: joining on the sampleID, metadata, tumor_name, and normal_name for + // safety. This re-arranges the values in the channel to: + // tuple val(sampleID), val(normal_name), val(tumor_name), file(manta_vcf), file(manta_vcf_tbi), val(meta_manta), val(manta), file(gridss_bgz), val(no_idx), val(meta_gripss), val(gridss) + // Downstream, just including sampleID, normal_name, and tumor_name to simplify a similar join that is necessary + + merge_sv_input = MANTA.out.manta_somaticsv_tbi.join(GRIPSS_SOMATIC_FILTER.out.gripss_filtered_bgz, by : [0,4,5]) + MERGE_SV(merge_sv_input, chrom_list) + + ANNOTATE_SV(MERGE_SV.out.merged, "main") + ANNOTATE_SV_SUPPLEMENTAL(MERGE_SV.out.merged_suppl, "supplemental") + ANNOTATE_GENES_SV(ANNOTATE_SV.out.annot_sv_bedpe, "main") + ANNOTATE_GENES_SV_SUPPLEMENTAL(ANNOTATE_SV_SUPPLEMENTAL.out.annot_sv_bedpe, "supplemental") + + // note: joining on the sampleID, normal_name, and tumor_name for + // safety. This re-arranges the values in the channel to: + // tuple val(sampleID), val(normal_name), val(tumor_name), file(bicseq_annot), file(annot_sv_genes_bedpe) + + annot_sv_cnv_input = ANNOTATE_BICSEQ2_CNV.out.bicseq_annot.join(ANNOTATE_GENES_SV.out.annot_sv_genes_bedpe, by: [0,2,3]) + ANNOTATE_SV_WITH_CNV(annot_sv_cnv_input, "main") + + // See notes on previous step + annot_sv_cnv_suppl_input = ANNOTATE_BICSEQ2_CNV.out.bicseq_annot.join(ANNOTATE_GENES_SV_SUPPLEMENTAL.out.annot_sv_genes_bedpe, by: [0,2,3]) + ANNOTATE_SV_WITH_CNV_SUPPLEMENTAL(annot_sv_cnv_suppl_input, "supplemental") + + FILTER_BEDPE(ANNOTATE_SV_WITH_CNV.out.sv_genes_cnv_bedpe, "main") + FILTER_BEDPE_SUPPLEMENTAL(ANNOTATE_SV_WITH_CNV_SUPPLEMENTAL.out.sv_genes_cnv_bedpe, "supplemental") + + ch_multiqc_files = Channel.empty() + ch_multiqc_files = ch_multiqc_files.mix(JAX_TRIMMER.out.quality_stats.collect{it[1]}.ifEmpty([])) + ch_multiqc_files = ch_multiqc_files.mix(FASTQC.out.quality_stats.collect{it[1]}.ifEmpty([])) + ch_multiqc_files = ch_multiqc_files.mix(ch_XENOME_CLASSIFY_multiqc.collect{it[1]}.ifEmpty([])) + ch_multiqc_files = ch_multiqc_files.mix(GATK_BASERECALIBRATOR.out.table.collect{it[1]}.ifEmpty([])) + ch_multiqc_files = ch_multiqc_files.mix(PICARD_COLLECTALIGNMENTSUMMARYMETRICS.out.txt.collect{it[1]}.ifEmpty([])) + ch_multiqc_files = ch_multiqc_files.mix(PICARD_COLLECTWGSMETRICS.out.txt.collect{it[1]}.ifEmpty([])) + ch_multiqc_files = ch_multiqc_files.mix(CONPAIR.out.concordance.collect{it[1]}.ifEmpty([])) + ch_multiqc_files = ch_multiqc_files.mix(CONPAIR.out.contamination.collect{it[1]}.ifEmpty([])) + + MULTIQC ( + ch_multiqc_files.collect() + ) + +} + +// Function to extract information (meta data + file(s)) from csv file(s) +// https://github.com/nf-core/sarek/blob/master/workflows/sarek.nf#L1084 +def extract_csv(csv_file) { + + // check that the sample sheet is not 1 line or less, because it'll skip all subsequent checks if so. + file(csv_file).withReader('UTF-8') { reader -> + def line, numberOfLinesInSampleSheet = 0; + while ((line = reader.readLine()) != null) {numberOfLinesInSampleSheet++} + if (numberOfLinesInSampleSheet < 2) { + log.error "Samplesheet had less than two lines. The sample sheet must be a csv file with a header, so at least two lines." + System.exit(1) + } + } + + // Additional check of sample sheet: + // 1. Each row should specify a lane and the same combination of patient, sample and lane shouldn't be present in different rows. + // 2. The same sample shouldn't be listed for different patients. + def patient_sample_lane_combinations_in_samplesheet = [] + def sample2patient = [:] + + Channel.from(csv_file).splitCsv(header: true) + .map{ row -> + if (!sample2patient.containsKey(row.sampleID.toString())) { + sample2patient[row.sampleID.toString()] = row.patient.toString() + } else if (sample2patient[row.sampleID.toString()] != row.patient.toString()) { + log.error('The sample "' + row.sampleID.toString() + '" is registered for both patient "' + row.patient.toString() + '" and "' + sample2patient[row.sampleID.toString()] + '" in the sample sheet.') + System.exit(1) + } + } + + sample_count_all = 0 + sample_count_normal = 0 + sample_count_tumor = 0 + + Channel.from(csv_file).splitCsv(header: true) + //Retrieves number of lanes by grouping together by patient and sample and counting how many entries there are for this combination + .map{ row -> + sample_count_all++ + if (!(row.patient && row.sampleID)){ + log.error "Missing field in csv file header. The csv file must have fields named 'patient' and 'sampleID'." + System.exit(1) + } + [[row.patient.toString(), row.sampleID.toString()], row] + }.groupTuple() + .map{ meta, rows -> + size = rows.size() + [rows, size] + }.transpose() + .map{ row, numLanes -> //from here do the usual thing for csv parsing + + def meta = [:] + + // Meta data to identify samplesheet + // Both patient and sample are mandatory + // Several sample can belong to the same patient + // Sample should be unique for the patient + if (row.patient) meta.patient = row.patient.toString() + if (row.sampleID) meta.sampleID = row.sampleID.toString() + + // If no sex specified, sex is not considered + // sex is only mandatory for somatic CNV + if (row.sex) meta.sex = row.sex.toString() + else meta.sex = 'NA' + + // If no status specified, sample is assumed normal + if (row.status) meta.status = row.status.toInteger() + else meta.status = 0 + + if (meta.status == 0) sample_count_normal++ + else sample_count_tumor++ + + // join meta to fastq + if (row.fastq_2) { + meta.id = "${row.patient}--${row.sampleID}".toString() + def fastq_1 = file(row.fastq_1, checkIfExists: true) + def fastq_2 = file(row.fastq_2, checkIfExists: true) + + meta.size = 1 // default number of splitted fastq + + return [meta.id, meta, [fastq_1, fastq_2]] + + } else { + log.error "Missing or unknown field in csv file header. Please check your samplesheet" + System.exit(1) + } + } +} \ No newline at end of file diff --git a/workflows/rna_fusion.nf b/workflows/rna_fusion.nf new file mode 100644 index 00000000..776f2361 --- /dev/null +++ b/workflows/rna_fusion.nf @@ -0,0 +1,170 @@ +#!/usr/bin/env nextflow +nextflow.enable.dsl=2 + +// import modules +include {help} from "${projectDir}/bin/help/rna_fusion.nf" +include {param_log} from "${projectDir}/bin/log/rna_fusion.nf" +include {getLibraryId} from "${projectDir}/bin/shared/getLibraryId.nf" +include {extract_csv} from "${projectDir}/bin/shared/extract_csv.nf" +include {FILE_DOWNLOAD} from "${projectDir}/subworkflows/aria_download_parse" +include {CONCATENATE_LOCAL_FILES} from "${projectDir}/subworkflows/concatenate_local_files" +include {CONCATENATE_READS_PE} from "${projectDir}/modules/utility_modules/concatenate_reads_PE" +include {GUNZIP} from "${projectDir}/modules/utility_modules/gunzip" +include {XENOME_CLASSIFY} from "${projectDir}/modules/xenome/xenome" +include {FASTQ_SORT as FASTQ_SORT_HUMAN; + FASTQ_SORT as FASTQ_SORT_MOUSE} from "${projectDir}/modules/fastq-tools/fastq-sort" +include {STAR_ALIGN as STAR_ARRIBA; + STAR_ALIGN as STAR_SQUID; + STAR_ALIGN as STAR_STARFUSION} from "${projectDir}/modules/star/star_align" +include {SAMTOOLS_SORT as SORT_ARRIBA; + SAMTOOLS_SORT as SORT_SQUID} from "${projectDir}/modules/samtools/samtools_sort" +include {SAMTOOLS_INDEX as INDEX_ARRIBA} from "${projectDir}/modules/samtools/samtools_index" +include {ARRIBA} from "${projectDir}/modules/arriba/arriba" +include {FUSIONCATCHER} from "${projectDir}/modules/fusioncatcher/fusioncatcher" +include {JAFFA} from "${projectDir}/modules/jaffa/jaffa" +include {KALLISTO_QUANT} from "${projectDir}/modules/kallisto/kallisto_quant" +include {KALLISTO_INSERT_SIZE} from "${projectDir}/modules/kallisto/kallisto_insert_size" +include {PIZZLY} from "${projectDir}/modules/pizzly/pizzly" +include {SQUID} from "${projectDir}/modules/squid/squid_call" +include {SQUID_ANNOTATE} from "${projectDir}/modules/squid/squid_annotate" +include {SAMTOOLS_VIEW as SAMTOOLS_VIEW_SQUID} from "${projectDir}/modules/samtools/samtools_view" +include {STAR_FUSION as STAR_FUSION} from "${projectDir}/modules/star-fusion/star-fusion" +include {FASTQC} from "${projectDir}/modules/fastqc/fastqc" +include {FUSION_REPORT} from "${projectDir}/modules/fusion_report/fusion_report" +include {MULTIQC} from "${projectDir}/modules/multiqc/multiqc" + +// log params +param_log() + +if (params.download_data && !params.csv_input) { + exit 1, "Data download was specified with `--download_data`. However, no input CSV file was specified with `--csv_input`. This is an invalid parameter combination. `--download_data` requires a CSV manifest. See `--help` for information." +} + +if (params.pdx && params.gen_org == 'mouse') { + exit 1, "PDX analysis was specified with `--pdx`. `--gen_org` was set to: ${params.gen_org}. This is an invalid parameter combination. `--gen_org` must == 'human' for PDX analysis." +} + +if (params.gen_org == 'mouse') { + exit 1, "This pipeline currently only supports human data analysis." +} + +if (params.read_type == 'SE') { + exit 1, "This pipeline supports only paired end data." +} + +// prepare reads channel +if (params.csv_input) { + + ch_input_sample = extract_csv(file(params.csv_input, checkIfExists: true)) + + ch_input_sample.map{it -> [it[0], [it[2], it[3]]]}.set{read_ch} + ch_input_sample.map{it -> [it[0], it[1]]}.set{meta_ch} + +} else if (params.concat_lanes){ + + read_ch = Channel + .fromFilePairs("${params.sample_folder}/${params.pattern}${params.extension}",checkExists:true, flat:true ) + .map { file, file1, file2 -> tuple(getLibraryId(file), file1, file2) } + .groupTuple() + + // if channel is empty give error message and exit + read_ch.ifEmpty{ exit 1, "ERROR: No Files Found in Path: ${params.sample_folder} Matching Pattern: ${params.pattern} and file extension: ${params.extension}"} + +} else { + + read_ch = Channel.fromFilePairs("${params.sample_folder}/${params.pattern}${params.extension}",checkExists:true ) + + // if channel is empty give error message and exit + read_ch.ifEmpty{ exit 1, "ERROR: No Files Found in Path: ${params.sample_folder} Matching Pattern: ${params.pattern} and file extension: ${params.extension}"} + +} + +// main workflow +workflow RNA_FUSION { + + // Step 0: Download data and concat Fastq files if needed. + if (params.download_data){ + FILE_DOWNLOAD(ch_input_sample) + + FILE_DOWNLOAD.out.read_meta_ch.map{it -> [it[0], it[2]]}.set{read_ch} + FILE_DOWNLOAD.out.read_meta_ch.map{it -> [it[0], it[1]]}.set{meta_ch} + } + + // Step 00: Concat local Fastq files from CSV input if required. + if (!params.download_data && params.csv_input){ + CONCATENATE_LOCAL_FILES(ch_input_sample) + CONCATENATE_LOCAL_FILES.out.read_meta_ch.map{it -> [it[0], it[2]]}.set{read_ch} + CONCATENATE_LOCAL_FILES.out.read_meta_ch.map{it -> [it[0], it[1]]}.set{meta_ch} + } + + // Step 0: Concat local Fastq files if required. + if (params.concat_lanes && !params.csv_input){ + CONCATENATE_READS_PE(read_ch) + read_ch = CONCATENATE_READS_PE.out.concat_fastq + } + + GUNZIP(read_ch) + + FASTQC(GUNZIP.out.gunzip_fastq) + + // Step 1a: Xenome if PDX data used. + ch_XENOME_CLASSIFY_multiqc = Channel.empty() //optional log file. + if (params.pdx){ + // Xenome Classification + XENOME_CLASSIFY(GUNZIP.out.gunzip_fastq) + ch_XENOME_CLASSIFY_multiqc = XENOME_CLASSIFY.out.xenome_stats //set log file for multiqc + + // Xenome Read Sort + FASTQ_SORT_HUMAN(XENOME_CLASSIFY.out.xenome_fastq, 'human') + FASTQ_SORT_MOUSE(XENOME_CLASSIFY.out.xenome_mouse_fastq, 'mouse') + fusion_tool_input = FASTQ_SORT_HUMAN.out.sorted_fastq + + } else { + fusion_tool_input = GUNZIP.out.gunzip_fastq + } + + // Step 3: Callers: + // arriba + STAR_ARRIBA(fusion_tool_input, params.arriba_star_args, params.gencode_gtf) + SORT_ARRIBA(STAR_ARRIBA.out.bam, '-O bam', 'bam') + INDEX_ARRIBA(SORT_ARRIBA.out.sorted_file) + arriba_input = SORT_ARRIBA.out.sorted_file.join(INDEX_ARRIBA.out.bai) + ARRIBA(arriba_input, params.gencode_gtf) + + // fusioncatcher + FUSIONCATCHER(fusion_tool_input) + + // jaffa + JAFFA(fusion_tool_input) + + // pizzly + KALLISTO_QUANT(fusion_tool_input) + KALLISTO_INSERT_SIZE(KALLISTO_QUANT.out.kallisto_abundance) + pizzly_input = KALLISTO_QUANT.out.kallisto_fusions.join(KALLISTO_INSERT_SIZE.out.kallisto_insert_size) + PIZZLY(pizzly_input, params.ensembl_gtf) + + // squid + STAR_SQUID(fusion_tool_input, params.squid_star_args, params.gencode_gtf) + SAMTOOLS_VIEW_SQUID(STAR_SQUID.out.sam, '-Sb', '_chimeric') // NOTE: The sam file from STAR_SQUID contains chimeric reads. Per STAR passed arguments. + SORT_SQUID(SAMTOOLS_VIEW_SQUID.out.bam, '-O bam', 'bam') + squid_input = STAR_SQUID.out.bam_sorted.join(SORT_SQUID.out.sorted_file ) + SQUID(squid_input) + SQUID_ANNOTATE(SQUID.out.squid_fusions, params.gencode_gtf) + + // star-fusion + STAR_FUSION(fusion_tool_input) + + // Step 4: Fusion Reporter + fusion_report_input = ARRIBA.out.arriba_fusions.join(FUSIONCATCHER.out.fusioncatcher_fusions).join(JAFFA.out.jaffa_fusions).join(PIZZLY.out.pizzly_fusions).join(SQUID_ANNOTATE.out.squid_fusions_annotated).join(STAR_FUSION.out.star_fusion_fusions) + FUSION_REPORT(fusion_report_input) + + // Step 5: MultiQC + ch_multiqc_files = Channel.empty() + ch_multiqc_files = ch_multiqc_files.mix(FUSION_REPORT.out.summary_fusions_mq.collect{it[1]}.ifEmpty([])) + ch_multiqc_files = ch_multiqc_files.mix(FASTQC.out.quality_stats.collect{it[1]}.ifEmpty([])) + ch_multiqc_files = ch_multiqc_files.mix(ch_XENOME_CLASSIFY_multiqc.collect{it[1]}.ifEmpty([])) + + MULTIQC ( + ch_multiqc_files.collect() + ) +} diff --git a/workflows/rnaseq.nf b/workflows/rnaseq.nf index a782f292..da0a6543 100644 --- a/workflows/rnaseq.nf +++ b/workflows/rnaseq.nf @@ -5,17 +5,24 @@ nextflow.enable.dsl=2 include {help} from "${projectDir}/bin/help/rnaseq" include {param_log} from "${projectDir}/bin/log/rnaseq" include {getLibraryId} from "${projectDir}/bin/shared/getLibraryId.nf" +include {extract_csv} from "${projectDir}/bin/shared/extract_csv.nf" +include {FILE_DOWNLOAD} from "${projectDir}/subworkflows/aria_download_parse" +include {CONCATENATE_LOCAL_FILES} from "${projectDir}/subworkflows/concatenate_local_files" include {CONCATENATE_READS_PE} from "${projectDir}/modules/utility_modules/concatenate_reads_PE" include {CONCATENATE_READS_SE} from "${projectDir}/modules/utility_modules/concatenate_reads_SE" +include {GET_READ_LENGTH} from "${projectDir}/modules/utility_modules/get_read_length" +include {PDX_RNASEQ} from "${projectDir}/subworkflows/pdx_rnaseq" +include {JAX_TRIMMER} from "${projectDir}/modules/utility_modules/jax_trimmer" +include {FASTQC} from "${projectDir}/modules/fastqc/fastqc" +include {CHECK_STRANDEDNESS} from "${projectDir}/modules/python/python_check_strandedness" include {READ_GROUPS} from "${projectDir}/modules/utility_modules/read_groups" -include {RNA_SUMMARY_STATS} from "${projectDir}/modules/utility_modules/aggregate_stats_rna" -include {BAMTOOLS_STATS} from "${projectDir}/modules/bamtools/bamtools_stats" include {RSEM_ALIGNMENT_EXPRESSION} from "${projectDir}/modules/rsem/rsem_alignment_expression" -include {QUALITY_STATISTICS} from "${projectDir}/modules/utility_modules/quality_stats" include {PICARD_ADDORREPLACEREADGROUPS} from "${projectDir}/modules/picard/picard_addorreplacereadgroups" include {PICARD_REORDERSAM} from "${projectDir}/modules/picard/picard_reordersam" -include {PICARD_COLLECTRNASEQMETRICS} from "${projectDir}/modules/picard/picard_collectrnaseqmetrics" include {PICARD_SORTSAM} from "${projectDir}/modules/picard/picard_sortsam" +include {PICARD_COLLECTRNASEQMETRICS} from "${projectDir}/modules/picard/picard_collectrnaseqmetrics" +include {RNA_SUMMARY_STATS} from "${projectDir}/modules/utility_modules/aggregate_stats_rna" +include {MULTIQC} from "${projectDir}/modules/multiqc/multiqc" // help if needed if (params.help){ @@ -26,8 +33,29 @@ if (params.help){ // log paramiter info param_log() +if (params.download_data && !params.csv_input) { + exit 1, "Data download was specified with `--download_data`. However, no input CSV file was specified with `--csv_input`. This is an invalid parameter combination. `--download_data` requires a CSV manifest. See `--help` for information." +} + +if (params.pdx && params.gen_org == 'mouse') { + exit 1, "PDX analysis was specified with `--pdx`. `--gen_org` was set to: ${params.gen_org}. This is an invalid parameter combination. `--gen_org` must == 'human' for PDX analysis." +} + // prepare reads channel -if (params.concat_lanes){ +if (params.csv_input) { + + ch_input_sample = extract_csv(file(params.csv_input, checkIfExists: true)) + + if (params.read_type == 'PE'){ + ch_input_sample.map{it -> [it[0], [it[2], it[3]]]}.set{read_ch} + ch_input_sample.map{it -> [it[0], it[1]]}.set{meta_ch} + } else if (params.read_type == 'SE') { + ch_input_sample.map{it -> [it[0], it[2]]}.set{read_ch} + ch_input_sample.map{it -> [it[0], it[1]]}.set{meta_ch} + } + +} else if (params.concat_lanes){ + if (params.read_type == 'PE'){ read_ch = Channel .fromFilePairs("${params.sample_folder}/${params.pattern}${params.extension}",checkExists:true, flat:true ) @@ -40,65 +68,104 @@ if (params.concat_lanes){ .groupTuple() .map{t-> [t[0], t[1].flatten()]} } + // if channel is empty give error message and exit + read_ch.ifEmpty{ exit 1, "ERROR: No Files Found in Path: ${params.sample_folder} Matching Pattern: ${params.pattern} and file extension: ${params.extension}"} + } else { + if (params.read_type == 'PE'){ read_ch = Channel.fromFilePairs("${params.sample_folder}/${params.pattern}${params.extension}",checkExists:true ) } else if (params.read_type == 'SE'){ read_ch = Channel.fromFilePairs("${params.sample_folder}/*${params.extension}",checkExists:true, size:1 ) } + // if channel is empty give error message and exit + read_ch.ifEmpty{ exit 1, "ERROR: No Files Found in Path: ${params.sample_folder} Matching Pattern: ${params.pattern} and file extension: ${params.extension}"} } -// if channel is empty give error message and exit -read_ch.ifEmpty{ exit 1, "ERROR: No Files Found in Path: ${params.sample_folder} Matching Pattern: ${params.pattern}"} - -// downstream resources (only load once so do it here) -if (params.rsem_aligner == "bowtie2") { - rsem_ref_files = file("${params.rsem_ref_files}/bowtie2/*") -} -else if (params.rsem_aligner == "star") { - rsem_ref_files = file("${params.rsem_ref_files}/STAR/${params.rsem_star_prefix}/*") -} -else error "${params.rsem_aligner} is not valid, use 'bowtie2' or 'star'" - // main workflow workflow RNASEQ { - // Step 0: Concatenate Fastq files if required. - if (params.concat_lanes){ - if (params.read_type == 'PE'){ - CONCATENATE_READS_PE(read_ch) - read_ch = CONCATENATE_READS_PE.out.concat_fastq - } else if (params.read_type == 'SE'){ - CONCATENATE_READS_SE(read_ch) - read_ch = CONCATENATE_READS_SE.out.concat_fastq - } + // Step 0: Download data and concat Fastq files if needed. + if (params.download_data){ + FILE_DOWNLOAD(ch_input_sample) + + FILE_DOWNLOAD.out.read_meta_ch.map{it -> [it[0], it[2]]}.set{read_ch} + FILE_DOWNLOAD.out.read_meta_ch.map{it -> [it[0], it[1]]}.set{meta_ch} + } + + // Step 00: Concat local Fastq files from CSV input if required. + if (!params.download_data && params.csv_input){ + CONCATENATE_LOCAL_FILES(ch_input_sample) + CONCATENATE_LOCAL_FILES.out.read_meta_ch.map{it -> [it[0], it[2]]}.set{read_ch} + CONCATENATE_LOCAL_FILES.out.read_meta_ch.map{it -> [it[0], it[1]]}.set{meta_ch} + } + + // Step 00: Concat local Fastq files if required. + if (params.concat_lanes && !params.csv_input){ + if (params.read_type == 'PE'){ + CONCATENATE_READS_PE(read_ch) + read_ch = CONCATENATE_READS_PE.out.concat_fastq + } else if (params.read_type == 'SE'){ + CONCATENATE_READS_SE(read_ch) + read_ch = CONCATENATE_READS_SE.out.concat_fastq + } } + + // ** MAIN workflow starts: - // Step 1: Qual_Stat - QUALITY_STATISTICS(read_ch) + // If samples are PDX, run the PDX RNAseq workflow. + // Otherwise, run the standard workflow. - // Step 2: RSEM - RSEM_ALIGNMENT_EXPRESSION(QUALITY_STATISTICS.out.trimmed_fastq, rsem_ref_files) + if (params.pdx){ + + PDX_RNASEQ(read_ch) - //Step 3: Get Read Group Information - READ_GROUPS(QUALITY_STATISTICS.out.trimmed_fastq, "picard") + } else { - // Step 4: Picard Alignment Metrics - add_replace_groups = READ_GROUPS.out.read_groups.join(RSEM_ALIGNMENT_EXPRESSION.out.bam) - PICARD_ADDORREPLACEREADGROUPS(add_replace_groups) + // Step 1: Qual_Stat + JAX_TRIMMER(read_ch) + + GET_READ_LENGTH(read_ch) + + FASTQC(JAX_TRIMMER.out.trimmed_fastq) - PICARD_REORDERSAM(PICARD_ADDORREPLACEREADGROUPS.out.bam) + // Check strand setting + CHECK_STRANDEDNESS(JAX_TRIMMER.out.trimmed_fastq) - // Step 5: Picard Alignment Metrics - PICARD_SORTSAM(PICARD_REORDERSAM.out.bam) - // need to sort out ref_flat and ribo_intervals (may break mouse now) - PICARD_COLLECTRNASEQMETRICS(PICARD_SORTSAM.out.bam) + rsem_input = JAX_TRIMMER.out.trimmed_fastq.join(CHECK_STRANDEDNESS.out.strand_setting).join(GET_READ_LENGTH.out.read_length) - // Step 6: Summary Stats + // Step 2: RSEM + RSEM_ALIGNMENT_EXPRESSION(rsem_input, params.rsem_ref_files, params.rsem_star_prefix, params.rsem_ref_prefix) - agg_stats = RSEM_ALIGNMENT_EXPRESSION.out.rsem_stats.join(QUALITY_STATISTICS.out.quality_stats).join(PICARD_COLLECTRNASEQMETRICS.out.picard_metrics) + //Step 3: Get Read Group Information + READ_GROUPS(JAX_TRIMMER.out.trimmed_fastq, "picard") - RNA_SUMMARY_STATS(agg_stats) + // Step 4: Picard Alignment Metrics + add_replace_groups = READ_GROUPS.out.read_groups.join(RSEM_ALIGNMENT_EXPRESSION.out.bam) + PICARD_ADDORREPLACEREADGROUPS(add_replace_groups) + PICARD_REORDERSAM(PICARD_ADDORREPLACEREADGROUPS.out.bam, params.picard_dict) + + // Step 5: Picard Alignment Metrics + PICARD_SORTSAM(PICARD_REORDERSAM.out.bam) + + PICARD_COLLECTRNASEQMETRICS(PICARD_SORTSAM.out.bam.join(CHECK_STRANDEDNESS.out.strand_setting), params.ref_flat, params.ribo_intervals) + + // Step 6: Summary Stats + + agg_stats = RSEM_ALIGNMENT_EXPRESSION.out.rsem_stats.join(JAX_TRIMMER.out.quality_stats).join(PICARD_COLLECTRNASEQMETRICS.out.picard_metrics) + + RNA_SUMMARY_STATS(agg_stats) + + ch_multiqc_files = Channel.empty() + ch_multiqc_files = ch_multiqc_files.mix(JAX_TRIMMER.out.quality_stats.collect{it[1]}.ifEmpty([])) + ch_multiqc_files = ch_multiqc_files.mix(FASTQC.out.quality_stats.collect{it[1]}.ifEmpty([])) + ch_multiqc_files = ch_multiqc_files.mix(RSEM_ALIGNMENT_EXPRESSION.out.rsem_cnt.collect{it[1]}.ifEmpty([])) + ch_multiqc_files = ch_multiqc_files.mix(PICARD_COLLECTRNASEQMETRICS.out.picard_metrics.collect{it[1]}.ifEmpty([])) + + MULTIQC ( + ch_multiqc_files.collect() + ) + } } diff --git a/workflows/rrbs.nf b/workflows/rrbs.nf index 13e391bb..849b4b17 100644 --- a/workflows/rrbs.nf +++ b/workflows/rrbs.nf @@ -4,11 +4,16 @@ nextflow.enable.dsl=2 include {help} from "${projectDir}/bin/help/rrbs" include {param_log} from "${projectDir}/bin/log/rrbs" include {getLibraryId} from "${projectDir}/bin/shared/getLibraryId.nf" +include {extract_csv} from "${projectDir}/bin/shared/extract_csv.nf" +include {FILE_DOWNLOAD} from "${projectDir}/subworkflows/aria_download_parse" +include {CONCATENATE_LOCAL_FILES} from "${projectDir}/subworkflows/concatenate_local_files" include {CONCATENATE_READS_PE} from "${projectDir}/modules/utility_modules/concatenate_reads_PE" include {CONCATENATE_READS_SE} from "${projectDir}/modules/utility_modules/concatenate_reads_SE" include {FASTQC} from "${projectDir}/modules/fastqc/fastqc" include {TRIM_GALORE} from "${projectDir}/modules/trim_galore/trim_galore" include {BISMARK_ALIGNMENT} from "${projectDir}/modules/bismark/bismark_alignment" +include {SAMTOOLS_SORT} from "${projectDir}/modules/samtools/samtools_sort" +include {SAMTOOLS_INDEX} from "${projectDir}/modules/samtools/samtools_index" include {BISMARK_DEDUPLICATION} from "${projectDir}/modules/bismark/bismark_deduplication" include {BISMARK_METHYLATION_EXTRACTION} from "${projectDir}/modules/bismark/bismark_methylation_extraction" include {MULTIQC} from "${projectDir}/modules/multiqc/multiqc" @@ -22,8 +27,25 @@ if (params.help){ // log paramiter info param_log() +if (params.download_data && !params.csv_input) { + exit 1, "Data download was specified with `--download_data`. However, no input CSV file was specified with `--csv_input`. This is an invalid parameter combination. `--download_data` requires a CSV manifest. See `--help` for information." +} + // prepare reads channel -if (params.concat_lanes){ +if (params.csv_input) { + + ch_input_sample = extract_csv(file(params.csv_input, checkIfExists: true)) + + if (params.read_type == 'PE'){ + ch_input_sample.map{it -> [it[0], [it[2], it[3]]]}.set{read_ch} + ch_input_sample.map{it -> [it[0], it[1]]}.set{meta_ch} + } else if (params.read_type == 'SE') { + ch_input_sample.map{it -> [it[0], it[2]]}.set{read_ch} + ch_input_sample.map{it -> [it[0], it[1]]}.set{meta_ch} + } + +} else if (params.concat_lanes){ + if (params.read_type == 'PE'){ read_ch = Channel .fromFilePairs("${params.sample_folder}/${params.pattern}${params.extension}",checkExists:true, flat:true ) @@ -36,38 +58,61 @@ if (params.concat_lanes){ .groupTuple() .map{t-> [t[0], t[1].flatten()]} } + // if channel is empty give error message and exit + read_ch.ifEmpty{ exit 1, "ERROR: No Files Found in Path: ${params.sample_folder} Matching Pattern: ${params.pattern} and file extension: ${params.extension}"} + } else { + if (params.read_type == 'PE'){ read_ch = Channel.fromFilePairs("${params.sample_folder}/${params.pattern}${params.extension}",checkExists:true ) } else if (params.read_type == 'SE'){ read_ch = Channel.fromFilePairs("${params.sample_folder}/*${params.extension}",checkExists:true, size:1 ) } + // if channel is empty give error message and exit + read_ch.ifEmpty{ exit 1, "ERROR: No Files Found in Path: ${params.sample_folder} Matching Pattern: ${params.pattern} and file extension: ${params.extension}"} } -// if channel is empty give error message and exit -read_ch.ifEmpty{ exit 1, "ERROR: No Files Found in Path: ${params.sample_folder} Matching Pattern: ${params.pattern}"} - // main workflow workflow RRBS { + // Step 0: Download data and concat Fastq files if needed. + if (params.download_data){ + FILE_DOWNLOAD(ch_input_sample) - // Step 0: Concatenate Fastq files if required. - if (params.concat_lanes){ - if (params.read_type == 'PE'){ - CONCATENATE_READS_PE(read_ch) - read_ch = CONCATENATE_READS_PE.out.concat_fastq - } else if (params.read_type == 'SE'){ - CONCATENATE_READS_SE(read_ch) - read_ch = CONCATENATE_READS_SE.out.concat_fastq - } + FILE_DOWNLOAD.out.read_meta_ch.map{it -> [it[0], it[2]]}.set{read_ch} + FILE_DOWNLOAD.out.read_meta_ch.map{it -> [it[0], it[1]]}.set{meta_ch} } + // Step 00: Concat local Fastq files from CSV input if required. + if (!params.download_data && params.csv_input){ + CONCATENATE_LOCAL_FILES(ch_input_sample) + CONCATENATE_LOCAL_FILES.out.read_meta_ch.map{it -> [it[0], it[2]]}.set{read_ch} + CONCATENATE_LOCAL_FILES.out.read_meta_ch.map{it -> [it[0], it[1]]}.set{meta_ch} + } + + // Step 00: Concat local Fastq files if required. + if (params.concat_lanes && !params.csv_input){ + if (params.read_type == 'PE'){ + CONCATENATE_READS_PE(read_ch) + read_ch = CONCATENATE_READS_PE.out.concat_fastq + } else if (params.read_type == 'SE'){ + CONCATENATE_READS_SE(read_ch) + read_ch = CONCATENATE_READS_SE.out.concat_fastq + } + } + + // ** MAIN workflow starts: + FASTQC(read_ch) + // Note: fastqc is run prior to trimming, as trim galor outputs fastqc level data. TRIM_GALORE(read_ch) BISMARK_ALIGNMENT(TRIM_GALORE.out.trimmed_fastq) + SAMTOOLS_SORT(BISMARK_ALIGNMENT.out.bam, '-O bam', 'bam') + SAMTOOLS_INDEX(SAMTOOLS_SORT.out.sorted_file) + ch_BISMARK_DEDUPLICATION_multiqc = Channel.empty() if (params.skip_deduplication) { diff --git a/workflows/wes.nf b/workflows/wes.nf index 1644f84c..1be5c199 100755 --- a/workflows/wes.nf +++ b/workflows/wes.nf @@ -5,41 +5,47 @@ nextflow.enable.dsl=2 include {help} from "${projectDir}/bin/help/wes.nf" include {param_log} from "${projectDir}/bin/log/wes.nf" include {getLibraryId} from "${projectDir}/bin/shared/getLibraryId.nf" +include {extract_csv} from "${projectDir}/bin/shared/extract_csv.nf" +include {FILE_DOWNLOAD} from "${projectDir}/subworkflows/aria_download_parse" +include {CONCATENATE_LOCAL_FILES} from "${projectDir}/subworkflows/concatenate_local_files" include {CONCATENATE_READS_PE} from "${projectDir}/modules/utility_modules/concatenate_reads_PE" include {CONCATENATE_READS_SE} from "${projectDir}/modules/utility_modules/concatenate_reads_SE" -include {BWA_MEM} from "${projectDir}/modules/bwa/bwa_mem" -include {SAMTOOLS_INDEX} from "${projectDir}/modules/samtools/samtools_index" +include {JAX_TRIMMER} from "${projectDir}/modules/utility_modules/jax_trimmer" include {READ_GROUPS} from "${projectDir}/modules/utility_modules/read_groups" -include {QUALITY_STATISTICS} from "${projectDir}/modules/utility_modules/quality_stats" -include {AGGREGATE_STATS} from "${projectDir}/modules/utility_modules/aggregate_stats_wes" -include {SNPSIFT_ANNOTATE as SNPSIFT_ANNOTATE_SNP_COSMIC; - SNPSIFT_ANNOTATE as SNPSIFT_ANNOTATE_INDEL_COSMIC; - SNPSIFT_ANNOTATE as SNPSIFT_ANNOTATE_SNP_DBSNP; - SNPSIFT_ANNOTATE as SNPSIFT_ANNOTATE_INDEL_DBSNP} from "${projectDir}/modules/snpeff_snpsift/snpsift_annotate" +include {BWA_MEM} from "${projectDir}/modules/bwa/bwa_mem" +include {FASTQC} from "${projectDir}/modules/fastqc/fastqc" include {PICARD_SORTSAM} from "${projectDir}/modules/picard/picard_sortsam" include {PICARD_MARKDUPLICATES} from "${projectDir}/modules/picard/picard_markduplicates" +include {GATK_BASERECALIBRATOR} from "${projectDir}/modules/gatk/gatk_baserecalibrator" +include {GATK_APPLYBQSR} from "${projectDir}/modules/gatk/gatk_applybqsr" include {PICARD_COLLECTHSMETRICS} from "${projectDir}/modules/picard/picard_collecthsmetrics" +include {GATK_HAPLOTYPECALLER; + GATK_HAPLOTYPECALLER as GATK_HAPLOTYPECALLER_GVCF} from "${projectDir}/modules/gatk/gatk_haplotypecaller" +include {GATK_VARIANTFILTRATION; + GATK_VARIANTFILTRATION as GATK_VARIANTFILTRATION_SNP; + GATK_VARIANTFILTRATION as GATK_VARIANTFILTRATION_INDEL} from "${projectDir}/modules/gatk/gatk_variantfiltration" +include {GATK_SELECTVARIANTS as GATK_SELECTVARIANTS_SNP; + GATK_SELECTVARIANTS as GATK_SELECTVARIANTS_INDEL} from "${projectDir}/modules/gatk/gatk_selectvariants" +include {GATK_MERGEVCF as GATK_MERGEVCF_UNANNOTATED; + GATK_MERGEVCF as GATK_MERGEVCF_ANNOTATED} from "${projectDir}/modules/gatk/gatk_mergevcf" +include {GATK_INDEXFEATUREFILE} from "${projectDir}/modules/gatk/gatk_indexfeaturefile" +include {SNPSIFT_ANNOTATE as SNPSIFT_ANNOTATE_DBSNP; + SNPSIFT_ANNOTATE as SNPSIFT_ANNOTATE_SNP_COSMIC; + SNPSIFT_ANNOTATE as SNPSIFT_ANNOTATE_INDEL_COSMIC; + SNPSIFT_ANNOTATE as SNPSIFT_ANNOTATE_SNP_DBSNP; + SNPSIFT_ANNOTATE as SNPSIFT_ANNOTATE_INDEL_DBSNP} from "${projectDir}/modules/snpeff_snpsift/snpsift_annotate" include {SNPEFF; SNPEFF as SNPEFF_SNP; SNPEFF as SNPEFF_INDEL} from "${projectDir}/modules/snpeff_snpsift/snpeff_snpeff" -include {SNPEFF_ONEPERLINE as SNPEFF_ONEPERLINE_SNP; +include {SNPEFF_ONEPERLINE; + SNPEFF_ONEPERLINE as SNPEFF_ONEPERLINE_SNP; SNPEFF_ONEPERLINE as SNPEFF_ONEPERLINE_INDEL} from "${projectDir}/modules/snpeff_snpsift/snpeff_oneperline" include {SNPSIFT_EXTRACTFIELDS} from "${projectDir}/modules/snpeff_snpsift/snpsift_extractfields" include {SNPSIFT_DBNSFP as SNPSIFT_DBNSFP_SNP; SNPSIFT_DBNSFP as SNPSIFT_DBNSFP_INDEL} from "${projectDir}/modules/snpeff_snpsift/snpsift_dbnsfp" -include {GATK_HAPLOTYPECALLER; - GATK_HAPLOTYPECALLER as GATK_HAPLOTYPECALLER_GVCF} from "${projectDir}/modules/gatk/gatk_haplotypecaller" -include {GATK_INDEXFEATUREFILE} from "${projectDir}/modules/gatk/gatk_indexfeaturefile" -include {GATK_VARIANTFILTRATION; - GATK_VARIANTFILTRATION as GATK_VARIANTFILTRATION_SNP; - GATK_VARIANTFILTRATION as GATK_VARIANTFILTRATION_INDEL} from "${projectDir}/modules/gatk/gatk_variantfiltration" -include {GATK_VARIANTANNOTATOR} from "${projectDir}/modules/gatk/gatk_variantannotator" -include {GATK_MERGEVCF} from "${projectDir}/modules/gatk/gatk_mergevcf" -include {GATK_SELECTVARIANTS; - GATK_SELECTVARIANTS as GATK_SELECTVARIANTS_SNP; - GATK_SELECTVARIANTS as GATK_SELECTVARIANTS_INDEL} from "${projectDir}/modules/gatk/gatk_selectvariants" -include {GATK_BASERECALIBRATOR} from "${projectDir}/modules/gatk/gatk_baserecalibrator" -include {GATK_APPLYBQSR} from "${projectDir}/modules/gatk/gatk_applybqsr" +include {AGGREGATE_STATS} from "${projectDir}/modules/utility_modules/aggregate_stats_wes" +include {MULTIQC} from "${projectDir}/modules/multiqc/multiqc" + // help if needed if (params.help){ @@ -50,8 +56,25 @@ if (params.help){ // log params param_log() +if (params.download_data && !params.csv_input) { + exit 1, "Data download was specified with `--download_data`. However, no input CSV file was specified with `--csv_input`. This is an invalid parameter combination. `--download_data` requires a CSV manifest. See `--help` for information." +} + // prepare reads channel -if (params.concat_lanes){ +if (params.csv_input) { + + ch_input_sample = extract_csv(file(params.csv_input, checkIfExists: true)) + + if (params.read_type == 'PE'){ + ch_input_sample.map{it -> [it[0], [it[2], it[3]]]}.set{read_ch} + ch_input_sample.map{it -> [it[0], it[1]]}.set{meta_ch} + } else if (params.read_type == 'SE') { + ch_input_sample.map{it -> [it[0], it[2]]}.set{read_ch} + ch_input_sample.map{it -> [it[0], it[1]]}.set{meta_ch} + } + +} else if (params.concat_lanes){ + if (params.read_type == 'PE'){ read_ch = Channel .fromFilePairs("${params.sample_folder}/${params.pattern}${params.extension}",checkExists:true, flat:true ) @@ -64,39 +87,60 @@ if (params.concat_lanes){ .groupTuple() .map{t-> [t[0], t[1].flatten()]} } + // if channel is empty give error message and exit + read_ch.ifEmpty{ exit 1, "ERROR: No Files Found in Path: ${params.sample_folder} Matching Pattern: ${params.pattern} and file extension: ${params.extension}"} + } else { + if (params.read_type == 'PE'){ read_ch = Channel.fromFilePairs("${params.sample_folder}/${params.pattern}${params.extension}",checkExists:true ) } else if (params.read_type == 'SE'){ read_ch = Channel.fromFilePairs("${params.sample_folder}/*${params.extension}",checkExists:true, size:1 ) } + // if channel is empty give error message and exit + read_ch.ifEmpty{ exit 1, "ERROR: No Files Found in Path: ${params.sample_folder} Matching Pattern: ${params.pattern} and file extension: ${params.extension}"} } -// if channel is empty give error message and exit -read_ch.ifEmpty{ exit 1, "ERROR: No Files Found in Path: ${params.sample_folder} Matching Pattern: ${params.pattern}"} - // main workflow workflow WES { - // Step 0: Concatenate Fastq files if required. - if (params.concat_lanes){ - if (params.read_type == 'PE'){ - CONCATENATE_READS_PE(read_ch) - read_ch = CONCATENATE_READS_PE.out.concat_fastq - } else if (params.read_type == 'SE'){ - CONCATENATE_READS_SE(read_ch) - read_ch = CONCATENATE_READS_SE.out.concat_fastq - } + // Step 0: Download data and concat Fastq files if needed. + if (params.download_data){ + FILE_DOWNLOAD(ch_input_sample) + + FILE_DOWNLOAD.out.read_meta_ch.map{it -> [it[0], it[2]]}.set{read_ch} + FILE_DOWNLOAD.out.read_meta_ch.map{it -> [it[0], it[1]]}.set{meta_ch} + } + + // Step 00: Concat local Fastq files from CSV input if required. + if (!params.download_data && params.csv_input){ + CONCATENATE_LOCAL_FILES(ch_input_sample) + CONCATENATE_LOCAL_FILES.out.read_meta_ch.map{it -> [it[0], it[2]]}.set{read_ch} + CONCATENATE_LOCAL_FILES.out.read_meta_ch.map{it -> [it[0], it[1]]}.set{meta_ch} + } + + // Step 00: Concat local Fastq files if required. + if (params.concat_lanes && !params.csv_input){ + if (params.read_type == 'PE'){ + CONCATENATE_READS_PE(read_ch) + read_ch = CONCATENATE_READS_PE.out.concat_fastq + } else if (params.read_type == 'SE'){ + CONCATENATE_READS_SE(read_ch) + read_ch = CONCATENATE_READS_SE.out.concat_fastq + } } // Step 1: Qual_Stat - QUALITY_STATISTICS(read_ch) + JAX_TRIMMER(read_ch) + + FASTQC(JAX_TRIMMER.out.trimmed_fastq) // Step 2: Get Read Group Information - READ_GROUPS(QUALITY_STATISTICS.out.trimmed_fastq, "gatk") + READ_GROUPS(JAX_TRIMMER.out.trimmed_fastq, "gatk") // Step 3: BWA-MEM Alignment - bwa_mem_mapping = QUALITY_STATISTICS.out.trimmed_fastq.join(READ_GROUPS.out.read_groups) + bwa_mem_mapping = JAX_TRIMMER.out.trimmed_fastq.join(READ_GROUPS.out.read_groups) + BWA_MEM(bwa_mem_mapping) // Step 4: Variant Preprocessing - Part 1 @@ -104,10 +148,12 @@ workflow WES { PICARD_MARKDUPLICATES(PICARD_SORTSAM.out.bam) // If Human: Step 5-10 + ch_GATK_BASERECALIBRATOR_multiqc = Channel.empty() //optional log file for human only. if (params.gen_org=='human'){ // Step 5: Variant Pre-Processing - Part 2 GATK_BASERECALIBRATOR(PICARD_MARKDUPLICATES.out.dedup_bam) + ch_GATK_BASERECALIBRATOR_multiqc = GATK_BASERECALIBRATOR.out.table // set log file for multiqc apply_bqsr = PICARD_MARKDUPLICATES.out.dedup_bam.join(GATK_BASERECALIBRATOR.out.table) GATK_APPLYBQSR(apply_bqsr) @@ -120,20 +166,22 @@ workflow WES { haplotype_caller = GATK_APPLYBQSR.out.bam.join(GATK_APPLYBQSR.out.bai) GATK_HAPLOTYPECALLER(haplotype_caller, 'variant') - haplotype_caller_gvcf = GATK_APPLYBQSR.out.bam.join(GATK_APPLYBQSR.out.bai) - GATK_HAPLOTYPECALLER_GVCF(haplotype_caller_gvcf, 'gvcf') + if (params.run_gvcf) { + haplotype_caller_gvcf = GATK_APPLYBQSR.out.bam.join(GATK_APPLYBQSR.out.bai) + GATK_HAPLOTYPECALLER_GVCF(haplotype_caller_gvcf, 'gvcf') + } // Step 8: Variant Filtration // SNP select_var_snp = GATK_HAPLOTYPECALLER.out.vcf.join(GATK_HAPLOTYPECALLER.out.idx) - GATK_SELECTVARIANTS_SNP(select_var_snp, 'SNP') + GATK_SELECTVARIANTS_SNP(select_var_snp, 'SNP', 'selected_SNP') var_filter_snp = GATK_SELECTVARIANTS_SNP.out.vcf.join(GATK_SELECTVARIANTS_SNP.out.idx) GATK_VARIANTFILTRATION_SNP(var_filter_snp, 'SNP') // INDEL select_var_indel = GATK_HAPLOTYPECALLER.out.vcf.join(GATK_HAPLOTYPECALLER.out.idx) - GATK_SELECTVARIANTS_INDEL(select_var_indel, 'INDEL') + GATK_SELECTVARIANTS_INDEL(select_var_indel, 'INDEL', 'selected_INDEL') var_filter_indel = GATK_SELECTVARIANTS_INDEL.out.vcf.join(GATK_SELECTVARIANTS_INDEL.out.idx) GATK_VARIANTFILTRATION_INDEL(var_filter_indel, 'INDEL') @@ -154,10 +202,13 @@ workflow WES { SNPEFF_ONEPERLINE_INDEL(SNPSIFT_DBNSFP_INDEL.out.vcf, 'INDEL') // Step 10: Post Variant Calling Processing - Part 2 - vcf_files = SNPEFF_ONEPERLINE_SNP.out.vcf.join(SNPEFF_ONEPERLINE_INDEL.out.vcf) - GATK_MERGEVCF(vcf_files) + vcf_files_unannotated = SNPSIFT_ANNOTATE_SNP_COSMIC.out.vcf.join(SNPSIFT_ANNOTATE_INDEL_COSMIC.out.vcf) + GATK_MERGEVCF_UNANNOTATED (vcf_files_unannotated, 'SNP_INDEL_filtered_unannotated_final') + + vcf_files_annotated = SNPEFF_ONEPERLINE_SNP.out.vcf.join(SNPEFF_ONEPERLINE_INDEL.out.vcf) + GATK_MERGEVCF_ANNOTATED(vcf_files_annotated, 'SNP_INDEL_filtered_annotated_final') - SNPSIFT_EXTRACTFIELDS(GATK_MERGEVCF.out.vcf) + SNPSIFT_EXTRACTFIELDS(GATK_MERGEVCF_ANNOTATED.out.vcf) } else if (params.gen_org=='mouse'){ @@ -165,28 +216,56 @@ workflow WES { collecths_metric = PICARD_MARKDUPLICATES.out.dedup_bam.join(PICARD_MARKDUPLICATES.out.dedup_bai) PICARD_COLLECTHSMETRICS(collecths_metric) - // Step 7: Variant Calling haplotype_caller = PICARD_MARKDUPLICATES.out.dedup_bam.join(PICARD_MARKDUPLICATES.out.dedup_bai) GATK_HAPLOTYPECALLER(haplotype_caller, 'variant') - + + if (params.run_gvcf) { + haplotype_caller_gvcf = PICARD_MARKDUPLICATES.out.dedup_bam.join(PICARD_MARKDUPLICATES.out.dedup_bai) + GATK_HAPLOTYPECALLER_GVCF(haplotype_caller_gvcf, 'gvcf') + } + // Step 8: Variant Filtration - var_filter = GATK_HAPLOTYPECALLER.out.vcf.join(GATK_HAPLOTYPECALLER.out.idx) + + SNPSIFT_ANNOTATE_DBSNP(GATK_HAPLOTYPECALLER.out.vcf, params.dbSNP, params.dbSNP_index, 'intermediate') + + GATK_INDEXFEATUREFILE(SNPSIFT_ANNOTATE_DBSNP.out.vcf) + + var_filter = SNPSIFT_ANNOTATE_DBSNP.out.vcf.join(GATK_INDEXFEATUREFILE.out.idx) + GATK_VARIANTFILTRATION(var_filter, 'BOTH') + // SNP for final save + select_var_snp = GATK_VARIANTFILTRATION.out.vcf.join(GATK_VARIANTFILTRATION.out.idx) + GATK_SELECTVARIANTS_SNP(select_var_snp, 'SNP', 'SNP_filtered_dbsnpID') + + // INDEL for final save + select_var_indel = GATK_VARIANTFILTRATION.out.vcf.join(GATK_VARIANTFILTRATION.out.idx) + GATK_SELECTVARIANTS_INDEL(select_var_indel, 'INDEL', 'INDEL_filtered_dbsnpID') + // Step 9: Post Variant Calling Processing - SNPEFF(GATK_VARIANTFILTRATION.out.vcf, 'BOTH', 'gatk') + SNPEFF(GATK_VARIANTFILTRATION.out.vcf, 'BOTH', 'vcf') - merged_vcf_files = GATK_VARIANTFILTRATION.out.vcf.join(SNPEFF.out.vcf) - GATK_VARIANTANNOTATOR(merged_vcf_files) + SNPEFF_ONEPERLINE(SNPEFF.out.vcf, 'BOTH') - SNPSIFT_EXTRACTFIELDS(GATK_VARIANTANNOTATOR.out.vcf) + SNPSIFT_EXTRACTFIELDS(SNPEFF_ONEPERLINE.out.vcf) } - agg_stats = QUALITY_STATISTICS.out.quality_stats.join(PICARD_COLLECTHSMETRICS.out.hsmetrics).join(PICARD_MARKDUPLICATES.out.dedup_metrics) + agg_stats = JAX_TRIMMER.out.quality_stats.join(PICARD_COLLECTHSMETRICS.out.hsmetrics).join(PICARD_MARKDUPLICATES.out.dedup_metrics) // Step 11: Aggregate Stats AGGREGATE_STATS(agg_stats) + ch_multiqc_files = Channel.empty() + ch_multiqc_files = ch_multiqc_files.mix(JAX_TRIMMER.out.quality_stats.collect{it[1]}.ifEmpty([])) + ch_multiqc_files = ch_multiqc_files.mix(FASTQC.out.quality_stats.collect{it[1]}.ifEmpty([])) + ch_multiqc_files = ch_multiqc_files.mix(ch_GATK_BASERECALIBRATOR_multiqc.collect{it[1]}.ifEmpty([])) + ch_multiqc_files = ch_multiqc_files.mix(PICARD_COLLECTHSMETRICS.out.hsmetrics.collect{it[1]}.ifEmpty([])) + ch_multiqc_files = ch_multiqc_files.mix(PICARD_MARKDUPLICATES.out.dedup_metrics.collect{it[1]}.ifEmpty([])) + + MULTIQC ( + ch_multiqc_files.collect() + ) + } diff --git a/workflows/wgs.nf b/workflows/wgs.nf index b23a93b7..c6030269 100644 --- a/workflows/wgs.nf +++ b/workflows/wgs.nf @@ -5,42 +5,49 @@ nextflow.enable.dsl=2 include {help} from "${projectDir}/bin/help/wgs.nf" include {param_log} from "${projectDir}/bin/log/wgs.nf" include {getLibraryId} from "${projectDir}/bin/shared/getLibraryId.nf" +include {extract_csv} from "${projectDir}/bin/shared/extract_csv.nf" +include {FILE_DOWNLOAD} from "${projectDir}/subworkflows/aria_download_parse" +include {CONCATENATE_LOCAL_FILES} from "${projectDir}/subworkflows/concatenate_local_files" include {CONCATENATE_READS_PE} from "${projectDir}/modules/utility_modules/concatenate_reads_PE" include {CONCATENATE_READS_SE} from "${projectDir}/modules/utility_modules/concatenate_reads_SE" +include {JAX_TRIMMER} from "${projectDir}/modules/utility_modules/jax_trimmer" +include {FASTQC} from "${projectDir}/modules/fastqc/fastqc" +include {READ_GROUPS} from "${projectDir}/modules/utility_modules/read_groups" include {BWA_MEM} from "${projectDir}/modules/bwa/bwa_mem" include {BWA_MEM_HLA} from "${projectDir}/modules/bwa/bwa_mem_hla" +include {PICARD_SORTSAM} from "${projectDir}/modules/picard/picard_sortsam" +include {PICARD_MARKDUPLICATES} from "${projectDir}/modules/picard/picard_markduplicates" +include {GATK_BASERECALIBRATOR} from "${projectDir}/modules/gatk/gatk_baserecalibrator" +include {GATK_APPLYBQSR} from "${projectDir}/modules/gatk/gatk_applybqsr" +include {PICARD_COLLECTALIGNMENTSUMMARYMETRICS} from "${projectDir}/modules/picard/picard_collectalignmentsummarymetrics" +include {PICARD_COLLECTWGSMETRICS} from "${projectDir}/modules/picard/picard_collectwgsmetrics" +include {GATK_HAPLOTYPECALLER_INTERVAL; + GATK_HAPLOTYPECALLER_INTERVAL as GATK_HAPLOTYPECALLER_INTERVAL_GVCF} from "${projectDir}/modules/gatk/gatk_haplotypecaller_interval" +include {MAKE_VCF_LIST} from "${projectDir}/modules/utility_modules/make_vcf_list" +include {GATK_MERGEVCF_LIST} from "${projectDir}/modules/gatk/gatk_mergevcf_list" +include {GATK_COMBINEGVCFS} from "${projectDir}/modules/gatk/gatk_combinegvcfs" +include {GATK_SELECTVARIANTS as GATK_SELECTVARIANTS_SNP; + GATK_SELECTVARIANTS as GATK_SELECTVARIANTS_INDEL} from "${projectDir}/modules/gatk/gatk_selectvariants" +include {GATK_VARIANTFILTRATION as GATK_VARIANTFILTRATION_SNP; + GATK_VARIANTFILTRATION as GATK_VARIANTFILTRATION_INDEL} from "${projectDir}/modules/gatk/gatk_variantfiltration" +include {GATK_MERGEVCF; + GATK_MERGEVCF as GATK_MERGEVCF_UNANNOTATED; + GATK_MERGEVCF as GATK_MERGEVCF_ANNOTATED} from "${projectDir}/modules/gatk/gatk_mergevcf" include {SNPSIFT_ANNOTATE as SNPSIFT_ANNOTATE_SNP_COSMIC; SNPSIFT_ANNOTATE as SNPSIFT_ANNOTATE_INDEL_COSMIC; SNPSIFT_ANNOTATE as SNPSIFT_ANNOTATE_SNP_DBSNP; SNPSIFT_ANNOTATE as SNPSIFT_ANNOTATE_INDEL_DBSNP} from "${projectDir}/modules/snpeff_snpsift/snpsift_annotate" -include {VCF_ANNOTATE as VCF_ANNOTATE_SNP; - VCF_ANNOTATE as VCF_ANNOTATE_INDEL} from "${projectDir}/modules/vcftools/vcf_annotate" include {SNPEFF; SNPEFF as SNPEFF_SNP; SNPEFF as SNPEFF_INDEL} from "${projectDir}/modules/snpeff_snpsift/snpeff_snpeff" -include {SNPEFF_ONEPERLINE as SNPEFF_ONEPERLINE_SNP; +include {SNPEFF_ONEPERLINE; + SNPEFF_ONEPERLINE as SNPEFF_ONEPERLINE_SNP; SNPEFF_ONEPERLINE as SNPEFF_ONEPERLINE_INDEL} from "${projectDir}/modules/snpeff_snpsift/snpeff_oneperline" -include {SNPSIFT_EXTRACTFIELDS} from "${projectDir}/modules/snpeff_snpsift/snpsift_extractfields" include {SNPSIFT_DBNSFP as SNPSIFT_DBNSFP_SNP; SNPSIFT_DBNSFP as SNPSIFT_DBNSFP_INDEL} from "${projectDir}/modules/snpeff_snpsift/snpsift_dbnsfp" +include {SNPSIFT_EXTRACTFIELDS} from "${projectDir}/modules/snpeff_snpsift/snpsift_extractfields" include {AGGREGATE_STATS} from "${projectDir}/modules/utility_modules/aggregate_stats_wgs" -include {READ_GROUPS} from "${projectDir}/modules/utility_modules/read_groups" -include {QUALITY_STATISTICS} from "${projectDir}/modules/utility_modules/quality_stats" -include {PICARD_SORTSAM} from "${projectDir}/modules/picard/picard_sortsam" -include {PICARD_MARKDUPLICATES} from "${projectDir}/modules/picard/picard_markduplicates" -include {PICARD_COLLECTALIGNMENTSUMMARYMETRICS} from "${projectDir}/modules/picard/picard_collectalignmentsummarymetrics" -include {PICARD_COLLECTWGSMETRICS} from "${projectDir}/modules/picard/picard_collectwgsmetrics" -include {GATK_BASERECALIBRATOR} from "${projectDir}/modules/gatk/gatk_baserecalibrator" -include {GATK_APPLYBQSR} from "${projectDir}/modules/gatk/gatk_applybqsr" -include {GATK_MERGEVCF} from "${projectDir}/modules/gatk/gatk_mergevcf" -include {GATK_MERGEVCF_LIST} from "${projectDir}/modules/gatk/gatk_mergevcf_list" -include {GATK_VARIANTANNOTATOR} from "${projectDir}/modules/gatk/gatk_variantannotator" -include {GATK_HAPLOTYPECALLER_INTERVAL} from "${projectDir}/modules/gatk/gatk_haplotypecaller_interval" -include {GATK_SELECTVARIANTS as GATK_SELECTVARIANTS_SNP; - GATK_SELECTVARIANTS as GATK_SELECTVARIANTS_INDEL} from "${projectDir}/modules/gatk/gatk_selectvariants" -include {GATK_VARIANTFILTRATION as GATK_VARIANTFILTRATION_SNP; - GATK_VARIANTFILTRATION as GATK_VARIANTFILTRATION_INDEL} from "${projectDir}/modules/gatk/gatk_variantfiltration" -include {MAKE_VCF_LIST} from "${projectDir}/modules/utility_modules/make_vcf_list" +include {MULTIQC} from "${projectDir}/modules/multiqc/multiqc" // help if needed if (params.help){ @@ -51,8 +58,25 @@ if (params.help){ // log params param_log() +if (params.download_data && !params.csv_input) { + exit 1, "Data download was specified with `--download_data`. However, no input CSV file was specified with `--csv_input`. This is an invalid parameter combination. `--download_data` requires a CSV manifest. See `--help` for information." +} + // prepare reads channel -if (params.concat_lanes){ +if (params.csv_input) { + + ch_input_sample = extract_csv(file(params.csv_input, checkIfExists: true)) + + if (params.read_type == 'PE'){ + ch_input_sample.map{it -> [it[0], [it[2], it[3]]]}.set{read_ch} + ch_input_sample.map{it -> [it[0], it[1]]}.set{meta_ch} + } else if (params.read_type == 'SE') { + ch_input_sample.map{it -> [it[0], it[2]]}.set{read_ch} + ch_input_sample.map{it -> [it[0], it[1]]}.set{meta_ch} + } + +} else if (params.concat_lanes){ + if (params.read_type == 'PE'){ read_ch = Channel .fromFilePairs("${params.sample_folder}/${params.pattern}${params.extension}",checkExists:true, flat:true ) @@ -65,37 +89,58 @@ if (params.concat_lanes){ .groupTuple() .map{t-> [t[0], t[1].flatten()]} } + // if channel is empty give error message and exit + read_ch.ifEmpty{ exit 1, "ERROR: No Files Found in Path: ${params.sample_folder} Matching Pattern: ${params.pattern} and file extension: ${params.extension}"} + } else { + if (params.read_type == 'PE'){ read_ch = Channel.fromFilePairs("${params.sample_folder}/${params.pattern}${params.extension}",checkExists:true ) } else if (params.read_type == 'SE'){ read_ch = Channel.fromFilePairs("${params.sample_folder}/*${params.extension}",checkExists:true, size:1 ) } + // if channel is empty give error message and exit + read_ch.ifEmpty{ exit 1, "ERROR: No Files Found in Path: ${params.sample_folder} Matching Pattern: ${params.pattern} and file extension: ${params.extension}"} } -// if channel is empty give error message and exit -read_ch.ifEmpty{ exit 1, "ERROR: No Files Found in Path: ${params.sample_folder} Matching Pattern: ${params.pattern}"} - // main workflow workflow WGS { - // Step 0: Concatenate Fastq files if required. - if (params.concat_lanes){ - if (params.read_type == 'PE'){ - CONCATENATE_READS_PE(read_ch) - read_ch = CONCATENATE_READS_PE.out.concat_fastq - } else if (params.read_type == 'SE'){ - CONCATENATE_READS_SE(read_ch) - read_ch = CONCATENATE_READS_SE.out.concat_fastq - } + // Step 0: Download data and concat Fastq files if needed. + if (params.download_data){ + FILE_DOWNLOAD(ch_input_sample) + + FILE_DOWNLOAD.out.read_meta_ch.map{it -> [it[0], it[2]]}.set{read_ch} + FILE_DOWNLOAD.out.read_meta_ch.map{it -> [it[0], it[1]]}.set{meta_ch} } + + // Step 00: Concat local Fastq files from CSV input if required. + if (!params.download_data && params.csv_input){ + CONCATENATE_LOCAL_FILES(ch_input_sample) + CONCATENATE_LOCAL_FILES.out.read_meta_ch.map{it -> [it[0], it[2]]}.set{read_ch} + CONCATENATE_LOCAL_FILES.out.read_meta_ch.map{it -> [it[0], it[1]]}.set{meta_ch} + } + + // Step 00: Concat local Fastq files if required. + if (params.concat_lanes && !params.csv_input){ + if (params.read_type == 'PE'){ + CONCATENATE_READS_PE(read_ch) + read_ch = CONCATENATE_READS_PE.out.concat_fastq + } else if (params.read_type == 'SE'){ + CONCATENATE_READS_SE(read_ch) + read_ch = CONCATENATE_READS_SE.out.concat_fastq + } + } + // Step 1: Qual_Stat - QUALITY_STATISTICS(read_ch) + JAX_TRIMMER(read_ch) + + FASTQC(JAX_TRIMMER.out.trimmed_fastq) // Step 2: Get Read Group Information - READ_GROUPS(QUALITY_STATISTICS.out.trimmed_fastq, "gatk") + READ_GROUPS(JAX_TRIMMER.out.trimmed_fastq, "gatk") - bwa_mem_mapping = QUALITY_STATISTICS.out.trimmed_fastq.join(READ_GROUPS.out.read_groups) + bwa_mem_mapping = JAX_TRIMMER.out.trimmed_fastq.join(READ_GROUPS.out.read_groups) // Step 3: BWA-MEM Alignment if (params.gen_org=='mouse'){ @@ -111,9 +156,11 @@ workflow WGS { PICARD_MARKDUPLICATES(PICARD_SORTSAM.out.bam) // If Human + ch_GATK_BASERECALIBRATOR_multiqc = Channel.empty() //optional log file for human only. if (params.gen_org=='human'){ GATK_BASERECALIBRATOR(PICARD_MARKDUPLICATES.out.dedup_bam) - + ch_GATK_BASERECALIBRATOR_multiqc = GATK_BASERECALIBRATOR.out.table // set log file for multiqc + apply_bqsr = PICARD_MARKDUPLICATES.out.dedup_bam.join(GATK_BASERECALIBRATOR.out.table) GATK_APPLYBQSR(apply_bqsr) @@ -131,15 +178,25 @@ workflow WGS { .splitText() .map{it -> it.trim()} + num_chroms = file(params.chrom_contigs).countLines().toInteger() + // number of intervals split on during calling. A 'value' variable used in groupTuple size statement. + // Applies scatter intervals from above to the BQSR bam file chrom_channel = data.combine(chroms) // Use the Channel in HaplotypeCaller - GATK_HAPLOTYPECALLER_INTERVAL(chrom_channel) + GATK_HAPLOTYPECALLER_INTERVAL(chrom_channel, '') // Gather intervals from scattered HaplotypeCaller operations into one // common stream for output - MAKE_VCF_LIST(GATK_HAPLOTYPECALLER_INTERVAL.out.vcf.groupTuple(),chroms.toList()) + + MAKE_VCF_LIST(GATK_HAPLOTYPECALLER_INTERVAL.out.vcf.groupTuple(size: num_chroms),chroms.toList()) GATK_MERGEVCF_LIST(MAKE_VCF_LIST.out.list) + + if (params.run_gvcf) { + // Use the Channel in HaplotypeCaller_GVCF + GATK_HAPLOTYPECALLER_INTERVAL_GVCF(chrom_channel,'gvcf') + GATK_COMBINEGVCFS(GATK_HAPLOTYPECALLER_INTERVAL_GVCF.out.vcf.groupTuple(size: num_chroms)) + } } // If Mouse @@ -157,81 +214,98 @@ workflow WGS { .splitText() .map{it -> it.trim()} + num_chroms = file(params.chrom_contigs).countLines().toInteger() + // number of intervals split on during calling. A 'value' variable used in groupTuple size statement. + // Applies scatter intervals from above to the BQSR bam file chrom_channel = data.combine(chroms) // Use the Channel in HaplotypeCaller - GATK_HAPLOTYPECALLER_INTERVAL(chrom_channel) + GATK_HAPLOTYPECALLER_INTERVAL(chrom_channel, '') // Gather intervals from scattered HaplotypeCaller operations into one // common stream for output - MAKE_VCF_LIST(GATK_HAPLOTYPECALLER_INTERVAL.out.vcf.groupTuple(), chroms.toList()) + + + MAKE_VCF_LIST(GATK_HAPLOTYPECALLER_INTERVAL.out.vcf.groupTuple(size: num_chroms), chroms.toList()) // Sort VCF within MAKE_VCF_LIST GATK_MERGEVCF_LIST(MAKE_VCF_LIST.out.list) - } + if (params.run_gvcf) { + // Use the Channel in HaplotypeCaller_GVCF + GATK_HAPLOTYPECALLER_INTERVAL_GVCF(chrom_channel,'gvcf') + GATK_COMBINEGVCFS(GATK_HAPLOTYPECALLER_INTERVAL_GVCF.out.vcf.groupTuple(size: num_chroms)) + } + } // SNP select_var_snp = GATK_MERGEVCF_LIST.out.vcf.join(GATK_MERGEVCF_LIST.out.idx) - GATK_SELECTVARIANTS_SNP(select_var_snp, 'SNP') + GATK_SELECTVARIANTS_SNP(select_var_snp, 'SNP', 'selected_SNP') var_filter_snp = GATK_SELECTVARIANTS_SNP.out.vcf.join(GATK_SELECTVARIANTS_SNP.out.idx) GATK_VARIANTFILTRATION_SNP(var_filter_snp, 'SNP') // INDEL select_var_indel = GATK_MERGEVCF_LIST.out.vcf.join(GATK_MERGEVCF_LIST.out.idx) - GATK_SELECTVARIANTS_INDEL(select_var_indel, 'INDEL') + GATK_SELECTVARIANTS_INDEL(select_var_indel, 'INDEL', 'selected_INDEL') var_filter_indel = GATK_SELECTVARIANTS_INDEL.out.vcf.join(GATK_SELECTVARIANTS_INDEL.out.idx) GATK_VARIANTFILTRATION_INDEL(var_filter_indel, 'INDEL') - // Cat Output to vcf-annotate* and add dbSNP annotations. - VCF_ANNOTATE_SNP(GATK_VARIANTFILTRATION_SNP.out.vcf, 'SNP') - VCF_ANNOTATE_INDEL(GATK_VARIANTFILTRATION_INDEL.out.vcf, 'INDEL') - -// Final Post-Processing Steps Differ for Human and Mouse + SNPSIFT_ANNOTATE_SNP_DBSNP(GATK_VARIANTFILTRATION_SNP.out.vcf, params.dbSNP, params.dbSNP_index, 'dbsnpID') + SNPSIFT_ANNOTATE_INDEL_DBSNP(GATK_VARIANTFILTRATION_INDEL.out.vcf, params.dbSNP, params.dbSNP_index, 'dbsnpID') // If Human if (params.gen_org=='human'){ // SNP - SNPSIFT_ANNOTATE_SNP_DBSNP(VCF_ANNOTATE_SNP.out.vcf, params.dbSNP, params.dbSNP_index, 'dbsnpID') SNPSIFT_ANNOTATE_SNP_COSMIC(SNPSIFT_ANNOTATE_SNP_DBSNP.out.vcf, params.cosmic, params.cosmic_index, 'cosmicID') SNPEFF_SNP(SNPSIFT_ANNOTATE_SNP_COSMIC.out.vcf, 'SNP', 'vcf') SNPSIFT_DBNSFP_SNP(SNPEFF_SNP.out.vcf, 'SNP') SNPEFF_ONEPERLINE_SNP(SNPSIFT_DBNSFP_SNP.out.vcf, 'SNP') // INDEL - SNPSIFT_ANNOTATE_INDEL_DBSNP(VCF_ANNOTATE_INDEL.out.vcf, params.dbSNP, params.dbSNP_index, 'dbsnpID') SNPSIFT_ANNOTATE_INDEL_COSMIC(SNPSIFT_ANNOTATE_INDEL_DBSNP.out.vcf, params.cosmic, params.cosmic_index, 'cosmicID') SNPEFF_INDEL(SNPSIFT_ANNOTATE_INDEL_COSMIC.out.vcf, 'INDEL', 'vcf') SNPSIFT_DBNSFP_INDEL(SNPEFF_INDEL.out.vcf, 'INDEL') SNPEFF_ONEPERLINE_INDEL(SNPSIFT_DBNSFP_INDEL.out.vcf, 'INDEL') // Merge SNP and INDEL and Aggregate Stats - vcf_files = SNPEFF_ONEPERLINE_SNP.out.vcf.join(SNPEFF_ONEPERLINE_INDEL.out.vcf) - GATK_MERGEVCF(vcf_files) + vcf_files_unannotated = SNPSIFT_ANNOTATE_SNP_COSMIC.out.vcf.join(SNPSIFT_ANNOTATE_INDEL_COSMIC.out.vcf) + GATK_MERGEVCF_UNANNOTATED(vcf_files_unannotated, 'SNP_INDEL_filtered_unannotated_final') - SNPSIFT_EXTRACTFIELDS(GATK_MERGEVCF.out.vcf) + vcf_files_annotated = SNPEFF_ONEPERLINE_SNP.out.vcf.join(SNPEFF_ONEPERLINE_INDEL.out.vcf) + GATK_MERGEVCF_ANNOTATED(vcf_files_annotated, 'SNP_INDEL_filtered_annotated_final') + + SNPSIFT_EXTRACTFIELDS(GATK_MERGEVCF_ANNOTATED.out.vcf) } // If Mouse if (params.gen_org=='mouse'){ // Merge SNP and INDEL - vcf_files = VCF_ANNOTATE_SNP.out.vcf.join(VCF_ANNOTATE_INDEL.out.vcf) - - GATK_MERGEVCF(vcf_files) + vcf_files = SNPSIFT_ANNOTATE_SNP_DBSNP.out.vcf.join(SNPSIFT_ANNOTATE_INDEL_DBSNP.out.vcf) - SNPEFF(GATK_MERGEVCF.out.vcf, 'BOTH', 'gatk') + GATK_MERGEVCF(vcf_files, 'SNP_INDEL_filtered_unannotated_final') - merged_vcf_files = GATK_MERGEVCF.out.vcf.join(SNPEFF.out.vcf) + SNPEFF(GATK_MERGEVCF.out.vcf, 'BOTH', 'vcf') - GATK_VARIANTANNOTATOR(merged_vcf_files) - - SNPSIFT_EXTRACTFIELDS(GATK_VARIANTANNOTATOR.out.vcf) + SNPEFF_ONEPERLINE(SNPEFF.out.vcf, 'BOTH') + SNPSIFT_EXTRACTFIELDS(SNPEFF_ONEPERLINE.out.vcf) } - agg_stats = QUALITY_STATISTICS.out.quality_stats.join(PICARD_MARKDUPLICATES.out.dedup_metrics).join(PICARD_COLLECTALIGNMENTSUMMARYMETRICS.out.txt).join(PICARD_COLLECTWGSMETRICS.out.txt) + agg_stats = JAX_TRIMMER.out.quality_stats.join(PICARD_MARKDUPLICATES.out.dedup_metrics).join(PICARD_COLLECTALIGNMENTSUMMARYMETRICS.out.txt).join(PICARD_COLLECTWGSMETRICS.out.txt) - // may replace with multiqc AGGREGATE_STATS(agg_stats) + ch_multiqc_files = Channel.empty() + ch_multiqc_files = ch_multiqc_files.mix(JAX_TRIMMER.out.quality_stats.collect{it[1]}.ifEmpty([])) + ch_multiqc_files = ch_multiqc_files.mix(FASTQC.out.quality_stats.collect{it[1]}.ifEmpty([])) + ch_multiqc_files = ch_multiqc_files.mix(ch_GATK_BASERECALIBRATOR_multiqc.collect{it[1]}.ifEmpty([])) + ch_multiqc_files = ch_multiqc_files.mix(PICARD_COLLECTALIGNMENTSUMMARYMETRICS.out.txt.collect{it[1]}.ifEmpty([])) + ch_multiqc_files = ch_multiqc_files.mix(PICARD_COLLECTWGSMETRICS.out.txt.collect{it[1]}.ifEmpty([])) + ch_multiqc_files = ch_multiqc_files.mix(PICARD_MARKDUPLICATES.out.dedup_metrics.collect{it[1]}.ifEmpty([])) + + MULTIQC ( + ch_multiqc_files.collect() + ) + + }