diff --git a/.gitignore b/.gitignore index 29e6c952..6bfdf18e 100644 --- a/.gitignore +++ b/.gitignore @@ -7,3 +7,7 @@ work/ *trace* *.out .DS_Store +design.csv +sv_input.csv +test.csv +test2.csv diff --git a/ReleaseNotes.md b/ReleaseNotes.md index cd57937a..9b40003b 100644 --- a/ReleaseNotes.md +++ b/ReleaseNotes.md @@ -1,5 +1,194 @@ # RELEASE NOTES +## Release 0.3.0 + +In this major release we have added two additional pipelines, added flexibility for specifying inputs via sample sheets, support for downloading remote input data, support for GRCm39, support for PDX data, and many more changes detailed below. Additionally, we have added the concept of "subworkflows" for tasks that are more complex than a module and/or involve multiple containers, yet can be potentially re-used in multiple pipelines. + +### Pipelines Added: + +1. ChIP-seq - human, mouse +2. Paired Tumor Analysis (somatic/germline WGS) - human, PDX + +### Subworkflows Added: + +1. Aria download for remote input data +2. Concatenate paired tumor/normal FASTQ files +3. RNA-seq for PDX input data + +### Modules Added: + +1. arriba/arriba.nf +2. bamtools/bamtools_filter.nf +3. bcftools/bcftools_germline_filter.nf +4. bcftools/bcftools_intersect_lancet_candidates.nf +5. bcftools/bcftools_merge_callers.nf +6. bcftools/bcftools_remove_spanning.nf +7. bcftools/bcftools_split_multiallelic_regions.nf +8. bcftools/bcftools_split_multiallelic.nf +9. bedtools/bedtools_amplicon_metrics.nf +10. bedtools/bedtools_genomecov.nf +11. bedtools/bedtools_start_candidates.nf +12. biqseq2/bicseq2_normalize.nf +13. biqseq2/bicseq2_seg_unpaired.nf +14. biqseq2/bicseq2_seg.nf +15. conpair/conpair_pileup.nf +16. conpair/conpair.nf +17. cosmic/cosmic_add_cancer_resistance_mutations_germline.nf +18. cosmic/cosmic_add_cancer_resistance_mutations_somatic.nf +19. cosmic/cosmic_annotation_somatic.nf +20. cosmic/cosmic_annotation.nf +21. deeptools/deeptools_computematrix.nf +22. deeptools/deeptools_plotfingerprint.nf +23. deeptools/deeptools_plotheatmap.nf +24. deeptools/deeptools_plotprofile.nf +25. ensembl/varianteffectpredictor_germline.nf +26. ensembl/varianteffectpredictor_somatic.nf +27. fastq-tools/fastq-pair.nf +28. fastq-tools/fastq-sort.nf +29. fusion_report/fusion_report.nf +30. fusioncatcher/fusioncatcher.nf +31. gatk/gatk_cnnscorevariants.nf +32. gatk/gatk_combinegvcfs.nf +33. gatk/gatk_filtermutectcalls_tumorOnly.nf +34. gatk/gatk_filtermutectcalls.nf +35. gatk/gatk_filtervarianttranches.nf +36. gatk/gatk_genotype_gvcf.nf +37. gatk/gatk_getsamplename_noMeta.nf +38. gatk/gatk_getsamplename.nf +39. gatk/gatk_haplotypecaller_sv_germline.nf +40. gatk/gatk_mergemutectstats.nf +41. gatk/gatk_mutect2_tumorOnly.nf +42. gatk/gatk_mutect2.nf +43. gatk/gatk_sortvcf_germline.nf +44. gatk/gatk_sortvcf_somatic_merge.nf +45. gatk/gatk_sortvcf_somatic_tools.nf +46. gatk/gatk_variantfiltration_af.nf +47. gatk/gatk_variantfiltration_mutect2.nf +48. gatk/gatk3_applyrecalibration.nf +49. gatk/gatk3_genotypegvcf.nf +50. gatk/gatk3_haplotypecaller.nf +51. gatk/gatk3_indelrealigner.nf +52. gatk/gatk3_realignertargetcreator.nf +53. gatk/gatk3_variantannotator.nf +54. gatk/gatk3_variantrecalibrator.nf +55. gridss/gridss_assemble.nf +56. gridss/gridss_calling.nf +57. gridss/gridss_chrom_filter.nf +58. gridss/gridss_preprocess.nf +59. gridss/gripss_somatic_filter.nf +60. homer/annotate_boolean_peaks.nf +61. homer/homer_annotatepeaks.nf +62. homer/plot_homer_annotatepeaks.nf +63. illumina/manta.nf +64. illumina/strelka2.nf +65. jaffa/jaffa.nf +66. kallisto/kallisto_insert_size.nf +67. kallisto/kallisto_quant.nf +68. lumpy_sv/lumpy_sv.nf +69. macs2/macs2_consensus.nf +70. macs2/macs2_peak_calling_chipseq.nf +71. macs2/plot_macs2_qc.nf +72. msisensor2/msisensor2_tumorOnly.nf +73. msisensor2/msisensor2.nf +74. multiqc/multiqc_custom_phantompeakqualtools.nf +75. novocraft/novosort.nf +76. nygc-short-alignment-marking/short_alignment_marking.nf +77. nygenome/lancet_confirm.nf +78. nygenome/lancet.nf +79. phantompeakqualtools/phantompeakqualtools.nf +80. picard/picard_cleansam.nf +81. picard/picard_collectmultiplemetrics.nf +82. picard/picard_collecttargetpcrmetrics.nf +83. picard/picard_fix_mate_information.nf +84. picard/picard_mergesamfiles.nf +85. pizzly/pizzly.nf +86. preseq/preseq.nf +87. primerclip/primerclip.nf +88. python/python_add_final_allele_counts.nf +89. python/python_add_nygc_allele_counts.nf +90. python/python_check_strandedness.nf +91. python/python_filter_pon.nf +92. python/python_filter_vcf.nf +93. python/python_germline_vcf_finalization.nf +94. python/python_get_candidates.nf +95. python/python_merge_columns.nf +96. python/python_merge_prep.nf +97. python/python_remove_contig.nf +98. python/python_rename_metadata.nf +99. python/python_rename_vcf.nf +100. python/python_reorder_vcf_columns.nf +101. python/python_snv_to_mnv_final_filter.nf +102. python/python_somatic_vcf_finalization.nf +103. python/python_split_mnv.nf +104. python/python_vcf_to_bed.nf +105. r/annotate_bicseq2_cnv.nf +106. r/annotate_genes_sv.nf +107. r/annotate_sv_with_cnv.nf +108. r/annotate_sv.nf +109. r/filter_bedpe.nf +110. r/frag_len_plot.nf +111. r/merge_sv.nf +112. samtools/samtools_faidx.nf +113. samtools/samtools_filter_unique_reads.nf +114. samtools/samtools_filter.nf +115. samtools/samtools_mergebam_filter.nf +116. samtools/samtools_stats_insertsize.nf +117. samtools/samtools_stats.nf +118. samtools/samtools_view.nf +119. squid/squid_annotate.nf +120. squid/squid_call.nf +121. star/star_align.nf +122. star-fusion/star-fusion.nf +123. subread/subread_feature_counts_chipseq.nf +124. svaba/svaba.nf +125. tabix/compress_merged_vcf.nf +126. tabix/compress_vcf_region.nf +127. tabix/compress_vcf.nf +128. ucsc/ucsc_bedgraphtobigwig.nf +129. utility_modules/aria_download.nf +130. utility_modules/chipseq_bampe_rm_orphan.nf +131. utility_modules/chipseq_check_design.nf +132. utility_modules/chipseq_make_genome_filter.nf +133. utility_modules/concatenate_reads_sampleSheet.nf +134. utility_modules/deseq2_qc.nf +135. utility_modules/frip_score.nf +136. utility_modules/get_read_length.nf +137. utility_modules/gunzip.nf +138. utility_modules/jax_trimmer.nf +139. utility_modules/parse_extracted_sv_table.nf +140. xenome/xenome.nf + +### Pipeline Changes: + +1. WES, RNA-seq, and RNA-fusion added support for PDX data +2. WES, RNA-seq, WGS, ATAC, RRBS, ChIP added support for GRCm39 +3. Support for input specification using sample sheets for ATAC, RNA-seq, RRBS, WES, WGS +4. Support for downloading input data for ATAC, RNA-seq, RRBS, WES, WGS +5. Added MULTIQC to ATAC, RNA-seq, RRBS, WES, WGS +6. Added assessment of strandedness using python/python_check_strandedness.nf rather than requiring specification via parameters +7. Added assessment of read length for RNAseq for STAR index selection rather than requiring specfication via parameters +8. Modified variant annotations in WES and WGS +9. Added GVCF support for WES and WGS + +### Module Changes: + +1. errorStrategy modified for all modules to catch and report instances where tasks fail due to walltime or memory contraints. This previously required a deep reading of the subtask SLURM logs, but now will be reported in the top-level SLURM log and is more user-friendly +2. Removed log.info statements from modules to avoid noisy disruption of log files +3. ChIP-seq support for bwa/bwa_mem.nf, fastqc/fastqc.nf, picard/picard_markduplicates.nf, trim_galore/trim_galore.nf +4. Corrected emit statements for g2gtools/g2gtools_chain_convert_peak.nf +5. Corrected emit statements for gatk/gatk_chain_filter_reads.nf +6. Modified gatk/gatk_haplotypecaller_interval.nf and gatk/gatk_haplotypecaller.nf for optional GVCF support +7. Generalized multiqc/multiqc.nf via parameter for multiqc config +8. Removed --METRIC_ACCUMULATION_LEVEL ALL_READS and --VALIDATION_STRINGENCY LENIENT parameters from picard/picard_collectalignmentsummarymetrics.nf +9. Modified strand specification logic for picard/picard_collectrnaseqmetrics.nf +10. Updated rsem/rsem_alignment_expression.nf to reflect changes in strandedness detection, reorganized outputs and catching log files for multiqc +11. Changes to output text for mt DNA content in samtools/samtools_calc_mtdna_filter_chrm.nf +12. Changes to output text from samtools/samtools_final_calc_frip.nf +13. Changes to output formatting for samtools/samtools_quality_checks.nf +14. Updated snpEff container to v5.1d to support GRCm39 +15. Changes to output fields for mouse and human from snpeff_snpsift/snpsift_extractfields.nf +16. Added missing container to utility_modules/concatenate_reads_PE.nf and utility_modules/concatenate_reads_SE.nf + ## Release 0.2.2 * Change WES and WGS COMSIC annotation to use SNPsift. diff --git a/bin/atac/LogParser.py b/bin/atac/LogParser.py index 69b0e672..eaed3a7e 100644 --- a/bin/atac/LogParser.py +++ b/bin/atac/LogParser.py @@ -53,25 +53,29 @@ for file in glob.glob("*mtDNA_Content.txt"): with open(file) as f: - for line in f: - print(line.rstrip('\n')) + lines = f.readlines()[1:] + for line in lines: + input_reads = line.split(sep='\t') + print("mtDNA Percent:\t" + str(input_reads[1]).rstrip('\n')) print("----NRF and PBC Log----") for file in glob.glob("*pbc.qc"): with open(file) as f: - for line in f: - line = line.rstrip('\n') - input_reads = line.split(sep='\t') - print("Non-Redundant Fraction (NRF): " + str(input_reads[4])) - print("PCR Bottlenecking Coefficient 1 (PBC1):\t" + str(input_reads[5])) - print("PCR Bottlenecking Coefficient 2 (PBC2):\t" + str(input_reads[6])) + lines = f.readlines()[1:] + for line in lines: + line = line.rstrip('\n') + input_reads = line.split(sep='\t') + print("Non-Redundant Fraction (NRF): " + str(input_reads[5])) + print("PCR Bottlenecking Coefficient 1 (PBC1):\t" + str(input_reads[6])) + print("PCR Bottlenecking Coefficient 2 (PBC2):\t" + str(input_reads[7])) print("----Fraction Reads in Peak----") for file in glob.glob("*Fraction_reads_in_peak.txt"): with open(file) as f: - for line in f: - line.rstrip('\n') - input_reads = line.split(sep='\t') - print('Filtered Read Count:\t' + input_reads[1], end='') - print('Fraction Reads in Peak:\t' + input_reads[0]) + lines = f.readlines()[1:] + for line in lines: + line.rstrip('\n') + input_reads = line.split(sep='\t') + print('Filtered Read Count:\t' + input_reads[2], end='') + print('Fraction Reads in Peak:\t' + input_reads[1]) diff --git a/bin/atac/fragment_length_plot.R b/bin/atac/fragment_length_plot.R index 3ae53619..0351720d 100644 --- a/bin/atac/fragment_length_plot.R +++ b/bin/atac/fragment_length_plot.R @@ -4,9 +4,9 @@ library(ggplot2) args = commandArgs(trailingOnly=TRUE) -frag_length <- read.table(args[1], header=F, sep=" ", row.names=NULL, check.names=F, na.strings = '.') +frag_length <- read.table(args[1], header=F, sep="\t", row.names=NULL, check.names=F, na.strings = '.') -spline_int <- as.data.frame(spline(frag_length$V2, frag_length$V1)) +spline_int <- as.data.frame(spline(frag_length$V3, frag_length$V2)) pdf(file='fraglen_plot.pdf') ggplot(frag_length) + @@ -15,3 +15,9 @@ ggplot(frag_length) + xlab("Insert Size (bp)") + ylab("Read Count") dev.off() + +temp_df <- t(data.frame('x-axis' = spline_int$x, 'y-axis' = spline_int$y)) + +rownames(temp_df) <- c(unique(frag_length$V1), unique(frag_length$V1)) + +write.table(temp_df, quote = F, row.names = T, file = args[2], sep = '\t', col.names = F) diff --git a/bin/chipseq/bampe_rm_orphan.py b/bin/chipseq/bampe_rm_orphan.py new file mode 100755 index 00000000..5b0a6f72 --- /dev/null +++ b/bin/chipseq/bampe_rm_orphan.py @@ -0,0 +1,156 @@ +#!/usr/bin/env python + +############################################################################### +############################################################################### +## Created on February 1st 2017 to remove singletons from paired-end BAM file +############################################################################### +############################################################################### + +import os +import pysam +import errno +import argparse + +############################################ +############################################ +## PARSE ARGUMENTS +############################################ +############################################ + +Description = 'Remove singleton reads from paired-end BAM file i.e if read1 is present in BAM file without read 2 and vice versa.' +Epilog = """Example usage: bampe_rm_orphan.py """ + +argParser = argparse.ArgumentParser(description=Description, epilog=Epilog) + +## REQUIRED PARAMETERS +argParser.add_argument('BAM_INPUT_FILE', help="Input BAM file sorted by name.") +argParser.add_argument('BAM_OUTPUT_FILE', help="Output BAM file sorted by name.") + +## OPTIONAL PARAMETERS +argParser.add_argument('-fr', '--only_fr_pairs', dest="ONLY_FR_PAIRS", help="Only keeps pairs that are in FR orientation on same chromosome.",action='store_true') +args = argParser.parse_args() + +############################################ +############################################ +## HELPER FUNCTIONS +############################################ +############################################ + +def makedir(path): + + if not len(path) == 0: + try: + os.makedirs(path) + except OSError as exception: + if exception.errno != errno.EEXIST: + raise + +############################################ +############################################ +## MAIN FUNCTION +############################################ +############################################ + +def bampe_rm_orphan(BAMIn,BAMOut,onlyFRPairs=False): + + ## SETUP DIRECTORY/FILE STRUCTURE + OutDir = os.path.dirname(BAMOut) + makedir(OutDir) + + ## COUNT VARIABLES + totalReads = 0; totalOutputPairs = 0; totalSingletons = 0; totalImproperPairs = 0 + + ## ITERATE THROUGH BAM FILE + EOF = 0 + SAMFin = pysam.AlignmentFile(BAMIn, "rb") + SAMFout = pysam.AlignmentFile(BAMOut, "wb", header=SAMFin.header) + iter = SAMFin.fetch(until_eof=True) + currRead = next(iter) + for read in iter: + totalReads += 1 + if currRead.qname == read.qname: + pair1 = currRead; pair2 = read + + ## FILTER FOR READS ON SAME CHROMOSOME IN FR ORIENTATION + if onlyFRPairs: + if pair1.tid == pair2.tid: + + ## READ1 FORWARD AND READ2 REVERSE STRAND + if not pair1.is_reverse and pair2.is_reverse: + if pair1.reference_start <= pair2.reference_start: + totalOutputPairs += 1 + SAMFout.write(pair1) + SAMFout.write(pair2) + else: + totalImproperPairs += 1 + + ## READ1 REVERSE AND READ2 FORWARD STRAND + elif pair1.is_reverse and not pair2.is_reverse: + if pair2.reference_start <= pair1.reference_start: + totalOutputPairs += 1 + SAMFout.write(pair1) + SAMFout.write(pair2) + else: + totalImproperPairs += 1 + + else: + totalImproperPairs += 1 + else: + totalImproperPairs += 1 + else: + totalOutputPairs += 1 + SAMFout.write(pair1) + SAMFout.write(pair2) + + ## RESET COUNTER + try: + totalReads += 1 + currRead = next(iter) + except: + StopIteration + EOF = 1 + + ## READS WHERE ONLY ONE OF A PAIR IS IN FILE + else: + totalSingletons += 1 + pair1 = currRead + currRead = read + + if not EOF: + totalReads += 1 + totalSingletons += 1 + pair1 = currRead + + ## CLOSE ALL FILE HANDLES + SAMFin.close() + SAMFout.close() + + LogFile = os.path.join(OutDir,'%s_bampe_rm_orphan.log' % (os.path.basename(BAMOut[:-4]))) + SamLogFile = open(LogFile,'w') + SamLogFile.write('\n##############################\n') + SamLogFile.write('FILES/DIRECTORIES') + SamLogFile.write('\n##############################\n\n') + SamLogFile.write('Input File: ' + BAMIn + '\n') + SamLogFile.write('Output File: ' + BAMOut + '\n') + SamLogFile.write('\n##############################\n') + SamLogFile.write('OVERALL COUNTS') + SamLogFile.write('\n##############################\n\n') + SamLogFile.write('Total Input Reads = ' + str(totalReads) + '\n') + SamLogFile.write('Total Output Pairs = ' + str(totalOutputPairs) + '\n') + SamLogFile.write('Total Singletons Excluded = ' + str(totalSingletons) + '\n') + SamLogFile.write('Total Improper Pairs Excluded = ' + str(totalImproperPairs) + '\n') + SamLogFile.write('\n##############################\n') + SamLogFile.close() + +############################################ +############################################ +## RUN FUNCTION +############################################ +############################################ + +bampe_rm_orphan(BAMIn=args.BAM_INPUT_FILE,BAMOut=args.BAM_OUTPUT_FILE,onlyFRPairs=args.ONLY_FR_PAIRS) + +############################################ +############################################ +############################################ +############################################ diff --git a/bin/chipseq/check_design.py b/bin/chipseq/check_design.py new file mode 100755 index 00000000..51a99375 --- /dev/null +++ b/bin/chipseq/check_design.py @@ -0,0 +1,191 @@ +#!/usr/bin/env python + +####################################################################### +####################################################################### +## Created on April 4th 2019 to check nf-core/chipseq design file +####################################################################### +####################################################################### + +import os +import sys +import argparse + +############################################ +############################################ +## PARSE ARGUMENTS +############################################ +############################################ + +Description = 'Reformat nf-core/chipseq design file and check its contents.' +Epilog = """Example usage: python check_design.py """ + +argParser = argparse.ArgumentParser(description=Description, epilog=Epilog) + +## REQUIRED PARAMETERS +argParser.add_argument('DESIGN_FILE', help="Input design file.") +argParser.add_argument('READ_MAPPING_FILE', help="Output design file containing sample ids and reads.") +argParser.add_argument('CONTROL_MAPPING_FILE', help="Output design file containing ip vs control mappings.") +args = argParser.parse_args() + +############################################ +############################################ +## MAIN FUNCTION +############################################ +############################################ + +def reformat_design(DesignFile,ReadMappingFile,ControlMappingFile): + + ERROR_STR = 'ERROR: Please check design file' + HEADER = ['group', 'replicate', 'fastq_1', 'fastq_2', 'antibody', 'control'] + + ## CHECK HEADER + fin = open(DesignFile,'r') + header = fin.readline().strip().split(',') + if header != HEADER: + print("{} header: {} != {}".format(ERROR_STR,','.join(header),','.join(HEADER))) + sys.exit(1) + + numColList = [] + sampleMappingDict = {} + antibodyDict = {} + while True: + line = fin.readline() + if line: + lspl = [x.strip() for x in line.strip().split(',')] + group,replicate,fastQFiles,antibody,control = lspl[0],lspl[1],[x for x in lspl[2:-2] if x],lspl[-2],lspl[-1] + + ## CHECK VALID NUMBER OF COLUMNS PER SAMPLE + numCols = len(lspl) + if numCols not in [6]: + print("{}: Invalid number of columns (should be 6)!\nLine: '{}'".format(ERROR_STR,line.strip())) + sys.exit(1) + numColList.append(numCols) + + ## CHECK GROUP ID DOESNT CONTAIN SPACES + if group.find(' ') != -1: + print("{}: Group id contains spaces!\nLine: '{}'".format(ERROR_STR,line.strip())) + sys.exit(1) + + ## CHECK REPLICATE COLUMN IS INTEGER + if not replicate.isdigit(): + print("{}: Replicate id not an integer!\nLine: '{}'".format(ERROR_STR,line.strip())) + sys.exit(1) + replicate = int(replicate) + + for fastq in fastQFiles: + ## CHECK FASTQ FILE EXTENSION + if fastq[-9:] != '.fastq.gz' and fastq[-6:] != '.fq.gz': + print("{}: FastQ file has incorrect extension (has to be '.fastq.gz' or 'fq.gz') - {}\nLine: '{}'".format(ERROR_STR,fastq,line.strip())) + sys.exit(1) + + ## CREATE GROUP MAPPING DICT = {GROUP_ID: {REPLICATE_ID:[[FASTQ_FILES]]} + if group not in sampleMappingDict: + sampleMappingDict[group] = {} + if replicate not in sampleMappingDict[group]: + sampleMappingDict[group][replicate] = [] + sampleMappingDict[group][replicate].append(fastQFiles) + + ## CHECK BOTH ANTIBODY AND CONTROL COLUMNS HAVE VALID VALUES + if antibody: + if antibody.find(' ') != -1: + print("{}: Antibody id contains spaces!\nLine: '{}'".format(ERROR_STR,line.strip())) + sys.exit(1) + if not control: + print("{}: both Antibody and Control must be specified!\nLine: '{}'".format(ERROR_STR,line.strip())) + sys.exit(1) + if control: + if control.find(' ') != -1: + print("{}: Control id contains spaces!\nLine: '{}'".format(ERROR_STR,line.strip())) + sys.exit(1) + if not antibody: + print("{}: both Antibody and Control must be specified!\nLine: '{}'".format(ERROR_STR,line.strip())) + sys.exit(1) + + ## CREATE ANTIBODY MAPPING CONTROL DICT + if antibody and control: + antibodyDict[group] = (antibody,control) + + else: + fin.close() + break + + ## CHECK IF DATA IS PAIRED-END OR SINGLE-END AND NOT A MIXTURE + if min(numColList) != max(numColList): + print("{}: Mixture of paired-end and single-end reads!".format(ERROR_STR)) + sys.exit(1) + + ## CHECK IF ANTIBODY AND CONTROL COLUMNS HAVE BEEN SPECIFIED AT LEAST ONCE + if len(antibodyDict) == 0: + print("{}: Antibody and Control must be specified at least once!".format(ERROR_STR)) + sys.exit(1) + + ## WRITE READ MAPPING FILE + antibodyGroupDict = {} + fout = open(ReadMappingFile,'w') + fout.write(','.join(['sample_id','fastq_1','fastq_2']) + '\n') + for group in sorted(sampleMappingDict.keys()): + + ## CHECK THAT REPLICATE IDS ARE IN FORMAT 1.. + uniq_rep_ids = set(sampleMappingDict[group].keys()) + if len(uniq_rep_ids) != max(uniq_rep_ids): + print("{}: Replicate IDs must start with 1..\nGroup: {}, Replicate IDs: {}".format(ERROR_STR,group,list(uniq_rep_ids))) + sys.exit(1) + + ## RECONSTRUCT LINE FOR SAMPLE IN DESIGN + for replicate in sorted(sampleMappingDict[group].keys()): + for idx in range(len(sampleMappingDict[group][replicate])): + fastQFiles = sampleMappingDict[group][replicate][idx] + + ## GET SAMPLE_ID,FASTQ_1,FASTQ_2 COLUMNS + sample_id = "{}_R{}_T{}".format(group,replicate,idx+1) + oList = [sample_id] + fastQFiles + if len(fastQFiles) == 1: + oList += [''] + fout.write(','.join(oList) + '\n') + + ## EXTRAPOLATE CONTROL COLUMN + if group in antibodyDict: + antibody,control = antibodyDict[group] + if control in sampleMappingDict.keys(): + control_id = "{}_R1".format(control) + if replicate in sampleMappingDict[control]: + control_id = "{}_R{}".format(control,replicate) + if antibody not in antibodyGroupDict: + antibodyGroupDict[antibody] = {} + if group not in antibodyGroupDict[antibody]: + antibodyGroupDict[antibody][group] = [] + antibodyList = [sample_id[:-3],control_id] + if not antibodyList in antibodyGroupDict[antibody][group]: + antibodyGroupDict[antibody][group].append(antibodyList) + else: + print("{}: Control id not a valid group\nControl id: {}, Valid Groups: {}".format(ERROR_STR,control,sorted(sampleMappingDict.keys()))) + sys.exit(1) + fout.close() + + ## WRITE SAMPLE TO CONTROL MAPPING FILE + fout = open(ControlMappingFile,'w') + fout.write(','.join(['sample_id','control_id','antibody','replicatesExist','multipleGroups']) + '\n') + for antibody in sorted(antibodyGroupDict.keys()): + repsExist = '0' + if max([len(x) for x in antibodyGroupDict[antibody].values()]) > 1: + repsExist = '1' + multipleGroups = '0' + if len(antibodyGroupDict[antibody].keys()) > 1: + multipleGroups = '1' + for group in sorted(antibodyGroupDict[antibody].keys()): + for antibodyList in antibodyGroupDict[antibody][group]: + fout.write(','.join(antibodyList+[antibody,repsExist,multipleGroups]) + '\n') + fout.close() + +############################################ +############################################ +## RUN FUNCTION +############################################ +############################################ + +reformat_design(DesignFile=args.DESIGN_FILE,ReadMappingFile=args.READ_MAPPING_FILE,ControlMappingFile=args.CONTROL_MAPPING_FILE) + +############################################ +############################################ +############################################ +############################################ diff --git a/bin/chipseq/deseq2_qc.r b/bin/chipseq/deseq2_qc.r new file mode 100755 index 00000000..e8c2617f --- /dev/null +++ b/bin/chipseq/deseq2_qc.r @@ -0,0 +1,247 @@ +#!/usr/bin/env Rscript + +################################################ +################################################ +## REQUIREMENTS ## +################################################ +################################################ + +## PCA, HEATMAP AND SCATTERPLOTS FOR SAMPLES IN COUNTS FILE +## - SAMPLE NAMES HAVE TO END IN e.g. "_R1" REPRESENTING REPLICATE ID. LAST 3 CHARACTERS OF SAMPLE NAME WILL BE TRIMMED TO OBTAIN GROUP ID FOR DESEQ2 COMPARISONS. +## - PACKAGES BELOW NEED TO BE AVAILABLE TO LOAD WHEN RUNNING R + +################################################ +################################################ +## LOAD LIBRARIES ## +################################################ +################################################ + +library(optparse) +library(DESeq2) +library(ggplot2) +library(RColorBrewer) +library(pheatmap) + +################################################ +################################################ +## PARSE COMMAND-LINE PARAMETERS ## +################################################ +################################################ + +option_list <- list( + make_option(c("-i", "--count_file" ), type="character", default=NULL , metavar="path" , help="Count file matrix where rows are genes and columns are samples." ), + make_option(c("-f", "--count_col" ), type="integer" , default=2 , metavar="integer", help="First column containing sample count data." ), + make_option(c("-d", "--id_col" ), type="integer" , default=1 , metavar="integer", help="Column containing identifiers to be used." ), + make_option(c("-r", "--sample_suffix" ), type="character", default='' , metavar="string" , help="Suffix to remove after sample name in columns e.g. '.rmDup.bam' if 'DRUG_R1.rmDup.bam'."), + make_option(c("-o", "--outdir" ), type="character", default='./' , metavar="path" , help="Output directory." ), + make_option(c("-p", "--outprefix" ), type="character", default='deseq2', metavar="string" , help="Output prefix." ), + make_option(c("-v", "--vst" ), type="logical" , default=FALSE , metavar="boolean", help="Run vst transform instead of rlog." ), + make_option(c("-c", "--cores" ), type="integer" , default=1 , metavar="integer", help="Number of cores." ) +) + +opt_parser <- OptionParser(option_list=option_list) +opt <- parse_args(opt_parser) + +if (is.null(opt$count_file)){ + print_help(opt_parser) + stop("Please provide a counts file.", call.=FALSE) +} + +################################################ +################################################ +## READ IN COUNTS FILE ## +################################################ +################################################ + +count.table <- read.delim(file=opt$count_file,header=TRUE, row.names=NULL, skip=1, check.names=FALSE) +rownames(count.table) <- count.table[,opt$id_col] +count.table <- count.table[,opt$count_col:ncol(count.table),drop=FALSE] +colnames(count.table) <- gsub(opt$sample_suffix,"",colnames(count.table)) +colnames(count.table) <- gsub(pattern='\\.$', replacement='', colnames(count.table)) + +################################################ +################################################ +## RUN DESEQ2 ## +################################################ +################################################ + +if (file.exists(opt$outdir) == FALSE) { + dir.create(opt$outdir, recursive=TRUE) +} +setwd(opt$outdir) + +samples.vec <- colnames(count.table) +name_components <- strsplit(samples.vec, "_") +n_components <- length(name_components[[1]]) +decompose <- n_components!=1 && all(sapply(name_components, length)==n_components) +coldata <- data.frame(samples.vec, sample=samples.vec, row.names=1) +if (decompose) { + groupings <- as.data.frame(lapply(1:n_components, function(i) sapply(name_components, "[[", i))) + names(groupings) <- paste0("Group", 1:n_components) + n_distinct <- sapply(groupings, function(grp) length(unique(grp))) + groupings <- groupings[n_distinct!=1 & n_distinct!=length(samples.vec)] + if (ncol(groupings)!=0) { + coldata <- cbind(coldata, groupings) + } else { + decompose <- FALSE + } +} + +DDSFile <- paste(opt$outprefix,".dds.RData",sep="") + +counts <- count.table[,samples.vec,drop=FALSE] +dds <- DESeqDataSetFromMatrix(countData=round(counts), colData=coldata, design=~ 1) +dds <- estimateSizeFactors(dds) +if (min(dim(count.table))<=1) { # No point if only one sample, or one gene + save(dds,file=DDSFile) + saveRDS(dds, file=sub("\\.dds\\.RData$", ".rds", DDSFile)) + warning("Not enough samples or genes in counts file for PCA.", call.=FALSE) + quit(save = "no", status = 0, runLast = FALSE) +} +if (!opt$vst) { + vst_name <- "rlog" + rld <- rlog(dds) +} else { + vst_name <- "vst" + rld <- varianceStabilizingTransformation(dds) +} + +assay(dds, vst_name) <- assay(rld) +save(dds,file=DDSFile) +saveRDS(dds, file=sub("\\.dds\\.RData$", ".rds", DDSFile)) + +################################################ +################################################ +## PLOT QC ## +################################################ +################################################ + +##' PCA pre-processeor +##' +##' Generate all the necessary information to plot PCA from a DESeq2 object +##' in which an assay containing a variance-stabilised matrix of counts is +##' stored. Copied from DESeq2::plotPCA, but with additional ability to +##' say which assay to run the PCA on. +##' +##' @param object The DESeq2DataSet object. +##' @param ntop number of top genes to use for principla components, selected by highest row variance. +##' @param assay the name or index of the assay that stores the variance-stabilised data. +##' @return A data.frame containing the projected data alongside the grouping columns. +##' A 'percentVar' attribute is set which includes the percentage of variation each PC explains, +##' and additionally how much the variation within that PC is explained by the grouping variable. +##' @author Gavin Kelly +plotPCA_vst <- function (object, ntop = 500, assay=length(assays(object))) { + rv <- rowVars(assay(object, assay)) + select <- order(rv, decreasing = TRUE)[seq_len(min(ntop, length(rv)))] + pca <- prcomp(t(assay(object, assay)[select, ]), center=TRUE, scale=FALSE) + percentVar <- pca$sdev^2/sum(pca$sdev^2) + df <- cbind( as.data.frame(colData(object)), pca$x) + #Order points so extreme samples are more likely to get label + ord <- order(abs(rank(df$PC1)-median(df$PC1)), abs(rank(df$PC2)-median(df$PC2))) + df <- df[ord,] + attr(df, "percentVar") <- data.frame(PC=seq(along=percentVar), percentVar=100*percentVar) + return(df) +} + +PlotFile <- paste(opt$outprefix,".plots.pdf",sep="") + +pdf(file=PlotFile, onefile=TRUE, width=7, height=7) +## PCA +ntop <- c(500, Inf) +for (n_top_var in ntop) { + pca.data <- plotPCA_vst(dds, assay=vst_name, ntop=n_top_var) + percentVar <- round(attr(pca.data, "percentVar")$percentVar) + plot_subtitle <- ifelse(n_top_var==Inf, "All genes", paste("Top", n_top_var, "genes")) + pl <- ggplot(pca.data, aes(PC1, PC2, label=paste0(" ", sample, " "))) + + geom_point() + + geom_text(check_overlap=TRUE, vjust=0.5, hjust="inward") + + xlab(paste0("PC1: ",percentVar[1],"% variance")) + + ylab(paste0("PC2: ",percentVar[2],"% variance")) + + labs(title = paste0("First PCs on ", vst_name, "-transformed data"), subtitle = plot_subtitle) + + theme(legend.position="top", + panel.grid.major = element_blank(), + panel.grid.minor = element_blank(), + panel.background = element_blank(), + panel.border = element_rect(colour = "black", fill=NA, size=1)) + print(pl) + + if (decompose) { + pc_names <- paste0("PC", attr(pca.data, "percentVar")$PC) + long_pc <- reshape(pca.data, varying=pc_names, direction="long", sep="", timevar="component", idvar="pcrow") + long_pc <- subset(long_pc, component<=5) + long_pc_grp <- reshape(long_pc, varying=names(groupings), direction="long", sep="", timevar="grouper") + long_pc_grp <- subset(long_pc_grp, grouper<=5) + long_pc_grp$component <- paste("PC", long_pc_grp$component) + long_pc_grp$grouper <- paste0(long_pc_grp$grouper, c("st","nd","rd","th","th")[long_pc_grp$grouper], " prefix") + pl <- ggplot(long_pc_grp, aes(x=Group, y=PC)) + + geom_point() + + stat_summary(fun=mean, geom="line", aes(group = 1)) + + labs(x=NULL, y=NULL, subtitle = plot_subtitle, title="PCs split by sample-name prefixes") + + facet_grid(component~grouper, scales="free_x") + + scale_x_discrete(guide = guide_axis(n.dodge = 3)) + print(pl) + } +} # at end of loop, we'll be using the user-defined ntop if any, else all genes + +## WRITE PC1 vs PC2 VALUES TO FILE +pca.vals <- pca.data[,c("PC1","PC2")] +colnames(pca.vals) <- paste0(colnames(pca.vals), ": ", percentVar[1:2], '% variance') +pca.vals <- cbind(sample = rownames(pca.vals), pca.vals) +write.table(pca.vals, file = paste(opt$outprefix, ".pca.vals.txt", sep=""), + row.names = FALSE, col.names = TRUE, sep = "\t", quote = TRUE) + +## SAMPLE CORRELATION HEATMAP +sampleDists <- dist(t(assay(dds, vst_name))) +sampleDistMatrix <- as.matrix(sampleDists) +colors <- colorRampPalette( rev(brewer.pal(9, "Blues")) )(255) +pheatmap( + sampleDistMatrix, + clustering_distance_rows=sampleDists, + clustering_distance_cols=sampleDists, + col=colors, + main=paste("Euclidean distance between", vst_name, "of samples") +) + +## WRITE SAMPLE DISTANCES TO FILE +write.table(cbind(sample = rownames(sampleDistMatrix), sampleDistMatrix),file=paste(opt$outprefix, ".sample.dists.txt", sep=""), + row.names=FALSE, col.names=TRUE, sep="\t", quote=FALSE) +dev.off() + +################################################ +################################################ +## SAVE SIZE FACTORS ## +################################################ +################################################ + +SizeFactorsDir <- "size_factors/" +if (file.exists(SizeFactorsDir) == FALSE) { + dir.create(SizeFactorsDir, recursive=TRUE) +} + +NormFactorsFile <- paste(SizeFactorsDir,opt$outprefix, ".size_factors.RData", sep="") + +normFactors <- sizeFactors(dds) +save(normFactors, file=NormFactorsFile) + +for (name in names(sizeFactors(dds))) { + sizeFactorFile <- paste(SizeFactorsDir,name, ".txt", sep="") + write(as.numeric(sizeFactors(dds)[name]), file=sizeFactorFile) +} + +################################################ +################################################ +## R SESSION INFO ## +################################################ +################################################ + +RLogFile <- "R_sessionInfo.log" + +sink(RLogFile) +a <- sessionInfo() +print(a) +sink() + +################################################ +################################################ +################################################ +################################################ diff --git a/bin/chipseq/featurecounts_deseq2.r b/bin/chipseq/featurecounts_deseq2.r new file mode 100755 index 00000000..b75335a5 --- /dev/null +++ b/bin/chipseq/featurecounts_deseq2.r @@ -0,0 +1,301 @@ +#!/usr/bin/env Rscript + +################################################ +################################################ +## REQUIREMENTS ## +################################################ +################################################ + +## DIFFERENTIAL ANALYSIS, SCATTERPLOTS AND PCA FOR SAMPLES IN FEATURECOUNTS FILE + ## - FIRST SIX COLUMNS OF FEATURECOUNTS_FILE SHOULD BE INTERVAL INFO. REMAINDER OF COLUMNS SHOULD BE SAMPLES-SPECIFIC COUNTS. + ## - SAMPLE NAMES HAVE TO END IN "_R1" REPRESENTING REPLICATE ID. LAST 3 CHARACTERS OF SAMPLE NAME WILL BE TRIMMED TO OBTAIN GROUP ID FOR DESEQ2 COMPARISONS. + ## - BAM_SUFFIX IS PORTION OF FILENAME AFTER SAMPLE NAME IN FEATURECOUNTS COLUMN SAMPLE NAMES E.G. ".rmDup.bam" if "DRUG_R1.rmDup.bam" + ## - PACKAGES BELOW NEED TO BE AVAILABLE TO LOAD WHEN RUNNING R + +################################################ +################################################ +## LOAD LIBRARIES ## +################################################ +################################################ + +library(optparse) +library(DESeq2) +library(vsn) +library(ggplot2) +library(RColorBrewer) +library(pheatmap) +library(lattice) +library(BiocParallel) + +################################################ +################################################ +## PARSE COMMAND-LINE PARAMETERS ## +################################################ +################################################ + +option_list <- list(make_option(c("-i", "--featurecount_file"), type="character", default=NULL, help="Feature count file generated by the SubRead featureCounts command.", metavar="path"), + make_option(c("-b", "--bam_suffix"), type="character", default=NULL, help="Portion of filename after sample name in featurecount file header e.g. '.rmDup.bam' if 'DRUG_R1.rmDup.bam'", metavar="string"), + make_option(c("-o", "--outdir"), type="character", default='./', help="Output directory", metavar="path"), + make_option(c("-p", "--outprefix"), type="character", default='differential', help="Output prefix", metavar="string"), + make_option(c("-s", "--outsuffix"), type="character", default='', help="Output suffix for comparison-level results", metavar="string"), + make_option(c("-v", "--vst"), type="logical", default=FALSE, help="Run vst transform instead of rlog", metavar="boolean"), + make_option(c("-c", "--cores"), type="integer", default=1, help="Number of cores", metavar="integer")) + +opt_parser <- OptionParser(option_list=option_list) +opt <- parse_args(opt_parser) + +if (is.null(opt$featurecount_file)){ + print_help(opt_parser) + stop("Please provide featurecount file.", call.=FALSE) +} +if (is.null(opt$bam_suffix)){ + print_help(opt_parser) + stop("Please provide bam suffix in header of featurecount file.", call.=FALSE) +} + +################################################ +################################################ +## READ IN COUNTS FILE ## +################################################ +################################################ + +count.table <- read.delim(file=opt$featurecount_file,header=TRUE,skip=1) +colnames(count.table) <- gsub(opt$bam_suffix,"",colnames(count.table)) +colnames(count.table) <- as.character(lapply(colnames(count.table), function (x) tail(strsplit(x,'.',fixed=TRUE)[[1]],1))) +rownames(count.table) <- count.table$Geneid +interval.table <- count.table[,1:6] +count.table <- count.table[,7:ncol(count.table),drop=FALSE] + +################################################ +################################################ +## RUN DESEQ2 ## +################################################ +################################################ + +if (file.exists(opt$outdir) == FALSE) { + dir.create(opt$outdir,recursive=TRUE) +} +setwd(opt$outdir) + +samples.vec <- sort(colnames(count.table)) +groups <- sub("_[^_]+$", "", samples.vec) +print(unique(groups)) +if (length(unique(groups)) == 1) { + quit(save = "no", status = 0, runLast = FALSE) +} + +DDSFile <- paste(opt$outprefix,".dds.rld.RData",sep="") +if (file.exists(DDSFile) == FALSE) { + counts <- count.table[,samples.vec,drop=FALSE] + coldata <- data.frame(row.names=colnames(counts),condition=groups) + dds <- DESeqDataSetFromMatrix(countData = round(counts), colData = coldata, design = ~ condition) + dds <- DESeq(dds, parallel=TRUE, BPPARAM=MulticoreParam(opt$cores)) + if (!opt$vst) { + rld <- rlog(dds) + } else { + rld <- vst(dds) + } + save(dds,rld,file=DDSFile) +} + +################################################ +################################################ +## PLOT QC ## +################################################ +################################################ + +PlotFile <- paste(opt$outprefix,".plots.pdf",sep="") +if (file.exists(PlotFile) == FALSE) { + pdf(file=PlotFile,onefile=TRUE,width=7,height=7) + + ## PCA + pca.data <- DESeq2::plotPCA(rld,intgroup=c("condition"),returnData=TRUE) + percentVar <- round(100 * attr(pca.data, "percentVar")) + plot <- ggplot(pca.data, aes(PC1, PC2, color=condition)) + + geom_point(size=3) + + xlab(paste0("PC1: ",percentVar[1],"% variance")) + + ylab(paste0("PC2: ",percentVar[2],"% variance")) + + theme(panel.grid.major = element_blank(), + panel.grid.minor = element_blank(), + panel.background = element_blank(), + panel.border = element_rect(colour = "black", fill=NA, size=1)) + print(plot) + + ## WRITE PC1 vs PC2 VALUES TO FILE + pca.vals <- pca.data[,1:2] + colnames(pca.vals) <- paste(colnames(pca.vals),paste(percentVar,'% variance',sep=""), sep=": ") + pca.vals <- cbind(sample = rownames(pca.vals), pca.vals) + write.table(pca.vals,file=paste(opt$outprefix,".pca.vals.txt",sep=""),row.names=FALSE,col.names=TRUE,sep="\t",quote=TRUE) + + ## SAMPLE CORRELATION HEATMAP + sampleDists <- dist(t(assay(rld))) + sampleDistMatrix <- as.matrix(sampleDists) + colors <- colorRampPalette( rev(brewer.pal(9, "Blues")) )(255) + pheatmap(sampleDistMatrix,clustering_distance_rows=sampleDists,clustering_distance_cols=sampleDists,col=colors) + + ## WRITE SAMPLE DISTANCES TO FILE + write.table(cbind(sample = rownames(sampleDistMatrix), sampleDistMatrix),file=paste(opt$outprefix,".sample.dists.txt",sep=""),row.names=FALSE,col.names=TRUE,sep="\t",quote=FALSE) + + dev.off() +} + +################################################ +################################################ +## SAVE SIZE FACTORS ## +################################################ +################################################ + +SizeFactorsDir <- "sizeFactors/" +if (file.exists(SizeFactorsDir) == FALSE) { + dir.create(SizeFactorsDir,recursive=TRUE) +} + +NormFactorsFile <- paste(SizeFactorsDir,opt$outprefix,".sizeFactors.RData",sep="") +if (file.exists(NormFactorsFile) == FALSE) { + normFactors <- sizeFactors(dds) + save(normFactors,file=NormFactorsFile) + + for (name in names(sizeFactors(dds))) { + sizeFactorFile <- paste(SizeFactorsDir,name,opt$outsuffix,".sizeFactor.txt",sep="") + if (file.exists(sizeFactorFile) == FALSE) { + write(as.numeric(sizeFactors(dds)[name]),file=sizeFactorFile) + } + } +} + +################################################ +################################################ +## WRITE LOG FILE ## +################################################ +################################################ + +LogFile <- paste(opt$outprefix,".log",sep="") +if (file.exists(LogFile) == FALSE) { + cat("\nSamples =",samples.vec,"\n\n",file=LogFile,append=TRUE,sep=', ') + cat("Groups =",groups,"\n\n",file=LogFile,append=TRUE,sep=', ') + cat("Dimensions of count matrix =",dim(counts),"\n\n",file=LogFile,append=FALSE,sep=' ') + cat("\n",file=LogFile,append=TRUE,sep='') +} + +################################################ +################################################ +## LOOP THROUGH COMPARISONS ## +################################################ +################################################ + +ResultsFile <- paste(opt$outprefix,".results.txt",sep="") +if (file.exists(ResultsFile) == FALSE) { + + raw.counts <- counts(dds,normalized=FALSE) + colnames(raw.counts) <- paste(colnames(raw.counts),'raw',sep='.') + pseudo.counts <- counts(dds,normalized=TRUE) + colnames(pseudo.counts) <- paste(colnames(pseudo.counts),'pseudo',sep='.') + + deseq2_results_list <- list() + comparisons <- combn(unique(groups),2) + for (idx in 1:ncol(comparisons)) { + + control.group <- comparisons[1,idx] + treat.group <- comparisons[2,idx] + CompPrefix <- paste(control.group,treat.group,sep="vs") + cat("Saving results for ",CompPrefix," ...\n",sep="") + + CompOutDir <- paste(CompPrefix,'/',sep="") + if (file.exists(CompOutDir) == FALSE) { + dir.create(CompOutDir,recursive=TRUE) + } + + control.samples <- samples.vec[which(groups == control.group)] + treat.samples <- samples.vec[which(groups == treat.group)] + comp.samples <- c(control.samples,treat.samples) + + comp.results <- results(dds,contrast=c("condition",c(control.group,treat.group))) + comp.df <- as.data.frame(comp.results) + comp.table <- cbind(interval.table, as.data.frame(comp.df), raw.counts[,paste(comp.samples,'raw',sep='.')], pseudo.counts[,paste(comp.samples,'pseudo',sep='.')]) + + ## WRITE RESULTS FILE + CompResultsFile <- paste(CompOutDir,CompPrefix,opt$outsuffix,".deseq2.results.txt",sep="") + write.table(comp.table, file=CompResultsFile, col.names=TRUE, row.names=FALSE, sep='\t', quote=FALSE) + + ## FILTER RESULTS BY FDR & LOGFC AND WRITE RESULTS FILE + pdf(file=paste(CompOutDir,CompPrefix,opt$outsuffix,".deseq2.plots.pdf",sep=""),width=10,height=8) + if (length(comp.samples) > 2) { + for (MIN_FDR in c(0.01,0.05)) { + + ## SUBSET RESULTS BY FDR + pass.fdr.table <- subset(comp.table, padj < MIN_FDR) + pass.fdr.up.table <- subset(comp.table, padj < MIN_FDR & log2FoldChange > 0) + pass.fdr.down.table <- subset(comp.table, padj < MIN_FDR & log2FoldChange < 0) + + ## SUBSET RESULTS BY FDR AND LOGFC + pass.fdr.logFC.table <- subset(comp.table, padj < MIN_FDR & abs(log2FoldChange) >= 1) + pass.fdr.logFC.up.table <- subset(comp.table, padj < MIN_FDR & abs(log2FoldChange) >= 1 & log2FoldChange > 0) + pass.fdr.logFC.down.table <- subset(comp.table, padj < MIN_FDR & abs(log2FoldChange) >= 1 & log2FoldChange < 0) + + ## WRITE RESULTS FILE + CompResultsFile <- paste(CompOutDir,CompPrefix,opt$outsuffix,".deseq2.FDR",MIN_FDR,".results.txt",sep="") + CompBEDFile <- paste(CompOutDir,CompPrefix,opt$outsuffix,".deseq2.FDR",MIN_FDR,".results.bed",sep="") + write.table(pass.fdr.table, file=CompResultsFile, col.names=TRUE, row.names=FALSE, sep='\t', quote=FALSE) + write.table(pass.fdr.table[,c("Chr","Start","End","Geneid","log2FoldChange","Strand")], file=CompBEDFile, col.names=FALSE, row.names=FALSE, sep='\t', quote=FALSE) + + ## MA PLOT & VOLCANO PLOT + DESeq2::plotMA(comp.results, main=paste("MA plot FDR <= ",MIN_FDR,sep=""), ylim=c(-2,2),alpha=MIN_FDR) + plot(comp.table$log2FoldChange, -1*log10(comp.table$padj), col=ifelse(comp.table$padj<=MIN_FDR, "red", "black"), xlab="logFC", ylab="-1*log10(FDR)", main=paste("Volcano plot FDR <=",MIN_FDR,sep=" "), pch=20) + + ## ADD COUNTS TO LOGFILE + cat(CompPrefix," genes with FDR <= ",MIN_FDR,": ",nrow(pass.fdr.table)," (up=",nrow(pass.fdr.up.table),", down=",nrow(pass.fdr.down.table),")","\n",file=LogFile,append=TRUE,sep="") + cat(CompPrefix," genes with FDR <= ",MIN_FDR," & FC > 2: ",nrow(pass.fdr.logFC.table)," (up=",nrow(pass.fdr.logFC.up.table),", down=",nrow(pass.fdr.logFC.down.table),")","\n",file=LogFile,append=TRUE,sep="") + + } + cat("\n",file=LogFile,append=TRUE,sep="") + } + + ## SAMPLE CORRELATION HEATMAP + rld.subset <- assay(rld)[,comp.samples] + sampleDists <- dist(t(rld.subset)) + sampleDistMatrix <- as.matrix(sampleDists) + colors <- colorRampPalette( rev(brewer.pal(9, "Blues")) )(255) + pheatmap(sampleDistMatrix,clustering_distance_rows=sampleDists,clustering_distance_cols=sampleDists,col=colors) + + ## SCATTER PLOT FOR RLOG COUNTS + combs <- combn(comp.samples,2,simplify=FALSE) + clabels <- sapply(combs,function(x){paste(x,collapse=' & ')}) + plotdat <- data.frame(x=unlist(lapply(combs, function(x){rld.subset[, x[1] ]})),y=unlist(lapply(combs, function(y){rld.subset[, y[2] ]})),comp=rep(clabels, each=nrow(rld.subset))) + plot <- xyplot(y~x|comp,plotdat, + panel=function(...){ + panel.xyplot(...) + panel.abline(0,1,col="red") + }, + par.strip.text=list(cex=0.5)) + print(plot) + dev.off() + + colnames(comp.df) <- paste(CompPrefix,".",colnames(comp.df),sep="") + deseq2_results_list[[idx]] <- comp.df + + } + + ## WRITE RESULTS FROM ALL COMPARISONS TO FILE + deseq2_results_table <- cbind(interval.table,do.call(cbind, deseq2_results_list),raw.counts,pseudo.counts) + write.table(deseq2_results_table, file=ResultsFile, col.names=TRUE, row.names=FALSE, sep='\t', quote=FALSE) + +} + +################################################ +################################################ +## R SESSION INFO ## +################################################ +################################################ + +RLogFile <- "R_sessionInfo.log" +if (file.exists(RLogFile) == FALSE) { + sink(RLogFile) + a <- sessionInfo() + print(a) + sink() +} + +################################################ +################################################ +################################################ +################################################ diff --git a/bin/chipseq/gtf2bed b/bin/chipseq/gtf2bed new file mode 100755 index 00000000..c2a8bbee --- /dev/null +++ b/bin/chipseq/gtf2bed @@ -0,0 +1,123 @@ +#!/usr/bin/perl + +# Copyright (c) 2011 Erik Aronesty (erik@q32.com) +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +# THE SOFTWARE. +# +# ALSO, IT WOULD BE NICE IF YOU LET ME KNOW YOU USED IT. + +use Getopt::Long; + +my $extended; +GetOptions("x"=>\$extended); + +$in = shift @ARGV; + +my $in_cmd =($in =~ /\.gz$/ ? "gunzip -c $in|" : $in =~ /\.zip$/ ? "unzip -p $in|" : "$in") || die "Can't open $in: $!\n"; +open IN, $in_cmd; + +while () { + $gff = 2 if /^##gff-version 2/; + $gff = 3 if /^##gff-version 3/; + next if /^#/ && $gff; + + s/\s+$//; + # 0-chr 1-src 2-feat 3-beg 4-end 5-scor 6-dir 7-fram 8-attr + my @f = split /\t/; + if ($gff) { + # most ver 2's stick gene names in the id field + ($id) = $f[8]=~ /\bID="([^"]+)"/; + # most ver 3's stick unquoted names in the name field + ($id) = $f[8]=~ /\bName=([^";]+)/ if !$id && $gff == 3; + } else { + ($id) = $f[8]=~ /transcript_id "([^"]+)"/; + } + + next unless $id && $f[0]; + + if ($f[2] eq 'exon') { + die "no position at exon on line $." if ! $f[3]; + # gff3 puts :\d in exons sometimes + $id =~ s/:\d+$// if $gff == 3; + push @{$exons{$id}}, \@f; + # save lowest start + $trans{$id} = \@f if !$trans{$id}; + } elsif ($f[2] eq 'start_codon') { + #optional, output codon start/stop as "thick" region in bed + $sc{$id}->[0] = $f[3]; + } elsif ($f[2] eq 'stop_codon') { + $sc{$id}->[1] = $f[4]; + } elsif ($f[2] eq 'miRNA' ) { + $trans{$id} = \@f if !$trans{$id}; + push @{$exons{$id}}, \@f; + } +} + +for $id ( + # sort by chr then pos + sort { + $trans{$a}->[0] eq $trans{$b}->[0] ? + $trans{$a}->[3] <=> $trans{$b}->[3] : + $trans{$a}->[0] cmp $trans{$b}->[0] + } (keys(%trans)) ) { + my ($chr, undef, undef, undef, undef, undef, $dir, undef, $attr, undef, $cds, $cde) = @{$trans{$id}}; + my ($cds, $cde); + ($cds, $cde) = @{$sc{$id}} if $sc{$id}; + + # sort by pos + my @ex = sort { + $a->[3] <=> $b->[3] + } @{$exons{$id}}; + + my $beg = $ex[0][3]; + my $end = $ex[-1][4]; + + if ($dir eq '-') { + # swap + $tmp=$cds; + $cds=$cde; + $cde=$tmp; + $cds -= 2 if $cds; + $cde += 2 if $cde; + } + + # not specified, just use exons + $cds = $beg if !$cds; + $cde = $end if !$cde; + + # adjust start for bed + --$beg; --$cds; + + my $exn = @ex; # exon count + my $exst = join ",", map {$_->[3]-$beg-1} @ex; # exon start + my $exsz = join ",", map {$_->[4]-$_->[3]+1} @ex; # exon size + + my $gene_id; + my $extend = ""; + if ($extended) { + ($gene_id) = $attr =~ /gene_name "([^"]+)"/; + ($gene_id) = $attr =~ /gene_id "([^"]+)"/ unless $gene_id; + $extend="\t$gene_id"; + } + # added an extra comma to make it look exactly like ucsc's beds + print "$chr\t$beg\t$end\t$id\t0\t$dir\t$cds\t$cde\t0\t$exn\t$exsz,\t$exst,$extend\n"; +} + + +close IN; diff --git a/bin/chipseq/igv_files_to_session.py b/bin/chipseq/igv_files_to_session.py new file mode 100755 index 00000000..48e749c8 --- /dev/null +++ b/bin/chipseq/igv_files_to_session.py @@ -0,0 +1,122 @@ +#!/usr/bin/env python + +####################################################################### +####################################################################### +## Created on July 4th 2018 to create IGV session file from file list +####################################################################### +####################################################################### + +import os +import errno +import argparse + +############################################ +############################################ +## PARSE ARGUMENTS +############################################ +############################################ + +Description = 'Create IGV session file from a list of files and associated colours - ".bed", ".bw", ".bigwig", ".tdf", ".gtf" files currently supported.' +Epilog = """Example usage: python igv_files_to_session.py """ + +argParser = argparse.ArgumentParser(description=Description, epilog=Epilog) + +## REQUIRED PARAMETERS +argParser.add_argument('XML_OUT', help="XML output file.") +argParser.add_argument('LIST_FILE', help="Tab-delimited file containing two columns i.e. file_name\tcolour. Header isnt required.") +argParser.add_argument('GENOME', help="Full path to genome fasta file or shorthand for genome available in IGV e.g. hg19.") + +## OPTIONAL PARAMETERS +argParser.add_argument('-pp', '--path_prefix', type=str, dest="PATH_PREFIX", default='', help="Path prefix to be added at beginning of all files in input list file.") +args = argParser.parse_args() + +############################################ +############################################ +## HELPER FUNCTIONS +############################################ +############################################ + +def makedir(path): + + if not len(path) == 0: + try: + os.makedirs(path) + except OSError as exception: + if exception.errno != errno.EEXIST: + raise + +############################################ +############################################ +## MAIN FUNCTION +############################################ +############################################ + +def igv_files_to_session(XMLOut,ListFile,Genome,PathPrefix=''): + + makedir(os.path.dirname(XMLOut)) + + fileList = [] + fin = open(ListFile,'r') + while True: + line = fin.readline() + if line: + ifile,colour = line.strip().split('\t') + if len(colour.strip()) == 0: + colour = '0,0,178' + fileList.append((PathPrefix.strip()+ifile,colour)) + else: + break + fout.close() + + ## ADD RESOURCES SECTION + XMLStr = '\n' + XMLStr += '\n' % (Genome) + XMLStr += '\t\n' + for ifile,colour in fileList: + XMLStr += '\t\t\n' % (ifile) + XMLStr += '\t\n' + + ## ADD PANEL SECTION + XMLStr += '\t\n' + for ifile,colour in fileList: + extension = os.path.splitext(ifile)[1].lower() + if extension in ['.bed','.broadpeak','.narrowpeak']: + XMLStr += '\t\t --is_narrow_peak --min_replicates 1""" + +argParser = argparse.ArgumentParser(description=Description, epilog=Epilog) + +## REQUIRED PARAMETERS +argParser.add_argument('MERGED_INTERVAL_FILE', help="Merged MACS2 interval file created using linux sort and mergeBed.") +argParser.add_argument('SAMPLE_NAME_LIST', help="Comma-separated list of sample names as named in individual MACS2 broadPeak/narrowPeak output file e.g. SAMPLE_R1 for SAMPLE_R1_peak_1.") +argParser.add_argument('OUTFILE', help="Full path to output directory.") + +## OPTIONAL PARAMETERS +argParser.add_argument('-in', '--is_narrow_peak', dest="IS_NARROW_PEAK", help="Whether merged interval file was generated from narrow or broad peak files (default: False).",action='store_true') +argParser.add_argument('-mr', '--min_replicates', type=int, dest="MIN_REPLICATES", default=1, help="Minumum number of replicates per sample required to contribute to merged peak (default: 1).") +args = argParser.parse_args() + +############################################ +############################################ +## HELPER FUNCTIONS +############################################ +############################################ + +def makedir(path): + + if not len(path) == 0: + try: + os.makedirs(path) + except OSError as exception: + if exception.errno != errno.EEXIST: + raise + +############################################ +############################################ +## MAIN FUNCTION +############################################ +############################################ + +## MergedIntervalTxtFile is file created using commands below: +## 1) broadPeak +## sort -k1,1 -k2,2n | mergeBed -c 2,3,4,5,6,7,8,9 -o collapse,collapse,collapse,collapse,collapse,collapse,collapse,collapse > merged_peaks.txt +## 2) narrowPeak +## sort -k1,1 -k2,2n | mergeBed -c 2,3,4,5,6,7,8,9,10 -o collapse,collapse,collapse,collapse,collapse,collapse,collapse,collapse,collapse > merged_peaks.txt + +def macs2_merged_expand(MergedIntervalTxtFile,SampleNameList,OutFile,isNarrow=False,minReplicates=1): + + makedir(os.path.dirname(OutFile)) + + combFreqDict = {} + totalOutIntervals = 0 + SampleNameList = sorted(SampleNameList) + fin = open(MergedIntervalTxtFile,'r') + fout = open(OutFile,'w') + oFields = ['chr','start','end','interval_id','num_peaks','num_samples'] + [x+'.bool' for x in SampleNameList] + [x+'.fc' for x in SampleNameList] + [x+'.qval' for x in SampleNameList] + [x+'.pval' for x in SampleNameList] + [x+'.start' for x in SampleNameList] + [x+'.end' for x in SampleNameList] + if isNarrow: + oFields += [x+'.summit' for x in SampleNameList] + fout.write('\t'.join(oFields) + '\n') + while True: + line = fin.readline() + if line: + lspl = line.strip().split('\t') + + chromID = lspl[0]; mstart = int(lspl[1]); mend = int(lspl[2]); + starts = [int(x) for x in lspl[3].split(',')]; ends = [int(x) for x in lspl[4].split(',')] + names = lspl[5].split(','); fcs = [float(x) for x in lspl[8].split(',')] + pvals = [float(x) for x in lspl[9].split(',')]; qvals = [float(x) for x in lspl[10].split(',')] + summits = [] + if isNarrow: + summits = [int(x) for x in lspl[11].split(',')] + + ## GROUP SAMPLES BY REMOVING TRAILING *_R* + groupDict = {} + for sID in ['_'.join(x.split('_')[:-2]) for x in names]: + gID = '_'.join(sID.split('_')[:-1]) + if gID not in groupDict: + groupDict[gID] = [] + if sID not in groupDict[gID]: + groupDict[gID].append(sID) + + ## GET SAMPLES THAT PASS REPLICATE THRESHOLD + passRepThreshList = [] + for gID,sIDs in groupDict.items(): + if len(sIDs) >= minReplicates: + passRepThreshList += sIDs + + ## GET VALUES FROM INDIVIDUAL PEAK SETS + fcDict = {}; qvalDict = {}; pvalDict = {}; startDict = {}; endDict = {}; summitDict = {} + for idx in range(len(names)): + sample = '_'.join(names[idx].split('_')[:-2]) + if sample in passRepThreshList: + if sample not in fcDict: + fcDict[sample] = [] + fcDict[sample].append(str(fcs[idx])) + if sample not in qvalDict: + qvalDict[sample] = [] + qvalDict[sample].append(str(qvals[idx])) + if sample not in pvalDict: + pvalDict[sample] = [] + pvalDict[sample].append(str(pvals[idx])) + if sample not in startDict: + startDict[sample] = [] + startDict[sample].append(str(starts[idx])) + if sample not in endDict: + endDict[sample] = [] + endDict[sample].append(str(ends[idx])) + if isNarrow: + if sample not in summitDict: + summitDict[sample] = [] + summitDict[sample].append(str(summits[idx])) + + samples = sorted(fcDict.keys()) + if samples != []: + numSamples = len(samples) + boolList = ['TRUE' if x in samples else 'FALSE' for x in SampleNameList] + fcList = [';'.join(fcDict[x]) if x in samples else 'NA' for x in SampleNameList] + qvalList = [';'.join(qvalDict[x]) if x in samples else 'NA' for x in SampleNameList] + pvalList = [';'.join(pvalDict[x]) if x in samples else 'NA' for x in SampleNameList] + startList = [';'.join(startDict[x]) if x in samples else 'NA' for x in SampleNameList] + endList = [';'.join(endDict[x]) if x in samples else 'NA' for x in SampleNameList] + oList = [str(x) for x in [chromID,mstart,mend,'Interval_'+str(totalOutIntervals+1),len(names),numSamples]+boolList+fcList+qvalList+pvalList+startList+endList] + if isNarrow: + oList += [';'.join(summitDict[x]) if x in samples else 'NA' for x in SampleNameList] + fout.write('\t'.join(oList) + '\n') + + tsamples = tuple(sorted(samples)) + if tsamples not in combFreqDict: + combFreqDict[tsamples] = 0 + combFreqDict[tsamples] += 1 + totalOutIntervals += 1 + + else: + fin.close() + fout.close() + break + + ## WRITE FILE FOR INTERVAL INTERSECT ACROSS SAMPLES. + ## COMPATIBLE WITH UPSETR PACKAGE. + fout = open(OutFile[:-4]+'.intersect.txt','w') + combFreqItems = sorted([(combFreqDict[x],x) for x in combFreqDict.keys()],reverse=True) + for k,v in combFreqItems: + fout.write('%s\t%s\n' % ('&'.join(v),k)) + fout.close() + +############################################ +############################################ +## RUN FUNCTION +############################################ +############################################ + +macs2_merged_expand(MergedIntervalTxtFile=args.MERGED_INTERVAL_FILE,SampleNameList=args.SAMPLE_NAME_LIST.split(','),OutFile=args.OUTFILE,isNarrow=args.IS_NARROW_PEAK,minReplicates=args.MIN_REPLICATES) + +############################################ +############################################ +############################################ +############################################ diff --git a/bin/chipseq/markdown_to_html.py b/bin/chipseq/markdown_to_html.py new file mode 100755 index 00000000..57cc4263 --- /dev/null +++ b/bin/chipseq/markdown_to_html.py @@ -0,0 +1,100 @@ +#!/usr/bin/env python +from __future__ import print_function +import argparse +import markdown +import os +import sys + +def convert_markdown(in_fn): + input_md = open(in_fn, mode="r", encoding="utf-8").read() + html = markdown.markdown( + "[TOC]\n" + input_md, + extensions = [ + 'pymdownx.extra', + 'pymdownx.b64', + 'pymdownx.highlight', + 'pymdownx.emoji', + 'pymdownx.tilde', + 'toc' + ], + extension_configs = { + 'pymdownx.b64': { + 'base_path': os.path.dirname(in_fn) + }, + 'pymdownx.highlight': { + 'noclasses': True + }, + 'toc': { + 'title': 'Table of Contents' + } + } + ) + return html + +def wrap_html(contents): + header = """ + + + + + +
+ """ + footer = """ +
+ + + """ + return header + contents + footer + + +def parse_args(args=None): + parser = argparse.ArgumentParser() + parser.add_argument('mdfile', type=argparse.FileType('r'), nargs='?', + help='File to convert. Defaults to stdin.') + parser.add_argument('-o', '--out', type=argparse.FileType('w'), + default=sys.stdout, + help='Output file name. Defaults to stdout.') + return parser.parse_args(args) + +def main(args=None): + args = parse_args(args) + converted_md = convert_markdown(args.mdfile.name) + html = wrap_html(converted_md) + args.out.write(html) + +if __name__ == '__main__': + sys.exit(main()) diff --git a/bin/chipseq/plot_homer_annotatepeaks.r b/bin/chipseq/plot_homer_annotatepeaks.r new file mode 100755 index 00000000..4a867d8f --- /dev/null +++ b/bin/chipseq/plot_homer_annotatepeaks.r @@ -0,0 +1,170 @@ +#!/usr/bin/env Rscript + +################################################ +################################################ +## LOAD LIBRARIES ## +################################################ +################################################ + +library(optparse) +library(ggplot2) +library(reshape2) +library(scales) + +################################################ +################################################ +## PARSE COMMAND-LINE PARAMETERS ## +################################################ +################################################ + +option_list <- list(make_option(c("-i", "--homer_files"), type="character", default=NULL, help="Comma-separated list of homer annotated text files.", metavar="path"), + make_option(c("-s", "--sample_ids"), type="character", default=NULL, help="Comma-separated list of sample ids associated with homer annotated text files. Must be unique and in same order as homer files input.", metavar="string"), + make_option(c("-o", "--outdir"), type="character", default='./', help="Output directory", metavar="path"), + make_option(c("-p", "--outprefix"), type="character", default='homer_annotation', help="Output prefix", metavar="string")) + +opt_parser <- OptionParser(option_list=option_list) +opt <- parse_args(opt_parser) + +if (is.null(opt$homer_files)){ + print_help(opt_parser) + stop("At least one homer annotated file must be supplied", call.=FALSE) +} +if (is.null(opt$sample_ids)){ + print_help(opt_parser) + stop("Please provide sample ids associated with homer files.", call.=FALSE) +} + +if (file.exists(opt$outdir) == FALSE) { + dir.create(opt$outdir,recursive=TRUE) +} + +HomerFiles <- unlist(strsplit(opt$homer_files,",")) +SampleIDs <- unlist(strsplit(opt$sample_ids,",")) +if (length(HomerFiles) != length(SampleIDs)) { + print_help(opt_parser) + stop("Number of sample ids must equal number of homer annotated files.", call.=FALSE) +} + +################################################ +################################################ +## READ IN DATA ## +################################################ +################################################ + +plot.dat <- data.frame() +plot.dist.dat <- data.frame() +plot.feature.dat <- data.frame() +for (idx in 1:length(HomerFiles)) { + + sampleid = SampleIDs[idx] + anno.dat <- read.csv(HomerFiles[idx], sep="\t", header=TRUE) + anno.dat <- anno.dat[,c("Annotation","Distance.to.TSS","Nearest.PromoterID")] + + ## REPLACE UNASSIGNED FEATURE ENTRIES WITH SENSIBLE VALUES + unassigned <- which(is.na(as.character(anno.dat$Distance.to.TSS))) + anno.dat$Distance.to.TSS[unassigned] <- 1000000 + + anno.dat$Annotation <- as.character(anno.dat$Annotation) + anno.dat$Annotation[unassigned] <- "Unassigned" + anno.dat$Annotation <- as.factor(anno.dat$Annotation) + + anno.dat$Nearest.PromoterID <- as.character(anno.dat$Nearest.PromoterID) + anno.dat$Nearest.PromoterID[unassigned] <- "Unassigned" + anno.dat$Nearest.PromoterID <- as.factor(anno.dat$Nearest.PromoterID) + + anno.dat$name <- rep(sampleid,nrow(anno.dat)) + anno.dat$Distance.to.TSS <- abs(anno.dat$Distance.to.TSS) + 1 + plot.dat <- rbind(plot.dat,anno.dat) + + ## GET ANNOTATION COUNTS + anno.freq <- as.character(lapply(strsplit(as.character(anno.dat$Annotation)," "), function(x) x[1])) + anno.freq <- as.data.frame(table(anno.freq)) + colnames(anno.freq) <- c("feature",sampleid) + anno.melt <- melt(anno.freq) + plot.feature.dat <- rbind(plot.feature.dat,anno.melt) + + ## GET CLOSEST INSTANCE OF GENE TO ANY GIVEN PEAK + unique.gene.dat <- anno.dat[order(anno.dat$Distance.to.TSS),] + unique.gene.dat <- unique.gene.dat[!duplicated(unique.gene.dat$Nearest.PromoterID), ] + dist.freq <- rep("> 10kb",nrow(unique.gene.dat)) + dist.freq[which(unique.gene.dat$Distance.to.TSS < 10000)] <- "< 10kb" + dist.freq[which(unique.gene.dat$Distance.to.TSS < 5000)] <- "< 5kb" + dist.freq[which(unique.gene.dat$Distance.to.TSS < 2000)] <- "< 2kb" + dist.freq <- as.data.frame(table(dist.freq)) + colnames(dist.freq) <- c("distance",sampleid) + dist.melt <- melt(dist.freq) + plot.dist.dat <- rbind(plot.dist.dat,dist.melt) + +} +plot.dat$name <- factor(plot.dat$name, levels=sort(unique(as.character(plot.dat$name)))) +plot.dist.dat$variable <- factor(plot.dist.dat$variable, levels=sort(unique(as.character(plot.dist.dat$variable)))) +plot.feature.dat$variable <- factor(plot.feature.dat$variable, levels=sort(unique(as.character(plot.feature.dat$variable)))) + +summary.dat <- dcast(plot.feature.dat, variable ~ feature, value.var="value") +colnames(summary.dat)[1] <- "sample" +write.table(summary.dat,file=file.path(opt$outdir,paste(opt$outprefix,".summary.txt",sep="")),sep="\t",row.names=F,col.names=T,quote=F) + +################################################ +################################################ +## PLOTS ## +################################################ +################################################ + +PlotFile <- file.path(opt$outdir,paste(opt$outprefix,".plots.pdf",sep="")) +pdf(PlotFile,height=6,width=3*length(HomerFiles)) + +## FEATURE COUNT STACKED BARPLOT +plot <- ggplot(plot.feature.dat, aes(x=variable, y=value, group=feature)) + + geom_bar(stat="identity", position = "fill", aes(colour=feature,fill=feature), alpha = 0.3) + + xlab("") + + ylab("% Feature") + + ggtitle("Peak Location Relative to Annotation") + + scale_y_continuous(labels = percent_format()) + + theme(panel.grid.major = element_blank(), + panel.grid.minor = element_blank(), + panel.background = element_blank(), + axis.text.y = element_text(colour="black"), + axis.text.x= element_text(colour="black",face="bold"), + axis.line.x = element_line(size = 1, colour = "black", linetype = "solid"), + axis.line.y = element_line(size = 1, colour = "black", linetype = "solid")) +print(plot) + +## DISTANCE TO CLOSEST GENE ACROSS ALL PEAKS STACKED BARPLOT +plot <- ggplot(plot.dist.dat, aes(x=variable, y=value, group=distance)) + + geom_bar(stat="identity", position = "fill", aes(colour=distance,fill=distance), alpha = 0.3) + + xlab("") + + ylab("% Unique genes to closest peak") + + ggtitle("Distance of Closest Peak to Gene") + + scale_y_continuous(labels = percent_format()) + + theme(panel.grid.major = element_blank(), + panel.grid.minor = element_blank(), + panel.background = element_blank(), + axis.text.y = element_text(colour="black"), + axis.text.x= element_text(colour="black",face="bold"), + axis.line.x = element_line(size = 1, colour = "black", linetype = "solid"), + axis.line.y = element_line(size = 1, colour = "black", linetype = "solid")) +print(plot) + +## VIOLIN PLOT OF PEAK DISTANCE TO TSS +plot <- ggplot(plot.dat, aes(x=name, y=Distance.to.TSS)) + + geom_violin(aes(colour=name,fill=name), alpha = 0.3) + + geom_boxplot(width=0.1) + + xlab("") + + ylab(expression(log[10]*" distance to TSS")) + + ggtitle("Peak Distribution Relative to TSS") + + scale_y_continuous(trans='log10',breaks = trans_breaks("log10", function(x) 10^x), labels = trans_format("log10", math_format(10^.x))) + + theme(legend.position="none", + panel.grid.major = element_blank(), + panel.grid.minor = element_blank(), + panel.background = element_blank(), + axis.text.y = element_text(colour="black"), + axis.text.x= element_text(colour="black",face="bold"), + axis.line.x = element_line(size = 1, colour = "black", linetype = "solid"), + axis.line.y = element_line(size = 1, colour = "black", linetype = "solid")) +print(plot) +dev.off() + +################################################ +################################################ +################################################ +################################################ diff --git a/bin/chipseq/plot_macs_qc.r b/bin/chipseq/plot_macs_qc.r new file mode 100755 index 00000000..b8e25d56 --- /dev/null +++ b/bin/chipseq/plot_macs_qc.r @@ -0,0 +1,155 @@ +#!/usr/bin/env Rscript + +################################################ +################################################ +## LOAD LIBRARIES ## +################################################ +################################################ + +library(optparse) +library(ggplot2) +library(reshape2) +library(scales) + +################################################ +################################################ +## PARSE COMMAND-LINE PARAMETERS ## +################################################ +################################################ + +option_list <- list(make_option(c("-i", "--peak_files"), type="character", default=NULL, help="Comma-separated list of peak files.", metavar="path"), + make_option(c("-s", "--sample_ids"), type="character", default=NULL, help="Comma-separated list of sample ids associated with peak files. Must be unique and in same order as peaks files input.", metavar="string"), + make_option(c("-o", "--outdir"), type="character", default='./', help="Output directory", metavar="path"), + make_option(c("-p", "--outprefix"), type="character", default='macs2_peakqc', help="Output prefix", metavar="string")) + +opt_parser <- OptionParser(option_list=option_list) +opt <- parse_args(opt_parser) + +if (is.null(opt$peak_files)){ + print_help(opt_parser) + stop("At least one peak file must be supplied", call.=FALSE) +} +if (is.null(opt$sample_ids)){ + print_help(opt_parser) + stop("Please provide sample ids associated with peak files.", call.=FALSE) +} + +if (file.exists(opt$outdir) == FALSE) { + dir.create(opt$outdir,recursive=TRUE) +} + +PeakFiles <- unlist(strsplit(opt$peak_files,",")) +SampleIDs <- unlist(strsplit(opt$sample_ids,",")) +if (length(PeakFiles) != length(SampleIDs)) { + print_help(opt_parser) + stop("Number of sample ids must equal number of homer annotated files.", call.=FALSE) +} + +################################################ +################################################ +## READ IN DATA ## +################################################ +################################################ + +plot.dat <- data.frame() +summary.dat <- data.frame() +for (idx in 1:length(PeakFiles)) { + + sampleid = SampleIDs[idx] + isNarrow <- FALSE + header <- c("chrom","start","end","name","pileup", "strand", "fold", "-log10(pvalue)","-log10(qvalue)") + fsplit <- unlist(strsplit(basename(PeakFiles[idx]), split='.',fixed=TRUE)) + if (fsplit[length(fsplit)] == 'narrowPeak') { + isNarrow <- TRUE + header <- c(header,"summit") + } + peaks <- read.table(PeakFiles[idx], sep="\t", header=FALSE) + colnames(peaks) <- header + + ## GET SUMMARY STATISTICS + peaks.dat <- peaks[,c('fold','-log10(qvalue)','-log10(pvalue)')] + peaks.dat$length <- (peaks$end - peaks$start) + for (cname in colnames(peaks.dat)) { + sdat <- summary(peaks.dat[,cname]) + sdat["num_peaks"] <- nrow(peaks.dat) + sdat["measure"] <- cname + sdat["sample"] <- sampleid + sdat <- t(data.frame(x=matrix(sdat),row.names=names(sdat))) + summary.dat <- rbind(summary.dat,sdat) + } + colnames(peaks.dat) <- c('fold','fdr','pvalue','length') + peaks.dat$name <- rep(sampleid,nrow(peaks.dat)) + plot.dat <- rbind(plot.dat,peaks.dat) +} +plot.dat$name <- factor(plot.dat$name, levels=sort(unique(as.character(plot.dat$name)))) + +SummaryFile <- file.path(opt$outdir,paste(opt$outprefix,".summary.txt",sep="")) +write.table(summary.dat,file=SummaryFile,quote=FALSE,sep="\t",row.names=FALSE,col.names=TRUE) + +################################################ +################################################ +## PLOTS ## +################################################ +################################################ + +## RETURNS VIOLIN PLOT OBJECT +violin.plot <- function(plot.dat,x,y,ylab,title,log) { + + plot <- ggplot(plot.dat, aes_string(x=x, y=y)) + + geom_violin(aes_string(colour=x,fill=x), alpha = 0.3) + + geom_boxplot(width=0.1) + + xlab("") + + ylab(ylab) + + ggtitle(title) + + theme(legend.position="none", + panel.grid.major = element_blank(), + panel.grid.minor = element_blank(), + panel.background = element_blank(), + axis.text.y = element_text(colour="black"), + axis.text.x= element_text(colour="black",face="bold"), + axis.line.x = element_line(size = 1, colour = "black", linetype = "solid"), + axis.line.y = element_line(size = 1, colour = "black", linetype = "solid")) + if (log == 10) { + plot <- plot + scale_y_continuous(trans='log10',breaks = trans_breaks("log10", function(x) 10^x), labels = trans_format("log10", math_format(10^.x))) + } + if (log == 2) { + plot <- plot + scale_y_continuous(trans='log2',breaks = trans_breaks("log2", function(x) 2^x), labels = trans_format("log2", math_format(2^.x))) + } + return(plot) +} + +############################ + +PlotFile <- file.path(opt$outdir,paste(opt$outprefix,".plots.pdf",sep="")) +pdf(PlotFile,height=6,width=3*length(unique(plot.dat$name))) + +## PEAK COUNT PLOT +peak.count.dat <- as.data.frame(table(plot.dat$name)) +colnames(peak.count.dat) <- c("name","count") +plot <- ggplot(peak.count.dat, aes(x=name, y=count)) + + geom_bar(stat="identity",aes(colour=name,fill=name), position = "dodge", width = 0.8, alpha = 0.3) + + xlab("") + + ylab("Number of peaks") + + ggtitle("Peak count") + + theme(legend.position="none", + panel.grid.major = element_blank(), + panel.grid.minor = element_blank(), + panel.background = element_blank(), + axis.text.y = element_text(colour="black"), + axis.text.x= element_text(colour="black",face="bold"), + axis.line.x = element_line(size = 1, colour = "black", linetype = "solid"), + axis.line.y = element_line(size = 1, colour = "black", linetype = "solid")) + + geom_text(aes(label = count, x = name, y = count), position = position_dodge(width = 0.8), vjust = -0.6) +print(plot) + +## VIOLIN PLOTS +print(violin.plot(plot.dat=plot.dat,x="name",y="length",ylab=expression(log[10]*" peak length"),title="Peak length distribution",log=10)) +print(violin.plot(plot.dat=plot.dat,x="name",y="fold",ylab=expression(log[2]*" fold-enrichment"),title="Fold-change distribution",log=2)) +print(violin.plot(plot.dat=plot.dat,x="name",y="fdr",ylab=expression(-log[10]*" qvalue"),title="FDR distribution",log=-1)) +print(violin.plot(plot.dat=plot.dat,x="name",y="pvalue",ylab=expression(-log[10]*" pvalue"),title="Pvalue distribution",log=-1)) +dev.off() + +################################################ +################################################ +################################################ +################################################ diff --git a/bin/chipseq/plot_peak_intersect.r b/bin/chipseq/plot_peak_intersect.r new file mode 100755 index 00000000..513e44b3 --- /dev/null +++ b/bin/chipseq/plot_peak_intersect.r @@ -0,0 +1,78 @@ +#!/usr/bin/env Rscript + +################################################ +################################################ +## LOAD LIBRARIES ## +################################################ +################################################ + +library(optparse) +library(UpSetR) + +################################################ +################################################ +## PARSE COMMAND-LINE PARAMETERS ## +################################################ +################################################ + +option_list <- list(make_option(c("-i", "--input_file"), type="character", default=NULL, help="Path to tab-delimited file containing two columns i.e sample1&sample2&sample3 indicating intersect between samples set size.", metavar="path"), + make_option(c("-o", "--output_file"), type="character", default=NULL, help="Path to output file with '.pdf' extension.", metavar="path")) + +opt_parser <- OptionParser(option_list=option_list) +opt <- parse_args(opt_parser) + +if (is.null(opt$input_file)){ + print_help(opt_parser) + stop("Input file must be supplied.", call.=FALSE) +} +if (is.null(opt$output_file)){ + print_help(opt_parser) + stop("Output pdf file must be supplied.", call.=FALSE) +} + +OutDir <- dirname(opt$output_file) +if (file.exists(OutDir) == FALSE) { + dir.create(OutDir,recursive=TRUE) +} + +################################################ +################################################ +## PLOT DATA ## +################################################ +################################################ + +comb.dat <- read.table(opt$input_file,sep="\t",header=FALSE) +comb.vec <- comb.dat[,2] +comb.vec <- setNames(comb.vec,comb.dat[,1]) +sets <- sort(unique(unlist(strsplit(names(comb.vec),split='&'))), decreasing = TRUE) + +nintersects = length(names(comb.vec)) +if (nintersects > 70) { + nintersects <- 70 + comb.vec <- sort(comb.vec, decreasing = TRUE)[1:70] + sets <- sort(unique(unlist(strsplit(names(comb.vec),split='&'))), decreasing = TRUE) +} + +pdf(opt$output_file,onefile=F,height=10,width=20) + +upset( + fromExpression(comb.vec), + nsets = length(sets), + nintersects = nintersects, + sets = sets, + keep.order = TRUE, + sets.bar.color = "#56B4E9", + point.size = 3, + line.size = 1, + mb.ratio = c(0.55, 0.45), + order.by = "freq", + number.angles = 30, + text.scale = c(1.5, 1.5, 1.5, 1.5, 1.5, 1.2) +) + +dev.off() + +################################################ +################################################ +################################################ +################################################ diff --git a/bin/chipseq/scrape_software_versions.py b/bin/chipseq/scrape_software_versions.py new file mode 100755 index 00000000..6f61aa86 --- /dev/null +++ b/bin/chipseq/scrape_software_versions.py @@ -0,0 +1,78 @@ +#!/usr/bin/env python +from __future__ import print_function +from collections import OrderedDict +import re + +regexes = { + 'nf-core/chipseq': ['v_pipeline.txt', r"(\S+)"], + 'Nextflow': ['v_nextflow.txt', r"(\S+)"], + 'FastQC': ['v_fastqc.txt', r"FastQC v(\S+)"], + 'Trim Galore!': ['v_trim_galore.txt', r"version (\S+)"], + 'BWA': ['v_bwa.txt', r"Version: (\S+)"], + 'Samtools': ['v_samtools.txt', r"samtools (\S+)"], + 'BEDTools': ['v_bedtools.txt', r"bedtools v(\S+)"], + 'BamTools': ['v_bamtools.txt', r"bamtools (\S+)"], + 'deepTools': ['v_deeptools.txt', r"plotFingerprint (\S+)"], + 'Picard': ['v_picard.txt', r"\n(\S+)"], + 'R': ['v_R.txt', r"R version (\S+)"], + 'Pysam': ['v_pysam.txt', r"(\S+)"], + 'MACS2': ['v_macs2.txt', r"macs2 (\S+)"], + 'HOMER': ['v_homer.txt', r"(\S+)"], + 'featureCounts': ['v_featurecounts.txt', r"featureCounts v(\S+)"], + 'Preseq': ['v_preseq.txt', r"Version: (\S+)"], + 'MultiQC': ['v_multiqc.txt', r"multiqc, version (\S+)"], +} + +results = OrderedDict() +results['nf-core/chipseq'] = 'N/A' +results['Nextflow'] = 'N/A' +results['FastQC'] = 'N/A' +results['Trim Galore!'] = 'N/A' +results['BWA'] = 'N/A' +results['Samtools'] = 'N/A' +results['BEDTools'] = 'N/A' +results['BamTools'] = 'N/A' +results['deepTools'] = 'N/A' +results['Picard'] = 'N/A' +results['R'] = 'N/A' +results['Pysam'] = 'N/A' +results['MACS2'] = 'N/A' +results['HOMER'] = False +results['featureCounts'] = 'N/A' +results['Preseq'] = 'N/A' +results['MultiQC'] = 'N/A' + +# Search each file using its regex +for k, v in regexes.items(): + try: + with open(v[0]) as x: + versions = x.read() + match = re.search(v[1], versions) + if match: + results[k] = "v{}".format(match.group(1)) + except IOError: + results[k] = False + +# Remove software set to false in results +for k in list(results): + if not results[k]: + del(results[k]) + +# Dump to YAML +print (''' +id: 'software_versions' +section_name: 'nf-core/chipseq Software Versions' +section_href: 'https://github.com/nf-core/chipseq' +plot_type: 'html' +description: 'are collected at run time from the software output.' +data: | +
+''') +for k,v in results.items(): + print("
{}
{}
".format(k,v)) +print ("
") + +# Write out regexes as csv file: +with open('software_versions.csv', 'w') as f: + for k,v in results.items(): + f.write("{}\t{}\n".format(k,v)) diff --git a/bin/help/amplicon.nf b/bin/help/amplicon.nf new file mode 100644 index 00000000..b24bc6d4 --- /dev/null +++ b/bin/help/amplicon.nf @@ -0,0 +1,9 @@ +def help(){ + println ''' +Parameter | Default | Description + +WORKFLOW NOT OFFICALLY SUPPORTED AT THIS TIME. + +''' +} + diff --git a/bin/help/atac.nf b/bin/help/atac.nf index e193b241..12698cc9 100644 --- a/bin/help/atac.nf +++ b/bin/help/atac.nf @@ -11,11 +11,14 @@ Parameter | Default | Description --extension | .fastq.gz | The expected extension for the input read files. --pattern | '*_R{1,2}*' | The expected R1 / R2 matching pattern. The default value will match reads with names like this READ_NAME_R1_MoreText.fastq.gz or READ_NAME_R1.fastq.gz --read_type | PE | Options: PE and SE. Default: PE. Type of reads: paired end (PE) or single end (SE). ---concat_lanes | false | Options: false and true. Default: false. If this boolean is specific, FASTQ files will be concatenated by sample. This option is used in cases where samples are divided across individual sequencing lanes. +--concat_lanes | false | Options: false and true. Default: false. If this boolean is specified, FASTQ files will be concatenated by sample. This option is used in cases where samples are divided across individual sequencing lanes. +--csv_input | null | Provide a CSV manifest file with the header: "sampleID,lane,fastq_1,fastq_2". See the repository wiki for an example file. Fastq_2 is optional and used only in PE data. Fastq files can either be absolute paths to local files, or URLs to remote files. If remote URLs are provided, `--download_data` must be specified. +--download_data | null | Requires `--csv_input`. When specified, read data in the CSV manifest will be downloaded from provided URLs. ---gen_org | mouse | Options: mouse and human. +--gen_org | mouse | Options: mouse or human. +--genome_build | 'GRCm38' | Options: GRCm38 or GRCm39 ---effective_genome_size | The length of the “mappable” genome. | Mouse only - Please see : 'https://deeptools.readthedocs.io/en/develop/content/feature/effectiveGenomeSize.html'. +--effective_genome_size | The length of the “mappable” genome. | See : 'https://deeptools.readthedocs.io/en/develop/content/feature/effectiveGenomeSize.html'. --chain | null | The default value for Mouse Reference Strain - g2gtools chain file to adjust coordinates to reference diff --git a/bin/help/chipseq.nf b/bin/help/chipseq.nf new file mode 100644 index 00000000..259fdbcc --- /dev/null +++ b/bin/help/chipseq.nf @@ -0,0 +1,65 @@ +def help(){ + println ''' +Parameter | Default | Description + +--pubdir | / | The directory that the saved outputs will be stored. +--organize_by | sample | How to organize the output folder structure. Options: sample or analysis. +--cacheDir | /projects/omics_share/meta/containers | This is directory that contains cached Singularity containers. JAX users should not change this parameter. +-w | / | The directory that all intermediary files and nextflow processes utilize. This directory can become quite large. This should be a location on /fastscratch or other directory with ample storage. + +--input | / | The path to the design file that contains all the samples to be run by the pipeline. - For design file format, please see : 'https://nf-co.re/chipseq/1.2.1/usage' +--extension | .fastq.gz | The expected extension for the input read files. +--pattern | '*_R{1,2}*' | The expected R1 / R2 matching pattern. The default value will match reads with names like this READ_NAME_R1_MoreText.fastq.gz or READ_NAME_R1.fastq.gz +--read_type | PE | Options: PE and SE. Default: PE. Type of reads: paired end (PE) or single end (SE). + +--gen_org | mouse | Options: mouse and human. +--genome_build | 'GRCm38' | Mouse specific. Options: GRCm38 or GRCm39. If gen_org == human, build defaults to GRCm38. + +--fragment_size | 200 | Number of base pairs to extend single-end reads when creating bigWig files (Default: 200) +--fingerprint_bins | 500000 | Number of genomic bins to use when generating the deepTools fingerprint plot. Larger numbers will give a smoother profile, but take longer to run (Default: 500000) +--gtf | The full path to GTF file for annotating peaks and the GTF file should resemble the Ensembl format +--gene_bed | The full path to BED file for genome-wide gene intervals + +--ref_fa | Mouse: '/projects/omics_share/mouse/GRCm38/genome/sequence/ensembl/v102/Mus_musculus.GRCm38.dna.primary_assembly.fa' + | Human: '/projects/omics_share/human/GRCh38/genome/indices/gatk/bwa/Homo_sapiens_assembly38.fasta' +--ref_fa_indices | Mouse: '/projects/omics_share/mouse/GRCm38/genome/indices/ensembl/v102/bwa/Mus_musculus.GRCm38.dna.primary_assembly.fa' | The default value for mm10. + | Human: '/projects/omics_share/human/GRCh38/genome/indices/gatk/bwa/Homo_sapiens_assembly38.fasta' + | Pre-compiled BWA index files, points to human reference when --gen_org human. + +--macs_gsize | Effective genome size parameter required by MACS2 | if this parameter is not specified then the MACS2 peak-calling and differential analysis will be skipped +--blacklist | The BED format file, if provided, alignments that overlap with the regions in this file will be filtered out | Please see : 'https://sites.google.com/site/anshulkundaje/projects/blacklists' + +--trimLength | 30 | Discard reads that became shorter than length 'INT' because of either quality or adapter trimming. A value of 0 effectively disables this behaviour. +--qualThreshold | 30 | Trim low-quality ends from reads in addition to adapter removal. For RRBS samples, quality trimming will be performed first, and adapter trimming is carried in a second round. Other files are quality and adapter trimmed in a single pass. The algorithm is the same as the one used by BWA (Subtract INT from all qualities; compute partial sums from all indices to the end of the sequence; cut sequence at the index at which the sum is minimal). +--adapOverlap | 1 | Stringency for overlap with adapter sequence required to trim a sequence. Defaults to a very stringent setting of 1, i.e. even a single base pair of overlapping sequence will be trimmed of the 3' end of any read. +--adaptorSeq | 'AGATCGGAAGAGC' | Adapter sequence to be trimmed. This sequence is the standard Illumina adapter sequence. + +--mismatch_penalty | '' | The BWA penalty for a mismatch. +--bwa_min_score | false | Don’t output BWA MEM alignments with score lower than this parameter (Default: false) +--keep_dups | false | Duplicate reads are not filtered from alignments (Default: false) +--keep_multi_map | false | Reads mapping to multiple locations in the genome are not filtered from alignments (Default: false) +--bamtools_filter_pe_config | / | The path to bamtools_filter_pe.json for paired end (PE) +--bamtools_filter_se_config | / | The path to bamtools_filter_se.json for single end (SE) + | The configuration file used while running bamtools filter + +--narrow_peak | false | MACS2 is run by default with the --broad flag. Specify this flag to call peaks in narrowPeak mode (Default: false) +--broad_cutoff | 0.1 | Specifies broad cut-off value for MACS2. Only used when --narrow_peak isnt specified (Default: 0.1) + +--skip_preseq | false | Skip Preseq +--skip_peak_qc | false | Skip MACS2 peak QC plot generation (Default: false) +--skip_peak_annotation | false | Skip MACS2 peak QC plot generation (Default: false) +--skip_consensus_peaks | false | Skip consensus peak generation, annotation and counting (Default: false) +--skip_diff_analysis | false | Skip differential binding analysis with DESeq2 (Default: false) +--deseq2_vst | false | Use vst transformation instead of rlog with DESeq2. (Default: false) +--macs_fdr | false | Minimum FDR (q-value) cutoff for peak detection, --macs_fdr and --macs_pvalue are mutually exclusive (Default: false) +--macs_pvalue | false | p-value cutoff for peak detection (Default: false). +--min_reps_consensus | 1 | Number of biological replicates required from a given condition for a peak to contribute to a consensus peak (Default: 1) +--save_macs_pileup | false | Instruct MACS2 to create bedGraph files using the -B --SPMR parameters (Default: false). + +--multiqc_config | / | The path to chipseq.yaml + | The configuration file used while running MultiQC + +--tmpdir | / | Temporary directory to store temp files. +''' +} + diff --git a/bin/help/pdx_wes.nf b/bin/help/pdx_wes.nf new file mode 100644 index 00000000..7a9aa519 --- /dev/null +++ b/bin/help/pdx_wes.nf @@ -0,0 +1,48 @@ +def help(){ + println ''' +Parameter | Default | Description + +--pubdir | / | The directory that the saved outputs will be stored. +--organize_by | sample | How to organize the output folder structure. Options: sample or analysis. +--cacheDir | /projects/omics_share/meta/containers | This is directory that contains cached Singularity containers. JAX users should not change this parameter. +-w | / | The directory that all intermediary files and nextflow processes utilize. This directory can become quite large. This should be a location on /fastscratch or other directory with ample storage. + +--sample_folder | / | The path to the folder that contains all the samples to be run by the pipeline. The files in this path can also be symbolic links. +--extension | .fastq.gz | The expected extension for the input read files. +--pattern | '*_R{1,2}*' | The expected R1 / R2 matching pattern. The default value will match reads with names like this READ_NAME_R1_MoreText.fastq.gz or READ_NAME_R1.fastq.gz +--read_type | PE | Options: PE and SE. Default: PE. Type of reads: paired end (PE) or single end (SE). +--concat_lanes | false | Options: false and true. Default: false. If this boolean is specified, FASTQ files will be concatenated by sample. This option is used in cases where samples are divided across individual sequencing lanes. +--csv_input | null | Provide a CSV manifest file with the header: "sampleID,lane,fastq_1,fastq_2". See the repository wiki for an example file. Fastq_2 is optional and used only in PE data. Fastq files can either be absolute paths to local files, or URLs to remote files. If remote URLs are provided, `--download_data` must be specified. +--download_data | null | Requires `--csv_input`. When specified, read data in the CSV manifest will be downloaded from provided URLs. + +--gen_org | human | Options: human only. + +--ref_fa | '/projects/omics_share/human/GRCh38/genome/sequence/gatk/Homo_sapiens_assembly38.fasta' | The reference fasta to be used throughout the process for alignment as well as any downstream analysis. JAX users should not change this parameter. + +--ref_fa_indices | '/projects/omics_share/human/GRCh38/genome/indices/gatk/bwa/Homo_sapiens_assembly38.fasta' | Pre-compiled BWA index files. JAX users should not change this parameter. + +--min_pct_hq_reads | 0.0 | The minimum percent of high-quality reads passing when trimming the fastq files. + +--target_gatk | '/projects/omics_share/human/GRCh38/supporting_files/capture_kit_files/agilent/v7/S31285117_MergedProbes_no_gene_names.bed' | A bed file with WES target intervals as defined in the capture array used in the data. NOTE: This file MUST reflect the capture array used to generate your data. + +--target_picard | '/projects/omics_share/human/GRCh38/supporting_files/capture_kit_files/agilent/v7/S31285117_MergedProbes_no_gene_names.picard.interval_list' | A GATK interval file covering WES target intervals. Used in calculating coverage metrics. NOTE: This file MUST reflect the capture array used to generate your data. + +--bait_picard | '/projects/omics_share/human/GRCh38/supporting_files/capture_kit_files/agilent/v7/S31285117_MergedProbes_no_gene_names.picard.interval_list' | A GATK interval file covering WES target intervals. Used in calculating coverage metrics. This file can be the same as the interval file, NOTE: This file MUST reflect the capture array used to generate your data. + +--mismatch_penalty | -B 8 | The BWA penalty for a mismatch. +--call_val | 50 | The minimum phred-scaled confidence threshold at which variants should be called. +--ploidy_val | '-ploidy 2' | Sample ploidy + +--dbSNP | '/projects/omics_share/human/GRCh38/genome/annotation/snps_indels/dbsnp_151.vcf.gz' | The dbSNP database contains known single nucleotide polymorphisms, and is used in the annotation of known variants. Points to human dbSNP when --gen_org human. JAX users should not change this parameter. + +--gen_ver | 'hg38' | snpEff genome version. Sets to 'hg38' when --gen_org human JAX users should not change this parameter. + +--snpEff_config | Human: '/projects/omics_share/human/GRCh38/genome/indices/snpEff_5_1/snpEff.config' | The configuration file used while running snpEff, points to human snpEff file when --gen_org human. JAX users should not change this parameter. + +--gold_std_indels | '/projects/omics_share/human/GRCh38/genome/annotation/snps_indels/Mills_and_1000G_gold_standard.indels.hg38.vcf.gz’ | Human Only - Used in GATK BaseRecalibrator. JAX users should not change this parameter. +--phase1_1000G | '/projects/omics_share/human/GRCh38/genome/annotation/snps_indels/1000G_phase1.snps.high_confidence.hg38.vcf.gz' | Human Only - Used in GATK BaseRecalibrator. JAX users should not change this parameter. +--dbNSFP | '/projects/omics_share/human/GRCh38/genome/annotation/function/dbNSFP4.2a.gatk_formatted.txt.gz' | Human Only - Used in variant annotation. +--cosmic | '/projects/omics_share/human/GRCh38/genome/annotation/function/COSMICv95_Coding_Noncoding.gatk_formatted.vcf' | Human Only - Used in variant annotation. +''' +} + diff --git a/bin/help/pta.nf b/bin/help/pta.nf new file mode 100644 index 00000000..f839aae7 --- /dev/null +++ b/bin/help/pta.nf @@ -0,0 +1,86 @@ +def help(){ + println ''' +Parameter | Default | Description + +--pubdir | / | The directory that the saved outputs will be stored. +--organize_by | sample | How to organize the output folder structure. Options: sample or analysis. +--cacheDir | /projects/omics_share/meta/containers | This is directory that contains cached Singularity containers. JAX users should not change this parameter. +-w | / | The directory that all intermediary files and nextflow processes utilize. This directory can become quite large. This should be a location on /fastscratch or other directory with ample storage. + +--csv_input | / | CSV delimited sample sheet that controls how samples are processed. The required input header is: patient,sex,status,sampleID,lane,fastq_1,fastq_2. See the repository wiki (https://github.com/TheJacksonLaboratory/cs-nf-pipelines/wiki) for additional information. + +--xenome_prefix | /projects/compsci/omics_share/human/GRCh38/supporting_files/xenome/trans_human_GRCh38_84_NOD_based_on_mm10_k25| Xenome index for deconvolution of human and mouse reads. Used when `--pdx` is run. +--pdx | false | Options: false, true. If specified, 'Xenome' is run on reads to deconvolute human and mouse reads. Human only reads are used in analysis. + +--min_pct_hq_reads | 0.0 | The minimum percent of high-quality reads passing when trimming the fastq files to continue with the analysis. 0.0 disables this filter. +--hq_pct | 70 | The percentage of bases within a read that must be high quality for the read to pass filtering" + +--ref_fa | '/projects/omics_share/human/GRCh38/genome/sequence/gatk/Homo_sapiens_assembly38.fasta' | The reference fasta to be used throughout the process for alignment as well as any downstream analysis, points to human reference when --gen_org human. JAX users should not change this parameter. +--ref_fa_indices | '/projects/omics_share/human/GRCh38/genome/indices/gatk/bwa/Homo_sapiens_assembly38.fasta' | Pre-compiled BWA index files. JAX users should not change this parameter. + +--ref_fa_dict | '/projects/omics_share/human/GRCh38/genome/sequence/gatk/Homo_sapiens_assembly38.dict' | FASTA dictonary file. JAX users should not change this parameter. +--combined_reference_set | '/projects/compsci/omics_share/human/GRCh38/supporting_files/PTA_inputs/combined_ref_set/Homo_sapiens_assembly38.fasta' | Several tools (GRIDSS, SVABA) requires reference and bwa index files in same directory. Links used within this directory to avoid duplication of fasta and bwa indicies. See note in directory. + +--mismatch_penalty | -B 8 | The BWA penalty for a mismatch. + +--gold_std_indels | '/projects/omics_share/human/GRCh38/genome/annotation/snps_indels/Mills_and_1000G_gold_standard.indels.hg38.vcf.gz’ | Used in GATK BaseRecalibrator and variant tranche recalibration derived from the GATK resource bundle. JAX users should not change this parameter. +--phase1_1000G | '/projects/omics_share/human/GRCh38/genome/annotation/snps_indels/1000G_phase1.snps.high_confidence.hg38.vcf.gz' | Used in GATK BaseRecalibrator derived from the GATK resource bundle. JAX users should not change this parameter. +--dbSNP | '/projects/omics_share/human/GRCh38/genome/annotation/snps_indels/dbsnp_151.vcf.gz' | Used in variant annotation, GATK BaseRecalibrator, variant tranche recalibration, and by SVABA. JAX users should not change this parameter. +--dbSNP_index | '/projects/omics_share/human/GRCh38/genome/annotation/snps_indels/dbsnp_151.vcf.gz.tbi' | Index associated with the dbsnp file. + +--chrom_contigs | '/projects/omics_share/human/GRCh38/genome/sequence/gatk/Homo_sapiens_assembly38.primaryChr.contig_list' | Contig list used for scatter / gather in calling and annotation. +--chrom_intervals | '/projects/omics_share/human/GRCh38/genome/annotation/intervals/hg38_calling_intervals/' | Chromosome intervals used for scatter gather in calling. + +--call_val | 50 | The minimum phred-scaled confidence threshold at which variants should be called. +--ploidy_val | '-ploidy 2' | Sample ploidy used by Haplotypecaller in germline small variant calling. + +--excludeIntervalList | '/projects/compsci/omics_share/human/GRCh38/genome/annotation/intervals/hg38_haplotypeCaller_skip.interval_list' | Germline caller exclusion list. +--hapmap | '/projects/compsci/omics_share/human/GRCh38/genome/annotation/snps_indels/hapmap_3.3.hg38.vcf.gz' | variant tranche recalibration requirement derived from the GATK resource bundle. +--omni | '/projects/compsci/omics_share/human/GRCh38/genome/annotation/snps_indels/1000G_omni2.5.hg38.vcf.gz' | variant tranche recalibration requirement derived from GATK resource bundle. + +--pon_bed | '/projects/compsci/omics_share/human/GRCh38/supporting_files/PTA_inputs/filtering/WGS_1000g_GRCh38.pon.bed' | Panel of normal samples used in in snp and indel filtering. +--intervalListBed | '/projects/compsci/omics_share/human/GRCh38/supporting_files/PTA_inputs/filtering/SureSelect_V6plusCOSMIC.target.GRCh38_full_analysis_set_plus_decoy_hla.interval_list.bed' | This file is used to extract small variants in non-exonic regions. Such calls are then attempted to be recovered via Lancet calls. + +--lancet_beds_directory | '/projects/omics_share/human/GRCh38/genome/annotation/intervals/lancet_chr_beds/' | Lancet interval bed files used in calling by that tool. + +--mappability_directory | '/projects/compsci/omics_share/human/GRCh38/genome/annotation/intervals/mappability' | Bicseq2 input requirement. Derived from the tool developer resource pack. +--bicseq2_chromList | '/projects/compsci/omics_share/human/GRCh38/supporting_files/PTA_inputs/configs/sampleId.bicseq2.config' | Bicseq2 config requirement. Derived from the tool developer resource pack. +--bicseq2_no_scaling | false | false: estimate 'lamda' smoothing factor from data for CNV profile calling. true: Use standard 'lamda | 4' smoothing for CNV profile calling. If BicSeq2 fails with an error, set this parameter to 'true'. + +--germline_filtering_vcf | '/projects/compsci/omics_share/human/GRCh38/supporting_files/PTA_inputs/filtering/gnomad-and-ALL_GRCh38_sites.20170504.normalized.modified.PASS.vcf.gz' | Germline reference file used in Gridss SV call filtering. Provided by the tool developer resource pack. +--gripss_pon | '/projects/compsci/omics_share/human/GRCh38/supporting_files/PTA_inputs/gripss_pon' | Panel of normal files for Gripss SV call filering. Provided by the tool developer resource pack. + +--callRegions | '/projects/compsci/omics_share/human/GRCh38/genome/annotation/intervals/GRCh38.callregions.bed.gz' | Manta calling regions. Provided by the tool developer resource pack. + +--strelka_config | '/projects/compsci/omics_share/human/GRCh38/supporting_files/PTA_inputs/configs/configureStrelkaSomaticWorkflow.py.ini' | Strelka input configuration. Provided by the tool developer resource pack. + +--msisensor_model | '/projects/compsci/omics_share/human/GRCh38/supporting_files/msisensor2/models_hg38' | Model files for MSI calling via MSIsensor2. Provided by the tool developer resource pack. + +--vep_cache_directory | '/projects/compsci/omics_share/human/GRCh38/genome/annotation/vep_data' | VEP annotation cache. Cache provided is for Ensembl v109. +--vep_fasta | '/projects/compsci/omics_share/human/GRCh38/genome/sequence/ensembl/GRCh38.p13/Homo_sapiens.GRCh38.dna.primary_assembly.fa' | VEP requires an ensembl based fasta. GRCh38.p13 is used for v97-v109. + +--cosmic_cgc | '/projects/compsci/omics_share/human/GRCh38/genome/annotation/function/cancer_gene_census_v97.csv' | COSMIC Cancer Gene Census annotation file. Index for file required within same location. +--cosmic_cancer_resistance_muts | '/projects/compsci/omics_share/human/GRCh38/genome/annotation/function/CosmicResistanceMutations.tsv.gz' | COSMIC Resistance Mutations file. Index for file required within same location. + +--ensembl_entrez | '/projects/compsci/omics_share/human/GRCh38/supporting_files/PTA_inputs/annotations/GRCh39.p13_ensemblv109_entrez_id_map.csv' | Ensembl to Entrez gene ID to HGNC symbol mapping file. used in somatic vcf finalization. + +--cytoband | '/projects/compsci/omics_share/human/GRCh38/supporting_files/PTA_inputs/annotations/GRCh38.cytoBand.UCSC.chr.sorted.txt' | File used in bicseq2 annotations +--dgv | '/projects/compsci/omics_share/human/GRCh38/supporting_files/PTA_inputs/annotations/DGV.GRCh38_hg38_variants_2020-02-25.bed' | File used in bicseq2 annotations +--thousandG | '/projects/compsci/omics_share/human/GRCh38/supporting_files/PTA_inputs/annotations/1KGP.CNV.GRCh38.canvas.merged.bed' | File used in bicseq2 annotations +--cosmicUniqueBed | '/projects/compsci/omics_share/human/GRCh38/supporting_files/PTA_inputs/annotations/CosmicCompleteCNA_uniqIntervals.bed' | File used in bicseq2 annotations +--cancerCensusBed | '/projects/compsci/omics_share/human/GRCh38/supporting_files/PTA_inputs/annotations/cancer_gene_census.GRCh38-v92.bed' | File used in bicseq2 annotations and SV annotation. +--ensemblUniqueBed | '/projects/compsci/omics_share/human/GRCh38/supporting_files/PTA_inputs/annotations/ensembl_genes_unique_sorted.final.v93.chr.sorted.bed' | File used in bicseq2 annotations and SV annotation. +--gap | '/projects/compsci/omics_share/human/GRCh38/supporting_files/PTA_inputs/annotations/GRCh38.gap.UCSC.annotated.chr.sorted.bed' | File used in SV annotation. +--dgvBedpe | '/projects/compsci/omics_share/human/GRCh38/supporting_files/PTA_inputs/annotations/DGV.GRCh38_hg38_variants_2020-02-25.bedpe' | File used in SV annotation. +--thousandGVcf | '/projects/compsci/omics_share/human/GRCh38/supporting_files/PTA_inputs/annotations/1KGP.pruned_wAFs.PASS_and_MULTIALLELIC_Mosaic.GRCh38.vcf' | File used in SV annotation. +--svPon | '/projects/compsci/omics_share/human/GRCh38/supporting_files/PTA_inputs/annotations/1000G-SV-PON.survivor-merged.GRCh38.filtered.bedpe' | File used in SV annotation. +--cosmicBedPe | '/projects/compsci/omics_share/human/GRCh38/supporting_files/PTA_inputs/annotations/cosmic-sv-GRCh38-v92.bedpe' | File used in SV annotation. + +--na12878_bam | '/projects/compsci/omics_share/human/GRCh38/supporting_files/PTA_inputs/NA12878/NA12878_realigned_BQSR.bam' | NA12878 BAM file. Used in un-paired sample analysis. +--na12878_bai | '/projects/compsci/omics_share/human/GRCh38/supporting_files/PTA_inputs/NA12878/NA12878_realigned_BQSR.bai' | NA12878 BAM index file. Used in un-paired sample analysis. +--na12878_sampleName | 'ERR194147_1.fastq.gz_filtered_trimmed' | NA12878 sample name within the NA12878 BAM file. + +--read_type | PE | Only 'PE' is accepted for this workflow. + +''' +} diff --git a/bin/help/rna_fusion.nf b/bin/help/rna_fusion.nf new file mode 100644 index 00000000..dc17726b --- /dev/null +++ b/bin/help/rna_fusion.nf @@ -0,0 +1,54 @@ +def help(){ + println ''' +Parameter | Default | Description + +--pubdir | / | The directory that the saved outputs will be stored. +--organize_by | sample | How to organize the output folder structure. Options: sample or analysis. +--cacheDir | /projects/omics_share/meta/containers | This is directory that contains cached Singularity containers. JAX users should not change this parameter. +-w | / | The directory that all intermediary files and nextflow processes utilize. This directory can become quite large. This should be a location on /fastscratch or other directory with ample storage. + +--sample_folder | / | The path to the folder that contains all the samples to be run by the pipeline. The files in this path can also be symbolic links. +--extension | .fastq.gz | The expected extension for the input read files. +--pattern | '*_R{1,2}*' | The expected R1 / R2 matching pattern. The default value will match reads with names like this READ_NAME_R1_MoreText.fastq.gz or READ_NAME_R1.fastq.gz +--read_type | PE | Options: PE and SE. Default: PE. Type of reads: paired end (PE) or single end (SE). +--concat_lanes | false | Options: false and true. Default: false. If this boolean is specified, FASTQ files will be concatenated by sample. This option is used in cases where samples are divided across individual sequencing lanes. +--csv_input | null | Provide a CSV manifest file with the header: "sampleID,lane,fastq_1,fastq_2". See the repository wiki for an example file. Fastq_2 is optional and used only in PE data. Fastq files can either be absolute paths to local files, or URLs to remote files. If remote URLs are provided, `--download_data` must be specified. +--download_data | null | Requires `--csv_input`. When specified, read data in the CSV manifest will be downloaded from provided URLs. + +--gen_org | mouse | Options: mouse and human. + +--xenome_prefix | /projects/compsci/omics_share/human/GRCh38/supporting_files/xenome/trans_human_GRCh38_84_NOD_based_on_mm10_k25| Xenome index for deconvolution of human and mouse reads. Used when `--pdx` is run. +--read_length | 150 | Options: 75, 100, 150. Changed relative to sample read length. +--star_index | /projects/omics_share/human/GRCh38/transcriptome/indices/rna_fusion/star/star-2.7.4a-150bp | STAR index used by several tools. Change the index relative to sample read length. Read length options: 75, 100, 150. +--star_fusion_star_index | /projects/omics_share/human/GRCh38/transcriptome/indices/rna_fusion/starfusion/star-150 | STAR-fusion index. Change the index relative to sample read length. Read length options: 75, 100, 150. + +--gencode_gtf | /projects/compsci/omics_share/human/GRCh38/transcriptome/indices/rna_fusion/gencode/gencode.v37.annotation.gtf.revised.custom.gtf | GTF file used by several callers (Arriba, Pizzly, Squid, StarFusion). This file is used to build STAR refrences for these callers and it should not be changed unless other indicies are also updated. +--ensembl_gtf | /projects/omics_share/human/GRCh38/transcriptome/indices/rna_fusion/ensembl/Homo_sapiens.GRCh38.102.gtf | GTF file used by JAFFA. Jaffa uses a slightly different implmentation of calling etc., and this is the recommended GTF. +--fasta | /projects/omics_share/human/GRCh38/transcriptome/indices/rna_fusion/ensembl/Homo_sapiens.GRCh38.102.all.fa | Genomic FASTA file used by fusion callers. STAR refrences were built from this file, and it should not be changed unless other indicies are also updated. + +--arriba_star_args | | Arriba recommended argument string for STAR alignment. See the rna_fusion.config file for specific arguments used. +--arriba_blacklist | /projects/compsci/omics_share/human/GRCh38/transcriptome/indices/rna_fusion/arriba/blacklist_hg38_GRCh38_v2.4.0.tsv.gz | Arriba provided blacklist of difficult regions for fusion calling: https://arriba.readthedocs.io/en/latest/input-files/ +--arriba_known_fusions | /projects/compsci/omics_share/human/GRCh38/transcriptome/indices/rna_fusion/arriba/known_fusions_hg38_GRCh38_v2.4.0.tsv.gz | Arriba provided list of known fusions: https://arriba.readthedocs.io/en/latest/input-files/ +--arriba_protein_domains | /projects/compsci/omics_share/human/GRCh38/transcriptome/indices/rna_fusion/arriba/protein_domains_hg38_GRCh38_v2.4.0.gff3 | Arriba provided known protein domains: https://arriba.readthedocs.io/en/latest/input-files/ + +--fusioncatcher_ref | /projects/compsci/omics_share/human/GRCh38/transcriptome/indices/rna_fusion/fusioncatcher/human_v102 | Fusion catcher provided reference files: http://sourceforge.net/projects/fusioncatcher/files/data/ +--fusioncatcher_limitSjdbInsertNsj | 2000000 | STAR option used by fusioncatcher: maximum number of junction to be inserted to the genome on the fly at the mapping stage + +--jaffa_ref_dir | /projects/omics_share/human/GRCh38/transcriptome/indices/rna_fusion/jaffa/ | Jaffa provided reference files: https://github.com/Oshlack/JAFFA/wiki/Download + +--kallisto_index | /projects/omics_share/human/GRCh38/transcriptome/indices/rna_fusion/pizzly/Homo_sapiens.GRCh38.102.cdna.all.kallisto-0.48.0.index | Kallisto alignment index. +--transcript_fasta | /projects/omics_share/human/GRCh38/transcriptome/indices/rna_fusion/ensembl/Homo_sapiens.GRCh38.102.cdna.all.fa.gz | Transcriptome FASTA file used by Pizzly. + +--squid_star_args | | Squid recommended argument string for STAR alignment. See the rna_fusion.config file for specific arguments used. + +--star_fusion_ref | /projects/omics_share/human/GRCh38/transcriptome/indices/rna_fusion/starfusion/ctat_genome_lib_build_dir | star-fusion reference file set. Build from the above GTF and FASTA. +--star_fusion_opt | null | Additional star-fusion options can be provided. + +--fusion_report_opt | null | Additional fusion-report options can be provided. +--databases | /projects/compsci/omics_share/human/GRCh38/supporting_files/rna_fusion_dbs | Fusion-report databases of known fusion events. Used in report generation only. + +--pdx | false | Options: false, true. If specified, 'Xenome' is run on reads to deconvolute human and mouse reads. Human only reads are used in analysis. + +''' +} + diff --git a/bin/help/rnaseq.nf b/bin/help/rnaseq.nf index 71168e8d..67480817 100644 --- a/bin/help/rnaseq.nf +++ b/bin/help/rnaseq.nf @@ -11,13 +11,26 @@ Parameter | Default | Description --extension | .fastq.gz | The expected extension for the input read files. --pattern | '*_R{1,2}*' | The expected R1 / R2 matching pattern. The default value will match reads with names like this READ_NAME_R1_MoreText.fastq.gz or READ_NAME_R1.fastq.gz --read_type | PE | Options: PE and SE. Default: PE. Type of reads: paired end (PE) or single end (SE). ---concat_lanes | false | Options: false and true. Default: false. If this boolean is specific, FASTQ files will be concatenated by sample. This option is used in cases where samples are divided across individual sequencing lanes. +--concat_lanes | false | Options: false and true. Default: false. If this boolean is specified, FASTQ files will be concatenated by sample. This option is used in cases where samples are divided across individual sequencing lanes. +--csv_input | null | Provide a CSV manifest file with the header: "sampleID,lane,fastq_1,fastq_2". See the repository wiki for an example file. Fastq_2 is optional and used only in PE data. Fastq files can either be absolute paths to local files, or URLs to remote files. If remote URLs are provided, `--download_data` must be specified. +--download_data | null | Requires `--csv_input`. When specified, read data in the CSV manifest will be downloaded from provided URLs. --gen_org | mouse | Options: mouse and human. +--genome_build | 'GRCm38' | Mouse specific. Options: GRCm38 or GRCm39. If gen_org == human, build defaults to GRCm38. ---read_prep | 'reverse_stranded' | Options: 'reverse_stranded', 'forward_stranded' or 'non_stranded'. This determines how RNA quantification is done, and statistics are calculated. It is based on the library strandedness. +--pdx | false | Options: true or false. If 'true' Xenome is run to remove mouse reads from samples. +--xenome_prefix | '/projects/compsci/omics_share/human/GRCh38/supporting_files/xenome/trans_human_GRCh38_84_NOD_based_on_mm10_k25' | Pre-compiled Xenome classification index files. Used if PDX analysis is specified. + +--min_pct_hq_reads | 0.0 | The minimum percent of high-quality reads passing when trimming the fastq files to continue with the analysis. 0.0 disables this filter. +--hq_pct | 70 | The percentage of bases within a read that must be high quality for the read to pass filtering" + +--strandedness_ref | Mouse: '/projects/compsci/omics_share/human/GRCh38/transcriptome/indices/ensembl/v104/kallisto/kallisto_index' + | Human: '/projects/compsci/omics_share/human/GRCh38/transcriptome/indices/ensembl/v104/kallisto/kallisto_index' + | Modfied kallisto index file used in strandedness determination. +--strandedness_gtf | Mouse: '/projects/compsci/omics_share/mouse/GRCm38/transcriptome/annotation/ensembl/v102/Mus_musculus.GRCm38.102.gtf' + | Human: '/projects/compsci/omics_share/human/GRCh38/transcriptome/annotation/ensembl/v104/Homo_sapiens.GRCh38.104.gtf' + | GTF file used with kallisto index file used in strandedness determination. ---min_pct_hq_reads| '0.0' | The minimum percent of high-quality reads passing when trimming the fastq files. --rsem_ref_files | /projects/omics_share/mouse/GRCm38/transcriptome/indices/ensembl/v102/bowtie2 | Pre-compiled index files. Refers to human indices when --gen_org human. JAX users should not change this, unless using STAR indices. --rsem_ref_prefix | 'Mus_musculus.GRCm38.dna.toplevel' | Prefix for index files. JAX users should not change this, unless using STAR indices. Refers to human indices when --gen_org human. --seed_length | 25 | 'Seed length used by the read aligner. Providing the correct value is important for RSEM. If RSEM runs Bowtie, it uses this value for Bowtie's seed length parameter.' @@ -35,6 +48,8 @@ Parameter | Default | Description | Human: '/projects/omics_share/human/GRCh38/transcriptome/annotation/ensembl/v104/Homo_sapiens.GRCh38.104.chr_patch_hapl_scaff.rRNA.interval_list' | The coverage metric calculation step requires this file. Refers to human assembly when --gen_org human. JAX users should not change this parameter. +--pdx | false | Options: false, true. If specified, 'Xenome' is run on reads to deconvolute human and mouse reads. Human only reads are used in analysis. + There are two additional parameters that are human specific. They are: Parameter| Default| Description diff --git a/bin/help/rrbs.nf b/bin/help/rrbs.nf index b00920ad..df0ab0b4 100644 --- a/bin/help/rrbs.nf +++ b/bin/help/rrbs.nf @@ -11,11 +11,14 @@ Parameter | Default | Description --extension | .fastq.gz | The expected extension for the input read files. --pattern | '*_R{1,2}*' | The expected R1 / R2 matching pattern. The default value will match reads with names like this READ_NAME_R1_MoreText.fastq.gz or READ_NAME_R1.fastq.gz --read_type | PE | Options: PE and SE. Default: PE. Type of reads: paired end (PE) or single end (SE). ---concat_lanes | false | Options: false and true. Default: false. If this boolean is specific, FASTQ files will be concatenated by sample. This option is used in cases where samples are divided across individual sequencing lanes. +--concat_lanes | false | Options: false and true. Default: false. If this boolean is specified, FASTQ files will be concatenated by sample. This option is used in cases where samples are divided across individual sequencing lanes. +--csv_input | null | Provide a CSV manifest file with the header: "sampleID,lane,fastq_1,fastq_2". See the repository wiki for an example file. Fastq_2 is optional and used only in PE data. Fastq files can either be absolute paths to local files, or URLs to remote files. If remote URLs are provided, `--download_data` must be specified. +--download_data | null | Requires `--csv_input`. When specified, read data in the CSV manifest will be downloaded from provided URLs. --gen_org | mouse | Options: mouse and human. +--genome_build | 'GRCm38' | Mouse specific. Options: GRCm38 or GRCm39. If gen_org == human, build defaults to GRCm38. ---non_direction | true | Options: true and false. Selecting this option for non-directional RRBS libraries will screen quality-trimmed sequences for CAA or CGA at the start of the read and, if found, removes the first two base pairs. +--non_directional | true | Options: true and false. Selecting this option for non-directional RRBS libraries will screen quality-trimmed sequences for CAA or CGA at the start of the read and, if found, removes the first two base pairs. --trimLength | 30 | Discard reads that became shorter than length 'INT' because of either quality or adapter trimming. A value of 0 effectively disables this behaviour. --qualThreshold | 30 | Trim low-quality ends from reads in addition to adapter removal. For RRBS samples, quality trimming will be performed first, and adapter trimming is carried in a second round. Other files are quality and adapter trimmed in a single pass. The algorithm is the same as the one used by BWA (Subtract INT from all qualities; compute partial sums from all indices to the end of the sequence; cut sequence at the index at which the sum is minimal). diff --git a/bin/help/wes.nf b/bin/help/wes.nf index ecf16134..56c191ef 100644 --- a/bin/help/wes.nf +++ b/bin/help/wes.nf @@ -11,19 +11,25 @@ Parameter | Default | Description --extension | .fastq.gz | The expected extension for the input read files. --pattern | '*_R{1,2}*' | The expected R1 / R2 matching pattern. The default value will match reads with names like this READ_NAME_R1_MoreText.fastq.gz or READ_NAME_R1.fastq.gz --read_type | PE | Options: PE and SE. Default: PE. Type of reads: paired end (PE) or single end (SE). ---concat_lanes | false | Options: false and true. Default: false. If this boolean is specific, FASTQ files will be concatenated by sample. This option is used in cases where samples are divided across individual sequencing lanes. +--concat_lanes | false | Options: false and true. Default: false. If this boolean is specified, FASTQ files will be concatenated by sample. This option is used in cases where samples are divided across individual sequencing lanes. +--csv_input | null | Provide a CSV manifest file with the header: "sampleID,lane,fastq_1,fastq_2". See the repository wiki for an example file. Fastq_2 is optional and used only in PE data. Fastq files can either be absolute paths to local files, or URLs to remote files. If remote URLs are provided, `--download_data` must be specified. +--download_data | null | Requires `--csv_input`. When specified, read data in the CSV manifest will be downloaded from provided URLs. + +--run_gvcf | false | Options: false and true. Default: false. If this boolean is specified, GCVF output will be generated. --gen_org | mouse | Options: mouse and human. +--genome_build | 'GRCm38' | Mouse specific. Options: GRCm38 or GRCm39. If gen_org == human, build defaults to GRCm38. --ref_fa | Mouse: '/projects/omics_share/mouse/GRCm38/genome/sequence/ensembl/v102/Mus_musculus.GRCm38.dna.toplevel.fa' | Human: '/projects/omics_share/human/GRCh38/genome/sequence/gatk/Homo_sapiens_assembly38.fasta' | The reference fasta to be used throughout the process for alignment as well as any downstream analysis, points to human reference when --gen_org human. JAX users should not change this parameter. --ref_fa_indices | Mouse: '/projects/omics_share/mouse/GRCm38/genome/indices/ensembl/v102/bwa/Mus_musculus.GRCm38.dna.toplevel.fa' - | Human: '/projects/omics_share/human/GRCh38/genome/indices/gatk/bwa/Homo_sapiens_assembly38.fasta.64' + | Human: '/projects/omics_share/human/GRCh38/genome/indices/gatk/bwa/Homo_sapiens_assembly38.fasta' | Pre-compiled BWA index files, points to human reference when --gen_org human. JAX users should not change this parameter. ---min_pct_hq_reads | 0.0 | The minimum percent of high-quality reads passing when trimming the fastq files. +--min_pct_hq_reads | 0.0 | The minimum percent of high-quality reads passing when trimming the fastq files to continue with the analysis. 0.0 disables this filter. +--hq_pct | 70 | The percentage of bases within a read that must be high quality for the read to pass filtering" --target_gatk | Mouse: '/projects/omics_share/mouse/GRCm38/supporting_files/capture_kit_files/agilent/v2/S32371113_mouse_exon_V2.bare.bed' | Human: '/projects/omics_share/human/GRCh38/supporting_files/capture_kit_files/agilent/v7/S31285117_MergedProbes_no_gene_names.bed' diff --git a/bin/help/wgs.nf b/bin/help/wgs.nf index c187abc4..67efd429 100644 --- a/bin/help/wgs.nf +++ b/bin/help/wgs.nf @@ -11,23 +11,30 @@ Parameter | Default | Description --extension | .fastq.gz | The expected extension for the input read files. --pattern | '*_R{1,2}*' | The expected R1 / R2 matching pattern. The default value will match reads with names like this READ_NAME_R1_MoreText.fastq.gz or READ_NAME_R1.fastq.gz --read_type | PE | Options: PE and SE. Default: PE. Type of reads: paired end (PE) or single end (SE). ---concat_lanes | false | Options: false and true. Default: false. If this boolean is specific, FASTQ files will be concatenated by sample. This option is used in cases where samples are divided across individual sequencing lanes. +--concat_lanes | false | Options: false and true. Default: false. If this boolean is specified, FASTQ files will be concatenated by sample. This option is used in cases where samples are divided across individual sequencing lanes. +--csv_input | null | Provide a CSV manifest file with the header: "sampleID,lane,fastq_1,fastq_2". See the repository wiki for an example file. Fastq_2 is optional and used only in PE data. Fastq files can either be absolute paths to local files, or URLs to remote files. If remote URLs are provided, `--download_data` must be specified. +--download_data | null | Requires `--csv_input`. When specified, read data in the CSV manifest will be downloaded from provided URLs. + +--run_gvcf | false | Options: false and true. Default: false. If this boolean is specified, GCVF output will be generated. --gen_org | mouse | Options: mouse and human. +--genome_build | 'GRCm38' | Mouse specific. Options: GRCm38 or GRCm39. If gen_org == human, build defaults to GRCm38. --ref_fa | Mouse: '/projects/omics_share/mouse/GRCm38/genome/sequence/ensembl/v102/Mus_musculus.GRCm38.dna.toplevel.fa' | Human: '/projects/omics_share/human/GRCh38/genome/sequence/gatk/Homo_sapiens_assembly38.fasta' | The reference fasta to be used throughout the process for alignment as well as any downstream analysis, points to human reference when --gen_org human. JAX users should not change this parameter. --ref_fa_indices | Mouse: '/projects/omics_share/mouse/GRCm38/genome/indices/ensembl/v102/bwa/Mus_musculus.GRCm38.dna.toplevel.fa' - | Human: '/projects/omics_share/human/GRCh38/genome/indices/gatk/bwa/Homo_sapiens_assembly38.fasta.64' + | Human: '/projects/omics_share/human/GRCh38/genome/indices/gatk/bwa/Homo_sapiens_assembly38.fasta' | Pre-compiled BWA index files, points to human reference when --gen_org human. JAX users should not change this parameter. --chrom_contigs | Mouse: '/projects/omics_share/mouse/GRCm38/genome/sequence/ensembl/v102/Mus_musculus.GRCm38.dna.toplevel.primaryChr.contig_list' | Human: '/projects/omics_share/human/GRCh38/genome/sequence/gatk/Homo_sapiens_assembly38.primaryChr.contig_list' | A list of all chromosomes, unplaced, and unlocalized contigs present in the reference file, points to human reference when --gen_org human. Used to scatter variant calling by chromosome. JAX users should not change this parameter. ---min_pct_hq_reads | 0.0 | The minimum percent of high-quality reads passing when trimming the fastq files. +--min_pct_hq_reads | 0.0 | The minimum percent of high-quality reads passing when trimming the fastq files to continue with the analysis. 0.0 disables this filter. +--hq_pct | 70 | The percentage of bases within a read that must be high quality for the read to pass filtering" + --mismatch_penalty | -B 8 | The BWA penalty for a mismatch. --call_val | 50 | The minimum phred-scaled confidence threshold at which variants should be called. --ploidy_val | '-ploidy 2' | Sample ploidy diff --git a/bin/log/amplicon.nf b/bin/log/amplicon.nf new file mode 100644 index 00000000..e792ab81 --- /dev/null +++ b/bin/log/amplicon.nf @@ -0,0 +1,69 @@ +import Logos + +logo = new Logo() +println '\n' +println logo.show() + +def param_log(){ +log.info """ +AMPLICON PARAMETER LOG + +--comment: ${params.comment} + +Results Published to: ${params.pubdir} +______________________________________________________ +--workflow ${params.workflow} + +WORKFLOW NOT OFFICALLY SUPPORTED AT THIS TIME. + + // Shared params + gen_org = 'human' + extension='.fastq.gz' + pattern="*_R{1,2}*" + read_type = 'PE' // SE + sample_folder = null + concat_lanes = false + download_data = false + csv_input = null + + multiqc_config = "${projectDir}/bin/shared/multiqc/amplicon_multiqc.yaml" + + cutadaptMinLength = 20 + cutadaptQualCutoff = 20 + cutadaptAdapterR1 = 'CTGTCTCTTATACACATCTCCGAGCCCACGAGAC' + cutadaptAdapterR2 = 'CTGTCTCTTATACACATCTGACGCTGCCGACGA' + + + ref_fa = '/projects/omics_share/human/GRCh38/genome/sequence/gatk/Homo_sapiens_assembly38.fasta' + ref_fa_indices = '/projects/omics_share/human/GRCh38/genome/indices/gatk/bwa/Homo_sapiens_assembly38.fasta' + mismatch_penalty = "-B 8" + + masterfile = '/projects/compsci/omics_share/human/GRCh38/supporting_files/capture_kit_files/IDT/xGen_sampleID_amplicon/hg38Lifted_xGen_masterfile.txt' + + amplicon_primer_intervals = '/projects/compsci/omics_share/human/GRCh38/supporting_files/capture_kit_files/IDT/xGen_sampleID_amplicon/hg38Lifted_xGen_SampleID_primers.interval_list' + amplicon_target_intervals = '/projects/compsci/omics_share/human/GRCh38/supporting_files/capture_kit_files/IDT/xGen_sampleID_amplicon/hg38Lifted_xGen_SampleID_merged_targets.interval_list' + + gold_std_indels = '/projects/omics_share/human/GRCh38/genome/annotation/snps_indels/Mills_and_1000G_gold_standard.indels.hg38.vcf.gz' + phase1_1000G = '/projects/omics_share/human/GRCh38/genome/annotation/snps_indels/1000G_phase1.snps.high_confidence.hg38.vcf.gz' + dbSNP = '/projects/omics_share/human/GRCh38/genome/annotation/snps_indels/dbsnp_151.vcf.gz' + + ploidy_val = '-ploidy 2' // variable in haplotypecaller. not required for amplicon, but present in module. + target_gatk = '/projects/compsci/omics_share/human/GRCh38/supporting_files/capture_kit_files/IDT/xGen_sampleID_amplicon/hg38Lifted_xGen_SampleID_merged_targets.bed' + params.call_val = "50.0" + + dbSNP = '/projects/omics_share/human/GRCh38/genome/annotation/snps_indels/dbsnp_151.vcf.gz' + dbSNP_index = '/projects/omics_share/human/GRCh38/genome/annotation/snps_indels/dbsnp_151.vcf.gz.tbi' + + tmpdir = "/fastscratch/${USER}" + bwa_min_score = null + + + +Project Directory: ${projectDir} + +Command line call: +${workflow.commandLine} +______________________________________________________ +""" + +} diff --git a/bin/log/atac.nf b/bin/log/atac.nf index 31f22433..1b237b00 100644 --- a/bin/log/atac.nf +++ b/bin/log/atac.nf @@ -1,9 +1,13 @@ +import Logos + +logo = new Logo() +println '\n' +println logo.show() + def param_log(){ if (params.gen_org=='human') - log.info """ -______________________________________________________ - - ATAC PARAMETER LOG +log.info """ +ATAC PARAMETER LOG --comment: ${params.comment} @@ -11,11 +15,14 @@ Results Published to: ${params.pubdir} ______________________________________________________ --workflow ${params.workflow} --gen_org ${params.gen_org} +--genome_build ${params.genome_build} --read_type ${params.read_type} --sample_folder ${params.sample_folder} --pattern ${params.pattern} --extension ${params.extension} --concat_lanes ${params.concat_lanes} +--csv_input ${params.csv_input} +--download_data ${params.download_data} -w ${workDir} -c ${params.config} --pubdir ${params.pubdir} @@ -30,13 +37,14 @@ ______________________________________________________ --tmpdir ${params.tmpdir} Project Directory: ${projectDir} + +Command line call: +${workflow.commandLine} ______________________________________________________ """ else log.info """ -______________________________________________________ - - ATAC PARAMETER LOG +ATAC PARAMETER LOG --comment: ${params.comment} @@ -44,11 +52,14 @@ Results Published to: ${params.pubdir} ______________________________________________________ --workflow ${params.workflow} --gen_org ${params.gen_org} +--genome_build ${params.genome_build} --read_type ${params.read_type} --sample_folder ${params.sample_folder} --pattern ${params.pattern} --extension ${params.extension} --concat_lanes ${params.concat_lanes} +--csv_input ${params.csv_input} +--download_data ${params.download_data} -w ${workDir} -c ${params.config} --pubdir ${params.pubdir} @@ -65,6 +76,9 @@ ______________________________________________________ --tmpdir ${params.tmpdir} Project Directory: ${projectDir} + +Command line call: +${workflow.commandLine} ______________________________________________________ """ diff --git a/bin/log/chipseq.nf b/bin/log/chipseq.nf new file mode 100644 index 00000000..4f6bb717 --- /dev/null +++ b/bin/log/chipseq.nf @@ -0,0 +1,120 @@ +import Logos + +logo = new Logo() +println '\n' +println logo.show() + +def param_log(){ +if (params.gen_org=='human') + log.info """ +CHIPSEQ PARAMETER LOG + +--comment: ${params.comment} + +Results Published to: ${params.pubdir} +______________________________________________________ +--workflow ${params.workflow} +--gen_org ${params.gen_org} +--genome_build ${params.genome_build} +--read_type ${params.read_type} +--input ${params.input} +-w ${workDir} +-c ${params.config} +--pubdir ${params.pubdir} +--organize_by ${params.organize_by} +--fragment_size ${params.fragment_size} +--fingerprint_bins ${params.fingerprint_bins} +--gtf ${params.gtf} +--gene_bed ${params.gene_bed} +--ref_fa ${params.ref_fa} +--ref_fa_indices ${params.ref_fa_indices} +--macs_gsize ${params.macs_gsize} +--blacklist ${params.blacklist} +--trimLength ${params.trimLength} +--qualThreshold ${params.qualThreshold} +--adapOverlap ${params.adapOverlap} +--adaptorSeq ${params.adaptorSeq} +--mismatch_penalty ${params.mismatch_penalty} +--bwa_min_score ${params.bwa_min_score} +--keep_dups ${params.keep_dups} +--keep_multi_map ${params.keep_multi_map} +--bamtools_filter_pe_config ${params.bamtools_filter_pe_config} +--bamtools_filter_se_config ${params.bamtools_filter_se_config} +--narrow_peak ${params.narrow_peak} +--broad_cutoff ${params.broad_cutoff} +--skip_preseq ${params.skip_preseq} +--skip_peak_qc ${params.skip_peak_qc} +--skip_peak_annotation ${params.skip_peak_annotation} +--skip_consensus_peaks ${params.skip_consensus_peaks} +--skip_diff_analysis ${params.skip_diff_analysis} +--deseq2_vst ${params.deseq2_vst} +--macs_fdr ${params.macs_fdr} +--macs_pvalue ${params.macs_pvalue} +--min_reps_consensus ${params.min_reps_consensus} +--save_macs_pileup ${params.save_macs_pileup} +--multiqc_config ${params.multiqc_config} +--tmpdir ${params.tmpdir} + +Project Directory: ${projectDir} + +Command line call: +${workflow.commandLine} +______________________________________________________ +""" +else +log.info """ +CHIPSEQ PARAMETER LOG + +--comment: ${params.comment} + +Results Published to: ${params.pubdir} +______________________________________________________ +--workflow ${params.workflow} +--gen_org ${params.gen_org} +--genome_build ${params.genome_build} +--read_type ${params.read_type} +--input ${params.input} +-w ${workDir} +-c ${params.config} +--pubdir ${params.pubdir} +--organize_by ${params.organize_by} +--fragment_size ${params.fragment_size} +--fingerprint_bins ${params.fingerprint_bins} +--gtf ${params.gtf} +--gene_bed ${params.gene_bed} +--ref_fa ${params.ref_fa} +--ref_fa_indices ${params.ref_fa_indices} +--macs_gsize ${params.macs_gsize} +--blacklist ${params.blacklist} +--trimLength ${params.trimLength} +--qualThreshold ${params.qualThreshold} +--adapOverlap ${params.adapOverlap} +--adaptorSeq ${params.adaptorSeq} +--mismatch_penalty ${params.mismatch_penalty} +--bwa_min_score ${params.bwa_min_score} +--keep_dups ${params.keep_dups} +--keep_multi_map ${params.keep_multi_map} +--bamtools_filter_pe_config ${params.bamtools_filter_pe_config} +--bamtools_filter_se_config ${params.bamtools_filter_se_config} +--narrow_peak ${params.narrow_peak} +--broad_cutoff ${params.broad_cutoff} +--skip_preseq ${params.skip_preseq} +--skip_peak_qc ${params.skip_peak_qc} +--skip_peak_annotation ${params.skip_peak_annotation} +--skip_consensus_peaks ${params.skip_consensus_peaks} +--skip_diff_analysis ${params.skip_diff_analysis} +--deseq2_vst ${params.deseq2_vst} +--macs_fdr ${params.macs_fdr} +--macs_pvalue ${params.macs_pvalue} +--min_reps_consensus ${params.min_reps_consensus} +--save_macs_pileup ${params.save_macs_pileup} +--multiqc_config ${params.multiqc_config} + +Project Directory: ${projectDir} + +Command line call: +${workflow.commandLine} +______________________________________________________ +""" + +} diff --git a/bin/log/pdx_wes.nf b/bin/log/pdx_wes.nf new file mode 100644 index 00000000..24e7cb5d --- /dev/null +++ b/bin/log/pdx_wes.nf @@ -0,0 +1,55 @@ +import Logos + +logo = new Logo() +println '\n' +println logo.show() + +def param_log(){ +log.info """ +WES PARAMETER LOG + +--comment: ${params.comment} + +Results Published to: ${params.pubdir} +______________________________________________________ +--workflow ${params.workflow} +--gen_org ${params.gen_org} +--read_type ${params.read_type} +--sample_folder ${params.sample_folder} +--pattern ${params.pattern} +--extension ${params.extension} +--concat_lanes ${params.concat_lanes} +--csv_input ${params.csv_input} +--download_data ${params.download_data} +-w ${workDir} +--keep_intermediate ${params.keep_intermediate} +-c ${params.config} +--pubdir ${params.pubdir} +--organize_by ${params.organize_by} +--xenome_index ${params.xenome_prefix} +--ref_fa ${params.ref_fa} +--ref_fa_indices ${params.ref_fa_indices} +--min_pct_hq_reads ${params.min_pct_hq_reads} +--dbSNP ${params.dbSNP} +--target_gatk ${params.target_gatk} +--target_picard ${params.target_picard} +--bait_picard ${params.bait_picard} +--snpEff_config ${params.snpEff_config} +--mismatch_penalty ${params.mismatch_penalty} +--call_val ${params.call_val} +--gen_ver ${params.gen_ver} +--gold_std_indels ${params.gold_std_indels} +--phase1_1000G ${params.phase1_1000G} +--dbNSFP ${params.dbNSFP} +--cosmic ${params.cosmic} +--snpEff_config ${params.snpEff_config} + + +Project Directory: ${projectDir} + +Command line call: +${workflow.commandLine} +______________________________________________________ +""" + +} diff --git a/bin/log/pta.nf b/bin/log/pta.nf new file mode 100644 index 00000000..5fd1f8f3 --- /dev/null +++ b/bin/log/pta.nf @@ -0,0 +1,82 @@ +import Logos + +logo = new Logo() +println '\n' +println logo.show() + +def param_log(){ +log.info """ +PTA PARAMETER LOG + +--comment: ${params.comment} + +Results Published to: ${params.pubdir} +______________________________________________________ +--workflow ${params.workflow} +--csv_input ${params.csv_input} +--organize_by ${params.organize_by} +--pubdir ${params.pubdir} +-w ${workDir} +--keep_intermediate ${params.keep_intermediate} +-c ${params.config} +--gen_org ${params.gen_org} +--pdx ${params.pdx} +--read_type ${params.read_type} +--min_pct_hq_reads ${params.min_pct_hq_reads} +--hq_pct ${params.hq_pct} +--xenome_prefix ${params.xenome_prefix} +--ref_fa ${params.ref_fa} +--ref_fa_indices ${params.ref_fa_indices} +--ref_fa_dict ${params.ref_fa_dict} +--combined_reference_set ${params.combined_reference_set} +--mismatch_penalty ${params.mismatch_penalty} +--gold_std_indels ${params.gold_std_indels} +--phase1_1000G ${params.phase1_1000G} +--dbSNP ${params.dbSNP} +--dbSNP_index ${params.dbSNP_index} +--chrom_contigs ${params.chrom_contigs} +--chrom_intervals ${params.chrom_intervals} +--call_val ${params.call_val} +--ploidy_val ${params.ploidy_val} +--excludeIntervalList ${params.excludeIntervalList} +--hapmap ${params.hapmap} +--omni ${params.omni} +--pon_bed ${params.pon_bed} +--intervalListBed ${params.intervalListBed} +--lancet_beds_directory ${params.lancet_beds_directory} +--mappability_directory ${params.mappability_directory} +--bicseq2_chromList ${params.bicseq2_chromList} +--bicseq2_no_scaling ${params.bicseq2_no_scaling} +--germline_filtering_vcf ${params.germline_filtering_vcf} +--gripss_pon ${params.gripss_pon} +--callRegions ${params.callRegions} +--strelka_config ${params.strelka_config} +--msisensor_model ${params.msisensor_model} +--vep_cache_directory ${params.vep_cache_directory} +--vep_fasta ${params.vep_fasta} +--cosmic_cgc ${params.cosmic_cgc} +--cosmic_cancer_resistance_muts ${params.cosmic_cancer_resistance_muts} +--ensembl_entrez ${params.ensembl_entrez} +--cytoband ${params.cytoband} +--dgv ${params.dgv} +--thousandG ${params.thousandG} +--cosmicUniqueBed ${params.cosmicUniqueBed} +--cancerCensusBed ${params.cancerCensusBed} +--ensemblUniqueBed ${params.ensemblUniqueBed} +--gap ${params.gap} +--dgvBedpe ${params.dgvBedpe} +--thousandGVcf ${params.thousandGVcf} +--svPon ${params.svPon} +--cosmicBedPe ${params.cosmicBedPe} +--na12878_bam ${params.na12878_bam} +--na12878_bai ${params.na12878_bai} +--na12878_sampleName ${params.na12878_sampleName} + +Project Directory: ${projectDir} + +Command line call: +${workflow.commandLine} +______________________________________________________ +""" + +} diff --git a/bin/log/rna_fusion.nf b/bin/log/rna_fusion.nf new file mode 100644 index 00000000..fbe1cfb5 --- /dev/null +++ b/bin/log/rna_fusion.nf @@ -0,0 +1,60 @@ +import Logos + +logo = new Logo() +println '\n' +println logo.show() + +def param_log(){ +log.info """ +RNA FUSION PARAMETER LOG + +--comment: ${params.comment} + +Results Published to: ${params.pubdir} +______________________________________________________ +--workflow ${params.workflow} +--gen_org ${params.gen_org} +--read_type ${params.read_type} +--sample_folder ${params.sample_folder} +--extension ${params.extension} +--pattern ${params.pattern} +--concat_lanes ${params.concat_lanes} +--csv_input ${params.csv_input} +--download_data ${params.download_data} +--organize_by ${params.organize_by} +--pubdir ${params.pubdir} +-w ${workDir} +--keep_intermediate ${params.keep_intermediate} +-c ${params.config} +--multiqc_config ${params.multiqc_config} +--xenome_prefix ${params.xenome_prefix} +--read_length ${params.read_length} +--star_index ${params.star_index} +--star_fusion_star_index ${params.star_fusion_star_index} +--gencode_gtf ${params.gencode_gtf} +--ensembl_gtf ${params.ensembl_gtf} +--fasta ${params.fasta} +--arriba_star_args ${params.arriba_star_args} +--arriba_blacklist ${params.arriba_blacklist} +--arriba_known_fusions ${params.arriba_known_fusions} +--arriba_protein_domains ${params.arriba_protein_domains} +--fusioncatcher_ref ${params.fusioncatcher_ref} +--fusioncatcher_limitSjdbInsertNsj ${params.fusioncatcher_limitSjdbInsertNsj} +--jaffa_ref_dir ${params.jaffa_ref_dir} +--kallisto_index ${params.kallisto_index} +--transcript_fasta ${params.transcript_fasta} +--squid_star_args ${params.squid_star_args} +--star_fusion_ref ${params.star_fusion_ref} +--star_fusion_opt ${params.star_fusion_opt} +--fusion_report_opt ${params.fusion_report_opt} +--databases ${params.databases} +--pdx ${params.pdx} + +Project Directory: ${projectDir} + +Command line call: +${workflow.commandLine} +______________________________________________________ +""" + +} diff --git a/bin/log/rnaseq.nf b/bin/log/rnaseq.nf index ebbd44b7..a87ba3d4 100644 --- a/bin/log/rnaseq.nf +++ b/bin/log/rnaseq.nf @@ -1,9 +1,134 @@ +import Logos + +logo = new Logo() +println '\n' +println logo.show() + def param_log(){ -if (params.gen_org=='human' && params.rsem_aligner=='bowtie2') - log.info """ + +if (params.rsem_aligner != "bowtie2" && params.rsem_aligner != "star") { + error "'--rsem_aligner': \"${params.rsem_aligner}\" is not valid, supported options are 'bowtie2' or 'star'" +} + +if (params.gen_org != "mouse" && params.gen_org != "human") { + error "'--gen_org': \"${params.gen_org}\" is not valid, supported options are 'mouse' or 'human'" +} + +if (params.pdx && params.rsem_aligner=='bowtie2') +log.info """ +RNASEQ PARAMETER LOG + +--comment: ${params.comment} + +Results Published to: ${params.pubdir} +______________________________________________________ +--workflow ${params.workflow} +--gen_org ${params.gen_org} +--genome_build ${params.genome_build} +--read_type ${params.read_type} +--sample_folder ${params.sample_folder} +--extension ${params.extension} +--pattern ${params.pattern} +--concat_lanes ${params.concat_lanes} +--csv_input ${params.csv_input} +--download_data ${params.download_data} +--organize_by ${params.organize_by} +--pubdir ${params.pubdir} +-w ${workDir} +--keep_intermediate ${params.keep_intermediate} +-c ${params.config} +--min_pct_hq_reads ${params.min_pct_hq_reads} +--seed_length ${params.seed_length} + +--pdx ${params.pdx} +--xenome_prefix ${params.xenome_prefix} + +--strandedness_ref ${params.strandedness_ref} +--strandedness_gtf ${params.strandedness_gtf} + +--rsem_aligner ${params.rsem_aligner} + +Human specific files: +--rsem_ref_prefix_human ${params.rsem_ref_prefix_human} +--rsem_ref_files_human ${params.rsem_ref_files_human} +--picard_dict_human ${params.picard_dict_human} +--ref_flat_human ${params.ref_flat_human} +--ribo_intervals_human ${params.ribo_intervals_human} + +Mouse specific files: +--rsem_ref_prefix_mouse ${params.rsem_ref_prefix_mouse} +--rsem_ref_files_mouse ${params.rsem_ref_files_mouse} +--picard_dict_mouse ${params.picard_dict_mouse} +--ref_flat_mouse ${params.ref_flat_mouse} +--ribo_intervals_mouse ${params.ribo_intervals_mouse} + +Project Directory: ${projectDir} + +Command line call: +${workflow.commandLine} ______________________________________________________ - RNASEQ PARAMETER LOG +""" +else if (params.pdx && params.rsem_aligner=='star') +log.info """ +RNASEQ PARAMETER LOG + +--comment: ${params.comment} + +Results Published to: ${params.pubdir} +______________________________________________________ +--workflow ${params.workflow} +--gen_org ${params.gen_org} +--genome_build ${params.genome_build} +--read_type ${params.read_type} +--sample_folder ${params.sample_folder} +--extension ${params.extension} +--pattern ${params.pattern} +--concat_lanes ${params.concat_lanes} +--csv_input ${params.csv_input} +--download_data ${params.download_data} +--organize_by ${params.organize_by} +--pubdir ${params.pubdir} +-w ${workDir} +--keep_intermediate ${params.keep_intermediate} +-c ${params.config} +--min_pct_hq_reads ${params.min_pct_hq_reads} +--seed_length ${params.seed_length} + +--pdx ${params.pdx} +--xenome_prefix ${params.xenome_prefix} + +--strandedness_ref ${params.strandedness_ref} +--strandedness_gtf ${params.strandedness_gtf} + +--rsem_aligner ${params.rsem_aligner} + +Human specific files: +--rsem_ref_prefix_human ${params.rsem_ref_prefix_human} +--rsem_ref_files_human ${params.rsem_ref_files_human} +--rsem_star_prefix_human ${params.rsem_star_prefix_human} +--picard_dict_human ${params.picard_dict_human} +--ref_flat_human ${params.ref_flat_human} +--ribo_intervals_human ${params.ribo_intervals_human} + +Mouse specific files: +--rsem_ref_prefix_mouse ${params.rsem_ref_prefix_mouse} +--rsem_ref_files_mouse ${params.rsem_ref_files_mouse} +--rsem_star_prefix_mouse ${params.rsem_star_prefix_mouse} +--picard_dict_mouse ${params.picard_dict_mouse} +--ref_flat_mouse ${params.ref_flat_mouse} +--ribo_intervals_mouse ${params.ribo_intervals_mouse} + +Project Directory: ${projectDir} + +Command line call: +${workflow.commandLine} +______________________________________________________ + +""" +else if (params.gen_org=='human' && params.rsem_aligner=='bowtie2') +log.info """ +RNASEQ PARAMETER LOG --comment: ${params.comment} @@ -11,20 +136,23 @@ Results Published to: ${params.pubdir} ______________________________________________________ --workflow ${params.workflow} --gen_org ${params.gen_org} +--genome_build ${params.genome_build} --read_type ${params.read_type} --sample_folder ${params.sample_folder} --extension ${params.extension} --pattern ${params.pattern} --concat_lanes ${params.concat_lanes} +--csv_input ${params.csv_input} +--download_data ${params.download_data} --organize_by ${params.organize_by} --pubdir ${params.pubdir} -w ${workDir} --keep_intermediate ${params.keep_intermediate} -c ${params.config} ---read_prep ${params.read_prep} ---ref_fa ${params.ref_fa} ---ref_fai ${params.ref_fai} --min_pct_hq_reads ${params.min_pct_hq_reads} +--hq_pct ${params.hq_pct} +--strandedness_ref ${params.strandedness_ref} +--strandedness_gtf ${params.strandedness_gtf} --seed_length ${params.seed_length} --rsem_ref_prefix ${params.rsem_ref_prefix} --rsem_ref_files ${params.rsem_ref_files} @@ -34,14 +162,15 @@ ______________________________________________________ --ribo_intervals ${params.ribo_intervals} Project Directory: ${projectDir} + +Command line call: +${workflow.commandLine} ______________________________________________________ """ else if (params.gen_org=='human' && params.rsem_aligner=='star') - log.info """ -______________________________________________________ - - RNASEQ PARAMETER LOG +log.info """ +RNASEQ PARAMETER LOG --comment: ${params.comment} @@ -49,20 +178,23 @@ Results Published to: ${params.pubdir} ______________________________________________________ --workflow ${params.workflow} --gen_org ${params.gen_org} +--genome_build ${params.genome_build} --read_type ${params.read_type} --sample_folder ${params.sample_folder} --extension ${params.extension} --pattern ${params.pattern} --concat_lanes ${params.concat_lanes} +--csv_input ${params.csv_input} +--download_data ${params.download_data} --organize_by ${params.organize_by} --pubdir ${params.pubdir} -w ${workDir} --keep_intermediate ${params.keep_intermediate} -c ${params.config} ---read_prep ${params.read_prep} ---ref_fa ${params.ref_fa} ---ref_fai ${params.ref_fai} --min_pct_hq_reads ${params.min_pct_hq_reads} +--hq_pct ${params.hq_pct} +--strandedness_ref ${params.strandedness_ref} +--strandedness_gtf ${params.strandedness_gtf} --seed_length ${params.seed_length} --rsem_ref_prefix ${params.rsem_ref_prefix} --rsem_ref_files ${params.rsem_ref_files} @@ -73,14 +205,15 @@ ______________________________________________________ --ribo_intervals ${params.ribo_intervals} Project Directory: ${projectDir} + +Command line call: +${workflow.commandLine} ______________________________________________________ """ else if (params.gen_org=='mouse' && params.rsem_aligner=='bowtie2') - log.info """ -______________________________________________________ - - RNASEQ PARAMETER LOG +log.info """ +RNASEQ PARAMETER LOG --comment: ${params.comment} @@ -88,19 +221,23 @@ Results Published to: ${params.pubdir} ______________________________________________________ --workflow ${params.workflow} --gen_org ${params.gen_org} +--genome_build ${params.genome_build} --read_type ${params.read_type} --sample_folder ${params.sample_folder} --extension ${params.extension} --pattern ${params.pattern} --concat_lanes ${params.concat_lanes} +--csv_input ${params.csv_input} +--download_data ${params.download_data} --pubdir ${params.pubdir} --organize_by ${params.organize_by} -w ${workDir} --keep_intermediate ${params.keep_intermediate} -c ${params.config} ---read_prep ${params.read_prep} ---ref_fa ${params.ref_fa} --min_pct_hq_reads ${params.min_pct_hq_reads} +--hq_pct ${params.hq_pct} +--strandedness_ref ${params.strandedness_ref} +--strandedness_gtf ${params.strandedness_gtf} --seed_length ${params.seed_length} --rsem_ref_prefix ${params.rsem_ref_prefix} --rsem_ref_files ${params.rsem_ref_files} @@ -108,14 +245,15 @@ ______________________________________________________ --picard_dict ${params.picard_dict} Project Directory: ${projectDir} + +Command line call: +${workflow.commandLine} ______________________________________________________ """ else if (params.gen_org=='mouse' && params.rsem_aligner=='star') - log.info """ -______________________________________________________ - - RNASEQ PARAMETER LOG +log.info """ +RNASEQ PARAMETER LOG --comment: ${params.comment} @@ -123,19 +261,23 @@ Results Published to: ${params.pubdir} ______________________________________________________ --workflow ${params.workflow} --gen_org ${params.gen_org} +--genome_build ${params.genome_build} --read_type ${params.read_type} --sample_folder ${params.sample_folder} --extension ${params.extension} --pattern ${params.pattern} --concat_lanes ${params.concat_lanes} +--csv_input ${params.csv_input} +--download_data ${params.download_data} --pubdir ${params.pubdir} --organize_by ${params.organize_by} -w ${workDir} --keep_intermediate ${params.keep_intermediate} -c ${params.config} ---read_prep ${params.read_prep} ---ref_fa ${params.ref_fa} --min_pct_hq_reads ${params.min_pct_hq_reads} +--hq_pct ${params.hq_pct} +--strandedness_ref ${params.strandedness_ref} +--strandedness_gtf ${params.strandedness_gtf} --seed_length ${params.seed_length} --rsem_ref_prefix ${params.rsem_ref_prefix} --rsem_ref_files ${params.rsem_ref_files} @@ -144,9 +286,12 @@ ______________________________________________________ --picard_dict ${params.picard_dict} Project Directory: ${projectDir} + +Command line call: +${workflow.commandLine} ______________________________________________________ """ -else error "invalid parameters in ${params.gen_org} and/or ${params.rsem_aligner}" +else error "Invalid parameters in '--gen_org': ${params.gen_org} and/or in '--rsem_aligner': ${params.rsem_aligner}. Supported options are 'mouse' or 'human' and 'bowtie2' or 'star'." } diff --git a/bin/log/rrbs.nf b/bin/log/rrbs.nf index 899a4c36..8ced2ebc 100644 --- a/bin/log/rrbs.nf +++ b/bin/log/rrbs.nf @@ -1,9 +1,13 @@ +import Logos + +logo = new Logo() +println '\n' +println logo.show() + def param_log(){ if (params.gen_org=='human') - log.info """ -______________________________________________________ - - RRBS PARAMETER LOG +log.info """ +RRBS PARAMETER LOG --comment: ${params.comment} @@ -11,11 +15,14 @@ Results Published to: ${params.pubdir} ______________________________________________________ --workflow ${params.workflow} --gen_org ${params.gen_org} +--genome_build ${params.genome_build} --read_type ${params.read_type} --sample_folder ${params.sample_folder} --extension ${params.extension} --pattern ${params.pattern} --concat_lanes ${params.concat_lanes} +--csv_input ${params.csv_input} +--download_data ${params.download_data} --organize_by ${params.organize_by} --pubdir ${params.pubdir} -w ${workDir} @@ -37,13 +44,14 @@ ______________________________________________________ --comprehensive ${params.comprehensive} Project Directory: ${projectDir} + +Command line call: +${workflow.commandLine} ______________________________________________________ """ else log.info """ -______________________________________________________ - - RRBS PARAMETER LOG +RRBS PARAMETER LOG --comment: ${params.comment} @@ -51,11 +59,14 @@ Results Published to: ${params.pubdir} ______________________________________________________ --workflow ${params.workflow} --gen_org ${params.gen_org} +--genome_build ${params.genome_build} --read_type ${params.read_type} --sample_folder ${params.sample_folder} --extension ${params.extension} --pattern ${params.pattern} --concat_lanes ${params.concat_lanes} +--csv_input ${params.csv_input} +--download_data ${params.download_data} --pubdir ${params.pubdir} --organize_by ${params.organize_by} -w ${workDir} @@ -77,6 +88,9 @@ ______________________________________________________ --comprehensive ${params.comprehensive} Project Directory: ${projectDir} + +Command line call: +${workflow.commandLine} ______________________________________________________ """ diff --git a/bin/log/wes.nf b/bin/log/wes.nf index ef4cc88b..6fe2dc75 100644 --- a/bin/log/wes.nf +++ b/bin/log/wes.nf @@ -1,9 +1,13 @@ +import Logos + +logo = new Logo() +println '\n' +println logo.show() + def param_log(){ if (params.gen_org=='human') - log.info """ -______________________________________________________ - - WES PARAMETER LOG +log.info """ +WES PARAMETER LOG --comment: ${params.comment} @@ -11,11 +15,14 @@ Results Published to: ${params.pubdir} ______________________________________________________ --workflow ${params.workflow} --gen_org ${params.gen_org} +--genome_build ${params.genome_build} --read_type ${params.read_type} --sample_folder ${params.sample_folder} --pattern ${params.pattern} --extension ${params.extension} --concat_lanes ${params.concat_lanes} +--csv_input ${params.csv_input} +--download_data ${params.download_data} -w ${workDir} --keep_intermediate ${params.keep_intermediate} -c ${params.config} @@ -24,6 +31,8 @@ ______________________________________________________ --ref_fa ${params.ref_fa} --ref_fa_indices ${params.ref_fa_indices} --min_pct_hq_reads ${params.min_pct_hq_reads} +--hq_pct ${params.hq_pct} +--run_gvcf ${params.run_gvcf} --dbSNP ${params.dbSNP} --target_gatk ${params.target_gatk} --target_picard ${params.target_picard} @@ -41,13 +50,14 @@ ______________________________________________________ Project Directory: ${projectDir} + +Command line call: +${workflow.commandLine} ______________________________________________________ """ else log.info """ -______________________________________________________ - - WES PARAMETER LOG +WES PARAMETER LOG --comment: ${params.comment} @@ -60,6 +70,8 @@ ______________________________________________________ --pattern ${params.pattern} --extension ${params.extension} --concat_lanes ${params.concat_lanes} +--csv_input ${params.csv_input} +--download_data ${params.download_data} -w ${workDir} --keep_intermediate ${params.keep_intermediate} -c ${params.config} @@ -68,6 +80,8 @@ ______________________________________________________ --ref_fa ${params.ref_fa} --ref_fa_indices ${params.ref_fa_indices} --min_pct_hq_reads ${params.min_pct_hq_reads} +--hq_pct ${params.hq_pct} +--run_gvcf ${params.run_gvcf} --dbSNP ${params.dbSNP} --target_gatk ${params.target_gatk} --target_picard ${params.target_picard} @@ -78,6 +92,9 @@ ______________________________________________________ --ploidy_val ${params.ploidy_val} Project Directory: ${projectDir} + +Command line call: +${workflow.commandLine} ______________________________________________________ """ diff --git a/bin/log/wgs.nf b/bin/log/wgs.nf index 0b39a499..450c60cb 100644 --- a/bin/log/wgs.nf +++ b/bin/log/wgs.nf @@ -1,22 +1,29 @@ +import Logos + +logo = new Logo() +println '\n' +println logo.show() + def param_log(){ if (params.gen_org=='human') - log.info """ -______________________________________________________ - - WGS PARAMETER LOG +log.info """ +WGS PARAMETER LOG --comment: ${params.comment} Results Published to: ${params.pubdir} -______________________________________________________ +________________________________________________________________________________________ --workflow ${params.workflow} --gen_org ${params.gen_org} +--genome_build ${params.genome_build} --gen_ver ${params.gen_ver} --read_type ${params.read_type} --sample_folder ${params.sample_folder} --pattern ${params.pattern} --extension ${params.extension} --concat_lanes ${params.concat_lanes} +--csv_input ${params.csv_input} +--download_data ${params.download_data} -w ${workDir} -c ${params.config} --pubdir ${params.pubdir} @@ -24,6 +31,8 @@ ______________________________________________________ --ref_fa ${params.ref_fa} --ref_fa_indices ${params.ref_fa_indices} --min_pct_hq_reads ${params.min_pct_hq_reads} +--hq_pct ${params.hq_pct} +--run_gvcf ${params.run_gvcf} --dbSNP ${params.dbSNP} --snpEff_config ${params.snpEff_config} --mismatch_penalty ${params.mismatch_penalty} @@ -37,26 +46,30 @@ ______________________________________________________ Project Directory: ${projectDir} -______________________________________________________ + +Command line call: +${workflow.commandLine} +________________________________________________________________________________________ """ else log.info """ -______________________________________________________ - - WGS PARAMETER LOG +WGS PARAMETER LOG --comment: ${params.comment} Results Published to: ${params.pubdir} -______________________________________________________ +________________________________________________________________________________________ --workflow ${params.workflow} --gen_org ${params.gen_org} +--genome_build ${params.genome_build} --gen_ver ${params.gen_ver} --read_type ${params.read_type} --sample_folder ${params.sample_folder} --pattern ${params.pattern} --extension ${params.extension} --concat_lanes ${params.concat_lanes} +--csv_input ${params.csv_input} +--download_data ${params.download_data} -w ${workDir} -c ${params.config} --pubdir ${params.pubdir} @@ -64,6 +77,8 @@ ______________________________________________________ --ref_fa ${params.ref_fa} --ref_fa_indices ${params.ref_fa_indices} --min_pct_hq_reads ${params.min_pct_hq_reads} +--hq_pct ${params.hq_pct} +--run_gvcf ${params.run_gvcf} --dbSNP ${params.dbSNP} --snpEff_config ${params.snpEff_config} --mismatch_penalty ${params.mismatch_penalty} @@ -71,7 +86,10 @@ ______________________________________________________ --ploidy_val ${params.ploidy_val} Project Directory: ${projectDir} -______________________________________________________ + +Command line call: +${workflow.commandLine} +________________________________________________________________________________________ """ } diff --git a/bin/pta/SNVsToMNVs_CountsBasedFilter_AnnotateHighConf.py b/bin/pta/SNVsToMNVs_CountsBasedFilter_AnnotateHighConf.py new file mode 100644 index 00000000..0e4429a6 --- /dev/null +++ b/bin/pta/SNVsToMNVs_CountsBasedFilter_AnnotateHighConf.py @@ -0,0 +1,240 @@ +#!/usr/bin/env python + +import argparse +import sys +import os +import re + +class ArgumentParser(argparse.ArgumentParser): + def error(self, message): + self.print_help(sys.stderr) + self.exit(2, '\nERROR: %s\n\n' % (message)) + +class Variant: + def __init__(self,chrom,pos,identity,ref,alt,qual,filter_value): + self.chrom=chrom + self.pos=pos + self.identity=identity + self.ref=ref + self.alt=alt + self.qual=qual + self.filter=filter_value + self.info_dict=dict() + self.info_text="." + self.tumor=dict() + self.normal=dict() + + def print_variant(self): + sorted_format_keys=sorted(self.tumor.keys()) + tumor_format=[] + normal_format=[] + for field in sorted_format_keys: + if field in self.tumor: + tumor_format.append(str(self.tumor[field])) + else: + tumor_format.append(".") + if field in self.normal: + normal_format.append(str(self.normal[field])) + else: + normal_format.append(".") + self.tumor_format_text=":".join(tumor_format) + self.normal_format_text=":".join(normal_format) + self.format_keys_text = ":".join(sorted_format_keys) + if len(self.info_dict.keys())>0: + info_text_list=[] + for key in sorted(self.info_dict.keys()): + info_text_list.append('{0}={1}'.format(key,self.info_dict[key])) + self.info_text=";".join(info_text_list) + return "\t".join([self.chrom, self.pos, self.identity, self.ref, self.alt, self.qual, self.filter, self.info_text, self.format_keys_text, self.normal_format_text, self.tumor_format_text]) + + +def assemble_header(header, header_keys=["fileformat","FILTER","FORMAT","INFO","contig","cmdline","col_headers"]): + header_text="" + for k in header_keys: + if k in header: + if k in ["FILTER","FORMAT","INFO"]: + header_text=header_text+"\n".join(sorted(set(header[k])))+"\n" + else: + header_text=header_text+"\n".join(header[k])+"\n" + return header_text + +def __main__(): + parser = ArgumentParser(prog=os.path.basename(sys.argv[0]), description='Parses file and converts adjacent SNVs to MNVs if they have they match the MNV_ID and called_by fields.', epilog='', formatter_class=lambda prog: argparse.ArgumentDefaultsHelpFormatter(prog, max_help_position=100, width=150)) + parser.add_argument('-i', '--input_vcf', help = 'Input VCF file.', required=True) + parser.add_argument('-o', '--output_vcf', help = 'Output VCF file.', required=True) + parser.add_argument('--min_tumor_vaf', help='Minimum VAF in tumor sample', type=float,default=0.0001) + parser.add_argument('--max_normal_vaf', help='Maximum VAF in normal sample', type=float,default=0.2) + parser.add_argument('--min_tumor_dp', help='Minimum depth in tumor sample', type=int,default=2) + parser.add_argument('--min_normal_dp', help='Minimum depth in normal sample', type=int,default=2) + args=parser.parse_args() + INPUT=args.input_vcf + OUTPUT=args.output_vcf + MIN_T_VAF=args.min_tumor_vaf + MAX_N_VAF=args.max_normal_vaf + MIN_T_DP=args.min_tumor_dp + MIN_N_DP=args.min_normal_dp + if not os.path.isfile(INPUT): + print("ERROR: Required file {0} does not exist. Cannot run.".format(INPUT)) + sys.exit(1) + f=open(INPUT) + content=f.readlines() + o=open(OUTPUT,"w") + info_pattern=re.compile('(\w+)\=([^;]+)') + MNV=dict() + SNV=dict() + mnv_filter=dict() + uac_filter=dict() + add_supported_by=dict() + header=dict() + + ## Parse input VCF and identify SNVs to merge + ## Expects sorted VCF + + seen_filter_header=0 + seen_info_header=0 + for line in content: + line=line.strip() + # parse metadata + if line.startswith("##"): + meta_pattern = re.compile(r'''##(?P.+?)=(?P.+)''') + m=meta_pattern.match(line) + keyid=m.group('key') + if keyid not in header: + header[keyid]=[] + header[keyid].append(line) + # parse chrom header + elif line.startswith("#CHROM"): + header['col_headers']=[line] + # parse records + else: + (CHROM,POS,ID,REF,ALT,QUAL,FILTER,INFO,FORMAT,NORMAL,TUMOR)=line.strip().split("\t") + info_pattern=re.compile('(\w+)\=([^;]+)') + info_dict=dict(info_pattern.findall(INFO)) + normal_format_dict=dict(zip(FORMAT.split(":"),NORMAL.split(":"))) + tumor_format_dict=dict(zip(FORMAT.split(":"),TUMOR.split(":"))) + varID=":".join([CHROM,POS,REF,ALT]) + ###### Allele count specific filters ####### + if 'AF' in tumor_format_dict \ + and 'AF' in normal_format_dict \ + and 'DP' in tumor_format_dict \ + and 'DP' in normal_format_dict \ + and tumor_format_dict['AF'] != "." \ + and normal_format_dict['AF'] != "." \ + and tumor_format_dict['DP'] != "." \ + and normal_format_dict['DP'] != ".": + if float(tumor_format_dict['AF']) < MIN_T_VAF: + uac_filter[varID]=uac_filter[varID]+";LowTumorVAF" if varID in uac_filter else "LowTumorVAF" + if float(tumor_format_dict['AF'])>0 \ + and (float(normal_format_dict['AF']) > MAX_N_VAF \ + or float(normal_format_dict['AF'])>=float(tumor_format_dict['AF'])): + uac_filter[varID]=uac_filter[varID]+";HighNormalVAF" if varID in uac_filter else "HighNormalVAF" + if int(tumor_format_dict['DP']) < MIN_T_DP: + uac_filter[varID]=uac_filter[varID]+";LowTumorDP" if varID in uac_filter else "LowTumorDP" + if int(normal_format_dict['DP']) < MIN_N_DP: + uac_filter[varID]=uac_filter[varID]+";LowNormalDP" if varID in uac_filter else "LowNormalDP" + if varID in uac_filter: + print(varID, uac_filter[varID], tumor_format_dict['DP'], + normal_format_dict['DP'], tumor_format_dict['AF'], + normal_format_dict['AF']) + ### check if variant is an MNV or part of MNV ## + if "MNV_ID" in info_dict \ + and "TYPE" in info_dict: + #varID=":".join([CHROM,POS,REF,ALT]) + # type may not be MNV if an SNV also existed in the merge + if info_dict["TYPE"]=="MNV": + MNV[varID]=info_dict + MNV[varID]["SNVs"]=[] + for i in range(len(REF)): + ref=REF[i] + alt=ALT[i] + if not ref == alt: + pos=int(POS)+i + MNV[varID]["SNVs"].append(":".join([CHROM,str(pos),ref,alt])) + elif info_dict["TYPE"]=="SNV": + SNV[varID]=info_dict + # for every varID that goes to type MNV + for varID in MNV.keys(): + called_by=[] # list callers for SNV supporting this MNV + supported_by=[] # list support callers for SNV supporting this MNV + MNV_ID=[] # list MNV_IDs for this group? +# keep_mnv=1 + # for every varID that goes to a SNV supporting this MNV + for snvID in MNV[varID]["SNVs"]: + if "called_by" in SNV[snvID]: + called_by.append(SNV[snvID]["called_by"]) + if "supported_by" in SNV[snvID]: + supported_by.append(SNV[snvID]["supported_by"]) + MNV_ID.append(SNV[snvID]["MNV_ID"]) + # Split into SNVs if different callers are calling some of the site + # note each *ed_by is a string which can become a comma separated list + # order is conserved in the list + if len(set(called_by)) > 1 \ + or len(set(supported_by)) > 1 \ + or len(set(MNV_ID)) > 1 \ + or set(MNV_ID) != set([MNV[varID]["MNV_ID"]]): + mnv_filter[varID]="SplitToSNVs" + else: + # keep MNV together because no lone evidence supporting + # any one individual SNVs was reported + for snvID in MNV[varID]["SNVs"]: + mnv_filter[snvID]="PartOfMNV" + # if the MNV has a called_by update it with support from anything else + if "called_by" in MNV[varID]: + snv_callers = called_by[0].split(",") + mnv_callers=MNV[varID]["called_by"].split(",") # check who called the MNV + if len(set(mnv_callers)) < len(set(snv_callers)): # check for new callers + additional_snv_callers = list(set(mnv_callers)^set(snv_callers)) + add_supported_by[varID] = ",".join([s + "_SNVs" for s in additional_snv_callers]) + + header["FILTER"].append('##FILTER=') + header["FILTER"].append('##FILTER=') + header["FILTER"].append('##FILTER='.format(MIN_T_VAF)) + header["FILTER"].append('##FILTER='.format(MIN_T_DP)) + header["FILTER"].append('##FILTER='.format(MIN_N_DP)) + header["FILTER"].append('##FILTER='.format(MAX_N_VAF)) + header["INFO"].append('##INFO=') + o.write(assemble_header(header)) + + ## Read VCF for the second time ## + for line in content: + if line.startswith("#"): + continue + else: + (CHROM,POS,ID,REF,ALT,QUAL,FILTER,INFO,FORMAT,NORMAL,TUMOR) = line.strip().split("\t") + info_pattern=re.compile('(\w+)\=([^;]+)') + info_dict=dict(info_pattern.findall(INFO)) + varID=":".join([CHROM,POS,REF,ALT]) + if varID in mnv_filter: + if FILTER != "PASS": + FILTER=FILTER+";"+mnv_filter[varID] + else: + FILTER=mnv_filter[varID] + if varID in uac_filter: + if FILTER != "PASS": + FILTER=FILTER+";"+uac_filter[varID] + else: + FILTER=uac_filter[varID] + if varID in add_supported_by: + if "supported_by" in info_dict: + info_dict["supported_by"] = info_dict["supported_by"]+","+add_supported_by[varID] + else: + info_dict["supported_by"] = add_supported_by[varID] + info_text_list=[] + for key in sorted(info_dict.keys()): + info_text_list.append('{0}={1}'.format(key,info_dict[key])) + INFO=";".join(info_text_list) + if "num_callers" in info_dict \ + and FILTER=="PASS": + num_callers=int(info_dict["num_callers"]) + if num_callers>1 or "supported_by" in info_dict: + INFO=INFO+";HighConfidence" + o.write("\t".join([CHROM,POS,ID,REF, + ALT,QUAL,FILTER, + INFO,FORMAT,NORMAL, + TUMOR])+"\n") + o.close() + f.close() + +if __name__ == "__main__": + __main__() + \ No newline at end of file diff --git a/bin/pta/add_cancer_gene_census.py b/bin/pta/add_cancer_gene_census.py new file mode 100644 index 00000000..f4c6e416 --- /dev/null +++ b/bin/pta/add_cancer_gene_census.py @@ -0,0 +1,203 @@ +#!/usr/bin/env python +# USAGE: python cancer_gene_census.py cancer_gene_census.csv VCF VCF_OUT +# DESCRIPTION: Annotates files by adding information about the +# Cosmic Genome Census entry for the gene symbol. +################################################################################ +##################### COPYRIGHT ################################################ +# New York Genome Center + +# SOFTWARE COPYRIGHT NOTICE AGREEMENT +# This software and its documentation are copyright (2018) by the New York +# Genome Center. All rights are reserved. This software is supplied without +# any warranty or guaranteed support whatsoever. The New York Genome Center +# cannot be responsible for its use, misuse, or functionality. +# Version: 0.1 +# Author: Jennifer M Shelton +##################### /COPYRIGHT ############################################### +################################################################################ + +import sys +import os +import logging as log +import pandas as pd +import pysam +import numpy as np +########################################################################## +############## Custom functions ############ +########################################################################## + + +def get_record(gene, key, cancer_gene_census): + ''' + Get value for field from Cancer Gene Census records. + Ignores gene that are associated with multiple loci (e.g T-cell + receptor (TR*) and immunoglobin (IG*) genes) + ''' + if gene in ['C15orf65', 'CTNNA2', 'HMGN2P46', 'IGH', 'IGK', + 'IGL', 'KAT6A', 'TRA', 'TRB', 'TRD']: + value = '' + else: + try: + value = cancer_gene_census[(cancer_gene_census['Gene Symbol'] == gene)][key].values[0] + except (KeyError, IndexError): + value = '' + return value + + +def convert_numpy(x): + ''' + Convert Numpy nan to blank + ''' + if isinstance(x, float): + if np.isnan(x): + return '' + return str(x) + + +def get_csq_columns(bcf_in): + ''' + get column names from the bar + separated CSQ VEP annotation + results. CSQ are Consequence + annotations from Ensembl VEP. + ''' + csq_columns = bcf_in.header.info['CSQ'].description.split()[-1].split('|') # grab the definitions + return csq_columns + + +def get_csqs(record, csq_columns): + ''' + Get new INFO field results. + ''' + alt_count = len(record.alts) + csq_dicts = {} + spanning_deletion_offset = 0 + for i in range(alt_count): + j = i - spanning_deletion_offset + if record.alts[j] == '*': + csq_dicts[i] = {csq_column : '' for csq_column in csq_columns} + spanning_deletion_offset += 1 + else: + try: + csq_line = record.info['CSQ'][j] + except UnicodeDecodeError: # for names with accents and other unexpected characters (rare) + line = str(record) + csq_line = line.split('\t')[7].split('CSQ=')[1] + csq_line = csq_line.split(';')[0] + csq_line = csq_line.split(',')[j] + csq_values = csq_line.split('|') + csq_dict = dict(zip(csq_columns, csq_values)) + csq_dicts[i] = csq_dict + return csq_dicts + + +def get_CancerGeneCensus(record, csq_columns, cancer_gene_census): + ''' + Get new INFO field results. + ''' +# Example results: +# ('Mutation Types', 'A') +# ('Mutation Types', 'A, O, Mis') +# ('Mutation Types', 'F, Mis') +# ('Mutation Types', 'F; Mis') +# ('Mutation Types', 'Promoter Mis') +# grab the gene symbol from the first annotation ALT + CancerGeneCensus_lists = [] + csq_dicts = get_csqs(record, csq_columns) + for i, alt in enumerate(record.alts): + CancerGeneCensus_list = [] + for key in ['Tier', 'Hallmark', 'Somatic', 'Germline', + 'Tumour Types(Somatic)', 'Tumour Types(Germline)', + 'Cancer Syndrome', 'Tissue Type', 'Molecular Genetics', + 'Role in Cancer', 'Mutation Types']: + result = get_record(csq_dicts[i]['SYMBOL'], key, cancer_gene_census) + if key == 'Hallmark': + if result == 'Yes': + result = 'https://cancer.sanger.ac.uk/cosmic/census-page/' + csq_dicts[i]['SYMBOL'] + else: + result = '' + result = convert_numpy(result).replace(', ', ',') + result = result.replace('; ', ',') # database has highly variable entries + result = result.strip() # remove leading or trailing whitespace + result = result.replace(' ', '_') + CancerGeneCensus_list.append(result) + CancerGeneCensus = ('|').join(CancerGeneCensus_list) + CancerGeneCensus_lists.append(CancerGeneCensus) + CancerGeneCensus_line = ','.join(CancerGeneCensus_lists) + return CancerGeneCensus_line + + +def modify_header(bcf_in): + ''' + Add new INFO field + ''' + bcf_in.header.info.add(id='CancerGeneCensus', number='.', + type='String', + description='Consequence annotations from Cancer Gene Census. Format: Tier|Hallmark|Somatic|Germline|Tumour Types(Somatic)|Tumour Types(Germline)|Cancer Syndrome|Tissue Type|Molecular Genetics|Role in Cancer|Mutation Types' + ) + return bcf_in + + +def modify_record(record, cancer_gene_census, csq_columns): + ''' + Add new INFO field to each record + ''' + record.info['CancerGeneCensus'] = get_CancerGeneCensus(record, csq_columns, + cancer_gene_census) + return record + + +def get_census(cancer_gene_census_file): + ''' + Read in the Cancer Gene Census file + ''' + cancer_gene_census = pd.read_csv(cancer_gene_census_file) + return cancer_gene_census + + +def read_vcf(vcf_file): + ''' + Read in annotated VCF file. + ''' + bcf_in = pysam.VariantFile(vcf_file) # auto-detect input format + return bcf_in + + +def write_vcf(bcf_in, vcf_out_file, cancer_gene_census, csq_columns): + ''' + Write out the download + ''' + bcf_out = pysam.VariantFile(vcf_out_file, 'w', header=bcf_in.header) + for record in bcf_in.fetch(): + record = modify_record(record, cancer_gene_census, csq_columns) + exit_status = bcf_out.write(record) + if exit_status != 0: + print(exit_status) + +def prep_resitance_mutations(): + ''' + Convert to new record from tsv file + ''' + + +def main(): + ''' + Annotates files by adding information about the + Cosmic Genome Census entry for the gene symbol + ''' + cancer_gene_census_file = sys.argv[1] + vcf_file = sys.argv[2] + vcf_out_file = sys.argv[3] + cancer_gene_census = get_census(cancer_gene_census_file) + bcf_in = read_vcf(vcf_file) + bcf_in = modify_header(bcf_in) + csq_columns = get_csq_columns(bcf_in) + write_vcf(bcf_in, vcf_out_file, cancer_gene_census, csq_columns) + + +########################################################################## +##### Execute main unless script is simply imported ############ +##### for individual functions ############ +########################################################################## +if __name__ == '__main__': + main() diff --git a/bin/pta/add_cancer_resistance_mutations.py b/bin/pta/add_cancer_resistance_mutations.py new file mode 100644 index 00000000..3572282d --- /dev/null +++ b/bin/pta/add_cancer_resistance_mutations.py @@ -0,0 +1,191 @@ +#!/usr/bin/env python + # USAGE: python add_cancer_resistance_mutations.py cosmic_resistance_file genome vcf_file out +# DESCRIPTION: Annotates from Cosmic DB of resistance mutations. +################################################################################ +##################### COPYRIGHT ################################################ +# New York Genome Center + +# SOFTWARE COPYRIGHT NOTICE AGREEMENT +# This software and its documentation are copyright (2018) by the New York +# Genome Center. All rights are reserved. This software is supplied without +# any warranty or guaranteed support whatsoever. The New York Genome Center +# cannot be responsible for its use, misuse, or functionality. +# Version: 0.1 +# Author: Jennifer M Shelton +##################### /COPYRIGHT ############################################### +################################################################################ + +import sys +import os +import logging as log +import pysam +import pandas as pd +import re +########################################################################## +############## Custom functions ############ +########################################################################## + + +def modify_header(bcf_in): + ''' + Add new INFO field + ['MUTATION_ID', 'GENOMIC_MUTATION_ID', 'Drug Name', 'Tier'] + ''' + bcf_in.header.info.add(id='CosmicResistanceMutation', number='.', + type='String', + description='Consequence annotations from Cancer Gene Census. Format: MUTATION_ID|GENOMIC_MUTATION_ID|Drug Name|Tier') + return bcf_in + + + +def get_csq_columns(bcf_in): + ''' + get column names from the bar + separated CSQ VEP annotation + results. CSQ are Consequence + annotations from Ensembl VEP. + ''' + csq_columns = bcf_in.header.info['CSQ'].description.split()[-1].split('|') # grab the definitions + return csq_columns + + +def get_csqs(record, csq_columns): + ''' + Get new INFO field results. + ''' + alt_count = len(record.alts) + csq_dicts = {} + for i in range(alt_count): + try: + csq_line = record.info['CSQ'][i] + except UnicodeDecodeError: # for names with accents and other unexpected characters (rare) + line = str(record) + csq_line = line.split('\t')[7].split('CSQ=')[1] + csq_line = csq_line.split(';')[0] + csq_line = csq_line.split(',')[i] + csq_values = csq_line.split('|') + csq_dict = dict(zip(csq_columns, csq_values)) + csq_dicts[i] = csq_dict + return csq_dicts + + +def read_vcf(vcf_file): + ''' + Read in annotated VCF file. + ''' + bcf_in = pysam.VariantFile(vcf_file) # auto-detect input format + return bcf_in + + +def read_cosmic(cosmic_resistance_file): + ''' + Read in annotated VCF file. + ''' + cosmic_resistance = pd.read_csv(cosmic_resistance_file, sep='\t') + return cosmic_resistance + + +def get_record(match_Mutation, key, region_cosmic_resistance, match_key): + ''' + Get value for field from Cancer Gene Census records + ''' + try: + value = region_cosmic_resistance[(region_cosmic_resistance[match_key] == match_Mutation)][key].values[0] + except (KeyError, IndexError): + value = '' + return value + + +def match_cosmic(cosmic_resistance, record, csq_columns): + ''' + Check if HGVSp or HGVSc is at the same position + in the Cosmic Restance database return values + for each ALT in list. + ''' + # ======================= + # Get VEP annotation + # ======================= + csq_dicts = get_csqs(record, csq_columns) + cosmic_resistance_annotation = {} + for i, alt in enumerate(record.alts): + cosmic_resistance_annotation[i] = {} + for key in ['MUTATION_ID', 'GENOMIC_MUTATION_ID', 'Drug Name', 'Tier']: + cosmic_resistance_annotation[i][key] = '' + # ======================= + # Check for id match + # ======================= + kind = 'CosmicCoding' + id = csq_dicts[i][kind] + legacy_id = csq_dicts[i][kind +'_LEGACY_ID'] + match = cosmic_resistance[cosmic_resistance.GENOMIC_MUTATION_ID == id].copy() + legacy_match = cosmic_resistance[cosmic_resistance.LEGACY_MUTATION_ID == legacy_id].copy() + # ID Mutation + if not match.empty: + for key in ['MUTATION_ID', 'GENOMIC_MUTATION_ID', 'Drug Name', 'Tier']: + cosmic_resistance_annotation[i][key] = match[key].tolist()[0] + + elif not legacy_match.empty: + for key in ['MUTATION_ID', 'GENOMIC_MUTATION_ID', 'Drug Name', 'Tier']: + cosmic_resistance_annotation[i][key] = legacy_match[key].tolist()[0] + else: + # ======================= + # Check for non coding id match + # ======================= + kind = 'CosmicNonCoding' + id = csq_dicts[i][kind] + legacy_id = csq_dicts[i][kind +'_LEGACY_ID'] + match = cosmic_resistance[cosmic_resistance.GENOMIC_MUTATION_ID == id].copy() + legacy_match = cosmic_resistance[cosmic_resistance.LEGACY_MUTATION_ID == legacy_id].copy() + # ID Mutation + if not match.empty: + for key in ['MUTATION_ID', 'GENOMIC_MUTATION_ID', 'Drug Name', 'Tier']: + cosmic_resistance_annotation[i][key] = match[key].tolist()[0] + elif not legacy_match.empty: + for key in ['MUTATION_ID', 'GENOMIC_MUTATION_ID', 'Drug Name', 'Tier']: + cosmic_resistance_annotation[i][key] = legacy_match[key].tolist()[0] + # ======================= + # Compose annotation text + # ======================= + cosmic_resistance_annotation_lines = [] + for i in cosmic_resistance_annotation: + to_join = [str(cosmic_resistance_annotation[i][annotation]) for annotation in ['MUTATION_ID', + 'GENOMIC_MUTATION_ID', + 'Drug Name', 'Tier']] + cosmic_resistance_annotation_lines.append('|'.join(to_join)) + cosmic_resistance_annotation_line = ','.join(cosmic_resistance_annotation_lines) + return cosmic_resistance_annotation_line + + +def write_file(bcf_in, out, csq_columns, + cosmic_resistance): + ''' + Write out the header + ''' + bcf_out = pysam.VariantFile(out, 'w', header=bcf_in.header) + for record in bcf_in.fetch(): + cosmic_resistance_annotation = match_cosmic(cosmic_resistance, record, csq_columns) + record.info['CosmicResistanceMutation'] = cosmic_resistance_annotation + exit_status = bcf_out.write(record) + if exit_status != 0: + print('exit_status', exit_status) + + +def main(): + cosmic_resistance_file = sys.argv[1] + vcf_file = sys.argv[2] + out = sys.argv[3] + assert os.path.isfile(vcf_file), 'Failed to find caller VCF call file :' + vcf_file + cosmic_resistance = read_cosmic(cosmic_resistance_file) + bcf_in = read_vcf(vcf_file) + bcf_in = modify_header(bcf_in) + csq_columns = get_csq_columns(bcf_in) + write_file(bcf_in, out, csq_columns, + cosmic_resistance) + + +########################################################################## +##### Execute main unless script is simply imported ############ +##### for individual functions ############ +########################################################################## +if __name__ == '__main__': + main() diff --git a/bin/pta/add_final_allele_counts_to_vcf.py b/bin/pta/add_final_allele_counts_to_vcf.py new file mode 100644 index 00000000..53b7663c --- /dev/null +++ b/bin/pta/add_final_allele_counts_to_vcf.py @@ -0,0 +1,146 @@ +#!/usr/bin/env python + +################################################################################ +##################### COPYRIGHT ################################################ +# New York Genome Center + +# SOFTWARE COPYRIGHT NOTICE AGREEMENT +# This software and its documentation are copyright (2018) by the New York +# Genome Center. All rights are reserved. This software is supplied without +# any warranty or guaranteed support whatsoever. The New York Genome Center +# cannot be responsible for its use, misuse, or functionality. + +# Version: 0.1 (2018-12-06) +# Author: Kanika Arora (karora@nygenome.org) +##################### COPYRIGHT ################################################ +################################################################################ + + +import sys +import argparse +import os +import re + +class ArgumentParser(argparse.ArgumentParser): + def error(self, message): + self.print_help(sys.stderr) + self.exit(2, '\nERROR: %s\n\n' % (message)) + + +def check_if_exists(file_or_dir_path, type="file"): + if type=="file": + if not os.path.isfile(file_or_dir_path): + print("ERROR: Required file {0} does not exist. Cannot run.".format(file_or_dir_path)) + sys.exit(1) + elif type=="directory": + if not os.path.isdir(file_or_dir_path): + print("ERROR: Required file {0} does not exist. Cannot run.".format(file_or_dir_path)) + sys.exit(1) + else: + if not os.path.exists(file_or_dir_path): + print("ERROR: {0} does not exist. Cannot run.".format(file_or_dir_path)) + sys.exit(1) + +def compute_vaf(alt_count,dp): + ''' + Compute VAF from dp and alt_count. + ''' + if not isinstance(alt_count, int): + if str(int(alt_count)) == alt_count: + alt_count=int(alt_count) + else: + raise ValueError("alt_count should be an integer. "+alt_count+" provided. Cannot run") + if not isinstance(dp, int): + if str(int(dp)) == dp: + dp=int(dp) + else: + raise ValueError("dp should be an integer. "+dp+" provided. Cannot run") + if alt_count > dp: + raise ValueError("alt_count {0} is greater than depth {1}.".format(alt_count,dp)) + return (0 if dp==0 else round(float(alt_count)/dp,4)) + +def parse_format_return_allele_counts(ref,alt,format_dict,caller): + ''' + Return allele counts, depth and allele fraction as reported by the given caller. + If these fields are not present as is, they are computed from other fields from that caller. + ''' + AD = format_dict[caller+"_AD"] if caller+"_AD" in format_dict else "." + DP = format_dict[caller+"_DP"] if caller+"_DP" in format_dict else "." + AF = format_dict[caller+"_AF"] if caller+"_AF" in format_dict else "." + if caller == "lancet": + AF=str(compute_vaf(AD.split(",")[1],DP)) + if caller == "strelka2": + ## compute as suggested in https://github.com/Illumina/strelka/blob/master/docs/userGuide/README.md#somatic + if len(ref)==1 and len(alt)==1: + ## SNV + AD_ref=format_dict["strelka2_"+ref+"U"].split(",")[0] + AD_alt=format_dict["strelka2_"+alt+"U"].split(",")[0] + DP=str(int(format_dict["strelka2_AU"].split(",")[0])+int(format_dict["strelka2_CU"].split(",")[0])+int(format_dict["strelka2_GU"].split(",")[0])+int(format_dict["strelka2_TU"].split(",")[0])) + else: + ## INDEL + AD_ref=format_dict["strelka2_TAR"].split(",")[0] + AD_alt=format_dict["strelka2_TIR"].split(",")[0] + DP=str(int(AD_ref)+int(AD_alt)) + AD=AD_ref+","+AD_alt + AF=str(compute_vaf(AD_alt,DP)) + return (AD,DP,AF) + + + +def __main__(): + parser = ArgumentParser(prog='add_final_allele_counts', + description='Picks final values for AD and DP based on set caller priority.', epilog='', + formatter_class=lambda prog: argparse.ArgumentDefaultsHelpFormatter(prog, max_help_position=100, width=150)) + parser.add_argument('-v', '--vcf', help = 'SNV VCF file.', required=True) + parser.add_argument('-o', '--output', help = 'Output VCF file.', required=True) + parser.add_argument('-p', '--priority', help = 'Comma-separated prioritized list of sources (callers) for picking final allele counts for the variants.', default='nygc,strelka2,mutect2,lancet') + args=parser.parse_args() + VCF=args.vcf + OUT=args.output + check_if_exists(VCF) + CALLER_PRIORITY=args.priority.split(",") + f=open(VCF) + o=open(OUT,"w") + seen=0 + for line in f: + if line.startswith("#"): + o.write(line) + if line.startswith("##FORMAT") and seen==0: + o.write('##INFO=\n') + o.write('##FORMAT=\n') + o.write('##FORMAT=\n') + o.write('##FORMAT=\n') + seen=1 + else: + toks=line.strip().split("\t") + info_pattern=r"(\w+)=([^;]+);*" + matches = re.findall(info_pattern,toks[7]) + info_dict = {a:b for a,b in matches} + normal_format=dict(zip(toks[8].split(":"),toks[9].split(":"))) + tumor_format=dict(zip(toks[8].split(":"),toks[10].split(":"))) + if 'called_by' in info_dict: + called_by=info_dict["called_by"].split(",") + else: + called_by = [] + if "nygc_AD" in normal_format: + called_by.append("nygc") + chosen_caller="" + for caller in CALLER_PRIORITY: + if caller in called_by: + chosen_caller=caller + break + if chosen_caller=="": + o.write(line) + else: + toks[7]=toks[7]+";AlleleCountSource="+chosen_caller + (normal_AD,normal_DP,normal_AF)=parse_format_return_allele_counts(toks[3],toks[4],normal_format,chosen_caller) + (tumor_AD,tumor_DP,tumor_AF)=parse_format_return_allele_counts(toks[3],toks[4],tumor_format,chosen_caller) + toks[8]=toks[8]+":AD:DP:AF" + toks[9]=toks[9]+':{0}:{1}:{2}'.format(normal_AD,normal_DP,normal_AF) + toks[10]=toks[10]+':{0}:{1}:{2}'.format(tumor_AD,tumor_DP,tumor_AF) + o.write("\t".join(toks)+"\n") + f.close() + o.close() + +if __name__ == "__main__": + __main__() \ No newline at end of file diff --git a/bin/pta/add_nygc_allele_counts_to_vcf.py b/bin/pta/add_nygc_allele_counts_to_vcf.py new file mode 100644 index 00000000..81e3ef2a --- /dev/null +++ b/bin/pta/add_nygc_allele_counts_to_vcf.py @@ -0,0 +1,357 @@ +#!/usr/bin/env python + +################################################################################ +##################### COPYRIGHT ################################################ +# New York Genome Center + +# SOFTWARE COPYRIGHT NOTICE AGREEMENT +# This software and its documentation are copyright (2018) by the New York +# Genome Center. All rights are reserved. This software is supplied without +# any warranty or guaranteed support whatsoever. The New York Genome Center +# cannot be responsible for its use, misuse, or functionality. + +# Version: 0.5 (2018-09-05) +# Author: Kanika Arora (karora@nygenome.org) +##################### COPYRIGHT ################################################ +################################################################################ + +#### PLEASE NOTE THAT THIS SCRIPT EXPECTS THE FORMAT COLUMN FOR THE NORMAL SAMPLE +#### TO BE COLUMN 10 AND TUMOR SAMPLE TO BE COLUMN 11 +#### IT WILL ADD INCORRECT ALLELE COUNTS IF THAT ORDER OF SAMPLES IS NOT TRUE + +import pysam +import sys +import argparse +import os +import re + +header=dict() + +def add_new_header_field(KEY, VALUE): + if KEY not in header: + header[KEY]='' + header[KEY]=header[KEY]+"##{0}={1}\n".format(KEY,VALUE) + +class ArgumentParser(argparse.ArgumentParser): + def error(self, message): + self.print_help(sys.stderr) + self.exit(2, '\nERROR: %s\n\n' % (message)) + +def check_if_exists(file_or_dir_path, type="file"): + if type=="file": + if not os.path.isfile(file_or_dir_path): + print("ERROR: Required file {0} does not exist. Cannot run.".format(file_or_dir_path)) + sys.exit(1) + elif type=="directory": + if not os.path.isdir(file_or_dir_path): + print("ERROR: Required file {0} does not exist. Cannot run.".format(file_or_dir_path)) + sys.exit(1) + else: + if not os.path.exists(file_or_dir_path): + print("ERROR: {0} does not exist. Cannot run.".format(file_or_dir_path)) + sys.exit(1) + +def compute_vaf(alt_count,dp): + ''' + Compute VAF from dp and alt_count. + ''' + if not isinstance(alt_count,int): + if str(int(alt_count)) == alt_count: + alt_count=int(alt_count) + else: + raise ValueError("alt_count should be an integer. "+alt_count+" provided. Cannot run") + if not isinstance(dp,int): + if str(int(dp)) == dp: + dp=int(dp) + else: + raise ValueError("dp should be an integer. "+dp+" provided. Cannot run") + if alt_count > dp: + raise ValueError("alt_count {0} is greater than depth {1}.".format(alt_count,dp)) + return (0 if dp==0 else round(float(alt_count)/dp,4)) + + +def infer_variant_type(ref, alt): + ''' + Infers whether the variant is a SNV,MNV,INDEL or COMPLEX (delin). + ''' + variant_type="COMPLEX" + if len(ref) == len(alt): + if len(ref)==1: + variant_type="SNV" + else: + variant_type="MNV" + elif (len(ref) == 1 or len(alt) == 1) and ref[0] == alt[0]: + ## VCF file has anchor bases for indels + variant_type="INDEL" + return variant_type + +def is_too_long(ref, alt, variant_type, MAX_INDEL_LEN): + ''' + Test if an INDEL or COMPLEX event is too long for computing allele counts using NYGC's pileup method given the length cut off. + ''' + too_long = False + if (len(ref) > MAX_INDEL_LEN or len(alt) > MAX_INDEL_LEN) and (variant_type == "INDEL" or variant_type == "COMPLEX"): + too_long = True + return too_long + + +def read_pileup_return_count(samfile, chr, pos, ref, + alt, variant_type, MIN_MQ=10, MIN_BQ=10, + testing=False): + ref_reads = [] + alt_reads = [] + other_reads = [] + f1r2_reads = [] + f2r1_reads = [] + properly_paired_reads = [] + fwd_reads = [] + rev_reads = [] + pileup = samfile.pileup(chr, pos - 1, pos) + possible_complex=False + anchor_mismatch=0 + for pileupcolumn in pileup: + if pileupcolumn.pos == pos - 1: + for pileupread in pileupcolumn.pileups: + # if the position in the read is .is_del pos is none so take next + pos_in_read = pileupread.query_position + # skip reads where the position is already a deletion (is_del) + if not pos_in_read: + continue +# print ('pos_in_read', pos_in_read, pos, ref, alt) +# print('VERSION', pysam.__version__) + # ========================== + # pysam filters secondary, dup, and qcfail by default unless nofilter is used + # ========================== + BQ = pileupread.alignment.query_qualities[pos_in_read] + MQ = pileupread.alignment.mapping_quality + if testing: + if pileupread.alignment.is_duplicate: + print('is_duplicate') + sys.exit(0) + if pileupread.alignment.is_qcfail: + print('is_qcfail') + sys.exit(0) + if BQ < MIN_BQ \ + or MQ < MIN_MQ \ + or pileupread.alignment.is_supplementary: + continue + read_name = pileupread.alignment.query_name + # Check strand of reads + if pileupread.alignment.is_reverse is False: + fwd_reads.append(read_name) + else: + rev_reads.append(read_name) + #Check if properly paired + if pileupread.alignment.is_proper_pair is True: + properly_paired_reads.append(read_name) + #If properly paired, check if read-pair in F1R2 or F2R1 orientation + if pileupread.alignment.is_reverse is False and pileupread.alignment.is_read1: + f1r2_reads.append(read_name) + else: + f2r1_reads.append(read_name) + # filter reads that don't span the indel + if pos_in_read + len(ref) > pileupread.alignment.query_alignment_end \ + or pos_in_read + len(alt) > pileupread.alignment.query_alignment_end: + continue + + ## Check if read has ref allele or alt allele + if variant_type == "SNV" or variant_type == "MNV": + if pileupread.alignment.query_sequence[pos_in_read:pos_in_read + len(ref)] == ref: + ref_reads.append(read_name) + elif pileupread.alignment.query_sequence[pos_in_read:pos_in_read + len(alt)] == alt: + alt_reads.append(read_name) + if pileupread.indel != 0: + anchor_mismatch+=1 + else: + other_reads.append(read_name) + elif variant_type == "COMPLEX": + ### Exact length and sequence of allele match required for complex events. Example: if the variant is AC>T, and if a read has deletion of C but the nt at anchor position is A, it will go into other_reads ### + if pileupread.indel == 0 and pileupread.alignment.query_sequence[pos_in_read:pos_in_read + len(ref)] == ref: + ref_reads.append(read_name) + elif pileupread.indel == len(alt) - len(ref) and pileupread.alignment.query_sequence[pos_in_read:pos_in_read + len(alt)] == alt: + alt_reads.append(read_name) + else: + other_reads.append(read_name) + else: + #### Variant type is "INDEL" #### + ############################# PLEASE NOTE ####################################### + ### Variant calling for indels: For insertion, check whether the length of the + ### insertion and sequence matches alt allele. If there is no indel at the anchor + ### position (even if the base at the anchor position doesn't match), we consider + ### the read as adding support to the reference. For deletions, if the length of + ### deletion matches alt allele, we consider that read supporting the alt allele, + ### and if there is no deletion at that position (even if there are mismatches in + ### the bases spanning the deletion), it's considered to support reference allele. + ### Examples for insertions: + ### Let's say that the variant is chr1:12345 A > AT + ### Scenario1: The read has a C at chr1:12345 along with insertion of T + ### This read will be used to add support to the alternate allele + ### Scenario2: Read has a mismatch (let's say 'C') at chr1:12345, but no indel + ### This read will be used to add support to the reference allele + ### Scenario3: Read has a different insertion, let's say 'G' insted of 'T' + ### This read will go into the other_reads category. + ### Example for deletions: + ### Let's say the variant is chr1:12345 AT > A + ### Scenario1: The read has a C at chr1:12345 along with deletion of T + ### It will be used to add support to the alt allele. + ### Scenario2: The read has a A at chr1:12345 followed by a 2nt deletion + ### This read will go into the other_reads_category + ################################################################################### + if len(ref)==1: + #Variant is an insertion + if pileupread.indel == 0: ### and pileupread.alignment.query_sequence[pos_in_read:pos_in_read + 1] == ref: + ref_reads.append(read_name) + elif pileupread.indel == len(alt) - len(ref) and pileupread.alignment.query_sequence[pos_in_read+1:pos_in_read + len(alt)] == alt[1:]: + alt_reads.append(read_name) + if pileupread.alignment.query_sequence[pos_in_read] != alt[0]: + anchor_mismatch+=1 + else: + other_reads.append(read_name) + else: + # Variant is a deletion (len(ref)>1 and len(alt)==1) + if pileupread.indel == 0: ## and pileupread.alignment.query_sequence[pos_in_read+1:pos_in_read + len(ref)] == ref[1:]: + ref_reads.append(read_name) + elif pileupread.indel == len(alt) - len(ref): ## and pileupread.alignment.query_sequence[pos_in_read:pos_in_read + len(alt)] == alt: + alt_reads.append(read_name) + if pileupread.alignment.query_sequence[pos_in_read] != alt[0]: + anchor_mismatch+=1 + else: + other_reads.append(read_name) + + ## If there are more than 2 reads that support the alternate allele of an indel variant, but the anchor base does not match, we report that as a PossiblyComplex event ## + ## Similarly, if there are more than 2 reads that support alt allele of an SNV, but have an indel immediately following the SNV variant, we report that as a PossiblyComplex event ## + if anchor_mismatch > 2: + possible_complex=True + # check sets to make sure reads don't show up in multiple sets + # supporting multiple calls + set_ref_raw = set(ref_reads) + set_alt_raw = set(alt_reads) + set_other_raw = set(other_reads) + ref_reads_set = set_ref_raw - set_alt_raw - set_other_raw + alt_reads_set = set_alt_raw - set_ref_raw - set_other_raw + other_reads_set = set_other_raw - set_ref_raw - set_alt_raw + all_reads_set = alt_reads_set|ref_reads_set|other_reads_set + # make read-type sets + f1r2_reads_set = set(f1r2_reads) + f2r1_reads_set = set(f2r1_reads) + fwd_reads_set = set(fwd_reads) + rev_reads_set = set(rev_reads) + properly_paired_reads_set = set(properly_paired_reads) + # tally set in ref and alt, non-ref/alt, all reads + ref_count = len(ref_reads_set) + alt_count = len(alt_reads_set) + # other_count=len(other_reads_set) # not used + dp = len(all_reads_set) + # get VAF + vaf = compute_vaf(alt_count, dp) + # get read set count by pair info ref/alt + f1r2_ref = len(ref_reads_set.intersection(f1r2_reads_set)) + f1r2_alt = len(alt_reads_set.intersection(f1r2_reads_set)) + f2r1_ref = len(ref_reads_set.intersection(f2r1_reads_set)) + f2r1_alt = len(alt_reads_set.intersection(f2r1_reads_set)) + # get read set count by orientation ref/alt + fwd_ref = len(ref_reads_set.intersection(fwd_reads_set)) + fwd_alt = len(alt_reads_set.intersection(fwd_reads_set)) + rev_ref = len(ref_reads_set.intersection(rev_reads_set)) + rev_alt = len(alt_reads_set.intersection(rev_reads_set)) + # tally properly paired sets ref/alt + proper_paired_ref = len(ref_reads_set.intersection(properly_paired_reads_set)) + proper_paired_alt = len(alt_reads_set.intersection(properly_paired_reads_set)) + # tally not properly paired sets ref/alt + not_proper_paired_ref = len(ref_reads_set) - proper_paired_ref + not_proper_paired_alt = len(alt_reads_set) - proper_paired_alt + return ('{0},{1}'.format(ref_count, alt_count), + str(dp), str(vaf), + '{0},{1}'.format(f1r2_ref, f1r2_alt), + '{0},{1}'.format(f2r1_ref, f2r1_alt), + '{0},{1}'.format(fwd_ref, fwd_alt), + '{0},{1}'.format(rev_ref,rev_alt), + '{0},{1}'.format(proper_paired_ref, proper_paired_alt), + '{0},{1}'.format(not_proper_paired_ref, not_proper_paired_alt),possible_complex) + + +def __main__(): + parser = ArgumentParser(prog='add_nygc_allele_counts', + description='Runs pileup on tumor and normal bam files to compute allele counts for bi-allelic SNV and Indel variants in VCF file and adds pileup format columns to the VCF file.', epilog='', + formatter_class=lambda prog: argparse.ArgumentDefaultsHelpFormatter(prog, max_help_position=100, width=150)) + parser.add_argument('-t', '--tumor_bam', help = 'Tumor BAM file.', required=True) + parser.add_argument('-n', '--normal_bam', help = 'Normal BAM file.', required=True) + parser.add_argument('-v', '--vcf', help = 'SNV VCF file.', required=True) + parser.add_argument('-o', '--output', help = 'Output VCF file.', required=True) + parser.add_argument('-b', '--min_base_quality', help='Minimum base quality', default=10, type=int) + parser.add_argument('-m', '--min_mapping_quality', help='Minimum mapping quality', + default=10, type=int) + parser.add_argument('-i', '--max_indel_len_for_count', + help='Maximum indel or delin (complex event) length for generating counts', + default=10, type=int) + args=parser.parse_args() + # name variables + TBAM=args.tumor_bam + NBAM=args.normal_bam + VCF=args.vcf + OUT=args.output + MIN_BQ=args.min_base_quality + MIN_MQ=args.min_mapping_quality + MAX_INDEL_LEN=args.max_indel_len_for_count + # test files + check_if_exists(TBAM) + check_if_exists(NBAM) + check_if_exists(VCF) + + TumorSamFile=pysam.AlignmentFile(TBAM, "rb") + NormalSamFile=pysam.AlignmentFile(NBAM, "rb") + + f=open(VCF) + o=open(OUT,"w") + seen=0 + for line in f: + if line.startswith("#"): + if line.startswith("##FORMAT") and seen==0: + o.write('##INFO=\n') + o.write('##FORMAT=\n') + o.write('##FORMAT=\n') + o.write('##FORMAT=\n') + o.write('##FORMAT=\n') + o.write('##FORMAT=\n') + o.write('##FORMAT=\n') + o.write('##FORMAT=\n') + o.write('##FORMAT=\n') + o.write('##FORMAT=\n') + seen=1 + o.write(line) + else: + new_ids=["nygc_AD", "nygc_DP", "nygc_AF", "nygc_F1R2", "nygc_F2R1", "nygc_FWD", "nygc_REV", + "nygc_PROPER_PAIRED", "nygc_NOT_PROPER_PAIRED"] + line=line.strip() + toks=line.split("\t") + chrom = toks[0] + pos = toks[1] + ref = toks[3] + alt = toks[4] + variant_type=infer_variant_type(ref,alt) + + too_long = is_too_long(ref, alt, variant_type, MAX_INDEL_LEN) + if too_long: + o.write("\t".join(toks)+"\n") + else: + toks[8] = toks[8]+":"+":".join(new_ids) + (AD,DP,AF,F1R2,F2R1,FWD,REV,PROPER_PAIRED,NOT_PROPER_PAIRED,possibly_complex) = read_pileup_return_count(NormalSamFile, chrom, + int(pos), + ref, alt, variant_type, + MIN_MQ=10, MIN_BQ=10) + toks[9]=toks[9]+":"+":".join([AD, DP, AF, F1R2, F2R1, FWD, REV, + PROPER_PAIRED, NOT_PROPER_PAIRED]) + (AD,DP,AF,F1R2,F2R1,FWD,REV,PROPER_PAIRED,NOT_PROPER_PAIRED, possibly_complex) = read_pileup_return_count(TumorSamFile, chrom, + int(pos), + ref, alt, variant_type, + MIN_MQ=10, MIN_BQ=10) + toks[10]=toks[10]+":"+":".join([AD, DP, AF, F1R2, F2R1, FWD, REV, + PROPER_PAIRED, NOT_PROPER_PAIRED]) + if possibly_complex is True: + toks[7]=toks[7]+";PossiblyComplex" + o.write("\t".join(toks)+"\n") + f.close() + o.close() + +if __name__ == "__main__": + __main__() \ No newline at end of file diff --git a/bin/pta/annotate-bedpe-with-cnv.r b/bin/pta/annotate-bedpe-with-cnv.r new file mode 100644 index 00000000..982b9b9f --- /dev/null +++ b/bin/pta/annotate-bedpe-with-cnv.r @@ -0,0 +1,145 @@ +## BJS Note: this script was located in the root path of the Docker container +## gcr.io/nygc-public/sv_cnv@sha256:1c14a50d131323a2a4bab323cf224879776af8de37f93df79292fd2e63269274 +## It is reproduced below as it exists there without modification + +## Annotate a merged & annotated BEDPE with closest CNV changepoint +libs = c('optparse', 'StructuralVariantAnnotation', 'VariantAnnotation', 'rtracklayer', 'stringr') +invisible(suppressPackageStartupMessages(sapply(libs, require, character.only=T, quietly=T))) +options(width=200, scipen=999) + + + +## Handle non-standard bedpe columns better +readBEDPE = function(f) { + + ## Read file as Pairs object + x = rtracklayer::import(f, format='bedpe') + + ## Update metadata column names + x.mcol.names = colnames(read.csv(f, h=T, stringsAsFactors=F, sep='\t', check.names=F)) + colnames(mcols(x))[3:ncol(mcols(x))] = x.mcol.names[11:length(x.mcol.names)] + mcols(x)$type = mcols(x)$name + + + ## Convert to breakpoint ranges + x = StructuralVariantAnnotation::pairs2breakpointgr(x) + + return(x) + +} + + + +## Read a headered, tab-delimited CNV file into a GRanges object +readCNV = function(f) { + + x = read.csv(f, h=F, stringsAsFactors=F, sep='\t', comment.char='#') + colnames(x)[1:3] = c('chr','start','end') + x = makeGRangesFromDataFrame(x) + + return(x) + +} + + + +## Find the nearest copy number changepoints to each breakend +annotateWithClosestChangepoint = function(sv, cnv) { + + sv$cnv = '' + cnv.str = paste0(as.character(seqnames(cnv)), ':', start(cnv), '-', end(cnv)) + nearest.cnv = GenomicRanges::nearest(sv, cnv) + + ## Make sure NAs (i.e. no nearest neightbor) are preserved as blanks + idx.na = which(is.na(nearest.cnv)) + nearest.cnv[idx.na] = 1 + + sv$cnv = cnv.str[nearest.cnv] + sv$cnv[idx.na] = '' + + return(sv) + +} + + + +## Convert breakpointRanges to BEDPE +vcfToBedpe = function(vcf) { + + sqn = as.character(seqnames(vcf)) + strand = as.character(strand(vcf)) + res = c() + processed = c() + + for (i in 1:length(vcf)) { + bnd = names(vcf)[i] + partner = vcf$partner[i] + partner.idx = which(names(vcf) == partner) + + ## If we don't have exactly one partner, exclude this variant + if (length(partner.idx) != 1) { + warning('Missing partner for breakend ', bnd) + next + } + + ## Check to see if we've alrady processed this or it's partner + if (any(c(bnd, partner) %in% processed)) { + next + } + + + ## Combine breakends in single line + res.i = c(sqn[i], start(vcf)[i], end(vcf)[i], ## chr1, start1, end1 + sqn[partner.idx], start(vcf)[partner.idx], end(vcf)[partner.idx], ## chr2, start2, end 2 + vcf$type[i], '.', strand[i], strand[partner.idx], ## type, score, strand1, strand2 + vcf$evidence[i], vcf$tools[i], vcf$`tumor--normal`[i], vcf$info[i], ## evidence, tools, TN, info + vcf$cnv[i], vcf$cnv[partner.idx]) ## changepoint1 , changpoint2 + ## Add to result, keep track of processed breakends + res = rbind(res, res.i) + processed = c(processed, bnd, partner) + } + + + ## Add colnames and fill in simple event classifications + colnames(res) = c('chr1', 'start1', 'end1', 'chr2', 'start2', 'end2', 'type', + 'score', 'strand1', 'strand2', 'evidence', 'tools', 'tumor--normal', + 'info', 'cnv_changepoint_1', 'cnv_changepoint_2') + res = as.data.frame(res, stringsAsFactors=F) + + + ## Fix coordinates (have to subtract when starting from a bedpe) + res$start1 = as.numeric(res$start1) - 1 + res$start2 = as.numeric(res$start2) - 1 + + + colnames(res)[1] = paste0('#', colnames(res)[1]) + + return(res) + +} + + + +## Collect arguments +option_list = list( + make_option(c("-b", "--bedpe"), type='character', help="Input BEDPE"), + make_option(c("-c", "--cnv"), type='character', help="BED file containing CNV intervals"), + make_option(c("-o", "--out_file"), type='character', help="Output BEDPE")) +opt = parse_args(OptionParser(option_list=option_list)) + + + +## Read bedpe +sv = readBEDPE(opt$bedpe) + +## Read CNV +cnv = readCNV(opt$cnv) + +## Annotate breakpoints with closest changepoint +sv = annotateWithClosestChangepoint(sv=sv, cnv=cnv) + +## Convert to bedpe +res = vcfToBedpe(sv) + +## Write result +write.table(res, opt$out_file, row.names=F, col.names=T, sep='\t', quote=F) diff --git a/bin/pta/annotate-bedpe-with-databases.r b/bin/pta/annotate-bedpe-with-databases.r new file mode 100644 index 00000000..385d21e1 --- /dev/null +++ b/bin/pta/annotate-bedpe-with-databases.r @@ -0,0 +1,254 @@ +## BJS Note: this script was located in the root path of the Docker container +## gcr.io/nygc-public/sv_cnv@sha256:1c14a50d131323a2a4bab323cf224879776af8de37f93df79292fd2e63269274 +## It is reproduced below as it exists there without modification + +## Annotate a merged bedpe with arbitrary databases +libs = c('optparse', 'StructuralVariantAnnotation', 'VariantAnnotation', 'rtracklayer', 'stringr', 'gUtils') +invisible(suppressPackageStartupMessages(sapply(libs, require, character.only=T, quietly=T))) +options(width=200, scipen=999) + + + +## Handle non-standard bedpe columns better +readBEDPE = function(f) { + + ## Read file as Pairs object + x = rtracklayer::import(f, format='bedpe') + + ## Update metadata column names + x.mcol.names = colnames(read.csv(f, h=T, stringsAsFactors=F, sep='\t', check.names=F)) + colnames(mcols(x))[3:ncol(mcols(x))] = x.mcol.names[11:length(x.mcol.names)] + mcols(x)$type = mcols(x)$name + + + ## Convert to breakpoint ranges + x = StructuralVariantAnnotation::pairs2breakpointgr(x) + + return(x) + +} + + + +## Handle both BEDPE and BED files (headerless) +readDB = function(f) { + + is.bed = grepl('\\.bed\\.gz$|\\.bed$', f) + is.bedpe = grepl('\\.bedpe\\.gz$|\\.bedpe$', f) + is.vcf = grepl('\\.vcf\\.gz$|\\.vcf$', f) + + if (is.bed || is.bedpe) { + x = rtracklayer::import(f) + + ## If this is a BEDPE, convert to a breakpointRanges object + ## The package seems to misname SV type so update that + if (is.bedpe) { + x = StructuralVariantAnnotation::pairs2breakpointgr(x) + x$type = x$sourceId + } + + } + + if (is.vcf) { + + x = VariantAnnotation::readVcf(f) + x = StructuralVariantAnnotation::breakpointRanges(x, nominalPosition=T) + + if ('svtype' %in% colnames(mcols(x))) { + x$type = x$svtype + } else { + x$type = x$sourceId + } + + + } + + + return(x) + +} + + + +## Check overlaps between a breakpointRanges and a GRanges, +pairInBed = function(query, subject) { + + overlaps = rep(NA, length(query)) + + ## For each breakend + processed = c() + for (i in 1:length(query)) { + + partner.idx = which(names(query) == query$partner[i]) + is.translocation = as.character(seqnames(query[i])) != as.character(seqnames(query[partner.idx])) + + ## Check if we've already procesed the full breakpoint + if (i %in% processed) { + next + } + + if (is.translocation) { + + overlap = any(query[c(i, partner.idx)] %^% subject) + + } else { + + bkpt = GRanges(as.character(seqnames(query))[i], IRanges(start(query)[i], end(query)[partner.idx])) + overlap = any(bkpt %^% subject) + + } + + overlaps[c(i, partner.idx)] = overlap + processed = c(processed, i, partner.idx) + + } + + return(overlaps) + +} + + + +## Annotate breakpointRanges object with breakpointRanges or GRanges +annotateDB = function(x, db, name, slop, ignore.strand=F) { + + + ## Use different overlap method depending on whether DB is BED or BEDPE + if ('partner' %in% colnames(mcols(db))) { + overlaps = StructuralVariantAnnotation::findBreakpointOverlaps(query=x, + subject=db, + maxgap=slop, + sizemargin=0.8, + restrictMarginToSizeMultiple=0.8, + ignore.strand=ignore.strand) + overlaps = queryHits(overlaps) + + } else { + # overlaps = GenomicRanges::findOverlaps(query=x, subject=db) + overlaps = pairInBed(query=x, subject=db) + } + + ## Annotate with hits if there are any + x$db[overlaps] = paste0(dta$db[overlaps],name,',') + + return(x) + +} + + + +## Convert breakpointRanges to BEDPE +vcfToBedpe = function(vcf) { + + sqn = as.character(seqnames(vcf)) + strand = as.character(strand(vcf)) + res = c() + processed = c() + + for (i in 1:length(vcf)) { + bnd = names(vcf)[i] + partner = vcf$partner[i] + partner.idx = which(names(vcf) == partner) + + ## If we don't have exactly one partner, exclude this variant + if (length(partner.idx) != 1) { + warning('Missing partner for breakend ', bnd) + next + } + + ## Check to see if we've alrady processed this or it's partner + if (any(c(bnd, partner) %in% processed)) { + next + } + + ## Aggregate database string + dbstr = unique(unlist(strsplit(c(vcf$db[i], vcf$db[partner.idx]), ',', fixed=T))) + dbstr = paste(dbstr, collapse=',') + dbstr = paste0('known=',dbstr,';') + + + ## Combine breakends in single line + res.i = c(sqn[i], start(vcf)[i], end(vcf)[i], ## chr1, start1, end1 + sqn[partner.idx], start(vcf)[partner.idx], end(vcf)[partner.idx], ## chr2, start2, end 2 + vcf$type[i], '.', strand[i], strand[partner.idx], ## type, score, strand1, strand2 + vcf$evidence[i], vcf$tools[i], vcf$`tumor--normal`[i], dbstr) ## evidence, tools, TN, info + + ## Add to result, keep track of processed breakends + res = rbind(res, res.i) + processed = c(processed, bnd, partner) + } + + + ## Add colnames and fill in simple event classifications + colnames(res) = c('chr1', 'start1', 'end1', 'chr2', 'start2', 'end2', 'type', 'score', 'strand1', 'strand2', 'evidence', 'tools', 'tumor--normal','info') + res = as.data.frame(res, stringsAsFactors=F) + + + ## Fix coordinates (have to subtract when starting from a bedpe) + res$start1 = as.numeric(res$start1) - 1 + res$start2 = as.numeric(res$start2) - 1 + + + colnames(res)[1] = paste0('#', colnames(res)[1]) + + return(res) + +} + + + +## Collect arguments +option_list = list( + make_option(c("-b", "--bedpe"), type='character', help="Input BEDPE"), + make_option(c("-n", "--db_names"), type='character', help="Comma-delimited list of database names corresponding to the order in --db_files"), + make_option(c("-f", "--db_files"), type='character', help="Comma-delimited list of database files corresponding to the order in --db_names"), + make_option(c("-i", "--db_ignore_strand"), type='character', help="Comma-delimited list of database names to ignore strand orientation for when overlapping? Should be present in --db_names"), + make_option(c("-s", "--slop"), type='numeric', help="Padding to use when comparing breakpoints"), + make_option(c("-o", "--out_file"), type='character', help="Output BEDPE")) +opt = parse_args(OptionParser(option_list=option_list)) + + +## Unpack arguments +opt$db_names = unlist(strsplit(opt$db_names, ',', fixed=T)) +opt$db_files = unlist(strsplit(opt$db_files, ',', fixed=T)) + +if (!is.null(opt$db_ignore_strand)) { + opt$db_ignore_strand = unlist(strsplit(opt$db_ignore_strand, ',', fixed=T)) +} + + + +## Sanity-check ignore-strand option +if (!is.null(opt$db_ignore_strand) && !all(opt$db_ignore_strand %in% opt$db_names)) { + missing = paste(setdiff(opt$db_ignore_strand, opt$db_names), collapse=',') + stop('Databases present in --db_ignore_strand not present in --db_names: ', missing) +} + + +## Read bedpe +dta = readBEDPE(opt$bedpe) +dta$db = '' + + +## Annotate with databases +for (i in 1:length(opt$db_names)) { + + db.name = opt$db_names[i] + db.file = opt$db_files[i] + is = !is.null(opt$db_ignore_strand) && db.name %in% opt$db_ignore_strand + + db = readDB(db.file) + dta = annotateDB(x=dta, + db=db, + name=db.name, + slop=opt$slop, + ignore.strand=is) + +} + + +## Convert to bedpe +res = vcfToBedpe(dta) + +## Write result +write.table(res, opt$out_file, row.names=F, col.names=T, sep='\t', quote=F) diff --git a/bin/pta/annotate-bedpe-with-genes.r b/bin/pta/annotate-bedpe-with-genes.r new file mode 100644 index 00000000..652ed67d --- /dev/null +++ b/bin/pta/annotate-bedpe-with-genes.r @@ -0,0 +1,344 @@ +## BJS Note: this script was located in the root path of the Docker container +## gcr.io/nygc-public/sv_cnv@sha256:1c14a50d131323a2a4bab323cf224879776af8de37f93df79292fd2e63269274 +## It is reproduced below as it exists there without modification + +## Annotate a merged & annotated BEDPE with gene information +libs = c('optparse', 'StructuralVariantAnnotation', 'VariantAnnotation', 'rtracklayer', 'gUtils') +invisible(suppressPackageStartupMessages(sapply(libs, require, character.only=T, quietly=T))) +options(width=200, scipen=999) + + +CLOSEST_MAX_DISTANCE = 2e4 ## For intergenic CNVs, ignore distanceToNearest() hits farther than this + + + +## Handle non-standard bedpe columns better +readBEDPE = function(f) { + + ## Read file as Pairs object + x = rtracklayer::import(f, format='bedpe') + + ## Update metadata column names + x.mcol.names = colnames(read.csv(f, h=T, stringsAsFactors=F, sep='\t', check.names=F)) + colnames(mcols(x))[3:ncol(mcols(x))] = x.mcol.names[11:length(x.mcol.names)] + mcols(x)$type = mcols(x)$name + + + ## Convert to breakpoint ranges + x = StructuralVariantAnnotation::pairs2breakpointgr(x) + + return(x) + +} + + + +## Read Ensembl gene info +readEnsembl = function(f) { + + ## Read and get rid of unnecessary columns + x = read.csv(f, h=F, stringsAsFactors=F, sep='\t') + colnames(x) = c('gene_chr', 'gene_start', 'gene_end', 'strand', 'name', 'na1', 'na2', 'exons', 'exon_starts', 'exon_ends', 'na3', 'intron_starts', 'intron_ends') + x = x[, !grepl('na[0-9]$', colnames(x))] + + x = GenomicRanges::makeGRangesFromDataFrame(x, + keep.extra.columns=T, + seqnames.field='gene_chr', + start.field='gene_start', + end.field='gene_end') + + x$intron_starts = gsub(',$','',x$intron_starts) + x$intron_ends = gsub(',$','',x$intron_ends) + + return(x) + +} + + + +## Check if breakpoints fall within genes +annotateWithDisruptions = function(sv, genes) { + + genes$disrupt = genes$name + sv = gr.val(query=sv, target=genes, val='disrupt') + sv$disrupt = gsub(' ', '', sv$disrupt, fixed=T) + + return(sv) + +} + + + +readCancerCensus = function(f) { + + x = read.csv(f, h=T, stringsAsFactors=F, sep='\t') + colnames(x) = c('chrom', 'start', 'end', 'name', 'locus') + + + # x$name = gsub('\\|.*$', '', x$name) + x = x[, !colnames(x) %in% 'locus'] + + x = GenomicRanges::makeGRangesFromDataFrame(x, + keep.extra.columns=T, + seqnames.field='chrom', + start.field='start', + end.field='end') + + return(x) + +} + + + +## Check if breakends fall within introns +annotateWithIntronic = function(sv=sv, genes=genes) { + + sv$intronic = '' + + ## Subset gene list to those that have introns and are already known to overlap with breakpoints + genes = genes[genes$intron_starts != '-' & genes$name %in% unique(unlist(strsplit(sv$disrupt, ',')))] + + ## Expand introns to GRanges object (keeping track of what genes they belong to) + introns = base::mapply(function(x, y, z, n) GRanges(x, IRanges(as.numeric(y), as.numeric(z), name=rep(n, length(y)))), + x=as.character(seqnames(genes)), + y=strsplit(genes$intron_starts, ',', fixed=T), + z=strsplit(genes$intron_ends, ',', fixed=T), + n=genes$name) + + introns = Reduce(c, introns) + + + ## Do SV breakends overlap any introns? + processed = c() + for (i in 1:length(sv)) { + partner.idx = which(names(sv) == sv$partner[i]) + + ## Check if we've already procesed the full breakpoint + if (i %in% processed) { + next + } + + ## Check which introns these breakpoints overlap + intron.hits = findOverlaps(query=c(sv[c(i, partner.idx)]), subject=introns) + + ## Do we have any duplicated intron hits? If so, these breakends fall within the same intron + gene.names = names(introns)[subjectHits(intron.hits)[duplicated(subjectHits(intron.hits))]] + sv$intronic[c(i, partner.idx)] = paste(gene.names, collapse=',') + + processed = c(processed, i, partner.idx) + + } + + return(sv) + +} + + + +## Find the nearest copy number changepoints to each breakend +annotateWithClosest = function(sv, genes, closest.max.distance=CLOSEST_MAX_DISTANCE) { + + sv$closest = '' + + ## For each breakend + for (i in 1:length(sv)) { + + ## Find the nearest non-overlapping gene, i.e. + ## exclude any disrupted/contained gene(s) from the comparisons + disrupt = unlist(strsplit(sv$disrupt[i], ',')) + # contains = unlist(strsplit(sv$contains[i], ',')) + intronic = unlist(strsplit(sv$intronic[i], ',')) + + if (length(c(disrupt, intronic)) > 0) { + genes.i = genes[-which(genes$name %in% c(disrupt, intronic))] + } else { + genes.i = genes + } + + + ## Add closest gene, subject to distance cutoff (default 20kb) + closest = GenomicRanges::distanceToNearest(x=sv[i], subject=genes.i, ignore.strand=T) + closest = closest[mcols(closest)$distance <= closest.max.distance] + + if (length(closest) > 0) { + sv$closest[i] = genes.i$name[subjectHits(closest)] + } + + } + + return(sv) + +} + + + +annotateWithContained = function(sv, genes, sv.colname='contains', allow.partial.overlap=F) { + + mcols(sv)[,sv.colname] = '' + + ## For each breakend + processed = c() + for (i in 1:length(sv)) { + + partner.idx = which(names(sv) == sv$partner[i]) + is.translocation = as.character(seqnames(sv[i])) != as.character(seqnames(sv[partner.idx])) + contains = '' + + ## Check if we've already procesed the full breakpoint + if (i %in% processed) { + next + } + + ## Only check contained genes if this is intrachromosomal + if (!is.translocation) { + + ## If allow.partial.overlap=T, allow partially overlapping intervals, otherwise, just a + ## simple coordinate check to see if intervals are fully contained + if (allow.partial.overlap) { + + bkpt = GRanges(as.character(seqnames(sv))[i], IRanges(start(sv)[i], end(sv)[partner.idx])) + contains = genes$name[genes %^% bkpt] + + } else { + + contains = genes$name[start(sv)[i] <= start(genes) & + end(genes) <= end(sv)[partner.idx] & + as.character(seqnames(genes)) == as.character(seqnames(sv[i]))] + + } + + + + } else if (allow.partial.overlap && is.translocation) { + + contains = genes$name[genes %^% sv[c(i, partner.idx)]] + + } + + mcols(sv)[i, sv.colname] = paste(contains, collapse=',') + processed = c(processed, i, partner.idx) + + } + + return(sv) + +} + + + +## Convert breakpointRanges to BEDPE +vcfToBedpe = function(vcf, supplemental) { + + sqn = as.character(seqnames(vcf)) + strand = as.character(strand(vcf)) + res = c() + processed = c() + + for (i in 1:length(vcf)) { + bnd = names(vcf)[i] + partner = vcf$partner[i] + partner.idx = which(names(vcf) == partner) + + ## If we don't have exactly one partner, exclude this variant + if (length(partner.idx) != 1) { + warning('Missing partner for breakend ', bnd) + next + } + + ## Check to see if we've alrady processed this or it's partner + if (any(c(bnd, partner) %in% processed)) { + next + } + + + ## Add disrupt/closest/intronic to info field + disrupt.l = paste0('DisruptL=', vcf$disrupt[i]) + disrupt.r = paste0('DisruptR=', vcf$disrupt[partner.idx]) + closest.l = paste0('ClosestL=', vcf$closest[i]) + closest.r = paste0('ClosestR=', vcf$closest[partner.idx]) + intronic = paste0('Intronic=', vcf$intronic[i]) + contained = paste0('Contained=', vcf$contains[i]) + + if (supplemental) { + gene.str = paste(disrupt.l, disrupt.r, closest.l, closest.r, intronic, contained, sep=';') + } else { + gene.str = paste(disrupt.l, disrupt.r, closest.l, closest.r, intronic, sep=';') + } + + + + ## Add cancer census genes only if they exist + if (vcf$cgc[i] != '' || vcf$cgc[partner.idx] !='') { + cgc.str = paste0('Cancer_census=', paste(setdiff(vcf$cgc[c(i,partner.idx)],''),collapse=',')) + gene.str = paste(gene.str, cgc.str, sep=';') + } + + vcf$info[i] = paste0(vcf$info[i], gene.str) + + + ## Combine breakends in single line + res.i = c(sqn[i], start(vcf)[i], end(vcf)[i], ## chr1, start1, end1 + sqn[partner.idx], start(vcf)[partner.idx], end(vcf)[partner.idx], ## chr2, start2, end 2 + vcf$type[i], '.', strand[i], strand[partner.idx], ## type, score, strand1, strand2 + vcf$evidence[i], vcf$tools[i], vcf$`tumor--normal`[i], vcf$info[i]) ## evidence, tools, TN, info + + ## Add to result, keep track of processed breakends + res = rbind(res, res.i) + processed = c(processed, bnd, partner) + } + + + ## Add colnames and fill in simple event classifications + colnames(res) = c('chr1', 'start1', 'end1', 'chr2', 'start2', 'end2', 'type', + 'score', 'strand1', 'strand2', 'evidence', 'tools', 'tumor--normal', + 'info') + res = as.data.frame(res, stringsAsFactors=F) + + ## Fix coordinates (have to subtract when starting from a bedpe) + res$start1 = as.numeric(res$start1) - 1 + res$start2 = as.numeric(res$start2) - 1 + + + colnames(res)[1] = paste0('#', colnames(res)[1]) + + return(res) + +} + + + +## Collect arguments +option_list = list( + make_option(c("-b", "--bedpe"), type='character', help="Input BEDPE"), + make_option(c("-e", "--ensembl"), type='character', help="Ensembl gene list"), + make_option(c("-c", "--cancer_census"), type='character', help="Cancer census gene list"), + make_option(c("-s", "--supplemental"), action='store_true', default=F, help="Add supplementary gene annotations?"), + make_option(c("-o", "--out_file"), type='character', help="Output BEDPE")) +opt = parse_args(OptionParser(option_list=option_list)) + + +## Read bedpe +sv = readBEDPE(opt$bedpe) + +## Read gene lists +genes = readEnsembl(opt$ensembl) +cgc = readCancerCensus(opt$cancer_census) + +## Add contained ensembl and cgc genes +sv = annotateWithContained(sv=sv, genes=genes, allow.partial.overlap=F) +sv = annotateWithContained(sv=sv, genes=cgc, sv.colname='cgc', allow.partial.overlap=T) + +## Add disrupted genes +sv = annotateWithDisruptions(sv=sv, genes=genes) + +## Which breakpoints fall within introns? +sv = annotateWithIntronic(sv=sv, genes=genes) + +## Add closest (non-contained) genes +sv = annotateWithClosest(sv=sv, genes=genes) + +## Convert to bedpe +res = vcfToBedpe(sv, supplemental=opt$supplemental) + +## Write result +write.table(res, opt$out_file, row.names=F, col.names=T, sep='\t', quote=F) diff --git a/bin/pta/annotate-cnv.r b/bin/pta/annotate-cnv.r new file mode 100644 index 00000000..814847a4 --- /dev/null +++ b/bin/pta/annotate-cnv.r @@ -0,0 +1,331 @@ +## BJS Note: this script was located in the root path of the Docker container +## gcr.io/nygc-public/sv_cnv@sha256:1c14a50d131323a2a4bab323cf224879776af8de37f93df79292fd2e63269274 +## It is reproduced below as it exists there without modification + +## Annotate a merged bedpe with arbitrary databases +libs = c('optparse', 'gUtils', 'GenomicRanges', 'rtracklayer') +invisible(suppressPackageStartupMessages(sapply(libs, require, character.only=T, quietly=T))) +options(width=200, scipen=999) + +## TODO: Move to config? +CLOSEST_MAX_DISTANCE = 2e4 ## For intergenic CNVs, ignore nearest() hits farther than this +LARGESCALE_MIN = 3e6 ## Any events smaller than this are considered focal +DUP_LOG2 = 0.2 ## log2 ratio cutoff for considering an event a duplication +DEL_LOG2 = -0.235 ## log2 ratio cutoff for considering an event a deletion + +## Read BIC-Seq2 output into a GRanges object +## Optionally subset to CNVs in chr +readCNV = function(f, chr=NULL) { + + x = read.csv(f, h=T, stringsAsFactors=F, sep='\t') + colnames(x)[colnames(x) == 'log2.copyRatio'] = 'log2' + + x = GenomicRanges::makeGRangesFromDataFrame(x, + keep.extra.columns=T, + seqnames.field='chrom', + start.field='start', + end.field='end') + + + if (!is.null(chr)) { + x = x[as.character(seqnames(x)) %in% chr] + } + + return(x) + +} + +## Read cytoband into a GRanges object +readCytoband = function(f) { + + x = read.csv(f, h=F, stringsAsFactors=F, sep='\t') + colnames(x) = c('chrom', 'start', 'end', 'cytoband', 'stain') + + x = x[, !colnames(x) %in% 'stain'] + + x = GenomicRanges::makeGRangesFromDataFrame(x, + keep.extra.columns=T, + seqnames.field='chrom', + start.field='start', + end.field='end') + + return(x) + +} + +readDB = function(f) { + + x <- import(f, format = 'BED') + + return(x) + +} + +readCancerCensus = function(f) { + + x = read.csv(f, h=T, stringsAsFactors=F, sep='\t') + colnames(x) = c('chrom', 'start', 'end', 'cgc', 'locus') + + # x$cgc = gsub('\\|.*$', '', x$cgc) + x = x[, !colnames(x) %in% 'locus'] + + x = GenomicRanges::makeGRangesFromDataFrame(x, + keep.extra.columns=T, + seqnames.field='chrom', + start.field='start', + end.field='end') + + return(x) + +} + +## Read Ensembl +readEnsembl = function(f) { + + ## Read and get rid of unnecessary columns + x = read.csv(f, h=F, stringsAsFactors=F, sep='\t') + colnames(x) = c('gene_chr', 'gene_start', 'gene_end', 'strand', 'name', 'na1', 'na2', 'exons', 'exon_starts', 'exon_ends', 'na3', 'intron_starts', 'intron_ends') + x = x[, !grepl('na[0-9]$', colnames(x))] + + x = GenomicRanges::makeGRangesFromDataFrame(x, + keep.extra.columns=T, + seqnames.field='gene_chr', + start.field='gene_start', + end.field='gene_end') + + x$intron_starts = gsub(',$','',x$intron_starts) + x$intron_ends = gsub(',$','',x$intron_ends) + + return(x) + +} + +## Simplify comma-delimited cytoband list to only the first and last cytobands +.simplifyCytoband = function(x, delim=', ', collapse='-') { + + x = unlist(strsplit(x, delim, fixed=T)) + + if (length(x) > 1) { + x = paste0(x[1], collapse, x[length(x)]) + } + + return(x) + +} + +## Annotate with cytoband +annotateCytoband = function(cnv, cytoband) { + + ## Pull in cytoband info + cnv = cnv %$% cytoband + + ## Simplify comma-delimited representation to hyphenated if necessary + cnv$cytoband = sapply(cnv$cytoband, .simplifyCytoband) + + ## Add chromosome information + cnv$cytoband = paste0(as.character(seqnames(cnv)), cnv$cytoband) + + return(cnv) + +} + +## Annotate with databases, subject to reciprocal overlap criteria +annotateDB = function(x, db, name, overlap) { + + ## Find hits + hits = GenomicRanges::findOverlaps(query=x, subject=db) + + ## Compute overlap + mcols(hits)$intersection = width(pintersect(x[queryHits(hits)], db[subjectHits(hits)])) + mcols(hits)$overlap_query = mcols(hits)$intersection / width(x[queryHits(hits)]) + mcols(hits)$overlap_subject = mcols(hits)$intersection / width(db[subjectHits(hits)]) + + ## Hits should meet the minimum reciprocal overlap cutoff + ## To match bedtools::intersect's implementation of reciprocal oerlap, + ## The fraction overlap should be at least the same in each direction + hits = hits[mcols(hits)$overlap_query >= overlap & mcols(hits)$overlap_subject >= overlap] + + ## Annotate any hits we get + x$db[queryHits(hits)] = paste0(x$db[queryHits(hits)], ',', name) + x$db = gsub('^,', '', x$db) + + return(x) + +} + +## Compare GRanges x to GRanges gene mcols intron_start, intron_end +.isIntronic = function(x, gene) { + + if (gene$intron_starts == '-') { + + is.intronic = F + + } else { + + introns = GRanges(as.character(seqnames(gene)), + IRanges(as.numeric(unlist(strsplit(gene$intron_starts, ',', fixed=T))), + as.numeric(unlist(strsplit(gene$intron_ends, ',', fixed=T))))) + + is.intronic = any(start(introns) <= start(x) && end(x) <= end(introns)) + + } + + return(is.intronic) + +} + +## Annotate with ensembl genes +annotateEnsembl = function(x, ens, closest.max.distance=CLOSEST_MAX_DISTANCE) { + + ## Init empty columns + mcols(x)[, c('disrupt.l', 'disrupt.r', 'contains', 'intronic', 'intergenic', 'closest')] = '' + + ## Find hits + hits = GenomicRanges::findOverlaps(query=x, subject=ens) + + ## Check contains, disruption on each side, intronic + x.start = start(x[queryHits(hits)]) + x.end = end(x[queryHits(hits)]) + gene.start = start(ens[subjectHits(hits)]) + gene.end = end(ens[subjectHits(hits)]) + + contains = x.start <= gene.start & gene.end <= x.end + disrupt.r = gene.start <= x.end & x.end <= gene.end + disrupt.l = gene.start <= x.start & x.start <= gene.end + intronic = F + + + ## We only need to check CNVs in introns if they don't contain their hit + ## and intersect the gene on both CNV ends + for (i in which(!contains & disrupt.r & disrupt.l)) { + + message('Checking potential intronic variant...') + intronic[i] = .isIntronic(x=x[queryHits(hits[i])], gene=ens[subjectHits(hits[i])]) + + } + + ## Concatenate genes and store + ## The tapply() aggregates genes by query index (i.e. x index), which we use to map back to x + contains = tapply(ens[subjectHits(hits)]$name[contains], queryHits(hits)[contains], paste, collapse=',') + x$contains[as.numeric(names(contains))] = contains + + disrupt.r = tapply(ens[subjectHits(hits)]$name[disrupt.r], queryHits(hits)[disrupt.r], paste, collapse=',') + x$disrupt.r[as.numeric(names(disrupt.r))] = disrupt.r + + disrupt.l = tapply(ens[subjectHits(hits)]$name[disrupt.l], queryHits(hits)[disrupt.l], paste, collapse=',') + x$disrupt.l[as.numeric(names(disrupt.l))] = disrupt.l + + intronic = tapply(ens[subjectHits(hits)]$name[intronic], queryHits(hits)[intronic], paste, collapse=',') + x$intronic[as.numeric(names(intronic))] = intronic + + + ## Add intergenic and closest gene + x$intergenic[setdiff(1:length(x), queryHits(hits))] = 'yes' + + + ## Add closest gene, subject to distance cutoff + dist.to.nearest = GenomicRanges::distanceToNearest(x=x, subject=ens) + dist.to.nearest = dist.to.nearest[mcols(dist.to.nearest)$distance <= closest.max.distance] + x$closest[queryHits(dist.to.nearest)] = ens$name[subjectHits(dist.to.nearest)] + + return(x) + +} + +## Collect arguments +option_list = list( + make_option(c("-c", "--cnv"), type='character', help="Input CNV calles"), + make_option(c("-a", "--caller"), type='character', help="Name of tool used to call CNVs in --cnv (only bicseq2 is currently supported)"), + make_option(c("-t", "--tumor"), type='character', help="Comma-delimited list of database names corresponding to the order in --db_files"), + make_option(c("-n", "--normal"), type='character', help="Comma-delimited list of database files corresponding to the order in --db_names"), + make_option(c("-b", "--cytoband"), type='character', help="Cytoband file: headerless tab-delimited files with chr, start, end, cytoband, stain"), + make_option(c("-d", "--db_names"), type='character', help="Comma-delimited list of database names corresponding to the order in --db_files"), + make_option(c("-s", "--db_files"), type='character', help="Comma-delimited list of database files corresponding to the order in --db_names"), + make_option(c("-e", "--ensembl"), type='character', help="Ensembl gene list"), + make_option(c("-l", "--allowed_chr"), type='character', help="Comma-delimited list of chromosomes to keep"), + make_option(c("-g", "--cancer_census"), type='character', help="Cancer census gene list"), + make_option(c("-f", "--overlap_fraction"), type='numeric', help="Fraction that database hits must overlap query interval"), + make_option(c("-o", "--out_file_main"), type='character', help="Main output BED"), + make_option(c("-p", "--out_file_supplemental"), type='character', help="Supplemental output BED")) +opt = parse_args(OptionParser(option_list=option_list)) + + +## Unpack arguments +opt$db_names = unlist(strsplit(opt$db_names, ',', fixed=T)) +opt$db_files = unlist(strsplit(opt$db_files, ',', fixed=T)) +opt$allowed_chr = unlist(strsplit(opt$allowed_chr, ',', fixed=T)) + + +## Read files +cnv = readCNV(opt$cnv, chr=opt$allowed_chr) +cyto = readCytoband(opt$cytoband) +cgc = readCancerCensus(opt$cancer_census) +ensembl = readEnsembl(opt$ensembl) + +## Add cytoband annotation +cnv = annotateCytoband(cnv=cnv, cytoband=cyto) + +## Add tumor-normal id, caller info +cnv$`tumor--normal` = paste0(opt$tumor,'--',opt$normal) +cnv$tool = opt$caller + + +## Annotate focal/large-scale +cnv$focal = ifelse(width(cnv) < LARGESCALE_MIN, 'yes', 'no') + + +## Annotate dup/del/neu +cnv$type = 'NEU' +cnv$type[cnv$log2 > DUP_LOG2] = 'DUP' +cnv$type[cnv$log2 < DEL_LOG2] = 'DEL' + + +## Annotate with databases +cnv$db = '' +for (i in 1:length(opt$db_names)) { + + db.name = opt$db_names[i] + db.file = opt$db_files[i] + + print(db.name) + + db = readDB(db.file) + cnv = annotateDB(x=cnv, db=db, name=db.name, overlap=opt$overlap_fraction) + +} + +## Annotate with CGC genes +cnv = cnv %$% cgc +cnv$cgc = gsub(' ', '', cnv$cgc) + +## Annotate with Ensembl genes +cnv = annotateEnsembl(x=cnv, ens=ensembl) + +## Subtract 1 from the output start to adhere to BED standard +start(cnv) = start(cnv) - 1 + +## Rename chr, convert to data frame +cnv = as.data.frame(cnv) +cnv$`#chr` = cnv$seqnames + +## Build info field +cnv$info = paste0('known=',cnv$db, ';Cancer_census=',cnv$cgc, ';DisruptL=',cnv$disrupt.l, ';DisruptR=', cnv$disrupt.r) +cnv$info[cnv$intergenic == 'yes'] = paste0(cnv$info[cnv$intergenic == 'yes'], ';Intergenic') +cnv$info[cnv$intergenic == 'yes'] = paste0(cnv$info[cnv$intergenic == 'yes'], ';Closest=', cnv$closest[cnv$intergenic == 'yes']) + +## Fields included in main/supplemental are slightly different +for (i in c('main', 'supplemental')) { + + cnv.i = cnv[, c('#chr', 'start', 'end', 'type', 'log2', 'tool', 'tumor..normal', 'info', 'focal', 'cytoband')] + colnames(cnv.i) = gsub('..', '--', colnames(cnv.i), fixed=T) + outfile = ifelse(i == 'main', opt$out_file_main, opt$out_file_supplemental) + + if (i=='supplemental') { + + cnv.i$info = paste0(cnv$info,';Contained=',cnv$contains) + + } + + write.table(cnv.i, outfile, row.names=F, col.names=T, sep='\t', quote=F) + +} diff --git a/bin/pta/annotate_id.py b/bin/pta/annotate_id.py new file mode 100644 index 00000000..3eac8237 --- /dev/null +++ b/bin/pta/annotate_id.py @@ -0,0 +1,144 @@ +#!/usr/bin/env python +# USAGE: python annotate_id.py VCF VCF_OUT +# DESCRIPTION: Annotates files by adding information about the +# CosmicID to the ID field. +################################################################################ +##################### COPYRIGHT ################################################ +# New York Genome Center + +# SOFTWARE COPYRIGHT NOTICE AGREEMENT +# This software and its documentation are copyright (2018) by the New York +# Genome Center. All rights are reserved. This software is supplied without +# any warranty or guaranteed support whatsoever. The New York Genome Center +# cannot be responsible for its use, misuse, or functionality. +# Version: 0.1 +# Author: Jennifer M Shelton +##################### /COPYRIGHT ############################################### +################################################################################ + + +import sys +import os +import logging as log +import pysam +########################################################################## +############## Custom functions ############ +########################################################################## + + +def read_vcf(vcf_file): + ''' + Read in annotated VCF file. + ''' + bcf_in = pysam.VariantFile(vcf_file) # auto-detect input format + return bcf_in + + +def get_csq_columns(bcf_in): + ''' + get column names from the bar + separated CSQ VEP annotation + results. CSQ are Consequence + annotations from Ensembl VEP. + ''' + csq_columns = bcf_in.header.info['CSQ'].description.split()[-1].split('|') # grab the definitions + return csq_columns + + +def get_csqs(record, csq_columns): + ''' + Get new INFO field results. + ''' + alt_count = len(record.alts) + csq_dicts = {} + for i in range(alt_count): + try: + csq_line = record.info['CSQ'][i] + except UnicodeDecodeError: # for names with accents and other unexpected characters (rare) + line = str(record) + csq_line = line.split('\t')[7].split('CSQ=')[1] + csq_line = csq_line.split(';')[0] + csq_line = csq_line.split(',')[i] + csq_values = csq_line.split('|') + csq_dict = dict(zip(csq_columns, csq_values)) + csq_dicts[i] = csq_dict + return csq_dicts + + +def get_ID(record, csq_columns): + ''' + Get new ID results is a Cosmic Coding or non-coding result is available. + ''' + csq_dicts = get_csqs(record, csq_columns) + alt_count = len(record.alts) + coding_ids = [] + noncoding_ids = [] + for i in range(alt_count): + if 'CosmicCoding' in csq_dicts[i]: + coding_ids += [csq_dicts[i]['CosmicCoding'].replace('&', ';')] + if 'CosmicNonCoding' in csq_dicts[i]: + noncoding_ids += [csq_dicts[i]['CosmicNonCoding'].replace('&', ';')] + ids = ';'.join([id for id in coding_ids + noncoding_ids if not id == '']) + return ids + + +def fix_gt(gt): + ''' + change GT 0/0/0/1/0, 0/0/0/1, 0/0/1, etc to 0/1 + ''' + if len(gt.split('/')) > 2: + gt = '0/1' + return gt + + +def modify_record(record, csq_columns): + ''' + Add new ID field to records as needed + ''' + # 'CosmicCoding', 'CosmicNonCoding' + gts = [key for key in record.samples[0].keys() if key.endswith('_GT')] + for key in gts: + gt = record.samples[0][key] + record.samples[0][key] = fix_gt(gt) + gt = record.samples[1][key] + record.samples[1][key] = fix_gt(gt) + ids = get_ID(record, csq_columns) + if ids: + if record.id: + record.id = record.id + ';' + ids + else: + record.id = ids + return record + +def write_vcf(bcf_in, vcf_out_file, csq_columns): + ''' + Write out the VCF + ''' + bcf_out = pysam.VariantFile(vcf_out_file, 'w', header=bcf_in.header) + for record in bcf_in.fetch(): + record = modify_record(record, csq_columns) + exit_status = bcf_out.write(record) + if exit_status != 0: + print(exit_status) + + +def main(): + ''' + Annotates files by adding information about the + Cosmic coding and non-coding entries to the ID column. + Also changes GT 0/0/0/1/0, 0/0/0/1, 0/0/1, etc to 0/1. + ''' + vcf_file = sys.argv[1] + vcf_out_file = sys.argv[2] + assert os.path.isfile(vcf_file), 'Failed to find caller VCF call file :' + vcf_file + bcf_in = read_vcf(vcf_file) + csq_columns = get_csq_columns(bcf_in) + write_vcf(bcf_in, vcf_out_file, csq_columns) + + +########################################################################## +##### Execute main unless script is simply imported ############ +##### for individual functions ############ +########################################################################## +if __name__ == '__main__': + main() diff --git a/bin/pta/bicseq2_config_writer.py b/bin/pta/bicseq2_config_writer.py new file mode 100644 index 00000000..56d7fd25 --- /dev/null +++ b/bin/pta/bicseq2_config_writer.py @@ -0,0 +1,109 @@ +import pandas as pd +import os +import argparse +from os import listdir +from os.path import isfile, join +import glob + +class Bicseq2Prep(): + def __init__(self, sample_id, + fa_files, + out_file, + mappability_directory, + norm_bicseq2_config, + temp_seqs): + self.out_file = out_file + self.sample_id = sample_id + self.norm_bicseq2_config = norm_bicseq2_config + self.mappability_directory = mappability_directory + self.fa_files = fa_files + self.temp_seqs = temp_seqs + self.write_sample_configs() + + def match_fa_file(self, row): + for fa_file in self.fa_files: + if fa_file.endswith('_' + str(row.chrom_name) + '.fasta'): + return fa_file + + def match_mappability_file(self, row): + mappability_files = (glob.glob(self.mappability_directory+"/*")) + # from the directory provided, find all files. + for mappability_file in mappability_files: + if os.path.basename(mappability_file) == str(row.chrom_name) + '.uniq.txt': + return mappability_file + + def match_seq_file(self, row): + for temp_seq in self.temp_seqs: + if os.path.basename(temp_seq) == str(self.sample_id) + '_' + str(row.chrom_name) + '.seq': + return temp_seq + + def match_norm_file(self, row): + for temp_seq in self.temp_seqs: + if os.path.splitext(os.path.basename(temp_seq))[0] + '.norm.bin.txt' == str(self.sample_id) + '_' + str(row.chrom_name) + '.norm.bin.txt': + return os.path.splitext(os.path.basename(temp_seq))[0] + '.norm.bin.txt' + # Modified to programtically set this output. + + def prep(self): + '''initial file should start with one column named chrom_name ''' + data = pd.read_csv(self.norm_bicseq2_config, sep='\t') + assert ''.join(data.columns.tolist()) == 'chrom_name', 'Error: initial config file should start with one column named chrom_name' + data['fa_file'] = data.apply(lambda row: self.match_fa_file(row), axis=1) + data['mappability'] = data.apply(lambda row: self.match_mappability_file(row), axis=1) + data['readPosFile'] = data.apply(lambda row: self.match_seq_file(row), axis=1) + data['bin_file_normalized'] = data.apply(lambda row: self.match_norm_file(row), axis=1) + return data + + def write_sample_configs(self): + '''write configs for the normalization step''' + # tumor + data = self.prep() + config = data.to_csv(self.out_file, sep='\t', index=False) + + +def get_args(): + '''Parse input flags + ''' + parser = argparse.ArgumentParser() + parser.add_argument('--fa-files', + help='List of chrom fasta files. ', + required=True, + nargs='*' + ) + parser.add_argument('--mappability-directory', + help='Directory containing mappability files. ', + required=True + ) + parser.add_argument('--temp-seqs', + help='List of file paths ${sample_id}_${chr}.seq ' + '(readPosFile files output from samtools getUnique) ', + required=True, + nargs='*' + ) + parser.add_argument('--norm-bicseq2-config', + help='Pre filled file for ${sample_id}.bicseq2.config. ' + 'Fasta-specific but sample-independent portion of config file.', + required=True + ) + parser.add_argument('--out-file', + help='Output config filename', + required=True + ) + parser.add_argument('--sample-id', + help='sample id', + required=True + ) + args_namespace = parser.parse_args() + return args_namespace.__dict__ + +def main(): + args = get_args() + bicseq = Bicseq2Prep(sample_id=args['sample_id'], + fa_files=args['fa_files'], + out_file=args['out_file'], + mappability_directory=args['mappability_directory'], + norm_bicseq2_config=args['norm_bicseq2_config'], + temp_seqs=args['temp_seqs']) + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/bin/pta/bicseq2_seg_config_writer.py b/bin/pta/bicseq2_seg_config_writer.py new file mode 100644 index 00000000..787c39da --- /dev/null +++ b/bin/pta/bicseq2_seg_config_writer.py @@ -0,0 +1,88 @@ +import pandas as pd +import os +import argparse + + +class Bicseq2Prep(): + def __init__(self, pair_id, + out_file, + seg_bicseq2_config, + tumor_norms, + normal_norms): + self.out_file = out_file + self.pair_id = pair_id + self.seg_bicseq2_config = seg_bicseq2_config + self.tumor_norms = tumor_norms + self.normal_norms = normal_norms + self.write_sample_configs() + + def match_file(self, row, files): + for temp_norm in files: + if os.path.basename(temp_norm).endswith('_' + str(row.chrom_name) + '.norm.bin.txt'): + return temp_norm + + def prep_pair(self): + ''' file: tumor--normal.bicseq2.seg.config + prep fasta-specific but sample independent portion of config file''' + data = pd.read_csv(self.seg_bicseq2_config, sep='\t') + # assert ''.join(data.columns.tolist()) == 'chr', 'Error: initial config file should start with one column named chr' + assert ''.join(data.columns.tolist()) == 'chrom_name', 'Error: initial config file should start with one column named chrom_name' + + data['case'] = data.apply(lambda row: self.match_file(row, + files=self.tumor_norms), axis=1) + data['control'] = data.apply(lambda row: self.match_file(row, + files=self.normal_norms), axis=1) + return data + + + def write_sample_configs(self): + '''create and upload configs for the normalization step''' + # tumor + data = self.prep_pair() + config = data.to_csv(self.out_file, sep='\t', index=False) + + +def get_args(): + '''Parse input flags + ''' + parser = argparse.ArgumentParser() + parser.add_argument('--tumor-norms', + help='List of file paths ${tumor}_${chrom_name}.norm.bin.txt ' + ' (Output from Bicseq2Norm ).', + required=True, + nargs='*' + ) + parser.add_argument('--normal-norms', + help='List of file paths ${normal}_${chrom_name}.norm.bin.txt ' + ' (Output from Bicseq2Norm ).', + required=True, + nargs='*' + ) + parser.add_argument('--seg-bicseq2-config', + help='Pre filled file for ${pair_id}.bicseq2.seg.config. ' + 'Fasta-specific but sample-independent portion of config file.', + required=True + ) + parser.add_argument('--out-file', + help='Output config filename', + required=True + ) + parser.add_argument('--pair-id', + help='pair id', + required=True + ) + args_namespace = parser.parse_args() + return args_namespace.__dict__ + + +def main(): + args = get_args() + bicseq = Bicseq2Prep(pair_id=args['pair_id'], + out_file=args['out_file'], + seg_bicseq2_config=args['seg_bicseq2_config'], + tumor_norms=args['tumor_norms'], + normal_norms=args['normal_norms']) + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/bin/pta/bicseq2_seg_config_writer_unpaired.py b/bin/pta/bicseq2_seg_config_writer_unpaired.py new file mode 100644 index 00000000..71c3bc28 --- /dev/null +++ b/bin/pta/bicseq2_seg_config_writer_unpaired.py @@ -0,0 +1,77 @@ +import pandas as pd +import os +import argparse + + +class Bicseq2Prep(): + def __init__(self, pair_id, + out_file, + seg_bicseq2_config, + tumor_norms): + self.out_file = out_file + self.pair_id = pair_id + self.seg_bicseq2_config = seg_bicseq2_config + self.tumor_norms = tumor_norms + self.write_sample_configs() + + def match_file(self, row, files): + for temp_norm in files: + if os.path.basename(temp_norm).endswith('_' + str(row.chrom_name) + '.norm.bin.txt'): + return temp_norm + + def prep_pair(self): + ''' file: tumor.bicseq2.seg.config + prep fasta-specific but sample independent portion of config file''' + data = pd.read_csv(self.seg_bicseq2_config, sep='\t') + # assert ''.join(data.columns.tolist()) == 'chr', 'Error: initial config file should start with one column named chr' + assert ''.join(data.columns.tolist()) == 'chrom_name', 'Error: initial config file should start with one column named chrom_name' + + data['case'] = data.apply(lambda row: self.match_file(row, + files=self.tumor_norms), axis=1) + return data + + + def write_sample_configs(self): + '''create and upload configs for the normalization step''' + # tumor + data = self.prep_pair() + config = data.to_csv(self.out_file, sep='\t', index=False) + + +def get_args(): + '''Parse input flags + ''' + parser = argparse.ArgumentParser() + parser.add_argument('--tumor-norms', + help='List of file paths ${tumor}_${chrom_name}.norm.bin.txt ' + ' (Output from Bicseq2Norm ).', + required=True, + nargs='*' + ) + parser.add_argument('--seg-bicseq2-config', + help='Pre filled file for ${pair_id}.bicseq2.seg.config. ' + 'Fasta-specific but sample-independent portion of config file.', + required=True + ) + parser.add_argument('--out-file', + help='Output config filename', + required=True + ) + parser.add_argument('--pair-id', + help='pair id', + required=True + ) + args_namespace = parser.parse_args() + return args_namespace.__dict__ + + +def main(): + args = get_args() + bicseq = Bicseq2Prep(pair_id=args['pair_id'], + out_file=args['out_file'], + seg_bicseq2_config=args['seg_bicseq2_config'], + tumor_norms=args['tumor_norms']) + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/bin/pta/filter-bedpe.r b/bin/pta/filter-bedpe.r new file mode 100644 index 00000000..14ab7bef --- /dev/null +++ b/bin/pta/filter-bedpe.r @@ -0,0 +1,82 @@ +## BJS Note: this script was located in the root path of the Docker container +## gcr.io/nygc-public/sv_cnv@sha256:1c14a50d131323a2a4bab323cf224879776af8de37f93df79292fd2e63269274 +## It is reproduced below as it exists there without modification + +## Filter a bedpe for somatic variants (i.e., not in specified germline databases), and +## high-confidence variants (2+ callers or 1 caller with a nearby copy number changepoint) +libs = c('optparse', 'GenomicRanges') +invisible(suppressPackageStartupMessages(sapply(libs, require, character.only=T, quietly=T))) +options(width=200, scipen=999) + + +## Check if databases db are in info string x +inDatabase = function(x, db) { + + ## Split info field, look for database entry + x = unlist(strsplit(x, ';', fixed=T)) + x = grep('known=', x, fixed=T, value=T) + x = gsub('known=', '', x, fixed=T) + x = unlist(strsplit(x, ',', fixed=T)) + + return(any(x %in% db)) + +} + + + +makeGRangesFromChangepoint = function(x) { + + x = unlist(strsplit(x, ':|-')) + GRanges(seqnames=x[1], ranges=IRanges(as.numeric(x[2:3]), as.numeric(x[2:3]))) + +} + + + +## Is variant x a high-confidence variant? +## Meant to be used with apply(,2,) +isHighConfidence = function(x, cpmax) { + + ## Is there support from multiple callers? + multi.caller = grepl(',', x['tools']) + + ## Is either breakpoint close enough to its nearest changepoint? + x1.gr = GRanges(seqnames=x['#chr1'], ranges=IRanges(as.numeric(x['start1']), as.numeric(x['end1']))) + ch1.gr = makeGRangesFromChangepoint(x['cnv_changepoint_1']) + near.ch1 = any(GenomicRanges::distance(x1.gr, ch1.gr) <= cpmax) + + x2.gr = GRanges(seqnames=x['chr2'], ranges=IRanges(as.numeric(x['start2']), as.numeric(x['end2']))) + ch2.gr = makeGRangesFromChangepoint(x['cnv_changepoint_2']) + near.ch2 = any(GenomicRanges::distance(x2.gr, ch2.gr) <= cpmax) + + return(multi.caller || near.ch1 || near.ch2) + +} + + + +## Collect arguments +option_list = list( + make_option(c("-b", "--bedpe"), type='character', help="Input BEDPE"), + make_option(c("-m", "--max_changepoint_distance"), type='numeric', help="Maximum distance a changepoint can be from a breakpoint to 'rescue' it into the high-confidence set"), + make_option(c("-f", "--filter_databases"), type='character', help="Comma-separated list of databases to filter, looking in the info field"), + make_option(c("-s", "--out_file_somatic"), type='character', help="Output somatic BEDPE"), + make_option(c("-o", "--out_file_highconf"), type='character', help="Output high-confidence BEDPE")) +opt = parse_args(OptionParser(option_list=option_list)) + +## Unpack arguments +opt$filter_databases = unlist(strsplit(opt$filter_databases, ',', fixed=T)) + + +## Read bedpe, filter for known germline variants +x = read.csv(opt$bedpe, h=T, stringsAsFactors=F, sep='\t', check.names=F) +x = x[!sapply(x$info, inDatabase, opt$filter_databases), ] + +## Write out somatic variants +write.table(x, opt$out_file_somatic, row.names=F, col.names=T, sep='\t', quote=F) + +## Filter for high confidence +x = x[apply(x, 1, isHighConfidence, opt$max_changepoint_distance), ] + +## Write result +write.table(x, opt$out_file_highconf, row.names=F, col.names=T, sep='\t', quote=F) diff --git a/bin/pta/filter_bam b/bin/pta/filter_bam new file mode 100755 index 00000000..e5252cdc Binary files /dev/null and b/bin/pta/filter_bam differ diff --git a/bin/pta/filter_pon.py b/bin/pta/filter_pon.py new file mode 100644 index 00000000..a2d8e1db --- /dev/null +++ b/bin/pta/filter_pon.py @@ -0,0 +1,261 @@ +#!/usr/bin/env python +# USAGE: python filter_pon.py +# DESCRIPTION: +################################################################################ +##################### COPYRIGHT ################################################ +# New York Genome Center + +# SOFTWARE COPYRIGHT NOTICE AGREEMENT +# This software and its documentation are copyright (2018) by the New York +# Genome Center. All rights are reserved. This software is supplied without +# any warranty or guaranteed support whatsoever. The New York Genome Center +# cannot be responsible for its use, misuse, or functionality. +# Version: 0.1 +# Author: Kanika Arora (karora@nygenome.org) and Jennifer M Shelton +##################### /COPYRIGHT ############################################### +################################################################################ +import sys +import os +import logging as log +import pandas as pd +import pysam +import argparse +########################################################################## +############## Custom functions ############ +########################################################################## + + +def read_vcf(vcf_file): + ''' + Read in annotated VCF file. + ''' + bcf_in = pysam.VariantFile(vcf_file) # auto-detect input format + return bcf_in + + +def read_bed(bed_file, chrom): + ''' + Read in BED file. Require TotalUniqueSamples be integers. + ''' + bed_in = pd.read_csv(bed_file, sep = '\t', dtype={'#CHROM': str}) # auto-detect input format + assert 'TotalUniqueSamples' in list(bed_in.columns), 'BED file missing TotalUniqueSamples column' + if chrom: + bed_in = bed_in[(bed_in['#CHROM'] == chrom)].copy() + bed_in['TotalUniqueSamples'] = bed_in['TotalUniqueSamples'].astype('int64') + return bed_in + + +def compare_num(observed, rule, value): + ''' + Compare two numbers. Available rules are lt, gt and eq + ''' + observed = int(observed) + if rule == 'lt': + if observed < value: + return False + elif rule == 'gt': + if observed > value: + return False + elif rule == 'eq': + if observed == value: + return False + return True + + +def compare_str(observed, rule, value): + ''' + Compare two strings to see that they are the same 'eq' + or are not the same 'ne' + ''' + if rule == 'eq': + if value == observed: + return False + elif rule == 'ne': + if value != observed: + return False + return True + + +def test_rules(filter_func, rule): + ''' + Test that rule matches combination. + ''' + combos = {compare_num : ['lt', 'gt', 'eq'], + compare_str : ['eq', 'ne']} + assert rule in combos[filter_func], 'rule not is possible rules for given function. Possible rules: ' + str(combos[filter_func]) + ' Rule : ' + rule + + +def custom(row, filter_func, key, value, rule): + ''' + Run custom filter for bed file row + ''' + observed = row[key] + return filter_func(observed, rule, value) + + +def filter_bed(bed_in, + filter_func, key, + value, rule): + ''' + Filter based on column and rule. + ''' + bed_in['fail'] = bed_in.apply(lambda row: custom(row, filter_func, key, value, rule), axis=1) + bed_in_filtered = bed_in[(bed_in.fail == False)].copy() + return bed_in_filtered + + +def get_bad_pos(bed_in_filtered): + ''' + Grab filtered position info + ''' + bed_in_filtered['key'] = bed_in_filtered.apply(lambda row: row['#CHROM'] + '{' + str(row.END), axis=1) + bad_pos = set(bed_in_filtered.key) + return bad_pos + + +def add_filter_header(bcf_out, + id, + description): + ''' + Add new FILTER field + ''' + bcf_out.header.filters.add(id=id, + number=None, + type=None, + description=description) + return bcf_out + + +def compose(bcf_in, bad_pos): + ''' + Filter based on the PON bad positions. + ''' + passing = False + for record in bcf_in.fetch(): + key = record.chrom + '{' + str(record.pos) + if key in bad_pos: + filters = record.filter.keys() + if len(filters) == 1 and filters[0] in ['PASS', 'SUPPORT']: + record.filter.clear() + record.filter.add('PON') + yield record + + +def write_file(bcf_out, record): + ''' + Write to a VCF. + ''' + exit_status = bcf_out.write(record) + if exit_status != 0: + print(exit_status) + return bcf_out + + +def main(): + ''' + Filter VCF for start positions which match + ''' + # ========================== + # Input variables + # ========================== + parser = argparse.ArgumentParser( + description='DESCRIPTION: Filters a VCF file \ + if a start position of a VCF matches the \ + start + 1 position of the bed file. Command-line \ + options that may be omitted (i.e. are NOT \ + required) are shown in square brackets.') + # Documentation parameters + parser.add_argument('-v', '--vcf', + dest='vcf_file', + help='VCF file', + required=True) + parser.add_argument('-o', '--out', + dest='out', + help='Output VCF file', + required=True) + parser.add_argument('-b', '--bed', + dest='bed_file', + help='Input BED file to use to filter', + required=True) + parser.add_argument('-d', '--default', + dest='default', + help='Default filter for TotalUniqueSamples. Result \ + must be greater than this value [1]', + default='1') + parser.add_argument('-f', '--filter', + dest='filter_func', + choices=['str', + 'num'], + nargs='+', + help='Filter type(s)', + default=[False]) + parser.add_argument('-r', '--rule', + dest='rule', + nargs='+', + help='Rule(s) to filter based on value. Acceptable options \ + are "lt", "gt", "eq" for "num" and \ + "eq" and "ne" for "str"', + default=[False]) + parser.add_argument('-val', '--value', + dest='value', + nargs='+', + help='Value(s) used to compare to custom filter', + default=[False]) + parser.add_argument('-c', '--chrom', + dest='chrom', + help='Chrom used for filtering', + default=False) + args = parser.parse_args() + assert os.path.isfile(args.vcf_file), 'Failed to find caller VCF call file :' + args.vcf_file + assert os.path.isfile(args.bed_file), 'Failed to find BED file ' + args.bed_file + if args.filter_func[0]: + assert args.key[0], 'key is required for custom filter' + assert args.value[0], 'value is required for custom filter' + assert args.rule[0], 'rule is required for custom filter' + # ========================== + # Filter PON + # ========================== + functions = {'num' : compare_num, + 'str' : compare_str} + # default filter + bed_file = args.bed_file + filter_func = functions['num'] + key = 'TotalUniqueSamples' + rule = 'gt' + value = int(args.default) + + bed_in = read_bed(bed_file, args.chrom) + test_rules(filter_func, rule) + bed_in_filtered = filter_bed(bed_in, filter_func, key, value, rule) + # optional_filters + if args.filter_func[0]: + for i in range(len(args.filter_func)): + filter_func = functions[args.filter_func[i]] + key = args.key[i] + rule = args.rule[i] + value = args.value[i] + test_rules(filter_func, rule) + bed_in_filtered = filter_bed(bed_in_filtered, filter_func, key, value, rule) + bad_pos = get_bad_pos(bed_in_filtered) + # ========================== + # Filter with PON + # ========================== + bcf_in = read_vcf(args.vcf_file) + bcf_in = add_filter_header(bcf_out=bcf_in, + id='PON', + description='Variant in panel of normal database') + bcf_out = pysam.VariantFile(args.out, 'w', header=bcf_in.header) + for record in compose(bcf_in, bad_pos): + bcf_out = write_file(bcf_out, record) + + + + +########################################################################## +##### Execute main unless script is simply imported ############ +##### for individual functions ############ +########################################################################## + + +if __name__ == '__main__': + main() \ No newline at end of file diff --git a/bin/pta/filter_vcf.py b/bin/pta/filter_vcf.py new file mode 100644 index 00000000..6800ef4b --- /dev/null +++ b/bin/pta/filter_vcf.py @@ -0,0 +1,88 @@ +import pysam +import numpy as np +import argparse +import logging as log +import re + +chrom_pattern = re.compile('[\[\]](.*):') + +class FilterNonChroms(): + def __init__(self, vcf_file, + out_file, + chroms): + ''' + Requires + NYGC column headers: + #chr start end type log2 tool pair_id info focal cytoband + Without nygc columns the step returns an empty table + ''' + self.chroms = chroms + self.bcf_in = self.read_vcf(vcf_file) + self.bcf_out = self.start_vcf(out_file) + self.filter_vcf() + + def read_vcf(self, vcf_file): + bcf_in = pysam.VariantFile(vcf_file) + return bcf_in + + def start_vcf(self, out_file): + bcf_out = pysam.VariantFile(out_file, 'w', header=self.bcf_in.header) + return bcf_out + + def filter_non_chroms(self, record): + ''' + Filter calls to leave calls that are only on chroms. + ''' + if record.contig in self.chroms: + for alt in record.alts: + result = re.search(chrom_pattern, alt) + if result and result[1] not in self.chroms: + return True + else: + return True + return False + + def filter_vcf(self): + ''' + Only print calls that have: + 1) ref in chroms + 2) alt in chroms + ''' + for record in self.bcf_in.fetch(): + if not self.filter_non_chroms(record): + exit_status = self.bcf_out.write(record) + if exit_status != 0: + print(exit_status) + + +def get_args(): + '''Parse input flags + ''' + parser = argparse.ArgumentParser() + parser.add_argument('--vcf-file', + help='SV VCF file', + required=True + ) + parser.add_argument('--output', + help='Output VCF file', + required=True + ) + parser.add_argument('--chroms', + help='A space separated list of chroms to plot.', + required=False, + nargs='*', + default=False + ) + args_namespace = parser.parse_args() + return args_namespace.__dict__ + + +def main(): + args = get_args() + FilterNonChroms(args['vcf_file'], + args['output'], + chroms=args['chroms']) + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/bin/pta/get_candidates.py b/bin/pta/get_candidates.py new file mode 100644 index 00000000..62c4b243 --- /dev/null +++ b/bin/pta/get_candidates.py @@ -0,0 +1,82 @@ +#!/usr/bin/env python +# USAGE: python get_candidates.py VCF_FILE OUT_FILE +# DESCRIPTION: +################################################################################ +##################### COPYRIGHT ################################################ +# New York Genome Center + +# SOFTWARE COPYRIGHT NOTICE AGREEMENT +# This software and its documentation are copyright (2018) by the New York +# Genome Center. All rights are reserved. This software is supplied without +# any warranty or guaranteed support whatsoever. The New York Genome Center +# cannot be responsible for its use, misuse, or functionality. +# Version: 0.1 +# Author: Kanika Arora (karora@nygenome.org) and Jennifer M Shelton +##################### /COPYRIGHT ############################################### +################################################################################ +import sys +import os +import logging as log +import pysam +########################################################################## +############## Custom functions ############ +########################################################################## + +def read_vcf(vcf_file): + ''' + Read in annotated VCF file. + ''' + bcf_in = pysam.VariantFile(vcf_file) # auto-detect input format + return bcf_in + + +def filter_call(record): + ''' + Filter calls to leave calls that may be + supported by a second Lancet run. + ''' + if 'called_by' in record.info.keys() and \ + not 'lancet' in record.info['called_by'] and \ + record.info['num_callers'] == 1: + if 'supported_by' in record.info.keys(): + if record.info['supported_by']: + return True + return False + return True + + +def filter_vcf(bcf_in, bcf_out): + ''' + Only print calls that are: + 1) Not already called by Lancet + 2) Only supported by on caller + 3) Not Supported by a support caller + ''' + for record in bcf_in.fetch(): + if not filter_call(record): + exit_status = bcf_out.write(record) + if exit_status != 0: + print(exit_status) + + +def main(): + ''' + Only print calls that are: + 1) Not already called by Lancet + 2) Only supported by on caller #MWL NOTE: on caller = one caller? + 3) Not Supported by a support caller + ''' + vcf_file = sys.argv[1] + out_file = sys.argv[2] + assert os.path.isfile(vcf_file), 'Failed to find somatic VCF call file :' + vcf_file + bcf_in = read_vcf(vcf_file) + bcf_out = pysam.VariantFile(out_file, 'w', header=bcf_in.header) + filter_vcf(bcf_in, bcf_out) + + +########################################################################## +##### Execute main unless script is simply imported ############ +##### for individual functions ############ +########################################################################## +if __name__ == '__main__': + main() \ No newline at end of file diff --git a/bin/pta/make_maf.py b/bin/pta/make_maf.py new file mode 100644 index 00000000..a1bde93f --- /dev/null +++ b/bin/pta/make_maf.py @@ -0,0 +1,490 @@ +#!/usr/bin/env python +# USAGE: python make_maf.py VCF MAF LIBRARY GENOME +# DESCRIPTION: Makes MAF file from VCF file. +################################################################################ +##################### COPYRIGHT ################################################ +# New York Genome Center + +# SOFTWARE COPYRIGHT NOTICE AGREEMENT +# This software and its documentation are copyright (2018) by the New York +# Genome Center. All rights are reserved. This software is supplied without +# any warranty or guaranteed support whatsoever. The New York Genome Center +# cannot be responsible for its use, misuse, or functionality. +# Version: 0.2 +# Author: Jennifer M Shelton +##################### /COPYRIGHT ############################################### +################################################################################ + +import sys +import os +import re +import logging as log +import pysam +import mygene +import argparse +import pandas as pd +########################################################################## +############## Custom functions ############ +########################################################################## + + +def ensembl_gene_id_entrez_id(ensembl_gene_id, mg): + ''' + Returns entrez id from ensemble. + False is used for regions that do not + correspond to a gene region or Ensembl ID + ''' + entrez_id = 0 + if ensembl_gene_id != '': + results = mg.query(ensembl_gene_id) + try: + entrez_id = str(results['hits'][0]['entrezgene']) + except (KeyError, IndexError): + pass +# sys.stderr.write('WARNING: entrezgene not found for ' + str(ensembl_gene_id) + ' in ' + str(results) + '\n') + return entrez_id + + +def ensembl_gene_entrez_local(ensembl_gene_id, data): + ''' + Returns entrez id from ensemble. + False is used for regions that do not + correspond to a gene region or Ensembl ID + ''' + try: + entrez_id = data[(data['Gene stable ID'] == ensembl_gene_id)]['NCBI gene (formerly Entrezgene) ID'].values[0] + except IndexError: + entrez_id = 0 + if str(entrez_id) == 'nan': + entrez_id = 0 + return int(entrez_id) + +def get_csq_columns(bcf_in): + ''' + get column names from the bar + separated CSQ VEP annotation + results. CSQ are Consequence + annotations from Ensembl VEP. + ''' + csq_columns = bcf_in.header.info['CSQ'].description.split()[-1].split('|') # grab the definitions + return csq_columns + + +def get_csqs(record, csq_columns): + ''' + Get new INFO field results. + ''' + alt_count = len(record.alts) + csq_values = [] + csq_dicts = {} + for i in range(alt_count): + try: + csq_line = record.info['CSQ'][i] + except UnicodeDecodeError: # for names with accents and other unexpected characters (rare) + line = str(record) + csq_line = line.split('\t')[7].split('CSQ=')[1] + csq_line = csq_line.split(';')[0] + csq_line = csq_line.split(',')[i] + csq_values = csq_line.split('|') + csq_dict = dict(zip(csq_columns, csq_values)) + csq_dicts[i] = csq_dict + return csq_dicts + + +def group_mnv(record): + ''' + Convert inhouse MNV and SNV type to GDC-like calls. + This does not work for multiple alts. + Only the first alt will be considered because + type only takes one value. + ''' + if record.info['TYPE'] == 'SNV': + record.info['TYPE'] = 'SNP' + elif record.info['TYPE'] == 'MNV': + if len(record.ref) == 2 \ + and len(record.alts[0]) == 2: + record.info['TYPE'] = 'DNP' + elif len(record.ref) == 3 \ + and len(record.alts[0]) == 3: + record.info['TYPE'] = 'TNP' + elif len(record.ref) > 3 \ + and len(record.alts[0]) > 3 \ + and len(record.ref) == len(record.alts[0]): + record.info['TYPE'] = 'ONP' + return record + + + +def read_vcf(vcf_file): + ''' + Read in annotated VCF file. + ''' + bcf_in = pysam.VariantFile(vcf_file) # auto-detect input format + return bcf_in + + +def get_dbsnp_rs(Existing_variation): + ''' + Remove Cosmic IDs and split by comma. + ''' + ids = Existing_variation.split('&') + good_ids = [id for id in ids if id.startswith('rs')] + return ','.join(good_ids) + + +def set_frame_shift(type): + ''' + Set the variant classification based on whether the variant is + an insertion or a deletion. + ''' + if type == 'DEL': + variant_classification = 'Frame_Shift_Del' + elif type == 'INS': + variant_classification = 'Frame_Shift_Ins' + else: + variant_classification = False + return variant_classification + + +def set_protein_altering(type, ref, alt): + ''' + Set the variant classification based on whether the variant is + an insertion or a deletion. + ''' + inframe = False + if abs(len(ref) - len(alt)) % 3 == 0: + inframe = True + if inframe: + if type == 'DEL': + variant_classification = 'In_Frame_Del' + elif type == 'INS': + variant_classification = 'In_Frame_Ins' + else: + variant_classification = False + else: + if type == 'DEL': + variant_classification = 'Frame_Shift_Del' + elif type == 'INS': + variant_classification = 'Frame_Shift_Ins' + else: + variant_classification = False + return variant_classification + + +def shorten_AA(AAMutation): + ''' + Lengthen to the three letter AA code + ''' + AA_dict = {'Cys': 'C', 'Asp': 'D', 'Ser': 'S', 'Gln': 'Q', 'Lys': 'K', + 'Ile': 'I', 'Pro': 'P', 'Thr': 'T', 'Phe': 'F', 'Asn': 'N', + 'Gly': 'G', 'His': 'H', 'Leu': 'L', 'Arg': 'R', 'Trp': 'W', + 'Ala': 'A', 'Val':'V', 'Glu': 'E', 'Tyr': 'Y', 'Met': 'M', + 'Ter' : '*'} + short_mutation = [] + skip_until = -1 + for i, char in enumerate(AAMutation): + if i > skip_until: + if AAMutation[i:i + 3] in AA_dict: + short_mutation += AA_dict[AAMutation[i:i + 3]] + skip_until = i + 2 + else: + short_mutation += char + return ''.join(short_mutation) + + +def get_variant_classification(consequences, type, ref, alt): + ''' + Convert VEP consequences to MAF variant_classification. + ''' + consequences_to_class = {'intergenic_variant' : 'Silent', + 'upstream_gene_variant' : 'Silent', + '5_prime_UTR_variant' : 'Silent', + 'splice_acceptor_variant' : 'Splice_Site', + 'splice_donor_variant' : 'Splice_Site', + 'splice_region_variant' : 'Splice_Site', + 'missense_variant' : 'Missense_Mutation', + 'synonymous_variant' : 'Silent', + 'frameshift_variant' : set_frame_shift(type), + 'protein_altering_variant' : set_protein_altering(type, ref, alt), + 'inframe_insertion' : 'In_Frame_Ins', + 'inframe_deletion' : 'In_Frame_Del', + 'stop_gained' : 'Nonsense_Mutation', + 'stop_retained_variant' : 'Silent', + 'stop_lost' : 'Nonstop_Mutation', + 'intron_variant' : 'Silent', + '3_prime_UTR_variant' : 'Silent', + 'downstream_gene_variant' : 'Silent', + 'initiator_codon_variant' : 'Translation_Start_Site', + 'regulatory_region_variant' : 'Silent', + 'TF_binding_site_variant' : 'Silent', + 'mature_miRNA_variant' : 'RNA', + 'regulatory_region_ablation' : 'Silent', + 'regulatory_region_amplification' : 'Silent', + 'TFBS_ablation' : 'Silent', + 'TFBS_amplification' : 'Silent', + 'non_coding_transcript_variant' : 'Silent', + 'NMD_transcript_variant' : 'Silent', + 'incomplete_terminal_codon_variant' : 'Silent', + 'non_coding_transcript_exon_variant' : 'RNA', + 'transcript_ablation' : 'Splice_Site', + 'transcript_amplification' : 'Silent', + 'feature_elongation' : False, + 'feature_truncation' : False, + 'start_lost' : 'Translation_Start_Site', + 'start_retained_variant' : 'Silent', + 'coding_sequence_variant' : 'Missense_Mutation', + 'splice_polypyrimidine_tract_variant' : 'Splice_Site', + 'splice_donor_5th_base_variant' : 'Splice_Site', + 'splice_donor_region_variant' : 'Splice_Site' + } # https://useast.ensembl.org/info/genome/variation/prediction/predicted_data.html + return consequences_to_class[consequences] + +def get_HGVSp_Short(HGVSp_string, HGVSc_string, csq_term): + ''' + Convert HGVSp to HGVSp_Short: + derive HGVSp_Short from HGVSp. if Consequence is splice acceptor/donor variants, generate HGVSp_Short + ''' + aa_to_short = {'Ala': 'A', + 'Arg': 'R', + 'Asn': 'N', + 'Asp': 'D', + 'Asx': 'B', + 'Cys': 'C', + 'Glu': 'E', + 'Gln': 'Q', + 'Glx': 'Z', + 'Gly': 'G', + 'His': 'H', + 'Ile': 'I', + 'Leu': 'L', + 'Lys': 'K', + 'Met': 'M', + 'Phe': 'F', + 'Pro': 'P', + 'Ser': 'S', + 'Thr': 'T', + 'Trp': 'W', + 'Tyr': 'Y', + 'Val': 'V', + 'Xxx': 'X', + 'Xaa': 'X', + 'Ter': '*' + } + HGVSp_Short = '' + if csq_term == 'splice_acceptor_variant' \ + or csq_term == 'splice_donor_variant': + if len(HGVSc_string.split(':'))>1: + HGVSc_string = HGVSc_string.split(':')[1] + HGVSc_coding = re.findall('^c.(\d+)',HGVSc_string) + if len(HGVSc_coding) > 0: + input_pos = float(HGVSc_coding[0]) + if input_pos < 1: + input_pos = 1 + corrected_pos = (input_pos + input_pos % 3)/3 + HGVSp_Short = 'p.X' + str(int(corrected_pos)) + '_splice' + return HGVSp_Short + elif len(HGVSp_string) > 0: + HGVSp_Short = HGVSp_string.split(':')[1] + for item in aa_to_short.keys(): + HGVSp_Short = re.sub(item, aa_to_short[item], HGVSp_Short) + return HGVSp_Short + else: + return HGVSp_Short + +def make_row(record, csq_columns, bcf_in, library, + normal, tumor, VEP_version='GRCh38', + mg=False, + ensembl_entrez=False): + ''' + Fill in MAF row + ''' + # ======================= + # Get VEP annotation + # ======================= + csq_dicts = get_csqs(record, csq_columns) + cosmic_resistance_annotation = {} + if normal == bcf_in.header.samples[1]: + normal_index = 1 + tumor_index = 0 + elif tumor == bcf_in.header.samples[1]: + normal_index = 0 + tumor_index = 1 + for i, alt in enumerate(record.alts): + if csq_dicts[i]['SYMBOL_SOURCE'] == 'HGNC': + hugo = csq_dicts[i]['SYMBOL'] + else: + hugo = 'Unknown' + ensembl_gene_id = csq_dicts[i]['Gene'] + if mg: + entrez_id = ensembl_gene_id_entrez_id(ensembl_gene_id, mg) # default '0' + else: + entrez_id = ensembl_gene_entrez_local(ensembl_gene_id, ensembl_entrez) + center = 'NYGenome' + ncbi_build = VEP_version + chrom = record.chrom + if record.info['TYPE'] == 'DEL': + start = record.pos + 1 # skip anchor base + else: + start = record.pos + # get end position for 1-based inclusive coordinates + if record.info['TYPE'] == 'SNP': + end = record.pos + if record.info['TYPE'] == 'INS': + end = record.pos + 1 + elif record.info['TYPE'] == 'DEL': + end = (record.pos + 1) + len(record.ref) - len(alt) - 1 # add one to skip anchor + else: + end = record.pos + len(record.ref) - 1 + strand = '+' + record = group_mnv(record) + variant_type = record.info['TYPE'] + if record.info['TYPE'] == 'INS': + reference_allele = '-' + Tumor_Seq_Allele1 = '-' + Tumor_Seq_Allele2 = alt[1:] + elif record.info['TYPE'] == 'DEL': + reference_allele = record.ref[1:] + Tumor_Seq_Allele1 = record.ref[1:] + Tumor_Seq_Allele2 = '-' + else: + reference_allele = record.ref + Tumor_Seq_Allele1 = record.ref + Tumor_Seq_Allele2 = alt + dbSNP_RS = get_dbsnp_rs(csq_dicts[i]['Existing_variation']) + dbSNP_Val_Status = 'bySubmitter' + Tumor_Sample_Barcode = bcf_in.header.samples[tumor_index] + Matched_Norm_Sample_Barcode = bcf_in.header.samples[normal_index] + Match_Norm_Seq_Allele1 = '' + Match_Norm_Seq_Allele2 = '' + Tumor_Validation_Allele1 = '' + Tumor_Validation_Allele2 = '' + Match_Norm_Validation_Allele1 = '' + Match_Norm_Validation_Allele2 = '' + Verification_Status = 'Unknown' + Validation_Status = 'Untested' + Mutation_Status = 'Somatic' + Sequencing_Phase = 'Phase_I' + Sequence_Source = library + Validation_Method = 'none' + Score = '' + BAM_file= '' + Sequencer = 'Illumina' + if 'AD' in record.samples[bcf_in.header.samples[1]].keys() \ + and 'AD' in record.samples[bcf_in.header.samples[0]].keys(): + t_alt_count = record.samples[bcf_in.header.samples[tumor_index]]['AD'][1] + t_ref_count = record.samples[bcf_in.header.samples[tumor_index]]['AD'][0] + n_alt_count = record.samples[bcf_in.header.samples[normal_index]]['AD'][1] + n_ref_count = record.samples[bcf_in.header.samples[normal_index]]['AD'][0] + else: + t_alt_count = '' + t_ref_count = '' + n_alt_count = '' + n_ref_count = '' + HGVSc = csq_dicts[i]['HGVSc'] + HGVSp = csq_dicts[i]['HGVSp'] + SYMBOL_SOURCE = csq_dicts[i]['SYMBOL_SOURCE'] + SYMBOL = csq_dicts[i]['SYMBOL'] + IMPACT = csq_dicts[i]['IMPACT'] + return_line = [] + for csq_term in csq_dicts[i]['Consequence'].split('&'): + variant_classification = get_variant_classification(csq_term, + record.info['TYPE'], + record.ref, + alt) + HGVSp_Short = get_HGVSp_Short(HGVSp,HGVSc,csq_term) + line = [hugo, entrez_id, center, ncbi_build, chrom, start, end, strand, + variant_classification, variant_type, reference_allele, + Tumor_Seq_Allele1, Tumor_Seq_Allele2, dbSNP_RS, + dbSNP_Val_Status, Tumor_Sample_Barcode, + Matched_Norm_Sample_Barcode, Match_Norm_Seq_Allele1, + Match_Norm_Seq_Allele2, Tumor_Validation_Allele1, + Tumor_Validation_Allele2, + Match_Norm_Validation_Allele1, Match_Norm_Validation_Allele2, + Verification_Status, Validation_Status, Mutation_Status, + Sequencing_Phase, + Sequence_Source, Validation_Method, Score, BAM_file, + Sequencer, t_alt_count, t_ref_count, + n_alt_count, n_ref_count, HGVSc, HGVSp, HGVSp_Short, SYMBOL, SYMBOL_SOURCE, IMPACT] + joined_line = '\t'.join([str(x) for x in line]) + '\n' + joined_line = joined_line.replace('&', ',') + return_line.append(joined_line) + yield ''.join(set(return_line)) + + +def write_file(bcf_in, out, csq_columns, library, + normal, tumor, VEP_version, ensembl_entrez=False): + ''' + Write out the header + ''' + if ensembl_entrez: + data = pd.read_csv(ensembl_entrez) + mg = False + else: + data = False + mg = mygene.MyGeneInfo() + with open(out, 'w') as o: + header = ['Hugo_Symbol', 'Entrez_Gene_Id', 'Center', 'NCBI_Build', 'Chromosome', 'Start_Position', 'End_Position', 'Strand', 'Variant_Classification', 'Variant_Type', 'Reference_Allele', 'Tumor_Seq_Allele1', 'Tumor_Seq_Allele2', 'dbSNP_RS', 'dbSNP_Val_Status', 'Tumor_Sample_Barcode', 'Matched_Norm_Sample_Barcode', 'Match_Norm_Seq_Allele1', 'Match_Norm_Seq_Allele2', 'Tumor_Validation_Allele1', 'Tumor_Validation_Allele2', 'Match_Norm_Validation_Allele1', 'Match_Norm_Validation_Allele2', 'Verification_Status', 'Validation_Status', 'Mutation_Status', 'Sequencing_Phase', 'Sequence_Source', 'Validation_Method', 'Score', 'BAM_file', 'Sequencer', 't_alt_count', 't_ref_count', 'n_alt_count', 'n_ref_count', 'HGVSc', 'HGVSp', 'HGVSp_Short', 'SYMBOL', 'SYMBOL_SOURCE','IMPACT'] + o.write('\t'.join(header) + '\n') + for record in bcf_in.fetch(): + if record.info['HighConfidence']: + for alt_line in make_row(record, csq_columns, bcf_in, library, + normal, tumor, VEP_version=VEP_version, + mg=mg, + ensembl_entrez=data): + o.write(alt_line) + + + +def main(): + ''' + Script to make MAF file from VEP annotated VCF files + ''' + ###################################################################### + ############ Get commandline arguments ############ + ###################################################################### + parser = argparse.ArgumentParser( + description='DESCRIPTION: Takes in a VCF \ + file and returns a MAF. Command-line \ + options that may be omitted (i.e. are NOT \ + required) are shown in square brackets.') + # Documentation parameters + # Parameter options + parser.add_argument('-v', '--vcf', + dest='vcf_file', + help='Annotated VCF file') + parser.add_argument('-m', '--maf', + dest='maf', + help='MAF output file') + parser.add_argument('-l', '--library', + dest='library', + help='Sequence library type', + choices=['WGS', 'Exome']) + parser.add_argument('-vep', '--vep-version', + dest='VEP_version', + help='VEP genome version', + choices=['GRCh37', 'GRCh38']) + parser.add_argument('-t', '--tumor', + dest='tumor', + help='Tumor sample name') + parser.add_argument('-n', '--normal', + dest='normal', + help='Normal sample name') + parser.add_argument('-e', '--ensembl-entrez', + dest='ensembl_entrez', + default=False, + help='Map of ensembl ids to entrez ids') + args = parser.parse_args() + assert os.path.isfile(args.vcf_file), 'Failed to find caller VCF call file :' + args.vcf_file + bcf_in = read_vcf(args.vcf_file) + csq_columns = get_csq_columns(bcf_in) + write_file(bcf_in, args.maf, csq_columns, args.library, + args.normal, args.tumor, args.VEP_version, + args.ensembl_entrez) + + +########################################################################## +##### Execute main unless script is simply imported ############ +##### for individual functions ############ +########################################################################## +if __name__ == '__main__': + main() diff --git a/bin/pta/make_main_vcf.py b/bin/pta/make_main_vcf.py new file mode 100644 index 00000000..d61d809f --- /dev/null +++ b/bin/pta/make_main_vcf.py @@ -0,0 +1,230 @@ +#!/usr/bin/env python +# USAGE: python cancer_gene_census.py cancer_gene_census.csv VCF VCF_OUT +# DESCRIPTION: Annotates files by adding information about the +# Cosmic Genome Census entry for the nearest gene. +################################################################################ +##################### COPYRIGHT ################################################ +# New York Genome Center + +# SOFTWARE COPYRIGHT NOTICE AGREEMENT +# This software and its documentation are copyright (2018) by the New York +# Genome Center. All rights are reserved. This software is supplied without +# any warranty or guaranteed support whatsoever. The New York Genome Center +# cannot be responsible for its use, misuse, or functionality. +# Version: 0.2 +# Author: Jennifer M Shelton +##################### /COPYRIGHT ############################################### +################################################################################ + +import sys +import os +import logging as log +import pysam +import pprint +from collections import OrderedDict +########################################################################## +############## Custom functions ############ +########################################################################## + + +def remove_info(bcf_in, csq_columns): + ''' + Remove a INFO field from VCF. + ''' + for id in bcf_in.header.info.keys(): + if not id in ['HighConfidence','TYPE', 'called_by', 'num_callers', + 'supported_by', 'CSQ', 'CancerGeneCensus'] + csq_columns: + bcf_in.header.info.remove_header(id) + return bcf_in + + +def remove_format(bcf_in): + ''' + Remove a FORMAT field from VCF. + ''' + for id in bcf_in.header.formats.keys(): + if not id in ['AD','DP', 'AF']: + bcf_in.header.formats.remove_header(id) + return bcf_in + + + +def get_csqs(record, csq_columns): + ''' + Get new INFO field results. + ''' + alt_count = len(record.alts) + csq_dicts = {} + for i in range(alt_count): + try: + csq_line = record.info['CSQ'][i] + except UnicodeDecodeError: # for names with accents and other unexpected characters (rare) + line = str(record) + csq_line = line.split('\t')[7].split('CSQ=')[1] + csq_line = csq_line.split(';')[0] + csq_line = csq_line.split(',')[i] + csq_values = csq_line.split('|') + assert len(csq_columns) == len(csq_values), 'failed because lengths do not match' + csq_dict = dict(zip(csq_columns, csq_values)) + csq_dicts[i] = csq_dict + return csq_dicts + + +def modify_record(record, csq_columns, good_fields): + ''' + Shorten CSQ fields + ''' + csq_dicts = get_csqs(record, csq_columns) + csq_out = '|'.join([csq_dicts[0][key] for key in good_fields]) + return csq_out + + +def check_build(bcf_in): + ''' + Check if genome is in a list of supprted non-human genomes. + ''' + VEP_line = [metadata.value for metadata in bcf_in.header.records if metadata.key == 'VEP'][0] + vep_info = {entry.split('=')[0] : entry.split('=')[-1] for entry in VEP_line.split(' ')} + if vep_info['assembly'] in ['"GRCm38.p6"']: + return False + else: + return True + + +class Variant(object): + + + def __init__(self, record, csq_out, human=True): + self.record = record + self.csq_out = csq_out + self.human = human + self.line = str(self.record).rstrip() + self.parts = self.line.split('\t') + # VCF columns + self.chrom = self.parts[0] + self.pos = self.parts[1] + self.id = self.parts[2] + self.ref = self.parts[3] + self.alts = self.parts[4].split(',') + self.qual = self.parts[5] + self.filters = self.parts[6].split(';') + self.info = self.parts[7].split(';') + self.format = self.parts[8].split(':') + self.samples = self.parts[9:] + # modify + self.good_format = ['AD','DP', 'AF'] + if self.human: + self.good_info = ['HighConfidence','TYPE', 'called_by', 'num_callers', + 'supported_by', 'CSQ', 'CancerGeneCensus'] + else: + self.good_info = ['HighConfidence','TYPE', 'called_by', 'num_callers', + 'supported_by', 'CSQ'] + self.info_dict = self.get_info() + self.samples[0] = self.fix_format(self.samples[0].split(':')) + self.samples[1] = self.fix_format(self.samples[1].split(':')) + self.format = [key for key in self.good_format if key in self.format] + + def get_info(self): + ''' + Get current info line and add prefix if needed + ''' + info_dict = OrderedDict() + for item in self.info: + if item.split('=')[0] in self.good_info: + if item.split('=')[0] == 'CSQ': + info_dict.update({'CSQ' : self.csq_out}) + elif '=' in item: + info_dict.update({item.split('=')[0] : item.split('=')[1]}) + else: + info_dict.update({item : None }) + return info_dict + + def fix_format(self, sample): + ''' + Reduce to good formats + ''' + format_dict = dict(zip(self.format, sample)) + new_format = [format_dict[key] for key in self.good_format if key in format_dict] + return ':'.join(new_format) + + def write(self): + if ':'.join(self.format) == '': + self.format = '.' + self.samples = ['.', '.'] + if ';'.join(self.filters) == 'PASS': + line = [self.chrom, + self.pos, + self.id, + self.ref, + ','.join(self.alts), + str(self.qual), + ';'.join(self.filters), + ';'.join(['='.join([x for x in [key, self.info_dict[key]] if x != None]) for key in self.info_dict]), + ':'.join(self.format)] + line += self.samples + self.new_line = '\t'.join(line) + return self.new_line + else: + return False + + +def read_vcf(vcf_file): + ''' + Read in annotated VCF file. + ''' + bcf_in = pysam.VariantFile(vcf_file) # auto-detect input format + return bcf_in + + +def write_vcf(bcf_in, vcf_out_file, csq_columns, human=True): + ''' + Write out the download + ''' + # CSQ to keep + if human: + good_fields = ['Gene', 'BIOTYPE', 'CLIN_SIG', 'Consequence', + 'CosmicCoding', 'CosmicCoding_AA', 'CosmicNonCoding', + 'Existing_variation', 'GnomadExomes_AF', 'GnomadGenomes_AF', + 'HGVSc', 'HGVSp', 'IMPACT', 'Polyphen2_HVAR_pred', + 'FATHMM_pred', 'fathmm-MKL_coding_pred', + 'SIFT4G_pred', 'SIFT_pred', 'SYMBOL', 'SYMBOL_SOURCE', 'AF_1000G'] + else: + good_fields = ['Gene', 'BIOTYPE', 'Consequence', + 'Existing_variation', 'HGVSc', 'HGVSp', 'IMPACT', + 'SIFT4G_pred', 'SIFT_pred', 'SYMBOL', 'SYMBOL_SOURCE'] + # Import the header after removal of extra metadata + header = str(bcf_in.header).rstrip() + csq_format = '|'.join(good_fields) + # Write new header with corrected CSQ and fewer metadata keys overall + with open(vcf_out_file, 'w') as vcf_out: + for line in header.split('\n'): + if 'ID=CSQ' in line: + line = '##INFO=' + vcf_out.write(line + '\n') + for record in bcf_in: + csq_out = modify_record(record, csq_columns, good_fields) + line = Variant(record, csq_out, human).write() + if line: + vcf_out.write(line + '\n') + + +def main(): + ''' + Reduce metadata in VCF for main VCF output + ''' + vcf_file = sys.argv[1] + vcf_out_file = sys.argv[2] + bcf_in = read_vcf(vcf_file) + human = check_build(bcf_in) + csq_columns = bcf_in.header.info['CSQ'].description.split()[-1].split('|') # grab the definitions + bcf_in = remove_format(bcf_in) + bcf_in = remove_info(bcf_in, csq_columns) + write_vcf(bcf_in, vcf_out_file, csq_columns, human=human) + + +########################################################################## +##### Execute main unless script is simply imported ############ +##### for individual functions ############ +########################################################################## +if __name__ == '__main__': + main() diff --git a/bin/pta/make_txt.py b/bin/pta/make_txt.py new file mode 100644 index 00000000..8d5b9200 --- /dev/null +++ b/bin/pta/make_txt.py @@ -0,0 +1,233 @@ +#!/usr/bin/env python +# USAGE: python make_maf.py --vcf VCF --txt TXT -n NORMAL -t TUMOR +# DESCRIPTION: Makes MAF file from VCF file. +################################################################################ +##################### COPYRIGHT ################################################ +# New York Genome Center + +# SOFTWARE COPYRIGHT NOTICE AGREEMENT +# This software and its documentation are copyright (2018) by the New York +# Genome Center. All rights are reserved. This software is supplied without +# any warranty or guaranteed support whatsoever. The New York Genome Center +# cannot be responsible for its use, misuse, or functionality. +# Version: 0.2 +# Author: Jennifer M Shelton +##################### /COPYRIGHT ############################################### +################################################################################ + +import sys +import os +import logging as log +import pysam +import argparse +########################################################################## +############## Custom functions ############ +########################################################################## + + +def get_csq_columns(bcf_in): + ''' + get column names from the bar + separated CSQ VEP annotation + results. CSQ are Consequence + annotations from Ensembl VEP. + ''' + csq_columns = bcf_in.header.info['CSQ'].description.split()[-1].split('|') # grab the definitions + return csq_columns + + +def get_csqs(record, csq_columns): + ''' + Get new INFO field results. + ''' + alt_count = len(record.alts) + csq_values = [] + csq_dicts = {} + for i in range(alt_count): + try: + csq_line = record.info['CSQ'][i] + except UnicodeDecodeError: # for names with accents and other unexpected characters (rare) + line = str(record) + csq_line = line.split('\t')[7].split('CSQ=')[1] + csq_line = csq_line.split(';')[0] + csq_line = csq_line.split(',')[i] + csq_values = csq_line.split('|') + csq_dict = dict(zip(csq_columns, csq_values)) + csq_dicts[i] = csq_dict + return csq_dicts + + +def read_vcf(vcf_file): + ''' + Read in annotated VCF file. + ''' + bcf_in = pysam.VariantFile(vcf_file) # auto-detect input format + return bcf_in + + +def check_build(bcf_in): + ''' + Check if genome is in a list of supprted non-human genomes. + ''' + VEP_line = [metadata.value for metadata in bcf_in.header.records if metadata.key == 'VEP'][0] + vep_info = {entry.split('=')[0] : entry.split('=')[-1] for entry in VEP_line.split(' ')} + if vep_info['assembly'] in ['"GRCm38.p6"']: + return False + else: + return True + + +def make_row(record, csq_columns, bcf_in, + normal, tumor, human=True): + ''' + Fill in MAF row + ''' + # ======================= + # Get VEP annotation + # ======================= + csq_dicts = get_csqs(record, csq_columns) + if normal == bcf_in.header.samples[1]: + normal_index = 1 + tumor_index = 0 + elif tumor == bcf_in.header.samples[1]: + normal_index = 0 + tumor_index = 1 + else: + log.error('VCF sample names do not match listed tumor or normal name') + sys.exit(1) + id = record.id + if record.id == None: + id = '.' + for i, alt in enumerate(record.alts): + consequence = csq_dicts[i]['Consequence'] + impact = csq_dicts[i]['IMPACT'] + GENE_SYMBOL = csq_dicts[i]['SYMBOL'] + HGVSc = csq_dicts[i]['HGVSc'] + HGVSp = csq_dicts[i]['HGVSp'] + type = record.info['TYPE'] + if human: + PolyPhen = csq_dicts[i]['Polyphen2_HVAR_pred'] + AF_1000G = csq_dicts[i]['AF_1000G'] + GnomadExomes_AF = csq_dicts[i]['GnomadExomes_AF'] + GnomadGenomes_AF = csq_dicts[i]['GnomadGenomes_AF'] + CosmicCoding = csq_dicts[i]['CosmicCoding'] + CosmicCoding_AA = csq_dicts[i]['CosmicCoding_AA'] + CosmicNonCoding = csq_dicts[i]['CosmicNonCoding'] + fathmm = csq_dicts[i]['FATHMM_pred'] + fathmm_MKL_coding = csq_dicts[i]['fathmm-MKL_coding_pred'] + sift = csq_dicts[i]['SIFT_pred'] + sift_4g = csq_dicts[i]['SIFT4G_pred'] + HighConfidence = record.info['HighConfidence'] + if 'called_by' in record.info: + called_by = ','.join(record.info['called_by']) + else: + called_by = '' + if 'supported_by' in record.info: + supported_by = ','.join(record.info['supported_by']) + else: + supported_by = '' + if 'AD' in record.samples[bcf_in.header.samples[1]].keys() \ + and 'AD' in record.samples[bcf_in.header.samples[0]].keys(): + t_alt_count = record.samples[bcf_in.header.samples[tumor_index]]['AD'][1] + t_ref_count = record.samples[bcf_in.header.samples[tumor_index]]['AD'][0] + n_alt_count = record.samples[bcf_in.header.samples[normal_index]]['AD'][1] + n_ref_count = record.samples[bcf_in.header.samples[normal_index]]['AD'][0] + else: + t_alt_count = '' + t_ref_count = '' + n_alt_count = '' + n_ref_count = '' + if 'AF' in record.samples[bcf_in.header.samples[1]].keys(): + t_VAF = record.samples[bcf_in.header.samples[tumor_index]]['AF'][i] + else: + t_VAF = '' + if human: + line = [record.chrom, record.pos, id, record.ref, alt, + consequence, impact, GENE_SYMBOL, HGVSc, HGVSp, type, + PolyPhen, AF_1000G, GnomadExomes_AF, GnomadGenomes_AF, + CosmicCoding, CosmicCoding_AA, CosmicNonCoding, + n_ref_count, n_alt_count, t_ref_count, t_alt_count, + t_VAF, fathmm, fathmm_MKL_coding, sift, sift_4g, + HighConfidence, called_by, supported_by] + else: + line = [record.chrom, record.pos, id, record.ref, alt, + consequence, impact, GENE_SYMBOL, HGVSc, HGVSp, type, + n_ref_count, n_alt_count, t_ref_count, t_alt_count, + t_VAF, sift, sift_4g, HighConfidence, called_by, supported_by] + line = [str(part).replace('&', ',') for part in line] + line = [str(part).replace(';', ',') for part in line] + yield line + + +def write_file(bcf_in, out, csq_columns, + normal, tumor, human=True): + ''' + Write out the header + ''' + with open(out, 'w') as o: + if human: + header = ['CHROM', 'POS', 'ID', 'REF', 'ALT', 'Consequence', 'IMPACT', + 'GENE_SYMBOL', 'HGVSc', 'HGVSp', 'TYPE', 'PolyPhen', 'AF_1000G', + 'GnomadExomes_AF', 'GnomadGenomes_AF', 'CosmicCoding', + 'CosmicCoding_AA', 'CosmicNonCoding', + 'n_ref_count', 'n_alt_count', 't_ref_count', 't_alt_count', + 't_VAF', 'FATHMM', 'fathmm_MKL_coding', 'SIFT', 'SIFT4G', 'HighConfidence', + 'called_by', 'supported_by'] + else: + header = ['CHROM', 'POS', 'ID', 'REF', 'ALT', 'Consequence', 'IMPACT', + 'GENE_SYMBOL', 'HGVSc', 'HGVSp', 'TYPE', + 'n_ref_count', 'n_alt_count', 't_ref_count', 't_alt_count', + 't_VAF', 'SIFT', 'SIFT4G', 'HighConfidence', + 'called_by', 'supported_by'] + o.write('\t'.join(header) + '\n') + header_len = len(header) + for record in bcf_in.fetch(): + for alt_line in make_row(record, csq_columns, bcf_in, + normal, tumor, human): + assert len(alt_line) == header_len, "columns don't equal header names" + joined_line = '\t'.join([x for x in alt_line]) + '\n' + o.write(joined_line) + + + +def main(): + ''' + Script to make TEXT file from VEP annotated VCF files + ''' + ###################################################################### + ############ Get commandline arguments ############ + ###################################################################### + parser = argparse.ArgumentParser( + description='DESCRIPTION: Takes in a VCF \ + file and returns a TEXT file. Command-line \ + options that may be omitted (i.e. are NOT \ + required) are shown in square brackets.') + # Documentation parameters + # Parameter options + parser.add_argument('-v', '--vcf', + dest='vcf_file', + help='Annotated VCF file') + parser.add_argument('--txt', + dest='txt', + help='TEXT output file') + parser.add_argument('-t', '--tumor', + dest='tumor', + help='Tumor sample name') + parser.add_argument('-n', '--normal', + dest='normal', + help='Normal sample name') + args = parser.parse_args() + assert os.path.isfile(args.vcf_file), 'Failed to find caller VCF call file :' + args.vcf_file + bcf_in = read_vcf(args.vcf_file) + human = check_build(bcf_in) + csq_columns = get_csq_columns(bcf_in) + write_file(bcf_in, args.txt, csq_columns, + args.normal, args.tumor, human=human) + + +########################################################################## +##### Execute main unless script is simply imported ############ +##### for individual functions ############ +########################################################################## +if __name__ == '__main__': + main() diff --git a/bin/pta/merge-caller-vcfs.r b/bin/pta/merge-caller-vcfs.r new file mode 100644 index 00000000..2232334c --- /dev/null +++ b/bin/pta/merge-caller-vcfs.r @@ -0,0 +1,425 @@ +## BJS Note: this script was located in the root path of the Docker container +## gcr.io/nygc-public/sv_cnv@sha256:1c14a50d131323a2a4bab323cf224879776af8de37f93df79292fd2e63269274 +## It is reproduced below as it exists there without modification + +## Merge arbitrary number of VCFs, annotate with simple event type +libs = c('optparse', 'StructuralVariantAnnotation', 'VariantAnnotation', 'rtracklayer', 'stringr') +invisible(suppressPackageStartupMessages(sapply(libs, require, character.only=T, quietly=T))) +options(width=200, scipen=999) + + +SUPPORTED_CALLERS = c('manta', 'lumpy', 'svaba', 'gridss') ## Update this flag when adding support for new callers +SVABA_MIN_LENGTH = 1001 ## Svaba-unique calls shorter than this appear to be artifactual + + +## Callers have different names for the same pieces of evidence, +## For now handle each case separately +## TODO: Add support for GRIDSS +getReadSupport = function(vcf, caller, sample_id, supplementary=FALSE, supported_callers=SUPPORTED_CALLERS) { + + ## Don't try to process genotype info if we don't know how + if (!caller %in% supported_callers) { + stop('Caller ', caller, ' is not currently supported. Supported callers: ', paste(supported_callers, collapse=',')) + } + + ## It's a possibility that the sample names in the VCF will be + ## the full path to the BAM used instead of just the sample ID + ## Just grab the index of the correct column + if (!sample_id %in% colnames(geno(vcf)[[1]])) { + sample_id = which(gsub('\\.final\\.bam$','',basename(colnames(geno(vcf)[[1]]))) %in% sample_id) + } + + + if (caller == 'manta') { + + ## Common info + sr = geno(vcf)$SR[, sample_id] + sr = sapply(sr, `[`, 2) + pe = geno(vcf)$PR[, sample_id] + pe = sapply(pe, `[`, 2) + + ## Supplementary info + supp_string = paste0(caller,'_SOMATICSCORE=', info(vcf)$SOMATICSCORE) + + } else if (caller == 'svaba') { + + ## Common info + sr = geno(vcf)$SR[, sample_id] + pe = geno(vcf)$DR[, sample_id] + + ## Supplementary info + ad = paste0(caller,'_AD=', geno(vcf)$AD[, sample_id]) + dp = paste0(caller,'_DP=', geno(vcf)$DP[, sample_id]) + lo = paste0(caller,'_LO=', geno(vcf)$LO[, sample_id]) + gt = paste0(caller,'_GT=', geno(vcf)$GT[, sample_id]) + supp_string = paste(ad, dp, lo, gt, sep=',') + + } else if (caller == 'lumpy') { + + ## Common info + sr = geno(vcf)$SR[, sample_id] + sr = unlist(sr) + pe = geno(vcf)$PE[, sample_id] + pe = unlist(pe) + + ## Supplementary info + ro = paste0(caller,'_RO=', geno(vcf)$RO[, sample_id]) + ao = paste0(caller,'_AO=', geno(vcf)$AO[, sample_id]) + dp = paste0(caller,'_DP=', geno(vcf)$DP[, sample_id]) + gt = paste0(caller,'_GT=', geno(vcf)$GT[, sample_id]) + supp_string = paste(ro, ao, dp, gt, sep=',') + + + } else if (caller == 'gridss') { + + ## Common info + sr = geno(vcf)$SR[, sample_id] + pe = geno(vcf)$RP[, sample_id] + + ## Supplementary info + vf = paste0(caller,'_VF=', geno(vcf)$VF[, sample_id]) + asq = paste0(caller,'_ASQ=', geno(vcf)$ASQ[, sample_id]) + qual = paste0(caller,'_QUAL=', geno(vcf)$QUAL[, sample_id]) + supp_string = paste(vf, asq, qual, sep=',') + + } + + ## Set NA to 0 + ## TODO: Keep this? + sr[is.na(sr)] = 0 + pe[is.na(pe)] = 0 + + ## Build output string + if (supplementary) { + res = paste0('[',caller,'_SR=',sr,',', caller,'_PE=', pe,',', supp_string,']') + } else { + res = paste0('[',caller,'_SR=',sr,',', caller,'_PE=', pe,']') + } + + return(res) + +} + + + +sumSupport = function(x) { + sapply(str_extract_all(x, '(?<=\\=)[0-9]+(?=,|\\])'), function(y) sum(as.numeric(y))) +} + + + +removeRedundantBreakpoints = function(x) { + + ## Find duplicates + key = unlist(strsplit(x$breakendPosID,',')) + key.count = table(key) + key.dup = key.count[key.count > 1] + + + ## If there aren't duplicates we don't have anything to do + if (length(key.dup) == 0) { + return(x) + } + + + ## For each set of duplicate breakends, select the one with the higher score + x.idx.rm = c() + for (i in names(key.dup)) { + + ## Subset to breakends of interest + x.idx = grep(i, x$breakendPosID, fixed=T) + xi = x[x.idx] + + ## Collect support + xi$read.support = sumSupport(xi$support) + xi$multicaller.support = grepl('],[',xi$support,fixed=T) + + + + ## Automatically keep breakends with multi-caller support + idx.multi = which(xi$multicaller.support) + if (length(idx.multi) > 0) { + x.idx.rm = c(x.idx.rm, x.idx[-idx.multi]) + next + } + + + ## Automatically discard breakends with the lowest support + idx.max = which(xi$read.support %in% max(xi$read.support)) + if (length(idx.max) > 0) { + x.idx.rm = c(x.idx.rm, x.idx[-idx.max]) + x.idx = x.idx[idx.max] + xi = xi[idx.max] + } + + + ## If there are multiple breakends tied for highest read support + if (length(xi) > 1) { + + if (all(!is.na(xi$svLen))) { + + ## If all are non-TRA take longest SV + x.idx.rm = c(x.idx.rm, x.idx[-which.max(abs(xi$svLen))]) + + } else if (all(is.na(xi$svLen)) && length(unique(as.character(seqnames(xi)))) == 1) { + + ## If all TRA to the same chr select rightmost coordinate + partners = x[names(x) %in% xi$partner] + partner.keep = names(partners)[which.max(start(partners))] + x.idx.rm = c(x.idx.rm, x.idx[!xi$partner %in% partner.keep]) + + } + + ## Otherwise, just keep tied SVs + + } + + } + + + ## Remove breakends and their partners if we have any to remove + if (length(x.idx.rm) > 0) { + x = x[-x.idx.rm] + x = x[names(x) %in% x$partner] + } + + return(x) + +} + + + +## Compute error between query and subject for a hits object +computeError = function(query, subject, hits) { + + ## Init result dataframe + error = data.frame(local=rep(NA, length(queryHits(hits))), + remote=rep(NA, length(queryHits(hits)))) + + + ## For each hit + for (i in 1:length(queryHits(hits))) { + + ## Compute local error (error between breakends at hit i) and remote error (error between the partners of + ## the breakends at hit i) + error$local[i] = StructuralVariantAnnotation:::.distance(query[queryHits(hits)[i]], subject[subjectHits(hits)[i]])$min + error$remote[i] = StructuralVariantAnnotation:::.distance(query[names(query) == query[queryHits(hits)[i]]$partner], + subject[names(subject) == subject[subjectHits(hits)[i]]$partner] + )$min + } + + return(error) + +} + + + +## Take the union of callsets a and b, both breakpointRanges objects +## If multiple hits found in b for a, choose the closest match, measured +## as the mean distance between breakends +mergeCallsets = function(a, b, slop) { + + ## Find overlaps + overlaps = StructuralVariantAnnotation::findBreakpointOverlaps(query=a, + subject=b, + maxgap=slop, + sizemargin=0.8, + restrictMarginToSizeMultiple=0.8) + + + + ## If we have any duplicate query hits, choose hit based on match quality + if(anyDuplicated(queryHits(overlaps))) { + + ## Compute local and remote breakend basepair error on matches + error = computeError(query=a, subject=b, hits=overlaps) + + ## Get duplicate hits + dup.query.hits = table(queryHits(overlaps)) + dup.query.hits = names(dup.query.hits[dup.query.hits > 1]) + + ## Determine which hits we're removing + idx.hits.rm = c() + for (d in dup.query.hits) { + + idx.dup.query.hits = which(queryHits(overlaps) %in% d) + + local.error = error$local[idx.dup.query.hits] + remote.error = error$remote[idx.dup.query.hits] + mean.error = rowMeans(cbind(local.error, remote.error)) + + ## Keep the query hit with the smallest mean error + idx.hits.rm = c(idx.hits.rm, idx.dup.query.hits[which.max(mean.error)]) + + } + + overlaps = overlaps[-idx.hits.rm] + + } + + ## For matching SVs, merge caller support + a$support[queryHits(overlaps)] = paste0(a$support[queryHits(overlaps)],',',b$support[subjectHits(overlaps)]) + a$breakendPosID[queryHits(overlaps)] = paste0(a$breakendPosID[queryHits(overlaps)],',',b$breakendPosID[subjectHits(overlaps)]) + + ## Pull in non-matching SVs from b + res = c(a, b[-subjectHits(overlaps)]) + + return(res) + +} + + + +## Convert breakpointRanges to BEDPE +vcfToBedpe = function(vcf, supplemental=F) { + + sqn = as.character(seqnames(vcf)) + strand = as.character(strand(vcf)) + res = c() + processed = c() + + for (i in 1:length(vcf)) { + bnd = names(vcf)[i] + partner = vcf$partner[i] + partner.idx = which(names(vcf) == partner) + + ## If we don't have exactly one partner, exclude this variant + if (length(partner.idx) != 1) { + warning('Missing partner for breakend ', bnd) + next + } + + ## Check to see if we've alrady processed this or it's partner + if (any(c(bnd, partner) %in% processed)) { + next + } + + ## Which support column should we use? + if (supplemental) { + support = vcf$supplemental[i] + } else { + support = vcf$support[i] + } + + + ## Combine breakends in single line + res.i = c(sqn[i], start(vcf)[i], end(vcf)[i], ## chr1, start1, end1 + sqn[partner.idx], start(vcf)[partner.idx], end(vcf)[partner.idx], ## chr2, start2, end 2 + 'BND', '.', strand[i], strand[partner.idx], support) ## type, score, strand1, strand2, support + + ## Add to result, keep track of processed breakends + res = rbind(res, res.i) + processed = c(processed, bnd, partner) + } + + + ## Add colnames and fill in simple event classifications + colnames(res) = c('chr1', 'start1', 'end1', 'chr2', 'start2', 'end2', 'type', 'score', 'strand1', 'strand2', 'evidence') + res = as.data.frame(res, stringsAsFactors=F) + + res$type[res$strand1 == '+' & res$strand2 == '-'] = 'DEL' + res$type[res$strand1 == '-' & res$strand2 == '+'] = 'DUP' + res$type[res$strand1 == '-' & res$strand2 == '-'] = 'INV' + res$type[res$strand1 == '+' & res$strand2 == '+'] = 'INV' + res$type[res$chr1 != res$chr2] = 'TRA' + + ## Sort by chromosome + res = res[order(factor(res$chr1, levels=levels(seqnames(vcf))), res$start1, res$end1, decreasing=F), ] + + ## Simplify coordinates + res$end1 = as.numeric(res$start1) + 1 + res$end2 = as.numeric(res$start2) + 1 + + + ## Extract tool info from read support column + res$tools = sapply(res$evidence, function(x) paste(unlist(stringr::str_extract_all(x, '(?<=\\[)[a-z]+(?=_)')), collapse=',')) + + colnames(res)[1] = paste0('#', colnames(res)[1]) + + return(res) + +} + + + +## Collect arguments +option_list = list( + make_option(c("-v", "--vcf"), type='character', help="Comma-delimited list of breakend notation VCFs"), + make_option(c("-c", "--callers"), type='character', help="Comma-delimited list of SV caller names corresponding to the order of VCFs given in --vcf"), + make_option(c("-t", "--tumor"), type='character', help="Tumor sample ID"), + make_option(c("-n", "--normal"), type='character', help="Normal sample ID"), + make_option(c("-b", "--build"), type='character', help="Genome build"), + make_option(c("-s", "--slop"), type='numeric', help="Padding to use when comparing breakpoints"), + make_option(c("-l", "--min_sv_length"), type='numeric', help="Filter SVs shorter than this length"), + make_option(c("-a", "--allowed_chr"), type='character', help="Comma-delimited list of chromosomes to keep"), + make_option(c("-o", "--out_file"), type='character', help="Output BEDPE"), + make_option(c("-p", "--out_file_supplemental"), type='character', help="Output supplemental BEDPE")) +opt = parse_args(OptionParser(option_list=option_list)) + + + +## Unpack arguments +opt$vcf = unlist(strsplit(opt$vcf, ',', fixed=T)) +opt$callers = unlist(strsplit(opt$callers, ',', fixed=T)) +opt$allowed_chr = unlist(strsplit(opt$allowed_chr, ',', fixed=T)) + + + +## Iteratively merge VCFs +res = NULL +for (i in 1:length(opt$vcf)) { + + ## Read VCF + caller = opt$caller[i] + vcf = VariantAnnotation::readVcf(opt$vcf[i], genome=opt$build) + + ## Get read support + rowRanges(vcf)$support = getReadSupport(vcf=vcf, caller=caller, sample_id=opt$tumor) + rowRanges(vcf)$supplemental = getReadSupport(vcf=vcf, caller=caller, sample_id=opt$tumor, supplementary=T ) + + ## Convert to breakpointRanges object, don't adjust for CIPOS uncertainty (i.e. keep nominalPosition) + vcf = StructuralVariantAnnotation::breakpointRanges(vcf, nominalPosition=T) + + ## Add breakendPosID for later redundancy checks + vcf$breakendPosID = paste0('[',caller,'=',as.character(seqnames(vcf)),':',start(vcf),':',strand(vcf),']') + + ## Overlap if this isn't the first callset + if (i == 1) { + res = vcf + } else { + res = mergeCallsets(a=res, b=vcf, slop=opt$slop) + } + +} + + + +## Handle breakpoints with duplicate start or end positions +res = removeRedundantBreakpoints(res) + + + +## Convert to bedpe, apply some filters +for (i in c('main','supplemental')) { + + outfile = ifelse(i=='main', opt$out_file, opt$out_file_supplemental) + + ## Convert to BEDPE format + res.i = vcfToBedpe(res, supplemental=i=='supplemental') + res.i$`tumor--normal` = paste0(opt$tumor,'--',opt$normal) + + ## Filter non-TRA variants for minimum length opt$min_sv_length + sv.lengths = abs(as.numeric(res.i$start2) - as.numeric(res.i$start1)) + res.i = res.i[res.i$type == 'TRA' | sv.lengths >= opt$min_sv_length, ] + + ## Filter non-TRA svaba-unique variants less than SVABA_MIN_LENGTH + sv.lengths = abs(as.numeric(res.i$start2) - as.numeric(res.i$start1)) + res.i = res.i[(res.i$tools != 'svaba' | res.i$type == 'TRA') | (res.i$tools == 'svaba' & sv.lengths >= SVABA_MIN_LENGTH), ] + + ## Filter SVs not occurring in allowed chromosomes (i.e. autosomes and sex chromosomes) + res.i = res.i[res.i$`#chr1` %in% opt$allowed_chr & res.i$chr2 %in% opt$allowed_chr, ] + + ## Write result + write.table(res.i, outfile, row.names=F, col.names=T, sep='\t', quote=F) + +} diff --git a/bin/pta/merge_columns.py b/bin/pta/merge_columns.py new file mode 100644 index 00000000..76b77b25 --- /dev/null +++ b/bin/pta/merge_columns.py @@ -0,0 +1,326 @@ +#!/usr/bin/env python +# USAGE: python merge_columns.py +# DESCRIPTION: +################################################################################ +##################### COPYRIGHT ################################################ +# New York Genome Center + +# SOFTWARE COPYRIGHT NOTICE AGREEMENT +# This software and its documentation are copyright (2018) by the New York +# Genome Center. All rights are reserved. This software is supplied without +# any warranty or guaranteed support whatsoever. The New York Genome Center +# cannot be responsible for its use, misuse, or functionality. +# Version: 0.1 +# Author: Kanika Arora (karora@nygenome.org) and Jennifer M Shelton +##################### /COPYRIGHT ############################################### +################################################################################ +import sys +import os +import logging as log +import pysam +import collections +from functools import reduce +########################################################################## +############## Custom functions ############ +########################################################################## +class Naming(object): + ''' + Split based on tumor or normal identity. Assumes id is + _. + ''' + + def __init__(self, samples, tumor, normal): + self.samples = samples + self.normal = normal + self.tumor = tumor + self.get_lists() + + + def get_lists(self): + ''' + Make list of tumor-only and normal-only sample names + NOTE: N/T order is reversed by bcftools merge. + + MWL NOTE: This does not appear to be true! Sanity check is needed with real data. + + ''' + print(self.get_originals(self.samples[0])) + if self.get_originals(self.samples[0]) == self.normal: + self.tumor_samples = self.samples[1::2] + self.normal_samples = self.samples[::2] + elif self.get_originals(self.samples[0]) == self.tumor: + self.tumor_samples = self.samples[::2] + self.normal_samples = self.samples[1::2] + self.pairs = [sample_name for sample_name in zip(self.normal_samples, self.tumor_samples)] + + + def get_originals(self, sample): + ''' + get original sample names. NOTE: N/T order is + reversed by bcftools merge if alpha order is reversed. + ''' + return '_'.join(sample.split('_')[1:]).replace('indel_', '').replace('support_','').replace('sv_','') + # // MWL NOTE: replace statements added to remove additions to sample names that were made to clarify what tools calls originated from. + # i.e., lancet_support_, strelka2_sv_, strelka2_indel_. + +class Variant(Naming): + ''' + Import a pysam record. and write out from record. + The class allows editing of fixed elements like + the min number of samples in the VCF. + ''' + + def __init__(self, record, samples, tumor, normal): + self.record = record + self.line = str(record) + self.samples = samples + self.tumor = tumor + self.normal = normal + Naming.__init__(self, self.samples, self.tumor, self.normal) + self.parts = self.line.split('\t') + # GT fields found + self.gt_tools = set() + # VCF columns + self.chrom = self.parts[0] + self.pos = self.parts[1] + self.id = self.parts[2] + self.ref = self.parts[3] + self.alts = self.parts[4] + self.qual = self.parts[5] + self.filters = self.parts[6] + self.info = self.parts[7] + self.format = ':'.join([key for key in self.get_uniq_keys()]) + self.find_tools() + + + def replace_empty(self, value, key, sep=','): + ''' + Replace an empty field with ".". In pysam this will be + a tuple with None as the only value. + ''' + if isinstance(value, tuple): + if len(value) == 1 and value[0] == None: + return '.' + if len(value) > 1 and all(map(lambda x: x != None, value)) and key == 'GT': + return '/'.join(['.' if x == None else str(x) for x in value]) + elif value == None: + return '.' + if key == 'GT': + return '/'.join(['.' if x == None else str(x) for x in value]) + if (isinstance(value, collections.Iterable) and not isinstance(value, str)) or \ + isinstance(value, tuple): + joined = [] + for i in value: + if i == None: + joined.append('') + else: + joined.append(str(i)) + return sep.join(joined) + return str(value) + + + def not_empty(self, value, key): + ''' + Test if a format field is not empty. In pysam this will be + a tuple with None as the only value. + ''' + if isinstance(value, tuple): + if len(value) == 1 and value[0] == None: + return False + if len(value) > 1 and all(map(lambda x: x == None, value)) and key == 'GT': + return False + if value == None: + return False + if value == '.': + return False + return True + + + def deuniqify_gt(self, key, sample_name): + ''' + Remove tool prefix from key (only if it was added for merge). + ''' + format_keys = self.record.samples[sample_name].keys() + if key not in format_keys and \ + key.endswith('GT') and \ + 'GT' in format_keys: + return 'GT' + return key + + + def find_tools(self): + ''' + Return list of samples with keys. + ''' + self.final_normal_samples = [] + self.final_tumor_samples = [] + for pair in self.pairs: + found_keys = [] + for sample_name in pair: + found_keys += [key for key in self.uniq_keys if self.not_empty(self.record.samples[sample_name][self.deuniqify_gt(key, sample_name)], self.deuniqify_gt(key, sample_name))] + if len(found_keys) > 0: + self.final_normal_samples.append(pair[0]) + self.final_tumor_samples.append(pair[1]) + + + def uniqify_gt(self, key, sample_name): + ''' + Add tool prefix to key. + ''' + if key == 'GT': + tool = sample_name.split('_')[0] + tool = tool.split(':')[-1] + self.gt_tools.update(set([tool])) + return tool + '_' + key + return key + + + def find_keys(self, record, sample_name): + ''' + Return list of keys with values for FORMAT + ''' + format_keys = record.samples[sample_name].keys() + found_keys = [self.uniqify_gt(key, sample_name) for key in format_keys if self.not_empty(record.samples[sample_name][key], key)] + return found_keys + + + + def get_uniq_keys(self): + ''' + Return a key/value pairs for any key with a value for on tool's results. + ''' + seen = set() + seen_add = seen.add + found_keys = reduce(list.__add__, + [self.find_keys(self.record, sample_name) for sample_name in self.samples], + []) + self.uniq_keys = [key for key in found_keys if not (key in seen or seen_add(key))] + return self.uniq_keys + + + def reduce_samples(self, samples): + ''' + Find all keys with a value for any sample. Return '.' for missing values. + ''' + uniq_keys = self.get_uniq_keys() + sample_result = [] + for sample_name in samples: + tool = sample_name.split('_')[0] + tool = tool.split(':')[-1] + sample_result.append(':'.join([self.replace_empty(self.record.samples[sample_name][self.deuniqify_gt(key, sample_name)], self.deuniqify_gt(key, sample_name)) for key in uniq_keys if tool == key.split('_')[0]])) + return ':'.join(sample_result) + + + def write(self): + ''' + Return a reformatted string from a pysam object. + ''' + line = [self.chrom, + self.pos, + self.id, + self.ref, + self.alts, + self.qual, + self.filters, + self.info, + self.format] + line += [self.reduce_samples(self.final_normal_samples)] + line += [self.reduce_samples(self.final_tumor_samples)] + self.new_line = '\t'.join(line) + '\n' + return self.new_line + + +def modify_header(bcf_in, tool): + ''' + Add new FORMAT field + ''' + bcf_in.header.formats.add(id=tool + '_GT', number='1', + type='String', + description='Genotype from ' + tool) + return bcf_in + + +def load_header(bcf_in): + ''' + Load a VCF file header as a list of lines. + ''' + header = '\n'.join(str(bcf_in.header).split('\n')[:-2]) + '\n' + return header + + +def read_vcf(vcf_file): + ''' + Read in annotated VCF file. + ''' + bcf_in = pysam.VariantFile(vcf_file) # auto-detect input format + return bcf_in + + +def vcf_writer(bcf_in, vcf_out_file, tumor, normal): + ''' + Write out the VCF file with corrected information + ''' + with open(vcf_out_file, 'w') as vcf_out: + samples = [sample_name for sample_name in bcf_in.header.samples] + names = Naming(samples, tumor, normal) + lines = [] + gt_tools = set() + for record in bcf_in.fetch(): + out = Variant(record, samples, tumor, normal) + gt_tools.update(out.gt_tools) + lines.append(out.write()) + # ========================== + # Add GT + # ========================== + for tool in gt_tools: + bcf_in = modify_header(bcf_in, tool) + header = load_header(bcf_in) + # ========================== + # Write header + # ========================== + for line in header: + vcf_out.write(line) + vcf_out.write('\t'.join(['#CHROM', 'POS', 'ID', 'REF', 'ALT', + 'QUAL', 'FILTER', 'INFO', 'FORMAT', + names.normal, names.tumor]) + '\n') + # ========================== + # Write variants + # ========================== + for line in lines: + vcf_out.write(line) + + +def main(): + ''' + Merge the VCF columns by: + 1) Getting all INFO fields with values + 2) Getting all FORMAT fields from any sample with non-empty values + 3) Uniqify GT now that bcftools merge is done + ''' + # ========================== + # Input variables + # ========================== + vcf_in = sys.argv[1] + vcf_out_file = sys.argv[2] + normal = sys.argv[3] + tumor = sys.argv[4] + + # NOTE: Order was changed to be consistent with prior scripts. + + assert os.path.isfile(vcf_in), 'Failed to find caller VCF call file :' + vcf_in + # ========================== + # Run prep + # ========================== + bcf_in = read_vcf(vcf_in) + vcf_writer(bcf_in, vcf_out_file, tumor, normal) + + + +########################################################################## +##### Execute main unless script is simply imported ############ +##### for individual functions ############ +########################################################################## + + +if __name__ == '__main__': + main() \ No newline at end of file diff --git a/bin/pta/merge_prep.py b/bin/pta/merge_prep.py new file mode 100644 index 00000000..5f2ebd07 --- /dev/null +++ b/bin/pta/merge_prep.py @@ -0,0 +1,242 @@ +#!/usr/bin/env python +# USAGE: python merge_prep.py +# DESCRIPTION: +################################################################################ +##################### COPYRIGHT ################################################ +# New York Genome Center + +# SOFTWARE COPYRIGHT NOTICE AGREEMENT +# This software and its documentation are copyright (2018) by the New York +# Genome Center. All rights are reserved. This software is supplied without +# any warranty or guaranteed support whatsoever. The New York Genome Center +# cannot be responsible for its use, misuse, or functionality. +# Version: 0.1 +# Author: Kanika Arora (karora@nygenome.org) and Jennifer M Shelton +##################### /COPYRIGHT ############################################### +################################################################################ +import sys +import os +import logging as log +import pysam +import pandas as pd +import re +import argparse +########################################################################## +############## Custom functions ############ +########################################################################## +base_pattern = re.compile(r'^[ACGTN]+$') + + + +def read_vcf(vcf_file): + ''' + Read in annotated VCF file. + ''' + bcf_in = pysam.VariantFile(vcf_file) # auto-detect input format + return bcf_in + + +def add_info_header(bcf_out, + id, + number, + type, + description): + ''' + Add new INFO field + ''' + bcf_out.header.info.add(id=id, + number=number, + type=type, + description=description) + return bcf_out + + +def add_filter_header(bcf_out, + id, + description): + ''' + Add new FILTER field + ''' + bcf_out.header.filters.add(id=id, + number=None, + type=None, + description=description) + return bcf_out + + +def remove_filters_header(bcf_out): + ''' + Remove all filters except PASS + ''' + for id in bcf_out.header.filters.keys(): + if not id in ['PASS','SUPPORT']: + bcf_out.header.filters.remove_header(id) + return bcf_out + + +def pass_alleles(record, base_pattern=base_pattern): + ''' + Pass lines that have no special characters in REF/ALT + ''' + passed = True + alleles = list(record.alts) + [record.ref] + for allele in alleles: + if not re.match(base_pattern, allele): + passed = False + return passed + + +def prep_record(record, tool, passing, support): + ''' + Pass lines that have no special characters in ALT/REF + ''' + record.id = None + record.qual = None + if passing: + if support: + if tool == 'manta': + tool_supported_by = tool + 'SV' + else: + tool_supported_by = tool + record.info['supported_by'] = (tool_supported_by,) + else: + record.info['called_by'] = (tool,) + record.info['num_callers'] = 1 + return record + + +def write_file(bcf_in, bcf_out, tool, filter=True, support=False): + ''' + Filter based on FILTER column, + also filter lines with special characters in ALT/REF + ''' + passing = False + for record in bcf_in.fetch(): + filters = record.filter.keys() + # ==================== + # Passing variants + # ==================== + if len(filters) == 1 and \ + filters[0] == 'PASS': + passing = True + write = True + if support: + record.filter.clear() + record.filter.add('SUPPORT') + # ==================== + # Failing variants + # ==================== + else: + if filter: + write = False + else: + write = True + if write and not pass_alleles(record): + write = False + if write: + record = prep_record(record, tool, passing, support) + exit_status = bcf_out.write(record) + if exit_status != 0: + print(exit_status) + return True + + +def main(): + ''' + Prepare the VCF file for merging by: + 1) 'TYPE' is added to header + 2) 'called_by' is added to header + 3) 'num_callers' is added to header + 4) filter lines with special characters in REF/ALT (e.g. in manta) + 5) fill in 'called_by', 'num_callers' + 6) 'SUPPORT' FILTER line is added + 7) non 'PASS'/'SUPPORT' FILTER lines are removed (if not skip-filter) + ''' + # ========================== + # Input variables + # ========================== + parser = argparse.ArgumentParser( + description='DESCRIPTION: Takes in a VCF \ + file and preps the file by: \ + 1) "TYPE" is added to header \ + 2) "called_by" is added to header \ + 3) "num_callers" is added to header \ + 4) filter lines with special characters in REF/ALT (e.g. in manta) \ + 5) fill in "called_by", "num_callers" \ + 6) "SUPPORT" FILTER line is added \ + 7) non "PASS"/"SUPPORT" FILTER lines are removed (if not skip-filter) \ + . Command-line \ + options that may be omitted (i.e. are NOT \ + required) are shown in square brackets.') + # Documentation parameters + parser.add_argument('-v', '--vcf', + dest='vcf_file', + help='VCF file', + required=True) + parser.add_argument('-o', '--out', + dest='out', + help='Output VCF file', + required=True) + parser.add_argument('-t', '--tool', + dest='tool', + choices=['strelka2_sv', + 'strelka2_indel', + 'mutect2', + 'svaba', + 'lancet', + 'manta'], + help='Tool name', + required=True) + parser.add_argument('-s', '--support', + dest='support', + help='Use if calls are only support calls', + action='store_true') + parser.add_argument('-f', '--skip-filter', + dest='skip_filter', + help='Remove calls that are not PASS or SUPPORT', + action='store_true') + args = parser.parse_args() + filter = True + if args.skip_filter: + filter = False + assert os.path.isfile(args.vcf_file), 'Failed to find caller VCF call file :' + args.vcf_file + # ========================== + # Run prep + # ========================== + bcf_in = read_vcf(args.vcf_file) + bcf_in = add_info_header(bcf_out=bcf_in, + id='called_by', + number='.', + type='String', + description='Name of the variant caller(s) that the variant was called by') + bcf_in = add_info_header(bcf_out=bcf_in, + id='num_callers', + number='1', + type='Integer', + description='Number of callers') + bcf_in = add_info_header(bcf_out=bcf_in, + id='supported_by', + number='.', + type='String', + description='Name of the tool(s) apart from the main variant callers in the pipeline that support the variant') + bcf_in = add_filter_header(bcf_out=bcf_in, + id='SUPPORT', + description='Variant from Validation caller') + bcf_out = pysam.VariantFile(args.out, 'w', header=bcf_in.header) + if filter: + bcf_out = remove_filters_header(bcf_out) + write_file(bcf_in, + bcf_out, + tool=args.tool, + filter=filter, + support=args.support) + + +########################################################################## +##### Execute main unless script is simply imported ############ +##### for individual functions ############ +########################################################################## + + +if __name__ == '__main__': + main() \ No newline at end of file diff --git a/bin/pta/remove_contig.py b/bin/pta/remove_contig.py new file mode 100644 index 00000000..e87f12f6 --- /dev/null +++ b/bin/pta/remove_contig.py @@ -0,0 +1,51 @@ +#!/usr/bin/env python +# USAGE: remove_contig.py VCF_IN VCF_OUT +# DESCRIPTION: Print a VCF file skipping contig descriptions (for use on bad contig descriptions) +# Version 1.0 +import sys +import shutil + + +def remove_contig(vcf): + ''' + Skips line if starts with ##contig= to remove Lumpy + VCF line with contig name but witout required length value + ''' + for line in vcf: + if not line.startswith('##contig='): + yield line + + +def vcf_writer(vcf_file, vcf_out_file): + ''' + Write out the VCF file with corrected information + ''' + # ===================== + # test if renaming should occur + # ===================== + rename = False + if vcf_out_file == vcf_file: + vcf_out_file = vcf_file + '_tmp.vcf' + rename = True + # ===================== + # write non-contig lines + # ===================== + with open(vcf_out_file, 'w') as vcf_out: + with open(vcf_file) as vcf: + for line in remove_contig(vcf): + vcf_out.write(line) + # ===================== + # rename output VCF + # ===================== + if rename: + shutil.move(vcf_out_file, vcf_file) + + + +# ===================== +# Main +# ===================== +if __name__ == "__main__": + vcf_file = sys.argv[1] + vcf_out_file = sys.argv[2] + vcf_writer(vcf_file, vcf_out_file) \ No newline at end of file diff --git a/bin/pta/rename_csq_vcf.py b/bin/pta/rename_csq_vcf.py new file mode 100644 index 00000000..e458ab5a --- /dev/null +++ b/bin/pta/rename_csq_vcf.py @@ -0,0 +1,171 @@ +#!/usr/bin/env python +# USAGE: python cancer_gene_census.py cancer_gene_census.csv VCF VCF_OUT +# DESCRIPTION: Annotates files by adding information about the +# Cosmic Genome Census entry for the nearest gene. +################################################################################ +##################### COPYRIGHT ################################################ +# New York Genome Center + +# SOFTWARE COPYRIGHT NOTICE AGREEMENT +# This software and its documentation are copyright (2018) by the New York +# Genome Center. All rights are reserved. This software is supplied without +# any warranty or guaranteed support whatsoever. The New York Genome Center +# cannot be responsible for its use, misuse, or functionality. +# Version: 0.1 +# Author: Jennifer M Shelton +##################### /COPYRIGHT ############################################### +################################################################################ + + +import sys +import os +import logging as log +import pysam +import pprint +from collections import OrderedDict +########################################################################## +############## Custom functions ############ +########################################################################## + + +def add_info_header(bcf_out, + id, + number, + type, + description): + ''' + Add new INFO field + ''' + bcf_out.header.info.add(id=id, + number=number, + type=type, + description=description) + return bcf_out + + +def af_1000g(bcf_out): + ''' + Add description for VEP annotation. + ''' + for af in ['AF_1000G', 'AFR_AF_1000G', 'AMR_AF_1000G', + 'EAS_AF_1000G', 'EUR_AF_1000G', 'SAS_AF_1000G']: + description = af + ' field from phase3 1000genomes' + bcf_out = add_info_header(bcf_out, + id=af, + number='.', + type='String', + description=description) + return bcf_out + + +def get_good_fields(csq_columns, suffix='_1000G'): + ''' + rename CSQ fields as needed. + ''' + change = ['AF', 'AFR_AF', 'AMR_AF', 'EAS_AF', 'EUR_AF', 'SAS_AF'] + good = [] + for key in csq_columns: + if key in change: + key += suffix + good.append(key) + return good + +class Variant(object): + + + def __init__(self, record): + self.record = record + self.line = str(self.record).rstrip() + self.parts = self.line.split('\t') + # VCF columns + self.chrom = self.parts[0] + self.pos = self.parts[1] + self.id = self.parts[2] + self.ref = self.parts[3] + self.alts = self.parts[4].split(',') + self.qual = self.parts[5] + self.filters = self.parts[6].split(';') + self.info = self.parts[7].split(';') + self.format = self.parts[8].split(':') + self.samples = self.parts[9:] + # modify + self.info_dict = self.get_info() + + + def get_info(self): + ''' + Get current info line and add prefix if needed + ''' + info_dict = OrderedDict() + for item in self.info: + if '=' in item: + if '=' in item: + info_dict.update({item.split('=')[0] : '='.join(item.split('=')[1:])}) + else: + info_dict.update({item : None }) + return info_dict + + + + def write(self): + line = [self.chrom, + self.pos, + self.id, + self.ref, + ','.join(self.alts), + str(self.qual), + ';'.join(self.filters), + ';'.join(['='.join([x for x in [key, self.info_dict[key]] if x != None]) for key in self.info_dict]), + ':'.join(self.format)] + line += self.samples + self.new_line = '\t'.join(line) + return self.new_line + + +def read_vcf(vcf_file): + ''' + Read in annotated VCF file. + ''' + bcf_in = pysam.VariantFile(vcf_file) # auto-detect input format + return bcf_in + + +def write_vcf(bcf_in, vcf_out_file, csq_columns): + ''' + Write out the download + ''' + # CSQ to keep + good_fields = get_good_fields(csq_columns) + # Import the header after removal of extra metadata + header = str(bcf_in.header).rstrip() + csq_format = '|'.join(good_fields) + # Write new header with corrected CSQ and fewer metadata keys overall + with open(vcf_out_file, 'w') as vcf_out: + for line in header.split('\n'): + if 'ID=CSQ' in line: + line = '##INFO=' + vcf_out.write(line + '\n') + for record in bcf_in: + line = Variant(record).write() + if line: + vcf_out.write(line + '\n') + + +def main(): + ''' + Reduce metadata in VCF for main VCF output + ''' + vcf_file = sys.argv[1] + vcf_out_file = sys.argv[2] + bcf_in = read_vcf(vcf_file) + bcf_in = af_1000g(bcf_out=bcf_in) + csq_columns = bcf_in.header.info['CSQ'].description.split()[-1].split('|') # grab the definitions + write_vcf(bcf_in, vcf_out_file, csq_columns) + + +########################################################################## +##### Execute main unless script is simply imported ############ +##### for individual functions ############ +########################################################################## +if __name__ == '__main__': + main() diff --git a/bin/pta/rename_metadata.py b/bin/pta/rename_metadata.py new file mode 100644 index 00000000..d4cf28f1 --- /dev/null +++ b/bin/pta/rename_metadata.py @@ -0,0 +1,195 @@ +#!/usr/bin/env python +# USAGE: rename_metadata.py VCF_IN VCF_OUT PREFIX +# DESCRIPTION: Takes in a VCF and a tool name +# and preps the file by: +# 1) add tool + "_" to all INFO and FORMAT def lines (unless the key is "GT") + +import sys +import shutil +import logging as log +from collections import OrderedDict +import re +import os + + +class Variant(object): + + + def __init__(self, line, prefix=''): + self.line = line + self.prefix = prefix + self.parts = line.split('\t') + # VCF columns + self.chrom = self.parts[0] + self.pos = self.parts[1] + self.id = self.parts[2] + self.ref = self.parts[3] + self.alts = self.parts[4].split(',') + self.qual = self.parts[5] + self.filters = self.parts[6].split(';') + self.info = self.parts[7].split(';') + self.format = self.parts[8].split(':') + self.samples = self.parts[9:] + # modify + self.info_dict = self.get_info() + self.fix_format() + + + def get_info(self): + ''' + Get current info line and add prefix if needed + ''' + info_dict = OrderedDict() + for item in self.info: + if '=' in item: + info_dict.update({self.prefix + item.split('=')[0] : item.split('=')[1]}) + else: + info_dict.update({self.prefix + item : None}) + return info_dict + + + def add_prefix(self, key): + ''' + Add prefix unless variant is GT. + ''' + if key != 'GT': + return self.prefix + key + return key + + + def fix_format(self): + ''' + Add any prefix to format entries. Skips GT becuase + in at least one arrangement (the GT being first without the + full name GT bcftools combines other GT feilds (e.g. Mutect2's + PGT into the first position and writes bad VCF files. + This: + "0/0:37,1:0.098:10,0:27,1:12:224,349:25:2:0|1:16759500_G_C:.:." + Becomes (with non-Non-ASCII characters changed to *): + "0/00/1!%I9**=m**=! + :37,1:0.098:10,0:27,1:12:224,349:25:2:0|1:16759500_G_C:.:." + ''' + new_format = [self.add_prefix(key) for key in self.format] + self.format = new_format + + + def write(self): + line = [self.chrom, + self.pos, + self.id, + self.ref, + ','.join(self.alts), + str(self.qual), + ';'.join(self.filters), + ';'.join(['='.join([x for x in [key, self.info_dict[key]] if x != None]) for key in self.info_dict]), + ':'.join(self.format)] + line += self.samples + self.new_line = '\t'.join(line) + return self.new_line + + +def fix_header(line, prefix): + ''' + Add prefix as needed. Replace AD with standard AD line because + GATK also makes this replacement and it conflicts with Lancet similar but unique + wording. + ''' + if '##FORMAT=' in line or \ + '##INFO=' in line: + id = re.search('ID=(?P[^>,]+)', line) + if id == None: + log.error('FORMAT or INFO line missing ID field: ' + line) + sys.exit() + if id.group(1) == 'AD' and '##FORMAT=' in line: + line = '##FORMAT=\n' + if id.group(1) != 'GT': + line = line.replace('ID=' + id.group(1), + 'ID=' + prefix + id.group(1)) + return line + + +def load_header(vcf_in, prefix): + ''' + Load a VCF file header as a list of lines. + ''' + with open(vcf_in) as vcf: + header = [fix_header(line, prefix) for line in vcf if line.startswith('#')] + return header + + +def load_vcf(vcf_in, prefix): + ''' + Load a VCF file and fixes lines. + ''' + with open(vcf_in) as vcf: + for line in vcf: + if not line.startswith('#'): + yield Variant(line, prefix).write() + + + + +def rename_metadata(vcf_file, vcf_out_file, prefix): + ''' + Add prefix to FORMAT and INFO keys + ''' + # ===================== + # test if renaming should occur + # ===================== + rename = False + if vcf_out_file == vcf_file: + vcf_out_file = vcf_file + '_tmp.vcf' + rename = True + # ===================== + # rename + # ===================== + header = load_header(vcf_file, prefix) + vcf_reader = load_vcf(vcf_file, prefix) + # ===================== + # rewrite + # ===================== + vcf_writer(header, vcf_reader, vcf_out_file) + # ===================== + # rename output VCF + # ===================== + if rename: + shutil.move(vcf_out_file, vcf_file) + return True + + +def correct_ad_line(): + ''' + ''' + +def vcf_writer(header, vcf_reader, vcf_out_file): + ''' + Write out a VCF file with the a prefix added to INFO and FORMAT keys + ''' + with open(vcf_out_file, 'w') as vcf_out: + for line in header: + vcf_out.write(line) + for line in vcf_reader: + vcf_out.write(line) + return True + + +def main(): + ''' + DESCRIPTION: Takes in a VCF and a tool name + and preps the file by: + 1) add tool + "_" to all INFO and FORMAT def lines (unless the key is "GT") + ''' + vcf_file = sys.argv[1] + vcf_out_file = sys.argv[2] + prefix = sys.argv[3] + '_' + assert os.path.isfile(vcf_file), 'Failed to find prep caller VCF call file :' + vcf_file + rename_metadata(vcf_file, vcf_out_file, prefix) + + +# ===================== +# Main +# ===================== + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/bin/pta/rename_vcf.py b/bin/pta/rename_vcf.py new file mode 100644 index 00000000..c1c43e31 --- /dev/null +++ b/bin/pta/rename_vcf.py @@ -0,0 +1,123 @@ +#!/usr/bin/env python +# USAGE: rename_vcf.py VCF_IN VCF_OUT NORMAL TUMOR PREFIX +# DESCRIPTION: Print a VCF file with the sample order indicated in the 3rd +# and 4th arguments and a prefix added to the sample names + +import sys +import pandas as pd +import shutil +import logging as log +import os + + +def load_header(vcf_in): + ''' + Load a VCF file header as a list of lines. + ''' + with open(vcf_in) as vcf: + header = [line for line in vcf if line.startswith('#')] + return header + + +def load_vcf(vcf_in, header, reorder, paired, normal, tumor, prefix): + ''' + Load a VCF file as an pandas dataframe. + ''' + names = header[-1].rstrip().replace('^#', '').split('\t') + if paired: + names = ['#CHROM', 'POS', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER', + 'INFO', 'FORMAT', + prefix + '_' + normal, prefix + '_' + tumor] + vcf_reader = pd.read_csv(vcf_in, comment='#', + names=names, sep='\t') + return vcf_reader + + +def order_wrong(last_header, normal, tumor): + ''' + Check if the order is perfect + ''' + header_parts = last_header.rstrip().split('\t') + if header_parts[9] == tumor and \ + header_parts[10] == normal: + return True + else: + return False + + +def check_paired(last_header): + ''' + Return False if VCF has only a single sample + ''' + header_parts = last_header.rstrip().split('\t') + if len(header_parts) == 10: + return False + return True + + +def rename(vcf_file, vcf_out_file, normal, tumor, prefix): + ''' + Add prefix to sample name + ''' + # ===================== + # test if renaming should occur + # ===================== + rename = False + if vcf_out_file == vcf_file: + vcf_out_file = vcf_file + '_tmp.vcf' + rename = True + # ===================== + # reorder + # ===================== + header = load_header(vcf_file) + last_header = header[-1] + paired = check_paired(last_header) + if not paired: + reorder = False + else: + reorder = order_wrong(last_header, normal, tumor) + if reorder: + log.error('VCF must start with expected sample names in the order normal, tumor.') + sys.exit(1) + vcf_reader = load_vcf(vcf_file, header, reorder, + paired, normal, tumor, prefix) + # ===================== + # rewrite + # ===================== + vcf_writer(header, vcf_reader, vcf_out_file) + # ===================== + # rename output VCF + # ===================== + if rename: + shutil.move(vcf_out_file, vcf_file) + return True + + +def vcf_writer(header, vcf_reader, vcf_out_file): + ''' + Write out the VCF file with corrected sample names + ''' + with open(vcf_out_file, 'w') as vcf_out: + for line in header[:-1]: + vcf_out.write(line) + vcf_reader.to_csv(vcf_out_file, sep='\t', + mode='a', index=False) + + +def main(): + vcf_file = sys.argv[1] + vcf_out_file = sys.argv[2] + normal = sys.argv[3] + tumor = sys.argv[4] + prefix = sys.argv[5] + assert os.path.isfile(vcf_file), 'Failed to find prep caller VCF call file :' + vcf_file + rename(vcf_file, vcf_out_file, normal, tumor, prefix) + + +# ===================== +# Main +# ===================== + + +if __name__ == "__main__": + main() diff --git a/bin/pta/reorder_vcf.py b/bin/pta/reorder_vcf.py new file mode 100644 index 00000000..2f70b30b --- /dev/null +++ b/bin/pta/reorder_vcf.py @@ -0,0 +1,135 @@ +#!/usr/bin/env python +# USAGE: reorder_vcf.py VCF_IN VCF_OUT NORMAL TUMOR +# DESCRIPTION: Print a VCF file with the sample order indicated in the 3rd +# and 4th arguments + + +# ## MWL NOTE: +# This script requires the header and input 'tumor/normal' names in the 3rd and 4th arg to match. +# If you pass names NOT present in the header, it will simply emit the file AS IS. +# The script DOES NOT inform the user of if a change has been made in the sample order. +# NOTE ALSO: if the header already contains the strings 'TUMOR' and 'NORMAL, +# 'TUMOR and NORMAL are RENAMED to string provided in 3rd and 4th args. + +import sys +import pandas as pd +import shutil + +def load_header(vcf_in): + ''' + Load a VCF file header as a list of lines. + ''' + with open(vcf_in) as vcf: + header = [line for line in vcf if line.startswith('#')] + return header + + +def load_vcf(vcf_in, header, reorder, paired, normal, tumor): + ''' + Load a VCF file as an pandas dataframe. + ''' + names = header[-1].rstrip().replace('^#', '').split('\t') + if paired and reorder: + names = ['#CHROM', 'POS', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER', + 'INFO', 'FORMAT', tumor, normal] + elif paired and not reorder: + names = ['#CHROM', 'POS', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER', + 'INFO', 'FORMAT', normal, tumor] + vcf_reader = pd.read_csv(vcf_in, comment='#', + names=names, sep='\t', + dtype={'#CHROM' : str}) + return vcf_reader + + +def reorder_vcf(vcf_reader, reorder): + ''' + reorder corrected names. + ''' + if reorder: + vcf_reader = vcf_reader[['#CHROM', 'POS', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER', + 'INFO', 'FORMAT', normal, tumor]] + return vcf_reader + + +def order_wrong(last_header, normal, tumor): + ''' + Check if the order is good + ''' + header_parts = last_header.rstrip().split('\t') + if header_parts[9] in [tumor, 'TUMOR'] and \ + header_parts[10] in [normal, 'NORMAL']: + return True + elif tumor in header_parts[9] and \ + normal in header_parts[10]: + return True + else: + return False + + +def check_paired(last_header): + ''' + Return False if VCF has only a single sample + ''' + header_parts = last_header.rstrip().split('\t') + if len(header_parts) == 10: + return False + return True + + +def reorder_column(vcf_file, vcf_out_file, normal, tumor): + ''' + Order columns Normal and then Tumor + ''' + # ===================== + # test if renaming should occur + # ===================== + rename = False + if vcf_out_file == vcf_file: + vcf_out_file = vcf_file + '_tmp.vcf' + rename = True + # ===================== + # reorder + # ===================== + header = load_header(vcf_file) + last_header = header[-1] + paired = check_paired(last_header) + if not paired: + reorder = False + else: + reorder = order_wrong(last_header, normal, tumor) + vcf_reader = load_vcf(vcf_file, header, reorder, paired, normal, tumor) + if reorder: + vcf_reader = reorder_vcf(vcf_reader, reorder) + # ===================== + # rewrite + # ===================== + vcf_writer(header, vcf_reader, vcf_out_file) + # ===================== + # rename output VCF + # ===================== + if rename: + shutil.move(vcf_out_file, vcf_file) + return True + + +def vcf_writer(header, vcf_reader, vcf_out_file): + ''' + Write out the VCF file with corrected information + ''' + with open(vcf_out_file, 'w') as vcf_out: + for line in header[:-1]: + vcf_out.write(line) + vcf_reader.to_csv(vcf_out_file, sep='\t', + mode='a', index=False) + + +# ===================== +# Main +# ===================== +if __name__ == "__main__": + vcf_file = sys.argv[1] + vcf_out_file = sys.argv[2] + normal = sys.argv[3] + tumor = sys.argv[4] + reorder_column(vcf_file, vcf_out_file, normal, tumor) + diff --git a/bin/pta/split_annotations.py b/bin/pta/split_annotations.py new file mode 100644 index 00000000..2384330d --- /dev/null +++ b/bin/pta/split_annotations.py @@ -0,0 +1,11 @@ +import pandas as pd +import sys + +df = pd.read_csv(sys.argv[1], sep='\t') + +df[['Allele','Consequence','IMPACT','SYMBOL','Gene','Feature_type','Feature','BIOTYPE','EXON','INTRON','HGVSc','HGVSp','cDNA_position','CDS_position','Protein_position','Amino_acids','Codons','Existing_variation','DISTANCE','STRAND','FLAGS','SYMBOL_SOURCE','HGNC_ID','REFSEQ_MATCH','REFSEQ_OFFSET','SOURCE','HGVS_OFFSET','AF_1000G','AFR_AF_1000G','AMR_AF_1000G','EAS_AF_1000G','EUR_AF_1000G','SAS_AF_1000G','gnomADe_AF','gnomADe_AFR_AF','gnomADe_AMR_AF','gnomADe_ASJ_AF','gnomADe_EAS_AF','gnomADe_FIN_AF','gnomADe_NFE_AF','gnomADe_OTH_AF','gnomADe_SAS_AF','MAX_AF','MAX_AF_POPS','CLIN_SIG','SOMATIC','PHENO','ada_score','rf_score','MaxEntScan_alt','MaxEntScan_diff','MaxEntScan_ref','CADD_phred','FATHMM_pred','GERP++_RS','LRT_pred','MetaSVM_pred','MutationAssessor_pred','MutationTaster_pred','PROVEAN_pred','Polyphen2_HVAR_pred','PrimateAI_pred','REVEL_score','SIFT4G_pred','SIFT_pred','fathmm-MKL_coding_pred','phyloP100way_vertebrate','CosmicCoding','CosmicCoding_GENOMIC_ID','CosmicCoding_LEGACY_ID','CosmicCoding_CNT','CosmicCoding_CDS','CosmicCoding_AA','CosmicNonCoding','CosmicNonCoding_GENOMIC_ID','CosmicNonCoding_LEGACY_ID','CosmicNonCoding_CNT','CosmicNonCoding_CDS','CosmicNonCoding_AA','NYGC','NYGC_AF','NYGC_Samples','NYGC_AC_Het','NYGC_AC_Hom','CLN_Overlap','CLN_Overlap_CLIN_ID','CLN_Overlap_CLNSIG','CLN_Overlap_CLNREVSTAT','CLN_Overlap_CLNDN','CLN_Exact','CLN_Exact_CLIN_ID','CLN_Exact_CLNSIG','CLN_Exact_CLNREVSTAT','CLN_Exact_CLNDN','GnomadExomes','GnomadExomes_AF','GnomadExomes_nhomalt','GnomadGenomes','GnomadGenomes_AF','GnomadGenomes_nhomalt','CHD_GENES','CHD_GENES_GENE','CHD_EVOLVING','CHD_EVOLVING_GENE','chd_whitelist','chd_whitelist_END','INTRONIC','INTRONIC_INTRONIC','CLINVAR_INTRONIC','CLINVAR_INTRONIC_INTRONIC','mm','mm_GENE','mm_HGVSG','mm_MMCNT1','mm_MMCNT2','mm_MMCNT3','mm_MMID3','mm_MMURI3','SPLICEAI','SPLICEAI_DS_AG','SPLICEAI_DS_AL','SPLICEAI_DS_DG','SPLICEAI_DS_DL','PLI','PLI_pLI','PLI_mis_z','Domino','Domino_Domino_Score','AR','AR_AR_GENE','ACMG59','ACMG59_GENE','ACMG59_DISEASE','DIALS','DIALS_DIALS_GENE','PGx','PGx_pgx_rsid','IMMUNO','IMMUNO_IMMUNO_Gene','NEURO','NEURO_NEURO_Gene','CARDIO','CARDIO_CARDIO_Gene','N19','N19_NYGC_CUR','R19','R19_NYGC_REPORTED_SAMPLE','R19_NYGC_CLASS','R19_NYGC_DISEASE']] = df['CSQ'].str.split('|',expand=True) +df[['CancerGeneCensus_Tier','Hallmark','Somatic','Germline','Tumour_Types_Somatic','Tumour_Types_Germline','Cancer_Syndrome','Tissue_Type','Molecular_Genetics','Role_in_Cancer','Mutation_Types']] = df['CancerGeneCensus'].str.split('|',expand=True) +df[['MUTATION_ID','GENOMIC_MUTATION_ID','Drug_Name','CosmicResistanceMutation_Tier']] = df['CosmicResistanceMutation'].str.split('|',expand=True) +df.drop(['CSQ', 'CancerGeneCensus', 'CosmicResistanceMutation'], axis=1, inplace = True) + +df.to_csv(sys.argv[2], sep='\t', index = False) \ No newline at end of file diff --git a/bin/pta/split_mnv.py b/bin/pta/split_mnv.py new file mode 100644 index 00000000..c5d326a6 --- /dev/null +++ b/bin/pta/split_mnv.py @@ -0,0 +1,198 @@ +#!/usr/bin/env python +# USAGE: python split_mnv.py +# DESCRIPTION: +################################################################################ +##################### COPYRIGHT ################################################ +# New York Genome Center + +# SOFTWARE COPYRIGHT NOTICE AGREEMENT +# This software and its documentation are copyright (2018) by the New York +# Genome Center. All rights are reserved. This software is supplied without +# any warranty or guaranteed support whatsoever. The New York Genome Center +# cannot be responsible for its use, misuse, or functionality. +# Version: 0.1 +# Author: Kanika Arora (karora@nygenome.org) and Jennifer M Shelton +##################### /COPYRIGHT ############################################### +################################################################################ +import sys +import os +import logging as log +import pysam +########################################################################## +############## Custom functions ############ +########################################################################## + + +def read_vcf(vcf_file): + ''' + Read in annotated VCF file. + ''' + bcf_in = pysam.VariantFile(vcf_file) # auto-detect input format + return bcf_in + + +def add_info_header(bcf_out, + id, + number, + type, + description): + ''' + Add new INFO field + ''' + bcf_out.header.info.add(id=id, + number=number, + type=type, + description=description) + return bcf_out + + +def get_type(record): + ''' + Fill in the type field. + ''' + if len(record.ref) == 1 and len(record.alts[0]) == 1: + type = 'SNV' + elif len(record.alts) > 1: + type = 'MULTI' + elif len(record.ref) == 1 and len(record.alts[0]) > 1 and record.ref[0] == record.alts[0][0]: + type = 'INS' + elif len(record.ref) > 1 and len(record.alts[0]) == 1 and record.ref[0] == record.alts[0][0]: + type = 'DEL' + else: + type = 'COMPLEX' + return type + + +def print_record(record, bcf_out): + ''' + Write out a vcf record. + ''' + exit_status = bcf_out.write(record) + if exit_status != 0: + print(exit_status) + + +def determine_anchor_base(record): + ''' + determine anchor base + ''' + # without anchor base + start_pos = 0 + # with anchor base + for alt in record.alts: + if alt[0] == record.ref[0]: + start_pos = 1 + return start_pos + + +def write_file(bcf_in, bcf_out, tool): + ''' + Split MNV records + ''' + splits = [] + for record in bcf_in.fetch(): + if len(record.alts) > 1: + log.error('VCF file must have only one ALT per line') + sys.exit(1) + if len(record.ref) > 1 \ + and len(record.ref) == len(record.alts[0]): + start_pos = determine_anchor_base(record) + mnv_id = '_'.join([tool, record.contig, str(record.pos), + record.ref, record.alts[0]]) + refs = list(record.ref) + alts = list(record.alts[0]) + orig_record = record.copy() + # print full record + record.pos = record.pos + start_pos + record.ref = record.ref[start_pos:] + record.alts = [record.alts[0][start_pos:]] + record.info['TYPE'] = 'MNV' + record.info['MNV_ID'] = [mnv_id] # change if never multi-allelic + print_record(record, bcf_out) + # print split records + for new_record in split_records(start_pos, refs, + orig_record, alts, + mnv_id): + print_record(new_record, bcf_out) + splits.append('|'.join([new_record.chrom, str(new_record.pos), new_record.ref, new_record.alts[0]])) + return set(splits) + + +def write_file_non_mnv(bcf_in, splits, bcf_out, tool): + ''' + Split MNV records + ''' + for record in bcf_in.fetch(): + if not (len(record.ref) > 1 \ + and len(record.ref) == len(record.alts[0])): + # print non-MNVs + id = '|'.join([record.chrom, str(record.pos), record.ref, record.alts[0]]) + if not id in splits: + record.info['TYPE'] = get_type(record) # change if multi-allelic + print_record(record, bcf_out) + else: + print('duplicate...', id) + return True + +def split_records(start_pos, refs, orig_record, alts, mnv_id): + ''' + print split records (skipping the first anchor base) + (if start_pos is 1) + ''' + for i in range(start_pos, len(refs)): + if refs[i] != alts[i]: + new_record = orig_record.copy() + new_record.ref = refs[i] + new_record.alts = [alts[i]] + new_record.pos = orig_record.pos + i + new_record.info['TYPE'] = 'SNV' # change if multi-allelic + new_record.info['MNV_ID'] = [mnv_id] # change if never multi-allelic + yield new_record + + +def main(): + ''' + Prepare the VCF file for merging by: + 1) Split MNV records to one line per nucleotide + 2) Skip any line that has an SNV called by an MNV for the same tool + ''' + # ========================== + # Input variables + # ========================== + vcf_file = sys.argv[1] + out = sys.argv[2] + tool = sys.argv[3] + + assert os.path.isfile(vcf_file), 'Failed to find caller VCF call file :' + vcf_file + # ========================== + # Run prep + # ========================== + bcf_in = read_vcf(vcf_file) + bcf_in = add_info_header(bcf_out=bcf_in, + id='MNV_ID', + number='.', + type='String', + description='ID of multi-nucleotide variant (MNV) that the SNV is part of') + bcf_in = add_info_header(bcf_out=bcf_in, + id='TYPE', + number='1', + type='String', + description='Variant type (SNV,INS,DEL,MNV,COMPLEX,MULTI)') + bcf_out = pysam.VariantFile(out, 'w', header=bcf_in.header) + splits = write_file(bcf_in, + bcf_out, + tool) + write_file_non_mnv(bcf_in, + splits, + bcf_out, + tool) + + +########################################################################## +##### Execute main unless script is simply imported ############ +##### for individual functions ############ +########################################################################## + + +if __name__ == '__main__': + main() \ No newline at end of file diff --git a/bin/pta/vcf_filter.py b/bin/pta/vcf_filter.py new file mode 100644 index 00000000..f0fcd9a1 --- /dev/null +++ b/bin/pta/vcf_filter.py @@ -0,0 +1,127 @@ +#!/usr/bin/env python +# USAGE: python vcf_filter.py GRM_FILE VCF_FILE OUT_FILE +# DESCRIPTION: +################################################################################ +##################### COPYRIGHT ################################################ +# New York Genome Center + +# SOFTWARE COPYRIGHT NOTICE AGREEMENT +# This software and its documentation are copyright (2018) by the New York +# Genome Center. All rights are reserved. This software is supplied without +# any warranty or guaranteed support whatsoever. The New York Genome Center +# cannot be responsible for its use, misuse, or functionality. +# Version: 0.2 +# Author: Kanika Arora (karora@nygenome.org) and Jennifer M Shelton +##################### /COPYRIGHT ############################################### +################################################################################ +import sys +import os +import logging as log +import pysam +########################################################################## +############## Custom functions ############ +########################################################################## + + +def read_vcf(vcf_file): + ''' + Read in annotated VCF file. + ''' + bcf_in = pysam.VariantFile(vcf_file) # auto-detect input format + return bcf_in + + +def add_filter_header(bcf_out, + id, + description): + ''' + Add new FILTER field + ''' + bcf_out.header.filters.add(id=id, + number=None, + type=None, + description=description) + return bcf_out + + +def test_af(record, alt_index, af=0.01): + ''' + Test if AF > 0.01 in one germline database. PASS + variants that don't have AF listed (for example + in records from mouse databases) + Pass variants that don't have sample columns (e.g. mouse variants from 00-All.normalized.vcf.gz) + ''' + samples = record.samples.keys() + none_count = 0 + if len(samples) > 0: + for sample_name in samples: + if not 'AF' in record.format.keys(): + return True + elif record.samples[sample_name]['AF'][alt_index]: + if record.samples[sample_name]['AF'][alt_index] > af: + return True + else: + none_count += 1 + if none_count < len(samples): + return False + return True + + +def is_germline(germ_in, record, alt, af=0.01): + ''' + Check if matching variant is in the GRM VCF. + ''' + for germ_record in germ_in.fetch(record.contig, record.pos - 1, record.pos): + if germ_record.ref == record.ref: + for alt_index, germ_alt in enumerate(germ_record.alts): + if test_af(germ_record, alt_index, af=af): + if germ_alt == alt: + return True + return False + + +def filter_vcf(bcf_in, bcf_out, germ_in): + ''' + Change filter column from PASS to GRM or + add GRM to filters. + ''' + for record in bcf_in.fetch(): + filters = record.filter.keys() + filter = False + for alt in record.alts: + if is_germline(germ_in, record, alt): + filter = True + if filter: + if len(filters) == 1 and filters[0] == 'PASS': + record.filter.clear() + record.filter.add('GRM') + exit_status = bcf_out.write(record) + if exit_status != 0: + print(exit_status) + + +def main(): + ''' + Change filter column from PASS to GRM or + add GRM to filters. + ''' + germ_file = sys.argv[1] + vcf_file = sys.argv[2] + out_file = sys.argv[3] + assert os.path.isfile(germ_file), 'Failed to find germline VCF call file :' + germ_file + assert os.path.isfile(vcf_file), 'Failed to find somatic VCF call file :' + vcf_file + germ_in = read_vcf(germ_file) + bcf_in = read_vcf(vcf_file) + bcf_in = add_filter_header(bcf_in, + id='GRM', + description='Known germline variant') + bcf_out = pysam.VariantFile(out_file, 'w', header=bcf_in.header) + filter_vcf(bcf_in, bcf_out, germ_in) + + +########################################################################## +##### Execute main unless script is simply imported ############ +##### for individual functions ############ +########################################################################## +if __name__ == '__main__': + main() \ No newline at end of file diff --git a/bin/pta/vcf_to_bed.py b/bin/pta/vcf_to_bed.py new file mode 100644 index 00000000..5068ea79 --- /dev/null +++ b/bin/pta/vcf_to_bed.py @@ -0,0 +1,71 @@ +#!/usr/bin/env python +# USAGE: vcf_to_bed.py +# DESCRIPTION: Make bed file from VCF. +################################################################################ +##################### COPYRIGHT ################################################ +# New York Genome Center + +# SOFTWARE COPYRIGHT NOTICE AGREEMENT +# This software and its documentation are copyright (2018) by the New York +# Genome Center. All rights are reserved. This software is supplied without +# any warranty or guaranteed support whatsoever. The New York Genome Center +# cannot be responsible for its use, misuse, or functionality. + +# Version: 1.0 +# Author: Jennifer M Shelton, Andre Corvelo +##################### /COPYRIGHT ############################################### +################################################################################ +import sys +import os +import logging as log +import pysam +########################################################################## +############## Custom functions ############ +########################################################################## + +def read_vcf(vcf_file): + ''' + Read in annotated VCF file. + ''' + bcf_in = pysam.VariantFile(vcf_file) # auto-detect input format + return bcf_in + + +def feed_vcf(bcf_in): + ''' + Generate relevant columns for BED. + Converts to 0-based intervals + ''' + for record in bcf_in.fetch(): + yield record.chrom, record.pos - 1, record.pos + len(record.alts[0]) - 1 + + +def make_bed(vcf_file): + ''' + Make BED file from the interval list + ''' + log.info('#######################################') + log.info('# Making bed...') + log.info('#######################################') + bcf_in = read_vcf(vcf_file) + for chrom, start, end in feed_vcf(bcf_in): + sys.stdout.write('\t'.join([chrom, str(start), str(end), '.']) + '\n') + log.info('#######################################') + log.info('# Done making bed.') + log.info('#######################################') + + +def main(): + ''' + Makes BED file from a VCF. + ''' + assert os.path.isfile(sys.argv[1]); 'Failed to open reference VCF file' + vcf_file = sys.argv[1] + make_bed(vcf_file) + +########################################################################## +##### Execute main unless script is simply imported ############ +##### for individual functions ############ +########################################################################## +if __name__ == '__main__': + main() \ No newline at end of file diff --git a/bin/rna_fusion/compute_insert_size.py b/bin/rna_fusion/compute_insert_size.py new file mode 100644 index 00000000..a2dae74c --- /dev/null +++ b/bin/rna_fusion/compute_insert_size.py @@ -0,0 +1,22 @@ +import h5py +import numpy as np +import sys + +fn = sys.argv[1] +f = h5py.File(fn) +x = np.asarray(f['aux']['fld'], dtype='float64') +y = np.cumsum(x)/np.sum(x) +cutoff = np.argmax(y > .95) +# 95% CI insert size. See: https://github.com/pmelsted/pizzly/issues/45 +print(cutoff) + +# cutoff = np.argmax(y) +# max insert size. If needed. + +## Mean insert size. If needed. +# fn = sys.argv[1] +# f = h5py.File(fn) +# x = np.asarray(f['aux']['fld'], dtype='float64') +# t = np.arange(0,len(x),1) +# mean = np.sum(x*t)/np.sum(x) +# print(mean) diff --git a/bin/shared/bamtools/bamtools_filter_pe.json b/bin/shared/bamtools/bamtools_filter_pe.json new file mode 100755 index 00000000..323c186c --- /dev/null +++ b/bin/shared/bamtools/bamtools_filter_pe.json @@ -0,0 +1,18 @@ +{ + "filters" : [ + { "id" : "insert_min", + "insertSize" : ">=-2000" + }, + + { "id" : "insert_max", + "insertSize" : "<=2000" + }, + + { "id" : "mismatch", + "tag" : "NM:<=4" + } + ], + + "rule" : " insert_min & insert_max & mismatch " + +} diff --git a/bin/shared/bamtools/bamtools_filter_se.json b/bin/shared/bamtools/bamtools_filter_se.json new file mode 100755 index 00000000..0b21d3e9 --- /dev/null +++ b/bin/shared/bamtools/bamtools_filter_se.json @@ -0,0 +1,10 @@ +{ + "filters" : [ + { "id" : "mismatch", + "tag" : "NM:<=4" + } + ], + + "rule" : " mismatch " + +} diff --git a/bin/shared/extract_csv.nf b/bin/shared/extract_csv.nf new file mode 100644 index 00000000..0cb09b9f --- /dev/null +++ b/bin/shared/extract_csv.nf @@ -0,0 +1,77 @@ +// Function to extract information (meta data + file(s)) from csv file(s) +// https://github.com/nf-core/sarek/blob/master/workflows/sarek.nf#L1084 +def extract_csv(csv_file) { + + // check that the sample sheet is not 1 line or less, because it'll skip all subsequent checks if so. + file(csv_file).withReader('UTF-8') { reader -> + def line, numberOfLinesInSampleSheet = 0; + while ((line = reader.readLine()) != null) {numberOfLinesInSampleSheet++} + if (numberOfLinesInSampleSheet < 2) { + log.error "Samplesheet had less than two lines. The sample sheet must be a csv file with a header, so at least two lines." + System.exit(1) + } + } + + Channel.from(csv_file).splitCsv(header: true) + .map{ row -> + if (!(row.sampleID)){ + log.error "Missing field in csv file header. The csv file must have a field named 'sampleID'." + System.exit(1) + } + [row.sampleID.toString(), row] + }.groupTuple() + .map{ meta, rows -> + size = rows.size() + [rows, size] + }.transpose() + .map{ row, numLanes -> //from here do the usual thing for csv parsing + + def meta = [:] + + // Meta data to identify samplesheet + if (row.sampleID) meta.sampleID = row.sampleID.toString() + + // If no lane specified, lane is not considered + if (row.lane) meta.lane = row.lane.toString() + else meta.lane = 'NA' + + /* + NOTE: Additional metadata parsing could be added here. This function is a minimal implimentation of a csv parser. + */ + + meta.id = row.sampleID.toString() + /* + NOTE: Additional ID parsing could be added here. For example a concatenation of patient and sample, if those fields were added to the csv sheet. + */ + meta.size = size + // defines the number of lanes for each sample. + + // join meta to fastq + + if (row.fastq_2) { + + return [meta.id, meta, row.fastq_1, row.fastq_2] + + } else { + return [meta.id, meta, row.fastq_1] + + } + } +} + +/* + // Additional check of sample sheet: + // 1. Each row should specify a lane and the same combination of patient, sample and lane shouldn't be present in different rows. + // 2. The same sample shouldn't be listed for different patients. + def sample2patient = [:] + + Channel.from(csv_file).splitCsv(header: true) + .map{ row -> + if (!sample2patient.containsKey(row.sample.toString())) { + sample2patient[row.sample.toString()] = row.patient.toString() + } else if (sample2patient[row.sample.toString()] != row.patient.toString()) { + log.error('The sample "' + row.sample.toString() + '" is registered for both patient "' + row.patient.toString() + '" and "' + sample2patient[row.sample.toString()] + '" in the sample sheet.') + System.exit(1) + } + } +*/ \ No newline at end of file diff --git a/bin/shared/multiqc/JAX_logo_rgb_transparentback.png b/bin/shared/multiqc/JAX_logo_rgb_transparentback.png new file mode 100644 index 00000000..9ea8a5cf Binary files /dev/null and b/bin/shared/multiqc/JAX_logo_rgb_transparentback.png differ diff --git a/bin/shared/multiqc/amplicon_multiqc.yaml b/bin/shared/multiqc/amplicon_multiqc.yaml new file mode 100644 index 00000000..a38f1076 --- /dev/null +++ b/bin/shared/multiqc/amplicon_multiqc.yaml @@ -0,0 +1,30 @@ +title: "Amplicon QC Summary Report" +intro_text: This report has been generated by the JAX NGS Operations Nextflow DSL2 Pipelines + analysis pipeline. For information about how to interpret these results, please see the + wiki documentation. + +export_plots: true + +module_order: + - cutadapt + - fastqc + - primerclip + - gatk + - coverage_metrics + - picard + +table_columns_visible: + FastQC: + percent_duplicates: False + percent_gc: False + total_sequences: False + +extra_fn_clean_exts: + - "_sortsam" + - "_realigned_BQSR" + - "_paired" + - "_001" + +custom_logo: "./JAX_logo_rgb_transparentback.png" +custom_logo_url: "https://github.com/TheJacksonLaboratory/cs-nf-pipelines" +custom_logo_title: "The Jackson Laboratory - NGS Operations Nextflow DSL2 Pipelines" diff --git a/bin/shared/multiqc/atac_multiqc.yaml b/bin/shared/multiqc/atac_multiqc.yaml new file mode 100644 index 00000000..a427165d --- /dev/null +++ b/bin/shared/multiqc/atac_multiqc.yaml @@ -0,0 +1,94 @@ +title: "ATAC-Seq QC Summary Report" +intro_text: This report has been generated by the JAX NGS Operations Nextflow DSL2 Pipelines + analysis pipeline. For information about how to interpret these results, please see the + wiki documentation. + +export_plots: true + +module_order: + - fastqc + - cutadapt + - bowtie2 + - picard + - custom_content: + - fraglen_plot + +table_columns_visible: + FastQC: + percent_duplicates: False + percent_gc: False + total_sequences: False + +table_columns_placement: + PCR_statistics: + NRF: 1010 + PBC1: 1020 + PBC2: 1030 + MT_content: + 'Perc mtDNA': 1040 + FRiP: + FRiP: 1050 + +extra_fn_clean_exts: + - "_bowtie2" + - "_R2" + +custom_data: + PCR_statistics: + file_format: 'tsv' + plot_type: 'generalstats' + pconfig: + - NRF: + description: 'Non Redundant Fraction' + scale: False + - PBC1: + description: 'PCR Bottlenecking Coefficient 1' + scale: False + - PBC2: + description: 'PCR Bottlenecking Coefficient 2' + scale: False + format: '{:,.2f}' + MT_content: + file_format: 'tsv' + plot_type: 'generalstats' + pconfig: + - 'Perc mtDNA': + description: 'Percent mtDNA' + scale: False + suffix: "%" + FRiP: + file_format: 'tsv' + plot_type: 'generalstats' + pconfig: + - FRiP: + description: 'Fraction of Reads in Peak' + scale: False + - 'Filtered Reads': + description: 'Total Filtered Reads' + scale: False + format: '{:,.0f}' + fraglen_plot: + file_format: "tsv" + section_name: "Fragment Length" + description: "This plot comes from files acommpanied by a mutliqc_config.yaml file for configuration" + plot_type: "linegraph" + pconfig: + id: "example_coverage_lineplot" + title: "Fragment Length Plot" + ylab: "Read Count" + xlab: "Insert Size (bp)" + + +sp: + PCR_statistics: + fn: '*.pbc.qc' + MT_content: + fn: '*mtDNA_Content.txt' + FRiP: + fn: '*Fraction_reads_in_peak.txt' + fraglen_plot: + fn: '*spline_table.txt' + +custom_logo: "./JAX_logo_rgb_transparentback.png" +custom_logo_url: "https://github.com/TheJacksonLaboratory/cs-nf-pipelines" +custom_logo_title: "The Jackson Laboratory - NGS Operations Nextflow DSL2 Pipelines" diff --git a/bin/shared/multiqc/chipseq.yaml b/bin/shared/multiqc/chipseq.yaml new file mode 100644 index 00000000..0fd756a6 --- /dev/null +++ b/bin/shared/multiqc/chipseq.yaml @@ -0,0 +1,173 @@ +report_comment: > + This report has been generated by the JAX NGS Operations Nextflow DSL2 Pipelines + analysis pipeline. For information about how to interpret these results, please see the + wiki documentation. + +export_plots: true + +run_modules: + - custom_content + - fastqc + - cutadapt + - samtools + - picard + - preseq + - featureCounts + - deeptools + - phantompeakqualtools + +exclude_modules: + - "general_stats" + +module_order: + - fastqc: + name: "LIB: FastQC (raw)" + info: "This section of the report shows FastQC results before adapter trimming for individual libraries." + path_filters: + - "./fastqc/*.zip" + - cutadapt: + name: "LIB: cutadapt (trimmed)" + info: "This section of the report shows the length of trimmed reads by cutadapt for individual libraries." + - fastqc: + name: "LIB: FastQC (trimmed)" + info: "This section of the report shows FastQC results after adapter trimming for individual libraries." + path_filters: + - "./trimgalore/fastqc/*.zip" + - samtools: + name: "LIB: SAMTools" + info: "This section of the report shows SAMTools results for individual libraries." + path_filters: + - "./alignment/library/*" + - samtools: + name: "MERGED LIB: SAMTools (unfiltered)" + info: "This section of the report shows SAMTools results after merging libraries and before filtering." + path_filters: + - "./alignment/mergedLibrary/unfiltered/*.mLb.mkD.sorted.bam*" + - picard: + name: "MERGED LIB: Picard (unfiltered)" + info: "This section of the report shows picard results after merging libraries and before filtering." + path_filters: + - "./alignment/mergedLibrary/unfiltered/picard_metrics/*" + - preseq: + name: "MERGED LIB: Preseq (unfiltered)" + info: "This section of the report shows Preseq results after merging libraries and before filtering." + - samtools: + name: "MERGED LIB: SAMTools (filtered)" + info: "This section of the report shows SAMTools results after merging libraries and after filtering." + path_filters: + - "./alignment/mergedLibrary/filtered/*.mLb.clN.sorted.bam*" + - picard: + name: "MERGED LIB: Picard (filtered)" + info: "This section of the report shows picard results after merging libraries and after filtering." + path_filters: + - "./alignment/mergedLibrary/filtered/picard_metrics/*" + - deeptools: + name: "MERGED LIB: deepTools" + anchor: "mlib_deeptools" + info: "This section of the report shows ChIP-seq QC plots generated by deepTools." + - featureCounts: + name: "MERGED LIB: featureCounts" + anchor: "mlib_featurecounts" + info: "This section of the report shows featureCounts results for the number of reads assigned to merged library consensus peaks." + path_filters: + - "./macs2/featurecounts/*.summary" + +report_section_order: + peak_count: + before: mlib_deeptools + frip_score: + before: peak_count + peak_annotation: + before: frip_score + strand_shift_correlation: + before: peak_annotation + nsc_coefficient: + before: strand_shift_correlation + rsc_coefficient: + before: nsc_coefficient + mlib_featurecounts: + before: rsc_coefficient + deseq2_pca_1: + order: -1600 + deseq2_pca_2: + order: -1700 + deseq2_pca_3: + order: -1800 + deseq2_pca_4: + order: -1900 + deseq2_pca_5: + order: -2000 + deseq2_pca_6: + order: -2100 + deseq2_pca_7: + order: -2200 + deseq2_pca_8: + order: -2300 + deseq2_pca_9: + order: -2400 + deseq2_pca_10: + order: -2500 + deseq2_clustering_1: + order: -2600 + deseq2_clustering_2: + order: -2700 + deseq2_clustering_3: + order: -2800 + deseq2_clustering_4: + order: -2900 + deseq2_clustering_5: + order: -3000 + deseq2_clustering_6: + order: -3100 + deseq2_clustering_7: + order: -3200 + deseq2_clustering_8: + order: -3300 + deseq2_clustering_9: + order: -3400 + deseq2_clustering_10: + order: -3500 + software_versions: + order: -3600 + nf-core-chipseq-summary: + order: -3700 + +custom_plot_config: + picard_insert_size: + cpswitch_c_active: False + smooth_points: 1000 + featurecounts: + cpswitch_c_active: False + +extra_fn_clean_exts: + - "fastq.gz" + - "_trimmed" + - "_val" + - "sorted.bam" + - ".Lb" + - "mkD" + - "clN" + - "mLb" + - "_peaks" + - ".FRiP" + - ".peak" + - "_spp" + - ".spp" + - "lc_extrap" + +# # Customise the module search patterns to speed up execution time +# # - Skip module sub-tools that we are not interested in +# # - Replace file-content searching with filename pattern searching +# # - Don't add anything that is the same as the MultiQC default +# # See https://multiqc.info/docs/#optimise-file-search-patterns for details +sp: + cutadapt: + fn: "*trimming_report.txt" + preseq: + fn: "*.lc_extrap.txt" + deeptools/plotFingerprintOutRawCounts: + fn: "*plotFingerprint*" + deeptools/plotProfile: + fn: "*plotProfile*" + phantompeakqualtools/out: + fn: "*.spp.out" diff --git a/bin/shared/multiqc/chipseq/deseq2_clustering_header.txt b/bin/shared/multiqc/chipseq/deseq2_clustering_header.txt new file mode 100644 index 00000000..f7bb33d8 --- /dev/null +++ b/bin/shared/multiqc/chipseq/deseq2_clustering_header.txt @@ -0,0 +1,12 @@ +#id: 'deseq2_clustering' +#section_name: 'MERGED LIB: DESeq2 sample similarity' +#description: "Matrix is generated from clustering with Euclidean distances between +# DESeq2 +# rlog values for each sample +# in the deseq2_qc.r script." +#plot_type: 'heatmap' +#anchor: 'deseq2_clustering' +#pconfig: +# title: 'DESeq2: Heatmap of the sample-to-sample distances' +# xlab: True +# reverseColors: True diff --git a/bin/shared/multiqc/chipseq/deseq2_pca_header.txt b/bin/shared/multiqc/chipseq/deseq2_pca_header.txt new file mode 100644 index 00000000..250c1cb7 --- /dev/null +++ b/bin/shared/multiqc/chipseq/deseq2_pca_header.txt @@ -0,0 +1,11 @@ +#id: 'deseq2_pca' +#section_name: 'MERGED LIB: DESeq2 PCA plot' +#description: "PCA plot of the samples in the experiment. +# These values are calculated using DESeq2 +# in the deseq2_qc.r script." +#plot_type: 'scatter' +#anchor: 'deseq2_pca' +#pconfig: +# title: 'DESeq2: Principal component plot' +# xlab: PC1 +# ylab: PC2 diff --git a/bin/shared/multiqc/chipseq/frip_score_header.txt b/bin/shared/multiqc/chipseq/frip_score_header.txt new file mode 100644 index 00000000..82902115 --- /dev/null +++ b/bin/shared/multiqc/chipseq/frip_score_header.txt @@ -0,0 +1,13 @@ +#id: 'frip_score' +#section_name: 'MERGED LIB: MACS2 FRiP score' +#description: "is generated by calculating the fraction of all mapped reads that fall +# into the MACS2 called peak regions. A read must overlap a peak by at least 20% to be counted. +# See FRiP score." +#plot_type: 'bargraph' +#anchor: 'frip_score' +#pconfig: +# title: 'FRiP score' +# ylab: 'FRiP score' +# ymax: 1 +# ymin: 0 +# tt_decimals: 2 diff --git a/bin/shared/multiqc/chipseq/peak_annotation_header.txt b/bin/shared/multiqc/chipseq/peak_annotation_header.txt new file mode 100644 index 00000000..2b3ee938 --- /dev/null +++ b/bin/shared/multiqc/chipseq/peak_annotation_header.txt @@ -0,0 +1,9 @@ +#id: 'peak_annotation' +#section_name: 'MERGED LIB: HOMER peak annotation' +#description: "is generated by calculating the proportion of peaks assigned to genomic features by +# HOMER annotatePeaks.pl." +#plot_type: 'bargraph' +#anchor: 'peak_annotation' +#pconfig: +# title: 'Peak to feature proportion' +# ylab: 'Peak count' diff --git a/bin/shared/multiqc/chipseq/peak_count_header.txt b/bin/shared/multiqc/chipseq/peak_count_header.txt new file mode 100644 index 00000000..aa4dd346 --- /dev/null +++ b/bin/shared/multiqc/chipseq/peak_count_header.txt @@ -0,0 +1,9 @@ +#id: 'peak_count' +#section_name: 'MERGED LIB: MACS2 peak count' +#description: "is calculated from total number of peaks called by +# MACS2" +#plot_type: 'bargraph' +#anchor: 'peak_count' +#pconfig: +# title: 'Total peak count' +# ylab: 'Peak count' diff --git a/bin/shared/multiqc/chipseq/spp_correlation_header.txt b/bin/shared/multiqc/chipseq/spp_correlation_header.txt new file mode 100644 index 00000000..ad571563 --- /dev/null +++ b/bin/shared/multiqc/chipseq/spp_correlation_header.txt @@ -0,0 +1,12 @@ +#id: 'strand_shift_correlation' +#section_name: 'MERGED LIB: spp strand-shift correlation' +#description: "generated using run_spp.R script from +# phantompeakqualtools." +#plot_type: 'linegraph' +#anchor: 'strand_shift_correlation' +#pconfig: +# title: 'Strand-shift correlation plot' +# ylab: 'Cross-correlation' +# xlab: 'Strand-shift (bp)' +# xDecimals: False +# tt_label: 'Strand-shift (bp) {point.x}: {point.y:.2f} Cross-correlation' diff --git a/bin/shared/multiqc/chipseq/spp_nsc_header.txt b/bin/shared/multiqc/chipseq/spp_nsc_header.txt new file mode 100644 index 00000000..43370f32 --- /dev/null +++ b/bin/shared/multiqc/chipseq/spp_nsc_header.txt @@ -0,0 +1,11 @@ +#id: 'nsc_coefficient' +#section_name: 'MERGED LIB: spp NSC coefficient' +#description: "generated using run_spp.R script from +# phantompeakqualtools." +#plot_type: 'bargraph' +#anchor: 'nsc_coefficient' +#pconfig: +# title: 'Normalized strand cross-correlation coefficient' +# ylab: 'NSC coefficient' +# ymin: 1 +# tt_decimals: 1 diff --git a/bin/shared/multiqc/chipseq/spp_rsc_header.txt b/bin/shared/multiqc/chipseq/spp_rsc_header.txt new file mode 100644 index 00000000..bab5e09b --- /dev/null +++ b/bin/shared/multiqc/chipseq/spp_rsc_header.txt @@ -0,0 +1,11 @@ +#id: 'rsc_coefficient' +#section_name: 'MERGED LIB: spp RSC coefficient' +#description: "generated using run_spp.R script from +# phantompeakqualtools." +#plot_type: 'bargraph' +#anchor: 'rsc_coefficient' +#pconfig: +# title: 'Relative strand cross-correlation coefficient' +# ylab: 'RSC coefficient' +# ymin: 0 +# tt_decimals: 1 diff --git a/bin/shared/multiqc/pdx_wes_multiqc.yaml b/bin/shared/multiqc/pdx_wes_multiqc.yaml new file mode 100644 index 00000000..7a9750bd --- /dev/null +++ b/bin/shared/multiqc/pdx_wes_multiqc.yaml @@ -0,0 +1,30 @@ +title: "PDX Whole Exome Seq QC Summary Report" +intro_text: This report has been generated by the JAX NGS Operations Nextflow DSL2 Pipelines + analysis pipeline. For information about how to interpret these results, please see the + wiki documentation. + +export_plots: true + +module_order: + - jax_trimmer + - fastqc + - xenome + - gatk + - picard + +table_columns_visible: + FastQC: + percent_duplicates: False + percent_gc: False + total_sequences: False + +extra_fn_clean_exts: + - "_sortsam" + - "_realigned_BQSR" + - "_FilterTrim" + - type: "truncate" + pattern: ".fastq" + +custom_logo: "./JAX_logo_rgb_transparentback.png" +custom_logo_url: "https://github.com/TheJacksonLaboratory/cs-nf-pipelines" +custom_logo_title: "The Jackson Laboratory - NGS Operations Nextflow DSL2 Pipelines" diff --git a/bin/shared/multiqc/pta_multiqc.yaml b/bin/shared/multiqc/pta_multiqc.yaml new file mode 100644 index 00000000..076ead18 --- /dev/null +++ b/bin/shared/multiqc/pta_multiqc.yaml @@ -0,0 +1,35 @@ +title: "Paired Tumor Analysis QC Summary Report" +intro_text: This report has been generated by the JAX NGS Operations Nextflow DSL2 Pipelines + analysis pipeline. For information about how to interpret these results, please see the + wiki documentation. + +export_plots: true + +module_order: + - jax_trimmer + - fastqc + - xenome + - conpair + - gatk + - picard + +table_columns_visible: + FastQC: + percent_duplicates: False + percent_gc: False + total_sequences: False + +extra_fn_clean_exts: + - "_sortsam" + - "_realigned_BQSR" + - "_FilterTrim" + - "_concordance" + - "_contamination" + - ".final_sorted" + - ".R1" + - type: "truncate" + pattern: ".fastq" + +custom_logo: "./JAX_logo_rgb_transparentback.png" +custom_logo_url: "https://github.com/TheJacksonLaboratory/cs-nf-pipelines" +custom_logo_title: "The Jackson Laboratory - NGS Operations Nextflow DSL2 Pipelines" diff --git a/bin/shared/multiqc/rna_fusion_multiqc.yaml b/bin/shared/multiqc/rna_fusion_multiqc.yaml new file mode 100644 index 00000000..3ba15cca --- /dev/null +++ b/bin/shared/multiqc/rna_fusion_multiqc.yaml @@ -0,0 +1,24 @@ +title: "RNA Fusion QC Summary Report" +intro_text: This report has been generated by the JAX NGS Operations Nextflow DSL2 Pipelines + analysis pipeline. For information about how to interpret these results, please see the + wiki documentation. + +export_plots: true + +module_order: + - fastqc + - xenome + - custom_content + +table_columns_visible: + FastQC: + percent_duplicates: False + percent_gc: False + total_sequences: False + +extra_fn_clean_exts: + - "_sortsam" + +custom_logo: "./JAX_logo_rgb_transparentback.png" +custom_logo_url: "https://github.com/TheJacksonLaboratory/cs-nf-pipelines" +custom_logo_title: "The Jackson Laboratory - NGS Operations Nextflow DSL2 Pipelines" diff --git a/bin/shared/multiqc/rnaseq_multiqc.yaml b/bin/shared/multiqc/rnaseq_multiqc.yaml new file mode 100644 index 00000000..dc8f8c08 --- /dev/null +++ b/bin/shared/multiqc/rnaseq_multiqc.yaml @@ -0,0 +1,25 @@ +title: "RNA-Seq QC Summary Report" +intro_text: This report has been generated by the JAX NGS Operations Nextflow DSL2 Pipelines + analysis pipeline. For information about how to interpret these results, please see the + wiki documentation. + +export_plots: true + +module_order: + - fastqc + - xenome + - rsem + - picard + +table_columns_visible: + FastQC: + percent_duplicates: False + percent_gc: False + total_sequences: False + +extra_fn_clean_exts: + - "_sortsam" + +custom_logo: "./JAX_logo_rgb_transparentback.png" +custom_logo_url: "https://github.com/TheJacksonLaboratory/cs-nf-pipelines" +custom_logo_title: "The Jackson Laboratory - NGS Operations Nextflow DSL2 Pipelines" diff --git a/bin/shared/multiqc/rrbs_multiqc.yaml b/bin/shared/multiqc/rrbs_multiqc.yaml new file mode 100644 index 00000000..5f2328e3 --- /dev/null +++ b/bin/shared/multiqc/rrbs_multiqc.yaml @@ -0,0 +1,15 @@ +title: "RRBS QC Summary Report" +intro_text: This report has been generated by the JAX NGS Operations Nextflow DSL2 Pipelines + analysis pipeline. For information about how to interpret these results, please see the + wiki documentation. + +export_plots: true + +module_order: + - fastqc + - cutadapt + - bismark + +custom_logo: "./JAX_logo_rgb_transparentback.png" +custom_logo_url: "https://github.com/TheJacksonLaboratory/cs-nf-pipelines" +custom_logo_title: "The Jackson Laboratory - NGS Operations Nextflow DSL2 Pipelines" diff --git a/bin/shared/multiqc/wes_multiqc.yaml b/bin/shared/multiqc/wes_multiqc.yaml new file mode 100644 index 00000000..c916444d --- /dev/null +++ b/bin/shared/multiqc/wes_multiqc.yaml @@ -0,0 +1,30 @@ +title: "Whole Exome Seq QC Summary Report" +intro_text: This report has been generated by the JAX NGS Operations Nextflow DSL2 Pipelines + analysis pipeline. For information about how to interpret these results, please see the + wiki documentation. + +export_plots: true + +module_order: + - jax_trimmer + - fastqc + - gatk + - picard + +table_columns_visible: + FastQC: + percent_duplicates: False + percent_gc: False + total_sequences: False + +extra_fn_clean_exts: + - "_sortsam" + - "_realigned_BQSR" + - "_FilterTrim" + - "_dedup" + - type: "truncate" + pattern: ".fastq" + +custom_logo: "./JAX_logo_rgb_transparentback.png" +custom_logo_url: "https://github.com/TheJacksonLaboratory/cs-nf-pipelines" +custom_logo_title: "The Jackson Laboratory - NGS Operations Nextflow DSL2 Pipelines" diff --git a/bin/shared/multiqc/wgs_multiqc.yaml b/bin/shared/multiqc/wgs_multiqc.yaml new file mode 100644 index 00000000..2efc95cf --- /dev/null +++ b/bin/shared/multiqc/wgs_multiqc.yaml @@ -0,0 +1,30 @@ +title: "Whole Genome Seq QC Summary Report" +intro_text: This report has been generated by the JAX NGS Operations Nextflow DSL2 Pipelines + analysis pipeline. For information about how to interpret these results, please see the + wiki documentation. + +export_plots: true + +module_order: + - jax_trimmer + - fastqc + - gatk + - picard + +table_columns_visible: + FastQC: + percent_duplicates: False + percent_gc: False + total_sequences: False + +extra_fn_clean_exts: + - "_sortsam" + - "_realigned_BQSR" + - "_FilterTrim" + - "_dedup" + - type: "truncate" + pattern: ".fastq" + +custom_logo: "./JAX_logo_rgb_transparentback.png" +custom_logo_url: "https://github.com/TheJacksonLaboratory/cs-nf-pipelines" +custom_logo_title: "The Jackson Laboratory - NGS Operations Nextflow DSL2 Pipelines" diff --git a/bin/shared/read_group_from_fastq.py b/bin/shared/read_group_from_fastq.py index 702779d3..5d198620 100644 --- a/bin/shared/read_group_from_fastq.py +++ b/bin/shared/read_group_from_fastq.py @@ -39,6 +39,8 @@ def parse_args(): help="Sample is tumor in a tumor/normal pair") parser.add_argument('-n', '--normal', action='store_true', help="Sample is normal in a tumor/normal pair") + parser.add_argument('-s', '--sample_id', dest="sample_id", + help="SampleID of file") parser.add_argument('-o', '--output', dest="output_file", help="Output file name [STDOUT]") parser.add_argument('fastq', nargs="+", @@ -115,15 +117,17 @@ def main(): pos = n break if pos == -1: - # Didn't find the GES marker. Use the filename up to the end name. - match = re.search('(.*)[._]R[12]_.*',fn) - if match is not None: - fn = match.group(1) - else: - # something is seriously odd here, but we'll just use the - # whole filename - pass - + if args.sample_id: + fn = args.sample_id + else: + # Didn't find the GES marker. Use the filename up to the end name. + match = re.search('(.*)[._]R[12]_.*',fn) + if match is not None: + fn = match.group(1) + else: + # something is seriously odd here, but we'll just use the + # whole filename + pass cust_id = ges_id = fn else: cust_id = '_'.join(fn_parts[:pos]) diff --git a/config/amplicon.config b/config/amplicon.config new file mode 100644 index 00000000..249272ac --- /dev/null +++ b/config/amplicon.config @@ -0,0 +1,50 @@ +//==================== Nextflow/Container Config ========== + +manifest { + name = "amplicon" + description = 'Pipeline for Processing xGEN amplicon panel data' + author = 'Anuj Srivastava, Carolyn Paisie, Barry Guglielmo, Michael Lloyd, Brian Sanderson Copyright Jackson Laboratory 2021' +} + +params { + // Shared params + gen_org = 'human' + extension='.fastq.gz' + pattern="*_R{1,2}*" + read_type = 'PE' // SE + sample_folder = null + concat_lanes = false + download_data = false + csv_input = null + + multiqc_config = "${projectDir}/bin/shared/multiqc/amplicon_multiqc.yaml" + + cutadaptMinLength = 20 + cutadaptQualCutoff = 20 + cutadaptAdapterR1 = 'CTGTCTCTTATACACATCTCCGAGCCCACGAGAC' + cutadaptAdapterR2 = 'CTGTCTCTTATACACATCTGACGCTGCCGACGA' + + + ref_fa = '/projects/omics_share/human/GRCh38/genome/sequence/gatk/Homo_sapiens_assembly38.fasta' + ref_fa_indices = '/projects/omics_share/human/GRCh38/genome/indices/gatk/bwa/Homo_sapiens_assembly38.fasta' + mismatch_penalty = "-B 8" + + masterfile = '/projects/compsci/omics_share/human/GRCh38/supporting_files/capture_kit_files/IDT/xGen_sampleID_amplicon/hg38Lifted_xGen_masterfile.txt' + + amplicon_primer_intervals = '/projects/compsci/omics_share/human/GRCh38/supporting_files/capture_kit_files/IDT/xGen_sampleID_amplicon/hg38Lifted_xGen_SampleID_primers.interval_list' + amplicon_target_intervals = '/projects/compsci/omics_share/human/GRCh38/supporting_files/capture_kit_files/IDT/xGen_sampleID_amplicon/hg38Lifted_xGen_SampleID_merged_targets.interval_list' + + gold_std_indels = '/projects/omics_share/human/GRCh38/genome/annotation/snps_indels/Mills_and_1000G_gold_standard.indels.hg38.vcf.gz' + phase1_1000G = '/projects/omics_share/human/GRCh38/genome/annotation/snps_indels/1000G_phase1.snps.high_confidence.hg38.vcf.gz' + dbSNP = '/projects/omics_share/human/GRCh38/genome/annotation/snps_indels/dbsnp_151.vcf.gz' + + ploidy_val = '-ploidy 2' // variable in haplotypecaller. not required for amplicon, but present in module. + target_gatk = '/projects/compsci/omics_share/human/GRCh38/supporting_files/capture_kit_files/IDT/xGen_sampleID_amplicon/hg38Lifted_xGen_SampleID_merged_targets.bed' + params.call_val = "50.0" + + dbSNP = '/projects/omics_share/human/GRCh38/genome/annotation/snps_indels/dbsnp_151.vcf.gz' + dbSNP_index = '/projects/omics_share/human/GRCh38/genome/annotation/snps_indels/dbsnp_151.vcf.gz.tbi' + + tmpdir = "/fastscratch/${USER}" + bwa_min_score = null +} \ No newline at end of file diff --git a/config/atac.config b/config/atac.config index 6d9c49e2..3937214b 100644 --- a/config/atac.config +++ b/config/atac.config @@ -3,7 +3,7 @@ manifest { name = "atac" description = 'Pipeline for ATAC Seq Samples' - author = 'Sai Lek, Copyright Jackson Laboratory 2022' + author = 'Sai Lek, Michael Lloyd, Anuj Srivastava, Copyright Jackson Laboratory 2022' version = "0.1.0" } @@ -13,10 +13,16 @@ manifest { params { // Shared params gen_org = 'mouse' + genome_build = 'GRCm38' // GRCm39 extension='.fastq.gz' pattern="*_R{1,2}*" + sample_folder = null read_type = 'PE' // 'SE' concat_lanes = false + download_data = false + csv_input = null + + multiqc_config = "${projectDir}/bin/shared/multiqc/atac_multiqc.yaml" // Reference bowtie2Index = '/projects/omics_share/mouse/GRCm38/genome/indices/ensembl/v102/bowtie2/Mus_musculus.GRCm38.dna.primary_assembly.fa' @@ -39,7 +45,15 @@ params { if (params.gen_org=='human'){ - // Reference + // Reference + params.genome_build = 'GRCh38' params.bowtie2Index = '/projects/omics_share/human/GRCh38/genome/indices/gatk/bowtie2/hg38_noalt' } +// Defaults for GRCm39 build +if (params.genome_build=='GRCm39'){ + // Reference + params.bowtie2Index = '/projects/compsci/omics_share/mouse/GRCm39/genome/indices/ensembl/v105/bowtie2/Mus_musculus.GRCm39.dna.primary_assembly.fa' + params.effective_genome_size = '2654621783' + +} diff --git a/config/chipseq.config b/config/chipseq.config new file mode 100644 index 00000000..2476d8e0 --- /dev/null +++ b/config/chipseq.config @@ -0,0 +1,118 @@ +//==================== Nextflow/Container Config ========== + +manifest { + name = "chipseq" + description = 'Pipeline for ChIP-Seq Samples. Adapted from: https://nf-co.re/chipseq, which is available under MIT License' + author = 'Sai Lek, Copyright Jackson Laboratory 2022' + version = "0.1.0" +} + + +// Default to Mouse, If gen_org == 'human' parameters are overwritten with values +// in the "Defaults for Human" section below + +params { + // Shared params + gen_org = 'mouse' // human + read_type = 'PE' // 'SE' + genome_build = 'GRCm38' // 'GRCm38' or 'GRCm39' + + // Reference fasta + ref_fa = '/projects/omics_share/mouse/GRCm38/genome/sequence/ensembl/v102/Mus_musculus.GRCm38.dna.primary_assembly.fa' + ref_fa_indices='/projects/omics_share/mouse/GRCm38/genome/indices/ensembl/v102/bwa/Mus_musculus.GRCm38.dna.primary_assembly.fa' + + // GTF & BED annotation + gtf = '/projects/omics_share/mouse/GRCm38/transcriptome/annotation/ensembl/v102/Mus_musculus.GRCm38.102.gtf' + gene_bed = '/projects/omics_share/mouse/GRCm38/transcriptome/annotation/ensembl/v102/Mus_musculus.GRCm38.102.bed' + + // Global default params, used in configs + + // Options: Generic + input = '' + fragment_size = 200 + fingerprint_bins = 500000 + + // Mac2 Effective Genome Size - based on GRCm38 ensembl primary_assembly chroms and MT only + macs_gsize = 2725537669 + + // Blacklist regions: + blacklist = "" + + // Trim-Galore settings. + trimLength = '30' + qualThreshold = '30' + adapOverlap = '1' + adaptorSeq = 'AGATCGGAAGAGC' + + // bwa parameters + mismatch_penalty = "" + bwa_min_score = false + + // samtools merge bam filter parameters + keep_dups = false + keep_multi_map = false + + // bamtools filter + bamtools_filter_pe_config = "$projectDir/bin/shared/bamtools/bamtools_filter_pe.json" + bamtools_filter_se_config = "$projectDir/bin/shared/bamtools/bamtools_filter_se.json" + + // preseq paramters + skip_preseq = false + + // Options: Peaks + narrow_peak = false + broad_cutoff = 0.05 + macs_fdr = false + macs_pvalue = false + min_reps_consensus = 1 + save_macs_pileup = false + skip_peak_qc = false + skip_peak_annotation = false + skip_consensus_peaks = false + + // Options: Differential analysis + deseq2_vst = false + skip_diff_analysis = false + + // MultiQC + multiqc_config = "${projectDir}/bin/shared/multiqc/chipseq.yaml" + + tmpdir = "/fastscratch/${USER}" + extension = null // not used in this workflow + pattern = null // not used in this workflow + concat_lanes = false // not used in this workflow + non_directional = '' // not used in this workflow + +} + +if (params.gen_org=='human'){ + + params.genome_build = 'GRCh38' + + // Mac2 Effective Genome Size - based on GRCh38 GATK assembly chroms and MT only + params.macs_gsize = 3088286401 + + // Reference fasta + params.ref_fa = '/projects/omics_share/human/GRCh38/genome/sequence/gatk/Homo_sapiens_assembly38.fasta' + params.ref_fa_indices = '/projects/omics_share/human/GRCh38/genome/indices/gatk/bwa/Homo_sapiens_assembly38.fasta' + + // GTF & BED annotation + params.gtf = '/projects/omics_share/human/GRCh38/transcriptome/annotation/ensembl/v104/Homo_sapiens.GRCh38.104.chr.GATKchrom.gtf' + params.gene_bed = '/projects/omics_share/human/GRCh38/transcriptome/annotation/ensembl/v104/Homo_sapiens.GRCh38.104.GATKchrom.bed' +} + +// Defaults for GRCm39 build +if (params.genome_build=='GRCm39'){ + + // Mac2 Effective Genome Size - based on GRCm38 ensembl primary_assembly chroms and MT only + params.macs_gsize = 2723431143 + + // Reference fasta + params.ref_fa = '/projects/omics_share/mouse/GRCm39/genome/sequence/ensembl/v105/Mus_musculus.GRCm39.dna.primary_assembly.fa' + params.ref_fa_indices = '/projects/omics_share/mouse/GRCm39/genome/indices/ensembl/v105/bwa/Mus_musculus.GRCm39.dna.primary_assembly.fa' + + // GTF & BED annotation + gtf = '/projects/omics_share/mouse/GRCm39/transcriptome/annotation/ensembl/v105/Mus_musculus.GRCm39.105.gtf' + gene_bed = '/projects/omics_share/mouse/GRCm39/transcriptome/annotation/ensembl/v105/Mus_musculus.GRCm39.105.bed' + +} \ No newline at end of file diff --git a/config/pdx_wes.config b/config/pdx_wes.config new file mode 100644 index 00000000..83ae6f88 --- /dev/null +++ b/config/pdx_wes.config @@ -0,0 +1,63 @@ +//==================== Nextflow/Container Config ========== + +manifest { + name = "pdx_wes" + description = 'Pipeline for Processing PDX Whole Exome Samples' + author = 'Anuj Srivastava, Carolyn Paisie, Barry Guglielmo, Michael Lloyd, Brian Sanderson Copyright Jackson Laboratory 2021' +} + +// Default to Mouse, If gen_org == 'human' parameters are overwritten with values +// in the "Defaults for Human" section below + +params { + // Shared params + gen_org = 'human' // human + extension='.fastq.gz' + pattern="*_R{1,2}*" + read_type = 'PE' // SE + sample_folder = null + concat_lanes = false + download_data = false + csv_input = null + + multiqc_config = "${projectDir}/bin/shared/multiqc/pdx_wes_multiqc.yaml" + + // Reference fasta + ref_fa = '/projects/omics_share/human/GRCh38/genome/sequence/gatk/Homo_sapiens_assembly38.fasta' + ref_fa_indices = '/projects/omics_share/human/GRCh38/genome/indices/gatk/bwa/Homo_sapiens_assembly38.fasta' + + // Quality Stats params + min_pct_hq_reads = '0.0' + hq_pct = '70' + + // Xenome index + xenome_prefix='/projects/compsci/omics_share/human/GRCh38/supporting_files/xenome/hg38_broad_NOD_based_on_mm10_k25' + + // WES capture array BED and GATK intervals lists + target_gatk = '/projects/omics_share/human/GRCh38/supporting_files/capture_kit_files/agilent/v7/S31285117_MergedProbes_no_gene_names.bed' + target_picard = '/projects/omics_share/human/GRCh38/supporting_files/capture_kit_files/agilent/v7/S31285117_MergedProbes_no_gene_names.picard.interval_list' + bait_picard = '/projects/omics_share/human/GRCh38/supporting_files/capture_kit_files/agilent/v7/S31285117_MergedProbes_no_gene_names.picard.interval_list' + + // Variant calling parameters + mismatch_penalty = "-B 8" + call_val = "50.0" + + gnomad_ref='/projects/compsci/omics_share/human/GRCh38/genome/annotation/snps_indels/af-only-gnomad.hg38.vcf.gz' + pon_ref='/projects/compsci/omics_share/human/GRCh38/genome/annotation/snps_indels/1000g_pon.hg38.vcf.gz' + + msisensor_model='/projects/compsci/omics_share/human/GRCh38/supporting_files/msisensor2/models_hg38' + + // VCF annotation + gold_std_indels = '/projects/omics_share/human/GRCh38/genome/annotation/snps_indels/Mills_and_1000G_gold_standard.indels.hg38.vcf.gz' + phase1_1000G = '/projects/omics_share/human/GRCh38/genome/annotation/snps_indels/1000G_phase1.snps.high_confidence.hg38.vcf.gz' + dbSNP = '/projects/omics_share/human/GRCh38/genome/annotation/snps_indels/dbsnp_151.vcf.gz' + dbSNP_index = '/projects/omics_share/human/GRCh38/genome/annotation/snps_indels/dbsnp_151.vcf.gz.tbi' + dbNSFP='/projects/omics_share/human/GRCh38/genome/annotation/function/dbNSFP4.2a.gatk_formatted.txt.gz' + cosmic = '/projects/omics_share/human/GRCh38/genome/annotation/function/COSMICv95_Coding_Noncoding.gatk_formatted.vcf.gz' + cosmic_index = '/projects/omics_share/human/GRCh38/genome/annotation/function/COSMICv95_Coding_Noncoding.gatk_formatted.vcf.gz.tbi' + gen_ver = "hg38" + snpEff_config = '/projects/omics_share/human/GRCh38/genome/indices/snpEff_5_1/snpEff.config' + + tmpdir = "/fastscratch/${USER}" + bwa_min_score = null +} \ No newline at end of file diff --git a/config/profiles/elion.config b/config/profiles/elion.config index 3eae133d..dc1c7c05 100644 --- a/config/profiles/elion.config +++ b/config/profiles/elion.config @@ -14,18 +14,16 @@ process { } executor { - $slurm { - queueSize = 250 - // The number of tasks the executor will handle in a parallel manner - submitRateLimit = '1 / 2 s' - // Determines the max rate of job submission per time unit, for example '10sec' eg. max 10 jobs per second or '1/2 s' i.e. 1 job submissions every 2 seconds. - } + name = 'slurm' + // The number of tasks the executor will handle in a parallel manner + queueSize = 100 + submitRateLimit = '1 / 2 s' + // Determines the max rate of job submission per time unit, for example '10sec' eg. max 10 jobs per second or '1/2 s' i.e. 1 job submissions every 2 seconds. } env { NXF_ANSI_SUMMARY = true NXF_ANSI_LOG = true - NXF_DEBUG = 2 } trace { diff --git a/config/profiles/sumner.config b/config/profiles/sumner.config index bce233e0..6135db21 100644 --- a/config/profiles/sumner.config +++ b/config/profiles/sumner.config @@ -14,18 +14,16 @@ process { } executor { - $slurm { - queueSize = 250 - // The number of tasks the executor will handle in a parallel manner - submitRateLimit = '1 / 2 s' - // Determines the max rate of job submission per time unit, for example '10sec' eg. max 10 jobs per second or '1/2 s' i.e. 1 job submissions every 2 seconds. - } + name = 'slurm' + // The number of tasks the executor will handle in a parallel manner + queueSize = 150 + submitRateLimit = '1 / 2 s' + // Determines the max rate of job submission per time unit, for example '10sec' eg. max 10 jobs per second or '1/2 s' i.e. 1 job submissions every 2 seconds. } env { NXF_ANSI_SUMMARY = true NXF_ANSI_LOG = true - NXF_DEBUG = 2 } trace { diff --git a/config/pta.config b/config/pta.config new file mode 100644 index 00000000..6ff86dee --- /dev/null +++ b/config/pta.config @@ -0,0 +1,115 @@ +//==================== Nextflow/Container Config ========== + +manifest { + name = "Patient Tumor Analysis" + description = 'Pipeline for processing of germline and somatic SNP/InDEL and somatic structural variants and copy number alterations.' + author = 'Anuj Srivastava, Michael Lloyd, Brian Sanderson, Harshpreet Chandok, Peter Fields, Copyright Jackson Laboratory 2023' +} + +params { + // PDX sample: + pdx = false + + multiqc_config = "${projectDir}/bin/shared/multiqc/pta_multiqc.yaml" + + // Quality Stats params + + min_pct_hq_reads = '0.0' + hq_pct = '70' + + // Xenome index + xenome_prefix='/projects/compsci/omics_share/human/GRCh38/supporting_files/xenome/hg38_broad_NOD_based_on_mm10_k25' + + // Reference fasta + ref_fa = '/projects/compsci/omics_share/human/GRCh38/genome/sequence/gatk/Homo_sapiens_assembly38.fasta' + ref_fa_indices = '/projects/compsci/omics_share/human/GRCh38/genome/indices/gatk/bwa/Homo_sapiens_assembly38.fasta' + ref_fa_dict = '/projects/omics_share/human/GRCh38/genome/sequence/gatk/Homo_sapiens_assembly38.dict' + combined_reference_set = '/projects/compsci/omics_share/human/GRCh38/supporting_files/PTA_inputs/combined_ref_set/Homo_sapiens_assembly38.fasta' // Several tools (GRIDSS, SVABA) requires reference and bwa index files in same directory. Links used within this directory to avoid duplication. See note in directory. + + // BWA params + mismatch_penalty = "-B 8" + + // Known Sites for BQSR + gold_std_indels = '/projects/omics_share/human/GRCh38/genome/annotation/snps_indels/Mills_and_1000G_gold_standard.indels.hg38.vcf.gz' // used in variant recal, and variant tranche recal. GATK resource bundle. + phase1_1000G = '/projects/omics_share/human/GRCh38/genome/annotation/snps_indels/1000G_phase1.snps.high_confidence.hg38.vcf.gz' // used in variant recal, and variant tranche recal. GATK resource bundle. + dbSNP = '/projects/omics_share/human/GRCh38/genome/annotation/snps_indels/dbsnp_151.vcf.gz' // used in annotation, variant recal, variant tranche recal, and by SVABA. + dbSNP_index = '/projects/omics_share/human/GRCh38/genome/annotation/snps_indels/dbsnp_151.vcf.gz.tbi' + + // Chromosome contig lists, used in scatter / gather operations. + chrom_contigs = '/projects/omics_share/human/GRCh38/genome/sequence/gatk/Homo_sapiens_assembly38.primaryChr.contig_list' // Contig list used for scatter / gather in calling and annotation. + chrom_intervals = '/projects/omics_share/human/GRCh38/genome/annotation/intervals/hg38_calling_intervals/' // Chromosome intervals used for scatter gather in calling. + + + // Germline Haplotypecaller and downstream filtering. + call_val = 50.0 + ploidy_val = "-ploidy 2" + excludeIntervalList = '/projects/compsci/omics_share/human/GRCh38/genome/annotation/intervals/hg38_haplotypeCaller_skip.interval_list' // Germline caller exclusion list. + hapmap = '/projects/compsci/omics_share/human/GRCh38/genome/annotation/snps_indels/hapmap_3.3.hg38.vcf.gz' // variant tranche requirement. GATK resource bundle. + omni = '/projects/compsci/omics_share/human/GRCh38/genome/annotation/snps_indels/1000G_omni2.5.hg38.vcf.gz' // variant tranche requirement. GATK resource bundle. + + // Somatic SNP/InDEL filtering + pon_bed = '/projects/compsci/omics_share/human/GRCh38/supporting_files/PTA_inputs/filtering/WGS_1000g_GRCh38.pon.bed' // used in snp/indel filtering. + intervalListBed='/projects/compsci/omics_share/human/GRCh38/supporting_files/PTA_inputs/filtering/SureSelect_V6plusCOSMIC.target.GRCh38_full_analysis_set_plus_decoy_hla.interval_list.bed' // used to extract non-exonic regions, to attempt recovery with Lancet calls. + + // Lancet: + lancet_beds_directory = '/projects/omics_share/human/GRCh38/genome/annotation/intervals/lancet_chr_beds/' // Lancet requirement + + // Bicseq2 + mappability_directory = '/projects/compsci/omics_share/human/GRCh38/genome/annotation/intervals/mappability' // Bicseq2 requirement. + bicseq2_chromList = '/projects/compsci/omics_share/human/GRCh38/supporting_files/PTA_inputs/configs/sampleId.bicseq2.config' // bicseq2 requirement + bicseq2_no_scaling = false + + // Gridss and Gripss (filtering) + germline_filtering_vcf = '/projects/compsci/omics_share/human/GRCh38/supporting_files/PTA_inputs/filtering/gnomad-and-ALL_GRCh38_sites.20170504.normalized.modified.PASS.vcf.gz' // used in gridss call filtering. + gripss_pon = '/projects/compsci/omics_share/human/GRCh38/supporting_files/PTA_inputs/gripss_pon' // gripss requirement + + // Manta + callRegions = '/projects/compsci/omics_share/human/GRCh38/genome/annotation/intervals/GRCh38.callregions.bed.gz' // manta requirement. + + // Strelka + strelka_config = '/projects/compsci/omics_share/human/GRCh38/supporting_files/PTA_inputs/configs/configureStrelkaSomaticWorkflow.py.ini' // strelka requirement. + + // MSIsensor2 + msisensor_model='/projects/compsci/omics_share/human/GRCh38/supporting_files/msisensor2/models_hg38' // model files for MSIsensor2 + + // Annotations: + // VEP + vep_cache_directory = '/projects/compsci/omics_share/human/GRCh38/genome/annotation/vep_data' // VEP annotation cache. Note this directory contains additional annotation cache files. + vep_fasta = '/projects/compsci/omics_share/human/GRCh38/genome/sequence/ensembl/GRCh38.p13/Homo_sapiens.GRCh38.dna.primary_assembly.fa' // VEP is ensembl based, and requires a separate reference file. + + // Cosmic. + cosmic_cgc = '/projects/compsci/omics_share/human/GRCh38/genome/annotation/function/cancer_gene_census_v97.csv' + cosmic_cancer_resistance_muts = '/projects/compsci/omics_share/human/GRCh38/genome/annotation/function/CosmicResistanceMutations.tsv.gz' + + // Additional somatic annotations + ensembl_entrez='/projects/compsci/omics_share/human/GRCh38/supporting_files/PTA_inputs/annotations/GRCh39.p13_ensemblv109_entrez_id_map.csv' // used in somatic vcf finalization. + + // CNV and SV annotations and filtering files. + cytoband = '/projects/compsci/omics_share/human/GRCh38/supporting_files/PTA_inputs/annotations/GRCh38.cytoBand.UCSC.chr.sorted.txt' // used in bicseq2 annotations + dgv = '/projects/compsci/omics_share/human/GRCh38/supporting_files/PTA_inputs/annotations/DGV.GRCh38_hg38_variants_2020-02-25.bed' // used in bicseq2 annotations + thousandG = '/projects/compsci/omics_share/human/GRCh38/supporting_files/PTA_inputs/annotations/1KGP.CNV.GRCh38.canvas.merged.bed' // used in bicseq2 annotations + cosmicUniqueBed = '/projects/compsci/omics_share/human/GRCh38/supporting_files/PTA_inputs/annotations/CosmicCompleteCNA_uniqIntervals.bed' // used in bicseq2 annotations + cancerCensusBed = '/projects/compsci/omics_share/human/GRCh38/supporting_files/PTA_inputs/annotations/cancer_gene_census.GRCh38-v92.bed' // used in bicseq2 annotations and SV annotation. + ensemblUniqueBed = '/projects/compsci/omics_share/human/GRCh38/supporting_files/PTA_inputs/annotations/ensembl_genes_unique_sorted.final.v93.chr.sorted.bed' // used in bicseq2 annotations and SV annotation. + gap = '/projects/compsci/omics_share/human/GRCh38/supporting_files/PTA_inputs/annotations/GRCh38.gap.UCSC.annotated.chr.sorted.bed' // used in SV annotation. + dgvBedpe = '/projects/compsci/omics_share/human/GRCh38/supporting_files/PTA_inputs/annotations/DGV.GRCh38_hg38_variants_2020-02-25.bedpe' // used in SV annotation. + thousandGVcf = '/projects/compsci/omics_share/human/GRCh38/supporting_files/PTA_inputs/annotations/1KGP.pruned_wAFs.PASS_and_MULTIALLELIC_Mosaic.GRCh38.vcf' // used in SV annotation. + svPon = '/projects/compsci/omics_share/human/GRCh38/supporting_files/PTA_inputs/annotations/1000G-SV-PON.survivor-merged.GRCh38.filtered.bedpe' // used in SV annotation. + cosmicBedPe = '/projects/compsci/omics_share/human/GRCh38/supporting_files/PTA_inputs/annotations/cosmic-sv-GRCh38-v92.bedpe' // used in SV annotation. + + // NA12878 BAM file. For use in tumor-only processing. + na12878_bam = '/projects/compsci/omics_share/human/GRCh38/supporting_files/PTA_inputs/NA12878/NA12878_realigned_BQSR.bam' + na12878_bai = '/projects/compsci/omics_share/human/GRCh38/supporting_files/PTA_inputs/NA12878/NA12878_realigned_BQSR.bai' + na12878_sampleName = 'NA12878' + + // General ngs-ops arguments unsed in this workflow. + read_type = 'PE' // Only PE accepted. + gen_org='human' // Only human accepted. + tmpdir = '' + sample_folder = null // not used, csv input required. + extension='' // not used, csv input required. + pattern="" // not used, csv input required. + concat_lanes = false // not used, csv input required. + csv_input = null + bwa_min_score = null +} \ No newline at end of file diff --git a/config/rna_fusion.config b/config/rna_fusion.config new file mode 100644 index 00000000..a5167672 --- /dev/null +++ b/config/rna_fusion.config @@ -0,0 +1,89 @@ +//==================== Nextflow/Container Config ========== + +manifest { + name = "rna_fusion" + description = 'Pipeline for processing of PDX RNASeq samples to call RNA Fusions, contains xenome step for processing PDX samples' + author = 'Michael Lloyd, Sai Lek, Brian Sanderson Copyright Jackson Laboratory 2022' + version = "0.1.0" +} + +params { + + //Shared params + extension='.fastq.gz' + pattern="*{.,_,-}R{1,2}*" + read_type = 'PE' // PE only supported. + concat_lanes = false + sample_folder = null + download_data = false + csv_input = null + + multiqc_config = "${projectDir}/bin/shared/multiqc/rna_fusion_multiqc.yaml" + + // Xenome index + xenome_prefix='/projects/compsci/omics_share/human/GRCh38/supporting_files/xenome/hg38_broad_NOD_based_on_mm10_k25' + + // READ LENGTH ADJUSTMENTS: + read_length = 150 // change relative to sample being processed. 75, 100, 125, and 150 are supported. + star_index = '/projects/omics_share/human/GRCh38/transcriptome/indices/rna_fusion/star/star-index-150bp' // change relative to read length. 75, 100, 125, and 150 are supported. + + // GTF Annotation File. + gencode_gtf = '/projects/compsci/omics_share/human/GRCh38/transcriptome/indices/rna_fusion/gencode/gencode.v37.annotation.gtf.revised.custom.gtf' + + // FASTA + fasta = '/projects/omics_share/human/GRCh38/transcriptome/indices/rna_fusion/ensembl/Homo_sapiens.GRCh38.102.all.fa' + + // Arriba Options + arriba_star_args = '--outSAMtype BAM Unsorted \ + --outSAMunmapped Within \ + --outBAMcompression 0 \ + --outFilterMultimapNmax 50 \ + --peOverlapNbasesMin 10 \ + --alignSplicedMateMapLminOverLmate 0.5 \ + --alignSJstitchMismatchNmax 5 -1 5 5 \ + --chimSegmentMin 10 \ + --chimOutType WithinBAM HardClip \ + --chimJunctionOverhangMin 10 \ + --chimScoreDropMax 30 \ + --chimScoreJunctionNonGTAG 0 \ + --chimScoreSeparation 1 \ + --chimSegmentReadGapMax 3 \ + --chimMultimapNmax 50' + + arriba_blacklist = '/projects/compsci/omics_share/human/GRCh38/transcriptome/indices/rna_fusion/arriba/blacklist_hg38_GRCh38_v2.4.0.tsv.gz' + arriba_known_fusions = '/projects/compsci/omics_share/human/GRCh38/transcriptome/indices/rna_fusion/arriba/known_fusions_hg38_GRCh38_v2.4.0.tsv.gz' + arriba_protein_domains = '/projects/compsci/omics_share/human/GRCh38/transcriptome/indices/rna_fusion/arriba/protein_domains_hg38_GRCh38_v2.4.0.gff3' + + // Fusioncatcher Options + fusioncatcher_ref = '/projects/compsci/omics_share/human/GRCh38/transcriptome/indices/rna_fusion/fusioncatcher/human_v102' + fusioncatcher_limitSjdbInsertNsj = 2000000 + + // Jaffa Options + jaffa_ref_dir = '/projects/omics_share/human/GRCh38/transcriptome/indices/rna_fusion/jaffa/' + + // Pizzly Options + kallisto_index = '/projects/omics_share/human/GRCh38/transcriptome/indices/rna_fusion/pizzly/Homo_sapiens.GRCh38.102.cdna.all.kallisto-0.48.0.index' + transcript_fasta = '/projects/omics_share/human/GRCh38/transcriptome/indices/rna_fusion/ensembl/Homo_sapiens.GRCh38.102.cdna.all.fa.gz' + ensembl_gtf = '/projects/omics_share/human/GRCh38/transcriptome/indices/rna_fusion/ensembl/Homo_sapiens.GRCh38.102.gtf' + + // Squid Options + squid_star_args = '--twopassMode Basic \ + --chimOutType SeparateSAMold \ + --chimSegmentMin 20 \ + --chimJunctionOverhangMin 12 \ + --alignSJDBoverhangMin 10 \ + --outReadsUnmapped Fastx \ + --outSAMstrandField intronMotif \ + --outSAMtype BAM SortedByCoordinate' + + //Star-Fusion Options + star_fusion_ref = '/projects/omics_share/human/GRCh38/transcriptome/indices/rna_fusion/starfusion/ctat_genome_lib_build_dir' + star_fusion_opt = '' + + // Fusion Report Options: + fusion_report_opt = false + + //Fusion-report databases + databases = '/projects/compsci/omics_share/human/GRCh38/supporting_files/rna_fusion_dbs' + +} diff --git a/config/rnaseq.config b/config/rnaseq.config index f406fbb1..f61496e0 100644 --- a/config/rnaseq.config +++ b/config/rnaseq.config @@ -13,52 +13,128 @@ params { //Shared params gen_org='mouse' // human + genome_build = 'GRCm38' // GRCm39 extension='.fastq.gz' pattern="*_R{1,2}*" + sample_folder = null read_type = 'PE' // SE concat_lanes = false - read_prep = 'reverse_stranded' // 'reverse_stranded, forward_stranded, or non_stranded' - ref_fa='/projects/omics_share/mouse/GRCm38/genome/sequence/ensembl/v102/Mus_musculus.GRCm38.dna.toplevel.fa' + download_data = false + csv_input = null + + + pdx = false // if PDX, gen_org == human and xenome is run to remove mouse reads from the sample(s). + + multiqc_config = "${projectDir}/bin/shared/multiqc/rnaseq_multiqc.yaml" //Quality Stats params min_pct_hq_reads='0.0' + hq_pct = '70' + + // strand check - used only for QC check, not for mapping. + strandedness_ref = '/projects/compsci/omics_share/mouse/GRCm38/transcriptome/indices/ensembl/v102/kallisto/kallisto_index' + strandedness_gtf = '/projects/compsci/omics_share/mouse/GRCm38/transcriptome/annotation/ensembl/v102/Mus_musculus.GRCm38.102.gtf' //RSEM params seed_length = '25' rsem_aligner = 'bowtie2' // 'star' - rsem_ref_prefix = 'Mus_musculus.GRCm38.dna.toplevel' + rsem_ref_prefix = 'Mus_musculus.GRCm38.dna.primary_assembly' rsem_ref_files = '/projects/omics_share/mouse/GRCm38/transcriptome/indices/ensembl/v102' - rsem_star_prefix = 'toplevel/GRCm38_100' // 'toplevel/GRCm38_75' or 'toplevel/GRCm38_150' + rsem_star_prefix = 'primary' // 'primary' or 'top_level' - //Picard params - picard_dict='/projects/omics_share/mouse/GRCm38/genome/sequence/ensembl/v102/Mus_musculus.GRCm38.dna.toplevel.dict' + picard_dict='/projects/omics_share/mouse/GRCm38/genome/sequence/ensembl/v102/Mus_musculus.GRCm38.dna.primary_assembly.dict' - ref_flat='/projects/omics_share/mouse/GRCm38/transcriptome/annotation/ensembl/v102/Mus_musculus.GRCm38.102.chr_patch_hapl_scaff.refFlat.txt' - ribo_intervals='/projects/omics_share/mouse/GRCm38/transcriptome/annotation/ensembl/v102/Mus_musculus.GRCm38.102.chr_patch_hapl_scaff.rRNA.interval_list' + ref_flat='/projects/omics_share/mouse/GRCm38/transcriptome/annotation/ensembl/v102/Mus_musculus.GRCm38.102.chr.refFlat.txt' + ribo_intervals='/projects/omics_share/mouse/GRCm38/transcriptome/annotation/ensembl/v102/Mus_musculus.GRCm38.102.chr.rRNA.interval_list' tmpdir = "/fastscratch/${USER}" + } // Defaults for Human if (params.gen_org=='human'){ - params.ref_fa='/projects/omics_share/human/GRCh38/genome/sequence/ensembl/v104/Homo_sapiens.GRCh38.dna.toplevel.fa' - params.ref_fai='/projects/omics_share/human/GRCh38/genome/sequence/ensembl/v104/Homo_sapiens.GRCh38.dna.toplevel.fa.fai' + params.genome_build = 'GRCh38' //Quality Stats params params.min_pct_hq_reads='0.0' + params.hq_pct = '70' + + // strand check + params.strandedness_ref = '/projects/compsci/omics_share/human/GRCh38/transcriptome/indices/ensembl/v104/kallisto/kallisto_index' + params.strandedness_gtf = '/projects/compsci/omics_share/human/GRCh38/transcriptome/annotation/ensembl/v104/Homo_sapiens.GRCh38.104.gtf' // RSEM params.seed_length = '25' params.rsem_aligner = 'bowtie2' // 'star' - params.rsem_ref_prefix = 'Homo_sapiens.GRCh38.dna.toplevel' + params.rsem_ref_prefix = 'Homo_sapiens.GRCh38.dna.primary_assembly' params.rsem_ref_files = '/projects/omics_share/human/GRCh38/transcriptome/indices/ensembl/v104' - params.rsem_star_prefix = 'toplevel/GRCh38_100' // 'toplevel/GRCh38_75' or 'toplevel/GRCh38_150' + params.rsem_star_prefix = 'primary' // 'primary' or 'top_level' // Picard - params.picard_dict='/projects/omics_share/human/GRCh38/genome/sequence/ensembl/v104/Homo_sapiens.GRCh38.dna.toplevel.dict' - params.ref_flat='/projects/omics_share/human/GRCh38/transcriptome/annotation/ensembl/v104/Homo_sapiens.GRCh38.104.chr_patch_hapl_scaff.refFlat.txt' - params.ribo_intervals='/projects/omics_share/human/GRCh38/transcriptome/annotation/ensembl/v104/Homo_sapiens.GRCh38.104.chr_patch_hapl_scaff.rRNA.interval_list' + params.picard_dict='/projects/omics_share/human/GRCh38/genome/sequence/ensembl/v104/Homo_sapiens.GRCh38.dna.primary_assembly.dict' + params.ref_flat='/projects/omics_share/human/GRCh38/transcriptome/annotation/ensembl/v104/Homo_sapiens.GRCh38.104.chr.refFlat.txt' + params.ribo_intervals='/projects/omics_share/human/GRCh38/transcriptome/annotation/ensembl/v104/Homo_sapiens.GRCh38.104.chr.rRNA.interval_list' + +} + +// Defaults for GRCm39 build +if (params.genome_build=='GRCm39'){ + + //RSEM params + params.rsem_ref_prefix = 'Mus_musculus.GRCm39.dna.primary_assembly' + params.rsem_ref_files = '/projects/omics_share/mouse/GRCm39/transcriptome/indices/ensembl/v105' + params.rsem_star_prefix = 'primary' // 'primary' or 'top_level' + + //Picard params + params.picard_dict='/projects/omics_share/mouse/GRCm39/genome/sequence/ensembl/v105/Mus_musculus.GRCm39.dna.primary_assembly.dict' + + params.ref_flat='/projects/omics_share/mouse/GRCm39/transcriptome/annotation/ensembl/v105/Mus_musculus.GRCm39.105.refFlat.txt' + params.ribo_intervals='/projects/omics_share/mouse/GRCm39/transcriptome/annotation/ensembl/v105/Mus_musculus.GRCm39.105.rRNA.interval_list' + +} + +// Defaults for PDX +if (params.gen_org=='human' && params.pdx){ + + params.rsem_ref_prefix = null // zero out params to avoid accidental collision + params.rsem_ref_files = null // zero out params to avoid accidental collision + params.rsem_star_prefix = null // zero out params to avoid accidental collision + // strand check + params.strandedness_ref = '/projects/compsci/omics_share/human/GRCh38/transcriptome/indices/ensembl/v104/kallisto/kallisto_index' + params.strandedness_gtf = '/projects/compsci/omics_share/human/GRCh38/transcriptome/annotation/ensembl/v104/Homo_sapiens.GRCh38.104.gtf' + + //Quality Stats params + params.min_pct_hq_reads='0.0' + + // Xenome + params.xenome_prefix='/projects/compsci/omics_share/human/GRCh38/supporting_files/xenome/hg38_broad_NOD_based_on_mm10_k25' + + // General RSEM + params.seed_length = '25' + params.rsem_aligner = 'bowtie2' // 'star' + + // Human RSEM + params.rsem_ref_prefix_human = 'Homo_sapiens.GRCh38.dna.primary_assembly' + params.rsem_ref_files_human = '/projects/omics_share/human/GRCh38/transcriptome/indices/ensembl/v104' + params.rsem_star_prefix_human = 'primary' // 'primary' or 'top_level' + + // Human Picard + params.picard_dict_human='/projects/omics_share/human/GRCh38/genome/sequence/ensembl/v104/Homo_sapiens.GRCh38.dna.primary_assembly.dict' + params.ref_flat_human='/projects/omics_share/human/GRCh38/transcriptome/annotation/ensembl/v104/Homo_sapiens.GRCh38.104.chr.refFlat.txt' + params.ribo_intervals_human='/projects/omics_share/human/GRCh38/transcriptome/annotation/ensembl/v104/Homo_sapiens.GRCh38.104.chr.rRNA.interval_list' + + // Mouse RSEM + params.rsem_ref_prefix_mouse = 'Mus_musculus.GRCm38.dna.primary_assembly' + params.rsem_ref_files_mouse = '/projects/omics_share/mouse/GRCm38/transcriptome/indices/ensembl/v102' + params.rsem_star_prefix_mouse = 'primary' // 'primary' or 'top_level' + + // Mouse Picard + params.picard_dict_mouse='/projects/omics_share/mouse/GRCm38/genome/sequence/ensembl/v102/Mus_musculus.GRCm38.dna.primary_assembly.dict' + + params.ref_flat_mouse='/projects/omics_share/mouse/GRCm38/transcriptome/annotation/ensembl/v102/Mus_musculus.GRCm38.102.chr.refFlat.txt' + params.ribo_intervals_mouse='/projects/omics_share/mouse/GRCm38/transcriptome/annotation/ensembl/v102/Mus_musculus.GRCm38.102.chr.rRNA.interval_list' + } \ No newline at end of file diff --git a/config/rrbs.config b/config/rrbs.config index a8322e5a..dd910b90 100644 --- a/config/rrbs.config +++ b/config/rrbs.config @@ -11,10 +11,16 @@ manifest { params { //Shared params gen_org='mouse' // human + genome_build = 'GRCm38' // GRCm39 extension='.fastq.gz' pattern="*_R{1,2}*" + sample_folder = null read_type = 'PE' // SE concat_lanes = false + download_data = false + csv_input = null + + multiqc_config = "${projectDir}/bin/shared/multiqc/rrbs_multiqc.yaml" // Trimming & Bismark Setting non_directional = true @@ -46,6 +52,8 @@ params { if (params.gen_org=='human'){ + params.genome_build = 'GRCh38' + // Trimming & Bismark Setting params.non_directional = true @@ -69,4 +77,11 @@ if (params.gen_org=='human'){ // Bismark Methylation extraction settings. params.cytosine_report = false params.comprehensive = true +} + +// Defaults for GRCm39 build +if (params.genome_build=='GRCm39'){ + // Bismark Mapping settings. + params.ref_fa_index = '/projects/omics_share/mouse/GRCm39/genome/indices/ensembl/v105/bismark/bowtie2' + } \ No newline at end of file diff --git a/config/wes.config b/config/wes.config index b955a3aa..dc83c45c 100644 --- a/config/wes.config +++ b/config/wes.config @@ -12,23 +12,32 @@ manifest { params { // Shared params gen_org = 'mouse' // human + genome_build = 'GRCm38' // GRCm39 extension='.fastq.gz' pattern="*_R{1,2}*" + sample_folder = null read_type = 'PE' // SE concat_lanes = false - + download_data = false + csv_input = null - // Reference fasta - ref_fa = '/projects/omics_share/mouse/GRCm38/genome/sequence/ensembl/v102/Mus_musculus.GRCm38.dna.toplevel.fa' - ref_fa_indices='/projects/omics_share/mouse/GRCm38/genome/indices/ensembl/v102/bwa/Mus_musculus.GRCm38.dna.toplevel.fa' + multiqc_config = "${projectDir}/bin/shared/multiqc/wes_multiqc.yaml" + // Reference fasta + ref_fa = '/projects/omics_share/mouse/GRCm38/genome/sequence/ensembl/v102/Mus_musculus.GRCm38.dna.primary_assembly.fa' + ref_fa_indices='/projects/omics_share/mouse/GRCm38/genome/indices/ensembl/v102/bwa/Mus_musculus.GRCm38.dna.primary_assembly.fa' + // Quality Stats params min_pct_hq_reads = '0.0' + hq_pct = '70' + + // GVCF + run_gvcf = false // WES capture array BED and GATK intervals lists target_gatk = '/projects/omics_share/mouse/GRCm38/supporting_files/capture_kit_files/agilent/v2/S32371113_mouse_exon_V2.bare.bed' - target_picard = '/projects/omics_share/mouse/GRCm38/supporting_files/capture_kit_files/agilent/v2/S32371113_mouse_exon_V2.bare.picard.interval_list' - bait_picard = '/projects/omics_share/mouse/GRCm38/supporting_files/capture_kit_files/agilent/v2/S32371113_mouse_exon_V2.bare.picard.interval_list' + target_picard = '/projects/omics_share/mouse/GRCm38/supporting_files/capture_kit_files/agilent/v2/S32371113_mouse_exon_V2.bare.picard.primary_assembly.interval_list' + bait_picard = '/projects/omics_share/mouse/GRCm38/supporting_files/capture_kit_files/agilent/v2/S32371113_mouse_exon_V2.bare.picard.primary_assembly.interval_list' // Variant calling parameters mismatch_penalty = "-B 8" @@ -37,21 +46,27 @@ params { // VCF annotation dbSNP = '/projects/omics_share/mouse/GRCm38/genome/annotation/snps_indels/GCA_000001635.6_current_ids.vcf.gz' + dbSNP_index = '/projects/omics_share/mouse/GRCm38/genome/annotation/snps_indels/GCA_000001635.6_current_ids.vcf.gz.tbi' + gen_ver = "GRCm38.99" snpEff_config = "/projects/omics_share/mouse/GRCm38/genome/indices/snpEff_5_1/snpEff.config" tmpdir = "/fastscratch/${USER}" + bwa_min_score = null } -// Defaults for Human (Default HG38 PE) Should we have a switch for other versions? +// Defaults for Human (Default HG38 PE) if (params.gen_org=='human'){ + params.genome_build = 'GRCh38' + // Reference fasta params.ref_fa = '/projects/omics_share/human/GRCh38/genome/sequence/gatk/Homo_sapiens_assembly38.fasta' - params.ref_fa_indices = '/projects/omics_share/human/GRCh38/genome/indices/gatk/bwa/Homo_sapiens_assembly38.fasta.64' + params.ref_fa_indices = '/projects/omics_share/human/GRCh38/genome/indices/gatk/bwa/Homo_sapiens_assembly38.fasta' // Quality Stats params params.min_pct_hq_reads = '0.0' + params.hq_pct = '70' // WES capture array BED and GATK intervals lists params.target_gatk = '/projects/omics_share/human/GRCh38/supporting_files/capture_kit_files/agilent/v7/S31285117_MergedProbes_no_gene_names.bed' @@ -64,7 +79,6 @@ if (params.gen_org=='human'){ params.ploidy_val = "-ploidy 2" // VCF annotation - // These gold standard snp & indel files are bgzipped and tabixed, but old versions were not. Is this an issue? params.gold_std_indels = '/projects/omics_share/human/GRCh38/genome/annotation/snps_indels/Mills_and_1000G_gold_standard.indels.hg38.vcf.gz' params.phase1_1000G = '/projects/omics_share/human/GRCh38/genome/annotation/snps_indels/1000G_phase1.snps.high_confidence.hg38.vcf.gz' params.dbSNP = '/projects/omics_share/human/GRCh38/genome/annotation/snps_indels/dbsnp_151.vcf.gz' @@ -75,4 +89,26 @@ if (params.gen_org=='human'){ params.gen_ver = "hg38" params.snpEff_config = '/projects/omics_share/human/GRCh38/genome/indices/snpEff_5_1/snpEff.config' +} + +// Defaults for GRCm39 build +if (params.genome_build=='GRCm39'){ + + // Reference fasta + params.ref_fa = '/projects/omics_share/mouse/GRCm39/genome/sequence/ensembl/v105/Mus_musculus.GRCm39.dna.primary_assembly.fa' + params.ref_fa_indices = '/projects/omics_share/mouse/GRCm39/genome/indices/ensembl/v105/bwa/Mus_musculus.GRCm39.dna.primary_assembly.fa' + params.chrom_contigs = '/projects/omics_share/mouse/GRCm39/genome/sequence/ensembl/v105/Mus_musculus.GRCm39.dna.primary_assembly.primaryChr.contig_list' + + // WES capture array BED and GATK intervals lists + params.target_gatk = '/projects/omics_share/mouse/GRCm39/supporting_files/capture_kit_files/agilent/v2/S32371113_mouse_exon_V2.mm39.bare.bed' + params.target_picard = '/projects/omics_share/mouse/GRCm39/supporting_files/capture_kit_files/agilent/v2/S32371113_mouse_exon_V2.mm39.picard.interval_list' + params.bait_picard = '/projects/omics_share/mouse/GRCm39/supporting_files/capture_kit_files/agilent/v2/S32371113_mouse_exon_V2.mm39.picard.interval_list' + + // VCF annotation + params.dbSNP = '/projects/omics_share/mouse/GRCm39/genome/annotation/snps_indels/GCA_000001635.9_current_ids.vcf.gz' + params.dbSNP_index = '/projects/omics_share/mouse/GRCm39/genome/annotation/snps_indels/GCA_000001635.9_current_ids.vcf.gz.tbi' + params.gen_ver = 'GRCm39.105' + params.snpEff_config = '/projects/omics_share/mouse/GRCm39/genome/indices/snpEff_5_1d/snpEff.config' + params.comment = 'This script will run whole genome sequencing on mouse samples using default GRCm39' + } \ No newline at end of file diff --git a/config/wgs.config b/config/wgs.config index 070e32b4..aa5b4cfc 100644 --- a/config/wgs.config +++ b/config/wgs.config @@ -12,22 +12,33 @@ manifest { params { // Shared params gen_org = 'mouse' // human + genome_build = 'GRCm38' // GRCm39 extension='.fastq.gz' pattern="*_R{1,2}*" read_type = 'PE' // SE + sample_folder = null concat_lanes = false - + download_data = false + csv_input = null + + multiqc_config = "${projectDir}/bin/shared/multiqc/wgs_multiqc.yaml" + // Reference fasta - ref_fa = '/projects/omics_share/mouse/GRCm38/genome/sequence/ensembl/v102/Mus_musculus.GRCm38.dna.toplevel.fa' - ref_fa_indices='/projects/omics_share/mouse/GRCm38/genome/indices/ensembl/v102/bwa/Mus_musculus.GRCm38.dna.toplevel.fa' - chrom_contigs = '/projects/omics_share/mouse/GRCm38/genome/sequence/ensembl/v102/Mus_musculus.GRCm38.dna.toplevel.primaryChr.contig_list' + ref_fa = '/projects/omics_share/mouse/GRCm38/genome/sequence/ensembl/v102/Mus_musculus.GRCm38.dna.primary_assembly.fa' + ref_fa_indices='/projects/omics_share/mouse/GRCm38/genome/indices/ensembl/v102/bwa/Mus_musculus.GRCm38.dna.primary_assembly.fa' + chrom_contigs = '/projects/omics_share/mouse/GRCm38/genome/sequence/ensembl/v102/Mus_musculus.GRCm38.dna.primary_assembly.primaryChr.contig_list' // Quality Stats params min_pct_hq_reads = '0.0' + hq_pct = '70' + // GVCF + run_gvcf = false + // VCF annotation gen_ver = "GRCm38.99" dbSNP = '/projects/omics_share/mouse/GRCm38/genome/annotation/snps_indels/GCA_000001635.6_current_ids.vcf.gz' + dbSNP_index = '/projects/omics_share/mouse/GRCm38/genome/annotation/snps_indels/GCA_000001635.6_current_ids.vcf.gz.tbi' snpEff_config = '/projects/omics_share/mouse/GRCm38/genome/indices/snpEff_5_1/snpEff.config' // Variant calling parameters @@ -35,14 +46,17 @@ params { ploidy_val = "-ploidy 2" call_val = "50.0" - tmpdir = "/fastscratch/${USER}" + tmpdir = "/fastscratch/${USER}" + bwa_min_score = null } if (params.gen_org=='human'){ + params.genome_build = 'GRCh38' + // Reference fasta params.ref_fa = '/projects/omics_share/human/GRCh38/genome/sequence/gatk/Homo_sapiens_assembly38.fasta' - params.ref_fa_indices = '/projects/omics_share/human/GRCh38/genome/indices/gatk/bwa/Homo_sapiens_assembly38.fasta.64' + params.ref_fa_indices = '/projects/omics_share/human/GRCh38/genome/indices/gatk/bwa/Homo_sapiens_assembly38.fasta' params.chrom_contigs = '/projects/omics_share/human/GRCh38/genome/sequence/gatk/Homo_sapiens_assembly38.primaryChr.contig_list' // Variant calling parameters @@ -60,4 +74,21 @@ if (params.gen_org=='human'){ params.cosmic_index = '/projects/omics_share/human/GRCh38/genome/annotation/function/COSMICv95_Coding_Noncoding.gatk_formatted.vcf.gz.tbi' params.gen_ver = "hg38" params.snpEff_config = '/projects/omics_share/human/GRCh38/genome/indices/snpEff_5_1/snpEff.config' +} + +// Defaults for GRCm39 build +if (params.genome_build=='GRCm39'){ + + // Reference fasta + params.ref_fa = '/projects/omics_share/mouse/GRCm39/genome/sequence/ensembl/v105/Mus_musculus.GRCm39.dna.primary_assembly.fa' + params.ref_fa_indices = '/projects/omics_share/mouse/GRCm39/genome/indices/ensembl/v105/bwa/Mus_musculus.GRCm39.dna.primary_assembly.fa' + params.chrom_contigs = '/projects/omics_share/mouse/GRCm39/genome/sequence/ensembl/v105/Mus_musculus.GRCm39.dna.primary_assembly.primaryChr.contig_list' + + // VCF annotation + params.dbSNP = '/projects/omics_share/mouse/GRCm39/genome/annotation/snps_indels/GCA_000001635.9_current_ids.vcf.gz' + params.dbSNP_index = '/projects/omics_share/mouse/GRCm39/genome/annotation/snps_indels/GCA_000001635.9_current_ids.vcf.gz.tbi' + params.gen_ver = 'GRCm39.105' + params.snpEff_config = '/projects/omics_share/mouse/GRCm39/genome/indices/snpEff_5_1d/snpEff.config' + params.comment = 'This script will run whole genome sequencing on mouse samples using default GRCm39' + } \ No newline at end of file diff --git a/lib/Logos.groovy b/lib/Logos.groovy new file mode 100644 index 00000000..448063f0 --- /dev/null +++ b/lib/Logos.groovy @@ -0,0 +1,88 @@ +class Colors { + final non = "\033[0m" + final dim = "\033[2m" + final blk = "\033[0;30m" + final grn = "\033[0;32m" + final ylw = "\033[0;33m" + final blu = "\033[0;34m" + final pur = "\033[0;35m" + final cyn = "\033[0;36m" + final wht = "\033[0;37m" +} + +class Logo { + def c = new Colors() + + def logColors() { + // Mod Log colors if not ANSI + final mono = !env.NXF_ANSI_LOG + //final mono = true + if ( mono ) { + for (k in c.keySet()) { + c[k] = '' + } + } + } + + private def frameLogo(String logo, String color=c.blu) { + // return framed lines of passed logo text + def logos = logo.split('[\n\r]') + int maxLen = ( logos.collect{ it.length() } ).max() + 4 + def frameit = { it * maxLen } + // def bar = "${c.dim}.${c.non}" + def bar = "" + def logoLines = '' + for ( line in logos ) { + (line =~ /\S/) && \ + (logoLines += "${bar}${color} ${line} ${c.non}${bar}\n") + } + logos = "${c.dim}.${frameit('.')}.${c.non}\n" \ + + logoLines \ + + "${bar}${c.dim}${frameit('.')}${bar}" + logos = logoLines + return logos.stripIndent() + } + + public def show(String logo=this.logo, String clr=c.non) { + // return framed lines of chosen org logo in chosen color + frameLogo(logo, clr) + } + +/* +ASCII Name Art Options: http://patorjk.com/software/taag/ +*/ + +static def logo = this.logo_jaxngsops_cyber + +static def logo_jaxgm_ansi_regular = $/ + ██ █████ ██ ██ ██████ ███████ ███ ██ ██████ ███████ ██████ ██████ ███████ + ██ ██ ██ ██ ██ ██ ██ ████ ██ ██ ██ ██ ██ ██ ██ ██ + ██ ███████ ███ █████ ██ ███████ █████ ██ ██ ██ ██ ███ ███████ █████ ██ ██ ██████ ███████ +██ ██ ██ ██ ██ ██ ██ ██ ██ ██ ██ ██ ██ ██ ██ ██ ██ ██ + █████ ██ ██ ██ ██ ██████ ███████ ██ ████ ██████ ███████ ██████ ██ ███████ +/$ + +static def logo_jaxcsngsops_big = $/ + _ _ __ __ ____ ____ _ _ ____ ____ ___ ____ ____ + | | / \ \ \/ / / ___/ ___| | \ | |/ ___/ ___| / _ \| _ \/ ___| + _ | |/ _ \ \ /_____| | \___ \ _____| \| | | _\___ \ _____| | | | |_) \___ \ +| |_| / ___ \ / |_____| |___ ___) |_____| |\ | |_| |___) |_____| |_| | __/ ___) | + \___/_/ \_/_/\_\ \____|____/ |_| \_|\____|____/ \___/|_| |____/ +/$ + +static def logo_jaxngsops_mini = $/ + _ __ __ __ _ _ __ + | /\ \/ __ / (_ __ |\ | /__ (_ __ / \ |_) (_ + \_| /--\ /\ \_ __) | \| \_| __) \_/ | __) +/$ + + +static def logo_jaxngsops_cyber = $/ +_____ _______ _ _ _______ _______ __ _ ______ _______ _____ _____ _______ + | |_____| \___/ ___ | |______ ___ | \ | | ____ |______ ___ | | |_____| |______ +__| | | _/ \_ |_____ ______| | \_| |_____| ______| |_____| | ______| +/$ + + + +} diff --git a/main.nf b/main.nf index b802a15c..b4a86776 100644 --- a/main.nf +++ b/main.nf @@ -6,19 +6,36 @@ nextflow.enable.dsl=2 if (params.workflow == "rnaseq"){ include {RNASEQ} from './workflows/rnaseq' } -if (params.workflow == "wes"){ +else if (params.workflow == "wes"){ include {WES} from './workflows/wes' } -if (params.workflow == "wgs"){ +else if (params.workflow == "pdx_wes"){ + include {PDX_WES} from './workflows/pdx_wes' +} +else if (params.workflow == "wgs"){ include {WGS} from './workflows/wgs' } -if (params.workflow == "rrbs"){ +else if (params.workflow == "rrbs"){ include {RRBS} from './workflows/rrbs' } -if (params.workflow == "atac"){ +else if (params.workflow == "atac"){ include {ATAC} from './workflows/atac' } -// conditional to kick off appropriate workflow +else if (params.workflow == "chipseq"){ + include {CHIPSEQ} from './workflows/chipseq' +} +else if (params.workflow == "pta"){ + include {PTA} from './workflows/pta' +} +else if (params.workflow == "rna_fusion"){ + include {RNA_FUSION} from './workflows/rna_fusion' +} +else { + // if workflow name is not supported: + exit 1, "ERROR: No valid pipeline called. '--workflow ${params.workflow}' is not a valid workflow name." +} + +// conditional to launch appropriate workflow workflow{ if (params.workflow == "rnaseq"){ RNASEQ() @@ -26,6 +43,9 @@ workflow{ if (params.workflow == "wes"){ WES() } + if (params.workflow == "pdx_wes"){ + PDX_WES() + } if (params.workflow == "wgs"){ WGS() } @@ -35,4 +55,13 @@ workflow{ if (params.workflow == "atac"){ ATAC() } + if (params.workflow == "chipseq"){ + CHIPSEQ() + } + if (params.workflow == "pta"){ + PTA() + } + if (params.workflow == "rna_fusion"){ + RNA_FUSION() + } } diff --git a/modules/arriba/arriba.nf b/modules/arriba/arriba.nf new file mode 100644 index 00000000..0a93548d --- /dev/null +++ b/modules/arriba/arriba.nf @@ -0,0 +1,43 @@ +process ARRIBA { + + tag "$sampleID" + + cpus 1 + memory 10.GB + time 2.h + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} + + container 'quay.io/biocontainers/arriba:2.4.0--ha04fe3b_0' + + publishDir "${params.pubdir}/${ params.organize_by=='sample' ? sampleID + '/fusions' : 'arriba' }", pattern: "*.{tsv,txt}", mode:'copy' + + input: + tuple val(sampleID), path(bam), path(bai) + path(gtf) + + output: + tuple val(sampleID), path("*_arriba_fusions.tsv"), emit: arriba_fusions + tuple val(sampleID), path("*_arriba_fusions_discarded.tsv"), emit: arriba_fusions_fail + + script: + + """ + arriba \\ + -x ${bam} \\ + -a ${params.fasta} \\ + -g ${gtf} \\ + -o ${sampleID}_arriba_fusions.tsv \\ + -O ${sampleID}_arriba_fusions_discarded.tsv \\ + -b ${params.arriba_blacklist} \\ + -k ${params.arriba_known_fusions} \\ + -t ${params.arriba_known_fusions} \\ + -p ${params.arriba_protein_domains} + """ +} + +/* +From the documentation: + Note: In this execution, the same file is passed to the parameters -k and -t, because it is used for two purposes: + applying sensitive filtering parameters to known fusions (-k) and tagging known fusions in the tags column (-t). + However, it is possible to use different files for these two parameters if a user wants to separate the two tasks. +*/ \ No newline at end of file diff --git a/modules/bamtools/bamtools_filter.nf b/modules/bamtools/bamtools_filter.nf new file mode 100644 index 00000000..a0a34ccf --- /dev/null +++ b/modules/bamtools/bamtools_filter.nf @@ -0,0 +1,25 @@ +process BAMTOOLS_FILTER { + + tag "$sampleID" + + cpus 1 + memory 8.GB + time '12:00:00' + + container 'quay.io/biocontainers/bamtools:2.5.1--h9a82719_9' + + + input: + tuple val(sampleID), file(bam) + file(bamtools_filter_config) + + output: + tuple val(sampleID), file("*.sorted.bam"), emit: bam + + script: + prefix = params.read_type == 'SE' ? "${sampleID}.mLb.clN" : "${sampleID}.mLb.flT" + """ + bamtools filter -in ${bam} -script ${bamtools_filter_config} -out ${prefix}.sorted.bam + """ + +} diff --git a/modules/bamtools/bamtools_stats.nf b/modules/bamtools/bamtools_stats.nf index 088827ff..d2ef0376 100644 --- a/modules/bamtools/bamtools_stats.nf +++ b/modules/bamtools/bamtools_stats.nf @@ -5,7 +5,8 @@ process BAMTOOLS_STATS { cpus 1 memory 8.GB time '12:00:00' - + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} + container 'quay.io/biocontainers/bamtools:2.5.1--h9a82719_9' publishDir "${params.pubdir}/${ params.organize_by=='sample' ? sampleID : 'quality_stats' }", pattern:"*.txt", mode:'copy' @@ -17,7 +18,6 @@ process BAMTOOLS_STATS { tuple val(sampleID), file("*metrics.txt"), emit: picard_metrics script: - log.info "----- Bamtools Stats Running on: ${sampleID} -----" if (params.read_type == "PE") """ diff --git a/modules/bcftools/bcftools_germline_filter.nf b/modules/bcftools/bcftools_germline_filter.nf new file mode 100644 index 00000000..e888ec2e --- /dev/null +++ b/modules/bcftools/bcftools_germline_filter.nf @@ -0,0 +1,137 @@ +process BCFTOOLS_GERMLINE_FILTER { + // This modules is a port of the NYGC germline filtering scheme found at this site: + // https://bitbucket.nygenome.org/projects/WDL/repos/somatic_dna_wdl/browse/germline/germline.wdl?at=7.4.0 + + tag "$sampleID" + + cpus = 1 + memory = 2.GB + time = '00:30:00' + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} + + container 'quay.io/biocontainers/bcftools:1.15--h0ea216a_2' + + publishDir "${params.pubdir}/${ params.organize_by=='sample' ? sampleID : 'bedtools' }", pattern: "*haplotypecaller.gatk.filtered.vcf.gz", mode:'copy' + + input: + tuple val(sampleID), file(vcf) + + output: + tuple val(sampleID), file("*haplotypecaller.gatk.filtered.vcf.gz"), file("*haplotypecaller.gatk.filtered.vcf.gz.tbi"), emit: vcf_idx + + + // NOTE: These are hard coded to resources provided at: https://bitbucket.nygenome.org/projects/WDL/repos/somatic_dna_wdl/browse/config/fasta_references.json + // Many of the files used in the filtering here are used again by VEP. Therefore, the reosource sets were combined to reduce the number of params. + + script: + """ + bgzip ${vcf} + tabix -p vcf ${vcf}.gz + + ## Remove existing AF annotations from merged VCF + bcftools annotate \ + -x INFO/AF \ + -Oz \ + ${vcf}.gz \ + > noaf.vcf.gz + + tabix -p vcf noaf.vcf.gz + + ## Annotate with NYGC AF for filtering + bcftools annotate \ + --annotations ${params.vep_cache_directory}/annotations/04142020_NYGC_samples.vcf.gz \ + --columns 'INFO/AF,INFO/AC_Hom' \ + -Oz \ + noaf.vcf.gz \ + > ${sampleID}.final.annotated.vcf.gz + + tabix -p vcf ${sampleID}.final.annotated.vcf.gz + + ## filter variants >3% AF and >10 Homozygotes in NYGC vars + bcftools filter \ + --exclude 'INFO/AF[*] > 0.03 || INFO/AC_Hom[*] > 10' \ + ${sampleID}.final.annotated.vcf.gz \ + > ${sampleID}.pop.filtered.vcf + + bgzip ${sampleID}.pop.filtered.vcf + tabix -p vcf ${sampleID}.pop.filtered.vcf.gz + + ## select whitelist variants + bcftools view \ + -Oz \ + -R ${params.vep_cache_directory}/annotations/vep_whitelist_38.20201118.vcf.gz \ + ${vcf}.gz \ + > ${sampleID}.whitelist.filtered.vcf.gz + + tabix -p vcf ${sampleID}.whitelist.filtered.vcf.gz + + ## select pgx variants + bcftools view \ + -Oz \ + -R ${params.vep_cache_directory}/annotations/pgx_vep_hg38.vcf.gz \ + ${vcf}.gz \ + > ${sampleID}.pgx.filtered.vcf.gz + + tabix -p vcf ${sampleID}.pgx.filtered.vcf.gz + + ## select chd whitelist variants + bcftools view \ + -Oz \ + -R ${params.vep_cache_directory}/annotations/chd_whitelist.vcf.gz \ + ${vcf}.gz \ + > ${sampleID}.chdwhitelist.filtered.vcf.gz + + tabix -p vcf ${sampleID}.chdwhitelist.filtered.vcf.gz + + ## select rwgs pgx variants + bcftools view \ + -Oz \ + -R ${params.vep_cache_directory}/annotations/rWGS_PGx.bed.gz \ + ${vcf}.gz \ + > ${sampleID}.rwgspgx.filtered.vcf.gz + + tabix -p vcf ${sampleID}.rwgspgx.filtered.vcf.gz + + ## Select deep intronics + bcftools view \ + -Oz \ + -R ${params.vep_cache_directory}/annotations/deep_intronic_whitelist_08132020.vcf.gz \ + ${vcf}.gz \ + > ${sampleID}.deep_intronics.filtered.vcf.gz + + tabix -p vcf ${sampleID}.deep_intronics.filtered.vcf.gz + + ## Select clinvar intronics + bcftools view \ + -Oz \ + -R ${params.vep_cache_directory}/annotations/clinvar_deep_intronics_09012020.vcf.gz \ + ${vcf}.gz \ + > ${sampleID}.clinvar_intronics.filtered.vcf.gz + + tabix -p vcf ${sampleID}.clinvar_intronics.filtered.vcf.gz + + bcftools query -l ${vcf}.gz > samples.txt + + ## merge all filtered files for further processing + bcftools concat \ + -a \ + -d all \ + ${sampleID}.pop.filtered.vcf.gz \ + ${sampleID}.whitelist.filtered.vcf.gz \ + ${sampleID}.pgx.filtered.vcf.gz \ + ${sampleID}.chdwhitelist.filtered.vcf.gz \ + ${sampleID}.rwgspgx.filtered.vcf.gz \ + ${sampleID}.deep_intronics.filtered.vcf.gz \ + ${sampleID}.clinvar_intronics.filtered.vcf.gz \ + | \ + bcftools view \ + -i 'GT[@samples.txt]="alt"' \ + | \ + bcftools sort \ + -Oz \ + > ${sampleID}_haplotypecaller.gatk.filtered.vcf.gz + + tabix -p vcf ${sampleID}_haplotypecaller.gatk.filtered.vcf.gz + + """ +} \ No newline at end of file diff --git a/modules/bcftools/bcftools_intersect_lancet_candidates.nf b/modules/bcftools/bcftools_intersect_lancet_candidates.nf new file mode 100644 index 00000000..d6fdd1c2 --- /dev/null +++ b/modules/bcftools/bcftools_intersect_lancet_candidates.nf @@ -0,0 +1,32 @@ +process BCFTOOLS_INTERSECTVCFS { + tag "$sampleID" + + cpus = 8 + memory = 6.GB + time = '06:00:00' + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} + + container 'quay.io/biocontainers/bcftools:1.15--h0ea216a_2' + + input: + tuple val(sampleID), file(candidate_vcf), file(candidate_tbi), file(lancet_confirm_vcf), file(lancet_confirm_tbi), val(meta), val(normal_name), val(tumor_name), val(chrom) + + output: + tuple val(sampleID), file("*.vcf.gz"), file("*.tbi"), val(meta), val(normal_name), val(tumor_name), emit: vcf + + script: + """ + bcftools \ + isec \ + -w 1 \ + -c none \ + -n =2 \ + --threads ${task.cpus} \ + ${lancet_confirm_vcf} \ + ${candidate_vcf} \ + > ${sampleID}_confirmed_lancet_merged_${chrom}.vcf + + bgzip ${sampleID}_confirmed_lancet_merged_${chrom}.vcf + tabix ${sampleID}_confirmed_lancet_merged_${chrom}.vcf.gz + """ +} diff --git a/modules/bcftools/bcftools_merge_callers.nf b/modules/bcftools/bcftools_merge_callers.nf new file mode 100644 index 00000000..d2fd237c --- /dev/null +++ b/modules/bcftools/bcftools_merge_callers.nf @@ -0,0 +1,61 @@ +process BCFTOOLS_MERGECALLERS { + tag "$sampleID" + + cpus = 8 + memory = 6.GB + time = '06:00:00' + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} + + container 'quay.io/biocontainers/bcftools:1.15--h0ea216a_2' + + input: + tuple val(sampleID), file(vcf), file(idx), val(meta), val(chrom) + + output: + tuple val(sampleID), file("*.vcf"), val(meta), val(chrom), emit: vcf + + script: + + """ + bcftools \ + merge \ + -r ${chrom} \ + --force-samples \ + --no-version \ + --threads ${task.cpus} \ + -f PASS,SUPPORT \ + -F x \ + -m none \ + -o ${sampleID}_mergedCallers_${chrom}.vcf \ + -i called_by:join,num_callers:sum,MNV_ID:join,supported_by:join \ + ${vcf} +""" +} + + +/* + +About: Merge multiple VCF/BCF files from non-overlapping sample sets to create one multi-sample file. + Note that only records from different files can be merged, never from the same file. For + "vertical" merge take a look at "bcftools norm" instead. +Usage: bcftools merge [options] [...] + +Options: + --force-samples resolve duplicate sample names + --print-header print only the merged header and exit + --use-header use the provided header + -0 --missing-to-ref assume genotypes at missing sites are 0/0 + -f, --apply-filters require at least one of the listed FILTER strings (e.g. "PASS,.") + -F, --filter-logic remove filters if some input is PASS ("x"), or apply all filters ("+") [+] + -g, --gvcf <-|ref.fa> merge gVCF blocks, INFO/END tag is expected. Implies -i QS:sum,MinDP:min,I16:sum,IDV:max,IMF:max + -i, --info-rules rules for merging INFO fields (method is one of sum,avg,min,max,join) or "-" to turn off the default [DP:sum,DP4:sum] + -l, --file-list read file names from the file + -m, --merge allow multiallelic records for , see man page for details [both] + --no-version do not append version and command line to the header + -o, --output write output to a file [standard output] + -O, --output-type 'b' compressed BCF; 'u' uncompressed BCF; 'z' compressed VCF; 'v' uncompressed VCF [v] + -r, --regions restrict to comma-separated list of regions + -R, --regions-file restrict to regions listed in a file + --threads number of extra output compression threads [0] + +*/ \ No newline at end of file diff --git a/modules/bcftools/bcftools_remove_spanning.nf b/modules/bcftools/bcftools_remove_spanning.nf new file mode 100644 index 00000000..5ab088d0 --- /dev/null +++ b/modules/bcftools/bcftools_remove_spanning.nf @@ -0,0 +1,28 @@ +process BCFTOOLS_REMOVESPANNING { + tag "$sampleID" + + cpus = 4 + memory = 2.GB + time = '01:00:00' + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} + + container 'quay.io/biocontainers/bcftools:1.15--h0ea216a_2' + + input: + tuple val(sampleID), file(vcf) + + output: + tuple val(sampleID), file("*.vcf"), emit: vcf + + script: + + """ + bcftools \ + view \ + --exclude 'ALT="*"' \ + --threads ${task.cpus} \ + -o ${sampleID}_nospanning_calls.vcf \ + ${vcf} + """ + +} \ No newline at end of file diff --git a/modules/bcftools/bcftools_sort.nf b/modules/bcftools/bcftools_sort.nf index 371d9de7..2912d493 100644 --- a/modules/bcftools/bcftools_sort.nf +++ b/modules/bcftools/bcftools_sort.nf @@ -1,9 +1,10 @@ -process BCF_SORT { +process BCFTOOLS_SORT { tag "$sampleID" cpus = 1 memory = 6.GB time = '06:00:00' + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} container 'quay.io/biocontainers/bcftools:1.15--h0ea216a_2' @@ -14,7 +15,6 @@ process BCF_SORT { tuple val(sampleID), file("*.vcf"), emit: vcf script: - log.info "----- BCFTools Sort Running on: ${sampleID} -----" """ bcftools sort -o ${sampleID}_only_${indel_snp}.vcf ${vcf} diff --git a/modules/bcftools/bcftools_split_multiallelic.nf b/modules/bcftools/bcftools_split_multiallelic.nf new file mode 100644 index 00000000..b5872c3f --- /dev/null +++ b/modules/bcftools/bcftools_split_multiallelic.nf @@ -0,0 +1,30 @@ +process BCFTOOLS_SPLITMULTIALLELIC { + tag "$sampleID" + + cpus = 8 + memory = 6.GB + time = '06:00:00' + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} + + container 'quay.io/biocontainers/bcftools:1.15--h0ea216a_2' + + input: + tuple val(sampleID), file(vcf), file(tbi), val(meta), val(normal_name), val(tumor_name), val(tool) + + output: + tuple val(sampleID), file("*.vcf"), val(meta), val(normal_name), val(tumor_name), val(tool), emit: vcf + + script: + output_name = vcf.getBaseName().replace('.vcf', '') + """ + bcftools \ + norm \ + -m \ + -any \ + --threads ${task.cpus} \ + --no-version \ + -f ${params.ref_fa} \ + -o ${output_name}_multiAllelicSplit.vcf \ + ${vcf} +""" +} \ No newline at end of file diff --git a/modules/bcftools/bcftools_split_multiallelic_regions.nf b/modules/bcftools/bcftools_split_multiallelic_regions.nf new file mode 100644 index 00000000..54c6e323 --- /dev/null +++ b/modules/bcftools/bcftools_split_multiallelic_regions.nf @@ -0,0 +1,42 @@ +process BCFTOOLS_SPLITMULTIALLELIC_REGIONS { + tag "$sampleID" + + cpus = 4 + memory = 6.GB + time = '06:00:00' + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} + + container 'quay.io/biocontainers/bcftools:1.15--h0ea216a_2' + + input: + tuple val(sampleID), file(vcf), file(index) + val(chrom_list) + + output: + tuple val(sampleID), file("*.vcf.gz"), file("*.vcf.gz.tbi"), emit: vcf_idx + + script: + + listOfChroms = chrom_list.collect { "$it" }.join(',') + + """ + bcftools \ + norm \ + -m \ + -any \ + --threads ${task.cpus} \ + --regions ${listOfChroms} \ + --no-version \ + -f ${params.ref_fa} \ + -o ${sampleID}_split.vcf \ + ${vcf} + + bgzip \ + -c \ + ${sampleID}_split.vcf > ${sampleID}_split.vcf.gz + + tabix ${sampleID}_split.vcf.gz + + """ + +} \ No newline at end of file diff --git a/modules/bedtools/bedtools_amplicon_metrics.nf b/modules/bedtools/bedtools_amplicon_metrics.nf new file mode 100644 index 00000000..60a2bd6e --- /dev/null +++ b/modules/bedtools/bedtools_amplicon_metrics.nf @@ -0,0 +1,45 @@ +process TARGET_COVERAGE_METRICS { + tag "$sampleID" + + cpus 4 + memory 20.GB + time = '10:00:00' + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} + + publishDir "${params.pubdir}/${ params.organize_by=='sample' ? sampleID+'/stats' : 'bedtools' }", pattern: "*coverage_metrics.txt", mode: 'copy' + + container 'quay.io/jaxcompsci/bedtools-python3:2.26.0' // note: version difference over other bedtools modules. The 2.23.0 container was failing to parse bed target file. + + input: + tuple val(sampleID), file(bam) + + output: + tuple val(sampleID), file("*coverage_metrics.txt"), emit: qc_metrics + + shell: + ''' + ### total bases (B) that map/align to the on-target (OT) region + bases_on_target=$(coverageBed -a !{params.target_gatk} -b !{bam} | awk '{if($7>0) total+=$7}END{print total}') + + ### Total length covered by BAM alignment. + total_bases_covered=$(genomeCoverageBed -ibam !{bam} -bg | awk '{if($4>0) total += ($3-$2)}END{print total}') + + ## Bot / Btot + awk -v a="$bases_on_target" -v b="$total_bases_covered" 'BEGIN { printf "on_target_percent\\t%s\\n", (a/b)*100 }' !{sampleID}_amplicon_coverage_metrics.txt + + ## Get the depth of 20% of the Average coverage + perc_mean=$(coverageBed -d -a !{params.target_gatk} -b !{bam} | awk '{if($7>0) total+=1;s+=$7}END{print (s/total)*.2}') + + ### total capture array bases + total_target_bases=$(awk -F'\\t' 'BEGIN{SUM=0}{ SUM+=$3-$2 }END{print SUM}' !{params.target_gatk}) + + ### compute total bases that exceed 20% coverage in capture target region, and calculated coverage uniformity + coverageBed -d -a !{params.target_gatk} -b !{bam} | awk -v percmean=$perc_mean -v totalbases=$total_target_bases '{if($7>percmean) total+=1;s+=$7}END{print "coverage_uniformity\\t"(total/totalbases)*100}' >> !{sampleID}_amplicon_coverage_metrics.txt + + ''' +} + + +/* +Calculations Per: https://sfvideo.blob.core.windows.net/sitefinity/docs/default-source/application-note/primerclip-a-tool-for-trimming-primer-sequences-application-note.pdf?sfvrsn=cf83e107_14 +*/ \ No newline at end of file diff --git a/modules/bedtools/bedtools_calc_pbc_metrics.nf b/modules/bedtools/bedtools_calc_pbc_metrics.nf index f16a81d5..6d046b99 100644 --- a/modules/bedtools/bedtools_calc_pbc_metrics.nf +++ b/modules/bedtools/bedtools_calc_pbc_metrics.nf @@ -4,6 +4,7 @@ process CALC_PBC_METRICS { cpus 4 memory 20.GB time = '10:00:00' + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} publishDir "${params.pubdir}/${ params.organize_by=='sample' ? sampleID+'/stats' : 'bedtools' }", pattern: "*.pbc.qc", mode: 'copy' container 'quay.io/biocontainers/bedtools:2.23.0--h5b5514e_6' @@ -15,7 +16,6 @@ process CALC_PBC_METRICS { tuple val(sampleID), file("*.pbc.qc") shell: - log.info "----- Calculate PBC Metrics on ${sampleID} -----" ''' { # try @@ -24,9 +24,9 @@ process CALC_PBC_METRICS { -i !{tmp_bams[0]} \ | awk 'BEGIN{OFS="\\t"}{print $1,$2,$4,$6,$9,$10}' \ | grep -v 'MT' | sort | uniq -c \ - | awk 'BEGIN{mt=0;m0=0;m1=0;m2=0}($1==1){m1=m1+1} \ + | awk 'BEGIN{mt=0;m0=0;m1=0;m2=0;sample=!{sampleID}}($1==1){m1=m1+1} \ ($1==2){m2=m2+1} {m0=m0+1} {mt=mt+$1} \ - END{printf "%d\\t%d\\t%d\\t%d\\t%f\\t%f\\t%f\\n", mt,m0,m1,m2,m0/mt,m1/m0,m1/m2}' \ + END{printf "SAMPLEID\\tMT\\tM0\\tM1\\tM2\\tNRF\\tPBC1\\tPBC2\\n!{sampleID}\\t%d\\t%d\\t%d\\t%d\\t%f\\t%f\\t%f\\n",mt,m0,m1,m2,m0/mt,m1/m0,m1/m2}' \ > !{sampleID}.pbc.qc } || { # catch diff --git a/modules/bedtools/bedtools_feature_count2bed.nf b/modules/bedtools/bedtools_feature_count2bed.nf index 6ae39a51..9baeef91 100644 --- a/modules/bedtools/bedtools_feature_count2bed.nf +++ b/modules/bedtools/bedtools_feature_count2bed.nf @@ -4,7 +4,7 @@ process FEATURE_COUNT2BED { cpus 1 memory 4.GB time = '04:00:00' - + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} publishDir "${params.pubdir}/${ params.organize_by=='sample' ? sampleID : 'bash-utils' }", pattern: "*_peaks_countMatrix.mm10.bed", mode: 'copy' container 'quay.io/biocontainers/bedtools:2.23.0--h5b5514e_6' @@ -16,7 +16,6 @@ process FEATURE_COUNT2BED { tuple val(sampleID), file("*_peaks_countMatrix.mm10.bed") shell: - log.info "----- Feature Count to Bed on ${sampleID} -----" ''' tail -n +3 !{peak_cnt_matrx} \ | awk -F $'\\t' 'BEGIN {OFS = FS} { print $2, $3, $4, $7, $6 }' \ diff --git a/modules/bedtools/bedtools_frip_reads_in_peaks.nf b/modules/bedtools/bedtools_frip_reads_in_peaks.nf index 6759bec2..79cb27b1 100644 --- a/modules/bedtools/bedtools_frip_reads_in_peaks.nf +++ b/modules/bedtools/bedtools_frip_reads_in_peaks.nf @@ -4,6 +4,7 @@ process FRIP_READS_IN_PEAKS { cpus 2 memory 4.GB time '04:00:00' + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} container 'quay.io/biocontainers/bedtools:2.23.0--h5b5514e_6' @@ -14,7 +15,6 @@ process FRIP_READS_IN_PEAKS { tuple val(sampleID), file("reads_in_peaks.tmp.ba*") script: - log.info "----- Fraction of reads in peaks (FRiP) on ${sampleID} -----" """ bedtools sort \ -i ${narrow_peaks} \ diff --git a/modules/bedtools/bedtools_genomecov.nf b/modules/bedtools/bedtools_genomecov.nf new file mode 100644 index 00000000..fbf99ee4 --- /dev/null +++ b/modules/bedtools/bedtools_genomecov.nf @@ -0,0 +1,35 @@ +process BEDTOOLS_GENOMECOV { + tag "$sampleID" + + cpus 2 + memory 4.GB + time '04:00:00' + + publishDir { + def type = "${params.workflow}" == 'chipseq' ? ( sampleID =~ /INPUT/ ? 'control_samples' : 'immuno_precip_samples') : '' + "${params.pubdir}/${ params.organize_by=='sample' ? type+'/'+sampleID+'/bigwig' : 'bedtools'}" + }, pattern: "*.txt", mode: 'copy' + + + container 'quay.io/jaxcompsci/bedtools-sv_refs:2.30.0--hc088bd4_0' + + + input: + tuple val(sampleID), path(bam), path(flagstat) + + output: + tuple val(sampleID), path("*.bedGraph"), emit: bedgraph + tuple val(sampleID), path("*.txt"), emit: scale_factor + + + script: + pe_fragment = params.read_type == 'SE' ? '' : '-pc' + extend = (params.read_type == 'SE' && params.fragment_size > 0) ? "-fs ${params.fragment_size}" : '' + """ + SCALE_FACTOR=\$(grep '[0-9] mapped (' $flagstat | awk '{print 1000000/\$1}') + echo \$SCALE_FACTOR > ${sampleID}.scale_factor.txt + + bedtools genomecov -ibam ${bam[0]} -bg -scale \$SCALE_FACTOR $pe_fragment $extend | sort -T '.' -k1,1 -k2,2n > ${sampleID}.bedGraph + """ + +} diff --git a/modules/bedtools/bedtools_start_candidates.nf b/modules/bedtools/bedtools_start_candidates.nf new file mode 100644 index 00000000..82a90606 --- /dev/null +++ b/modules/bedtools/bedtools_start_candidates.nf @@ -0,0 +1,28 @@ +process BEDTOOLS_STARTCANDIDATES { + tag "$sampleID" + + cpus = 1 + memory = 6.GB + time = '06:00:00' + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} + + container 'quay.io/biocontainers/bedtools:2.23.0--h5b5514e_6' + + input: + tuple val(sampleID), file(vcf), val(meta), val(chrom) + + output: + tuple val(sampleID), file("*.vcf"), val(meta), val(chrom), emit: vcf + + script: + + """ + bedtools \ + intersect \ + -header \ + -a ${vcf} \ + -b ${params.intervalListBed} \ + -v \ + > ${sampleID}_startCand_merged_${chrom}.vcf +""" +} \ No newline at end of file diff --git a/modules/biqseq2/bicseq2_normalize.nf b/modules/biqseq2/bicseq2_normalize.nf new file mode 100644 index 00000000..718e4b90 --- /dev/null +++ b/modules/biqseq2/bicseq2_normalize.nf @@ -0,0 +1,102 @@ +process BICSEQ2_NORMALIZE { + tag "$sampleID" + + cpus = 1 + memory = 8.GB + time = '05:00:00' + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} + + container 'quay.io/jaxcompsci/bicseq2:v3' + // publishDir "${params.pubdir}/${ params.organize_by=='sample' ? sampleID : 'biqseq2' }", pattern:".txt", mode:'copy' + + input: + tuple val(sampleID), path(individual_chr_seq_files), val(meta), val(read_ID), val(read_length), val(insert_size) + val(fasta_file_list) + + output: + tuple val(sampleID), path("*.norm.bin.txt"), val(meta), val(read_ID), emit: normalized_output + + script: + + // fasta and mappability are set file lists. mappability is set by by read length of sample. + // tempSeqs are the .seq files from the prior step. + // tempnormpaths are the output bins. + // bicseq2config file is a file with just the list of chroms. + // sampleID is sampleID + // out-file is the configuration file used in the next step. + + // `bicseq2_config_writer` will sort lists by chromosome name, and omit invalid chr names. + // Chromosome names in file names must have `chr` in the name. OR the bicseq2config file must be changed to exclude it. + + fasta_files = fasta_file_list.collect { "$it" }.join(' ') + + if( read_length == '100' || read_length == '101') { + mappability_path = params.mappability_directory + '/100' + } else if( read_length == '125') { + mappability_path = params.mappability_directory + '/125' + } else if( read_length == '150' || read_length == '151' ) { + mappability_path = params.mappability_directory + '/151' + } else if( read_length == '250') { + mappability_path = params.mappability_directory + '/250' + } else { + log.info("\nUnsupported read length " + read_length + " in BicSeq2 normalization. This step is about to fail gracefully.\n\n") + mappability_path = 'error' + } + + seq_file_list = individual_chr_seq_files.collect { "$it" }.join(' ') + + + """ + if [ "${mappability_path}" = "error" ]; then exit 1; fi + + mappability_file_list=`echo ${mappability_path}` + + python3 \ + ${projectDir}/bin/pta/bicseq2_config_writer.py \ + --fa-files ${fasta_files} \ + --mappability-directory ${mappability_path} \ + --temp-seqs ${seq_file_list} \ + --norm-bicseq2-config ${params.bicseq2_chromList} \ + --sample-id ${read_ID} \ + --out-file configuration_file.txt + + rounded_length=`echo ${insert_size} | awk '{print int(\$1+0.5)}'` + + /NBICseq-norm_v0.2.4/NBICseq-norm.pl \ + -l=${read_length} \ + -s=\${rounded_length} \ + -fig=${sampleID}.GCvsRD.pdf \ + -tmp=${sampleID}.tmp \ + configuration_file.txt \ + ${sampleID}.params.out + """ + + stub: + """ + touch ${read_ID}_chr1.norm.bin.txt + touch ${read_ID}_chr2.norm.bin.txt + touch ${read_ID}_chr3.norm.bin.txt + touch ${read_ID}_chr4.norm.bin.txt + touch ${read_ID}_chr5.norm.bin.txt + touch ${read_ID}_chr6.norm.bin.txt + touch ${read_ID}_chr7.norm.bin.txt + touch ${read_ID}_chr8.norm.bin.txt + touch ${read_ID}_chr9.norm.bin.txt + touch ${read_ID}_chr10.norm.bin.txt + touch ${read_ID}_chr11.norm.bin.txt + touch ${read_ID}_chr12.norm.bin.txt + touch ${read_ID}_chr13.norm.bin.txt + touch ${read_ID}_chr14.norm.bin.txt + touch ${read_ID}_chr15.norm.bin.txt + touch ${read_ID}_chr16.norm.bin.txt + touch ${read_ID}_chr17.norm.bin.txt + touch ${read_ID}_chr18.norm.bin.txt + touch ${read_ID}_chr19.norm.bin.txt + touch ${read_ID}_chr20.norm.bin.txt + touch ${read_ID}_chr21.norm.bin.txt + touch ${read_ID}_chr22.norm.bin.txt + touch ${read_ID}_chrX.norm.bin.txt + touch ${read_ID}_chrY.norm.bin.txt + """ + +} \ No newline at end of file diff --git a/modules/biqseq2/bicseq2_seg.nf b/modules/biqseq2/bicseq2_seg.nf new file mode 100644 index 00000000..770e248d --- /dev/null +++ b/modules/biqseq2/bicseq2_seg.nf @@ -0,0 +1,53 @@ +process BICSEQ2_SEG { + tag "$sampleID" + + cpus = 1 + memory = 8.GB + time = '03:00:00' + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} + + container 'quay.io/jaxcompsci/bicseq2:v3' + publishDir "${params.pubdir}/${ params.organize_by=='sample' ? sampleID + '/callers' : 'biqseq2' }", pattern:"{*.txt,*.png}", mode:'copy' + + input: + tuple val(sampleID), file(individual_normal_norm_bin_files), file(individual_tumor_norm_bin_files), val(meta), val(normal_name), val(tumor_name) + + output: + tuple val(sampleID), file("*.bicseq2.png"), val('no_idx'), val(meta), val(normal_name), val(tumor_name), val('bicseq2'), emit: bicseq2_png + tuple val(sampleID), file("*.bicseq2.txt"), val('no_idx'), val(meta), val(normal_name), val(tumor_name), val('bicseq2'), emit: bicseq2_sv_calls + + script: + + normal_norm_list = individual_normal_norm_bin_files.collect { "$it" }.join(' ') + tumor_norm_list = individual_tumor_norm_bin_files.collect { "$it" }.join(' ') + + scale = params.bicseq2_no_scaling ? "--noscale" : "" + + """ + + python3 \ + ${projectDir}/bin/pta/bicseq2_seg_config_writer.py \ + --normal-norms ${normal_norm_list} \ + --tumor-norms ${tumor_norm_list} \ + --seg-bicseq2-config ${params.bicseq2_chromList} \ + --out-file configuration_file.txt \ + --pair-id ${sampleID} + + perl /NBICseq-seg_v0.7.2/NBICseq-seg.pl \ + --control \ + --tmp ${sampleID} \ + --fig ${sampleID}.bicseq2.png \ + --title ${sampleID} \ + --lambda 4 \ + ${scale} \ + configuration_file.txt \ + ${sampleID}.bicseq2.txt + + """ + + stub: + """ + touch ${sampleID}.bicseq2.png + touch ${sampleID}.bicseq2.txt + """ +} diff --git a/modules/biqseq2/bicseq2_seg_unpaired.nf b/modules/biqseq2/bicseq2_seg_unpaired.nf new file mode 100644 index 00000000..032e13ef --- /dev/null +++ b/modules/biqseq2/bicseq2_seg_unpaired.nf @@ -0,0 +1,55 @@ +process BICSEQ2_SEG_UNPAIRED { + tag "$sampleID" + + cpus = 1 + memory = 8.GB + time = '03:00:00' + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} + + container 'quay.io/jaxcompsci/bicseq2:v3' + publishDir "${params.pubdir}/${ params.organize_by=='sample' ? sampleID + '/callers' : 'biqseq2' }", pattern:"{*.txt,*.png}", mode:'copy' + + input: + tuple val(sampleID), file(individual_tumor_norm_bin_files), val(meta), val(tumor_name) + + output: + tuple val(sampleID), file("*.bicseq2.png"), val('no_idx'), val(meta), val(params.na12878_sampleName), val(tumor_name), val('bicseq2'), emit: bicseq2_png + tuple val(sampleID), file("*.bicseq2.txt"), val('no_idx'), val(meta), val(params.na12878_sampleName), val(tumor_name), val('bicseq2'), emit: bicseq2_sv_calls + + script: + + tumor_norm_list = individual_tumor_norm_bin_files.collect { "$it" }.join(' ') + + scale = params.bicseq2_no_scaling ? "--noscale" : "" + + """ + + python3 \ + ${projectDir}/bin/pta/bicseq2_seg_config_writer_unpaired.py \ + --tumor-norms ${tumor_norm_list} \ + --seg-bicseq2-config ${params.bicseq2_chromList} \ + --out-file configuration_file.txt \ + --pair-id ${sampleID} + + perl /NBICseq-seg_v0.7.2/NBICseq-seg.pl \ + --tmp ${sampleID} \ + --fig ${sampleID}.bicseq2.png \ + --title ${sampleID} \ + --lambda 4 \ + ${scale} \ + configuration_file.txt \ + ${sampleID}.bicseq2.txt + + """ + + stub: + """ + touch ${sampleID}.bicseq2.png + touch ${sampleID}.bicseq2.txt + """ +} + + + + + diff --git a/modules/bismark/bismark_alignment.nf b/modules/bismark/bismark_alignment.nf index 4dc75b71..e365bad4 100644 --- a/modules/bismark/bismark_alignment.nf +++ b/modules/bismark/bismark_alignment.nf @@ -2,14 +2,13 @@ process BISMARK_ALIGNMENT { tag "$sampleID" cpus 20 - memory {60.GB * task.attempt} - time {30.hour * task.attempt} - errorStrategy 'retry' - maxRetries 1 + memory 60.GB + time 30.hour + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} container 'quay.io/biocontainers/bismark:0.23.1--hdfd78af_0' - publishDir "${params.pubdir}/${ params.organize_by=='sample' ? sampleID+'/alignment' : 'bismark_align' }", pattern: "*.bam", mode:'copy' + publishDir "${params.pubdir}/${ params.organize_by=='sample' ? sampleID+'/alignment' : 'bismark_align' }", pattern: "*.bam", mode:'copy', enabled: params.keep_intermediate publishDir "${params.pubdir}/${ params.organize_by=='sample' ? sampleID+'/stats' : 'bismark_align' }", pattern: "*report.txt", mode:'copy' publishDir "${params.pubdir}/${ params.organize_by=='sample' ? sampleID+'/alignment' : 'bismark_align' }", pattern: "*unmapped*", mode:'copy', enabled: params.keep_intermediate publishDir "${params.pubdir}/${ params.organize_by=='sample' ? sampleID+'/alignment' : 'bismark_align' }", pattern: "*ambiguous*", mode:'copy', enabled: params.keep_intermediate @@ -24,7 +23,6 @@ process BISMARK_ALIGNMENT { tuple val(sampleID), file("*unmapped*"), emit: unmapped_reads script: - log.info "----- Bismark Alignment Running on: ${sampleID} -----" inputfq = params.read_type == 'PE' ? "-1 ${fq_reads[0]} -2 ${fq_reads[1]}" : "-1 ${fq_reads[0]}" directionality = params.non_directional ? '--non_directional': '' diff --git a/modules/bismark/bismark_deduplication.nf b/modules/bismark/bismark_deduplication.nf index 86dc5527..e9e0a466 100644 --- a/modules/bismark/bismark_deduplication.nf +++ b/modules/bismark/bismark_deduplication.nf @@ -2,10 +2,9 @@ process BISMARK_DEDUPLICATION { tag "$sampleID" cpus 8 - memory {60.GB * task.attempt} - time {30.hour * task.attempt} - errorStrategy 'retry' - maxRetries 1 + memory 60.GB + time 30.hour + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} container 'quay.io/biocontainers/bismark:0.23.1--hdfd78af_0' @@ -20,7 +19,6 @@ process BISMARK_DEDUPLICATION { tuple val(sampleID), file("*report.txt"), emit: dedup_report script: - log.info "----- Bismark Deduplication Running on: ${sampleID} -----" fq_type = params.read_type == 'PE' ? '-p' : '-s' diff --git a/modules/bismark/bismark_methylation_extraction.nf b/modules/bismark/bismark_methylation_extraction.nf index df661e59..774ab6b0 100644 --- a/modules/bismark/bismark_methylation_extraction.nf +++ b/modules/bismark/bismark_methylation_extraction.nf @@ -2,10 +2,9 @@ process BISMARK_METHYLATION_EXTRACTION { tag "$sampleID" cpus 8 - memory {60.GB * task.attempt} - time {30.hour * task.attempt} - errorStrategy 'retry' - maxRetries 1 + memory 60.GB + time 30.hour + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} container 'quay.io/biocontainers/bismark:0.23.1--hdfd78af_0' @@ -21,7 +20,6 @@ process BISMARK_METHYLATION_EXTRACTION { tuple val(sampleID), file("*.{png,gz}"), emit: extractor_png_gz script: - log.info "----- Bismark Methylation Extractor Running on: ${sampleID} -----" comprehensive = params.comprehensive ? '--comprehensive --merge_non_CpG' : '' cytosine_report = params.cytosine_report ? "--cytosine_report --genome_folder ${params.ref_fa_index}" : '' diff --git a/modules/bowtie2/bowtie2_align_trimmed_fastq.nf b/modules/bowtie2/bowtie2_align_trimmed_fastq.nf index 28cd63f1..a9c9a621 100644 --- a/modules/bowtie2/bowtie2_align_trimmed_fastq.nf +++ b/modules/bowtie2/bowtie2_align_trimmed_fastq.nf @@ -4,6 +4,7 @@ process ALIGN_TRIMMED_FASTQ { cpus 16 memory 30.GB time '48:00:00' + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} publishDir "${params.pubdir}/${ params.organize_by=='sample' ? sampleID+'/stats' : 'bowtie2' }", pattern: "*.log", mode: 'copy' container 'biocontainers/bowtie2:v2.4.1_cv1' @@ -16,7 +17,6 @@ process ALIGN_TRIMMED_FASTQ { tuple val(sampleID), file("*_bowtie2.log"), emit: bowtie_log script: - log.info "----- Bowtie2 Running on: ${sampleID} -----" String options = params.bowtieVSensitive == 'true' ? '--very-sensitive' : '' """ bowtie2 \ diff --git a/modules/bwa/bwa_mem.nf b/modules/bwa/bwa_mem.nf index 14681487..dc679a54 100644 --- a/modules/bwa/bwa_mem.nf +++ b/modules/bwa/bwa_mem.nf @@ -2,14 +2,17 @@ process BWA_MEM { tag "$sampleID" cpus 8 - memory {60.GB * task.attempt} - time {30.hour * task.attempt} - errorStrategy 'retry' - maxRetries 1 + memory 60.GB + time 30.hour + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} container 'quay.io/biocontainers/bwakit:0.7.17.dev1--hdfd78af_1' - publishDir "${params.pubdir}/${ params.organize_by=='sample' ? sampleID : 'bwa_mem' }", pattern: "*.sam", mode:'copy', enabled: params.keep_intermediate + publishDir { + def type = "${params.workflow}" == 'chipseq' ? ( sampleID =~ /INPUT/ ? 'control_samples/' : 'immuno_precip_samples/') : '' + "${params.pubdir}/${ params.organize_by=='sample' ? type+sampleID : 'bwa_mem'}" + }, pattern: "*.sam", mode: 'copy', enabled: params.keep_intermediate + input: tuple val(sampleID), file(fq_reads), file(read_groups) @@ -18,7 +21,6 @@ process BWA_MEM { tuple val(sampleID), file("*.sam"), emit: sam script: - log.info "----- BWA-MEM Alignment Running on: ${sampleID} -----" if (params.read_type == "SE"){ inputfq="${fq_reads[0]}" @@ -27,9 +29,11 @@ process BWA_MEM { inputfq="${fq_reads[0]} ${fq_reads[1]}" } + score = params.bwa_min_score ? "-T ${params.bwa_min_score}" : '' + split_hits = params.workflow == "chipseq" ? "-M" : '' """ rg=\$(cat $read_groups) bwa mem -R \${rg} \ - -t $task.cpus ${params.mismatch_penalty} ${params.ref_fa_indices} $inputfq > ${sampleID}.sam + -t $task.cpus $split_hits ${params.mismatch_penalty} $score ${params.ref_fa_indices} $inputfq > ${sampleID}.sam """ -} \ No newline at end of file +} diff --git a/modules/bwa/bwa_mem_hla.nf b/modules/bwa/bwa_mem_hla.nf index 569772d4..4dfbdb88 100644 --- a/modules/bwa/bwa_mem_hla.nf +++ b/modules/bwa/bwa_mem_hla.nf @@ -2,10 +2,9 @@ process BWA_MEM_HLA { tag "$sampleID" cpus 8 - memory {60.GB * task.attempt} - time {30.hour * task.attempt} - errorStrategy 'retry' - maxRetries 1 + memory 60.GB + time 30.hour + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} container 'quay.io/biocontainers/bwakit:0.7.17.dev1--hdfd78af_1' @@ -18,7 +17,6 @@ process BWA_MEM_HLA { tuple val(sampleID), file("*.bam"), emit: bam script: - log.info "----- BWA-MEM Alignment Running on: ${sampleID} -----" if (params.read_type == "SE"){ inputfq="${fq_reads[0]}" diff --git a/modules/conpair/conpair.nf b/modules/conpair/conpair.nf new file mode 100644 index 00000000..93a1e92d --- /dev/null +++ b/modules/conpair/conpair.nf @@ -0,0 +1,31 @@ +process CONPAIR { + tag "$pairName" + + cpus 1 + memory 4.GB + time '10:00:00' + container 'quay.io/jaxcompsci/conpair:v0.2' + errorStrategy 'ignore' + + publishDir "${params.pubdir}/${ params.organize_by=='sample' ? "$pairName" : 'conpair' }", pattern:"*.txt", mode:'copy' + + input: + tuple val(sampleID), val(pairName), file(tumor_pileup), file(normal_pileup) + + output: + tuple val(pairName), file("*_concordance.txt"), emit: concordance + tuple val(pairName), file("*_contamination.txt"), emit: contamination + + script: + """ + python2 /Conpair-0.2/scripts/verify_concordance.py -T ${tumor_pileup} -N ${normal_pileup} --outfile ${pairName}_concordance.txt -M /Conpair-0.2/data/markers/GRCh38.autosomes.phase3_shapeit2_mvncall_integrated.20130502.SNV.genotype.sselect_v4_MAF_0.4_LD_0.8.liftover.txt + + python2 /Conpair-0.2/scripts/estimate_tumor_normal_contamination.py -T ${tumor_pileup} -N ${normal_pileup} --outfile ${pairName}_contamination.txt -M /Conpair-0.2/data/markers/GRCh38.autosomes.phase3_shapeit2_mvncall_integrated.20130502.SNV.genotype.sselect_v4_MAF_0.4_LD_0.8.liftover.txt + """ + + stub: + """ + touch ${pairName}_concordance.txt + touch ${pairName}_contamination.txt + """ +} diff --git a/modules/conpair/conpair_pileup.nf b/modules/conpair/conpair_pileup.nf new file mode 100644 index 00000000..c267776f --- /dev/null +++ b/modules/conpair/conpair_pileup.nf @@ -0,0 +1,24 @@ +process CONPAIR_PILEUP { + tag "$sampleName" + + cpus 1 + memory 4.GB + time '10:00:00' + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} + + container 'quay.io/jaxcompsci/conpair:v0.2' + + input: + tuple val(sampleID), val(sampleName), file(bam), file(bai) + val(type) + + output: + tuple val(sampleID), val(sampleName), file("*pileup.txt"), emit: pileup + + script: + """ + python2 /Conpair-0.2/scripts/run_gatk_pileup_for_sample.py -B ${bam} -O ${sampleName}_${type}_pileup.txt --reference ${params.ref_fa} --markers /Conpair-0.2/data/markers/GRCh38.autosomes.phase3_shapeit2_mvncall_integrated.20130502.SNV.genotype.sselect_v4_MAF_0.4_LD_0.8.liftover.bed + """ +} + +// Marker file inputs: `--markers /Conpair-0.2/data/markers/...` are located in the container. \ No newline at end of file diff --git a/modules/cosmic/cosmic_add_cancer_resistance_mutations_germline.nf b/modules/cosmic/cosmic_add_cancer_resistance_mutations_germline.nf new file mode 100644 index 00000000..ab00911f --- /dev/null +++ b/modules/cosmic/cosmic_add_cancer_resistance_mutations_germline.nf @@ -0,0 +1,30 @@ +process COSMIC_CANCER_RESISTANCE_MUTATION_GERMLINE { + tag "$sampleID" + + cpus 1 + memory 5.GB + time 1.hour + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} + + container 'quay.io/jaxcompsci/py3_perl_pylibs:v2' + + input: + tuple val(sampleID), file(vcf) + + output: + tuple val(sampleID), file("*.vcf"), emit: vcf + + script: + """ + python \ + ${projectDir}/bin/pta/add_cancer_resistance_mutations.py \ + ${params.cosmic_cancer_resistance_muts} \ + ${vcf} \ + ${sampleID}_germline_snv_indel_annotated_supplemental.vcf + """ +} + +// cosmic for 'pta' pipeline comes from: +// curl -H "Authorization: Basic ADD AUTHORIZATION" https://cancer.sanger.ac.uk/cosmic/file_download/GRCh38/cosmic/v97/CosmicResistanceMutations.tsv.gz +// the above command provides a URL for curl download +// curl "https://cog.sanger.ac.uk/cosmic/GRCh38/cosmic/v97/CosmicResistanceMutations.tsv.gz?AWSAccessKeyId=KRV7P7QR9DL41J9EWGA2&Expires=1672933745&Signature=nQ9AFGONT4rDKfM4UZ1cmN4J%2F%2BM%3D" --output CosmicResistanceMutations.tsv.gz \ No newline at end of file diff --git a/modules/cosmic/cosmic_add_cancer_resistance_mutations_somatic.nf b/modules/cosmic/cosmic_add_cancer_resistance_mutations_somatic.nf new file mode 100644 index 00000000..74f00643 --- /dev/null +++ b/modules/cosmic/cosmic_add_cancer_resistance_mutations_somatic.nf @@ -0,0 +1,30 @@ +process COSMIC_CANCER_RESISTANCE_MUTATION_SOMATIC { + tag "$sampleID" + + cpus 1 + memory 40.GB + time 20.hour + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} + + container 'quay.io/jaxcompsci/py3_perl_pylibs:v2' + + input: + tuple val(sampleID), file(vcf), val(meta), val(normal_name), val(tumor_name) + + output: + tuple val(sampleID), file("*_somatic_vep_cosmic_cancerResitMut_annotated.vcf"), val(meta), val(normal_name), val(tumor_name), emit: vcf + + script: + """ + python \ + ${projectDir}/bin/pta/add_cancer_resistance_mutations.py \ + ${params.cosmic_cancer_resistance_muts} \ + ${vcf} \ + ${sampleID}_somatic_vep_cosmic_cancerResitMut_annotated.vcf + """ +} + +// cosmic for 'pta' pipeline comes from: +// curl -H "Authorization: Basic ADD AUTHORIZATION" https://cancer.sanger.ac.uk/cosmic/file_download/GRCh38/cosmic/v97/CosmicResistanceMutations.tsv.gz +// the above command provides a URL for curl download +// curl "https://cog.sanger.ac.uk/cosmic/GRCh38/cosmic/v97/CosmicResistanceMutations.tsv.gz?AWSAccessKeyId=KRV7P7QR9DL41J9EWGA2&Expires=1672933745&Signature=nQ9AFGONT4rDKfM4UZ1cmN4J%2F%2BM%3D" --output CosmicResistanceMutations.tsv.gz \ No newline at end of file diff --git a/modules/cosmic/cosmic_annotation.nf b/modules/cosmic/cosmic_annotation.nf new file mode 100644 index 00000000..43e4066a --- /dev/null +++ b/modules/cosmic/cosmic_annotation.nf @@ -0,0 +1,30 @@ +process COSMIC_ANNOTATION { + tag "$sampleID" + + cpus 1 + memory 1.GB + time 5.hour + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} + + container 'quay.io/jaxcompsci/py3_perl_pylibs:v2' + + input: + tuple val(sampleID), file(vcf) + + output: + tuple val(sampleID), file("*.vcf"), emit: vcf + + script: + """ + python \ + ${projectDir}/bin/pta/add_cancer_gene_census.py \ + ${params.cosmic_cgc} \ + ${vcf} \ + ${sampleID}_germline_vep_cosmic_annotated.vcf + """ +} + +// cosmic for 'pta' pipeline comes from: +// curl -H "Authorization: Basic ADD AUTHORIZATION" https://cancer.sanger.ac.uk/cosmic/file_download/GRCh38/cosmic/v97/cancer_gene_census.csv +// the above command provides a URL for curl download +// curl "https://cog.sanger.ac.uk/cosmic/GRCh38/cosmic/v97/cancer_gene_census.csv?AWSAccessKeyId=KRV7P7QR9DL41J9EWGA2&Expires=1672931317&Signature=PK8YAGC%2Bh9veZqc7mIZzywkOSf0%3D" --output cancer_gene_census.csv diff --git a/modules/cosmic/cosmic_annotation_somatic.nf b/modules/cosmic/cosmic_annotation_somatic.nf new file mode 100644 index 00000000..84a70435 --- /dev/null +++ b/modules/cosmic/cosmic_annotation_somatic.nf @@ -0,0 +1,30 @@ +process COSMIC_ANNOTATION_SOMATIC { + tag "$sampleID" + + cpus 1 + memory 40.GB + time 20.hour + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} + + container 'quay.io/jaxcompsci/py3_perl_pylibs:v2' + + input: + tuple val(sampleID), file(vcf), val(meta), val(normal_name), val(tumor_name) + + output: + tuple val(sampleID), file("*_somatic_vep_cosmic_annotated.vcf"), val(meta), val(normal_name), val(tumor_name), emit: vcf + + script: + """ + python \ + ${projectDir}/bin/pta/add_cancer_gene_census.py \ + ${params.cosmic_cgc} \ + ${vcf} \ + ${sampleID}_somatic_vep_cosmic_annotated.vcf + """ +} + +// cosmic for 'pta' pipeline comes from: +// curl -H "Authorization: Basic ADD AUTHORIZATION" https://cancer.sanger.ac.uk/cosmic/file_download/GRCh38/cosmic/v97/cancer_gene_census.csv +// the above command provides a URL for curl download +// curl "https://cog.sanger.ac.uk/cosmic/GRCh38/cosmic/v97/cancer_gene_census.csv?AWSAccessKeyId=KRV7P7QR9DL41J9EWGA2&Expires=1672931317&Signature=PK8YAGC%2Bh9veZqc7mIZzywkOSf0%3D" --output cancer_gene_census.csv diff --git a/modules/cutadapt/cutadapt_trim_fastq.nf b/modules/cutadapt/cutadapt_trim_fastq.nf index 9ecff88e..6077524a 100644 --- a/modules/cutadapt/cutadapt_trim_fastq.nf +++ b/modules/cutadapt/cutadapt_trim_fastq.nf @@ -2,6 +2,7 @@ process TRIM_FASTQ { cpus 8 memory 10.GB time '20:00:00' + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} publishDir "${params.pubdir}/${ params.organize_by=='sample' ? sampleID+'/stats' : 'cutadapt' }", pattern: "*.log", mode: 'copy' container 'quay.io/biocontainers/cutadapt:2.3--py37h14c3975_0' @@ -14,7 +15,6 @@ process TRIM_FASTQ { tuple val(sampleID), file("*.log"), emit: cutadapt_log script: - log.info "----- Cutadapt Running on: ${sampleID} -----" paired_end = params.read_type == 'PE' ? "-p ${sampleID}_R2_paired_trimmed.fq" : '' diff --git a/modules/deeptools/deeptools_bam_coverage_bigwig.nf b/modules/deeptools/deeptools_bam_coverage_bigwig.nf index 3a10bfc4..fbc267c1 100644 --- a/modules/deeptools/deeptools_bam_coverage_bigwig.nf +++ b/modules/deeptools/deeptools_bam_coverage_bigwig.nf @@ -4,8 +4,9 @@ process BAM_COVERAGE_BIGWIG { cpus 8 memory 10.GB time '04:00:00' + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} - publishDir "${params.pubdir}/${ params.organize_by=='sample' ? sampleID : 'deeptools' }", pattern: "*.bigwig", mode: 'copy' + publishDir "${params.pubdir}/${ params.organize_by=='sample' ? sampleID+'/deeptools' : 'deeptools' }", pattern: "*.bigwig", mode: 'copy' container 'quay.io/biocontainers/deeptools:3.3.2--py_1' input: @@ -15,7 +16,6 @@ process BAM_COVERAGE_BIGWIG { tuple val(sampleID), file("*.bigwig") script: - log.info "----- Running deeptools bamCoverage bigwig on ${sampleID} -----" """ bamCoverage \ --numberOfProcessors $task.cpus \ diff --git a/modules/deeptools/deeptools_computematrix.nf b/modules/deeptools/deeptools_computematrix.nf new file mode 100644 index 00000000..c4ee39af --- /dev/null +++ b/modules/deeptools/deeptools_computematrix.nf @@ -0,0 +1,32 @@ +process DEEPTOOLS_COMPUTEMATRIX { + tag "$sampleID" + + cpus 8 + memory 10.GB + time '04:00:00' + + container 'quay.io/biocontainers/deeptools:3.3.2--py_1' + + input: + tuple val(sampleID), file(bigwig) + file(bed) + + output: + tuple val(sampleID), file("*.mat.gz") , emit: matrix + tuple val(sampleID), file("*.mat.tab"), emit: table + + script: + """ + computeMatrix scale-regions \\ + --regionsFileName $bed \\ + --scoreFileName $bigwig \\ + --outFileName ${sampleID}.computeMatrix.mat.gz \\ + --outFileNameMatrix ${sampleID}.computeMatrix.vals.mat.tab \\ + --regionBodyLength 1000 \\ + --beforeRegionStartLength 3000 \\ + --afterRegionStartLength 3000 \\ + --skipZeros \\ + --smartLabels \\ + --numberOfProcessors $task.cpus + """ +} diff --git a/modules/deeptools/deeptools_filter_remove_multi_sieve.nf b/modules/deeptools/deeptools_filter_remove_multi_sieve.nf index 7797a17a..c8d2eb32 100644 --- a/modules/deeptools/deeptools_filter_remove_multi_sieve.nf +++ b/modules/deeptools/deeptools_filter_remove_multi_sieve.nf @@ -4,6 +4,7 @@ process FILTER_REMOVE_MULTI_SIEVE { cpus 8 memory 4.GB time '04:00:00' + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} container 'quay.io/biocontainers/deeptools:3.3.2--py_1' @@ -14,7 +15,6 @@ process FILTER_REMOVE_MULTI_SIEVE { tuple val(sampleID), file("*.shift.tmp.ba*") script: - log.info "----- Running deeptools alignmentSieve on ${sampleID} -----" """ alignmentSieve \ --numberOfProcessors $task.cpus \ diff --git a/modules/deeptools/deeptools_plotfingerprint.nf b/modules/deeptools/deeptools_plotfingerprint.nf new file mode 100644 index 00000000..f6ba9eb5 --- /dev/null +++ b/modules/deeptools/deeptools_plotfingerprint.nf @@ -0,0 +1,38 @@ +process DEEPTOOLS_PLOTFINGERPRINT { + tag "${ip} vs ${control}" + + cpus 8 + memory 10.GB + time '04:00:00' + + publishDir "${params.pubdir}/${ params.organize_by=='sample' ? 'immuno_precip_samples/'+ip+'_vs_'+control+'/deeptools' : 'deeptools' }", pattern: "*.pdf", mode: 'copy' + publishDir "${params.pubdir}/${ params.organize_by=='sample' ? 'immuno_precip_samples/'+ip+'_vs_'+control+'/deeptools' : 'deeptools' }", pattern: "*.raw.txt", mode: 'copy' + publishDir "${params.pubdir}/${ params.organize_by=='sample' ? 'immuno_precip_samples/'+ip+'_vs_'+control+'/deeptools' : 'deeptools' }", pattern: "*.qcmetrics.txt", mode: 'copy' + + container 'quay.io/biocontainers/deeptools:3.3.2--py_1' + + input: + tuple val(antibody), val(replicatesExist), val(multipleGroups), val(ip), file(ipbam), val(control), file(controlbam), file(ipflagstat) + + output: + tuple val(ip), file("*.pdf"), emit : pdf + tuple val(ip), file("*.raw.txt"), emit : raw + tuple val(ip), file("*.qcmetrics.txt"), emit : qc + + + script: + extend = (params.read_type == 'SE' && params.fragment_size > 0) ? "--extendReads ${params.fragment_size}" : '' + """ + plotFingerprint \\ + --bamfiles ${ipbam[0]} ${controlbam[0]} \\ + --plotFile ${ip}.plotFingerprint.pdf \\ + $extend \\ + --labels $ip $control \\ + --outRawCounts ${ip}.plotFingerprint.raw.txt \\ + --outQualityMetrics ${ip}.plotFingerprint.qcmetrics.txt \\ + --skipZeros \\ + --JSDsample ${controlbam[0]} \\ + --numberOfProcessors $task.cpus \\ + --numberOfSamples $params.fingerprint_bins + """ +} diff --git a/modules/deeptools/deeptools_plotheatmap.nf b/modules/deeptools/deeptools_plotheatmap.nf new file mode 100644 index 00000000..aaab3770 --- /dev/null +++ b/modules/deeptools/deeptools_plotheatmap.nf @@ -0,0 +1,29 @@ +process DEEPTOOLS_PLOTHEATMAP { + tag "$sampleID" + + cpus 8 + memory 10.GB + time '04:00:00' + + publishDir { + def type = "${params.workflow}" == 'chipseq' ? ( sampleID =~ /INPUT/ ? 'control_samples/' : 'immuno_precip_samples/') : '' + "${params.pubdir}/${ params.organize_by=='sample' ? type+sampleID+'/deeptools' : 'deeptools'}" + }, pattern: "*.pdf", mode: 'copy' + + + container 'quay.io/biocontainers/deeptools:3.3.2--py_1' + + input: + tuple val(sampleID), file(matrix) + + output: + tuple val(sampleID), path("*.pdf"), emit: pdf + tuple val(sampleID), path("*.tab"), emit: table + + script: + """ + plotHeatmap --matrixFile $matrix \\ + --outFileName ${sampleID}.plotHeatmap.pdf \\ + --outFileNameMatrix ${sampleID}.plotHeatmap.mat.tab + """ +} diff --git a/modules/deeptools/deeptools_plotprofile.nf b/modules/deeptools/deeptools_plotprofile.nf new file mode 100644 index 00000000..f179d517 --- /dev/null +++ b/modules/deeptools/deeptools_plotprofile.nf @@ -0,0 +1,30 @@ +process DEEPTOOLS_PLOTPROFILE { + tag "$sampleID" + + cpus 8 + memory 10.GB + time '04:00:00' + + publishDir { + def type = "${params.workflow}" == 'chipseq' ? ( sampleID =~ /INPUT/ ? 'control_samples/' : 'immuno_precip_samples/') : '' + "${params.pubdir}/${ params.organize_by=='sample' ? type+sampleID+'/deeptools' : 'deeptools'}" + }, pattern: "*.pdf", mode: 'copy' + + + container 'quay.io/biocontainers/deeptools:3.3.2--py_1' + + + input: + tuple val(sampleID), file(matrix) + + output: + tuple val(sampleID), path("*.pdf"), emit: pdf + tuple val(sampleID), path("*.tab"), emit: table + + script: + """ + plotProfile --matrixFile ${sampleID}.computeMatrix.mat.gz \\ + --outFileName ${sampleID}.plotProfile.pdf \\ + --outFileNameData ${sampleID}.plotProfile.tab + """ +} diff --git a/modules/ensembl/varianteffectpredictor_germline.nf b/modules/ensembl/varianteffectpredictor_germline.nf new file mode 100644 index 00000000..0af31fa6 --- /dev/null +++ b/modules/ensembl/varianteffectpredictor_germline.nf @@ -0,0 +1,131 @@ +process VEP_GERMLINE { + tag "$sampleID" + + cpus = 4 + memory = 15.GB + time = '10:00:00' + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} + + container 'ensemblorg/ensembl-vep:release_109.3' + + publishDir "${params.pubdir}/${ params.organize_by=='sample' ? sampleID : 'vep' }", pattern: "*.vcf", mode:'copy', enabled: params.keep_intermediate + + input: + tuple val(sampleID), file(vcf), file(idx) + + output: + tuple val(sampleID), file("*_vep_annotated.vcf"), emit: vcf + + script: + + """ + vep \ + --input_file ${vcf} \ + --output_file ${sampleID}_germline_vep_annotated.vcf \ + --fork ${task.cpus} \ + --buffer_size 50000 \ + --format vcf \ + --no_stats \ + --no_escape \ + --offline \ + --assembly GRCh38 \ + --cache \ + --dir_cache ${params.vep_cache_directory} \ + --refseq \ + --max_af \ + --af \ + --af_1kg \ + --af_gnomad \ + --exclude_predicted \ + --fasta ${params.vep_fasta} \ + --symbol \ + --hgvs \ + --check_existing \ + --vcf \ + --pick_allele_gene \ + --dir_plugins ${params.vep_cache_directory}/Plugins \ + --plugin dbscSNV,${params.vep_cache_directory}/Plugins/dbscSNV1.1/dbscSNV1.1_GRCh38.txt.gz \ + --plugin MaxEntScan,${params.vep_cache_directory}/Plugins/maxentscan \ + --plugin dbNSFP,${params.vep_cache_directory}/Plugins/dbNSFP/dbNSFP4.3a_grch38.gz,${params.vep_cache_directory}/Plugins/dbNSFP_replacement_logic,REVEL_score,SIFT_pred,SIFT4G_pred,LRT_pred,MutationTaster_pred,MutationAssessor_pred,FATHMM_pred,PROVEAN_pred,MetaSVM_pred,PrimateAI_pred,fathmm-MKL_coding_pred,GERP++_RS,phyloP100way_vertebrate,CADD_phred,Polyphen2_HVAR_pred \ + --custom ${params.vep_cache_directory}/annotations/COSMIC_v97/CosmicCodingMuts.vcf.gz,CosmicCoding,vcf,exact,0,GENOMIC_ID,LEGACY_ID,CNT,CDS,AA \ + --custom ${params.vep_cache_directory}/annotations/COSMIC_v97/CosmicNonCodingVariants.normal.vcf.gz,CosmicNonCoding,vcf,exact,0,GENOMIC_ID,LEGACY_ID,CNT,CDS,AA \ + --custom ${params.vep_cache_directory}/annotations/04142020_NYGC_samples.vcf.gz,NYGC,vcf,exact,0,AF,Samples,AC_Het,AC_Hom \ + --custom ${params.vep_cache_directory}/annotations/clinvar.vep.vcf.gz,CLN_Overlap,vcf,overlap,0,CLIN_ID,CLNSIG,CLNREVSTAT,CLNDN \ + --custom ${params.vep_cache_directory}/annotations/clinvar.vep.vcf.gz,CLN_Exact,vcf,exact,0,CLIN_ID,CLNSIG,CLNREVSTAT,CLNDN \ + --custom ${params.vep_cache_directory}/annotations/gnomad_exomes_subset_final.vcf.gz,GnomadExomes,vcf,exact,0,AF,nhomalt \ + --custom ${params.vep_cache_directory}/annotations/gnomad_genomes_subset_final.vcf.gz,GnomadGenomes,vcf,exact,0,AF,nhomalt \ + --custom ${params.vep_cache_directory}/annotations/chd_genes.vcf.gz,CHD_GENES,vcf,overlap,0,GENE \ + --custom ${params.vep_cache_directory}/annotations/chd_evolving.vcf.gz,CHD_EVOLVING,vcf,overlap,0,GENE \ + --custom ${params.vep_cache_directory}/annotations/chd_whitelist.vcf.gz,chd_whitelist,vcf,overlap,0,END \ + --custom ${params.vep_cache_directory}/annotations/deep_intronic_whitelist_08132020.vcf.gz,INTRONIC,vcf,exact,0,INTRONIC \ + --custom ${params.vep_cache_directory}/annotations/clinvar_deep_intronics_09012020.vcf.gz,CLINVAR_INTRONIC,vcf,exact,0,INTRONIC \ + --custom ${params.vep_cache_directory}/annotations/mastermind_cited_variants_reference-2021.01.02-grch38_fixed-contigs.vcf.gz,mm,vcf,exact,0,GENE,HGVSG,MMCNT1,MMCNT2,MMCNT3,MMID3,MMURI3 \ + --custom ${params.vep_cache_directory}/annotations/spliceai_scores.hg38.sorted.vcf.gz,SPLICEAI,vcf,exact,0,DS_AG,DS_AL,DS_DG,DS_DL \ + --custom ${params.vep_cache_directory}/annotations/pli_hg38.vcf.gz,PLI,vcf,overlap,0,pLI,mis_z \ + --custom ${params.vep_cache_directory}/annotations/domino_genes_38.vcf.gz,Domino,vcf,overlap,0,Domino_Score \ + --custom ${params.vep_cache_directory}/annotations/ar_extended.vcf.gz,AR,vcf,overlap,0,AR_GENE \ + --custom ${params.vep_cache_directory}/annotations/ACMG59_2017-09-28.vcf.gz,ACMG59,vcf,overlap,0,GENE,DISEASE \ + --custom ${params.vep_cache_directory}/annotations/dials_genes_b38.vcf.gz,DIALS,vcf,overlap,0,DIALS_GENE \ + --custom ${params.vep_cache_directory}/annotations/pgx_vep.vcf.gz,PGx,vcf,exact,0,pgx_rsid \ + --custom ${params.vep_cache_directory}/annotations/sema4_immuno_genes_b38.vcf.gz,IMMUNO,vcf,overlap,0,IMMUNO_Gene \ + --custom ${params.vep_cache_directory}/annotations/sema4_neuro_genes_b38.vcf.gz,NEURO,vcf,overlap,0,NEURO_Gene \ + --custom ${params.vep_cache_directory}/annotations/sema4_cardio_genes_b38.vcf.gz,CARDIO,vcf,overlap,0,CARDIO_Gene \ + --custom ${params.vep_cache_directory}/annotations/nygc_curation_b38.vcf.gz,N19,vcf,overlap,0,NYGC_CUR \ + --custom ${params.vep_cache_directory}/annotations/nygc_reported_variants_b38.vcf.gz,R19,vcf,overlap,0,NYGC_REPORTED_SAMPLE,NYGC_CLASS,NYGC_DISEASE + """ +} + +// NOTE: Many of the resources are hard coded based on those provided in: +// https://bitbucket.nygenome.org/projects/WDL/repos/somatic_dna_wdl/browse/annotate/variantEffectPredictor.wdl +// https://bitbucket.nygenome.org/projects/WDL/repos/somatic_dna_wdl/browse/config/fasta_references.json +// For VEP cache, dbNSFP and dbscSNV resources were rebuild using VEPv108 and as noted below. + +// VEP Cache setup: + +// singularity pull --name vep.sif docker://ensemblorg/ensembl-vep:release_108.2 +// singularity exec vep.sif INSTALL.pl -c /PATH_TO_VEP/vep -a cfp -s homo_sapiens_refseq -y GRCh38 -g dbNSFP,dbscSNV,MaxEntScan +// ln -sf homo_sapiens homo_sapiens_refseq + +// In the plugin directory: + +// dbNSFP: +// wget https://dbnsfp.s3.amazonaws.com/dbNSFP4.3a.zip +// unzip dbNSFP4.3a.zip +// zcat dbNSFP4.3a_variant.chr1.gz | head -n1 > h +// mkdir temp +// zgrep -h -v ^#chr dbNSFP4.3a_variant.chr* | sort -T temp -k1,1 -k2,2n - | cat h - | bgzip -c > dbNSFP4.3a.gz +// tabix -s 1 -b 2 -e 2 dbNSFP4.3a.gz +// rm -rf temp dbNSFP4.3a_variant.chr* h dbNSFP4.3_gene.gz dbNSFP4.3_gene.complete.gz dbNSFP4.3a.zip search_dbNSFP43a.readme.pdf search_dbNSFP43a.class search_dbNSFP43a.jar + + +// dbscSNV: +// wget ftp://dbnsfp:dbnsfp@dbnsfp.softgenetics.com/dbscSNV1.1.zip +// unzip dbscSNV1.1.zip +// head -n1 dbscSNV1.1.chr1 > h2 +// mkdir temp2 +// cat dbscSNV1.1.chr* | grep -v ^chr | sort -T temp2 -k5,5 -k6,6n | cat h2 - | awk '$5 != "."' | bgzip -c > dbscSNV1.1_GRCh38.txt.gz +// tabix -s 5 -b 6 -e 6 -c c dbscSNV1.1_GRCh38.txt.gz +// rm dbscSNV1.1.chr* dbscSNV1.1.zip h2 + +// wget http://hollywood.mit.edu/burgelab/maxent/download/fordownload.tar.gz +// gunzip fordownload.tar.gz +// tar -xvf fordownload.tar +// mkdir maxentscan +// mv fordownload ./maxentscan/ + + +// COSMIC: +// mkdir COSMIC_v97 +// echo "@jax.org:" | base64 +// +// curl -H "Authorization: Basic ADD AUTHORIZATION" https://cancer.sanger.ac.uk/cosmic/file_download/GRCh38/cosmic/v97/VCF/CosmicCodingMuts.vcf.gz +// the above command provides a URL for curl download +// curl "https://cog.sanger.ac.uk/cosmic/GRCh38/cosmic/v97/VCF/CosmicCodingMuts.vcf.gz?AWSAccessKeyId=KRV7P7QR9DL41J9EWGA2&Expires=1672843877&Signature=TSsIiQodqoKS5skE1ziS49zEWSU%3D" --output CosmicCodingMuts.vcf.gz + +// curl -H "Authorization: Basic ADD AUTHORIZATION" https://cancer.sanger.ac.uk/cosmic/file_download/GRCh38/cosmic/v97/VCF/CosmicNonCodingVariants.normal.vcf.gz +// the above command provides a URL for curl download +// curl "https://cog.sanger.ac.uk/cosmic/GRCh38/cosmic/v97/VCF/CosmicNonCodingVariants.normal.vcf.gz?AWSAccessKeyId=KRV7P7QR9DL41J9EWGA2&Expires=1672844121&Signature=4JkeRizNMg0pv%2FChw4QAl268dVw%3D" --output CosmicNonCodingVariants.normal.vcf.gz + + +// ALL REMAINING ANNOTATIONS: +// Note: remaining annotations come from: gs://nygc-resources-public/ensembl_vep/annotations.tar.gz \ No newline at end of file diff --git a/modules/ensembl/varianteffectpredictor_somatic.nf b/modules/ensembl/varianteffectpredictor_somatic.nf new file mode 100644 index 00000000..7c84aad0 --- /dev/null +++ b/modules/ensembl/varianteffectpredictor_somatic.nf @@ -0,0 +1,131 @@ +process VEP_SOMATIC { + tag "$sampleID" + + cpus = 4 + memory = 15.GB + time = '10:00:00' + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} + + container 'ensemblorg/ensembl-vep:release_109.3' + + publishDir "${params.pubdir}/${ params.organize_by=='sample' ? sampleID : 'vep' }", pattern: "*.vcf", mode:'copy', enabled: params.keep_intermediate + + input: + tuple val(sampleID), file(vcf), file(idx), val(meta), val(normal_name), val(tumor_name) + + output: + tuple val(sampleID), file("*_vep_annotated.vcf"), val(meta), val(normal_name), val(tumor_name), emit: vcf + + script: + + """ + vep \ + --input_file ${vcf} \ + --output_file ${sampleID}_somatic_vep_annotated.vcf \ + --fork ${task.cpus} \ + --buffer_size 50000 \ + --format vcf \ + --no_stats \ + --no_escape \ + --offline \ + --assembly GRCh38 \ + --cache \ + --dir_cache ${params.vep_cache_directory} \ + --refseq \ + --max_af \ + --af \ + --af_1kg \ + --af_gnomad \ + --exclude_predicted \ + --fasta ${params.vep_fasta} \ + --symbol \ + --hgvs \ + --check_existing \ + --vcf \ + --pick_allele_gene \ + --dir_plugins ${params.vep_cache_directory}/Plugins \ + --plugin dbscSNV,${params.vep_cache_directory}/Plugins/dbscSNV1.1/dbscSNV1.1_GRCh38.txt.gz \ + --plugin MaxEntScan,${params.vep_cache_directory}/Plugins/maxentscan \ + --plugin dbNSFP,${params.vep_cache_directory}/Plugins/dbNSFP/dbNSFP4.3a_grch38.gz,${params.vep_cache_directory}/Plugins/dbNSFP_replacement_logic,REVEL_score,SIFT_pred,SIFT4G_pred,LRT_pred,MutationTaster_pred,MutationAssessor_pred,FATHMM_pred,PROVEAN_pred,MetaSVM_pred,PrimateAI_pred,fathmm-MKL_coding_pred,GERP++_RS,phyloP100way_vertebrate,CADD_phred,Polyphen2_HVAR_pred \ + --custom ${params.vep_cache_directory}/annotations/COSMIC_v97/CosmicCodingMuts.vcf.gz,CosmicCoding,vcf,exact,0,GENOMIC_ID,LEGACY_ID,CNT,CDS,AA \ + --custom ${params.vep_cache_directory}/annotations/COSMIC_v97/CosmicNonCodingVariants.normal.vcf.gz,CosmicNonCoding,vcf,exact,0,GENOMIC_ID,LEGACY_ID,CNT,CDS,AA \ + --custom ${params.vep_cache_directory}/annotations/04142020_NYGC_samples.vcf.gz,NYGC,vcf,exact,0,AF,Samples,AC_Het,AC_Hom \ + --custom ${params.vep_cache_directory}/annotations/clinvar.vep.vcf.gz,CLN_Overlap,vcf,overlap,0,CLIN_ID,CLNSIG,CLNREVSTAT,CLNDN \ + --custom ${params.vep_cache_directory}/annotations/clinvar.vep.vcf.gz,CLN_Exact,vcf,exact,0,CLIN_ID,CLNSIG,CLNREVSTAT,CLNDN \ + --custom ${params.vep_cache_directory}/annotations/gnomad_exomes_subset_final.vcf.gz,GnomadExomes,vcf,exact,0,AF,nhomalt \ + --custom ${params.vep_cache_directory}/annotations/gnomad_genomes_subset_final.vcf.gz,GnomadGenomes,vcf,exact,0,AF,nhomalt \ + --custom ${params.vep_cache_directory}/annotations/chd_genes.vcf.gz,CHD_GENES,vcf,overlap,0,GENE \ + --custom ${params.vep_cache_directory}/annotations/chd_evolving.vcf.gz,CHD_EVOLVING,vcf,overlap,0,GENE \ + --custom ${params.vep_cache_directory}/annotations/chd_whitelist.vcf.gz,chd_whitelist,vcf,overlap,0,END \ + --custom ${params.vep_cache_directory}/annotations/deep_intronic_whitelist_08132020.vcf.gz,INTRONIC,vcf,exact,0,INTRONIC \ + --custom ${params.vep_cache_directory}/annotations/clinvar_deep_intronics_09012020.vcf.gz,CLINVAR_INTRONIC,vcf,exact,0,INTRONIC \ + --custom ${params.vep_cache_directory}/annotations/mastermind_cited_variants_reference-2021.01.02-grch38_fixed-contigs.vcf.gz,mm,vcf,exact,0,GENE,HGVSG,MMCNT1,MMCNT2,MMCNT3,MMID3,MMURI3 \ + --custom ${params.vep_cache_directory}/annotations/spliceai_scores.hg38.sorted.vcf.gz,SPLICEAI,vcf,exact,0,DS_AG,DS_AL,DS_DG,DS_DL \ + --custom ${params.vep_cache_directory}/annotations/pli_hg38.vcf.gz,PLI,vcf,overlap,0,pLI,mis_z \ + --custom ${params.vep_cache_directory}/annotations/domino_genes_38.vcf.gz,Domino,vcf,overlap,0,Domino_Score \ + --custom ${params.vep_cache_directory}/annotations/ar_extended.vcf.gz,AR,vcf,overlap,0,AR_GENE \ + --custom ${params.vep_cache_directory}/annotations/ACMG59_2017-09-28.vcf.gz,ACMG59,vcf,overlap,0,GENE,DISEASE \ + --custom ${params.vep_cache_directory}/annotations/dials_genes_b38.vcf.gz,DIALS,vcf,overlap,0,DIALS_GENE \ + --custom ${params.vep_cache_directory}/annotations/pgx_vep.vcf.gz,PGx,vcf,exact,0,pgx_rsid \ + --custom ${params.vep_cache_directory}/annotations/sema4_immuno_genes_b38.vcf.gz,IMMUNO,vcf,overlap,0,IMMUNO_Gene \ + --custom ${params.vep_cache_directory}/annotations/sema4_neuro_genes_b38.vcf.gz,NEURO,vcf,overlap,0,NEURO_Gene \ + --custom ${params.vep_cache_directory}/annotations/sema4_cardio_genes_b38.vcf.gz,CARDIO,vcf,overlap,0,CARDIO_Gene \ + --custom ${params.vep_cache_directory}/annotations/nygc_curation_b38.vcf.gz,N19,vcf,overlap,0,NYGC_CUR \ + --custom ${params.vep_cache_directory}/annotations/nygc_reported_variants_b38.vcf.gz,R19,vcf,overlap,0,NYGC_REPORTED_SAMPLE,NYGC_CLASS,NYGC_DISEASE + """ +} + +// NOTE: Many of the resources are hard coded based on those provided in: +// https://bitbucket.nygenome.org/projects/WDL/repos/somatic_dna_wdl/browse/annotate/variantEffectPredictor.wdl +// https://bitbucket.nygenome.org/projects/WDL/repos/somatic_dna_wdl/browse/config/fasta_references.json +// For VEP cache, dbNSFP and dbscSNV resources were rebuild using VEPv108 and as noted below. + +// VEP Cache setup: + +// singularity pull --name vep.sif docker://ensemblorg/ensembl-vep:release_108.2 +// singularity exec vep.sif INSTALL.pl -c /PATH_TO_VEP/vep -a cfp -s homo_sapiens_refseq -y GRCh38 -g dbNSFP,dbscSNV,MaxEntScan +// ln -sf homo_sapiens homo_sapiens_refseq + +// In the plugin directory: + +// dbNSFP: +// wget https://dbnsfp.s3.amazonaws.com/dbNSFP4.3a.zip +// unzip dbNSFP4.3a.zip +// zcat dbNSFP4.3a_variant.chr1.gz | head -n1 > h +// mkdir temp +// zgrep -h -v ^#chr dbNSFP4.3a_variant.chr* | sort -T temp -k1,1 -k2,2n - | cat h - | bgzip -c > dbNSFP4.3a.gz +// tabix -s 1 -b 2 -e 2 dbNSFP4.3a.gz +// rm -rf temp dbNSFP4.3a_variant.chr* h dbNSFP4.3_gene.gz dbNSFP4.3_gene.complete.gz dbNSFP4.3a.zip search_dbNSFP43a.readme.pdf search_dbNSFP43a.class search_dbNSFP43a.jar + + +// dbscSNV: +// wget ftp://dbnsfp:dbnsfp@dbnsfp.softgenetics.com/dbscSNV1.1.zip +// unzip dbscSNV1.1.zip +// head -n1 dbscSNV1.1.chr1 > h2 +// mkdir temp2 +// cat dbscSNV1.1.chr* | grep -v ^chr | sort -T temp2 -k5,5 -k6,6n | cat h2 - | awk '$5 != "."' | bgzip -c > dbscSNV1.1_GRCh38.txt.gz +// tabix -s 5 -b 6 -e 6 -c c dbscSNV1.1_GRCh38.txt.gz +// rm dbscSNV1.1.chr* dbscSNV1.1.zip h2 + +// wget http://hollywood.mit.edu/burgelab/maxent/download/fordownload.tar.gz +// gunzip fordownload.tar.gz +// tar -xvf fordownload.tar +// mkdir maxentscan +// mv fordownload ./maxentscan/ + + +// COSMIC: +// mkdir COSMIC_v97 +// echo "@jax.org:" | base64 +// +// curl -H "Authorization: Basic ADD AUTHORIZATION" https://cancer.sanger.ac.uk/cosmic/file_download/GRCh38/cosmic/v97/VCF/CosmicCodingMuts.vcf.gz +// the above command provides a URL for curl download +// curl "https://cog.sanger.ac.uk/cosmic/GRCh38/cosmic/v97/VCF/CosmicCodingMuts.vcf.gz?AWSAccessKeyId=KRV7P7QR9DL41J9EWGA2&Expires=1672843877&Signature=TSsIiQodqoKS5skE1ziS49zEWSU%3D" --output CosmicCodingMuts.vcf.gz + +// curl -H "Authorization: Basic ADD AUTHORIZATION" https://cancer.sanger.ac.uk/cosmic/file_download/GRCh38/cosmic/v97/VCF/CosmicNonCodingVariants.normal.vcf.gz +// the above command provides a URL for curl download +// curl "https://cog.sanger.ac.uk/cosmic/GRCh38/cosmic/v97/VCF/CosmicNonCodingVariants.normal.vcf.gz?AWSAccessKeyId=KRV7P7QR9DL41J9EWGA2&Expires=1672844121&Signature=4JkeRizNMg0pv%2FChw4QAl268dVw%3D" --output CosmicNonCodingVariants.normal.vcf.gz + + +// ALL REMAINING ANNOTATIONS: +// Note: remaining annotations come from: gs://nygc-resources-public/ensembl_vep/annotations.tar.gz \ No newline at end of file diff --git a/modules/fastq-tools/fastq-pair.nf b/modules/fastq-tools/fastq-pair.nf new file mode 100644 index 00000000..c09e4c9d --- /dev/null +++ b/modules/fastq-tools/fastq-pair.nf @@ -0,0 +1,23 @@ +process FASTQ_PAIR { + tag "$sampleID" + + cpus 1 + memory 50.GB + time { reads[0].size() < 35.GB ? 10.h : 18.h } + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} + + container 'quay.io/biocontainers/fastq-pair:1.0--h87f3376_3' + + input: + tuple val(sampleID), file(reads) + + output: + tuple val(sampleID), file("*.paired.fq"), emit: paired_fastq + tuple val(sampleID), file("*.single.fq"), emit: single_fastq + + script: + + """ + fastq_pair ${reads[0]} ${reads[1]} + """ +} diff --git a/modules/fastq-tools/fastq-sort.nf b/modules/fastq-tools/fastq-sort.nf new file mode 100644 index 00000000..e147835e --- /dev/null +++ b/modules/fastq-tools/fastq-sort.nf @@ -0,0 +1,28 @@ +process FASTQ_SORT { + + tag "$sampleID" + + cpus 1 + memory 50.GB + time 2.h + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} + + container 'quay.io/biocontainers/fastq-tools:0.8.3--hbd632db_2' + + publishDir "${params.pubdir}/${ params.organize_by=='sample' ? sampleID + '/deconvoluted_reads': 'deconvoluted_reads' }", pattern: "*.fastq", mode:'copy', enabled: params.keep_intermediate + + input: + tuple val(sampleID), file(reads) + val(suffix) + + output: + tuple val(sampleID), file("*sorted*{1,2}.fastq"), emit: sorted_fastq + + script: + command_two = params.read_type == 'PE' ? "fastq-sort --id ${reads[1]} > ${sampleID}_sorted_${suffix}_2.fastq" : '' + + """ + fastq-sort --id ${reads[0]} > ${sampleID}_sorted_${suffix}_1.fastq + ${command_two} + """ +} diff --git a/modules/fastqc/fastqc.nf b/modules/fastqc/fastqc.nf index 85ffa2db..af472cd4 100644 --- a/modules/fastqc/fastqc.nf +++ b/modules/fastqc/fastqc.nf @@ -5,9 +5,15 @@ process FASTQC { cpus 8 memory 4.GB time '10:00:00' + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} container 'quay.io/biocontainers/fastqc:0.11.9--hdfd78af_1' - publishDir "${params.pubdir}/${ params.organize_by=='sample' ? sampleID+'/stats' : 'fastqc' }", pattern: "*_fastqc.{zip,html}", mode:'copy' + + publishDir { + def type = "${params.workflow}" == 'chipseq' ? 'fastqc/' : '' + "${params.pubdir}/${ params.organize_by=='sample' ? type+sampleID+'/stats' : 'fastqc'}" + }, pattern: "*_fastqc.{zip,html}", mode: 'copy' + input: tuple val(sampleID), file(fq_reads) @@ -17,9 +23,23 @@ process FASTQC { script: - log.info "----- FASTQC Running on: ${sampleID} -----" + if (params.workflow == "chipseq" && params.read_type == 'SE') + """ + [ ! -f ${sampleID}.fastq.gz ] && ln -s ${fq_reads} ${sampleID}.fastq.gz + + fastqc --quiet -t ${task.cpus} ${sampleID}.fastq.gz + """ + else if (params.workflow == "chipseq" && params.read_type == 'PE') + """ + [ ! -f ${sampleID}_1.fastq.gz ] && ln -s ${fq_reads[0]} ${sampleID}_1.fastq.gz + [ ! -f ${sampleID}_2.fastq.gz ] && ln -s ${fq_reads[1]} ${sampleID}_2.fastq.gz + fastqc --quiet -t ${task.cpus} ${sampleID}_1.fastq.gz + fastqc --quiet -t ${task.cpus} ${sampleID}_2.fastq.gz + """ + else """ fastqc --quiet -t ${task.cpus} ${fq_reads} """ + } diff --git a/modules/fusion_report/fusion_report.nf b/modules/fusion_report/fusion_report.nf new file mode 100644 index 00000000..db4f2776 --- /dev/null +++ b/modules/fusion_report/fusion_report.nf @@ -0,0 +1,36 @@ +process FUSION_REPORT { + tag "$sampleID" + + cpus 1 + memory 2.GB + time 2.h + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} + + container 'quay.io/biocontainers/fusion-report:2.1.5--pyhdfd78af_0' + + publishDir "${params.pubdir}/${ params.organize_by=='sample' ? sampleID+'/fusion-report/' : 'star-fusion' }", mode:'copy' + + input: + tuple val(sampleID), path(arriba), path(fusioncatcher), path(jaffa), path(pizzly), path(squid), path(starfusion) + + output: + tuple val(sampleID), file("${sampleID}_fusion_list.tsv"), emit: fusion_inspector_input_list + tuple val(sampleID), file("${sampleID}_fusion_genes_mqc.json"), emit: summary_fusions_mq + tuple val(sampleID), file("*"), emit: report + + script: + def extra_params = params.fusion_report_opt ? params.fusion_report_opt : '' + def tools = !arriba.empty() ? "--arriba ${arriba} " : '' + tools += !jaffa.empty() ? "--jaffa ${jaffa} " : '' + tools += !fusioncatcher.empty() ? "--fusioncatcher ${fusioncatcher} " : '' + tools += !pizzly.empty() ? "--pizzly ${pizzly} " : '' + tools += !squid.empty() ? "--squid ${squid} " : '' + tools += !starfusion.empty() ? "--starfusion ${starfusion} " : '' + + """ + fusion_report run ${sampleID} . ${params.databases} ${tools} ${extra_params} + mv fusion_list.tsv ${sampleID}_fusion_list.tsv + mv fusion_list_filtered.tsv ${sampleID}_fusion_list_filtered.tsv + mv fusion_genes_mqc.json ${sampleID}_fusion_genes_mqc.json + """ +} \ No newline at end of file diff --git a/modules/fusioncatcher/fusioncatcher.nf b/modules/fusioncatcher/fusioncatcher.nf new file mode 100644 index 00000000..26f03618 --- /dev/null +++ b/modules/fusioncatcher/fusioncatcher.nf @@ -0,0 +1,44 @@ +process FUSIONCATCHER { + + tag "$sampleID" + + cpus 12 + memory 84.GB + time 24.h + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} + + container 'quay.io/biocontainers/fusioncatcher:1.33--hdfd78af_4' + + publishDir "${params.pubdir}/${ params.organize_by=='sample' ? sampleID + '/fusions': 'fusioncatcher' }", pattern: "*.{tsv,txt}", mode:'copy' + + + input: + tuple val(sampleID), path(reads) + + output: + tuple val(sampleID), path("*_fusioncatcher_fusions.txt"), optional:true, emit: fusioncatcher_fusions + tuple val(sampleID), path("*_fusioncatcher_summary.txt"), optional:true, emit: fusioncatcher_summary + tuple val(sampleID), path("*_fusioncatcher.log"), emit: fusioncatcher_log + + script: + + def input_reads = reads.toString().replace(" ", ",") + + """ + fusioncatcher.py \\ + -d ${params.fusioncatcher_ref} \\ + -i ${input_reads} \\ + -p ${task.cpus} \\ + -o . \\ + --skip-blat \\ + --limitSjdbInsertNsj ${params.fusioncatcher_limitSjdbInsertNsj} + + mv final-list_candidate-fusion-genes.txt ${sampleID}_fusioncatcher_fusions.txt + mv summary_candidate_fusions.txt ${sampleID}_fusioncatcher_summary.txt + mv fusioncatcher.log ${sampleID}_fusioncatcher.log + + """ + + + +} diff --git a/modules/g2gtools/g2gtools_chain_convert_peak.nf b/modules/g2gtools/g2gtools_chain_convert_peak.nf index 7202eb24..748184d9 100644 --- a/modules/g2gtools/g2gtools_chain_convert_peak.nf +++ b/modules/g2gtools/g2gtools_chain_convert_peak.nf @@ -4,6 +4,7 @@ process CHAIN_CONVERT { cpus 1 memory 10.GB time '10:00:00' + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} container 'quay.io/jaxcompsci/g2gtools:0.1.31' @@ -13,13 +14,12 @@ process CHAIN_CONVERT { tuple val(sampleID), file(bam_shifted) output: - tuple val(sampleID), file("*.tmp.mm10.ba*") - tuple val(sampleID), file("*g2gconvert.log") + tuple val(sampleID), file("*.tmp.mm10.bam"), emit: converted_bam + tuple val(sampleID), file("*g2gconvert.log"), emit: log when: params.chain != null script: - log.info "----- Converting Coordinates to Reference on ${sampleID} -----" """ g2gtools convert \ -r -f bam -c ${params.chain} \ diff --git a/modules/gatk/gatk3_applyrecalibration.nf b/modules/gatk/gatk3_applyrecalibration.nf new file mode 100644 index 00000000..4c27da26 --- /dev/null +++ b/modules/gatk/gatk3_applyrecalibration.nf @@ -0,0 +1,38 @@ +process GATKv3_5_ApplyRecalibration { + tag "$sampleID" + + cpus = 1 + memory = 15.GB + time = '10:00:00' + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} + + container 'broadinstitute/gatk3:3.5-0' + + publishDir "${params.pubdir}/${ params.organize_by=='sample' ? "$sampleID" : 'gatk' }", pattern: "*.vcf", mode:'copy' + + input: + tuple val(sampleID), file(normal_germline_vcf) + tuple val(sampleID), file(normal_germline_vcf_index) + tuple val(sampleID), file(normal_germline_recal) + tuple val(sampleID), file(normal_germline_tranches) + + output: + tuple val(sampleID), file("*.*recalibrated.filtered.vcf"), emit: normal_germline_recalibrated_vcf + tuple val(sampleID), file("*.*recalibrated.filtered.vcf.idx"), emit: normal_germline_recalibrated_vcf_index + + script: + String my_mem = (task.memory-1.GB).toString() + my_mem = my_mem[0..-4] + + """ + java -Djava.io.tmpdir=$TMPDIR -Xmx${my_mem}G -jar GenomeAnalysisTK.jar \ + -T ApplyRecalibration \ + -R ${params.ref_fa} \ + -input ${sampleID}_variants_raw.vcf \ + --ts_filter_level 99.6 \ + -tranchesFile ${sampleID}.tranches.txt \ + -recalFile ${sampleID}.recal.txt \ + -mode SNP + -o ${sampleID}_variants_raw.recalibrated.filtered.vcf + """ +} \ No newline at end of file diff --git a/modules/gatk/gatk3_genotypegvcf.nf b/modules/gatk/gatk3_genotypegvcf.nf new file mode 100644 index 00000000..3eef231b --- /dev/null +++ b/modules/gatk/gatk3_genotypegvcf.nf @@ -0,0 +1,33 @@ +process GATKv3_5_GENOTYPEGVCF { + tag "$sampleID" + + cpus = 1 + memory = 15.GB + time = '10:00:00' + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} + + container 'broadinstitute/gatk3:3.5-0' + + publishDir "${params.pubdir}/${ params.organize_by=='sample' ? "sampleID" : 'gatk' }", pattern: "*.vcf", mode:'copy', enabled: params.keep_intermediate + + input: + tuple val(sampleID), file(normal_germline_gvcf) + tuple val(sampleID), file(normal_germline_gvcf_index) + + output: + tuple val(sampleID), file("*.*vcf"), emit: normal_germline_vcf + tuple val(sampleID), file("*.vcf.idx"), emit: normal_germline_vcf_index + + script: + String my_mem = (task.memory-1.GB).toString() + my_mem = my_mem[0..-4] + + """ + java -Djava.io.tmpdir=$TMPDIR -Xmx${my_mem}G -jar GenomeAnalysisTK.jar \ + -T GenotypeGVCFs \ + -R ${params.ref_fa} \ + --variant ${sampleID}_variants_raw.gvcf \ + -o ${sampleID}_variants_raw.vcf + """ +} + \ No newline at end of file diff --git a/modules/gatk/gatk3_haplotypecaller.nf b/modules/gatk/gatk3_haplotypecaller.nf new file mode 100644 index 00000000..2e6982e1 --- /dev/null +++ b/modules/gatk/gatk3_haplotypecaller.nf @@ -0,0 +1,36 @@ +process GATKv3_5_HAPLOTYPECALLER { + tag "$sampleID" + + cpus = 1 + memory = 15.GB + time = '10:00:00' + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} + + container 'broadinstitute/gatk3:3.5-0' + + publishDir "${params.pubdir}/${ params.organize_by=='sample' ? "$sampleID" : 'gatk' }", pattern: "*.gvcf", mode:'copy' + + input: + tuple val(sampleID), val(meta), path(normal_bam), path(normal_bai) + + output: + tuple val(sampleID), path("*.gvcf"), emit: normal_germline_gvcf + tuple val(sampleID), path("*.gvcf.idx"), emit: normal_germline_gvcf_index + + script: + String my_mem = (task.memory-1.GB).toString() + my_mem = my_mem[0..-4] + + """ + java -Djava.io.tmpdir=$TMPDIR -Xmx${my_mem}G -jar /usr/GenomeAnalysisTK.jar \ + -T HaplotypeCaller \ + -R ${params.ref_fa} \ + -I ${normal_bam} \ + -o ${sampleID}_variants_raw.gvcf \ + -L ${params.target_gatk} \ + -stand_call_conf ${params.call_val} \ + -ERC GVCF \ + -variant_index_type LINEAR \ + -variant_index_parameter 128000 + """ +} \ No newline at end of file diff --git a/modules/gatk/gatk_indelrealigner.nf b/modules/gatk/gatk3_indelrealigner.nf similarity index 76% rename from modules/gatk/gatk_indelrealigner.nf rename to modules/gatk/gatk3_indelrealigner.nf index a4da1172..84eb22d8 100644 --- a/modules/gatk/gatk_indelrealigner.nf +++ b/modules/gatk/gatk3_indelrealigner.nf @@ -5,6 +5,7 @@ process GATK_INDELREALIGNER{ cpus = 1 memory = 35.GB time = '08:00:00' + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} // Command Depricated in GATK 4 container 'broadinstitute/gatk3:3.6-0' @@ -21,7 +22,6 @@ process GATK_INDELREALIGNER{ tuple val(sampleID), file("*.bai"), emit: bai script: - log.info "----- GATK IndelRealigner Running on: ${sampleID} -----" String my_mem = (task.memory-1.GB).toString() my_mem = my_mem[0..-4] diff --git a/modules/gatk/gatk_realignertargetcreator.nf b/modules/gatk/gatk3_realignertargetcreator.nf similarity index 72% rename from modules/gatk/gatk_realignertargetcreator.nf rename to modules/gatk/gatk3_realignertargetcreator.nf index 6e9cf465..06c4cf79 100644 --- a/modules/gatk/gatk_realignertargetcreator.nf +++ b/modules/gatk/gatk3_realignertargetcreator.nf @@ -5,6 +5,7 @@ process GATK_REALIGNERTARGETCREATOR { cpus = 12 memory = 35.GB time = '12:00:00' + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} container 'broadinstitute/gatk3:3.6-0' @@ -17,7 +18,6 @@ process GATK_REALIGNERTARGETCREATOR { tuple val(sampleID), file("*.intervals"), emit: intervals script: - log.info "----- GATK RealignerTargetCreator Running on: ${sampleID} -----" String my_mem = (task.memory-1.GB).toString() my_mem = my_mem[0..-4] diff --git a/modules/gatk/gatk_variantannotator.nf b/modules/gatk/gatk3_variantannotator.nf similarity index 63% rename from modules/gatk/gatk_variantannotator.nf rename to modules/gatk/gatk3_variantannotator.nf index 4abf05ba..f9e1c3fe 100644 --- a/modules/gatk/gatk_variantannotator.nf +++ b/modules/gatk/gatk3_variantannotator.nf @@ -4,12 +4,13 @@ process GATK_VARIANTANNOTATOR { cpus 1 memory 15.GB time '05:00:00' + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} // Legacy Reasons Leave as GATK3 (public) // Flag --snpEffFile was removed in GATK4 container 'broadinstitute/gatk3:3.6-0' - publishDir "${params.pubdir}/${ params.organize_by=='sample' ? sampleID : 'gatk' }", pattern: "*.vcf", mode:'copy' + publishDir "${params.pubdir}/${ params.organize_by=='sample' ? sampleID : 'gatk' }", pattern: "*.vcf", mode:'copy', enabled: params.gen_org=='mouse' ? true : params.keep_intermediate input: tuple val(sampleID), file(sample_vcf), file(snpeff_vcf) @@ -18,7 +19,6 @@ process GATK_VARIANTANNOTATOR { tuple val(sampleID), file("*.vcf"), emit: vcf script: - log.info "----- GATK VariantAnnotator Running on: ${sampleID} -----" String my_mem = (task.memory-1.GB).toString() my_mem = my_mem[0..-4] """ diff --git a/modules/gatk/gatk3_variantrecalibrator.nf b/modules/gatk/gatk3_variantrecalibrator.nf new file mode 100644 index 00000000..76e547a6 --- /dev/null +++ b/modules/gatk/gatk3_variantrecalibrator.nf @@ -0,0 +1,42 @@ +process GATKv3_5_VARIANTRECALIBRATOR { + tag "$sampleID" + + cpus = 1 + memory = 15.GB + time = '10:00:00' + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} + + container 'broadinstitute/gatk3:3.5-0' + + publishDir "${params.pubdir}/${ params.organize_by=='sample' ? "$sampleID" : 'gatk' }", pattern: "*.txt", mode:'copy' + + input: + tuple val(sampleID), file(normal_germline_vcf) + tuple val(sampleID), file(normal_germline_vcf_index) + + output: + tuple val(sampleID), file("*.*recal.txt"), emit: normal_germline_recal + tuple val(sampleID), file("*.*tranches.txt"), emit: normal_germline_tranches + tuple val(sampleID), file("*.*plot.R.txt"), emit: normal_germline_plot_R + + script: + String my_mem = (task.memory-1.GB).toString() + my_mem = my_mem[0..-4] + + """ + java -Djava.io.tmpdir=$TMPDIR -Xmx${my_mem}G -jar /usr/GenomeAnalysisTK.jar \ + -T VariantRecalibrator \ + -R ${params.ref_fa} \ + -input ${normal_germline_vcf} \ + -resource:hapmap,known=false,training=true,truth=true,prior=15.0 hapmap_3.3.b37.sites.vcf \ + -resource:omni,known=false,training=true,truth=false,prior=12.0 1000G_omni2.5.b37.sites.vcf \ + -resource:1000G,known=false,training=true,truth=false,prior=10.0 1000G_phase1.snps.high_confidence.vcf \ + -resource:dbsnp,known=true,training=false,truth=false,prior=6.0 dbsnp_135.b37.vcf \ + -an QD -an MQ -an MQRankSum -an ReadPosRankSum -an FS -an SOR -an InbreedingCoeff \ + -mode SNP \ + -tranche 99.6 \ + -recalFile ${sampleID}.recal.txt \ + -tranchesFile ${sampleID}.tranches.txt \ + -rscriptFile ${sampleID}.plots.R.txt + """ +} \ No newline at end of file diff --git a/modules/gatk/gatk_applybqsr.nf b/modules/gatk/gatk_applybqsr.nf index 3dc4face..dcefbdec 100644 --- a/modules/gatk/gatk_applybqsr.nf +++ b/modules/gatk/gatk_applybqsr.nf @@ -2,14 +2,13 @@ process GATK_APPLYBQSR { tag "$sampleID" cpus = 1 - memory = {40.GB * task.attempt} + memory = 40.GB time = '12:00:00' - errorStrategy 'retry' - maxRetries 1 + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} container 'broadinstitute/gatk:4.2.4.1' - publishDir "${params.pubdir}/${ params.organize_by=='sample' ? sampleID+'/bam' : 'gatk' }", pattern: "*.bam", mode:'copy' + publishDir "${params.pubdir}/${ params.organize_by=='sample' ? sampleID+'/bam' : 'gatk' }", pattern: "*.ba*", mode:'copy' input: tuple val(sampleID), file(bam), file(table) @@ -19,7 +18,6 @@ process GATK_APPLYBQSR { tuple val(sampleID), file("*.bai"), emit: bai script: - log.info "----- GATK ApplyBQSR Running on: ${sampleID} -----" String my_mem = (task.memory-1.GB).toString() my_mem = my_mem[0..-4] """ diff --git a/modules/gatk/gatk_baserecalibrator.nf b/modules/gatk/gatk_baserecalibrator.nf index a52450e4..35e299ec 100644 --- a/modules/gatk/gatk_baserecalibrator.nf +++ b/modules/gatk/gatk_baserecalibrator.nf @@ -2,10 +2,9 @@ process GATK_BASERECALIBRATOR { tag "$sampleID" cpus = 1 - memory = {40.GB * task.attempt} - time = '12:00:00' - errorStrategy 'retry' - maxRetries 1 + memory { bam.size() < 60.GB ? 40.GB : 80.GB } + time { bam.size() < 60.GB ? '12:00:00' : '24:00:00' } + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} container 'broadinstitute/gatk:4.2.4.1' @@ -18,7 +17,6 @@ process GATK_BASERECALIBRATOR { tuple val(sampleID), file("*.table"), emit: table script: - log.info "----- GATK BaseRecalibrator Running on: ${sampleID} -----" String my_mem = (task.memory-1.GB).toString() my_mem = my_mem[0..-4] """ diff --git a/modules/gatk/gatk_chain_extract_badreads.nf b/modules/gatk/gatk_chain_extract_badreads.nf index ac89330d..0e3c9d5f 100644 --- a/modules/gatk/gatk_chain_extract_badreads.nf +++ b/modules/gatk/gatk_chain_extract_badreads.nf @@ -4,11 +4,12 @@ process CHAIN_EXTRACT_BADREADS { cpus 2 memory 4.GB time = '04:00:00' + errorStrategy { [0,3,4].contains(task.exitStatus) ? 'ignore' : 'terminate' } publishDir "${params.pubdir}/${ params.organize_by=='sample' ? sampleID+'/stats' : 'gatk' }", pattern: "*.log", mode: 'copy' + container 'broadinstitute/gatk:4.2.4.1' - errorStrategy { [0,3,4].contains(task.exitStatus) ? 'ignore' : 'terminate' } input: tuple val(sampleID), file(bam_sort_mm10) @@ -20,7 +21,6 @@ process CHAIN_EXTRACT_BADREADS { when: params.chain != null script: - log.info "----- Extracting a list of 'bad reads' on ${sampleID} -----" """ gatk ValidateSamFile \ -I ${bam_sort_mm10[0]} \ diff --git a/modules/gatk/gatk_chain_filter_reads.nf b/modules/gatk/gatk_chain_filter_reads.nf index 09782b2c..e012b3c7 100644 --- a/modules/gatk/gatk_chain_filter_reads.nf +++ b/modules/gatk/gatk_chain_filter_reads.nf @@ -4,6 +4,7 @@ process CHAIN_FILTER_READS { cpus 2 memory 4.GB time = '10:00:00' + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} publishDir "${params.pubdir}/${ params.organize_by=='sample' ? sampleID+'/stats' : 'gatk' }", pattern: "*.log", mode: 'copy' container 'broadinstitute/gatk:4.2.4.1' @@ -13,13 +14,12 @@ process CHAIN_FILTER_READS { output: - tuple val(sampleID), file("*.tmp2.mm10.ba*") + tuple val(sampleID), path("*.tmp2.mm10.bam"), emit: bam tuple val(sampleID), file("*_FilterSamReads.log"), emit: filterReads_log when: params.chain != null script: - log.info "----- Filtering list to unique name on ${sampleID} -----" """ gatk FilterSamReads \ -I ${bam_sort_mm10[0]} \ diff --git a/modules/gatk/gatk_cnnscorevariants.nf b/modules/gatk/gatk_cnnscorevariants.nf new file mode 100644 index 00000000..9602e8b1 --- /dev/null +++ b/modules/gatk/gatk_cnnscorevariants.nf @@ -0,0 +1,31 @@ +process GATK_CNNSCORE_VARIANTS { + tag "$sampleID" + + cpus = 1 + memory = 15.GB + time = '04:00:00' + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} + + container 'broadinstitute/gatk:4.2.4.1' + + publishDir "${params.pubdir}/${ params.organize_by=='sample' ? sampleID : 'gatk' }", pattern: "*.*vcf", mode:'copy', enabled: params.keep_intermediate + + input: + tuple val(sampleID), file(vcf), file(vcf_index), path(interval), val(index) + + output: + tuple val(sampleID), file("*.vcf"), emit: vcf + tuple val(sampleID), file("*.idx"), emit: idx + + script: + String my_mem = (task.memory-1.GB).toString() + my_mem = my_mem[0..-4] + + """ + gatk --java-options "-Xmx${my_mem}G" CNNScoreVariants \ + -R ${params.ref_fa} \ + -V ${vcf} \ + -O ${sampleID}_${index}_haplotypecaller.annotated.vcf \ + -L ${interval} + """ +} diff --git a/modules/gatk/gatk_combinegvcfs.nf b/modules/gatk/gatk_combinegvcfs.nf new file mode 100644 index 00000000..2de1ae18 --- /dev/null +++ b/modules/gatk/gatk_combinegvcfs.nf @@ -0,0 +1,33 @@ +process GATK_COMBINEGVCFS { + tag "$sampleID" + + cpus 1 + memory 10.GB + time '05:00:00' + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} + + container 'broadinstitute/gatk:4.2.4.1' + + publishDir "${params.pubdir}/${ params.organize_by=='sample' ? sampleID : 'gatk' }", pattern: "*.gvcf", mode:'copy' + + input: + tuple val(sampleID), path(gvcf) + + output: + tuple val(sampleID), file("*.gvcf"), emit: gvcf + tuple val(sampleID), file("*.idx"), emit: idx + + script: + // memory needs to be set explicitly + String my_mem = (task.memory-1.GB).toString() + my_mem = my_mem[0..-4] + + inputs = gvcf.collect { "--variant $it" }.join(' ') + + """ + gatk --java-options "-Xmx${my_mem}G" CombineGVCFs \ + -R ${params.ref_fa} \ + ${inputs} \ + -O ${sampleID}_GATKcombined_raw.gvcf + """ +} \ No newline at end of file diff --git a/modules/gatk/gatk_depthofcoverage.nf b/modules/gatk/gatk_depthofcoverage.nf index 6d7eec4c..e699d77d 100644 --- a/modules/gatk/gatk_depthofcoverage.nf +++ b/modules/gatk/gatk_depthofcoverage.nf @@ -5,9 +5,9 @@ process GATK_DEPTHOFCOVERAGE { cpus 1 memory 15.GB time '05:00:00' + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} container 'broadinstitute/gatk:4.2.4.1' - file(params.ref_fai) input: tuple val(sampleID), file(bam), file(bai) @@ -17,7 +17,6 @@ process GATK_DEPTHOFCOVERAGE { tuple val(sampleID), file("*_gatk_temp.txt"), emit: txt script: - log.info "----- GATK Depth of Coverage Running on: ${sampleID} -----" String my_mem = (task.memory-1.GB).toString() my_mem = my_mem[0..-4] """ diff --git a/modules/gatk/gatk_filtermutectcalls.nf b/modules/gatk/gatk_filtermutectcalls.nf new file mode 100644 index 00000000..deaa9f27 --- /dev/null +++ b/modules/gatk/gatk_filtermutectcalls.nf @@ -0,0 +1,32 @@ +process GATK_FILTERMUECTCALLS { + tag "$sampleID" + + cpus = 1 + memory = 15.GB + time '05:00:00' + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} + + container 'broadinstitute/gatk:4.2.4.1' + + publishDir "${params.pubdir}/${ params.organize_by=='sample' ? sampleID + '/callers' : 'gatk' }", pattern: "*_mutect2_somatic.filtered.vcf.gz", mode:'copy' + publishDir "${params.pubdir}/${ params.organize_by=='sample' ? sampleID + '/stats' : 'gatk' }", pattern: "*.filteringStats.tsv", mode:'copy' + + input: + tuple val(sampleID), path(vcf), path(tbi), val(meta), val(normal_name), val(tumor_name), val(tool), path(stats) + + output: + tuple val(sampleID), file("*_mutect2_somatic.filtered.vcf.gz"), file("*_mutect2_somatic.filtered.vcf.gz.tbi"), val(meta), val(normal_name), val(tumor_name), val('mutect2'), emit: mutect2_vcf_tbi + tuple val(sampleID), file("*.filteringStats.tsv"), emit: stats + + script: + //Estimate somatic variants using Mutect2 + String my_mem = (task.memory-1.GB).toString() + my_mem = my_mem[0..-4] + """ + gatk --java-options "-Xmx${my_mem}G" FilterMutectCalls \ + -R ${params.ref_fa} \ + -V ${vcf} \ + --stats ${stats} \ + -O ${sampleID}_mutect2_somatic.filtered.vcf.gz + """ +} \ No newline at end of file diff --git a/modules/gatk/gatk_filtermutectcalls_tumorOnly.nf b/modules/gatk/gatk_filtermutectcalls_tumorOnly.nf new file mode 100644 index 00000000..8de1c636 --- /dev/null +++ b/modules/gatk/gatk_filtermutectcalls_tumorOnly.nf @@ -0,0 +1,32 @@ +process GATK_FILTERMUECTCALLS { + tag "$sampleID" + + cpus = 1 + memory = 15.GB + time '05:00:00' + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} + + container 'broadinstitute/gatk:4.4.0.0' + + publishDir "${params.pubdir}/${ params.organize_by=='sample' ? sampleID : 'gatk' }", pattern: "*_mutect2_somatic.filtered.vcf.gz", mode:'copy' + publishDir "${params.pubdir}/${ params.organize_by=='sample' ? sampleID+'/stats' : 'gatk' }", pattern: "*.filteringStats.tsv", mode:'copy' + + input: + tuple val(sampleID), path(vcf), path(tbi), path(stats) + + output: + tuple val(sampleID), file("*_mutect2_somatic.filtered.vcf.gz"), file("*_mutect2_somatic.filtered.vcf.gz.tbi"), emit: mutect2_vcf_tbi + tuple val(sampleID), file("*.filteringStats.tsv"), emit: stats + + script: + //Estimate somatic variants using Mutect2 + String my_mem = (task.memory-1.GB).toString() + my_mem = my_mem[0..-4] + """ + gatk --java-options "-Xmx${my_mem}G" FilterMutectCalls \ + -R ${params.ref_fa} \ + -V ${vcf} \ + --stats ${stats} \ + -O ${sampleID}_mutect2_somatic.filtered.vcf.gz + """ +} \ No newline at end of file diff --git a/modules/gatk/gatk_filtervarianttranches.nf b/modules/gatk/gatk_filtervarianttranches.nf new file mode 100644 index 00000000..4ecd68dd --- /dev/null +++ b/modules/gatk/gatk_filtervarianttranches.nf @@ -0,0 +1,39 @@ +process GATK_FILTER_VARIANT_TRANCHES { + // This modules is a port of the NYGC germline filtering scheme found at this site: + // https://bitbucket.nygenome.org/projects/WDL/repos/somatic_dna_wdl/browse/germline/germline.wdl?at=7.4.0 + + tag "$sampleID" + + cpus = 1 + memory = 15.GB + time = '01:30:00' + errorStrategy 'ignore' + + container 'broadinstitute/gatk:4.2.4.1' + + publishDir "${params.pubdir}/${ params.organize_by=='sample' ? sampleID : 'gatk' }", pattern: "*.*vcf", mode:'copy', enabled: params.keep_intermediate + + input: + tuple val(sampleID), file(vcf), file(vcf_index) + + output: + tuple val(sampleID), file("*.*vcf"), file("*.idx"), emit: vcf_idx + + script: + String my_mem = (task.memory-1.GB).toString() + my_mem = my_mem[0..-4] + + """ + gatk --java-options "-Xmx${my_mem}G" FilterVariantTranches \ + -V ${vcf} \ + -O ${sampleID}_haplotypecaller.gatk.filtered.genotypedGVCFs.vcf \ + --snp-tranche 99.9 --snp-tranche 99.95 \ + --indel-tranche 99.0 --indel-tranche 99.4 \ + --resource ${params.hapmap} \ + --resource ${params.omni} \ + --resource ${params.phase1_1000G} \ + --resource ${params.dbSNP} \ + --info-key CNN_1D \ + --create-output-variant-index true + """ +} \ No newline at end of file diff --git a/modules/gatk/gatk_genotype_gvcf.nf b/modules/gatk/gatk_genotype_gvcf.nf new file mode 100644 index 00000000..a5c9e238 --- /dev/null +++ b/modules/gatk/gatk_genotype_gvcf.nf @@ -0,0 +1,30 @@ +process GATK_GENOTYPE_GVCF { + tag "$sampleID" + + cpus = 1 + memory = 15.GB + time = '01:30:00' + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} + + container 'broadinstitute/gatk:4.2.4.1' + + publishDir "${params.pubdir}/${ params.organize_by=='sample' ? sampleID : 'gatk' }", pattern: "*.*vcf", mode:'copy', enabled: params.keep_intermediate + + input: + tuple val(sampleID), file(vcf), file(vcf_index), path(interval), val(index) + + output: + tuple val(sampleID), file("*.*vcf"), file("*.idx"), path(interval), val(index), emit: vcf_idx + + script: + String my_mem = (task.memory-1.GB).toString() + my_mem = my_mem[0..-4] + + """ + gatk --java-options "-Xmx${my_mem}G" GenotypeGVCFs \ + -R ${params.ref_fa} \ + -V ${vcf} \ + -O ${sampleID}_${index}_genotypedGVCFs.vcf \ + -L ${interval} + """ +} \ No newline at end of file diff --git a/modules/gatk/gatk_getsamplename.nf b/modules/gatk/gatk_getsamplename.nf new file mode 100644 index 00000000..8b41dbd5 --- /dev/null +++ b/modules/gatk/gatk_getsamplename.nf @@ -0,0 +1,25 @@ +process GATK_GETSAMPLENAME { + tag "$sampleID" + + cpus = 1 + memory = 1.GB + time = '00:05:00' + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} + + container 'broadinstitute/gatk:4.2.4.1' + + input: + tuple val(sampleID), val(meta), file(bam), file(bai) + + output: + tuple val(sampleID), stdout, emit: sample_name + + script: + """ + gatk GetSampleName \ + -I ${bam} \ + -O sample_name.txt + + cat sample_name.txt + """ +} \ No newline at end of file diff --git a/modules/gatk/gatk_getsamplename_noMeta.nf b/modules/gatk/gatk_getsamplename_noMeta.nf new file mode 100644 index 00000000..28ced711 --- /dev/null +++ b/modules/gatk/gatk_getsamplename_noMeta.nf @@ -0,0 +1,25 @@ +process GATK_GETSAMPLENAME { + tag "$sampleID" + + cpus = 1 + memory = 1.GB + time = '00:05:00' + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} + + container 'broadinstitute/gatk:4.2.4.1' + + input: + tuple val(sampleID), file(bam), file(bai) + + output: + tuple val(sampleID), stdout, emit: sample_name + + script: + """ + gatk GetSampleName \ + -I ${bam} \ + -O sample_name.txt + + cat sample_name.txt + """ +} \ No newline at end of file diff --git a/modules/gatk/gatk_haplotypecaller.nf b/modules/gatk/gatk_haplotypecaller.nf index 5506970c..a39b222f 100644 --- a/modules/gatk/gatk_haplotypecaller.nf +++ b/modules/gatk/gatk_haplotypecaller.nf @@ -4,6 +4,7 @@ process GATK_HAPLOTYPECALLER { cpus = 1 memory = 15.GB time = '10:00:00' + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} container 'broadinstitute/gatk:4.2.4.1' @@ -18,7 +19,6 @@ process GATK_HAPLOTYPECALLER { tuple val(sampleID), file("*.idx"), emit: idx script: - log.info "----- GATK Haplotype Caller Running on: ${sampleID} -----" String my_mem = (task.memory-1.GB).toString() my_mem = my_mem[0..-4] @@ -27,7 +27,7 @@ process GATK_HAPLOTYPECALLER { output_suffix='gvcf' } else{ - delta="--dbsnp ${params.dbSNP} " + delta="--dbsnp ${params.dbSNP} -stand-call-conf ${params.call_val}" output_suffix='vcf' } @@ -38,7 +38,6 @@ process GATK_HAPLOTYPECALLER { -I ${bam} \ -O ${sampleID}_variants_raw.${output_suffix} \ -L ${params.target_gatk} \ - -stand-call-conf ${params.call_val} \ ${params.ploidy_val} \ ${delta} \ """ diff --git a/modules/gatk/gatk_haplotypecaller_interval.nf b/modules/gatk/gatk_haplotypecaller_interval.nf index 83555298..ea410bf7 100644 --- a/modules/gatk/gatk_haplotypecaller_interval.nf +++ b/modules/gatk/gatk_haplotypecaller_interval.nf @@ -4,28 +4,42 @@ process GATK_HAPLOTYPECALLER_INTERVAL { cpus = 1 memory = 15.GB - time = '05:30:00' + time 12.hour + errorStrategy 'finish' + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} container 'broadinstitute/gatk:4.2.4.1' + publishDir "${params.pubdir}/${ params.organize_by=='sample' ? sampleID : 'gatk' }", pattern: "*.*vcf", mode:'copy', enabled: params.keep_intermediate + input: tuple val(sampleID), file(bam), file(bai), val(chrom) + val(gvcf) output: - tuple val(sampleID), file("*.vcf"), emit: vcf + tuple val(sampleID), file("*.*vcf"), emit: vcf tuple val(sampleID), file("*.idx"), emit: idx script: - log.info "----- GATK Haplotype Caller Running on Chromosome ${chrom} for sample: ${sampleID} -----" String my_mem = (task.memory-1.GB).toString() my_mem = my_mem[0..-4] + + if (gvcf=='gvcf'){ + delta="-ERC GVCF" + output_suffix='gvcf' + } + else{ + delta="-stand-call-conf ${params.call_val}" + output_suffix='vcf' + } + """ gatk --java-options "-Xmx${my_mem}G" HaplotypeCaller \ -R ${params.ref_fa} \ -I ${bam} \ - -O ${sampleID}_HaplotypeCaller_${chrom}.vcf \ + -O ${sampleID}_HaplotypeCaller_${chrom}.${output_suffix} \ -L ${chrom} \ - -stand-call-conf ${params.call_val} + ${delta} \ """ } \ No newline at end of file diff --git a/modules/gatk/gatk_haplotypecaller_sv_germline.nf b/modules/gatk/gatk_haplotypecaller_sv_germline.nf new file mode 100644 index 00000000..022fbb74 --- /dev/null +++ b/modules/gatk/gatk_haplotypecaller_sv_germline.nf @@ -0,0 +1,38 @@ +process GATK_HAPLOTYPECALLER_SV_GERMLINE { + tag "$sampleID" + + cpus = 1 + memory = 15.GB + time = '10:00:00' + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} + + container 'broadinstitute/gatk:4.2.4.1' + + publishDir "${params.pubdir}/${ params.organize_by=='sample' ? sampleID + '/callers' : 'gatk' }", pattern: "*.*vcf", mode:'copy', enabled: params.keep_intermediate + + input: + tuple val(sampleID), val(meta), path(normal_bam), path(normal_bai), val(read_name), path(interval), val(index) + + output: + tuple val(sampleID), path("*.*vcf"), emit: vcf + tuple val(sampleID), path("*.idx"), emit: idx + + script: + String my_mem = (task.memory-1.GB).toString() + my_mem = my_mem[0..-4] + + """ + gatk --java-options "-Xmx${my_mem}G" HaplotypeCaller \ + -R ${params.ref_fa} \ + -I ${normal_bam} \ + -O ${sampleID}_${index}_variants_raw.gvcf \ + -L ${interval} \ + -XL ${params.excludeIntervalList} \ + -stand-call-conf ${params.call_val} \ + -G StandardAnnotation \ + -G StandardHCAnnotation \ + -G AS_StandardAnnotation \ + -GQB 10 -GQB 20 -GQB 30 -GQB 40 -GQB 50 -GQB 60 -GQB 70 -GQB 80 -GQB 90 \ + -ERC GVCF + """ +} \ No newline at end of file diff --git a/modules/gatk/gatk_indexfeaturefile.nf b/modules/gatk/gatk_indexfeaturefile.nf index 14c6f2eb..e5ff5a31 100644 --- a/modules/gatk/gatk_indexfeaturefile.nf +++ b/modules/gatk/gatk_indexfeaturefile.nf @@ -4,6 +4,7 @@ process GATK_INDEXFEATUREFILE { cpus = 1 memory = 6.GB time = '03:00:00' + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} container 'broadinstitute/gatk:4.2.4.1' @@ -16,7 +17,6 @@ process GATK_INDEXFEATUREFILE { tuple val(sampleID), file("*.idx"), emit: idx script: - log.info "----- GATK IndexFeatureFile Running on: ${sampleID} -----" String my_mem = (task.memory-1.GB).toString() my_mem = my_mem[0..-4] """ diff --git a/modules/gatk/gatk_mergemutectstats.nf b/modules/gatk/gatk_mergemutectstats.nf new file mode 100644 index 00000000..4ad056cf --- /dev/null +++ b/modules/gatk/gatk_mergemutectstats.nf @@ -0,0 +1,29 @@ + process GATK_MERGEMUTECTSTATS { + tag "$sampleID" + + cpus = 1 + memory = 15.GB + time '05:00:00' + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} + + container 'broadinstitute/gatk:4.2.4.1' + + input: + tuple val(sampleID), path(list) + + output: + tuple val(sampleID), file("*.stats"), emit: stats + + script: + //Estimate somatic variants using Mutect2 + String my_mem = (task.memory-1.GB).toString() + my_mem = my_mem[0..-4] + + stats = list.collect { "-stats $it" }.join(' ') + + """ + gatk --java-options "-Xmx${my_mem}G" MergeMutectStats \ + ${stats} \ + -O ${sampleID}_merged.stats + """ + } \ No newline at end of file diff --git a/modules/gatk/gatk_mergevcf.nf b/modules/gatk/gatk_mergevcf.nf index ebd56d6e..4537674a 100644 --- a/modules/gatk/gatk_mergevcf.nf +++ b/modules/gatk/gatk_mergevcf.nf @@ -4,6 +4,7 @@ process GATK_MERGEVCF { cpus 1 memory 15.GB time '05:00:00' + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} container 'broadinstitute/gatk:4.2.4.1' @@ -11,12 +12,12 @@ process GATK_MERGEVCF { input: tuple val(sampleID), file(snp_vcf), file(indel_vcf) + val(suffix) output: tuple val(sampleID), file("*.vcf"), emit: vcf script: - log.info "----- GATK MergeVcfs Running on: ${sampleID} -----" String my_mem = (task.memory-1.GB).toString() my_mem = my_mem[0..-4] """ @@ -24,6 +25,6 @@ process GATK_MERGEVCF { -R ${params.ref_fa} \ -I ${snp_vcf} \ -I ${indel_vcf} \ - -O ${sampleID}_GATKcombined.vcf + -O ${sampleID}_${suffix}.vcf """ } \ No newline at end of file diff --git a/modules/gatk/gatk_mergevcf_list.nf b/modules/gatk/gatk_mergevcf_list.nf index a544c238..f3150197 100644 --- a/modules/gatk/gatk_mergevcf_list.nf +++ b/modules/gatk/gatk_mergevcf_list.nf @@ -4,6 +4,7 @@ process GATK_MERGEVCF_LIST { cpus 1 memory 10.GB time '05:00:00' + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} container 'broadinstitute/gatk:4.2.4.1' @@ -17,7 +18,6 @@ process GATK_MERGEVCF_LIST { tuple val(sampleID), file("*.idx"), emit: idx script: - log.info "----- GATK MergeVcfs Running on: ${sampleID} -----" // memory needs to be set explicitly String my_mem = (task.memory-1.GB).toString() my_mem = my_mem[0..-4] diff --git a/modules/gatk/gatk_mutect2.nf b/modules/gatk/gatk_mutect2.nf new file mode 100644 index 00000000..b1076eb1 --- /dev/null +++ b/modules/gatk/gatk_mutect2.nf @@ -0,0 +1,37 @@ +process GATK_MUTECT2 { + tag "$sampleID" + + cpus = 4 + memory = 15.GB + time 15.hour + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} + + container 'broadinstitute/gatk:4.2.4.1' + + publishDir "${params.pubdir}/${ params.organize_by=='sample' ? sampleID + '/callers' : 'gatk' }", pattern: "*_somatic.vcf.gz", mode:'copy', enabled: params.keep_intermediate + + input: + tuple val(sampleID), val(meta), path(normal_bam), path(normal_bai), val(normal_name), path(tumor_bam), path(tumor_bai), val(tumor_name), path(interval), val(interval_index) + + output: + tuple val(sampleID), path("*_somatic.vcf.gz"), val(meta), val(normal_name), val(tumor_name), val('mutect2'), emit: vcf + tuple val(sampleID), path("*_somatic.vcf.gz.tbi"), emit: tbi + tuple val(sampleID), path("*.stats"), emit: stats + + script: + //Estimate somatic variants using Mutect2 + String my_mem = (task.memory-1.GB).toString() + my_mem = my_mem[0..-4] + + """ + gatk --java-options "-Xmx${my_mem}G -XX:ParallelGCThreads=${task.cpus}" Mutect2 \ + -R ${params.ref_fa} \ + -I ${tumor_bam} \ + -tumor ${tumor_name} \ + -I ${normal_bam} \ + -normal ${normal_name} \ + -L ${interval} \ + --native-pair-hmm-threads 4 \ + -O ${sampleID}_${interval_index}_somatic.vcf.gz + """ +} \ No newline at end of file diff --git a/modules/gatk/gatk_mutect2_tumorOnly.nf b/modules/gatk/gatk_mutect2_tumorOnly.nf new file mode 100644 index 00000000..b55e9c10 --- /dev/null +++ b/modules/gatk/gatk_mutect2_tumorOnly.nf @@ -0,0 +1,55 @@ +process GATK_MUTECT2 { + tag "$sampleID" + + cpus = 4 + memory = 15.GB + time 15.hour + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} + + container 'broadinstitute/gatk:4.4.0.0' + + publishDir "${params.pubdir}/${ params.organize_by=='sample' ? sampleID : 'gatk' }", pattern: "*_somatic.vcf.gz", mode:'copy', enabled: params.keep_intermediate + + input: + tuple val(sampleID), file(tumor_bam), file(tumor_bai), val(tumor_name) + + output: + tuple val(sampleID), file("*_somatic.vcf.gz"), file("*_somatic.vcf.gz.tbi"), file("*.stats"), emit: vcf_tbi_stats + + script: + //Estimate somatic variants using Mutect2 + String my_mem = (task.memory-1.GB).toString() + my_mem = my_mem[0..-4] + + """ + gatk --java-options "-Xmx${my_mem}G -XX:ParallelGCThreads=${task.cpus}" Mutect2 \ + -R ${params.ref_fa} \ + -I ${tumor_bam} \ + --germline-resource ${params.gnomad_ref} \ + --panel-of-normals ${params.pon_ref} \ + --genotype-germline-sites true \ + --genotype-pon-sites true \ + --pileup-detection \ + --dont-use-soft-clipped-bases false \ + -L ${params.target_gatk} \ + --native-pair-hmm-threads 4 \ + --annotation QualByDepth \ + --annotation RMSMappingQuality \ + --annotation FisherStrand \ + --annotation MappingQualityRankSumTest \ + --annotation ReadPosRankSumTest \ + --min-base-quality-score 20 \ + -O ${sampleID}_mutect2_somatic.vcf.gz + """ +} + +/* +As of v4.1, there is no longer a need to specify the tumor sample name with -tumor. You need only specify the normal sample name with -normal, if you include a normal. + +Starting with v4.0.4.0, GATK recommends the default setting of --af-of-alleles-not-in-resource, which the tool dynamically adjusts for different modes. +tumor-only calling sets the default to 5e-8, tumor-normal calling sets it to 1e-6 and mitochondrial mode sets it to 4e-3. +For previous versions, the default was 0.001, the average heterozygosity of humans. +For other organisms, change --af-of-alleles-not-in-resource to 1/(ploidy*samples in resource). + +https://console.cloud.google.com/storage/browser/gatk-best-practices/somatic-hg38;tab=objects?prefix=&forceOnObjectsSortingFiltering=false +*/ \ No newline at end of file diff --git a/modules/gatk/gatk_selectvariants.nf b/modules/gatk/gatk_selectvariants.nf index 51fe520c..e6614882 100644 --- a/modules/gatk/gatk_selectvariants.nf +++ b/modules/gatk/gatk_selectvariants.nf @@ -4,21 +4,23 @@ process GATK_SELECTVARIANTS { cpus = 1 memory = 6.GB time = '03:00:00' + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} container 'broadinstitute/gatk:4.2.4.1' - publishDir "${params.pubdir}/${ params.organize_by=='sample' ? sampleID : 'gatk' }", pattern: "*.vcf", mode:'copy' + publishDir "${params.pubdir}/${ params.organize_by=='sample' ? sampleID : 'gatk' }", pattern: "*.vcf", mode:'copy', enabled: params.keep_intermediate + publishDir "${params.pubdir}/${ params.organize_by=='sample' ? sampleID : 'gatk' }", pattern: "*filtered_dbsnpID.vcf", mode:'copy' input: tuple val(sampleID), file(vcf), file(idx) val(indel_snp) + val(suffix) output: tuple val(sampleID), file("*.vcf"), emit: vcf tuple val(sampleID), file("*.idx"), emit: idx script: - log.info "----- GATK Selectvariants Running on: ${sampleID} -----" String my_mem = (task.memory-1.GB).toString() my_mem = my_mem[0..-4] """ @@ -26,6 +28,6 @@ process GATK_SELECTVARIANTS { -R ${params.ref_fa} \ -V ${vcf} \ -select-type ${indel_snp} \ - -O ${sampleID}_selectedvariants_${indel_snp}.vcf + -O ${sampleID}_${suffix}.vcf """ } \ No newline at end of file diff --git a/modules/gatk/gatk_sortvcf_germline.nf b/modules/gatk/gatk_sortvcf_germline.nf new file mode 100644 index 00000000..b017944a --- /dev/null +++ b/modules/gatk/gatk_sortvcf_germline.nf @@ -0,0 +1,38 @@ +process GATK_SORTVCF_GERMLINE { + + tag "$sampleID" + + cpus = 1 + memory = 15.GB + time = '05:30:00' + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} + + container 'broadinstitute/gatk:4.2.4.1' + + input: + tuple val(sampleID), path(list) + val(gvcf) + + output: + tuple val(sampleID), file("*.vcf"), file("*.idx"), emit: vcf_idx, optional: true + + script: + String my_mem = (task.memory-1.GB).toString() + my_mem = my_mem[0..-4] + + inputs = list.collect { "-I $it" }.join(' ') + + if (gvcf=='gvcf'){ + output_suffix='g.vcf' + } + else{ + output_suffix='vcf' + } + + """ + gatk --java-options "-Xmx${my_mem}G" SortVcf \ + -SD ${params.ref_fa_dict} \ + ${inputs} \ + -O ${sampleID}_merged.${output_suffix} + """ +} \ No newline at end of file diff --git a/modules/gatk/gatk_sortvcf_somatic_merge.nf b/modules/gatk/gatk_sortvcf_somatic_merge.nf new file mode 100644 index 00000000..0da71331 --- /dev/null +++ b/modules/gatk/gatk_sortvcf_somatic_merge.nf @@ -0,0 +1,30 @@ +process GATK_SORTVCF_SOMATIC { + + tag "$sampleID" + + cpus = 1 + memory = 15.GB + time = '05:30:00' + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} + + container 'broadinstitute/gatk:4.2.4.1' + + input: + tuple val(sampleID), path(list), val(meta) + + output: + tuple val(sampleID), file("*.vcf"), file("*.idx"), val(meta), emit: vcf_idx, optional: true + + script: + String my_mem = (task.memory-1.GB).toString() + my_mem = my_mem[0..-4] + + inputs = list.collect { "-I $it" }.join(' ') + + """ + gatk --java-options "-Xmx${my_mem}G" SortVcf \ + -SD ${params.ref_fa_dict} \ + ${inputs} \ + -O ${sampleID}_mnv_final_filtered_merged.vcf + """ +} \ No newline at end of file diff --git a/modules/gatk/gatk_sortvcf_somatic_tools.nf b/modules/gatk/gatk_sortvcf_somatic_tools.nf new file mode 100644 index 00000000..a6ed490b --- /dev/null +++ b/modules/gatk/gatk_sortvcf_somatic_tools.nf @@ -0,0 +1,48 @@ +process GATK_SORTVCF { + + tag "$sampleID" + + cpus = 1 + memory = 15.GB + time = '05:30:00' + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} + + container 'broadinstitute/gatk:4.2.4.1' + + publishDir "${params.pubdir}/${ params.organize_by=='sample' ? sampleID + '/callers' : 'lancet' }", pattern:"*_lancet_merged.vcf.gz", mode:'copy' + + input: + tuple val(sampleID), path(list), val(meta), val(normal_name), val(tumor_name), val(tool) + + output: + tuple val(sampleID), file("*.vcf.gz"), file("*.tbi"), val(meta), val(normal_name), val(tumor_name), val(tool), emit: vcf_tbi + + script: + String my_mem = (task.memory-1.GB).toString() + my_mem = my_mem[0..-4] + + inputs = list.collect { "-I $it" }.join(' ') + + if (tool == 'lancet_support') { + chrom_extract = (list =~ /\w+merged_(chr.+)_h.+/) + tool_name = "lancet_support_"+chrom_extract[0][1] + tool = chrom_extract[0][1] + // for final sort merge of lancet confirm, set 'tool_name' to include chrom. + // set tool to chrom. These steps are required for tuple build as input to final merge. + + } else { + tool_name = tool + } + + """ + gatk --java-options "-Xmx${my_mem}G" SortVcf \ + -SD ${params.ref_fa_dict} \ + ${inputs} \ + -O ${sampleID}_${tool_name}_merged.vcf + + bgzip -f -c ${sampleID}_${tool_name}_merged.vcf > ${sampleID}_${tool_name}_merged.vcf.gz + tabix ${sampleID}_${tool_name}_merged.vcf.gz + + """ +} + diff --git a/modules/gatk/gatk_variantfiltration.nf b/modules/gatk/gatk_variantfiltration.nf index 2b4ef3fc..4023d8e3 100644 --- a/modules/gatk/gatk_variantfiltration.nf +++ b/modules/gatk/gatk_variantfiltration.nf @@ -4,9 +4,12 @@ process GATK_VARIANTFILTRATION { cpus = 1 memory = 6.GB time = '03:00:00' + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} container 'broadinstitute/gatk:4.2.4.1' - publishDir "${params.pubdir}/${ params.organize_by=='sample' ? sampleID : 'gatk' }", pattern: "*.vcf", mode:'copy' + + publishDir "${params.pubdir}/${ params.organize_by=='sample' ? sampleID : 'gatk' }", pattern: "*.vcf", mode:'copy', enabled: params.keep_intermediate + publishDir "${params.pubdir}/${ params.organize_by=='sample' ? sampleID : 'gatk' }", pattern: "*SNP_INDEL_filtered_unannotated_final.vcf", mode:'copy' input: tuple val(sampleID), file(vcf), file(idx) @@ -17,7 +20,6 @@ process GATK_VARIANTFILTRATION { tuple val(sampleID), file("*.idx"), emit: idx script: - log.info "----- GATK VariantFiltration Running on: ${sampleID} -----" String my_mem = (task.memory-1.GB).toString() my_mem = my_mem[0..-4] if (indel_snp == 'INDEL'){ @@ -30,14 +32,14 @@ process GATK_VARIANTFILTRATION { } if (indel_snp == 'BOTH'){ fs = '60.0' - output_suffix = 'snp_indel_filtered.vcf' + output_suffix = 'SNP_INDEL_filtered_unannotated_final.vcf' } """ gatk --java-options "-Xmx${my_mem}G" VariantFiltration \ -R ${params.ref_fa} \ -V ${vcf} \ - -O ${sampleID}_variantfiltration_${output_suffix} \ + -O ${sampleID}_${output_suffix} \ --cluster-window-size 10 \ --filter-name "LowCoverage" --filter-expression "DP < 25" \ --filter-name "VeryLowQual" --filter-expression "QUAL < 30.0" \ diff --git a/modules/gatk/gatk_variantfiltration_af.nf b/modules/gatk/gatk_variantfiltration_af.nf new file mode 100644 index 00000000..1ab1df69 --- /dev/null +++ b/modules/gatk/gatk_variantfiltration_af.nf @@ -0,0 +1,54 @@ + +process GATK_VARIANTFILTRATION_AF { + // This modules is a port of the NYGC germline filtering scheme found at this site: + // https://bitbucket.nygenome.org/projects/WDL/repos/somatic_dna_wdl/browse/germline/germline.wdl?at=7.4.0 + + tag "$sampleID" + + cpus = 1 + memory = 6.GB + time = '03:00:00' + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} + + container 'broadinstitute/gatk:4.2.4.1' + publishDir "${params.pubdir}/${ params.organize_by=='sample' ? sampleID : 'gatk' }", pattern: "*.vcf", mode:'copy', enabled: params.keep_intermediate + + input: + tuple val(sampleID), file(vcf), file(idx) + + output: + tuple val(sampleID), file("*haplotypecaller.gatk.af-gq-filtered.vcf"), emit: vcf + + script: + String my_mem = (task.memory-1.GB).toString() + my_mem = my_mem[0..-4] + """ + ## Annotate FORMAT/AF + gatk --java-options "-Xmx${my_mem}G" VariantAnnotator \ + -R ${params.ref_fa} \ + -V ${vcf} \ + -O ${sampleID}_haplotypecaller.gatk.af.vcf.gz \ + -A AlleleFraction + + ## remove biallellic sites + zcat ${sampleID}_haplotypecaller.gatk.af.vcf.gz \ + | awk '(\$5 !~ ",")' \ + > ${sampleID}.biallellic.vcf + + ## Variant filtration + gatk --java-options "-Xmx${my_mem}G" VariantFiltration \ + -R ${params.ref_fa} \ + -V ${sampleID}.biallellic.vcf \ + -O ${sampleID}.haplotypecaller.af-gq-filtered.vcf.gz \ + --genotype-filter-name "AlleleFraction" \ + --genotype-filter-expression "(AF < 0.25 && AF > 0.0) || AF > 0.75" \ + --genotype-filter-name "GQ20" \ + --genotype-filter-expression "GQ < 20" + + ## filter with AF (deliver) + zcat ${sampleID}.haplotypecaller.af-gq-filtered.vcf.gz \ + | grep -v "AlleleFraction" \ + > ${sampleID}_haplotypecaller.gatk.af-gq-filtered.vcf + + """ +} \ No newline at end of file diff --git a/modules/gatk/gatk_variantfiltration_mutect2.nf b/modules/gatk/gatk_variantfiltration_mutect2.nf new file mode 100644 index 00000000..356b11ee --- /dev/null +++ b/modules/gatk/gatk_variantfiltration_mutect2.nf @@ -0,0 +1,46 @@ +process GATK_VARIANTFILTRATION { + tag "$sampleID" + + cpus = 1 + memory = 6.GB + time = '03:00:00' + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} + + container 'broadinstitute/gatk:4.2.4.1' + + publishDir "${params.pubdir}/${ params.organize_by=='sample' ? sampleID : 'gatk' }", pattern: "*.vcf", mode:'copy', enabled: params.keep_intermediate + + input: + tuple val(sampleID), file(vcf), file(idx) + val(indel_snp) + + output: + tuple val(sampleID), file("*.vcf"), emit: vcf + tuple val(sampleID), file("*.idx"), emit: idx + + script: + String my_mem = (task.memory-1.GB).toString() + my_mem = my_mem[0..-4] + if (indel_snp == 'INDEL'){ + fs='200.0' + output_suffix = 'INDEL_filtered.vcf' + } + if (indel_snp =='SNP'){ + fs ='60.0' + output_suffix = 'SNP_filtered.vcf' + } + if (indel_snp == 'BOTH'){ + fs = '60.0' + output_suffix = 'snp_indel_filtered.vcf' + } + + """ + gatk --java-options "-Xmx${my_mem}G" VariantFiltration \ + -R ${params.ref_fa} \ + -V ${vcf} \ + -O ${sampleID}_${output_suffix} \ + --cluster-window-size 10 \ + --filter-name "LowCoverage" --filter-expression "DP < 25" \ + --filter-name "StrandBias" --filter-expression "FS > ${fs}" + """ +} \ No newline at end of file diff --git a/modules/gridss/gridss_assemble.nf b/modules/gridss/gridss_assemble.nf new file mode 100644 index 00000000..d3611597 --- /dev/null +++ b/modules/gridss/gridss_assemble.nf @@ -0,0 +1,44 @@ +process GRIDSS_ASSEMBLE { + tag "$sampleID" + + cpus = 4 + memory = 15.GB + time = '10:00:00' + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} + + container 'quay.io/jaxcompsci/gridss:2.13.2-2_ln' + + input: + tuple val(sampleID), val(meta), path(normal_bam), path(normal_bai), val(normal_name), path(tumor_bam), path(tumor_bai), val(tumor_name), val(gridss_preprocessed) + + output: + tuple val(sampleID), path('gridss_assemble/'), emit: gridss_assembly + + script: + String my_mem = (task.memory-1.GB).toString() + my_mem = my_mem[0..-4]+'g' + + output_dir = 'gridss_assemble/' + + """ + # https://github.com/umccr/gridss-purple-linx-nf + # Create shadow directory with file symlinks of GRIDSS 'workingdir' to prevent NF cache invalidation (resume related) + # NOTE: for reasons that elude me, NF doesn't always stage in the workingdir; remove if it is present + mkdir -p "${output_dir}/work/" + lndir \$(readlink -f "${gridss_preprocessed}/") "${output_dir}/work" + if [[ -L "${gridss_preprocessed.name}" ]]; then + rm "${gridss_preprocessed}" + fi + + gridss \ + --jvmheap "${my_mem}" \ + --steps assemble \ + --reference "${params.combined_reference_set}" \ + --jar /opt/gridss/gridss-2.13.2-gridss-jar-with-dependencies.jar \ + --threads ${task.cpus} \ + --workingdir "${output_dir}/work/" \ + --assembly ${output_dir}/${sampleID}.gridssassembly.bam \ + --picardoptions VALIDATION_STRINGENCY=LENIENT \ + ${normal_bam} ${tumor_bam} + """ +} \ No newline at end of file diff --git a/modules/gridss/gridss_calling.nf b/modules/gridss/gridss_calling.nf new file mode 100644 index 00000000..4b191c00 --- /dev/null +++ b/modules/gridss/gridss_calling.nf @@ -0,0 +1,47 @@ +process GRIDSS_CALLING { + tag "$sampleID" + + cpus = 4 + memory = 15.GB + time = '10:00:00' + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} + + container 'quay.io/jaxcompsci/gridss:2.13.2-2_ln' + + publishDir "${params.pubdir}/${ params.organize_by=='sample' ? sampleID + '/callers' : 'gridss' }", pattern: "*_gridss_sv.vcf.gz", mode:'copy', enabled: params.keep_intermediate + + input: + tuple val(sampleID), val(meta), path(normal_bam), path(normal_bai), val(normal_name), path(tumor_bam), path(tumor_bai), val(tumor_name), val(gridss_assembled) + + output: + tuple val(sampleID), path('*_gridss_sv.vcf.gz'), val(meta), val(normal_name), val(tumor_name), emit: gridss_vcf + + script: + String my_mem = (task.memory-1.GB).toString() + my_mem = my_mem[0..-4]+'g' + + output_dir = 'gridss_call/' + + """ + # https://github.com/umccr/gridss-purple-linx-nf + # Create shadow directory with file symlinks of GRIDSS 'workingdir' to prevent NF cache invalidation (resume related) + # NOTE: for reasons that elude me, NF doesn't always stage in the workingdir; remove if it is present + mkdir -p "${output_dir}" + lndir \$(readlink -f "${gridss_assembled}/") "${output_dir}/" + if [[ -L "${gridss_assembled.name}" ]]; then + rm "${gridss_assembled}" + fi + + gridss \ + --jvmheap "${my_mem}" \ + --steps call \ + --reference "${params.combined_reference_set}" \ + --jar /opt/gridss/gridss-2.13.2-gridss-jar-with-dependencies.jar \ + --threads ${task.cpus} \ + --workingdir "${output_dir}/work/" \ + --assembly "${output_dir}/${sampleID}.gridssassembly.bam" \ + --output "${sampleID}_gridss_sv.vcf.gz" \ + --picardoptions VALIDATION_STRINGENCY=LENIENT \ + ${normal_bam} ${tumor_bam} + """ +} \ No newline at end of file diff --git a/modules/gridss/gridss_chrom_filter.nf b/modules/gridss/gridss_chrom_filter.nf new file mode 100644 index 00000000..88986086 --- /dev/null +++ b/modules/gridss/gridss_chrom_filter.nf @@ -0,0 +1,31 @@ +process GRIDSS_CHROM_FILTER { + tag "$sampleID" + + cpus = 1 + memory = 1.GB + time = '01:00:00' + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} + + container 'quay.io/jaxcompsci/internal_tools:v1.0' + + stageInMode = 'copy' + + publishDir "${params.pubdir}/${ params.organize_by=='sample' ? sampleID + '/callers' : 'gridss' }", pattern: "*_gridss_sv_unfiltered_chroms.vcf", mode:'copy', enabled: params.keep_intermediate + + input: + tuple val(sampleID), path(vcf), val(meta), val(normal_name), val(tumor_name) + val(chroms) + + output: + tuple val(sampleID), path('*_gridss_sv_unfiltered_chroms.vcf'), val(meta), val(normal_name), val(tumor_name), emit: gridss_chrom_vcf + + script: + chrom_list = chroms.collect { "$it" }.join(' ') + + """ + python ${projectDir}/bin/pta/filter_vcf.py \ + --vcf-file ${vcf} \ + --output ${sampleID}_gridss_sv_unfiltered_chroms.vcf \ + --chroms ${chrom_list} + """ +} diff --git a/modules/gridss/gridss_preprocess.nf b/modules/gridss/gridss_preprocess.nf new file mode 100644 index 00000000..3d8449f3 --- /dev/null +++ b/modules/gridss/gridss_preprocess.nf @@ -0,0 +1,32 @@ +process GRIDSS_PREPROCESS { + tag "$sampleID" + + cpus = 4 + memory = 15.GB + time = '10:00:00' + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} + + container 'quay.io/jaxcompsci/gridss:2.13.2-2_ln' + + input: + tuple val(sampleID), val(meta), path(normal_bam), path(normal_bai), val(normal_name), path(tumor_bam), path(tumor_bai), val(tumor_name) + + output: + tuple val(sampleID), path('gridss_preprocess/'), emit: gridss_preproc + + script: + String my_mem = (task.memory-1.GB).toString() + my_mem = my_mem[0..-4]+'g' + """ + # https://github.com/umccr/gridss-purple-linx-nf + gridss \ + --jvmheap "${my_mem}" \ + --steps preprocess \ + --reference "${params.combined_reference_set}" \ + --jar /opt/gridss/gridss-2.13.2-gridss-jar-with-dependencies.jar \ + --threads ${task.cpus} \ + --workingdir gridss_preprocess/ \ + --picardoptions VALIDATION_STRINGENCY=LENIENT \ + ${normal_bam} ${tumor_bam} + """ +} \ No newline at end of file diff --git a/modules/gridss/gripss_somatic_filter.nf b/modules/gridss/gripss_somatic_filter.nf new file mode 100644 index 00000000..bfb9d5a9 --- /dev/null +++ b/modules/gridss/gripss_somatic_filter.nf @@ -0,0 +1,53 @@ + +process GRIPSS_SOMATIC_FILTER { + tag "$sampleID" + + cpus = 1 + memory = 5.GB + time = '01:00:00' + errorStrategy 'ignore' + + container 'quay.io/biocontainers/hmftools-gripss:2.3.2--hdfd78af_0' + + stageInMode = 'copy' + + publishDir "${params.pubdir}/${ params.organize_by=='sample' ? sampleID + '/callers' : 'gridss' }", pattern: "*gripss.filtered.vcf.gz", mode:'copy' + + input: + tuple val(sampleID), path(vcf), val(meta), val(normal_name), val(tumor_name) + + + output: + tuple val(sampleID), path('*gripss.filtered.vcf.gz'),path('*gripss.filtered.vcf.gz.tbi'), val(meta), val(normal_name), val(tumor_name), val('gridss'), emit: gripss_filtered_bgz + //note: while this is "GRIPSS" filtering. + // GRIDSS was the caller and downstream + // scripts expect "gridss" as the tool name. + tuple val(sampleID), path('*.gripss.vcf.gz'), path('*.gripss.vcf.gz.tbi'), val(meta), val(normal_name), val(tumor_name), val('gridss'), emit: gripss_all_bgz + + script: + """ + gripss -Xmx5g \ + -sample ${tumor_name} \ + -reference ${normal_name} \ + -ref_genome_version 38 \ + -ref_genome ${params.ref_fa} \ + -pon_sgl_file ${params.gripss_pon}/sgl_pon.38.bed \ + -pon_sv_file ${params.gripss_pon}/sv_pon.38.bedpe \ + -known_hotspot_file ${params.gripss_pon}/known_fusions.38.bedpe \ + -repeat_mask_file ${params.gripss_pon}/repeat_mask_data.38.fa.gz \ + -vcf ${vcf} \ + -output_dir . + + mv ${tumor_name}.gripss.filtered.vcf.gz ${sampleID}.gripss.filtered.vcf.gz + mv ${tumor_name}.gripss.filtered.vcf.gz.tbi ${sampleID}.gripss.filtered.vcf.gz.tbi + mv ${tumor_name}.gripss.vcf.gz ${sampleID}.gripss.vcf.gz + mv ${tumor_name}.gripss.vcf.gz.tbi ${sampleID}.gripss.vcf.gz.tbi + + """ + + stub: + """ + touch ${sampleID}_gripss.filtered.vcf.gz + touch ${sampleID}_gripss.filtered.vcf.gz.tbi + """ +} diff --git a/modules/homer/annotate_boolean_peaks.nf b/modules/homer/annotate_boolean_peaks.nf new file mode 100644 index 00000000..88badbaf --- /dev/null +++ b/modules/homer/annotate_boolean_peaks.nf @@ -0,0 +1,22 @@ +process ANNOTATE_BOOLEAN_PEAKS { + tag "${antibody}" + + cpus 1 + memory 5.GB + time '04:00:00' + + container 'ubuntu:20.04' + + input: + tuple val(antibody), path(boolean_txt), path(homer_peaks) + + output: + path '*.boolean.annotatePeaks.txt', emit: annotate_peaks_txt + + script: + prefix="\$(echo ${boolean_txt} | sed 's/.boolean.txt//g')" + """ + cut -f2- ${homer_peaks} | awk 'NR==1; NR > 1 {print \$0 | "sort -T '.' -k1,1 -k2,2n"}' | cut -f6- > tmp.txt + paste ${boolean_txt} tmp.txt > ${prefix}.boolean.annotatePeaks.txt + """ +} diff --git a/modules/homer/homer_annotatepeaks.nf b/modules/homer/homer_annotatepeaks.nf new file mode 100644 index 00000000..df2c7e68 --- /dev/null +++ b/modules/homer/homer_annotatepeaks.nf @@ -0,0 +1,40 @@ +process HOMER_ANNOTATEPEAKS { + tag "${run_tag}" + + cpus 2 + memory 10.GB + time '10:00:00' + + publishDir { + def type = "${ip}" ? "${'immuno_precip_samples/'+ip+'_vs_'+control+'/macs2'}" : "${'consensusCalling_'+antibody+'/macs2'}" + "${params.pubdir}/${ params.organize_by=='sample' ? type : 'macs2'}" + }, pattern: "*annotatePeaks.txt", mode: 'copy' + + container 'quay.io/biocontainers/homer:4.11--pl526hc9558a2_3' + + input: + tuple val(antibody), val(replicatesExist), val(multipleGroups), val(ip), val(control), file(peak) + file(fasta) + file(gtf) + + when: + params.macs_gsize && !params.skip_peak_annotation + + output: + tuple val(tuple_tag), path("*annotatePeaks.txt"), emit: txt + + script: + prefix = peak =~ /bed/ ? "${antibody}.consensus_peaks" : "${ip}_peaks" + run_tag = ip ? "${ip} vs ${control}" : "${antibody}" + tuple_tag = ip ? ip : antibody + + """ + annotatePeaks.pl \\ + $peak \\ + $fasta \\ + -gid \\ + -gtf $gtf \\ + -cpu $task.cpus \\ + > ${prefix}.annotatePeaks.txt + """ +} diff --git a/modules/homer/plot_homer_annotatepeaks.nf b/modules/homer/plot_homer_annotatepeaks.nf new file mode 100644 index 00000000..eeeef450 --- /dev/null +++ b/modules/homer/plot_homer_annotatepeaks.nf @@ -0,0 +1,37 @@ +process PLOT_HOMER_ANNOTATEPEAKS { + + cpus 2 + memory 10.GB + time '10:00:00' + + + container 'quay.io/biocontainers/mulled-v2-ad9dd5f398966bf899ae05f8e7c54d0fb10cdfa7:05678da05b8e5a7a5130e90a9f9a6c585b965afa-0' + + publishDir "${params.pubdir}/${ params.organize_by=='sample' ? 'immuno_precip_samples/cross_sample_plots' : 'homer' }", pattern: "*.{pdf,txt}", mode: 'copy' + + input: + file(annos) + file(mqc_header) + val suffix //_peaks.annotatePeaks.txt + + when: + params.macs_gsize && !params.skip_peak_annotation && !params.skip_peak_qc + + output: + path '*.txt' , emit: txt + path '*.pdf' , emit: pdf + path '*.tsv' , emit: tsv + + script: // This script was bundled withing the nf-core/chipseq/bin/ directory + def prefix = "macs_annotatepeaks" + """ + ${projectDir}/bin/chipseq/plot_homer_annotatepeaks.r \\ + -i ${annos.join(',')} \\ + -s ${annos.join(',').replaceAll("${suffix}","")} \\ + -p $prefix \\ + -o ./ + + find ./ -type f -name "*summary.txt" -exec cat {} \\; | cat $mqc_header - > ${prefix}.summary_mqc.tsv + + """ +} diff --git a/modules/illumina/manta.nf b/modules/illumina/manta.nf new file mode 100644 index 00000000..3d4550dd --- /dev/null +++ b/modules/illumina/manta.nf @@ -0,0 +1,50 @@ +process MANTA { + tag "$sampleID" + + cpus = 4 + memory { normal_bam.size() < 60.GB ? 12.GB : 24.GB } + time { normal_bam.size() < 60.GB ? '03:00:00' : '12:00:00' } + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} + + container 'quay.io/jaxcompsci/manta:v1.5.0' + + publishDir "${params.pubdir}/${ params.organize_by=='sample' ? "$sampleID" + '/callers' : 'manta' }", pattern:"*.vcf.gz", mode:'copy' + + input: + tuple val(sampleID), val(meta), path(normal_bam), path(normal_bai), val(normal_name), path(tumor_bam), path(tumor_bai), val(tumor_name) + + output: + tuple val(sampleID), path("*candidateSmallIndels.vcf.gz"), path("*candidateSmallIndels.vcf.gz.tbi"), emit: manta_smallindel_vcf_tbi + tuple val(sampleID), path("*diploidSV.vcf.gz"), path("*diploidSV.vcf.gz.tbi"), emit: manta_diploidsv_tbi + tuple val(sampleID), path("*somaticSV.vcf.gz"), path("*somaticSV.vcf.gz.tbi"), val(meta), val(normal_name), val(tumor_name), val('manta'), emit: manta_somaticsv_tbi + tuple val(sampleID), path("*candidateSV.vcf.gz"), path("*candidateSV.vcf.gz.tbi"), emit: manta_candidatesv_tbi + + + script: + String my_mem = (task.memory-1.GB).toString() + my_mem = my_mem[0..-4] + + """ + # configure manta + python /usr/local/bin/configManta.py \ + --normalBam ${normal_bam} \ + --tumorBam ${tumor_bam} \ + --referenceFasta ${params.ref_fa} \ + --callRegions ${params.callRegions} \ + --runDir ${sampleID} + + # execute manta + python ${sampleID}/runWorkflow.py -j ${task.cpus} \ + --mode local \ + --memGb ${my_mem} + + mv ${sampleID}/results/variants/candidateSmallIndels.vcf.gz ${sampleID}_manta_candidateSmallIndels.vcf.gz + mv ${sampleID}/results/variants/candidateSmallIndels.vcf.gz.tbi ${sampleID}_manta_candidateSmallIndels.vcf.gz.tbi + mv ${sampleID}/results/variants/diploidSV.vcf.gz ${sampleID}_manta_diploidSV.vcf.gz + mv ${sampleID}/results/variants/diploidSV.vcf.gz.tbi ${sampleID}_manta_diploidSV.vcf.gz.tbi + mv ${sampleID}/results/variants/somaticSV.vcf.gz ${sampleID}_manta_somaticSV.vcf.gz + mv ${sampleID}/results/variants/somaticSV.vcf.gz.tbi ${sampleID}_manta_somaticSV.vcf.gz.tbi + mv ${sampleID}/results/variants/candidateSV.vcf.gz ${sampleID}_manta_candidateSV.vcf.gz + mv ${sampleID}/results/variants/candidateSV.vcf.gz.tbi ${sampleID}_manta_candidateSV.vcf.gz.tbi + """ +} \ No newline at end of file diff --git a/modules/illumina/strelka2.nf b/modules/illumina/strelka2.nf new file mode 100644 index 00000000..1a444e61 --- /dev/null +++ b/modules/illumina/strelka2.nf @@ -0,0 +1,48 @@ +process STRELKA2 { + tag "$sampleID" + + cpus = 4 + memory { normal_bam.size() < 60.GB ? 8.GB : 24.GB } + time { normal_bam.size() < 60.GB ? '03:00:00' : '12:00:00' } + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} + + container 'quay.io/jaxcompsci/strelka2:v2.9.3' + + publishDir "${params.pubdir}/${ params.organize_by=='sample' ? "$sampleID" + '/callers' : 'strelka' }", pattern:"*.vcf.gz", mode:'copy' + + input: + tuple val(sampleID), val(meta), path(normal_bam), path(normal_bai), val(normal_name), path(tumor_bam), path(tumor_bai), val(tumor_name), path(candidateSmallIndels), path(candidateSmallIndels_tbi) + + output: + tuple val(sampleID), path("*indels.vcf.gz"), path("*indels.vcf.gz.tbi"), val(meta), val(normal_name), val(tumor_name), val('strelka2_indel'), emit: strelka_indel_vcf_tbi + tuple val(sampleID), path("*snvs.vcf.gz"), path("*snvs.vcf.gz.tbi"), val(meta), val(normal_name), val(tumor_name), val('strelka2_sv'), emit: strelka_snv_vcf_tbi + + script: + + String my_mem = (task.memory-1.GB).toString() + my_mem = my_mem[0..-4] + + """ + # configure strelka + python /usr/local/bin/configureStrelkaSomaticWorkflow.py \ + --normalBam ${normal_bam} \ + --tumorBam ${tumor_bam} \ + --callRegions ${params.callRegions} \ + --referenceFasta ${params.ref_fa} \ + --indelCandidates ${candidateSmallIndels} \ + --config ${params.strelka_config} \ + --runDir ${sampleID} + + # execute strelka + python ${sampleID}/runWorkflow.py \ + --mode local \ + --job ${task.cpus} \ + --memGb ${my_mem} + + mv ${sampleID}/results/variants/somatic.snvs.vcf.gz ${sampleID}_strelka_somatic.snvs.vcf.gz + mv ${sampleID}/results/variants/somatic.snvs.vcf.gz.tbi ${sampleID}_strelka_somatic.snvs.vcf.gz.tbi + mv ${sampleID}/results/variants/somatic.indels.vcf.gz ${sampleID}_strelka_somatic.indels.vcf.gz + mv ${sampleID}/results/variants/somatic.indels.vcf.gz.tbi ${sampleID}_strelka_somatic.indels.vcf.gz.tbi + + """ +} \ No newline at end of file diff --git a/modules/jaffa/jaffa.nf b/modules/jaffa/jaffa.nf new file mode 100644 index 00000000..68c81345 --- /dev/null +++ b/modules/jaffa/jaffa.nf @@ -0,0 +1,41 @@ +process JAFFA { + + tag "$sampleID" + + cpus 12 + memory 84.GB + time 10.h + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} + + container 'quay.io/jaxcompsci/jaffa:d1587c9' + + publishDir "${params.pubdir}/${ params.organize_by=='sample' ? sampleID + '/fusions': 'jaffa' }", pattern: "*_jaffa_fusions.csv", mode:'copy' + publishDir "${params.pubdir}/${ params.organize_by=='sample' ? sampleID + '/fusions': 'jaffa' }", pattern: "*_jaffa_fusions.fasta", mode:'copy', enabled: params.keep_intermediate + + input: + tuple val(sampleID), path(reads) + + output: + tuple val(sampleID), path("*_jaffa_fusions.csv"), emit: jaffa_fusions + tuple val(sampleID), path("*_jaffa_fusions.fasta"), emit: jaffa_fasta + + script: + ext = reads[0].getExtension() + + """ + + bpipe run -v \ + -n ${task.cpus} \ + -p fastqInputFormat='%_*.${ext}' \ + -p refBase=${params.jaffa_ref_dir} \ + -p genome=hg38 \ + -p annotation=genCode22 \ + /opt/JAFFA/JAFFA_direct.groovy \ + ${reads[0]} \ + ${reads[1]} + + mv jaffa_results.csv ${sampleID}_jaffa_fusions.csv + mv jaffa_results.fasta ${sampleID}_jaffa_fusions.fasta ; + + """ +} \ No newline at end of file diff --git a/modules/kallisto/kallisto_insert_size.nf b/modules/kallisto/kallisto_insert_size.nf new file mode 100644 index 00000000..a26a2af3 --- /dev/null +++ b/modules/kallisto/kallisto_insert_size.nf @@ -0,0 +1,23 @@ +process KALLISTO_INSERT_SIZE { + tag "$sampleID" + + cpus 1 + memory 1.GB + time '00:05:00' + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} + + cache 'lenient' + + container 'quay.io/biocontainers/pizzly:0.37.3--h470a237_3' + + input: + tuple val(sampleID), val(kallisto_abundance) + + output: + tuple val(sampleID), path('insert_size.txt'), emit: kallisto_insert_size + + script: + """ + python ${projectDir}/bin/rna_fusion/compute_insert_size.py ${kallisto_abundance} > insert_size.txt + """ +} diff --git a/modules/kallisto/kallisto_quant.nf b/modules/kallisto/kallisto_quant.nf new file mode 100644 index 00000000..508f239f --- /dev/null +++ b/modules/kallisto/kallisto_quant.nf @@ -0,0 +1,33 @@ +process KALLISTO_QUANT { + + tag "$sampleID" + + cpus 12 + memory 84.GB + time 24.h + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} + + container 'quay.io/biocontainers/kallisto:0.48.0--h15996b6_2' + + input: + tuple val(sampleID), path(reads) + + output: + tuple val(sampleID), path("*kallisto_quant.fusions.txt"), emit: kallisto_fusions + tuple val(sampleID), path("*abundance.h5"), emit: kallisto_abundance + + script: + """ + kallisto quant \ + -t $task.cpus \ + -i ${params.kallisto_index} \ + --fusion \ + -o . \ + ${reads} + mv fusion.txt ${sampleID}.kallisto_quant.fusions.txt + mv abundance.h5 ${sampleID}.abundance.h5 + """ +} +// NOTE: +// Index built with command: +// singularity run /projects/omics_share/meta/containers/quay.io-biocontainers-kallisto-0.48.0--h15996b6_2.img kallisto index -k 31 -i Homo_sapiens.GRCh38.102.cdna.all.kallisto-0.48.0.index Homo_sapiens.GRCh38.102.cdna.all.fa.gz diff --git a/modules/lumpy_sv/lumpy_sv.nf b/modules/lumpy_sv/lumpy_sv.nf new file mode 100644 index 00000000..ac03d439 --- /dev/null +++ b/modules/lumpy_sv/lumpy_sv.nf @@ -0,0 +1,25 @@ +process LUMPY_SV { + tag "$sampleID" + + cpus = 1 + memory { normal_bam.size() < 60.GB ? 8.GB : 24.GB } + time { normal_bam.size() < 60.GB ? '03:00:00' : '12:00:00' } + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} + + container 'quay.io/biocontainers/lumpy-sv:0.3.1--hdfd78af_3' + + publishDir "${params.pubdir}/${ params.organize_by=='sample' ? sampleID + '/callers' : 'lumpy-sv' }", pattern:"*.vcf", mode:'copy' + + input: + tuple val(sampleID), val(meta), path(normal_bam), path(normal_bai), val(normal_name), path(tumor_bam), path(tumor_bai), val(tumor_name) + + output: + tuple val(sampleID), path("*_lumpy_sv.vcf"), val(meta), val(normal_name), val(tumor_name), val('lumpy'), emit: lumpy_sv_vcf + + script: + """ + lumpyexpress \ + -B ${tumor_bam},${normal_bam} \ + -o ${sampleID}_lumpy_sv.vcf + """ +} \ No newline at end of file diff --git a/modules/macs2/macs2_consensus.nf b/modules/macs2/macs2_consensus.nf new file mode 100644 index 00000000..d25c2cb5 --- /dev/null +++ b/modules/macs2/macs2_consensus.nf @@ -0,0 +1,67 @@ +/* + * Consensus peaks across samples, create boolean filtering file, SAF file for featureCounts + */ +process MACS2_CONSENSUS { + tag "${antibody}" + + cpus 8 + memory 10.GB + time '10:00:00' + + publishDir "${params.pubdir}/${ params.organize_by=='sample' ? 'consensusCalling_'+antibody+'/macs2' : 'macs2' }", pattern: "*_peaks.*", mode: 'copy' + + container 'quay.io/biocontainers/mulled-v2-2f48cc59b03027e31ead6d383fe1b8057785dd24:5d182f583f4696f4c4d9f3be93052811b383341f-0' + + input: + tuple val(antibody), val(replicatesExist), val(multipleGroups), path(peaks) + + output: + tuple val(antibody), val(replicatesExist), val(multipleGroups), path('*.bed') , emit: ano + tuple val(antibody), val(replicatesExist), val(multipleGroups), val(''), val(''), path('*.bed') , emit: bed + tuple val(antibody), path('*.saf') , emit: saf + tuple val(antibody), path("*.pdf") , emit: pdf + tuple val(antibody), path("*.antibody.txt") , emit: txt + tuple val(antibody), path("*.boolean.txt") , emit: boolean_txt + tuple val(antibody), path("*.intersect.txt"), emit: intersect_txt + + when: + params.macs_gsize && (replicatesExist || multipleGroups) && !params.skip_consensus_peaks + + script: + peak_type = params.narrow_peak ? 'narrowPeak' : 'broadPeak' + prefix = "${antibody}.consensus_peaks" + mergecols = params.narrow_peak ? (2..10).join(',') : (2..9).join(',') + collapsecols = params.narrow_peak ? (['collapse']*9).join(',') : (['collapse']*8).join(',') + expandparam = params.narrow_peak ? '--is_narrow_peak' : '' + """ + sort -T '.' -k1,1 -k2,2n ${peaks.collect{it.toString()}.sort().join(' ')} \\ + | mergeBed -c $mergecols -o $collapsecols > ${prefix}.txt + + ${projectDir}/bin/chipseq/macs2_merged_expand.py \\ + ${prefix}.txt \\ + ${peaks.collect{it.toString()}.sort().join(',').replaceAll("_peaks.${peak_type}","")} \\ + ${prefix}.boolean.txt \\ + --min_replicates $params.min_reps_consensus \\ + $expandparam + + awk -v FS='\t' -v OFS='\t' 'FNR > 1 { print \$1, \$2, \$3, \$4, "0", "+" }' ${prefix}.boolean.txt > ${prefix}.bed + + echo -e "GeneID\tChr\tStart\tEnd\tStrand" > ${prefix}.saf + awk -v FS='\t' -v OFS='\t' 'FNR > 1 { print \$4, \$1, \$2, \$3, "+" }' ${prefix}.boolean.txt >> ${prefix}.saf + + ${projectDir}/bin/chipseq/plot_peak_intersect.r -i ${prefix}.boolean.intersect.txt -o ${prefix}.boolean.intersect.plot.pdf + + echo "${prefix}.bed\t${antibody}/${prefix}.bed" > ${prefix}.antibody.txt + + """ + +} + +/* +IGV steps removed, re-add if IGV is needed: + + OUTPUT: tuple val(antibody), path("*.bed.igv.txt"), emit: igv_txt + + + SCRIPT: find * -type f -name "${prefix}.bed" -exec echo -e "macs2/"{}"\\t0,0,0" \\; > ${prefix}.bed.igv.txt +*/ diff --git a/modules/macs2/macs2_peak_calling.nf b/modules/macs2/macs2_peak_calling.nf index 857e5615..6d1c84d2 100644 --- a/modules/macs2/macs2_peak_calling.nf +++ b/modules/macs2/macs2_peak_calling.nf @@ -4,13 +4,14 @@ process PEAK_CALLING { cpus 2 memory 10.GB time '10:00:00' + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} + + container 'quay.io/biocontainers/macs2:2.2.7.1--py39hbf8eff0_4' publishDir "${params.pubdir}/${ params.organize_by=='sample' ? sampleID : 'macs2' }", pattern: "*_peaks.narrowPeak", mode: 'copy' publishDir "${params.pubdir}/${ params.organize_by=='sample' ? sampleID : 'macs2' }", pattern: "*_summits.bed", mode: 'copy' publishDir "${params.pubdir}/${ params.organize_by=='sample' ? sampleID+'/stats' : 'macs2' }", pattern: "*.log", mode: 'copy' - container 'quay.io/biocontainers/macs2:2.2.7.1--py39hbf8eff0_4' - - + input: tuple val(sampleID), file(processed_bams) @@ -21,7 +22,6 @@ process PEAK_CALLING { script: - log.info "----- Performing Peak Calling on on ${sampleID} -----" String genome = params.gen_org == 'human' ? 'hs' : 'mm' """ macs2 callpeak \ diff --git a/modules/macs2/macs2_peak_calling_chipseq.nf b/modules/macs2/macs2_peak_calling_chipseq.nf new file mode 100644 index 00000000..78025734 --- /dev/null +++ b/modules/macs2/macs2_peak_calling_chipseq.nf @@ -0,0 +1,47 @@ +process PEAK_CALLING_CHIPSEQ { + tag "${ip} vs ${control}" + + cpus 2 + memory 10.GB + time '10:00:00' + + publishDir "${params.pubdir}/${ params.organize_by=='sample' ? 'immuno_precip_samples/'+ip+'_vs_'+control+'/macs2' : 'macs2' }", pattern: "*_peaks.*", mode: 'copy' + + container 'quay.io/biocontainers/macs2:2.2.7.1--py39hbf8eff0_4' + + + input: + tuple val(antibody), val(replicatesExist), val(multipleGroups), val(ip), file(ipbam), val(control), file(controlbam), file(ipflagstat) + file(peak_count_header) + file(frip_score_header) + + + output: + tuple val(antibody), val(replicatesExist), val(multipleGroups), file("*.{narrowPeak,broadPeak}"), emit: arm_peak + tuple val(antibody), val(replicatesExist), val(multipleGroups), val(ip), val(control), file("*.{narrowPeak,broadPeak}"), emit: ip_control_peak + tuple val(antibody), val(replicatesExist), val(multipleGroups), val(ip), val(control), emit: ip_control + tuple val(ip), file("*.{narrowPeak,broadPeak}"), emit: peak + tuple val(ip), file("*_peaks.gappedPeak"), emit: gapped, optional: true + tuple val(ip), file("*_peaks.xls"), emit: xls + + + script: + broad = params.narrow_peak ? '' : "--broad --broad-cutoff ${params.broad_cutoff}" + format = params.read_type == 'SE' ? 'BAM' : 'BAMPE' + pileup = params.save_macs_pileup ? '-B --SPMR' : '' + fdr = params.macs_fdr ? "--qvalue ${params.macs_fdr}" : '' + pvalue = params.macs_pvalue ? "--pvalue ${params.macs_pvalue}" : '' + """ + macs2 callpeak \\ + -t ${ipbam[0]} \\ + -c ${controlbam[0]} \\ + $broad \\ + -f $format \\ + -g $params.macs_gsize \\ + -n $ip \\ + $pileup \\ + $fdr \\ + $pvalue \\ + --keep-dup all + """ +} diff --git a/modules/macs2/macs2_peak_coverage.nf b/modules/macs2/macs2_peak_coverage.nf index 8d632fbd..8bb39381 100644 --- a/modules/macs2/macs2_peak_coverage.nf +++ b/modules/macs2/macs2_peak_coverage.nf @@ -4,6 +4,7 @@ process PEAK_COVERAGE { cpus = 1 memory 1.GB time '01:00:00' + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} container 'quay.io/biocontainers/macs2:2.2.7.1--py39hbf8eff0_4' @@ -14,7 +15,6 @@ process PEAK_COVERAGE { tuple val(sampleID), file("*_peaks.narrowPeak.saf") shell: - log.info "----- Get coverage in each peak on ${sampleID} -----" ''' awk 'OFS="\\t" {print $1"."$2"."$3, $1, $2, $3, "."}' !{narrow_peaks} \ > !{sampleID}_peaks.narrowPeak.saf diff --git a/modules/macs2/plot_macs2_qc.nf b/modules/macs2/plot_macs2_qc.nf new file mode 100644 index 00000000..35f952ef --- /dev/null +++ b/modules/macs2/plot_macs2_qc.nf @@ -0,0 +1,30 @@ +process PLOT_MACS2_QC { + + cpus 2 + memory 10.GB + time '10:00:00' + + container 'quay.io/biocontainers/mulled-v2-ad9dd5f398966bf899ae05f8e7c54d0fb10cdfa7:05678da05b8e5a7a5130e90a9f9a6c585b965afa-0' + + publishDir "${params.pubdir}/${ params.organize_by=='sample' ? 'immuno_precip_samples/cross_sample_plots' : 'macs2' }", mode: 'copy' + + input: + file(peaks) + + when: + params.macs_gsize && !params.skip_peak_annotation && !params.skip_peak_qc + + output: + path '*.txt' , emit: txt + path '*.pdf' , emit: pdf + + script: // This script was bundled withing the nf-core/chipseq/bin/ directory + def peak_type = params.narrow_peak ? 'narrowPeak' : 'broadPeak' + """ + ${projectDir}/bin/chipseq/plot_macs_qc.r \\ + -i ${peaks.join(',')} \\ + -s ${peaks.join(',').replaceAll("_peaks.${peak_type}","")} \\ + -o ./ \\ + -p macs_peak + """ +} diff --git a/modules/msisensor2/msisensor2.nf b/modules/msisensor2/msisensor2.nf new file mode 100644 index 00000000..f1925912 --- /dev/null +++ b/modules/msisensor2/msisensor2.nf @@ -0,0 +1,31 @@ +process MSISENSOR2_MSI { + tag "$sampleID" + + cpus = 1 + memory = 6.GB + time = '03:00:00' + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} + + container 'quay.io/biocontainers/msisensor2:0.1--hd03093a_0' + + publishDir "${params.pubdir}/${ params.organize_by=='sample' ? sampleID + '/msi' : 'msisensor2' }", pattern:"*msisensor", mode:'copy' + + input: + tuple val(sampleID), val(meta), file(bam), file(bai), val(seqID) + + output: + tuple val(sampleID), file("*msisensor"), emit: msisensor + file("${sampleID}_msisensor_dis") + file("${sampleID}_msisensor_somatic") + + script: + + """ + mkdir models + + cp -r ${params.msisensor_model} models + + msisensor2 msi -M models/models_hg38 -t ${bam} -o ${sampleID}_msisensor + + """ +} diff --git a/modules/msisensor2/msisensor2_tumorOnly.nf b/modules/msisensor2/msisensor2_tumorOnly.nf new file mode 100644 index 00000000..ec00a7f2 --- /dev/null +++ b/modules/msisensor2/msisensor2_tumorOnly.nf @@ -0,0 +1,31 @@ +process MSISENSOR2_MSI { + tag "$sampleID" + + cpus = 1 + memory = 6.GB + time = '03:00:00' + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} + + container 'quay.io/biocontainers/msisensor2:0.1--hd03093a_0' + + publishDir "${params.pubdir}/${ params.organize_by=='sample' ? sampleID+'/msi' : 'msisensor2' }", pattern:"*msisensor", mode:'copy' + + input: + tuple val(sampleID), file(bam), file(bai) + + output: + tuple val(sampleID), file("*msisensor"), emit: msisensor + file("${sampleID}_msisensor_dis") + file("${sampleID}_msisensor_somatic") + + script: + + """ + mkdir models + + cp -r ${params.msisensor_model} models + + msisensor2 msi -M models/models_hg38 -t ${bam} -o ${sampleID}_msisensor + + """ +} diff --git a/modules/multiqc/multiqc.nf b/modules/multiqc/multiqc.nf index afff31ee..dadee379 100644 --- a/modules/multiqc/multiqc.nf +++ b/modules/multiqc/multiqc.nf @@ -1,7 +1,9 @@ process MULTIQC { + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} - container 'quay.io/biocontainers/multiqc:1.12--pyhdfd78af_0' - + container 'quay.io/jaxcompsci/multiqc:v1.15.dev0' + //quay.io/biocontainers/multiqc:1.12--pyhdfd78af_0 + publishDir "${params.pubdir}/multiqc", pattern: "*multiqc_report.html", mode:'copy' publishDir "${params.pubdir}/multiqc", pattern: "*_data", mode:'copy' @@ -14,9 +16,9 @@ process MULTIQC { path "*_plots" , optional:true, emit: plots script: - + def custom_config = params.multiqc_config ? " --config $params.multiqc_config " : '' """ - multiqc . + multiqc . ${custom_config} """ -} +} \ No newline at end of file diff --git a/modules/multiqc/multiqc_custom_phantompeakqualtools.nf b/modules/multiqc/multiqc_custom_phantompeakqualtools.nf new file mode 100644 index 00000000..8b1a72fa --- /dev/null +++ b/modules/multiqc/multiqc_custom_phantompeakqualtools.nf @@ -0,0 +1,25 @@ +process MULTIQC_CUSTOM_PHANTOMPEAKQUALTOOLS { + tag "$sampleID" + + container 'quay.io/biocontainers/r-base:3.5.1' + + input: + tuple val(sampleID), file(spp), file(rdata) + file(nsc_header) + file(rsc_header) + file(correlation_header) + + output: + tuple val(sampleID), file("*.spp_nsc_mqc.tsv") , emit: nsc + tuple val(sampleID), file("*.spp_rsc_mqc.tsv") , emit: rsc + tuple val(sampleID), file("*.spp_correlation_mqc.tsv"), emit: correlation + + script: + """ + cp $correlation_header ${sampleID}.spp_correlation_mqc.tsv + Rscript --max-ppsize=500000 -e "load('$rdata'); write.table(crosscorr\\\$cross.correlation, file=\\"${sampleID}.spp_correlation_mqc.tsv\\", sep=",", quote=FALSE, row.names=FALSE, col.names=FALSE,append=TRUE)" + + awk -v OFS='\t' '{print "${sampleID}", \$9}' $spp | cat $nsc_header - > ${sampleID}.spp_nsc_mqc.tsv + awk -v OFS='\t' '{print "${sampleID}", \$10}' $spp | cat $rsc_header - > ${sampleID}.spp_rsc_mqc.tsv + """ +} diff --git a/modules/novocraft/novosort.nf b/modules/novocraft/novosort.nf new file mode 100644 index 00000000..981db78f --- /dev/null +++ b/modules/novocraft/novosort.nf @@ -0,0 +1,23 @@ +process NOVOSORT_markDuplicates { + tag "$sampleID" + + cpus = 1 + memory = 8.GB + time = '03:00:00' + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} + + container 'quay.io/jaxcompsci/novosort:lastest' + publishDir "${params.pubdir}/${ params.organize_by=='sample' ? sampleID : 'novosort' }", pattern:"*_fixed_mate_dup_marked.bam", mode:'copy' + + input: + tuple val(sampleID), file(fixed_mate_bam) + + output: + tuple val(sampleID), file("*_fixed_mate_dup_marked.bam"), emit: fixed_mate_dup_marked_bam + + script: + + """ + novosort -markduplicates -t . -m 8G \ + ${fixed_mate_bam} > ${sampleID}_fixed_mate_dup_marked.bam + """ diff --git a/modules/nygc-short-alignment-marking/short_alignment_marking.nf b/modules/nygc-short-alignment-marking/short_alignment_marking.nf new file mode 100644 index 00000000..3701e6e0 --- /dev/null +++ b/modules/nygc-short-alignment-marking/short_alignment_marking.nf @@ -0,0 +1,36 @@ +process SHORT_ALIGNMENT_MARKING { + tag "$sampleID" + + cpus 1 + memory 24.GB + time '24:00:00' + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} + + container 'quay.io/biocontainers/samtools:1.14--hb421002_0' + + publishDir "${params.pubdir}/${ params.organize_by=='sample' ? sampleID : 'short_alignment_marking' }", pattern:"*.marked.bam", mode:'copy', enabled: params.keep_intermediate + + input: + tuple val(sampleID), file(aligned_bam) + + output: + tuple val(sampleID), file("*.marked.bam"), emit: marked_bam + + script: + // parses the bam file and marks as unmapped a read with alignment length below a user-defined threshold. Reads are not filtered from the bam file but kept as unmapped. + """ + ${projectDir}/bin/pta/filter_bam -I ${aligned_bam} -A1 30 -A2 30 -o ${sampleID}.marked.bam | samtools view -b -o ${sampleID}.marked.bam + """ +} + +/* +-A1, --ALN_LEN_PRIM ALN_LEN_PRIM + Primary (loose) alignment length +-A2, --ALN_LEN_SECOND ALN_LEN_SECOND + Supplementary (strict) alignment length +-o, --OUT_PREFIX OUT_PREFIX + Output file prefix + +NOTE: -o does not actually produce an output file. +NOTE: The BAM file produced here, is corrupt. It requires sorting and cleaning (non mapped reads have non 0 MAPQ) and mate information to be fixed. +*/ \ No newline at end of file diff --git a/modules/nygenome/lancet.nf b/modules/nygenome/lancet.nf new file mode 100644 index 00000000..f59b63c2 --- /dev/null +++ b/modules/nygenome/lancet.nf @@ -0,0 +1,33 @@ +process LANCET { + tag "$sampleID" + + cpus = 4 + memory = 15.GB + time = '10:00:00' + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} + + container 'quay.io/jaxcompsci/lancet:v1.1.0' + publishDir "${params.pubdir}/${ params.organize_by=='sample' ? "$sampleID" : 'lancet' }", pattern:"*.vcf", mode:'copy', enabled: params.keep_intermediate + + input: + tuple val(sampleID), val(meta), path(normal_bam), path(normal_bai), val(normal_name), path(tumor_bam), path(tumor_bai), val(tumor_name), path(bed), val(index) + + output: + tuple val(sampleID), path("*_lancet.vcf"), val(meta), val(normal_name), val(tumor_name), val('lancet'), emit: vcf + + script: + """ + lancet \ + --tumor ${tumor_bam} \ + --normal ${normal_bam} \ + --ref ${params.ref_fa} \ + --bed ${bed} \ + --min-k 11 \ + --low-cov 1 \ + --min-phred-fisher 5 \ + --min-strand-bias 1 \ + --min-alt-count-tumor 3 \ + --min-vaf-tumor 0.04 \ + --num-threads ${task.cpus} > ${sampleID}_${index}_lancet.vcf + """ +} \ No newline at end of file diff --git a/modules/nygenome/lancet_confirm.nf b/modules/nygenome/lancet_confirm.nf new file mode 100644 index 00000000..e84f7378 --- /dev/null +++ b/modules/nygenome/lancet_confirm.nf @@ -0,0 +1,36 @@ +process LANCET_CONFIRM { + tag "$sampleID" + + cpus = 8 + memory = 15.GB + time = '20:00:00' + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} + + container 'quay.io/jaxcompsci/lancet:v1.1.0' + // publishDir "${params.pubdir}/${ params.organize_by=='sample' ? sampleID : 'lancet' }", pattern:".vcf", mode:'copy' + + input: + tuple val(sampleID), path(bed), val(meta), path(normal_bam), path(normal_bai), val(normal_name), path(tumor_bam), path(tumor_bai), val(tumor_name), val(chrom) + + output: + tuple val(sampleID), path("*.vcf"), val(meta), val(normal_name), val(tumor_name), val(chrom), emit: vcf + + script: + """ + lancet \ + --tumor ${tumor_bam} \ + --normal ${normal_bam} \ + --bed ${bed} \ + --ref ${params.ref_fa} \ + --min-k 11 \ + --low-cov 1 \ + --min-phred-fisher 5 \ + --min-strand-bias 1 \ + --min-alt-count-tumor 3 \ + --min-vaf-tumor 0.04 \ + --padding 250 \ + --window-size 2000 \ + --num-threads ${task.cpus} \ + > ${sampleID}_lancet_merged_${chrom}.vcf + """ +} diff --git a/modules/phantompeakqualtools/phantompeakqualtools.nf b/modules/phantompeakqualtools/phantompeakqualtools.nf new file mode 100644 index 00000000..92fb4277 --- /dev/null +++ b/modules/phantompeakqualtools/phantompeakqualtools.nf @@ -0,0 +1,22 @@ +process PHANTOMPEAKQUALTOOLS { + tag "$sampleID" + cpus 8 + memory 10.GB + time '04:00:00' + + container 'quay.io/biocontainers/phantompeakqualtools:1.2.2--0' + + input: + tuple val(sampleID), file(bam) + + output: + tuple val(sampleID), file("*.out") , emit: spp + tuple val(sampleID), file("*.pdf") , emit: pdf + tuple val(sampleID), file("*.Rdata"), emit: rdata + + script: + """ + RUN_SPP=`which run_spp.R` + Rscript -e "library(caTools); source(\\"\$RUN_SPP\\")" -c="$bam" -savp="${sampleID}.spp.pdf" -savd="${sampleID}.spp.Rdata" -out="${sampleID}.spp.out" -p=$task.cpus + """ +} diff --git a/modules/picard/picard_addorreplacereadgroups.nf b/modules/picard/picard_addorreplacereadgroups.nf index 20429376..e7c2cfa8 100644 --- a/modules/picard/picard_addorreplacereadgroups.nf +++ b/modules/picard/picard_addorreplacereadgroups.nf @@ -4,6 +4,7 @@ process PICARD_ADDORREPLACEREADGROUPS { cpus 1 memory 8.GB time '06:00:00' + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} container 'quay.io/biocontainers/picard:2.26.10--hdfd78af_0' publishDir "${params.pubdir}/${ params.organize_by=='sample' ? sampleID+'/bam' : 'picard' }", pattern: "*.bam", mode:'copy', enabled: params.keep_intermediate @@ -16,7 +17,6 @@ process PICARD_ADDORREPLACEREADGROUPS { tuple val(sampleID), file("*.bai"), emit: bai script: - log.info "----- Picard Add or Replace Read Groups Running on: ${sampleID} -----" String my_mem = (task.memory-1.GB).toString() my_mem = my_mem[0..-4] diff --git a/modules/picard/picard_cleansam.nf b/modules/picard/picard_cleansam.nf new file mode 100644 index 00000000..0ebd610a --- /dev/null +++ b/modules/picard/picard_cleansam.nf @@ -0,0 +1,30 @@ +process PICARD_CLEANSAM { + tag "$sampleID" + + cpus = 1 + memory { bam.size() < 60.GB ? 8.GB : 24.GB } + time { bam.size() < 60.GB ? '06:00:00' : '12:00:00' } + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} + + container 'quay.io/biocontainers/picard:2.26.10--hdfd78af_0' + + publishDir "${params.pubdir}/${ params.organize_by=='sample' ? sampleID : 'picard' }", pattern: "*_cleaned.bam", mode:'copy', enabled: params.keep_intermediate + + input: + tuple val(sampleID), file(bam) + + output: + tuple val(sampleID), file("*_cleaned.bam"), emit: cleaned_bam + + script: + + String my_mem = (task.memory-1.GB).toString() + my_mem = my_mem[0..-4] + + """ + picard -Xmx${my_mem}G CleanSam \ + I=${bam} \ + TMP_DIR=${workDir}/temp \ + O=${sampleID}_cleaned.bam + """ +} diff --git a/modules/picard/picard_collectalignmentsummarymetrics.nf b/modules/picard/picard_collectalignmentsummarymetrics.nf index 086474c0..f9be1a83 100644 --- a/modules/picard/picard_collectalignmentsummarymetrics.nf +++ b/modules/picard/picard_collectalignmentsummarymetrics.nf @@ -4,6 +4,7 @@ process PICARD_COLLECTALIGNMENTSUMMARYMETRICS{ cpus = 1 memory = 5.GB time = '03:00:00' + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} container 'broadinstitute/gatk:4.2.4.1' @@ -16,7 +17,6 @@ process PICARD_COLLECTALIGNMENTSUMMARYMETRICS{ tuple val(sampleID), file("*.txt"), emit: txt script: - log.info "----- Collect Alignment Sumary Metrics Running on: ${sampleID} -----" String my_mem = (task.memory-1.GB).toString() my_mem = my_mem[0..-4] @@ -28,4 +28,4 @@ process PICARD_COLLECTALIGNMENTSUMMARYMETRICS{ --METRIC_ACCUMULATION_LEVEL ALL_READS \ --VALIDATION_STRINGENCY LENIENT """ -} \ No newline at end of file +} diff --git a/modules/picard/picard_collecthsmetrics.nf b/modules/picard/picard_collecthsmetrics.nf index 7582efdf..f40e8460 100644 --- a/modules/picard/picard_collecthsmetrics.nf +++ b/modules/picard/picard_collecthsmetrics.nf @@ -4,6 +4,7 @@ process PICARD_COLLECTHSMETRICS { cpus = 1 memory = 6.GB time = '03:00:00' + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} container 'quay.io/biocontainers/picard:2.26.10--hdfd78af_0' @@ -16,7 +17,6 @@ process PICARD_COLLECTHSMETRICS { tuple val(sampleID), file("*Metrics.txt"), emit: hsmetrics script: - log.info "----- Picard CollectHsMetrics Running on: ${sampleID} -----" String my_mem = (task.memory-1.GB).toString() my_mem = my_mem[0..-4] @@ -29,4 +29,4 @@ process PICARD_COLLECTHSMETRICS { REFERENCE_SEQUENCE=${params.ref_fa} \ VALIDATION_STRINGENCY=SILENT """ -} \ No newline at end of file +} diff --git a/modules/picard/picard_collectmultiplemetrics.nf b/modules/picard/picard_collectmultiplemetrics.nf new file mode 100644 index 00000000..692ad39f --- /dev/null +++ b/modules/picard/picard_collectmultiplemetrics.nf @@ -0,0 +1,37 @@ +process PICARD_COLLECTMULTIPLEMETRICS { + tag "$sampleID" + + cpus = 1 + memory = 6.GB + time = '03:00:00' + + container 'quay.io/biocontainers/picard:2.26.10--hdfd78af_0' + + publishDir { + def type = "${params.workflow}" == 'chipseq' ? ( sampleID =~ /INPUT/ ? 'control_samples/' : 'immuno_precip_samples/') : '' + "${params.pubdir}/${ params.organize_by=='sample' ? type+sampleID+'/stats' : 'picard'}" + }, pattern: "*.CollectMultipleMetrics.*", mode: 'copy' + + + input: + tuple val(sampleID), file(bam) + + output: + tuple val(sampleID), file("*_metrics"), emit : metrics + tuple val(sampleID), file("*.pdf"), emit : pdf + + + script: + prefix = "${sampleID}.mLb.clN" + String my_mem = (task.memory-1.GB).toString() + my_mem = my_mem[0..-4] + + """ + picard -Xmx${my_mem}G CollectMultipleMetrics \ + INPUT=${bam[0]} \ + OUTPUT=${prefix}.CollectMultipleMetrics \ + REFERENCE_SEQUENCE=${params.ref_fa} \ + VALIDATION_STRINGENCY=LENIENT + TMP_DIR=${params.tmpdir} + """ +} diff --git a/modules/picard/picard_collectrnaseqmetrics.nf b/modules/picard/picard_collectrnaseqmetrics.nf index bd125891..5c71b5ee 100644 --- a/modules/picard/picard_collectrnaseqmetrics.nf +++ b/modules/picard/picard_collectrnaseqmetrics.nf @@ -4,6 +4,7 @@ process PICARD_COLLECTRNASEQMETRICS { cpus 1 memory 8.GB time '03:00:00' + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} container 'quay.io/biocontainers/picard:2.26.10--hdfd78af_0' @@ -11,23 +12,25 @@ process PICARD_COLLECTRNASEQMETRICS { publishDir "${params.pubdir}/${ params.organize_by=='sample' ? sampleID+'/stats' : 'picard' }", pattern: "*.pdf", mode:'copy' input: - tuple val(sampleID), file(bam) + tuple val(sampleID), file(bam), val(strand_setting) + val(ref_flat) + val(ribo_intervals) + output: tuple val(sampleID), file("*metrics.txt"), emit: picard_metrics script: - log.info "----- Collect RNA Sequence Metrics on: ${sampleID} -----" - if (params.read_prep == "reverse_stranded") { + if (strand_setting == "reverse_stranded") { strand_setting = "SECOND_READ_TRANSCRIPTION_STRAND" } - if (params.read_prep == "forward_stranded") { + if (strand_setting == "forward_stranded") { strand_setting = "FIRST_READ_TRANSCRIPTION_STRAND" } - if (params.read_prep == "non_stranded") { + if (strand_setting == "non_stranded") { strand_setting = "NONE" } @@ -35,9 +38,9 @@ process PICARD_COLLECTRNASEQMETRICS { picard CollectRnaSeqMetrics \ I=${bam} \ O=${sampleID}_picard_aln_metrics.txt \ - REF_FLAT=${params.ref_flat} \ - RIBOSOMAL_INTERVALS=${params.ribo_intervals} \ + REF_FLAT=${ref_flat} \ + RIBOSOMAL_INTERVALS=${ribo_intervals} \ STRAND=${strand_setting} \ CHART_OUTPUT=${sampleID}_coverage_vs_transcript_plot.pdf """ -} \ No newline at end of file +} diff --git a/modules/picard/picard_collecttargetpcrmetrics.nf b/modules/picard/picard_collecttargetpcrmetrics.nf new file mode 100644 index 00000000..7a150eef --- /dev/null +++ b/modules/picard/picard_collecttargetpcrmetrics.nf @@ -0,0 +1,35 @@ +process PICARD_COLLECTTARGETPCRMETRICS { + tag "$sampleID" + + cpus = 1 + memory = 5.GB + time = '08:00:00' + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} + + container 'broadinstitute/gatk:4.2.4.1' + + publishDir "${params.pubdir}/${ params.organize_by=='sample' ? sampleID+'/stats' : 'picard' }", pattern: "*.txt", mode:'copy' + + input: + tuple val(sampleID), file(bam) + + output: + tuple val(sampleID), file("*.txt"), emit: txt + + script: + String my_mem = (task.memory-1.GB).toString() + my_mem = my_mem[0..-4] + + """ + gatk --java-options "-Xmx${my_mem}G" CollectTargetedPcrMetrics \ + --INPUT ${bam} \ + --OUTPUT ${sampleID}_CollectTargetedPcrMetrics.txt \ + --REFERENCE_SEQUENCE ${params.ref_fa} \ + --AMPLICON_INTERVALS ${params.amplicon_primer_intervals} \ + --TARGET_INTERVALS ${params.amplicon_target_intervals} \ + --COVERAGE_CAP 1500 \ + --NEAR_DISTANCE 50 \ + --VALIDATION_STRINGENCY LENIENT + """ + +} diff --git a/modules/picard/picard_collectwgsmetrics.nf b/modules/picard/picard_collectwgsmetrics.nf index cbd1dc75..b380a5a1 100644 --- a/modules/picard/picard_collectwgsmetrics.nf +++ b/modules/picard/picard_collectwgsmetrics.nf @@ -4,6 +4,7 @@ process PICARD_COLLECTWGSMETRICS { cpus = 1 memory = 5.GB time = '08:00:00' + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} container 'broadinstitute/gatk:4.2.4.1' @@ -16,7 +17,6 @@ process PICARD_COLLECTWGSMETRICS { tuple val(sampleID), file("*.txt"), emit: txt script: - log.info "----- Collect Alignment Sumary Metrics Running on: ${sampleID} -----" String my_mem = (task.memory-1.GB).toString() my_mem = my_mem[0..-4] @@ -27,5 +27,4 @@ process PICARD_COLLECTWGSMETRICS { --REFERENCE_SEQUENCE ${params.ref_fa} \ --VALIDATION_STRINGENCY LENIENT """ - -} \ No newline at end of file +} diff --git a/modules/picard/picard_fix_mate_information.nf b/modules/picard/picard_fix_mate_information.nf new file mode 100644 index 00000000..d3443c47 --- /dev/null +++ b/modules/picard/picard_fix_mate_information.nf @@ -0,0 +1,31 @@ +process PICARD_FIX_MATE_INFORMATION { + tag "$sampleID" + + cpus = 1 + memory { bam.size() < 30.GB ? 6.GB : 48.GB } + time { bam.size() < 30.GB ? '03:00:00' : '24:00:00' } + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} + + container 'quay.io/biocontainers/picard:2.26.10--hdfd78af_0' + + publishDir "${params.pubdir}/${ params.organize_by=='sample' ? sampleID : 'picard' }", pattern: "*fixed_mate.bam", mode:'copy', enabled: params.keep_intermediate + + input: + tuple val(sampleID), file(bam) + + output: + tuple val(sampleID), file("*_fixed_mate.bam"), emit: fixed_mate_bam + + script: + + String my_mem = (task.memory-1.GB).toString() + my_mem = my_mem[0..-4] + + """ + picard -Xmx${my_mem}G FixMateInformation \ + I=${bam} \ + O=${sampleID}_fixed_mate.bam \ + TMP_DIR=${workDir}/temp \ + ADD_MATE_CIGAR=true + """ +} diff --git a/modules/picard/picard_markduplicates.nf b/modules/picard/picard_markduplicates.nf index 23cf338b..df5ee7fa 100644 --- a/modules/picard/picard_markduplicates.nf +++ b/modules/picard/picard_markduplicates.nf @@ -2,14 +2,23 @@ process PICARD_MARKDUPLICATES { tag "$sampleID" cpus 1 - memory 16.GB - time '12:00:00' + memory { bam.size() < 60.GB ? 16.GB : 32.GB } + time { bam.size() < 60.GB ? '12:00:00' : '24:00:00' } + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} container 'quay.io/biocontainers/picard:2.26.10--hdfd78af_0' - // save if mouse and wes or save if keep intermediate - publishDir "${params.pubdir}/${ params.organize_by=='sample' ? sampleID+'/bam' : 'picard' }", pattern: "*.bam", mode:'copy', enabled: params.gen_org=='mouse' ? true : params.keep_intermediate - publishDir "${params.pubdir}/${ params.organize_by=='sample' ? sampleID+'/stats' : 'picard' }", pattern: "*.txt", mode:'copy' + // save if mouse or save if keep intermediate + publishDir { + def type = "${params.workflow}" == 'chipseq' ? ( sampleID =~ /INPUT/ ? 'control_samples/' : 'immuno_precip_samples/') : '' + "${params.pubdir}/${ params.organize_by=='sample' ? type+sampleID+'/bam' : 'picard'}" + }, pattern: "*.{bam,bai}", mode: 'copy', enabled: params.gen_org=='mouse' || params.workflow=='chipseq' ? true : params.keep_intermediate + + publishDir { + def type = "${params.workflow}" == 'chipseq' ? ( sampleID =~ /INPUT/ ? 'control_samples/' : 'immuno_precip_samples/') : '' + "${params.pubdir}/${ params.organize_by=='sample' ? type+sampleID+'/stats' : 'picard'}" + }, pattern: "*.txt", mode: 'copy' + input: tuple val(sampleID), file(bam) @@ -20,30 +29,17 @@ process PICARD_MARKDUPLICATES { tuple val(sampleID), file("*.txt"), emit: dedup_metrics script: - log.info "----- Picard SortSam Running on: ${sampleID} -----" String my_mem = (task.memory-1.GB).toString() my_mem = my_mem[0..-4] - if (params.workflow != "atac") """ picard -Xmx${my_mem}G MarkDuplicates \ - I=${bam} \ + I=${bam[0]} \ O=${sampleID}_dedup.bam \ M=${sampleID}_dup_metrics.txt \ - REMOVE_DUPLICATES=true \ - CREATE_INDEX=true \ - VALIDATION_STRINGENCY=SILENT - """ - else - """ - picard -Xmx${my_mem}G MarkDuplicates \ - I=${bam[0]} \ - O=${sampleID}.sorted.marked4_dedup.bam \ - M=${sampleID}.sorted.metrics.txt \ REMOVE_DUPLICATES=false \ CREATE_INDEX=true \ - VALIDATION_STRINGENCY=LENIENT \ - TMP_DIR=${params.tmpdir} \ - > ${sampleID}.picard.log 2>&1 + TMP_DIR=${workDir}/temp \ + VALIDATION_STRINGENCY=SILENT """ } diff --git a/modules/picard/picard_mergesamfiles.nf b/modules/picard/picard_mergesamfiles.nf new file mode 100644 index 00000000..785e5b3b --- /dev/null +++ b/modules/picard/picard_mergesamfiles.nf @@ -0,0 +1,43 @@ +process PICARD_MERGESAMFILES { + tag "$sampleID" + + cpus 1 + memory 8.GB + time '06:00:00' + + container 'quay.io/biocontainers/picard:2.26.10--hdfd78af_0' + + publishDir { + def type = "${params.workflow}" == 'chipseq' ? ( sampleID =~ /INPUT/ ? 'control_samples/' : 'immuno_precip_samples/') : '' + "${params.pubdir}/${ params.organize_by=='sample' ? type+sampleID+'/bam' : 'picard'}" + }, pattern: "*.bam", mode: 'copy', enabled: params.keep_intermediate + + + input: + tuple val(sampleID), file(bam) + + output: + tuple val(sampleID), file("*.bam"), emit: bam + + script: + String my_mem = (task.memory-1.GB).toString() + my_mem = my_mem[0..-4] + + prefix = "${sampleID}.mLb.mkD" + bam_files = bam.findAll { it.toString().endsWith('.bam') }.sort() + if (bam_files.size() > 1) { + """ + picard -Xmx${my_mem}G MergeSamFiles \ + ${'INPUT='+bam_files.join(' INPUT=')} \ + OUTPUT=${sampleID}.sorted.bam \ + SORT_ORDER=coordinate \ + VALIDATION_STRINGENCY=LENIENT \ + TMP_DIR=tmp + """ + }else { + """ + ln -s ${bam_files[0]} ${prefix}.bam + """ + } + +} diff --git a/modules/picard/picard_reordersam.nf b/modules/picard/picard_reordersam.nf index 0d1873b2..38334949 100644 --- a/modules/picard/picard_reordersam.nf +++ b/modules/picard/picard_reordersam.nf @@ -4,6 +4,7 @@ process PICARD_REORDERSAM { cpus 1 memory 8.GB time '06:00:00' + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} container 'quay.io/biocontainers/picard:2.26.10--hdfd78af_0' @@ -11,13 +12,13 @@ process PICARD_REORDERSAM { input: tuple val(sampleID), file(bam) + val(picard_dict) output: tuple val(sampleID), file("*.bam"), emit: bam tuple val(sampleID), file("*.bai"), emit: bai script: - log.info "----- Picard Alignment Metrics Running on: ${sampleID} -----" String my_mem = (task.memory-1.GB).toString() my_mem = my_mem[0..-4] @@ -25,7 +26,7 @@ process PICARD_REORDERSAM { picard -Xmx${my_mem}G ReorderSam \ INPUT=${bam} \ OUTPUT=${sampleID}_genome_bam_with_read_group_reorder.bam \ - SEQUENCE_DICTIONARY=${params.picard_dict} \ + SEQUENCE_DICTIONARY=${picard_dict} \ CREATE_INDEX=true """ -} \ No newline at end of file +} diff --git a/modules/picard/picard_sortsam.nf b/modules/picard/picard_sortsam.nf index bc2ad583..07b9a894 100644 --- a/modules/picard/picard_sortsam.nf +++ b/modules/picard/picard_sortsam.nf @@ -2,8 +2,9 @@ process PICARD_SORTSAM { tag "$sampleID" cpus 1 - memory 8.GB - time '06:00:00' + memory { sam.size() < 60.GB ? 6.GB : 24.GB } + time { sam.size() < 60.GB ? '03:00:00' : '12:00:00' } + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} container 'quay.io/biocontainers/picard:2.26.10--hdfd78af_0' @@ -17,16 +18,16 @@ process PICARD_SORTSAM { tuple val(sampleID), file("*_sortsam.bai"), emit: bai script: - log.info "----- Picard SortSam Running on: ${sampleID} -----" String my_mem = (task.memory-1.GB).toString() my_mem = my_mem[0..-4] """ - picard -Xmx${my_mem}G SortSam \ + picard -Xmx${my_mem}G -Djava.io.tmpdir=`pwd`/tmp SortSam \ SO=coordinate \ INPUT=${sam} \ OUTPUT=${sampleID}_sortsam.bam \ + TMP_DIR=`pwd`/tmp \ VALIDATION_STRINGENCY=SILENT \ CREATE_INDEX=true """ -} \ No newline at end of file +} diff --git a/modules/pizzly/pizzly.nf b/modules/pizzly/pizzly.nf new file mode 100644 index 00000000..9cd40b53 --- /dev/null +++ b/modules/pizzly/pizzly.nf @@ -0,0 +1,38 @@ +process PIZZLY { + + tag "$sampleID" + + cpus 1 + memory 10.GB + time 2.h + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} + + container 'quay.io/biocontainers/pizzly:0.37.3--h470a237_3' + + publishDir "${params.pubdir}/${ params.organize_by=='sample' ? sampleID + '/fusions': 'pizzly' }", pattern: "*_pizzly_fusions.txt", mode:'copy' + + input: + tuple val(sampleID), path(kallisto_fusions), path(kallisto_insert_size) + path(gtf) + + output: + tuple val(sampleID), path("*_pizzly_fusions.txt"), emit: pizzly_fusions + + script: + """ + + insert_size="\$(cat ${kallisto_insert_size})" + + pizzly \ + -k 31 \ + --align-score 2 \ + --insert-size "\${insert_size}" \ + --cache index.cache.txt \ + --gtf ${gtf} \ + --fasta ${params.transcript_fasta} \ + --output ${sampleID}.pizzly ${kallisto_fusions} + + pizzly_flatten_json.py ${sampleID}.pizzly.json ${sampleID}_pizzly_fusions.txt + + """ +} diff --git a/modules/preseq/preseq.nf b/modules/preseq/preseq.nf new file mode 100644 index 00000000..950f5b84 --- /dev/null +++ b/modules/preseq/preseq.nf @@ -0,0 +1,35 @@ +process PRESEQ { + tag "$sampleID" + + cpus 4 + memory 20.GB + time '20:00:00' + errorStrategy 'ignore' + + + container 'quay.io/biocontainers/preseq:3.1.2--h445547b_2' + + + input: + tuple val(sampleID), file(bam) + + output: + tuple val(sampleID), path("*.ccurve.txt"), emit: txt + tuple val(sampleID), path("*.log") , emit: log + + when: + !params.skip_preseq + + script: + pe = params.read_type == 'SE' ? '' : '-pe' + """ + preseq lc_extrap \\ + -output ${sampleID}.ccurve.txt \\ + -verbose \\ + -bam \\ + $pe \\ + -seed 1 \\ + $bam + cp .command.err ${sampleID}.command.log + """ +} diff --git a/modules/primerclip/primerclip.nf b/modules/primerclip/primerclip.nf new file mode 100644 index 00000000..c21f2f50 --- /dev/null +++ b/modules/primerclip/primerclip.nf @@ -0,0 +1,26 @@ +process PRIMERCLIP { + tag "$sampleID" + + cpus 1 + memory 8.GB + time '06:00:00' + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} + + container 'quay.io/biocontainers/primerclip:0.3.8--h9ee0642_1' + + publishDir "${params.pubdir}/${ params.organize_by=='sample' ? sampleID : 'primerclip' }", pattern:"*.sam", mode:'copy', enabled: params.keep_intermediate + publishDir "${params.pubdir}/${ params.organize_by=='sample' ? sampleID + '/stats' : 'primerclip' }", pattern:"*primerclip_runstats.log", mode:'copy' + + input: + tuple val(sampleID), file(sam) + + output: + tuple val(sampleID), file("*.sam"), emit: sam + tuple val(sampleID), file("*primerclip_runstats.log"), emit: log + + script: + + """ + primerclip ${params.masterfile} ${sam} ${sam.baseName}_primerclip.sam + """ +} diff --git a/modules/python/python_add_final_allele_counts.nf b/modules/python/python_add_final_allele_counts.nf new file mode 100644 index 00000000..265f6799 --- /dev/null +++ b/modules/python/python_add_final_allele_counts.nf @@ -0,0 +1,24 @@ +process ADD_FINAL_ALLELE_COUNTS { + tag "$sampleID" + + cpus 1 + memory 4.GB + time '04:00:00' + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} + + container 'quay.io/jaxcompsci/bedtools-python3:2.26.0' + + input: + tuple val(sampleID), file(vcf), val(meta), val(chrom) + + output: + tuple val(sampleID), file("*.vcf"), val(meta), val(chrom), emit: vcf + + script: + """ + python \ + ${projectDir}/bin/pta/add_final_allele_counts_to_vcf.py \ + -v ${vcf} \ + -o ${sampleID}_final_${chrom}.vcf \ + """ +} diff --git a/modules/python/python_add_nygc_allele_counts.nf b/modules/python/python_add_nygc_allele_counts.nf new file mode 100644 index 00000000..5786ef0e --- /dev/null +++ b/modules/python/python_add_nygc_allele_counts.nf @@ -0,0 +1,28 @@ +process ADD_NYGC_ALLELE_COUNTS { + tag "$sampleID" + + cpus 1 + memory 120.GB + time '24:00:00' + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} + + container 'quay.io/jaxcompsci/bedtools-python3:2.26.0' + + input: + tuple val(sampleID), file(vcf), val(meta), path(normal_bam), path(normal_bai), path(tumor_bam), path(tumor_bai), val(chrom) + + output: + tuple val(sampleID), path("*.vcf"), val(meta), val(chrom), emit: vcf + + script: + """ + python \ + ${projectDir}/bin/pta/add_nygc_allele_counts_to_vcf.py \ + -t ${tumor_bam} \ + -n ${normal_bam} \ + -v ${vcf} \ + -b 10 \ + -m 10 \ + -o ${sampleID}_pre_count_${chrom}.vcf + """ +} diff --git a/modules/python/python_check_strandedness.nf b/modules/python/python_check_strandedness.nf new file mode 100644 index 00000000..7215577a --- /dev/null +++ b/modules/python/python_check_strandedness.nf @@ -0,0 +1,41 @@ + +process CHECK_STRANDEDNESS { + tag "$sampleID" + + cpus 1 + memory 10.GB + time '1:00:00' + errorStrategy 'finish' + + container 'quay.io/jaxcompsci/how-are-we-stranded-here:v1.0.1-e6ce74d' + + input: + tuple val(sampleID), path(reads) + + output: + tuple val(sampleID), env(STRAND), emit: strand_setting + + script: + paired = params.read_type == 'PE' ? "-r2 ${reads[1]}" : '' + + """ + check_strandedness -g ${params.strandedness_gtf} -k ${params.strandedness_ref} -r1 ${reads[0]} ${paired} > ${sampleID}_strandedness.txt 2>&1 + + data_type=`grep "Data is likely" ${sampleID}_strandedness.txt` + + if [[ \$data_type == *RF* ]] ; then + STRAND='reverse_stranded' + elif [[ \$data_type == *FR* ]] ; then + STRAND='forward_stranded' + elif [[ \$data_type == *unstranded* ]] ; then + STRAND='non_stranded' + else + echo "RNA Seq data does not fall into a likely stranded (max percent explained > 0.9) or unstranded layout (max percent explained < 0.6). Please check your data for low quality and contaminating reads before proceeding."; exit 1; + fi + + """ +} + +// Data is likely RF/fr-firststrand +// Data is likely FR/fr-secondstrand +// Data is likely unstranded diff --git a/modules/python/python_filter_pon.nf b/modules/python/python_filter_pon.nf new file mode 100644 index 00000000..0d027af6 --- /dev/null +++ b/modules/python/python_filter_pon.nf @@ -0,0 +1,26 @@ +process FILTER_PON { + tag "$sampleID" + + cpus 1 + memory 15.GB + time '04:00:00' + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} + + container 'quay.io/jaxcompsci/bedtools-python3:2.26.0' + + input: + tuple val(sampleID), file(vcf), val(meta), val(chrom) + + output: + tuple val(sampleID), file("*.vcf"), val(meta), val(chrom), emit: vcf + + script: + """ + python \ + ${projectDir}/bin/pta/filter_pon.py \ + --bed ${params.pon_bed} \ + --chrom ${chrom} \ + --vcf ${vcf} \ + --out ${sampleID}_pon_final_${chrom}.vcf + """ +} diff --git a/modules/python/python_filter_vcf.nf b/modules/python/python_filter_vcf.nf new file mode 100644 index 00000000..d9955997 --- /dev/null +++ b/modules/python/python_filter_vcf.nf @@ -0,0 +1,28 @@ + +process FILTER_VCF { + tag "$sampleID" + + cpus 1 + memory 4.GB + time '04:00:00' + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} + + container 'quay.io/jaxcompsci/bedtools-python3:2.26.0' + + input: + tuple val(sampleID), file(vcf), val(meta), val(chrom) + + output: + tuple val(sampleID), file("*.vcf"), val(meta), val(chrom), emit: vcf + + script: + """ + python \ + ${projectDir}/bin/pta/vcf_filter.py \ + ${params.germline_filtering_vcf} \ + ${vcf} \ + ${sampleID}_final_filtered_${chrom}.vcf + """ +} +// NOTE: There are two similarly named scripts: vcf_filter.py and filter_vcf.py. +// The above script is used here. filter_vcf.py is used in gridss_chrom_filter.nf diff --git a/modules/python/python_germline_vcf_finalization.nf b/modules/python/python_germline_vcf_finalization.nf new file mode 100644 index 00000000..cbd9940d --- /dev/null +++ b/modules/python/python_germline_vcf_finalization.nf @@ -0,0 +1,36 @@ +process GERMLINE_VCF_FINALIZATION { + tag "$sampleID" + + cpus 1 + memory 5.GB + time 1.hour + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} + + container 'quay.io/jaxcompsci/py3_perl_pylibs:v2' + + publishDir "${params.pubdir}/${ params.organize_by=='sample' ? sampleID : 'vcf' }", pattern: "*final.vcf", mode:'copy' + + input: + tuple val(sampleID), file(vcf) + val(filtered) + + output: + tuple val(sampleID), file("*final.vcf"), emit: vcf + + script: + + output_suffix = filtered == 'filtered' ? 'filtered' : 'unfiltered' + + """ + python \ + ${projectDir}/bin/pta/annotate_id.py \ + ${vcf} \ + ${sampleID}_germline_vep_cosmic_cancerResitMut_annotated_id.vcf + + python \ + ${projectDir}/bin/pta/rename_csq_vcf.py \ + ${sampleID}_germline_vep_cosmic_cancerResitMut_annotated_id.vcf \ + ${sampleID}_germline_snv_indel_annotated_${output_suffix}_final.vcf + + """ +} diff --git a/modules/python/python_get_candidates.nf b/modules/python/python_get_candidates.nf new file mode 100644 index 00000000..8f113392 --- /dev/null +++ b/modules/python/python_get_candidates.nf @@ -0,0 +1,24 @@ +process GET_CANDIDATES { + tag "$sampleID" + + cpus 1 + memory 4.GB + time '04:00:00' + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} + + container 'quay.io/jaxcompsci/bedtools-python3:2.26.0' + + input: + tuple val(sampleID), file(vcf), val(meta), val(chrom) + + output: + tuple val(sampleID), file("*.vcf"), val(meta), val(chrom), emit: vcf + + script: + """ + python \ + ${projectDir}/bin/pta/get_candidates.py \ + ${vcf} \ + ${sampleID}_candidate_merged_${chrom}.vcf + """ +} diff --git a/modules/python/python_log_parser.nf b/modules/python/python_log_parser.nf index 637b9fde..f8564da1 100644 --- a/modules/python/python_log_parser.nf +++ b/modules/python/python_log_parser.nf @@ -4,6 +4,7 @@ process LOG_PARSER { cpus 1 memory 4.GB time '04:00:00' + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} publishDir "${params.pubdir}/${ params.organize_by=='sample' ? sampleID : 'logparser' }", pattern: "*.summary_QC_metrics.txt", mode: 'copy' @@ -16,7 +17,6 @@ process LOG_PARSER { tuple val(sampleID), file("*.summary_QC_metrics.txt") script: - log.info "----- LogParser on ${sampleID} -----" """ python ${projectDir}/bin/atac/LogParser.py > ${sampleID}.summary_QC_metrics.txt """ diff --git a/modules/python/python_merge_columns.nf b/modules/python/python_merge_columns.nf new file mode 100644 index 00000000..ca955b6d --- /dev/null +++ b/modules/python/python_merge_columns.nf @@ -0,0 +1,34 @@ +process MERGE_COLUMNS { + tag "$sampleID" + + cpus 1 + memory 4.GB + time '04:00:00' + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} + + container 'quay.io/jaxcompsci/bedtools-python3:2.26.0' + + input: + tuple val(sampleID), file(vcf), file(tbi), val(meta), val('empty_name'), val('empty_name'), val(chrom) + + output: + tuple val(sampleID), file("*.vcf"), val(meta), val(chrom), emit: mergeColumn_vcf + + script: + + normal = meta.normal_id + tumor = meta.tumor_id + + """ + python \ + ${projectDir}/bin/pta/merge_columns.py \ + ${vcf} \ + ${sampleID}_single_column_${chrom}.vcf \ + ${normal} \ + ${tumor} + """ +} + /* + NOTE: This script will take 'tumor' and 'normal' names and match string based on a simple split on '_'. + Sample names are currently _ or _. This script merged based on the index[1] of split('_'). + */ diff --git a/modules/python/python_merge_prep.nf b/modules/python/python_merge_prep.nf new file mode 100644 index 00000000..6e437422 --- /dev/null +++ b/modules/python/python_merge_prep.nf @@ -0,0 +1,43 @@ +process MERGE_PREP { + tag "$sampleID" + + cpus 1 + memory 4.GB + time '04:00:00' + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} + + container 'quay.io/jaxcompsci/bedtools-python3:2.26.0' + + input: + tuple val(sampleID), path(vcf), val(meta), val(normal_name), val(tumor_name), val(tool) + + output: + tuple val(sampleID), path("*_mergePrep.vcf"), val(meta), val(normal_name), val(tumor_name), val(tool), emit: merge_prep_vcf + + script: + String support_call = tool == 'manta' || tool == 'lancet_support' ? '--support' : '' + String tool_name = tool == 'lancet_support' ? 'lancet' : tool + + """ + python \ + ${projectDir}/bin/pta/reorder_vcf.py \ + ${vcf} \ + ${vcf.baseName}_ordered.vcf \ + ${normal_name} ${tumor_name} + + python \ + ${projectDir}/bin/pta/merge_prep.py \ + --vcf ${vcf.baseName}_ordered.vcf \ + --out ${vcf.baseName}_mergePrep.vcf \ + --tool ${tool_name} \ + ${support_call} + """ +} + +/* NOTE: PLEASE READ!!! + `reorder_vcf.py` requires the header and input 'tumor/normal' names in the 3rd and 4th arg to match. + If you pass names NOT present in the header, it will simply emit the file AS IS. + The script DOES NOT inform the user of if a change has been made in the sample order. + NOTE ALSO: if the header already contains the strings 'TUMOR' and 'NORMAL, + 'TUMOR and NORMAL are RENAMED to string provided in 3rd and 4th args. +*/ diff --git a/modules/python/python_remove_contig.nf b/modules/python/python_remove_contig.nf new file mode 100644 index 00000000..52ff21eb --- /dev/null +++ b/modules/python/python_remove_contig.nf @@ -0,0 +1,24 @@ +process REMOVE_CONTIG { + tag "$sampleID" + + cpus 1 + memory 4.GB + time '04:00:00' + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} + + container 'quay.io/jaxcompsci/bedtools-python3:2.26.0' + + input: + tuple val(sampleID), file(vcf), val(meta), val(normal_name), val(tumor_name), val(tool) + + output: + tuple val(sampleID), file("*.vcf"), val(meta), val(normal_name), val(tumor_name), val(tool), emit: remove_contig_vcf + + script: + """ + python \ + ${projectDir}/bin/pta/remove_contig.py \ + ${vcf} \ + ${vcf.baseName}_removeContig.vcf + """ +} diff --git a/modules/python/python_rename_metadata.nf b/modules/python/python_rename_metadata.nf new file mode 100644 index 00000000..44724e03 --- /dev/null +++ b/modules/python/python_rename_metadata.nf @@ -0,0 +1,27 @@ +process RENAME_METADATA { + tag "$sampleID" + + cpus 1 + memory 4.GB + time '04:00:00' + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} + + container 'quay.io/jaxcompsci/bedtools-python3:2.26.0' + + input: + tuple val(sampleID), file(vcf), path(idx), val(meta), val(normal_name), val(tumor_name), val(tool) + + output: + tuple val(sampleID), file("*headerAdjust.vcf"), val(meta), val(normal_name), val(tumor_name), val(tool), emit: rename_metadata_vcf + + script: + output_name = vcf.getBaseName().replace('.vcf', '') + """ + gunzip -c ${vcf} > temp.vcf + python \ + ${projectDir}/bin/pta/rename_metadata.py \ + temp.vcf \ + ${output_name}_headerAdjust.vcf \ + ${tool} + """ +} diff --git a/modules/python/python_rename_vcf.nf b/modules/python/python_rename_vcf.nf new file mode 100644 index 00000000..75f8f8cb --- /dev/null +++ b/modules/python/python_rename_vcf.nf @@ -0,0 +1,33 @@ +process RENAME_VCF { + tag "$sampleID" + + cpus 1 + memory 4.GB + time '04:00:00' + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} + + container 'quay.io/jaxcompsci/bedtools-python3:2.26.0' + + input: + tuple val(sampleID), file(vcf), val(meta), val(normal_name), val(tumor_name), val(tool) + + output: + tuple val(sampleID), file("*_sampleNamed.vcf"), val(meta), val(normal_name), val(tumor_name), val(tool), emit: rename_vcf + + script: + + normal = meta.normal_id + tumor = meta.tumor_id + + tool_name = tool == 'lancet_support' ? 'lancet' : tool + + """ + python \ + ${projectDir}/bin/pta/rename_vcf.py \ + ${vcf} \ + ${vcf.baseName}_sampleNamed.vcf \ + ${normal} \ + ${tumor} \ + ${tool} + """ +} diff --git a/modules/python/python_reorder_vcf_columns.nf b/modules/python/python_reorder_vcf_columns.nf new file mode 100644 index 00000000..f4fd8f59 --- /dev/null +++ b/modules/python/python_reorder_vcf_columns.nf @@ -0,0 +1,29 @@ +process REORDER_VCF_COLUMNS { + tag "$sampleID" + + cpus 1 + memory 4.GB + time '04:00:00' + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} + + container 'quay.io/jaxcompsci/bedtools-python3:2.26.0' + + input: + tuple val(sampleID), path(vcf), path(idx), val(meta) + + output: + tuple val(sampleID), path("*_mnv_final_filtered_merged_reordered.vcf"), val(meta), emit: vcf + + script: + + normal = meta.normal_id + tumor = meta.tumor_id + + """ + python \ + ${projectDir}/bin/pta/reorder_vcf.py \ + ${vcf} \ + ${vcf.baseName}_mnv_final_filtered_merged_reordered.vcf \ + ${normal} ${tumor} + """ +} diff --git a/modules/python/python_snv_to_mnv_final_filter.nf b/modules/python/python_snv_to_mnv_final_filter.nf new file mode 100644 index 00000000..00b9fbff --- /dev/null +++ b/modules/python/python_snv_to_mnv_final_filter.nf @@ -0,0 +1,24 @@ +process SNV_TO_MNV_FINAL_FILTER { + tag "$sampleID" + + cpus 1 + memory 4.GB + time '04:00:00' + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} + + container 'quay.io/jaxcompsci/bedtools-python3:2.26.0' + + input: + tuple val(sampleID), file(vcf), val(meta), val(chrom) + + output: + tuple val(sampleID), file("*.vcf"), val(meta), val(chrom), emit: vcf + + script: + """ + python \ + ${projectDir}/bin/pta/SNVsToMNVs_CountsBasedFilter_AnnotateHighConf.py \ + -i ${vcf} \ + -o ${sampleID}_mnv_final_filtered_${chrom}.vcf + """ +} diff --git a/modules/python/python_somatic_vcf_finalization.nf b/modules/python/python_somatic_vcf_finalization.nf new file mode 100644 index 00000000..aa6b3b6a --- /dev/null +++ b/modules/python/python_somatic_vcf_finalization.nf @@ -0,0 +1,62 @@ +process SOMATIC_VCF_FINALIZATION { + tag "$sampleID" + + cpus 1 + memory 50.GB + time 1.hour + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} + + container 'quay.io/jaxcompsci/py3_perl_pylibs:v2' + + publishDir "${params.pubdir}/${ params.organize_by=='sample' ? sampleID : 'vcf' }", pattern: "*final.*", mode:'copy' + publishDir "${params.pubdir}/${ params.organize_by=='sample' ? sampleID : 'vcf' }", pattern: "*supplemental.vcf", mode:'copy' + + input: + tuple val(sampleID), file(vcf), val(meta), val(normal_name), val(tumor_name) + val(filtered) + + output: + tuple val(sampleID), file("*final.vcf"), emit: vcf + tuple val(sampleID), file("*final.txt"), emit: txt + tuple val(sampleID), file("*final.maf"), emit: maf + tuple val(sampleID), file("*supplemental.vcf"), emit: supp_vcf + + script: + + output_suffix = filtered == 'filtered' ? 'filtered' : 'unfiltered' + + """ + python \ + ${projectDir}/bin/pta/annotate_id.py \ + ${vcf} \ + ${sampleID}_somatic_vep_cosmic_cancerResitMut_annotated_id.vcf + + python \ + ${projectDir}/bin/pta/rename_csq_vcf.py \ + ${sampleID}_somatic_vep_cosmic_cancerResitMut_annotated_id.vcf \ + ${sampleID}_somatic_snv_indel_annotated_${output_suffix}_supplemental.vcf + + python \ + ${projectDir}/bin/pta/make_main_vcf.py \ + ${sampleID}_somatic_snv_indel_annotated_${output_suffix}_supplemental.vcf \ + ${sampleID}_somatic_snv_indel_annotated_${output_suffix}_final.vcf + + python \ + ${projectDir}/bin/pta/make_txt.py \ + --vcf ${sampleID}_somatic_snv_indel_annotated_${output_suffix}_final.vcf \ + --txt ${sampleID}_somatic_snv_indel_annotated_${output_suffix}_final.txt \ + --tumor ${tumor_name} \ + --normal ${normal_name} + + python \ + ${projectDir}/bin/pta/make_maf.py \ + --vcf ${sampleID}_somatic_snv_indel_annotated_${output_suffix}_final.vcf \ + --maf ${sampleID}_somatic_snv_indel_annotated_${output_suffix}_final.maf \ + --library WGS \ + --vep-version GRCh38 \ + --tumor ${tumor_name} \ + --normal ${normal_name} \ + --ensembl-entrez ${params.ensembl_entrez} + + """ +} diff --git a/modules/python/python_split_mnv.nf b/modules/python/python_split_mnv.nf new file mode 100644 index 00000000..80286816 --- /dev/null +++ b/modules/python/python_split_mnv.nf @@ -0,0 +1,25 @@ +process SPLIT_MNV { + tag "$sampleID" + + cpus 1 + memory 4.GB + time '04:00:00' + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} + + container 'quay.io/jaxcompsci/bedtools-python3:2.26.0' + + input: + tuple val(sampleID), file(vcf), val(meta), val(normal_name), val(tumor_name), val(tool) + + output: + tuple val(sampleID), file("*.vcf"), val(meta), val(normal_name), val(tumor_name), val(tool), emit: split_mnv_vcf + + script: + """ + python \ + ${projectDir}/bin/pta/split_mnv.py \ + ${vcf} \ + ${vcf.baseName}_splitMNV.vcf \ + ${tool} + """ +} diff --git a/modules/python/python_vcf_to_bed.nf b/modules/python/python_vcf_to_bed.nf new file mode 100644 index 00000000..0c284320 --- /dev/null +++ b/modules/python/python_vcf_to_bed.nf @@ -0,0 +1,26 @@ +process VCF_TO_BED { + tag "$sampleID" + + cpus 1 + memory 4.GB + time '04:00:00' + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} + + container 'quay.io/jaxcompsci/bedtools-python3:2.26.0' + + input: + tuple val(sampleID), file(vcf), val(meta), val(chrom) + + output: + tuple val(sampleID), file("*.bed"), val(meta), val(chrom), emit: bed + + script: + """ + python \ + ${projectDir}/bin/pta/vcf_to_bed.py \ + ${vcf} \ + | bedtools \ + merge \ + > ${sampleID}_candidate_merged_${chrom}.bed + """ +} diff --git a/modules/r/annotate_bicseq2_cnv.nf b/modules/r/annotate_bicseq2_cnv.nf new file mode 100644 index 00000000..6263c491 --- /dev/null +++ b/modules/r/annotate_bicseq2_cnv.nf @@ -0,0 +1,42 @@ +process ANNOTATE_BICSEQ2_CNV { + tag "$sampleID" + + cpus 1 + memory 10.GB + time '08:00:00' + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} + + container 'quay.io/jaxcompsci/r-sv_cnv_annotate:4.1.1' + + publishDir "${params.pubdir}/${ params.organize_by=='sample' ? sampleID : 'cnv'}", pattern: "*.bed", mode: 'copy' + + input: + //BICSEQ2_SEG.out.bicseq2_sv_calls + tuple val(sampleID), file(bicseq2_calls), val(no_idx), val(meta), val(normal_name), val(tumor_name), val(bicseq2) + val(chrom_list) + + output: + tuple val(sampleID), file("${sampleID}_cnv_annotated_final.bed"), val(normal_name), val(tumor_name), emit: bicseq_annot + tuple val(sampleID), file("${sampleID}_cnv_annotated_supplemental.bed"), val(normal_name), val(tumor_name), emit: bicseq_annot_suppl + + script: + listOfChroms = chrom_list.collect { "$it" }.join(',') + + """ + Rscript ${projectDir}/bin/pta/annotate-cnv.r \ + --cnv=${bicseq2_calls} \ + --caller="bicseq2" \ + --tumor=${tumor_name} \ + --normal=${normal_name} \ + --cytoband=${params.cytoband} \ + --db_names="DGV,1000G,COSMIC" \ + --db_files=${params.dgv},${params.thousandG},${params.cosmicUniqueBed} \ + --cancer_census=${params.cancerCensusBed} \ + --ensembl=${params.ensemblUniqueBed} \ + --allowed_chr=${listOfChroms} \ + --overlap_fraction=0.8 \ + --out_file_main=${sampleID}_cnv_annotated_final.bed \ + --out_file_supplemental=${sampleID}_cnv_annotated_supplemental.bed + + """ +} diff --git a/modules/r/annotate_genes_sv.nf b/modules/r/annotate_genes_sv.nf new file mode 100644 index 00000000..d7563fd2 --- /dev/null +++ b/modules/r/annotate_genes_sv.nf @@ -0,0 +1,38 @@ +process ANNOTATE_GENES_SV { + tag "$sampleID" + + cpus 1 + memory 8.GB + time '04:00:00' + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} + + container 'quay.io/jaxcompsci/r-sv_cnv_annotate:4.1.1' + + input: + tuple val(sampleID), file(annot_sv_bedpe), val(normal_name), val(tumor_name) + val(suppl_switch) + + output: + tuple val(sampleID), file("*.manta_gridss_sv_annotated_genes*.bed"), val(normal_name), val(tumor_name), emit: annot_sv_genes_bedpe + + script: + + + if (suppl_switch == "main") + """ + Rscript ${projectDir}/bin/pta/annotate-bedpe-with-genes.r \ + --ensembl=${params.ensemblUniqueBed} \ + --cancer_census=${params.cancerCensusBed} \ + --bedpe=${annot_sv_bedpe} \ + --out_file=${sampleID}.manta_gridss_sv_annotated_genes.bed + """ + else if (suppl_switch == "supplemental") + """ + Rscript ${projectDir}/bin/pta/annotate-bedpe-with-genes.r \ + --ensembl=${params.ensemblUniqueBed} \ + --cancer_census=${params.cancerCensusBed} \ + --bedpe=${annot_sv_bedpe} \ + --out_file=${sampleID}.manta_gridss_sv_annotated_genes_supplemental.bed \ + --supplemental + """ +} diff --git a/modules/r/annotate_sv.nf b/modules/r/annotate_sv.nf new file mode 100644 index 00000000..21bc27e2 --- /dev/null +++ b/modules/r/annotate_sv.nf @@ -0,0 +1,42 @@ +process ANNOTATE_SV { + tag "$sampleID" + + cpus 1 + memory 8.GB + time '04:00:00' + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} + + container 'quay.io/jaxcompsci/r-sv_cnv_annotate:4.1.1' + + input: + // MERGE_SV.out.merged + tuple val(sampleID), file(merged_sv_bed), val(normal_name), val(tumor_name) + val(suppl_switch) + + output: + tuple val(sampleID), file("${sampleID}.manta_gridss_sv_annotated*.bed"), val(normal_name), val(tumor_name), emit: annot_sv_bedpe + + script: + + if (suppl_switch == "main") + """ + Rscript ${projectDir}/bin/pta/annotate-bedpe-with-databases.r \ + --db_names=gap,DGV,1000G,PON,COSMIC \ + --db_files=${params.gap},${params.dgvBedpe},${params.thousandGVcf},${params.svPon},${params.cosmicBedPe} \ + --slop=500 \ + --db_ignore_strand=COSMIC \ + --bedpe=${merged_sv_bed} \ + --out_file=${sampleID}.manta_gridss_sv_annotated.bed + + """ + else if (suppl_switch == "supplemental") + """ + Rscript ${projectDir}/bin/pta/annotate-bedpe-with-databases.r \ + --db_names=gap,DGV,1000G,PON,COSMIC \ + --db_files=${params.gap},${params.dgvBedpe},${params.thousandGVcf},${params.svPon},${params.cosmicBedPe} \ + --slop=500 \ + --db_ignore_strand=COSMIC \ + --bedpe=${merged_sv_bed} \ + --out_file=${sampleID}.manta_gridss_sv_annotated_supplemental.bed + """ +} diff --git a/modules/r/annotate_sv_with_cnv.nf b/modules/r/annotate_sv_with_cnv.nf new file mode 100644 index 00000000..c2fc0a71 --- /dev/null +++ b/modules/r/annotate_sv_with_cnv.nf @@ -0,0 +1,35 @@ +process ANNOTATE_SV_WITH_CNV { + tag "$sampleID" + + cpus 1 + memory 8.GB + time '04:00:00' + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} + + container 'quay.io/jaxcompsci/r-sv_cnv_annotate:4.1.1' + + input: + tuple val(sampleID), val(normal_name), val(tumor_name), file(bicseq_annot), file(annot_sv_genes_bedpe) + val(suppl_switch) + + output: + tuple val(sampleID), file("${sampleID}.manta_gridss_sv_annotated_genes_cnv*.bed"), val(normal_name), val(tumor_name), emit: sv_genes_cnv_bedpe + + script: + + if (suppl_switch == "main") + """ + Rscript ${projectDir}/bin/pta/annotate-bedpe-with-cnv.r \ + --cnv=${bicseq_annot} \ + --bedpe=${annot_sv_genes_bedpe} \ + --out_file=${sampleID}.manta_gridss_sv_annotated_genes_cnv.bed + """ + + else if (suppl_switch == "supplemental") + """ + Rscript ${projectDir}/bin/pta/annotate-bedpe-with-cnv.r \ + --cnv=${bicseq_annot} \ + --bedpe=${annot_sv_genes_bedpe} \ + --out_file=${sampleID}.manta_gridss_sv_annotated_genes_cnv_supplemental.bed + """ +} diff --git a/modules/r/filter_bedpe.nf b/modules/r/filter_bedpe.nf new file mode 100644 index 00000000..5da2a0e6 --- /dev/null +++ b/modules/r/filter_bedpe.nf @@ -0,0 +1,43 @@ +process FILTER_BEDPE { + tag "$sampleID" + + cpus 1 + memory 8.GB + time '04:00:00' + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} + + publishDir "${params.pubdir}/${ params.organize_by=='sample' ? sampleID : 'bedpe'}", pattern: "*.bedpe", mode: 'copy' + + container 'quay.io/jaxcompsci/r-sv_cnv_annotate:4.1.1' + + input: + // ANNOTATE_SV_WITH_CNV.out.sv_genes_cnv_bedpe + tuple val(sampleID), file(sv_genes_cnv_bedpe), val(normal_name), val(tumor_name) + val(suppl_switch) + output: + tuple val(sampleID), file("${sampleID}_sv_annotated_somatic_final.bedpe"), val(normal_name), val(tumor_name), optional: true + tuple val(sampleID), file("${sampleID}_sv_annotated_somatic_supplemental.bedpe"), val(normal_name), val(tumor_name), optional: true + tuple val(sampleID), file("${sampleID}_sv_annotated_somatic_high_confidence_final.bedpe"), val(normal_name), val(tumor_name), optional: true + tuple val(sampleID), file("${sampleID}_sv_annotated_somatic_high_confidence_supplemental.bedpe"), val(normal_name), val(tumor_name), optional: true + + script: + if(suppl_switch == "main") + """ + Rscript ${projectDir}/bin/pta/filter-bedpe.r \ + --max_changepoint_distance=1000 \ + --filter_databases=DGV,1000G,PON \ + --bedpe=${sv_genes_cnv_bedpe} \ + --out_file_somatic=${sampleID}_sv_annotated_somatic_final.bedpe \ + --out_file_highconf=${sampleID}_sv_annotated_somatic_high_confidence_final.bedpe + """ + + else if (suppl_switch == "supplemental") + """ + Rscript ${projectDir}/bin/pta/filter-bedpe.r \ + --max_changepoint_distance=1000 \ + --filter_databases=DGV,1000G,PON \ + --bedpe=${sv_genes_cnv_bedpe} \ + --out_file_somatic=${sampleID}_sv_annotated_somatic_supplemental.bedpe \ + --out_file_highconf=${sampleID}_sv_annotated_somatic_high_confidence_supplemental.bedpe + """ +} diff --git a/modules/rstudio/rstudio_frag_len_plot.nf b/modules/r/frag_len_plot.nf similarity index 54% rename from modules/rstudio/rstudio_frag_len_plot.nf rename to modules/r/frag_len_plot.nf index 67081f76..7a0ee302 100644 --- a/modules/rstudio/rstudio_frag_len_plot.nf +++ b/modules/r/frag_len_plot.nf @@ -4,6 +4,7 @@ process FRAG_LEN_PLOT { cpus 1 memory 4.GB time '04:00:00' + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} publishDir "${params.pubdir}/${ params.organize_by=='sample' ? sampleID+'/stats' : 'rstudio' }", pattern: "*fraglen_plot.pdf", mode: 'copy' container 'quay.io/jaxcompsci/rstudio:4.2.0' @@ -13,11 +14,11 @@ process FRAG_LEN_PLOT { output: tuple val(sampleID), file("*fraglen_plot.pdf") + tuple val(sampleID), file("*_spline_table.txt"), emit: spline_table script: - log.info "----- Fragment Length Plot on ${sampleID} -----" """ - Rscript ${projectDir}/bin/atac/fragment_length_plot.R ${frag_len_count} + Rscript ${projectDir}/bin/atac/fragment_length_plot.R ${frag_len_count} ${sampleID}_spline_table.txt mv fraglen_plot.pdf ${sampleID}_fraglen_plot.pdf """ } diff --git a/modules/r/merge_sv.nf b/modules/r/merge_sv.nf new file mode 100644 index 00000000..8f6ccf7c --- /dev/null +++ b/modules/r/merge_sv.nf @@ -0,0 +1,36 @@ +process MERGE_SV { + tag "$sampleID" + + cpus 1 + memory 8.GB + time '04:00:00' + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} + + container 'quay.io/jaxcompsci/r-sv_cnv_annotate:4.1.1' + + input: + tuple val(sampleID), val(normal_name), val(tumor_name), file(manta_vcf), file(manta_vcf_tbi), val(meta_manta), val(manta), file(gripss_vcf), val(gripss_idx), val(meta_gripss), val(gripss) + val(chrom_list) + + output: + tuple val(sampleID), file("${sampleID}.manta_gridss_sv.bed"), val(normal_name), val(tumor_name), emit: merged + tuple val(sampleID), file("${sampleID}.manta_gridss_sv_supplemental.bed"), val(normal_name), val(tumor_name), emit: merged_suppl + + + script: + listOfChroms = chrom_list.collect { "$it" }.join(',') + + """ + Rscript ${projectDir}/bin/pta/merge-caller-vcfs.r \ + --vcf=${manta_vcf},${gripss_vcf} \ + --caller=manta,gridss \ + --tumor=${tumor_name} \ + --normal=${normal_name} \ + --build=GRCh38 \ + --slop=300 \ + --allowed_chr=${listOfChroms} \ + --min_sv_length=500 \ + --out_file=${sampleID}.manta_gridss_sv.bed \ + --out_file_supplemental=${sampleID}.manta_gridss_sv_supplemental.bed + """ +} diff --git a/modules/rsem/rsem_alignment_expression.nf b/modules/rsem/rsem_alignment_expression.nf index 31179a5e..c98b3514 100644 --- a/modules/rsem/rsem_alignment_expression.nf +++ b/modules/rsem/rsem_alignment_expression.nf @@ -2,43 +2,46 @@ process RSEM_ALIGNMENT_EXPRESSION { tag "$sampleID" cpus 12 - memory { 60.GB * task.attempt } - time { 24.h * task.attempt } - errorStrategy 'retry' - maxRetries 1 + memory 60.GB + time 24.h + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} - container 'quay.io/jaxcompsci/rsem_bowtie2_star:0.1.0' + container 'quay.io/jaxcompsci/rsem_bowtie2_star:0.1.0' publishDir "${params.pubdir}/${ params.organize_by=='sample' ? sampleID+'/stats' : 'rsem' }", pattern: "*stats", mode:'copy', enabled: params.rsem_aligner == "bowtie2" publishDir "${params.pubdir}/${ params.organize_by=='sample' ? sampleID : 'rsem' }", pattern: "*results*", mode:'copy' - publishDir "${params.pubdir}/${ params.organize_by=='sample' ? sampleID+'/bam' : 'rsem' }", pattern: "*genome.bam", mode:'copy' - publishDir "${params.pubdir}/${ params.organize_by=='sample' ? sampleID+'/bam' : 'rsem' }", pattern: "*transcript.bam", mode:'copy' + publishDir "${params.pubdir}/${ params.organize_by=='sample' ? sampleID+'/bam' : 'rsem' }", pattern: "*genome.sorted.ba*", mode:'copy' + publishDir "${params.pubdir}/${ params.organize_by=='sample' ? sampleID+'/bam' : 'rsem' }", pattern: "*transcript.sorted.ba*", mode:'copy' input: - tuple val(sampleID), file(reads) - file(rsem_ref_files) + tuple val(sampleID), path(reads), val(strand_setting), val(read_length) + val(rsem_ref_path) + val(rsem_star_prefix) + val(rsem_ref_prefix) output: - file "*stats" - file "*results*" - tuple val(sampleID), file("rsem_aln_*.stats"), emit: rsem_stats - tuple val(sampleID), file("*genes.results"), emit: rsem_genes - tuple val(sampleID), file("*isoforms.results"), emit: rsem_isoforms - tuple val(sampleID), file("*.genome.bam"), emit: bam - tuple val(sampleID), file("*.transcript.bam"), emit: transcript_bam - + path "*stats" + path "*results*" + tuple val(sampleID), path("rsem_aln_*.stats"), emit: rsem_stats + tuple val(sampleID), path("*.stat/*.cnt"), emit: rsem_cnt + tuple val(sampleID), path("*genes.results"), emit: rsem_genes + tuple val(sampleID), path("*isoforms.results"), emit: rsem_isoforms + tuple val(sampleID), path("*.genome.bam"), emit: bam + tuple val(sampleID), path("*.transcript.bam"), emit: transcript_bam + tuple val(sampleID), path("*.genome.sorted.bam"), path("*.genome.sorted.bam.bai"), emit: sorted_genomic_bam + tuple val(sampleID), path("*.transcript.sorted.bam"), path("*.transcript.sorted.bam.bai"), emit: sorted_transcript_bam + script: - log.info "----- Genome Alignment Running on: ${sampleID} -----" - if (params.read_prep == "reverse_stranded") { + if (strand_setting == "reverse_stranded") { prob="--forward-prob 0" } - if (params.read_prep == "forward_stranded") { + if (strand_setting == "forward_stranded") { prob="--forward-prob 1" } - if (params.read_prep == "non_stranded") { + if (strand_setting == "non_stranded") { prob="--forward-prob 0.5" } @@ -53,15 +56,43 @@ process RSEM_ALIGNMENT_EXPRESSION { trimmedfq="${reads[0]}" } if (params.rsem_aligner == "bowtie2"){ - outbam="--output-genome-bam" + + rsem_ref_files = file("${rsem_ref_path}/bowtie2/*").collect { "$it" }.join(' ') + + outbam="--output-genome-bam --sort-bam-by-coordinate" seed_length="--seed-length ${params.seed_length}" + sort_command='' + index_command='' } if (params.rsem_aligner == "star") { - outbam="--star-output-genome-bam" + outbam="--star-output-genome-bam --sort-bam-by-coordinate" seed_length="" + samtools_mem = task.memory.giga / task.cpus + sort_command="samtools sort -@ ${task.cpus} -m ${samtools_mem}G -o ${sampleID}.STAR.genome.sorted.bam ${sampleID}.STAR.genome.bam" + index_command="samtools index ${sampleID}.STAR.genome.sorted.bam" + + read_length = read_length.toInteger() + + if( read_length >= 65 && read_length <= 85) { + rsem_ref_files = file("${rsem_ref_path}/STAR/${rsem_star_prefix}_75/*").collect { "$it" }.join(' ') + } else if( read_length >= 90 && read_length <= 110 ) { + rsem_ref_files = file("${rsem_ref_path}/STAR/${rsem_star_prefix}_100/*").collect { "$it" }.join(' ') + } else if( read_length >= 115 && read_length <= 135 ) { + rsem_ref_files = file("${rsem_ref_path}/STAR/${rsem_star_prefix}_125/*").collect { "$it" }.join(' ') + } else if( read_length >= 140 && read_length <= 160 ) { + rsem_ref_files = file("${rsem_ref_path}/STAR/${rsem_star_prefix}_150/*").collect { "$it" }.join(' ') + } else { + log.info("\nUnsupported read length " + read_length + " in RSEM with STAR. RSEM will now fail gracefully.\n\n") + rsem_ref_files = 'error' + } + } """ + if [ "${rsem_ref_files}" = "error" ]; then exit 1; fi + + ln -s -f ${rsem_ref_files} . + rsem-calculate-expression -p $task.cpus \ ${prob} \ ${stype} \ @@ -71,8 +102,12 @@ process RSEM_ALIGNMENT_EXPRESSION { ${seed_length} \ ${outbam} \ ${trimmedfq} \ - ${params.rsem_ref_prefix} \ + ${rsem_ref_prefix} \ ${sampleID} \ 2> rsem_aln_${sampleID}.stats + + ${sort_command} + + ${index_command} """ -} \ No newline at end of file +} diff --git a/modules/samtools/samtools_calc_mtdna_filter_chrm.nf b/modules/samtools/samtools_calc_mtdna_filter_chrm.nf index f275d8f7..45336f68 100644 --- a/modules/samtools/samtools_calc_mtdna_filter_chrm.nf +++ b/modules/samtools/samtools_calc_mtdna_filter_chrm.nf @@ -4,6 +4,7 @@ process CALC_MTDNA_FILTER_CHRM { cpus 4 memory 4.GB time '10:00:00' + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} publishDir "${params.pubdir}/${ params.organize_by=='sample' ? sampleID+'/stats' : 'samtools' }", pattern: "*_mtDNA_Content.txt", mode: 'copy' container 'quay.io/jaxcompsci/samtools_with_bc:1.3.1' @@ -17,7 +18,6 @@ process CALC_MTDNA_FILTER_CHRM { tuple val(sampleID), file("*_mtDNA_Content.txt"), emit: mtdna_log shell: - log.info "----- Calculate %mtDNA and Filter Mitochondrial Reads on ${sampleID} -----" // Get Mitochondrial and total read counts, calculate %mtDNA and filter Mitochondrial Reads from bam file mt_name = params.gen_org == 'mouse' ? 'MT' : 'chrM' @@ -37,7 +37,7 @@ process CALC_MTDNA_FILTER_CHRM { fi # Calculate %mtDNA - echo 'mtDNA Content:' $(bc <<< "scale=2;100*$mtReads/$totalReads")'%' >> !{sampleID}_mtDNA_Content.txt + echo -e 'sampleID\\tPerc mtDNA\\n'!{sampleID}'\\t'$(bc <<< "scale=2;100*$mtReads/$totalReads") >> !{sampleID}_mtDNA_Content.txt # Filter Mitochondrial Reads from bam file samtools view -@ !{task.cpus} -h !{rmdup_bam_file} \ diff --git a/modules/samtools/samtools_chain_bad2uniq_reads.nf b/modules/samtools/samtools_chain_bad2uniq_reads.nf index 78d5f4c0..0011e211 100644 --- a/modules/samtools/samtools_chain_bad2uniq_reads.nf +++ b/modules/samtools/samtools_chain_bad2uniq_reads.nf @@ -4,6 +4,7 @@ process CHAIN_BAD2UNIQ_READS { cpus 1 memory 4.GB time '04:00:00' + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} container 'quay.io/jaxcompsci/samtools_with_bc:1.3.1' @@ -16,7 +17,6 @@ process CHAIN_BAD2UNIQ_READS { when: params.chain != null shell: - log.info "----- Getting 'bad reads' from bam file on ${sampleID} -----" // Get unique 'bad read names' from bam file using gatk ValidateSamFile out results ''' cat !{bad_reads} \ diff --git a/modules/samtools/samtools_chain_sort_fixmate_bam.nf b/modules/samtools/samtools_chain_sort_fixmate_bam.nf index 24f4a6f3..78e1d0d9 100644 --- a/modules/samtools/samtools_chain_sort_fixmate_bam.nf +++ b/modules/samtools/samtools_chain_sort_fixmate_bam.nf @@ -4,20 +4,21 @@ process CHAIN_SORT_FIXMATE_BAM { cpus 8 memory 20.GB time '20:00:00' + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} publishDir "${params.pubdir}/${ params.organize_by=='sample' ? sampleID+'/bam' : 'samtools' }", pattern: "*.filtered.shifted.*", mode: 'copy' + container 'quay.io/jaxcompsci/samtools_with_bc:1.3.1' input: - tuple val(sampleID), file(bam_mm10) + tuple val(sampleID), file(bam) output: - tuple val(sampleID), file("*.filtered.shifted.*") + tuple val(sampleID), path("*.filtered.shifted.*") when: params.chain != null script: - log.info "----- Performing sort, fixmate, filter the bam on ${sampleID} -----" // This module is for Non-Reference Strain Samples. // To sort bam by read name, fix the mate information, re-sort by coordinates and filter Mitochondrial Reads from bam file. """ @@ -25,7 +26,7 @@ process CHAIN_SORT_FIXMATE_BAM { samtools sort \ -n \ -@ $task.cpus -O bam \ - -o ${sampleID}.tmp3.mm10.bam ${bam_mm10[0]} + -o ${sampleID}.tmp3.mm10.bam ${bam[0]} # fix the mate information. This is done to fix 'TLEN' which is required for MACS2 samtools fixmate \ diff --git a/modules/samtools/samtools_faidx.nf b/modules/samtools/samtools_faidx.nf new file mode 100644 index 00000000..639c2c83 --- /dev/null +++ b/modules/samtools/samtools_faidx.nf @@ -0,0 +1,23 @@ +process SAMTOOLS_FAIDX { + tag "${fasta}" + + cpus 1 + memory 8.GB + time '06:00:00' + + container 'quay.io/biocontainers/samtools:1.14--hb421002_0' + + publishDir "${params.pubdir}/genome_info", mode: 'copy' + + input: + file(fasta) + + output: + file("*.fai") + + script: + + """ + samtools faidx ${fasta} + """ +} diff --git a/modules/samtools/samtools_filter.nf b/modules/samtools/samtools_filter.nf new file mode 100644 index 00000000..77d44d80 --- /dev/null +++ b/modules/samtools/samtools_filter.nf @@ -0,0 +1,31 @@ +process SAMTOOLS_FILTER { + tag "$sampleID" + + cpus 2 + memory 4.GB + time '10:00:00' + + container 'quay.io/jaxcompsci/samtools_with_bc:1.3.1' + + input: + tuple val(sampleID), file(in_file) + val(option) + + output: + tuple val(sampleID), file("*.bam"), emit: bam + + script: + // Exclude reads based on input bit flag. + + prefix = "${sampleID}.Lb" + if (params.workflow == "chipseq"){ + output = "${prefix}.bam" + } + else{ + output = "${sampleID}.bam" + } + """ + samtools view -h -b ${option} ${in_file} > ${output} + """ + +} diff --git a/modules/samtools/samtools_filter_remove_multi_shift.nf b/modules/samtools/samtools_filter_remove_multi_shift.nf index 7ecefe36..2fff05e5 100644 --- a/modules/samtools/samtools_filter_remove_multi_shift.nf +++ b/modules/samtools/samtools_filter_remove_multi_shift.nf @@ -4,8 +4,10 @@ process FILTER_REMOVE_MULTI_SHIFT { cpus 4 memory 10.GB time '10:00:00' + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} publishDir "${params.pubdir}/${ params.organize_by=='sample' ? sampleID+'/bam' : 'samtools' }", pattern: "*.sorted.rmDup.rmChrM.rmMulti.filtered.ba*", mode: 'copy' + container 'quay.io/jaxcompsci/samtools_with_bc:1.3.1' input: @@ -16,7 +18,6 @@ process FILTER_REMOVE_MULTI_SHIFT { tuple val(sampleID), file("*.sorted.rmDup.rmChrM.rmMulti.filtered.ba*"), emit: srf_bam script: - log.info "----- Filter Non-Unique and Include Only 'properly mapped reads' Alignments on ${sampleID} -----" // Filter reads unmapped, mate unmapped, not primary alignment, reads failing platform, pcr duplicates (-F 1804) and reatin properly paired reads (-f 2) in bam file """ # filter low quality reads diff --git a/modules/samtools/samtools_filter_unique_reads.nf b/modules/samtools/samtools_filter_unique_reads.nf new file mode 100644 index 00000000..5c5510df --- /dev/null +++ b/modules/samtools/samtools_filter_unique_reads.nf @@ -0,0 +1,39 @@ +process SAMTOOLS_FILTER_UNIQUE { + tag "$sampleID" + + cpus 1 + memory 4.GB + time '04:00:00' + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} + + container 'quay.io/jaxcompsci/bicseq2:v2' + + input: + tuple val(sampleID), val(meta), path(bam), path(bai), val(read_ID) + val(chroms) + + output: + tuple val(sampleID), path("seq_out/*.seq"), val(meta), val(read_ID), emit: uniq_seq + + script: + chrom_list = chroms.collect { "$it" }.join(' ') + """ + /samtools-0.1.7a_getUnique-0.1.3/samtools view -U "BWA,${read_ID}_,N,N" ${bam} + + mkdir seq_out + + for chrom in ${chrom_list}; do mv *\$chrom.seq seq_out/; done + + """ +} +// Modified samtools view: +// -U STR If specified, get the uique reads. STR should be or +// e.g. means that the aligner is BWA, the prefix of the output is output, and that the chromosome names and strands will not be reported +// StrandReport should be S (separate positive and negative reads), Y, or N +// minLen and maxLen specifies the length range of the reported reads + +// NOTE: The modified samtools view with -U does not work as normal samtools. It does not pass the region to the filter step. + +// NOTE: The modified samtools view reads the header and uses the chrom names in the header, not matter what is present in the mapped file. + +// NOTE: Therefore, to avoid `chrUn_*_decoy.seq` and `_HLA-*.seq` non-primary chroms, the primary list is moved to an output directory. diff --git a/modules/samtools/samtools_final_calc_frip.nf b/modules/samtools/samtools_final_calc_frip.nf index 8152a183..b7a6347d 100644 --- a/modules/samtools/samtools_final_calc_frip.nf +++ b/modules/samtools/samtools_final_calc_frip.nf @@ -4,8 +4,10 @@ process FINAL_CALC_FRIP { cpus 1 memory 4.GB time '04:00:00' + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} publishDir "${params.pubdir}/${ params.organize_by=='sample' ? sampleID+'/stats' : 'samtools' }", pattern: "*_Fraction_reads_in_peak.txt", mode: 'copy' + container 'quay.io/jaxcompsci/samtools_with_bc:1.3.1' input: @@ -15,13 +17,12 @@ process FINAL_CALC_FRIP { tuple val(sampleID), file("*_Fraction_reads_in_peak.txt") shell: - log.info "----- Final Calculate (FRiP) on ${sampleID} -----" // Calculate fraction of reads in peak ''' total_reads=$(samtools view -c !{processed_bams[0]}) reads_in_peaks=$(samtools view -c !{reads_peaks_bams[0]}) FRiP=$(awk "BEGIN {print "${reads_in_peaks}"/"${total_reads}"}") - echo -e ${FRiP}"\\t"${total_reads} \ + echo -e 'SAMPLEID\\tFRiP\\tFiltered Reads\\n'!{sampleID}"\\t"${FRiP}"\\t"${total_reads} \ > !{sampleID}_Fraction_reads_in_peak.txt ''' } diff --git a/modules/samtools/samtools_index.nf b/modules/samtools/samtools_index.nf index 248be1a8..a864b89d 100644 --- a/modules/samtools/samtools_index.nf +++ b/modules/samtools/samtools_index.nf @@ -4,19 +4,19 @@ process SAMTOOLS_INDEX { cpus 1 memory 8.GB time '06:00:00' + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} container 'quay.io/biocontainers/samtools:1.14--hb421002_0' - publishDir "${params.pubdir}/${ params.organize_by=='sample' ? sampleID+'/bam' : 'samtools' }", pattern:"*.ba*", mode:'copy', enabled: params.keep_intermediate + publishDir "${params.pubdir}/${ params.organize_by=='sample' ? sampleID+'/bam' : 'samtools' }", pattern:"*.ba*", mode:'copy', enabled: params.workflow == 'rrbs' ? true : false input: - tuple val("sampleID"), file(bam) + tuple val(sampleID), file(bam) output: - tuple val("sampleID"), file("*.bai"), emit: bai + tuple val(sampleID), file("*.bai"), emit: bai script: - log.info "----- Samtools Index Running on: ${sampleID} -----" """ samtools index ${bam} diff --git a/modules/samtools/samtools_mergebam_filter.nf b/modules/samtools/samtools_mergebam_filter.nf new file mode 100644 index 00000000..23c2bb10 --- /dev/null +++ b/modules/samtools/samtools_mergebam_filter.nf @@ -0,0 +1,35 @@ +process SAMTOOLS_MERGEBAM_FILTER { + tag "$sampleID" + + cpus 2 + memory 4.GB + time '10:00:00' + + container 'quay.io/jaxcompsci/samtools_with_bc:1.3.1' + + input: + tuple val(sampleID), file(in_file) + file(bed) + + output: + tuple val(sampleID), file("*.bam"), emit: bam + + script: + // Setup for chipseq pipeline + + prefix = params.read_type == 'SE' ? "${sampleID}.mLb.clN" : "${sampleID}.mLb.flT" + filter_params = params.read_type == 'SE' ? '-F 0x004' : '-F 0x004 -F 0x0008 -f 0x001' + dup_params = params.keep_dups ? '' : '-F 0x0400' + multimap_params = params.keep_multi_map ? '' : '-q 1' + blacklist_params = params.blacklist ? "-L $bed" : '' + + """ + samtools view \\ + $filter_params \\ + $dup_params \\ + $multimap_params \\ + $blacklist_params \\ + -b ${in_file} > ${prefix}.bam + """ + +} diff --git a/modules/samtools/samtools_non_chain_reindex.nf b/modules/samtools/samtools_non_chain_reindex.nf index 058bc9f7..40418f78 100644 --- a/modules/samtools/samtools_non_chain_reindex.nf +++ b/modules/samtools/samtools_non_chain_reindex.nf @@ -4,8 +4,10 @@ process NON_CHAIN_REINDEX { cpus 1 memory 8.GB time '10:00:00' + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} publishDir "${params.pubdir}/${ params.organize_by=='sample' ? sampleID+'/bam' : 'samtools' }", pattern: "*.filtered.shifted.*", mode: 'copy' + container 'quay.io/jaxcompsci/samtools_with_bc:1.3.1' input: @@ -17,7 +19,6 @@ process NON_CHAIN_REINDEX { when: params.chain == null script: - log.info "----- Filtering Mitochondrial, Unplaced/Unlocalized Reads and reindex on ${sampleID} -----" // This module is for Reference Strain Samples. // To filter Mitochondrial, Unplaced/Unlocalized Reads from bam file. """ diff --git a/modules/samtools/samtools_quality_checks.nf b/modules/samtools/samtools_quality_checks.nf index 851d60f2..7bf02938 100644 --- a/modules/samtools/samtools_quality_checks.nf +++ b/modules/samtools/samtools_quality_checks.nf @@ -4,8 +4,10 @@ process QUALITY_CHECKS { cpus 2 memory 4.GB time '04:00:00' + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} publishDir "${params.pubdir}/${ params.organize_by=='sample' ? sampleID+'/stats' : 'samtools' }", pattern: "*.fragment_length_count.txt", mode: 'copy' + container 'quay.io/jaxcompsci/samtools_with_bc:1.3.1' input: @@ -15,13 +17,11 @@ process QUALITY_CHECKS { tuple val(sampleID), file("*.fragment_length_count.txt") script: - log.info "----- Quality checks on ${sampleID} -----" - log.info "----- Fragment/Insert size on ${sampleID} -----" // Get the fragment length count from bam file for Quality Checks. """ samtools view \ -@ $task.cpus ${sort_rm_filter_bam[0]} \ | awk '\$9>0' | cut -f 9 | sort | uniq -c | sort -b -k2,2n \ - | sed -e 's/^[ \\t]*//' > ${sampleID}.fragment_length_count.txt + | sed -e 's/^[ \\t]*//' | awk -v sample="${sampleID}" -F' ' '{print sample,\$1,\$2}' OFS="\\t" > ${sampleID}.fragment_length_count.txt """ } diff --git a/modules/samtools/samtools_remove_duplicate_reads.nf b/modules/samtools/samtools_remove_duplicate_reads.nf index a54bc3f3..b40267f6 100644 --- a/modules/samtools/samtools_remove_duplicate_reads.nf +++ b/modules/samtools/samtools_remove_duplicate_reads.nf @@ -4,6 +4,7 @@ process REMOVE_DUPLICATE_READS { cpus 2 memory 4.GB time '10:00:00' + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} container 'quay.io/jaxcompsci/samtools_with_bc:1.3.1' @@ -15,7 +16,6 @@ process REMOVE_DUPLICATE_READS { tuple val(sampleID), file("*.sorted.rmDup.bam.bai"), emit: rmDup_bai script: - log.info "----- Samtools Removing PCR Duplicates on: ${sampleID} -----" // Exclude reads flagged as pcr or optical duplicates (0x400), marked with bit flag 1024 in the BAM. """ samtools view -h -b -F 1024 ${marked_bam_file} > ${sampleID}.sorted.rmDup.bam diff --git a/modules/samtools/samtools_sort.nf b/modules/samtools/samtools_sort.nf index 96e38933..6f08983e 100644 --- a/modules/samtools/samtools_sort.nf +++ b/modules/samtools/samtools_sort.nf @@ -1,42 +1,29 @@ -process SORT { +process SAMTOOLS_SORT { tag "$sampleID" cpus 4 memory 20.GB time '20:00:00' + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} + publishDir "${params.pubdir}/${ params.organize_by=='sample' ? sampleID+'/bam' : 'samtools' }", pattern: "*.bam", mode:'copy', enabled: params.workflow == 'rrbs' ? true : false + container 'quay.io/jaxcompsci/samtools_with_bc:1.3.1' input: tuple val(sampleID), file(sam_file) val(options) + val(suffix) output: - tuple val(sampleID), file("*.sorted.bam*") + tuple val(sampleID), file("*.sorted.*"), emit: sorted_file script: - log.info "----- Samtools sort Running on: ${sampleID} -----" - - // check if not sorting by name - if(options != "-n ") - """ - samtools sort \ - ${options} \ - -@ $task.cpus \ - -O bam \ - -o ${sampleID}.sorted.bam \ - ${sam_file[0]} - - samtools index \ - ${sampleID}.sorted.bam - """ - else """ samtools sort \ ${options} \ - -@ $task.cpus \ - -O bam \ - -o ${sampleID}.sorted.bam \ - ${sam_file[0]} + -@ ${task.cpus} \ + -o ${sam_file.baseName}.sorted.${suffix} \ + ${sam_file} """ } diff --git a/modules/samtools/samtools_stats.nf b/modules/samtools/samtools_stats.nf new file mode 100644 index 00000000..42741699 --- /dev/null +++ b/modules/samtools/samtools_stats.nf @@ -0,0 +1,41 @@ +process SAMTOOLS_STATS { + tag "$sampleID" + + cpus 1 + memory 8.GB + time '06:00:00' + + container 'quay.io/biocontainers/samtools:1.14--hb421002_0' + + publishDir { + def type = "${params.workflow}" == 'chipseq' ? ( sampleID =~ /INPUT/ ? 'control_samples/' : 'immuno_precip_samples/') : '' + "${params.pubdir}/${ params.organize_by=='sample' ? type+sampleID+'/samtools' : 'samtools'}" + }, pattern: "*.flagstat", mode: 'copy', enabled: params.keep_intermediate + + publishDir { + def type = "${params.workflow}" == 'chipseq' ? ( sampleID =~ /INPUT/ ? 'control_samples/' : 'immuno_precip_samples/') : '' + "${params.pubdir}/${ params.organize_by=='sample' ? type+sampleID+'/samtools' : 'samtools'}" + }, pattern: "*.idxstats", mode: 'copy', enabled: params.keep_intermediate + + publishDir { + def type = "${params.workflow}" == 'chipseq' ? ( sampleID =~ /INPUT/ ? 'control_samples/' : 'immuno_precip_samples/') : '' + "${params.pubdir}/${ params.organize_by=='sample' ? type+sampleID+'/samtools' : 'samtools'}" + }, pattern: "*.stats", mode: 'copy', enabled: params.keep_intermediate + + + input: + tuple val(sampleID), file(bam) + + output: + tuple val(sampleID), file("*.flagstat"), emit: flagstat + tuple val(sampleID), file("*.idxstats"), emit: idxstat + tuple val(sampleID), file("*.stats"), emit: stats + + script: + + """ + samtools flagstat ${bam[0]} > ${bam[0]}.flagstat + samtools idxstats ${bam[0]} > ${bam[0]}.idxstats + samtools stats ${bam[0]} > ${bam[0]}.stats + """ +} diff --git a/modules/samtools/samtools_stats_insertsize.nf b/modules/samtools/samtools_stats_insertsize.nf new file mode 100644 index 00000000..5b0b7299 --- /dev/null +++ b/modules/samtools/samtools_stats_insertsize.nf @@ -0,0 +1,26 @@ +process SAMTOOLS_STATS_INSERTSIZE { + tag "$sampleID" + + cpus 8 + memory 1.GB + time '01:00:00' + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} + + container 'quay.io/biocontainers/samtools:1.14--hb421002_0' + + publishDir "${params.pubdir}/${ params.organize_by=='sample' ? sampleID+'/stats' : 'samtools' }", pattern: "*insert_size.txt", mode:'copy' + + input: + tuple val(sampleID), val(meta), path(bam), path(bai), val(read_ID) + + output: + tuple val(sampleID), env(read_length), env(insert_size), emit: read_length_insert_size + file("*insert_size.txt") + + script: + """ + samtools stats --insert-size 8000 ${bam} --threads ${task.cpus} | grep ^SN | cut -f 2- > ${sampleID}_insert_size.txt + read_length=`grep "maximum length" ${sampleID}_insert_size.txt | cut -d ':' -f2 | tr -d " \\t\\n\\r"` + insert_size=`grep "insert size average" ${sampleID}_insert_size.txt | cut -d ':' -f2 | tr -d " \\t\\n\\r"` + """ +} diff --git a/modules/samtools/samtools_view.nf b/modules/samtools/samtools_view.nf new file mode 100644 index 00000000..fbd94378 --- /dev/null +++ b/modules/samtools/samtools_view.nf @@ -0,0 +1,26 @@ +process SAMTOOLS_VIEW { + tag "$sampleID" + + cpus 1 + memory 8.GB + time '06:00:00' + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} + + container 'quay.io/biocontainers/samtools:1.14--hb421002_0' + + publishDir "${params.pubdir}/${ params.organize_by=='sample' ? sampleID : 'samtools_view' }", pattern:"*.bam", mode:'copy', enabled: params.keep_intermediate + + input: + tuple val(sampleID), file(sam) + val(view_string) + val(filename) + + output: + tuple val(sampleID), file("*.bam"), emit: bam + + script: + + """ + samtools view ${view_string} ${sam} > ${sampleID}_${filename}.bam + """ +} diff --git a/modules/snpeff_snpsift/snpeff_oneperline.nf b/modules/snpeff_snpsift/snpeff_oneperline.nf index 99cabfca..65920879 100644 --- a/modules/snpeff_snpsift/snpeff_oneperline.nf +++ b/modules/snpeff_snpsift/snpeff_oneperline.nf @@ -4,6 +4,7 @@ process SNPEFF_ONEPERLINE { cpus 1 memory 2.GB time '00:10:00' + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} input: tuple val(sampleID), file(vcf) @@ -25,4 +26,4 @@ process SNPEFF_ONEPERLINE { """ cat ${vcf} | perl ${projectDir}/bin/shared/vcfEffOnePerLine.pl > ${sampleID}_oneperline_${output_suffix} """ -} \ No newline at end of file +} diff --git a/modules/snpeff_snpsift/snpeff_snpeff.nf b/modules/snpeff_snpsift/snpeff_snpeff.nf index 8bfe3c64..52cc872a 100644 --- a/modules/snpeff_snpsift/snpeff_snpeff.nf +++ b/modules/snpeff_snpsift/snpeff_snpeff.nf @@ -4,11 +4,11 @@ process SNPEFF{ cpus = 1 memory = 8.GB time = '06:00:00' + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} - // SNPEFF and SNPSIFT need updating - container 'quay.io/jaxcompsci/snpeff_snpsift_5.1:v5.1' + container 'quay.io/jaxcompsci/snpeff_snpsift_5.1:v5.1d' - publishDir "${params.pubdir}/${ params.organize_by=='sample' ? sampleID : 'snpeff' }", pattern:"*.*", mode:'copy' + publishDir "${params.pubdir}/${ params.organize_by=='sample' ? sampleID : 'snpeff' }", pattern:"*.*", mode:'copy', enabled: params.gen_org=='mouse' ? true : params.keep_intermediate input: tuple val(sampleID),file(vcf) @@ -19,11 +19,10 @@ process SNPEFF{ tuple val(sampleID),file("*.vcf"), emit:vcf //tuple val(sampleID),file("*.html") // If adding back in ^ this command should be added to the java block below - // -s ${sampleID}_snpeff.html \ + // -s ${sampleID}_snpeff.html \ // tuple val(sampleID),file("*") script: - log.info "----- snpEff Running on: ${sampleID} -----" String my_mem = (task.memory-1.GB).toString() my_mem = my_mem[0..-4] @@ -35,7 +34,7 @@ process SNPEFF{ output_suffix = 'SNP_snpeff.vcf' } if (indel_snp == 'BOTH'){ - output_suffix = 'snp_indel_snpeff.vcf' + output_suffix = 'SNP_INDEL_filtered_annotated_final.vcf' } """ @@ -46,4 +45,4 @@ process SNPEFF{ -noStats \ ${vcf} > ${sampleID}_${output_suffix} """ -} \ No newline at end of file +} diff --git a/modules/snpeff_snpsift/snpsift_annotate.nf b/modules/snpeff_snpsift/snpsift_annotate.nf index 2f72e326..020b5a85 100644 --- a/modules/snpeff_snpsift/snpsift_annotate.nf +++ b/modules/snpeff_snpsift/snpsift_annotate.nf @@ -4,9 +4,12 @@ process SNPSIFT_ANNOTATE { cpus = 1 memory = 6.GB time = '06:00:00' + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} - // SNPEFF and SNPSIFT need updating - container 'quay.io/jaxcompsci/snpeff_snpsift_5.1:v5.1' + container 'quay.io/jaxcompsci/snpeff_snpsift_5.1:v5.1d' + + publishDir "${params.pubdir}/${ params.organize_by=='sample' ? sampleID : 'snpsift' }", pattern:"*dbsnpID.vcf", mode:'copy' + publishDir "${params.pubdir}/${ params.organize_by=='sample' ? sampleID : 'snpeff' }", pattern:"*.vcf", mode:'copy', enabled: params.workflow == 'amplicon' ? true : false input: tuple val(sampleID), file(vcf) @@ -26,4 +29,4 @@ process SNPSIFT_ANNOTATE { java -Xmx${my_mem}G -jar /opt/snpEff/SnpSift.jar \ annotate -noDownload -id ${annot_source} ${vcf} > ${vcf.baseName}_${output_suffix}.vcf """ -} \ No newline at end of file +} diff --git a/modules/snpeff_snpsift/snpsift_dbnsfp.nf b/modules/snpeff_snpsift/snpsift_dbnsfp.nf index cabff374..510a61ed 100644 --- a/modules/snpeff_snpsift/snpsift_dbnsfp.nf +++ b/modules/snpeff_snpsift/snpsift_dbnsfp.nf @@ -4,11 +4,11 @@ process SNPSIFT_DBNSFP{ cpus = 1 memory = 6.GB time = '06:00:00' + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} - // SNPEFF and SNPSIFT need updating - container 'quay.io/jaxcompsci/snpeff_snpsift_5.1:v5.1' + container 'quay.io/jaxcompsci/snpeff_snpsift_5.1:v5.1d' - publishDir "${params.pubdir}/${ params.organize_by=='sample' ? sampleID : 'snpeff' }", pattern:"*.vcf", mode:'copy' + publishDir "${params.pubdir}/${ params.organize_by=='sample' ? sampleID : 'snpeff' }", pattern:"*.vcf", mode:'copy', enabled: params.keep_intermediate input: tuple val(sampleID), file(vcf) @@ -18,7 +18,6 @@ process SNPSIFT_DBNSFP{ tuple val(sampleID), file("*.vcf"), emit: vcf script: - log.info "----- snpSift DBNSFP Running on: ${sampleID} -----" String my_mem = (task.memory-1.GB).toString() my_mem = my_mem[0..-4] @@ -39,4 +38,4 @@ process SNPSIFT_DBNSFP{ -f SIFT_score,SIFT_pred,Polyphen2_HDIV_score,MutationAssessor_score,phyloP100way_vertebrate,1000Gp3_AF,1000Gp3_AFR_AF,1000Gp3_EUR_AF,1000Gp3_AMR_AF,1000Gp3_EAS_AF,ESP6500_AA_AF,ESP6500_EA_AF \ ${vcf} > ${sampleID}_${output_suffix} """ -} \ No newline at end of file +} diff --git a/modules/snpeff_snpsift/snpsift_extractfields.nf b/modules/snpeff_snpsift/snpsift_extractfields.nf index 9ae9b2bb..e53449f1 100644 --- a/modules/snpeff_snpsift/snpsift_extractfields.nf +++ b/modules/snpeff_snpsift/snpsift_extractfields.nf @@ -4,9 +4,9 @@ process SNPSIFT_EXTRACTFIELDS { cpus = 1 memory = 6.GB time = '01:00:00' + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} - // SNPEFF and SNPSIFT need updating - container 'quay.io/jaxcompsci/snpeff_snpsift_5.1:v5.1' + container 'quay.io/jaxcompsci/snpeff_snpsift_5.1:v5.1d' publishDir "${params.pubdir}/${ params.organize_by=='sample' ? sampleID : 'snpeff' }", pattern:"*.txt", mode:'copy' @@ -14,26 +14,29 @@ process SNPSIFT_EXTRACTFIELDS { tuple val(sampleID), file(vcf) output: - tuple val(sampleID), file("*.txt"), emit: txt - + tuple val(sampleID), file("*.txt"), emit: txt, optional: true + tuple val(sampleID), file("*.temp"), emit: temp, optional: true + script: - log.info "----- snpSift DBNSFP Running on: ${sampleID} -----" // add suffix for snp indel both for output name String my_mem = (task.memory-1.GB).toString() my_mem = my_mem[0..-4] if (params.gen_org=='human'){ - fields = 'CHROM POS ID REF ALT QUAL FILTER "ANN[*].ALLELE" "ANN[*].EFFECT" "ANN[*].IMPACT" "ANN[*].GENE" "ANN[*].GENEID" "ANN[*].FEATURE" "ANN[*].FEATUREID" "ANN[*].BIOTYPE" "ANN[*].RANK" "ANN[*].HGVS_C" "ANN[*].HGVS_P" "ANN[*].CDNA_POS" "ANN[*].CDNA_LEN" "ANN[*].CDS_POS" "ANN[*].CDS_LEN" "ANN[*].AA_POS" "ANN[*].AA_LEN" "ANN[*].DISTANCE" "LOF[*].GENE" "LOF[*].GENEID" "LOF[*].NUMTR" "LOF[*].PERC" "NMD[*].GENE" "NMD[*].GENEID" "NMD[*].NUMTR" "NMD[*].PERC" "dbNSFP_SIFT_score" "dbNSFP_SIFT_pred" "dbNSFP_Polyphen2_HDIV_score" "dbNSFP_MutationAssessor_score" "dbNSFP_phyloP100way_vertebrate" "dbNSFP_1000Gp3_AF" "dbNSFP_1000Gp3_AFR_AF" "dbNSFP_1000Gp3_EUR_AF" "dbNSFP_1000Gp3_AMR_AF" "dbNSFP_1000Gp3_EAS_AF" "dbNSFP_ESP6500_AA_AF" "dbNSFP_ESP6500_EA_AF"' + fields = 'CHROM POS ID REF ALT QUAL FILTER AF "ANN[*].ALLELE" "ANN[*].EFFECT" "ANN[*].IMPACT" "ANN[*].GENE" "ANN[*].GENEID" "ANN[*].FEATURE" "ANN[*].FEATUREID" "ANN[*].BIOTYPE" "ANN[*].RANK" "ANN[*].HGVS_C" "ANN[*].HGVS_P" "ANN[*].CDNA_POS" "ANN[*].CDNA_LEN" "ANN[*].CDS_POS" "ANN[*].CDS_LEN" "ANN[*].AA_POS" "ANN[*].AA_LEN" "ANN[*].DISTANCE" "LOF[*].GENE" "LOF[*].GENEID" "LOF[*].NUMTR" "LOF[*].PERC" "NMD[*].GENE" "NMD[*].GENEID" "NMD[*].NUMTR" "NMD[*].PERC" "dbNSFP_SIFT_score" "dbNSFP_SIFT_pred" "dbNSFP_Polyphen2_HDIV_score" "dbNSFP_MutationAssessor_score" "dbNSFP_phyloP100way_vertebrate" "dbNSFP_1000Gp3_AF" "dbNSFP_1000Gp3_AFR_AF" "dbNSFP_1000Gp3_EUR_AF" "dbNSFP_1000Gp3_AMR_AF" "dbNSFP_1000Gp3_EAS_AF" "dbNSFP_ESP6500_AA_AF" "dbNSFP_ESP6500_EA_AF"' + suffix = 'txt' } if (params.gen_org=='mouse'){ - fields = 'CHROM POS REF ALT ID FILTER QUAL FILTER AF SNPEFF_FUNCTIONAL_CLASS SNPEFF_GENE_NAME SNPEFF_AMINO_ACID_CHANGE SNPEFF_EFFECT SNPEFF_TRANSCRIPT_ID' + fields = 'CHROM POS ID REF ALT QUAL FILTER AF "ANN[*].ALLELE" "ANN[*].EFFECT" "ANN[*].IMPACT" "ANN[*].GENE" "ANN[*].GENEID" "ANN[*].FEATURE" "ANN[*].FEATUREID" "ANN[*].BIOTYPE" "ANN[*].RANK" "ANN[*].HGVS_C" "ANN[*].HGVS_P" "ANN[*].CDNA_POS" "ANN[*].CDNA_LEN" "ANN[*].CDS_POS" "ANN[*].CDS_LEN" "ANN[*].AA_POS" "ANN[*].AA_LEN" "ANN[*].DISTANCE"' + + suffix = 'txt' } """ java -Xmx${my_mem}G -jar /opt/snpEff/SnpSift.jar \ extractFields ${vcf} ${fields} \ - > ${sampleID}_snpsift_finalTable.txt + > ${sampleID}_snpsift_finalTable.${suffix} """ -} \ No newline at end of file +} diff --git a/modules/squid/squid_annotate.nf b/modules/squid/squid_annotate.nf new file mode 100644 index 00000000..ce8c0bc8 --- /dev/null +++ b/modules/squid/squid_annotate.nf @@ -0,0 +1,25 @@ +process SQUID_ANNOTATE { + + tag "$sampleID" + + cpus 1 + memory 10.GB + time 5.h + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} + + container 'docker.io/nfcore/rnafusion:squid_1.5-star2.7.1a' + + publishDir "${params.pubdir}/${ params.organize_by=='sample' ? sampleID + '/fusions': 'squid' }", pattern: "*.{tsv,txt}", mode:'copy' + + input: + tuple val(sampleID), path(txt) + path(gtf) + + output: + tuple val(sampleID), path("*annotated.txt"), emit: squid_fusions_annotated + + script: + """ + AnnotateSQUIDOutput.py ${gtf} ${txt} ${sampleID}_squid_fusions_annotated.txt + """ +} diff --git a/modules/squid/squid_call.nf b/modules/squid/squid_call.nf new file mode 100644 index 00000000..818f18e6 --- /dev/null +++ b/modules/squid/squid_call.nf @@ -0,0 +1,23 @@ +process SQUID { + + tag "$sampleID" + + cpus 1 + memory 10.GB + time 5.h + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} + + + container 'docker.io/nfcore/rnafusion:squid_1.5-star2.7.1a' + + input: + tuple val(sampleID), path(bam), path(chimeric_bam) + + output: + tuple val(sampleID), path("*sv.txt"), emit: squid_fusions + + script: + """ + squid -b ${bam} -c ${chimeric_bam} -o ${sampleID}.squid.fusions + """ +} diff --git a/modules/star-fusion/star-fusion.nf b/modules/star-fusion/star-fusion.nf new file mode 100644 index 00000000..fee06e1e --- /dev/null +++ b/modules/star-fusion/star-fusion.nf @@ -0,0 +1,95 @@ +process STAR_FUSION { + + tag "$sampleID" + + cpus 12 + memory 42.GB + time 5.h + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} + + container 'trinityctat/starfusion:1.12.0' + + publishDir "${params.pubdir}/${ params.organize_by=='sample' ? sampleID + '/fusions': 'star-fusion' }", pattern: "*.{tsv,txt}", mode:'copy' + + input: + tuple val(sampleID), file(reads) + + output: + tuple val(sampleID), file("*_star-fusion_fusions.tsv"), emit: star_fusion_fusions + tuple val(sampleID), file("*_abridged.tsv"), emit: star_fusion_fusions_abridge + tuple val(sampleID), file("*_abridged.coding_effect.tsv"), optional: true, emit: star_fusion_abridge_coding + + script: + def avail_mem = task.memory ? "--limitBAMsortRAM ${task.memory.toBytes() - 100000000}" : '' + option = params.read_type == 'PE' ? "--left_fq ${reads[0]} --right_fq ${reads[1]}" : "--left_fq ${reads[0]}" + def extra_params = params.star_fusion_opt ? params.star_fusion_opt : '' + + """ + STAR \\ + --genomeDir ${params.star_index} \\ + --readFilesIn ${reads} \\ + --twopassMode Basic \\ + --outReadsUnmapped None \\ + --chimSegmentMin 12 \\ + --chimJunctionOverhangMin 12 \\ + --alignSJDBoverhangMin 10 \\ + --alignMatesGapMax 100000 \\ + --alignIntronMax 6000 \\ + --chimSegmentReadGapMax 3 \\ + --alignSJstitchMismatchNmax 5 -1 5 5 \\ + --runThreadN ${task.cpus} \\ + --outSAMstrandField intronMotif ${avail_mem} \\ + --outSAMunmapped Within \\ + --outSAMtype BAM Unsorted \\ + --outSAMattrRGline ID:GRPundef \\ + --chimMultimapScoreRange 10 \\ + --chimMultimapNmax 10 \\ + --chimNonchimScoreDropMin 10 \\ + --peOverlapNbasesMin 12 \\ + --peOverlapMMp 0.1 \\ + --sjdbOverhang ${params.read_length - 1} \\ + --chimOutJunctionFormat 1 + + STAR-Fusion \\ + --genome_lib_dir ${params.star_fusion_ref} \\ + -J Chimeric.out.junction \\ + ${option} \\ + --CPU ${task.cpus} \\ + --examine_coding_effect \\ + --output_dir . ${extra_params} + + mv star-fusion.fusion_predictions.tsv ${sampleID}_star-fusion_fusions.tsv + mv star-fusion.fusion_predictions.abridged.tsv ${sampleID}_star-fusion_abridged.tsv + mv star-fusion.fusion_predictions.abridged.coding_effect.tsv ${sampleID}_star-fusion_abridged.coding_effect.tsv + """ +} + +//`--readFilesCommand zcat` this option is included in STAR if files are compressed. + +/* + +To build a new reference set: + + export TMPDIR=/fastscratch/lloydm/tmp + + wget http://ftp.ebi.ac.uk/pub/databases/Pfam/releases/Pfam34.0/Pfam-A.hmm.gz --no-check-certificate + wget https://github.com/FusionAnnotator/CTAT_HumanFusionLib/releases/download/v0.3.0/fusion_lib.Mar2021.dat.gz -O CTAT_HumanFusionLib_Mar2021.dat.gz --no-check-certificate + wget https://data.broadinstitute.org/Trinity/CTAT_RESOURCE_LIB/AnnotFilterRule.pm -O AnnotFilterRule.pm --no-check-certificate + wget https://www.dfam.org/releases/Dfam_3.4/infrastructure/dfamscan/homo_sapiens_dfam.hmm --no-check-certificate + wget https://www.dfam.org/releases/Dfam_3.4/infrastructure/dfamscan/homo_sapiens_dfam.hmm.h3f --no-check-certificate + wget https://www.dfam.org/releases/Dfam_3.4/infrastructure/dfamscan/homo_sapiens_dfam.hmm.h3i --no-check-certificate + wget https://www.dfam.org/releases/Dfam_3.4/infrastructure/dfamscan/homo_sapiens_dfam.hmm.h3m --no-check-certificate + wget https://www.dfam.org/releases/Dfam_3.4/infrastructure/dfamscan/homo_sapiens_dfam.hmm.h3p --no-check-certificate + gunzip Pfam-A.hmm.gz && hmmpress Pfam-A.hmm + + singularity exec /projects/omics_share/meta/containers/trinityctat-starfusion-1.12.0.img \ + /usr/local/src/STAR-Fusion/ctat-genome-lib-builder/prep_genome_lib.pl \ + --genome_fa /projects/compsci/omics_share/human/GRCh38/transcriptome/indices/ensembl/Homo_sapiens.GRCh38.102.all.fa \ + --gtf /projects/compsci/omics_share/human/GRCh38/transcriptome/indices/ensembl/Homo_sapiens.GRCh38.102.chr.gtf \ + --annot_filter_rule AnnotFilterRule.pm \ + --fusion_annot_lib CTAT_HumanFusionLib_Mar2021.dat.gz \ + --pfam_db Pfam-A.hmm \ + --dfam_db homo_sapiens_dfam.hmm \ + --max_readlength 150 \ + --CPU 8 +*/ diff --git a/modules/star/star_align.nf b/modules/star/star_align.nf new file mode 100644 index 00000000..36fd4c80 --- /dev/null +++ b/modules/star/star_align.nf @@ -0,0 +1,39 @@ +process STAR_ALIGN { + + tag "$sampleID" + + cpus 12 + memory 84.GB + time 24.h + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} + + container 'quay.io/biocontainers/star:2.7.8a--h9ee0642_1' + + input: + tuple val(sampleID), path(reads) + val(args) + path(gtf) + + output: + tuple val(sampleID), path('*d.out.bam'), emit: bam + tuple val(sampleID), path('*Log.final.out'), emit: log_final + tuple val(sampleID), path('*Log.out'), emit: log_out + + tuple val(sampleID), path('*sortedByCoord.out.bam'), optional:true, emit: bam_sorted + tuple val(sampleID), path('*toTranscriptome.out.bam'), optional:true, emit: bam_transcript + tuple val(sampleID), path('*Aligned.unsort.out.bam'), optional:true, emit: bam_unsorted + tuple val(sampleID), path('*.tab'), optional:true, emit: tab + tuple val(sampleID), path('*.out.junction'), optional:true, emit: junction + tuple val(sampleID), path('*.out.sam'), optional:true, emit: sam + + script: + """ + STAR \\ + --genomeDir ${params.star_index} \\ + --readFilesIn ${reads} \\ + --runThreadN ${task.cpus} \\ + --outFileNamePrefix ${sampleID}_ \\ + --sjdbGTFfile ${gtf} \\ + ${args} + """ +} diff --git a/modules/subread/subread_feature_counts.nf b/modules/subread/subread_feature_counts.nf index 1c253886..7f84b371 100644 --- a/modules/subread/subread_feature_counts.nf +++ b/modules/subread/subread_feature_counts.nf @@ -16,7 +16,6 @@ process FEATURE_COUNTS { tuple val(sampleID), file("*_peaks_countMatrix.txt") script: - log.info "----- Feature Counts on ${sampleID} -----" """ featureCounts \ -a ${peak_cvg_saf} \ diff --git a/modules/subread/subread_feature_counts_chipseq.nf b/modules/subread/subread_feature_counts_chipseq.nf new file mode 100644 index 00000000..29c4d722 --- /dev/null +++ b/modules/subread/subread_feature_counts_chipseq.nf @@ -0,0 +1,34 @@ +process SUBREAD_FEATURECOUNTS { + tag "${antibody}" + + cpus 4 + memory 4.GB + time '10:00:00' + + publishDir "${params.pubdir}/${ params.organize_by=='sample' ? 'consensusCalling_'+antibody+'/subread' : 'subread' }", pattern: "*.txt*", mode: 'copy' + + container 'quay.io/biocontainers/subread:2.0.1--hed695b0_0' + + input: + tuple val(antibody), val(replicatesExist), val(multipleGroups), path(bams), path(saf) + + output: + tuple val(antibody), file("*featureCounts.txt") , emit: counts + tuple val(antibody), file("*featureCounts.txt.summary"), emit: summary + + script: + prefix = "${antibody}.consensus_peaks" + bam_files = bams.findAll { it.toString().endsWith('.bam') }.sort() + pe_params = params.read_type == 'SE' ? '' : '-p --donotsort' + """ + featureCounts \\ + -F SAF \\ + -O \\ + --fracOverlap 0.2 \\ + -T $task.cpus \\ + $pe_params \\ + -a $saf \\ + -o ${prefix}.featureCounts.txt \\ + ${bam_files.join(' ')} + """ +} diff --git a/modules/svaba/svaba.nf b/modules/svaba/svaba.nf new file mode 100644 index 00000000..693747a4 --- /dev/null +++ b/modules/svaba/svaba.nf @@ -0,0 +1,101 @@ +process SVABA { + tag "$sampleID" + + cpus = 8 + memory { normal_bam.size() < 60.GB ? 15.GB : 48.GB } + time { normal_bam.size() < 60.GB ? '10:00:00' : '24:00:00' } + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} + + container 'quay.io/jaxcompsci/svaba:v0.2.1' + + publishDir "${params.pubdir}/${ params.organize_by=='sample' ? "$sampleID" + '/callers' : 'svaba' }", pattern: "*.vcf.gz", mode:'copy' + + input: + tuple val(sampleID), val(meta), path(normal_bam), path(normal_bai), val(normal_name), path(tumor_bam), path(tumor_bai), val(tumor_name) + + output: + tuple val(sampleID), path("*svaba.germline.indel.vcf.gz"), val(meta), val(normal_name), val(tumor_name), val('svaba'), emit: svaba_germline_indel_vcf + tuple val(sampleID), path("*svaba.germline.sv.vcf.gz"), val(meta), val(normal_name), val(tumor_name), val('svaba'), emit: svaba_germline_sv_vcf + tuple val(sampleID), path("*svaba.somatic.indel.vcf.gz"), val(meta), val(normal_name), val(tumor_name), val('svaba'), emit: svaba_somatic_indel_vcf + tuple val(sampleID), path("*svaba.somatic.sv.vcf.gz"), val(meta), val(normal_name), val(tumor_name), val('svaba'), emit: svaba_somatic_sv_vcf + tuple val(sampleID), path("*svaba.bps.txt.gz"), val(meta), val(normal_name), val(tumor_name), val('svaba'), emit: svaba_unfiltered_variants + tuple val(sampleID), path("*svaba.contigs.bam"), emit: svaba_contigs_bam + tuple val(sampleID), path("*svaba.discordant.txt.gz"), emit: svaba_discordants + tuple val(sampleID), path("*svaba.log"), emit: svaba_log + tuple val(sampleID), path("*svaba.alignments.txt.gz"), emit: svaba_alignments + + script: + """ + svaba run \ + -t ${tumor_bam} \ + -n ${normal_bam} \ + -p ${task.cpus} \ + -a ${sampleID}_svaba \ + -G ${params.combined_reference_set} \ + --region ${params.callRegions} \ + -D ${params.dbSNP} \ + -z on + """ +} +// NOTE: VCF Output header has the BAM file names as 'sampleID' e.g.,: +// #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT test-test_realigned_BQSR.bam test-test2_realigned_BQSR.bam + +// Usage: svaba run -t -G -a myid [OPTIONS] + +// Description: SV and indel detection using rolling SGA assembly and BWA-MEM realignment + +// General options +// -v, --verbose Select verbosity level (0-4). Default: 0 +// -h, --help Display this help and exit +// -p, --threads Use NUM threads to run svaba. Default: 1 +// -a, --id-string String specifying the analysis ID to be used as part of ID common. +// Main input +// -G, --reference-genome Path to indexed reference genome to be used by BWA-MEM. +// -t, --case-bam Case BAM/CRAM/SAM file (eg tumor). Can input multiple. +// -n, --control-bam (optional) Control BAM/CRAM/SAM file (eg normal). Can input multiple. +// -k, --region Run on targeted intervals. Accepts BED file or Samtools-style string +// --germline Sets recommended settings for case-only analysis (eg germline). (-I, -L5, assembles NM >= 3 reads) +// Variant filtering and classification +// --lod LOD cutoff to classify indel as non-REF (tests AF=0 vs AF=MaxLikelihood(AF)) [8] +// --lod-dbsnp LOD cutoff to classify indel as non-REF (tests AF=0 vs AF=MaxLikelihood(AF)) at DBSnp indel site [5] +// --lod-somatic LOD cutoff to classify indel as somatic (tests AF=0 in normal vs AF=ML(0.5)) [2.5] +// --lod-somatic-dbsnp LOD cutoff to classify indel as somatic (tests AF=0 in normal vs AF=ML(0.5)) at DBSnp indel site [4] +// --scale-errors Scale the priors that a site is artifact at given repeat count. 0 means assume low (const) error rate [1] +// Additional options +// -L, --mate-lookup-min Minimum number of somatic reads required to attempt mate-region lookup [3] +// -s, --disc-sd-cutoff Number of standard deviations of calculated insert-size distribution to consider discordant. [3.92] +// -c, --chunk-size Size of a local assembly window (in bp). Set 0 for whole-BAM in one assembly. [25000] +// -x, --max-reads Max total read count to read in from assembly region. Set 0 to turn off. [50000] +// -C, --max-coverage Max read coverage to send to assembler (per BAM). Subsample reads if exceeded. [500] +// --no-interchrom-lookup Skip mate lookup for inter-chr candidate events. Reduces power for translocations but less I/O. +// --discordant-only Only run the discordant read clustering module, skip assembly. +// --num-assembly-rounds Run assembler multiple times. > 1 will bootstrap the assembly. [2] +// --num-to-sample When learning about inputs, number of reads to sample. [2,000,000] +// --hp Highly parallel. Don't write output until completely done. More memory, but avoids all thread-locks. +// Output options +// -z, --g-zip Gzip and tabix the output VCF files. [off] +// -A, --all-contigs Output all contigs that were assembled, regardless of mapping or length. [off] +// --read-tracking Track supporting reads by qname. Increases file sizes. [off] +// --write-extracted-reads For the case BAM, write reads sent to assembly to a BAM file. [off] +// Optional external database +// -D, --dbsnp-vcf DBsnp database (VCF) to compare indels against +// -B, --blacklist BED-file with blacklisted regions to not extract any reads from. +// -Y, --microbial-genome Path to indexed reference genome of microbial sequences to be used by BWA-MEM to filter reads. +// -V, --germline-sv-database BED file containing sites of known germline SVs. Used as additional filter for somatic SV detection +// -R, --simple-seq-database BED file containing sites of simple DNA that can confuse the contig re-alignment. +// Assembly and EC params +// -m, --min-overlap Minimum read overlap, an SGA parameter. Default: 0.4* readlength +// -e, --error-rate Fractional difference two reads can have to overlap. See SGA. 0 is fast, but requires error correcting. [0] +// -K, --ec-correct-type (f) Fermi-kit BFC correction, (s) Kmer-correction from SGA, (0) no correction (then suggest non-zero -e) [f] +// -E, --ec-subsample Learn from fraction of non-weird reads during error-correction. Lower number = faster compute [0.5] +// --write-asqg Output an ASQG graph file for each assembly window. +// BWA-MEM alignment params +// --bwa-match-score Set the BWA-MEM match score. BWA-MEM -A [2] +// --gap-open-penalty Set the BWA-MEM gap open penalty for contig to genome alignments. BWA-MEM -O [32] +// --gap-extension-penalty Set the BWA-MEM gap extension penalty for contig to genome alignments. BWA-MEM -E [1] +// --mismatch-penalty Set the BWA-MEM mismatch penalty for contig to genome alignments. BWA-MEM -b [18] +// --bandwidth Set the BWA-MEM SW alignment bandwidth for contig to genome alignments. BWA-MEM -w [1000] +// --z-dropoff Set the BWA-MEM SW alignment Z-dropoff for contig to genome alignments. BWA-MEM -d [100] +// --reseed-trigger Set the BWA-MEM reseed trigger for reseeding mems for contig to genome alignments. BWA-MEM -r [1.5] +// --penalty-clip-3 Set the BWA-MEM penalty for 3' clipping. [5] +// --penalty-clip-5 Set the BWA-MEM penalty for 5' clipping. [5] diff --git a/modules/tabix/compress_merged_vcf.nf b/modules/tabix/compress_merged_vcf.nf new file mode 100644 index 00000000..36159f9f --- /dev/null +++ b/modules/tabix/compress_merged_vcf.nf @@ -0,0 +1,29 @@ +process COMPRESS_INDEX_MERGED_VCF { + tag "$sampleID" + + cpus = 1 + memory = 6.GB + time = '06:00:00' + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} + + container 'quay.io/biocontainers/tabix:1.11--hdfd78af_0' + + input: + tuple val(sampleID), file(vcf), val(meta) + + output: + tuple val(sampleID), file("*.vcf.gz"), file("*.vcf.gz.tbi"), val(meta), val(normal_name), val(tumor_name), emit: compressed_vcf_tbi + + script: + normal_name = meta.normal_id + tumor_name = meta.tumor_id + + """ + bgzip \ + -c \ + ${vcf} \ + > ${vcf}.gz + + tabix ${vcf}.gz + """ +} diff --git a/modules/tabix/compress_vcf.nf b/modules/tabix/compress_vcf.nf new file mode 100644 index 00000000..79a476b6 --- /dev/null +++ b/modules/tabix/compress_vcf.nf @@ -0,0 +1,25 @@ +process COMPRESS_INDEX_VCF { + tag "$sampleID" + + cpus = 1 + memory = 6.GB + time = '06:00:00' + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} + + container 'quay.io/biocontainers/tabix:1.11--hdfd78af_0' + + input: + tuple val(sampleID), file(vcf), val(meta), val(normal_name), val(tumor_name), val(tool) + + output: + tuple val(sampleID), file("*.vcf.gz"), file("*.vcf.gz.tbi"), val(meta), val(normal_name), val(tumor_name), val(tool), emit: compressed_vcf_tbi + + """ + bgzip \ + -c \ + ${vcf} \ + > ${vcf}.gz + + tabix ${vcf}.gz + """ +} diff --git a/modules/tabix/compress_vcf_region.nf b/modules/tabix/compress_vcf_region.nf new file mode 100644 index 00000000..9b6e2366 --- /dev/null +++ b/modules/tabix/compress_vcf_region.nf @@ -0,0 +1,25 @@ +process COMPRESS_INDEX_VCF_REGION { + tag "$sampleID" + + cpus = 1 + memory = 6.GB + time = '06:00:00' + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} + + container 'quay.io/biocontainers/tabix:1.11--hdfd78af_0' + + input: + tuple val(sampleID), file(vcf), val(meta), val(chrom) + + output: + tuple val(sampleID), file("*.vcf.gz"), file("*.vcf.gz.tbi"), val(meta), val('empty_name'), val('empty_name'), val(chrom), emit: compressed_vcf_tbi + + """ + bgzip \ + -c \ + ${vcf} \ + > ${vcf}.gz + + tabix ${vcf}.gz + """ +} diff --git a/modules/trim_galore/trim_galore.nf b/modules/trim_galore/trim_galore.nf index f8c4b8bb..b9968dfd 100644 --- a/modules/trim_galore/trim_galore.nf +++ b/modules/trim_galore/trim_galore.nf @@ -4,11 +4,25 @@ process TRIM_GALORE { cpus 8 memory 16.GB time '06:00:00' + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} container 'quay.io/biocontainers/trim-galore:0.6.7--hdfd78af_0' - publishDir "${params.pubdir}/${ params.organize_by=='sample' ? sampleID+'/trimmed_fastq' : 'trim_galore' }", pattern: "*.fq.gz", mode:'copy' - publishDir "${params.pubdir}/${ params.organize_by=='sample' ? sampleID+'/stats' : 'fastqc' }", pattern: "*_fastqc.{zip,html}", mode:'copy' - publishDir "${params.pubdir}/${ params.organize_by=='sample' ? sampleID+'/stats' : 'trim_report' }", pattern: "*trimming_report.txt", mode:'copy' + + publishDir { + def type = "${params.workflow}" == 'chipseq' ? ( sampleID =~ /INPUT/ ? 'control_samples/' : 'immuno_precip_samples/') : '' + "${params.pubdir}/${ params.organize_by=='sample' ? type+sampleID+'/trimmed_fastq' : 'trim_galore'}" + }, pattern: "*.fq.gz", mode: 'copy', enabled: params.keep_intermediate + + publishDir { + def type = "${params.workflow}" == 'chipseq' ? 'fastqc/' : '' + "${params.pubdir}/${ params.organize_by=='sample' ? type+sampleID+'/stats' : 'fastqc'}" + }, pattern: "*_fastqc.{zip,html}", mode: 'copy' + + publishDir { + def type = "${params.workflow}" == 'chipseq' ? 'fastqc/' : '' + "${params.pubdir}/${ params.organize_by=='sample' ? type+sampleID+'/trimmed_fastq' : 'trim_galore'}" + }, pattern: "*trimming_report.txt", mode: 'copy' + input: tuple val(sampleID), file(fq_reads) @@ -19,7 +33,6 @@ process TRIM_GALORE { tuple val(sampleID), file("*trimming_report.txt"), emit: trim_stats script: - log.info "----- Trim Galore Running on: ${sampleID} -----" paired_end = params.read_type == 'PE' ? '--paired' : '' rrbs_flag = params.workflow == "rrbs" ? '--rrbs' : '' @@ -39,8 +52,22 @@ process TRIM_GALORE { refer to the RRBS guide for the meaning of CTOT and CTOB strands). */ + if (params.workflow == "chipseq" && params.read_type == 'SE') + """ + [ ! -f ${sampleID}.fastq.gz ] && ln -s ${fq_reads} ${sampleID}.fastq.gz + + trim_galore --cores ${task.cpus} ${paired_end} ${rrbs_flag} ${directionality} --gzip --length ${params.trimLength} -q ${params.qualThreshold} --stringency ${params.adapOverlap} -a ${params.adaptorSeq} --fastqc ${sampleID}.fastq.gz """ + else if (params.workflow == "chipseq" && params.read_type == 'PE') + """ + [ ! -f ${sampleID}_1.fastq.gz ] && ln -s ${fq_reads[0]} ${sampleID}_1.fastq.gz + [ ! -f ${sampleID}_2.fastq.gz ] && ln -s ${fq_reads[1]} ${sampleID}_2.fastq.gz + + trim_galore --cores ${task.cpus} ${paired_end} ${rrbs_flag} ${directionality} --gzip --length ${params.trimLength} -q ${params.qualThreshold} --stringency ${params.adapOverlap} -a ${params.adaptorSeq} --fastqc ${sampleID}_1.fastq.gz ${sampleID}_2.fastq.gz + """ + else + """ trim_galore --basename ${sampleID} --cores ${task.cpus} ${paired_end} ${rrbs_flag} ${directionality} --gzip --length ${params.trimLength} -q ${params.qualThreshold} --stringency ${params.adapOverlap} -a ${params.adaptorSeq} --fastqc ${fq_reads} """ -} +} diff --git a/modules/ucsc/ucsc_bedgraphtobigwig.nf b/modules/ucsc/ucsc_bedgraphtobigwig.nf new file mode 100644 index 00000000..439e69d6 --- /dev/null +++ b/modules/ucsc/ucsc_bedgraphtobigwig.nf @@ -0,0 +1,40 @@ +process UCSC_BEDGRAPHTOBIGWIG { + tag "$sampleID" + + cpus 8 + memory 10.GB + time '04:00:00' + + publishDir { + def type = "${params.workflow}" == 'chipseq' ? ( sampleID =~ /INPUT/ ? 'control_samples/' : 'immuno_precip_samples/') : '' + "${params.pubdir}/${ params.organize_by=='sample' ? type+sampleID+'/bigwig' : 'ucsc'}" + }, pattern: "*.bigWig", mode: 'copy' + + container 'quay.io/biocontainers/ucsc-bedgraphtobigwig:377--h446ed27_1' + + input: + tuple val(sampleID), file(bedgraph) + file(sizes) + + output: + tuple val(sampleID), file("*.bigWig"), emit: bigwig + + script: + """ + bedGraphToBigWig \\ + $bedgraph \\ + $sizes \\ + ${sampleID}.bigWig + + + """ +} + +/* +IGV steps removed, re-add if IGV is needed: + + OUTPUT: tuple val(sampleID), file("*.igv.txt"), emit: igv_txt + + SCRIPT: find * -type f -name "*.bigWig" -exec echo -e "bigwig/"{}"\\t0,0,178" \\; > ${sampleID}.bigWig.igv.txt + +*/ \ No newline at end of file diff --git a/modules/utility_modules/aggregate_stats_rna.nf b/modules/utility_modules/aggregate_stats_rna.nf index 707705bd..7f8f0675 100644 --- a/modules/utility_modules/aggregate_stats_rna.nf +++ b/modules/utility_modules/aggregate_stats_rna.nf @@ -3,6 +3,7 @@ process RNA_SUMMARY_STATS { cpus = 1 time = '00:15:00' + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} container 'quay.io/jaxcompsci/perl:0.1.0' @@ -15,7 +16,6 @@ process RNA_SUMMARY_STATS { tuple val(sampleID), file("*.txt") script: - log.info "----- Summary Metrics running on ${sampleID} -----" if (params.read_type == "PE") diff --git a/modules/utility_modules/aggregate_stats_wes.nf b/modules/utility_modules/aggregate_stats_wes.nf index 7c6304a3..79052719 100644 --- a/modules/utility_modules/aggregate_stats_wes.nf +++ b/modules/utility_modules/aggregate_stats_wes.nf @@ -3,6 +3,7 @@ process AGGREGATE_STATS { cpus = 1 time = '00:30:00' + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} container 'quay.io/jaxcompsci/python-bz2file:np_2.7.18' @@ -15,7 +16,6 @@ process AGGREGATE_STATS { tuple val(sampleID), file("*summary_stats.txt"), emit: txt script: - log.info "----- Generating Summary Stats for: ${sampleID} -----" """ python ${projectDir}/bin/wes/aggregate_stats_wes.py ${sampleID}_summary_stats.txt ${filter_stats} ${picard_met} ${algn_met} diff --git a/modules/utility_modules/aggregate_stats_wgs.nf b/modules/utility_modules/aggregate_stats_wgs.nf index 23c08334..03d4b0c5 100644 --- a/modules/utility_modules/aggregate_stats_wgs.nf +++ b/modules/utility_modules/aggregate_stats_wgs.nf @@ -3,6 +3,7 @@ process AGGREGATE_STATS { cpus = 1 time = '00:30:00' + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} container 'quay.io/jaxcompsci/python-bz2file:np_2.7.18' @@ -15,7 +16,6 @@ process AGGREGATE_STATS { tuple val(sampleID), file("*summary_stats.txt"), emit: txt script: - log.info "----- Generating Summary Stats for: ${sampleID} -----" """ python ${projectDir}/bin/wgs/aggregate_stats_wgs.py ${sampleID}_summary_stats.txt ${filter_stats} ${picard_met} ${algn_met} ${cov_met} diff --git a/modules/utility_modules/aria_download.nf b/modules/utility_modules/aria_download.nf new file mode 100644 index 00000000..7bb02477 --- /dev/null +++ b/modules/utility_modules/aria_download.nf @@ -0,0 +1,23 @@ +process ARIA_DOWNLOAD { + + tag "$sampleID" + + cpus 1 + memory 15.GB + time '10:00:00' + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} + + container 'quay.io/jaxcompsci/aria2:1.36.0' + + input: + tuple val(sampleID), val(meta), val(read_num), val(link) + + output: + tuple val(sampleID), val(meta), val(read_num), path("*"), emit: file + + script: + + """ + aria2c --connect-timeout=180 --retry-wait=60 --timeout=180 ${link} + """ +} diff --git a/modules/utility_modules/chipseq_bampe_rm_orphan.nf b/modules/utility_modules/chipseq_bampe_rm_orphan.nf new file mode 100644 index 00000000..6b4d3c84 --- /dev/null +++ b/modules/utility_modules/chipseq_bampe_rm_orphan.nf @@ -0,0 +1,17 @@ +process BAMPE_RM_ORPHAN { + tag "$sampleID" + + container 'quay.io/biocontainers/mulled-v2-57736af1eb98c01010848572c9fec9fff6ffaafd:402e865b8f6af2f3e58c6fc8d57127ff0144b2c7-0' + + input: + tuple val(sampleID), file(bam) + + output: + tuple val(sampleID), file("*.bam"), emit: bam + + script: // This script was bundled withing the nf-core/chipseq/bin/ directory + prefix = "${sampleID}.mLb.clN" + """ + python ${projectDir}/bin/chipseq/bampe_rm_orphan.py ${bam[0]} ${prefix}.bam --only_fr_pairs + """ +} diff --git a/modules/utility_modules/chipseq_check_design.nf b/modules/utility_modules/chipseq_check_design.nf new file mode 100644 index 00000000..f98d40e6 --- /dev/null +++ b/modules/utility_modules/chipseq_check_design.nf @@ -0,0 +1,17 @@ +process CHECK_DESIGN { + tag "$design" + publishDir "${params.pubdir}/parsed_samplesheets", mode: 'copy' + + input: + path(design) + + output: + path('design_reads.csv'), emit: sample_reads + path('design_controls.csv'), emit: study_design + + script: + """ + python ${projectDir}/bin/chipseq/check_design.py $design design_reads.csv design_controls.csv + """ +} + diff --git a/modules/utility_modules/chipseq_make_genome_filter.nf b/modules/utility_modules/chipseq_make_genome_filter.nf new file mode 100644 index 00000000..87f0a063 --- /dev/null +++ b/modules/utility_modules/chipseq_make_genome_filter.nf @@ -0,0 +1,20 @@ +process MAKE_GENOME_FILTER { + tag "$fai" + publishDir "${params.pubdir}/genome_info", mode: 'copy' + + input: + file(fai) + file(blacklist) + + output: + path('*.bed'), emit: bed + path('*.sizes'), emit: sizes + + script: + fasta="\$(echo ${fai} | sed 's/.fai//g')" + blacklist_filter = params.blacklist ? "sortBed -i $blacklist -g ${fasta}.sizes | complementBed -i stdin -g ${fasta}.sizes" : "awk '{print \$1, '0' , \$2}' OFS='\t' ${fasta}.sizes" + """ + cut -f 1,2 ${fai} > ${fasta}.sizes + $blacklist_filter > ${fasta}.include_regions.bed + """ +} diff --git a/modules/utility_modules/concatenate_reads_PE.nf b/modules/utility_modules/concatenate_reads_PE.nf index d7e705fe..cc5786a0 100644 --- a/modules/utility_modules/concatenate_reads_PE.nf +++ b/modules/utility_modules/concatenate_reads_PE.nf @@ -5,20 +5,22 @@ process CONCATENATE_READS_PE { cpus 1 memory 15.GB time '03:00:00' + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} - publishDir "${params.pubdir}/${ params.organize_by=='sample' ? sampleID+'/concatenated_reads' : 'concatenated_reads' }", pattern: "*fastq.gz", mode:'copy' + container 'ubuntu:20.04' + + publishDir "${params.pubdir}/${ params.organize_by=='sample' ? sampleID+'/concatenated_reads' : 'concatenated_reads' }", pattern: "*", mode:'copy' input: tuple val(sampleID), file(R1), file(R2) output: - tuple val(sampleID), file("*fastq.gz"), emit: concat_fastq + tuple val(sampleID), file("*"), emit: concat_fastq script: - log.info "----- Concatenate Reads Running on: ${sampleID} -----" """ - cat $R1 > ${sampleID}_R1.fastq.gz - cat $R2 > ${sampleID}_R2.fastq.gz + cat $R1 > ${sampleID}_R1${params.extension} + cat $R2 > ${sampleID}_R2${params.extension} """ } diff --git a/modules/utility_modules/concatenate_reads_SE.nf b/modules/utility_modules/concatenate_reads_SE.nf index f110e575..c9eb5b12 100644 --- a/modules/utility_modules/concatenate_reads_SE.nf +++ b/modules/utility_modules/concatenate_reads_SE.nf @@ -5,19 +5,21 @@ process CONCATENATE_READS_SE { cpus 1 memory 15.GB time '03:00:00' + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} - publishDir "${params.pubdir}/${ params.organize_by=='sample' ? sampleID+'/concatenated_reads' : 'concatenated_reads' }", pattern: "*fastq.gz", mode:'copy' + container 'ubuntu:20.04' + + publishDir "${params.pubdir}/${ params.organize_by=='sample' ? sampleID+'/concatenated_reads' : 'concatenated_reads' }", pattern: "*", mode:'copy' input: tuple val(sampleID), file(R1) output: - tuple val(sampleID), file("*fastq.gz"), emit: concat_fastq + tuple val(sampleID), file("*"), emit: concat_fastq script: - log.info "----- Concatenate Reads Running on: ${sampleID} -----" """ - cat $R1 > ${sampleID}_R1.fastq.gz + cat $R1 > ${sampleID}_R1${params.extension} """ } diff --git a/modules/utility_modules/concatenate_reads_sampleSheet.nf b/modules/utility_modules/concatenate_reads_sampleSheet.nf new file mode 100644 index 00000000..d22695e9 --- /dev/null +++ b/modules/utility_modules/concatenate_reads_sampleSheet.nf @@ -0,0 +1,26 @@ +process CONCATENATE_READS_SAMPLESHEET { + + tag "$sampleID" + + cpus 1 + memory 15.GB + time '03:00:00' + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} + + publishDir "${params.pubdir}/${ params.organize_by=='sample' ? sampleID+'/concatenated_reads' : 'concatenated_reads' }", pattern: "*fastq.gz", mode:'copy' + + input: + tuple val(sampleID), val(num_lanes), val(meta), val(read_num), path(reads) + + output: + tuple val(sampleID), val(num_lanes), val(meta), val(read_num), path("*fastq.gz"), emit: concat_fastq + + when: + num_lanes > 1 + + script: + + """ + cat $reads > ${sampleID}_${read_num}.fastq.gz + """ +} diff --git a/modules/utility_modules/deseq2_qc.nf b/modules/utility_modules/deseq2_qc.nf new file mode 100644 index 00000000..e9b75b2f --- /dev/null +++ b/modules/utility_modules/deseq2_qc.nf @@ -0,0 +1,52 @@ +process DESEQ2_QC { + tag "${antibody}" + + cpus 1 + memory 15.GB + time '10:00:00' + + container 'quay.io/biocontainers/mulled-v2-8849acf39a43cdd6c839a369a74c0adc823e2f91:ab110436faf952a33575c64dd74615a84011450b-0' + + publishDir "${params.pubdir}/${ params.organize_by=='sample' ? 'consensusCalling_'+antibody+'/deseq2' : 'deseq2' }", mode: 'copy' + + input: + tuple val(antibody), path(counts) + file(deseq2_pca_header) + file(deseq2_clustering_header) + + output: + path "*.pdf" , optional:true, emit: pdf + path "*.RData" , optional:true, emit: rdata + path "*.rds" , optional:true, emit: rds + path "*pca.vals.txt" , optional:true, emit: pca_txt + path "*pca.vals_mqc.tsv" , optional:true, emit: pca_multiqc + path "*sample.dists.txt" , optional:true, emit: dists_txt + path "*sample.dists_mqc.tsv", optional:true, emit: dists_multiqc + path "*.log" , optional:true, emit: log + path "size_factors" , optional:true, emit: size_factors + + + script: + prefix = "${antibody}.consensus_peaks" + bam_ext = params.read_type == 'SE' ? '.mLb.clN.sorted.bam' : '.mLb.clN.bam' + vst = params.deseq2_vst ? '--vst TRUE' : '' + peak_type = params.narrow_peak ? 'narrowPeak' : 'broadPeak' + """ + ${projectDir}/bin/chipseq/deseq2_qc.r \\ + --count_file $counts \\ + --sample_suffix '$bam_ext' \\ + --outdir ./ \\ + --outprefix $prefix \\ + --cores $task.cpus \\ + --id_col 1 --count_col 7 --vst TRUE + + sed 's/deseq2_pca/deseq2_pca_${task.index}/g' <$deseq2_pca_header >tmp.txt + sed -i -e 's/DESeq2 /${antibody} DESeq2 /g' tmp.txt + cat tmp.txt ${prefix}.pca.vals.txt > ${prefix}.pca.vals_mqc.tsv + + sed 's/deseq2_clustering/deseq2_clustering_${task.index}/g' <$deseq2_clustering_header >tmp.txt + sed -i -e 's/DESeq2 /${antibody} DESeq2 /g' tmp.txt + cat tmp.txt ${prefix}.sample.dists.txt > ${prefix}.sample.dists_mqc.tsv + + """ +} diff --git a/modules/utility_modules/frip_score.nf b/modules/utility_modules/frip_score.nf new file mode 100644 index 00000000..375e1274 --- /dev/null +++ b/modules/utility_modules/frip_score.nf @@ -0,0 +1,40 @@ +process FRIP_SCORE { + tag "${ip} vs ${control}" + + cpus 1 + memory 10.GB + time '10:00:00' + + + publishDir "${params.pubdir}/${ params.organize_by=='sample' ? 'immuno_precip_samples/'+ip+'_vs_'+control+'/macs2' : 'macs2' }", pattern: "*.tsv", mode: 'copy' + + container 'quay.io/biocontainers/mulled-v2-8186960447c5cb2faa697666dc1e6d919ad23f3e:3127fcae6b6bdaf8181e21a26ae61231030a9fcb-0' + + input: + tuple val(antibody), val(replicatesExist), val(multipleGroups), val(ip), path(ipbam), val(control), path(controlbam), path(ipflagstat), path(peak) + path(peak_count_header) + path(frip_score_header) + + output: + tuple val(ip), path("*.tsv"), emit : tsv + + script: + def PEAK_TYPE = params.narrow_peak ? 'narrowPeak' : 'broadPeak' + """ + cat $peak | wc -l | awk -v OFS='\t' '{ print "${ip}", \$1 }' | cat $peak_count_header - > ${ip}_peaks.count_mqc.tsv + READS_IN_PEAKS=\$(intersectBed -a ${ipbam[0]} -b $peak -bed -c -f 0.20 | awk -F '\t' '{sum += \$NF} END {print sum}')i + grep 'mapped (' $ipflagstat | awk -v a="\$READS_IN_PEAKS" -v OFS='\t' '{print "${ip}", a/\$1}' | cat $frip_score_header - > ${ip}_peaks.FRiP_mqc.tsv + + + """ +} + +/* +IGV steps removed, re-add if IGV is needed: + + PUBDIR: publishDir "${params.pubdir}/${ params.organize_by=='sample' ? 'comparison/'+ip+'_vs_'+control+'/macs2' : 'macs2' }", pattern: "*.txt", mode: 'copy' + + OUTPUT: tuple val(ip), path("*.txt"), emit : txt + + SCRIPT: find * -type l -name "*.${PEAK_TYPE}" -exec echo -e "macs2/"{}"\\t0,0,178" \\; > ${ip}_peaks.igv.txt +*/ \ No newline at end of file diff --git a/modules/utility_modules/get_read_length.nf b/modules/utility_modules/get_read_length.nf new file mode 100644 index 00000000..e32c4ddd --- /dev/null +++ b/modules/utility_modules/get_read_length.nf @@ -0,0 +1,19 @@ +process GET_READ_LENGTH { + tag "$sampleID" + + cpus = 1 + time = '00:05:00' + + container 'ubuntu:20.04' + + input: + tuple val(sampleID), path(reads) + + output: + tuple val(sampleID), env(READ_LENGTH), emit: read_length + + script: + """ + READ_LENGTH=`zcat ${reads[0]} | head -n 400 | awk 'NR%4==2{m=length(\$0)}{print m}' | sort -n | tail -1` + """ +} \ No newline at end of file diff --git a/modules/utility_modules/gunzip.nf b/modules/utility_modules/gunzip.nf new file mode 100644 index 00000000..4e6bb93a --- /dev/null +++ b/modules/utility_modules/gunzip.nf @@ -0,0 +1,34 @@ +process GUNZIP { + + tag "$sampleID" + + cpus 1 + memory 5.GB + time 2.h + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} + + + container "quay.io/jaxcompsci/py3_perl_pylibs:v2" + + input: + tuple val(sampleID), path(reads) + + output: + tuple val(sampleID), path("*.{fastq,fq}"), emit: gunzip_fastq + shell: + + ''' + if [[ !{reads[0]} =~ ".gz" ]]; + then + gunzip -c !{reads[0]} > !{reads[0].baseName} + else + mv !{reads[0]} input_!{reads[0]} + fi + if [[ !{reads[1]} =~ ".gz" ]]; + then + gunzip -c !{reads[1]} > !{reads[1].baseName} + else + mv !{reads[1]} input_!{reads[1]} + fi + ''' +} diff --git a/modules/utility_modules/jax_trimmer.nf b/modules/utility_modules/jax_trimmer.nf new file mode 100644 index 00000000..ba6edcfd --- /dev/null +++ b/modules/utility_modules/jax_trimmer.nf @@ -0,0 +1,35 @@ +process JAX_TRIMMER { + + tag "$sampleID" + + cpus 1 + memory 30.GB + time '24:00:00' + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} + + container 'quay.io/jaxcompsci/python-bz2file:np_2.7.18' + + publishDir "${params.pubdir}/${ params.organize_by=='sample' ? sampleID+'/stats' : 'quality_stats' }", pattern: "*_stat", mode:'copy' + + input: + tuple val(sampleID), path(fq_reads) + + output: + tuple val(sampleID), file("*_stat"), emit: quality_stats + tuple val(sampleID), file("*filtered_trimmed"), emit: trimmed_fastq + + script: + + if (params.read_type == "SE"){ + mode_HQ="-S -M" + inputfq="${fq_reads[0]}" + } + if (params.read_type == "PE"){ + mode_HQ="-M" + inputfq="${fq_reads[0]} ${fq_reads[1]}" + } + + """ + python ${projectDir}/bin/shared/filter_trim.py $mode_HQ ${params.min_pct_hq_reads} -p ${params.hq_pct} $inputfq + """ +} diff --git a/modules/utility_modules/make_vcf_list.nf b/modules/utility_modules/make_vcf_list.nf index a92cff18..0f08dc51 100644 --- a/modules/utility_modules/make_vcf_list.nf +++ b/modules/utility_modules/make_vcf_list.nf @@ -1,5 +1,6 @@ process MAKE_VCF_LIST { tag "$sampleID" + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} input: tuple val(sampleID), val(chroms) @@ -9,7 +10,6 @@ process MAKE_VCF_LIST { tuple val(sampleID), file("*.list"), emit: list script: - log.info "----- Make VCF List from Chromosomes: ${sampleID} -------" // Puts Individual Chromosome Files In Order and Then Into List for MergeVCFs // convert paths to strings diff --git a/modules/utility_modules/parse_extracted_sv_table.nf b/modules/utility_modules/parse_extracted_sv_table.nf new file mode 100644 index 00000000..b8c0e24c --- /dev/null +++ b/modules/utility_modules/parse_extracted_sv_table.nf @@ -0,0 +1,28 @@ +process SNPSIFT_EXTRACT_AND_PARSE { + + // NOTE: This script is for the parsing of the 'SV' pipeline germline annotationed table from snpeff extractfields. + // It is hard coded to the annotations used. + + tag "$sampleID" + + cpus = 1 + memory = 6.GB + time = '03:00:00' + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} + + container 'quay.io/jaxcompsci/py3_perl_pylibs:v2' + + publishDir "${params.pubdir}/${ params.organize_by=='sample' ? sampleID : 'snpeff' }", pattern:"*.txt", mode:'copy' + + input: + tuple val(sampleID), file(table) + + output: + tuple val(sampleID), file("*.txt"), emit: txt + + script: + + """ + python ${projectDir}/bin/pta/split_annotations.py ${table} ${sampleID}_annotated_filtered_final_table.txt + """ +} diff --git a/modules/utility_modules/quality_stats.nf b/modules/utility_modules/quality_stats.nf deleted file mode 100644 index 8ac6848e..00000000 --- a/modules/utility_modules/quality_stats.nf +++ /dev/null @@ -1,35 +0,0 @@ -process QUALITY_STATISTICS { - - tag "$sampleID" - - cpus 1 - memory 30.GB - time '24:00:00' - - container 'quay.io/jaxcompsci/python-bz2file:np_2.7.18' - - publishDir "${params.pubdir}/${ params.organize_by=='sample' ? sampleID+'/stats' : 'quality_stats' }", pattern: "*fastq.gz_stat", mode:'copy' - - input: - tuple val(sampleID), file(fq_reads) - - output: - tuple val(sampleID), file("*.fastq.gz_stat"), emit: quality_stats - tuple val(sampleID), file("*filtered_trimmed"), emit: trimmed_fastq - - script: - log.info "----- Quality Stats Running on: ${sampleID} -----" - - if (params.read_type == "SE"){ - mode_HQ="-S -M" - inputfq="${fq_reads[0]}" - } - if (params.read_type == "PE"){ - mode_HQ="-M" - inputfq="${fq_reads[0]} ${fq_reads[1]}" - } - - """ - python ${projectDir}/bin/shared/filter_trim.py $mode_HQ ${params.min_pct_hq_reads} $inputfq - """ -} diff --git a/modules/utility_modules/read_groups.nf b/modules/utility_modules/read_groups.nf index 9d41d75b..a27d951d 100644 --- a/modules/utility_modules/read_groups.nf +++ b/modules/utility_modules/read_groups.nf @@ -4,10 +4,11 @@ process READ_GROUPS { cpus 1 memory 5.GB time '01:00:00' + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} container 'quay.io/jaxcompsci/python-bz2file:np_2.7.18' - publishDir "${params.pubdir}/${ params.organize_by=='sample' ? sampleID : 'read_groups' }", pattern: "*read_group.txt", mode:'copy', enabled: params.keep_intermediate + publishDir "${params.pubdir}/${ params.organize_by=='sample' ? sampleID+'/stats' : 'read_groups' }", pattern: "*read_group.txt", mode:'copy', enabled: params.workflow == 'rnaseq' || params.keep_intermediate input: tuple val(sampleID), file(fq_reads) @@ -17,7 +18,6 @@ process READ_GROUPS { tuple val(sampleID), file("*.txt"), emit: read_groups script: - log.info "----- Read Group Information Determination Running on: ${sampleID} -----" if (picard=="picard"){ p='-p' } @@ -25,6 +25,6 @@ process READ_GROUPS { p='' } """ - python ${projectDir}/bin/shared/read_group_from_fastq.py $p -o ${sampleID}_read_group.txt ${fq_reads[0]} + python ${projectDir}/bin/shared/read_group_from_fastq.py $p -s ${sampleID} -o ${sampleID}_read_group.txt ${fq_reads[0]} """ } diff --git a/modules/utility_modules/rna_covcalc_gatk.nf b/modules/utility_modules/rna_covcalc_gatk.nf index 7abbf82e..33b4078f 100644 --- a/modules/utility_modules/rna_covcalc_gatk.nf +++ b/modules/utility_modules/rna_covcalc_gatk.nf @@ -4,10 +4,10 @@ process COVCALC_GATK { cpus 1 memory 15.GB time '24:00:00' + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} container 'quay.io/jaxcompsci/python-bz2file:np_2.7.18' - // store in /stats publishDir "${params.pubdir}/${ params.organize_by=='sample' ? sampleID : 'gatk' }", pattern: "*.bed", mode:'copy' input: @@ -18,7 +18,6 @@ process COVCALC_GATK { tuple val(sampleID), file("*.bed"), emit: bed script: - log.info "----- GATK COVCALC Running on: ${sampleID} -----" """ python ${projectDir}/bin/rnaseq/coveragecalculator.py ${txt} ${sampleID}_${filename}_avg_median_coverage.bed diff --git a/modules/utility_modules/rna_format_gatk.nf b/modules/utility_modules/rna_format_gatk.nf index 0b1e3cc2..529bec33 100644 --- a/modules/utility_modules/rna_format_gatk.nf +++ b/modules/utility_modules/rna_format_gatk.nf @@ -4,10 +4,10 @@ process FORMAT_GATK { cpus 1 memory 15.GB time '24:00:00' + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} container 'quay.io/biocontainers/bedtools:2.23.0--h5b5514e_6' - file(params.ref_fai) - + input: tuple val(sampleID), file(txt) val(L) @@ -16,10 +16,9 @@ process FORMAT_GATK { tuple val(sampleID), file("*_gatk_formatter.txt"), emit: txt script: - log.info "----- GATK Formatter Running on: ${sampleID} -----" """ chmod +x ${projectDir}/bin/rnaseq/gatk_formatter.sh ${projectDir}/bin/rnaseq/gatk_formatter.sh ${txt} ${sampleID}_gatk_temp2.txt ${sampleID}_gatk_formatter.txt ${L} """ // This is a script to format gatk coverage file for subsequent use in log aggregation -} \ No newline at end of file +} diff --git a/modules/vcftools/vcf_annotate.nf b/modules/vcftools/vcf_annotate.nf index a403bf05..135aa3a4 100644 --- a/modules/vcftools/vcf_annotate.nf +++ b/modules/vcftools/vcf_annotate.nf @@ -4,6 +4,7 @@ process VCF_ANNOTATE { cpus = 1 memory = 10.GB time = '23:00:00' + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} input: tuple val(sampleID), file(snp_vcf) @@ -12,14 +13,11 @@ process VCF_ANNOTATE { output: tuple val(sampleID), file("*.vcf"), emit: vcf - // vcftools container needed container 'quay.io/biocontainers/perl-vcftools-vcf:0.1.16--pl5321hdfd78af_4' script: - log.info "----- CAT VCF-ANNOTATE Running on: ${sampleID} -----" if (params.gen_org=='mouse'){ - // make sure it does not break delta="CHROM,POS,ID,REF,ALT" } else if (params.gen_org=='human'){ diff --git a/modules/xenome/xenome.nf b/modules/xenome/xenome.nf new file mode 100644 index 00000000..7e2b1211 --- /dev/null +++ b/modules/xenome/xenome.nf @@ -0,0 +1,29 @@ +process XENOME_CLASSIFY { + tag "$sampleID" + + cpus 8 + memory 50.GB + time 8.h + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} + + container 'quay.io/jaxcompsci/xenome:1.0.1' + + publishDir "${params.pubdir}/${ params.organize_by=='sample' ? sampleID + '/stats': 'xenome' }", pattern: "*.txt", mode:'copy' + + input: + tuple val(sampleID), path(trimmed) + + output: + tuple val(sampleID), path("human*.fastq"), emit: xenome_fastq + tuple val(sampleID), path("mouse*.fastq"), emit: xenome_mouse_fastq + tuple val(sampleID), path("*.txt"), emit: xenome_stats + + script: + + read_input = params.read_type == 'PE' ? "-i ${trimmed[0]} -i ${trimmed[1]}" : "-i ${trimmed[0]}" + pairs = params.read_type == 'PE' ? "--pairs" : "" + + """ + /xenome-1.0.1-r/xenome classify -T 8 -P ${params.xenome_prefix} ${pairs} --host-name mouse --graft-name human ${read_input} > ${sampleID}_xenome_stats.txt + """ +} diff --git a/nextflow.config b/nextflow.config index b1018417..f6b06c5a 100644 --- a/nextflow.config +++ b/nextflow.config @@ -2,7 +2,7 @@ Nextflow DSL2 Main Config - Author(s): Anuj Srivastava, Carolyn Paisie, Barry Guglielmo, Michael Lloyd, Brian Sanderson, Sai Lek + Authors: Anuj Srivastava, Carolyn Paisie, Barry Guglielmo, Michael Lloyd, Brian Sanderson, Sai Lek, Harshpreet Chandok, Peter Fields Copyright of Jackson Laboratories 2022 _____________________________________________________*/ @@ -32,17 +32,25 @@ params { } // specific config for the pipeline -includeConfig params.config + + +try { + includeConfig params.config +} catch (Exception e) { + System.err.println("ERROR: Could not load ${params.config} check that you are using a valid pipeline name") +} + + // work directory is important as it will be large, plan accordingly -workDir = "/fastscratch/nextflow/${params.workflow}" +workDir = "/fastscratch/${USER}/${params.workflow}" manifest { name = "The Jackson Laboratory Computational Sciences Nextflow based analysis pipelines" homePage = "https://github.com/TheJacksonLaboratory/cs-nf-pipelines" mainScript = "main.nf" nextflowVersion = "!>=20.10.0" - version = "0.2.0" + version = "0.3.0" } profiles { diff --git a/run.sh b/run.sh index ff81e06f..f05fcff4 100644 --- a/run.sh +++ b/run.sh @@ -19,5 +19,5 @@ nextflow main.nf \ --workflow rnaseq \ --gen_org mouse \ --sample_folder 'test/rna/mouse' \ ---pubdir '/fastscratch/outputDir' \ --w '/fastscratch/outputDir/work' \ No newline at end of file +--pubdir "/fastscratch/${USER}/outputDir" \ +-w "/fastscratch/${USER}/outputDir/work" diff --git a/run_scripts/README.md b/run_scripts/README.md index 3da98f9d..08a34c6d 100644 --- a/run_scripts/README.md +++ b/run_scripts/README.md @@ -44,4 +44,6 @@ There are several things a user must change before running these scripts: **NOTE:** -These scripts assume they are being run from within `cs-nf-pipelines/run_scripts`. If they are moved to other locations, specify the absolute path to `main.nf` (e.g., `/home/USERNAME/cs-nf-pipelines/main.nf`) \ No newline at end of file +1. These scripts assume they are being run from within `cs-nf-pipelines/run_scripts`. If they are moved to other locations, specify the absolute path to `main.nf` (e.g., `/home/USERNAME/cs-nf-pipelines/main.nf`) + +2. Sample data for each workflow and species are provided in cs-nf-pipelines/test/ \ No newline at end of file diff --git a/run_scripts/atac_human.sh b/run_scripts/atac_human.sh index fbe06b4b..12e7edc4 100644 --- a/run_scripts/atac_human.sh +++ b/run_scripts/atac_human.sh @@ -20,6 +20,6 @@ nextflow ../main.nf \ -profile sumner \ --sample_folder \ --gen_org human \ ---pubdir '/fastscratch/outputDir' \ --w '/fastscratch/outputDir/work' \ +--pubdir "/fastscratch/${USER}/outputDir" \ +-w "/fastscratch/${USER}/outputDir/work" \ --comment "This script will run atac sequencing on human samples using default hg38" diff --git a/run_scripts/atac_mouse.sh b/run_scripts/atac_mouse.sh index 23e85d38..4df5fa56 100644 --- a/run_scripts/atac_mouse.sh +++ b/run_scripts/atac_mouse.sh @@ -23,6 +23,6 @@ nextflow ../main.nf \ --effective_genome_size 2652783500 \ --bowtie2Index '/projects/omics_share/mouse/GRCm38/genome/indices/ensembl/v102/bowtie2/Mus_musculus.GRCm38.dna.primary_assembly.fa' \ --chain '' \ ---pubdir '/fastscratch/outputDir' \ --w '/fastscratch/outputDir/work' \ +--pubdir "/fastscratch/${USER}/outputDir" \ +-w "/fastscratch/${USER}/outputDir/work" \ --comment "This script will run atac sequencing on mouse samples using default mm10" \ No newline at end of file diff --git a/run_scripts/pta_human.sh b/run_scripts/pta_human.sh new file mode 100644 index 00000000..533aefa6 --- /dev/null +++ b/run_scripts/pta_human.sh @@ -0,0 +1,24 @@ +#!/bin/bash +#SBATCH --mail-user=first.last@jax.org +#SBATCH --job-name=pta_human +#SBATCH --mail-type=END,FAIL +#SBATCH -p compute +#SBATCH -q batch +#SBATCH -t 72:00:00 +#SBATCH --mem=1G +#SBATCH --ntasks=1 + +cd $SLURM_SUBMIT_DIR + +# LOAD NEXTFLOW +module use --append /projects/omics_share/meta/modules +module load nextflow + +# RUN PIPELINE +nextflow ../main.nf \ +--workflow pta \ +-profile sumner \ +--csv_input ../test/csv_samplesheets/pta_test.csv +--pubdir "/fastscratch/${USER}/outputDir" \ +-w "/fastscratch/${USER}/outputDir/work" \ +--comment "This script will run paired tumor analysis on test data" diff --git a/run_scripts/rnafusion_human.sh b/run_scripts/rnafusion_human.sh new file mode 100644 index 00000000..bce1d0fc --- /dev/null +++ b/run_scripts/rnafusion_human.sh @@ -0,0 +1,25 @@ +#!/bin/bash +#SBATCH --mail-user=first.last@jax.org +#SBATCH --job-name=rna_fusion_human +#SBATCH --mail-type=END,FAIL +#SBATCH -p compute +#SBATCH -q batch +#SBATCH -t 72:00:00 +#SBATCH --mem=1G +#SBATCH --ntasks=1 + +cd $SLURM_SUBMIT_DIR + +# LOAD NEXTFLOW +module use --append /projects/omics_share/meta/modules +module load nextflow + +# RUN PIPELINE +nextflow ../main.nf \ +--workflow rna_fusion \ +-profile sumner \ +--sample_folder \ +--gen_org human \ +--pubdir "/fastscratch/${USER}/outputDir" \ +-w "/fastscratch/${USER}/outputDir/work" \ +--comment "This script will run rna_fusion on human samples using default hg38" diff --git a/run_scripts/rnaseq_human.sh b/run_scripts/rnaseq_human.sh index 577b308c..a0f97345 100644 --- a/run_scripts/rnaseq_human.sh +++ b/run_scripts/rnaseq_human.sh @@ -20,6 +20,6 @@ nextflow ../main.nf \ -profile sumner \ --sample_folder \ --gen_org human \ ---pubdir '/fastscratch/outputDir' \ --w '/fastscratch/outputDir/work' \ +--pubdir "/fastscratch/${USER}/outputDir" \ +-w "/fastscratch/${USER}/outputDir/work" \ --comment "This script will run rnaseq on human samples using default hg38" diff --git a/run_scripts/rnaseq_mouse.sh b/run_scripts/rnaseq_mouse.sh index e72da816..a21562d3 100644 --- a/run_scripts/rnaseq_mouse.sh +++ b/run_scripts/rnaseq_mouse.sh @@ -20,6 +20,6 @@ nextflow ../main.nf \ -profile sumner \ --sample_folder \ --gen_org mouse \ ---pubdir '/fastscratch/outputDir' \ --w '/fastscratch/outputDir/work' \ +--pubdir "/fastscratch/${USER}/outputDir" \ +-w "/fastscratch/${USER}/outputDir/work" \ --comment "This script will run rnaseq on mouse samples using default mm10" diff --git a/run_scripts/rnaseq_pdx.sh b/run_scripts/rnaseq_pdx.sh new file mode 100644 index 00000000..b186eeae --- /dev/null +++ b/run_scripts/rnaseq_pdx.sh @@ -0,0 +1,26 @@ +#!/bin/bash +#SBATCH --mail-user=first.last@jax.org +#SBATCH --job-name=rnaseq_pdx_human +#SBATCH --mail-type=END,FAIL +#SBATCH -p compute +#SBATCH -q batch +#SBATCH -t 72:00:00 +#SBATCH --mem=1G +#SBATCH --ntasks=1 + +cd $SLURM_SUBMIT_DIR + +# LOAD NEXTFLOW +module use --append /projects/omics_share/meta/modules +module load nextflow + +# RUN PIPELINE +nextflow ../main.nf \ +--workflow rnaseq \ +--pdx \ +-profile sumner \ +--sample_folder \ +--gen_org human \ +--pubdir "/fastscratch/${USER}/outputDir" \ +-w "/fastscratch/${USER}/outputDir/work" \ +--comment "This script will run rnaseq on pdx samples using default hg38 and mm10" diff --git a/run_scripts/rrbs_human.sh b/run_scripts/rrbs_human.sh index b1434a1f..968f5705 100644 --- a/run_scripts/rrbs_human.sh +++ b/run_scripts/rrbs_human.sh @@ -20,6 +20,6 @@ nextflow ../main.nf \ -profile sumner \ --sample_folder \ --gen_org human \ ---pubdir '/fastscratch/outputDir' \ --w '/fastscratch/outputDir/work' \ +--pubdir "/fastscratch/${USER}/outputDir" \ +-w "/fastscratch/${USER}/outputDir/work" \ --comment "This script will run the reduced-representation bisulfite sequencing analysis pipeline on human samples using default hg38" \ No newline at end of file diff --git a/run_scripts/rrbs_mouse.sh b/run_scripts/rrbs_mouse.sh index 6fb963fb..a6bd872b 100644 --- a/run_scripts/rrbs_mouse.sh +++ b/run_scripts/rrbs_mouse.sh @@ -20,6 +20,6 @@ nextflow ../main.nf \ -profile sumner \ --sample_folder \ --gen_org mouse \ ---pubdir '/fastscratch/outputDir' \ --w '/fastscratch/outputDir/work' \ +--pubdir "/fastscratch/${USER}/outputDir" \ +-w "/fastscratch/${USER}/outputDir/work" \ --comment "This script will run the reduced-representation bisulfite sequencing analysis pipeline on mouse samples using default mm10" \ No newline at end of file diff --git a/run_scripts/wes_human.sh b/run_scripts/wes_human.sh index 9c68260c..01e57be8 100644 --- a/run_scripts/wes_human.sh +++ b/run_scripts/wes_human.sh @@ -20,6 +20,6 @@ nextflow ../main.nf \ -profile sumner \ --sample_folder \ --gen_org human \ ---pubdir '/fastscratch/outputDir' \ --w '/fastscratch/outputDir/work' \ +--pubdir "/fastscratch/${USER}/outputDir" \ +-w "/fastscratch/${USER}/outputDir/work" \ --comment "This script will run whole exome sequencing on human samples using default hg38" diff --git a/run_scripts/wes_mouse.sh b/run_scripts/wes_mouse.sh index 7a4bd973..1e30bba7 100644 --- a/run_scripts/wes_mouse.sh +++ b/run_scripts/wes_mouse.sh @@ -20,6 +20,6 @@ nextflow ../main.nf \ -profile sumner \ --sample_folder \ --gen_org mouse \ ---pubdir '/fastscratch/outputDir' \ --w '/fastscratch/outputDir/work' \ +--pubdir "/fastscratch/${USER}/outputDir" \ +-w "/fastscratch/${USER}/outputDir/work" \ --comment "This script will run whole exome sequencing on mouse samples using default mm10" diff --git a/run_scripts/wes_pdx.sh b/run_scripts/wes_pdx.sh new file mode 100644 index 00000000..18e3c1d0 --- /dev/null +++ b/run_scripts/wes_pdx.sh @@ -0,0 +1,25 @@ +#!/bin/bash +#SBATCH --mail-user=first.last@jax.org +#SBATCH --job-name=wes_pdx_human +#SBATCH --mail-type=END,FAIL +#SBATCH -p compute +#SBATCH -q batch +#SBATCH -t 72:00:00 +#SBATCH --mem=1G +#SBATCH --ntasks=1 + +cd $SLURM_SUBMIT_DIR + +# LOAD NEXTFLOW +module use --append /projects/omics_share/meta/modules +module load nextflow + +# RUN PIPELINE +nextflow ../main.nf \ +--workflow pdx_wes \ +-profile sumner \ +--sample_folder \ +--gen_org human \ +--pubdir "/fastscratch/${USER}/outputDir" \ +-w "/fastscratch/${USER}/outputDir/work" \ +--comment "This script will run whole exome sequencing on pdx samples using default hg38" diff --git a/run_scripts/wgs_human.sh b/run_scripts/wgs_human.sh index d598cd79..7741b70a 100644 --- a/run_scripts/wgs_human.sh +++ b/run_scripts/wgs_human.sh @@ -20,6 +20,6 @@ nextflow ../main.nf \ -profile sumner \ --sample_folder \ --gen_org human \ ---pubdir '/fastscratch/outputDir' \ --w '/fastscratch/outputDir/work' \ +--pubdir "/fastscratch/${USER}/outputDir" \ +-w "/fastscratch/${USER}/outputDir/work" \ --comment "This script will run whole genome sequencing on human samples using default hg38" diff --git a/run_scripts/wgs_mouse.sh b/run_scripts/wgs_mouse.sh index 25f433c6..5b4f95b2 100644 --- a/run_scripts/wgs_mouse.sh +++ b/run_scripts/wgs_mouse.sh @@ -20,6 +20,6 @@ nextflow ../main.nf \ -profile sumner \ --sample_folder \ --gen_org mouse \ ---pubdir '/fastscratch/outputDir' \ --w '/fastscratch/outputDir/work' \ +--pubdir "/fastscratch/${USER}/outputDir" \ +-w "/fastscratch/${USER}/outputDir/work" \ --comment "This script will run whole genome sequencing on mouse samples using default mm10" diff --git a/subworkflows/aria_download_parse.nf b/subworkflows/aria_download_parse.nf new file mode 100644 index 00000000..d756c6af --- /dev/null +++ b/subworkflows/aria_download_parse.nf @@ -0,0 +1,105 @@ +#!/usr/bin/env nextflow +nextflow.enable.dsl=2 + +// import modules +include {ARIA_DOWNLOAD} from "${projectDir}/modules/utility_modules/aria_download" +include {CONCATENATE_READS_SAMPLESHEET} from "${projectDir}/modules/utility_modules/concatenate_reads_sampleSheet" + +workflow FILE_DOWNLOAD { + + take: + ch_input_sample + + main: + + /* + General note: + + Input tuple expected from the CSV sheet: + it[0] is sample ID. + it[1] is metadata information + it[2] and it[3] are R1 and R2 if PE. it[3] is empty if SE. + + All steps expect that sampleID is in position [0] of tuples. + + */ + + + if (params.read_type == 'PE') { + aria_download_input = ch_input_sample + .multiMap { it -> + R1: tuple(it[0], it[1], 'R1', it[2]) + R2: tuple(it[0], it[1], 'R2', it[3]) + } + .mix() + group_size = 2 + } else { + aria_download_input = ch_input_sample + .multiMap { it -> + R1: tuple(it[0], it[1], 'R1', it[2]) + } + .mix() + group_size = 1 + } + /* + remap the data to individual R1 / R2 tuples. + These individual tuples are then mixed to pass individual files to the downloader. + R1 vs. R2 is maintained in the mix. Order is irrelavent here as data are grouped + by sampleID downstream. + */ + + // Download files. + ARIA_DOWNLOAD(aria_download_input) + + concat_input = ARIA_DOWNLOAD.out.file + .map { it -> + def meta = [:] + meta.sampleID = it[1].sampleID + [it[0], it[1].lane, meta, it[2], it[3], it[1].size] // sampleID, laneID, meta, read_ID:[R1|R2], file, number_of_lanes + } + .map { sampleID, laneID, meta, readID, file, size -> tuple( groupKey([sampleID, meta, readID], size), laneID, file ) } + .groupTuple() // controlled by group key: [sampleID, meta, read_ID] + .map{ it -> tuple(it[0][0], it[1].size(), it[0][1], it[0][2], it[2])} // sampleID, num_lanes, meta, read_ID:[R1|R2], file + .branch{ + concat: it[1] > 1 + pass: it[1] == 1 + } + /* + remap the downloaded files to exclude lane from meta, and group on sampleID, meta, and read_ID: R1|R2. + The number of lanes in the grouped data is used to determine if concatenation is needed. + The branch statement makes a 'concat' set for concatenation and a 'pass' set that isn't concatenated. + The branch is using it[1].size() from the preceding step, i.e., the list size of lanes for the sample. + + Metadata inclusion here is for future expansion. As implimented above, metadata is redundant to sampleID in `it[0]`. + However, if additional metadata are added to sample sheets, those metadata can be added and tracked above. + + groupTuple size is dynamically defined by metadata field 'size' i.e., the number of lanes per sample. + + See: https://www.nextflow.io/docs/latest/operator.html#grouptuple and the note about dynamic group size. + + */ + + no_concat_samples = concat_input.pass + .map{it -> tuple(it[0], it[1], it[2], it[3], it[4][0])} // sampleID, num_lanes, meta, read_ID:[R1|R2], file + /* + this delists the the file in `it[4]` as it is a single fastq sample (i.e., non-concat samples). + */ + + // Concatenate samples as needed. + CONCATENATE_READS_SAMPLESHEET(concat_input.concat) + + read_meta_ch = CONCATENATE_READS_SAMPLESHEET.out.concat_fastq + .mix(no_concat_samples) + .groupTuple(by: [0,2], size: group_size) // sampleID, meta + .map{it -> tuple(it[0], it[2], it[4].toSorted( { a, b -> a.getName() <=> b.getName() } ) ) } + + /* + Mix concatenation files, with non-concat files. 'mix' allows for, all, some, or no files to have + gone through concatenation. + + Reads are remapped to read_ch and meta is placed in meta_ch. Input tuples for existing modules + do not expect 'meta' in the tuple. Example expected input tuple: [sampleID, [reads]] + */ + emit: + read_meta_ch +} \ No newline at end of file diff --git a/subworkflows/concatenate_local_files.nf b/subworkflows/concatenate_local_files.nf new file mode 100644 index 00000000..f567b8f0 --- /dev/null +++ b/subworkflows/concatenate_local_files.nf @@ -0,0 +1,78 @@ +#!/usr/bin/env nextflow +nextflow.enable.dsl=2 + +// import modules +include {CONCATENATE_READS_SAMPLESHEET} from "${projectDir}/modules/utility_modules/concatenate_reads_sampleSheet" + +workflow CONCATENATE_LOCAL_FILES { + + take: + ch_input_sample + + main: + + if (params.read_type == 'PE') { + temp_map = ch_input_sample + .multiMap { it -> + def meta = [:] + meta.sampleID = it[1].sampleID + R1: tuple(it[0], it[1].lane, meta, 'R1', it[2]) + R2: tuple(it[0], it[1].lane, meta, 'R2', it[3]) + } + .mix() + .groupTuple(by: [0,2,3]) + .map{ it -> tuple(it[0], it[1].size(), it[2], it[3], it[4]) } // sampleID, num_lanes, meta, read_ID:[R1|R2], file + + concat_input = temp_map + .branch { + concat: it[1] > 1 + pass: it[1] == 1 + } + group_size = 2 + } else { + + temp_map = ch_input_sample + .multiMap { it -> + def meta = [:] + meta.sampleID = it[1].sampleID + R1: tuple(it[0], it[1].lane, meta, 'R1', it[2]) + } + .mix() + .groupTuple(by: [0,2,3]) + .map{ it -> tuple(it[0], it[1].size(), it[2], it[3], it[4]) } // sampleID, num_lanes, meta, read_ID:[R1], file + + concat_input = temp_map + .branch { + concat: it[1] > 1 + pass: it[1] == 1 + } + group_size = 1 + } + + no_concat_samples = concat_input.pass + .map{it -> tuple(it[0], it[1], it[2], it[3], it[4][0])} // sampleID, num_lanes, meta, read_ID:[R1|R2], file + + /* + this delists the the file in `it[4]` as it is a single fastq sample (i.e., non-concat samples). + + */ + + CONCATENATE_READS_SAMPLESHEET(concat_input.concat) + + read_meta_ch = CONCATENATE_READS_SAMPLESHEET.out.concat_fastq + .mix(no_concat_samples) + .groupTuple(by: [0,2], size: group_size) // sampleID, meta + .map{it -> tuple(it[0], it[2], it[4].toSorted( { a, b -> file(a).getName() <=> file(b).getName() } ) ) } + + /* + Mix concatenation files, with non-concat files. 'mix' allows for, all, some, or no files to have + gone through concatenation. + + Reads are remapped to read_ch and meta is placed in meta_ch. Input tuples for existing modules + do not expect 'meta' in the tuple. Example expected input tuple: [sampleID, [reads]] + */ + + emit: + read_meta_ch + +} \ No newline at end of file diff --git a/subworkflows/concatenate_pta_fastq.nf b/subworkflows/concatenate_pta_fastq.nf new file mode 100644 index 00000000..14899d71 --- /dev/null +++ b/subworkflows/concatenate_pta_fastq.nf @@ -0,0 +1,72 @@ +#!/usr/bin/env nextflow +nextflow.enable.dsl=2 + +// import modules +include {CONCATENATE_READS_SAMPLESHEET} from "${projectDir}/modules/utility_modules/concatenate_reads_sampleSheet" + +workflow CONCATENATE_PTA_FASTQ { + + take: + ch_input_sample + + main: + + if (params.read_type == 'PE') { + temp_map = ch_input_sample + .multiMap { it -> + R1: tuple(it[0], it[1].lane, it[1], 'R1', it[2][0]) + R2: tuple(it[0], it[1].lane, it[1], 'R2', it[2][1]) + } + .mix() + .groupTuple(by: [0,2,3]) + .map{ it -> tuple(it[0], it[1].size(), it[2], it[3], it[4]) } // sampleID, num_lanes, meta, read_ID:[R1|R2], file + + concat_input = temp_map + .branch { + concat: it[1] > 1 + pass: it[1] == 1 + } + } else { + + temp_map = ch_input_sample + .multiMap { it -> + R1: tuple(it[0], it[1].lane, it[1], 'R1', it[2][0]) + } + .mix() + .groupTuple(by: [0,2,3]) + .map{ it -> tuple(it[0], it[1].size(), it[2], it[3], it[4]) } // sampleID, num_lanes, meta, read_ID:[R1], file + + concat_input = temp_map + .branch { + concat: it[1] > 1 + pass: it[1] == 1 + } + } + + no_concat_samples = concat_input.pass + .map{it -> tuple(it[0], it[1], it[2], it[3], it[4][0])} // sampleID, num_lanes, meta, read_ID:[R1|R2], file + + /* + this delists the the file in `it[4]` as it is a single fastq sample (i.e., non-concat samples). + + */ + + CONCATENATE_READS_SAMPLESHEET(concat_input.concat) + + read_meta_ch = CONCATENATE_READS_SAMPLESHEET.out.concat_fastq + .mix(no_concat_samples) + .groupTuple(by: [0,2]) // sampleID, meta + .map{it -> tuple(it[0], it[2], it[4].toSorted( { a, b -> file(a).getName() <=> file(b).getName() } ) ) } + + /* + Mix concatenation files, with non-concat files. 'mix' allows for, all, some, or no files to have + gone through concatenation. + + Reads are remapped to read_ch and meta is placed in meta_ch. Input tuples for existing modules + do not expect 'meta' in the tuple. Example expected input tuple: [sampleID, [reads]] + */ + + emit: + read_meta_ch + +} \ No newline at end of file diff --git a/subworkflows/pdx_rnaseq.nf b/subworkflows/pdx_rnaseq.nf new file mode 100644 index 00000000..6f96df4c --- /dev/null +++ b/subworkflows/pdx_rnaseq.nf @@ -0,0 +1,120 @@ +#!/usr/bin/env nextflow +nextflow.enable.dsl=2 + +// import modules +include {JAX_TRIMMER} from "${projectDir}/modules/utility_modules/jax_trimmer" +include {READ_GROUPS as READ_GROUPS_HUMAN; + READ_GROUPS as READ_GROUPS_MOUSE} from "${projectDir}/modules/utility_modules/read_groups" +include {FASTQC} from "${projectDir}/modules/fastqc/fastqc" +include {GET_READ_LENGTH} from "${projectDir}/modules/utility_modules/get_read_length" +include {CHECK_STRANDEDNESS} from "${projectDir}/modules/python/python_check_strandedness" +include {XENOME_CLASSIFY} from "${projectDir}/modules/xenome/xenome" +include {FASTQ_SORT as FASTQ_SORT_HUMAN; + FASTQ_SORT as FASTQ_SORT_MOUSE} from "${projectDir}/modules/fastq-tools/fastq-sort" +include {RSEM_ALIGNMENT_EXPRESSION as RSEM_ALIGNMENT_EXPRESSION_HUMAN; + RSEM_ALIGNMENT_EXPRESSION as RSEM_ALIGNMENT_EXPRESSION_MOUSE} from "${projectDir}/modules/rsem/rsem_alignment_expression" +include {PICARD_ADDORREPLACEREADGROUPS as PICARD_ADDORREPLACEREADGROUPS_HUMAN; + PICARD_ADDORREPLACEREADGROUPS as PICARD_ADDORREPLACEREADGROUPS_MOUSE} from "${projectDir}/modules/picard/picard_addorreplacereadgroups" +include {PICARD_REORDERSAM as PICARD_REORDERSAM_HUMAN; + PICARD_REORDERSAM as PICARD_REORDERSAM_MOUSE} from "${projectDir}/modules/picard/picard_reordersam" +include {PICARD_SORTSAM as PICARD_SORTSAM_HUMAN; + PICARD_SORTSAM as PICARD_SORTSAM_MOUSE} from "${projectDir}/modules/picard/picard_sortsam" +include {PICARD_COLLECTRNASEQMETRICS as PICARD_COLLECTRNASEQMETRICS_HUMAN; + PICARD_COLLECTRNASEQMETRICS as PICARD_COLLECTRNASEQMETRICS_MOUSE} from "${projectDir}/modules/picard/picard_collectrnaseqmetrics" + +include {MULTIQC} from "${projectDir}/modules/multiqc/multiqc" + +workflow PDX_RNASEQ { + + take: + read_ch + + main: + // Step 1: Qual_Stat, Get read group information, Run Xenome + JAX_TRIMMER(read_ch) + + GET_READ_LENGTH(read_ch) + + if (params.read_type == 'PE') { + xenome_input = JAX_TRIMMER.out.trimmed_fastq + } else { + xenome_input = JAX_TRIMMER.out.trimmed_fastq + } + + // QC is assess on all reads. Mouse/human is irrelevant here. + FASTQC(JAX_TRIMMER.out.trimmed_fastq) + + CHECK_STRANDEDNESS(JAX_TRIMMER.out.trimmed_fastq) + + // Xenome Classification + XENOME_CLASSIFY(xenome_input) + + // Xenome Read Sort + FASTQ_SORT_HUMAN(XENOME_CLASSIFY.out.xenome_fastq, 'human') + FASTQ_SORT_MOUSE(XENOME_CLASSIFY.out.xenome_mouse_fastq, 'mouse') + + human_reads = FASTQ_SORT_HUMAN.out.sorted_fastq + .join(CHECK_STRANDEDNESS.out.strand_setting) + .join(GET_READ_LENGTH.out.read_length) + .map{it -> tuple(it[0]+'_human', it[1], it[2], it[3])} + + mouse_reads = FASTQ_SORT_MOUSE.out.sorted_fastq + .join(CHECK_STRANDEDNESS.out.strand_setting) + .join(GET_READ_LENGTH.out.read_length) + .map{it -> tuple(it[0]+'_mouse', it[1], it[2], it[3])} + + // Step 2: RSEM Human and Stats: + + RSEM_ALIGNMENT_EXPRESSION_HUMAN(human_reads, params.rsem_ref_files_human, params.rsem_star_prefix_human, params.rsem_ref_prefix_human) + + // Picard Alignment Metrics + READ_GROUPS_HUMAN(human_reads.map{it -> tuple(it[0], it[1])}, "picard") + + add_replace_groups_human = READ_GROUPS_HUMAN.out.read_groups.join(RSEM_ALIGNMENT_EXPRESSION_HUMAN.out.bam) + PICARD_ADDORREPLACEREADGROUPS_HUMAN(add_replace_groups_human) + + PICARD_REORDERSAM_HUMAN(PICARD_ADDORREPLACEREADGROUPS_HUMAN.out.bam, params.picard_dict_human) + + // Picard Alignment Metrics + PICARD_SORTSAM_HUMAN(PICARD_REORDERSAM_HUMAN.out.bam) + + human_qc_input = PICARD_SORTSAM_HUMAN.out.bam.join(human_reads) + .map{it -> [it[0], it[1], it[3]]} + + PICARD_COLLECTRNASEQMETRICS_HUMAN(human_qc_input, params.ref_flat_human, params.ribo_intervals_human) + + // Step 3 RSEM Mouse and Stats: + + RSEM_ALIGNMENT_EXPRESSION_MOUSE(mouse_reads, params.rsem_ref_files_mouse, params.rsem_star_prefix_mouse, params.rsem_ref_prefix_mouse) + + // Step 4: Picard Alignment Metrics + READ_GROUPS_MOUSE(mouse_reads.map{it -> tuple(it[0], it[1])}, "picard") + + add_replace_groups_mouse = READ_GROUPS_MOUSE.out.read_groups.join(RSEM_ALIGNMENT_EXPRESSION_MOUSE.out.bam) + PICARD_ADDORREPLACEREADGROUPS_MOUSE(add_replace_groups_mouse) + + PICARD_REORDERSAM_MOUSE(PICARD_ADDORREPLACEREADGROUPS_MOUSE.out.bam, params.picard_dict_mouse) + + // Step 5: Picard Alignment Metrics + PICARD_SORTSAM_MOUSE(PICARD_REORDERSAM_MOUSE.out.bam) + + mouse_qc_input = PICARD_SORTSAM_MOUSE.out.bam.join(mouse_reads) + .map{it -> [it[0], it[1], it[3]]} + + PICARD_COLLECTRNASEQMETRICS_MOUSE(mouse_qc_input, params.ref_flat_mouse, params.ribo_intervals_mouse) + + + ch_multiqc_files = Channel.empty() + ch_multiqc_files = ch_multiqc_files.mix(JAX_TRIMMER.out.quality_stats.collect{it[1]}.ifEmpty([])) + ch_multiqc_files = ch_multiqc_files.mix(FASTQC.out.quality_stats.collect{it[1]}.ifEmpty([])) + ch_multiqc_files = ch_multiqc_files.mix(XENOME_CLASSIFY.out.xenome_stats.collect{it[1]}.ifEmpty([])) + ch_multiqc_files = ch_multiqc_files.mix(RSEM_ALIGNMENT_EXPRESSION_HUMAN.out.rsem_cnt.collect{it[1]}.ifEmpty([])) + ch_multiqc_files = ch_multiqc_files.mix(PICARD_COLLECTRNASEQMETRICS_HUMAN.out.picard_metrics.collect{it[1]}.ifEmpty([])) + ch_multiqc_files = ch_multiqc_files.mix(RSEM_ALIGNMENT_EXPRESSION_MOUSE.out.rsem_cnt.collect{it[1]}.ifEmpty([])) + ch_multiqc_files = ch_multiqc_files.mix(PICARD_COLLECTRNASEQMETRICS_MOUSE.out.picard_metrics.collect{it[1]}.ifEmpty([])) + + MULTIQC ( + ch_multiqc_files.collect() + ) + +} \ No newline at end of file diff --git a/test/README.md b/test/README.md index e68c2ea1..b9ea606e 100644 --- a/test/README.md +++ b/test/README.md @@ -1,5 +1,11 @@ # Test Data -This directory contains 10,000 simulated RNA, whole exome, and whole genome paired end reads based on GRCm38 (mm10) and GRCh38 (hg38). +This directory contains 10,000 simulated RNA, whole exome, whole genome, and ATAC-seq paired end reads based on GRCm38 (mm10) and GRCh38 (hg38). -These files can be used in testing pipeline functionality. \ No newline at end of file +These files can be used in testing pipeline functionality. + +Sample chip-seq data from NF-core are provided. These data can be staged by the workflow from the provided URLs. + +A sample CSV datasheet are provided for PTA. + +A sample WES datasheet for both remote and local files is provided for testing download and/or local samplesheet input to workflows. \ No newline at end of file diff --git a/test/csv_samplesheets/pdx_wes_test.csv b/test/csv_samplesheets/pdx_wes_test.csv new file mode 100644 index 00000000..5ee3b514 --- /dev/null +++ b/test/csv_samplesheets/pdx_wes_test.csv @@ -0,0 +1,9 @@ +sampleID,lane,fastq_1,fastq_2 +112475_105-R_G2UN84PK7,baz_L1,https://github.com/hartwigmedical/testdata/raw/master/cancerPanel/CPCT12345678T/CPCT12345678T_HJJLGCCXX_S1_L001_R1_001.fastq.gz,https://github.com/hartwigmedical/testdata/raw/master/cancerPanel/CPCT12345678T/CPCT12345678T_HJJLGCCXX_S1_L001_R2_001.fastq.gz +112475_105-R_G2U,foo_L1,https://github.com/hartwigmedical/testdata/raw/master/cancerPanel/CPCT12345678T/CPCT12345678T_AHHKYHDSXX_S12_L001_R1_001.fastq.gz,https://github.com/hartwigmedical/testdata/raw/master/cancerPanel/CPCT12345678T/CPCT12345678T_AHHKYHDSXX_S12_L001_R2_001.fastq.gz +112475_105-R_G2U,foo_L2,https://github.com/hartwigmedical/testdata/raw/master/cancerPanel/CPCT12345678T/CPCT12345678T_AHHKYHDSXX_S12_L002_R1_001.fastq.gz,https://github.com/hartwigmedical/testdata/raw/master/cancerPanel/CPCT12345678T/CPCT12345678T_AHHKYHDSXX_S12_L002_R2_001.fastq.gz +112475_105-R_G2U,foo_L3,https://github.com/hartwigmedical/testdata/raw/master/cancerPanel/CPCT12345678T/CPCT12345678T_AHHKYHDSXX_S12_L003_R1_001.fastq.gz,https://github.com/hartwigmedical/testdata/raw/master/cancerPanel/CPCT12345678T/CPCT12345678T_AHHKYHDSXX_S12_L003_R2_001.fastq.gz +2475_105-R_G2U,bar_L1,https://github.com/hartwigmedical/testdata/raw/master/cancerPanel/CPCT12345678R/CPCT12345678R_AHHKYHDSXX_S13_L001_R1_001.fastq.gz,https://github.com/hartwigmedical/testdata/raw/master/cancerPanel/CPCT12345678R/CPCT12345678R_AHHKYHDSXX_S13_L001_R2_001.fastq.gz +2475_105-R_G2U,bar_L2,https://github.com/hartwigmedical/testdata/raw/master/cancerPanel/CPCT12345678R/CPCT12345678R_AHHKYHDSXX_S13_L002_R1_001.fastq.gz,https://github.com/hartwigmedical/testdata/raw/master/cancerPanel/CPCT12345678R/CPCT12345678R_AHHKYHDSXX_S13_L002_R2_001.fastq.gz +2475_105-R_G2UN84PK7,bar_L1,https://github.com/hartwigmedical/testdata/raw/master/cancerPanel/CPCT12345678R/CPCT12345678R_AHHKYHDSXX_S13_L003_R1_001.fastq.gz,https://github.com/hartwigmedical/testdata/raw/master/cancerPanel/CPCT12345678R/CPCT12345678R_AHHKYHDSXX_S13_L003_R2_001.fastq.gz +2475_105-R_G2UN84PK7,bar_L2,https://github.com/hartwigmedical/testdata/raw/master/cancerPanel/CPCT12345678R/CPCT12345678R_AHHKYHDSXX_S13_L004_R1_001.fastq.gz,https://github.com/hartwigmedical/testdata/raw/master/cancerPanel/CPCT12345678R/CPCT12345678R_AHHKYHDSXX_S13_L004_R2_001.fastq.gz diff --git a/test/csv_samplesheets/pdx_wes_test_local.csv b/test/csv_samplesheets/pdx_wes_test_local.csv new file mode 100644 index 00000000..850cc3b1 --- /dev/null +++ b/test/csv_samplesheets/pdx_wes_test_local.csv @@ -0,0 +1,9 @@ +sampleID,lane,fastq_1,fastq_2 +112475_105-R_G2U,foo_L1,/projects/compsci/omics_share/human/GRCh38/supporting_files/benchmarking_data/PDX/test_reads/CPCT12345678T_AHHKYHDSXX_S12_L001_R1_001.fastq.gz,/projects/compsci/omics_share/human/GRCh38/supporting_files/benchmarking_data/PDX/test_reads/CPCT12345678T_AHHKYHDSXX_S12_L001_R2_001.fastq.gz +112475_105-R_G2U,foo_L2,/projects/compsci/omics_share/human/GRCh38/supporting_files/benchmarking_data/PDX/test_reads/CPCT12345678T_AHHKYHDSXX_S12_L002_R1_001.fastq.gz,/projects/compsci/omics_share/human/GRCh38/supporting_files/benchmarking_data/PDX/test_reads/CPCT12345678T_AHHKYHDSXX_S12_L002_R2_001.fastq.gz +112475_105-R_G2U,foo_L3,/projects/compsci/omics_share/human/GRCh38/supporting_files/benchmarking_data/PDX/test_reads/CPCT12345678T_AHHKYHDSXX_S12_L003_R1_001.fastq.gz,/projects/compsci/omics_share/human/GRCh38/supporting_files/benchmarking_data/PDX/test_reads/CPCT12345678T_AHHKYHDSXX_S12_L003_R2_001.fastq.gz +2475_105-R_G2U,bar_L1,/projects/compsci/omics_share/human/GRCh38/supporting_files/benchmarking_data/PDX/test_reads/CPCT12345678R_AHHKYHDSXX_S13_L001_R1_001.fastq.gz,/projects/compsci/omics_share/human/GRCh38/supporting_files/benchmarking_data/PDX/test_reads/CPCT12345678R_AHHKYHDSXX_S13_L001_R2_001.fastq.gz +2475_105-R_G2U,bar_L2,/projects/compsci/omics_share/human/GRCh38/supporting_files/benchmarking_data/PDX/test_reads/CPCT12345678R_AHHKYHDSXX_S13_L002_R1_001.fastq.gz,/projects/compsci/omics_share/human/GRCh38/supporting_files/benchmarking_data/PDX/test_reads/CPCT12345678R_AHHKYHDSXX_S13_L002_R2_001.fastq.gz +2475_105-R_G2UN84PK7,bar_L1,/projects/compsci/omics_share/human/GRCh38/supporting_files/benchmarking_data/PDX/test_reads/CPCT12345678R_AHHKYHDSXX_S13_L003_R1_001.fastq.gz,/projects/compsci/omics_share/human/GRCh38/supporting_files/benchmarking_data/PDX/test_reads/CPCT12345678R_AHHKYHDSXX_S13_L003_R2_001.fastq.gz +2475_105-R_G2UN84PK7,bar_L2,/projects/compsci/omics_share/human/GRCh38/supporting_files/benchmarking_data/PDX/test_reads/CPCT12345678R_AHHKYHDSXX_S13_L004_R1_001.fastq.gz,/projects/compsci/omics_share/human/GRCh38/supporting_files/benchmarking_data/PDX/test_reads/CPCT12345678R_AHHKYHDSXX_S13_L004_R2_001.fastq.gz +112475_105-R_G2UN84PK7,baz_L1,/projects/compsci/omics_share/human/GRCh38/supporting_files/benchmarking_data/PDX/test_reads/CPCT12345678T_HJJLGCCXX_S1_L001_R1_001.fastq.gz,/projects/compsci/omics_share/human/GRCh38/supporting_files/benchmarking_data/PDX/test_reads/CPCT12345678T_HJJLGCCXX_S1_L001_R2_001.fastq.gz \ No newline at end of file diff --git a/test/csv_samplesheets/pta_test.csv b/test/csv_samplesheets/pta_test.csv new file mode 100644 index 00000000..528f02f5 --- /dev/null +++ b/test/csv_samplesheets/pta_test.csv @@ -0,0 +1,4 @@ +patient,sex,status,sampleID,lane,fastq_1,fastq_2 +fizzbang,XX,0,n_fizz,test_L1,/projects/compsci/omics_share/human/GRCh38/supporting_files/benchmarking_data/PTA/test_reads/normal_withSV_hg38_WGS_sample_R1.fastq.gz,/projects/compsci/omics_share/human/GRCh38/supporting_files/benchmarking_data/PTA/test_reads/normal_withSV_hg38_WGS_sample_R2.fastq.gz +fizzbang,XX,1,t_bang,test_L1,/projects/compsci/omics_share/human/GRCh38/supporting_files/benchmarking_data/PTA/test_reads/tumor_withSV_hg38_WGS_sample_R1.fastq.gz,/projects/compsci/omics_share/human/GRCh38/supporting_files/benchmarking_data/PTA/test_reads/tumor_withSV_hg38_WGS_sample_R2.fastq.gz +foobar,XX,1,t_bar,test_L1,/projects/compsci/omics_share/human/GRCh38/supporting_files/benchmarking_data/PTA/test_reads/tumor2_withSV_hg38_WGS_sample_R1.fastq.gz,/projects/compsci/omics_share/human/GRCh38/supporting_files/benchmarking_data/PTA/test_reads/tumor2_withSV_hg38_WGS_sample_R2.fastq.gz diff --git a/test/fusion/human/hg38_FUSION_sample_R1.fastq.gz b/test/fusion/human/hg38_FUSION_sample_R1.fastq.gz new file mode 100644 index 00000000..c4950c24 Binary files /dev/null and b/test/fusion/human/hg38_FUSION_sample_R1.fastq.gz differ diff --git a/test/fusion/human/hg38_FUSION_sample_R2.fastq.gz b/test/fusion/human/hg38_FUSION_sample_R2.fastq.gz new file mode 100644 index 00000000..5efb0fff Binary files /dev/null and b/test/fusion/human/hg38_FUSION_sample_R2.fastq.gz differ diff --git a/test/pta/README.md b/test/pta/README.md new file mode 100644 index 00000000..1faa890c --- /dev/null +++ b/test/pta/README.md @@ -0,0 +1,3 @@ +# NOTE: + +Due to size restrictions within the repository, this CSV file points to files on Sumner. When running this test dataset, use the option `--bicseq2_no_scaling` \ No newline at end of file diff --git a/test/pta/test_input.csv b/test/pta/test_input.csv new file mode 100644 index 00000000..528f02f5 --- /dev/null +++ b/test/pta/test_input.csv @@ -0,0 +1,4 @@ +patient,sex,status,sampleID,lane,fastq_1,fastq_2 +fizzbang,XX,0,n_fizz,test_L1,/projects/compsci/omics_share/human/GRCh38/supporting_files/benchmarking_data/PTA/test_reads/normal_withSV_hg38_WGS_sample_R1.fastq.gz,/projects/compsci/omics_share/human/GRCh38/supporting_files/benchmarking_data/PTA/test_reads/normal_withSV_hg38_WGS_sample_R2.fastq.gz +fizzbang,XX,1,t_bang,test_L1,/projects/compsci/omics_share/human/GRCh38/supporting_files/benchmarking_data/PTA/test_reads/tumor_withSV_hg38_WGS_sample_R1.fastq.gz,/projects/compsci/omics_share/human/GRCh38/supporting_files/benchmarking_data/PTA/test_reads/tumor_withSV_hg38_WGS_sample_R2.fastq.gz +foobar,XX,1,t_bar,test_L1,/projects/compsci/omics_share/human/GRCh38/supporting_files/benchmarking_data/PTA/test_reads/tumor2_withSV_hg38_WGS_sample_R1.fastq.gz,/projects/compsci/omics_share/human/GRCh38/supporting_files/benchmarking_data/PTA/test_reads/tumor2_withSV_hg38_WGS_sample_R2.fastq.gz diff --git a/test/rna/human/pdx/pdx_RNA_sample_R1.fastq.gz b/test/rna/human/pdx/pdx_RNA_sample_R1.fastq.gz new file mode 100644 index 00000000..2746ec82 Binary files /dev/null and b/test/rna/human/pdx/pdx_RNA_sample_R1.fastq.gz differ diff --git a/test/rna/human/pdx/pdx_RNA_sample_R2.fastq.gz b/test/rna/human/pdx/pdx_RNA_sample_R2.fastq.gz new file mode 100644 index 00000000..1d320fb1 Binary files /dev/null and b/test/rna/human/pdx/pdx_RNA_sample_R2.fastq.gz differ diff --git a/test/wes/human/hg38_WES_sample_R1.fastq.gz b/test/wes/human/hg38_WES_sample_R1.fastq.gz index 4c5aafbe..5ec973ba 100644 Binary files a/test/wes/human/hg38_WES_sample_R1.fastq.gz and b/test/wes/human/hg38_WES_sample_R1.fastq.gz differ diff --git a/test/wes/human/hg38_WES_sample_R2.fastq.gz b/test/wes/human/hg38_WES_sample_R2.fastq.gz index f4a8efb3..36046400 100644 Binary files a/test/wes/human/hg38_WES_sample_R2.fastq.gz and b/test/wes/human/hg38_WES_sample_R2.fastq.gz differ diff --git a/test/wes/human/pdx/pdx_WES_sample_R1.fastq.gz b/test/wes/human/pdx/pdx_WES_sample_R1.fastq.gz new file mode 100644 index 00000000..6ba04ec2 Binary files /dev/null and b/test/wes/human/pdx/pdx_WES_sample_R1.fastq.gz differ diff --git a/test/wes/human/pdx/pdx_WES_sample_R2.fastq.gz b/test/wes/human/pdx/pdx_WES_sample_R2.fastq.gz new file mode 100644 index 00000000..38506851 Binary files /dev/null and b/test/wes/human/pdx/pdx_WES_sample_R2.fastq.gz differ diff --git a/test/wes/mouse/mm10_WES_sample_R1.fastq.gz b/test/wes/mouse/mm10_WES_sample_R1.fastq.gz index b1f86d2a..f53e3a4f 100644 Binary files a/test/wes/mouse/mm10_WES_sample_R1.fastq.gz and b/test/wes/mouse/mm10_WES_sample_R1.fastq.gz differ diff --git a/test/wes/mouse/mm10_WES_sample_R2.fastq.gz b/test/wes/mouse/mm10_WES_sample_R2.fastq.gz index 058cbc1a..10eb4f15 100644 Binary files a/test/wes/mouse/mm10_WES_sample_R2.fastq.gz and b/test/wes/mouse/mm10_WES_sample_R2.fastq.gz differ diff --git a/test/wgs/human/hg38_WGS_sample_R1.fastq.gz b/test/wgs/human/hg38_WGS_sample_R1.fastq.gz index 315aadf3..dc7e001e 100644 Binary files a/test/wgs/human/hg38_WGS_sample_R1.fastq.gz and b/test/wgs/human/hg38_WGS_sample_R1.fastq.gz differ diff --git a/test/wgs/human/hg38_WGS_sample_R2.fastq.gz b/test/wgs/human/hg38_WGS_sample_R2.fastq.gz index 6f3c0a91..88cd8939 100644 Binary files a/test/wgs/human/hg38_WGS_sample_R2.fastq.gz and b/test/wgs/human/hg38_WGS_sample_R2.fastq.gz differ diff --git a/test/wgs/mouse/mm10_WGS_sample_R1.fastq.gz b/test/wgs/mouse/mm10_WGS_sample_R1.fastq.gz index 7e74bfd4..2a85e51e 100644 Binary files a/test/wgs/mouse/mm10_WGS_sample_R1.fastq.gz and b/test/wgs/mouse/mm10_WGS_sample_R1.fastq.gz differ diff --git a/test/wgs/mouse/mm10_WGS_sample_R2.fastq.gz b/test/wgs/mouse/mm10_WGS_sample_R2.fastq.gz index 37e17a1c..266dd096 100644 Binary files a/test/wgs/mouse/mm10_WGS_sample_R2.fastq.gz and b/test/wgs/mouse/mm10_WGS_sample_R2.fastq.gz differ diff --git a/workflows/amplicon.nf b/workflows/amplicon.nf new file mode 100644 index 00000000..2118abd0 --- /dev/null +++ b/workflows/amplicon.nf @@ -0,0 +1,163 @@ +#!/usr/bin/env nextflow +nextflow.enable.dsl=2 + +// import modules +include {help} from "${projectDir}/bin/help/amplicon.nf" +include {param_log} from "${projectDir}/bin/log/amplicon.nf" +include {getLibraryId} from "${projectDir}/bin/shared/getLibraryId.nf" +include {extract_csv} from "${projectDir}/bin/shared/extract_csv.nf" +include {FILE_DOWNLOAD} from "${projectDir}/subworkflows/aria_download_parse" +include {CONCATENATE_LOCAL_FILES} from "${projectDir}/subworkflows/concatenate_local_files" +include {CONCATENATE_READS_PE} from "${projectDir}/modules/utility_modules/concatenate_reads_PE" +include {CONCATENATE_READS_SE} from "${projectDir}/modules/utility_modules/concatenate_reads_SE" +include {TRIM_FASTQ as CUTADAPT} from "${projectDir}/modules/cutadapt/cutadapt_trim_fastq" +include {FASTQC} from "${projectDir}/modules/fastqc/fastqc" +include {READ_GROUPS} from "${projectDir}/modules/utility_modules/read_groups" +include {BWA_MEM} from "${projectDir}/modules/bwa/bwa_mem" +include {SAMTOOLS_SORT as SAMTOOLS_SORT_PRIMERCLIP; + SAMTOOLS_SORT as SAMTOOLS_SORT_CALLING} from "${projectDir}/modules/samtools/samtools_sort" +include {PRIMERCLIP} from "${projectDir}/modules/primerclip/primerclip" +include {TARGET_COVERAGE_METRICS} from "${projectDir}/modules/bedtools/bedtools_amplicon_metrics" +include {SNPSIFT_ANNOTATE} from "${projectDir}/modules/snpeff_snpsift/snpsift_annotate" +include {PICARD_COLLECTTARGETPCRMETRICS} from "${projectDir}/modules/picard/picard_collecttargetpcrmetrics" +include {GATK_BASERECALIBRATOR} from "${projectDir}/modules/gatk/gatk_baserecalibrator" +include {GATK_APPLYBQSR} from "${projectDir}/modules/gatk/gatk_applybqsr" +include {GATK_HAPLOTYPECALLER} from "${projectDir}/modules/gatk/gatk_haplotypecaller" +include {MULTIQC} from "${projectDir}/modules/multiqc/multiqc" + +// help if needed +if (params.help){ + help() + exit 0 +} + +// log params +param_log() + +if (params.download_data && !params.csv_input) { + exit 1, "Data download was specified with `--download_data`. However, no input CSV file was specified with `--csv_input`. This is an invalid parameter combination. `--download_data` requires a CSV manifest. See `--help` for information." +} + +// prepare reads channel +if (params.csv_input) { + + ch_input_sample = extract_csv(file(params.csv_input, checkIfExists: true)) + + if (params.read_type == 'PE'){ + ch_input_sample.map{it -> [it[0], [it[2], it[3]]]}.set{read_ch} + ch_input_sample.map{it -> [it[0], it[1]]}.set{meta_ch} + } else if (params.read_type == 'SE') { + ch_input_sample.map{it -> [it[0], it[2]]}.set{read_ch} + ch_input_sample.map{it -> [it[0], it[1]]}.set{meta_ch} + } + +} else if (params.concat_lanes){ + + if (params.read_type == 'PE'){ + read_ch = Channel + .fromFilePairs("${params.sample_folder}/${params.pattern}${params.extension}",checkExists:true, flat:true ) + .map { file, file1, file2 -> tuple(getLibraryId(file), file1, file2) } + .groupTuple() + } + else if (params.read_type == 'SE'){ + read_ch = Channel.fromFilePairs("${params.sample_folder}/*${params.extension}", checkExists:true, size:1 ) + .map { file, file1 -> tuple(getLibraryId(file), file1) } + .groupTuple() + .map{t-> [t[0], t[1].flatten()]} + } + // if channel is empty give error message and exit + read_ch.ifEmpty{ exit 1, "ERROR: No Files Found in Path: ${params.sample_folder} Matching Pattern: ${params.pattern} and file extension: ${params.extension}"} + +} else { + + if (params.read_type == 'PE'){ + read_ch = Channel.fromFilePairs("${params.sample_folder}/${params.pattern}${params.extension}",checkExists:true ) + } + else if (params.read_type == 'SE'){ + read_ch = Channel.fromFilePairs("${params.sample_folder}/*${params.extension}",checkExists:true, size:1 ) + } + // if channel is empty give error message and exit + read_ch.ifEmpty{ exit 1, "ERROR: No Files Found in Path: ${params.sample_folder} Matching Pattern: ${params.pattern} and file extension: ${params.extension}"} +} + +workflow AMPLICON { + // Step 0: Download data and concat Fastq files if needed. + if (params.download_data){ + FILE_DOWNLOAD(ch_input_sample) + + FILE_DOWNLOAD.out.read_meta_ch.map{it -> [it[0], it[2]]}.set{read_ch} + FILE_DOWNLOAD.out.read_meta_ch.map{it -> [it[0], it[1]]}.set{meta_ch} + } + + // Step 00: Concat local Fastq files from CSV input if required. + if (!params.download_data && params.csv_input){ + CONCATENATE_LOCAL_FILES(ch_input_sample) + CONCATENATE_LOCAL_FILES.out.read_meta_ch.map{it -> [it[0], it[2]]}.set{read_ch} + CONCATENATE_LOCAL_FILES.out.read_meta_ch.map{it -> [it[0], it[1]]}.set{meta_ch} + } + + // Step 00: Concat local Fastq files if required. + if (params.concat_lanes && !params.csv_input){ + if (params.read_type == 'PE'){ + CONCATENATE_READS_PE(read_ch) + read_ch = CONCATENATE_READS_PE.out.concat_fastq + } else if (params.read_type == 'SE'){ + CONCATENATE_READS_SE(read_ch) + read_ch = CONCATENATE_READS_SE.out.concat_fastq + } + } + + // ** MAIN workflow starts: + + CUTADAPT(read_ch) + + FASTQC(CUTADAPT.out.paired_trimmed_fastq) + + // Step 2: Get Read Group Information + READ_GROUPS(CUTADAPT.out.paired_trimmed_fastq, "gatk") + + // Step 3: BWA-MEM Alignment + bwa_mem_mapping = CUTADAPT.out.paired_trimmed_fastq.join(READ_GROUPS.out.read_groups) + BWA_MEM(bwa_mem_mapping) + + SAMTOOLS_SORT_PRIMERCLIP(BWA_MEM.out.sam, '-O sam -n', 'sam') + + PRIMERCLIP(SAMTOOLS_SORT_PRIMERCLIP.out.sorted_file) + + SAMTOOLS_SORT_CALLING(PRIMERCLIP.out.sam, '-O bam', 'bam') + + PICARD_COLLECTTARGETPCRMETRICS(SAMTOOLS_SORT_CALLING.out.sorted_file) + + TARGET_COVERAGE_METRICS(SAMTOOLS_SORT_CALLING.out.sorted_file) + + /* + Important: While the use of the Picard tool, MarkDuplicates, is a common quality control step to identify + low-complexity libraries, MarkDuplicates cannot be used on data derived from PCR-based target enrichment + methods such as the xGen Amplicon Panels. Since these targeted panels contain high numbers of identical + library fragments (particularly regarding alignment start position), MarkDuplicates cannot appropriately + analyze Amplicon libraries. + https://sfvideo.blob.core.windows.net/sitefinity/docs/default-source/application-note/primerclip-a-tool-for-trimming-primer-sequences-application-note.pdf?sfvrsn=cf83e107_14 + */ + + GATK_BASERECALIBRATOR(SAMTOOLS_SORT_CALLING.out.sorted_file) + + GATK_APPLYBQSR(SAMTOOLS_SORT_CALLING.out.sorted_file.join(GATK_BASERECALIBRATOR.out.table)) + + GATK_HAPLOTYPECALLER(GATK_APPLYBQSR.out.bam.join(GATK_APPLYBQSR.out.bai), '') + + SNPSIFT_ANNOTATE(GATK_HAPLOTYPECALLER.out.vcf, params.dbSNP, params.dbSNP_index, 'dbsnpID') + + // MultiQC + // coverage metrics? + ch_multiqc_files = Channel.empty() + ch_multiqc_files = ch_multiqc_files.mix(CUTADAPT.out.cutadapt_log.collect{it[1]}.ifEmpty([])) + ch_multiqc_files = ch_multiqc_files.mix(FASTQC.out.quality_stats.collect{it[1]}.ifEmpty([])) + ch_multiqc_files = ch_multiqc_files.mix(GATK_BASERECALIBRATOR.out.table.collect{it[1]}.ifEmpty([])) + ch_multiqc_files = ch_multiqc_files.mix(PICARD_COLLECTTARGETPCRMETRICS.out.txt.collect{it[1]}.ifEmpty([])) + ch_multiqc_files = ch_multiqc_files.mix(PRIMERCLIP.out.log.collect{it[1]}.ifEmpty([])) + ch_multiqc_files = ch_multiqc_files.mix(TARGET_COVERAGE_METRICS.out.qc_metrics.collect{it[1]}.ifEmpty([])) + + MULTIQC ( + ch_multiqc_files.collect() + ) +} \ No newline at end of file diff --git a/workflows/atac.nf b/workflows/atac.nf index 5ed9e5ad..d696bb5c 100755 --- a/workflows/atac.nf +++ b/workflows/atac.nf @@ -5,15 +5,18 @@ nextflow.enable.dsl=2 include {help} from "${projectDir}/bin/help/atac.nf" include {param_log} from "${projectDir}/bin/log/atac.nf" include {getLibraryId} from "${projectDir}/bin/shared/getLibraryId.nf" +include {extract_csv} from "${projectDir}/bin/shared/extract_csv.nf" +include {FILE_DOWNLOAD} from "${projectDir}/subworkflows/aria_download_parse" +include {CONCATENATE_LOCAL_FILES} from "${projectDir}/subworkflows/concatenate_local_files" include {CONCATENATE_READS_PE} from "${projectDir}/modules/utility_modules/concatenate_reads_PE" include {CONCATENATE_READS_SE} from "${projectDir}/modules/utility_modules/concatenate_reads_SE" include {TRIM_FASTQ} from "${projectDir}/modules/cutadapt/cutadapt_trim_fastq" include {FASTQC} from "${projectDir}/modules/fastqc/fastqc" include {ALIGN_TRIMMED_FASTQ} from "${projectDir}/modules/bowtie2/bowtie2_align_trimmed_fastq" -include {SORT as SORT_ALIGN_TRIM; - SORT as SORT_SHIFTED_BAM; - SORT as SORT_MARK_DUP_BAM; - SORT as SORT_LIFTOVER_BAM } from "${projectDir}/modules/samtools/samtools_sort" +include {SAMTOOLS_SORT as SORT_ALIGN_TRIM; + SAMTOOLS_SORT as SORT_SHIFTED_BAM; + SAMTOOLS_SORT as SORT_MARK_DUP_BAM; + SAMTOOLS_SORT as SORT_LIFTOVER_BAM } from "${projectDir}/modules/samtools/samtools_sort" include {PICARD_MARKDUPLICATES} from "${projectDir}/modules/picard/picard_markduplicates" include {REMOVE_DUPLICATE_READS} from "${projectDir}/modules/samtools/samtools_remove_duplicate_reads" include {CALC_MTDNA_FILTER_CHRM} from "${projectDir}/modules/samtools/samtools_calc_mtdna_filter_chrm" @@ -33,9 +36,10 @@ include {PEAK_COVERAGE} from "${projectDir}/modules/macs2/macs2_peak_coverage" include {FEATURE_COUNTS} from "${projectDir}/modules/subread/subread_feature_counts" include {FEATURE_COUNT2BED} from "${projectDir}/modules/bedtools/bedtools_feature_count2bed" include {QUALITY_CHECKS} from "${projectDir}/modules/samtools/samtools_quality_checks" -include {FRAG_LEN_PLOT} from "${projectDir}/modules/rstudio/rstudio_frag_len_plot" +include {FRAG_LEN_PLOT} from "${projectDir}/modules/r/frag_len_plot" include {CALC_PBC_METRICS} from "${projectDir}/modules/bedtools/bedtools_calc_pbc_metrics" include {LOG_PARSER} from "${projectDir}/modules/python/python_log_parser" +include {MULTIQC} from "${projectDir}/modules/multiqc/multiqc" // help if needed if (params.help){ @@ -46,8 +50,25 @@ if (params.help){ // log params param_log() +if (params.download_data && !params.csv_input) { + exit 1, "Data download was specified with `--download_data`. However, no input CSV file was specified with `--csv_input`. This is an invalid parameter combination. `--download_data` requires a CSV manifest. See `--help` for information." +} + // prepare reads channel -if (params.concat_lanes){ +if (params.csv_input) { + + ch_input_sample = extract_csv(file(params.csv_input, checkIfExists: true)) + + if (params.read_type == 'PE'){ + ch_input_sample.map{it -> [it[0], [it[2], it[3]]]}.set{read_ch} + ch_input_sample.map{it -> [it[0], it[1]]}.set{meta_ch} + } else if (params.read_type == 'SE') { + ch_input_sample.map{it -> [it[0], it[2]]}.set{read_ch} + ch_input_sample.map{it -> [it[0], it[1]]}.set{meta_ch} + } + +} else if (params.concat_lanes){ + if (params.read_type == 'PE'){ read_ch = Channel .fromFilePairs("${params.sample_folder}/${params.pattern}${params.extension}",checkExists:true, flat:true ) @@ -60,33 +81,52 @@ if (params.concat_lanes){ .groupTuple() .map{t-> [t[0], t[1].flatten()]} } + // if channel is empty give error message and exit + read_ch.ifEmpty{ exit 1, "ERROR: No Files Found in Path: ${params.sample_folder} Matching Pattern: ${params.pattern} and file extension: ${params.extension}"} + } else { + if (params.read_type == 'PE'){ read_ch = Channel.fromFilePairs("${params.sample_folder}/${params.pattern}${params.extension}",checkExists:true ) } else if (params.read_type == 'SE'){ read_ch = Channel.fromFilePairs("${params.sample_folder}/*${params.extension}",checkExists:true, size:1 ) } + // if channel is empty give error message and exit + read_ch.ifEmpty{ exit 1, "ERROR: No Files Found in Path: ${params.sample_folder} Matching Pattern: ${params.pattern} and file extension: ${params.extension}"} } -// if channel is empty give error message and exit -read_ch.ifEmpty{ exit 1, "ERROR: No Files Found in Path: ${params.sample_folder} Matching Pattern: ${params.pattern}"} - - // main workflow workflow ATAC { - // Step 0: Concatenate Fastq files if required. - if (params.concat_lanes){ - if (params.read_type == 'PE'){ - CONCATENATE_READS_PE(read_ch) - read_ch = CONCATENATE_READS_PE.out.concat_fastq - } else if (params.read_type == 'SE'){ - CONCATENATE_READS_SE(read_ch) - read_ch = CONCATENATE_READS_SE.out.concat_fastq - } + // Step 0: Download data and concat Fastq files if needed. + if (params.download_data){ + FILE_DOWNLOAD(ch_input_sample) + + FILE_DOWNLOAD.out.read_meta_ch.map{it -> [it[0], it[2]]}.set{read_ch} + FILE_DOWNLOAD.out.read_meta_ch.map{it -> [it[0], it[1]]}.set{meta_ch} } + // Step 00: Concat local Fastq files from CSV input if required. + if (!params.download_data && params.csv_input){ + CONCATENATE_LOCAL_FILES(ch_input_sample) + CONCATENATE_LOCAL_FILES.out.read_meta_ch.map{it -> [it[0], it[2]]}.set{read_ch} + CONCATENATE_LOCAL_FILES.out.read_meta_ch.map{it -> [it[0], it[1]]}.set{meta_ch} + } + + // Step 00: Concat local Fastq files if required. + if (params.concat_lanes && !params.csv_input){ + if (params.read_type == 'PE'){ + CONCATENATE_READS_PE(read_ch) + read_ch = CONCATENATE_READS_PE.out.concat_fastq + } else if (params.read_type == 'SE'){ + CONCATENATE_READS_SE(read_ch) + read_ch = CONCATENATE_READS_SE.out.concat_fastq + } + } + + // ** MAIN workflow starts: + // Step 1: Trim_Fastq TRIM_FASTQ(read_ch) @@ -97,7 +137,7 @@ workflow ATAC { ALIGN_TRIMMED_FASTQ(TRIM_FASTQ.out.paired_trimmed_fastq) // Step 4: Sort alignment file - SORT_ALIGN_TRIM(ALIGN_TRIMMED_FASTQ.out.sam, '') + SORT_ALIGN_TRIM(ALIGN_TRIMMED_FASTQ.out.sam, '-O bam', 'bam') // Step 5: Flag pcr duplicates PICARD_MARKDUPLICATES(SORT_ALIGN_TRIM.out) @@ -118,34 +158,35 @@ workflow ATAC { FILTER_REMOVE_MULTI_SIEVE(FILTER_REMOVE_MULTI_SHIFT.out[0]) // Step 10: Re-sort shifted bam - SORT_SHIFTED_BAM(FILTER_REMOVE_MULTI_SIEVE.out[0], '') + SORT_SHIFTED_BAM(FILTER_REMOVE_MULTI_SIEVE.out[0], '-O bam', 'bam') // If Mouse if (params.gen_org=='mouse'){ // Step 11: Convert peak coordinates // Step occurs when chain != null || chain != false - CHAIN_CONVERT(SORT_SHIFTED_BAM.out[0]) + CHAIN_CONVERT(SORT_SHIFTED_BAM.out.sorted_file) // Step 12: Sort bam by coordinates - SORT_LIFTOVER_BAM(CHAIN_CONVERT.out[0], '') + SORT_LIFTOVER_BAM(CHAIN_CONVERT.out.converted_bam, '-O bam', 'bam') // Step 13: Extract a list of 'bad reads' - CHAIN_EXTRACT_BADREADS(SORT_LIFTOVER_BAM.out[0]) + CHAIN_EXTRACT_BADREADS(SORT_LIFTOVER_BAM.out.sorted_file) // Step 14: Remove 'bad reads' from bam file CHAIN_BAD2UNIQ_READS(CHAIN_EXTRACT_BADREADS.out.bad_reads) // Step 15: Filter list to unique names - filter_chain_reads = SORT_LIFTOVER_BAM.out[0].join(CHAIN_BAD2UNIQ_READS.out.uniq_reads) + filter_chain_reads = SORT_LIFTOVER_BAM.out.sorted_file.join(CHAIN_BAD2UNIQ_READS.out.uniq_reads) CHAIN_FILTER_READS(filter_chain_reads) // Step 16: Sort fixmate bam and filter mitochondrial reads - CHAIN_SORT_FIXMATE_BAM(CHAIN_FILTER_READS.out[0]) + CHAIN_SORT_FIXMATE_BAM(CHAIN_FILTER_READS.out.bam) // Step 17: Reference strain samples, filter mitochondrial, unplaced/unlocalized reads and reindex // Step occurs when chain == null || chain == false - NON_CHAIN_REINDEX(SORT_SHIFTED_BAM.out[0]) + + NON_CHAIN_REINDEX(SORT_SHIFTED_BAM.out.sorted_file) // Step 18 : Mix chain and non-chain @@ -155,10 +196,10 @@ workflow ATAC { // Step 17 will only run when `--chain` is not used (controlled via modules). // A bam file is required in the next step. `mix` ensures that one OR the other output is used. // When '--gen_org == human' data_ch is set to the tuple output in step 10. - + } else if (params.gen_org=='human'){ - data_ch = SORT_SHIFTED_BAM.out[0] + data_ch = SORT_SHIFTED_BAM.out.sorted_file } // Step 19: Peak calling @@ -197,14 +238,27 @@ workflow ATAC { FRAG_LEN_PLOT(QUALITY_CHECKS.out) // Step 28: Sort markduplicates bam by read names - SORT_MARK_DUP_BAM(PICARD_MARKDUPLICATES.out.dedup_bam, '-n ') + SORT_MARK_DUP_BAM(PICARD_MARKDUPLICATES.out.dedup_bam, '-n -O bam', 'bam') // Step 29: Calculating PBC Metrics - CALC_PBC_METRICS(SORT_MARK_DUP_BAM.out[0]) + CALC_PBC_METRICS(SORT_MARK_DUP_BAM.out.sorted_file) // Step 30: Log Parser log_agg = TRIM_FASTQ.out.cutadapt_log.join(ALIGN_TRIMMED_FASTQ.out.bowtie_log).join(PICARD_MARKDUPLICATES.out.dedup_metrics).join(CALC_MTDNA_FILTER_CHRM.out.mtdna_log).join(CALC_PBC_METRICS.out).join(FINAL_CALC_FRIP.out) - LOG_PARSER(log_agg) + LOG_PARSER(log_agg) + + ch_multiqc_files = Channel.empty() + ch_multiqc_files = ch_multiqc_files.mix(FASTQC.out.quality_stats.collect{it[1]}.ifEmpty([])) + ch_multiqc_files = ch_multiqc_files.mix(TRIM_FASTQ.out.cutadapt_log.collect{it[1]}.ifEmpty([])) + ch_multiqc_files = ch_multiqc_files.mix(ALIGN_TRIMMED_FASTQ.out.bowtie_log.collect{it[1]}.ifEmpty([])) + ch_multiqc_files = ch_multiqc_files.mix(PICARD_MARKDUPLICATES.out.dedup_metrics.collect{it[1]}.ifEmpty([])) + ch_multiqc_files = ch_multiqc_files.mix(CALC_MTDNA_FILTER_CHRM.out.mtdna_log.collect{it[1]}.ifEmpty([])) + ch_multiqc_files = ch_multiqc_files.mix(CALC_PBC_METRICS.out.collect{it[1]}.ifEmpty([])) + ch_multiqc_files = ch_multiqc_files.mix(FINAL_CALC_FRIP.out.collect{it[1]}.ifEmpty([])) + ch_multiqc_files = ch_multiqc_files.mix(FRAG_LEN_PLOT.out.spline_table.collect{it[1]}.ifEmpty([])) + + MULTIQC ( + ch_multiqc_files.collect() + ) } - diff --git a/workflows/chipseq.nf b/workflows/chipseq.nf new file mode 100755 index 00000000..08abf5ac --- /dev/null +++ b/workflows/chipseq.nf @@ -0,0 +1,370 @@ +#!/usr/bin/env nextflow +nextflow.enable.dsl=2 + +// import modules +include {help} from "${projectDir}/bin/help/chipseq.nf" +include {param_log} from "${projectDir}/bin/log/chipseq.nf" +include {getLibraryId} from "${projectDir}/bin/shared/getLibraryId.nf" +include {CHECK_DESIGN} from "${projectDir}/modules/utility_modules/chipseq_check_design" +include {SAMTOOLS_FAIDX} from "${projectDir}/modules/samtools/samtools_faidx" +include {MAKE_GENOME_FILTER} from "${projectDir}/modules/utility_modules/chipseq_make_genome_filter" +include {FASTQC} from "${projectDir}/modules/fastqc/fastqc" +include {TRIM_GALORE} from "${projectDir}/modules/trim_galore/trim_galore" +include {READ_GROUPS} from "${projectDir}/modules/utility_modules/read_groups" +include {BWA_MEM} from "${projectDir}/modules/bwa/bwa_mem" +include {SAMTOOLS_FILTER} from "${projectDir}/modules/samtools/samtools_filter" +include {SAMTOOLS_SORT; + SAMTOOLS_SORT as PAIR_SORT; + SAMTOOLS_SORT as NAME_SORT} from "${projectDir}/modules/samtools/samtools_sort" +include {SAMTOOLS_INDEX} from "${projectDir}/modules/samtools/samtools_index" +include {SAMTOOLS_STATS; + SAMTOOLS_STATS as SAMTOOLS_STATS_MD; + SAMTOOLS_STATS as SAMTOOLS_STATS_FILTERED; + SAMTOOLS_STATS as SAMTOOLS_STATS_BF} from "${projectDir}/modules/samtools/samtools_stats" +include {PICARD_MERGESAMFILES} from "${projectDir}/modules/picard/picard_mergesamfiles" +include {PICARD_MARKDUPLICATES} from "${projectDir}/modules/picard/picard_markduplicates" +include {SAMTOOLS_MERGEBAM_FILTER} from "${projectDir}/modules/samtools/samtools_mergebam_filter" +include {BAMTOOLS_FILTER} from "${projectDir}/modules/bamtools/bamtools_filter" +include {BAMPE_RM_ORPHAN} from "${projectDir}/modules/utility_modules/chipseq_bampe_rm_orphan" +include {PRESEQ} from "${projectDir}/modules/preseq/preseq" +include {PICARD_COLLECTMULTIPLEMETRICS} from "${projectDir}/modules/picard/picard_collectmultiplemetrics" +include {BEDTOOLS_GENOMECOV} from "${projectDir}/modules/bedtools/bedtools_genomecov" +include {UCSC_BEDGRAPHTOBIGWIG} from "${projectDir}/modules/ucsc/ucsc_bedgraphtobigwig" +include {DEEPTOOLS_COMPUTEMATRIX} from "${projectDir}/modules/deeptools/deeptools_computematrix" +include {DEEPTOOLS_PLOTPROFILE} from "${projectDir}/modules/deeptools/deeptools_plotprofile" +include {DEEPTOOLS_PLOTHEATMAP} from "${projectDir}/modules/deeptools/deeptools_plotheatmap" +include {PHANTOMPEAKQUALTOOLS} from "${projectDir}/modules/phantompeakqualtools/phantompeakqualtools" +include {MULTIQC_CUSTOM_PHANTOMPEAKQUALTOOLS} from "${projectDir}/modules/multiqc/multiqc_custom_phantompeakqualtools" +include {DEEPTOOLS_PLOTFINGERPRINT} from "${projectDir}/modules/deeptools/deeptools_plotfingerprint" +include {PEAK_CALLING_CHIPSEQ} from "${projectDir}/modules/macs2/macs2_peak_calling_chipseq" +include {FRIP_SCORE} from "${projectDir}/modules/utility_modules/frip_score" +include {HOMER_ANNOTATEPEAKS; + HOMER_ANNOTATEPEAKS as CONSENSUS_PEAKS_ANNOTATE} from "${projectDir}/modules/homer/homer_annotatepeaks" +include {PLOT_MACS2_QC} from "${projectDir}/modules/macs2/plot_macs2_qc" +include {PLOT_HOMER_ANNOTATEPEAKS} from "${projectDir}/modules/homer/plot_homer_annotatepeaks" +include {MACS2_CONSENSUS} from "${projectDir}/modules/macs2/macs2_consensus" +include {ANNOTATE_BOOLEAN_PEAKS} from "${projectDir}/modules/homer/annotate_boolean_peaks" +include {SUBREAD_FEATURECOUNTS} from "${projectDir}/modules/subread/subread_feature_counts_chipseq" +include {DESEQ2_QC} from "${projectDir}/modules/utility_modules/deseq2_qc" +include {MULTIQC} from "${projectDir}/modules/multiqc/multiqc" + +// help if needed +if (params.help){ + help() + exit 0 +} + +// log params +param_log() + +// main workflow +workflow CHIPSEQ { + + if (params.input) { ch_input = file(params.input, checkIfExists: true) } else { exit 1, 'Samples design file not specified!' } + + // Step 1: CHECK_DESIGN + CHECK_DESIGN(ch_input) + + /* + * Create channels for input fastq files + */ + + if (params.read_type == 'SE'){ + read_ch = CHECK_DESIGN.out.sample_reads + .splitCsv(header:true, sep:',') + .map { row -> [ row.sample_id, [ file(row.fastq_1, checkIfExists: true) ] ] } + } else { + read_ch = CHECK_DESIGN.out.sample_reads + .splitCsv(header:true, sep:',') + .map { row -> [ row.sample_id, [ file(row.fastq_1, checkIfExists: true), file(row.fastq_2, checkIfExists: true) ] ] } + } + + /* + * Create a channel with [sample_id, control id, antibody, replicatesExist, multipleGroups] + */ + control_ch = CHECK_DESIGN.out.study_design + .splitCsv(header:true, sep:',') + .map { row -> [ row.sample_id, row.control_id, row.antibody, row.replicatesExist.toBoolean(), row.multipleGroups.toBoolean() ] } + + + // Header files for MultiQC + ch_spp_nsc_header = file("${projectDir}/bin/shared/multiqc/chipseq/spp_nsc_header.txt", checkIfExists: true) + ch_spp_rsc_header = file("${projectDir}/bin/shared/multiqc/chipseq/spp_rsc_header.txt", checkIfExists: true) + ch_spp_correlation_header = file("${projectDir}/bin/shared/multiqc/chipseq/spp_correlation_header.txt", checkIfExists: true) + ch_peak_count_header = file("${projectDir}/bin/shared/multiqc/chipseq/peak_count_header.txt", checkIfExists: true) + ch_frip_score_header = file("${projectDir}/bin/shared/multiqc/chipseq/frip_score_header.txt", checkIfExists: true) + ch_peak_annotation_header = file("${projectDir}/bin/shared/multiqc/chipseq/peak_annotation_header.txt", checkIfExists: true) + ch_deseq2_pca_header = file("${projectDir}/bin/shared/multiqc/chipseq/deseq2_pca_header.txt", checkIfExists: true) + ch_deseq2_clustering_header = file("${projectDir}/bin/shared/multiqc/chipseq/deseq2_clustering_header.txt", checkIfExists: true) + + // Reference genome + ch_fasta = file(params.ref_fa, checkIfExists: true) + ch_gtf = file(params.gtf, checkIfExists: true) + + // genes.bed + if (params.gene_bed) { ch_gene_bed = file(params.gene_bed, checkIfExists: true) } + + // Step 2: Make genome filter + SAMTOOLS_FAIDX(ch_fasta) + MAKE_GENOME_FILTER(SAMTOOLS_FAIDX.out, params.blacklist) + + // Step 3: Fastqc + FASTQC(read_ch) + + // Step 4: Trim Galore + TRIM_GALORE(read_ch) + + // Step 5: Get Read Group Information + READ_GROUPS(TRIM_GALORE.out.trimmed_fastq, "gatk") + + // Step 6: BWA-MEM + bwa_mem_mapping = TRIM_GALORE.out.trimmed_fastq.join(READ_GROUPS.out.read_groups) + BWA_MEM(bwa_mem_mapping) + + // Step 7: Samtools Removing Unmapped + SAMTOOLS_FILTER(BWA_MEM.out, '-F 0x0100') + + // Step 8: Samtools Sort + SAMTOOLS_SORT(SAMTOOLS_FILTER.out.bam, '-O bam', 'bam') + + // Step 9: Samtools Stats + SAMTOOLS_STATS(SAMTOOLS_SORT.out.sorted_file) + + // Step 10: Merge BAM files + // Merge techincal replicates of sample replicates (if tech reps exist). + // BAM files for all libraries from same techincal sample replicate. + // i.e., merge multiple lanes per sample or resequenceing of 1 sample. + // see: https://github.com/nf-core/chipseq/blob/1.2.2/docs/usage.md#multiple-runs-of-the-same-library + + ch_sort_bam_merge = SAMTOOLS_SORT.out.sorted_file + .map { it -> [ it[0].split('_')[0..-2].join('_'), it[1] ] } + .groupTuple(by: [0]) + .map { it -> [ it[0], it[1].flatten() ] } + // The design script adds 2 fields to sample names: R (replicate), and T (treatment), which are delimited by '_'. + // The first step, splits off the T identifier, and groups all remaining samples by the new ID. + // This allows all samples that are techincal replicataes, i.e., share the same + // sampleID and replicate ID, to be joined and then merged in the next step. + + PICARD_MERGESAMFILES(ch_sort_bam_merge) + + // Step 11: Mark Duplicates + PICARD_MARKDUPLICATES(PICARD_MERGESAMFILES.out.bam) + + // Step 12: Samtools Stats + SAMTOOLS_STATS_MD(PICARD_MARKDUPLICATES.out.dedup_bam) + + // Step 13: Samtools Mergebam Filter + SAMTOOLS_MERGEBAM_FILTER(PICARD_MARKDUPLICATES.out.dedup_bam, MAKE_GENOME_FILTER.out.bed) + // Note: genome filter file is generic and used by all samples. + + // JSON files required by BAMTools for alignment filtering + if (params.read_type == 'SE'){ + ch_bamtools_filter_config = file(params.bamtools_filter_se_config, checkIfExists: true) + } else { + ch_bamtools_filter_config = file(params.bamtools_filter_pe_config, checkIfExists: true) + } + + // Step 14: Bamtools Filter + BAMTOOLS_FILTER(SAMTOOLS_MERGEBAM_FILTER.out.bam, ch_bamtools_filter_config) + + // Step 15: Samtools Stats + SAMTOOLS_STATS_BF(BAMTOOLS_FILTER.out.bam) + + if (params.read_type == 'SE'){ + + filtered_sorted_bam = BAMTOOLS_FILTER.out.bam + // Note: the output BAM from the preceding step was coordinate sorted in step 8, + // and the sort was maintained in the optional merge step and in markduplicates step. + + } else { + // Step 16: Samtools Name Sort + NAME_SORT(BAMTOOLS_FILTER.out.bam, '-n -O bam', 'bam') + // Name sorting is required to remove orphaned singletons in PE data. + + // Step 17: Remove singleton reads from paired-end BAM file + BAMPE_RM_ORPHAN(NAME_SORT.out.sorted_file) + + // Step 18 : Samtools Pair Sort + PAIR_SORT(BAMPE_RM_ORPHAN.out.bam, '-O bam', 'bam') + // Coordinate sorting must be used for next steps. + + filtered_sorted_bam = PAIR_SORT.out.sorted_file + + } + + // Step 19 : Samtools Stats + SAMTOOLS_STATS_FILTERED(filtered_sorted_bam) + + // Step 20 : Preseq + PRESEQ(PICARD_MARKDUPLICATES.out.dedup_bam) + //Note: preseq package is aimed at predicting and estimating the complexity of a genomic sequencing library + + // Step 21 : Collect Multiple Metrics + + SAMTOOLS_INDEX(filtered_sorted_bam) + + PICARD_COLLECTMULTIPLEMETRICS(filtered_sorted_bam) + + // Step 22 : Bedtools Genome Coverage + BEDTOOLS_GENOMECOV(filtered_sorted_bam.join(SAMTOOLS_STATS_FILTERED.out.flagstat)) + + // Step 23 : USCS Bedgraph to bigwig + UCSC_BEDGRAPHTOBIGWIG(BEDTOOLS_GENOMECOV.out.bedgraph, MAKE_GENOME_FILTER.out.sizes) + // Note: genome filter is a generic file used for all samples. + + // Step 24 : Deeptools Compute matrix + DEEPTOOLS_COMPUTEMATRIX(UCSC_BEDGRAPHTOBIGWIG.out.bigwig, ch_gene_bed) + // Note: ch_gene_bed is a generic file used for all samples. + + // Step 25 : Deeptools Plot Profile + DEEPTOOLS_PLOTPROFILE(DEEPTOOLS_COMPUTEMATRIX.out.matrix) + + // Step 26 : Deeptools Plot Heatmap + DEEPTOOLS_PLOTHEATMAP(DEEPTOOLS_COMPUTEMATRIX.out.matrix) + + // Step 27 : Phantompeakqualtools + PHANTOMPEAKQUALTOOLS(filtered_sorted_bam) + + // Step 28 : Multiqc Custom Phantompeakqualtools + mcp_ch = PHANTOMPEAKQUALTOOLS.out.spp.join(PHANTOMPEAKQUALTOOLS.out.rdata, by: [0]) + MULTIQC_CUSTOM_PHANTOMPEAKQUALTOOLS(mcp_ch, ch_spp_nsc_header, ch_spp_rsc_header, ch_spp_correlation_header) + + // Create channel linking IP bams with control bams + ch_genome_bam_bai = filtered_sorted_bam.join(SAMTOOLS_INDEX.out.bai) + .map{it -> [it[0], [it[1], it[2]]]} + // next step requires a tuple with [sampleID, [bam, bai]] + + ch_genome_bam_bai = ch_genome_bam_bai + .combine(ch_genome_bam_bai) + // this combine step genenerates pairs of samples, which are then refined in the next step. + + ch_group_bam = control_ch + .combine(ch_genome_bam_bai ) + .filter { it[0] == it[5] && it[1] == it[7] } + .join(SAMTOOLS_STATS_FILTERED.out.flagstat) + .map { it -> it[2..-1] } + // Generate combinations of all study design objects: + // [SPT5_T0_R1, SPT5_INPUT_R1, SPT5, true, true] + // with all combined bams from the 'combine' above: + // [SPT5_T0_R1, [/.../SPT5_T0_R1.mLb.clN.sorted.bam, /.../SPT5_T0_R1.mLb.clN.sorted.bam.bai], SPT5_INPUT_R1, [/.../SPT5_INPUT_R2.mLb.clN.sorted.bam, /.../SPT5_INPUT_R2.mLb.clN.sorted.bam.bai]] + // the combinations between design and all pairs, has combinations that are not relavent to the study design. Therefore, the combinations are filtered to cases where: + // it[0] == it[5] (e.g., SPT5_T0_R1 == SPT5_T0_R1) AND it[1] == it[7] (e.g., SPT5_INPUT_R1 == SPT5_INPUT_R1) + // it then adjust the output tuple to remove the extra sample IDs: + // [SPT5, true, true, SPT5_T0_R2, [/../SPT5_T0_R2.mLb.clN.sorted.bam, /../SPT5_T0_R2.mLb.clN.sorted.bam.bai], SPT5_INPUT_R2, [/../SPT5_INPUT_R2.mLb.clN.sorted.bam, /../SPT5_INPUT_R2.mLb.clN.sorted.bam.bai], /../SPT5_T0_R2.mLb.clN.sorted.bam.flagstat] + + // Step 29 : Deeptools plotFingerprint + DEEPTOOLS_PLOTFINGERPRINT(ch_group_bam) + + // Step 30 : Call peaks with MACS2 + PEAK_CALLING_CHIPSEQ(ch_group_bam, ch_peak_count_header, ch_frip_score_header) + // Note: ch_peak_count_header is a generic file used for all samples. ch_frip_score_header is a generic file used for all samples. + + // Step 31 : Calculate FRiP score + frip_input = ch_group_bam + .map{it -> [it[3], it[0], it[1], it[2], it[3], it[4], it[5], it[6], it[7]]} + .join(PEAK_CALLING_CHIPSEQ.out.peak) + .map{it -> it[1..-1]} + // 'ch_group_bam' is indexed on antibody. peak calling is indexed on the IP sample. + // This map adjusts the tuple to put IP in the index position + // Joins 'ch_group_bam' to the peak file by IP sample ID, + // and then readjusts the tuple to place antibody in the index position. + + FRIP_SCORE(frip_input, ch_peak_count_header, ch_frip_score_header) + // Note: ch_peak_count_header is a generic file used for all samples. ch_frip_score_header is a generic file used for all samples. + + // Step 32 : Homer Annotate Peaks + HOMER_ANNOTATEPEAKS(PEAK_CALLING_CHIPSEQ.out.ip_control_peak, ch_fasta, ch_gtf) + + // Step 33 : Plot Macs2 QC + PLOT_MACS2_QC(PEAK_CALLING_CHIPSEQ.out.peak.collect{ it[-1] }) + // Note: *collect{ it[-1] } collects all peak files, and passes those to the module. + + // Step 34 : Plot Homer Annotate Peaks + PLOT_HOMER_ANNOTATEPEAKS(HOMER_ANNOTATEPEAKS.out.txt.collect{ it[-1] }, ch_peak_annotation_header, '_peaks.annotatePeaks.txt') + // Note: *collect{ it[-1] } collects all peak files, and passes those to the module. + + // Step 35 : Consensus peaks across samples, create boolean filtering file, SAF file + + // Create channel for CONSENSUS PEAKS ANALYSIS + // Group by antibody from this point and carry forward boolean variables + + ch_macs_consensus = PEAK_CALLING_CHIPSEQ.out.ip_control_peak + .map { it -> [ it[0], it[1], it[2], it[-1] ] } + .groupTuple() + .map { it -> [ it[0], it[1][0], it[2][0], it[3].toSorted( { a, b -> a.getName() <=> b.getName() } ) ] } + // Note: re-order the output tuple from PEAK_CALLING_CHIPSEQ: + // [SPT5, true, true, SPT5_T15_R2, SPT5_INPUT_R2, /.../SPT5_T15_R2_peaks.broadPeak] + // to remove the case and control sample IDs: + // [SPT5, true, true, /.../SPT5_T15_R1_peaks.broadPeak] + // Then group by antibody. Map: keep only the first index position of replicatesExist, multipleGroups + // as the remaining array for those are duplicate values. sort the broadpeak file array by file name. + + MACS2_CONSENSUS(ch_macs_consensus) + // Note: this step will not run when replicatesExist || multipleGroups are false. + // Subequently all steps beyond this point will not run as they rely on output from this step. + + // Step 36 : Consensus peaks annotation + CONSENSUS_PEAKS_ANNOTATE(MACS2_CONSENSUS.out.bed, ch_fasta, ch_gtf) + // Note: ch_fasta and ch_gtf are generic files and shared by all samples. + + // Step 37 : Annotate boolean peaks + ANNOTATE_BOOLEAN_PEAKS(MACS2_CONSENSUS.out.boolean_txt.join(CONSENSUS_PEAKS_ANNOTATE.out.txt)) + + // Get BAM and SAF files for each antibody + + ch_group_bam // [antibody, replicatesExist, multipleGroups, sample_id, [bam, bai], control_id, [bam, bai], sample_id bam.flagstat] + .map { it -> [ it[3], [ it[0], it[1], it[2] ] ] } // [sample_id, [antibody, replicatesExist, multipleGroups]] + .join(filtered_sorted_bam) // [sample_id, [antibody, replicatesExist, multipleGroups], final filtered sample_id indexed bam] + .map { it -> [ it[1][0], it[1][1], it[1][2], it[2] ] } // [antibody, replicatesExist, multipleGroups, OR sample_id bam] + .groupTuple() + .map { it -> [ it[0], it[1][0], it[2][0], it[3].flatten().sort() ] } // [antibody, replicatesExist, multipleGroups, [OR sample_id1 R1 bam, OR sample_id1 R2 bam, OR sample_id2 R1 bam, OR sample_id2 R2 bam]] + .join(MACS2_CONSENSUS.out.saf) // [antibody, replicatesExist, multipleGroups, [OR sample_id1 R1 bam, OR sample_id1 R2 bam, OR sample_id2 R1 bam, OR sample_id2 R2 bam], SAF] + .set { ch_group_bam } + + // Step 38 : Count reads in consensus peaks with featureCounts + SUBREAD_FEATURECOUNTS(ch_group_bam) + + // Step 39 : Differential analysis with DESeq2 + DESEQ2_QC(SUBREAD_FEATURECOUNTS.out.counts, ch_deseq2_pca_header, ch_deseq2_clustering_header) + // note: ch_deseq2_pca_header, ch_deseq2_clustering_header are generic files used for all samples. + + // Create channels for multi input files + ch_multiqc_files = Channel.empty() + + ch_multiqc_files = ch_multiqc_files.mix(FASTQC.out.quality_stats.collect{it[1]}.ifEmpty([])) + ch_multiqc_files = ch_multiqc_files.mix(TRIM_GALORE.out.trim_stats.collect{it[1]}.ifEmpty([])) + ch_multiqc_files = ch_multiqc_files.mix(TRIM_GALORE.out.trimmed_fastqc.collect{it[1]}.ifEmpty([])) + + ch_multiqc_files = ch_multiqc_files.mix(SAMTOOLS_STATS.out.flagstat.collect{it[1]}.ifEmpty([])) + ch_multiqc_files = ch_multiqc_files.mix(SAMTOOLS_STATS.out.idxstat.collect{it[1]}.ifEmpty([])) + ch_multiqc_files = ch_multiqc_files.mix(SAMTOOLS_STATS.out.stats.collect{it[1]}.ifEmpty([])) + ch_multiqc_files = ch_multiqc_files.mix(SAMTOOLS_STATS_MD.out.flagstat.collect{it[1]}.ifEmpty([])) + ch_multiqc_files = ch_multiqc_files.mix(SAMTOOLS_STATS_MD.out.idxstat.collect{it[1]}.ifEmpty([])) + ch_multiqc_files = ch_multiqc_files.mix(SAMTOOLS_STATS_MD.out.stats.collect{it[1]}.ifEmpty([])) + ch_multiqc_files = ch_multiqc_files.mix(SAMTOOLS_STATS_FILTERED.out.flagstat.collect{it[1]}.ifEmpty([])) + ch_multiqc_files = ch_multiqc_files.mix(SAMTOOLS_STATS_FILTERED.out.idxstat.collect{it[1]}.ifEmpty([])) + ch_multiqc_files = ch_multiqc_files.mix(SAMTOOLS_STATS_FILTERED.out.stats.collect{it[1]}.ifEmpty([])) + ch_multiqc_files = ch_multiqc_files.mix(PICARD_MARKDUPLICATES.out.dedup_metrics.collect{it[1]}.ifEmpty([])) + ch_multiqc_files = ch_multiqc_files.mix(PICARD_COLLECTMULTIPLEMETRICS.out.metrics.collect{it[1]}.ifEmpty([])) + + ch_multiqc_files = ch_multiqc_files.mix(FRIP_SCORE.out.tsv.collect{it[1]}.ifEmpty([])) + ch_multiqc_files = ch_multiqc_files.mix(PLOT_HOMER_ANNOTATEPEAKS.out.tsv.collect()) + ch_multiqc_files = ch_multiqc_files.mix(SUBREAD_FEATURECOUNTS.out.summary.collect{it[1]}.ifEmpty([])) + ch_multiqc_files = ch_multiqc_files.mix(DESEQ2_QC.out.pca_multiqc.collect()) + ch_multiqc_files = ch_multiqc_files.mix(DESEQ2_QC.out.dists_multiqc.collect()) + + ch_multiqc_files = ch_multiqc_files.mix(PRESEQ.out.txt.collect{it[1]}.ifEmpty([])) + ch_multiqc_files = ch_multiqc_files.mix(DEEPTOOLS_PLOTFINGERPRINT.out.raw.collect{it[1]}.ifEmpty([])) + ch_multiqc_files = ch_multiqc_files.mix(DEEPTOOLS_PLOTPROFILE.out.table.collect{it[1]}.ifEmpty([])) + ch_multiqc_files = ch_multiqc_files.mix(PHANTOMPEAKQUALTOOLS.out.spp.collect{it[1]}.ifEmpty([])) + ch_multiqc_files = ch_multiqc_files.mix(MULTIQC_CUSTOM_PHANTOMPEAKQUALTOOLS.out.nsc.collect{it[1]}.ifEmpty([])) + ch_multiqc_files = ch_multiqc_files.mix(MULTIQC_CUSTOM_PHANTOMPEAKQUALTOOLS.out.rsc.collect{it[1]}.ifEmpty([])) + ch_multiqc_files = ch_multiqc_files.mix(MULTIQC_CUSTOM_PHANTOMPEAKQUALTOOLS.out.correlation.collect{it[1]}.ifEmpty([])) + + + // Step 41 : MultiQC + MULTIQC ( + ch_multiqc_files.collect() + ) + +} diff --git a/workflows/pdx_wes.nf b/workflows/pdx_wes.nf new file mode 100755 index 00000000..bed77d59 --- /dev/null +++ b/workflows/pdx_wes.nf @@ -0,0 +1,246 @@ +#!/usr/bin/env nextflow +nextflow.enable.dsl=2 + +// import modules +include {help} from "${projectDir}/bin/help/pdx_wes.nf" +include {param_log} from "${projectDir}/bin/log/pdx_wes.nf" +include {getLibraryId} from "${projectDir}/bin/shared/getLibraryId.nf" +include {extract_csv} from "${projectDir}/bin/shared/extract_csv.nf" +include {FILE_DOWNLOAD} from "${projectDir}/subworkflows/aria_download_parse" +include {CONCATENATE_LOCAL_FILES} from "${projectDir}/subworkflows/concatenate_local_files" +include {CONCATENATE_READS_PE} from "${projectDir}/modules/utility_modules/concatenate_reads_PE" +include {CONCATENATE_READS_SE} from "${projectDir}/modules/utility_modules/concatenate_reads_SE" +include {JAX_TRIMMER} from "${projectDir}/modules/utility_modules/jax_trimmer" +include {FASTQC} from "${projectDir}/modules/fastqc/fastqc" +include {XENOME_CLASSIFY} from "${projectDir}/modules/xenome/xenome" +include {FASTQ_SORT as FASTQ_SORT_HUMAN; + FASTQ_SORT as FASTQ_SORT_MOUSE} from "${projectDir}/modules/fastq-tools/fastq-sort" +include {READ_GROUPS} from "${projectDir}/modules/utility_modules/read_groups" +include {BWA_MEM} from "${projectDir}/modules/bwa/bwa_mem" +include {PICARD_SORTSAM} from "${projectDir}/modules/picard/picard_sortsam" +include {PICARD_MARKDUPLICATES} from "${projectDir}/modules/picard/picard_markduplicates" +include {GATK_BASERECALIBRATOR} from "${projectDir}/modules/gatk/gatk_baserecalibrator" +include {GATK_APPLYBQSR} from "${projectDir}/modules/gatk/gatk_applybqsr" +include {GATK_GETSAMPLENAME} from "${projectDir}/modules/gatk/gatk_getsamplename_noMeta" +include {GATK_VARIANTFILTRATION as GATK_VARIANTFILTRATION_SNP; + GATK_VARIANTFILTRATION as GATK_VARIANTFILTRATION_INDEL} from "${projectDir}/modules/gatk/gatk_variantfiltration_mutect2" +include {GATK_SELECTVARIANTS as GATK_SELECTVARIANTS_SNP; + GATK_SELECTVARIANTS as GATK_SELECTVARIANTS_INDEL} from "${projectDir}/modules/gatk/gatk_selectvariants" +include {GATK_MUTECT2} from "${projectDir}/modules/gatk/gatk_mutect2_tumorOnly" +include {GATK_FILTERMUECTCALLS} from "${projectDir}/modules/gatk/gatk_filtermutectcalls_tumorOnly" +include {MSISENSOR2_MSI} from "${projectDir}/modules/msisensor2/msisensor2_tumorOnly" +include {GATK_MERGEVCF as GATK_MERGEVCF_UNANNOTATED; + GATK_MERGEVCF as GATK_MERGEVCF_ANNOTATED} from "${projectDir}/modules/gatk/gatk_mergevcf" +include {COSMIC_ANNOTATION as COSMIC_ANNOTATION_SNP; + COSMIC_ANNOTATION as COSMIC_ANNOTATION_INDEL} from "${projectDir}/modules/cosmic/cosmic_annotation" +include {SNPSIFT_ANNOTATE as SNPSIFT_ANNOTATE_SNP_COSMIC; + SNPSIFT_ANNOTATE as SNPSIFT_ANNOTATE_INDEL_COSMIC; + SNPSIFT_ANNOTATE as SNPSIFT_ANNOTATE_SNP_DBSNP; + SNPSIFT_ANNOTATE as SNPSIFT_ANNOTATE_INDEL_DBSNP} from "${projectDir}/modules/snpeff_snpsift/snpsift_annotate" +include {SNPEFF as SNPEFF_SNP; + SNPEFF as SNPEFF_INDEL} from "${projectDir}/modules/snpeff_snpsift/snpeff_snpeff" +include {SNPEFF_ONEPERLINE as SNPEFF_ONEPERLINE_SNP; + SNPEFF_ONEPERLINE as SNPEFF_ONEPERLINE_INDEL} from "${projectDir}/modules/snpeff_snpsift/snpeff_oneperline" +include {SNPSIFT_EXTRACTFIELDS} from "${projectDir}/modules/snpeff_snpsift/snpsift_extractfields" +include {SNPSIFT_DBNSFP as SNPSIFT_DBNSFP_SNP; + SNPSIFT_DBNSFP as SNPSIFT_DBNSFP_INDEL} from "${projectDir}/modules/snpeff_snpsift/snpsift_dbnsfp" +include {PICARD_COLLECTHSMETRICS} from "${projectDir}/modules/picard/picard_collecthsmetrics" +include {AGGREGATE_STATS} from "${projectDir}/modules/utility_modules/aggregate_stats_wes" +include {MULTIQC} from "${projectDir}/modules/multiqc/multiqc" + +// help if needed +if (params.help){ + help() + exit 0 +} + +// log params +param_log() + +// prepare reads channel + +if (params.download_data && !params.csv_input) { + exit 1, "Data download was specified with `--download_data`. However, no input CSV file was specified with `--csv_input`. This is an invalid parameter combination. `--download_data` requires a CSV manifest. See `--help` for information." +} + +if (params.gen_org == 'mouse') { + exit 1, "PDX workflow was called; however, `--gen_org` was set to: ${params.gen_org}. This is an invalid parameter combination. `--gen_org` must == 'human' for PDX analysis." +} + +if (params.csv_input) { + + ch_input_sample = extract_csv(file(params.csv_input, checkIfExists: true)) + + if (params.read_type == 'PE'){ + ch_input_sample.map{it -> [it[0], [it[2], it[3]]]}.set{read_ch} + ch_input_sample.map{it -> [it[0], it[1]]}.set{meta_ch} + } else if (params.read_type == 'SE') { + ch_input_sample.map{it -> [it[0], it[2]]}.set{read_ch} + ch_input_sample.map{it -> [it[0], it[1]]}.set{meta_ch} + } + +} else if (params.concat_lanes){ + + if (params.read_type == 'PE'){ + read_ch = Channel + .fromFilePairs("${params.sample_folder}/${params.pattern}${params.extension}",checkExists:true, flat:true ) + .map { file, file1, file2 -> tuple(getLibraryId(file), file1, file2) } + .groupTuple() + } + else if (params.read_type == 'SE'){ + read_ch = Channel.fromFilePairs("${params.sample_folder}/*${params.extension}", checkExists:true, size:1 ) + .map { file, file1 -> tuple(getLibraryId(file), file1) } + .groupTuple() + .map{t-> [t[0], t[1].flatten()]} + } + // if channel is empty give error message and exit + read_ch.ifEmpty{ exit 1, "ERROR: No Files Found in Path: ${params.sample_folder} Matching Pattern: ${params.pattern} and file extension: ${params.extension}"} + +} else { + + if (params.read_type == 'PE'){ + read_ch = Channel.fromFilePairs("${params.sample_folder}/${params.pattern}${params.extension}",checkExists:true ) + } + else if (params.read_type == 'SE'){ + read_ch = Channel.fromFilePairs("${params.sample_folder}/*${params.extension}",checkExists:true, size:1 ) + } + // if channel is empty give error message and exit + read_ch.ifEmpty{ exit 1, "ERROR: No Files Found in Path: ${params.sample_folder} Matching Pattern: ${params.pattern} and file extension: ${params.extension}"} + +} + +workflow PDX_WES { + + // Step 0: Download data and concat Fastq files if needed. + if (params.download_data){ + FILE_DOWNLOAD(ch_input_sample) + + FILE_DOWNLOAD.out.read_meta_ch.map{it -> [it[0], it[2]]}.set{read_ch} + FILE_DOWNLOAD.out.read_meta_ch.map{it -> [it[0], it[1]]}.set{meta_ch} + } + + // Step 00: Concat local Fastq files from CSV input if required. + if (!params.download_data && params.csv_input){ + CONCATENATE_LOCAL_FILES(ch_input_sample) + CONCATENATE_LOCAL_FILES.out.read_meta_ch.map{it -> [it[0], it[2]]}.set{read_ch} + CONCATENATE_LOCAL_FILES.out.read_meta_ch.map{it -> [it[0], it[1]]}.set{meta_ch} + } + + // Step 00: Concat local Fastq files from directory if required. + if (params.concat_lanes && !params.csv_input){ + if (params.read_type == 'PE'){ + CONCATENATE_READS_PE(read_ch) + read_ch = CONCATENATE_READS_PE.out.concat_fastq + } else if (params.read_type == 'SE'){ + CONCATENATE_READS_SE(read_ch) + read_ch = CONCATENATE_READS_SE.out.concat_fastq + } + } + + // ** MAIN workflow starts: + + // Step 1: Qual_Stat + JAX_TRIMMER(read_ch) + + xenome_input = JAX_TRIMMER.out.trimmed_fastq + + FASTQC(JAX_TRIMMER.out.trimmed_fastq) + + // Step 2: Xenome classify and sort. + XENOME_CLASSIFY(xenome_input) + + // Xenome Read Sort + FASTQ_SORT_HUMAN(XENOME_CLASSIFY.out.xenome_fastq, 'human') + FASTQ_SORT_MOUSE(XENOME_CLASSIFY.out.xenome_mouse_fastq, 'mouse') + + // Step 3: Get Read Group Information + READ_GROUPS(JAX_TRIMMER.out.trimmed_fastq, "gatk") + + // Step 4: BWA-MEM Alignment + bwa_mem_mapping = FASTQ_SORT_HUMAN.out.sorted_fastq.join(READ_GROUPS.out.read_groups) + + BWA_MEM(bwa_mem_mapping) + + // Step 5: Variant Preprocessing - Part 1 + PICARD_SORTSAM(BWA_MEM.out.sam) + PICARD_MARKDUPLICATES(PICARD_SORTSAM.out.bam) + + // Step 6: Variant Pre-Processing - Part 2 + GATK_BASERECALIBRATOR(PICARD_MARKDUPLICATES.out.dedup_bam) + + apply_bqsr = PICARD_MARKDUPLICATES.out.dedup_bam.join(GATK_BASERECALIBRATOR.out.table) + GATK_APPLYBQSR(apply_bqsr) + + // Step 7: Variant Pre-Processing - Part 3 + collect_metrics = GATK_APPLYBQSR.out.bam.join(GATK_APPLYBQSR.out.bai) + PICARD_COLLECTHSMETRICS(collect_metrics) + + // Step 8: MSI + MSISENSOR2_MSI(GATK_APPLYBQSR.out.bam.join(GATK_APPLYBQSR.out.bai)) + + // Step 9: Get sample names + GATK_GETSAMPLENAME(collect_metrics) + + // ** Variant Calling + mutect2_caller_input = GATK_APPLYBQSR.out.bam.join(GATK_APPLYBQSR.out.bai).join(GATK_GETSAMPLENAME.out.sample_name) + + // Step 10: Mutect2 + GATK_MUTECT2(mutect2_caller_input) + GATK_FILTERMUECTCALLS(GATK_MUTECT2.out.vcf_tbi_stats) + + // Step 8: Variant Filtration + // SNP + GATK_SELECTVARIANTS_SNP(GATK_FILTERMUECTCALLS.out.mutect2_vcf_tbi, 'SNP', 'selected_SNP') + + var_filter_snp = GATK_SELECTVARIANTS_SNP.out.vcf.join(GATK_SELECTVARIANTS_SNP.out.idx) + GATK_VARIANTFILTRATION_SNP(var_filter_snp, 'SNP') + + // INDEL + GATK_SELECTVARIANTS_INDEL(GATK_FILTERMUECTCALLS.out.mutect2_vcf_tbi, 'INDEL', 'selected_INDEL') + + var_filter_indel = GATK_SELECTVARIANTS_INDEL.out.vcf.join(GATK_SELECTVARIANTS_INDEL.out.idx) + GATK_VARIANTFILTRATION_INDEL(var_filter_indel, 'INDEL') + + // Step 9: Post Variant Calling Processing - Part 1 + // + SNPSIFT_ANNOTATE_SNP_DBSNP(GATK_VARIANTFILTRATION_SNP.out.vcf, params.dbSNP, params.dbSNP_index, 'dbsnpID') + SNPSIFT_ANNOTATE_SNP_COSMIC(SNPSIFT_ANNOTATE_SNP_DBSNP.out.vcf, params.cosmic, params.cosmic_index, 'cosmicID') + SNPEFF_SNP(SNPSIFT_ANNOTATE_SNP_COSMIC.out.vcf, 'SNP', 'vcf') + SNPSIFT_DBNSFP_SNP(SNPEFF_SNP.out.vcf, 'SNP') + SNPEFF_ONEPERLINE_SNP(SNPSIFT_DBNSFP_SNP.out.vcf, 'SNP') + + // INDEL + SNPSIFT_ANNOTATE_INDEL_DBSNP(GATK_VARIANTFILTRATION_INDEL.out.vcf, params.dbSNP, params.dbSNP_index, 'dbsnpID') + SNPSIFT_ANNOTATE_INDEL_COSMIC(SNPSIFT_ANNOTATE_INDEL_DBSNP.out.vcf, params.cosmic, params.cosmic_index, 'cosmicID') + SNPEFF_INDEL(SNPSIFT_ANNOTATE_INDEL_COSMIC.out.vcf, 'INDEL', 'vcf') + SNPSIFT_DBNSFP_INDEL(SNPEFF_INDEL.out.vcf, 'INDEL') + SNPEFF_ONEPERLINE_INDEL(SNPSIFT_DBNSFP_INDEL.out.vcf, 'INDEL') + + // Step 10: Post Variant Calling Processing - Part 2 + vcf_files_unannotated = SNPSIFT_ANNOTATE_SNP_COSMIC.out.vcf.join(SNPSIFT_ANNOTATE_INDEL_COSMIC.out.vcf) + GATK_MERGEVCF_UNANNOTATED (vcf_files_unannotated, 'SNP_INDEL_filtered_unannotated_final') + + vcf_files_annotated = SNPEFF_ONEPERLINE_SNP.out.vcf.join(SNPEFF_ONEPERLINE_INDEL.out.vcf) + GATK_MERGEVCF_ANNOTATED(vcf_files_annotated, 'SNP_INDEL_filtered_annotated_final') + + SNPSIFT_EXTRACTFIELDS(GATK_MERGEVCF_ANNOTATED.out.vcf) + + agg_stats = JAX_TRIMMER.out.quality_stats.join(PICARD_COLLECTHSMETRICS.out.hsmetrics).join(PICARD_MARKDUPLICATES.out.dedup_metrics) + + // Step 11: Aggregate Stats + AGGREGATE_STATS(agg_stats) + + ch_multiqc_files = Channel.empty() + ch_multiqc_files = ch_multiqc_files.mix(JAX_TRIMMER.out.quality_stats.collect{it[1]}.ifEmpty([])) + ch_multiqc_files = ch_multiqc_files.mix(FASTQC.out.quality_stats.collect{it[1]}.ifEmpty([])) + ch_multiqc_files = ch_multiqc_files.mix(GATK_BASERECALIBRATOR.out.table.collect{it[1]}.ifEmpty([])) + ch_multiqc_files = ch_multiqc_files.mix(PICARD_COLLECTHSMETRICS.out.hsmetrics.collect{it[1]}.ifEmpty([])) + ch_multiqc_files = ch_multiqc_files.mix(PICARD_MARKDUPLICATES.out.dedup_metrics.collect{it[1]}.ifEmpty([])) + ch_multiqc_files = ch_multiqc_files.mix(XENOME_CLASSIFY.out.xenome_stats.collect{it[1]}.ifEmpty([])) + ch_multiqc_files = ch_multiqc_files.mix(GATK_FILTERMUECTCALLS.out.stats.collect{it[1]}.ifEmpty([])) + + MULTIQC ( + ch_multiqc_files.collect() + ) + +} diff --git a/workflows/pta.nf b/workflows/pta.nf new file mode 100644 index 00000000..fa7e77df --- /dev/null +++ b/workflows/pta.nf @@ -0,0 +1,909 @@ +#!/usr/bin/env nextflow +nextflow.enable.dsl=2 + +// import modules +include {help} from "${projectDir}/bin/help/pta.nf" +include {param_log} from "${projectDir}/bin/log/pta.nf" +include {CONCATENATE_PTA_FASTQ} from "${projectDir}/subworkflows/concatenate_pta_fastq" +include {JAX_TRIMMER} from "${projectDir}/modules/utility_modules/jax_trimmer" +include {FASTQC} from "${projectDir}/modules/fastqc/fastqc" +include {READ_GROUPS} from "${projectDir}/modules/utility_modules/read_groups" +include {XENOME_CLASSIFY} from "${projectDir}/modules/xenome/xenome" +include {FASTQ_SORT} from "${projectDir}/modules/fastq-tools/fastq-sort" +include {BWA_MEM} from "${projectDir}/modules/bwa/bwa_mem" +include {PICARD_SORTSAM} from "${projectDir}/modules/picard/picard_sortsam" +include {SHORT_ALIGNMENT_MARKING} from "${projectDir}/modules/nygc-short-alignment-marking/short_alignment_marking" +include {PICARD_CLEANSAM} from "${projectDir}/modules/picard/picard_cleansam" +include {PICARD_FIX_MATE_INFORMATION} from "${projectDir}/modules/picard/picard_fix_mate_information" +include {PICARD_MARKDUPLICATES} from "${projectDir}/modules/picard/picard_markduplicates" +include {GATK_BASERECALIBRATOR} from "${projectDir}/modules/gatk/gatk_baserecalibrator" +include {GATK_APPLYBQSR} from "${projectDir}/modules/gatk/gatk_applybqsr" + +include {PICARD_COLLECTALIGNMENTSUMMARYMETRICS} from "${projectDir}/modules/picard/picard_collectalignmentsummarymetrics" +include {PICARD_COLLECTWGSMETRICS} from "${projectDir}/modules/picard/picard_collectwgsmetrics" + +include {CONPAIR_PILEUP as CONPAIR_TUMOR_PILEUP; + CONPAIR_PILEUP as CONPAIR_NORMAL_PILEUP} from "${projectDir}/modules/conpair/conpair_pileup" +include {CONPAIR} from "${projectDir}/modules/conpair/conpair" + +include {GATK_HAPLOTYPECALLER_SV_GERMLINE} from "${projectDir}/modules/gatk/gatk_haplotypecaller_sv_germline" +include {GATK_SORTVCF_GERMLINE as GATK_SORTVCF_GERMLINE; + GATK_SORTVCF_GERMLINE as GATK_SORTVCF_GENOTYPE} from "${projectDir}/modules/gatk/gatk_sortvcf_germline" +include {GATK_GENOTYPE_GVCF} from "${projectDir}/modules/gatk/gatk_genotype_gvcf" +include {GATK_CNNSCORE_VARIANTS} from "${projectDir}/modules/gatk/gatk_cnnscorevariants" +include {GATK_FILTER_VARIANT_TRANCHES} from "${projectDir}/modules/gatk/gatk_filtervarianttranches" +include {GATK_VARIANTFILTRATION_AF} from "${projectDir}/modules/gatk/gatk_variantfiltration_af" +include {BCFTOOLS_GERMLINE_FILTER} from "${projectDir}/modules/bcftools/bcftools_germline_filter" +include {BCFTOOLS_SPLITMULTIALLELIC_REGIONS} from "${projectDir}/modules/bcftools/bcftools_split_multiallelic_regions" +include {VEP_GERMLINE} from "${projectDir}/modules/ensembl/varianteffectpredictor_germline" +include {BCFTOOLS_REMOVESPANNING} from "${projectDir}/modules/bcftools/bcftools_remove_spanning" +include {COSMIC_ANNOTATION} from "${projectDir}/modules/cosmic/cosmic_annotation" +include {COSMIC_CANCER_RESISTANCE_MUTATION_GERMLINE} from "${projectDir}/modules/cosmic/cosmic_add_cancer_resistance_mutations_germline" +include {GERMLINE_VCF_FINALIZATION} from "${projectDir}/modules/python/python_germline_vcf_finalization" +include {GATK_GETSAMPLENAME as GATK_GETSAMPLENAME_NORMAL; + GATK_GETSAMPLENAME as GATK_GETSAMPLENAME_TUMOR} from "${projectDir}/modules/gatk/gatk_getsamplename" + +include {GATK_MUTECT2} from "${projectDir}/modules/gatk/gatk_mutect2" +include {GATK_MERGEMUTECTSTATS} from "${projectDir}/modules/gatk/gatk_mergemutectstats" +include {GATK_FILTERMUECTCALLS} from "${projectDir}/modules/gatk/gatk_filtermutectcalls" +include {MANTA} from "${projectDir}/modules/illumina/manta" +include {STRELKA2} from "${projectDir}/modules/illumina/strelka2" +include {LANCET} from "${projectDir}/modules/nygenome/lancet" +include {GATK_SORTVCF as GATK_SORTVCF_MUTECT; + GATK_SORTVCF as GATK_SORTVCF_LANCET; + GATK_SORTVCF as GATK_SORTVCF_TOOLS; + GATK_SORTVCF as GATK_SORTVCF_TOOLS_LANCET} from "${projectDir}/modules/gatk/gatk_sortvcf_somatic_tools" +include {GRIDSS_PREPROCESS} from "${projectDir}/modules/gridss/gridss_preprocess" +include {GRIDSS_ASSEMBLE} from "${projectDir}/modules/gridss/gridss_assemble" +include {GRIDSS_CALLING} from "${projectDir}/modules/gridss/gridss_calling" +include {GRIDSS_CHROM_FILTER} from "${projectDir}/modules/gridss/gridss_chrom_filter" +include {GRIPSS_SOMATIC_FILTER} from "${projectDir}/modules/gridss/gripss_somatic_filter" +include {SAMTOOLS_STATS_INSERTSIZE as SAMTOOLS_STATS_INSERTSIZE_NORMAL; + SAMTOOLS_STATS_INSERTSIZE as SAMTOOLS_STATS_INSERTSIZE_TUMOR} from "${projectDir}/modules/samtools/samtools_stats_insertsize" +include {SAMTOOLS_FILTER_UNIQUE as SAMTOOLS_FILTER_UNIQUE_NORMAL; + SAMTOOLS_FILTER_UNIQUE as SAMTOOLS_FILTER_UNIQUE_TUMOR} from "${projectDir}/modules/samtools/samtools_filter_unique_reads" +include {BICSEQ2_NORMALIZE as BICSEQ2_NORMALIZE_NORMAL; + BICSEQ2_NORMALIZE as BICSEQ2_NORMALIZE_TUMOR} from "${projectDir}/modules/biqseq2/bicseq2_normalize" +include {BICSEQ2_SEG} from "${projectDir}/modules/biqseq2/bicseq2_seg" +include {BICSEQ2_SEG_UNPAIRED} from "${projectDir}/modules/biqseq2/bicseq2_seg_unpaired" +include {MSISENSOR2_MSI} from "${projectDir}/modules/msisensor2/msisensor2" + +include {RENAME_METADATA; + RENAME_METADATA as RENAME_METADATA_LANCET} from "${projectDir}/modules/python/python_rename_metadata" +include {MERGE_PREP; + MERGE_PREP as MERGE_PREP_LANCET} from "${projectDir}/modules/python/python_merge_prep" +include {RENAME_VCF; + RENAME_VCF as RENAME_VCF_LANCET;} from "${projectDir}/modules/python/python_rename_vcf" +include {COMPRESS_INDEX_VCF; + COMPRESS_INDEX_VCF as COMPRESS_INDEX_VCF_LANCET; + COMPRESS_INDEX_VCF as COMPRESS_INDEX_VCF_REGION_LANCET} from "${projectDir}/modules/tabix/compress_vcf" +include {BCFTOOLS_SPLITMULTIALLELIC; + BCFTOOLS_SPLITMULTIALLELIC as BCFTOOLS_SPLITMULTIALLELIC_LANCET} from "${projectDir}/modules/bcftools/bcftools_split_multiallelic" +include {SPLIT_MNV; + SPLIT_MNV as SPLIT_MNV_LANCET} from "${projectDir}/modules/python/python_split_mnv" +include {REMOVE_CONTIG} from "${projectDir}/modules/python/python_remove_contig" + +include {BCFTOOLS_MERGECALLERS; + BCFTOOLS_MERGECALLERS as BCFTOOLS_MERGECALLERS_FINAL} from "${projectDir}/modules/bcftools/bcftools_merge_callers" +include {BEDTOOLS_STARTCANDIDATES} from "${projectDir}/modules/bedtools/bedtools_start_candidates" +include {GET_CANDIDATES} from "${projectDir}/modules/python/python_get_candidates" +include {VCF_TO_BED} from "${projectDir}/modules/python/python_vcf_to_bed" +include {LANCET_CONFIRM} from "${projectDir}/modules/nygenome/lancet_confirm" +include {COMPRESS_INDEX_VCF_REGION; + COMPRESS_INDEX_VCF_REGION as COMPRESS_INDEX_VCF_ALL_CALLERS; + COMPRESS_INDEX_VCF_REGION as COMPRESS_INDEX_VCF_MERGED} from "${projectDir}/modules/tabix/compress_vcf_region" +include {BCFTOOLS_INTERSECTVCFS} from "${projectDir}/modules/bcftools/bcftools_intersect_lancet_candidates" + +include {MERGE_COLUMNS} from "${projectDir}/modules/python/python_merge_columns" +include {ADD_NYGC_ALLELE_COUNTS} from "${projectDir}/modules/python/python_add_nygc_allele_counts" +include {ADD_FINAL_ALLELE_COUNTS} from "${projectDir}/modules/python/python_add_final_allele_counts" +include {FILTER_PON} from "${projectDir}/modules/python/python_filter_pon" +include {FILTER_VCF} from "${projectDir}/modules/python/python_filter_vcf" +include {SNV_TO_MNV_FINAL_FILTER} from "${projectDir}/modules/python/python_snv_to_mnv_final_filter" + +include {GATK_SORTVCF_SOMATIC} from "${projectDir}/modules/gatk/gatk_sortvcf_somatic_merge" +include {REORDER_VCF_COLUMNS} from "${projectDir}/modules/python/python_reorder_vcf_columns" +include {COMPRESS_INDEX_MERGED_VCF} from "${projectDir}/modules/tabix/compress_merged_vcf" +include {VEP_SOMATIC} from "${projectDir}/modules/ensembl/varianteffectpredictor_somatic" +include {COSMIC_ANNOTATION_SOMATIC} from "${projectDir}/modules/cosmic/cosmic_annotation_somatic" +include {COSMIC_CANCER_RESISTANCE_MUTATION_SOMATIC} from "${projectDir}/modules/cosmic/cosmic_add_cancer_resistance_mutations_somatic" +include {SOMATIC_VCF_FINALIZATION} from "${projectDir}/modules/python/python_somatic_vcf_finalization" +include {SNPSIFT_ANNOTATE as SNPSIFT_ANNOTATE_DBSNP_GERMLINE; + SNPSIFT_ANNOTATE as SNPSIFT_ANNOTATE_DBSNP_SOMATIC} from "${projectDir}/modules/snpeff_snpsift/snpsift_annotate" +include {ANNOTATE_BICSEQ2_CNV} from "${projectDir}/modules/r/annotate_bicseq2_cnv" +include {MERGE_SV} from "${projectDir}/modules/r/merge_sv" +include {ANNOTATE_SV; + ANNOTATE_SV as ANNOTATE_SV_SUPPLEMENTAL} from "${projectDir}/modules/r/annotate_sv" +include {ANNOTATE_GENES_SV; + ANNOTATE_GENES_SV as ANNOTATE_GENES_SV_SUPPLEMENTAL} from "${projectDir}/modules/r/annotate_genes_sv" +include {ANNOTATE_SV_WITH_CNV; + ANNOTATE_SV_WITH_CNV as ANNOTATE_SV_WITH_CNV_SUPPLEMENTAL} from "${projectDir}/modules/r/annotate_sv_with_cnv" +include {FILTER_BEDPE; + FILTER_BEDPE as FILTER_BEDPE_SUPPLEMENTAL} from "${projectDir}/modules/r/filter_bedpe" + +include {MULTIQC} from "${projectDir}/modules/multiqc/multiqc" + + +// help if needed +if (params.help){ + help() + exit 0 +} + +// log paramiter info +param_log() + +// main workflow +workflow PTA { + + if (params.csv_input) { + ch_input_sample = extract_csv(file(params.csv_input, checkIfExists: true)) + // Concat local Fastq files from CSV input if required. + CONCATENATE_PTA_FASTQ(ch_input_sample) + CONCATENATE_PTA_FASTQ.out.read_meta_ch.map{it -> [it[0], it[2]]}.set{read_ch} + CONCATENATE_PTA_FASTQ.out.read_meta_ch.map{it -> [it[0], it[1]]}.set{meta_ch} + } + + // ** Step 1: Qual_Stat + JAX_TRIMMER(read_ch) + + FASTQC(JAX_TRIMMER.out.trimmed_fastq) + + // ** Step 2: Get Read Group Information + READ_GROUPS(JAX_TRIMMER.out.trimmed_fastq, "gatk") + + // PDX CASES TO ADD AND VALIDATE: + // Normal samples should PASS the PDX step. + + // ** Step 2a: Xenome if PDX data used. + ch_XENOME_CLASSIFY_multiqc = Channel.empty() //optional log file. + if (params.pdx){ + // Xenome Classification + XENOME_CLASSIFY(JAX_TRIMMER.out.trimmed_fastq) + ch_XENOME_CLASSIFY_multiqc = XENOME_CLASSIFY.out.xenome_stats // set log file for multiqc + + // Xenome Read Sort + FASTQ_SORT(XENOME_CLASSIFY.out.xenome_fastq, 'human') + bwa_mem_mapping = FASTQ_SORT.out.sorted_fastq.join(READ_GROUPS.out.read_groups) + + } else { + bwa_mem_mapping = JAX_TRIMMER.out.trimmed_fastq.join(READ_GROUPS.out.read_groups) + } + + // ** Step 3: BWA-MEM Alignment + BWA_MEM(bwa_mem_mapping) + + // ** Step 4: Sort mapped reads + PICARD_SORTSAM(BWA_MEM.out.sam) + + // ** Step 5: Remove short mapping 'artifacts': https://github.com/nygenome/nygc-short-alignment-marking + SHORT_ALIGNMENT_MARKING(PICARD_SORTSAM.out.bam) + + // ** Step 6: Clean BAM to set MAPQ = 0 when read is unmapped (issue introduced in step 5) + PICARD_CLEANSAM(PICARD_SORTSAM.out.bam) + + // ** Step 7: Fix mate information (fix pair flags due to mapping adjustment in step 5) + PICARD_FIX_MATE_INFORMATION(PICARD_CLEANSAM.out.cleaned_bam) + + // ** Step 8: Markduplicates + PICARD_MARKDUPLICATES(PICARD_FIX_MATE_INFORMATION.out.fixed_mate_bam) + + // ** Step 9: Calculate BQSR + GATK_BASERECALIBRATOR(PICARD_MARKDUPLICATES.out.dedup_bam) + + // ** Step 10: Apply BQSR + apply_bqsr = PICARD_MARKDUPLICATES.out.dedup_bam.join(GATK_BASERECALIBRATOR.out.table) + GATK_APPLYBQSR(apply_bqsr) + + // Step 12: Nextflow channel processing + // https://github.com/nf-core/sarek/blob/master/workflows/sarek.nf#L854 + + GATK_APPLYBQSR.out.bam.join(GATK_APPLYBQSR.out.bai).join(meta_ch).branch{ + normal: it[3].status == 0 + tumor: it[3].status == 1 + }.set{ch_final_bam} + // re-join the sampleID to metadata information. Split normal and tumor samples into 2 different paths. + // Process tumor and normal BAMs seperately for conpair. For calling, use mapped and crossed data. + + // ** Get alignment and WGS metrics + PICARD_COLLECTALIGNMENTSUMMARYMETRICS(GATK_APPLYBQSR.out.bam) + PICARD_COLLECTWGSMETRICS(GATK_APPLYBQSR.out.bam) + + // ** NEXTFLOW OPERATORS::: Establish channels with sample pairs and individual input objects for downstream calling + + // get sample names, and join to bams. + GATK_GETSAMPLENAME_NORMAL(ch_final_bam.normal.map{ id, bam, bai, meta -> [id, meta, bam, bai] }) + GATK_GETSAMPLENAME_TUMOR(ch_final_bam.tumor.map{ id, bam, bai, meta -> [id, meta, bam, bai] }) + + ch_normal_to_cross = ch_final_bam.normal.join(GATK_GETSAMPLENAME_NORMAL.out.sample_name).map{ id, bam, bai, meta, readID -> [meta.patient, meta, bam, bai, readID] } + ch_tumor_to_cross = ch_final_bam.tumor.join(GATK_GETSAMPLENAME_TUMOR.out.sample_name).map{ id, bam, bai, meta, readID -> [meta.patient, meta, bam, bai, readID] } + + /* + The above map statements adjusts channels for normal, tumor samples to organize them by patient IDs. + A common key ID is needed to cross tumor by normal samples when multiples of each patient are run together. + + NOTE!!!! that if a common patient key is then used as 'sampleID' across samples, + downstream results will have name collisions and results will be overwritten in muliple tumor to normal mappings. + + e.g., patient: foo; tumor1 = bar, tumor2 = baz; normal = fizz. + + Common key: sampleID == patient, results = foo.calls.vcf for both bar--fizz and baz--fizz, and results are mangled. + + Unique key: sampleID == patient--tumor--normal, results == foo--bar--fizz.calls.vcf & foo--baz--fizz.calls.vcf. Results OK. + + Therefore, the above ch_*_to_cross should ONLY be used in crossing samples. + A different channel is made below for cases when needed in callers. + */ + + // Cross all normal and tumor by common patient ID. + ch_paired_samples = ch_normal_to_cross.cross(ch_tumor_to_cross) + .map { normal, tumor -> + def meta = [:] + meta.patient = normal[0] + meta.normal_id = normal[1].sampleID + meta.tumor_id = tumor[1].sampleID + meta.sex = normal[1].sex + meta.id = "${meta.patient}--${meta.tumor_id}--${meta.normal_id}".toString() + + [meta.id, meta, normal[2], normal[3], normal[4], tumor[2], tumor[3], tumor[4]] + } + /* + normal[0] is patient ID, + normal[1] and tumor[1] are meta info + normal[2] is normal bam, normal[3] is bai, normal[4] is read group ID. + tumor[2] is bam, tumor[3] is bai, tumor[4] is read group ID. + */ + + // Restore un-paired tumor samples, and add NA12878 as pairing in those cases + ch_paired_samples = ch_tumor_to_cross + .mix(ch_paired_samples) + .map{it -> [it[1].patient, it[1], it[2], it[3], it[4]]}.groupTuple().filter{it[2].size() == 1} + // it[0] = sampleID, it[1] = meta, it[2] = bam, it[3] = bai, it[4] = sampleReadID. + // unknown group size, no 'size' statement can be used in groupTuple + .map{tumor -> + def meta = [:] + meta.patient = tumor[1][0].patient + meta.normal_id = 'NA12878' + meta.tumor_id = tumor[1][0].sampleID + meta.sex = tumor[1][0].sex + meta.id = "${meta.patient}--${meta.tumor_id}--${meta.normal_id}".toString() + + [meta.id, meta, params.na12878_bam, params.na12878_bai, params.na12878_sampleName, tumor[2][0], tumor[3][0], tumor[4][0]] + } + .mix(ch_paired_samples) + + /* SAMPLE PAIRING CASES AND NOTES: + 1. Paired only for all samples: Managed by the intial cross statement. + 2. Tumor only provide for all samples: Managed by the intial cross and subsequent remapping of non-crossed samples. + 3. Some samples have a pair and others do not: Managed by the intial cross, and subsequent remapping of non-crossed samples. + 4. Multiple tumors per normal, or multiple normals per tumor, or a mixture of this: See note below. + + Notes: + The cross statement manages one normal to many tumors, many normals to one tumor, and many normals to many tumors. + E.g.,: + [foo, [patient:foo, normal_id:n_baz, tumor_id:t_bar, sex:XX, id:t_bar_vs_n_baz], ....bam, ....bai, , ....bam, ....bai, ] + [foo, [patient:foo, normal_id:n_baz, tumor_id:t_qux, sex:XX, id:t_qux_vs_n_baz], ....bam, ....bai, , ....bam, ....bai, ] + ... + + When samples are provided without a pair, they will not be paired in the cross statment and dropped from the first: 'ch_paired_samples' instantiation. + To recover un-paired tumors, pair them with NA12878 and pass them with paired samples downstream, + the group of all tumor samples: 'ch_tumor_to_cross' is mixed with the paired sample: 'ch_paired_samples'. + Cases where tumor samples were paired are then filtered. Tumors that were paired in the cross will appear > 2 times in the mix results, and are removed via the 'filter it[2].size()==1' statement. + The resulting tumor-only samples are mapped into the format seen in the cross statement, with NA12878 being added via parameters as the 'normal' sample. + */ + + + ch_ind_samples = ch_paired_samples + .filter{it[4] != params.na12878_sampleName} + .multiMap{it -> + normal: ["${it[1].patient}--${it[1].normal_id}".toString(), it[1], it[2], it[3], it[4]] + tumor: ["${it[1].patient}--${it[1].tumor_id}".toString(), it[1], it[5], it[6], it[7]] + } + ch_normal_samples = ch_ind_samples.normal.unique{it[0]} + ch_tumor_samples = ch_ind_samples.tumor.unique{it[0]} + + ch_tumor_only = ch_paired_samples + .filter{it[4] == params.na12878_sampleName} + .map{it -> ["${it[1].patient}--${it[1].tumor_id}".toString(), it[1], it[5], it[6], it[7]]} + .unique{it[0]} + + ch_msisensor2_input = ch_paired_samples + .map{["${it[1].patient}--${it[1].tumor_id}".toString(), it[1], it[5], it[6], it[7]]} + .unique{it[0]} + + /* + The above establishes channels needed for germline calling, bicseq2 and MSIsensor2. + Those steps require BAM, index and readID. + Here sampleID is reset to the original ID from the CSV parser which is: 'patient--sample' + Note that NA12878 is filtered for germline and can be filtered for bicseq2 / conpair, + CNA and sample comparision analysis may not make sense for that pairing. + All tumor samples are passed to MSIsensor2 as it runs in tumor-only mode. + */ + + + + // ** Step 13: Conpair pileup for T/N true pairs. + // Step not run on tumor-only samples. As contamination analysis is not biologcally relavent. + + conpair_input = ch_paired_samples + .filter{it[4] != params.na12878_sampleName} + .multiMap{it -> + normal: [it[1].patient, "${it[1].normal_id}".toString(), it[2], it[3]] + tumor: [it[1].patient, "${it[1].tumor_id}".toString(), it[5], it[6]] + } + /* + Remap the paired samples to required normal/tumor inputs for conpair, and filter NA12878 paired samples. + it[1] = metadata, it[2] = normal BAM, it[3] = normal BAI. + it[5] = tumor BAM, it[6] = tumor BAI. + Patient ID is used here because samples must be re-crossed after the pileup to match all tumors and normals. + */ + + CONPAIR_NORMAL_PILEUP(conpair_input.normal.unique{it[2]}, 'normal') + CONPAIR_TUMOR_PILEUP(conpair_input.tumor.unique{it[2]}, 'tumor') + + conpair_input = CONPAIR_NORMAL_PILEUP.out.pileup.cross(CONPAIR_TUMOR_PILEUP.out.pileup) + .map { normal, tumor -> [normal[0], "${normal[0]}--${tumor[1]}--${normal[1]}".toString(), normal[2], tumor[2]] + } + // normal[0] is patientID or 'sampleID', normal[2] is normal pileup, tumor[2] is tumor pileup. + + CONPAIR(conpair_input) + + + // ** Step 14: Germline Calling and Annotation + + // Find the paths of all `scattered.interval_list` files, and make tuples with an index value. + // This is used for for HaplotypeCaller variant regions and GenotypeGVCF + + // Loads paths from a directory, collects into a list, sorts, + // maps to a set with indicies, flattens the map [file, index, file, index ...], + // collates the flattened map into pairs, then remaps to the pairs to tuple + + intervals = Channel.fromPath( params.chrom_intervals+'/*/scattered.interval_list' ) + .collect() + .sort() + .map { items -> items.withIndex() } + .flatten() + .collate(2) + .map { item, idx -> tuple( item, idx + 1 ) } + // https://stackoverflow.com/a/67084467/18557826 + interval_count = files( params.chrom_intervals+'/*/scattered.interval_list' ).size() + // interval count is used in groupTuple size statements. + + // Applies scatter intervals from above to the BAM file channel prior to variant calling. + chrom_channel = ch_normal_samples.combine(intervals).filter{it[4] != params.na12878_sampleName} + + // Read a list of chromosome names from a parameter. These are provided to several tools. + chroms = Channel + .fromPath("${params.chrom_contigs}") + .splitText() + .map{it -> it.trim()} + + // Get a list of primary chromosomes and exclude chrM (dropRight(1)) + chrom_list = chroms.collect().dropRight(1) + chrom_list_noY = chrom_list.dropRight(1) + + + + // Variant calling. + GATK_HAPLOTYPECALLER_SV_GERMLINE(chrom_channel) + + // Applies gather to scattered haplotype calls. + GATK_SORTVCF_GERMLINE(GATK_HAPLOTYPECALLER_SV_GERMLINE.out.vcf.groupTuple(size: interval_count), 'gvcf') + + // Applies scatter intervals from above to the merged file, and genotype. + genotype_channel = GATK_SORTVCF_GERMLINE.out.vcf_idx.combine(intervals) + + GATK_GENOTYPE_GVCF(genotype_channel) + GATK_CNNSCORE_VARIANTS(GATK_GENOTYPE_GVCF.out.vcf_idx) + + // Applies gather to genotyped/cnn called vcfs prior to tranche filtering. + GATK_SORTVCF_GENOTYPE(GATK_CNNSCORE_VARIANTS.out.vcf.groupTuple(size: interval_count), 'vcf') + + // Variant tranche filtering. + GATK_FILTER_VARIANT_TRANCHES(GATK_SORTVCF_GENOTYPE.out.vcf_idx) + + // Allele frequency and call refinement filtering. + GATK_VARIANTFILTRATION_AF(GATK_FILTER_VARIANT_TRANCHES.out.vcf_idx) + BCFTOOLS_GERMLINE_FILTER(GATK_VARIANTFILTRATION_AF.out.vcf) + + // Germline annotation - Filtered + // 1. SplitMultiAllelicRegions & compress & index + BCFTOOLS_SPLITMULTIALLELIC_REGIONS(BCFTOOLS_GERMLINE_FILTER.out.vcf_idx, chrom_list_noY) + // 2. vepPublicSvnIndel + VEP_GERMLINE(BCFTOOLS_SPLITMULTIALLELIC_REGIONS.out.vcf_idx) + // 3. RemoveSpanning + BCFTOOLS_REMOVESPANNING(VEP_GERMLINE.out.vcf) + // 4. AddCosmic + COSMIC_ANNOTATION(BCFTOOLS_REMOVESPANNING.out.vcf) + // 5. AddCancerResistanceMutations and dbsnpIDs + COSMIC_CANCER_RESISTANCE_MUTATION_GERMLINE(COSMIC_ANNOTATION.out.vcf) + SNPSIFT_ANNOTATE_DBSNP_GERMLINE(COSMIC_CANCER_RESISTANCE_MUTATION_GERMLINE.out.vcf, params.dbSNP, params.dbSNP_index, 'intermediate') + // 6. AnnotateId & RenameCsqVcf + GERMLINE_VCF_FINALIZATION(SNPSIFT_ANNOTATE_DBSNP_GERMLINE.out.vcf, 'filtered') + + + + // ** Step 15: Somatic Calling + + // Applies scatter intervals from above to the BQSR bam file + somatic_calling_channel = ch_paired_samples.combine(intervals) + + /* Applies scatter intervals from above to the BQSR bam file + somatic_calling_channel = ch_paired_samples.combine(chroms) + NOTE: The above code line will split by Mutect2 calling by individual 'chroms'. + Entire chromosomes are scattered. For WGS, this is computationally intensive. + We changed to calling to be done based on the same intervals passed to the germline caller. + These intervals are based on the 'NoN' file made by BROAD/GATK. + If complete chromosomes are requried, the above line of code can be uncommented. + */ + + // ** Mutect2 - SNP/InDEL Calling + // STEPS: Call on each chromosome / interval. + // Prior to 'filtermutectcalls' vcfs must be merged (GATK best practice). + // NOTE: The group and map statement ensures that VCFs are organzied by sampleID, and carry and toolID is maintained through the process. + // Prior to 'filtermutectcalls' "stats" files from mutect2 must be merged (GATK best practice). + // Merge vcfs and stats must be Nextflow joined prior to 'filtermutectcalls' to avoid samples being confounded. + + GATK_MUTECT2(somatic_calling_channel) + + sort_merge_input_mutect2VCF = GATK_MUTECT2.out.vcf + .groupTuple(size: interval_count) + .map { sampleID, vcf, meta, normal, tumor, tool -> tuple( sampleID, vcf, meta.unique()[0], normal.unique()[0], tumor.unique()[0], tool.unique()[0] ) } + + GATK_SORTVCF_MUTECT(sort_merge_input_mutect2VCF) + GATK_MERGEMUTECTSTATS(GATK_MUTECT2.out.stats.groupTuple(size: interval_count)) + + filter_mutect_input = GATK_SORTVCF_MUTECT.out.vcf_tbi.join(GATK_MERGEMUTECTSTATS.out.stats) + + GATK_FILTERMUECTCALLS(filter_mutect_input) + + // ** Lancet - SNP/InDEL Calling + // Generate a list of chromosome beds. This is generated in the same manner as the calling `intervals` variable above. + lancet_beds = Channel.fromPath( params.lancet_beds_directory+'/*.bed' ) + .collect() + .sort() + .map { items -> items.withIndex() } + .flatten() + .collate(2) + .map { item, idx -> tuple( item, idx + 1 ) } + // https://stackoverflow.com/a/67084467/18557826 + lancet_beds_count = files( params.lancet_beds_directory+'/*.bed' ).size() + // bed file count is used in groupTuple size statements. + + // Applies scatter intervals from above to the BQSR bam file + lancet_calling_channel = ch_paired_samples.combine(lancet_beds) + LANCET(lancet_calling_channel) + + sort_merge_input_lancetVCF = LANCET.out.vcf + .groupTuple(size: lancet_beds_count) + .map { sampleID, vcf, meta, normal, tumor, tool -> tuple( sampleID, vcf, meta.unique()[0], normal.unique()[0], tumor.unique()[0], tool.unique()[0] ) } + + GATK_SORTVCF_LANCET(sort_merge_input_lancetVCF) + + // ** Manta - SV Calling + MANTA(ch_paired_samples) + // FilterNonpass can be used with `SelectVariants` and `--exclude-filtered`. However, hard filtering excluded for now. + + // ** Strelka2 - SNP/InDEL Calling + strekla2_input = ch_paired_samples.join(MANTA.out.manta_smallindel_vcf_tbi) + STRELKA2(strekla2_input) + + // ** Gridss - SV Calling + GRIDSS_PREPROCESS(ch_paired_samples) + gridss_assemble_input = ch_paired_samples.join(GRIDSS_PREPROCESS.out.gridss_preproc) + GRIDSS_ASSEMBLE(gridss_assemble_input) + gridss_call_input = ch_paired_samples.join(GRIDSS_ASSEMBLE.out.gridss_assembly) + GRIDSS_CALLING(gridss_call_input) + GRIDSS_CHROM_FILTER(GRIDSS_CALLING.out.gridss_vcf, chrom_list) + GRIPSS_SOMATIC_FILTER(GRIDSS_CHROM_FILTER.out.gridss_chrom_vcf) + // NOTE: this filtering tool is hard coded for GRCh38 based on PON naming. + + // ** BicSeq2 - CNV Calling + /* This step does not run on unpaired samples. + CNV of tumor samples against an unrelated normal will produce spurious results. + NA12878 paired samples can be filtered from ch_normal_samples and ch_tumor_samples channels at their creation. + */ + SAMTOOLS_STATS_INSERTSIZE_NORMAL(ch_normal_samples) + SAMTOOLS_STATS_INSERTSIZE_TUMOR(ch_tumor_samples.mix(ch_tumor_only)) + + SAMTOOLS_FILTER_UNIQUE_NORMAL(ch_normal_samples, chrom_list) + SAMTOOLS_FILTER_UNIQUE_TUMOR(ch_tumor_samples.mix(ch_tumor_only), chrom_list) + + biqseq_norm_input_normal = SAMTOOLS_FILTER_UNIQUE_NORMAL.out.uniq_seq.join(SAMTOOLS_STATS_INSERTSIZE_NORMAL.out.read_length_insert_size) + // sampleID, individual_chr_seq_files, meta, read_ID, read_length, insert_size. + biqseq_norm_input_tumor = SAMTOOLS_FILTER_UNIQUE_TUMOR.out.uniq_seq.join(SAMTOOLS_STATS_INSERTSIZE_TUMOR.out.read_length_insert_size) + // sampleID, individual_chr_seq_files, meta, read_ID, read_length, insert_size. + + fasta_files = Channel.fromPath( file(params.ref_fa).parent + '/*_chr*' ) + .collect() + // collect individual chr fasta files. These are located in the same directory as the main reference. + // if the extension of `name_chr#.fa` changes this match will break. + + BICSEQ2_NORMALIZE_NORMAL(biqseq_norm_input_normal, fasta_files) + BICSEQ2_NORMALIZE_TUMOR(biqseq_norm_input_tumor, fasta_files) + // note: this can not be split by chrom, even though bicseq2 norm acts on chroms in turn, + // it needs all chroms to parameterize the normalization. + // reported error will be in these cases: "Error in bin_read: bin file is in incorrect format." + + bicseq_normal = BICSEQ2_NORMALIZE_NORMAL.out.normalized_output + .map{it -> [it[2].patient, it[1], it[2], it[3]]} + + bicseq_tumor = BICSEQ2_NORMALIZE_TUMOR.out.normalized_output + .map{it -> [it[2].patient, it[1], it[2], it[3]]} + + bicseq2_seg_input = bicseq_normal.cross(bicseq_tumor) + .map{normal, tumor -> + def meta = [:] + meta.patient = normal[2].patient + meta.normal_id = normal[2].sampleID + meta.tumor_id = tumor[2].sampleID + meta.sex = normal[2].sex + meta.id = "${tumor[2].patient}--${tumor[2].tumor_id}--${tumor[2].normal_id}".toString() + + ["${tumor[2].patient}--${tumor[2].tumor_id}--${tumor[2].normal_id}".toString(), normal[1], tumor[1], normal[2], normal[3], tumor[3]]} + // sampleID, individual_normal_norm_bin_files, individual_tumor_norm_bin_files, metadata, norm_readID, tumor_readID. + // The metadata object here is reset following the cross. So that ID matches up again. + // It is possible that in many to many or one to many crosses, the ID field will not reflect the crossed samples. + + BICSEQ2_SEG(bicseq2_seg_input) + // NOTE: with insufficent coverage, the segmentation will fail because the 'lamda' factor can not be properly optimized. + + bicseq2_tumoronly_input = BICSEQ2_NORMALIZE_TUMOR.out.normalized_output + .filter{it[2].normal_id == 'NA12878'} + + BICSEQ2_SEG_UNPAIRED(bicseq2_tumoronly_input) + + bicseq2_calls = BICSEQ2_SEG_UNPAIRED.out.bicseq2_sv_calls + .map{it -> [it[3].id, it[1], it[2], it[3], it[4], it[5], it[6]]} + .mix(BICSEQ2_SEG.out.bicseq2_sv_calls) + // remap output from unpaired bicseq2 to standard format for bicseq2 paired. And mix both channel outputs. This is passed to annotation. + + // Step 15: MSI + MSISENSOR2_MSI(ch_msisensor2_input) + + /* + The follow are the harmonized output channels for each tool: + + Manta + MANTA.out.manta_somaticsv_tbi + + Strelka_SV + STRELKA2.out.strelka_snv_vcf_tbi + + Strelka_INDEL + STRELKA2.out.strelka_indel_vcf_tbi + + Mutect2 + GATK_FILTERMUECTCALLS.out.mutect2_vcf_tbi + + Lancet + GATK_SORTVCF_LANCET.out.lancet_vcf + + Gridss + GRIPSS_SOMATIC_FILTER.out.gripss_filtered_bgz + + Bicseq2 + BICSEQ2_SEG.out.bicseq2_sv_calls + */ + + /* + NOTE: + The call merging and annotatoins sections of this workflow becomes highly complex. + Files from each caller are passed through a set of 'merge prep' steps. + These steps apply various functions to manipulate the VCF header, and also calls within the VCFs. + Once the VCFs are prepared, a merge occurs. Following the merge, non-exonic regions are parsed out, + and calls in those regions are passed to Lancet for confirmation/rescue. + Following this, confirmed calls are used as 'support' and merged back to the full caller call set. + Additional manipulations are done on the VCF, and then the 'final' VCF + is passed through to the annotation steps. Additional and different annotations are done on SV and CNV + calls. The steps are commented to faciliate understanding of what is being done. + */ + + somatic_caller_concat = MANTA.out.manta_somaticsv_tbi.concat( STRELKA2.out.strelka_snv_vcf_tbi, + STRELKA2.out.strelka_indel_vcf_tbi, + GATK_FILTERMUECTCALLS.out.mutect2_vcf_tbi, + GATK_SORTVCF_LANCET.out.vcf_tbi ) + + // Merge prep: + // 1. Rename VCF header to include tool name: + RENAME_METADATA(somatic_caller_concat) + + // 2. Order samples in VCF to 'normal', 'tumor' and prep for merge. + // See script for list of changes applied to the VCF: + MERGE_PREP(RENAME_METADATA.out.rename_metadata_vcf) + + // 3. Rename VCF header to specfied 'normal' and 'tumor' names, add tool prefix to sampleIDs. + RENAME_VCF(MERGE_PREP.out.merge_prep_vcf) + + // 4. Compress and Index VCF: + COMPRESS_INDEX_VCF(RENAME_VCF.out.rename_vcf) + + // 5. Split out multi-allelic calls: + BCFTOOLS_SPLITMULTIALLELIC(COMPRESS_INDEX_VCF.out.compressed_vcf_tbi) + + // 6. Split MNV calls: + SPLIT_MNV(BCFTOOLS_SPLITMULTIALLELIC.out.vcf) + + // 7. Sort VCF: + GATK_SORTVCF_TOOLS(SPLIT_MNV.out.split_mnv_vcf) + + callers_for_merge = GATK_SORTVCF_TOOLS.out.vcf_tbi + .groupTuple(size: 5) + .map{sampleID, vcf, idx, meta, normal_sample, tumor_sample, tool_list -> tuple( sampleID, vcf, idx, meta.unique()[0] ) } + .combine(chrom_list_noY.flatten()) + // The above collects all callers on sampleID, then maps to avoid duplication of data and to drop the tool list, which is not needed anymore. + // Note that this could be done using 'by' in the groupTuple statement. However, the map is still required to remove the tool list. + // 'size: 5' corresponds to the 5 callers used in the workflow. If additional callers are added, this must be changed. + + // Merge Callers, Extract non-exonic calls and try to confirm those with Lancet, + // then prep confirmed calls for merged back to full merge set: + + // ** Make all caller merge set, and compress and index: + BCFTOOLS_MERGECALLERS(callers_for_merge) + COMPRESS_INDEX_VCF_ALL_CALLERS(BCFTOOLS_MERGECALLERS.out.vcf) + + // ** Extract non-exonic, and try to confirm with Lancet. + // 1. Intersect with '-v' against a list of exonic regions. This step subsets calls to non-exonic regions. + BEDTOOLS_STARTCANDIDATES(BCFTOOLS_MERGECALLERS.out.vcf) + + // 2. Get candidates from intersected, using rules outlined in get_candidates.py (script docs provided by original dev). + // Compress and index the resulting VCF. + GET_CANDIDATES(BEDTOOLS_STARTCANDIDATES.out.vcf) + COMPRESS_INDEX_VCF_REGION(GET_CANDIDATES.out.vcf) + + // 3. VCF to BED + VCF_TO_BED(GET_CANDIDATES.out.vcf) + + // 4. Confirm extracted calls with Lancet: + // Compress and index the resulting VCF. + lancet_confirm_input = VCF_TO_BED.out.bed + .combine(ch_paired_samples, by: 0) + .map{sampleID, bed, meta, chrom, meta2, normal_bam, normal_bai, normal_name, tumor_bam, tumor_bai, tumor_name -> tuple( sampleID, bed, meta, normal_bam, normal_bai, normal_name, tumor_bam, tumor_bai, tumor_name, chrom ) } + // The above combines output by sampleID with BAM files. Then maps to avoid duplication of data, and set input tuples for the steps that follow. + // Note that "combine" here, combines each output stream from VCF_TO_BED with ch_paired_samples, keeping the scattered chrom seperate. + + LANCET_CONFIRM(lancet_confirm_input) + COMPRESS_INDEX_VCF_REGION_LANCET(LANCET_CONFIRM.out.vcf) + + // 5. Intersect Lancet Confirm with candidate extractions. + candidate_lancet_intersect_input = COMPRESS_INDEX_VCF_REGION.out.compressed_vcf_tbi + .join(COMPRESS_INDEX_VCF_REGION_LANCET.out.compressed_vcf_tbi, by: [0,6]) + .map{sampleID, chrom, vcf, tbi, meta, empty_name, empty_name2, vcf2, tbi2, meta2, normal_name, tumor_name -> tuple( sampleID, vcf, tbi, vcf2, tbi2, meta, normal_name, tumor_name, chrom )} + // The above joins candidate VCF with Lancet Confirm VCF by sampleID and chrom. Then maps to avoid duplication of data, and set input tuples for the steps that follow. + // Note: A. The 'by' statement here, joins on sampleID and chrom, which correspond to index values 0 and 6 in the output tuples. + // B. 'empty_name' is used here because 'normal_name' and 'tumor_name' are not required/used in the candidate steps. + // C. 'normal_name' and 'tumor_name' are needed to match input tuple expectations for teh steps that follow. + + BCFTOOLS_INTERSECTVCFS(candidate_lancet_intersect_input) + + lancet_confirm_mergePrep_input = BCFTOOLS_INTERSECTVCFS.out.vcf.map{sampleID, vcf, index, meta, normal_name, tumor_name -> tuple(sampleID, vcf, index, meta, normal_name, tumor_name, 'lancet_support')} + // The above remaps the output tuple from BCFTOOLS_INTERSECTVCF to include the tool name 'lancet', which is needed for the steps that follow. + // 'lancet_support' is used to trigger `--support` in the MERGE_PREP_LANCET statement. Logic is present in RENAME_VCF_LANCET to set the header to 'lancet' rather than 'lancet_support' + + // ** Prep calls for merge back to all caller merge set. + // 1. Rename VCF header to include tool name: + RENAME_METADATA_LANCET(lancet_confirm_mergePrep_input) + + // 2. Order samples in VCF to 'normal', 'tumor' and prep for merge. + // See script for list of changes applied to the VCF: + // This step is done as `--support` + MERGE_PREP_LANCET(RENAME_METADATA_LANCET.out.rename_metadata_vcf) + + // 3. Rename VCF header to specfied 'normal' and 'tumor' names, add tool prefix to sampleIDs. + RENAME_VCF_LANCET(MERGE_PREP_LANCET.out.merge_prep_vcf) + + // 4. Compress and Index VCF: + COMPRESS_INDEX_VCF_LANCET(RENAME_VCF_LANCET.out.rename_vcf) + + // 5. Split out multi-allelic calls: + BCFTOOLS_SPLITMULTIALLELIC_LANCET(COMPRESS_INDEX_VCF_LANCET.out.compressed_vcf_tbi) + + // 6. Split MNV calls: + SPLIT_MNV_LANCET(BCFTOOLS_SPLITMULTIALLELIC_LANCET.out.vcf) + + // 7. Remove contig descriptions: + REMOVE_CONTIG(SPLIT_MNV_LANCET.out.split_mnv_vcf) + + // 8. Sort VCF. + GATK_SORTVCF_TOOLS_LANCET(REMOVE_CONTIG.out.remove_contig_vcf) + + // ** Merge lancet confirmed back to all merged callers. Compress and index merged calls. + allCalls_lancetConfirm_merge_input = COMPRESS_INDEX_VCF_ALL_CALLERS.out.compressed_vcf_tbi + .join(GATK_SORTVCF_TOOLS_LANCET.out.vcf_tbi, by: [0,6]) + .map{sampleID, chrom, vcf, tbi, meta, empty_name, empty_name2, vcf2, tbi2, meta2, normal_name, tumor_name -> tuple( sampleID, [vcf, vcf2], [tbi, tbi2], meta, chrom )} + // BCFTOOLS_MERGE Requires an input tuple as follows: [val(sampleID), file(vcf), file(idx), val(meta), val(chrom)] + // Join the output streams on sampleID and chrom, and then map to the require tuple structure. Note that [vcf, vcf2] makes a list that is understoon by the module. + + BCFTOOLS_MERGECALLERS_FINAL(allCalls_lancetConfirm_merge_input) + COMPRESS_INDEX_VCF_MERGED(BCFTOOLS_MERGECALLERS_FINAL.out.vcf) + + // ** Manipulation of VCF into final file to be passed to annotation modules. + // 1. Merge Columns. + // See script merge_columns.py for the three features used in merge (script docs provided by original dev). + MERGE_COLUMNS(COMPRESS_INDEX_VCF_MERGED.out.compressed_vcf_tbi) + + // 2. Add Allele Count to VCF. + // "Runs pileup on tumor and normal bam files to compute allele counts for bi-allelic SNV and Indel variants in VCF file and adds pileup format columns to the VCF file."" + addAlleleCounts_confirm_input = MERGE_COLUMNS.out.mergeColumn_vcf + .combine(ch_paired_samples, by: 0) + .map{sampleID, vcf, meta, chrom, meta2, normal_bam, normal_bai, normal_name, tumor_bam, tumor_bai, tumor_name -> tuple( sampleID, vcf, meta, normal_bam, normal_bai, tumor_bam, tumor_bai, chrom ) } + ADD_NYGC_ALLELE_COUNTS(addAlleleCounts_confirm_input) + + // 3. Add Final Allele Counts to VCF + ADD_FINAL_ALLELE_COUNTS(ADD_NYGC_ALLELE_COUNTS.out.vcf) + + // 4. Filter VCF based on PON + FILTER_PON(ADD_FINAL_ALLELE_COUNTS.out.vcf) + + // 5. Filter VCF based on gnomad and "ALL_GRCh38_sites" + FILTER_VCF(FILTER_PON.out.vcf) + + // 6. "SnvstomnvsCountsbasedfilterAnnotatehighconf" + // Parses file and converts adjacent SNVs to MNVs if they have they match the MNV_ID and called_by fields. + SNV_TO_MNV_FINAL_FILTER(FILTER_VCF.out.vcf) + + // ** Collect and Merge Chroms. + num_intervals = file(params.chrom_contigs).countLines().toInteger() - 2 + // number of chrom intervals split on during the above steps. A 'value' variable used in groupTuple size statement. MT and Y are removed, hence '- 2' + chrom_merge_input = SNV_TO_MNV_FINAL_FILTER.out.vcf + .groupTuple(size: num_intervals) + .map{sampleID, vcf, meta, chrom -> tuple( sampleID, vcf, meta.unique()[0] ) } + // Collect scattered chroms, remap to tuple without chrom names. + + GATK_SORTVCF_SOMATIC(chrom_merge_input) + REORDER_VCF_COLUMNS(GATK_SORTVCF_SOMATIC.out.vcf_idx) + // output tuple = val(sampleID), path("*_mergePrep.vcf"), val(meta). + // meta = [patient:test, normal_id:test, tumor_id:test2, sex:XX, id:test2_vs_test] + // This named list can be accessed in the script section prior to """ via calls like: meta.patient + + // Compress and index the merged vcf + COMPRESS_INDEX_MERGED_VCF(REORDER_VCF_COLUMNS.out.vcf) + + // ** Annotation of somatic indels and snps + + VEP_SOMATIC(COMPRESS_INDEX_MERGED_VCF.out.compressed_vcf_tbi) + COSMIC_ANNOTATION_SOMATIC(VEP_SOMATIC.out.vcf) + COSMIC_CANCER_RESISTANCE_MUTATION_SOMATIC(COSMIC_ANNOTATION_SOMATIC.out.vcf) + + SNPSIFT_ANNOTATE_DBSNP_SOMATIC(COSMIC_CANCER_RESISTANCE_MUTATION_SOMATIC.out.vcf.map{it -> [it[0], it[1]]}, params.dbSNP, params.dbSNP_index, 'intermediate') + // note: existing module requires only sampleID and VCF. input remapped to required tuple. + + somatic_finalization_input = SNPSIFT_ANNOTATE_DBSNP_SOMATIC.out.vcf.join(COSMIC_CANCER_RESISTANCE_MUTATION_SOMATIC.out.vcf).map{it -> [it[0], it[1], it[3], it[4], it[5]]} + // re-join dbSNP ID annotated VCF output with [meta], normalID, tumorID. + + SOMATIC_VCF_FINALIZATION(somatic_finalization_input, 'filtered') + + // ** Annotation of somatic CNV and SV + + ANNOTATE_BICSEQ2_CNV(bicseq2_calls, chrom_list_noY) + + // note: joining on the sampleID, metadata, tumor_name, and normal_name for + // safety. This re-arranges the values in the channel to: + // tuple val(sampleID), val(normal_name), val(tumor_name), file(manta_vcf), file(manta_vcf_tbi), val(meta_manta), val(manta), file(gridss_bgz), val(no_idx), val(meta_gripss), val(gridss) + // Downstream, just including sampleID, normal_name, and tumor_name to simplify a similar join that is necessary + + merge_sv_input = MANTA.out.manta_somaticsv_tbi.join(GRIPSS_SOMATIC_FILTER.out.gripss_filtered_bgz, by : [0,4,5]) + MERGE_SV(merge_sv_input, chrom_list) + + ANNOTATE_SV(MERGE_SV.out.merged, "main") + ANNOTATE_SV_SUPPLEMENTAL(MERGE_SV.out.merged_suppl, "supplemental") + ANNOTATE_GENES_SV(ANNOTATE_SV.out.annot_sv_bedpe, "main") + ANNOTATE_GENES_SV_SUPPLEMENTAL(ANNOTATE_SV_SUPPLEMENTAL.out.annot_sv_bedpe, "supplemental") + + // note: joining on the sampleID, normal_name, and tumor_name for + // safety. This re-arranges the values in the channel to: + // tuple val(sampleID), val(normal_name), val(tumor_name), file(bicseq_annot), file(annot_sv_genes_bedpe) + + annot_sv_cnv_input = ANNOTATE_BICSEQ2_CNV.out.bicseq_annot.join(ANNOTATE_GENES_SV.out.annot_sv_genes_bedpe, by: [0,2,3]) + ANNOTATE_SV_WITH_CNV(annot_sv_cnv_input, "main") + + // See notes on previous step + annot_sv_cnv_suppl_input = ANNOTATE_BICSEQ2_CNV.out.bicseq_annot.join(ANNOTATE_GENES_SV_SUPPLEMENTAL.out.annot_sv_genes_bedpe, by: [0,2,3]) + ANNOTATE_SV_WITH_CNV_SUPPLEMENTAL(annot_sv_cnv_suppl_input, "supplemental") + + FILTER_BEDPE(ANNOTATE_SV_WITH_CNV.out.sv_genes_cnv_bedpe, "main") + FILTER_BEDPE_SUPPLEMENTAL(ANNOTATE_SV_WITH_CNV_SUPPLEMENTAL.out.sv_genes_cnv_bedpe, "supplemental") + + ch_multiqc_files = Channel.empty() + ch_multiqc_files = ch_multiqc_files.mix(JAX_TRIMMER.out.quality_stats.collect{it[1]}.ifEmpty([])) + ch_multiqc_files = ch_multiqc_files.mix(FASTQC.out.quality_stats.collect{it[1]}.ifEmpty([])) + ch_multiqc_files = ch_multiqc_files.mix(ch_XENOME_CLASSIFY_multiqc.collect{it[1]}.ifEmpty([])) + ch_multiqc_files = ch_multiqc_files.mix(GATK_BASERECALIBRATOR.out.table.collect{it[1]}.ifEmpty([])) + ch_multiqc_files = ch_multiqc_files.mix(PICARD_COLLECTALIGNMENTSUMMARYMETRICS.out.txt.collect{it[1]}.ifEmpty([])) + ch_multiqc_files = ch_multiqc_files.mix(PICARD_COLLECTWGSMETRICS.out.txt.collect{it[1]}.ifEmpty([])) + ch_multiqc_files = ch_multiqc_files.mix(CONPAIR.out.concordance.collect{it[1]}.ifEmpty([])) + ch_multiqc_files = ch_multiqc_files.mix(CONPAIR.out.contamination.collect{it[1]}.ifEmpty([])) + + MULTIQC ( + ch_multiqc_files.collect() + ) + +} + +// Function to extract information (meta data + file(s)) from csv file(s) +// https://github.com/nf-core/sarek/blob/master/workflows/sarek.nf#L1084 +def extract_csv(csv_file) { + + // check that the sample sheet is not 1 line or less, because it'll skip all subsequent checks if so. + file(csv_file).withReader('UTF-8') { reader -> + def line, numberOfLinesInSampleSheet = 0; + while ((line = reader.readLine()) != null) {numberOfLinesInSampleSheet++} + if (numberOfLinesInSampleSheet < 2) { + log.error "Samplesheet had less than two lines. The sample sheet must be a csv file with a header, so at least two lines." + System.exit(1) + } + } + + // Additional check of sample sheet: + // 1. Each row should specify a lane and the same combination of patient, sample and lane shouldn't be present in different rows. + // 2. The same sample shouldn't be listed for different patients. + def patient_sample_lane_combinations_in_samplesheet = [] + def sample2patient = [:] + + Channel.from(csv_file).splitCsv(header: true) + .map{ row -> + if (!sample2patient.containsKey(row.sampleID.toString())) { + sample2patient[row.sampleID.toString()] = row.patient.toString() + } else if (sample2patient[row.sampleID.toString()] != row.patient.toString()) { + log.error('The sample "' + row.sampleID.toString() + '" is registered for both patient "' + row.patient.toString() + '" and "' + sample2patient[row.sampleID.toString()] + '" in the sample sheet.') + System.exit(1) + } + } + + sample_count_all = 0 + sample_count_normal = 0 + sample_count_tumor = 0 + + Channel.from(csv_file).splitCsv(header: true) + //Retrieves number of lanes by grouping together by patient and sample and counting how many entries there are for this combination + .map{ row -> + sample_count_all++ + if (!(row.patient && row.sampleID)){ + log.error "Missing field in csv file header. The csv file must have fields named 'patient' and 'sampleID'." + System.exit(1) + } + [[row.patient.toString(), row.sampleID.toString()], row] + }.groupTuple() + .map{ meta, rows -> + size = rows.size() + [rows, size] + }.transpose() + .map{ row, numLanes -> //from here do the usual thing for csv parsing + + def meta = [:] + + // Meta data to identify samplesheet + // Both patient and sample are mandatory + // Several sample can belong to the same patient + // Sample should be unique for the patient + if (row.patient) meta.patient = row.patient.toString() + if (row.sampleID) meta.sampleID = row.sampleID.toString() + + // If no sex specified, sex is not considered + // sex is only mandatory for somatic CNV + if (row.sex) meta.sex = row.sex.toString() + else meta.sex = 'NA' + + // If no status specified, sample is assumed normal + if (row.status) meta.status = row.status.toInteger() + else meta.status = 0 + + if (meta.status == 0) sample_count_normal++ + else sample_count_tumor++ + + // join meta to fastq + if (row.fastq_2) { + meta.id = "${row.patient}--${row.sampleID}".toString() + def fastq_1 = file(row.fastq_1, checkIfExists: true) + def fastq_2 = file(row.fastq_2, checkIfExists: true) + + meta.size = 1 // default number of splitted fastq + + return [meta.id, meta, [fastq_1, fastq_2]] + + } else { + log.error "Missing or unknown field in csv file header. Please check your samplesheet" + System.exit(1) + } + } +} \ No newline at end of file diff --git a/workflows/rna_fusion.nf b/workflows/rna_fusion.nf new file mode 100644 index 00000000..776f2361 --- /dev/null +++ b/workflows/rna_fusion.nf @@ -0,0 +1,170 @@ +#!/usr/bin/env nextflow +nextflow.enable.dsl=2 + +// import modules +include {help} from "${projectDir}/bin/help/rna_fusion.nf" +include {param_log} from "${projectDir}/bin/log/rna_fusion.nf" +include {getLibraryId} from "${projectDir}/bin/shared/getLibraryId.nf" +include {extract_csv} from "${projectDir}/bin/shared/extract_csv.nf" +include {FILE_DOWNLOAD} from "${projectDir}/subworkflows/aria_download_parse" +include {CONCATENATE_LOCAL_FILES} from "${projectDir}/subworkflows/concatenate_local_files" +include {CONCATENATE_READS_PE} from "${projectDir}/modules/utility_modules/concatenate_reads_PE" +include {GUNZIP} from "${projectDir}/modules/utility_modules/gunzip" +include {XENOME_CLASSIFY} from "${projectDir}/modules/xenome/xenome" +include {FASTQ_SORT as FASTQ_SORT_HUMAN; + FASTQ_SORT as FASTQ_SORT_MOUSE} from "${projectDir}/modules/fastq-tools/fastq-sort" +include {STAR_ALIGN as STAR_ARRIBA; + STAR_ALIGN as STAR_SQUID; + STAR_ALIGN as STAR_STARFUSION} from "${projectDir}/modules/star/star_align" +include {SAMTOOLS_SORT as SORT_ARRIBA; + SAMTOOLS_SORT as SORT_SQUID} from "${projectDir}/modules/samtools/samtools_sort" +include {SAMTOOLS_INDEX as INDEX_ARRIBA} from "${projectDir}/modules/samtools/samtools_index" +include {ARRIBA} from "${projectDir}/modules/arriba/arriba" +include {FUSIONCATCHER} from "${projectDir}/modules/fusioncatcher/fusioncatcher" +include {JAFFA} from "${projectDir}/modules/jaffa/jaffa" +include {KALLISTO_QUANT} from "${projectDir}/modules/kallisto/kallisto_quant" +include {KALLISTO_INSERT_SIZE} from "${projectDir}/modules/kallisto/kallisto_insert_size" +include {PIZZLY} from "${projectDir}/modules/pizzly/pizzly" +include {SQUID} from "${projectDir}/modules/squid/squid_call" +include {SQUID_ANNOTATE} from "${projectDir}/modules/squid/squid_annotate" +include {SAMTOOLS_VIEW as SAMTOOLS_VIEW_SQUID} from "${projectDir}/modules/samtools/samtools_view" +include {STAR_FUSION as STAR_FUSION} from "${projectDir}/modules/star-fusion/star-fusion" +include {FASTQC} from "${projectDir}/modules/fastqc/fastqc" +include {FUSION_REPORT} from "${projectDir}/modules/fusion_report/fusion_report" +include {MULTIQC} from "${projectDir}/modules/multiqc/multiqc" + +// log params +param_log() + +if (params.download_data && !params.csv_input) { + exit 1, "Data download was specified with `--download_data`. However, no input CSV file was specified with `--csv_input`. This is an invalid parameter combination. `--download_data` requires a CSV manifest. See `--help` for information." +} + +if (params.pdx && params.gen_org == 'mouse') { + exit 1, "PDX analysis was specified with `--pdx`. `--gen_org` was set to: ${params.gen_org}. This is an invalid parameter combination. `--gen_org` must == 'human' for PDX analysis." +} + +if (params.gen_org == 'mouse') { + exit 1, "This pipeline currently only supports human data analysis." +} + +if (params.read_type == 'SE') { + exit 1, "This pipeline supports only paired end data." +} + +// prepare reads channel +if (params.csv_input) { + + ch_input_sample = extract_csv(file(params.csv_input, checkIfExists: true)) + + ch_input_sample.map{it -> [it[0], [it[2], it[3]]]}.set{read_ch} + ch_input_sample.map{it -> [it[0], it[1]]}.set{meta_ch} + +} else if (params.concat_lanes){ + + read_ch = Channel + .fromFilePairs("${params.sample_folder}/${params.pattern}${params.extension}",checkExists:true, flat:true ) + .map { file, file1, file2 -> tuple(getLibraryId(file), file1, file2) } + .groupTuple() + + // if channel is empty give error message and exit + read_ch.ifEmpty{ exit 1, "ERROR: No Files Found in Path: ${params.sample_folder} Matching Pattern: ${params.pattern} and file extension: ${params.extension}"} + +} else { + + read_ch = Channel.fromFilePairs("${params.sample_folder}/${params.pattern}${params.extension}",checkExists:true ) + + // if channel is empty give error message and exit + read_ch.ifEmpty{ exit 1, "ERROR: No Files Found in Path: ${params.sample_folder} Matching Pattern: ${params.pattern} and file extension: ${params.extension}"} + +} + +// main workflow +workflow RNA_FUSION { + + // Step 0: Download data and concat Fastq files if needed. + if (params.download_data){ + FILE_DOWNLOAD(ch_input_sample) + + FILE_DOWNLOAD.out.read_meta_ch.map{it -> [it[0], it[2]]}.set{read_ch} + FILE_DOWNLOAD.out.read_meta_ch.map{it -> [it[0], it[1]]}.set{meta_ch} + } + + // Step 00: Concat local Fastq files from CSV input if required. + if (!params.download_data && params.csv_input){ + CONCATENATE_LOCAL_FILES(ch_input_sample) + CONCATENATE_LOCAL_FILES.out.read_meta_ch.map{it -> [it[0], it[2]]}.set{read_ch} + CONCATENATE_LOCAL_FILES.out.read_meta_ch.map{it -> [it[0], it[1]]}.set{meta_ch} + } + + // Step 0: Concat local Fastq files if required. + if (params.concat_lanes && !params.csv_input){ + CONCATENATE_READS_PE(read_ch) + read_ch = CONCATENATE_READS_PE.out.concat_fastq + } + + GUNZIP(read_ch) + + FASTQC(GUNZIP.out.gunzip_fastq) + + // Step 1a: Xenome if PDX data used. + ch_XENOME_CLASSIFY_multiqc = Channel.empty() //optional log file. + if (params.pdx){ + // Xenome Classification + XENOME_CLASSIFY(GUNZIP.out.gunzip_fastq) + ch_XENOME_CLASSIFY_multiqc = XENOME_CLASSIFY.out.xenome_stats //set log file for multiqc + + // Xenome Read Sort + FASTQ_SORT_HUMAN(XENOME_CLASSIFY.out.xenome_fastq, 'human') + FASTQ_SORT_MOUSE(XENOME_CLASSIFY.out.xenome_mouse_fastq, 'mouse') + fusion_tool_input = FASTQ_SORT_HUMAN.out.sorted_fastq + + } else { + fusion_tool_input = GUNZIP.out.gunzip_fastq + } + + // Step 3: Callers: + // arriba + STAR_ARRIBA(fusion_tool_input, params.arriba_star_args, params.gencode_gtf) + SORT_ARRIBA(STAR_ARRIBA.out.bam, '-O bam', 'bam') + INDEX_ARRIBA(SORT_ARRIBA.out.sorted_file) + arriba_input = SORT_ARRIBA.out.sorted_file.join(INDEX_ARRIBA.out.bai) + ARRIBA(arriba_input, params.gencode_gtf) + + // fusioncatcher + FUSIONCATCHER(fusion_tool_input) + + // jaffa + JAFFA(fusion_tool_input) + + // pizzly + KALLISTO_QUANT(fusion_tool_input) + KALLISTO_INSERT_SIZE(KALLISTO_QUANT.out.kallisto_abundance) + pizzly_input = KALLISTO_QUANT.out.kallisto_fusions.join(KALLISTO_INSERT_SIZE.out.kallisto_insert_size) + PIZZLY(pizzly_input, params.ensembl_gtf) + + // squid + STAR_SQUID(fusion_tool_input, params.squid_star_args, params.gencode_gtf) + SAMTOOLS_VIEW_SQUID(STAR_SQUID.out.sam, '-Sb', '_chimeric') // NOTE: The sam file from STAR_SQUID contains chimeric reads. Per STAR passed arguments. + SORT_SQUID(SAMTOOLS_VIEW_SQUID.out.bam, '-O bam', 'bam') + squid_input = STAR_SQUID.out.bam_sorted.join(SORT_SQUID.out.sorted_file ) + SQUID(squid_input) + SQUID_ANNOTATE(SQUID.out.squid_fusions, params.gencode_gtf) + + // star-fusion + STAR_FUSION(fusion_tool_input) + + // Step 4: Fusion Reporter + fusion_report_input = ARRIBA.out.arriba_fusions.join(FUSIONCATCHER.out.fusioncatcher_fusions).join(JAFFA.out.jaffa_fusions).join(PIZZLY.out.pizzly_fusions).join(SQUID_ANNOTATE.out.squid_fusions_annotated).join(STAR_FUSION.out.star_fusion_fusions) + FUSION_REPORT(fusion_report_input) + + // Step 5: MultiQC + ch_multiqc_files = Channel.empty() + ch_multiqc_files = ch_multiqc_files.mix(FUSION_REPORT.out.summary_fusions_mq.collect{it[1]}.ifEmpty([])) + ch_multiqc_files = ch_multiqc_files.mix(FASTQC.out.quality_stats.collect{it[1]}.ifEmpty([])) + ch_multiqc_files = ch_multiqc_files.mix(ch_XENOME_CLASSIFY_multiqc.collect{it[1]}.ifEmpty([])) + + MULTIQC ( + ch_multiqc_files.collect() + ) +} diff --git a/workflows/rnaseq.nf b/workflows/rnaseq.nf index a782f292..da0a6543 100644 --- a/workflows/rnaseq.nf +++ b/workflows/rnaseq.nf @@ -5,17 +5,24 @@ nextflow.enable.dsl=2 include {help} from "${projectDir}/bin/help/rnaseq" include {param_log} from "${projectDir}/bin/log/rnaseq" include {getLibraryId} from "${projectDir}/bin/shared/getLibraryId.nf" +include {extract_csv} from "${projectDir}/bin/shared/extract_csv.nf" +include {FILE_DOWNLOAD} from "${projectDir}/subworkflows/aria_download_parse" +include {CONCATENATE_LOCAL_FILES} from "${projectDir}/subworkflows/concatenate_local_files" include {CONCATENATE_READS_PE} from "${projectDir}/modules/utility_modules/concatenate_reads_PE" include {CONCATENATE_READS_SE} from "${projectDir}/modules/utility_modules/concatenate_reads_SE" +include {GET_READ_LENGTH} from "${projectDir}/modules/utility_modules/get_read_length" +include {PDX_RNASEQ} from "${projectDir}/subworkflows/pdx_rnaseq" +include {JAX_TRIMMER} from "${projectDir}/modules/utility_modules/jax_trimmer" +include {FASTQC} from "${projectDir}/modules/fastqc/fastqc" +include {CHECK_STRANDEDNESS} from "${projectDir}/modules/python/python_check_strandedness" include {READ_GROUPS} from "${projectDir}/modules/utility_modules/read_groups" -include {RNA_SUMMARY_STATS} from "${projectDir}/modules/utility_modules/aggregate_stats_rna" -include {BAMTOOLS_STATS} from "${projectDir}/modules/bamtools/bamtools_stats" include {RSEM_ALIGNMENT_EXPRESSION} from "${projectDir}/modules/rsem/rsem_alignment_expression" -include {QUALITY_STATISTICS} from "${projectDir}/modules/utility_modules/quality_stats" include {PICARD_ADDORREPLACEREADGROUPS} from "${projectDir}/modules/picard/picard_addorreplacereadgroups" include {PICARD_REORDERSAM} from "${projectDir}/modules/picard/picard_reordersam" -include {PICARD_COLLECTRNASEQMETRICS} from "${projectDir}/modules/picard/picard_collectrnaseqmetrics" include {PICARD_SORTSAM} from "${projectDir}/modules/picard/picard_sortsam" +include {PICARD_COLLECTRNASEQMETRICS} from "${projectDir}/modules/picard/picard_collectrnaseqmetrics" +include {RNA_SUMMARY_STATS} from "${projectDir}/modules/utility_modules/aggregate_stats_rna" +include {MULTIQC} from "${projectDir}/modules/multiqc/multiqc" // help if needed if (params.help){ @@ -26,8 +33,29 @@ if (params.help){ // log paramiter info param_log() +if (params.download_data && !params.csv_input) { + exit 1, "Data download was specified with `--download_data`. However, no input CSV file was specified with `--csv_input`. This is an invalid parameter combination. `--download_data` requires a CSV manifest. See `--help` for information." +} + +if (params.pdx && params.gen_org == 'mouse') { + exit 1, "PDX analysis was specified with `--pdx`. `--gen_org` was set to: ${params.gen_org}. This is an invalid parameter combination. `--gen_org` must == 'human' for PDX analysis." +} + // prepare reads channel -if (params.concat_lanes){ +if (params.csv_input) { + + ch_input_sample = extract_csv(file(params.csv_input, checkIfExists: true)) + + if (params.read_type == 'PE'){ + ch_input_sample.map{it -> [it[0], [it[2], it[3]]]}.set{read_ch} + ch_input_sample.map{it -> [it[0], it[1]]}.set{meta_ch} + } else if (params.read_type == 'SE') { + ch_input_sample.map{it -> [it[0], it[2]]}.set{read_ch} + ch_input_sample.map{it -> [it[0], it[1]]}.set{meta_ch} + } + +} else if (params.concat_lanes){ + if (params.read_type == 'PE'){ read_ch = Channel .fromFilePairs("${params.sample_folder}/${params.pattern}${params.extension}",checkExists:true, flat:true ) @@ -40,65 +68,104 @@ if (params.concat_lanes){ .groupTuple() .map{t-> [t[0], t[1].flatten()]} } + // if channel is empty give error message and exit + read_ch.ifEmpty{ exit 1, "ERROR: No Files Found in Path: ${params.sample_folder} Matching Pattern: ${params.pattern} and file extension: ${params.extension}"} + } else { + if (params.read_type == 'PE'){ read_ch = Channel.fromFilePairs("${params.sample_folder}/${params.pattern}${params.extension}",checkExists:true ) } else if (params.read_type == 'SE'){ read_ch = Channel.fromFilePairs("${params.sample_folder}/*${params.extension}",checkExists:true, size:1 ) } + // if channel is empty give error message and exit + read_ch.ifEmpty{ exit 1, "ERROR: No Files Found in Path: ${params.sample_folder} Matching Pattern: ${params.pattern} and file extension: ${params.extension}"} } -// if channel is empty give error message and exit -read_ch.ifEmpty{ exit 1, "ERROR: No Files Found in Path: ${params.sample_folder} Matching Pattern: ${params.pattern}"} - -// downstream resources (only load once so do it here) -if (params.rsem_aligner == "bowtie2") { - rsem_ref_files = file("${params.rsem_ref_files}/bowtie2/*") -} -else if (params.rsem_aligner == "star") { - rsem_ref_files = file("${params.rsem_ref_files}/STAR/${params.rsem_star_prefix}/*") -} -else error "${params.rsem_aligner} is not valid, use 'bowtie2' or 'star'" - // main workflow workflow RNASEQ { - // Step 0: Concatenate Fastq files if required. - if (params.concat_lanes){ - if (params.read_type == 'PE'){ - CONCATENATE_READS_PE(read_ch) - read_ch = CONCATENATE_READS_PE.out.concat_fastq - } else if (params.read_type == 'SE'){ - CONCATENATE_READS_SE(read_ch) - read_ch = CONCATENATE_READS_SE.out.concat_fastq - } + // Step 0: Download data and concat Fastq files if needed. + if (params.download_data){ + FILE_DOWNLOAD(ch_input_sample) + + FILE_DOWNLOAD.out.read_meta_ch.map{it -> [it[0], it[2]]}.set{read_ch} + FILE_DOWNLOAD.out.read_meta_ch.map{it -> [it[0], it[1]]}.set{meta_ch} + } + + // Step 00: Concat local Fastq files from CSV input if required. + if (!params.download_data && params.csv_input){ + CONCATENATE_LOCAL_FILES(ch_input_sample) + CONCATENATE_LOCAL_FILES.out.read_meta_ch.map{it -> [it[0], it[2]]}.set{read_ch} + CONCATENATE_LOCAL_FILES.out.read_meta_ch.map{it -> [it[0], it[1]]}.set{meta_ch} + } + + // Step 00: Concat local Fastq files if required. + if (params.concat_lanes && !params.csv_input){ + if (params.read_type == 'PE'){ + CONCATENATE_READS_PE(read_ch) + read_ch = CONCATENATE_READS_PE.out.concat_fastq + } else if (params.read_type == 'SE'){ + CONCATENATE_READS_SE(read_ch) + read_ch = CONCATENATE_READS_SE.out.concat_fastq + } } + + // ** MAIN workflow starts: - // Step 1: Qual_Stat - QUALITY_STATISTICS(read_ch) + // If samples are PDX, run the PDX RNAseq workflow. + // Otherwise, run the standard workflow. - // Step 2: RSEM - RSEM_ALIGNMENT_EXPRESSION(QUALITY_STATISTICS.out.trimmed_fastq, rsem_ref_files) + if (params.pdx){ + + PDX_RNASEQ(read_ch) - //Step 3: Get Read Group Information - READ_GROUPS(QUALITY_STATISTICS.out.trimmed_fastq, "picard") + } else { - // Step 4: Picard Alignment Metrics - add_replace_groups = READ_GROUPS.out.read_groups.join(RSEM_ALIGNMENT_EXPRESSION.out.bam) - PICARD_ADDORREPLACEREADGROUPS(add_replace_groups) + // Step 1: Qual_Stat + JAX_TRIMMER(read_ch) + + GET_READ_LENGTH(read_ch) + + FASTQC(JAX_TRIMMER.out.trimmed_fastq) - PICARD_REORDERSAM(PICARD_ADDORREPLACEREADGROUPS.out.bam) + // Check strand setting + CHECK_STRANDEDNESS(JAX_TRIMMER.out.trimmed_fastq) - // Step 5: Picard Alignment Metrics - PICARD_SORTSAM(PICARD_REORDERSAM.out.bam) - // need to sort out ref_flat and ribo_intervals (may break mouse now) - PICARD_COLLECTRNASEQMETRICS(PICARD_SORTSAM.out.bam) + rsem_input = JAX_TRIMMER.out.trimmed_fastq.join(CHECK_STRANDEDNESS.out.strand_setting).join(GET_READ_LENGTH.out.read_length) - // Step 6: Summary Stats + // Step 2: RSEM + RSEM_ALIGNMENT_EXPRESSION(rsem_input, params.rsem_ref_files, params.rsem_star_prefix, params.rsem_ref_prefix) - agg_stats = RSEM_ALIGNMENT_EXPRESSION.out.rsem_stats.join(QUALITY_STATISTICS.out.quality_stats).join(PICARD_COLLECTRNASEQMETRICS.out.picard_metrics) + //Step 3: Get Read Group Information + READ_GROUPS(JAX_TRIMMER.out.trimmed_fastq, "picard") - RNA_SUMMARY_STATS(agg_stats) + // Step 4: Picard Alignment Metrics + add_replace_groups = READ_GROUPS.out.read_groups.join(RSEM_ALIGNMENT_EXPRESSION.out.bam) + PICARD_ADDORREPLACEREADGROUPS(add_replace_groups) + PICARD_REORDERSAM(PICARD_ADDORREPLACEREADGROUPS.out.bam, params.picard_dict) + + // Step 5: Picard Alignment Metrics + PICARD_SORTSAM(PICARD_REORDERSAM.out.bam) + + PICARD_COLLECTRNASEQMETRICS(PICARD_SORTSAM.out.bam.join(CHECK_STRANDEDNESS.out.strand_setting), params.ref_flat, params.ribo_intervals) + + // Step 6: Summary Stats + + agg_stats = RSEM_ALIGNMENT_EXPRESSION.out.rsem_stats.join(JAX_TRIMMER.out.quality_stats).join(PICARD_COLLECTRNASEQMETRICS.out.picard_metrics) + + RNA_SUMMARY_STATS(agg_stats) + + ch_multiqc_files = Channel.empty() + ch_multiqc_files = ch_multiqc_files.mix(JAX_TRIMMER.out.quality_stats.collect{it[1]}.ifEmpty([])) + ch_multiqc_files = ch_multiqc_files.mix(FASTQC.out.quality_stats.collect{it[1]}.ifEmpty([])) + ch_multiqc_files = ch_multiqc_files.mix(RSEM_ALIGNMENT_EXPRESSION.out.rsem_cnt.collect{it[1]}.ifEmpty([])) + ch_multiqc_files = ch_multiqc_files.mix(PICARD_COLLECTRNASEQMETRICS.out.picard_metrics.collect{it[1]}.ifEmpty([])) + + MULTIQC ( + ch_multiqc_files.collect() + ) + } } diff --git a/workflows/rrbs.nf b/workflows/rrbs.nf index 13e391bb..849b4b17 100644 --- a/workflows/rrbs.nf +++ b/workflows/rrbs.nf @@ -4,11 +4,16 @@ nextflow.enable.dsl=2 include {help} from "${projectDir}/bin/help/rrbs" include {param_log} from "${projectDir}/bin/log/rrbs" include {getLibraryId} from "${projectDir}/bin/shared/getLibraryId.nf" +include {extract_csv} from "${projectDir}/bin/shared/extract_csv.nf" +include {FILE_DOWNLOAD} from "${projectDir}/subworkflows/aria_download_parse" +include {CONCATENATE_LOCAL_FILES} from "${projectDir}/subworkflows/concatenate_local_files" include {CONCATENATE_READS_PE} from "${projectDir}/modules/utility_modules/concatenate_reads_PE" include {CONCATENATE_READS_SE} from "${projectDir}/modules/utility_modules/concatenate_reads_SE" include {FASTQC} from "${projectDir}/modules/fastqc/fastqc" include {TRIM_GALORE} from "${projectDir}/modules/trim_galore/trim_galore" include {BISMARK_ALIGNMENT} from "${projectDir}/modules/bismark/bismark_alignment" +include {SAMTOOLS_SORT} from "${projectDir}/modules/samtools/samtools_sort" +include {SAMTOOLS_INDEX} from "${projectDir}/modules/samtools/samtools_index" include {BISMARK_DEDUPLICATION} from "${projectDir}/modules/bismark/bismark_deduplication" include {BISMARK_METHYLATION_EXTRACTION} from "${projectDir}/modules/bismark/bismark_methylation_extraction" include {MULTIQC} from "${projectDir}/modules/multiqc/multiqc" @@ -22,8 +27,25 @@ if (params.help){ // log paramiter info param_log() +if (params.download_data && !params.csv_input) { + exit 1, "Data download was specified with `--download_data`. However, no input CSV file was specified with `--csv_input`. This is an invalid parameter combination. `--download_data` requires a CSV manifest. See `--help` for information." +} + // prepare reads channel -if (params.concat_lanes){ +if (params.csv_input) { + + ch_input_sample = extract_csv(file(params.csv_input, checkIfExists: true)) + + if (params.read_type == 'PE'){ + ch_input_sample.map{it -> [it[0], [it[2], it[3]]]}.set{read_ch} + ch_input_sample.map{it -> [it[0], it[1]]}.set{meta_ch} + } else if (params.read_type == 'SE') { + ch_input_sample.map{it -> [it[0], it[2]]}.set{read_ch} + ch_input_sample.map{it -> [it[0], it[1]]}.set{meta_ch} + } + +} else if (params.concat_lanes){ + if (params.read_type == 'PE'){ read_ch = Channel .fromFilePairs("${params.sample_folder}/${params.pattern}${params.extension}",checkExists:true, flat:true ) @@ -36,38 +58,61 @@ if (params.concat_lanes){ .groupTuple() .map{t-> [t[0], t[1].flatten()]} } + // if channel is empty give error message and exit + read_ch.ifEmpty{ exit 1, "ERROR: No Files Found in Path: ${params.sample_folder} Matching Pattern: ${params.pattern} and file extension: ${params.extension}"} + } else { + if (params.read_type == 'PE'){ read_ch = Channel.fromFilePairs("${params.sample_folder}/${params.pattern}${params.extension}",checkExists:true ) } else if (params.read_type == 'SE'){ read_ch = Channel.fromFilePairs("${params.sample_folder}/*${params.extension}",checkExists:true, size:1 ) } + // if channel is empty give error message and exit + read_ch.ifEmpty{ exit 1, "ERROR: No Files Found in Path: ${params.sample_folder} Matching Pattern: ${params.pattern} and file extension: ${params.extension}"} } -// if channel is empty give error message and exit -read_ch.ifEmpty{ exit 1, "ERROR: No Files Found in Path: ${params.sample_folder} Matching Pattern: ${params.pattern}"} - // main workflow workflow RRBS { + // Step 0: Download data and concat Fastq files if needed. + if (params.download_data){ + FILE_DOWNLOAD(ch_input_sample) - // Step 0: Concatenate Fastq files if required. - if (params.concat_lanes){ - if (params.read_type == 'PE'){ - CONCATENATE_READS_PE(read_ch) - read_ch = CONCATENATE_READS_PE.out.concat_fastq - } else if (params.read_type == 'SE'){ - CONCATENATE_READS_SE(read_ch) - read_ch = CONCATENATE_READS_SE.out.concat_fastq - } + FILE_DOWNLOAD.out.read_meta_ch.map{it -> [it[0], it[2]]}.set{read_ch} + FILE_DOWNLOAD.out.read_meta_ch.map{it -> [it[0], it[1]]}.set{meta_ch} } + // Step 00: Concat local Fastq files from CSV input if required. + if (!params.download_data && params.csv_input){ + CONCATENATE_LOCAL_FILES(ch_input_sample) + CONCATENATE_LOCAL_FILES.out.read_meta_ch.map{it -> [it[0], it[2]]}.set{read_ch} + CONCATENATE_LOCAL_FILES.out.read_meta_ch.map{it -> [it[0], it[1]]}.set{meta_ch} + } + + // Step 00: Concat local Fastq files if required. + if (params.concat_lanes && !params.csv_input){ + if (params.read_type == 'PE'){ + CONCATENATE_READS_PE(read_ch) + read_ch = CONCATENATE_READS_PE.out.concat_fastq + } else if (params.read_type == 'SE'){ + CONCATENATE_READS_SE(read_ch) + read_ch = CONCATENATE_READS_SE.out.concat_fastq + } + } + + // ** MAIN workflow starts: + FASTQC(read_ch) + // Note: fastqc is run prior to trimming, as trim galor outputs fastqc level data. TRIM_GALORE(read_ch) BISMARK_ALIGNMENT(TRIM_GALORE.out.trimmed_fastq) + SAMTOOLS_SORT(BISMARK_ALIGNMENT.out.bam, '-O bam', 'bam') + SAMTOOLS_INDEX(SAMTOOLS_SORT.out.sorted_file) + ch_BISMARK_DEDUPLICATION_multiqc = Channel.empty() if (params.skip_deduplication) { diff --git a/workflows/wes.nf b/workflows/wes.nf index 1644f84c..1be5c199 100755 --- a/workflows/wes.nf +++ b/workflows/wes.nf @@ -5,41 +5,47 @@ nextflow.enable.dsl=2 include {help} from "${projectDir}/bin/help/wes.nf" include {param_log} from "${projectDir}/bin/log/wes.nf" include {getLibraryId} from "${projectDir}/bin/shared/getLibraryId.nf" +include {extract_csv} from "${projectDir}/bin/shared/extract_csv.nf" +include {FILE_DOWNLOAD} from "${projectDir}/subworkflows/aria_download_parse" +include {CONCATENATE_LOCAL_FILES} from "${projectDir}/subworkflows/concatenate_local_files" include {CONCATENATE_READS_PE} from "${projectDir}/modules/utility_modules/concatenate_reads_PE" include {CONCATENATE_READS_SE} from "${projectDir}/modules/utility_modules/concatenate_reads_SE" -include {BWA_MEM} from "${projectDir}/modules/bwa/bwa_mem" -include {SAMTOOLS_INDEX} from "${projectDir}/modules/samtools/samtools_index" +include {JAX_TRIMMER} from "${projectDir}/modules/utility_modules/jax_trimmer" include {READ_GROUPS} from "${projectDir}/modules/utility_modules/read_groups" -include {QUALITY_STATISTICS} from "${projectDir}/modules/utility_modules/quality_stats" -include {AGGREGATE_STATS} from "${projectDir}/modules/utility_modules/aggregate_stats_wes" -include {SNPSIFT_ANNOTATE as SNPSIFT_ANNOTATE_SNP_COSMIC; - SNPSIFT_ANNOTATE as SNPSIFT_ANNOTATE_INDEL_COSMIC; - SNPSIFT_ANNOTATE as SNPSIFT_ANNOTATE_SNP_DBSNP; - SNPSIFT_ANNOTATE as SNPSIFT_ANNOTATE_INDEL_DBSNP} from "${projectDir}/modules/snpeff_snpsift/snpsift_annotate" +include {BWA_MEM} from "${projectDir}/modules/bwa/bwa_mem" +include {FASTQC} from "${projectDir}/modules/fastqc/fastqc" include {PICARD_SORTSAM} from "${projectDir}/modules/picard/picard_sortsam" include {PICARD_MARKDUPLICATES} from "${projectDir}/modules/picard/picard_markduplicates" +include {GATK_BASERECALIBRATOR} from "${projectDir}/modules/gatk/gatk_baserecalibrator" +include {GATK_APPLYBQSR} from "${projectDir}/modules/gatk/gatk_applybqsr" include {PICARD_COLLECTHSMETRICS} from "${projectDir}/modules/picard/picard_collecthsmetrics" +include {GATK_HAPLOTYPECALLER; + GATK_HAPLOTYPECALLER as GATK_HAPLOTYPECALLER_GVCF} from "${projectDir}/modules/gatk/gatk_haplotypecaller" +include {GATK_VARIANTFILTRATION; + GATK_VARIANTFILTRATION as GATK_VARIANTFILTRATION_SNP; + GATK_VARIANTFILTRATION as GATK_VARIANTFILTRATION_INDEL} from "${projectDir}/modules/gatk/gatk_variantfiltration" +include {GATK_SELECTVARIANTS as GATK_SELECTVARIANTS_SNP; + GATK_SELECTVARIANTS as GATK_SELECTVARIANTS_INDEL} from "${projectDir}/modules/gatk/gatk_selectvariants" +include {GATK_MERGEVCF as GATK_MERGEVCF_UNANNOTATED; + GATK_MERGEVCF as GATK_MERGEVCF_ANNOTATED} from "${projectDir}/modules/gatk/gatk_mergevcf" +include {GATK_INDEXFEATUREFILE} from "${projectDir}/modules/gatk/gatk_indexfeaturefile" +include {SNPSIFT_ANNOTATE as SNPSIFT_ANNOTATE_DBSNP; + SNPSIFT_ANNOTATE as SNPSIFT_ANNOTATE_SNP_COSMIC; + SNPSIFT_ANNOTATE as SNPSIFT_ANNOTATE_INDEL_COSMIC; + SNPSIFT_ANNOTATE as SNPSIFT_ANNOTATE_SNP_DBSNP; + SNPSIFT_ANNOTATE as SNPSIFT_ANNOTATE_INDEL_DBSNP} from "${projectDir}/modules/snpeff_snpsift/snpsift_annotate" include {SNPEFF; SNPEFF as SNPEFF_SNP; SNPEFF as SNPEFF_INDEL} from "${projectDir}/modules/snpeff_snpsift/snpeff_snpeff" -include {SNPEFF_ONEPERLINE as SNPEFF_ONEPERLINE_SNP; +include {SNPEFF_ONEPERLINE; + SNPEFF_ONEPERLINE as SNPEFF_ONEPERLINE_SNP; SNPEFF_ONEPERLINE as SNPEFF_ONEPERLINE_INDEL} from "${projectDir}/modules/snpeff_snpsift/snpeff_oneperline" include {SNPSIFT_EXTRACTFIELDS} from "${projectDir}/modules/snpeff_snpsift/snpsift_extractfields" include {SNPSIFT_DBNSFP as SNPSIFT_DBNSFP_SNP; SNPSIFT_DBNSFP as SNPSIFT_DBNSFP_INDEL} from "${projectDir}/modules/snpeff_snpsift/snpsift_dbnsfp" -include {GATK_HAPLOTYPECALLER; - GATK_HAPLOTYPECALLER as GATK_HAPLOTYPECALLER_GVCF} from "${projectDir}/modules/gatk/gatk_haplotypecaller" -include {GATK_INDEXFEATUREFILE} from "${projectDir}/modules/gatk/gatk_indexfeaturefile" -include {GATK_VARIANTFILTRATION; - GATK_VARIANTFILTRATION as GATK_VARIANTFILTRATION_SNP; - GATK_VARIANTFILTRATION as GATK_VARIANTFILTRATION_INDEL} from "${projectDir}/modules/gatk/gatk_variantfiltration" -include {GATK_VARIANTANNOTATOR} from "${projectDir}/modules/gatk/gatk_variantannotator" -include {GATK_MERGEVCF} from "${projectDir}/modules/gatk/gatk_mergevcf" -include {GATK_SELECTVARIANTS; - GATK_SELECTVARIANTS as GATK_SELECTVARIANTS_SNP; - GATK_SELECTVARIANTS as GATK_SELECTVARIANTS_INDEL} from "${projectDir}/modules/gatk/gatk_selectvariants" -include {GATK_BASERECALIBRATOR} from "${projectDir}/modules/gatk/gatk_baserecalibrator" -include {GATK_APPLYBQSR} from "${projectDir}/modules/gatk/gatk_applybqsr" +include {AGGREGATE_STATS} from "${projectDir}/modules/utility_modules/aggregate_stats_wes" +include {MULTIQC} from "${projectDir}/modules/multiqc/multiqc" + // help if needed if (params.help){ @@ -50,8 +56,25 @@ if (params.help){ // log params param_log() +if (params.download_data && !params.csv_input) { + exit 1, "Data download was specified with `--download_data`. However, no input CSV file was specified with `--csv_input`. This is an invalid parameter combination. `--download_data` requires a CSV manifest. See `--help` for information." +} + // prepare reads channel -if (params.concat_lanes){ +if (params.csv_input) { + + ch_input_sample = extract_csv(file(params.csv_input, checkIfExists: true)) + + if (params.read_type == 'PE'){ + ch_input_sample.map{it -> [it[0], [it[2], it[3]]]}.set{read_ch} + ch_input_sample.map{it -> [it[0], it[1]]}.set{meta_ch} + } else if (params.read_type == 'SE') { + ch_input_sample.map{it -> [it[0], it[2]]}.set{read_ch} + ch_input_sample.map{it -> [it[0], it[1]]}.set{meta_ch} + } + +} else if (params.concat_lanes){ + if (params.read_type == 'PE'){ read_ch = Channel .fromFilePairs("${params.sample_folder}/${params.pattern}${params.extension}",checkExists:true, flat:true ) @@ -64,39 +87,60 @@ if (params.concat_lanes){ .groupTuple() .map{t-> [t[0], t[1].flatten()]} } + // if channel is empty give error message and exit + read_ch.ifEmpty{ exit 1, "ERROR: No Files Found in Path: ${params.sample_folder} Matching Pattern: ${params.pattern} and file extension: ${params.extension}"} + } else { + if (params.read_type == 'PE'){ read_ch = Channel.fromFilePairs("${params.sample_folder}/${params.pattern}${params.extension}",checkExists:true ) } else if (params.read_type == 'SE'){ read_ch = Channel.fromFilePairs("${params.sample_folder}/*${params.extension}",checkExists:true, size:1 ) } + // if channel is empty give error message and exit + read_ch.ifEmpty{ exit 1, "ERROR: No Files Found in Path: ${params.sample_folder} Matching Pattern: ${params.pattern} and file extension: ${params.extension}"} } -// if channel is empty give error message and exit -read_ch.ifEmpty{ exit 1, "ERROR: No Files Found in Path: ${params.sample_folder} Matching Pattern: ${params.pattern}"} - // main workflow workflow WES { - // Step 0: Concatenate Fastq files if required. - if (params.concat_lanes){ - if (params.read_type == 'PE'){ - CONCATENATE_READS_PE(read_ch) - read_ch = CONCATENATE_READS_PE.out.concat_fastq - } else if (params.read_type == 'SE'){ - CONCATENATE_READS_SE(read_ch) - read_ch = CONCATENATE_READS_SE.out.concat_fastq - } + // Step 0: Download data and concat Fastq files if needed. + if (params.download_data){ + FILE_DOWNLOAD(ch_input_sample) + + FILE_DOWNLOAD.out.read_meta_ch.map{it -> [it[0], it[2]]}.set{read_ch} + FILE_DOWNLOAD.out.read_meta_ch.map{it -> [it[0], it[1]]}.set{meta_ch} + } + + // Step 00: Concat local Fastq files from CSV input if required. + if (!params.download_data && params.csv_input){ + CONCATENATE_LOCAL_FILES(ch_input_sample) + CONCATENATE_LOCAL_FILES.out.read_meta_ch.map{it -> [it[0], it[2]]}.set{read_ch} + CONCATENATE_LOCAL_FILES.out.read_meta_ch.map{it -> [it[0], it[1]]}.set{meta_ch} + } + + // Step 00: Concat local Fastq files if required. + if (params.concat_lanes && !params.csv_input){ + if (params.read_type == 'PE'){ + CONCATENATE_READS_PE(read_ch) + read_ch = CONCATENATE_READS_PE.out.concat_fastq + } else if (params.read_type == 'SE'){ + CONCATENATE_READS_SE(read_ch) + read_ch = CONCATENATE_READS_SE.out.concat_fastq + } } // Step 1: Qual_Stat - QUALITY_STATISTICS(read_ch) + JAX_TRIMMER(read_ch) + + FASTQC(JAX_TRIMMER.out.trimmed_fastq) // Step 2: Get Read Group Information - READ_GROUPS(QUALITY_STATISTICS.out.trimmed_fastq, "gatk") + READ_GROUPS(JAX_TRIMMER.out.trimmed_fastq, "gatk") // Step 3: BWA-MEM Alignment - bwa_mem_mapping = QUALITY_STATISTICS.out.trimmed_fastq.join(READ_GROUPS.out.read_groups) + bwa_mem_mapping = JAX_TRIMMER.out.trimmed_fastq.join(READ_GROUPS.out.read_groups) + BWA_MEM(bwa_mem_mapping) // Step 4: Variant Preprocessing - Part 1 @@ -104,10 +148,12 @@ workflow WES { PICARD_MARKDUPLICATES(PICARD_SORTSAM.out.bam) // If Human: Step 5-10 + ch_GATK_BASERECALIBRATOR_multiqc = Channel.empty() //optional log file for human only. if (params.gen_org=='human'){ // Step 5: Variant Pre-Processing - Part 2 GATK_BASERECALIBRATOR(PICARD_MARKDUPLICATES.out.dedup_bam) + ch_GATK_BASERECALIBRATOR_multiqc = GATK_BASERECALIBRATOR.out.table // set log file for multiqc apply_bqsr = PICARD_MARKDUPLICATES.out.dedup_bam.join(GATK_BASERECALIBRATOR.out.table) GATK_APPLYBQSR(apply_bqsr) @@ -120,20 +166,22 @@ workflow WES { haplotype_caller = GATK_APPLYBQSR.out.bam.join(GATK_APPLYBQSR.out.bai) GATK_HAPLOTYPECALLER(haplotype_caller, 'variant') - haplotype_caller_gvcf = GATK_APPLYBQSR.out.bam.join(GATK_APPLYBQSR.out.bai) - GATK_HAPLOTYPECALLER_GVCF(haplotype_caller_gvcf, 'gvcf') + if (params.run_gvcf) { + haplotype_caller_gvcf = GATK_APPLYBQSR.out.bam.join(GATK_APPLYBQSR.out.bai) + GATK_HAPLOTYPECALLER_GVCF(haplotype_caller_gvcf, 'gvcf') + } // Step 8: Variant Filtration // SNP select_var_snp = GATK_HAPLOTYPECALLER.out.vcf.join(GATK_HAPLOTYPECALLER.out.idx) - GATK_SELECTVARIANTS_SNP(select_var_snp, 'SNP') + GATK_SELECTVARIANTS_SNP(select_var_snp, 'SNP', 'selected_SNP') var_filter_snp = GATK_SELECTVARIANTS_SNP.out.vcf.join(GATK_SELECTVARIANTS_SNP.out.idx) GATK_VARIANTFILTRATION_SNP(var_filter_snp, 'SNP') // INDEL select_var_indel = GATK_HAPLOTYPECALLER.out.vcf.join(GATK_HAPLOTYPECALLER.out.idx) - GATK_SELECTVARIANTS_INDEL(select_var_indel, 'INDEL') + GATK_SELECTVARIANTS_INDEL(select_var_indel, 'INDEL', 'selected_INDEL') var_filter_indel = GATK_SELECTVARIANTS_INDEL.out.vcf.join(GATK_SELECTVARIANTS_INDEL.out.idx) GATK_VARIANTFILTRATION_INDEL(var_filter_indel, 'INDEL') @@ -154,10 +202,13 @@ workflow WES { SNPEFF_ONEPERLINE_INDEL(SNPSIFT_DBNSFP_INDEL.out.vcf, 'INDEL') // Step 10: Post Variant Calling Processing - Part 2 - vcf_files = SNPEFF_ONEPERLINE_SNP.out.vcf.join(SNPEFF_ONEPERLINE_INDEL.out.vcf) - GATK_MERGEVCF(vcf_files) + vcf_files_unannotated = SNPSIFT_ANNOTATE_SNP_COSMIC.out.vcf.join(SNPSIFT_ANNOTATE_INDEL_COSMIC.out.vcf) + GATK_MERGEVCF_UNANNOTATED (vcf_files_unannotated, 'SNP_INDEL_filtered_unannotated_final') + + vcf_files_annotated = SNPEFF_ONEPERLINE_SNP.out.vcf.join(SNPEFF_ONEPERLINE_INDEL.out.vcf) + GATK_MERGEVCF_ANNOTATED(vcf_files_annotated, 'SNP_INDEL_filtered_annotated_final') - SNPSIFT_EXTRACTFIELDS(GATK_MERGEVCF.out.vcf) + SNPSIFT_EXTRACTFIELDS(GATK_MERGEVCF_ANNOTATED.out.vcf) } else if (params.gen_org=='mouse'){ @@ -165,28 +216,56 @@ workflow WES { collecths_metric = PICARD_MARKDUPLICATES.out.dedup_bam.join(PICARD_MARKDUPLICATES.out.dedup_bai) PICARD_COLLECTHSMETRICS(collecths_metric) - // Step 7: Variant Calling haplotype_caller = PICARD_MARKDUPLICATES.out.dedup_bam.join(PICARD_MARKDUPLICATES.out.dedup_bai) GATK_HAPLOTYPECALLER(haplotype_caller, 'variant') - + + if (params.run_gvcf) { + haplotype_caller_gvcf = PICARD_MARKDUPLICATES.out.dedup_bam.join(PICARD_MARKDUPLICATES.out.dedup_bai) + GATK_HAPLOTYPECALLER_GVCF(haplotype_caller_gvcf, 'gvcf') + } + // Step 8: Variant Filtration - var_filter = GATK_HAPLOTYPECALLER.out.vcf.join(GATK_HAPLOTYPECALLER.out.idx) + + SNPSIFT_ANNOTATE_DBSNP(GATK_HAPLOTYPECALLER.out.vcf, params.dbSNP, params.dbSNP_index, 'intermediate') + + GATK_INDEXFEATUREFILE(SNPSIFT_ANNOTATE_DBSNP.out.vcf) + + var_filter = SNPSIFT_ANNOTATE_DBSNP.out.vcf.join(GATK_INDEXFEATUREFILE.out.idx) + GATK_VARIANTFILTRATION(var_filter, 'BOTH') + // SNP for final save + select_var_snp = GATK_VARIANTFILTRATION.out.vcf.join(GATK_VARIANTFILTRATION.out.idx) + GATK_SELECTVARIANTS_SNP(select_var_snp, 'SNP', 'SNP_filtered_dbsnpID') + + // INDEL for final save + select_var_indel = GATK_VARIANTFILTRATION.out.vcf.join(GATK_VARIANTFILTRATION.out.idx) + GATK_SELECTVARIANTS_INDEL(select_var_indel, 'INDEL', 'INDEL_filtered_dbsnpID') + // Step 9: Post Variant Calling Processing - SNPEFF(GATK_VARIANTFILTRATION.out.vcf, 'BOTH', 'gatk') + SNPEFF(GATK_VARIANTFILTRATION.out.vcf, 'BOTH', 'vcf') - merged_vcf_files = GATK_VARIANTFILTRATION.out.vcf.join(SNPEFF.out.vcf) - GATK_VARIANTANNOTATOR(merged_vcf_files) + SNPEFF_ONEPERLINE(SNPEFF.out.vcf, 'BOTH') - SNPSIFT_EXTRACTFIELDS(GATK_VARIANTANNOTATOR.out.vcf) + SNPSIFT_EXTRACTFIELDS(SNPEFF_ONEPERLINE.out.vcf) } - agg_stats = QUALITY_STATISTICS.out.quality_stats.join(PICARD_COLLECTHSMETRICS.out.hsmetrics).join(PICARD_MARKDUPLICATES.out.dedup_metrics) + agg_stats = JAX_TRIMMER.out.quality_stats.join(PICARD_COLLECTHSMETRICS.out.hsmetrics).join(PICARD_MARKDUPLICATES.out.dedup_metrics) // Step 11: Aggregate Stats AGGREGATE_STATS(agg_stats) + ch_multiqc_files = Channel.empty() + ch_multiqc_files = ch_multiqc_files.mix(JAX_TRIMMER.out.quality_stats.collect{it[1]}.ifEmpty([])) + ch_multiqc_files = ch_multiqc_files.mix(FASTQC.out.quality_stats.collect{it[1]}.ifEmpty([])) + ch_multiqc_files = ch_multiqc_files.mix(ch_GATK_BASERECALIBRATOR_multiqc.collect{it[1]}.ifEmpty([])) + ch_multiqc_files = ch_multiqc_files.mix(PICARD_COLLECTHSMETRICS.out.hsmetrics.collect{it[1]}.ifEmpty([])) + ch_multiqc_files = ch_multiqc_files.mix(PICARD_MARKDUPLICATES.out.dedup_metrics.collect{it[1]}.ifEmpty([])) + + MULTIQC ( + ch_multiqc_files.collect() + ) + } diff --git a/workflows/wgs.nf b/workflows/wgs.nf index b23a93b7..c6030269 100644 --- a/workflows/wgs.nf +++ b/workflows/wgs.nf @@ -5,42 +5,49 @@ nextflow.enable.dsl=2 include {help} from "${projectDir}/bin/help/wgs.nf" include {param_log} from "${projectDir}/bin/log/wgs.nf" include {getLibraryId} from "${projectDir}/bin/shared/getLibraryId.nf" +include {extract_csv} from "${projectDir}/bin/shared/extract_csv.nf" +include {FILE_DOWNLOAD} from "${projectDir}/subworkflows/aria_download_parse" +include {CONCATENATE_LOCAL_FILES} from "${projectDir}/subworkflows/concatenate_local_files" include {CONCATENATE_READS_PE} from "${projectDir}/modules/utility_modules/concatenate_reads_PE" include {CONCATENATE_READS_SE} from "${projectDir}/modules/utility_modules/concatenate_reads_SE" +include {JAX_TRIMMER} from "${projectDir}/modules/utility_modules/jax_trimmer" +include {FASTQC} from "${projectDir}/modules/fastqc/fastqc" +include {READ_GROUPS} from "${projectDir}/modules/utility_modules/read_groups" include {BWA_MEM} from "${projectDir}/modules/bwa/bwa_mem" include {BWA_MEM_HLA} from "${projectDir}/modules/bwa/bwa_mem_hla" +include {PICARD_SORTSAM} from "${projectDir}/modules/picard/picard_sortsam" +include {PICARD_MARKDUPLICATES} from "${projectDir}/modules/picard/picard_markduplicates" +include {GATK_BASERECALIBRATOR} from "${projectDir}/modules/gatk/gatk_baserecalibrator" +include {GATK_APPLYBQSR} from "${projectDir}/modules/gatk/gatk_applybqsr" +include {PICARD_COLLECTALIGNMENTSUMMARYMETRICS} from "${projectDir}/modules/picard/picard_collectalignmentsummarymetrics" +include {PICARD_COLLECTWGSMETRICS} from "${projectDir}/modules/picard/picard_collectwgsmetrics" +include {GATK_HAPLOTYPECALLER_INTERVAL; + GATK_HAPLOTYPECALLER_INTERVAL as GATK_HAPLOTYPECALLER_INTERVAL_GVCF} from "${projectDir}/modules/gatk/gatk_haplotypecaller_interval" +include {MAKE_VCF_LIST} from "${projectDir}/modules/utility_modules/make_vcf_list" +include {GATK_MERGEVCF_LIST} from "${projectDir}/modules/gatk/gatk_mergevcf_list" +include {GATK_COMBINEGVCFS} from "${projectDir}/modules/gatk/gatk_combinegvcfs" +include {GATK_SELECTVARIANTS as GATK_SELECTVARIANTS_SNP; + GATK_SELECTVARIANTS as GATK_SELECTVARIANTS_INDEL} from "${projectDir}/modules/gatk/gatk_selectvariants" +include {GATK_VARIANTFILTRATION as GATK_VARIANTFILTRATION_SNP; + GATK_VARIANTFILTRATION as GATK_VARIANTFILTRATION_INDEL} from "${projectDir}/modules/gatk/gatk_variantfiltration" +include {GATK_MERGEVCF; + GATK_MERGEVCF as GATK_MERGEVCF_UNANNOTATED; + GATK_MERGEVCF as GATK_MERGEVCF_ANNOTATED} from "${projectDir}/modules/gatk/gatk_mergevcf" include {SNPSIFT_ANNOTATE as SNPSIFT_ANNOTATE_SNP_COSMIC; SNPSIFT_ANNOTATE as SNPSIFT_ANNOTATE_INDEL_COSMIC; SNPSIFT_ANNOTATE as SNPSIFT_ANNOTATE_SNP_DBSNP; SNPSIFT_ANNOTATE as SNPSIFT_ANNOTATE_INDEL_DBSNP} from "${projectDir}/modules/snpeff_snpsift/snpsift_annotate" -include {VCF_ANNOTATE as VCF_ANNOTATE_SNP; - VCF_ANNOTATE as VCF_ANNOTATE_INDEL} from "${projectDir}/modules/vcftools/vcf_annotate" include {SNPEFF; SNPEFF as SNPEFF_SNP; SNPEFF as SNPEFF_INDEL} from "${projectDir}/modules/snpeff_snpsift/snpeff_snpeff" -include {SNPEFF_ONEPERLINE as SNPEFF_ONEPERLINE_SNP; +include {SNPEFF_ONEPERLINE; + SNPEFF_ONEPERLINE as SNPEFF_ONEPERLINE_SNP; SNPEFF_ONEPERLINE as SNPEFF_ONEPERLINE_INDEL} from "${projectDir}/modules/snpeff_snpsift/snpeff_oneperline" -include {SNPSIFT_EXTRACTFIELDS} from "${projectDir}/modules/snpeff_snpsift/snpsift_extractfields" include {SNPSIFT_DBNSFP as SNPSIFT_DBNSFP_SNP; SNPSIFT_DBNSFP as SNPSIFT_DBNSFP_INDEL} from "${projectDir}/modules/snpeff_snpsift/snpsift_dbnsfp" +include {SNPSIFT_EXTRACTFIELDS} from "${projectDir}/modules/snpeff_snpsift/snpsift_extractfields" include {AGGREGATE_STATS} from "${projectDir}/modules/utility_modules/aggregate_stats_wgs" -include {READ_GROUPS} from "${projectDir}/modules/utility_modules/read_groups" -include {QUALITY_STATISTICS} from "${projectDir}/modules/utility_modules/quality_stats" -include {PICARD_SORTSAM} from "${projectDir}/modules/picard/picard_sortsam" -include {PICARD_MARKDUPLICATES} from "${projectDir}/modules/picard/picard_markduplicates" -include {PICARD_COLLECTALIGNMENTSUMMARYMETRICS} from "${projectDir}/modules/picard/picard_collectalignmentsummarymetrics" -include {PICARD_COLLECTWGSMETRICS} from "${projectDir}/modules/picard/picard_collectwgsmetrics" -include {GATK_BASERECALIBRATOR} from "${projectDir}/modules/gatk/gatk_baserecalibrator" -include {GATK_APPLYBQSR} from "${projectDir}/modules/gatk/gatk_applybqsr" -include {GATK_MERGEVCF} from "${projectDir}/modules/gatk/gatk_mergevcf" -include {GATK_MERGEVCF_LIST} from "${projectDir}/modules/gatk/gatk_mergevcf_list" -include {GATK_VARIANTANNOTATOR} from "${projectDir}/modules/gatk/gatk_variantannotator" -include {GATK_HAPLOTYPECALLER_INTERVAL} from "${projectDir}/modules/gatk/gatk_haplotypecaller_interval" -include {GATK_SELECTVARIANTS as GATK_SELECTVARIANTS_SNP; - GATK_SELECTVARIANTS as GATK_SELECTVARIANTS_INDEL} from "${projectDir}/modules/gatk/gatk_selectvariants" -include {GATK_VARIANTFILTRATION as GATK_VARIANTFILTRATION_SNP; - GATK_VARIANTFILTRATION as GATK_VARIANTFILTRATION_INDEL} from "${projectDir}/modules/gatk/gatk_variantfiltration" -include {MAKE_VCF_LIST} from "${projectDir}/modules/utility_modules/make_vcf_list" +include {MULTIQC} from "${projectDir}/modules/multiqc/multiqc" // help if needed if (params.help){ @@ -51,8 +58,25 @@ if (params.help){ // log params param_log() +if (params.download_data && !params.csv_input) { + exit 1, "Data download was specified with `--download_data`. However, no input CSV file was specified with `--csv_input`. This is an invalid parameter combination. `--download_data` requires a CSV manifest. See `--help` for information." +} + // prepare reads channel -if (params.concat_lanes){ +if (params.csv_input) { + + ch_input_sample = extract_csv(file(params.csv_input, checkIfExists: true)) + + if (params.read_type == 'PE'){ + ch_input_sample.map{it -> [it[0], [it[2], it[3]]]}.set{read_ch} + ch_input_sample.map{it -> [it[0], it[1]]}.set{meta_ch} + } else if (params.read_type == 'SE') { + ch_input_sample.map{it -> [it[0], it[2]]}.set{read_ch} + ch_input_sample.map{it -> [it[0], it[1]]}.set{meta_ch} + } + +} else if (params.concat_lanes){ + if (params.read_type == 'PE'){ read_ch = Channel .fromFilePairs("${params.sample_folder}/${params.pattern}${params.extension}",checkExists:true, flat:true ) @@ -65,37 +89,58 @@ if (params.concat_lanes){ .groupTuple() .map{t-> [t[0], t[1].flatten()]} } + // if channel is empty give error message and exit + read_ch.ifEmpty{ exit 1, "ERROR: No Files Found in Path: ${params.sample_folder} Matching Pattern: ${params.pattern} and file extension: ${params.extension}"} + } else { + if (params.read_type == 'PE'){ read_ch = Channel.fromFilePairs("${params.sample_folder}/${params.pattern}${params.extension}",checkExists:true ) } else if (params.read_type == 'SE'){ read_ch = Channel.fromFilePairs("${params.sample_folder}/*${params.extension}",checkExists:true, size:1 ) } + // if channel is empty give error message and exit + read_ch.ifEmpty{ exit 1, "ERROR: No Files Found in Path: ${params.sample_folder} Matching Pattern: ${params.pattern} and file extension: ${params.extension}"} } -// if channel is empty give error message and exit -read_ch.ifEmpty{ exit 1, "ERROR: No Files Found in Path: ${params.sample_folder} Matching Pattern: ${params.pattern}"} - // main workflow workflow WGS { - // Step 0: Concatenate Fastq files if required. - if (params.concat_lanes){ - if (params.read_type == 'PE'){ - CONCATENATE_READS_PE(read_ch) - read_ch = CONCATENATE_READS_PE.out.concat_fastq - } else if (params.read_type == 'SE'){ - CONCATENATE_READS_SE(read_ch) - read_ch = CONCATENATE_READS_SE.out.concat_fastq - } + // Step 0: Download data and concat Fastq files if needed. + if (params.download_data){ + FILE_DOWNLOAD(ch_input_sample) + + FILE_DOWNLOAD.out.read_meta_ch.map{it -> [it[0], it[2]]}.set{read_ch} + FILE_DOWNLOAD.out.read_meta_ch.map{it -> [it[0], it[1]]}.set{meta_ch} } + + // Step 00: Concat local Fastq files from CSV input if required. + if (!params.download_data && params.csv_input){ + CONCATENATE_LOCAL_FILES(ch_input_sample) + CONCATENATE_LOCAL_FILES.out.read_meta_ch.map{it -> [it[0], it[2]]}.set{read_ch} + CONCATENATE_LOCAL_FILES.out.read_meta_ch.map{it -> [it[0], it[1]]}.set{meta_ch} + } + + // Step 00: Concat local Fastq files if required. + if (params.concat_lanes && !params.csv_input){ + if (params.read_type == 'PE'){ + CONCATENATE_READS_PE(read_ch) + read_ch = CONCATENATE_READS_PE.out.concat_fastq + } else if (params.read_type == 'SE'){ + CONCATENATE_READS_SE(read_ch) + read_ch = CONCATENATE_READS_SE.out.concat_fastq + } + } + // Step 1: Qual_Stat - QUALITY_STATISTICS(read_ch) + JAX_TRIMMER(read_ch) + + FASTQC(JAX_TRIMMER.out.trimmed_fastq) // Step 2: Get Read Group Information - READ_GROUPS(QUALITY_STATISTICS.out.trimmed_fastq, "gatk") + READ_GROUPS(JAX_TRIMMER.out.trimmed_fastq, "gatk") - bwa_mem_mapping = QUALITY_STATISTICS.out.trimmed_fastq.join(READ_GROUPS.out.read_groups) + bwa_mem_mapping = JAX_TRIMMER.out.trimmed_fastq.join(READ_GROUPS.out.read_groups) // Step 3: BWA-MEM Alignment if (params.gen_org=='mouse'){ @@ -111,9 +156,11 @@ workflow WGS { PICARD_MARKDUPLICATES(PICARD_SORTSAM.out.bam) // If Human + ch_GATK_BASERECALIBRATOR_multiqc = Channel.empty() //optional log file for human only. if (params.gen_org=='human'){ GATK_BASERECALIBRATOR(PICARD_MARKDUPLICATES.out.dedup_bam) - + ch_GATK_BASERECALIBRATOR_multiqc = GATK_BASERECALIBRATOR.out.table // set log file for multiqc + apply_bqsr = PICARD_MARKDUPLICATES.out.dedup_bam.join(GATK_BASERECALIBRATOR.out.table) GATK_APPLYBQSR(apply_bqsr) @@ -131,15 +178,25 @@ workflow WGS { .splitText() .map{it -> it.trim()} + num_chroms = file(params.chrom_contigs).countLines().toInteger() + // number of intervals split on during calling. A 'value' variable used in groupTuple size statement. + // Applies scatter intervals from above to the BQSR bam file chrom_channel = data.combine(chroms) // Use the Channel in HaplotypeCaller - GATK_HAPLOTYPECALLER_INTERVAL(chrom_channel) + GATK_HAPLOTYPECALLER_INTERVAL(chrom_channel, '') // Gather intervals from scattered HaplotypeCaller operations into one // common stream for output - MAKE_VCF_LIST(GATK_HAPLOTYPECALLER_INTERVAL.out.vcf.groupTuple(),chroms.toList()) + + MAKE_VCF_LIST(GATK_HAPLOTYPECALLER_INTERVAL.out.vcf.groupTuple(size: num_chroms),chroms.toList()) GATK_MERGEVCF_LIST(MAKE_VCF_LIST.out.list) + + if (params.run_gvcf) { + // Use the Channel in HaplotypeCaller_GVCF + GATK_HAPLOTYPECALLER_INTERVAL_GVCF(chrom_channel,'gvcf') + GATK_COMBINEGVCFS(GATK_HAPLOTYPECALLER_INTERVAL_GVCF.out.vcf.groupTuple(size: num_chroms)) + } } // If Mouse @@ -157,81 +214,98 @@ workflow WGS { .splitText() .map{it -> it.trim()} + num_chroms = file(params.chrom_contigs).countLines().toInteger() + // number of intervals split on during calling. A 'value' variable used in groupTuple size statement. + // Applies scatter intervals from above to the BQSR bam file chrom_channel = data.combine(chroms) // Use the Channel in HaplotypeCaller - GATK_HAPLOTYPECALLER_INTERVAL(chrom_channel) + GATK_HAPLOTYPECALLER_INTERVAL(chrom_channel, '') // Gather intervals from scattered HaplotypeCaller operations into one // common stream for output - MAKE_VCF_LIST(GATK_HAPLOTYPECALLER_INTERVAL.out.vcf.groupTuple(), chroms.toList()) + + + MAKE_VCF_LIST(GATK_HAPLOTYPECALLER_INTERVAL.out.vcf.groupTuple(size: num_chroms), chroms.toList()) // Sort VCF within MAKE_VCF_LIST GATK_MERGEVCF_LIST(MAKE_VCF_LIST.out.list) - } + if (params.run_gvcf) { + // Use the Channel in HaplotypeCaller_GVCF + GATK_HAPLOTYPECALLER_INTERVAL_GVCF(chrom_channel,'gvcf') + GATK_COMBINEGVCFS(GATK_HAPLOTYPECALLER_INTERVAL_GVCF.out.vcf.groupTuple(size: num_chroms)) + } + } // SNP select_var_snp = GATK_MERGEVCF_LIST.out.vcf.join(GATK_MERGEVCF_LIST.out.idx) - GATK_SELECTVARIANTS_SNP(select_var_snp, 'SNP') + GATK_SELECTVARIANTS_SNP(select_var_snp, 'SNP', 'selected_SNP') var_filter_snp = GATK_SELECTVARIANTS_SNP.out.vcf.join(GATK_SELECTVARIANTS_SNP.out.idx) GATK_VARIANTFILTRATION_SNP(var_filter_snp, 'SNP') // INDEL select_var_indel = GATK_MERGEVCF_LIST.out.vcf.join(GATK_MERGEVCF_LIST.out.idx) - GATK_SELECTVARIANTS_INDEL(select_var_indel, 'INDEL') + GATK_SELECTVARIANTS_INDEL(select_var_indel, 'INDEL', 'selected_INDEL') var_filter_indel = GATK_SELECTVARIANTS_INDEL.out.vcf.join(GATK_SELECTVARIANTS_INDEL.out.idx) GATK_VARIANTFILTRATION_INDEL(var_filter_indel, 'INDEL') - // Cat Output to vcf-annotate* and add dbSNP annotations. - VCF_ANNOTATE_SNP(GATK_VARIANTFILTRATION_SNP.out.vcf, 'SNP') - VCF_ANNOTATE_INDEL(GATK_VARIANTFILTRATION_INDEL.out.vcf, 'INDEL') - -// Final Post-Processing Steps Differ for Human and Mouse + SNPSIFT_ANNOTATE_SNP_DBSNP(GATK_VARIANTFILTRATION_SNP.out.vcf, params.dbSNP, params.dbSNP_index, 'dbsnpID') + SNPSIFT_ANNOTATE_INDEL_DBSNP(GATK_VARIANTFILTRATION_INDEL.out.vcf, params.dbSNP, params.dbSNP_index, 'dbsnpID') // If Human if (params.gen_org=='human'){ // SNP - SNPSIFT_ANNOTATE_SNP_DBSNP(VCF_ANNOTATE_SNP.out.vcf, params.dbSNP, params.dbSNP_index, 'dbsnpID') SNPSIFT_ANNOTATE_SNP_COSMIC(SNPSIFT_ANNOTATE_SNP_DBSNP.out.vcf, params.cosmic, params.cosmic_index, 'cosmicID') SNPEFF_SNP(SNPSIFT_ANNOTATE_SNP_COSMIC.out.vcf, 'SNP', 'vcf') SNPSIFT_DBNSFP_SNP(SNPEFF_SNP.out.vcf, 'SNP') SNPEFF_ONEPERLINE_SNP(SNPSIFT_DBNSFP_SNP.out.vcf, 'SNP') // INDEL - SNPSIFT_ANNOTATE_INDEL_DBSNP(VCF_ANNOTATE_INDEL.out.vcf, params.dbSNP, params.dbSNP_index, 'dbsnpID') SNPSIFT_ANNOTATE_INDEL_COSMIC(SNPSIFT_ANNOTATE_INDEL_DBSNP.out.vcf, params.cosmic, params.cosmic_index, 'cosmicID') SNPEFF_INDEL(SNPSIFT_ANNOTATE_INDEL_COSMIC.out.vcf, 'INDEL', 'vcf') SNPSIFT_DBNSFP_INDEL(SNPEFF_INDEL.out.vcf, 'INDEL') SNPEFF_ONEPERLINE_INDEL(SNPSIFT_DBNSFP_INDEL.out.vcf, 'INDEL') // Merge SNP and INDEL and Aggregate Stats - vcf_files = SNPEFF_ONEPERLINE_SNP.out.vcf.join(SNPEFF_ONEPERLINE_INDEL.out.vcf) - GATK_MERGEVCF(vcf_files) + vcf_files_unannotated = SNPSIFT_ANNOTATE_SNP_COSMIC.out.vcf.join(SNPSIFT_ANNOTATE_INDEL_COSMIC.out.vcf) + GATK_MERGEVCF_UNANNOTATED(vcf_files_unannotated, 'SNP_INDEL_filtered_unannotated_final') - SNPSIFT_EXTRACTFIELDS(GATK_MERGEVCF.out.vcf) + vcf_files_annotated = SNPEFF_ONEPERLINE_SNP.out.vcf.join(SNPEFF_ONEPERLINE_INDEL.out.vcf) + GATK_MERGEVCF_ANNOTATED(vcf_files_annotated, 'SNP_INDEL_filtered_annotated_final') + + SNPSIFT_EXTRACTFIELDS(GATK_MERGEVCF_ANNOTATED.out.vcf) } // If Mouse if (params.gen_org=='mouse'){ // Merge SNP and INDEL - vcf_files = VCF_ANNOTATE_SNP.out.vcf.join(VCF_ANNOTATE_INDEL.out.vcf) - - GATK_MERGEVCF(vcf_files) + vcf_files = SNPSIFT_ANNOTATE_SNP_DBSNP.out.vcf.join(SNPSIFT_ANNOTATE_INDEL_DBSNP.out.vcf) - SNPEFF(GATK_MERGEVCF.out.vcf, 'BOTH', 'gatk') + GATK_MERGEVCF(vcf_files, 'SNP_INDEL_filtered_unannotated_final') - merged_vcf_files = GATK_MERGEVCF.out.vcf.join(SNPEFF.out.vcf) + SNPEFF(GATK_MERGEVCF.out.vcf, 'BOTH', 'vcf') - GATK_VARIANTANNOTATOR(merged_vcf_files) - - SNPSIFT_EXTRACTFIELDS(GATK_VARIANTANNOTATOR.out.vcf) + SNPEFF_ONEPERLINE(SNPEFF.out.vcf, 'BOTH') + SNPSIFT_EXTRACTFIELDS(SNPEFF_ONEPERLINE.out.vcf) } - agg_stats = QUALITY_STATISTICS.out.quality_stats.join(PICARD_MARKDUPLICATES.out.dedup_metrics).join(PICARD_COLLECTALIGNMENTSUMMARYMETRICS.out.txt).join(PICARD_COLLECTWGSMETRICS.out.txt) + agg_stats = JAX_TRIMMER.out.quality_stats.join(PICARD_MARKDUPLICATES.out.dedup_metrics).join(PICARD_COLLECTALIGNMENTSUMMARYMETRICS.out.txt).join(PICARD_COLLECTWGSMETRICS.out.txt) - // may replace with multiqc AGGREGATE_STATS(agg_stats) + ch_multiqc_files = Channel.empty() + ch_multiqc_files = ch_multiqc_files.mix(JAX_TRIMMER.out.quality_stats.collect{it[1]}.ifEmpty([])) + ch_multiqc_files = ch_multiqc_files.mix(FASTQC.out.quality_stats.collect{it[1]}.ifEmpty([])) + ch_multiqc_files = ch_multiqc_files.mix(ch_GATK_BASERECALIBRATOR_multiqc.collect{it[1]}.ifEmpty([])) + ch_multiqc_files = ch_multiqc_files.mix(PICARD_COLLECTALIGNMENTSUMMARYMETRICS.out.txt.collect{it[1]}.ifEmpty([])) + ch_multiqc_files = ch_multiqc_files.mix(PICARD_COLLECTWGSMETRICS.out.txt.collect{it[1]}.ifEmpty([])) + ch_multiqc_files = ch_multiqc_files.mix(PICARD_MARKDUPLICATES.out.dedup_metrics.collect{it[1]}.ifEmpty([])) + + MULTIQC ( + ch_multiqc_files.collect() + ) + + }