Merged in PIVOT_wes_updates (pull request #202)

pivot wes updates
TheJacksonLaboratory · Aug 16, 2024 · 259d10a · 259d10a
2 parents f6152b9 + 584ce44
commit 259d10a
Show file tree

Hide file tree

Showing 5 changed files with 52 additions and 2 deletions.
diff --git a/bin/help/pdx_wes.nf b/bin/help/pdx_wes.nf
@@ -15,6 +15,8 @@ Parameter | Default | Description
 --csv_input | null | Provide a CSV manifest file with the header: "sampleID,lane,fastq_1,fastq_2". See the repository wiki for an example file. Fastq_2 is optional and used only in PE data. Fastq files can either be absolute paths to local files, or URLs to remote files. If remote URLs are provided, `--download_data` must be specified.
 --download_data | null | Requires `--csv_input`. When specified, read data in the CSV manifest will be downloaded from provided URLs. 
 
+--deduplicate_reads | false | Options: false, true. If specified, run bbmap clumpify on input reads. Clumpify will deduplicate reads prior to trimming. This can help with mapping and downstream steps when analyzing high coverage WES data.
+
 --gen_org | human | Options: human only.
 
 --ref_fa | '/projects/omics_share/human/GRCh38/genome/sequence/gatk/Homo_sapiens_assembly38.fasta' | The reference fasta to be used throughout the process for alignment as well as any downstream analysis. JAX users should not change this parameter.

diff --git a/bin/log/pdx_wes.nf b/bin/log/pdx_wes.nf
@@ -26,6 +26,7 @@ ______________________________________________________
 -c                              ${params.config}
 --pubdir                        ${params.pubdir}
 --organize_by                   ${params.organize_by}
+--deduplicate_reads             ${params.deduplicate_reads}
 --xenome_index                  ${params.xenome_prefix}
 --ref_fa                        ${params.ref_fa}
 --ref_fa_indices                ${params.ref_fa_indices}

diff --git a/config/pdx_wes.config b/config/pdx_wes.config
@@ -19,7 +19,8 @@ params {
   concat_lanes = false
   download_data = false
   csv_input = null
-
+  deduplicate_reads = false
+
   multiqc_config = "${projectDir}/bin/shared/multiqc/pdx_wes_multiqc.yaml"
 
   // Reference fasta

diff --git a/modules/bbmap/bbmap_clumpify.nf b/modules/bbmap/bbmap_clumpify.nf
@@ -0,0 +1,37 @@
+process CLUMPIFY {
+    tag "$sampleID"
+
+    cpus 6
+    memory 200.GB
+    time '48:00:00'
+    errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.memory} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'}
+
+    container 'quay.io/biocontainers/bbmap:39.06--h92535d8_0'
+
+    publishDir "${params.pubdir}/${ params.organize_by=='sample' ? sampleID+'/stats' : 'stats'}", pattern: "*.txt", mode:'copy'
+
+    input:
+    tuple val(sampleID), path(fq_reads)
+
+    output:
+    tuple val(sampleID), path("${sampleID}.clumpy.R*.fastq.gz"), emit: clumpy_fastq
+    tuple val(sampleID), path("*log.txt"), emit: clumpy_log
+
+    script:
+    if (params.read_type == "SE")
+    """
+    testformat.sh ${fq_reads[0]} > fastq_format.txt
+
+    if grep -q 'illumina' fastq_format.txt ; then qual=64; else qual=33; fi
+
+    clumpify.sh in=${fq_reads[0]} out=${sampleID}.clumpy.R1.fastq.gz tmpdir=./ usetmpdir=t dedupe=t qin=\${qual} -Xmx199g &> ${sampleID}_clumpy_log.txt
+    """
+    else
+    """
+    testformat.sh ${fq_reads[0]} > fastq_format.txt
+    
+    if grep -q 'illumina' fastq_format.txt ; then qual=64; else qual=33; fi
+
+    clumpify.sh in=${fq_reads[0]} in2=${fq_reads[1]} out=${sampleID}.clumpy.R1.fastq.gz out2=${sampleID}.clumpy.R2.fastq.gz tmpdir=./ usetmpdir=t dedupe=t qin=\${qual} -Xmx199g &> ${sampleID}_clumpy_log.txt
+    """
+}
diff --git a/workflows/pdx_wes.nf b/workflows/pdx_wes.nf
@@ -10,6 +10,7 @@ include {FILE_DOWNLOAD} from "${projectDir}/subworkflows/aria_download_parse"
 include {CONCATENATE_LOCAL_FILES} from "${projectDir}/subworkflows/concatenate_local_files"
 include {CONCATENATE_READS_PE} from "${projectDir}/modules/utility_modules/concatenate_reads_PE"
 include {CONCATENATE_READS_SE} from "${projectDir}/modules/utility_modules/concatenate_reads_SE"
+include {CLUMPIFY} from "${projectDir}/modules/bbmap/bbmap_clumpify"
 include {JAX_TRIMMER} from "${projectDir}/modules/utility_modules/jax_trimmer"
 include {FASTQC} from "${projectDir}/modules/fastqc/fastqc"
 include {XENOME_CLASSIFY} from "${projectDir}/modules/xenome/xenome"
@@ -138,8 +139,16 @@ workflow PDX_WES {
 
     // ** MAIN workflow starts: 
 
+    // Optional Step -- Clumpify
+    if (params.deduplicate_reads) {
+        CLUMPIFY(read_ch)
+        trimmer_input = CLUMPIFY.out.clumpy_fastq
+    } else {
+        trimmer_input = read_ch
+    }
+
     // Step 1: Qual_Stat
-    JAX_TRIMMER(read_ch)
+    JAX_TRIMMER(trimmer_input)
 
     xenome_input = JAX_TRIMMER.out.trimmed_fastq