From 0985352b0c43392e9723c5957661c060d9bc16a0 Mon Sep 17 00:00:00 2001 From: reichan1998 Date: Tue, 24 Sep 2024 11:06:58 +0700 Subject: [PATCH 01/29] apply chunking and parallelisation for align_pacbio and align_ont --- conf/base.config | 5 +++ conf/modules.config | 24 +++++++++++ subworkflows/local/align_ont.nf | 55 +++++++++++++++++++++--- subworkflows/local/align_pacbio.nf | 55 ++++++++++++++++++++---- subworkflows/local/minimap2_mapreduce.nf | 3 +- 5 files changed, 126 insertions(+), 16 deletions(-) diff --git a/conf/base.config b/conf/base.config index cd54c75..0fc6a4e 100644 --- a/conf/base.config +++ b/conf/base.config @@ -109,6 +109,11 @@ process { memory = { check_max( 1.GB * Math.ceil( 30 * fasta.size() / 1e+9 ) * task.attempt, 'memory' ) } } + withName: GENERATE_CRAM_CSV { + cpus = { check_max( 4 * task.attempt, 'cpus' ) } + memory = { check_max( 16.GB * task.attempt, 'memory' ) } + } + withName: CRUMBLE { // No correlation between memory usage and the number of reads or the genome size. // Most genomes seem happy with 1 GB, then some with 2 GB, then some with 5 GB. diff --git a/conf/modules.config b/conf/modules.config index ebf3fb0..ec1c74d 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -103,6 +103,30 @@ process { ext.args = { "-ax map-ont -R ${meta.read_group} -I" + Math.ceil(meta2.genome_size/1e9) + 'G' } } + withName: ".*:ALIGN_HIFI:.*:CRAM_FILTER_MINIMAP2_FILTER5END_FIXMATE_SORT" { + ext.args = "" + ext.args1 = { "-F 0x200 -nt" } + ext.args2 = { "-ax map-hifi --cs=short -I" + Math.ceil(meta.genome_size/1e9) + 'G' } + ext.args3 = "-mpu" + ext.args4 = { "--write-index -l1" } + } + + withName: ".*:ALIGN_CLR:.*:CRAM_FILTER_MINIMAP2_FILTER5END_FIXMATE_SORT" { + ext.args = "" + ext.args1 = { "-F 0x200 -nt" } + ext.args2 = { "-ax map-pb -I" + Math.ceil(meta.genome_size/1e9) + 'G' } + ext.args3 = "-mpu" + ext.args4 = { "--write-index -l1" } + } + + withName: ".*:ALIGN_ONT:.*:CRAM_FILTER_MINIMAP2_FILTER5END_FIXMATE_SORT" { + ext.args = "" + ext.args1 = { "-F 0x200 -nt" } + ext.args2 = { "-ax map-ont -I" + Math.ceil(meta.genome_size/1e9) + 'G' } + ext.args3 = "-mpu" + ext.args4 = { "--write-index -l1" } + } + withName: '.*:CONVERT_STATS:SAMTOOLS_CRAM' { ext.prefix = { "${fasta.baseName}.${meta.datatype}.${meta.id}" } ext.args = '--output-fmt cram --write-index' diff --git a/subworkflows/local/align_ont.nf b/subworkflows/local/align_ont.nf index ef1a021..f1e3465 100644 --- a/subworkflows/local/align_ont.nf +++ b/subworkflows/local/align_ont.nf @@ -2,7 +2,11 @@ // Align Nanopore read files against the genome // -include { MINIMAP2_ALIGN } from '../../modules/nf-core/minimap2/align/main' +include { SAMTOOLS_ADDREPLACERG } from '../../modules/local/samtools_addreplacerg' +include { SAMTOOLS_INDEX } from '../../modules/nf-core/samtools/index/main' +include { GENERATE_CRAM_CSV } from '../../modules/local/generate_cram_csv' +include { MINIMAP2_MAPREDUCE } from '../../subworkflows/local/minimap2_mapreduce' +include { SAMTOOLS_SORMADUP as CONVERT_CRAM } from '../../modules/local/samtools_sormadup' include { SAMTOOLS_MERGE } from '../../modules/nf-core/samtools/merge/main' @@ -14,17 +18,54 @@ workflow ALIGN_ONT { main: ch_versions = Channel.empty() + ch_merged_bam = Channel.empty() + // Convert FASTQ to CRAM + CONVERT_CRAM ( reads, fasta ) + ch_versions = ch_versions.mix ( CONVERT_CRAM.out.versions ) - // Align Fastq to Genome with minimap2. bam_format is set to true, making the output a *sorted* BAM - MINIMAP2_ALIGN ( reads, fasta, true, "bai", false, false ) - ch_versions = ch_versions.mix ( MINIMAP2_ALIGN.out.versions.first() ) + SAMTOOLS_ADDREPLACERG ( CONVERT_CRAM.out.bam ) + ch_versions = ch_versions.mix ( SAMTOOLS_ADDREPLACERG.out.versions ) + SAMTOOLS_ADDREPLACERG.out.cram + | set { ch_reads_cram } - // Collect all alignment output by sample name - MINIMAP2_ALIGN.out.bam + // Index the CRAM file + SAMTOOLS_INDEX ( ch_reads_cram ) + ch_versions = ch_versions.mix( SAMTOOLS_INDEX.out.versions ) + + ch_reads_cram + | join ( SAMTOOLS_INDEX.out.crai ) + | set { ch_reads_cram_crai } + + + // + // MODULE: generate a CRAM CSV file containing the required parametres for CRAM_FILTER_ALIGN_BWAMEM2_FIXMATE_SORT + // + GENERATE_CRAM_CSV( ch_reads_cram_crai ) + ch_versions = ch_versions.mix( GENERATE_CRAM_CSV.out.versions ) + + // + // SUBWORKFLOW: mapping hic reads using minimap2 or bwamem2 + // + MINIMAP2_MAPREDUCE ( + fasta, + GENERATE_CRAM_CSV.out.csv + ) + ch_versions = ch_versions.mix( MINIMAP2_MAPREDUCE.out.versions ) + ch_merged_bam = ch_merged_bam.mix(MINIMAP2_MAPREDUCE.out.mergedbam) + + + ch_merged_bam + | combine( ch_reads_cram_crai ) + | map { meta_bam, bam, meta_cram, cram, crai -> [ meta_cram, bam ] } + | set { ch_merged_bam } + + + // Collect all BAM output by sample name + ch_merged_bam | map { meta, bam -> [['id': meta.id.split('_')[0..-2].join('_'), 'datatype': meta.datatype], meta.read_count, bam] } - | groupTuple ( by: [0] ) + | groupTuple( by: [0] ) | map { meta, read_counts, bams -> [meta + [read_count: read_counts.sum()], bams] } | branch { meta, bams -> diff --git a/subworkflows/local/align_pacbio.nf b/subworkflows/local/align_pacbio.nf index f472a6c..7474d45 100644 --- a/subworkflows/local/align_pacbio.nf +++ b/subworkflows/local/align_pacbio.nf @@ -3,7 +3,11 @@ // include { FILTER_PACBIO } from '../../subworkflows/local/filter_pacbio' -include { MINIMAP2_ALIGN } from '../../modules/nf-core/minimap2/align/main' +include { SAMTOOLS_ADDREPLACERG } from '../../modules/local/samtools_addreplacerg' +include { SAMTOOLS_INDEX } from '../../modules/nf-core/samtools/index/main' +include { GENERATE_CRAM_CSV } from '../../modules/local/generate_cram_csv' +include { MINIMAP2_MAPREDUCE } from '../../subworkflows/local/minimap2_mapreduce' +include { SAMTOOLS_SORMADUP as CONVERT_CRAM } from '../../modules/local/samtools_sormadup' include { SAMTOOLS_MERGE } from '../../modules/nf-core/samtools/merge/main' @@ -16,21 +20,56 @@ workflow ALIGN_PACBIO { main: ch_versions = Channel.empty() - + ch_merged_bam = Channel.empty() // Filter BAM and output as FASTQ FILTER_PACBIO ( reads, db ) ch_versions = ch_versions.mix ( FILTER_PACBIO.out.versions ) + // Convert FASTQ to CRAM + CONVERT_CRAM ( FILTER_PACBIO.out.fastq, fasta ) + ch_versions = ch_versions.mix ( CONVERT_CRAM.out.versions ) + + SAMTOOLS_ADDREPLACERG ( CONVERT_CRAM.out.bam ) + ch_versions = ch_versions.mix ( SAMTOOLS_ADDREPLACERG.out.versions ) + + SAMTOOLS_ADDREPLACERG.out.cram + | set { ch_reads_cram } + + // Index the CRAM file + SAMTOOLS_INDEX ( ch_reads_cram ) + ch_versions = ch_versions.mix( SAMTOOLS_INDEX.out.versions ) + + ch_reads_cram + | join ( SAMTOOLS_INDEX.out.crai ) + | set { ch_reads_cram_crai } + + + // + // MODULE: generate a CRAM CSV file containing the required parametres for CRAM_FILTER_MINIMAP2_FILTER5END_FIXMATE_SORT + // + GENERATE_CRAM_CSV( ch_reads_cram_crai ) + ch_versions = ch_versions.mix( GENERATE_CRAM_CSV.out.versions ) + + // + // SUBWORKFLOW: mapping pacbio reads using minimap2 + // + MINIMAP2_MAPREDUCE ( + fasta, + GENERATE_CRAM_CSV.out.csv + ) + ch_versions = ch_versions.mix( MINIMAP2_MAPREDUCE.out.versions ) + ch_merged_bam = ch_merged_bam.mix(MINIMAP2_MAPREDUCE.out.mergedbam) - // Align Fastq to Genome with minimap2. bam_format is set to true, making the output a *sorted* BAM - MINIMAP2_ALIGN ( FILTER_PACBIO.out.fastq, fasta, true, "bai", false, false ) - ch_versions = ch_versions.mix ( MINIMAP2_ALIGN.out.versions.first() ) + ch_merged_bam + | combine( ch_reads_cram_crai ) + | map { meta_bam, bam, meta_cram, cram, crai -> [ meta_cram, bam ] } + | set { ch_merged_bam } - // Collect all alignment output by sample name - MINIMAP2_ALIGN.out.bam + // Collect all BAM output by sample name + ch_merged_bam | map { meta, bam -> [['id': meta.id.split('_')[0..-2].join('_'), 'datatype': meta.datatype], meta.read_count, bam] } - | groupTuple ( by: [0] ) + | groupTuple( by: [0] ) | map { meta, read_counts, bams -> [meta + [read_count: read_counts.sum()], bams] } | branch { meta, bams -> diff --git a/subworkflows/local/minimap2_mapreduce.nf b/subworkflows/local/minimap2_mapreduce.nf index 35b5aae..7503e02 100644 --- a/subworkflows/local/minimap2_mapreduce.nf +++ b/subworkflows/local/minimap2_mapreduce.nf @@ -38,7 +38,8 @@ workflow MINIMAP2_MAPREDUCE { .map{ cram_id, cram_info, ref_id, ref_dir, mmi_id, mmi_path-> tuple([ id: cram_id.id, - chunk_id: cram_id.id + "_" + cram_info[5] + chunk_id: cram_id.id + "_" + cram_info[5], + genome_size: ref_id.genome_size ], file(cram_info[0]), cram_info[1], From 2d108e9edcea53d2bcc1cce2c057572dcb6d81b5 Mon Sep 17 00:00:00 2001 From: reichan1998 Date: Tue, 24 Sep 2024 12:29:04 +0700 Subject: [PATCH 02/29] fix cannot allocate resource samtools_addreplcerg --- conf/base.config | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/conf/base.config b/conf/base.config index 0fc6a4e..d3b72be 100644 --- a/conf/base.config +++ b/conf/base.config @@ -58,6 +58,12 @@ process { time = { check_max( 2.h * Math.ceil( meta.read_count / 100000000 ) * task.attempt / log_increase_cpus(2, 6*task.attempt, 1, 2), 'time' ) } } + withName: SAMTOOLS_ADDREPLACERG { + cpus = { log_increase_cpus(2, 6*task.attempt, 1, 2) } + memory = { check_max( 4.GB + 850.MB * log_increase_cpus(2, 6*task.attempt, 1, 2) * task.attempt + 0.6.GB * Math.ceil( meta.read_count / 100000000 ), 'memory' ) } + time = { check_max( 2.h * Math.ceil( meta.read_count / 100000000 ) * task.attempt / log_increase_cpus(2, 6*task.attempt, 1, 2), 'time' ) } + } + withName: BLAST_BLASTN { time = { check_max( 2.hour * Math.ceil( meta.read_count / 1000000 ) * task.attempt, 'time' ) } memory = { check_max( 100.MB + 20.MB * Math.ceil( meta.read_count / 1000000 ) * task.attempt, 'memory' ) } From 65145ce68e567d364c7ac1b617a61d216ba1c4ef Mon Sep 17 00:00:00 2001 From: Tyler Chafin Date: Tue, 24 Sep 2024 10:43:10 +0100 Subject: [PATCH 03/29] patch 1.3.1 --- conf/base.config | 5 + conf/modules.config | 6 +- modules.json | 10 +- .../bbmap/filterbyname/environment.yml | 5 + modules/nf-core/bbmap/filterbyname/main.nf | 71 ++++++ modules/nf-core/bbmap/filterbyname/meta.yml | 70 ++++++ .../bbmap/filterbyname/tests/main.nf.test | 218 ++++++++++++++++++ .../filterbyname/tests/main.nf.test.snap | 145 ++++++++++++ .../nf-core/bbmap/filterbyname/tests/tags.yml | 2 + modules/nf-core/seqtk/subseq/environment.yml | 7 - modules/nf-core/seqtk/subseq/main.nf | 56 ----- modules/nf-core/seqtk/subseq/meta.yml | 40 ---- .../nf-core/seqtk/subseq/tests/main.nf.test | 59 ----- .../seqtk/subseq/tests/main.nf.test.snap | 60 ----- .../seqtk/subseq/tests/standard.config | 5 - modules/nf-core/seqtk/subseq/tests/tags.yml | 2 - subworkflows/local/align_short.nf | 6 +- subworkflows/local/filter_pacbio.nf | 12 +- 18 files changed, 535 insertions(+), 244 deletions(-) create mode 100644 modules/nf-core/bbmap/filterbyname/environment.yml create mode 100644 modules/nf-core/bbmap/filterbyname/main.nf create mode 100644 modules/nf-core/bbmap/filterbyname/meta.yml create mode 100644 modules/nf-core/bbmap/filterbyname/tests/main.nf.test create mode 100644 modules/nf-core/bbmap/filterbyname/tests/main.nf.test.snap create mode 100644 modules/nf-core/bbmap/filterbyname/tests/tags.yml delete mode 100644 modules/nf-core/seqtk/subseq/environment.yml delete mode 100644 modules/nf-core/seqtk/subseq/main.nf delete mode 100644 modules/nf-core/seqtk/subseq/meta.yml delete mode 100644 modules/nf-core/seqtk/subseq/tests/main.nf.test delete mode 100644 modules/nf-core/seqtk/subseq/tests/main.nf.test.snap delete mode 100644 modules/nf-core/seqtk/subseq/tests/standard.config delete mode 100644 modules/nf-core/seqtk/subseq/tests/tags.yml diff --git a/conf/base.config b/conf/base.config index ca753f3..3827aae 100644 --- a/conf/base.config +++ b/conf/base.config @@ -41,6 +41,11 @@ process { memory = { check_max( ((meta.datatype == "pacbio_clr" || meta.datatype == "ont") ? 2.GB : 1.GB) * task.attempt, 'memory' ) } } + // minimum 1GB memory + withName: 'BBMAP_FILTERBYNAME' { + memory = { check_max( 1.GB, 'memory' ) } + } + withName: 'SAMTOOLS_COLLATETOFASTA' { cpus = { log_increase_cpus(4, 2*task.attempt, 1, 2) } memory = { check_max( 1.GB * Math.ceil( meta.read_count / 1000000 ) * task.attempt, 'memory' ) } diff --git a/conf/modules.config b/conf/modules.config index a2d4464..d05fe4b 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -15,12 +15,16 @@ process { ext.args = '-F 0x200 -nt' } + withName: BBMAP_FILTERBYNAME { + ext.args = 'include=f' + } + withName: '.*:.*:ALIGN_HIC:BWAMEM2_MEM' { ext.args = { "-5SPCp -R ${meta.read_group}" } } withName: '.*:.*:ALIGN_ILLUMINA:BWAMEM2_MEM' { - ext.args = { "-R ${meta.read_group}" } + ext.args = { "-p -R ${meta.read_group}" } } withName: SAMTOOLS_MERGE { diff --git a/modules.json b/modules.json index b7bb93d..522bb30 100644 --- a/modules.json +++ b/modules.json @@ -5,6 +5,11 @@ "https://github.com/nf-core/modules.git": { "modules": { "nf-core": { + "bbmap/filterbyname": { + "branch": "master", + "git_sha": "666652151335353eef2fcd58880bcef5bc2928e1", + "installed_by": ["modules"] + }, "blast/blastn": { "branch": "master", "git_sha": "583edaf97c9373a20df05a3b7be5a6677f9cd719", @@ -81,11 +86,6 @@ "git_sha": "03fbf6c89e551bd8d77f3b751fb5c955f75b34c5", "installed_by": ["modules"] }, - "seqtk/subseq": { - "branch": "master", - "git_sha": "730f3aee80d5f8d0b5fc532202ac59361414d006", - "installed_by": ["modules"] - }, "untar": { "branch": "master", "git_sha": "4e5f4687318f24ba944a13609d3ea6ebd890737d", diff --git a/modules/nf-core/bbmap/filterbyname/environment.yml b/modules/nf-core/bbmap/filterbyname/environment.yml new file mode 100644 index 0000000..dfd8936 --- /dev/null +++ b/modules/nf-core/bbmap/filterbyname/environment.yml @@ -0,0 +1,5 @@ +channels: + - conda-forge + - bioconda +dependencies: + - bioconda::bbmap=39.01 diff --git a/modules/nf-core/bbmap/filterbyname/main.nf b/modules/nf-core/bbmap/filterbyname/main.nf new file mode 100644 index 0000000..7267908 --- /dev/null +++ b/modules/nf-core/bbmap/filterbyname/main.nf @@ -0,0 +1,71 @@ +process BBMAP_FILTERBYNAME { + tag "$meta.id" + label 'process_single' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/bbmap:39.01--h5c4e2a8_0': + 'biocontainers/bbmap:39.01--h5c4e2a8_0' }" + + input: + tuple val(meta), path(reads) + val(names_to_filter) + val(output_format) + val(interleaved_output) + + output: + tuple val(meta), path("*.${output_format}"), emit: reads + tuple val(meta), path('*.log') , emit: log + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def input = meta.single_end ? "in=${reads}" : "in=${reads[0]} in2=${reads[1]}" + def output = (meta.single_end || interleaved_output) ? + "out=${prefix}.${output_format}" : + "out1=${prefix}_1.${output_format} out2=${prefix}_2.${output_format}" + def names_command = names_to_filter ? "names=${names_to_filter}": "" + + def avail_mem = 3 + if (!task.memory) { + log.info '[filterbyname] Available memory not known - defaulting to 3GB. Specify process memory requirements to change this.' + } else { + avail_mem = task.memory.giga + } + + """ + filterbyname.sh \\ + -Xmx${avail_mem}g \\ + $input \\ + $output \\ + $names_command \\ + $args \\ + | tee ${prefix}.log + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + bbmap: \$(bbversion.sh | grep -v "Duplicate cpuset") + END_VERSIONS + """ + + stub: + def prefix = task.ext.prefix ?: "${meta.id}" + def filtered = (meta.single_end || interleaved_output) ? + "echo '' | gzip > ${prefix}.${output_format}" : + "echo '' | gzip >${prefix}_1.${output_format} ; echo '' | gzip >${prefix}_2.${output_format}" + + """ + $filtered + touch ${prefix}.log + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + bbmap: \$(bbversion.sh | grep -v "Duplicate cpuset") + END_VERSIONS + """ + +} diff --git a/modules/nf-core/bbmap/filterbyname/meta.yml b/modules/nf-core/bbmap/filterbyname/meta.yml new file mode 100644 index 0000000..b7b8641 --- /dev/null +++ b/modules/nf-core/bbmap/filterbyname/meta.yml @@ -0,0 +1,70 @@ +name: bbmap_filterbyname +description: Filter out sequences by sequence header name(s) +keywords: + - fastq + - fasta + - filter +tools: + - bbmap: + description: BBMap is a short read aligner, as well as various other bioinformatic + tools. + homepage: https://jgi.doe.gov/data-and-tools/software-tools/bbtools/bb-tools-user-guide/clumpify-guide/ + documentation: https://www.biostars.org/p/225338/ + licence: ["UC-LBL license (see package)"] + identifier: biotools:bbmap + +input: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - reads: + type: file + description: | + List of input FastQ files of size 1 and 2 for single-end and + paired-end data, respectively. + - - names_to_filter: + type: string + description: | + String containing names of reads to filter out of the fastq files. + - - output_format: + type: string + description: | + String with the format of the output file, e.g. fastq.gz, fasta, fasta.bz2 + - - interleaved_output: + type: boolean + description: | + Whether to produce an interleaved fastq output file +output: + - reads: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - "*.${output_format}": + type: file + description: The trimmed/modified fastq reads + pattern: "*${output_format}" + - log: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - "*.log": + type: file + description: filterbyname.sh log file + pattern: "*.filterbyname.log" + - versions: + - versions.yml: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@tokarevvasily" + - "@sppearce" + +maintainers: + - "@sppearce" diff --git a/modules/nf-core/bbmap/filterbyname/tests/main.nf.test b/modules/nf-core/bbmap/filterbyname/tests/main.nf.test new file mode 100644 index 0000000..17c7ea5 --- /dev/null +++ b/modules/nf-core/bbmap/filterbyname/tests/main.nf.test @@ -0,0 +1,218 @@ +nextflow_process { + + name "Test Process BBMAP_FILTERBYNAME" + script "../main.nf" + process "BBMAP_FILTERBYNAME" + + tag "modules" + tag "modules_nfcore" + tag "bbmap" + tag "bbmap/filterbyname" + + test("paired end fastq.bz2") { + + when { + process { + """ + input[0] = [ + [ id:'test', single_end:false ], + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_1.fastq.gz', checkIfExists: true), + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_2.fastq.gz', checkIfExists: true) + ] + input[1] = "" + input[2] = "fastq.bz2" + input[3] = false + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out.versions).match() } + ) + } + + } + + test("paired end fastq.bz2 - stub") { + + options "-stub" + + when { + process { + """ + input[0] = [ + [ id:'test', single_end:false ], + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_1.fastq.gz', checkIfExists: true), + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_2.fastq.gz', checkIfExists: true) + ] + input[1] = "" + input[2] = "fastq.bz2" + input[3] = false + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + + test("single end fasta") { + + when { + process { + """ + input[0] = [ + [ id:'test', single_end:false ], + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_1.fastq.gz', checkIfExists: true) + ] + input[1] = "" + input[2] = "fasta" + input[3] = false + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out.versions).match() } + ) + } + + } + + test("single end fasta - stub") { + options "-stub" + when { + process { + """ + input[0] = [ + [ id:'test', single_end:false ], + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_1.fastq.gz', checkIfExists: true) + ] + input[1] = "" + input[2] = "fasta" + input[3] = false + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out.versions).match() } + ) + } + + } + + test("single end fastq.gz filter") { + + when { + process { + """ + input[0] = [ + [ id:'test', single_end:false ], + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_1.fastq.gz', checkIfExists: true) + ] + input[1] = "ERR5069949.2151832,ERR5069949.576388,ERR5069949.501486" + input[2] = "fasta" + input[3] = false + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out.versions).match() } + ) + } + + } + + test("single end fastq.gz - stub") { + options "-stub" + when { + process { + """ + input[0] = [ + [ id:'test', single_end:false ], + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_1.fastq.gz', checkIfExists: true) + ] + input[1] = "ERR5069949.2151832,ERR5069949.576388,ERR5069949.501486" + input[2] = "fastq.gz" + input[3] = false + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out.versions).match() } + ) + } + + } + + test("paired end fastq.gz filter interleaved") { + + when { + process { + """ + input[0] = [ + [ id:'test', single_end:false ], + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_1.fastq.gz', checkIfExists: true), + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_2.fastq.gz', checkIfExists: true) + ] + input[1] = "ERR5069949.2151832,ERR5069949.576388,ERR5069949.501486" + input[2] = "fastq.gz" + input[3] = true + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out.versions).match() } + ) + } + + } + + test("paired end fastq.gz filter interleaved - stub") { + options "-stub" + when { + process { + """ + input[0] = [ + [ id:'test', single_end:false ], + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_1.fastq.gz', checkIfExists: true), + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_2.fastq.gz', checkIfExists: true) + ] + input[1] = "ERR5069949.2151832,ERR5069949.576388,ERR5069949.501486" + input[2] = "fastq.gz" + input[3] = true + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out.versions).match() } + ) + } + + } + +} diff --git a/modules/nf-core/bbmap/filterbyname/tests/main.nf.test.snap b/modules/nf-core/bbmap/filterbyname/tests/main.nf.test.snap new file mode 100644 index 0000000..e06845a --- /dev/null +++ b/modules/nf-core/bbmap/filterbyname/tests/main.nf.test.snap @@ -0,0 +1,145 @@ +{ + "single end fasta": { + "content": [ + [ + "versions.yml:md5,aaa9e9267785f8680ba0cab91423c06d" + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "24.04.2" + }, + "timestamp": "2024-06-13T12:10:54.50002639" + }, + "paired end fastq.bz2": { + "content": [ + [ + "versions.yml:md5,aaa9e9267785f8680ba0cab91423c06d" + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "24.04.2" + }, + "timestamp": "2024-06-13T12:10:31.368676493" + }, + "paired end fastq.bz2 - stub": { + "content": [ + { + "0": [ + [ + { + "id": "test", + "single_end": false + }, + [ + "test_1.fastq.bz2:md5,1a60c330fb42841e8dcf3cd507a70bfc", + "test_2.fastq.bz2:md5,1a60c330fb42841e8dcf3cd507a70bfc" + ] + ] + ], + "1": [ + [ + { + "id": "test", + "single_end": false + }, + "test.log:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "2": [ + "versions.yml:md5,aaa9e9267785f8680ba0cab91423c06d" + ], + "log": [ + [ + { + "id": "test", + "single_end": false + }, + "test.log:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "reads": [ + [ + { + "id": "test", + "single_end": false + }, + [ + "test_1.fastq.bz2:md5,1a60c330fb42841e8dcf3cd507a70bfc", + "test_2.fastq.bz2:md5,1a60c330fb42841e8dcf3cd507a70bfc" + ] + ] + ], + "versions": [ + "versions.yml:md5,aaa9e9267785f8680ba0cab91423c06d" + ] + } + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "24.04.2" + }, + "timestamp": "2024-06-13T12:10:42.854788269" + }, + "single end fastq.gz filter": { + "content": [ + [ + "versions.yml:md5,aaa9e9267785f8680ba0cab91423c06d" + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "24.04.2" + }, + "timestamp": "2024-06-13T12:11:24.280900344" + }, + "single end fastq.gz - stub": { + "content": [ + [ + "versions.yml:md5,aaa9e9267785f8680ba0cab91423c06d" + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "24.04.2" + }, + "timestamp": "2024-06-13T12:11:43.274477064" + }, + "paired end fastq.gz filter interleaved - stub": { + "content": [ + [ + "versions.yml:md5,aaa9e9267785f8680ba0cab91423c06d" + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "24.04.2" + }, + "timestamp": "2024-06-13T12:12:05.324554457" + }, + "single end fasta - stub": { + "content": [ + [ + "versions.yml:md5,aaa9e9267785f8680ba0cab91423c06d" + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "24.04.2" + }, + "timestamp": "2024-06-13T12:11:13.161430777" + }, + "paired end fastq.gz filter interleaved": { + "content": [ + [ + "versions.yml:md5,aaa9e9267785f8680ba0cab91423c06d" + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "24.04.2" + }, + "timestamp": "2024-06-13T12:11:54.599067108" + } +} diff --git a/modules/nf-core/bbmap/filterbyname/tests/tags.yml b/modules/nf-core/bbmap/filterbyname/tests/tags.yml new file mode 100644 index 0000000..707f910 --- /dev/null +++ b/modules/nf-core/bbmap/filterbyname/tests/tags.yml @@ -0,0 +1,2 @@ +bbmap/filterbyname: + - "modules/nf-core/bbmap/filterbyname/**" diff --git a/modules/nf-core/seqtk/subseq/environment.yml b/modules/nf-core/seqtk/subseq/environment.yml deleted file mode 100644 index 7abe364..0000000 --- a/modules/nf-core/seqtk/subseq/environment.yml +++ /dev/null @@ -1,7 +0,0 @@ -name: seqtk_subseq -channels: - - conda-forge - - bioconda - - defaults -dependencies: - - bioconda::seqtk=1.4 diff --git a/modules/nf-core/seqtk/subseq/main.nf b/modules/nf-core/seqtk/subseq/main.nf deleted file mode 100644 index d5caebc..0000000 --- a/modules/nf-core/seqtk/subseq/main.nf +++ /dev/null @@ -1,56 +0,0 @@ -process SEQTK_SUBSEQ { - tag "$sequences" - label 'process_single' - - conda "${moduleDir}/environment.yml" - container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/seqtk:1.4--he4a0461_1' : - 'biocontainers/seqtk:1.4--he4a0461_1' }" - - input: - tuple val(meta), path(sequences) - path filter_list - - output: - tuple val(meta), path("*.gz"), emit: sequences - path "versions.yml", emit: versions - - when: - task.ext.when == null || task.ext.when - - script: - def args = task.ext.args ?: '' - def prefix = task.ext.prefix ?: "${meta.id}" - def ext = "fa" - if ("$sequences" ==~ /.+\.fq|.+\.fq.gz|.+\.fastq|.+\.fastq.gz/) { - ext = "fq" - } - """ - seqtk \\ - subseq \\ - $args \\ - $sequences \\ - $filter_list | \\ - gzip --no-name > ${sequences}${prefix}.${ext}.gz - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - seqtk: \$(echo \$(seqtk 2>&1) | sed 's/^.*Version: //; s/ .*\$//') - END_VERSIONS - """ - - stub: - def prefix = task.ext.prefix ?: "${meta.id}" - def ext = "fa" - if ("$sequences" ==~ /.+\.fq|.+\.fq.gz|.+\.fastq|.+\.fastq.gz/) { - ext = "fq" - } - """ - echo "" | gzip > ${sequences}${prefix}.${ext}.gz - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - seqtk: \$(echo \$(seqtk 2>&1) | sed 's/^.*Version: //; s/ .*\$//') - END_VERSIONS - """ -} diff --git a/modules/nf-core/seqtk/subseq/meta.yml b/modules/nf-core/seqtk/subseq/meta.yml deleted file mode 100644 index de4a841..0000000 --- a/modules/nf-core/seqtk/subseq/meta.yml +++ /dev/null @@ -1,40 +0,0 @@ -name: seqtk_subseq -description: Select only sequences that match the filtering condition -keywords: - - filtering - - selection - - fastx -tools: - - seqtk: - description: Seqtk is a fast and lightweight tool for processing sequences in the FASTA or FASTQ format - homepage: https://github.com/lh3/seqtk - documentation: https://docs.csc.fi/apps/seqtk/ - tool_dev_url: https://github.com/lh3/seqtk - licence: ["MIT"] -input: - - meta: - type: map - description: | - Groovy Map containing sample information - e.g. [ id:'test' ] - - sequences: - type: file - description: FASTQ/FASTA file - pattern: "*.{fq,fq.gz,fa,fa.gz}" - - filter_list: - type: file - description: BED file or a text file with a list of sequence names - pattern: "*.{bed,lst}" -output: - - versions: - type: file - description: File containing software versions - pattern: "versions.yml" - - sequences: - type: file - description: FASTQ/FASTA file - pattern: "*.{fq.gz,fa.gz}" -authors: - - "@sidorov-si" -maintainers: - - "@sidorov-si" diff --git a/modules/nf-core/seqtk/subseq/tests/main.nf.test b/modules/nf-core/seqtk/subseq/tests/main.nf.test deleted file mode 100644 index fa8fad6..0000000 --- a/modules/nf-core/seqtk/subseq/tests/main.nf.test +++ /dev/null @@ -1,59 +0,0 @@ -nextflow_process { - - name "Test Process SEQTK_SUBSEQ" - script "modules/nf-core/seqtk/subseq/main.nf" - process "SEQTK_SUBSEQ" - config "./standard.config" - - tag "modules" - tag "modules_nfcore" - tag "seqtk" - tag "seqtk/subseq" - - test("sarscov2_subseq_fa") { - - when { - process { - """ - input[0] = [ - [ id:'test' ], - file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.fasta', checkIfExists: true) - ] - input[1] = file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/bed/test.bed.gz', checkIfExists: true) - """ - } - } - - then { - assertAll ( - { assert process.success }, - { assert snapshot(process.out).match() } - ) - } - - } - - test("sarscov2_subseq_fa_stub") { - options "-stub" - when { - process { - """ - input[0] = [ - [ id:'test' ], - file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.fasta', checkIfExists: true) - ] - input[1] = file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/bed/test.bed.gz', checkIfExists: true) - """ - } - } - - then { - assertAll ( - { assert process.success }, - { assert snapshot(process.out).match() } - ) - } - - } - -} diff --git a/modules/nf-core/seqtk/subseq/tests/main.nf.test.snap b/modules/nf-core/seqtk/subseq/tests/main.nf.test.snap deleted file mode 100644 index 75b3793..0000000 --- a/modules/nf-core/seqtk/subseq/tests/main.nf.test.snap +++ /dev/null @@ -1,60 +0,0 @@ -{ - "sarscov2_subseq_fa": { - "content": [ - { - "0": [ - [ - { - "id": "test" - }, - "genome.fasta.filtered.fa.gz:md5,31c95c4d686526cf002f6119bc55b2b2" - ] - ], - "1": [ - "versions.yml:md5,cd7682f4da748ef6d083c4a4656cc1e2" - ], - "sequences": [ - [ - { - "id": "test" - }, - "genome.fasta.filtered.fa.gz:md5,31c95c4d686526cf002f6119bc55b2b2" - ] - ], - "versions": [ - "versions.yml:md5,cd7682f4da748ef6d083c4a4656cc1e2" - ] - } - ], - "timestamp": "2024-02-22T15:56:36.155954" - }, - "sarscov2_subseq_fa_stub": { - "content": [ - { - "0": [ - [ - { - "id": "test" - }, - "genome.fasta.filtered.fa.gz:md5,68b329da9893e34099c7d8ad5cb9c940" - ] - ], - "1": [ - "versions.yml:md5,cd7682f4da748ef6d083c4a4656cc1e2" - ], - "sequences": [ - [ - { - "id": "test" - }, - "genome.fasta.filtered.fa.gz:md5,68b329da9893e34099c7d8ad5cb9c940" - ] - ], - "versions": [ - "versions.yml:md5,cd7682f4da748ef6d083c4a4656cc1e2" - ] - } - ], - "timestamp": "2024-02-22T15:56:44.222329" - } -} \ No newline at end of file diff --git a/modules/nf-core/seqtk/subseq/tests/standard.config b/modules/nf-core/seqtk/subseq/tests/standard.config deleted file mode 100644 index e8d7dc3..0000000 --- a/modules/nf-core/seqtk/subseq/tests/standard.config +++ /dev/null @@ -1,5 +0,0 @@ -process { - withName: SEQTK_SUBSEQ { - ext.prefix = { ".filtered" } - } -} \ No newline at end of file diff --git a/modules/nf-core/seqtk/subseq/tests/tags.yml b/modules/nf-core/seqtk/subseq/tests/tags.yml deleted file mode 100644 index 74056ba..0000000 --- a/modules/nf-core/seqtk/subseq/tests/tags.yml +++ /dev/null @@ -1,2 +0,0 @@ -seqtk/subseq: - - "modules/nf-core/seqtk/subseq/**" diff --git a/subworkflows/local/align_short.nf b/subworkflows/local/align_short.nf index e74b480..f189135 100644 --- a/subworkflows/local/align_short.nf +++ b/subworkflows/local/align_short.nf @@ -29,10 +29,10 @@ workflow ALIGN_SHORT { // Convert from CRAM to FASTQ only if CRAM files were provided as input - SAMTOOLS_FASTQ ( ch_reads.cram, false ) + SAMTOOLS_FASTQ ( ch_reads.cram, true ) ch_versions = ch_versions.mix ( SAMTOOLS_FASTQ.out.versions.first() ) - - + + SAMTOOLS_FASTQ.out.fastq | mix ( ch_reads.fastq ) | set { ch_reads_fastq } diff --git a/subworkflows/local/filter_pacbio.nf b/subworkflows/local/filter_pacbio.nf index acb21fa..5edb338 100644 --- a/subworkflows/local/filter_pacbio.nf +++ b/subworkflows/local/filter_pacbio.nf @@ -9,7 +9,7 @@ include { BLAST_BLASTN } from '../../modules/nf-core/blast/ include { PACBIO_FILTER } from '../../modules/local/pacbio_filter' include { SAMTOOLS_FILTERTOFASTQ } from '../../modules/local/samtools_filtertofastq' include { SEQKIT_FQ2FA } from '../../modules/nf-core/seqkit/fq2fa' -include { SEQTK_SUBSEQ } from '../../modules/nf-core/seqtk/subseq' +include { BBMAP_FILTERBYNAME } from '../../modules/nf-core/bbmap/filterbyname' workflow FILTER_PACBIO { @@ -67,7 +67,7 @@ workflow FILTER_PACBIO { ch_versions = ch_versions.mix ( PACBIO_FILTER.out.versions.first() ) - // Filter the BAM files and convert to FASTQ + // Filter the input BAM and output as interleaved FASTA SAMTOOLS_CONVERT.out.bam | join ( SAMTOOLS_CONVERT.out.csi ) | join ( PACBIO_FILTER.out.list ) @@ -81,7 +81,7 @@ workflow FILTER_PACBIO { ch_versions = ch_versions.mix ( SAMTOOLS_FILTERTOFASTQ.out.versions.first() ) - // Filter inputs provided as FASTQ + // Filter inputs provided as FASTQ and output as interleaved FASTQ ch_reads.fastq | join(PACBIO_FILTER.out.list) | multiMap { meta, fastq, list -> \ @@ -90,12 +90,12 @@ workflow FILTER_PACBIO { } | set { ch_reads_fastq } - SEQTK_SUBSEQ ( ch_reads_fastq.fastqs, ch_reads_fastq.lists ) - ch_versions = ch_versions.mix ( SEQTK_SUBSEQ.out.versions.first() ) + BBMAP_FILTERBYNAME ( ch_reads_fastq.fastqs, ch_reads_fastq.lists , "fastq", true) + ch_versions = ch_versions.mix ( BBMAP_FILTERBYNAME.out.versions.first() ) // Merge filtered outputs as ch_output_fastq - SEQTK_SUBSEQ.out.sequences + BBMAP_FILTERBYNAME.out.reads | concat ( SAMTOOLS_FILTERTOFASTQ.out.fastq ) | set { ch_filtered_fastq } From 35b26b72561e76fd41223af0571c84d4c91c56cd Mon Sep 17 00:00:00 2001 From: Tyler Chafin Date: Tue, 24 Sep 2024 10:49:01 +0100 Subject: [PATCH 04/29] pass interleaved fastq after cram conversion --- subworkflows/local/align_short.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/subworkflows/local/align_short.nf b/subworkflows/local/align_short.nf index f189135..3fdf330 100644 --- a/subworkflows/local/align_short.nf +++ b/subworkflows/local/align_short.nf @@ -33,7 +33,7 @@ workflow ALIGN_SHORT { ch_versions = ch_versions.mix ( SAMTOOLS_FASTQ.out.versions.first() ) - SAMTOOLS_FASTQ.out.fastq + SAMTOOLS_FASTQ.out.interleaved_fastq | mix ( ch_reads.fastq ) | set { ch_reads_fastq } From 414ac2a808b5a803bd5fc44ab4197e2f7e920534 Mon Sep 17 00:00:00 2001 From: Tyler Chafin Date: Tue, 24 Sep 2024 10:50:30 +0100 Subject: [PATCH 05/29] Update align_short.nf --- subworkflows/local/align_short.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/subworkflows/local/align_short.nf b/subworkflows/local/align_short.nf index 3fdf330..6b58e4e 100644 --- a/subworkflows/local/align_short.nf +++ b/subworkflows/local/align_short.nf @@ -33,7 +33,7 @@ workflow ALIGN_SHORT { ch_versions = ch_versions.mix ( SAMTOOLS_FASTQ.out.versions.first() ) - SAMTOOLS_FASTQ.out.interleaved_fastq + SAMTOOLS_FASTQ.out.interleaved | mix ( ch_reads.fastq ) | set { ch_reads_fastq } From 0c96567c22e6fb54420bf9152e21d6378507a72a Mon Sep 17 00:00:00 2001 From: Tyler Chafin Date: Tue, 24 Sep 2024 10:57:02 +0100 Subject: [PATCH 06/29] Update CHANGELOG.md --- CHANGELOG.md | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index ca9294f..3854131 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,6 +3,13 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## [[1.3.1](https://github.com/sanger-tol/readmapping/releases/tag/1.3.0)] - Antipodean Opaleye - [2024-09-24] + +### Enhancements & fixes + +- Fixed bug in handling CRAM HiC inputs introduced in 1.1.0 +- Fixed bug in handling PacBio FASTQ inputs introduced in 1.3.0 + ## [[1.3.0](https://github.com/sanger-tol/readmapping/releases/tag/1.3.0)] - Antipodean Opaleye - [2024-08-23] ### Enhancements & fixes From 5b4f685f21d24ad33bb350bc895100e629b7a4ee Mon Sep 17 00:00:00 2001 From: Tyler Chafin Date: Tue, 24 Sep 2024 10:57:34 +0100 Subject: [PATCH 07/29] Update nextflow.config --- nextflow.config | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nextflow.config b/nextflow.config index 437ff82..35f7911 100644 --- a/nextflow.config +++ b/nextflow.config @@ -232,7 +232,7 @@ manifest { description = 'Pipeline to map reads generated using different sequencing technologies against a genome assembly.' mainScript = 'main.nf' nextflowVersion = '!>=22.10.1' - version = '1.3.0' + version = '1.3.1' doi = '10.5281/zenodo.6563577' } From e4d43981299aa024bbbfaf4923c2d3174a03bd27 Mon Sep 17 00:00:00 2001 From: Tyler Chafin Date: Tue, 24 Sep 2024 11:20:12 +0100 Subject: [PATCH 08/29] Update LICENSE --- LICENSE | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/LICENSE b/LICENSE index e238724..a9bcd4d 100644 --- a/LICENSE +++ b/LICENSE @@ -1,6 +1,6 @@ MIT License -Copyright (c) @priyanka-surana +Copyright (c) 2022-2024 Genome Research Ltd. Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal From 86a38626c2854c1253cde00c973413e17c4d63fa Mon Sep 17 00:00:00 2001 From: Tyler Chafin Date: Tue, 24 Sep 2024 11:29:24 +0100 Subject: [PATCH 09/29] Update download_pipeline.yml --- .github/workflows/download_pipeline.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/download_pipeline.yml b/.github/workflows/download_pipeline.yml index bd9f7bf..36fbc9d 100644 --- a/.github/workflows/download_pipeline.yml +++ b/.github/workflows/download_pipeline.yml @@ -10,7 +10,7 @@ on: testbranch: description: "The specific branch you wish to utilize for the test execution of nf-core download." required: true - default: "dev" + default: "main" pull_request: types: - opened From d882db60fc0d9a268aec14c4645e4947eac94783 Mon Sep 17 00:00:00 2001 From: Tyler Chafin Date: Tue, 24 Sep 2024 12:57:47 +0100 Subject: [PATCH 10/29] Update linting.yml --- .github/workflows/linting.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/linting.yml b/.github/workflows/linting.yml index 177172b..19ddb83 100644 --- a/.github/workflows/linting.yml +++ b/.github/workflows/linting.yml @@ -19,7 +19,7 @@ jobs: - uses: actions/setup-node@v3 - name: Install editorconfig-checker - run: npm install -g editorconfig-checker + run: npm install -g editorconfig-checker@3.0.2 - name: Run ECLint check run: editorconfig-checker -exclude README.md $(find .* -type f | grep -v '.git\|.py\|.md\|json\|yml\|yaml\|html\|css\|work\|.nextflow\|build\|nf_core.egg-info\|log.txt\|Makefile') From e846b1d215c6aeb76c2831799e5b15b5ffea6e79 Mon Sep 17 00:00:00 2001 From: Tyler Chafin Date: Tue, 24 Sep 2024 13:13:42 +0100 Subject: [PATCH 11/29] Update download_pipeline.yml --- .github/workflows/download_pipeline.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/download_pipeline.yml b/.github/workflows/download_pipeline.yml index 36fbc9d..9ea6412 100644 --- a/.github/workflows/download_pipeline.yml +++ b/.github/workflows/download_pipeline.yml @@ -43,7 +43,7 @@ jobs: architecture: "x64" - uses: eWaterCycle/setup-singularity@931d4e31109e875b13309ae1d07c70ca8fbc8537 # v7 with: - singularity-version: 3.8.3 + singularity-version: 3.8.7 - name: Install dependencies run: | From d8b32e97aaa948e4b5f6f72b02e221505ef5937f Mon Sep 17 00:00:00 2001 From: Tyler Chafin Date: Tue, 24 Sep 2024 13:22:35 +0100 Subject: [PATCH 12/29] Delete .github/workflows/download_pipeline.yml --- .github/workflows/download_pipeline.yml | 88 ------------------------- 1 file changed, 88 deletions(-) delete mode 100644 .github/workflows/download_pipeline.yml diff --git a/.github/workflows/download_pipeline.yml b/.github/workflows/download_pipeline.yml deleted file mode 100644 index 9ea6412..0000000 --- a/.github/workflows/download_pipeline.yml +++ /dev/null @@ -1,88 +0,0 @@ -name: Test successful pipeline download with 'nf-core download' - -# Run the workflow when: -# - dispatched manually -# - when a PR is opened or reopened to master branch -# - the head branch of the pull request is updated, i.e. if fixes for a release are pushed last minute to dev. -on: - workflow_dispatch: - inputs: - testbranch: - description: "The specific branch you wish to utilize for the test execution of nf-core download." - required: true - default: "main" - pull_request: - types: - - opened - - edited - - synchronize - branches: - - main - - dev - pull_request_target: - branches: - - main - - dev - -env: - NXF_ANSI_LOG: false - -jobs: - download: - runs-on: ubuntu-latest - steps: - - name: Install Nextflow - uses: nf-core/setup-nextflow@v2 - - - name: Disk space cleanup - uses: jlumbroso/free-disk-space@54081f138730dfa15788a46383842cd2f914a1be # v1.3.1 - - - uses: actions/setup-python@82c7e631bb3cdc910f68e0081d67478d79c6982d # v5 - with: - python-version: "3.12" - architecture: "x64" - - uses: eWaterCycle/setup-singularity@931d4e31109e875b13309ae1d07c70ca8fbc8537 # v7 - with: - singularity-version: 3.8.7 - - - name: Install dependencies - run: | - python -m pip install --upgrade pip - pip install git+https://github.com/nf-core/tools.git@dev - - - name: Get the repository name and current branch set as environment variable - run: | - echo "REPO_LOWERCASE=${GITHUB_REPOSITORY,,}" >> ${GITHUB_ENV} - echo "REPOTITLE_LOWERCASE=$(basename ${GITHUB_REPOSITORY,,})" >> ${GITHUB_ENV} - echo "REPO_BRANCH=${{ github.event.inputs.testbranch || 'dev' }}" >> ${GITHUB_ENV} - - - name: Download the pipeline - env: - NXF_SINGULARITY_CACHEDIR: ./ - run: | - nf-core download ${{ env.REPO_LOWERCASE }} \ - --revision ${{ env.REPO_BRANCH }} \ - --outdir ./${{ env.REPOTITLE_LOWERCASE }} \ - --compress "none" \ - --container-system 'singularity' \ - --container-library "quay.io" -l "docker.io" -l "ghcr.io" \ - --container-cache-utilisation 'amend' \ - --download-configuration - - - name: Inspect download - run: tree ./${{ env.REPOTITLE_LOWERCASE }} - - - name: Run the downloaded pipeline (stub) - id: stub_run_pipeline - continue-on-error: true - env: - NXF_SINGULARITY_CACHEDIR: ./ - NXF_SINGULARITY_HOME_MOUNT: true - run: nextflow run ./${{ env.REPOTITLE_LOWERCASE }}/$( sed 's/\W/_/g' <<< ${{ env.REPO_BRANCH }}) -stub -profile test,singularity --outdir ./results - - name: Run the downloaded pipeline (stub run not supported) - id: run_pipeline - if: ${{ job.steps.stub_run_pipeline.status == failure() }} - env: - NXF_SINGULARITY_CACHEDIR: ./ - NXF_SINGULARITY_HOME_MOUNT: true - run: nextflow run ./${{ env.REPOTITLE_LOWERCASE }}/$( sed 's/\W/_/g' <<< ${{ env.REPO_BRANCH }}) -profile test,singularity --outdir ./results From 1db66a4f36ad1d3c00feba8350cce084b7cd7f19 Mon Sep 17 00:00:00 2001 From: Tyler Chafin Date: Tue, 24 Sep 2024 13:29:49 +0100 Subject: [PATCH 13/29] BBtools citation --- subworkflows/local/utils_nfcore_readmapping_pipeline/main.nf | 2 ++ 1 file changed, 2 insertions(+) diff --git a/subworkflows/local/utils_nfcore_readmapping_pipeline/main.nf b/subworkflows/local/utils_nfcore_readmapping_pipeline/main.nf index 92485e0..67a8254 100644 --- a/subworkflows/local/utils_nfcore_readmapping_pipeline/main.nf +++ b/subworkflows/local/utils_nfcore_readmapping_pipeline/main.nf @@ -256,6 +256,7 @@ def toolCitationText() { // Uncomment function in methodsDescriptionText to render in MultiQC report def citation_text = [ "Tools used in the workflow included:", + "BBtools (Buschnell 2014),", "blastn (Camacho et al. 2009),", "bwa-mem2 (Vasimuddin et al. 2019),", "Crumble (Bonfield et al. 2019),", @@ -270,6 +271,7 @@ def toolBibliographyText() { // Can use ternary operators to dynamically construct based conditions, e.g. params["run_xyz"] ? "
  • Author (2023) Pub name, Journal, DOI
  • " : "", // Uncomment function in methodsDescriptionText to render in MultiQC report def reference_text = [ + "
  • Buschnell, B. (2014). BBtools software package. sourceforge.net/projects/bbmap.
  • ", "
  • Camacho, C., Coulouris, G., Avagyan, V., Ma, N., Papadopoulos, J., Bealer, K., & Madden, T.L. (2009). BLAST+: architecture and applications. BMC Bioinformatics, 10, 421. doi:10.1186/1471-2105-10-421.
  • ", "
  • Vasimuddin, Md., Misra, S., Li, H., & Aluru, S. (2019). Efficient Architecture-Aware Acceleration of BWA-MEM for Multicore Systems. IEEE Parallel and Distributed Processing Symposium (IPDPS), 2019. doi:10.1109/IPDPS.2019.00041.
  • ", "
  • Bonfield, J.K., McCarthy, S.A., & Durbin, R. (2019). Crumble: reference free lossy compression of sequence quality values. Bioinformatics, 35(2), 337-339. doi:10.1093/bioinformatics/bty608.
  • ", From 830a86164dc580accf0e8388933b3e2d6a3f7eea Mon Sep 17 00:00:00 2001 From: reichan1998 Date: Wed, 25 Sep 2024 11:25:17 +0700 Subject: [PATCH 14/29] replace seqtk/subseq by bbmap/filterbyread to fix filtering step for PacBio FASTQ input --- conf/base.config | 5 + conf/modules.config | 4 + modules.json | 10 +- .../bbmap/filterbyname/environment.yml | 5 + modules/nf-core/bbmap/filterbyname/main.nf | 71 ++++++ modules/nf-core/bbmap/filterbyname/meta.yml | 70 ++++++ .../bbmap/filterbyname/tests/main.nf.test | 218 ++++++++++++++++++ .../filterbyname/tests/main.nf.test.snap | 145 ++++++++++++ .../nf-core/bbmap/filterbyname/tests/tags.yml | 2 + modules/nf-core/seqtk/subseq/environment.yml | 7 - modules/nf-core/seqtk/subseq/main.nf | 56 ----- modules/nf-core/seqtk/subseq/meta.yml | 40 ---- .../nf-core/seqtk/subseq/tests/main.nf.test | 59 ----- .../seqtk/subseq/tests/main.nf.test.snap | 60 ----- .../seqtk/subseq/tests/standard.config | 5 - modules/nf-core/seqtk/subseq/tests/tags.yml | 2 - subworkflows/local/filter_pacbio.nf | 8 +- 17 files changed, 529 insertions(+), 238 deletions(-) create mode 100644 modules/nf-core/bbmap/filterbyname/environment.yml create mode 100644 modules/nf-core/bbmap/filterbyname/main.nf create mode 100644 modules/nf-core/bbmap/filterbyname/meta.yml create mode 100644 modules/nf-core/bbmap/filterbyname/tests/main.nf.test create mode 100644 modules/nf-core/bbmap/filterbyname/tests/main.nf.test.snap create mode 100644 modules/nf-core/bbmap/filterbyname/tests/tags.yml delete mode 100644 modules/nf-core/seqtk/subseq/environment.yml delete mode 100644 modules/nf-core/seqtk/subseq/main.nf delete mode 100644 modules/nf-core/seqtk/subseq/meta.yml delete mode 100644 modules/nf-core/seqtk/subseq/tests/main.nf.test delete mode 100644 modules/nf-core/seqtk/subseq/tests/main.nf.test.snap delete mode 100644 modules/nf-core/seqtk/subseq/tests/standard.config delete mode 100644 modules/nf-core/seqtk/subseq/tests/tags.yml diff --git a/conf/base.config b/conf/base.config index d3b72be..95bf286 100644 --- a/conf/base.config +++ b/conf/base.config @@ -41,6 +41,11 @@ process { memory = { check_max( ((meta.datatype == "pacbio_clr" || meta.datatype == "ont") ? 2.GB : 1.GB) * task.attempt, 'memory' ) } } + // minimum 1GB memory + withName: 'BBMAP_FILTERBYNAME' { + memory = { check_max( 1.GB, 'memory' ) } + } + withName: 'SAMTOOLS_COLLATETOFASTA' { cpus = { log_increase_cpus(4, 2*task.attempt, 1, 2) } memory = { check_max( 1.GB * Math.ceil( meta.read_count / 1000000 ) * task.attempt, 'memory' ) } diff --git a/conf/modules.config b/conf/modules.config index ec1c74d..07d7d5a 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -15,6 +15,10 @@ process { ext.args = '-F 0x200 -nt' } + withName: BBMAP_FILTERBYNAME { + ext.args = 'include=f' + } + withName: SAMTOOLS_MERGE { ext.args = { "-c -p" } ext.prefix = { "${meta.id}.merge" } diff --git a/modules.json b/modules.json index 8e24d3e..ea8b364 100644 --- a/modules.json +++ b/modules.json @@ -5,6 +5,11 @@ "https://github.com/nf-core/modules.git": { "modules": { "nf-core": { + "bbmap/filterbyname": { + "branch": "master", + "git_sha": "666652151335353eef2fcd58880bcef5bc2928e1", + "installed_by": ["modules"] + }, "blast/blastn": { "branch": "master", "git_sha": "583edaf97c9373a20df05a3b7be5a6677f9cd719", @@ -91,11 +96,6 @@ "git_sha": "03fbf6c89e551bd8d77f3b751fb5c955f75b34c5", "installed_by": ["modules"] }, - "seqtk/subseq": { - "branch": "master", - "git_sha": "730f3aee80d5f8d0b5fc532202ac59361414d006", - "installed_by": ["modules"] - }, "untar": { "branch": "master", "git_sha": "4e5f4687318f24ba944a13609d3ea6ebd890737d", diff --git a/modules/nf-core/bbmap/filterbyname/environment.yml b/modules/nf-core/bbmap/filterbyname/environment.yml new file mode 100644 index 0000000..dfd8936 --- /dev/null +++ b/modules/nf-core/bbmap/filterbyname/environment.yml @@ -0,0 +1,5 @@ +channels: + - conda-forge + - bioconda +dependencies: + - bioconda::bbmap=39.01 diff --git a/modules/nf-core/bbmap/filterbyname/main.nf b/modules/nf-core/bbmap/filterbyname/main.nf new file mode 100644 index 0000000..7267908 --- /dev/null +++ b/modules/nf-core/bbmap/filterbyname/main.nf @@ -0,0 +1,71 @@ +process BBMAP_FILTERBYNAME { + tag "$meta.id" + label 'process_single' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/bbmap:39.01--h5c4e2a8_0': + 'biocontainers/bbmap:39.01--h5c4e2a8_0' }" + + input: + tuple val(meta), path(reads) + val(names_to_filter) + val(output_format) + val(interleaved_output) + + output: + tuple val(meta), path("*.${output_format}"), emit: reads + tuple val(meta), path('*.log') , emit: log + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def input = meta.single_end ? "in=${reads}" : "in=${reads[0]} in2=${reads[1]}" + def output = (meta.single_end || interleaved_output) ? + "out=${prefix}.${output_format}" : + "out1=${prefix}_1.${output_format} out2=${prefix}_2.${output_format}" + def names_command = names_to_filter ? "names=${names_to_filter}": "" + + def avail_mem = 3 + if (!task.memory) { + log.info '[filterbyname] Available memory not known - defaulting to 3GB. Specify process memory requirements to change this.' + } else { + avail_mem = task.memory.giga + } + + """ + filterbyname.sh \\ + -Xmx${avail_mem}g \\ + $input \\ + $output \\ + $names_command \\ + $args \\ + | tee ${prefix}.log + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + bbmap: \$(bbversion.sh | grep -v "Duplicate cpuset") + END_VERSIONS + """ + + stub: + def prefix = task.ext.prefix ?: "${meta.id}" + def filtered = (meta.single_end || interleaved_output) ? + "echo '' | gzip > ${prefix}.${output_format}" : + "echo '' | gzip >${prefix}_1.${output_format} ; echo '' | gzip >${prefix}_2.${output_format}" + + """ + $filtered + touch ${prefix}.log + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + bbmap: \$(bbversion.sh | grep -v "Duplicate cpuset") + END_VERSIONS + """ + +} diff --git a/modules/nf-core/bbmap/filterbyname/meta.yml b/modules/nf-core/bbmap/filterbyname/meta.yml new file mode 100644 index 0000000..b7b8641 --- /dev/null +++ b/modules/nf-core/bbmap/filterbyname/meta.yml @@ -0,0 +1,70 @@ +name: bbmap_filterbyname +description: Filter out sequences by sequence header name(s) +keywords: + - fastq + - fasta + - filter +tools: + - bbmap: + description: BBMap is a short read aligner, as well as various other bioinformatic + tools. + homepage: https://jgi.doe.gov/data-and-tools/software-tools/bbtools/bb-tools-user-guide/clumpify-guide/ + documentation: https://www.biostars.org/p/225338/ + licence: ["UC-LBL license (see package)"] + identifier: biotools:bbmap + +input: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - reads: + type: file + description: | + List of input FastQ files of size 1 and 2 for single-end and + paired-end data, respectively. + - - names_to_filter: + type: string + description: | + String containing names of reads to filter out of the fastq files. + - - output_format: + type: string + description: | + String with the format of the output file, e.g. fastq.gz, fasta, fasta.bz2 + - - interleaved_output: + type: boolean + description: | + Whether to produce an interleaved fastq output file +output: + - reads: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - "*.${output_format}": + type: file + description: The trimmed/modified fastq reads + pattern: "*${output_format}" + - log: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - "*.log": + type: file + description: filterbyname.sh log file + pattern: "*.filterbyname.log" + - versions: + - versions.yml: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@tokarevvasily" + - "@sppearce" + +maintainers: + - "@sppearce" diff --git a/modules/nf-core/bbmap/filterbyname/tests/main.nf.test b/modules/nf-core/bbmap/filterbyname/tests/main.nf.test new file mode 100644 index 0000000..17c7ea5 --- /dev/null +++ b/modules/nf-core/bbmap/filterbyname/tests/main.nf.test @@ -0,0 +1,218 @@ +nextflow_process { + + name "Test Process BBMAP_FILTERBYNAME" + script "../main.nf" + process "BBMAP_FILTERBYNAME" + + tag "modules" + tag "modules_nfcore" + tag "bbmap" + tag "bbmap/filterbyname" + + test("paired end fastq.bz2") { + + when { + process { + """ + input[0] = [ + [ id:'test', single_end:false ], + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_1.fastq.gz', checkIfExists: true), + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_2.fastq.gz', checkIfExists: true) + ] + input[1] = "" + input[2] = "fastq.bz2" + input[3] = false + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out.versions).match() } + ) + } + + } + + test("paired end fastq.bz2 - stub") { + + options "-stub" + + when { + process { + """ + input[0] = [ + [ id:'test', single_end:false ], + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_1.fastq.gz', checkIfExists: true), + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_2.fastq.gz', checkIfExists: true) + ] + input[1] = "" + input[2] = "fastq.bz2" + input[3] = false + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + + test("single end fasta") { + + when { + process { + """ + input[0] = [ + [ id:'test', single_end:false ], + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_1.fastq.gz', checkIfExists: true) + ] + input[1] = "" + input[2] = "fasta" + input[3] = false + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out.versions).match() } + ) + } + + } + + test("single end fasta - stub") { + options "-stub" + when { + process { + """ + input[0] = [ + [ id:'test', single_end:false ], + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_1.fastq.gz', checkIfExists: true) + ] + input[1] = "" + input[2] = "fasta" + input[3] = false + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out.versions).match() } + ) + } + + } + + test("single end fastq.gz filter") { + + when { + process { + """ + input[0] = [ + [ id:'test', single_end:false ], + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_1.fastq.gz', checkIfExists: true) + ] + input[1] = "ERR5069949.2151832,ERR5069949.576388,ERR5069949.501486" + input[2] = "fasta" + input[3] = false + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out.versions).match() } + ) + } + + } + + test("single end fastq.gz - stub") { + options "-stub" + when { + process { + """ + input[0] = [ + [ id:'test', single_end:false ], + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_1.fastq.gz', checkIfExists: true) + ] + input[1] = "ERR5069949.2151832,ERR5069949.576388,ERR5069949.501486" + input[2] = "fastq.gz" + input[3] = false + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out.versions).match() } + ) + } + + } + + test("paired end fastq.gz filter interleaved") { + + when { + process { + """ + input[0] = [ + [ id:'test', single_end:false ], + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_1.fastq.gz', checkIfExists: true), + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_2.fastq.gz', checkIfExists: true) + ] + input[1] = "ERR5069949.2151832,ERR5069949.576388,ERR5069949.501486" + input[2] = "fastq.gz" + input[3] = true + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out.versions).match() } + ) + } + + } + + test("paired end fastq.gz filter interleaved - stub") { + options "-stub" + when { + process { + """ + input[0] = [ + [ id:'test', single_end:false ], + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_1.fastq.gz', checkIfExists: true), + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_2.fastq.gz', checkIfExists: true) + ] + input[1] = "ERR5069949.2151832,ERR5069949.576388,ERR5069949.501486" + input[2] = "fastq.gz" + input[3] = true + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out.versions).match() } + ) + } + + } + +} diff --git a/modules/nf-core/bbmap/filterbyname/tests/main.nf.test.snap b/modules/nf-core/bbmap/filterbyname/tests/main.nf.test.snap new file mode 100644 index 0000000..e06845a --- /dev/null +++ b/modules/nf-core/bbmap/filterbyname/tests/main.nf.test.snap @@ -0,0 +1,145 @@ +{ + "single end fasta": { + "content": [ + [ + "versions.yml:md5,aaa9e9267785f8680ba0cab91423c06d" + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "24.04.2" + }, + "timestamp": "2024-06-13T12:10:54.50002639" + }, + "paired end fastq.bz2": { + "content": [ + [ + "versions.yml:md5,aaa9e9267785f8680ba0cab91423c06d" + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "24.04.2" + }, + "timestamp": "2024-06-13T12:10:31.368676493" + }, + "paired end fastq.bz2 - stub": { + "content": [ + { + "0": [ + [ + { + "id": "test", + "single_end": false + }, + [ + "test_1.fastq.bz2:md5,1a60c330fb42841e8dcf3cd507a70bfc", + "test_2.fastq.bz2:md5,1a60c330fb42841e8dcf3cd507a70bfc" + ] + ] + ], + "1": [ + [ + { + "id": "test", + "single_end": false + }, + "test.log:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "2": [ + "versions.yml:md5,aaa9e9267785f8680ba0cab91423c06d" + ], + "log": [ + [ + { + "id": "test", + "single_end": false + }, + "test.log:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "reads": [ + [ + { + "id": "test", + "single_end": false + }, + [ + "test_1.fastq.bz2:md5,1a60c330fb42841e8dcf3cd507a70bfc", + "test_2.fastq.bz2:md5,1a60c330fb42841e8dcf3cd507a70bfc" + ] + ] + ], + "versions": [ + "versions.yml:md5,aaa9e9267785f8680ba0cab91423c06d" + ] + } + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "24.04.2" + }, + "timestamp": "2024-06-13T12:10:42.854788269" + }, + "single end fastq.gz filter": { + "content": [ + [ + "versions.yml:md5,aaa9e9267785f8680ba0cab91423c06d" + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "24.04.2" + }, + "timestamp": "2024-06-13T12:11:24.280900344" + }, + "single end fastq.gz - stub": { + "content": [ + [ + "versions.yml:md5,aaa9e9267785f8680ba0cab91423c06d" + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "24.04.2" + }, + "timestamp": "2024-06-13T12:11:43.274477064" + }, + "paired end fastq.gz filter interleaved - stub": { + "content": [ + [ + "versions.yml:md5,aaa9e9267785f8680ba0cab91423c06d" + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "24.04.2" + }, + "timestamp": "2024-06-13T12:12:05.324554457" + }, + "single end fasta - stub": { + "content": [ + [ + "versions.yml:md5,aaa9e9267785f8680ba0cab91423c06d" + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "24.04.2" + }, + "timestamp": "2024-06-13T12:11:13.161430777" + }, + "paired end fastq.gz filter interleaved": { + "content": [ + [ + "versions.yml:md5,aaa9e9267785f8680ba0cab91423c06d" + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "24.04.2" + }, + "timestamp": "2024-06-13T12:11:54.599067108" + } +} diff --git a/modules/nf-core/bbmap/filterbyname/tests/tags.yml b/modules/nf-core/bbmap/filterbyname/tests/tags.yml new file mode 100644 index 0000000..707f910 --- /dev/null +++ b/modules/nf-core/bbmap/filterbyname/tests/tags.yml @@ -0,0 +1,2 @@ +bbmap/filterbyname: + - "modules/nf-core/bbmap/filterbyname/**" diff --git a/modules/nf-core/seqtk/subseq/environment.yml b/modules/nf-core/seqtk/subseq/environment.yml deleted file mode 100644 index 7abe364..0000000 --- a/modules/nf-core/seqtk/subseq/environment.yml +++ /dev/null @@ -1,7 +0,0 @@ -name: seqtk_subseq -channels: - - conda-forge - - bioconda - - defaults -dependencies: - - bioconda::seqtk=1.4 diff --git a/modules/nf-core/seqtk/subseq/main.nf b/modules/nf-core/seqtk/subseq/main.nf deleted file mode 100644 index d5caebc..0000000 --- a/modules/nf-core/seqtk/subseq/main.nf +++ /dev/null @@ -1,56 +0,0 @@ -process SEQTK_SUBSEQ { - tag "$sequences" - label 'process_single' - - conda "${moduleDir}/environment.yml" - container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/seqtk:1.4--he4a0461_1' : - 'biocontainers/seqtk:1.4--he4a0461_1' }" - - input: - tuple val(meta), path(sequences) - path filter_list - - output: - tuple val(meta), path("*.gz"), emit: sequences - path "versions.yml", emit: versions - - when: - task.ext.when == null || task.ext.when - - script: - def args = task.ext.args ?: '' - def prefix = task.ext.prefix ?: "${meta.id}" - def ext = "fa" - if ("$sequences" ==~ /.+\.fq|.+\.fq.gz|.+\.fastq|.+\.fastq.gz/) { - ext = "fq" - } - """ - seqtk \\ - subseq \\ - $args \\ - $sequences \\ - $filter_list | \\ - gzip --no-name > ${sequences}${prefix}.${ext}.gz - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - seqtk: \$(echo \$(seqtk 2>&1) | sed 's/^.*Version: //; s/ .*\$//') - END_VERSIONS - """ - - stub: - def prefix = task.ext.prefix ?: "${meta.id}" - def ext = "fa" - if ("$sequences" ==~ /.+\.fq|.+\.fq.gz|.+\.fastq|.+\.fastq.gz/) { - ext = "fq" - } - """ - echo "" | gzip > ${sequences}${prefix}.${ext}.gz - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - seqtk: \$(echo \$(seqtk 2>&1) | sed 's/^.*Version: //; s/ .*\$//') - END_VERSIONS - """ -} diff --git a/modules/nf-core/seqtk/subseq/meta.yml b/modules/nf-core/seqtk/subseq/meta.yml deleted file mode 100644 index de4a841..0000000 --- a/modules/nf-core/seqtk/subseq/meta.yml +++ /dev/null @@ -1,40 +0,0 @@ -name: seqtk_subseq -description: Select only sequences that match the filtering condition -keywords: - - filtering - - selection - - fastx -tools: - - seqtk: - description: Seqtk is a fast and lightweight tool for processing sequences in the FASTA or FASTQ format - homepage: https://github.com/lh3/seqtk - documentation: https://docs.csc.fi/apps/seqtk/ - tool_dev_url: https://github.com/lh3/seqtk - licence: ["MIT"] -input: - - meta: - type: map - description: | - Groovy Map containing sample information - e.g. [ id:'test' ] - - sequences: - type: file - description: FASTQ/FASTA file - pattern: "*.{fq,fq.gz,fa,fa.gz}" - - filter_list: - type: file - description: BED file or a text file with a list of sequence names - pattern: "*.{bed,lst}" -output: - - versions: - type: file - description: File containing software versions - pattern: "versions.yml" - - sequences: - type: file - description: FASTQ/FASTA file - pattern: "*.{fq.gz,fa.gz}" -authors: - - "@sidorov-si" -maintainers: - - "@sidorov-si" diff --git a/modules/nf-core/seqtk/subseq/tests/main.nf.test b/modules/nf-core/seqtk/subseq/tests/main.nf.test deleted file mode 100644 index fa8fad6..0000000 --- a/modules/nf-core/seqtk/subseq/tests/main.nf.test +++ /dev/null @@ -1,59 +0,0 @@ -nextflow_process { - - name "Test Process SEQTK_SUBSEQ" - script "modules/nf-core/seqtk/subseq/main.nf" - process "SEQTK_SUBSEQ" - config "./standard.config" - - tag "modules" - tag "modules_nfcore" - tag "seqtk" - tag "seqtk/subseq" - - test("sarscov2_subseq_fa") { - - when { - process { - """ - input[0] = [ - [ id:'test' ], - file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.fasta', checkIfExists: true) - ] - input[1] = file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/bed/test.bed.gz', checkIfExists: true) - """ - } - } - - then { - assertAll ( - { assert process.success }, - { assert snapshot(process.out).match() } - ) - } - - } - - test("sarscov2_subseq_fa_stub") { - options "-stub" - when { - process { - """ - input[0] = [ - [ id:'test' ], - file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.fasta', checkIfExists: true) - ] - input[1] = file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/bed/test.bed.gz', checkIfExists: true) - """ - } - } - - then { - assertAll ( - { assert process.success }, - { assert snapshot(process.out).match() } - ) - } - - } - -} diff --git a/modules/nf-core/seqtk/subseq/tests/main.nf.test.snap b/modules/nf-core/seqtk/subseq/tests/main.nf.test.snap deleted file mode 100644 index 75b3793..0000000 --- a/modules/nf-core/seqtk/subseq/tests/main.nf.test.snap +++ /dev/null @@ -1,60 +0,0 @@ -{ - "sarscov2_subseq_fa": { - "content": [ - { - "0": [ - [ - { - "id": "test" - }, - "genome.fasta.filtered.fa.gz:md5,31c95c4d686526cf002f6119bc55b2b2" - ] - ], - "1": [ - "versions.yml:md5,cd7682f4da748ef6d083c4a4656cc1e2" - ], - "sequences": [ - [ - { - "id": "test" - }, - "genome.fasta.filtered.fa.gz:md5,31c95c4d686526cf002f6119bc55b2b2" - ] - ], - "versions": [ - "versions.yml:md5,cd7682f4da748ef6d083c4a4656cc1e2" - ] - } - ], - "timestamp": "2024-02-22T15:56:36.155954" - }, - "sarscov2_subseq_fa_stub": { - "content": [ - { - "0": [ - [ - { - "id": "test" - }, - "genome.fasta.filtered.fa.gz:md5,68b329da9893e34099c7d8ad5cb9c940" - ] - ], - "1": [ - "versions.yml:md5,cd7682f4da748ef6d083c4a4656cc1e2" - ], - "sequences": [ - [ - { - "id": "test" - }, - "genome.fasta.filtered.fa.gz:md5,68b329da9893e34099c7d8ad5cb9c940" - ] - ], - "versions": [ - "versions.yml:md5,cd7682f4da748ef6d083c4a4656cc1e2" - ] - } - ], - "timestamp": "2024-02-22T15:56:44.222329" - } -} \ No newline at end of file diff --git a/modules/nf-core/seqtk/subseq/tests/standard.config b/modules/nf-core/seqtk/subseq/tests/standard.config deleted file mode 100644 index e8d7dc3..0000000 --- a/modules/nf-core/seqtk/subseq/tests/standard.config +++ /dev/null @@ -1,5 +0,0 @@ -process { - withName: SEQTK_SUBSEQ { - ext.prefix = { ".filtered" } - } -} \ No newline at end of file diff --git a/modules/nf-core/seqtk/subseq/tests/tags.yml b/modules/nf-core/seqtk/subseq/tests/tags.yml deleted file mode 100644 index 74056ba..0000000 --- a/modules/nf-core/seqtk/subseq/tests/tags.yml +++ /dev/null @@ -1,2 +0,0 @@ -seqtk/subseq: - - "modules/nf-core/seqtk/subseq/**" diff --git a/subworkflows/local/filter_pacbio.nf b/subworkflows/local/filter_pacbio.nf index acb21fa..0f3972f 100644 --- a/subworkflows/local/filter_pacbio.nf +++ b/subworkflows/local/filter_pacbio.nf @@ -9,7 +9,7 @@ include { BLAST_BLASTN } from '../../modules/nf-core/blast/ include { PACBIO_FILTER } from '../../modules/local/pacbio_filter' include { SAMTOOLS_FILTERTOFASTQ } from '../../modules/local/samtools_filtertofastq' include { SEQKIT_FQ2FA } from '../../modules/nf-core/seqkit/fq2fa' -include { SEQTK_SUBSEQ } from '../../modules/nf-core/seqtk/subseq' +include { BBMAP_FILTERBYNAME } from '../../modules/nf-core/bbmap/filterbyname/main' workflow FILTER_PACBIO { @@ -90,12 +90,12 @@ workflow FILTER_PACBIO { } | set { ch_reads_fastq } - SEQTK_SUBSEQ ( ch_reads_fastq.fastqs, ch_reads_fastq.lists ) - ch_versions = ch_versions.mix ( SEQTK_SUBSEQ.out.versions.first() ) + BBMAP_FILTERBYNAME ( ch_reads_fastq.fastqs, ch_reads_fastq.lists , "fastq", true) + ch_versions = ch_versions.mix ( BBMAP_FILTERBYNAME.out.versions.first() ) // Merge filtered outputs as ch_output_fastq - SEQTK_SUBSEQ.out.sequences + BBMAP_FILTERBYNAME.out.reads | concat ( SAMTOOLS_FILTERTOFASTQ.out.fastq ) | set { ch_filtered_fastq } From c687234c3afab37ae31f90210d65081dfc63895f Mon Sep 17 00:00:00 2001 From: Tyler Chafin Date: Wed, 25 Sep 2024 09:39:46 +0100 Subject: [PATCH 15/29] Update CHANGELOG.md Co-authored-by: Matthieu Muffato --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 3854131..3630362 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,7 +3,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). -## [[1.3.1](https://github.com/sanger-tol/readmapping/releases/tag/1.3.0)] - Antipodean Opaleye - [2024-09-24] +## [[1.3.1](https://github.com/sanger-tol/readmapping/releases/tag/1.3.0)] - Antipodean Opaleye (patch 1) - [2024-09-24] ### Enhancements & fixes From 168dbfc26b1dc5dd28f0f2a28d0526c80811333f Mon Sep 17 00:00:00 2001 From: Tyler Chafin Date: Wed, 25 Sep 2024 09:40:07 +0100 Subject: [PATCH 16/29] Update conf/base.config Co-authored-by: Matthieu Muffato --- conf/base.config | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/conf/base.config b/conf/base.config index 3827aae..10ab224 100644 --- a/conf/base.config +++ b/conf/base.config @@ -43,7 +43,7 @@ process { // minimum 1GB memory withName: 'BBMAP_FILTERBYNAME' { - memory = { check_max( 1.GB, 'memory' ) } + memory = { check_max( 1.GB * task.attempt, 'memory' ) } } withName: 'SAMTOOLS_COLLATETOFASTA' { From 2f85112c90f5d5716496449cd8f9db3f6d481b40 Mon Sep 17 00:00:00 2001 From: Tyler Chafin Date: Wed, 25 Sep 2024 09:48:34 +0100 Subject: [PATCH 17/29] Update CITATIONS.md --- CITATIONS.md | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/CITATIONS.md b/CITATIONS.md index 4a33c7c..eb614af 100644 --- a/CITATIONS.md +++ b/CITATIONS.md @@ -10,6 +10,9 @@ ## Pipeline tools +- [BBTools](http://sourceforge.net/projects/bbmap/) + > Bushnell B. BBTools software package. 2014. http://sourceforge.net/projects/bbmap/ + - [Blast](https://pubmed.ncbi.nlm.nih.gov/20003500/) > Camacho C, Coulouris G, Avagyan V, Ma N, Papadopoulos J, Bealer K, Madden TL. BLAST+: architecture and applications. BMC Bioinformatics. 2009 Dec 15;10:421. doi: 10.1186/1471-2105-10-421. PMID: 20003500; PMCID: PMC2803857. @@ -18,7 +21,7 @@ > Vasimuddin Md, Misra S, Li H, Aluru S. Efficient Architecture-Aware Acceleration of BWA-MEM for Multicore Systems. 2019 IEEE International Parallel and Distributed Processing Symposium. 2019 May;314–24. doi: 10.1109/IPDPS.2019.00041. -- [CRUMBLE] +- [CRUMBLE](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6330002/) > Bonfield JK, McCarthy SA, Durbin R. Crumble: reference free lossy compression of sequence quality values. Bioinformatics. 2019 Jan;35(2):337-339. doi: 10.1093/bioinformatics/bty608. PubMed PMID: 29992288; PMCID: PMC6330002. @@ -30,13 +33,10 @@ > Danecek P, Bonfield JK, Liddle J, Marshall J, Ohan V, Pollard MO, Whitwham A, Keane T, McCarthy SA, Davies RM, Li H. Twelve years of SAMtools and BCFtools. Gigascience. 2021 Feb 16;10(2):giab008. doi: 10.1093/gigascience/giab008. PMID: 33590861; PMCID: PMC7931819. -- [SeqKit] +- [SeqKit](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5051824/) > Shen W, Le S, Li Y, Hu F. SeqKit: A cross-platform and ultrafast toolkit for FASTA/Q file manipulation. PLoS One. 2016 Oct 5;11(10):e0163962. doi: 10.1371/journal.pone.0163962. PubMed PMID: 27706213; PMCID: PMC5051824. -- [Seqtk] - - > Li H. Toolkit for processing sequences in FASTA/Q formats. GitHub Repository. 2012. https://github.com/lh3/seqtk. Accessed August 2024. ## Software packaging/containerisation tools From 485bf9deba4acb4674716ea08cca2c59f7b600b6 Mon Sep 17 00:00:00 2001 From: Tyler Chafin Date: Wed, 25 Sep 2024 09:54:11 +0100 Subject: [PATCH 18/29] prettier linting --- CITATIONS.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CITATIONS.md b/CITATIONS.md index eb614af..c2313c7 100644 --- a/CITATIONS.md +++ b/CITATIONS.md @@ -11,6 +11,7 @@ ## Pipeline tools - [BBTools](http://sourceforge.net/projects/bbmap/) + > Bushnell B. BBTools software package. 2014. http://sourceforge.net/projects/bbmap/ - [Blast](https://pubmed.ncbi.nlm.nih.gov/20003500/) @@ -37,7 +38,6 @@ > Shen W, Le S, Li Y, Hu F. SeqKit: A cross-platform and ultrafast toolkit for FASTA/Q file manipulation. PLoS One. 2016 Oct 5;11(10):e0163962. doi: 10.1371/journal.pone.0163962. PubMed PMID: 27706213; PMCID: PMC5051824. - ## Software packaging/containerisation tools - [Anaconda](https://anaconda.com) From ab98d60557e124877e76ab416c4c9bc1b1c6fa09 Mon Sep 17 00:00:00 2001 From: Tyler Chafin Date: Wed, 25 Sep 2024 09:57:11 +0100 Subject: [PATCH 19/29] Update CHANGELOG.md --- CHANGELOG.md | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 3630362..d6c5fc0 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,6 +10,11 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Fixed bug in handling CRAM HiC inputs introduced in 1.1.0 - Fixed bug in handling PacBio FASTQ inputs introduced in 1.3.0 +| Dependency | Old version | New version | +| ---------- | ------------- | ----------- | +| `bbtools` | | 39.01 | +| `seqtk` | 1.4 | | + ## [[1.3.0](https://github.com/sanger-tol/readmapping/releases/tag/1.3.0)] - Antipodean Opaleye - [2024-08-23] ### Enhancements & fixes From 54a11d2ce56c367a08fd2b0a0ba94f261671395e Mon Sep 17 00:00:00 2001 From: Tyler Chafin Date: Wed, 25 Sep 2024 09:58:39 +0100 Subject: [PATCH 20/29] prettier linting --- CHANGELOG.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index d6c5fc0..087b1b4 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,10 +10,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Fixed bug in handling CRAM HiC inputs introduced in 1.1.0 - Fixed bug in handling PacBio FASTQ inputs introduced in 1.3.0 -| Dependency | Old version | New version | -| ---------- | ------------- | ----------- | -| `bbtools` | | 39.01 | -| `seqtk` | 1.4 | | +| Dependency | Old version | New version | +| ---------- | ----------- | ----------- | +| `bbtools` | | 39.01 | +| `seqtk` | 1.4 | | ## [[1.3.0](https://github.com/sanger-tol/readmapping/releases/tag/1.3.0)] - Antipodean Opaleye - [2024-08-23] From 2de5ac40f07039f9f1300f153a8c056361c6eb66 Mon Sep 17 00:00:00 2001 From: reichan1998 Date: Thu, 26 Sep 2024 09:27:16 +0700 Subject: [PATCH 21/29] fix editorconfig --- conf/base.config | 2 +- subworkflows/local/align_pacbio.nf | 2 +- subworkflows/local/utils_nfcore_readmapping_pipeline/main.nf | 2 ++ 3 files changed, 4 insertions(+), 2 deletions(-) diff --git a/conf/base.config b/conf/base.config index 95bf286..679ec14 100644 --- a/conf/base.config +++ b/conf/base.config @@ -122,7 +122,7 @@ process { withName: GENERATE_CRAM_CSV { cpus = { check_max( 4 * task.attempt, 'cpus' ) } - memory = { check_max( 16.GB * task.attempt, 'memory' ) } + memory = { check_max( 16.GB * task.attempt, 'memory' ) } } withName: CRUMBLE { diff --git a/subworkflows/local/align_pacbio.nf b/subworkflows/local/align_pacbio.nf index 7474d45..cd42e63 100644 --- a/subworkflows/local/align_pacbio.nf +++ b/subworkflows/local/align_pacbio.nf @@ -52,7 +52,7 @@ workflow ALIGN_PACBIO { ch_versions = ch_versions.mix( GENERATE_CRAM_CSV.out.versions ) // - // SUBWORKFLOW: mapping pacbio reads using minimap2 + // SUBWORKFLOW: mapping pacbio reads using minimap2 // MINIMAP2_MAPREDUCE ( fasta, diff --git a/subworkflows/local/utils_nfcore_readmapping_pipeline/main.nf b/subworkflows/local/utils_nfcore_readmapping_pipeline/main.nf index 92485e0..67a8254 100644 --- a/subworkflows/local/utils_nfcore_readmapping_pipeline/main.nf +++ b/subworkflows/local/utils_nfcore_readmapping_pipeline/main.nf @@ -256,6 +256,7 @@ def toolCitationText() { // Uncomment function in methodsDescriptionText to render in MultiQC report def citation_text = [ "Tools used in the workflow included:", + "BBtools (Buschnell 2014),", "blastn (Camacho et al. 2009),", "bwa-mem2 (Vasimuddin et al. 2019),", "Crumble (Bonfield et al. 2019),", @@ -270,6 +271,7 @@ def toolBibliographyText() { // Can use ternary operators to dynamically construct based conditions, e.g. params["run_xyz"] ? "
  • Author (2023) Pub name, Journal, DOI
  • " : "", // Uncomment function in methodsDescriptionText to render in MultiQC report def reference_text = [ + "
  • Buschnell, B. (2014). BBtools software package. sourceforge.net/projects/bbmap.
  • ", "
  • Camacho, C., Coulouris, G., Avagyan, V., Ma, N., Papadopoulos, J., Bealer, K., & Madden, T.L. (2009). BLAST+: architecture and applications. BMC Bioinformatics, 10, 421. doi:10.1186/1471-2105-10-421.
  • ", "
  • Vasimuddin, Md., Misra, S., Li, H., & Aluru, S. (2019). Efficient Architecture-Aware Acceleration of BWA-MEM for Multicore Systems. IEEE Parallel and Distributed Processing Symposium (IPDPS), 2019. doi:10.1109/IPDPS.2019.00041.
  • ", "
  • Bonfield, J.K., McCarthy, S.A., & Durbin, R. (2019). Crumble: reference free lossy compression of sequence quality values. Bioinformatics, 35(2), 337-339. doi:10.1093/bioinformatics/bty608.
  • ", From 9942a92744f65e29be75e3dcc606f7b5cc43f063 Mon Sep 17 00:00:00 2001 From: reichan1998 Date: Thu, 26 Sep 2024 09:44:18 +0700 Subject: [PATCH 22/29] update patch 1.3.1 --- .github/workflows/download_pipeline.yml | 88 ------------------------- .github/workflows/linting.yml | 2 +- CHANGELOG.md | 12 ++++ CITATIONS.md | 12 ++-- LICENSE | 2 +- conf/base.config | 2 +- nextflow.config | 2 +- 7 files changed, 22 insertions(+), 98 deletions(-) delete mode 100644 .github/workflows/download_pipeline.yml diff --git a/.github/workflows/download_pipeline.yml b/.github/workflows/download_pipeline.yml deleted file mode 100644 index bd9f7bf..0000000 --- a/.github/workflows/download_pipeline.yml +++ /dev/null @@ -1,88 +0,0 @@ -name: Test successful pipeline download with 'nf-core download' - -# Run the workflow when: -# - dispatched manually -# - when a PR is opened or reopened to master branch -# - the head branch of the pull request is updated, i.e. if fixes for a release are pushed last minute to dev. -on: - workflow_dispatch: - inputs: - testbranch: - description: "The specific branch you wish to utilize for the test execution of nf-core download." - required: true - default: "dev" - pull_request: - types: - - opened - - edited - - synchronize - branches: - - main - - dev - pull_request_target: - branches: - - main - - dev - -env: - NXF_ANSI_LOG: false - -jobs: - download: - runs-on: ubuntu-latest - steps: - - name: Install Nextflow - uses: nf-core/setup-nextflow@v2 - - - name: Disk space cleanup - uses: jlumbroso/free-disk-space@54081f138730dfa15788a46383842cd2f914a1be # v1.3.1 - - - uses: actions/setup-python@82c7e631bb3cdc910f68e0081d67478d79c6982d # v5 - with: - python-version: "3.12" - architecture: "x64" - - uses: eWaterCycle/setup-singularity@931d4e31109e875b13309ae1d07c70ca8fbc8537 # v7 - with: - singularity-version: 3.8.3 - - - name: Install dependencies - run: | - python -m pip install --upgrade pip - pip install git+https://github.com/nf-core/tools.git@dev - - - name: Get the repository name and current branch set as environment variable - run: | - echo "REPO_LOWERCASE=${GITHUB_REPOSITORY,,}" >> ${GITHUB_ENV} - echo "REPOTITLE_LOWERCASE=$(basename ${GITHUB_REPOSITORY,,})" >> ${GITHUB_ENV} - echo "REPO_BRANCH=${{ github.event.inputs.testbranch || 'dev' }}" >> ${GITHUB_ENV} - - - name: Download the pipeline - env: - NXF_SINGULARITY_CACHEDIR: ./ - run: | - nf-core download ${{ env.REPO_LOWERCASE }} \ - --revision ${{ env.REPO_BRANCH }} \ - --outdir ./${{ env.REPOTITLE_LOWERCASE }} \ - --compress "none" \ - --container-system 'singularity' \ - --container-library "quay.io" -l "docker.io" -l "ghcr.io" \ - --container-cache-utilisation 'amend' \ - --download-configuration - - - name: Inspect download - run: tree ./${{ env.REPOTITLE_LOWERCASE }} - - - name: Run the downloaded pipeline (stub) - id: stub_run_pipeline - continue-on-error: true - env: - NXF_SINGULARITY_CACHEDIR: ./ - NXF_SINGULARITY_HOME_MOUNT: true - run: nextflow run ./${{ env.REPOTITLE_LOWERCASE }}/$( sed 's/\W/_/g' <<< ${{ env.REPO_BRANCH }}) -stub -profile test,singularity --outdir ./results - - name: Run the downloaded pipeline (stub run not supported) - id: run_pipeline - if: ${{ job.steps.stub_run_pipeline.status == failure() }} - env: - NXF_SINGULARITY_CACHEDIR: ./ - NXF_SINGULARITY_HOME_MOUNT: true - run: nextflow run ./${{ env.REPOTITLE_LOWERCASE }}/$( sed 's/\W/_/g' <<< ${{ env.REPO_BRANCH }}) -profile test,singularity --outdir ./results diff --git a/.github/workflows/linting.yml b/.github/workflows/linting.yml index 177172b..19ddb83 100644 --- a/.github/workflows/linting.yml +++ b/.github/workflows/linting.yml @@ -19,7 +19,7 @@ jobs: - uses: actions/setup-node@v3 - name: Install editorconfig-checker - run: npm install -g editorconfig-checker + run: npm install -g editorconfig-checker@3.0.2 - name: Run ECLint check run: editorconfig-checker -exclude README.md $(find .* -type f | grep -v '.git\|.py\|.md\|json\|yml\|yaml\|html\|css\|work\|.nextflow\|build\|nf_core.egg-info\|log.txt\|Makefile') diff --git a/CHANGELOG.md b/CHANGELOG.md index ca9294f..087b1b4 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,6 +3,18 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## [[1.3.1](https://github.com/sanger-tol/readmapping/releases/tag/1.3.0)] - Antipodean Opaleye (patch 1) - [2024-09-24] + +### Enhancements & fixes + +- Fixed bug in handling CRAM HiC inputs introduced in 1.1.0 +- Fixed bug in handling PacBio FASTQ inputs introduced in 1.3.0 + +| Dependency | Old version | New version | +| ---------- | ----------- | ----------- | +| `bbtools` | | 39.01 | +| `seqtk` | 1.4 | | + ## [[1.3.0](https://github.com/sanger-tol/readmapping/releases/tag/1.3.0)] - Antipodean Opaleye - [2024-08-23] ### Enhancements & fixes diff --git a/CITATIONS.md b/CITATIONS.md index 4a33c7c..c2313c7 100644 --- a/CITATIONS.md +++ b/CITATIONS.md @@ -10,6 +10,10 @@ ## Pipeline tools +- [BBTools](http://sourceforge.net/projects/bbmap/) + + > Bushnell B. BBTools software package. 2014. http://sourceforge.net/projects/bbmap/ + - [Blast](https://pubmed.ncbi.nlm.nih.gov/20003500/) > Camacho C, Coulouris G, Avagyan V, Ma N, Papadopoulos J, Bealer K, Madden TL. BLAST+: architecture and applications. BMC Bioinformatics. 2009 Dec 15;10:421. doi: 10.1186/1471-2105-10-421. PMID: 20003500; PMCID: PMC2803857. @@ -18,7 +22,7 @@ > Vasimuddin Md, Misra S, Li H, Aluru S. Efficient Architecture-Aware Acceleration of BWA-MEM for Multicore Systems. 2019 IEEE International Parallel and Distributed Processing Symposium. 2019 May;314–24. doi: 10.1109/IPDPS.2019.00041. -- [CRUMBLE] +- [CRUMBLE](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6330002/) > Bonfield JK, McCarthy SA, Durbin R. Crumble: reference free lossy compression of sequence quality values. Bioinformatics. 2019 Jan;35(2):337-339. doi: 10.1093/bioinformatics/bty608. PubMed PMID: 29992288; PMCID: PMC6330002. @@ -30,14 +34,10 @@ > Danecek P, Bonfield JK, Liddle J, Marshall J, Ohan V, Pollard MO, Whitwham A, Keane T, McCarthy SA, Davies RM, Li H. Twelve years of SAMtools and BCFtools. Gigascience. 2021 Feb 16;10(2):giab008. doi: 10.1093/gigascience/giab008. PMID: 33590861; PMCID: PMC7931819. -- [SeqKit] +- [SeqKit](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5051824/) > Shen W, Le S, Li Y, Hu F. SeqKit: A cross-platform and ultrafast toolkit for FASTA/Q file manipulation. PLoS One. 2016 Oct 5;11(10):e0163962. doi: 10.1371/journal.pone.0163962. PubMed PMID: 27706213; PMCID: PMC5051824. -- [Seqtk] - - > Li H. Toolkit for processing sequences in FASTA/Q formats. GitHub Repository. 2012. https://github.com/lh3/seqtk. Accessed August 2024. - ## Software packaging/containerisation tools - [Anaconda](https://anaconda.com) diff --git a/LICENSE b/LICENSE index e238724..a9bcd4d 100644 --- a/LICENSE +++ b/LICENSE @@ -1,6 +1,6 @@ MIT License -Copyright (c) @priyanka-surana +Copyright (c) 2022-2024 Genome Research Ltd. Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal diff --git a/conf/base.config b/conf/base.config index 679ec14..0a733b9 100644 --- a/conf/base.config +++ b/conf/base.config @@ -43,7 +43,7 @@ process { // minimum 1GB memory withName: 'BBMAP_FILTERBYNAME' { - memory = { check_max( 1.GB, 'memory' ) } + memory = { check_max( 1.GB * task.attempt, 'memory' ) } } withName: 'SAMTOOLS_COLLATETOFASTA' { diff --git a/nextflow.config b/nextflow.config index d143247..536987e 100644 --- a/nextflow.config +++ b/nextflow.config @@ -236,7 +236,7 @@ manifest { description = 'Pipeline to map reads generated using different sequencing technologies against a genome assembly.' mainScript = 'main.nf' nextflowVersion = '!>=22.10.1' - version = '1.3.0' + version = '1.3.1' doi = '10.5281/zenodo.6563577' } From 9ab1daf918c43cf6dbee99616ebc5170628fec9a Mon Sep 17 00:00:00 2001 From: reichan1998 Date: Thu, 26 Sep 2024 09:47:24 +0700 Subject: [PATCH 23/29] fix EC --- conf/base.config | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/conf/base.config b/conf/base.config index 0a733b9..f9d14f7 100644 --- a/conf/base.config +++ b/conf/base.config @@ -122,7 +122,7 @@ process { withName: GENERATE_CRAM_CSV { cpus = { check_max( 4 * task.attempt, 'cpus' ) } - memory = { check_max( 16.GB * task.attempt, 'memory' ) } + memory = { check_max( 16.GB * task.attempt, 'memory' ) } } withName: CRUMBLE { From 1e76d933aa7279dc138359dba788ad49ea25fc42 Mon Sep 17 00:00:00 2001 From: reichan1998 Date: Thu, 26 Sep 2024 09:58:34 +0700 Subject: [PATCH 24/29] fix accidental commit --- seq_cache_populate.pl | 0 1 file changed, 0 insertions(+), 0 deletions(-) delete mode 100644 seq_cache_populate.pl diff --git a/seq_cache_populate.pl b/seq_cache_populate.pl deleted file mode 100644 index e69de29..0000000 From 5a83851c78493b9aaed37a7efed5c14d6d41e630 Mon Sep 17 00:00:00 2001 From: Tyler Chafin Date: Tue, 1 Oct 2024 10:44:08 +0100 Subject: [PATCH 25/29] Change default to scale cpus --- conf/base.config | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/conf/base.config b/conf/base.config index f9d14f7..2f1cef5 100644 --- a/conf/base.config +++ b/conf/base.config @@ -16,7 +16,7 @@ process { // pipeline to self-heal from MEMLIMIT/RUNLIMIT. // Default - cpus = 1 + cpus = { check_max( 1 * task.attempt, 'cpus' ) } memory = { check_max( 50.MB * task.attempt, 'memory' ) } time = { check_max( 30.min * task.attempt, 'time' ) } From 9df839130b3efdadba7e6f61e66d0ce2954e998e Mon Sep 17 00:00:00 2001 From: Tyler Chafin Date: Tue, 1 Oct 2024 10:51:36 +0100 Subject: [PATCH 26/29] Update LICENSE --- LICENSE | 2 ++ 1 file changed, 2 insertions(+) diff --git a/LICENSE b/LICENSE index a9bcd4d..257404f 100644 --- a/LICENSE +++ b/LICENSE @@ -1,6 +1,8 @@ MIT License Copyright (c) 2022-2024 Genome Research Ltd. +except `bin/filter_five_end.pl`: +Copyright (c) 2017 Arima Genomics, Inc. Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal From 5e6f2d0898ab0a3be2a90b8051db09f27f98443a Mon Sep 17 00:00:00 2001 From: reichan1998 Date: Wed, 2 Oct 2024 00:13:33 +0700 Subject: [PATCH 27/29] remove the warning and the config entries for the older aligner calls --- conf/modules.config | 19 ------------------- subworkflows/local/align_ont.nf | 2 +- subworkflows/local/align_pacbio.nf | 2 +- subworkflows/local/bwamem2_mapreduce.nf | 2 +- subworkflows/local/minimap2_mapreduce.nf | 2 +- 5 files changed, 4 insertions(+), 23 deletions(-) diff --git a/conf/modules.config b/conf/modules.config index 342caca..6244108 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -50,14 +50,6 @@ process { ext.args = "--output-fmt cram" } - withName: '.*:.*:ALIGN_HIC:BWAMEM2_MEM' { - ext.args = { "-5SPCp -R ${meta.read_group}" } - } - - withName: '.*:.*:ALIGN_ILLUMINA:BWAMEM2_MEM' { - ext.args = { "-p -R ${meta.read_group}" } - } - withName: ".*:ALIGN_ILLUMINA:.*:CRAM_FILTER_ALIGN_BWAMEM2_FIXMATE_SORT" { ext.args = "" ext.args1 = { "-F 0x200 -nt" } @@ -99,17 +91,6 @@ process { // NOTE: minimap2 uses the decimal system ! 1G = 1,000,000,000 bp // NOTE: Math.ceil returns a double, but fortunately minimap2 accepts floating point values. // NOTE: minimap2 2.25 raises the default to 8G, which means higher memory savings on smaller genomes - withName: '.*:.*:ALIGN_HIFI:MINIMAP2_ALIGN' { - ext.args = { "-ax map-hifi --cs=short -R ${meta.read_group} -I" + Math.ceil(meta2.genome_size/1e9) + 'G' } - } - - withName: '.*:.*:ALIGN_CLR:MINIMAP2_ALIGN' { - ext.args = { "-ax map-pb -R ${meta.read_group} -I" + Math.ceil(meta2.genome_size/1e9) + 'G' } - } - - withName: '.*:.*:ALIGN_ONT:MINIMAP2_ALIGN' { - ext.args = { "-ax map-ont -R ${meta.read_group} -I" + Math.ceil(meta2.genome_size/1e9) + 'G' } - } withName: ".*:ALIGN_HIFI:.*:CRAM_FILTER_MINIMAP2_FILTER5END_FIXMATE_SORT" { ext.args = "" diff --git a/subworkflows/local/align_ont.nf b/subworkflows/local/align_ont.nf index f1e3465..751fcbc 100644 --- a/subworkflows/local/align_ont.nf +++ b/subworkflows/local/align_ont.nf @@ -77,7 +77,7 @@ workflow ALIGN_ONT { // Merge, but only if there is more than 1 file SAMTOOLS_MERGE ( ch_bams.multi_bams, [ [], [] ], [ [], [] ] ) - ch_versions = ch_versions.mix ( SAMTOOLS_MERGE.out.versions.first() ) + ch_versions = ch_versions.mix ( SAMTOOLS_MERGE.out.versions ) // Convert merged BAM to CRAM and calculate indices and statistics diff --git a/subworkflows/local/align_pacbio.nf b/subworkflows/local/align_pacbio.nf index cd42e63..59e039c 100644 --- a/subworkflows/local/align_pacbio.nf +++ b/subworkflows/local/align_pacbio.nf @@ -81,7 +81,7 @@ workflow ALIGN_PACBIO { // Merge, but only if there is more than 1 file SAMTOOLS_MERGE ( ch_bams.multi_bams, [ [], [] ], [ [], [] ] ) - ch_versions = ch_versions.mix ( SAMTOOLS_MERGE.out.versions.first() ) + ch_versions = ch_versions.mix ( SAMTOOLS_MERGE.out.versions ) // Convert merged BAM to CRAM and calculate indices and statistics diff --git a/subworkflows/local/bwamem2_mapreduce.nf b/subworkflows/local/bwamem2_mapreduce.nf index 13711fb..572c9d5 100644 --- a/subworkflows/local/bwamem2_mapreduce.nf +++ b/subworkflows/local/bwamem2_mapreduce.nf @@ -76,7 +76,7 @@ workflow BWAMEM2_MAPREDUCE { fasta, [ [], [] ] ) - ch_versions = ch_versions.mix ( SAMTOOLS_MERGE.out.versions.first() ) + ch_versions = ch_versions.mix ( SAMTOOLS_MERGE.out.versions ) emit: diff --git a/subworkflows/local/minimap2_mapreduce.nf b/subworkflows/local/minimap2_mapreduce.nf index 7503e02..7f6bb43 100644 --- a/subworkflows/local/minimap2_mapreduce.nf +++ b/subworkflows/local/minimap2_mapreduce.nf @@ -93,7 +93,7 @@ workflow MINIMAP2_MAPREDUCE { fasta, [ [], [] ] ) - ch_versions = ch_versions.mix ( SAMTOOLS_MERGE.out.versions.first() ) + ch_versions = ch_versions.mix ( SAMTOOLS_MERGE.out.versions ) emit: From 83dd8cda67d5ca4924b70d7a58dd560dc83d7e66 Mon Sep 17 00:00:00 2001 From: reichan1998 Date: Tue, 15 Oct 2024 02:54:04 -0700 Subject: [PATCH 28/29] add chunking before filtering for PacBio --- conf/modules.config | 26 +++++++- modules/local/cram_filter.nf | 46 ++++++++++++++ subworkflows/local/align_pacbio.nf | 47 +++++++++----- .../local/create_cram_filter_input.nf | 40 ++++++++++++ subworkflows/local/filter_pacbio.nf | 61 ++++--------------- 5 files changed, 154 insertions(+), 66 deletions(-) create mode 100644 modules/local/cram_filter.nf create mode 100644 subworkflows/local/create_cram_filter_input.nf diff --git a/conf/modules.config b/conf/modules.config index 6244108..ee8c2b7 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -35,21 +35,45 @@ process { withName: SAMTOOLS_COLLATETOFASTA { beforeScript = { "export REF_PATH=spoof"} ext.args = { (params.use_work_dir_as_temp ? "-T." : "") } + ext.prefix = { "${meta.chunk_id}" } + } + + withName: SAMTOOLS_FILTERTOFASTQ { + ext.prefix = { "${meta.chunk_id}" } } withName: BLAST_BLASTN { ext.args = '-task blastn -reward 1 -penalty -5 -gapopen 3 -gapextend 3 -dust yes -soft_masking true -evalue .01 -searchsp 1750000000000 -outfmt 6' + ext.prefix = { "${meta.chunk_id}" } + } + + withName: PACBIO_FILTER { + ext.prefix = { "${meta.chunk_id}" } } withName: SAMTOOLS_CONVERT { beforeScript = { "export REF_PATH=spoof"} - ext.args = "-be '[rq]>=0.99' -x fi -x fp -x ri -x rp --write-index" + ext.args = "--output-fmt bam --write-index" + ext.prefix = { "${meta.chunk_id}" } } withName: CONVERT_CRAM { ext.args = "--output-fmt cram" } + withName: CONVERT_FQ_CRAM { + ext.args = "--output-fmt cram" + ext.prefix = { "${meta.chunk_id}" } + } + + withName: SAMTOOLS_INDEX_FQ { + ext.prefix = { "${meta.chunk_id}" } + } + + withName: GENERATE_CRAM_CSV_FQ { + ext.prefix = { "${meta.chunk_id}" } + } + withName: ".*:ALIGN_ILLUMINA:.*:CRAM_FILTER_ALIGN_BWAMEM2_FIXMATE_SORT" { ext.args = "" ext.args1 = { "-F 0x200 -nt" } diff --git a/modules/local/cram_filter.nf b/modules/local/cram_filter.nf new file mode 100644 index 0000000..9a39fc3 --- /dev/null +++ b/modules/local/cram_filter.nf @@ -0,0 +1,46 @@ +process CRAM_FILTER { + tag "$meta.chunk_id" + label "process_high" + + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/mulled-v2-1a6fe65bd6674daba65066aa796ed8f5e8b4687b:688e175eb0db54de17822ba7810cc9e20fa06dd5-0' : + 'biocontainers/mulled-v2-1a6fe65bd6674daba65066aa796ed8f5e8b4687b:688e175eb0db54de17822ba7810cc9e20fa06dd5-0' }" + + input: + tuple val(meta), path(cramfile), path(cramindex), val(from), val(to), val(base), val(chunkid), val(rglines), path(reference) + + output: + tuple val(meta), path("*.cram"), emit: cram + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def VERSION = "1.15" // Staden_io versions break the pipeline + """ + cram_filter -n ${from}-${to} ${cramfile} ${prefix}_${base}_${chunkid}.cram + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//' ) + staden_io: $VERSION + END_VERSIONS + """ + + stub: + def prefix = task.ext.prefix ?: "${meta.id}" + def base = "45022_3#2" + def chunkid = "1" + """ + touch ${prefix}_${base}_${chunkid}_filtered.cram + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//' ) + staden_io: $VERSION + END_VERSIONS + """ +} \ No newline at end of file diff --git a/subworkflows/local/align_pacbio.nf b/subworkflows/local/align_pacbio.nf index 59e039c..a3edcf7 100644 --- a/subworkflows/local/align_pacbio.nf +++ b/subworkflows/local/align_pacbio.nf @@ -5,11 +5,14 @@ include { FILTER_PACBIO } from '../../subworkflows/local/filter_pacbio' include { SAMTOOLS_ADDREPLACERG } from '../../modules/local/samtools_addreplacerg' include { SAMTOOLS_INDEX } from '../../modules/nf-core/samtools/index/main' +include { SAMTOOLS_INDEX as SAMTOOLS_INDEX_FQ } from '../../modules/nf-core/samtools/index/main' include { GENERATE_CRAM_CSV } from '../../modules/local/generate_cram_csv' +include { GENERATE_CRAM_CSV as GENERATE_CRAM_CSV_FQ } from '../../modules/local/generate_cram_csv' include { MINIMAP2_MAPREDUCE } from '../../subworkflows/local/minimap2_mapreduce' include { SAMTOOLS_SORMADUP as CONVERT_CRAM } from '../../modules/local/samtools_sormadup' +include { SAMTOOLS_SORMADUP as CONVERT_FQ_CRAM } from '../../modules/local/samtools_sormadup' include { SAMTOOLS_MERGE } from '../../modules/nf-core/samtools/merge/main' - +include { CREATE_CRAM_FILTER_INPUT } from '../../subworkflows/local/create_cram_filter_input' workflow ALIGN_PACBIO { take: @@ -22,41 +25,55 @@ workflow ALIGN_PACBIO { ch_versions = Channel.empty() ch_merged_bam = Channel.empty() - // Filter BAM and output as FASTQ - FILTER_PACBIO ( reads, db ) - ch_versions = ch_versions.mix ( FILTER_PACBIO.out.versions ) - - // Convert FASTQ to CRAM - CONVERT_CRAM ( FILTER_PACBIO.out.fastq, fasta ) + // Convert input to CRAM + CONVERT_CRAM ( reads, fasta ) ch_versions = ch_versions.mix ( CONVERT_CRAM.out.versions ) SAMTOOLS_ADDREPLACERG ( CONVERT_CRAM.out.bam ) ch_versions = ch_versions.mix ( SAMTOOLS_ADDREPLACERG.out.versions ) - SAMTOOLS_ADDREPLACERG.out.cram - | set { ch_reads_cram } - // Index the CRAM file - SAMTOOLS_INDEX ( ch_reads_cram ) + SAMTOOLS_INDEX ( SAMTOOLS_ADDREPLACERG.out.cram ) ch_versions = ch_versions.mix( SAMTOOLS_INDEX.out.versions ) - ch_reads_cram + SAMTOOLS_ADDREPLACERG.out.cram | join ( SAMTOOLS_INDEX.out.crai ) + | set { ch_reads_cram } + + GENERATE_CRAM_CSV( ch_reads_cram ) + ch_versions = ch_versions.mix( GENERATE_CRAM_CSV.out.versions ) + + CREATE_CRAM_FILTER_INPUT ( GENERATE_CRAM_CSV.out.csv, fasta ) + ch_versions = ch_versions.mix( CREATE_CRAM_FILTER_INPUT.out.versions ) + + // Filter BAM and output as FASTQ + FILTER_PACBIO ( CREATE_CRAM_FILTER_INPUT.out.chunked_cram, db ) + ch_versions = ch_versions.mix ( FILTER_PACBIO.out.versions ) + + // Convert FASTQ to CRAM + CONVERT_FQ_CRAM ( FILTER_PACBIO.out.fastq, fasta ) + ch_versions = ch_versions.mix ( CONVERT_FQ_CRAM.out.versions ) + + SAMTOOLS_INDEX_FQ ( CONVERT_FQ_CRAM.out.bam ) + ch_versions = ch_versions.mix( SAMTOOLS_INDEX_FQ.out.versions ) + + CONVERT_FQ_CRAM.out.bam + | join ( SAMTOOLS_INDEX_FQ.out.crai ) | set { ch_reads_cram_crai } // // MODULE: generate a CRAM CSV file containing the required parametres for CRAM_FILTER_MINIMAP2_FILTER5END_FIXMATE_SORT // - GENERATE_CRAM_CSV( ch_reads_cram_crai ) - ch_versions = ch_versions.mix( GENERATE_CRAM_CSV.out.versions ) + GENERATE_CRAM_CSV_FQ( ch_reads_cram_crai ) + ch_versions = ch_versions.mix( GENERATE_CRAM_CSV_FQ.out.versions ) // // SUBWORKFLOW: mapping pacbio reads using minimap2 // MINIMAP2_MAPREDUCE ( fasta, - GENERATE_CRAM_CSV.out.csv + GENERATE_CRAM_CSV_FQ.out.csv ) ch_versions = ch_versions.mix( MINIMAP2_MAPREDUCE.out.versions ) ch_merged_bam = ch_merged_bam.mix(MINIMAP2_MAPREDUCE.out.mergedbam) diff --git a/subworkflows/local/create_cram_filter_input.nf b/subworkflows/local/create_cram_filter_input.nf new file mode 100644 index 0000000..f4d4d0f --- /dev/null +++ b/subworkflows/local/create_cram_filter_input.nf @@ -0,0 +1,40 @@ +include { CRAM_FILTER } from '../../modules/local/cram_filter' + +workflow CREATE_CRAM_FILTER_INPUT { + take: + csv_ch + fasta + + main: + ch_versions = Channel.empty() + + // Generate input channel for CRAM_FILTER + csv_ch + |splitCsv() + |combine(fasta) + |map { cram_id, cram_info, ref_id, ref_dir -> + tuple([ + id: cram_id.id, + chunk_id: cram_id.id + "_" + cram_info[5], + genome_size: ref_id.genome_size, + read_count: cram_id.read_count + ], + file(cram_info[0]), + cram_info[1], + cram_info[2], + cram_info[3], + cram_info[4], + cram_info[5], + cram_info[6], + ref_dir + ) + } + | set { ch_cram_filter_input } + + CRAM_FILTER(ch_cram_filter_input) + ch_versions = ch_versions.mix(CRAM_FILTER.out.versions) + + emit: + chunked_cram = CRAM_FILTER.out.cram + versions = ch_versions +} \ No newline at end of file diff --git a/subworkflows/local/filter_pacbio.nf b/subworkflows/local/filter_pacbio.nf index 5edb338..f8f50de 100644 --- a/subworkflows/local/filter_pacbio.nf +++ b/subworkflows/local/filter_pacbio.nf @@ -9,7 +9,7 @@ include { BLAST_BLASTN } from '../../modules/nf-core/blast/ include { PACBIO_FILTER } from '../../modules/local/pacbio_filter' include { SAMTOOLS_FILTERTOFASTQ } from '../../modules/local/samtools_filtertofastq' include { SEQKIT_FQ2FA } from '../../modules/nf-core/seqkit/fq2fa' -include { BBMAP_FILTERBYNAME } from '../../modules/nf-core/bbmap/filterbyname' +include { BBMAP_FILTERBYNAME } from '../../modules/nf-core/bbmap/filterbyname' workflow FILTER_PACBIO { @@ -17,55 +17,32 @@ workflow FILTER_PACBIO { reads // channel: [ val(meta), /path/to/datafile ] db // channel: /path/to/vector_db - main: ch_versions = Channel.empty() - - // Check file types and branch + // Convert from PacBio CRAM to BAM reads - | branch { - meta, reads -> - fastq : reads.findAll { it.getName().toLowerCase() =~ /.*f.*\.gz/ } - bam : true - } - | set { ch_reads } - - - // Convert from PacBio BAM to Samtools BAM - ch_reads.bam - | map { meta, bam -> [ meta, bam, [] ] } + | map { meta, cram -> [ meta, cram, [] ] } | set { ch_pacbio } SAMTOOLS_CONVERT ( ch_pacbio, [ [], [] ], [] ) - ch_versions = ch_versions.mix ( SAMTOOLS_CONVERT.out.versions.first() ) - + ch_versions = ch_versions.mix ( SAMTOOLS_CONVERT.out.versions ) // Collate BAM file to create interleaved FASTA SAMTOOLS_COLLATETOFASTA ( SAMTOOLS_CONVERT.out.bam ) - ch_versions = ch_versions.mix ( SAMTOOLS_COLLATETOFASTA.out.versions.first() ) - - - // Convert FASTQ to FASTA using SEQKIT_FQ2FA - SEQKIT_FQ2FA ( ch_reads.fastq ) - ch_versions = ch_versions.mix ( SEQKIT_FQ2FA.out.versions.first() ) + ch_versions = ch_versions.mix ( SAMTOOLS_COLLATETOFASTA.out.versions ) - - // Combine BAM-derived FASTA with converted FASTQ inputs + // Combine BAM-derived FASTA SAMTOOLS_COLLATETOFASTA.out.fasta - | concat( SEQKIT_FQ2FA.out.fasta ) | set { ch_fasta } - // Nucleotide BLAST BLAST_BLASTN ( ch_fasta, db ) - ch_versions = ch_versions.mix ( BLAST_BLASTN.out.versions.first() ) - + ch_versions = ch_versions.mix ( BLAST_BLASTN.out.versions ) // Filter BLAST output PACBIO_FILTER ( BLAST_BLASTN.out.txt ) - ch_versions = ch_versions.mix ( PACBIO_FILTER.out.versions.first() ) - + ch_versions = ch_versions.mix ( PACBIO_FILTER.out.versions ) // Filter the input BAM and output as interleaved FASTA SAMTOOLS_CONVERT.out.bam @@ -78,29 +55,13 @@ workflow FILTER_PACBIO { | set { ch_bam_reads } SAMTOOLS_FILTERTOFASTQ ( ch_bam_reads.bams, ch_bam_reads.lists ) - ch_versions = ch_versions.mix ( SAMTOOLS_FILTERTOFASTQ.out.versions.first() ) - - - // Filter inputs provided as FASTQ and output as interleaved FASTQ - ch_reads.fastq - | join(PACBIO_FILTER.out.list) - | multiMap { meta, fastq, list -> \ - fastqs: [meta, fastq] - lists: list - } - | set { ch_reads_fastq } - - BBMAP_FILTERBYNAME ( ch_reads_fastq.fastqs, ch_reads_fastq.lists , "fastq", true) - ch_versions = ch_versions.mix ( BBMAP_FILTERBYNAME.out.versions.first() ) - + ch_versions = ch_versions.mix ( SAMTOOLS_FILTERTOFASTQ.out.versions ) // Merge filtered outputs as ch_output_fastq - BBMAP_FILTERBYNAME.out.reads - | concat ( SAMTOOLS_FILTERTOFASTQ.out.fastq ) + SAMTOOLS_FILTERTOFASTQ.out.fastq | set { ch_filtered_fastq } - emit: fastq = ch_filtered_fastq // channel: [ meta, /path/to/fastq ] versions = ch_versions // channel: [ versions.yml ] -} +} \ No newline at end of file From 3cd2c01e1e7fc8677fc9eda23b17eb9676e5afde Mon Sep 17 00:00:00 2001 From: reichan1998 Date: Tue, 15 Oct 2024 06:59:53 -0700 Subject: [PATCH 29/29] Revert "add chunking before filtering for PacBio" This reverts commit 83dd8cda67d5ca4924b70d7a58dd560dc83d7e66. --- conf/modules.config | 26 +------- modules/local/cram_filter.nf | 46 -------------- subworkflows/local/align_pacbio.nf | 47 +++++--------- .../local/create_cram_filter_input.nf | 40 ------------ subworkflows/local/filter_pacbio.nf | 61 +++++++++++++++---- 5 files changed, 66 insertions(+), 154 deletions(-) delete mode 100644 modules/local/cram_filter.nf delete mode 100644 subworkflows/local/create_cram_filter_input.nf diff --git a/conf/modules.config b/conf/modules.config index ee8c2b7..6244108 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -35,45 +35,21 @@ process { withName: SAMTOOLS_COLLATETOFASTA { beforeScript = { "export REF_PATH=spoof"} ext.args = { (params.use_work_dir_as_temp ? "-T." : "") } - ext.prefix = { "${meta.chunk_id}" } - } - - withName: SAMTOOLS_FILTERTOFASTQ { - ext.prefix = { "${meta.chunk_id}" } } withName: BLAST_BLASTN { ext.args = '-task blastn -reward 1 -penalty -5 -gapopen 3 -gapextend 3 -dust yes -soft_masking true -evalue .01 -searchsp 1750000000000 -outfmt 6' - ext.prefix = { "${meta.chunk_id}" } - } - - withName: PACBIO_FILTER { - ext.prefix = { "${meta.chunk_id}" } } withName: SAMTOOLS_CONVERT { beforeScript = { "export REF_PATH=spoof"} - ext.args = "--output-fmt bam --write-index" - ext.prefix = { "${meta.chunk_id}" } + ext.args = "-be '[rq]>=0.99' -x fi -x fp -x ri -x rp --write-index" } withName: CONVERT_CRAM { ext.args = "--output-fmt cram" } - withName: CONVERT_FQ_CRAM { - ext.args = "--output-fmt cram" - ext.prefix = { "${meta.chunk_id}" } - } - - withName: SAMTOOLS_INDEX_FQ { - ext.prefix = { "${meta.chunk_id}" } - } - - withName: GENERATE_CRAM_CSV_FQ { - ext.prefix = { "${meta.chunk_id}" } - } - withName: ".*:ALIGN_ILLUMINA:.*:CRAM_FILTER_ALIGN_BWAMEM2_FIXMATE_SORT" { ext.args = "" ext.args1 = { "-F 0x200 -nt" } diff --git a/modules/local/cram_filter.nf b/modules/local/cram_filter.nf deleted file mode 100644 index 9a39fc3..0000000 --- a/modules/local/cram_filter.nf +++ /dev/null @@ -1,46 +0,0 @@ -process CRAM_FILTER { - tag "$meta.chunk_id" - label "process_high" - - container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/mulled-v2-1a6fe65bd6674daba65066aa796ed8f5e8b4687b:688e175eb0db54de17822ba7810cc9e20fa06dd5-0' : - 'biocontainers/mulled-v2-1a6fe65bd6674daba65066aa796ed8f5e8b4687b:688e175eb0db54de17822ba7810cc9e20fa06dd5-0' }" - - input: - tuple val(meta), path(cramfile), path(cramindex), val(from), val(to), val(base), val(chunkid), val(rglines), path(reference) - - output: - tuple val(meta), path("*.cram"), emit: cram - path "versions.yml" , emit: versions - - when: - task.ext.when == null || task.ext.when - - script: - def args = task.ext.args ?: '' - def prefix = task.ext.prefix ?: "${meta.id}" - def VERSION = "1.15" // Staden_io versions break the pipeline - """ - cram_filter -n ${from}-${to} ${cramfile} ${prefix}_${base}_${chunkid}.cram - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//' ) - staden_io: $VERSION - END_VERSIONS - """ - - stub: - def prefix = task.ext.prefix ?: "${meta.id}" - def base = "45022_3#2" - def chunkid = "1" - """ - touch ${prefix}_${base}_${chunkid}_filtered.cram - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//' ) - staden_io: $VERSION - END_VERSIONS - """ -} \ No newline at end of file diff --git a/subworkflows/local/align_pacbio.nf b/subworkflows/local/align_pacbio.nf index a3edcf7..59e039c 100644 --- a/subworkflows/local/align_pacbio.nf +++ b/subworkflows/local/align_pacbio.nf @@ -5,14 +5,11 @@ include { FILTER_PACBIO } from '../../subworkflows/local/filter_pacbio' include { SAMTOOLS_ADDREPLACERG } from '../../modules/local/samtools_addreplacerg' include { SAMTOOLS_INDEX } from '../../modules/nf-core/samtools/index/main' -include { SAMTOOLS_INDEX as SAMTOOLS_INDEX_FQ } from '../../modules/nf-core/samtools/index/main' include { GENERATE_CRAM_CSV } from '../../modules/local/generate_cram_csv' -include { GENERATE_CRAM_CSV as GENERATE_CRAM_CSV_FQ } from '../../modules/local/generate_cram_csv' include { MINIMAP2_MAPREDUCE } from '../../subworkflows/local/minimap2_mapreduce' include { SAMTOOLS_SORMADUP as CONVERT_CRAM } from '../../modules/local/samtools_sormadup' -include { SAMTOOLS_SORMADUP as CONVERT_FQ_CRAM } from '../../modules/local/samtools_sormadup' include { SAMTOOLS_MERGE } from '../../modules/nf-core/samtools/merge/main' -include { CREATE_CRAM_FILTER_INPUT } from '../../subworkflows/local/create_cram_filter_input' + workflow ALIGN_PACBIO { take: @@ -25,55 +22,41 @@ workflow ALIGN_PACBIO { ch_versions = Channel.empty() ch_merged_bam = Channel.empty() - // Convert input to CRAM - CONVERT_CRAM ( reads, fasta ) + // Filter BAM and output as FASTQ + FILTER_PACBIO ( reads, db ) + ch_versions = ch_versions.mix ( FILTER_PACBIO.out.versions ) + + // Convert FASTQ to CRAM + CONVERT_CRAM ( FILTER_PACBIO.out.fastq, fasta ) ch_versions = ch_versions.mix ( CONVERT_CRAM.out.versions ) SAMTOOLS_ADDREPLACERG ( CONVERT_CRAM.out.bam ) ch_versions = ch_versions.mix ( SAMTOOLS_ADDREPLACERG.out.versions ) - // Index the CRAM file - SAMTOOLS_INDEX ( SAMTOOLS_ADDREPLACERG.out.cram ) - ch_versions = ch_versions.mix( SAMTOOLS_INDEX.out.versions ) - SAMTOOLS_ADDREPLACERG.out.cram - | join ( SAMTOOLS_INDEX.out.crai ) | set { ch_reads_cram } - GENERATE_CRAM_CSV( ch_reads_cram ) - ch_versions = ch_versions.mix( GENERATE_CRAM_CSV.out.versions ) - - CREATE_CRAM_FILTER_INPUT ( GENERATE_CRAM_CSV.out.csv, fasta ) - ch_versions = ch_versions.mix( CREATE_CRAM_FILTER_INPUT.out.versions ) - - // Filter BAM and output as FASTQ - FILTER_PACBIO ( CREATE_CRAM_FILTER_INPUT.out.chunked_cram, db ) - ch_versions = ch_versions.mix ( FILTER_PACBIO.out.versions ) - - // Convert FASTQ to CRAM - CONVERT_FQ_CRAM ( FILTER_PACBIO.out.fastq, fasta ) - ch_versions = ch_versions.mix ( CONVERT_FQ_CRAM.out.versions ) - - SAMTOOLS_INDEX_FQ ( CONVERT_FQ_CRAM.out.bam ) - ch_versions = ch_versions.mix( SAMTOOLS_INDEX_FQ.out.versions ) + // Index the CRAM file + SAMTOOLS_INDEX ( ch_reads_cram ) + ch_versions = ch_versions.mix( SAMTOOLS_INDEX.out.versions ) - CONVERT_FQ_CRAM.out.bam - | join ( SAMTOOLS_INDEX_FQ.out.crai ) + ch_reads_cram + | join ( SAMTOOLS_INDEX.out.crai ) | set { ch_reads_cram_crai } // // MODULE: generate a CRAM CSV file containing the required parametres for CRAM_FILTER_MINIMAP2_FILTER5END_FIXMATE_SORT // - GENERATE_CRAM_CSV_FQ( ch_reads_cram_crai ) - ch_versions = ch_versions.mix( GENERATE_CRAM_CSV_FQ.out.versions ) + GENERATE_CRAM_CSV( ch_reads_cram_crai ) + ch_versions = ch_versions.mix( GENERATE_CRAM_CSV.out.versions ) // // SUBWORKFLOW: mapping pacbio reads using minimap2 // MINIMAP2_MAPREDUCE ( fasta, - GENERATE_CRAM_CSV_FQ.out.csv + GENERATE_CRAM_CSV.out.csv ) ch_versions = ch_versions.mix( MINIMAP2_MAPREDUCE.out.versions ) ch_merged_bam = ch_merged_bam.mix(MINIMAP2_MAPREDUCE.out.mergedbam) diff --git a/subworkflows/local/create_cram_filter_input.nf b/subworkflows/local/create_cram_filter_input.nf deleted file mode 100644 index f4d4d0f..0000000 --- a/subworkflows/local/create_cram_filter_input.nf +++ /dev/null @@ -1,40 +0,0 @@ -include { CRAM_FILTER } from '../../modules/local/cram_filter' - -workflow CREATE_CRAM_FILTER_INPUT { - take: - csv_ch - fasta - - main: - ch_versions = Channel.empty() - - // Generate input channel for CRAM_FILTER - csv_ch - |splitCsv() - |combine(fasta) - |map { cram_id, cram_info, ref_id, ref_dir -> - tuple([ - id: cram_id.id, - chunk_id: cram_id.id + "_" + cram_info[5], - genome_size: ref_id.genome_size, - read_count: cram_id.read_count - ], - file(cram_info[0]), - cram_info[1], - cram_info[2], - cram_info[3], - cram_info[4], - cram_info[5], - cram_info[6], - ref_dir - ) - } - | set { ch_cram_filter_input } - - CRAM_FILTER(ch_cram_filter_input) - ch_versions = ch_versions.mix(CRAM_FILTER.out.versions) - - emit: - chunked_cram = CRAM_FILTER.out.cram - versions = ch_versions -} \ No newline at end of file diff --git a/subworkflows/local/filter_pacbio.nf b/subworkflows/local/filter_pacbio.nf index f8f50de..5edb338 100644 --- a/subworkflows/local/filter_pacbio.nf +++ b/subworkflows/local/filter_pacbio.nf @@ -9,7 +9,7 @@ include { BLAST_BLASTN } from '../../modules/nf-core/blast/ include { PACBIO_FILTER } from '../../modules/local/pacbio_filter' include { SAMTOOLS_FILTERTOFASTQ } from '../../modules/local/samtools_filtertofastq' include { SEQKIT_FQ2FA } from '../../modules/nf-core/seqkit/fq2fa' -include { BBMAP_FILTERBYNAME } from '../../modules/nf-core/bbmap/filterbyname' +include { BBMAP_FILTERBYNAME } from '../../modules/nf-core/bbmap/filterbyname' workflow FILTER_PACBIO { @@ -17,32 +17,55 @@ workflow FILTER_PACBIO { reads // channel: [ val(meta), /path/to/datafile ] db // channel: /path/to/vector_db + main: ch_versions = Channel.empty() - // Convert from PacBio CRAM to BAM + + // Check file types and branch reads - | map { meta, cram -> [ meta, cram, [] ] } + | branch { + meta, reads -> + fastq : reads.findAll { it.getName().toLowerCase() =~ /.*f.*\.gz/ } + bam : true + } + | set { ch_reads } + + + // Convert from PacBio BAM to Samtools BAM + ch_reads.bam + | map { meta, bam -> [ meta, bam, [] ] } | set { ch_pacbio } SAMTOOLS_CONVERT ( ch_pacbio, [ [], [] ], [] ) - ch_versions = ch_versions.mix ( SAMTOOLS_CONVERT.out.versions ) + ch_versions = ch_versions.mix ( SAMTOOLS_CONVERT.out.versions.first() ) + // Collate BAM file to create interleaved FASTA SAMTOOLS_COLLATETOFASTA ( SAMTOOLS_CONVERT.out.bam ) - ch_versions = ch_versions.mix ( SAMTOOLS_COLLATETOFASTA.out.versions ) + ch_versions = ch_versions.mix ( SAMTOOLS_COLLATETOFASTA.out.versions.first() ) + + + // Convert FASTQ to FASTA using SEQKIT_FQ2FA + SEQKIT_FQ2FA ( ch_reads.fastq ) + ch_versions = ch_versions.mix ( SEQKIT_FQ2FA.out.versions.first() ) - // Combine BAM-derived FASTA + + // Combine BAM-derived FASTA with converted FASTQ inputs SAMTOOLS_COLLATETOFASTA.out.fasta + | concat( SEQKIT_FQ2FA.out.fasta ) | set { ch_fasta } + // Nucleotide BLAST BLAST_BLASTN ( ch_fasta, db ) - ch_versions = ch_versions.mix ( BLAST_BLASTN.out.versions ) + ch_versions = ch_versions.mix ( BLAST_BLASTN.out.versions.first() ) + // Filter BLAST output PACBIO_FILTER ( BLAST_BLASTN.out.txt ) - ch_versions = ch_versions.mix ( PACBIO_FILTER.out.versions ) + ch_versions = ch_versions.mix ( PACBIO_FILTER.out.versions.first() ) + // Filter the input BAM and output as interleaved FASTA SAMTOOLS_CONVERT.out.bam @@ -55,13 +78,29 @@ workflow FILTER_PACBIO { | set { ch_bam_reads } SAMTOOLS_FILTERTOFASTQ ( ch_bam_reads.bams, ch_bam_reads.lists ) - ch_versions = ch_versions.mix ( SAMTOOLS_FILTERTOFASTQ.out.versions ) + ch_versions = ch_versions.mix ( SAMTOOLS_FILTERTOFASTQ.out.versions.first() ) + + + // Filter inputs provided as FASTQ and output as interleaved FASTQ + ch_reads.fastq + | join(PACBIO_FILTER.out.list) + | multiMap { meta, fastq, list -> \ + fastqs: [meta, fastq] + lists: list + } + | set { ch_reads_fastq } + + BBMAP_FILTERBYNAME ( ch_reads_fastq.fastqs, ch_reads_fastq.lists , "fastq", true) + ch_versions = ch_versions.mix ( BBMAP_FILTERBYNAME.out.versions.first() ) + // Merge filtered outputs as ch_output_fastq - SAMTOOLS_FILTERTOFASTQ.out.fastq + BBMAP_FILTERBYNAME.out.reads + | concat ( SAMTOOLS_FILTERTOFASTQ.out.fastq ) | set { ch_filtered_fastq } + emit: fastq = ch_filtered_fastq // channel: [ meta, /path/to/fastq ] versions = ch_versions // channel: [ versions.yml ] -} \ No newline at end of file +}