diff --git a/CHANGELOG.md b/CHANGELOG.md index c719dcf6d..9f9006d7a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,10 +3,12 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). -## v3.0.0dev - [date] +## v3.0.0dev - [2023-08-25] ### `Added` +- [#1006](https://github.com/nf-core/eager/issues/1006) Added feature to shard fastqs before mapping, allowing more flexibility in parallelisation of mapping. + ### `Fixed` ### `Dependencies` diff --git a/CITATIONS.md b/CITATIONS.md index c7f6bc96b..c0f3d40ca 100644 --- a/CITATIONS.md +++ b/CITATIONS.md @@ -34,6 +34,10 @@ > Broad Institute (2019). Picard Toolkit. GitHub Repository: https://broadinstitute.github.io/picard/ +- [SeqKit](https://bioinf.shenwei.me/seqkit/) + + > Shen, W., Le, S., Li, Y., & Hu, F. (2016). SeqKit: A Cross-Platform and Ultrafast Toolkit for FASTA/Q File Manipulation. PLOS ONE, 11(10), e0163962. doi:[10.1371/journal.pone.0163962](https://doi.org/10.1371/journal.pone.0163962) + - [bwa](https://doi.org/10.1093/bioinformatics/btp324) > Li, H., & Durbin, R. (2009). Fast and accurate short read alignment with Burrows-Wheeler transform. Bioinformatics , 25(14), 1754–1760. doi: [10.1093/bioinformatics/btp324](https://doi.org/10.1093/bioinformatics/btp324) diff --git a/conf/modules.config b/conf/modules.config index 1c8237b4e..19cc03080 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -371,6 +371,18 @@ process { ] } + // + // SHARDING FASTQS + // + withName: SEQKIT_SPLIT2 { + tag = { "${meta.sample_id}_${meta.library_id}_L${meta.lane}" } + ext.prefix = "out" + ext.args = "-s ${params.fastq_shard_size}" + publishDir = [ + enabled: false + ] + } + // // READ MAPPING // @@ -454,6 +466,7 @@ process { publishDir = [ enabled: false ] + ext.args = { params.run_fastq_sharding ? "-c -p" : "" } } withName: SAMTOOLS_SORT_MERGED_LANES { diff --git a/conf/test.config b/conf/test.config index b3c71f28a..41c35735a 100644 --- a/conf/test.config +++ b/conf/test.config @@ -27,6 +27,10 @@ params { // Genome references fasta = 'https://raw.githubusercontent.com/nf-core/test-datasets/eager/reference/Mammoth/Mammoth_MT_Krause.fasta' + // Sharding FASTQ + run_fastq_sharding = true + fastq_shard_size = 5000 + // BAM filtering run_bamfiltering = true bamfiltering_minreadlength = 30 diff --git a/modules.json b/modules.json index e58d050d6..6af228714 100644 --- a/modules.json +++ b/modules.json @@ -204,6 +204,11 @@ "branch": "master", "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", "installed_by": ["modules", "bam_split_by_region"] + }, + "seqkit/split2": { + "branch": "master", + "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", + "installed_by": ["modules"] } } }, diff --git a/modules/nf-core/seqkit/split2/main.nf b/modules/nf-core/seqkit/split2/main.nf new file mode 100644 index 000000000..0cd092628 --- /dev/null +++ b/modules/nf-core/seqkit/split2/main.nf @@ -0,0 +1,53 @@ +process SEQKIT_SPLIT2 { + tag "$meta.id" + label 'process_medium' + + conda "bioconda::seqkit=2.1.0" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/seqkit:2.1.0--h9ee0642_0' : + 'biocontainers/seqkit:2.1.0--h9ee0642_0' }" + + input: + tuple val(meta), path(reads) + + output: + tuple val(meta), path("**/*.gz"), emit: reads + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + if(meta.single_end){ + """ + seqkit \\ + split2 \\ + $args \\ + --threads $task.cpus \\ + $reads \\ + --out-dir ${prefix} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + seqkit: \$(echo \$(seqkit 2>&1) | sed 's/^.*Version: //; s/ .*\$//') + END_VERSIONS + """ + } else { + """ + seqkit \\ + split2 \\ + $args \\ + --threads $task.cpus \\ + --read1 ${reads[0]} \\ + --read2 ${reads[1]} \\ + --out-dir ${prefix} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + seqkit: \$(echo \$(seqkit 2>&1) | sed 's/^.*Version: //; s/ .*\$//') + END_VERSIONS + """ + } +} diff --git a/modules/nf-core/seqkit/split2/meta.yml b/modules/nf-core/seqkit/split2/meta.yml new file mode 100644 index 000000000..ac386066f --- /dev/null +++ b/modules/nf-core/seqkit/split2/meta.yml @@ -0,0 +1,39 @@ +name: seqkit_split2 +description: Split single or paired-end fastq.gz files +keywords: + - split + - fastq +tools: + - seqkit: + description: | + Cross-platform and ultrafast toolkit for FASTA/Q file manipulation, written by Wei Shen. + homepage: https://github.com/shenwei356/seqkit + documentation: https://bioinf.shenwei.me/seqkit/ + doi: 10.1371/journal.pone.0163962 + licence: ["MIT"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - reads: + type: file + description: FastQ files + pattern: "*.{fq.gz/fastq.gz}" +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - reads: + type: file + description: Split fastq files + pattern: "*.{fq.gz/fastq.gz}" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@FriederikeHanssen" diff --git a/nextflow.config b/nextflow.config index 9129060b1..90fa73892 100644 --- a/nextflow.config +++ b/nextflow.config @@ -31,6 +31,10 @@ params { max_multiqc_email_size = '25.MB' multiqc_methods_description = null + // Shard Fastq options + run_fastq_sharding = false + fastq_shard_size = 1000000 + // bedtools options run_bedtools_coverage = false mapstats_bedtools_featurefile = null diff --git a/nextflow_schema.json b/nextflow_schema.json index a54706256..db9ba0805 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -463,6 +463,19 @@ "description": "Options for aligning reads against reference genome(s)", "default": "", "properties": { + "run_fastq_sharding": { + "type": "boolean", + "description": "Turn on FastQ sharding.", + "fa_icon": "fas fa-power-off", + "help_text": "Sharding will split the FastQs into smaller chunks before mapping. These chunks are then mapped in parallel. This approach can speed up the mapping process for larger FastQ files." + }, + "fastq_shard_size": { + "type": "integer", + "default": 1000000, + "description": "Specify the number of reads in each shard when splitting.", + "fa_icon": "fas fa-arrows-alt-v", + "help_text": "Make sure to choose a value that makes sense for your dataset. Small values can create many files, which can end up negatively affecting the overall speed of the mapping process." + }, "mapping_tool": { "type": "string", "default": "bowtie2", @@ -1118,9 +1131,6 @@ { "$ref": "#/definitions/mapping" }, - { - "$ref": "#/definitions/adna_damage_analysis" - }, { "$ref": "#/definitions/bam_filtering" }, @@ -1131,25 +1141,25 @@ "$ref": "#/definitions/deduplication" }, { - "$ref": "#/definitions/mitochondrial_to_nuclear_ratio" + "$ref": "#/definitions/damage_manipulation" }, { - "$ref": "#/definitions/mapping_statistics" + "$ref": "#/definitions/genotyping" }, { - "$ref": "#/definitions/damage_manipulation" + "$ref": "#/definitions/mitochondrial_to_nuclear_ratio" }, { - "$ref": "#/definitions/genotyping" + "$ref": "#/definitions/mapping_statistics" }, { "$ref": "#/definitions/adna_damage_analysis" }, { - "$ref": "#/definitions/contamination_estimation" + "$ref": "#/definitions/host_removal" }, { - "$ref": "#/definitions/host_removal" + "$ref": "#/definitions/contamination_estimation" }, { "$ref": "#/definitions/feature_annotation_statistics" diff --git a/subworkflows/local/map.nf b/subworkflows/local/map.nf index dc38315ab..734e3ab5d 100644 --- a/subworkflows/local/map.nf +++ b/subworkflows/local/map.nf @@ -2,6 +2,7 @@ // Prepare reference indexing for downstream // +include { SEQKIT_SPLIT2 } from '../../modules/nf-core/seqkit/split2/main' include { FASTQ_ALIGN_BWAALN } from '../../subworkflows/nf-core/fastq_align_bwaaln/main' include { BWA_MEM } from '../../modules/nf-core/bwa/mem/main' include { BOWTIE2_ALIGN } from '../../modules/nf-core/bowtie2/align/main' @@ -19,6 +20,47 @@ workflow MAP { ch_versions = Channel.empty() ch_multiqc_files = Channel.empty() + if ( params.run_fastq_sharding ) { + + ch_input_for_sharding = reads + + SEQKIT_SPLIT2( ch_input_for_sharding ) + ch_versions = ch_versions.mix ( SEQKIT_SPLIT2.out.versions.first() ) + + sharded_reads = SEQKIT_SPLIT2.out.reads + .transpose() + .map { + meta, reads -> + new_meta = meta.clone() + new_meta.shard_number = reads.getName().replaceAll(/.*(part_\d+).(?:fastq|fq).gz/, '$1') + [ new_meta, reads ] + } + .groupTuple() + + ch_input_for_mapping = sharded_reads + .combine(index) + .multiMap { + meta, reads, meta2, index -> + new_meta = meta.clone() + new_meta.reference = meta2.id + reads: [ new_meta, reads ] + index: [ meta2, index ] + } + + } else { + + ch_input_for_mapping = reads + .combine(index) + .multiMap { + meta, reads, meta2, index -> + new_meta = meta.clone() + new_meta.reference = meta2.id + reads: [ new_meta, reads ] + index: [ meta2, index ] + } + + } + if ( params.mapping_tool == 'bwaaln' ) { ch_index_for_mapping = index ch_reads_for_mapping = reads @@ -76,8 +118,7 @@ workflow MAP { ch_input_for_lane_merge = ch_mapped_lane_bam .map { meta, bam -> - new_meta = meta.clone().findAll{ it.key !in ['lane', 'colour_chemistry'] } - + new_meta = meta.clone().findAll{ it.key !in ['lane', 'colour_chemistry', 'shard_number'] } [ new_meta, bam ] } .groupTuple()