From d19f77aa275ae87cb8509d3d5b1c0f0eff8442c8 Mon Sep 17 00:00:00 2001 From: Anne Marie Noronha Date: Fri, 20 Sep 2024 12:43:28 -0400 Subject: [PATCH 01/28] updating GRCh38 references --- conf/igenomes.config | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/conf/igenomes.config b/conf/igenomes.config index c618acc..ec6d5b0 100644 --- a/conf/igenomes.config +++ b/conf/igenomes.config @@ -34,11 +34,14 @@ params { ensembl_version = 75 } 'GRCh38' { - fasta = "${params.igenomes_base}/Homo_sapiens/NCBI/GRCh38/Sequence/WholeGenomeFasta/genome.fa" - gtf = "${params.igenomes_base}/Homo_sapiens/NCBI/GRCh38/Annotation/Genes/genes.gtf" - refflat = "${params.igenomes_base}/Homo_sapiens/NCBI/GRCh38/Annotation/Genes.gencode/refFlat.txt.gz" - starfusion_url = "https://data.broadinstitute.org/Trinity/CTAT_RESOURCE_LIB/__genome_libs_StarFv1.10/GRCh38_gencode_v37_CTAT_lib_Mar012021.plug-n-play.tar.gz" - cdna = "https://ftp.ensembl.org/pub/release-86/fasta/homo_sapiens/cdna/Homo_sapiens.GRCh38.cdna.all.fa.gz" + ensembl_version = 88 + fasta = "${params.igenomes_base}/Homo_sapiens/NCBI/GRCh38Decoy/Sequence/WholeGenomeFasta/genome.fa" + gtf = "https://ftp.ensembl.org/pub/release-88/gtf/homo_sapiens/Homo_sapiens.GRCh38.88.chr.gtf.gz" + //forte will generate refflat from gtf + refflat = null + starfusion_url = "https://data.broadinstitute.org/Trinity/CTAT_RESOURCE_LIB/__genome_libs_StarFv1.10/GRCh38_gencode_v37_CTAT_lib_Mar012021.plug-n-play.tar.gz" + cdna = "https://ftp.ensembl.org/pub/release-88/fasta/homo_sapiens/cdna/Homo_sapiens.GRCh38.cdna.all.fa.gz" + metafusion_blocklist = "https://raw.githubusercontent.com/anoronh4/forte-references/main/GRCh38/blocklist_breakpoints.hg38.bedpe.gz" } 'smallGRCh37' { fasta = "https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/genome/genome.fasta" @@ -48,7 +51,6 @@ params { cdna = "http://ftp.ensemblgenomes.org/pub/viruses/fasta/sars_cov_2/cdna/Sars_cov_2.ASM985889v3.cdna.all.fa.gz" metafusion_blocklist = "https://raw.githubusercontent.com/anoronh4/forte-references/main/GRCh37_test/blocklist_breakpoints.bedpe" ensembl_version = 75 - } /* 'hg38' { From f8069a297344bcf0451c1f4bdcf4d1a21be8ea7b Mon Sep 17 00:00:00 2001 From: Anne Marie Noronha Date: Mon, 23 Sep 2024 12:51:27 -0400 Subject: [PATCH 02/28] fix gtf channel for GRCh38 mode --- subworkflows/local/align_reads.nf | 2 +- subworkflows/local/fusion.nf | 4 ++-- subworkflows/local/prepare_references.nf | 14 +++++++------- subworkflows/local/quantification.nf | 6 +++--- 4 files changed, 13 insertions(+), 13 deletions(-) diff --git a/subworkflows/local/align_reads.nf b/subworkflows/local/align_reads.nf index 8e5c410..8f50d57 100644 --- a/subworkflows/local/align_reads.nf +++ b/subworkflows/local/align_reads.nf @@ -19,7 +19,7 @@ workflow ALIGN_READS { STAR_ALIGN( reads, star_index, - gtf, + gtf.map{it[1]}, false, [], [] diff --git a/subworkflows/local/fusion.nf b/subworkflows/local/fusion.nf index 0539428..6f8578c 100644 --- a/subworkflows/local/fusion.nf +++ b/subworkflows/local/fusion.nf @@ -41,7 +41,7 @@ workflow FUSION { STAR_FOR_ARRIBA( reads, star_index, - gtf, + gtf.map{it[1]}, false, [], [] @@ -51,7 +51,7 @@ workflow FUSION { ARRIBA( STAR_FOR_ARRIBA.out.bam, fasta, - gtf, + gtf.map{it[1]}, arriba_blacklist, arriba_known_fusions, [], diff --git a/subworkflows/local/prepare_references.nf b/subworkflows/local/prepare_references.nf index b1d06a8..a4abdd8 100644 --- a/subworkflows/local/prepare_references.nf +++ b/subworkflows/local/prepare_references.nf @@ -25,10 +25,10 @@ workflow PREPARE_REFERENCES { ch_versions = Channel.empty() if (params.gtf.endsWith(".gz")){ - GUNZIP_GTF([[:],params.gtf]) - gtf = GUNZIP_GTF.out.gunzip.map{ it[1] }.first() + GUNZIP_GTF([[id:params.genome],params.gtf]) + gtf = GUNZIP_GTF.out.gunzip.first() } else { - gtf = params.gtf + gtf = Channel.of([[id:params.genome],params.gtf]) } if (params.metafusion_blocklist.endsWith(".gz")){ @@ -38,11 +38,11 @@ workflow PREPARE_REFERENCES { metafusion_blocklist = params.metafusion_blocklist } - STAR_GENOMEGENERATE(params.fasta,gtf) + STAR_GENOMEGENERATE(params.fasta,gtf.map{it[1] }.first()) ch_versions = ch_versions.mix(STAR_GENOMEGENERATE.out.versions) star_index = STAR_GENOMEGENERATE.out.index - UCSC_GTFTOGENEPRED(Channel.value(gtf).map{[[id:params.genome],it]}) + UCSC_GTFTOGENEPRED(gtf) ch_versions = ch_versions.mix(UCSC_GTFTOGENEPRED.out.versions) UCSC_GENEPREDTOBED(UCSC_GTFTOGENEPRED.out.genepred) @@ -85,7 +85,7 @@ workflow PREPARE_REFERENCES { ARRIBA_DOWNLOAD() AGAT_SPADDINTRONS( - [[:],gtf], + gtf, [] ) @@ -94,7 +94,7 @@ workflow PREPARE_REFERENCES { ) METAFUSION_GENEINFO( - [[:],gtf], starfusion_ref, fusioncatcher_ref + gtf,starfusion_ref, fusioncatcher_ref ) AGFUSION_DOWNLOAD( diff --git a/subworkflows/local/quantification.nf b/subworkflows/local/quantification.nf index 6a1c543..43321b5 100644 --- a/subworkflows/local/quantification.nf +++ b/subworkflows/local/quantification.nf @@ -19,14 +19,14 @@ workflow QUANTIFICATION { HTSEQ_COUNT( bam.join(bai,by:[0]), - gtf + gtf.map{it[1]} ) ch_versions = ch_versions.mix(HTSEQ_COUNT.out.versions) FEATURECOUNTS_GENE( bam, - gtf + gtf.map{it[1]} ) ch_versions = ch_versions.mix(FEATURECOUNTS_GENE.out.versions) @@ -40,7 +40,7 @@ workflow QUANTIFICATION { COUNT_FEATURES( KALLISTO_QUANT.out.abundance, - gtf + gtf.map{it[1]} ) From 16dd74ebbb4a5a061bf783758a0067edb5917631 Mon Sep 17 00:00:00 2001 From: Anne Marie Noronha Date: Mon, 23 Sep 2024 13:28:14 -0400 Subject: [PATCH 03/28] update star/genomegenerate --- modules.json | 2 +- .../star/genomegenerate/environment.yml | 9 ++ modules/nf-core/star/genomegenerate/main.nf | 97 +++++++----- modules/nf-core/star/genomegenerate/meta.yml | 47 ++++-- .../star/genomegenerate/tests/main.nf.test | 114 ++++++++++++++ .../genomegenerate/tests/main.nf.test.snap | 148 ++++++++++++++++++ .../star/genomegenerate/tests/tags.yml | 2 + subworkflows/local/prepare_references.nf | 5 +- 8 files changed, 371 insertions(+), 53 deletions(-) create mode 100644 modules/nf-core/star/genomegenerate/environment.yml create mode 100644 modules/nf-core/star/genomegenerate/tests/main.nf.test create mode 100644 modules/nf-core/star/genomegenerate/tests/main.nf.test.snap create mode 100644 modules/nf-core/star/genomegenerate/tests/tags.yml diff --git a/modules.json b/modules.json index 2be0262..13b9239 100644 --- a/modules.json +++ b/modules.json @@ -154,7 +154,7 @@ }, "star/genomegenerate": { "branch": "master", - "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c", + "git_sha": "666652151335353eef2fcd58880bcef5bc2928e1", "installed_by": ["modules"] }, "subread/featurecounts": { diff --git a/modules/nf-core/star/genomegenerate/environment.yml b/modules/nf-core/star/genomegenerate/environment.yml new file mode 100644 index 0000000..1debc4c --- /dev/null +++ b/modules/nf-core/star/genomegenerate/environment.yml @@ -0,0 +1,9 @@ +channels: + - conda-forge + - bioconda + +dependencies: + - bioconda::htslib=1.18 + - bioconda::samtools=1.18 + - bioconda::star=2.7.10a + - conda-forge::gawk=5.1.0 diff --git a/modules/nf-core/star/genomegenerate/main.nf b/modules/nf-core/star/genomegenerate/main.nf index 9146248..b885571 100644 --- a/modules/nf-core/star/genomegenerate/main.nf +++ b/modules/nf-core/star/genomegenerate/main.nf @@ -2,26 +2,27 @@ process STAR_GENOMEGENERATE { tag "$fasta" label 'process_high' - conda "bioconda::star=2.7.10a bioconda::samtools=1.16.1 conda-forge::gawk=5.1.0" + conda "${moduleDir}/environment.yml" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/mulled-v2-1fa26d1ce03c295fe2fdcf85831a92fbcbd7e8c2:1df389393721fc66f3fd8778ad938ac711951107-0' : - 'quay.io/biocontainers/mulled-v2-1fa26d1ce03c295fe2fdcf85831a92fbcbd7e8c2:1df389393721fc66f3fd8778ad938ac711951107-0' }" + 'https://depot.galaxyproject.org/singularity/mulled-v2-1fa26d1ce03c295fe2fdcf85831a92fbcbd7e8c2:ded3841da0194af2701c780e9b3d653a85d27489-0' : + 'biocontainers/mulled-v2-1fa26d1ce03c295fe2fdcf85831a92fbcbd7e8c2:ded3841da0194af2701c780e9b3d653a85d27489-0' }" input: - path fasta - path gtf + tuple val(meta), path(fasta) + tuple val(meta2), path(gtf) output: - path "star" , emit: index - path "versions.yml", emit: versions + tuple val(meta), path("star") , emit: index + path "versions.yml" , emit: versions when: task.ext.when == null || task.ext.when script: - def args = task.ext.args ?: '' - def args_list = args.tokenize() - def memory = task.memory ? "--limitGenomeGenerateRAM ${task.memory.toBytes() - 100000000}" : '' + def args = task.ext.args ?: '' + def args_list = args.tokenize() + def memory = task.memory ? "--limitGenomeGenerateRAM ${task.memory.toBytes() - 100000000}" : '' + def include_gtf = gtf ? "--sjdbGTFfile $gtf" : '' if (args_list.contains('--genomeSAindexNbases')) { """ mkdir star @@ -29,7 +30,7 @@ process STAR_GENOMEGENERATE { --runMode genomeGenerate \\ --genomeDir star/ \\ --genomeFastaFiles $fasta \\ - --sjdbGTFfile $gtf \\ + $include_gtf \\ --runThreadN $task.cpus \\ $memory \\ $args @@ -51,7 +52,7 @@ process STAR_GENOMEGENERATE { --runMode genomeGenerate \\ --genomeDir star/ \\ --genomeFastaFiles $fasta \\ - --sjdbGTFfile $gtf \\ + $include_gtf \\ --runThreadN $task.cpus \\ --genomeSAindexNbases \$NUM_BASES \\ $memory \\ @@ -67,30 +68,52 @@ process STAR_GENOMEGENERATE { } stub: - """ - mkdir star - touch star/Genome - touch star/Log.out - touch star/SA - touch star/SAindex - touch star/chrLength.txt - touch star/chrName.txt - touch star/chrNameLength.txt - touch star/chrStart.txt - touch star/exonGeTrInfo.tab - touch star/exonInfo.tab - touch star/geneInfo.tab - touch star/genomeParameters.txt - touch star/sjdbInfo.txt - touch star/sjdbList.fromGTF.out.tab - touch star/sjdbList.out.tab - touch star/transcriptInfo.tab + if (gtf) { + """ + mkdir star + touch star/Genome + touch star/Log.out + touch star/SA + touch star/SAindex + touch star/chrLength.txt + touch star/chrName.txt + touch star/chrNameLength.txt + touch star/chrStart.txt + touch star/exonGeTrInfo.tab + touch star/exonInfo.tab + touch star/geneInfo.tab + touch star/genomeParameters.txt + touch star/sjdbInfo.txt + touch star/sjdbList.fromGTF.out.tab + touch star/sjdbList.out.tab + touch star/transcriptInfo.tab - cat <<-END_VERSIONS > versions.yml - "${task.process}": - star: \$(STAR --version | sed -e "s/STAR_//g") - samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') - gawk: \$(echo \$(gawk --version 2>&1) | sed 's/^.*GNU Awk //; s/, .*\$//') - END_VERSIONS - """ + cat <<-END_VERSIONS > versions.yml + "${task.process}": + star: \$(STAR --version | sed -e "s/STAR_//g") + samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') + gawk: \$(echo \$(gawk --version 2>&1) | sed 's/^.*GNU Awk //; s/, .*\$//') + END_VERSIONS + """ + } else { + """ + mkdir star + touch star/Genome + touch star/Log.out + touch star/SA + touch star/SAindex + touch star/chrLength.txt + touch star/chrName.txt + touch star/chrNameLength.txt + touch star/chrStart.txt + touch star/genomeParameters.txt + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + star: \$(STAR --version | sed -e "s/STAR_//g") + samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') + gawk: \$(echo \$(gawk --version 2>&1) | sed 's/^.*GNU Awk //; s/, .*\$//') + END_VERSIONS + """ + } } diff --git a/modules/nf-core/star/genomegenerate/meta.yml b/modules/nf-core/star/genomegenerate/meta.yml index 8181157..33c1f65 100644 --- a/modules/nf-core/star/genomegenerate/meta.yml +++ b/modules/nf-core/star/genomegenerate/meta.yml @@ -14,24 +14,43 @@ tools: manual: https://github.com/alexdobin/STAR/blob/master/doc/STARmanual.pdf doi: 10.1093/bioinformatics/bts635 licence: ["MIT"] + identifier: biotools:star input: - - fasta: - type: file - description: Fasta file of the reference genome - - gtf: - type: file - description: GTF file of the reference genome - + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - fasta: + type: file + description: Fasta file of the reference genome + - - meta2: + type: map + description: | + Groovy Map containing reference information + e.g. [ id:'test' ] + - gtf: + type: file + description: GTF file of the reference genome output: - index: - type: directory - description: Folder containing the star index files - pattern: "star" + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - star: + type: directory + description: Folder containing the star index files + pattern: "star" - versions: - type: file - description: File containing software versions - pattern: "versions.yml" - + - versions.yml: + type: file + description: File containing software versions + pattern: "versions.yml" authors: - "@kevinmenden" - "@drpatelh" +maintainers: + - "@kevinmenden" + - "@drpatelh" diff --git a/modules/nf-core/star/genomegenerate/tests/main.nf.test b/modules/nf-core/star/genomegenerate/tests/main.nf.test new file mode 100644 index 0000000..4d619c4 --- /dev/null +++ b/modules/nf-core/star/genomegenerate/tests/main.nf.test @@ -0,0 +1,114 @@ +nextflow_process { + + name "Test Process STAR_GENOMEGENERATE" + script "../main.nf" + process "STAR_GENOMEGENERATE" + tag "modules" + tag "modules_nfcore" + tag "star" + tag "star/genomegenerate" + + test("fasta_gtf") { + + when { + process { + """ + input[0] = Channel.of([ + [ id:'test_fasta' ], + [ file(params.modules_testdata_base_path + 'genomics/homo_sapiens/genome/genome.fasta', checkIfExists: true) ] + ]) + input[1] = Channel.of([ + [ id:'test_gtf' ], + [ file(params.modules_testdata_base_path + 'genomics/homo_sapiens/genome/genome.gtf', checkIfExists: true) ] + ]) + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot( + file(process.out.index[0][1]).listFiles().collect { it.getName() }.sort().toString(), + process.out.versions) + .match() } + ) + } + } + + test("fasta") { + + when { + process { + """ + input[0] = Channel.of([ + [ id:'test_fasta' ], + [ file(params.modules_testdata_base_path + 'genomics/homo_sapiens/genome/genome.fasta', checkIfExists: true) ] + ]) + input[1] = Channel.of([ [], [] ]) + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot( + file(process.out.index[0][1]).listFiles().collect { it.getName() }.sort().toString(), + process.out.versions + ).match() } + ) + } + } + + test("fasta_gtf_stub") { + + options '-stub' + + when { + process { + """ + input[0] = Channel.of([ + [ id:'test_fasta' ], + [ file(params.modules_testdata_base_path + 'genomics/homo_sapiens/genome/genome.fasta', checkIfExists: true) ] + ]) + input[1] = Channel.of([ + [ id:'test_gtf' ], + [ file(params.modules_testdata_base_path + 'genomics/homo_sapiens/genome/genome.gtf', checkIfExists: true) ] + ]) + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + } + + test("fasta_stub") { + + options '-stub' + + when { + process { + """ + input[0] = Channel.of([ + [ id:'test_fasta' ], + [ file(params.modules_testdata_base_path + 'genomics/homo_sapiens/genome/genome.fasta', checkIfExists: true) ] + ]) + input[1] = Channel.of([ [], [] ]) + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + } +} diff --git a/modules/nf-core/star/genomegenerate/tests/main.nf.test.snap b/modules/nf-core/star/genomegenerate/tests/main.nf.test.snap new file mode 100644 index 0000000..207f4b4 --- /dev/null +++ b/modules/nf-core/star/genomegenerate/tests/main.nf.test.snap @@ -0,0 +1,148 @@ +{ + "fasta_gtf": { + "content": [ + "[Genome, Log.out, SA, SAindex, chrLength.txt, chrName.txt, chrNameLength.txt, chrStart.txt, exonGeTrInfo.tab, exonInfo.tab, geneInfo.tab, genomeParameters.txt, sjdbInfo.txt, sjdbList.fromGTF.out.tab, sjdbList.out.tab, transcriptInfo.tab]", + [ + "versions.yml:md5,46b8f1f34bb7f23892cd1eb249ed4d7f" + ] + ], + "meta": { + "nf-test": "0.9.0", + "nextflow": "24.04.3" + }, + "timestamp": "2024-07-22T14:55:35.478401" + }, + "fasta_gtf_stub": { + "content": [ + { + "0": [ + [ + { + "id": "test_fasta" + }, + [ + "Genome:md5,d41d8cd98f00b204e9800998ecf8427e", + "Log.out:md5,d41d8cd98f00b204e9800998ecf8427e", + "SA:md5,d41d8cd98f00b204e9800998ecf8427e", + "SAindex:md5,d41d8cd98f00b204e9800998ecf8427e", + "chrLength.txt:md5,d41d8cd98f00b204e9800998ecf8427e", + "chrName.txt:md5,d41d8cd98f00b204e9800998ecf8427e", + "chrNameLength.txt:md5,d41d8cd98f00b204e9800998ecf8427e", + "chrStart.txt:md5,d41d8cd98f00b204e9800998ecf8427e", + "exonGeTrInfo.tab:md5,d41d8cd98f00b204e9800998ecf8427e", + "exonInfo.tab:md5,d41d8cd98f00b204e9800998ecf8427e", + "geneInfo.tab:md5,d41d8cd98f00b204e9800998ecf8427e", + "genomeParameters.txt:md5,d41d8cd98f00b204e9800998ecf8427e", + "sjdbInfo.txt:md5,d41d8cd98f00b204e9800998ecf8427e", + "sjdbList.fromGTF.out.tab:md5,d41d8cd98f00b204e9800998ecf8427e", + "sjdbList.out.tab:md5,d41d8cd98f00b204e9800998ecf8427e", + "transcriptInfo.tab:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ] + ], + "1": [ + "versions.yml:md5,46b8f1f34bb7f23892cd1eb249ed4d7f" + ], + "index": [ + [ + { + "id": "test_fasta" + }, + [ + "Genome:md5,d41d8cd98f00b204e9800998ecf8427e", + "Log.out:md5,d41d8cd98f00b204e9800998ecf8427e", + "SA:md5,d41d8cd98f00b204e9800998ecf8427e", + "SAindex:md5,d41d8cd98f00b204e9800998ecf8427e", + "chrLength.txt:md5,d41d8cd98f00b204e9800998ecf8427e", + "chrName.txt:md5,d41d8cd98f00b204e9800998ecf8427e", + "chrNameLength.txt:md5,d41d8cd98f00b204e9800998ecf8427e", + "chrStart.txt:md5,d41d8cd98f00b204e9800998ecf8427e", + "exonGeTrInfo.tab:md5,d41d8cd98f00b204e9800998ecf8427e", + "exonInfo.tab:md5,d41d8cd98f00b204e9800998ecf8427e", + "geneInfo.tab:md5,d41d8cd98f00b204e9800998ecf8427e", + "genomeParameters.txt:md5,d41d8cd98f00b204e9800998ecf8427e", + "sjdbInfo.txt:md5,d41d8cd98f00b204e9800998ecf8427e", + "sjdbList.fromGTF.out.tab:md5,d41d8cd98f00b204e9800998ecf8427e", + "sjdbList.out.tab:md5,d41d8cd98f00b204e9800998ecf8427e", + "transcriptInfo.tab:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ] + ], + "versions": [ + "versions.yml:md5,46b8f1f34bb7f23892cd1eb249ed4d7f" + ] + } + ], + "meta": { + "nf-test": "0.9.0", + "nextflow": "24.04.3" + }, + "timestamp": "2024-07-22T14:55:57.247585" + }, + "fasta_stub": { + "content": [ + { + "0": [ + [ + { + "id": "test_fasta" + }, + [ + "Genome:md5,d41d8cd98f00b204e9800998ecf8427e", + "Log.out:md5,d41d8cd98f00b204e9800998ecf8427e", + "SA:md5,d41d8cd98f00b204e9800998ecf8427e", + "SAindex:md5,d41d8cd98f00b204e9800998ecf8427e", + "chrLength.txt:md5,d41d8cd98f00b204e9800998ecf8427e", + "chrName.txt:md5,d41d8cd98f00b204e9800998ecf8427e", + "chrNameLength.txt:md5,d41d8cd98f00b204e9800998ecf8427e", + "chrStart.txt:md5,d41d8cd98f00b204e9800998ecf8427e", + "genomeParameters.txt:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ] + ], + "1": [ + "versions.yml:md5,46b8f1f34bb7f23892cd1eb249ed4d7f" + ], + "index": [ + [ + { + "id": "test_fasta" + }, + [ + "Genome:md5,d41d8cd98f00b204e9800998ecf8427e", + "Log.out:md5,d41d8cd98f00b204e9800998ecf8427e", + "SA:md5,d41d8cd98f00b204e9800998ecf8427e", + "SAindex:md5,d41d8cd98f00b204e9800998ecf8427e", + "chrLength.txt:md5,d41d8cd98f00b204e9800998ecf8427e", + "chrName.txt:md5,d41d8cd98f00b204e9800998ecf8427e", + "chrNameLength.txt:md5,d41d8cd98f00b204e9800998ecf8427e", + "chrStart.txt:md5,d41d8cd98f00b204e9800998ecf8427e", + "genomeParameters.txt:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ] + ], + "versions": [ + "versions.yml:md5,46b8f1f34bb7f23892cd1eb249ed4d7f" + ] + } + ], + "meta": { + "nf-test": "0.9.0", + "nextflow": "24.04.3" + }, + "timestamp": "2024-07-22T14:56:07.01742" + }, + "fasta": { + "content": [ + "[Genome, Log.out, SA, SAindex, chrLength.txt, chrName.txt, chrNameLength.txt, chrStart.txt, genomeParameters.txt]", + [ + "versions.yml:md5,46b8f1f34bb7f23892cd1eb249ed4d7f" + ] + ], + "meta": { + "nf-test": "0.9.0", + "nextflow": "24.04.3" + }, + "timestamp": "2024-07-22T14:55:45.48784" + } +} \ No newline at end of file diff --git a/modules/nf-core/star/genomegenerate/tests/tags.yml b/modules/nf-core/star/genomegenerate/tests/tags.yml new file mode 100644 index 0000000..79f619b --- /dev/null +++ b/modules/nf-core/star/genomegenerate/tests/tags.yml @@ -0,0 +1,2 @@ +star/genomegenerate: + - modules/nf-core/star/genomegenerate/** diff --git a/subworkflows/local/prepare_references.nf b/subworkflows/local/prepare_references.nf index a4abdd8..938652a 100644 --- a/subworkflows/local/prepare_references.nf +++ b/subworkflows/local/prepare_references.nf @@ -38,7 +38,10 @@ workflow PREPARE_REFERENCES { metafusion_blocklist = params.metafusion_blocklist } - STAR_GENOMEGENERATE(params.fasta,gtf.map{it[1] }.first()) + STAR_GENOMEGENERATE( + [[id:params.genome],params.fasta], + gtf + ) ch_versions = ch_versions.mix(STAR_GENOMEGENERATE.out.versions) star_index = STAR_GENOMEGENERATE.out.index From e30de977fb7fb6752f1ec393116bde1c15a8172a Mon Sep 17 00:00:00 2001 From: Anne Marie Noronha Date: Wed, 25 Sep 2024 17:31:11 -0400 Subject: [PATCH 04/28] adjustments to make GRCh38 run through --- conf/igenomes.config | 5 +- conf/modules.config | 2 +- modules.json | 16 +- modules/nf-core/arriba/arriba/environment.yml | 5 + modules/nf-core/arriba/arriba/main.nf | 68 + modules/nf-core/arriba/arriba/meta.yml | 123 + .../gatk4/bedtointervallist/environment.yml | 5 + .../nf-core/gatk4/bedtointervallist/main.nf | 15 +- .../nf-core/gatk4/bedtointervallist/meta.yml | 60 +- .../bedtointervallist/tests/main.nf.test | 38 + .../bedtointervallist/tests/main.nf.test.snap | 35 + .../gatk4/bedtointervallist/tests/tags.yml | 2 + .../createsequencedictionary/environment.yml | 5 + .../gatk4/createsequencedictionary/main.nf | 19 +- .../gatk4/createsequencedictionary/meta.yml | 55 +- .../tests/main.nf.test | 56 + .../tests/main.nf.test.snap | 68 + .../createsequencedictionary/tests/tags.yml | 2 + .../nf-core/samtools/faidx/environment.yml | 7 + modules/nf-core/samtools/faidx/main.nf | 22 +- modules/nf-core/samtools/faidx/meta.yml | 79 +- .../nf-core/samtools/faidx/tests/main.nf.test | 122 + .../samtools/faidx/tests/main.nf.test.snap | 249 +++ .../samtools/faidx/tests/nextflow.config | 7 + .../samtools/faidx/tests/nextflow2.config | 6 + modules/nf-core/samtools/faidx/tests/tags.yml | 2 + modules/nf-core/star/align/environment.yml | 9 + modules/nf-core/star/align/main.nf | 14 +- modules/nf-core/star/align/meta.yml | 228 +- modules/nf-core/star/align/tests/main.nf.test | 609 +++++ .../star/align/tests/main.nf.test.snap | 1973 +++++++++++++++++ .../star/align/tests/nextflow.arriba.config | 14 + .../nf-core/star/align/tests/nextflow.config | 14 + .../align/tests/nextflow.starfusion.config | 14 + modules/nf-core/star/align/tests/tags.yml | 2 + modules/nf-core/star/genomegenerate/main.nf | 2 +- .../genomegenerate/star-genomegenerate.diff | 20 + subworkflows/local/align_reads.nf | 2 +- subworkflows/local/fillout.nf | 2 +- subworkflows/local/fusion.nf | 31 +- subworkflows/local/prepare_references.nf | 15 +- subworkflows/local/qc.nf | 8 +- workflows/forte.nf | 5 +- 43 files changed, 3854 insertions(+), 181 deletions(-) create mode 100644 modules/nf-core/arriba/arriba/environment.yml create mode 100644 modules/nf-core/arriba/arriba/main.nf create mode 100644 modules/nf-core/arriba/arriba/meta.yml create mode 100644 modules/nf-core/gatk4/bedtointervallist/environment.yml create mode 100644 modules/nf-core/gatk4/bedtointervallist/tests/main.nf.test create mode 100644 modules/nf-core/gatk4/bedtointervallist/tests/main.nf.test.snap create mode 100644 modules/nf-core/gatk4/bedtointervallist/tests/tags.yml create mode 100644 modules/nf-core/gatk4/createsequencedictionary/environment.yml create mode 100644 modules/nf-core/gatk4/createsequencedictionary/tests/main.nf.test create mode 100644 modules/nf-core/gatk4/createsequencedictionary/tests/main.nf.test.snap create mode 100644 modules/nf-core/gatk4/createsequencedictionary/tests/tags.yml create mode 100644 modules/nf-core/samtools/faidx/environment.yml create mode 100644 modules/nf-core/samtools/faidx/tests/main.nf.test create mode 100644 modules/nf-core/samtools/faidx/tests/main.nf.test.snap create mode 100644 modules/nf-core/samtools/faidx/tests/nextflow.config create mode 100644 modules/nf-core/samtools/faidx/tests/nextflow2.config create mode 100644 modules/nf-core/samtools/faidx/tests/tags.yml create mode 100644 modules/nf-core/star/align/environment.yml create mode 100644 modules/nf-core/star/align/tests/main.nf.test create mode 100644 modules/nf-core/star/align/tests/main.nf.test.snap create mode 100644 modules/nf-core/star/align/tests/nextflow.arriba.config create mode 100644 modules/nf-core/star/align/tests/nextflow.config create mode 100644 modules/nf-core/star/align/tests/nextflow.starfusion.config create mode 100644 modules/nf-core/star/align/tests/tags.yml create mode 100644 modules/nf-core/star/genomegenerate/star-genomegenerate.diff diff --git a/conf/igenomes.config b/conf/igenomes.config index ec6d5b0..a653ef7 100644 --- a/conf/igenomes.config +++ b/conf/igenomes.config @@ -35,8 +35,9 @@ params { } 'GRCh38' { ensembl_version = 88 - fasta = "${params.igenomes_base}/Homo_sapiens/NCBI/GRCh38Decoy/Sequence/WholeGenomeFasta/genome.fa" - gtf = "https://ftp.ensembl.org/pub/release-88/gtf/homo_sapiens/Homo_sapiens.GRCh38.88.chr.gtf.gz" + //fasta = "${params.igenomes_base}/Homo_sapiens/NCBI/GRCh38Decoy/Sequence/WholeGenomeFasta/genome.fa" + fasta = "https://ftp.ensembl.org/pub/release-88/fasta/homo_sapiens/dna/Homo_sapiens.GRCh38.dna.primary_assembly.fa.gz" + gtf = "https://ftp.ensembl.org/pub/release-88/gtf/homo_sapiens/Homo_sapiens.GRCh38.88.gtf.gz" //forte will generate refflat from gtf refflat = null starfusion_url = "https://data.broadinstitute.org/Trinity/CTAT_RESOURCE_LIB/__genome_libs_StarFv1.10/GRCh38_gencode_v37_CTAT_lib_Mar012021.plug-n-play.tar.gz" diff --git a/conf/modules.config b/conf/modules.config index 9277358..5f15d4e 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -475,7 +475,7 @@ process { ] } - withName: ARRIBA { + withName: ARRIBA_ARRIBA { ext.args = { "-s ${meta.single_end || meta.strandedness == "forward" ? "yes" : meta.strandedness == "reverse" ? "reverse" : "no" }" } diff --git a/modules.json b/modules.json index 13b9239..5a728db 100644 --- a/modules.json +++ b/modules.json @@ -26,6 +26,11 @@ "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c", "installed_by": ["modules"] }, + "arriba/arriba": { + "branch": "master", + "git_sha": "666652151335353eef2fcd58880bcef5bc2928e1", + "installed_by": ["modules"] + }, "cat/cat": { "branch": "master", "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", @@ -48,12 +53,12 @@ }, "gatk4/bedtointervallist": { "branch": "master", - "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c", + "git_sha": "666652151335353eef2fcd58880bcef5bc2928e1", "installed_by": ["modules"] }, "gatk4/createsequencedictionary": { "branch": "master", - "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c", + "git_sha": "666652151335353eef2fcd58880bcef5bc2928e1", "installed_by": ["modules"] }, "gunzip": { @@ -134,7 +139,7 @@ }, "samtools/faidx": { "branch": "master", - "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", + "git_sha": "666652151335353eef2fcd58880bcef5bc2928e1", "installed_by": ["modules"] }, "samtools/index": { @@ -149,13 +154,14 @@ }, "star/align": { "branch": "master", - "git_sha": "57d75dbac06812c59798a48585032f6e50bb1914", + "git_sha": "666652151335353eef2fcd58880bcef5bc2928e1", "installed_by": ["modules"] }, "star/genomegenerate": { "branch": "master", "git_sha": "666652151335353eef2fcd58880bcef5bc2928e1", - "installed_by": ["modules"] + "installed_by": ["modules"], + "patch": "modules/nf-core/star/genomegenerate/star-genomegenerate.diff" }, "subread/featurecounts": { "branch": "master", diff --git a/modules/nf-core/arriba/arriba/environment.yml b/modules/nf-core/arriba/arriba/environment.yml new file mode 100644 index 0000000..d0883a0 --- /dev/null +++ b/modules/nf-core/arriba/arriba/environment.yml @@ -0,0 +1,5 @@ +channels: + - conda-forge + - bioconda +dependencies: + - bioconda::arriba=2.4.0 diff --git a/modules/nf-core/arriba/arriba/main.nf b/modules/nf-core/arriba/arriba/main.nf new file mode 100644 index 0000000..761d0bf --- /dev/null +++ b/modules/nf-core/arriba/arriba/main.nf @@ -0,0 +1,68 @@ +process ARRIBA_ARRIBA { + tag "$meta.id" + label 'process_medium' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/arriba:2.4.0--h0033a41_2' : + 'biocontainers/arriba:2.4.0--h0033a41_2' }" + + input: + tuple val(meta), path(bam) + tuple val(meta2), path(fasta) + tuple val(meta3), path(gtf) + tuple val(meta4), path(blacklist) + tuple val(meta5), path(known_fusions) + tuple val(meta6), path(structural_variants) + tuple val(meta7), path(tags) + tuple val(meta8), path(protein_domains) + + output: + tuple val(meta), path("*.fusions.tsv") , emit: fusions + tuple val(meta), path("*.fusions.discarded.tsv"), emit: fusions_fail + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def blacklist = blacklist ? "-b $blacklist" : "-f blacklist" + def known_fusions = known_fusions ? "-k $known_fusions" : "" + def structural_variants = structural_variants ? "-d $structual_variants" : "" + def tags = tags ? "-t $tags" : "" + def protein_domains = protein_domains ? "-p $protein_domains" : "" + + """ + arriba \\ + -x $bam \\ + -a $fasta \\ + -g $gtf \\ + -o ${prefix}.fusions.tsv \\ + -O ${prefix}.fusions.discarded.tsv \\ + $blacklist \\ + $known_fusions \\ + $structural_variants \\ + $tags \\ + $protein_domains \\ + $args + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + arriba: \$(arriba -h | grep 'Version:' 2>&1 | sed 's/Version:\s//') + END_VERSIONS + """ + + stub: + def prefix = task.ext.prefix ?: "${meta.id}" + """ + echo stub > ${prefix}.fusions.tsv + echo stub > ${prefix}.fusions.discarded.tsv + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + arriba: \$(arriba -h | grep 'Version:' 2>&1 | sed 's/Version:\s//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/arriba/arriba/meta.yml b/modules/nf-core/arriba/arriba/meta.yml new file mode 100644 index 0000000..f230dda --- /dev/null +++ b/modules/nf-core/arriba/arriba/meta.yml @@ -0,0 +1,123 @@ +name: arriba_arriba +description: Arriba is a command-line tool for the detection of gene fusions from + RNA-Seq data. +keywords: + - fusion + - arriba + - detection + - RNA-Seq +tools: + - arriba: + description: Fast and accurate gene fusion detection from RNA-Seq data + homepage: https://github.com/suhrig/arriba + documentation: https://arriba.readthedocs.io/en/latest/ + tool_dev_url: https://github.com/suhrig/arriba + doi: "10.1101/gr.257246.119" + licence: ["MIT"] + identifier: biotools:Arriba +input: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - bam: + type: file + description: BAM/CRAM/SAM file + pattern: "*.{bam,cram,sam}" + - - meta2: + type: map + description: | + Groovy Map containing reference information + e.g. [ id:'test' ] + - fasta: + type: file + description: Assembly FASTA file + pattern: "*.{fasta}" + - - meta3: + type: map + description: | + Groovy Map containing reference information + e.g. [ id:'test' ] + - gtf: + type: file + description: Annotation GTF file + pattern: "*.{gtf}" + - - meta4: + type: map + description: | + Groovy Map containing reference information + e.g. [ id:'test' ] + - blacklist: + type: file + description: Blacklist file + pattern: "*.{tsv}" + - - meta5: + type: map + description: | + Groovy Map containing reference information + e.g. [ id:'test' ] + - known_fusions: + type: file + description: Known fusions file + pattern: "*.{tsv}" + - - meta6: + type: map + description: | + Groovy Map containing reference information + e.g. [ id:'test' ] + - structural_variants: + type: file + description: Structural variants file + pattern: "*.{tsv}" + - - meta7: + type: map + description: | + Groovy Map containing reference information + e.g. [ id:'test' ] + - tags: + type: file + description: Tags file + pattern: "*.{tsv}" + - - meta8: + type: map + description: | + Groovy Map containing reference information + e.g. [ id:'test' ] + - protein_domains: + type: file + description: Protein domains file + pattern: "*.{gff3}" +output: + - fusions: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - "*.fusions.tsv": + type: file + description: File contains fusions which pass all of Arriba's filters. + pattern: "*.{fusions.tsv}" + - fusions_fail: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - "*.fusions.discarded.tsv": + type: file + description: File contains fusions that Arriba classified as an artifact or + that are also observed in healthy tissue. + pattern: "*.{fusions.discarded.tsv}" + - versions: + - versions.yml: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@praveenraj2018" + - "@rannick" +maintainers: + - "@praveenraj2018" + - "@rannick" diff --git a/modules/nf-core/gatk4/bedtointervallist/environment.yml b/modules/nf-core/gatk4/bedtointervallist/environment.yml new file mode 100644 index 0000000..55993f4 --- /dev/null +++ b/modules/nf-core/gatk4/bedtointervallist/environment.yml @@ -0,0 +1,5 @@ +channels: + - conda-forge + - bioconda +dependencies: + - bioconda::gatk4=4.5.0.0 diff --git a/modules/nf-core/gatk4/bedtointervallist/main.nf b/modules/nf-core/gatk4/bedtointervallist/main.nf index 41fab00..68863d6 100644 --- a/modules/nf-core/gatk4/bedtointervallist/main.nf +++ b/modules/nf-core/gatk4/bedtointervallist/main.nf @@ -2,14 +2,14 @@ process GATK4_BEDTOINTERVALLIST { tag "$meta.id" label 'process_medium' - conda "bioconda::gatk4=4.3.0.0" + conda "${moduleDir}/environment.yml" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/gatk4:4.3.0.0--py36hdfd78af_0': - 'quay.io/biocontainers/gatk4:4.3.0.0--py36hdfd78af_0' }" + 'https://depot.galaxyproject.org/singularity/gatk4:4.5.0.0--py36hdfd78af_0': + 'biocontainers/gatk4:4.5.0.0--py36hdfd78af_0' }" input: tuple val(meta), path(bed) - path dict + tuple val(meta2), path(dict) output: tuple val(meta), path('*.interval_list'), emit: interval_list @@ -22,14 +22,15 @@ process GATK4_BEDTOINTERVALLIST { def args = task.ext.args ?: '' def prefix = task.ext.prefix ?: "${meta.id}" - def avail_mem = 3 + def avail_mem = 3072 if (!task.memory) { log.info '[GATK BedToIntervalList] Available memory not known - defaulting to 3GB. Specify process memory requirements to change this.' } else { - avail_mem = task.memory.giga + avail_mem = (task.memory.mega*0.8).intValue() } """ - gatk --java-options "-Xmx${avail_mem}g" BedToIntervalList \\ + gatk --java-options "-Xmx${avail_mem}M -XX:-UsePerfData" \\ + BedToIntervalList \\ --INPUT $bed \\ --OUTPUT ${prefix}.interval_list \\ --SEQUENCE_DICTIONARY $dict \\ diff --git a/modules/nf-core/gatk4/bedtointervallist/meta.yml b/modules/nf-core/gatk4/bedtointervallist/meta.yml index 986f159..25348e1 100644 --- a/modules/nf-core/gatk4/bedtointervallist/meta.yml +++ b/modules/nf-core/gatk4/bedtointervallist/meta.yml @@ -2,6 +2,8 @@ name: gatk4_bedtointervallist description: Creates an interval list from a bed file and a reference dict keywords: - bed + - bedtointervallist + - gatk4 - interval list tools: - gatk4: @@ -13,28 +15,48 @@ tools: documentation: https://gatk.broadinstitute.org/hc/en-us/categories/360002369672s doi: 10.1158/1538-7445.AM2017-3590 licence: ["Apache-2.0"] + identifier: "" input: - - meta: - type: map - description: | - Groovy Map containing sample information - e.g. [ id:'test'] - - bed: - type: file - description: Input bed file - pattern: "*.bed" - - dict: - type: file - description: Sequence dictionary - pattern: "*.dict" + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test'] + - bed: + type: file + description: Input bed file + pattern: "*.bed" + - - meta2: + type: map + description: | + Groovy Map containing reference information + e.g. [ id:'genome' ] + - dict: + type: file + description: Sequence dictionary + pattern: "*.dict" output: - interval_list: - type: file - description: gatk interval list file - pattern: "*.interval_list" + - meta: + type: file + description: gatk interval list file + pattern: "*.interval_list" + - "*.interval_list": + type: file + description: gatk interval list file + pattern: "*.interval_list" + - _list: + type: file + description: gatk interval list file + pattern: "*.interval_list" - versions: - type: file - description: File containing software versions - pattern: "versions.yml" + - versions.yml: + type: file + description: File containing software versions + pattern: "versions.yml" authors: - "@kevinmenden" + - "@ramprasadn" +maintainers: + - "@kevinmenden" + - "@ramprasadn" diff --git a/modules/nf-core/gatk4/bedtointervallist/tests/main.nf.test b/modules/nf-core/gatk4/bedtointervallist/tests/main.nf.test new file mode 100644 index 0000000..2289f73 --- /dev/null +++ b/modules/nf-core/gatk4/bedtointervallist/tests/main.nf.test @@ -0,0 +1,38 @@ +nextflow_process { + + name "Test Process GATK4_BEDTOINTERVALLIST" + script "../main.nf" + process "GATK4_BEDTOINTERVALLIST" + + tag "modules" + tag "modules_nfcore" + tag "gatk4" + tag "gatk4/bedtointervallist" + + test("test_gatk4_bedtointervallist") { + + when { + process { + """ + input[0] = [ [ id:'test' ], // meta map + [file(params.modules_testdata_base_path + + 'genomics/sarscov2/genome/bed/test.bed', checkIfExists: true) ] + ] + input[1] = [ [ id:'dict' ], // meta map + [file(params.modules_testdata_base_path + + 'genomics/sarscov2/genome/genome.dict', checkIfExists: true) ] + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + +} \ No newline at end of file diff --git a/modules/nf-core/gatk4/bedtointervallist/tests/main.nf.test.snap b/modules/nf-core/gatk4/bedtointervallist/tests/main.nf.test.snap new file mode 100644 index 0000000..48c322f --- /dev/null +++ b/modules/nf-core/gatk4/bedtointervallist/tests/main.nf.test.snap @@ -0,0 +1,35 @@ +{ + "test_gatk4_bedtointervallist": { + "content": [ + { + "0": [ + [ + { + "id": "test" + }, + "test.interval_list:md5,e51101c9357fb2d59fd30e370eefa39c" + ] + ], + "1": [ + "versions.yml:md5,29a18c36f27584eb5a5f2f5457088b3b" + ], + "interval_list": [ + [ + { + "id": "test" + }, + "test.interval_list:md5,e51101c9357fb2d59fd30e370eefa39c" + ] + ], + "versions": [ + "versions.yml:md5,29a18c36f27584eb5a5f2f5457088b3b" + ] + } + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-03-19T14:20:12.168775" + } +} \ No newline at end of file diff --git a/modules/nf-core/gatk4/bedtointervallist/tests/tags.yml b/modules/nf-core/gatk4/bedtointervallist/tests/tags.yml new file mode 100644 index 0000000..b4d54f1 --- /dev/null +++ b/modules/nf-core/gatk4/bedtointervallist/tests/tags.yml @@ -0,0 +1,2 @@ +gatk4/bedtointervallist: + - "modules/nf-core/gatk4/bedtointervallist/**" diff --git a/modules/nf-core/gatk4/createsequencedictionary/environment.yml b/modules/nf-core/gatk4/createsequencedictionary/environment.yml new file mode 100644 index 0000000..55993f4 --- /dev/null +++ b/modules/nf-core/gatk4/createsequencedictionary/environment.yml @@ -0,0 +1,5 @@ +channels: + - conda-forge + - bioconda +dependencies: + - bioconda::gatk4=4.5.0.0 diff --git a/modules/nf-core/gatk4/createsequencedictionary/main.nf b/modules/nf-core/gatk4/createsequencedictionary/main.nf index bc324ad..c7f1d75 100644 --- a/modules/nf-core/gatk4/createsequencedictionary/main.nf +++ b/modules/nf-core/gatk4/createsequencedictionary/main.nf @@ -2,17 +2,17 @@ process GATK4_CREATESEQUENCEDICTIONARY { tag "$fasta" label 'process_medium' - conda "bioconda::gatk4=4.3.0.0" + conda "${moduleDir}/environment.yml" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/gatk4:4.3.0.0--py36hdfd78af_0': - 'quay.io/biocontainers/gatk4:4.3.0.0--py36hdfd78af_0' }" + 'https://depot.galaxyproject.org/singularity/gatk4:4.5.0.0--py36hdfd78af_0': + 'biocontainers/gatk4:4.5.0.0--py36hdfd78af_0' }" input: - path fasta + tuple val(meta), path(fasta) output: - path "*.dict" , emit: dict - path "versions.yml" , emit: versions + tuple val(meta), path('*.dict') , emit: dict + path "versions.yml" , emit: versions when: task.ext.when == null || task.ext.when @@ -20,14 +20,15 @@ process GATK4_CREATESEQUENCEDICTIONARY { script: def args = task.ext.args ?: '' - def avail_mem = 6 + def avail_mem = 6144 if (!task.memory) { log.info '[GATK CreateSequenceDictionary] Available memory not known - defaulting to 6GB. Specify process memory requirements to change this.' } else { - avail_mem = task.memory.giga + avail_mem = (task.memory.mega*0.8).intValue() } """ - gatk --java-options "-Xmx${avail_mem}g" CreateSequenceDictionary \\ + gatk --java-options "-Xmx${avail_mem}M -XX:-UsePerfData" \\ + CreateSequenceDictionary \\ --REFERENCE $fasta \\ --URI $fasta \\ --TMP_DIR . \\ diff --git a/modules/nf-core/gatk4/createsequencedictionary/meta.yml b/modules/nf-core/gatk4/createsequencedictionary/meta.yml index bd24788..7b5156b 100644 --- a/modules/nf-core/gatk4/createsequencedictionary/meta.yml +++ b/modules/nf-core/gatk4/createsequencedictionary/meta.yml @@ -1,32 +1,49 @@ name: gatk4_createsequencedictionary description: Creates a sequence dictionary for a reference sequence keywords: + - createsequencedictionary - dictionary - fasta + - gatk4 tools: - gatk: - description: | - Developed in the Data Sciences Platform at the Broad Institute, the toolkit offers a wide variety of tools - with a primary focus on variant discovery and genotyping. Its powerful processing engine - and high-performance computing features make it capable of taking on projects of any size. - homepage: https://gatk.broadinstitute.org/hc/en-us - documentation: https://gatk.broadinstitute.org/hc/en-us/categories/360002369672s - doi: 10.1158/1538-7445.AM2017-3590 - licence: ["Apache-2.0"] - + description: | + Developed in the Data Sciences Platform at the Broad Institute, the toolkit offers a wide variety of tools + with a primary focus on variant discovery and genotyping. Its powerful processing engine + and high-performance computing features make it capable of taking on projects of any size. + homepage: https://gatk.broadinstitute.org/hc/en-us + documentation: https://gatk.broadinstitute.org/hc/en-us/categories/360002369672s + doi: 10.1158/1538-7445.AM2017-3590 + licence: ["Apache-2.0"] + identifier: "" input: - - fasta: - type: file - description: Input fasta file - pattern: "*.{fasta,fa}" + - - meta: + type: map + description: | + Groovy Map containing reference information + e.g. [ id:'genome' ] + - fasta: + type: file + description: Input fasta file + pattern: "*.{fasta,fa}" output: - dict: - type: file - description: gatk dictionary file - pattern: "*.{dict}" + - meta: + type: file + description: gatk dictionary file + pattern: "*.{dict}" + - "*.dict": + type: file + description: gatk dictionary file + pattern: "*.{dict}" - versions: - type: file - description: File containing software versions - pattern: "versions.yml" + - versions.yml: + type: file + description: File containing software versions + pattern: "versions.yml" authors: - "@maxulysse" + - "@ramprasadn" +maintainers: + - "@maxulysse" + - "@ramprasadn" diff --git a/modules/nf-core/gatk4/createsequencedictionary/tests/main.nf.test b/modules/nf-core/gatk4/createsequencedictionary/tests/main.nf.test new file mode 100644 index 0000000..a8a9c6d --- /dev/null +++ b/modules/nf-core/gatk4/createsequencedictionary/tests/main.nf.test @@ -0,0 +1,56 @@ +nextflow_process { + + name "Test Process GATK4_CREATESEQUENCEDICTIONARY" + script "../main.nf" + process "GATK4_CREATESEQUENCEDICTIONARY" + + tag "modules" + tag "modules_nfcore" + tag "gatk4" + tag "gatk4/createsequencedictionary" + + test("sarscov2 - fasta") { + + when { + process { + """ + input[0] = [ [ id:'test' ], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.fasta', checkIfExists: true) + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + + test("sarscov2 - fasta - stub") { + + options "-stub" + + when { + process { + """ + input[0] = [ [ id:'test' ], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.fasta', checkIfExists: true) + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + +} diff --git a/modules/nf-core/gatk4/createsequencedictionary/tests/main.nf.test.snap b/modules/nf-core/gatk4/createsequencedictionary/tests/main.nf.test.snap new file mode 100644 index 0000000..16735f9 --- /dev/null +++ b/modules/nf-core/gatk4/createsequencedictionary/tests/main.nf.test.snap @@ -0,0 +1,68 @@ +{ + "sarscov2 - fasta - stub": { + "content": [ + { + "0": [ + [ + { + "id": "test" + }, + "genome.dict:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "1": [ + "versions.yml:md5,e60dd34a71fc2029d81dc67ccb5d6be6" + ], + "dict": [ + [ + { + "id": "test" + }, + "genome.dict:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "versions": [ + "versions.yml:md5,e60dd34a71fc2029d81dc67ccb5d6be6" + ] + } + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.0" + }, + "timestamp": "2024-05-16T10:16:16.34453" + }, + "sarscov2 - fasta": { + "content": [ + { + "0": [ + [ + { + "id": "test" + }, + "genome.dict:md5,7362679f176e0f52add03c08f457f646" + ] + ], + "1": [ + "versions.yml:md5,e60dd34a71fc2029d81dc67ccb5d6be6" + ], + "dict": [ + [ + { + "id": "test" + }, + "genome.dict:md5,7362679f176e0f52add03c08f457f646" + ] + ], + "versions": [ + "versions.yml:md5,e60dd34a71fc2029d81dc67ccb5d6be6" + ] + } + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.0" + }, + "timestamp": "2024-05-16T13:58:25.822068" + } +} \ No newline at end of file diff --git a/modules/nf-core/gatk4/createsequencedictionary/tests/tags.yml b/modules/nf-core/gatk4/createsequencedictionary/tests/tags.yml new file mode 100644 index 0000000..035c5e4 --- /dev/null +++ b/modules/nf-core/gatk4/createsequencedictionary/tests/tags.yml @@ -0,0 +1,2 @@ +gatk4/createsequencedictionary: + - "modules/nf-core/gatk4/createsequencedictionary/**" diff --git a/modules/nf-core/samtools/faidx/environment.yml b/modules/nf-core/samtools/faidx/environment.yml new file mode 100644 index 0000000..2bcd47e --- /dev/null +++ b/modules/nf-core/samtools/faidx/environment.yml @@ -0,0 +1,7 @@ +channels: + - conda-forge + - bioconda + +dependencies: + - bioconda::htslib=1.21 + - bioconda::samtools=1.21 diff --git a/modules/nf-core/samtools/faidx/main.nf b/modules/nf-core/samtools/faidx/main.nf index 4dd0e5b..28c0a81 100644 --- a/modules/nf-core/samtools/faidx/main.nf +++ b/modules/nf-core/samtools/faidx/main.nf @@ -2,18 +2,20 @@ process SAMTOOLS_FAIDX { tag "$fasta" label 'process_single' - conda "bioconda::samtools=1.17" + conda "${moduleDir}/environment.yml" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/samtools:1.17--h00cdaf9_0' : - 'biocontainers/samtools:1.17--h00cdaf9_0' }" + 'https://depot.galaxyproject.org/singularity/samtools:1.21--h50ea8bc_0' : + 'biocontainers/samtools:1.21--h50ea8bc_0' }" input: tuple val(meta), path(fasta) + tuple val(meta2), path(fai) output: - tuple val(meta), path ("*.fai"), emit: fai - tuple val(meta), path ("*.gzi"), emit: gzi, optional: true - path "versions.yml" , emit: versions + tuple val(meta), path ("*.{fa,fasta}") , emit: fa , optional: true + tuple val(meta), path ("*.fai") , emit: fai, optional: true + tuple val(meta), path ("*.gzi") , emit: gzi, optional: true + path "versions.yml" , emit: versions when: task.ext.when == null || task.ext.when @@ -23,8 +25,8 @@ process SAMTOOLS_FAIDX { """ samtools \\ faidx \\ - $args \\ - $fasta + $fasta \\ + $args cat <<-END_VERSIONS > versions.yml "${task.process}": @@ -33,8 +35,12 @@ process SAMTOOLS_FAIDX { """ stub: + def match = (task.ext.args =~ /-o(?:utput)?\s(.*)\s?/).findAll() + def fastacmd = match[0] ? "touch ${match[0][1]}" : '' """ + ${fastacmd} touch ${fasta}.fai + cat <<-END_VERSIONS > versions.yml "${task.process}": diff --git a/modules/nf-core/samtools/faidx/meta.yml b/modules/nf-core/samtools/faidx/meta.yml index fe2fe9a..6721b2c 100644 --- a/modules/nf-core/samtools/faidx/meta.yml +++ b/modules/nf-core/samtools/faidx/meta.yml @@ -3,6 +3,7 @@ description: Index FASTA file keywords: - index - fasta + - faidx tools: - samtools: description: | @@ -13,35 +14,67 @@ tools: documentation: http://www.htslib.org/doc/samtools.html doi: 10.1093/bioinformatics/btp352 licence: ["MIT"] + identifier: biotools:samtools input: - - meta: - type: map - description: | - Groovy Map containing sample information - e.g. [ id:'test', single_end:false ] - - fasta: - type: file - description: FASTA file - pattern: "*.{fa,fasta}" + - - meta: + type: map + description: | + Groovy Map containing reference information + e.g. [ id:'test' ] + - fasta: + type: file + description: FASTA file + pattern: "*.{fa,fasta}" + - - meta2: + type: map + description: | + Groovy Map containing reference information + e.g. [ id:'test' ] + - fai: + type: file + description: FASTA index file + pattern: "*.{fai}" output: - - meta: - type: map - description: | - Groovy Map containing sample information - e.g. [ id:'test', single_end:false ] + - fa: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - "*.{fa,fasta}": + type: file + description: FASTA file + pattern: "*.{fa}" - fai: - type: file - description: FASTA index file - pattern: "*.{fai}" + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - "*.fai": + type: file + description: FASTA index file + pattern: "*.{fai}" - gzi: - type: file - description: Optional gzip index file for compressed inputs - pattern: "*.gzi" + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - "*.gzi": + type: file + description: Optional gzip index file for compressed inputs + pattern: "*.gzi" - versions: - type: file - description: File containing software versions - pattern: "versions.yml" + - versions.yml: + type: file + description: File containing software versions + pattern: "versions.yml" authors: - "@drpatelh" - "@ewels" - "@phue" +maintainers: + - "@drpatelh" + - "@ewels" + - "@phue" diff --git a/modules/nf-core/samtools/faidx/tests/main.nf.test b/modules/nf-core/samtools/faidx/tests/main.nf.test new file mode 100644 index 0000000..17244ef --- /dev/null +++ b/modules/nf-core/samtools/faidx/tests/main.nf.test @@ -0,0 +1,122 @@ +nextflow_process { + + name "Test Process SAMTOOLS_FAIDX" + script "../main.nf" + process "SAMTOOLS_FAIDX" + + tag "modules" + tag "modules_nfcore" + tag "samtools" + tag "samtools/faidx" + + test("test_samtools_faidx") { + + when { + process { + """ + input[0] = [ [ id:'test', single_end:false ], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.fasta', checkIfExists: true) ] + + input[1] = [[],[]] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + } + + test("test_samtools_faidx_bgzip") { + + when { + process { + """ + input[0] = [ [ id:'test', single_end:false ], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.fasta.gz', checkIfExists: true)] + + input[1] = [[],[]] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + } + + test("test_samtools_faidx_fasta") { + + config "./nextflow.config" + + when { + process { + """ + input[0] = [ [ id:'test', single_end:false ], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.fasta', checkIfExists: true) ] + + input[1] = [ [ id:'test', single_end:false ], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.fasta.fai', checkIfExists: true) ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + } + + test("test_samtools_faidx_stub_fasta") { + + config "./nextflow2.config" + + when { + process { + """ + input[0] = [ [ id:'test', single_end:false ], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.fasta', checkIfExists: true) ] + + input[1] = [ [ id:'test', single_end:false ], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.fasta.fai', checkIfExists: true) ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + } + + test("test_samtools_faidx_stub_fai") { + + when { + process { + """ + input[0] = [ [ id:'test', single_end:false ], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.fasta', checkIfExists: true) ] + + input[1] = [[],[]] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + } +} \ No newline at end of file diff --git a/modules/nf-core/samtools/faidx/tests/main.nf.test.snap b/modules/nf-core/samtools/faidx/tests/main.nf.test.snap new file mode 100644 index 0000000..1bbb3ec --- /dev/null +++ b/modules/nf-core/samtools/faidx/tests/main.nf.test.snap @@ -0,0 +1,249 @@ +{ + "test_samtools_faidx": { + "content": [ + { + "0": [ + + ], + "1": [ + [ + { + "id": "test", + "single_end": false + }, + "genome.fasta.fai:md5,9da2a56e2853dc8c0b86a9e7229c9fe5" + ] + ], + "2": [ + + ], + "3": [ + "versions.yml:md5,6bbe80a2e14bd61202ca63e12d66027f" + ], + "fa": [ + + ], + "fai": [ + [ + { + "id": "test", + "single_end": false + }, + "genome.fasta.fai:md5,9da2a56e2853dc8c0b86a9e7229c9fe5" + ] + ], + "gzi": [ + + ], + "versions": [ + "versions.yml:md5,6bbe80a2e14bd61202ca63e12d66027f" + ] + } + ], + "meta": { + "nf-test": "0.9.0", + "nextflow": "24.04.4" + }, + "timestamp": "2024-09-16T07:57:47.450887871" + }, + "test_samtools_faidx_bgzip": { + "content": [ + { + "0": [ + + ], + "1": [ + [ + { + "id": "test", + "single_end": false + }, + "genome.fasta.gz.fai:md5,9da2a56e2853dc8c0b86a9e7229c9fe5" + ] + ], + "2": [ + [ + { + "id": "test", + "single_end": false + }, + "genome.fasta.gz.gzi:md5,7dea362b3fac8e00956a4952a3d4f474" + ] + ], + "3": [ + "versions.yml:md5,6bbe80a2e14bd61202ca63e12d66027f" + ], + "fa": [ + + ], + "fai": [ + [ + { + "id": "test", + "single_end": false + }, + "genome.fasta.gz.fai:md5,9da2a56e2853dc8c0b86a9e7229c9fe5" + ] + ], + "gzi": [ + [ + { + "id": "test", + "single_end": false + }, + "genome.fasta.gz.gzi:md5,7dea362b3fac8e00956a4952a3d4f474" + ] + ], + "versions": [ + "versions.yml:md5,6bbe80a2e14bd61202ca63e12d66027f" + ] + } + ], + "meta": { + "nf-test": "0.9.0", + "nextflow": "24.04.4" + }, + "timestamp": "2024-09-16T07:58:04.804905659" + }, + "test_samtools_faidx_fasta": { + "content": [ + { + "0": [ + [ + { + "id": "test", + "single_end": false + }, + "extract.fa:md5,6a0774a0ad937ba0bfd2ac7457d90f36" + ] + ], + "1": [ + + ], + "2": [ + + ], + "3": [ + "versions.yml:md5,6bbe80a2e14bd61202ca63e12d66027f" + ], + "fa": [ + [ + { + "id": "test", + "single_end": false + }, + "extract.fa:md5,6a0774a0ad937ba0bfd2ac7457d90f36" + ] + ], + "fai": [ + + ], + "gzi": [ + + ], + "versions": [ + "versions.yml:md5,6bbe80a2e14bd61202ca63e12d66027f" + ] + } + ], + "meta": { + "nf-test": "0.9.0", + "nextflow": "24.04.4" + }, + "timestamp": "2024-09-16T07:58:23.831268154" + }, + "test_samtools_faidx_stub_fasta": { + "content": [ + { + "0": [ + [ + { + "id": "test", + "single_end": false + }, + "extract.fa:md5,9da2a56e2853dc8c0b86a9e7229c9fe5" + ] + ], + "1": [ + + ], + "2": [ + + ], + "3": [ + "versions.yml:md5,6bbe80a2e14bd61202ca63e12d66027f" + ], + "fa": [ + [ + { + "id": "test", + "single_end": false + }, + "extract.fa:md5,9da2a56e2853dc8c0b86a9e7229c9fe5" + ] + ], + "fai": [ + + ], + "gzi": [ + + ], + "versions": [ + "versions.yml:md5,6bbe80a2e14bd61202ca63e12d66027f" + ] + } + ], + "meta": { + "nf-test": "0.9.0", + "nextflow": "24.04.4" + }, + "timestamp": "2024-09-16T07:58:35.600243706" + }, + "test_samtools_faidx_stub_fai": { + "content": [ + { + "0": [ + + ], + "1": [ + [ + { + "id": "test", + "single_end": false + }, + "genome.fasta.fai:md5,9da2a56e2853dc8c0b86a9e7229c9fe5" + ] + ], + "2": [ + + ], + "3": [ + "versions.yml:md5,6bbe80a2e14bd61202ca63e12d66027f" + ], + "fa": [ + + ], + "fai": [ + [ + { + "id": "test", + "single_end": false + }, + "genome.fasta.fai:md5,9da2a56e2853dc8c0b86a9e7229c9fe5" + ] + ], + "gzi": [ + + ], + "versions": [ + "versions.yml:md5,6bbe80a2e14bd61202ca63e12d66027f" + ] + } + ], + "meta": { + "nf-test": "0.9.0", + "nextflow": "24.04.4" + }, + "timestamp": "2024-09-16T07:58:54.705460167" + } +} \ No newline at end of file diff --git a/modules/nf-core/samtools/faidx/tests/nextflow.config b/modules/nf-core/samtools/faidx/tests/nextflow.config new file mode 100644 index 0000000..f76a3ba --- /dev/null +++ b/modules/nf-core/samtools/faidx/tests/nextflow.config @@ -0,0 +1,7 @@ +process { + + withName: SAMTOOLS_FAIDX { + ext.args = 'MT192765.1 -o extract.fa' + } + +} diff --git a/modules/nf-core/samtools/faidx/tests/nextflow2.config b/modules/nf-core/samtools/faidx/tests/nextflow2.config new file mode 100644 index 0000000..33ebbd5 --- /dev/null +++ b/modules/nf-core/samtools/faidx/tests/nextflow2.config @@ -0,0 +1,6 @@ +process { + + withName: SAMTOOLS_FAIDX { + ext.args = '-o extract.fa' + } +} diff --git a/modules/nf-core/samtools/faidx/tests/tags.yml b/modules/nf-core/samtools/faidx/tests/tags.yml new file mode 100644 index 0000000..e4a8394 --- /dev/null +++ b/modules/nf-core/samtools/faidx/tests/tags.yml @@ -0,0 +1,2 @@ +samtools/faidx: + - modules/nf-core/samtools/faidx/** diff --git a/modules/nf-core/star/align/environment.yml b/modules/nf-core/star/align/environment.yml new file mode 100644 index 0000000..1debc4c --- /dev/null +++ b/modules/nf-core/star/align/environment.yml @@ -0,0 +1,9 @@ +channels: + - conda-forge + - bioconda + +dependencies: + - bioconda::htslib=1.18 + - bioconda::samtools=1.18 + - bioconda::star=2.7.10a + - conda-forge::gawk=5.1.0 diff --git a/modules/nf-core/star/align/main.nf b/modules/nf-core/star/align/main.nf index 8cb8e9a..ae67e00 100644 --- a/modules/nf-core/star/align/main.nf +++ b/modules/nf-core/star/align/main.nf @@ -2,15 +2,15 @@ process STAR_ALIGN { tag "$meta.id" label 'process_high' - conda "bioconda::star=2.7.10a bioconda::samtools=1.16.1 conda-forge::gawk=5.1.0" + conda "${moduleDir}/environment.yml" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/mulled-v2-1fa26d1ce03c295fe2fdcf85831a92fbcbd7e8c2:1df389393721fc66f3fd8778ad938ac711951107-0' : - 'biocontainers/mulled-v2-1fa26d1ce03c295fe2fdcf85831a92fbcbd7e8c2:1df389393721fc66f3fd8778ad938ac711951107-0' }" + 'https://depot.galaxyproject.org/singularity/mulled-v2-1fa26d1ce03c295fe2fdcf85831a92fbcbd7e8c2:ded3841da0194af2701c780e9b3d653a85d27489-0' : + 'biocontainers/mulled-v2-1fa26d1ce03c295fe2fdcf85831a92fbcbd7e8c2:ded3841da0194af2701c780e9b3d653a85d27489-0' }" input: tuple val(meta), path(reads, stageAs: "input*/*") - path index - path gtf + tuple val(meta2), path(index) + tuple val(meta3), path(gtf) val star_ignore_sjdbgtf val seq_platform val seq_center @@ -81,6 +81,8 @@ process STAR_ALIGN { stub: def prefix = task.ext.prefix ?: "${meta.id}" """ + echo "" | gzip > ${prefix}.unmapped_1.fastq.gz + echo "" | gzip > ${prefix}.unmapped_2.fastq.gz touch ${prefix}Xd.out.bam touch ${prefix}.Log.final.out touch ${prefix}.Log.out @@ -89,8 +91,6 @@ process STAR_ALIGN { touch ${prefix}.toTranscriptome.out.bam touch ${prefix}.Aligned.unsort.out.bam touch ${prefix}.Aligned.sortedByCoord.out.bam - touch ${prefix}.unmapped_1.fastq.gz - touch ${prefix}.unmapped_2.fastq.gz touch ${prefix}.tab touch ${prefix}.SJ.out.tab touch ${prefix}.ReadsPerGene.out.tab diff --git a/modules/nf-core/star/align/meta.yml b/modules/nf-core/star/align/meta.yml index bce16d3..d30556b 100644 --- a/modules/nf-core/star/align/meta.yml +++ b/modules/nf-core/star/align/meta.yml @@ -14,76 +14,194 @@ tools: manual: https://github.com/alexdobin/STAR/blob/master/doc/STARmanual.pdf doi: 10.1093/bioinformatics/bts635 licence: ["MIT"] + identifier: biotools:star input: - - meta: - type: map - description: | - Groovy Map containing sample information - e.g. [ id:'test', single_end:false ] - - reads: - type: file - description: | - List of input FastQ files of size 1 and 2 for single-end and paired-end data, - respectively. - - index: - type: directory - description: STAR genome index - pattern: "star" + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - reads: + type: file + description: | + List of input FastQ files of size 1 and 2 for single-end and paired-end data, + respectively. + - - meta2: + type: map + description: | + Groovy Map containing reference information + e.g. [ id:'test' ] + - index: + type: directory + description: STAR genome index + pattern: "star" + - - meta3: + type: map + description: | + Groovy Map containing reference information + e.g. [ id:'test' ] + - gtf: + type: file + description: Annotation GTF file + pattern: "*.{gtf}" + - - star_ignore_sjdbgtf: + type: boolean + description: Ignore annotation GTF file + - - seq_platform: + type: string + description: Sequencing platform + - - seq_center: + type: string + description: Sequencing center output: - - bam: - type: file - description: Output BAM file containing read alignments - pattern: "*.{bam}" - log_final: - type: file - description: STAR final log file - pattern: "*Log.final.out" + - meta: + type: file + description: STAR final log file + pattern: "*Log.final.out" + - "*Log.final.out": + type: file + description: STAR final log file + pattern: "*Log.final.out" - log_out: - type: file - description: STAR lot out file - pattern: "*Log.out" + - meta: + type: file + description: STAR lot out file + pattern: "*Log.out" + - "*Log.out": + type: file + description: STAR lot out file + pattern: "*Log.out" - log_progress: - type: file - description: STAR log progress file - pattern: "*Log.progress.out" + - meta: + type: file + description: STAR log progress file + pattern: "*Log.progress.out" + - "*Log.progress.out": + type: file + description: STAR log progress file + pattern: "*Log.progress.out" - versions: - type: file - description: File containing software versions - pattern: "versions.yml" + - versions.yml: + type: file + description: File containing software versions + pattern: "versions.yml" + - bam: + - meta: + type: file + description: Output BAM file containing read alignments + pattern: "*.{bam}" + - "*d.out.bam": + type: file + description: Output BAM file containing read alignments + pattern: "*.{bam}" - bam_sorted: - type: file - description: Sorted BAM file of read alignments (optional) - pattern: "*sortedByCoord.out.bam" + - meta: + type: file + description: Sorted BAM file of read alignments (optional) + pattern: "*sortedByCoord.out.bam" + - "*sortedByCoord.out.bam": + type: file + description: Sorted BAM file of read alignments (optional) + pattern: "*sortedByCoord.out.bam" - bam_transcript: - type: file - description: Output BAM file of transcriptome alignment (optional) - pattern: "*toTranscriptome.out.bam" + - meta: + type: file + description: Output BAM file of transcriptome alignment (optional) + pattern: "*toTranscriptome.out.bam" + - "*toTranscriptome.out.bam": + type: file + description: Output BAM file of transcriptome alignment (optional) + pattern: "*toTranscriptome.out.bam" - bam_unsorted: - type: file - description: Unsorted BAM file of read alignments (optional) - pattern: "*Aligned.unsort.out.bam" + - meta: + type: file + description: Unsorted BAM file of read alignments (optional) + pattern: "*Aligned.unsort.out.bam" + - "*Aligned.unsort.out.bam": + type: file + description: Unsorted BAM file of read alignments (optional) + pattern: "*Aligned.unsort.out.bam" - fastq: - type: file - description: Unmapped FastQ files (optional) - pattern: "*fastq.gz" + - meta: + type: file + description: Unmapped FastQ files (optional) + pattern: "*fastq.gz" + - "*fastq.gz": + type: file + description: Unmapped FastQ files (optional) + pattern: "*fastq.gz" - tab: - type: file - description: STAR output tab file(s) (optional) - pattern: "*.tab" + - meta: + type: file + description: STAR output tab file(s) (optional) + pattern: "*.tab" + - "*.tab": + type: file + description: STAR output tab file(s) (optional) + pattern: "*.tab" + - spl_junc_tab: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - "*.SJ.out.tab": + type: file + description: STAR output splice junction tab file + pattern: "*.SJ.out.tab" + - read_per_gene_tab: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - "*.ReadsPerGene.out.tab": + type: file + description: STAR output read per gene tab file + pattern: "*.ReadsPerGene.out.tab" - junction: - type: file - description: STAR chimeric junction output file (optional) - pattern: "*.out.junction" + - meta: + type: file + description: STAR chimeric junction output file (optional) + pattern: "*.out.junction" + - "*.out.junction": + type: file + description: STAR chimeric junction output file (optional) + pattern: "*.out.junction" + - sam: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - "*.out.sam": + type: file + description: STAR output SAM file + pattern: "*.out.sam" - wig: - type: file - description: STAR output wiggle format file(s) (optional) - pattern: "*.wig" + - meta: + type: file + description: STAR output wiggle format file(s) (optional) + pattern: "*.wig" + - "*.wig": + type: file + description: STAR output wiggle format file(s) (optional) + pattern: "*.wig" - bedgraph: - type: file - description: STAR output bedGraph format file(s) (optional) - pattern: "*.bg" - + - meta: + type: file + description: STAR output bedGraph format file(s) (optional) + pattern: "*.bg" + - "*.bg": + type: file + description: STAR output bedGraph format file(s) (optional) + pattern: "*.bg" authors: - "@kevinmenden" - "@drpatelh" - "@praveenraj2018" +maintainers: + - "@kevinmenden" + - "@drpatelh" + - "@praveenraj2018" diff --git a/modules/nf-core/star/align/tests/main.nf.test b/modules/nf-core/star/align/tests/main.nf.test new file mode 100644 index 0000000..2d9f72d --- /dev/null +++ b/modules/nf-core/star/align/tests/main.nf.test @@ -0,0 +1,609 @@ +nextflow_process { + + name "Test Process STAR_ALIGN" + script "../main.nf" + process "STAR_ALIGN" + tag "modules" + tag "modules_nfcore" + tag "star" + tag "star/align" + tag "star/genomegenerate" + + test("homo_sapiens - single_end") { + config "./nextflow.config" + + setup { + run("STAR_GENOMEGENERATE") { + script "../../../star/genomegenerate/main.nf" + process { + """ + input[0] = Channel.of([ + [ id:'test_fasta' ], + [ file(params.modules_testdata_base_path + 'genomics/homo_sapiens/genome/genome.fasta', checkIfExists: true) ] + ]) + input[1] = Channel.of([ + [ id:'test_gtf' ], + [ file(params.modules_testdata_base_path + 'genomics/homo_sapiens/genome/genome.gtf', checkIfExists: true) ] + ]) + """ + } + } + } + + when { + process { + """ + input[0] = Channel.of([ + [ id:'test', single_end:true ], // meta map + [ file(params.modules_testdata_base_path + 'genomics/homo_sapiens/illumina/fastq/test_rnaseq_1.fastq.gz', checkIfExists: true) ] + ]) + input[1] = STAR_GENOMEGENERATE.out.index + input[2] = Channel.of([ + [ id:'test_gtf' ], + [ file(params.modules_testdata_base_path + 'genomics/homo_sapiens/genome/genome.gtf', checkIfExists: true) ] + ]) + input[3] = false + input[4] = 'illumina' + input[5] = false + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot( + file(process.out.log_final[0][1]).name, + file(process.out.log_out[0][1]).name, + file(process.out.log_progress[0][1]).name, + process.out.bam, + process.out.bam_sorted, + process.out.bam_transcript, + process.out.bam_unsorted, + process.out.bedgraph, + process.out.fastq, + process.out.junction, + process.out.read_per_gene_tab, + process.out.sam, + process.out.spl_junc_tab, + process.out.tab, + process.out.wig, + process.out.versions + ).match() } + ) + } + } + + test("homo_sapiens - paired_end") { + config "./nextflow.config" + + setup { + run("STAR_GENOMEGENERATE") { + script "../../../star/genomegenerate/main.nf" + process { + """ + input[0] = Channel.of([ + [ id:'test_fasta' ], + [ file(params.modules_testdata_base_path + 'genomics/homo_sapiens/genome/genome.fasta', checkIfExists: true) ] + ]) + input[1] = Channel.of([ + [ id:'test_gtf' ], + [ file(params.modules_testdata_base_path + 'genomics/homo_sapiens/genome/genome.gtf', checkIfExists: true) ] + ]) + """ + } + } + } + + when { + process { + """ + input[0] = Channel.of([ + [ id:'test', single_end:false ], // meta map + [ + file(params.modules_testdata_base_path + 'genomics/homo_sapiens/illumina/fastq/test_rnaseq_1.fastq.gz', checkIfExists: true), + file(params.modules_testdata_base_path + 'genomics/homo_sapiens/illumina/fastq/test_rnaseq_2.fastq.gz', checkIfExists: true) + ] + ]) + input[1] = STAR_GENOMEGENERATE.out.index + input[2] = Channel.of([ + [ id:'test_gtf' ], + [ file(params.modules_testdata_base_path + 'genomics/homo_sapiens/genome/genome.gtf', checkIfExists: true) ] + ]) + input[3] = false + input[4] = 'illumina' + input[5] = false + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot( + file(process.out.log_final[0][1]).name, + file(process.out.log_out[0][1]).name, + file(process.out.log_progress[0][1]).name, + process.out.bam, + process.out.bam_sorted, + process.out.bam_transcript, + process.out.bam_unsorted, + process.out.bedgraph, + process.out.fastq, + process.out.junction, + process.out.read_per_gene_tab, + process.out.sam, + process.out.spl_junc_tab, + process.out.tab, + process.out.wig, + process.out.versions + ).match() } + ) + } + } + + test("homo_sapiens - paired_end - arriba") { + config "./nextflow.arriba.config" + + setup { + run("STAR_GENOMEGENERATE") { + script "../../../star/genomegenerate/main.nf" + process { + """ + input[0] = Channel.of([ + [ id:'test_fasta' ], + [ file(params.modules_testdata_base_path + 'genomics/homo_sapiens/genome/genome.fasta', checkIfExists: true) ] + ]) + input[1] = Channel.of([ + [ id:'test_gtf' ], + [ file(params.modules_testdata_base_path + 'genomics/homo_sapiens/genome/genome.gtf', checkIfExists: true) ] + ]) + """ + } + } + } + + when { + process { + """ + input[0] = Channel.of([ + [ id:'test', single_end:false ], // meta map + [ + file(params.modules_testdata_base_path + 'genomics/homo_sapiens/illumina/fastq/test_rnaseq_1.fastq.gz', checkIfExists: true), + file(params.modules_testdata_base_path + 'genomics/homo_sapiens/illumina/fastq/test_rnaseq_2.fastq.gz', checkIfExists: true) + ] + ]) + input[1] = STAR_GENOMEGENERATE.out.index + input[2] = Channel.of([ + [ id:'test_gtf' ], + [ file(params.modules_testdata_base_path + 'genomics/homo_sapiens/genome/genome.gtf', checkIfExists: true) ] + ]) + input[3] = false + input[4] = 'illumina' + input[5] = false + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot( + file(process.out.log_final[0][1]).name, + file(process.out.log_out[0][1]).name, + file(process.out.log_progress[0][1]).name, + process.out.bam, + process.out.bam_sorted, + process.out.bam_transcript, + process.out.bam_unsorted, + process.out.bedgraph, + process.out.fastq, + process.out.junction, + process.out.read_per_gene_tab, + process.out.sam, + process.out.spl_junc_tab, + process.out.tab, + process.out.wig, + process.out.versions + ).match() } + ) + } + } + + test("homo_sapiens - paired_end - starfusion") { + config "./nextflow.starfusion.config" + + setup { + run("STAR_GENOMEGENERATE") { + script "../../../star/genomegenerate/main.nf" + process { + """ + input[0] = Channel.of([ + [ id:'test_fasta' ], + [ file(params.modules_testdata_base_path + 'genomics/homo_sapiens/genome/genome.fasta', checkIfExists: true) ] + ]) + input[1] = Channel.of([ + [ id:'test_gtf' ], + [ file(params.modules_testdata_base_path + 'genomics/homo_sapiens/genome/genome.gtf', checkIfExists: true) ] + ]) + """ + } + } + } + + when { + process { + """ + input[0] = Channel.of([ + [ id:'test', single_end:false ], // meta map + [ + file(params.modules_testdata_base_path + 'genomics/homo_sapiens/illumina/fastq/test_rnaseq_1.fastq.gz', checkIfExists: true), + file(params.modules_testdata_base_path + 'genomics/homo_sapiens/illumina/fastq/test_rnaseq_2.fastq.gz', checkIfExists: true) + ] + ]) + input[1] = STAR_GENOMEGENERATE.out.index + input[2] = Channel.of([ + [ id:'test_gtf' ], + [ file(params.modules_testdata_base_path + 'genomics/homo_sapiens/genome/genome.gtf', checkIfExists: true) ] + ]) + input[3] = false + input[4] = 'illumina' + input[5] = false + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot( + file(process.out.log_final[0][1]).name, + file(process.out.log_out[0][1]).name, + file(process.out.log_progress[0][1]).name, + process.out.bam, + process.out.bam_sorted, + process.out.bam_transcript, + process.out.bam_unsorted, + process.out.bedgraph, + process.out.fastq, + process.out.junction, + process.out.read_per_gene_tab, + process.out.sam, + process.out.spl_junc_tab, + process.out.tab, + process.out.wig, + process.out.versions + ).match() } + ) + } + } + + test("homo_sapiens - paired_end - multiple") { + config "./nextflow.config" + + setup { + run("STAR_GENOMEGENERATE") { + script "../../../star/genomegenerate/main.nf" + process { + """ + input[0] = Channel.of([ + [ id:'test_fasta' ], + [ file(params.modules_testdata_base_path + 'genomics/homo_sapiens/genome/genome.fasta', checkIfExists: true) ] + ]) + input[1] = Channel.of([ + [ id:'test_gtf' ], + [ file(params.modules_testdata_base_path + 'genomics/homo_sapiens/genome/genome.gtf', checkIfExists: true) ] + ]) + """ + } + } + } + + when { + process { + """ + input[0] = Channel.of([ + [ id:'test', single_end:false ], // meta map + [ + file(params.modules_testdata_base_path + 'genomics/homo_sapiens/illumina/fastq/test_rnaseq_1.fastq.gz', checkIfExists: true), + file(params.modules_testdata_base_path + 'genomics/homo_sapiens/illumina/fastq/test_rnaseq_2.fastq.gz', checkIfExists: true), + file(params.modules_testdata_base_path + 'genomics/homo_sapiens/illumina/fastq/test_rnaseq_1.fastq.gz', checkIfExists: true), + file(params.modules_testdata_base_path + 'genomics/homo_sapiens/illumina/fastq/test_rnaseq_2.fastq.gz', checkIfExists: true) + ] + ]) + input[1] = STAR_GENOMEGENERATE.out.index + input[2] = Channel.of([ + [ id:'test_gtf' ], + [ file(params.modules_testdata_base_path + 'genomics/homo_sapiens/genome/genome.gtf', checkIfExists: true) ] + ]) + input[3] = false + input[4] = 'illumina' + input[5] = false + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot( + file(process.out.log_final[0][1]).name, + file(process.out.log_out[0][1]).name, + file(process.out.log_progress[0][1]).name, + process.out.bam, + process.out.bam_sorted, + process.out.bam_transcript, + process.out.bam_unsorted, + process.out.bedgraph, + process.out.fastq, + process.out.junction, + process.out.read_per_gene_tab, + process.out.sam, + process.out.spl_junc_tab, + process.out.tab, + process.out.wig, + process.out.versions + ).match() } + ) + } + } + + test("homo_sapiens - single_end - stub") { + options "-stub" + config "./nextflow.config" + + setup { + run("STAR_GENOMEGENERATE") { + script "../../../star/genomegenerate/main.nf" + process { + """ + input[0] = Channel.of([ + [ id:'test_fasta' ], + [ file(params.modules_testdata_base_path + 'genomics/homo_sapiens/genome/genome.fasta', checkIfExists: true) ] + ]) + input[1] = Channel.of([ + [ id:'test_gtf' ], + [ file(params.modules_testdata_base_path + 'genomics/homo_sapiens/genome/genome.gtf', checkIfExists: true) ] + ]) + """ + } + } + } + + when { + process { + """ + input[0] = Channel.of([ + [ id:'test', single_end:true ], // meta map + [ file(params.modules_testdata_base_path + 'genomics/homo_sapiens/illumina/fastq/test_rnaseq_1.fastq.gz', checkIfExists: true) ] + ]) + input[1] = STAR_GENOMEGENERATE.out.index + input[2] = Channel.of([ + [ id:'test_gtf' ], + [ file(params.modules_testdata_base_path + 'genomics/homo_sapiens/genome/genome.gtf', checkIfExists: true) ] + ]) + input[3] = false + input[4] = 'illumina' + input[5] = false + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + } + + test("homo_sapiens - paired_end - stub") { + options "-stub" + config "./nextflow.config" + + setup { + run("STAR_GENOMEGENERATE") { + script "../../../star/genomegenerate/main.nf" + process { + """ + input[0] = Channel.of([ + [ id:'test_fasta' ], + [ file(params.modules_testdata_base_path + 'genomics/homo_sapiens/genome/genome.fasta', checkIfExists: true) ] + ]) + input[1] = Channel.of([ + [ id:'test_gtf' ], + [ file(params.modules_testdata_base_path + 'genomics/homo_sapiens/genome/genome.gtf', checkIfExists: true) ] + ]) + """ + } + } + } + + when { + process { + """ + input[0] = Channel.of([ + [ id:'test', single_end:false ], // meta map + [ + file(params.modules_testdata_base_path + 'genomics/homo_sapiens/illumina/fastq/test_rnaseq_1.fastq.gz', checkIfExists: true), + file(params.modules_testdata_base_path + 'genomics/homo_sapiens/illumina/fastq/test_rnaseq_2.fastq.gz', checkIfExists: true) + ] + ]) + input[1] = STAR_GENOMEGENERATE.out.index + input[2] = Channel.of([ + [ id:'test_gtf' ], + [ file(params.modules_testdata_base_path + 'genomics/homo_sapiens/genome/genome.gtf', checkIfExists: true) ] + ]) + input[3] = false + input[4] = 'illumina' + input[5] = false + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + } + + test("homo_sapiens - paired_end - arriba - stub") { + options "-stub" + config "./nextflow.arriba.config" + + setup { + run("STAR_GENOMEGENERATE") { + script "../../../star/genomegenerate/main.nf" + process { + """ + input[0] = Channel.of([ + [ id:'test_fasta' ], + [ file(params.modules_testdata_base_path + 'genomics/homo_sapiens/genome/genome.fasta', checkIfExists: true) ] + ]) + input[1] = Channel.of([ + [ id:'test_gtf' ], + [ file(params.modules_testdata_base_path + 'genomics/homo_sapiens/genome/genome.gtf', checkIfExists: true) ] + ]) + """ + } + } + } + + when { + process { + """ + input[0] = Channel.of([ + [ id:'test', single_end:false ], // meta map + [ + file(params.modules_testdata_base_path + 'genomics/homo_sapiens/illumina/fastq/test_rnaseq_1.fastq.gz', checkIfExists: true), + file(params.modules_testdata_base_path + 'genomics/homo_sapiens/illumina/fastq/test_rnaseq_2.fastq.gz', checkIfExists: true) + ] + ]) + input[1] = STAR_GENOMEGENERATE.out.index + input[2] = Channel.of([ + [ id:'test_gtf' ], + [ file(params.modules_testdata_base_path + 'genomics/homo_sapiens/genome/genome.gtf', checkIfExists: true) ] + ]) + input[3] = false + input[4] = 'illumina' + input[5] = false + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + } + + test("homo_sapiens - paired_end - starfusion - stub") { + options "-stub" + config "./nextflow.starfusion.config" + + setup { + run("STAR_GENOMEGENERATE") { + script "../../../star/genomegenerate/main.nf" + process { + """ + input[0] = Channel.of([ + [ id:'test_fasta' ], + [ file(params.modules_testdata_base_path + 'genomics/homo_sapiens/genome/genome.fasta', checkIfExists: true) ] + ]) + input[1] = Channel.of([ + [ id:'test_gtf' ], + [ file(params.modules_testdata_base_path + 'genomics/homo_sapiens/genome/genome.gtf', checkIfExists: true) ] + ]) + """ + } + } + } + + when { + process { + """ + input[0] = Channel.of([ + [ id:'test', single_end:false ], // meta map + [ + file(params.modules_testdata_base_path + 'genomics/homo_sapiens/illumina/fastq/test_rnaseq_1.fastq.gz', checkIfExists: true), + file(params.modules_testdata_base_path + 'genomics/homo_sapiens/illumina/fastq/test_rnaseq_2.fastq.gz', checkIfExists: true) + ] + ]) + input[1] = STAR_GENOMEGENERATE.out.index + input[2] = Channel.of([ + [ id:'test_gtf' ], + [ file(params.modules_testdata_base_path + 'genomics/homo_sapiens/genome/genome.gtf', checkIfExists: true) ] + ]) + input[3] = false + input[4] = 'illumina' + input[5] = false + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + } + + test("homo_sapiens - paired_end - multiple - stub") { + options "-stub" + config "./nextflow.config" + + setup { + run("STAR_GENOMEGENERATE") { + script "../../../star/genomegenerate/main.nf" + process { + """ + input[0] = Channel.of([ + [ id:'test_fasta' ], + [ file(params.modules_testdata_base_path + 'genomics/homo_sapiens/genome/genome.fasta', checkIfExists: true) ] + ]) + input[1] = Channel.of([ + [ id:'test_gtf' ], + [ file(params.modules_testdata_base_path + 'genomics/homo_sapiens/genome/genome.gtf', checkIfExists: true) ] + ]) + """ + } + } + } + + when { + process { + """ + input[0] = Channel.of([ + [ id:'test', single_end:false ], // meta map + [ + file(params.modules_testdata_base_path + 'genomics/homo_sapiens/illumina/fastq/test_rnaseq_1.fastq.gz', checkIfExists: true), + file(params.modules_testdata_base_path + 'genomics/homo_sapiens/illumina/fastq/test_rnaseq_2.fastq.gz', checkIfExists: true), + file(params.modules_testdata_base_path + 'genomics/homo_sapiens/illumina/fastq/test_rnaseq_1.fastq.gz', checkIfExists: true), + file(params.modules_testdata_base_path + 'genomics/homo_sapiens/illumina/fastq/test_rnaseq_2.fastq.gz', checkIfExists: true) + ] + ]) + input[1] = STAR_GENOMEGENERATE.out.index + input[2] = Channel.of([ + [ id:'test_gtf' ], + [ file(params.modules_testdata_base_path + 'genomics/homo_sapiens/genome/genome.gtf', checkIfExists: true) ] + ]) + input[3] = false + input[4] = 'illumina' + input[5] = false + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + } +} \ No newline at end of file diff --git a/modules/nf-core/star/align/tests/main.nf.test.snap b/modules/nf-core/star/align/tests/main.nf.test.snap new file mode 100644 index 0000000..c814eb5 --- /dev/null +++ b/modules/nf-core/star/align/tests/main.nf.test.snap @@ -0,0 +1,1973 @@ +{ + "homo_sapiens - single_end - stub": { + "content": [ + { + "0": [ + [ + { + "id": "test", + "single_end": true + }, + "test.Log.final.out:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "1": [ + [ + { + "id": "test", + "single_end": true + }, + "test.Log.out:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "10": [ + [ + { + "id": "test", + "single_end": true + }, + "test.SJ.out.tab:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "11": [ + [ + { + "id": "test", + "single_end": true + }, + "test.ReadsPerGene.out.tab:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "12": [ + [ + { + "id": "test", + "single_end": true + }, + "test.Chimeric.out.junction:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "13": [ + [ + { + "id": "test", + "single_end": true + }, + "test.out.sam:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "14": [ + [ + { + "id": "test", + "single_end": true + }, + "test.Signal.UniqueMultiple.str1.out.wig:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "15": [ + [ + { + "id": "test", + "single_end": true + }, + "test.Signal.UniqueMultiple.str1.out.bg:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "2": [ + [ + { + "id": "test", + "single_end": true + }, + "test.Log.progress.out:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "3": [ + "versions.yml:md5,2e6b6d8809f5a17f38f4d27c45dcb22f" + ], + "4": [ + [ + { + "id": "test", + "single_end": true + }, + [ + "test.Aligned.sortedByCoord.out.bam:md5,d41d8cd98f00b204e9800998ecf8427e", + "test.sortedByCoord.out.bam:md5,d41d8cd98f00b204e9800998ecf8427e", + "testXd.out.bam:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ] + ], + "5": [ + [ + { + "id": "test", + "single_end": true + }, + [ + "test.Aligned.sortedByCoord.out.bam:md5,d41d8cd98f00b204e9800998ecf8427e", + "test.sortedByCoord.out.bam:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ] + ], + "6": [ + [ + { + "id": "test", + "single_end": true + }, + "test.toTranscriptome.out.bam:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "7": [ + [ + { + "id": "test", + "single_end": true + }, + "test.Aligned.unsort.out.bam:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "8": [ + [ + { + "id": "test", + "single_end": true + }, + [ + "test.unmapped_1.fastq.gz:md5,68b329da9893e34099c7d8ad5cb9c940", + "test.unmapped_2.fastq.gz:md5,68b329da9893e34099c7d8ad5cb9c940" + ] + ] + ], + "9": [ + [ + { + "id": "test", + "single_end": true + }, + [ + "test.ReadsPerGene.out.tab:md5,d41d8cd98f00b204e9800998ecf8427e", + "test.SJ.out.tab:md5,d41d8cd98f00b204e9800998ecf8427e", + "test.tab:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ] + ], + "bam": [ + [ + { + "id": "test", + "single_end": true + }, + [ + "test.Aligned.sortedByCoord.out.bam:md5,d41d8cd98f00b204e9800998ecf8427e", + "test.sortedByCoord.out.bam:md5,d41d8cd98f00b204e9800998ecf8427e", + "testXd.out.bam:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ] + ], + "bam_sorted": [ + [ + { + "id": "test", + "single_end": true + }, + [ + "test.Aligned.sortedByCoord.out.bam:md5,d41d8cd98f00b204e9800998ecf8427e", + "test.sortedByCoord.out.bam:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ] + ], + "bam_transcript": [ + [ + { + "id": "test", + "single_end": true + }, + "test.toTranscriptome.out.bam:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "bam_unsorted": [ + [ + { + "id": "test", + "single_end": true + }, + "test.Aligned.unsort.out.bam:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "bedgraph": [ + [ + { + "id": "test", + "single_end": true + }, + "test.Signal.UniqueMultiple.str1.out.bg:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "fastq": [ + [ + { + "id": "test", + "single_end": true + }, + [ + "test.unmapped_1.fastq.gz:md5,68b329da9893e34099c7d8ad5cb9c940", + "test.unmapped_2.fastq.gz:md5,68b329da9893e34099c7d8ad5cb9c940" + ] + ] + ], + "junction": [ + [ + { + "id": "test", + "single_end": true + }, + "test.Chimeric.out.junction:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "log_final": [ + [ + { + "id": "test", + "single_end": true + }, + "test.Log.final.out:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "log_out": [ + [ + { + "id": "test", + "single_end": true + }, + "test.Log.out:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "log_progress": [ + [ + { + "id": "test", + "single_end": true + }, + "test.Log.progress.out:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "read_per_gene_tab": [ + [ + { + "id": "test", + "single_end": true + }, + "test.ReadsPerGene.out.tab:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "sam": [ + [ + { + "id": "test", + "single_end": true + }, + "test.out.sam:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "spl_junc_tab": [ + [ + { + "id": "test", + "single_end": true + }, + "test.SJ.out.tab:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "tab": [ + [ + { + "id": "test", + "single_end": true + }, + [ + "test.ReadsPerGene.out.tab:md5,d41d8cd98f00b204e9800998ecf8427e", + "test.SJ.out.tab:md5,d41d8cd98f00b204e9800998ecf8427e", + "test.tab:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ] + ], + "versions": [ + "versions.yml:md5,2e6b6d8809f5a17f38f4d27c45dcb22f" + ], + "wig": [ + [ + { + "id": "test", + "single_end": true + }, + "test.Signal.UniqueMultiple.str1.out.wig:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ] + } + ], + "meta": { + "nf-test": "0.9.0", + "nextflow": "24.04.3" + }, + "timestamp": "2024-07-22T15:16:04.712114" + }, + "homo_sapiens - paired_end - arriba - stub": { + "content": [ + { + "0": [ + [ + { + "id": "test", + "single_end": false + }, + "test.Log.final.out:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "1": [ + [ + { + "id": "test", + "single_end": false + }, + "test.Log.out:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "10": [ + [ + { + "id": "test", + "single_end": false + }, + "test.SJ.out.tab:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "11": [ + [ + { + "id": "test", + "single_end": false + }, + "test.ReadsPerGene.out.tab:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "12": [ + [ + { + "id": "test", + "single_end": false + }, + "test.Chimeric.out.junction:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "13": [ + [ + { + "id": "test", + "single_end": false + }, + "test.out.sam:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "14": [ + [ + { + "id": "test", + "single_end": false + }, + "test.Signal.UniqueMultiple.str1.out.wig:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "15": [ + [ + { + "id": "test", + "single_end": false + }, + "test.Signal.UniqueMultiple.str1.out.bg:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "2": [ + [ + { + "id": "test", + "single_end": false + }, + "test.Log.progress.out:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "3": [ + "versions.yml:md5,2e6b6d8809f5a17f38f4d27c45dcb22f" + ], + "4": [ + [ + { + "id": "test", + "single_end": false + }, + [ + "test.Aligned.sortedByCoord.out.bam:md5,d41d8cd98f00b204e9800998ecf8427e", + "test.sortedByCoord.out.bam:md5,d41d8cd98f00b204e9800998ecf8427e", + "testXd.out.bam:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ] + ], + "5": [ + [ + { + "id": "test", + "single_end": false + }, + [ + "test.Aligned.sortedByCoord.out.bam:md5,d41d8cd98f00b204e9800998ecf8427e", + "test.sortedByCoord.out.bam:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ] + ], + "6": [ + [ + { + "id": "test", + "single_end": false + }, + "test.toTranscriptome.out.bam:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "7": [ + [ + { + "id": "test", + "single_end": false + }, + "test.Aligned.unsort.out.bam:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "8": [ + [ + { + "id": "test", + "single_end": false + }, + [ + "test.unmapped_1.fastq.gz:md5,68b329da9893e34099c7d8ad5cb9c940", + "test.unmapped_2.fastq.gz:md5,68b329da9893e34099c7d8ad5cb9c940" + ] + ] + ], + "9": [ + [ + { + "id": "test", + "single_end": false + }, + [ + "test.ReadsPerGene.out.tab:md5,d41d8cd98f00b204e9800998ecf8427e", + "test.SJ.out.tab:md5,d41d8cd98f00b204e9800998ecf8427e", + "test.tab:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ] + ], + "bam": [ + [ + { + "id": "test", + "single_end": false + }, + [ + "test.Aligned.sortedByCoord.out.bam:md5,d41d8cd98f00b204e9800998ecf8427e", + "test.sortedByCoord.out.bam:md5,d41d8cd98f00b204e9800998ecf8427e", + "testXd.out.bam:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ] + ], + "bam_sorted": [ + [ + { + "id": "test", + "single_end": false + }, + [ + "test.Aligned.sortedByCoord.out.bam:md5,d41d8cd98f00b204e9800998ecf8427e", + "test.sortedByCoord.out.bam:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ] + ], + "bam_transcript": [ + [ + { + "id": "test", + "single_end": false + }, + "test.toTranscriptome.out.bam:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "bam_unsorted": [ + [ + { + "id": "test", + "single_end": false + }, + "test.Aligned.unsort.out.bam:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "bedgraph": [ + [ + { + "id": "test", + "single_end": false + }, + "test.Signal.UniqueMultiple.str1.out.bg:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "fastq": [ + [ + { + "id": "test", + "single_end": false + }, + [ + "test.unmapped_1.fastq.gz:md5,68b329da9893e34099c7d8ad5cb9c940", + "test.unmapped_2.fastq.gz:md5,68b329da9893e34099c7d8ad5cb9c940" + ] + ] + ], + "junction": [ + [ + { + "id": "test", + "single_end": false + }, + "test.Chimeric.out.junction:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "log_final": [ + [ + { + "id": "test", + "single_end": false + }, + "test.Log.final.out:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "log_out": [ + [ + { + "id": "test", + "single_end": false + }, + "test.Log.out:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "log_progress": [ + [ + { + "id": "test", + "single_end": false + }, + "test.Log.progress.out:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "read_per_gene_tab": [ + [ + { + "id": "test", + "single_end": false + }, + "test.ReadsPerGene.out.tab:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "sam": [ + [ + { + "id": "test", + "single_end": false + }, + "test.out.sam:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "spl_junc_tab": [ + [ + { + "id": "test", + "single_end": false + }, + "test.SJ.out.tab:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "tab": [ + [ + { + "id": "test", + "single_end": false + }, + [ + "test.ReadsPerGene.out.tab:md5,d41d8cd98f00b204e9800998ecf8427e", + "test.SJ.out.tab:md5,d41d8cd98f00b204e9800998ecf8427e", + "test.tab:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ] + ], + "versions": [ + "versions.yml:md5,2e6b6d8809f5a17f38f4d27c45dcb22f" + ], + "wig": [ + [ + { + "id": "test", + "single_end": false + }, + "test.Signal.UniqueMultiple.str1.out.wig:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ] + } + ], + "meta": { + "nf-test": "0.9.0", + "nextflow": "24.04.3" + }, + "timestamp": "2024-07-22T15:16:28.874293" + }, + "homo_sapiens - single_end": { + "content": [ + "test.Log.final.out", + "test.Log.out", + "test.Log.progress.out", + [ + [ + { + "id": "test", + "single_end": true + }, + "test.Aligned.sortedByCoord.out.bam:md5,c6cfaccaf91bc7fdabed3cfe236d4535" + ] + ], + [ + [ + { + "id": "test", + "single_end": true + }, + "test.Aligned.sortedByCoord.out.bam:md5,c6cfaccaf91bc7fdabed3cfe236d4535" + ] + ], + [ + + ], + [ + + ], + [ + [ + { + "id": "test", + "single_end": true + }, + [ + "test.Signal.Unique.str1.out.bg:md5,c56fc1472776fb927eaf62d973da5f9a", + "test.Signal.UniqueMultiple.str1.out.bg:md5,e93373cf6f2a2a9506e2efdb260cdd4f" + ] + ] + ], + [ + + ], + [ + + ], + [ + + ], + [ + + ], + [ + [ + { + "id": "test", + "single_end": true + }, + "test.SJ.out.tab:md5,75a516ab950fb958f40b29996474949c" + ] + ], + [ + [ + { + "id": "test", + "single_end": true + }, + "test.SJ.out.tab:md5,75a516ab950fb958f40b29996474949c" + ] + ], + [ + + ], + [ + "versions.yml:md5,2e6b6d8809f5a17f38f4d27c45dcb22f" + ] + ], + "meta": { + "nf-test": "0.9.0", + "nextflow": "24.04.3" + }, + "timestamp": "2024-07-22T18:02:34.35338" + }, + "homo_sapiens - paired_end": { + "content": [ + "test.Log.final.out", + "test.Log.out", + "test.Log.progress.out", + [ + [ + { + "id": "test", + "single_end": false + }, + "test.Aligned.sortedByCoord.out.bam:md5,b9ee1c607e07323bc1652ef3babb543f" + ] + ], + [ + [ + { + "id": "test", + "single_end": false + }, + "test.Aligned.sortedByCoord.out.bam:md5,b9ee1c607e07323bc1652ef3babb543f" + ] + ], + [ + + ], + [ + + ], + [ + [ + { + "id": "test", + "single_end": false + }, + [ + "test.Signal.Unique.str1.out.bg:md5,d7bf8b70b436ca048a62513e1d0ece3a", + "test.Signal.UniqueMultiple.str1.out.bg:md5,686d58493b9eb445b56ace4d67f76ef6" + ] + ] + ], + [ + + ], + [ + + ], + [ + + ], + [ + + ], + [ + [ + { + "id": "test", + "single_end": false + }, + "test.SJ.out.tab:md5,844af19ab0fc8cd9a3f75228445aca0d" + ] + ], + [ + [ + { + "id": "test", + "single_end": false + }, + "test.SJ.out.tab:md5,844af19ab0fc8cd9a3f75228445aca0d" + ] + ], + [ + + ], + [ + "versions.yml:md5,2e6b6d8809f5a17f38f4d27c45dcb22f" + ] + ], + "meta": { + "nf-test": "0.9.0", + "nextflow": "24.04.3" + }, + "timestamp": "2024-07-22T18:03:16.701923" + }, + "homo_sapiens - paired_end - multiple - stub": { + "content": [ + { + "0": [ + [ + { + "id": "test", + "single_end": false + }, + "test.Log.final.out:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "1": [ + [ + { + "id": "test", + "single_end": false + }, + "test.Log.out:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "10": [ + [ + { + "id": "test", + "single_end": false + }, + "test.SJ.out.tab:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "11": [ + [ + { + "id": "test", + "single_end": false + }, + "test.ReadsPerGene.out.tab:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "12": [ + [ + { + "id": "test", + "single_end": false + }, + "test.Chimeric.out.junction:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "13": [ + [ + { + "id": "test", + "single_end": false + }, + "test.out.sam:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "14": [ + [ + { + "id": "test", + "single_end": false + }, + "test.Signal.UniqueMultiple.str1.out.wig:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "15": [ + [ + { + "id": "test", + "single_end": false + }, + "test.Signal.UniqueMultiple.str1.out.bg:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "2": [ + [ + { + "id": "test", + "single_end": false + }, + "test.Log.progress.out:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "3": [ + "versions.yml:md5,2e6b6d8809f5a17f38f4d27c45dcb22f" + ], + "4": [ + [ + { + "id": "test", + "single_end": false + }, + [ + "test.Aligned.sortedByCoord.out.bam:md5,d41d8cd98f00b204e9800998ecf8427e", + "test.sortedByCoord.out.bam:md5,d41d8cd98f00b204e9800998ecf8427e", + "testXd.out.bam:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ] + ], + "5": [ + [ + { + "id": "test", + "single_end": false + }, + [ + "test.Aligned.sortedByCoord.out.bam:md5,d41d8cd98f00b204e9800998ecf8427e", + "test.sortedByCoord.out.bam:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ] + ], + "6": [ + [ + { + "id": "test", + "single_end": false + }, + "test.toTranscriptome.out.bam:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "7": [ + [ + { + "id": "test", + "single_end": false + }, + "test.Aligned.unsort.out.bam:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "8": [ + [ + { + "id": "test", + "single_end": false + }, + [ + "test.unmapped_1.fastq.gz:md5,68b329da9893e34099c7d8ad5cb9c940", + "test.unmapped_2.fastq.gz:md5,68b329da9893e34099c7d8ad5cb9c940" + ] + ] + ], + "9": [ + [ + { + "id": "test", + "single_end": false + }, + [ + "test.ReadsPerGene.out.tab:md5,d41d8cd98f00b204e9800998ecf8427e", + "test.SJ.out.tab:md5,d41d8cd98f00b204e9800998ecf8427e", + "test.tab:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ] + ], + "bam": [ + [ + { + "id": "test", + "single_end": false + }, + [ + "test.Aligned.sortedByCoord.out.bam:md5,d41d8cd98f00b204e9800998ecf8427e", + "test.sortedByCoord.out.bam:md5,d41d8cd98f00b204e9800998ecf8427e", + "testXd.out.bam:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ] + ], + "bam_sorted": [ + [ + { + "id": "test", + "single_end": false + }, + [ + "test.Aligned.sortedByCoord.out.bam:md5,d41d8cd98f00b204e9800998ecf8427e", + "test.sortedByCoord.out.bam:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ] + ], + "bam_transcript": [ + [ + { + "id": "test", + "single_end": false + }, + "test.toTranscriptome.out.bam:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "bam_unsorted": [ + [ + { + "id": "test", + "single_end": false + }, + "test.Aligned.unsort.out.bam:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "bedgraph": [ + [ + { + "id": "test", + "single_end": false + }, + "test.Signal.UniqueMultiple.str1.out.bg:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "fastq": [ + [ + { + "id": "test", + "single_end": false + }, + [ + "test.unmapped_1.fastq.gz:md5,68b329da9893e34099c7d8ad5cb9c940", + "test.unmapped_2.fastq.gz:md5,68b329da9893e34099c7d8ad5cb9c940" + ] + ] + ], + "junction": [ + [ + { + "id": "test", + "single_end": false + }, + "test.Chimeric.out.junction:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "log_final": [ + [ + { + "id": "test", + "single_end": false + }, + "test.Log.final.out:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "log_out": [ + [ + { + "id": "test", + "single_end": false + }, + "test.Log.out:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "log_progress": [ + [ + { + "id": "test", + "single_end": false + }, + "test.Log.progress.out:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "read_per_gene_tab": [ + [ + { + "id": "test", + "single_end": false + }, + "test.ReadsPerGene.out.tab:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "sam": [ + [ + { + "id": "test", + "single_end": false + }, + "test.out.sam:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "spl_junc_tab": [ + [ + { + "id": "test", + "single_end": false + }, + "test.SJ.out.tab:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "tab": [ + [ + { + "id": "test", + "single_end": false + }, + [ + "test.ReadsPerGene.out.tab:md5,d41d8cd98f00b204e9800998ecf8427e", + "test.SJ.out.tab:md5,d41d8cd98f00b204e9800998ecf8427e", + "test.tab:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ] + ], + "versions": [ + "versions.yml:md5,2e6b6d8809f5a17f38f4d27c45dcb22f" + ], + "wig": [ + [ + { + "id": "test", + "single_end": false + }, + "test.Signal.UniqueMultiple.str1.out.wig:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ] + } + ], + "meta": { + "nf-test": "0.9.0", + "nextflow": "24.04.3" + }, + "timestamp": "2024-07-22T15:16:51.360287" + }, + "homo_sapiens - paired_end - multiple": { + "content": [ + "test.Log.final.out", + "test.Log.out", + "test.Log.progress.out", + [ + [ + { + "id": "test", + "single_end": false + }, + "test.Aligned.sortedByCoord.out.bam:md5,ab07c21d63ab0a6c07d171d213c81d5a" + ] + ], + [ + [ + { + "id": "test", + "single_end": false + }, + "test.Aligned.sortedByCoord.out.bam:md5,ab07c21d63ab0a6c07d171d213c81d5a" + ] + ], + [ + + ], + [ + + ], + [ + [ + { + "id": "test", + "single_end": false + }, + [ + "test.Signal.Unique.str1.out.bg:md5,d7bf8b70b436ca048a62513e1d0ece3a", + "test.Signal.UniqueMultiple.str1.out.bg:md5,686d58493b9eb445b56ace4d67f76ef6" + ] + ] + ], + [ + + ], + [ + + ], + [ + + ], + [ + + ], + [ + [ + { + "id": "test", + "single_end": false + }, + "test.SJ.out.tab:md5,069877e053714e23010fe4e1c003b4a2" + ] + ], + [ + [ + { + "id": "test", + "single_end": false + }, + "test.SJ.out.tab:md5,069877e053714e23010fe4e1c003b4a2" + ] + ], + [ + + ], + [ + "versions.yml:md5,2e6b6d8809f5a17f38f4d27c45dcb22f" + ] + ], + "meta": { + "nf-test": "0.9.0", + "nextflow": "24.04.3" + }, + "timestamp": "2024-07-22T13:13:28.987438" + }, + "homo_sapiens - paired_end - stub": { + "content": [ + { + "0": [ + [ + { + "id": "test", + "single_end": false + }, + "test.Log.final.out:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "1": [ + [ + { + "id": "test", + "single_end": false + }, + "test.Log.out:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "10": [ + [ + { + "id": "test", + "single_end": false + }, + "test.SJ.out.tab:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "11": [ + [ + { + "id": "test", + "single_end": false + }, + "test.ReadsPerGene.out.tab:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "12": [ + [ + { + "id": "test", + "single_end": false + }, + "test.Chimeric.out.junction:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "13": [ + [ + { + "id": "test", + "single_end": false + }, + "test.out.sam:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "14": [ + [ + { + "id": "test", + "single_end": false + }, + "test.Signal.UniqueMultiple.str1.out.wig:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "15": [ + [ + { + "id": "test", + "single_end": false + }, + "test.Signal.UniqueMultiple.str1.out.bg:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "2": [ + [ + { + "id": "test", + "single_end": false + }, + "test.Log.progress.out:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "3": [ + "versions.yml:md5,2e6b6d8809f5a17f38f4d27c45dcb22f" + ], + "4": [ + [ + { + "id": "test", + "single_end": false + }, + [ + "test.Aligned.sortedByCoord.out.bam:md5,d41d8cd98f00b204e9800998ecf8427e", + "test.sortedByCoord.out.bam:md5,d41d8cd98f00b204e9800998ecf8427e", + "testXd.out.bam:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ] + ], + "5": [ + [ + { + "id": "test", + "single_end": false + }, + [ + "test.Aligned.sortedByCoord.out.bam:md5,d41d8cd98f00b204e9800998ecf8427e", + "test.sortedByCoord.out.bam:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ] + ], + "6": [ + [ + { + "id": "test", + "single_end": false + }, + "test.toTranscriptome.out.bam:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "7": [ + [ + { + "id": "test", + "single_end": false + }, + "test.Aligned.unsort.out.bam:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "8": [ + [ + { + "id": "test", + "single_end": false + }, + [ + "test.unmapped_1.fastq.gz:md5,68b329da9893e34099c7d8ad5cb9c940", + "test.unmapped_2.fastq.gz:md5,68b329da9893e34099c7d8ad5cb9c940" + ] + ] + ], + "9": [ + [ + { + "id": "test", + "single_end": false + }, + [ + "test.ReadsPerGene.out.tab:md5,d41d8cd98f00b204e9800998ecf8427e", + "test.SJ.out.tab:md5,d41d8cd98f00b204e9800998ecf8427e", + "test.tab:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ] + ], + "bam": [ + [ + { + "id": "test", + "single_end": false + }, + [ + "test.Aligned.sortedByCoord.out.bam:md5,d41d8cd98f00b204e9800998ecf8427e", + "test.sortedByCoord.out.bam:md5,d41d8cd98f00b204e9800998ecf8427e", + "testXd.out.bam:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ] + ], + "bam_sorted": [ + [ + { + "id": "test", + "single_end": false + }, + [ + "test.Aligned.sortedByCoord.out.bam:md5,d41d8cd98f00b204e9800998ecf8427e", + "test.sortedByCoord.out.bam:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ] + ], + "bam_transcript": [ + [ + { + "id": "test", + "single_end": false + }, + "test.toTranscriptome.out.bam:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "bam_unsorted": [ + [ + { + "id": "test", + "single_end": false + }, + "test.Aligned.unsort.out.bam:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "bedgraph": [ + [ + { + "id": "test", + "single_end": false + }, + "test.Signal.UniqueMultiple.str1.out.bg:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "fastq": [ + [ + { + "id": "test", + "single_end": false + }, + [ + "test.unmapped_1.fastq.gz:md5,68b329da9893e34099c7d8ad5cb9c940", + "test.unmapped_2.fastq.gz:md5,68b329da9893e34099c7d8ad5cb9c940" + ] + ] + ], + "junction": [ + [ + { + "id": "test", + "single_end": false + }, + "test.Chimeric.out.junction:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "log_final": [ + [ + { + "id": "test", + "single_end": false + }, + "test.Log.final.out:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "log_out": [ + [ + { + "id": "test", + "single_end": false + }, + "test.Log.out:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "log_progress": [ + [ + { + "id": "test", + "single_end": false + }, + "test.Log.progress.out:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "read_per_gene_tab": [ + [ + { + "id": "test", + "single_end": false + }, + "test.ReadsPerGene.out.tab:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "sam": [ + [ + { + "id": "test", + "single_end": false + }, + "test.out.sam:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "spl_junc_tab": [ + [ + { + "id": "test", + "single_end": false + }, + "test.SJ.out.tab:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "tab": [ + [ + { + "id": "test", + "single_end": false + }, + [ + "test.ReadsPerGene.out.tab:md5,d41d8cd98f00b204e9800998ecf8427e", + "test.SJ.out.tab:md5,d41d8cd98f00b204e9800998ecf8427e", + "test.tab:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ] + ], + "versions": [ + "versions.yml:md5,2e6b6d8809f5a17f38f4d27c45dcb22f" + ], + "wig": [ + [ + { + "id": "test", + "single_end": false + }, + "test.Signal.UniqueMultiple.str1.out.wig:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ] + } + ], + "meta": { + "nf-test": "0.9.0", + "nextflow": "24.04.3" + }, + "timestamp": "2024-07-22T15:16:16.798018" + }, + "homo_sapiens - paired_end - starfusion": { + "content": [ + "test.Log.final.out", + "test.Log.out", + "test.Log.progress.out", + [ + [ + { + "id": "test", + "single_end": false + }, + "test.Aligned.out.bam:md5,bcad07b838f6762fc01eea52b5cd3f84" + ] + ], + [ + + ], + [ + + ], + [ + + ], + [ + + ], + [ + + ], + [ + [ + { + "id": "test", + "single_end": false + }, + "test.Chimeric.out.junction:md5,c10ef219f4a30e83711b995bc5e40dba" + ] + ], + [ + + ], + [ + + ], + [ + [ + { + "id": "test", + "single_end": false + }, + "test.SJ.out.tab:md5,19c3faa1bfa9a0cc5e4c45f17065b53a" + ] + ], + [ + [ + { + "id": "test", + "single_end": false + }, + "test.SJ.out.tab:md5,19c3faa1bfa9a0cc5e4c45f17065b53a" + ] + ], + [ + + ], + [ + "versions.yml:md5,2e6b6d8809f5a17f38f4d27c45dcb22f" + ] + ], + "meta": { + "nf-test": "0.9.0", + "nextflow": "24.04.3" + }, + "timestamp": "2024-07-22T13:10:55.371956" + }, + "homo_sapiens - paired_end - arriba": { + "content": [ + "test.Log.final.out", + "test.Log.out", + "test.Log.progress.out", + [ + [ + { + "id": "test", + "single_end": false + }, + "test.Aligned.out.bam:md5,c1b1747f5873f2d17762725636e891d5" + ] + ], + [ + + ], + [ + + ], + [ + + ], + [ + + ], + [ + + ], + [ + + ], + [ + + ], + [ + + ], + [ + [ + { + "id": "test", + "single_end": false + }, + "test.SJ.out.tab:md5,5155c9fd1f787ad6d7d80987fb06219c" + ] + ], + [ + [ + { + "id": "test", + "single_end": false + }, + "test.SJ.out.tab:md5,5155c9fd1f787ad6d7d80987fb06219c" + ] + ], + [ + + ], + [ + "versions.yml:md5,2e6b6d8809f5a17f38f4d27c45dcb22f" + ] + ], + "meta": { + "nf-test": "0.9.0", + "nextflow": "24.04.3" + }, + "timestamp": "2024-07-22T13:05:10.7534" + }, + "homo_sapiens - paired_end - starfusion - stub": { + "content": [ + { + "0": [ + [ + { + "id": "test", + "single_end": false + }, + "test.Log.final.out:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "1": [ + [ + { + "id": "test", + "single_end": false + }, + "test.Log.out:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "10": [ + [ + { + "id": "test", + "single_end": false + }, + "test.SJ.out.tab:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "11": [ + [ + { + "id": "test", + "single_end": false + }, + "test.ReadsPerGene.out.tab:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "12": [ + [ + { + "id": "test", + "single_end": false + }, + "test.Chimeric.out.junction:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "13": [ + [ + { + "id": "test", + "single_end": false + }, + "test.out.sam:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "14": [ + [ + { + "id": "test", + "single_end": false + }, + "test.Signal.UniqueMultiple.str1.out.wig:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "15": [ + [ + { + "id": "test", + "single_end": false + }, + "test.Signal.UniqueMultiple.str1.out.bg:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "2": [ + [ + { + "id": "test", + "single_end": false + }, + "test.Log.progress.out:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "3": [ + "versions.yml:md5,2e6b6d8809f5a17f38f4d27c45dcb22f" + ], + "4": [ + [ + { + "id": "test", + "single_end": false + }, + [ + "test.Aligned.sortedByCoord.out.bam:md5,d41d8cd98f00b204e9800998ecf8427e", + "test.sortedByCoord.out.bam:md5,d41d8cd98f00b204e9800998ecf8427e", + "testXd.out.bam:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ] + ], + "5": [ + [ + { + "id": "test", + "single_end": false + }, + [ + "test.Aligned.sortedByCoord.out.bam:md5,d41d8cd98f00b204e9800998ecf8427e", + "test.sortedByCoord.out.bam:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ] + ], + "6": [ + [ + { + "id": "test", + "single_end": false + }, + "test.toTranscriptome.out.bam:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "7": [ + [ + { + "id": "test", + "single_end": false + }, + "test.Aligned.unsort.out.bam:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "8": [ + [ + { + "id": "test", + "single_end": false + }, + [ + "test.unmapped_1.fastq.gz:md5,68b329da9893e34099c7d8ad5cb9c940", + "test.unmapped_2.fastq.gz:md5,68b329da9893e34099c7d8ad5cb9c940" + ] + ] + ], + "9": [ + [ + { + "id": "test", + "single_end": false + }, + [ + "test.ReadsPerGene.out.tab:md5,d41d8cd98f00b204e9800998ecf8427e", + "test.SJ.out.tab:md5,d41d8cd98f00b204e9800998ecf8427e", + "test.tab:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ] + ], + "bam": [ + [ + { + "id": "test", + "single_end": false + }, + [ + "test.Aligned.sortedByCoord.out.bam:md5,d41d8cd98f00b204e9800998ecf8427e", + "test.sortedByCoord.out.bam:md5,d41d8cd98f00b204e9800998ecf8427e", + "testXd.out.bam:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ] + ], + "bam_sorted": [ + [ + { + "id": "test", + "single_end": false + }, + [ + "test.Aligned.sortedByCoord.out.bam:md5,d41d8cd98f00b204e9800998ecf8427e", + "test.sortedByCoord.out.bam:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ] + ], + "bam_transcript": [ + [ + { + "id": "test", + "single_end": false + }, + "test.toTranscriptome.out.bam:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "bam_unsorted": [ + [ + { + "id": "test", + "single_end": false + }, + "test.Aligned.unsort.out.bam:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "bedgraph": [ + [ + { + "id": "test", + "single_end": false + }, + "test.Signal.UniqueMultiple.str1.out.bg:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "fastq": [ + [ + { + "id": "test", + "single_end": false + }, + [ + "test.unmapped_1.fastq.gz:md5,68b329da9893e34099c7d8ad5cb9c940", + "test.unmapped_2.fastq.gz:md5,68b329da9893e34099c7d8ad5cb9c940" + ] + ] + ], + "junction": [ + [ + { + "id": "test", + "single_end": false + }, + "test.Chimeric.out.junction:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "log_final": [ + [ + { + "id": "test", + "single_end": false + }, + "test.Log.final.out:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "log_out": [ + [ + { + "id": "test", + "single_end": false + }, + "test.Log.out:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "log_progress": [ + [ + { + "id": "test", + "single_end": false + }, + "test.Log.progress.out:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "read_per_gene_tab": [ + [ + { + "id": "test", + "single_end": false + }, + "test.ReadsPerGene.out.tab:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "sam": [ + [ + { + "id": "test", + "single_end": false + }, + "test.out.sam:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "spl_junc_tab": [ + [ + { + "id": "test", + "single_end": false + }, + "test.SJ.out.tab:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "tab": [ + [ + { + "id": "test", + "single_end": false + }, + [ + "test.ReadsPerGene.out.tab:md5,d41d8cd98f00b204e9800998ecf8427e", + "test.SJ.out.tab:md5,d41d8cd98f00b204e9800998ecf8427e", + "test.tab:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ] + ], + "versions": [ + "versions.yml:md5,2e6b6d8809f5a17f38f4d27c45dcb22f" + ], + "wig": [ + [ + { + "id": "test", + "single_end": false + }, + "test.Signal.UniqueMultiple.str1.out.wig:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ] + } + ], + "meta": { + "nf-test": "0.9.0", + "nextflow": "24.04.3" + }, + "timestamp": "2024-07-22T15:16:40.64399" + } +} \ No newline at end of file diff --git a/modules/nf-core/star/align/tests/nextflow.arriba.config b/modules/nf-core/star/align/tests/nextflow.arriba.config new file mode 100644 index 0000000..2324b9e --- /dev/null +++ b/modules/nf-core/star/align/tests/nextflow.arriba.config @@ -0,0 +1,14 @@ +process { + + withName: STAR_GENOMEGENERATE { + ext.args = '--genomeSAindexNbases 9' + } + + withName: STAR_ALIGN { + ext.args = '--readFilesCommand zcat --outSAMtype BAM Unsorted --outSAMunmapped Within --outBAMcompression 0 --outFilterMultimapNmax 50 --peOverlapNbasesMin 10 --alignSplicedMateMapLminOverLmate 0.5 --alignSJstitchMismatchNmax 5 -1 5 5 --chimSegmentMin 10 --chimOutType WithinBAM HardClip --chimJunctionOverhangMin 10 --chimScoreDropMax 30 --chimScoreJunctionNonGTAG 0 --chimScoreSeparation 1 --chimSegmentReadGapMax 3 --chimMultimapNmax 50' + } + +} + +// Fix chown issue for the output star folder +docker.runOptions = '--platform=linux/amd64 -u $(id -u):$(id -g)' diff --git a/modules/nf-core/star/align/tests/nextflow.config b/modules/nf-core/star/align/tests/nextflow.config new file mode 100644 index 0000000..c4ac580 --- /dev/null +++ b/modules/nf-core/star/align/tests/nextflow.config @@ -0,0 +1,14 @@ +process { + + withName: STAR_GENOMEGENERATE { + ext.args = '--genomeSAindexNbases 9' + } + + withName: STAR_ALIGN { + ext.args = '--readFilesCommand zcat --outSAMtype BAM SortedByCoordinate --outWigType bedGraph --outWigStrand Unstranded' + } + +} + +// Fix chown issue for the output star folder +docker.runOptions = '--platform=linux/amd64 -u $(id -u):$(id -g)' diff --git a/modules/nf-core/star/align/tests/nextflow.starfusion.config b/modules/nf-core/star/align/tests/nextflow.starfusion.config new file mode 100644 index 0000000..467b649 --- /dev/null +++ b/modules/nf-core/star/align/tests/nextflow.starfusion.config @@ -0,0 +1,14 @@ +process { + + withName: STAR_GENOMEGENERATE { + ext.args = '--genomeSAindexNbases 9' + } + + withName: STAR_ALIGN { + ext.args = '--readFilesCommand zcat --outSAMtype BAM Unsorted --outReadsUnmapped None --twopassMode Basic --outSAMstrandField intronMotif --outSAMunmapped Within --chimSegmentMin 12 --chimJunctionOverhangMin 8 --chimOutJunctionFormat 1 --alignSJDBoverhangMin 10 --alignMatesGapMax 100000 --alignIntronMax 100000 --alignSJstitchMismatchNmax 5 -1 5 5 --chimMultimapScoreRange 3 --chimScoreJunctionNonGTAG -4 --chimMultimapNmax 20 --chimNonchimScoreDropMin 10 --peOverlapNbasesMin 12 --peOverlapMMp 0.1 --alignInsertionFlush Right --alignSplicedMateMapLminOverLmate 0 --alignSplicedMateMapLmin 30' + } + +} + +// Fix chown issue for the output star folder +docker.runOptions = '--platform=linux/amd64 -u $(id -u):$(id -g)' diff --git a/modules/nf-core/star/align/tests/tags.yml b/modules/nf-core/star/align/tests/tags.yml new file mode 100644 index 0000000..8beace1 --- /dev/null +++ b/modules/nf-core/star/align/tests/tags.yml @@ -0,0 +1,2 @@ +star/align: + - modules/nf-core/star/align/** diff --git a/modules/nf-core/star/genomegenerate/main.nf b/modules/nf-core/star/genomegenerate/main.nf index b885571..4d4055b 100644 --- a/modules/nf-core/star/genomegenerate/main.nf +++ b/modules/nf-core/star/genomegenerate/main.nf @@ -21,7 +21,7 @@ process STAR_GENOMEGENERATE { script: def args = task.ext.args ?: '' def args_list = args.tokenize() - def memory = task.memory ? "--limitGenomeGenerateRAM ${task.memory.toBytes() - 100000000}" : '' + def memory = task.memory ? "--limitGenomeGenerateRAM ${(task.memory.toBytes()*task.cpus) - 100000000}" : '' def include_gtf = gtf ? "--sjdbGTFfile $gtf" : '' if (args_list.contains('--genomeSAindexNbases')) { """ diff --git a/modules/nf-core/star/genomegenerate/star-genomegenerate.diff b/modules/nf-core/star/genomegenerate/star-genomegenerate.diff new file mode 100644 index 0000000..247d39a --- /dev/null +++ b/modules/nf-core/star/genomegenerate/star-genomegenerate.diff @@ -0,0 +1,20 @@ +Changes in module 'nf-core/star/genomegenerate' +'modules/nf-core/star/genomegenerate/environment.yml' is unchanged +'modules/nf-core/star/genomegenerate/meta.yml' is unchanged +Changes in 'star/genomegenerate/main.nf': +--- modules/nf-core/star/genomegenerate/main.nf ++++ modules/nf-core/star/genomegenerate/main.nf +@@ -21,7 +21,7 @@ + script: + def args = task.ext.args ?: '' + def args_list = args.tokenize() +- def memory = task.memory ? "--limitGenomeGenerateRAM ${task.memory.toBytes() - 100000000}" : '' ++ def memory = task.memory ? "--limitGenomeGenerateRAM ${(task.memory.toBytes()*task.cpus) - 100000000}" : '' + def include_gtf = gtf ? "--sjdbGTFfile $gtf" : '' + if (args_list.contains('--genomeSAindexNbases')) { + """ + +'modules/nf-core/star/genomegenerate/tests/tags.yml' is unchanged +'modules/nf-core/star/genomegenerate/tests/main.nf.test' is unchanged +'modules/nf-core/star/genomegenerate/tests/main.nf.test.snap' is unchanged +************************************************************ diff --git a/subworkflows/local/align_reads.nf b/subworkflows/local/align_reads.nf index 8f50d57..8e5c410 100644 --- a/subworkflows/local/align_reads.nf +++ b/subworkflows/local/align_reads.nf @@ -19,7 +19,7 @@ workflow ALIGN_READS { STAR_ALIGN( reads, star_index, - gtf.map{it[1]}, + gtf, false, [], [] diff --git a/subworkflows/local/fillout.nf b/subworkflows/local/fillout.nf index 9453446..3a3703f 100755 --- a/subworkflows/local/fillout.nf +++ b/subworkflows/local/fillout.nf @@ -32,7 +32,7 @@ workflow FILLOUT { .map{ meta, bam, bai, variants -> [ meta, bam, bai, variants, "${variants.getBaseName()}.gbcms.maf"] }, - fasta, + fasta.map{it[1]}.first(), fai ) diff --git a/subworkflows/local/fusion.nf b/subworkflows/local/fusion.nf index 6f8578c..fb8525e 100644 --- a/subworkflows/local/fusion.nf +++ b/subworkflows/local/fusion.nf @@ -1,5 +1,5 @@ include { STAR_ALIGN as STAR_FOR_ARRIBA } from '../../modules/nf-core/star/align/main' -include { ARRIBA } from '../../modules/nf-core/arriba/main' +include { ARRIBA_ARRIBA } from '../../modules/nf-core/arriba/arriba/main' include { STAR_ALIGN as STAR_FOR_STARFUSION } from '../../modules/nf-core/star/align/main' include { STARFUSION } from '../../modules/local/starfusion/detect/main' include { FUSIONCATCHER_DETECT } from '../../modules/local/fusioncatcher/detect/main' @@ -19,6 +19,7 @@ workflow FUSION { reads reads_untrimmed star_index + fasta gtf starfusion_ref fusioncatcher_ref @@ -33,7 +34,7 @@ workflow FUSION { main: ch_versions = Channel.empty() - fasta = params.fasta + //fasta = params.fasta //gene_bed = params.metafusion_gene_bed //gene_info = params.metafusion_gene_info //blocklist = params.metafusion_blocklist @@ -41,30 +42,30 @@ workflow FUSION { STAR_FOR_ARRIBA( reads, star_index, - gtf.map{it[1]}, + gtf, false, [], [] ) ch_versions = ch_versions.mix(STAR_FOR_ARRIBA.out.versions.first()) - ARRIBA( + ARRIBA_ARRIBA( STAR_FOR_ARRIBA.out.bam, fasta, - gtf.map{it[1]}, - arriba_blacklist, - arriba_known_fusions, - [], - [], - arriba_protein_domains + gtf, + arriba_blacklist.map{[[:],it]}.view(), + arriba_known_fusions.map{[[:],it]}, + [[:],[]], + [[:],[]], + arriba_protein_domains.map{[[:],it]} ) - ch_versions = ch_versions.mix(ARRIBA.out.versions.first()) + ch_versions = ch_versions.mix(ARRIBA_ARRIBA.out.versions.first()) STAR_FOR_STARFUSION( reads, // use the star index in the starfusion reference to ensure compatibility - starfusion_ref.map{ file( it + "/ref_genome.fa.star.idx")}, - starfusion_ref.map{ file( it + "/ref_annot.gtf")}, + starfusion_ref.map{ [[id:params.genome],file( it + "/ref_genome.fa.star.idx")] }, + starfusion_ref.map{ [[id:params.genome],file( it + "/ref_annot.gtf")] }, false, [], [] @@ -88,7 +89,7 @@ workflow FUSION { fc_fusions = ["GRCh37","hg19","smallGRCh37"].contains(params.genome) ? FUSIONCATCHER_DETECT.out.fusions_alt : FUSIONCATCHER_DETECT.out.fusions - ARRIBA_TO_CFF(ARRIBA.out.fusions + ARRIBA_TO_CFF(ARRIBA_ARRIBA.out.fusions .map{ meta, file ->[ meta, "arriba", file ] }) FUSIONCATCHER_TO_CFF(fc_fusions .map{ meta, file -> [ meta, "fusioncatcher", file ] } ) @@ -113,7 +114,7 @@ workflow FUSION { MERGE_CFF.out.file_out, gene_bed.map{ it[1] }.first(), gene_info.map{ it[1] }.first(), - fasta, + fasta.map{ it[1] }.first(), blocklist ) diff --git a/subworkflows/local/prepare_references.nf b/subworkflows/local/prepare_references.nf index 938652a..d5a02dc 100644 --- a/subworkflows/local/prepare_references.nf +++ b/subworkflows/local/prepare_references.nf @@ -6,6 +6,7 @@ include { SAMTOOLS_FAIDX } from '../../modules/nf-core/samtools/ include { GATK4_BEDTOINTERVALLIST } from '../../modules/nf-core/gatk4/bedtointervallist/main' include { PREPARE_RRNA } from '../../modules/local/prepare_rrna/main' include { + GUNZIP as GUNZIP_FASTA ; GUNZIP as GUNZIP_GTF ; GUNZIP as GUNZIP_METAFUSIONGENEBED ; GUNZIP as GUNZIP_METAFUSIONBLOCKLIST @@ -24,6 +25,13 @@ workflow PREPARE_REFERENCES { main: ch_versions = Channel.empty() + if (params.fasta.endsWith(".gz")){ + GUNZIP_FASTA([[id:params.genome],params.fasta]) + fasta = GUNZIP_FASTA.out.gunzip.first() + } else { + fasta = Channel.of([[id:params.genome],params.fasta]) + } + if (params.gtf.endsWith(".gz")){ GUNZIP_GTF([[id:params.genome],params.gtf]) gtf = GUNZIP_GTF.out.gunzip.first() @@ -39,7 +47,7 @@ workflow PREPARE_REFERENCES { } STAR_GENOMEGENERATE( - [[id:params.genome],params.fasta], + fasta, gtf ) ch_versions = ch_versions.mix(STAR_GENOMEGENERATE.out.versions) @@ -58,7 +66,7 @@ workflow PREPARE_REFERENCES { } PREPARE_RRNA([],refflat) - GATK4_CREATESEQUENCEDICTIONARY(params.fasta) + GATK4_CREATESEQUENCEDICTIONARY(fasta) ch_versions = ch_versions.mix(GATK4_CREATESEQUENCEDICTIONARY.out.versions) GATK4_BEDTOINTERVALLIST( @@ -67,7 +75,7 @@ workflow PREPARE_REFERENCES { ) ch_versions = ch_versions.mix(GATK4_BEDTOINTERVALLIST.out.versions) - SAMTOOLS_FAIDX ([[:],params.fasta]) + SAMTOOLS_FAIDX(fasta,[[:],[]]) if (params.starfusion_url) { UNTAR_STARFUSION([[id:params.starfusion_url.tokenize("/")[-1].replaceFirst(/\.tar\.gz$/, "")],params.starfusion_url]) @@ -113,6 +121,7 @@ workflow PREPARE_REFERENCES { star_index = star_index // Convert queue channel to value channel so it never gets poison pilled refflat = refflat + fasta = fasta fasta_dict = GATK4_CREATESEQUENCEDICTIONARY.out.dict fasta_fai = SAMTOOLS_FAIDX.out.fai rrna_bed = PREPARE_RRNA.out.rRNA_bed diff --git a/subworkflows/local/qc.nf b/subworkflows/local/qc.nf index b9e3bc5..58b4e04 100644 --- a/subworkflows/local/qc.nf +++ b/subworkflows/local/qc.nf @@ -17,12 +17,12 @@ workflow QC { refflat rrna_intervals rseqc_bed + fasta fai dict baits main: - fasta = params.fasta ch_versions = Channel.empty() BAM_RSEQC( @@ -35,7 +35,7 @@ workflow QC { PICARD_COLLECTRNASEQMETRICS( bam, refflat, - fasta, + fasta.map{it[1]}.first(), rrna_intervals ) ch_versions = ch_versions.mix(PICARD_COLLECTRNASEQMETRICS.out.versions.first()) @@ -51,9 +51,9 @@ workflow QC { }.map{ meta, bam, bai, bait, bait_file, target_file -> [meta, bam, bai, bait_file, target_file] }, - [[:],fasta], + fasta, fai, - dict.map{ dict -> [[:],dict]} + dict ) multiqc_files = multiqc_files diff --git a/workflows/forte.nf b/workflows/forte.nf index ea56c0a..4ccd0d9 100644 --- a/workflows/forte.nf +++ b/workflows/forte.nf @@ -129,6 +129,7 @@ workflow FORTE { PREPROCESS_READS.out.reads_trimmed, PREPROCESS_READS.out.reads_untrimmed, PREPARE_REFERENCES.out.star_index, + PREPARE_REFERENCES.out.fasta, PREPARE_REFERENCES.out.gtf, PREPARE_REFERENCES.out.starfusion_ref, PREPARE_REFERENCES.out.fusioncatcher_ref, @@ -152,7 +153,7 @@ workflow FORTE { ALIGN_READS.out.bam, ALIGN_READS.out.bai, MAF_INPUT_CHECK.out.mafs, - params.fasta, + PREPARE_REFERENCES.out.fasta, PREPARE_REFERENCES.out.fasta_fai.map{ it[1] }.first() ) ch_versions = ch_versions.mix(FILLOUT.out.ch_versions) @@ -168,6 +169,7 @@ workflow FORTE { PREPARE_REFERENCES.out.refflat, PREPARE_REFERENCES.out.rrna_interval_list, PREPARE_REFERENCES.out.rseqc_bed, + PREPARE_REFERENCES.out.fasta, PREPARE_REFERENCES.out.fasta_fai, PREPARE_REFERENCES.out.fasta_dict, BAIT_INPUTS.out.baits @@ -189,6 +191,7 @@ workflow FORTE { PREPARE_REFERENCES.out.refflat, PREPARE_REFERENCES.out.rrna_interval_list, PREPARE_REFERENCES.out.rseqc_bed, + PREPARE_REFERENCES.out.fasta, PREPARE_REFERENCES.out.fasta_fai, PREPARE_REFERENCES.out.fasta_dict, BAIT_INPUTS.out.baits From e206da4aa740416f4810ad4ba2b6561ac9fc5587 Mon Sep 17 00:00:00 2001 From: Anne Marie Noronha Date: Wed, 25 Sep 2024 17:36:44 -0400 Subject: [PATCH 05/28] fix indentation --- conf/igenomes.config | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/conf/igenomes.config b/conf/igenomes.config index a653ef7..e39e26d 100644 --- a/conf/igenomes.config +++ b/conf/igenomes.config @@ -36,7 +36,7 @@ params { 'GRCh38' { ensembl_version = 88 //fasta = "${params.igenomes_base}/Homo_sapiens/NCBI/GRCh38Decoy/Sequence/WholeGenomeFasta/genome.fa" - fasta = "https://ftp.ensembl.org/pub/release-88/fasta/homo_sapiens/dna/Homo_sapiens.GRCh38.dna.primary_assembly.fa.gz" + fasta = "https://ftp.ensembl.org/pub/release-88/fasta/homo_sapiens/dna/Homo_sapiens.GRCh38.dna.primary_assembly.fa.gz" gtf = "https://ftp.ensembl.org/pub/release-88/gtf/homo_sapiens/Homo_sapiens.GRCh38.88.gtf.gz" //forte will generate refflat from gtf refflat = null From fa7f511494aa66438b29542d8d5faa0c9a4b3d06 Mon Sep 17 00:00:00 2001 From: Anne Marie Noronha Date: Wed, 25 Sep 2024 21:39:47 -0400 Subject: [PATCH 06/28] fix agfusion download command --- modules/local/agfusion/download/main.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/local/agfusion/download/main.nf b/modules/local/agfusion/download/main.nf index 094ca39..44b3b2b 100644 --- a/modules/local/agfusion/download/main.nf +++ b/modules/local/agfusion/download/main.nf @@ -31,7 +31,7 @@ process AGFUSION_DOWNLOAD { pyensembl install --species ${pyensembl_species} --release ${ensembl_release} - agfusion download -g ${agfusion_genome} + agfusion download -s ${pyensembl_species} --release ${ensembl_release} cat <<-END_VERSIONS > versions.yml "${task.process}": From 20379c791379c8d9b146c6a182118771283ed45d Mon Sep 17 00:00:00 2001 From: Anne Marie Noronha Date: Wed, 25 Sep 2024 21:41:05 -0400 Subject: [PATCH 07/28] clean up view operator --- subworkflows/local/fusion.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/subworkflows/local/fusion.nf b/subworkflows/local/fusion.nf index fb8525e..28e8546 100644 --- a/subworkflows/local/fusion.nf +++ b/subworkflows/local/fusion.nf @@ -53,7 +53,7 @@ workflow FUSION { STAR_FOR_ARRIBA.out.bam, fasta, gtf, - arriba_blacklist.map{[[:],it]}.view(), + arriba_blacklist.map{[[:],it]}, arriba_known_fusions.map{[[:],it]}, [[:],[]], [[:],[]], From 39744f71f0813a5826b0ad8d8c88ca42a10aa52f Mon Sep 17 00:00:00 2001 From: Anne Marie Noronha Date: Thu, 26 Sep 2024 12:13:41 -0400 Subject: [PATCH 08/28] fix fillout pytest --- subworkflows/local/fillout.nf | 2 +- workflows/forte.nf | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/subworkflows/local/fillout.nf b/subworkflows/local/fillout.nf index 3a3703f..9453446 100755 --- a/subworkflows/local/fillout.nf +++ b/subworkflows/local/fillout.nf @@ -32,7 +32,7 @@ workflow FILLOUT { .map{ meta, bam, bai, variants -> [ meta, bam, bai, variants, "${variants.getBaseName()}.gbcms.maf"] }, - fasta.map{it[1]}.first(), + fasta, fai ) diff --git a/workflows/forte.nf b/workflows/forte.nf index 4ccd0d9..3ce6178 100644 --- a/workflows/forte.nf +++ b/workflows/forte.nf @@ -138,9 +138,9 @@ workflow FORTE { PREPARE_REFERENCES.out.metafusion_gene_bed, PREPARE_REFERENCES.out.metafusion_gene_info, PREPARE_REFERENCES.out.metafusion_blocklist, - workflow.profile.toString().split(",").contains("test") ? [] : PREPARE_REFERENCES.out.arriba_blacklist, - workflow.profile.toString().split(",").contains("test") ? [] : PREPARE_REFERENCES.out.arriba_known_fusions, - workflow.profile.toString().split(",").contains("test") ? [] : PREPARE_REFERENCES.out.arriba_protein_domains + workflow.profile.toString().split(",").contains("test") ? Channel.of([]) : PREPARE_REFERENCES.out.arriba_blacklist, + workflow.profile.toString().split(",").contains("test") ? Channel.of([]) : PREPARE_REFERENCES.out.arriba_known_fusions, + workflow.profile.toString().split(",").contains("test") ? Channel.of([]) : PREPARE_REFERENCES.out.arriba_protein_domains ) ch_versions = ch_versions.mix(FUSION.out.ch_versions) @@ -153,7 +153,7 @@ workflow FORTE { ALIGN_READS.out.bam, ALIGN_READS.out.bai, MAF_INPUT_CHECK.out.mafs, - PREPARE_REFERENCES.out.fasta, + PREPARE_REFERENCES.out.fasta.map{ it[1] }.first(), PREPARE_REFERENCES.out.fasta_fai.map{ it[1] }.first() ) ch_versions = ch_versions.mix(FILLOUT.out.ch_versions) From 01b2981ac91558280463edfaaf5ee85df3f964d8 Mon Sep 17 00:00:00 2001 From: Anne Marie Noronha Date: Thu, 26 Sep 2024 14:08:54 -0400 Subject: [PATCH 09/28] change pfam reference to stagnant release --- modules/local/agfusion/download/main.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/local/agfusion/download/main.nf b/modules/local/agfusion/download/main.nf index 44b3b2b..ef4b80a 100644 --- a/modules/local/agfusion/download/main.nf +++ b/modules/local/agfusion/download/main.nf @@ -44,7 +44,7 @@ process AGFUSION_DOWNLOAD { pyensembl install --species ${pyensembl_species} --release ${ensembl_release} - curl http://ftp.ebi.ac.uk/pub/databases/Pfam/current_release/database_files/pfamA.txt.gz > pfamA.txt.gz + curl http://ftp.ebi.ac.uk/pub/databases/Pfam/releases/Pfam37.0/database_files/pfamA.txt.gz > pfamA.txt.gz gunzip pfamA.txt.gz agfusion build --dir . --species ${agfusion_genome} --release ${ensembl_release} --pfam pfamA.txt rm pfamA.txt From 732558981fab5a4b55cc54010ed4f69063921586 Mon Sep 17 00:00:00 2001 From: Anne Marie Noronha Date: Thu, 26 Sep 2024 16:23:52 -0400 Subject: [PATCH 10/28] add poison pill to reference channels --- subworkflows/local/prepare_references.nf | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/subworkflows/local/prepare_references.nf b/subworkflows/local/prepare_references.nf index d5a02dc..925571b 100644 --- a/subworkflows/local/prepare_references.nf +++ b/subworkflows/local/prepare_references.nf @@ -29,21 +29,21 @@ workflow PREPARE_REFERENCES { GUNZIP_FASTA([[id:params.genome],params.fasta]) fasta = GUNZIP_FASTA.out.gunzip.first() } else { - fasta = Channel.of([[id:params.genome],params.fasta]) + fasta = Channel.of([[id:params.genome],params.fasta]).first() } if (params.gtf.endsWith(".gz")){ GUNZIP_GTF([[id:params.genome],params.gtf]) gtf = GUNZIP_GTF.out.gunzip.first() } else { - gtf = Channel.of([[id:params.genome],params.gtf]) + gtf = Channel.of([[id:params.genome],params.gtf]).first() } if (params.metafusion_blocklist.endsWith(".gz")){ GUNZIP_METAFUSIONBLOCKLIST([[:],params.metafusion_blocklist]) metafusion_blocklist = GUNZIP_METAFUSIONBLOCKLIST.out.gunzip.map{ it[1] }.first() } else { - metafusion_blocklist = params.metafusion_blocklist + metafusion_blocklist = Channel.of(params.metafusion_blocklist).first() } STAR_GENOMEGENERATE( @@ -51,7 +51,7 @@ workflow PREPARE_REFERENCES { gtf ) ch_versions = ch_versions.mix(STAR_GENOMEGENERATE.out.versions) - star_index = STAR_GENOMEGENERATE.out.index + star_index = STAR_GENOMEGENERATE.out.index.first() UCSC_GTFTOGENEPRED(gtf) ch_versions = ch_versions.mix(UCSC_GTFTOGENEPRED.out.versions) From c2041194528fce10d2d95eb8d5a3c08af0b0af07 Mon Sep 17 00:00:00 2001 From: Anne Marie Noronha Date: Thu, 26 Sep 2024 16:51:49 -0400 Subject: [PATCH 11/28] update md5sum in test_profile test --- tests/small_test/test.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/small_test/test.yml b/tests/small_test/test.yml index 50ff7f1..508e2ff 100755 --- a/tests/small_test/test.yml +++ b/tests/small_test/test.yml @@ -4,11 +4,11 @@ - test_profile files: - path: output/analysis/SAMPLE_PAIRED_END/STAR/SAMPLE_PAIRED_END.Aligned.sortedByCoord.out.bam - md5sum: 781acdb8d313482de17a2933e18bb97a + md5sum: e46db0148604c6937a2cc7535d934292 - path: output/analysis/SAMPLE_PAIRED_END_UMI/STAR/SAMPLE_PAIRED_END_UMI.Aligned.sortedByCoord.out.bam - path: output/analysis/SAMPLE_SINGLE_END/STAR/SAMPLE_SINGLE_END.Aligned.sortedByCoord.out.bam - path: output/analysis/SAMPLE_SINGLE_END/arriba/SAMPLE_SINGLE_END.fusions.discarded.tsv - md5sum: da3e17e01697fe9990fd545e1e26b822 + md5sum: 9daf6f31ee9a90b6b263bf5ae28dbe96 - path: output/analysis/SAMPLE_SINGLE_END/arriba/SAMPLE_SINGLE_END.fusions.tsv md5sum: 7c3383f7eb6d79b84b0bd30a7ef02d70 - path: output/pipeline_info/software_versions.yml From 7c5dc285876fe78398542e47e372b60a6c6ba4b0 Mon Sep 17 00:00:00 2001 From: Anne Marie Noronha Date: Thu, 26 Sep 2024 17:40:26 -0400 Subject: [PATCH 12/28] update ensembl version to 112 --- conf/igenomes.config | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/conf/igenomes.config b/conf/igenomes.config index e39e26d..83ecb5d 100644 --- a/conf/igenomes.config +++ b/conf/igenomes.config @@ -34,14 +34,14 @@ params { ensembl_version = 75 } 'GRCh38' { - ensembl_version = 88 + ensembl_version = 112 //fasta = "${params.igenomes_base}/Homo_sapiens/NCBI/GRCh38Decoy/Sequence/WholeGenomeFasta/genome.fa" - fasta = "https://ftp.ensembl.org/pub/release-88/fasta/homo_sapiens/dna/Homo_sapiens.GRCh38.dna.primary_assembly.fa.gz" - gtf = "https://ftp.ensembl.org/pub/release-88/gtf/homo_sapiens/Homo_sapiens.GRCh38.88.gtf.gz" + fasta = "https://ftp.ensembl.org/pub/release-112/fasta/homo_sapiens/dna/Homo_sapiens.GRCh38.dna.primary_assembly.fa.gz" + gtf = "https://ftp.ensembl.org/pub/release-112/gtf/homo_sapiens/Homo_sapiens.GRCh38.112.gtf.gz" //forte will generate refflat from gtf refflat = null starfusion_url = "https://data.broadinstitute.org/Trinity/CTAT_RESOURCE_LIB/__genome_libs_StarFv1.10/GRCh38_gencode_v37_CTAT_lib_Mar012021.plug-n-play.tar.gz" - cdna = "https://ftp.ensembl.org/pub/release-88/fasta/homo_sapiens/cdna/Homo_sapiens.GRCh38.cdna.all.fa.gz" + cdna = "https://ftp.ensembl.org/pub/release-112/fasta/homo_sapiens/cdna/Homo_sapiens.GRCh38.cdna.all.fa.gz" metafusion_blocklist = "https://raw.githubusercontent.com/anoronh4/forte-references/main/GRCh38/blocklist_breakpoints.hg38.bedpe.gz" } 'smallGRCh37' { From d8e423ff52a89ee0dffe7a370de4cb0cd99cc06b Mon Sep 17 00:00:00 2001 From: Anne Marie Noronha Date: Thu, 26 Sep 2024 17:48:28 -0400 Subject: [PATCH 13/28] update CHANGELOG.md --- CHANGELOG.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index a59f682..95508cf 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,6 +11,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - [#118](https://github.com/mskcc/forte/pull/118) - change the way the plug-n-play starfusion reference is downloaded. +- [#128](https://github.com/mskcc/forte/pull/128) - full support for GRCh38 added + ### `Fixed` - [#119](https://github.com/mskcc/forte/pull/119) - change script error behavior in METAFUSION_RUN process From 4966057c714ce963aee2a645e613d278e9a709db Mon Sep 17 00:00:00 2001 From: Anne Marie Noronha Date: Tue, 1 Oct 2024 11:41:47 -0400 Subject: [PATCH 14/28] change ensembl version to 111 --- conf/igenomes.config | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/conf/igenomes.config b/conf/igenomes.config index 83ecb5d..89826d2 100644 --- a/conf/igenomes.config +++ b/conf/igenomes.config @@ -34,14 +34,14 @@ params { ensembl_version = 75 } 'GRCh38' { - ensembl_version = 112 + ensembl_version = 111 //fasta = "${params.igenomes_base}/Homo_sapiens/NCBI/GRCh38Decoy/Sequence/WholeGenomeFasta/genome.fa" - fasta = "https://ftp.ensembl.org/pub/release-112/fasta/homo_sapiens/dna/Homo_sapiens.GRCh38.dna.primary_assembly.fa.gz" - gtf = "https://ftp.ensembl.org/pub/release-112/gtf/homo_sapiens/Homo_sapiens.GRCh38.112.gtf.gz" + fasta = "https://ftp.ensembl.org/pub/release-111/fasta/homo_sapiens/dna/Homo_sapiens.GRCh38.dna.primary_assembly.fa.gz" + gtf = "https://ftp.ensembl.org/pub/release-111/gtf/homo_sapiens/Homo_sapiens.GRCh38.111.gtf.gz" //forte will generate refflat from gtf refflat = null starfusion_url = "https://data.broadinstitute.org/Trinity/CTAT_RESOURCE_LIB/__genome_libs_StarFv1.10/GRCh38_gencode_v37_CTAT_lib_Mar012021.plug-n-play.tar.gz" - cdna = "https://ftp.ensembl.org/pub/release-112/fasta/homo_sapiens/cdna/Homo_sapiens.GRCh38.cdna.all.fa.gz" + cdna = "https://ftp.ensembl.org/pub/release-111/fasta/homo_sapiens/cdna/Homo_sapiens.GRCh38.cdna.all.fa.gz" metafusion_blocklist = "https://raw.githubusercontent.com/anoronh4/forte-references/main/GRCh38/blocklist_breakpoints.hg38.bedpe.gz" } 'smallGRCh37' { From 3bbaecfda0ace815dbb4969ded60a2b18327aab7 Mon Sep 17 00:00:00 2001 From: Anne Marie Noronha <9613506+anoronh4@users.noreply.github.com> Date: Tue, 1 Oct 2024 15:12:08 -0400 Subject: [PATCH 15/28] update AGFusion to v1.4.3@mskcc.1 --- modules/local/agfusion/batch/main.nf | 4 +-- modules/local/agfusion/container/Dockerfile | 27 ++++++++++++++++----- modules/local/agfusion/download/main.nf | 8 +++--- 3 files changed, 27 insertions(+), 12 deletions(-) diff --git a/modules/local/agfusion/batch/main.nf b/modules/local/agfusion/batch/main.nf index e8de04d..398a21b 100644 --- a/modules/local/agfusion/batch/main.nf +++ b/modules/local/agfusion/batch/main.nf @@ -5,8 +5,8 @@ process AGFUSION_BATCH { // Note: 2.7X indices incompatible with AWS iGenomes. conda 'bioconda::agfusion=1.252' container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'docker://cmopipeline/agfusion:0.0.6' : - 'docker.io/cmopipeline/agfusion:0.0.6' }" + 'docker://cmopipeline/agfusion:0.0.7' : + 'docker.io/cmopipeline/agfusion:0.0.7' }" input: tuple val(meta), path(fusions) diff --git a/modules/local/agfusion/container/Dockerfile b/modules/local/agfusion/container/Dockerfile index eaca5d5..a455f44 100755 --- a/modules/local/agfusion/container/Dockerfile +++ b/modules/local/agfusion/container/Dockerfile @@ -1,14 +1,30 @@ -FROM ubuntu:bionic-20230530 +FROM ubuntu:jammy-20240911.1 LABEL maintainer="Anne Marie Noronha (noronhaa@mskcc.org)" \ - version.image="0.0.6" + version.image="0.0.7" # INSTALL DEPENDENCIES ENV DEBIAN_FRONTEND=noninteractive RUN apt-get update -y -RUN apt-get install -y build-essential python3 python3-pip python3-matplotlib python3-pandas python3-future python3-biopython curl less vim libnss-sss git zip +RUN apt-get install -y \ + build-essential \ + python3 \ + python3-pip \ + python3-matplotlib \ + python3-pandas \ + python3-future \ + python3-biopython \ + python3-dev \ + default-libmysqlclient-dev \ + pkg-config \ + curl \ + less \ + vim \ + libnss-sss \ + git \ + zip RUN pip3 install --upgrade pip RUN pip3 install pyensembl @@ -18,9 +34,8 @@ RUN pip3 install mysqlclient # INSTALL AGFUSION & DATABASE FILES WORKDIR /usr/local/bin -RUN git clone https://github.com/mskcc/AGFusion.git --branch v1.4.1-fork1 --single-branch +RUN git clone https://github.com/mskcc/AGFusion.git --branch v1.4.3@mskcc.1 --single-branch WORKDIR /usr/local/bin/AGFusion +RUN pip3 install -r requirements.txt RUN pip3 install . -# downgrade pyensembl for compatibility -RUN pip3 install gtfparse==1.2.1 --upgrade diff --git a/modules/local/agfusion/download/main.nf b/modules/local/agfusion/download/main.nf index 094ca39..513384a 100644 --- a/modules/local/agfusion/download/main.nf +++ b/modules/local/agfusion/download/main.nf @@ -4,8 +4,8 @@ process AGFUSION_DOWNLOAD { // Note: 2.7X indices incompatible with AWS iGenomes. conda 'bioconda::agfusion=1.252' container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'docker://cmopipeline/agfusion:0.0.6' : - 'docker.io/cmopipeline/agfusion:0.0.6' }" + 'docker://cmopipeline/agfusion:0.0.7' : + 'docker.io/cmopipeline/agfusion:0.0.7' }" input: val(ensembl_release) @@ -25,13 +25,13 @@ process AGFUSION_DOWNLOAD { ['GRCh38','hg38'].contains(genome) ? 'hg38' : ['GRCm38','mm10'].contains(genome) ? 'mm10' : '' def pyensembl_species = ['GRCm38','mm10'].contains(genome) ? 'mus_musculus' : 'homo_sapiens' - if (ensembl_release < 93) { + if (ensembl_release < 112) { """ export PYENSEMBL_CACHE_DIR=\$PWD/pyensembl_cache pyensembl install --species ${pyensembl_species} --release ${ensembl_release} - agfusion download -g ${agfusion_genome} + agfusion download -s ${pyensembl_species} -r ${ensembl_release} cat <<-END_VERSIONS > versions.yml "${task.process}": From cd06afcff6e1528ee4bdcf49267171f324a425da Mon Sep 17 00:00:00 2001 From: Anne Marie Noronha Date: Wed, 2 Oct 2024 13:07:01 -0400 Subject: [PATCH 16/28] add cpus in AGAT_SPADDINTRONS resources --- conf/modules.config | 1 + 1 file changed, 1 insertion(+) diff --git a/conf/modules.config b/conf/modules.config index 5f15d4e..8d912d2 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -210,6 +210,7 @@ process { ] } withName: 'AGAT_SPADDINTRONS' { + cpus = { 4 * task.attempt } storeDir = { "${params.reference_base}/${params.genome}/metafusion/introns" } publishDir = [ enabled: false, From 3bfeb91eea29c90b443a3c54611d1078955e7b31 Mon Sep 17 00:00:00 2001 From: Anne Marie Noronha Date: Wed, 2 Oct 2024 13:09:11 -0400 Subject: [PATCH 17/28] fix indentation --- conf/modules.config | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/conf/modules.config b/conf/modules.config index 8d912d2..33b1603 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -210,7 +210,7 @@ process { ] } withName: 'AGAT_SPADDINTRONS' { - cpus = { 4 * task.attempt } + cpus = { 4 * task.attempt } storeDir = { "${params.reference_base}/${params.genome}/metafusion/introns" } publishDir = [ enabled: false, From 9147321e10c9e0c16e358869fba41932375dc8ef Mon Sep 17 00:00:00 2001 From: pintoa1-mskcc Date: Wed, 2 Oct 2024 13:11:38 -0400 Subject: [PATCH 18/28] Set gene_id as gene_name for lncRNAs, remove NF transcripts from gene bed --- bin/final_generate_v75_gene_bed.R | 7 +++++-- bin/make_gene_info_for_forte.R | 2 +- modules/local/metafusion/genebed/main.nf | 4 ++-- .../resources/usr/bin/final_generate_v75_gene_bed.R | 11 +++++++---- .../resources/usr/bin/make_gene_info_for_forte.R | 2 +- 5 files changed, 16 insertions(+), 10 deletions(-) diff --git a/bin/final_generate_v75_gene_bed.R b/bin/final_generate_v75_gene_bed.R index ffcb064..156cfbc 100755 --- a/bin/final_generate_v75_gene_bed.R +++ b/bin/final_generate_v75_gene_bed.R @@ -3,7 +3,7 @@ # __author__ = "Alexandria Dymun" # __email__ = "pintoa1@mskcc.org" # __contributor__ = "Anne Marie Noronha (noronhaa@mskcc.org)" -# __version__ = "0.0.1" +# __version__ = "0.0.2" # __status__ = "Dev" @@ -35,6 +35,8 @@ if (length(args)!=2) { gtf <- rtracklayer::import(args[1]) gtf_df <- as.data.frame(gtf) +#remove incomplete transcripts mRNA_end_NF and mRNA_start_NF (not finished) +gtf_df <- gtf_df[!grepl("NF",gtf_df$tag),] file.to_write <- args[2] @@ -43,7 +45,8 @@ gtf_df <- gtf_df %>% chr = seqnames ) %>% select(c(chr, start, end, transcript_id, type, strand, gene_name, gene_id)) %>% - filter(type %in% c("exon","intron","UTR","CDS","cds","utr")) + filter(type %in% c("exon","intron","UTR","CDS","cds","utr")) %>% + mutate(gene_name = ifelse(is.na(gene_name),gene_id,gene_name)) #START CLOCK diff --git a/bin/make_gene_info_for_forte.R b/bin/make_gene_info_for_forte.R index 2ab3dfd..08ac644 100755 --- a/bin/make_gene_info_for_forte.R +++ b/bin/make_gene_info_for_forte.R @@ -106,7 +106,7 @@ gene_info <- rbind(gene_info,add_these_excess_gene_ids) gene_info <- merge(gene_info,do.call(rbind,unique_id_to_names[versioned_gtf])[,c("gene_id","gene_id_with_version")],by = "gene_id",all.x = T, all.y = F) gene_info$Synonyms <- ifelse(is.na(gene_info$gene_id_with_version),gene_info$gene_id,paste0(gene_info$gene_id,"|",gene_info$gene_id_with_version)) -gene_info$Symbol <- gene_info$gene_name +gene_info$Symbol <- ifelse(is.na(gene_info$gene_name), gene_info$gene_id, gene_info$gene_name) gene_info <- gene_info[,c("Symbol","Synonyms")] diff --git a/modules/local/metafusion/genebed/main.nf b/modules/local/metafusion/genebed/main.nf index 2314c40..1936846 100644 --- a/modules/local/metafusion/genebed/main.nf +++ b/modules/local/metafusion/genebed/main.nf @@ -28,7 +28,7 @@ process METAFUSION_GENEBED { cat <<-END_VERSIONS > versions.yml "${task.process}": R: \$(R --version | head -n1) - final_generate_v75_gene_bed.R: 0.0.1 + final_generate_v75_gene_bed.R: 0.0.2 END_VERSIONS """ @@ -41,7 +41,7 @@ process METAFUSION_GENEBED { cat <<-END_VERSIONS > versions.yml "${task.process}": R: \$(R --version | head -n1) - final_generate_v75_gene_bed.R: 0.0.1 + final_generate_v75_gene_bed.R: 0.0.2 END_VERSIONS """ } diff --git a/modules/local/metafusion/genebed/resources/usr/bin/final_generate_v75_gene_bed.R b/modules/local/metafusion/genebed/resources/usr/bin/final_generate_v75_gene_bed.R index 46a5d15..156cfbc 100755 --- a/modules/local/metafusion/genebed/resources/usr/bin/final_generate_v75_gene_bed.R +++ b/modules/local/metafusion/genebed/resources/usr/bin/final_generate_v75_gene_bed.R @@ -1,14 +1,14 @@ - #!/usr/local/bin/Rscript + # __author__ = "Alexandria Dymun" # __email__ = "pintoa1@mskcc.org" # __contributor__ = "Anne Marie Noronha (noronhaa@mskcc.org)" -# __version__ = "0.0.1" +# __version__ = "0.0.2" # __status__ = "Dev" suppressPackageStartupMessages({ - library(plyr) +# library(plyr) library(dplyr) library(data.table) library(stringr) @@ -35,6 +35,8 @@ if (length(args)!=2) { gtf <- rtracklayer::import(args[1]) gtf_df <- as.data.frame(gtf) +#remove incomplete transcripts mRNA_end_NF and mRNA_start_NF (not finished) +gtf_df <- gtf_df[!grepl("NF",gtf_df$tag),] file.to_write <- args[2] @@ -43,7 +45,8 @@ gtf_df <- gtf_df %>% chr = seqnames ) %>% select(c(chr, start, end, transcript_id, type, strand, gene_name, gene_id)) %>% - filter(type %in% c("exon","intron","UTR","CDS","cds","utr")) + filter(type %in% c("exon","intron","UTR","CDS","cds","utr")) %>% + mutate(gene_name = ifelse(is.na(gene_name),gene_id,gene_name)) #START CLOCK diff --git a/modules/local/metafusion/geneinfo/resources/usr/bin/make_gene_info_for_forte.R b/modules/local/metafusion/geneinfo/resources/usr/bin/make_gene_info_for_forte.R index 2ab3dfd..08ac644 100755 --- a/modules/local/metafusion/geneinfo/resources/usr/bin/make_gene_info_for_forte.R +++ b/modules/local/metafusion/geneinfo/resources/usr/bin/make_gene_info_for_forte.R @@ -106,7 +106,7 @@ gene_info <- rbind(gene_info,add_these_excess_gene_ids) gene_info <- merge(gene_info,do.call(rbind,unique_id_to_names[versioned_gtf])[,c("gene_id","gene_id_with_version")],by = "gene_id",all.x = T, all.y = F) gene_info$Synonyms <- ifelse(is.na(gene_info$gene_id_with_version),gene_info$gene_id,paste0(gene_info$gene_id,"|",gene_info$gene_id_with_version)) -gene_info$Symbol <- gene_info$gene_name +gene_info$Symbol <- ifelse(is.na(gene_info$gene_name), gene_info$gene_id, gene_info$gene_name) gene_info <- gene_info[,c("Symbol","Synonyms")] From ad507432d783d61b98d92c0ddf8c2bb259a76ad5 Mon Sep 17 00:00:00 2001 From: pintoa1-mskcc Date: Wed, 2 Oct 2024 14:10:34 -0400 Subject: [PATCH 19/28] fix linting error, ensure no scientific notation in gene bed --- bin/final_generate_v75_gene_bed.R | 3 ++- .../genebed/resources/usr/bin/final_generate_v75_gene_bed.R | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/bin/final_generate_v75_gene_bed.R b/bin/final_generate_v75_gene_bed.R index 156cfbc..f79c712 100755 --- a/bin/final_generate_v75_gene_bed.R +++ b/bin/final_generate_v75_gene_bed.R @@ -12,6 +12,7 @@ suppressPackageStartupMessages({ library(dplyr) library(data.table) library(stringr) + options(scipen = 999) }) usage <- function() { @@ -45,7 +46,7 @@ gtf_df <- gtf_df %>% chr = seqnames ) %>% select(c(chr, start, end, transcript_id, type, strand, gene_name, gene_id)) %>% - filter(type %in% c("exon","intron","UTR","CDS","cds","utr")) %>% + filter(type %in% c("exon","intron","UTR","CDS","cds","utr")) %>% mutate(gene_name = ifelse(is.na(gene_name),gene_id,gene_name)) diff --git a/modules/local/metafusion/genebed/resources/usr/bin/final_generate_v75_gene_bed.R b/modules/local/metafusion/genebed/resources/usr/bin/final_generate_v75_gene_bed.R index 156cfbc..f79c712 100755 --- a/modules/local/metafusion/genebed/resources/usr/bin/final_generate_v75_gene_bed.R +++ b/modules/local/metafusion/genebed/resources/usr/bin/final_generate_v75_gene_bed.R @@ -12,6 +12,7 @@ suppressPackageStartupMessages({ library(dplyr) library(data.table) library(stringr) + options(scipen = 999) }) usage <- function() { @@ -45,7 +46,7 @@ gtf_df <- gtf_df %>% chr = seqnames ) %>% select(c(chr, start, end, transcript_id, type, strand, gene_name, gene_id)) %>% - filter(type %in% c("exon","intron","UTR","CDS","cds","utr")) %>% + filter(type %in% c("exon","intron","UTR","CDS","cds","utr")) %>% mutate(gene_name = ifelse(is.na(gene_name),gene_id,gene_name)) From 95e737e24d10ed753aedba2defce835e38185644 Mon Sep 17 00:00:00 2001 From: Anne Marie Noronha Date: Wed, 9 Oct 2024 21:08:03 -0400 Subject: [PATCH 20/28] remove deprecated arriba installation --- modules.json | 5 --- modules/nf-core/arriba/main.nf | 66 ----------------------------- modules/nf-core/arriba/meta.yml | 74 --------------------------------- 3 files changed, 145 deletions(-) delete mode 100644 modules/nf-core/arriba/main.nf delete mode 100644 modules/nf-core/arriba/meta.yml diff --git a/modules.json b/modules.json index 5a728db..c89decd 100644 --- a/modules.json +++ b/modules.json @@ -21,11 +21,6 @@ "git_sha": "6898156da3604a6bdf26c36036053a970050fea0", "installed_by": ["modules"] }, - "arriba": { - "branch": "master", - "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c", - "installed_by": ["modules"] - }, "arriba/arriba": { "branch": "master", "git_sha": "666652151335353eef2fcd58880bcef5bc2928e1", diff --git a/modules/nf-core/arriba/main.nf b/modules/nf-core/arriba/main.nf deleted file mode 100644 index e4b48be..0000000 --- a/modules/nf-core/arriba/main.nf +++ /dev/null @@ -1,66 +0,0 @@ -process ARRIBA { - tag "$meta.id" - label 'process_medium' - - conda "bioconda::arriba=2.3.0" - container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/arriba:2.3.0--haa8aa89_0' : - 'quay.io/biocontainers/arriba:2.3.0--haa8aa89_0' }" - - input: - tuple val(meta), path(bam) - path fasta - path gtf - path blacklist - path known_fusions - path structural_variants - path tags - path protein_domains - - output: - tuple val(meta), path("*.fusions.tsv") , emit: fusions - tuple val(meta), path("*.fusions.discarded.tsv"), emit: fusions_fail - path "versions.yml" , emit: versions - - when: - task.ext.when == null || task.ext.when - - script: - def args = task.ext.args ?: '' - def prefix = task.ext.prefix ?: "${meta.id}" - def blacklist = blacklist ? "-b $blacklist" : "-f blacklist" - def known_fusions = known_fusions ? "-k $known_fusions" : "" - def structural_variants = structural_variants ? "-d $structual_variants" : "" - def tags = tags ? "-t $tags" : "" - def protein_domains = protein_domains ? "-p $protein_domains" : "" - - """ - arriba \\ - -x $bam \\ - -a $fasta \\ - -g $gtf \\ - -o ${prefix}.fusions.tsv \\ - -O ${prefix}.fusions.discarded.tsv \\ - $blacklist \\ - $known_fusions \\ - $structural_variants \\ - $tags \\ - $protein_domains \\ - $args - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - arriba: \$(arriba -h | grep 'Version:' 2>&1 | sed 's/Version:\s//') - END_VERSIONS - """ - - stub: - def prefix = task.ext.prefix ?: "${meta.id}" - """ - echo stub > ${prefix}.fusions.tsv - echo stub > ${prefix}.fusions.discarded.tsv - - echo "${task.process}:" > versions.yml - echo ' arriba: 2.2.1' >> versions.yml - """ -} diff --git a/modules/nf-core/arriba/meta.yml b/modules/nf-core/arriba/meta.yml deleted file mode 100644 index 119dd91..0000000 --- a/modules/nf-core/arriba/meta.yml +++ /dev/null @@ -1,74 +0,0 @@ -name: arriba -description: Arriba is a command-line tool for the detection of gene fusions from RNA-Seq data. -keywords: - - fusion - - arriba -tools: - - arriba: - description: Fast and accurate gene fusion detection from RNA-Seq data - homepage: https://github.com/suhrig/arriba - documentation: https://arriba.readthedocs.io/en/latest/ - tool_dev_url: https://github.com/suhrig/arriba - doi: "10.1101/gr.257246.119" - licence: ["MIT"] - -input: - - meta: - type: map - description: | - Groovy Map containing sample information - e.g. [ id:'test', single_end:false ] - - bam: - type: file - description: BAM/CRAM/SAM file - pattern: "*.{bam,cram,sam}" - - fasta: - type: file - description: Assembly FASTA file - pattern: "*.{fasta}" - - gtf: - type: file - description: Annotation GTF file - pattern: "*.{gtf}" - - blacklist: - type: file - description: Blacklist file - pattern: "*.{tsv}" - - known_fusions: - type: file - description: Known fusions file - pattern: "*.{tsv}" - - structural_variants: - type: file - description: Structural variants file - pattern: "*.{tsv}" - - tags: - type: file - description: Tags file - pattern: "*.{tsv}" - - protein_domains: - type: file - description: Protein domains file - pattern: "*.{gff3}" - -output: - - meta: - type: map - description: | - Groovy Map containing sample information - e.g. [ id:'test', single_end:false ] - - versions: - type: file - description: File containing software versions - pattern: "versions.yml" - - fusions: - type: file - description: File contains fusions which pass all of Arriba's filters. - pattern: "*.{fusions.tsv}" - - fusions_fail: - type: file - description: File contains fusions that Arriba classified as an artifact or that are also observed in healthy tissue. - pattern: "*.{fusions.discarded.tsv}" - -authors: - - "@praveenraj2018,@rannick" From 3d97ab86b026669b51e2686a6585769faa869075 Mon Sep 17 00:00:00 2001 From: Anne Marie Noronha Date: Thu, 10 Oct 2024 10:15:38 -0400 Subject: [PATCH 21/28] update to decoy fasta --- conf/igenomes.config | 4 +-- .../local/fastaremoveprefix/environment.yml | 5 +++ modules/local/fastaremoveprefix/main.nf | 32 +++++++++++++++++++ modules/local/prepare_rrna/main.nf | 5 +-- subworkflows/local/prepare_references.nf | 6 ++++ 5 files changed, 48 insertions(+), 4 deletions(-) create mode 100644 modules/local/fastaremoveprefix/environment.yml create mode 100644 modules/local/fastaremoveprefix/main.nf diff --git a/conf/igenomes.config b/conf/igenomes.config index 89826d2..69c949b 100644 --- a/conf/igenomes.config +++ b/conf/igenomes.config @@ -35,8 +35,8 @@ params { } 'GRCh38' { ensembl_version = 111 - //fasta = "${params.igenomes_base}/Homo_sapiens/NCBI/GRCh38Decoy/Sequence/WholeGenomeFasta/genome.fa" - fasta = "https://ftp.ensembl.org/pub/release-111/fasta/homo_sapiens/dna/Homo_sapiens.GRCh38.dna.primary_assembly.fa.gz" + fasta = "${params.igenomes_base}/Homo_sapiens/NCBI/GRCh38Decoy/Sequence/WholeGenomeFasta/genome.fa" + //fasta = "https://ftp.ensembl.org/pub/release-111/fasta/homo_sapiens/dna/Homo_sapiens.GRCh38.dna.primary_assembly.fa.gz" gtf = "https://ftp.ensembl.org/pub/release-111/gtf/homo_sapiens/Homo_sapiens.GRCh38.111.gtf.gz" //forte will generate refflat from gtf refflat = null diff --git a/modules/local/fastaremoveprefix/environment.yml b/modules/local/fastaremoveprefix/environment.yml new file mode 100644 index 0000000..315f6dc --- /dev/null +++ b/modules/local/fastaremoveprefix/environment.yml @@ -0,0 +1,5 @@ +channels: + - conda-forge + - bioconda +dependencies: + - conda-forge::gawk=5.3.0 diff --git a/modules/local/fastaremoveprefix/main.nf b/modules/local/fastaremoveprefix/main.nf new file mode 100644 index 0000000..71a790f --- /dev/null +++ b/modules/local/fastaremoveprefix/main.nf @@ -0,0 +1,32 @@ +process FASTAREMOVEPREFIX { + tag "$fasta" + label 'process_single' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/gawk:5.3.0' : + 'biocontainers/gawk:5.3.0' }" + + when: + task.ext.when == null || task.ext.when + + input: + tuple val(meta), path(fasta, name: 'input/*') + + output: + tuple val(meta), path("*.{fa,fasta}"), emit: fasta + path "versions.yml" , emit: versions + + script: + def modified_fasta = fasta.fileName.name + """ + cat ${fasta} | sed "s/^>chr/>/g" > ${modified_fasta} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + gawk: \$(awk -Wversion | sed '1!d; s/.*Awk //; s/,.*//') + END_VERSIONS + """ + + +} diff --git a/modules/local/prepare_rrna/main.nf b/modules/local/prepare_rrna/main.nf index 37ec774..8d5ab0a 100644 --- a/modules/local/prepare_rrna/main.nf +++ b/modules/local/prepare_rrna/main.nf @@ -14,6 +14,7 @@ process PREPARE_RRNA { path "rna.bed", emit: rRNA_bed script: + def extra_filter_chr = params.genome == "GRCh38" ? "|^GL000220|^KI270733" : "" if (gtf) { """ (${"${gtf}".endsWith(".gz") ? "z" : ""}grep "rRNA" ${gtf} || true) | \\ @@ -23,7 +24,7 @@ process PREPARE_RRNA { /transcript_id "([^"]+)"/ or die "no transcript_id on \$."; print join "\t", (@F[0,1,2,3], \$1) ' | \\ - (grep -vP "^HG|^HSCHR" || true) | \\ + (grep -vP "^HG|^HSCHR${extra_filter_chr}" || true) | \\ sort -k1V -k2n -k3n \\ > rna.bed @@ -32,7 +33,7 @@ process PREPARE_RRNA { """ (${"${refflat}".endsWith(".gz") ? "z" : ""}grep -P "^RNA5|^RNA1|^RNA2" ${refflat} || true) | \\ awk -F"\\t" -v OFS="\\t" '{ print \$3,\$5,\$6,\$4,\$2 }' | \\ - (grep -vP "^HG|^HSCHR" || true) | \\ + (grep -vP "^HG|^HSCHR${extra_filter_chr}" || true) | \\ sort -k1V -k2n -k3n \\ > rna.bed """ diff --git a/subworkflows/local/prepare_references.nf b/subworkflows/local/prepare_references.nf index 925571b..698f376 100644 --- a/subworkflows/local/prepare_references.nf +++ b/subworkflows/local/prepare_references.nf @@ -19,6 +19,7 @@ include { AGFUSION_DOWNLOAD } from '../../modules/local/agfusion/do include { AGAT_SPADDINTRONS } from '../../modules/nf-core/agat/spaddintrons/main' include { METAFUSION_GENEBED } from '../../modules/local/metafusion/genebed/main' include { METAFUSION_GENEINFO } from '../../modules/local/metafusion/geneinfo/main' +include { FASTAREMOVEPREFIX } from '../../modules/local/fastaremoveprefix/main' workflow PREPARE_REFERENCES { @@ -32,6 +33,11 @@ workflow PREPARE_REFERENCES { fasta = Channel.of([[id:params.genome],params.fasta]).first() } + if (params.genome == "GRCh38" ){ + FASTAREMOVEPREFIX(fasta) + fasta = FASTAREMOVEPREFIX.out.fasta + } + if (params.gtf.endsWith(".gz")){ GUNZIP_GTF([[id:params.genome],params.gtf]) gtf = GUNZIP_GTF.out.gunzip.first() From 7f8c7c05ee22a5d4a8642ca2106ecf227d4c8431 Mon Sep 17 00:00:00 2001 From: Anne Marie Noronha Date: Thu, 10 Oct 2024 11:21:43 -0400 Subject: [PATCH 22/28] add storeDir directives for gunzip* and FASTAREMOVEPREFIX --- conf/modules.config | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/conf/modules.config b/conf/modules.config index 33b1603..400d3c8 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -27,6 +27,14 @@ process { ] } + withName: '.*:PREPARE_REFERENCES:GUNZIP.*' { + storeDir = { "${params.reference_base}/${params.genome}/${task.process.tokenize(':')[-1].toLowerCase()}" } + } + + withName: 'FASTAREMOVEPREFIX' { + storeDir = { "${params.reference_base}/${params.genome}/fasta" } + } + withName: 'MSKCC_FORTE:FORTE:MULTIQC' { publishDir = [ path: { "${report.folder}/report" }, From 709317a0abd709d03bb35203af5a4ec79e3db029 Mon Sep 17 00:00:00 2001 From: Anne Marie Noronha Date: Thu, 10 Oct 2024 16:45:28 -0400 Subject: [PATCH 23/28] change chromosome M to MT in fasta --- modules/local/fastaremoveprefix/main.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/local/fastaremoveprefix/main.nf b/modules/local/fastaremoveprefix/main.nf index 71a790f..c7ebf26 100644 --- a/modules/local/fastaremoveprefix/main.nf +++ b/modules/local/fastaremoveprefix/main.nf @@ -20,7 +20,7 @@ process FASTAREMOVEPREFIX { script: def modified_fasta = fasta.fileName.name """ - cat ${fasta} | sed "s/^>chr/>/g" > ${modified_fasta} + cat ${fasta} | sed "s/^>chr/>/g" | sed "s/^>M />MT /g" > ${modified_fasta} cat <<-END_VERSIONS > versions.yml "${task.process}": From a70828dbb9e348a308f881b9c7aae1efbf3e875b Mon Sep 17 00:00:00 2001 From: Anne Marie Noronha Date: Fri, 11 Oct 2024 15:34:12 -0400 Subject: [PATCH 24/28] add idt_v2 baits for GRCh38 to reference config --- conf/igenomes.config | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/conf/igenomes.config b/conf/igenomes.config index 69c949b..cfb81b7 100644 --- a/conf/igenomes.config +++ b/conf/igenomes.config @@ -43,6 +43,12 @@ params { starfusion_url = "https://data.broadinstitute.org/Trinity/CTAT_RESOURCE_LIB/__genome_libs_StarFv1.10/GRCh38_gencode_v37_CTAT_lib_Mar012021.plug-n-play.tar.gz" cdna = "https://ftp.ensembl.org/pub/release-111/fasta/homo_sapiens/cdna/Homo_sapiens.GRCh38.cdna.all.fa.gz" metafusion_blocklist = "https://raw.githubusercontent.com/anoronh4/forte-references/main/GRCh38/blocklist_breakpoints.hg38.bedpe.gz" + baits { + 'idt_v2' { + targets = "/juno/work/ccs/cmopipeline/forte/GRCh38_probes/xgen-exome-hyb-panel-v2-targets-hg38.bed" + baits = "/juno/work/ccs/cmopipeline/forte/GRCh38_probes/xgen-exome-hyb-panel-v2-probes-hg38.bed" + } + } } 'smallGRCh37' { fasta = "https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/genome/genome.fasta" From 612a80a4898f7117cfa95ec4ca7713cf73df5780 Mon Sep 17 00:00:00 2001 From: Alexandria Pinto Date: Thu, 7 Nov 2024 15:52:33 -0500 Subject: [PATCH 25/28] edit genebed generation to function with v111 nomenclature --- bin/final_generate_v111_gene_bed.R | 122 ++++++++++++++++++ modules/local/metafusion/genebed/main.nf | 69 ++++++---- .../usr/bin/final_generate_v111_gene_bed.R | 122 ++++++++++++++++++ subworkflows/local/prepare_references.nf | 3 +- 4 files changed, 291 insertions(+), 25 deletions(-) create mode 100755 bin/final_generate_v111_gene_bed.R create mode 100755 modules/local/metafusion/genebed/resources/usr/bin/final_generate_v111_gene_bed.R diff --git a/bin/final_generate_v111_gene_bed.R b/bin/final_generate_v111_gene_bed.R new file mode 100755 index 0000000..28983da --- /dev/null +++ b/bin/final_generate_v111_gene_bed.R @@ -0,0 +1,122 @@ +#!/usr/local/bin/Rscript + +# __author__ = "Alexandria Dymun" +# __email__ = "pintoa1@mskcc.org" +# __contributor__ = "Anne Marie Noronha (noronhaa@mskcc.org)" +# __version__ = "0.0.1" +# __status__ = "Dev" + + +suppressPackageStartupMessages({ + library(plyr) + library(dplyr) + library(data.table) + library(stringr) + options(scipen = 999) +}) + +usage <- function() { + message("Usage:") + message("final_generate_v111_gene_bed.R ") +} + +args = commandArgs(TRUE) + +if (length(args)!=2) { + usage() + quit() +} + +gtf <- rtracklayer::import(args[1]) +gtf_df <- as.data.frame(gtf) +#remove incomplete transcripts mRNA_end_NF and mRNA_start_NF (not finished) +gtf_df <- gtf_df[!grepl("NF",gtf_df$tag),] + +file.to_write <- args[2] + +### convert start to 0-based to match metafusion expectations of gff format +gtf_df <- gtf_df %>% + rename( + chr = seqnames + ) %>% + select(c(chr, start, end, transcript_id, type, strand, gene_name, gene_id)) %>% + filter(type %in% c("exon","intron","five_prime_utr","three_prime_utr","CDS")) %>% + mutate(gene_name = ifelse(is.na(gene_name),gene_id,gene_name)) %>% mutate(start = start-1) + + +#START CLOCK +ptm <- proc.time() +print(ptm) + +# Index each transcript feature, incrementing when an intron is passed +## metafusion expects exon count 0 to (N(exons)-1) +## Forward strand: Exon 0 == Exon 1 +### Reverse strand: Exon 0 == LAST EXON IN TRANSCRIPT + +print(dim(gtf_df)) +print(length(unique(gtf_df$transcript_id))) + +modify_transcript <- function(transcript){ + + # Remove exons if coding gene, since "exon" and "CDS" are duplicates of one another + if ("CDS" %in% transcript$type){ + transcript <- transcript[!transcript$type == "exon",] + } + # Order features by increasing bp + transcript <- transcript[order(transcript$start, decreasing = FALSE),] + # Index features + idx <- 0 + for (i in 1:nrow(transcript)){ + transcript$idx[i]<- idx + if (transcript$type[i] == "intron"){ + idx <- idx + 1 + } + } + # REFORMAT TRANSCRIPT + #Change strand info (+ --> f, - --> r) + if (unique(transcript$strand) == "+"){ + transcript$strand <- 'f' + } else if (unique(transcript$strand) == "-"){ + transcript$strand <- 'r' + } else { + errorCondition("Strand info for this transcript is inconsistent") + } + #Add "chr" prefix to chromosomes + transcript$chr <- sapply("chr", paste0, transcript$chr) + #Change CDS --> cds ### IF A TRANSCRIPT LACKS "CDS" THIS LINE WILL DO NOTHING, Changing exon values to UTRs later + transcript <- transcript %>% mutate(type = as.character(type)) + transcript <- transcript %>% mutate(type=ifelse(type == "CDS","cds",type)) + transcript$type[transcript$type == "five_prime_utr"] <- "utr5" + transcript$type[transcript$type == "three_prime_utr"] <- "utr3" + + + #### Any exon that remains after the cds change, is likely and untranslated region. change below + # Basically, subfeatures which are "exon" need to be changed (i.e. exon --> utr3/utr5) + #Forward strand + transcript$type[transcript$strand == "f" & transcript$type == "exon" ] <- "utr5" + #Reverse strand + transcript$type[transcript$strand == "r" & transcript$type == "exon"]<- "utr3" + expected_types <- c("cds","intron","utr3","utr5") + transcript <- transcript[transcript$type %in% c(expected_types),] + return(transcript) +} + +if(file.exists(file.to_write) ) {file.remove(file.to_write)} + +gtf_df_modified <- gtf_df %>% + group_by(transcript_id,.drop = FALSE) %>% + group_modify(~ modify_transcript(.x)) %>% + select(c(chr, start, end, transcript_id, type, idx, strand, gene_name, gene_id )) %>% + arrange(chr,start,end) + +time <- proc.time() - ptm +print(time) + +write.table( + gtf_df_modified, + file.to_write, + sep="\t", + quote=F, + row.names=F, + col.names=F +) diff --git a/modules/local/metafusion/genebed/main.nf b/modules/local/metafusion/genebed/main.nf index 27a1a7f..3d11e7c 100644 --- a/modules/local/metafusion/genebed/main.nf +++ b/modules/local/metafusion/genebed/main.nf @@ -9,10 +9,9 @@ process METAFUSION_GENEBED { input: tuple val(meta), path(gff) - val ensembl_version output: - tuple val(meta), path("*.metafusion.gene.bed"), emit: metafusion_gene_bed + tuple val(meta), path("${meta.id}.metafusion.gene.bed"), emit: metafusion_gene_bed path "versions.yml" , emit: versions when: @@ -21,28 +20,52 @@ process METAFUSION_GENEBED { script: def args = task.ext.args ?: '' def prefix = task.ext.prefix ?: "${meta.id}" - """ - final_generate_v75_gene_bed.R \\ - $gff \\ - ${ensembl_version}.metafusion.gene.bed - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - R: \$(R --version | head -n1) - final_generate_v75_gene_bed.R: 0.0.2 - END_VERSIONS - """ + if( prefix == 'GRCh37' ) + """ + final_generate_v75_gene_bed.R \\ + $gff \\ + ${prefix}.metafusion.gene.bed + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + R: \$(R --version | head -n1) + final_generate_v75_gene_bed.R: 0.0.2 + END_VERSIONS + """ + else if( prefix == 'GRCh38' ) + """ + final_generate_v111_gene_bed.R \\ + $gff \\ + ${prefix}.metafusion.gene.bed + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + R: \$(R --version | head -n1) + final_generate_v111_gene_bed.R: 0.0.1 + END_VERSIONS + """ stub: def args = task.ext.args ?: '' def prefix = task.ext.prefix ?: "${meta.id}" - """ - touch ${prefix}.metafusion.gene.bed - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - R: \$(R --version | head -n1) - final_generate_v75_gene_bed.R: 0.0.2 - END_VERSIONS - """ -} + if( prefix == 'GRCh37' ) + """ + touch ${prefix}.metafusion.gene.bed + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + R: \$(R --version | head -n1) + final_generate_v75_gene_bed.R: 0.0.2 + END_VERSIONS + """ + else if( prefix == 'GRCh38' ) + """ + touch ${prefix}.metafusion.gene.bed + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + R: \$(R --version | head -n1) + final_generate_v111_gene_bed.R: 0.0.1 + END_VERSIONS + """ +} \ No newline at end of file diff --git a/modules/local/metafusion/genebed/resources/usr/bin/final_generate_v111_gene_bed.R b/modules/local/metafusion/genebed/resources/usr/bin/final_generate_v111_gene_bed.R new file mode 100755 index 0000000..afbbf10 --- /dev/null +++ b/modules/local/metafusion/genebed/resources/usr/bin/final_generate_v111_gene_bed.R @@ -0,0 +1,122 @@ +#!/usr/local/bin/Rscript + +# __author__ = "Alexandria Dymun" +# __email__ = "pintoa1@mskcc.org" +# __contributor__ = "Anne Marie Noronha (noronhaa@mskcc.org)" +# __version__ = "0.0.1" +# __status__ = "Dev" + + +suppressPackageStartupMessages({ + library(plyr) + library(dplyr) + library(data.table) + library(stringr) + options(scipen = 999) +}) + +usage <- function() { + message("Usage:") + message("final_generate_v111_gene_bed.R ") +} + +args = commandArgs(TRUE) + +if (length(args)!=2) { + usage() + quit() +} + +gtf <- rtracklayer::import(args[1]) +gtf_df <- as.data.frame(gtf) +#remove incomplete transcripts mRNA_end_NF and mRNA_start_NF (not finished) +gtf_df <- gtf_df[!grepl("NF",gtf_df$tag),] + +file.to_write <- args[2] + +### convert start to 0-based to match metafusion expectations of gff format +gtf_df <- gtf_df %>% + rename( + chr = seqnames + ) %>% + select(c(chr, start, end, transcript_id, type, strand, gene_name, gene_id)) %>% + filter(type %in% c("exon","intron","five_prime_utr","three_prime_utr","CDS")) %>% + mutate(gene_name = ifelse(is.na(gene_name),gene_id,gene_name)) %>% mutate(start = start-1) + + +#START CLOCK +ptm <- proc.time() +print(ptm) + +# Index each transcript feature, incrementing when an intron is passed +## metafusion expects exon count 0 to (N(exons)-1) +## Forward strand: Exon 0 == Exon 1 +### Reverse strand: Exon 0 == LAST EXON IN TRANSCRIPT + +print(dim(gtf_df)) +print(length(unique(gtf_df$transcript_id))) + +modify_transcript <- function(transcript){ + + # Remove exons if coding gene, since "exon" and "CDS" are duplicates of one another + if ("CDS" %in% transcript$type){ + transcript <- transcript[!transcript$type == "exon",] + } + # Order features by increasing bp + transcript <- transcript[order(transcript$start, decreasing = FALSE),] + # Index features + idx <- 0 + for (i in 1:nrow(transcript)){ + transcript$idx[i]<- idx + if (transcript$type[i] == "intron"){ + idx <- idx + 1 + } + } + # REFORMAT TRANSCRIPT + #Change strand info (+ --> f, - --> r) + if (unique(transcript$strand) == "+"){ + transcript$strand <- 'f' + } else if (unique(transcript$strand) == "-"){ + transcript$strand <- 'r' + } else { + errorCondition("Strand info for this transcript is inconsistent") + } + #Add "chr" prefix to chromosomes + transcript$chr <- sapply("chr", paste0, transcript$chr) + #Change CDS --> cds ### IF A TRANSCRIPT LACKS "CDS" THIS LINE WILL DO NOTHING, Changing exon values to UTRs later + transcript <- transcript %>% mutate(type = as.character(type)) + transcript <- transcript %>% mutate(type=ifelse(type == "CDS","cds",type)) + transcript$type[transcript$type == "five_prime_utr"] <- "utr5" + transcript$type[transcript$type == "three_prime_utr"] <- "utr3" + } + + #### Any exon that remains after the cds change, is likely and untranslated region. change below + # Basically, subfeatures which are "exon" need to be changed (i.e. exon --> utr3/utr5) + #Forward strand + transcript$type[transcript$strand == "f" & transcript$type == "exon" ] <- "utr5" + #Reverse strand + transcript$type[transcript$strand == "r" & transcript$type == "exon"]<- "utr3" + expected_types <- c("cds","intron","utr3","utr5") + transcript <- transcript[transcript$type %in% c(expected_types),] + return(transcript) +} + +if(file.exists(file.to_write) ) {file.remove(file.to_write)} + +gtf_df_modified <- gtf_df %>% + group_by(transcript_id,.drop = FALSE) %>% + group_modify(~ modify_transcript(.x)) %>% + select(c(chr, start, end, transcript_id, type, idx, strand, gene_name, gene_id )) %>% + arrange(chr,start,end) + +time <- proc.time() - ptm +print(time) + +write.table( + gtf_df_modified, + file.to_write, + sep="\t", + quote=F, + row.names=F, + col.names=F +) diff --git a/subworkflows/local/prepare_references.nf b/subworkflows/local/prepare_references.nf index ef91f9d..698f376 100644 --- a/subworkflows/local/prepare_references.nf +++ b/subworkflows/local/prepare_references.nf @@ -107,8 +107,7 @@ workflow PREPARE_REFERENCES { ) METAFUSION_GENEBED( - AGAT_SPADDINTRONS.out.gff, - params.ensembl_version + AGAT_SPADDINTRONS.out.gff ) METAFUSION_GENEINFO( From 1e0d3f172e656225d73c6accc557c5bf534583e4 Mon Sep 17 00:00:00 2001 From: Alexandria Pinto Date: Thu, 7 Nov 2024 15:57:24 -0500 Subject: [PATCH 26/28] linting errors --- bin/final_generate_v111_gene_bed.R | 2 -- modules/local/metafusion/genebed/main.nf | 5 ++++- .../genebed/resources/usr/bin/final_generate_v111_gene_bed.R | 2 -- 3 files changed, 4 insertions(+), 5 deletions(-) diff --git a/bin/final_generate_v111_gene_bed.R b/bin/final_generate_v111_gene_bed.R index 28983da..d114ccb 100755 --- a/bin/final_generate_v111_gene_bed.R +++ b/bin/final_generate_v111_gene_bed.R @@ -88,8 +88,6 @@ modify_transcript <- function(transcript){ transcript <- transcript %>% mutate(type=ifelse(type == "CDS","cds",type)) transcript$type[transcript$type == "five_prime_utr"] <- "utr5" transcript$type[transcript$type == "three_prime_utr"] <- "utr3" - - #### Any exon that remains after the cds change, is likely and untranslated region. change below # Basically, subfeatures which are "exon" need to be changed (i.e. exon --> utr3/utr5) #Forward strand diff --git a/modules/local/metafusion/genebed/main.nf b/modules/local/metafusion/genebed/main.nf index 3d11e7c..25a3c1f 100644 --- a/modules/local/metafusion/genebed/main.nf +++ b/modules/local/metafusion/genebed/main.nf @@ -32,6 +32,7 @@ process METAFUSION_GENEBED { final_generate_v75_gene_bed.R: 0.0.2 END_VERSIONS """ + else if( prefix == 'GRCh38' ) """ final_generate_v111_gene_bed.R \\ @@ -58,8 +59,9 @@ process METAFUSION_GENEBED { final_generate_v75_gene_bed.R: 0.0.2 END_VERSIONS """ + else if( prefix == 'GRCh38' ) - """ + """ touch ${prefix}.metafusion.gene.bed cat <<-END_VERSIONS > versions.yml @@ -68,4 +70,5 @@ process METAFUSION_GENEBED { final_generate_v111_gene_bed.R: 0.0.1 END_VERSIONS """ + } \ No newline at end of file diff --git a/modules/local/metafusion/genebed/resources/usr/bin/final_generate_v111_gene_bed.R b/modules/local/metafusion/genebed/resources/usr/bin/final_generate_v111_gene_bed.R index afbbf10..d114ccb 100755 --- a/modules/local/metafusion/genebed/resources/usr/bin/final_generate_v111_gene_bed.R +++ b/modules/local/metafusion/genebed/resources/usr/bin/final_generate_v111_gene_bed.R @@ -88,8 +88,6 @@ modify_transcript <- function(transcript){ transcript <- transcript %>% mutate(type=ifelse(type == "CDS","cds",type)) transcript$type[transcript$type == "five_prime_utr"] <- "utr5" transcript$type[transcript$type == "three_prime_utr"] <- "utr3" - } - #### Any exon that remains after the cds change, is likely and untranslated region. change below # Basically, subfeatures which are "exon" need to be changed (i.e. exon --> utr3/utr5) #Forward strand From e10530535c8ec1febd1bb3bbc0879306ea08c0be Mon Sep 17 00:00:00 2001 From: Alexandria Pinto Date: Thu, 7 Nov 2024 15:58:37 -0500 Subject: [PATCH 27/28] line endings --- modules/local/metafusion/genebed/main.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/local/metafusion/genebed/main.nf b/modules/local/metafusion/genebed/main.nf index 25a3c1f..9cd2769 100644 --- a/modules/local/metafusion/genebed/main.nf +++ b/modules/local/metafusion/genebed/main.nf @@ -71,4 +71,4 @@ process METAFUSION_GENEBED { END_VERSIONS """ -} \ No newline at end of file +} From fb2f514e2712ded692dd09716c556c76e8b41270 Mon Sep 17 00:00:00 2001 From: Alexandria Pinto Date: Wed, 13 Nov 2024 13:15:44 -0500 Subject: [PATCH 28/28] Modify generate gene bed to one script --- bin/final_generate_v111_gene_bed.R | 120 ------------------ ...ate_v75_gene_bed.R => generate_gene_bed.R} | 13 +- modules/local/metafusion/genebed/main.nf | 63 +++------ .../usr/bin/final_generate_v111_gene_bed.R | 120 ------------------ ...ate_v75_gene_bed.R => generate_gene_bed.R} | 14 +- 5 files changed, 27 insertions(+), 303 deletions(-) delete mode 100755 bin/final_generate_v111_gene_bed.R rename bin/{final_generate_v75_gene_bed.R => generate_gene_bed.R} (88%) delete mode 100755 modules/local/metafusion/genebed/resources/usr/bin/final_generate_v111_gene_bed.R rename modules/local/metafusion/genebed/resources/usr/bin/{final_generate_v75_gene_bed.R => generate_gene_bed.R} (88%) diff --git a/bin/final_generate_v111_gene_bed.R b/bin/final_generate_v111_gene_bed.R deleted file mode 100755 index d114ccb..0000000 --- a/bin/final_generate_v111_gene_bed.R +++ /dev/null @@ -1,120 +0,0 @@ -#!/usr/local/bin/Rscript - -# __author__ = "Alexandria Dymun" -# __email__ = "pintoa1@mskcc.org" -# __contributor__ = "Anne Marie Noronha (noronhaa@mskcc.org)" -# __version__ = "0.0.1" -# __status__ = "Dev" - - -suppressPackageStartupMessages({ - library(plyr) - library(dplyr) - library(data.table) - library(stringr) - options(scipen = 999) -}) - -usage <- function() { - message("Usage:") - message("final_generate_v111_gene_bed.R ") -} - -args = commandArgs(TRUE) - -if (length(args)!=2) { - usage() - quit() -} - -gtf <- rtracklayer::import(args[1]) -gtf_df <- as.data.frame(gtf) -#remove incomplete transcripts mRNA_end_NF and mRNA_start_NF (not finished) -gtf_df <- gtf_df[!grepl("NF",gtf_df$tag),] - -file.to_write <- args[2] - -### convert start to 0-based to match metafusion expectations of gff format -gtf_df <- gtf_df %>% - rename( - chr = seqnames - ) %>% - select(c(chr, start, end, transcript_id, type, strand, gene_name, gene_id)) %>% - filter(type %in% c("exon","intron","five_prime_utr","three_prime_utr","CDS")) %>% - mutate(gene_name = ifelse(is.na(gene_name),gene_id,gene_name)) %>% mutate(start = start-1) - - -#START CLOCK -ptm <- proc.time() -print(ptm) - -# Index each transcript feature, incrementing when an intron is passed -## metafusion expects exon count 0 to (N(exons)-1) -## Forward strand: Exon 0 == Exon 1 -### Reverse strand: Exon 0 == LAST EXON IN TRANSCRIPT - -print(dim(gtf_df)) -print(length(unique(gtf_df$transcript_id))) - -modify_transcript <- function(transcript){ - - # Remove exons if coding gene, since "exon" and "CDS" are duplicates of one another - if ("CDS" %in% transcript$type){ - transcript <- transcript[!transcript$type == "exon",] - } - # Order features by increasing bp - transcript <- transcript[order(transcript$start, decreasing = FALSE),] - # Index features - idx <- 0 - for (i in 1:nrow(transcript)){ - transcript$idx[i]<- idx - if (transcript$type[i] == "intron"){ - idx <- idx + 1 - } - } - # REFORMAT TRANSCRIPT - #Change strand info (+ --> f, - --> r) - if (unique(transcript$strand) == "+"){ - transcript$strand <- 'f' - } else if (unique(transcript$strand) == "-"){ - transcript$strand <- 'r' - } else { - errorCondition("Strand info for this transcript is inconsistent") - } - #Add "chr" prefix to chromosomes - transcript$chr <- sapply("chr", paste0, transcript$chr) - #Change CDS --> cds ### IF A TRANSCRIPT LACKS "CDS" THIS LINE WILL DO NOTHING, Changing exon values to UTRs later - transcript <- transcript %>% mutate(type = as.character(type)) - transcript <- transcript %>% mutate(type=ifelse(type == "CDS","cds",type)) - transcript$type[transcript$type == "five_prime_utr"] <- "utr5" - transcript$type[transcript$type == "three_prime_utr"] <- "utr3" - #### Any exon that remains after the cds change, is likely and untranslated region. change below - # Basically, subfeatures which are "exon" need to be changed (i.e. exon --> utr3/utr5) - #Forward strand - transcript$type[transcript$strand == "f" & transcript$type == "exon" ] <- "utr5" - #Reverse strand - transcript$type[transcript$strand == "r" & transcript$type == "exon"]<- "utr3" - expected_types <- c("cds","intron","utr3","utr5") - transcript <- transcript[transcript$type %in% c(expected_types),] - return(transcript) -} - -if(file.exists(file.to_write) ) {file.remove(file.to_write)} - -gtf_df_modified <- gtf_df %>% - group_by(transcript_id,.drop = FALSE) %>% - group_modify(~ modify_transcript(.x)) %>% - select(c(chr, start, end, transcript_id, type, idx, strand, gene_name, gene_id )) %>% - arrange(chr,start,end) - -time <- proc.time() - ptm -print(time) - -write.table( - gtf_df_modified, - file.to_write, - sep="\t", - quote=F, - row.names=F, - col.names=F -) diff --git a/bin/final_generate_v75_gene_bed.R b/bin/generate_gene_bed.R similarity index 88% rename from bin/final_generate_v75_gene_bed.R rename to bin/generate_gene_bed.R index a25b3ef..2a15149 100755 --- a/bin/final_generate_v75_gene_bed.R +++ b/bin/generate_gene_bed.R @@ -17,7 +17,7 @@ suppressPackageStartupMessages({ usage <- function() { message("Usage:") - message("final_generate_v75_gene_bed.R ") + message("generate_gene_bed.R ") } args = commandArgs(TRUE) @@ -27,13 +27,6 @@ if (length(args)!=2) { quit() } -# Utilized gtf from igenomes for FORTE This corresponds to GRCh37 ensembl 75 -# Add introns to gtf, convert to gff3 -# bsub -R "rusage[mem=64]" -o add_introns_agat_%J.out singularity exec -B /juno/ \\ -# -B /tmp -B /scratch/ docker://quay.io/biocontainers/agat:0.8.0--pl5262hdfd78af_0 \\ -# /bin/bash -c "agat_sp_add_introns.pl -g /juno/work/taylorlab/cmopipeline/mskcc-igenomes/igenomes/Homo_sapiens/Ensembl/GRCh37/Annotation/Genes/genes.gtf\\ -# -o genes.INTRONS.gff3" - gtf <- rtracklayer::import(args[1]) gtf_df <- as.data.frame(gtf) #remove incomplete transcripts mRNA_end_NF and mRNA_start_NF (not finished) @@ -47,7 +40,7 @@ gtf_df <- gtf_df %>% chr = seqnames ) %>% select(c(chr, start, end, transcript_id, type, strand, gene_name, gene_id)) %>% - filter(type %in% c("exon","intron","UTR","CDS","cds","utr")) %>% + filter(type %in% c("exon","intron","UTR","CDS","cds","utr","five_prime_utr","three_prime_utr")) %>% mutate(gene_name = ifelse(is.na(gene_name),gene_id,gene_name)) %>% mutate(start = start-1) @@ -110,6 +103,8 @@ modify_transcript <- function(transcript){ transcript$type[transcript$start >= stop_coding & transcript$type == "UTR"] <- "utr5" } } + transcript$type[transcript$type == "five_prime_utr"] <- "utr5" + transcript$type[transcript$type == "three_prime_utr"] <- "utr3" #### Any exon that remains after teh cds change, is likely and untranslated region. change below # Basically, subfeatures which are "exon" need to be changed (i.e. exon --> utr3/utr5) diff --git a/modules/local/metafusion/genebed/main.nf b/modules/local/metafusion/genebed/main.nf index 9cd2769..1fb97b5 100644 --- a/modules/local/metafusion/genebed/main.nf +++ b/modules/local/metafusion/genebed/main.nf @@ -20,55 +20,30 @@ process METAFUSION_GENEBED { script: def args = task.ext.args ?: '' def prefix = task.ext.prefix ?: "${meta.id}" - if( prefix == 'GRCh37' ) - """ - final_generate_v75_gene_bed.R \\ - $gff \\ - ${prefix}.metafusion.gene.bed - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - R: \$(R --version | head -n1) - final_generate_v75_gene_bed.R: 0.0.2 - END_VERSIONS - """ - - else if( prefix == 'GRCh38' ) - """ - final_generate_v111_gene_bed.R \\ - $gff \\ - ${prefix}.metafusion.gene.bed - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - R: \$(R --version | head -n1) - final_generate_v111_gene_bed.R: 0.0.1 - END_VERSIONS - """ + """ + generate_gene_bed.R \\ + $gff \\ + ${prefix}.metafusion.gene.bed + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + R: \$(R --version | head -n1) + generate_gene_bed.R: 0.0.2 + END_VERSIONS + """ stub: def args = task.ext.args ?: '' def prefix = task.ext.prefix ?: "${meta.id}" - if( prefix == 'GRCh37' ) - """ - touch ${prefix}.metafusion.gene.bed - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - R: \$(R --version | head -n1) - final_generate_v75_gene_bed.R: 0.0.2 - END_VERSIONS - """ - else if( prefix == 'GRCh38' ) - """ - touch ${prefix}.metafusion.gene.bed + """ + touch ${prefix}.metafusion.gene.bed - cat <<-END_VERSIONS > versions.yml - "${task.process}": - R: \$(R --version | head -n1) - final_generate_v111_gene_bed.R: 0.0.1 - END_VERSIONS - """ + cat <<-END_VERSIONS > versions.yml + "${task.process}": + R: \$(R --version | head -n1) + generate_gene_bed.R: 0.0.2 + END_VERSIONS + """ } diff --git a/modules/local/metafusion/genebed/resources/usr/bin/final_generate_v111_gene_bed.R b/modules/local/metafusion/genebed/resources/usr/bin/final_generate_v111_gene_bed.R deleted file mode 100755 index d114ccb..0000000 --- a/modules/local/metafusion/genebed/resources/usr/bin/final_generate_v111_gene_bed.R +++ /dev/null @@ -1,120 +0,0 @@ -#!/usr/local/bin/Rscript - -# __author__ = "Alexandria Dymun" -# __email__ = "pintoa1@mskcc.org" -# __contributor__ = "Anne Marie Noronha (noronhaa@mskcc.org)" -# __version__ = "0.0.1" -# __status__ = "Dev" - - -suppressPackageStartupMessages({ - library(plyr) - library(dplyr) - library(data.table) - library(stringr) - options(scipen = 999) -}) - -usage <- function() { - message("Usage:") - message("final_generate_v111_gene_bed.R ") -} - -args = commandArgs(TRUE) - -if (length(args)!=2) { - usage() - quit() -} - -gtf <- rtracklayer::import(args[1]) -gtf_df <- as.data.frame(gtf) -#remove incomplete transcripts mRNA_end_NF and mRNA_start_NF (not finished) -gtf_df <- gtf_df[!grepl("NF",gtf_df$tag),] - -file.to_write <- args[2] - -### convert start to 0-based to match metafusion expectations of gff format -gtf_df <- gtf_df %>% - rename( - chr = seqnames - ) %>% - select(c(chr, start, end, transcript_id, type, strand, gene_name, gene_id)) %>% - filter(type %in% c("exon","intron","five_prime_utr","three_prime_utr","CDS")) %>% - mutate(gene_name = ifelse(is.na(gene_name),gene_id,gene_name)) %>% mutate(start = start-1) - - -#START CLOCK -ptm <- proc.time() -print(ptm) - -# Index each transcript feature, incrementing when an intron is passed -## metafusion expects exon count 0 to (N(exons)-1) -## Forward strand: Exon 0 == Exon 1 -### Reverse strand: Exon 0 == LAST EXON IN TRANSCRIPT - -print(dim(gtf_df)) -print(length(unique(gtf_df$transcript_id))) - -modify_transcript <- function(transcript){ - - # Remove exons if coding gene, since "exon" and "CDS" are duplicates of one another - if ("CDS" %in% transcript$type){ - transcript <- transcript[!transcript$type == "exon",] - } - # Order features by increasing bp - transcript <- transcript[order(transcript$start, decreasing = FALSE),] - # Index features - idx <- 0 - for (i in 1:nrow(transcript)){ - transcript$idx[i]<- idx - if (transcript$type[i] == "intron"){ - idx <- idx + 1 - } - } - # REFORMAT TRANSCRIPT - #Change strand info (+ --> f, - --> r) - if (unique(transcript$strand) == "+"){ - transcript$strand <- 'f' - } else if (unique(transcript$strand) == "-"){ - transcript$strand <- 'r' - } else { - errorCondition("Strand info for this transcript is inconsistent") - } - #Add "chr" prefix to chromosomes - transcript$chr <- sapply("chr", paste0, transcript$chr) - #Change CDS --> cds ### IF A TRANSCRIPT LACKS "CDS" THIS LINE WILL DO NOTHING, Changing exon values to UTRs later - transcript <- transcript %>% mutate(type = as.character(type)) - transcript <- transcript %>% mutate(type=ifelse(type == "CDS","cds",type)) - transcript$type[transcript$type == "five_prime_utr"] <- "utr5" - transcript$type[transcript$type == "three_prime_utr"] <- "utr3" - #### Any exon that remains after the cds change, is likely and untranslated region. change below - # Basically, subfeatures which are "exon" need to be changed (i.e. exon --> utr3/utr5) - #Forward strand - transcript$type[transcript$strand == "f" & transcript$type == "exon" ] <- "utr5" - #Reverse strand - transcript$type[transcript$strand == "r" & transcript$type == "exon"]<- "utr3" - expected_types <- c("cds","intron","utr3","utr5") - transcript <- transcript[transcript$type %in% c(expected_types),] - return(transcript) -} - -if(file.exists(file.to_write) ) {file.remove(file.to_write)} - -gtf_df_modified <- gtf_df %>% - group_by(transcript_id,.drop = FALSE) %>% - group_modify(~ modify_transcript(.x)) %>% - select(c(chr, start, end, transcript_id, type, idx, strand, gene_name, gene_id )) %>% - arrange(chr,start,end) - -time <- proc.time() - ptm -print(time) - -write.table( - gtf_df_modified, - file.to_write, - sep="\t", - quote=F, - row.names=F, - col.names=F -) diff --git a/modules/local/metafusion/genebed/resources/usr/bin/final_generate_v75_gene_bed.R b/modules/local/metafusion/genebed/resources/usr/bin/generate_gene_bed.R similarity index 88% rename from modules/local/metafusion/genebed/resources/usr/bin/final_generate_v75_gene_bed.R rename to modules/local/metafusion/genebed/resources/usr/bin/generate_gene_bed.R index 1fb3d76..2a15149 100755 --- a/modules/local/metafusion/genebed/resources/usr/bin/final_generate_v75_gene_bed.R +++ b/modules/local/metafusion/genebed/resources/usr/bin/generate_gene_bed.R @@ -17,7 +17,7 @@ suppressPackageStartupMessages({ usage <- function() { message("Usage:") - message("final_generate_v75_gene_bed.R ") + message("generate_gene_bed.R ") } args = commandArgs(TRUE) @@ -27,13 +27,6 @@ if (length(args)!=2) { quit() } -# Utilized gtf from igenomes for FORTE This corresponds to GRCh37 ensembl 75 -# Add introns to gtf, convert to gff3 -# bsub -R "rusage[mem=64]" -o add_introns_agat_%J.out singularity exec -B /juno/ \\ -# -B /tmp -B /scratch/ docker://quay.io/biocontainers/agat:0.8.0--pl5262hdfd78af_0 \\ -# /bin/bash -c "agat_sp_add_introns.pl -g /juno/work/taylorlab/cmopipeline/mskcc-igenomes/igenomes/Homo_sapiens/Ensembl/GRCh37/Annotation/Genes/genes.gtf\\ -# -o genes.INTRONS.gff3" - gtf <- rtracklayer::import(args[1]) gtf_df <- as.data.frame(gtf) #remove incomplete transcripts mRNA_end_NF and mRNA_start_NF (not finished) @@ -47,11 +40,10 @@ gtf_df <- gtf_df %>% chr = seqnames ) %>% select(c(chr, start, end, transcript_id, type, strand, gene_name, gene_id)) %>% - filter(type %in% c("exon","intron","UTR","CDS","cds","utr")) %>% + filter(type %in% c("exon","intron","UTR","CDS","cds","utr","five_prime_utr","three_prime_utr")) %>% mutate(gene_name = ifelse(is.na(gene_name),gene_id,gene_name)) %>% mutate(start = start-1) - #START CLOCK ptm <- proc.time() print(ptm) @@ -111,6 +103,8 @@ modify_transcript <- function(transcript){ transcript$type[transcript$start >= stop_coding & transcript$type == "UTR"] <- "utr5" } } + transcript$type[transcript$type == "five_prime_utr"] <- "utr5" + transcript$type[transcript$type == "three_prime_utr"] <- "utr3" #### Any exon that remains after teh cds change, is likely and untranslated region. change below # Basically, subfeatures which are "exon" need to be changed (i.e. exon --> utr3/utr5)