From 45c8f77110c596a711ba320c2e17a869591b6382 Mon Sep 17 00:00:00 2001 From: luisas Date: Wed, 29 May 2024 12:45:47 +0200 Subject: [PATCH 01/16] Fix linting warnings --- .nf-core.yml | 2 + modules.json | 110 +++++++++++++----- modules/local/multiqc.nf | 2 +- modules/nf-core/csvtk/concat/environment.yml | 6 +- modules/nf-core/csvtk/concat/main.nf | 16 ++- .../nf-core/csvtk/concat/tests/main.nf.test | 67 +++++++++++ .../csvtk/concat/tests/main.nf.test.snap | 60 ++++++++++ modules/nf-core/csvtk/concat/tests/tags.yml | 2 + modules/nf-core/csvtk/join/environment.yml | 2 +- modules/nf-core/csvtk/join/main.nf | 5 +- modules/nf-core/csvtk/join/tests/main.nf.test | 64 ++++++++++ .../csvtk/join/tests/main.nf.test.snap | 60 ++++++++++ .../nf-core/csvtk/join/tests/nextflow.config | 5 + modules/nf-core/csvtk/join/tests/tags.yml | 2 + .../mtmalign/align/tests/main.nf.test.snap | 20 +++- modules/nf-core/tcoffee/align/main.nf | 2 +- .../nf-core/tcoffee/align/tests/lib.config | 3 + 17 files changed, 386 insertions(+), 42 deletions(-) create mode 100644 modules/nf-core/csvtk/concat/tests/main.nf.test create mode 100644 modules/nf-core/csvtk/concat/tests/main.nf.test.snap create mode 100644 modules/nf-core/csvtk/concat/tests/tags.yml create mode 100644 modules/nf-core/csvtk/join/tests/main.nf.test create mode 100644 modules/nf-core/csvtk/join/tests/main.nf.test.snap create mode 100644 modules/nf-core/csvtk/join/tests/nextflow.config create mode 100644 modules/nf-core/csvtk/join/tests/tags.yml create mode 100644 modules/nf-core/tcoffee/align/tests/lib.config diff --git a/.nf-core.yml b/.nf-core.yml index cb2323da..9491102a 100644 --- a/.nf-core.yml +++ b/.nf-core.yml @@ -2,3 +2,5 @@ repository_type: pipeline nf_core_version: "2.14.1" lint: multiqc_config: False + files_exist: + conf/igenomes.config \ No newline at end of file diff --git a/modules.json b/modules.json index 4536422e..4e5b91a4 100644 --- a/modules.json +++ b/modules.json @@ -8,112 +8,156 @@ "clustalo/align": { "branch": "master", "git_sha": "faf557ba56156ac0e5de76a25c1e3df11c944f59", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] }, "clustalo/guidetree": { "branch": "master", "git_sha": "1f253ec05723293df7757af8769f8389b7a1884e", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] }, "csvtk/concat": { "branch": "master", - "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", - "installed_by": ["modules"] + "git_sha": "cfe2a24902bfdfe8132f11461ffda92d257f9f09", + "installed_by": [ + "modules" + ] }, "csvtk/join": { "branch": "master", - "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", - "installed_by": ["modules"] + "git_sha": "614abbf126f287a3068dc86997b2e1b6a93abe20", + "installed_by": [ + "modules" + ] }, "famsa/align": { "branch": "master", "git_sha": "faf557ba56156ac0e5de76a25c1e3df11c944f59", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] }, "famsa/guidetree": { "branch": "master", "git_sha": "46789a4621be261f10dab0033f46f34779a5afc9", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] }, "fastqc": { "branch": "master", "git_sha": "285a50500f9e02578d90b3ce6382ea3c30216acd", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] }, "kalign/align": { "branch": "master", "git_sha": "7afd02d048ad0100be37fa1741816265c4aa307c", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] }, "learnmsa/align": { "branch": "master", "git_sha": "62007703c84bcfef92ce9e4a57cb1cc382917201", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] }, "mafft": { "branch": "master", "git_sha": "faf557ba56156ac0e5de76a25c1e3df11c944f59", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] }, "magus/align": { "branch": "master", "git_sha": "dc37bcdfa78fe3e9ca56e4b85e1621333c7b4301", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] }, "magus/guidetree": { "branch": "master", "git_sha": "dc37bcdfa78fe3e9ca56e4b85e1621333c7b4301", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] }, "mtmalign/align": { "branch": "master", - "git_sha": "faf557ba56156ac0e5de76a25c1e3df11c944f59", - "installed_by": ["modules"] + "git_sha": "5caf7640a9ef1d18d765d55339be751bb0969dfa", + "installed_by": [ + "modules" + ] }, "multiqc": { "branch": "master", "git_sha": "b7ebe95761cd389603f9cc0e0dc384c0f663815a", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] }, "muscle5/super5": { "branch": "master", "git_sha": "faf557ba56156ac0e5de76a25c1e3df11c944f59", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] }, "pigz/uncompress": { "branch": "master", "git_sha": "d7f0de8aae7bf84b080dfdcf4e294bf11a46a51c", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] }, "tcoffee/align": { "branch": "master", - "git_sha": "c65917d37cdaa9d6c26fccf8f7c313aab1a51d8a", - "installed_by": ["modules"] + "git_sha": "5c82ca0a942f2793859bb2f25601eb69c50590dc", + "installed_by": [ + "modules" + ] }, "tcoffee/alncompare": { "branch": "master", "git_sha": "faf557ba56156ac0e5de76a25c1e3df11c944f59", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] }, "tcoffee/irmsd": { "branch": "master", "git_sha": "faf557ba56156ac0e5de76a25c1e3df11c944f59", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] }, "tcoffee/seqreformat": { "branch": "master", "git_sha": "b04c647f465bea2c5bb9871503182236cd65b246", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] }, "tcoffee/tcs": { "branch": "master", "git_sha": "2d5ea4959c36da8c21e74a0c5e8ecc6b101b999e", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] }, "untar": { "branch": "master", "git_sha": "5caf7640a9ef1d18d765d55339be751bb0969dfa", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] } } }, @@ -122,20 +166,26 @@ "utils_nextflow_pipeline": { "branch": "master", "git_sha": "5caf7640a9ef1d18d765d55339be751bb0969dfa", - "installed_by": ["subworkflows"] + "installed_by": [ + "subworkflows" + ] }, "utils_nfcore_pipeline": { "branch": "master", "git_sha": "92de218a329bfc9a9033116eb5f65fd270e72ba3", - "installed_by": ["subworkflows"] + "installed_by": [ + "subworkflows" + ] }, "utils_nfvalidation_plugin": { "branch": "master", "git_sha": "5caf7640a9ef1d18d765d55339be751bb0969dfa", - "installed_by": ["subworkflows"] + "installed_by": [ + "subworkflows" + ] } } } } } -} +} \ No newline at end of file diff --git a/modules/local/multiqc.nf b/modules/local/multiqc.nf index 4a069c5c..0b68d71e 100644 --- a/modules/local/multiqc.nf +++ b/modules/local/multiqc.nf @@ -2,7 +2,7 @@ process MULTIQC { label 'process_medium' conda 'bioconda::multiqc=1.22.1' - container "community.wave.seqera.io/library/pip_multiqc:2c2e276ad8997cc4" + container "community.wave.seqera.io/library/multiqc:1.22.1--4886de6095538010" input: path multiqc_config diff --git a/modules/nf-core/csvtk/concat/environment.yml b/modules/nf-core/csvtk/concat/environment.yml index ed1ba26b..ac58390c 100644 --- a/modules/nf-core/csvtk/concat/environment.yml +++ b/modules/nf-core/csvtk/concat/environment.yml @@ -1,7 +1,9 @@ -name: csvtk_concat +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json +name: "csvtk_concat" channels: - conda-forge - bioconda - defaults dependencies: - - bioconda::csvtk=0.23.0 + - "bioconda::csvtk=0.30.0" diff --git a/modules/nf-core/csvtk/concat/main.nf b/modules/nf-core/csvtk/concat/main.nf index 16e59f64..741ed551 100644 --- a/modules/nf-core/csvtk/concat/main.nf +++ b/modules/nf-core/csvtk/concat/main.nf @@ -4,8 +4,8 @@ process CSVTK_CONCAT { conda "${moduleDir}/environment.yml" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/csvtk:0.23.0--h9ee0642_0' : - 'biocontainers/csvtk:0.23.0--h9ee0642_0' }" + 'https://depot.galaxyproject.org/singularity/csvtk:0.30.0--h9ee0642_0' : + 'biocontainers/csvtk:0.30.0--h9ee0642_0' }" input: tuple val(meta), path(csv) @@ -40,4 +40,16 @@ process CSVTK_CONCAT { csvtk: \$(echo \$( csvtk version | sed -e "s/csvtk v//g" )) END_VERSIONS """ + + stub: + prefix = task.ext.prefix ?: "${meta.id}" + out_extension = out_format == "tsv" ? 'tsv' : 'csv' + """ + touch ${prefix}.${out_extension} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + csvtk: \$(echo \$( csvtk version | sed -e "s/csvtk v//g" )) + END_VERSIONS + """ } diff --git a/modules/nf-core/csvtk/concat/tests/main.nf.test b/modules/nf-core/csvtk/concat/tests/main.nf.test new file mode 100644 index 00000000..13f20147 --- /dev/null +++ b/modules/nf-core/csvtk/concat/tests/main.nf.test @@ -0,0 +1,67 @@ +// nf-core modules test csvtk/concat +nextflow_process { + + name "Test Process CSVTK_CONCAT" + script "../main.nf" + process "CSVTK_CONCAT" + + tag "modules" + tag "modules_nfcore" + tag "csvtk" + tag "csvtk/concat" + + test("tsv - concat - csv") { + + when { + process { + """ + input[0] = [ + [ id:'test' ], // meta map + [ file("https://github.com/nf-core/test-datasets/raw/bacass/bacass_hybrid.csv", checkIfExists: true), + file("https://github.com/nf-core/test-datasets/raw/bacass/bacass_long.csv", checkIfExists: true), + file("https://github.com/nf-core/test-datasets/raw/bacass/bacass_short.csv", checkIfExists: true) ] + ] + input[1] = "tsv" + input[2] = "csv" + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + + test("tsv - concat - csv - stub") { + + options "-stub" + + when { + process { + """ + input[0] = [ + [ id:'test' ], // meta map + [ file("https://github.com/nf-core/test-datasets/raw/bacass/bacass_hybrid.csv", checkIfExists: true), + file("https://github.com/nf-core/test-datasets/raw/bacass/bacass_long.csv", checkIfExists: true), + file("https://github.com/nf-core/test-datasets/raw/bacass/bacass_short.csv", checkIfExists: true) ] + ] + input[1] = "tsv" + input[2] = "csv" + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + +} diff --git a/modules/nf-core/csvtk/concat/tests/main.nf.test.snap b/modules/nf-core/csvtk/concat/tests/main.nf.test.snap new file mode 100644 index 00000000..777114ba --- /dev/null +++ b/modules/nf-core/csvtk/concat/tests/main.nf.test.snap @@ -0,0 +1,60 @@ +{ + "tsv - concat - csv - stub": { + "content": [ + { + "0": [ + [ + { + "id": "test" + }, + "test.csv:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "1": [ + "versions.yml:md5,c04e6be6df50305cd689a92aacec947b" + ], + "csv": [ + [ + { + "id": "test" + }, + "test.csv:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "versions": [ + "versions.yml:md5,c04e6be6df50305cd689a92aacec947b" + ] + } + ], + "timestamp": "2024-05-17T12:43:26.787254" + }, + "tsv - concat - csv": { + "content": [ + { + "0": [ + [ + { + "id": "test" + }, + "test.csv:md5,917fe5d857f04b58e0f49c384d167cec" + ] + ], + "1": [ + "versions.yml:md5,c04e6be6df50305cd689a92aacec947b" + ], + "csv": [ + [ + { + "id": "test" + }, + "test.csv:md5,917fe5d857f04b58e0f49c384d167cec" + ] + ], + "versions": [ + "versions.yml:md5,c04e6be6df50305cd689a92aacec947b" + ] + } + ], + "timestamp": "2024-05-17T12:43:17.930902" + } +} \ No newline at end of file diff --git a/modules/nf-core/csvtk/concat/tests/tags.yml b/modules/nf-core/csvtk/concat/tests/tags.yml new file mode 100644 index 00000000..0d10e7c9 --- /dev/null +++ b/modules/nf-core/csvtk/concat/tests/tags.yml @@ -0,0 +1,2 @@ +csvtk/concat: + - "modules/nf-core/csvtk/concat/**" diff --git a/modules/nf-core/csvtk/join/environment.yml b/modules/nf-core/csvtk/join/environment.yml index b488c861..5b6c6468 100644 --- a/modules/nf-core/csvtk/join/environment.yml +++ b/modules/nf-core/csvtk/join/environment.yml @@ -4,4 +4,4 @@ channels: - bioconda - defaults dependencies: - - bioconda::csvtk=0.26.0 + - bioconda::csvtk=0.30.0 diff --git a/modules/nf-core/csvtk/join/main.nf b/modules/nf-core/csvtk/join/main.nf index bf02e7f5..5f3afeea 100644 --- a/modules/nf-core/csvtk/join/main.nf +++ b/modules/nf-core/csvtk/join/main.nf @@ -4,8 +4,8 @@ process CSVTK_JOIN { conda "${moduleDir}/environment.yml" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/csvtk:0.26.0--h9ee0642_0': - 'biocontainers/csvtk:0.26.0--h9ee0642_0' }" + 'https://depot.galaxyproject.org/singularity/csvtk:0.30.0--h9ee0642_0': + 'biocontainers/csvtk:0.30.0--h9ee0642_0' }" input: tuple val(meta), path(csv) @@ -36,7 +36,6 @@ process CSVTK_JOIN { """ stub: - def args = task.ext.args ?: '' prefix = task.ext.prefix ?: "${meta.id}" out_extension = args.contains('--out-delimiter "\t"') || args.contains('-D "\t"') || args.contains("-D \$'\t'") ? "tsv" : "csv" """ diff --git a/modules/nf-core/csvtk/join/tests/main.nf.test b/modules/nf-core/csvtk/join/tests/main.nf.test new file mode 100644 index 00000000..3cf178c4 --- /dev/null +++ b/modules/nf-core/csvtk/join/tests/main.nf.test @@ -0,0 +1,64 @@ +nextflow_process { + + name "Test Process CSVTK_JOIN" + script "../main.nf" + process "CSVTK_JOIN" + + tag "modules" + tag "modules_nfcore" + tag "csvtk" + tag "csvtk/join" + + test("join - csv") { + + when { + process { + """ + input[0] = [ + [ id:'test' ], // meta map + [ + file("https://github.com/nf-core/test-datasets/raw/bacass/bacass_hybrid.csv", checkIfExists: true), + file("https://github.com/nf-core/test-datasets/raw/bacass/bacass_short.csv", checkIfExists: true), + ] + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + + test("join - csv - stub") { + + options "-stub" + + when { + process { + """ + input[0] = [ + [ id:'test' ], // meta map + [ + file("https://github.com/nf-core/test-datasets/raw/bacass/bacass_hybrid.csv", checkIfExists: true), + file("https://github.com/nf-core/test-datasets/raw/bacass/bacass_short.csv", checkIfExists: true), + ] + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + +} diff --git a/modules/nf-core/csvtk/join/tests/main.nf.test.snap b/modules/nf-core/csvtk/join/tests/main.nf.test.snap new file mode 100644 index 00000000..b124788b --- /dev/null +++ b/modules/nf-core/csvtk/join/tests/main.nf.test.snap @@ -0,0 +1,60 @@ +{ + "join - csv": { + "content": [ + { + "0": [ + [ + { + "id": "test" + }, + "test.csv:md5,d0ad82ca096c7e05eb9f9a04194c9e30" + ] + ], + "1": [ + "versions.yml:md5,e76147e4eca968d23543e7007522f1d3" + ], + "csv": [ + [ + { + "id": "test" + }, + "test.csv:md5,d0ad82ca096c7e05eb9f9a04194c9e30" + ] + ], + "versions": [ + "versions.yml:md5,e76147e4eca968d23543e7007522f1d3" + ] + } + ], + "timestamp": "2024-05-21T15:45:44.045434" + }, + "join - csv - stub": { + "content": [ + { + "0": [ + [ + { + "id": "test" + }, + "test.csv:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "1": [ + "versions.yml:md5,e76147e4eca968d23543e7007522f1d3" + ], + "csv": [ + [ + { + "id": "test" + }, + "test.csv:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "versions": [ + "versions.yml:md5,e76147e4eca968d23543e7007522f1d3" + ] + } + ], + "timestamp": "2024-05-21T15:45:55.59201" + } +} \ No newline at end of file diff --git a/modules/nf-core/csvtk/join/tests/nextflow.config b/modules/nf-core/csvtk/join/tests/nextflow.config new file mode 100644 index 00000000..1b14393a --- /dev/null +++ b/modules/nf-core/csvtk/join/tests/nextflow.config @@ -0,0 +1,5 @@ +process { + withName: CSVTK_JOIN { + ext.args = "--fields 'ID;ID' -p -e -d \"\t\" -D \",\"" + } +} diff --git a/modules/nf-core/csvtk/join/tests/tags.yml b/modules/nf-core/csvtk/join/tests/tags.yml new file mode 100644 index 00000000..6c3a0fa6 --- /dev/null +++ b/modules/nf-core/csvtk/join/tests/tags.yml @@ -0,0 +1,2 @@ +csvtk/join: + - "modules/nf-core/csvtk/join/**" diff --git a/modules/nf-core/mtmalign/align/tests/main.nf.test.snap b/modules/nf-core/mtmalign/align/tests/main.nf.test.snap index eb321457..28a66506 100644 --- a/modules/nf-core/mtmalign/align/tests/main.nf.test.snap +++ b/modules/nf-core/mtmalign/align/tests/main.nf.test.snap @@ -1,10 +1,26 @@ { - "versions": { + "versions0": { "content": [ [ "versions.yml:md5,7cbacec15bb9e0c8cbb27610bde74c10" ] ], - "timestamp": "2024-01-25T18:21:22.385207003" + "meta": { + "nf-test": "0.8.4", + "nextflow": "24.01.0" + }, + "timestamp": "2024-02-28T19:04:20.621422" + }, + "versions1": { + "content": [ + [ + "versions.yml:md5,7cbacec15bb9e0c8cbb27610bde74c10" + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "24.01.0" + }, + "timestamp": "2024-02-28T19:04:47.062692" } } \ No newline at end of file diff --git a/modules/nf-core/tcoffee/align/main.nf b/modules/nf-core/tcoffee/align/main.nf index e159bb80..a7aa106c 100644 --- a/modules/nf-core/tcoffee/align/main.nf +++ b/modules/nf-core/tcoffee/align/main.nf @@ -16,7 +16,7 @@ process TCOFFEE_ALIGN { output: tuple val(meta), path("*.aln{.gz,}"), emit: alignment // in the args there might be the request to generate a lib file, so the following is an optional output - tuple val(meta), path("*.*lib") , emit: lib, optional : true + tuple val(meta), path("*.*lib") , emit: lib, optional : true path "versions.yml" , emit: versions when: diff --git a/modules/nf-core/tcoffee/align/tests/lib.config b/modules/nf-core/tcoffee/align/tests/lib.config new file mode 100644 index 00000000..2fc113ef --- /dev/null +++ b/modules/nf-core/tcoffee/align/tests/lib.config @@ -0,0 +1,3 @@ +process { + ext.args = { "-output fasta_aln -out_lib=sample_lib1.tc_lib" } +} \ No newline at end of file From 1db22b5bb2fd0331e1d9f684d6912d0c45150012 Mon Sep 17 00:00:00 2001 From: luisas Date: Tue, 4 Jun 2024 16:28:21 +0200 Subject: [PATCH 02/16] Improve parameter handling --- assets/schema_input.json | 3 +- assets/toolsheet.csv | 4 ++ conf/test.config | 3 +- modules.json | 5 +- modules/local/multiqc.nf | 2 +- modules/nf-core/csvtk/join/csvtk-join.diff | 24 +++++++ modules/nf-core/csvtk/join/main.nf | 7 +- modules/nf-core/mtmalign/align/main.nf | 6 +- .../nf-core/mtmalign/align/tests/main.nf.test | 21 +++--- .../mtmalign/align/tests/main.nf.test.snap | 8 +-- nextflow.config | 11 +-- nextflow_schema.json | 8 ++- subworkflows/local/align.nf | 67 ++++++++++++------- subworkflows/local/evaluate.nf | 2 +- subworkflows/local/stats.nf | 2 +- .../main.nf | 23 +++++-- workflows/multiplesequencealign.nf | 50 +++++++------- 17 files changed, 159 insertions(+), 87 deletions(-) create mode 100644 assets/toolsheet.csv create mode 100644 modules/nf-core/csvtk/join/csvtk-join.diff diff --git a/assets/schema_input.json b/assets/schema_input.json index f5f85025..1678b9d9 100644 --- a/assets/schema_input.json +++ b/assets/schema_input.json @@ -28,6 +28,7 @@ "type": "string" } }, - "required": ["id", "fasta"] + "required": ["id"], + "anyOf": [{ "required": ["fasta"] }, { "required": ["structures"] }] } } diff --git a/assets/toolsheet.csv b/assets/toolsheet.csv new file mode 100644 index 00000000..90d8ddb5 --- /dev/null +++ b/assets/toolsheet.csv @@ -0,0 +1,4 @@ +tree,args_tree,aligner,args_aligner +FAMSA,-gt upgma -parttree,FAMSA, +,,TCOFFEE,-output fasta_aln +,,MTMALIGN,, \ No newline at end of file diff --git a/conf/test.config b/conf/test.config index 544767dd..8c297066 100644 --- a/conf/test.config +++ b/conf/test.config @@ -23,7 +23,8 @@ params { skip_multiqc = false // Input data input = params.pipelines_testdata_base_path + 'multiplesequencealign/samplesheet/v1.0/samplesheet_test_af2.csv' - tools = params.pipelines_testdata_base_path + 'multiplesequencealign/toolsheet/v1.0/toolsheet.csv' + //tools = params.pipelines_testdata_base_path + 'multiplesequencealign/toolsheet/v1.0/toolsheet.csv' + tools = "./assets/toolsheet.csv" // Output directory outdir = "./outdir/" diff --git a/modules.json b/modules.json index 4e5b91a4..fd9ac491 100644 --- a/modules.json +++ b/modules.json @@ -31,7 +31,8 @@ "git_sha": "614abbf126f287a3068dc86997b2e1b6a93abe20", "installed_by": [ "modules" - ] + ], + "patch": "modules/nf-core/csvtk/join/csvtk-join.diff" }, "famsa/align": { "branch": "master", @@ -91,7 +92,7 @@ }, "mtmalign/align": { "branch": "master", - "git_sha": "5caf7640a9ef1d18d765d55339be751bb0969dfa", + "git_sha": "e2ea4d9fd6f326d51e468edc4b9cb1d6d4082be5", "installed_by": [ "modules" ] diff --git a/modules/local/multiqc.nf b/modules/local/multiqc.nf index 0b68d71e..ee54c43e 100644 --- a/modules/local/multiqc.nf +++ b/modules/local/multiqc.nf @@ -1,7 +1,7 @@ process MULTIQC { label 'process_medium' - conda 'bioconda::multiqc=1.22.1' + conda 'bioconda::multiqc=1.22.2' container "community.wave.seqera.io/library/multiqc:1.22.1--4886de6095538010" input: diff --git a/modules/nf-core/csvtk/join/csvtk-join.diff b/modules/nf-core/csvtk/join/csvtk-join.diff new file mode 100644 index 00000000..fded83ab --- /dev/null +++ b/modules/nf-core/csvtk/join/csvtk-join.diff @@ -0,0 +1,24 @@ +Changes in module 'nf-core/csvtk/join' +--- modules/nf-core/csvtk/join/main.nf ++++ modules/nf-core/csvtk/join/main.nf +@@ -22,12 +22,17 @@ + prefix = task.ext.prefix ?: "${meta.id}" + out_extension = args.contains('--out-delimiter "\t"') || args.contains('-D "\t"') || args.contains("-D \$'\t'") ? "tsv" : "csv" + """ ++ ++ # if the input is horter than 2, add an emtpy file as the second input ++ touch empty.csv ++ + csvtk \\ + join \\ + $args \\ + --num-cpus $task.cpus \\ + --out-file ${prefix}.${out_extension} \\ +- $csv ++ $csv \\ ++ empty.csv + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + +************************************************************ diff --git a/modules/nf-core/csvtk/join/main.nf b/modules/nf-core/csvtk/join/main.nf index 5f3afeea..47acc69b 100644 --- a/modules/nf-core/csvtk/join/main.nf +++ b/modules/nf-core/csvtk/join/main.nf @@ -22,12 +22,17 @@ process CSVTK_JOIN { prefix = task.ext.prefix ?: "${meta.id}" out_extension = args.contains('--out-delimiter "\t"') || args.contains('-D "\t"') || args.contains("-D \$'\t'") ? "tsv" : "csv" """ + + # if the input is horter than 2, add an emtpy file as the second input + touch empty.csv + csvtk \\ join \\ $args \\ --num-cpus $task.cpus \\ --out-file ${prefix}.${out_extension} \\ - $csv + $csv \\ + empty.csv cat <<-END_VERSIONS > versions.yml "${task.process}": diff --git a/modules/nf-core/mtmalign/align/main.nf b/modules/nf-core/mtmalign/align/main.nf index c6ad07b7..d8fa72c3 100644 --- a/modules/nf-core/mtmalign/align/main.nf +++ b/modules/nf-core/mtmalign/align/main.nf @@ -10,7 +10,7 @@ process MTMALIGN_ALIGN { 'biocontainers/mulled-v2-5bcf71dc66dac33d8e003c5e78043b80f5c7f269:8f0e486d46f7ab38892c1a8f78d2894a549d03b5-0' }" input: - tuple val(meta), path('*.pdb', arity: '2..*') + tuple val(meta), path(pdbs) val(compress) output: @@ -38,12 +38,13 @@ process MTMALIGN_ALIGN { mtm-align -i input_list.txt -o ${prefix}.pdb # -o does not affect the fasta naming, so move it to the new name mv ./mTM_result/result.fasta ./mTM_result/${prefix}.aln + # Remove ".pdb" from the ids in the alignment file + sed -i 's/\\.pdb//g' ./mTM_result/${prefix}.aln # compress both output files if ${compress}; then pigz -p ${task.cpus} ./mTM_result/${prefix}.aln ./mTM_result/${prefix}.pdb fi - tree # mtm-align -v prints the wrong version 20180725, so extract it from the cosmetic output in the help message cat <<-END_VERSIONS > versions.yml @@ -54,7 +55,6 @@ process MTMALIGN_ALIGN { """ stub: - def args = task.ext.args ?: '' prefix = task.ext.prefix ?: "${meta.id}" """ mkdir mTM_result diff --git a/modules/nf-core/mtmalign/align/tests/main.nf.test b/modules/nf-core/mtmalign/align/tests/main.nf.test index cb3f3885..ada32c39 100644 --- a/modules/nf-core/mtmalign/align/tests/main.nf.test +++ b/modules/nf-core/mtmalign/align/tests/main.nf.test @@ -39,11 +39,11 @@ nextflow_process { { assert process.success }, // mTMalign may be nondeterministic, just check if the pdbs are all in there //{ assert snapshot(process.out).match() } - { assert path(process.out.alignment[0][1]).getText().contains(">1.pdb") }, - { assert path(process.out.alignment[0][1]).getText().contains(">2.pdb") }, - { assert path(process.out.alignment[0][1]).getText().contains(">3.pdb") }, - { assert path(process.out.alignment[0][1]).getText().contains(">4.pdb") }, - { assert path(process.out.alignment[0][1]).getText().contains(">5.pdb") }, + { assert path(process.out.alignment[0][1]).getText().contains(">1ahl") }, + { assert path(process.out.alignment[0][1]).getText().contains(">1apf") }, + { assert path(process.out.alignment[0][1]).getText().contains(">1atx") }, + { assert path(process.out.alignment[0][1]).getText().contains(">1bds") }, + { assert path(process.out.alignment[0][1]).getText().contains(">1sh1") }, { assert snapshot(process.out.versions).match("versions0") } ) } @@ -78,12 +78,11 @@ nextflow_process { assertAll( { assert process.success }, // mTMalign may be nondeterministic, just check if the pdbs are all in there - //{ assert snapshot(process.out).match() } - { assert path(process.out.alignment[0][1]).getTextGzip().contains(">1.pdb") }, - { assert path(process.out.alignment[0][1]).getTextGzip().contains(">2.pdb") }, - { assert path(process.out.alignment[0][1]).getTextGzip().contains(">3.pdb") }, - { assert path(process.out.alignment[0][1]).getTextGzip().contains(">4.pdb") }, - { assert path(process.out.alignment[0][1]).getTextGzip().contains(">5.pdb") }, + { assert path(process.out.alignment[0][1]).getTextGzip().contains(">1ahl") }, + { assert path(process.out.alignment[0][1]).getTextGzip().contains(">1apf") }, + { assert path(process.out.alignment[0][1]).getTextGzip().contains(">1atx") }, + { assert path(process.out.alignment[0][1]).getTextGzip().contains(">1bds") }, + { assert path(process.out.alignment[0][1]).getTextGzip().contains(">1sh1") }, { assert snapshot(process.out.versions).match("versions1") } ) } diff --git a/modules/nf-core/mtmalign/align/tests/main.nf.test.snap b/modules/nf-core/mtmalign/align/tests/main.nf.test.snap index 28a66506..0eefb191 100644 --- a/modules/nf-core/mtmalign/align/tests/main.nf.test.snap +++ b/modules/nf-core/mtmalign/align/tests/main.nf.test.snap @@ -7,9 +7,9 @@ ], "meta": { "nf-test": "0.8.4", - "nextflow": "24.01.0" + "nextflow": "24.04.2" }, - "timestamp": "2024-02-28T19:04:20.621422" + "timestamp": "2024-06-03T11:01:13.729263689" }, "versions1": { "content": [ @@ -19,8 +19,8 @@ ], "meta": { "nf-test": "0.8.4", - "nextflow": "24.01.0" + "nextflow": "24.04.2" }, - "timestamp": "2024-02-28T19:04:47.062692" + "timestamp": "2024-06-03T11:01:37.28539854" } } \ No newline at end of file diff --git a/nextflow.config b/nextflow.config index 09f6f230..ed896ae5 100644 --- a/nextflow.config +++ b/nextflow.config @@ -14,14 +14,14 @@ plugins { params { // Input options - input = null - tools = null + input = null + tools = null // Stats skip_stats = false calc_sim = false calc_seq_stats = true - extract_plddt = true + extract_plddt = false // Evaluation skip_eval = false @@ -29,9 +29,10 @@ params { calc_tc = false calc_irmsd = false calc_gaps = true - calc_tcs = true + calc_tcs = false - no_compression = false + skip_compression = false + compress_after_eval = false // MultiQC options multiqc_config = null diff --git a/nextflow_schema.json b/nextflow_schema.json index 80ecdea8..4a45b03b 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -151,12 +151,16 @@ "compression": { "title": "Compression", "type": "object", - "description": "Define wether to run compression", + "description": "Define wether and how to run compression", "default": "", "properties": { - "no_compression": { + "skip_compression": { "type": "boolean", "description": "Produce uncompressed alignment files" + }, + "compress_after_eval": { + "type": "boolean", + "description": "Defines that the compression should be done after the evaluation" } } }, diff --git a/subworkflows/local/align.nf b/subworkflows/local/align.nf index f0c7f616..76c273bf 100644 --- a/subworkflows/local/align.nf +++ b/subworkflows/local/align.nf @@ -21,16 +21,24 @@ include { MTMALIGN_ALIGN } from '../../modules/nf-core/mtma workflow ALIGN { take: ch_fastas // channel: meta, /path/to/file.fasta - ch_tools // string: + ch_tools // channel: meta_tree, meta_aligner + // [[tree:, args_tree:, args_tree_clean: ], [aligner:, args_aligner:, args_aligner_clean:]] + // e.g.[[tree:FAMSA, args_tree:-gt upgma -parttree, args_tree_clean:-gt_upgma_-parttree], [aligner:FAMSA, args_aligner:null, args_aligner_clean:null]] + // e.g.[[tree:null, args_tree:null, args_tree_clean:null], [aligner:TCOFFEE, args_aligner:-output fasta_aln, args_aligner_clean:-output_fasta_aln]] ch_structures // channel: meta, [/path/to/file.pdb,/path/to/file.pdb,/path/to/file.pdb] + compress + // tree + // args_tree + // args_tree_clean + // aligner + // args_aligner + // args_aligner_clean main: msa = Channel.empty() ch_versions = Channel.empty() - compress = ! params.no_compression - // Branch the toolsheet information into two channels // This way, it can direct the computation of guidetrees // and aligners separately @@ -77,16 +85,28 @@ workflow ALIGN { magus: it[0]["aligner"] == "MAGUS" muscle5: it[0]["aligner"] == "MUSCLE5" mtmalign: it[0]["aligner"] == "MTMALIGN" + regressive: it[0]["aligner"] == "REGRESSIVE" tcoffee: it[0]["aligner"] == "TCOFFEE" tcoffee3d: it[0]["aligner"] == "3DCOFFEE" - regressive: it[0]["aligner"] == "REGRESSIVE" } .set { ch_fasta_trees } + ch_structures.combine(ch_tools) + .map { + metastruct, template, struct, metatree, metaalign -> + [ metastruct+metatree+metaalign, template, struct ] + } + .branch{ + mtmalign: it[0]["aligner"] == "MTMALIGN" + } + .set { ch_structures_tools } + // ------------------------------------------------ // Compute the alignments // ------------------------------------------------ + // 1. SEQUENCE BASED + // ----------------- CLUSTALO ------------------ ch_fasta_trees_clustalo = ch_fasta_trees.clustalo .multiMap{ @@ -172,6 +192,19 @@ workflow ALIGN { ch_versions = ch_versions.mix(TCOFFEE_ALIGN.out.versions.first()) msa = msa.mix(TCOFFEE_ALIGN.out.alignment) + // ----------------- REGRESSIVE ------------------ + ch_fasta_trees_regressive = ch_fasta_trees.regressive + .multiMap{ + meta, fastafile, treefile -> + fasta: [ meta, fastafile ] + tree: [ meta, treefile ] + } + REGRESSIVE_ALIGN(ch_fasta_trees_regressive.fasta, ch_fasta_trees_regressive.tree, [[:],[], []], compress) + ch_versions = ch_versions.mix(REGRESSIVE_ALIGN.out.versions.first()) + msa = msa.mix(REGRESSIVE_ALIGN.out.alignment) + + // 2. SEQUENCE + STRUCTURE BASED + // ----------------- 3DCOFFEE ------------------ ch_fasta_trees_3dcoffee = ch_fasta_trees.tcoffee3d.map{ meta, fasta, tree -> [meta["id"], meta, fasta, tree] } .combine(ch_structures.map{ meta, template, structures -> [meta["id"], template, structures]}, by: 0) @@ -185,29 +218,17 @@ workflow ALIGN { ch_versions = ch_versions.mix(TCOFFEE3D_ALIGN.out.versions.first()) msa = msa.mix(TCOFFEE3D_ALIGN.out.alignment) - // ----------------- REGRESSIVE ------------------ - ch_fasta_trees_regressive = ch_fasta_trees.regressive - .multiMap{ - meta, fastafile, treefile -> - fasta: [ meta, fastafile ] - tree: [ meta, treefile ] - } - REGRESSIVE_ALIGN(ch_fasta_trees_regressive.fasta, ch_fasta_trees_regressive.tree, [[:],[], []], compress) - ch_versions = ch_versions.mix(REGRESSIVE_ALIGN.out.versions.first()) - msa = msa.mix(REGRESSIVE_ALIGN.out.alignment) + // 3. STRUCTURE BASED // ----------------- MTMALIGN ------------------ - // this call discards the fasta, tree and template arguments, as MTMalign only takes pdb inputs - // nonetheless, this is required by the pipeline - ch_pdb_mtmalign = ch_fasta_trees.mtmalign.map{ meta, fasta, tree -> [meta["id"], meta] } - .combine(ch_structures.map{ meta, template, structures -> [meta["id"], structures]}, by: 0) - .multiMap{ - merging_id, meta, templatefile, structuresfiles -> - pdbs: [ meta, structuresfiles ] - } + ch_structures_tools.mtmalign + .multiMap{ + meta, template, struct -> + pdbs: [ meta, struct ] + }.set{ ch_pdb_mtmalign } - MTMALIGN_ALIGN(ch_pdb_mtmalign.pdbs, false) + MTMALIGN_ALIGN(ch_pdb_mtmalign.pdbs, compress) ch_versions = ch_versions.mix(MTMALIGN_ALIGN.out.versions.first()) msa = msa.mix(MTMALIGN_ALIGN.out.alignment) diff --git a/subworkflows/local/evaluate.nf b/subworkflows/local/evaluate.nf index 69eb8566..26d8ba8b 100644 --- a/subworkflows/local/evaluate.nf +++ b/subworkflows/local/evaluate.nf @@ -33,7 +33,7 @@ workflow EVALUATE { // ---------------------- // Decompress if required // ---------------------- - if( !params.no_compression ){ + if( !params.skip_compression ){ PIGZ_UNCOMPRESS(ch_msa) ch_msa = PIGZ_UNCOMPRESS.out.file ch_versions = ch_versions.mix(PIGZ_UNCOMPRESS.out.versions) diff --git a/subworkflows/local/stats.nf b/subworkflows/local/stats.nf index 44b88b3f..ed9c3a73 100644 --- a/subworkflows/local/stats.nf +++ b/subworkflows/local/stats.nf @@ -104,5 +104,5 @@ workflow STATS { emit: stats_summary - versions = ch_versions.ifEmpty(null) // channel: [ versions.yml ] + versions = ch_versions } diff --git a/subworkflows/local/utils_nfcore_multiplesequencealign_pipeline/main.nf b/subworkflows/local/utils_nfcore_multiplesequencealign_pipeline/main.nf index c89ae745..ae893c73 100644 --- a/subworkflows/local/utils_nfcore_multiplesequencealign_pipeline/main.nf +++ b/subworkflows/local/utils_nfcore_multiplesequencealign_pipeline/main.nf @@ -76,7 +76,7 @@ workflow PIPELINE_INITIALISATION { // // Custom validation for pipeline parameters // - validateInputParameters() + //validateInputParameters() // // Create channel from input file provided through params.input @@ -164,10 +164,11 @@ workflow PIPELINE_COMPLETION { // // Check and validate pipeline parameters // -def validateInputParameters() { - statsParamsWarning() - evalParamsWarning() -} +// def validateInputParameters() { +// statsParamsWarning() +// evalParamsWarning() +// compressionParams() +// } // // Validate channels from input samplesheet @@ -225,6 +226,8 @@ def evalParamsWarning() { } if (!params.skip_eval && !params.calc_sp && !params.calc_tc && !params.calc_irmsd ){ params.skip_eval = true + print(params.skip_eval) + print("-----------------------------") def warning_string = "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n" + " WARNING: The param skip_eval has been changed from false to true'.\n" + " None of the modules withing the stats subworkflow was activated. \n" + @@ -234,6 +237,16 @@ def evalParamsWarning() { } } +def compressionParams(){ + if(!params.skip_compression){ + if(!params.skip_eval){ + params.compress_after_eval = true + print("Hey") + print(params.compress_after_eval) + } + } +} + // // Generate methods description for MultiQC // diff --git a/workflows/multiplesequencealign.nf b/workflows/multiplesequencealign.nf index 995b6813..521f6a6d 100644 --- a/workflows/multiplesequencealign.nf +++ b/workflows/multiplesequencealign.nf @@ -26,6 +26,8 @@ ch_multiqc_table = Channel.empty() evaluation_summary = Channel.empty() stats_summary = Channel.empty() stats_and_evaluation_summary = Channel.empty() +ch_shiny_stats = Channel.empty() +shiny_app = Channel.fromPath(params.shiny_app) /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -168,21 +170,26 @@ workflow MULTIPLESEQUENCEALIGN { // // Compute summary statistics about the input sequences // + if( !params.skip_stats ){ STATS(ch_seqs, ch_structures) ch_versions = ch_versions.mix(STATS.out.versions) stats_summary = stats_summary.mix(STATS.out.stats_summary) } + // // Align // - ALIGN(ch_seqs, ch_tools, ch_structures_template) + + compress_during_align = ! params.skip_compression && !params.compress_after_eval + ALIGN(ch_seqs, ch_tools, ch_structures_template, compress_during_align) ch_versions = ch_versions.mix(ALIGN.out.versions) // // Evaluate the quality of the alignment // + if( !params.skip_eval ){ EVALUATE(ALIGN.out.msa, ch_refs, ch_structures_template) ch_versions = ch_versions.mix(EVALUATE.out.versions) @@ -192,36 +199,27 @@ workflow MULTIPLESEQUENCEALIGN { // // Combine stats and evaluation reports into a single CSV // - stats_summary_csv = stats_summary.map{ meta, csv -> csv } - eval_summary_csv = evaluation_summary.map{ meta, csv -> csv } - eval_summary_csv - .mix(stats_summary_csv) - .collect() - .map { - csvs -> - [ [ id:"summary_stats_eval" ], csvs ] - } - .set { stats_and_evaluation } - - if( !params.skip_stats && !params.skip_eval ){ - def number_of_stats = [params.calc_sim, params.calc_seq_stats].count(true) - def number_of_evals = [params.calc_sp, params.calc_tc, params.calc_irmsd].count(true) - if (number_of_evals > 0 && number_of_stats > 0 ){ - MERGE_STATS_EVAL(stats_and_evaluation) - stats_and_evaluation_summary = MERGE_STATS_EVAL.out.csv - ch_versions = ch_versions.mix(MERGE_STATS_EVAL.out.versions) - } - }else{ - stats_and_evaluation_summary = stats_and_evaluation - } + if( !params.skip_stats || !params.skip_eval ){ + stats_summary_csv = stats_summary.map{ meta, csv -> csv } + eval_summary_csv = evaluation_summary.map{ meta, csv -> csv } + stats_summary_csv.mix(eval_summary_csv) + .collect() + .map { + csvs -> + [ [ id:"summary_stats_eval" ], csvs ] + } + .set { stats_and_evaluation } + MERGE_STATS_EVAL(stats_and_evaluation) + stats_and_evaluation_summary = MERGE_STATS_EVAL.out.csv + ch_versions = ch_versions.mix(MERGE_STATS_EVAL.out.versions) + } // // MODULE: Shiny // - ch_shiny_stats = Channel.empty() if( !params.skip_shiny){ - PREPARE_SHINY ( stats_and_evaluation_summary, file(params.shiny_app) ) + PREPARE_SHINY ( stats_and_evaluation_summary, shiny_app ) ch_versions = ch_versions.mix(PREPARE_SHINY.out.versions) ch_shiny_stats = PREPARE_SHINY.out.data.toList() } @@ -238,7 +236,7 @@ workflow MULTIPLESEQUENCEALIGN { // MODULE: MultiQC // multiqc_out = Channel.empty() - if (!params.skip_multiqc){ + if (!params.skip_multiqc && (!params.skip_stats || !params.skip_eval)){ ch_multiqc_config = Channel.fromPath("$projectDir/assets/multiqc_config.yml", checkIfExists: true) ch_multiqc_custom_config = params.multiqc_config ? Channel.fromPath(params.multiqc_config, checkIfExists: true) : Channel.empty() ch_multiqc_logo = params.multiqc_logo ? Channel.fromPath(params.multiqc_logo, checkIfExists: true) : Channel.empty() From 0bda7180d2f4f4d9590edbc3a4d7a95c284a169a Mon Sep 17 00:00:00 2001 From: luisas Date: Tue, 4 Jun 2024 17:12:15 +0200 Subject: [PATCH 03/16] compression handling fixe --- modules.json | 7 ++ modules/nf-core/pigz/compress/environment.yml | 9 +++ modules/nf-core/pigz/compress/main.nf | 45 ++++++++++++ modules/nf-core/pigz/compress/meta.yml | 47 +++++++++++++ .../nf-core/pigz/compress/tests/main.nf.test | 49 +++++++++++++ .../pigz/compress/tests/main.nf.test.snap | 37 ++++++++++ modules/nf-core/pigz/compress/tests/tags.yml | 2 + nextflow.config | 3 +- nextflow_schema.json | 4 -- subworkflows/local/evaluate.nf | 22 ++---- subworkflows/local/stats.nf | 14 ++-- .../main.nf | 68 ------------------- workflows/multiplesequencealign.nf | 11 ++- 13 files changed, 220 insertions(+), 98 deletions(-) create mode 100644 modules/nf-core/pigz/compress/environment.yml create mode 100644 modules/nf-core/pigz/compress/main.nf create mode 100644 modules/nf-core/pigz/compress/meta.yml create mode 100644 modules/nf-core/pigz/compress/tests/main.nf.test create mode 100644 modules/nf-core/pigz/compress/tests/main.nf.test.snap create mode 100644 modules/nf-core/pigz/compress/tests/tags.yml diff --git a/modules.json b/modules.json index fd9ac491..8cae7016 100644 --- a/modules.json +++ b/modules.json @@ -111,6 +111,13 @@ "modules" ] }, + "pigz/compress": { + "branch": "master", + "git_sha": "0eab94fc1e48703c1b0a8704bd665f554905c39d", + "installed_by": [ + "modules" + ] + }, "pigz/uncompress": { "branch": "master", "git_sha": "d7f0de8aae7bf84b080dfdcf4e294bf11a46a51c", diff --git a/modules/nf-core/pigz/compress/environment.yml b/modules/nf-core/pigz/compress/environment.yml new file mode 100644 index 00000000..7551d187 --- /dev/null +++ b/modules/nf-core/pigz/compress/environment.yml @@ -0,0 +1,9 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json +name: "pigz_compress" +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - "pigz=2.8" diff --git a/modules/nf-core/pigz/compress/main.nf b/modules/nf-core/pigz/compress/main.nf new file mode 100644 index 00000000..152e7006 --- /dev/null +++ b/modules/nf-core/pigz/compress/main.nf @@ -0,0 +1,45 @@ +process PIGZ_COMPRESS { + tag "$meta.id" + label 'process_low' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/pigz:2.8': + 'biocontainers/pigz:2.8' }" + + input: + tuple val(meta), path(raw_file) + + output: + tuple val(meta), path("$archive"), emit: archive + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + archive = raw_file.toString() + ".gz" + """ + # Note: needs --stdout for pigz to avoid the following issue: + # pigz: skipping: ${raw_file} is a symbolic link + pigz --processes $task.cpus --stdout --force ${args} ${raw_file} > ${archive} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + pigz:\$(echo \$(pigz --version 2>&1) | sed 's/^.*pigz\\w*//' ) + END_VERSIONS + """ + + stub: + def args = task.ext.args ?: '' + archive = raw_file.toString() + ".gz" + """ + touch ${archive} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + pigz:\$(echo \$(pigz --version 2>&1) | sed 's/^.*pigz\\w*//' ) + END_VERSIONS + """ +} diff --git a/modules/nf-core/pigz/compress/meta.yml b/modules/nf-core/pigz/compress/meta.yml new file mode 100644 index 00000000..42efd735 --- /dev/null +++ b/modules/nf-core/pigz/compress/meta.yml @@ -0,0 +1,47 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/meta-schema.json +name: "pigz_compress" +description: Compresses files with pigz. +keywords: + - compress + - gzip + - parallelized +tools: + - "pigz": + description: "Parallel implementation of the gzip algorithm." + homepage: "https://zlib.net/pigz/" + documentation: "https://zlib.net/pigz/pigz.pdf" + +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'sample1', single_end:false ]` + + - raw_file: + type: file + description: File to be compressed + pattern: "*.*" + +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'sample1', single_end:false ]` + + - archive: + type: file + description: The compressed file + pattern: "*.gz" + + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + +authors: + - "@leoisl" +maintainers: + - "@leoisl" diff --git a/modules/nf-core/pigz/compress/tests/main.nf.test b/modules/nf-core/pigz/compress/tests/main.nf.test new file mode 100644 index 00000000..248d40fb --- /dev/null +++ b/modules/nf-core/pigz/compress/tests/main.nf.test @@ -0,0 +1,49 @@ +nextflow_process { + name "Test Process PIGZ_COMPRESS" + script "../main.nf" + process "PIGZ_COMPRESS" + + tag "modules" + tag "modules_nfcore" + tag "pigz" + tag "pigz/compress" + + test("sarscov2 - genome - fasta") { + when { + process { + """ + input[0] = [ + [ id:'test'], // meta map + file(params.test_data['sarscov2']['genome']['genome_fasta'], checkIfExists: true) + ] + """ + } + } + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + } + + test("sarscov2 - genome - fasta - stub") { + options "-stub-run" + when { + process { + """ + input[0] = [ + [ id:'test'], // meta map + file(params.test_data['sarscov2']['genome']['genome_fasta'], checkIfExists: true) + ] + """ + } + } + then { + assertAll( + { assert process.success }, + { assert snapshot(file(process.out.archive[0][1]).name).match() } + ) + } + } +} diff --git a/modules/nf-core/pigz/compress/tests/main.nf.test.snap b/modules/nf-core/pigz/compress/tests/main.nf.test.snap new file mode 100644 index 00000000..6e50456f --- /dev/null +++ b/modules/nf-core/pigz/compress/tests/main.nf.test.snap @@ -0,0 +1,37 @@ +{ + "sarscov2 - genome - fasta": { + "content": [ + { + "0": [ + [ + { + "id": "test" + }, + "genome.fasta.gz:md5,6e9fe4042a72f2345f644f239272b7e6" + ] + ], + "1": [ + "versions.yml:md5,ca30e9e1ffa1394ba7eefdac8cf3a3ad" + ], + "archive": [ + [ + { + "id": "test" + }, + "genome.fasta.gz:md5,6e9fe4042a72f2345f644f239272b7e6" + ] + ], + "versions": [ + "versions.yml:md5,ca30e9e1ffa1394ba7eefdac8cf3a3ad" + ] + } + ], + "timestamp": "2023-12-11T22:39:53.350546" + }, + "sarscov2 - genome - fasta - stub": { + "content": [ + "genome.fasta.gz" + ], + "timestamp": "2023-12-11T22:52:24.309192" + } +} \ No newline at end of file diff --git a/modules/nf-core/pigz/compress/tests/tags.yml b/modules/nf-core/pigz/compress/tests/tags.yml new file mode 100644 index 00000000..42c46bfa --- /dev/null +++ b/modules/nf-core/pigz/compress/tests/tags.yml @@ -0,0 +1,2 @@ +pigz/compress: + - "modules/nf-core/pigz/compress/**" diff --git a/nextflow.config b/nextflow.config index ed896ae5..b886f2f3 100644 --- a/nextflow.config +++ b/nextflow.config @@ -28,11 +28,10 @@ params { calc_sp = true calc_tc = false calc_irmsd = false - calc_gaps = true + calc_gaps = false calc_tcs = false skip_compression = false - compress_after_eval = false // MultiQC options multiqc_config = null diff --git a/nextflow_schema.json b/nextflow_schema.json index 4a45b03b..b770573c 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -157,10 +157,6 @@ "skip_compression": { "type": "boolean", "description": "Produce uncompressed alignment files" - }, - "compress_after_eval": { - "type": "boolean", - "description": "Defines that the compression should be done after the evaluation" } } }, diff --git a/subworkflows/local/evaluate.nf b/subworkflows/local/evaluate.nf index 26d8ba8b..3c2a60df 100644 --- a/subworkflows/local/evaluate.nf +++ b/subworkflows/local/evaluate.nf @@ -28,17 +28,9 @@ workflow EVALUATE { tc_csv = Channel.empty() irmsd_csv = Channel.empty() tcs_csv = Channel.empty() + gaps_csv = Channel.empty() eval_summary = Channel.empty() - // ---------------------- - // Decompress if required - // ---------------------- - if( !params.skip_compression ){ - PIGZ_UNCOMPRESS(ch_msa) - ch_msa = PIGZ_UNCOMPRESS.out.file - ch_versions = ch_versions.mix(PIGZ_UNCOMPRESS.out.versions) - } - // -------------------------- // Reference based evaluation @@ -49,7 +41,7 @@ workflow EVALUATE { // Sum of pairs - if( params.calc_sp == true){ + if( params.calc_sp ){ TCOFFEE_ALNCOMPARE_SP(alignment_and_ref) sp_scores = TCOFFEE_ALNCOMPARE_SP.out.scores ch_versions = ch_versions.mix(TCOFFEE_ALNCOMPARE_SP.out.versions.first()) @@ -65,7 +57,7 @@ workflow EVALUATE { } // Total column score - if( params.calc_tc == true){ + if( params.calc_tc ){ TCOFFEE_ALNCOMPARE_TC(alignment_and_ref) tc_scores = TCOFFEE_ALNCOMPARE_TC.out.scores ch_versions = ch_versions.mix(TCOFFEE_ALNCOMPARE_TC.out.versions.first()) @@ -81,7 +73,7 @@ workflow EVALUATE { } // number of gaps - if (params.calc_gaps == true){ + if ( params.calc_gaps ){ CALC_GAPS(ch_msa) gaps_scores = CALC_GAPS.out.gaps ch_versions = ch_versions.mix(CALC_GAPS.out.versions) @@ -104,7 +96,7 @@ workflow EVALUATE { // ------------------------------------------- // iRMSD - if (params.calc_irmsd == true){ + if (params.calc_irmsd ){ msa_str = ch_structures.map { meta, template, str -> [ meta.id, template, str ] } .cross (ch_msa.map { meta, aln -> [ meta.id, meta, aln ] }) .multiMap { chstr, chaln -> @@ -136,7 +128,7 @@ workflow EVALUATE { // ------------------------------------------- // TCS - if( params.calc_tcs == true){ + if( params.calc_tcs ){ // the second argument is empty but a lib file can be fed to it TCOFFEE_TCS(ch_msa, [[:], []]) tcs_scores = TCOFFEE_TCS.out.scores @@ -179,6 +171,6 @@ workflow EVALUATE { emit: eval_summary - versions = ch_versions.ifEmpty(null) // channel: [ versions.yml ] + versions = ch_versions // channel: [ versions.yml ] } diff --git a/subworkflows/local/stats.nf b/subworkflows/local/stats.nf index ed9c3a73..76b3b328 100644 --- a/subworkflows/local/stats.nf +++ b/subworkflows/local/stats.nf @@ -17,16 +17,16 @@ workflow STATS { main: - ch_versions = Channel.empty() - sim_csv = Channel.empty() - seqstats_csv = Channel.empty() - plddts_csv = Channel.empty() + ch_versions = Channel.empty() + sim_csv = Channel.empty() + seqstats_csv = Channel.empty() + plddts_csv = Channel.empty() stats_summary = Channel.empty() // // ------------------------------------------- // // SEQUENCE SIMILARITY // // ------------------------------------------- - if( params.calc_sim == true){ + if( params.calc_sim){ TCOFFEE_SEQREFORMAT_SIM(ch_seqs) tcoffee_seqreformat_sim = TCOFFEE_SEQREFORMAT_SIM.out.formatted_file ch_versions = ch_versions.mix(TCOFFEE_SEQREFORMAT_SIM.out.versions.first()) @@ -48,7 +48,7 @@ workflow STATS { // SEQUENCE GENERAL STATS // Sequence length, # of sequences, etc // ------------------------------------------- - if( params.calc_seq_stats == true){ + if( params.calc_seq_stats){ CALCULATE_SEQSTATS(ch_seqs) seqstats = CALCULATE_SEQSTATS.out.seqstats seqstats_summary = CALCULATE_SEQSTATS.out.seqstats_summary @@ -69,7 +69,7 @@ workflow STATS { // ------------------------------------------- // EXTRACT PLDDT // ------------------------------------------- - if (params.extract_plddt == true){ + if (params.extract_plddt){ EXTRACT_PLDDT(ch_structures) ch_versions = ch_versions.mix(EXTRACT_PLDDT.out.versions) plddt_summary = EXTRACT_PLDDT.out.plddt_summary diff --git a/subworkflows/local/utils_nfcore_multiplesequencealign_pipeline/main.nf b/subworkflows/local/utils_nfcore_multiplesequencealign_pipeline/main.nf index ae893c73..8e1f2339 100644 --- a/subworkflows/local/utils_nfcore_multiplesequencealign_pipeline/main.nf +++ b/subworkflows/local/utils_nfcore_multiplesequencealign_pipeline/main.nf @@ -162,13 +162,6 @@ workflow PIPELINE_COMPLETION { ======================================================================================== */ // -// Check and validate pipeline parameters -// -// def validateInputParameters() { -// statsParamsWarning() -// evalParamsWarning() -// compressionParams() -// } // // Validate channels from input samplesheet @@ -185,67 +178,6 @@ def validateInputSamplesheet(input) { return [ metas[0], fastqs ] } -// -// Warning if incorrect combination of stats parameters are used -// -def statsParamsWarning() { - if (params.skip_stats){ - if(params.calc_sim || params.calc_seq_stats) { - def warning_string = "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n" + - " WARNING: The param skip_stats is set to '${params.skip_stats}'.\n" + - " The following params have values calc_sim: ${params.calc_sim} and calc_seq_stats: ${params.calc_seq_stats} \n" + - " As skip_stats is set to true, the params.calc_sim and params.calc_seq_stats will be set by default to false. \n" + - "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" - println(warning_string) - } - } - if (!params.skip_stats && !params.calc_sim && !params.calc_seq_stats){ - params.skip_stats = true - def warning_string = "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n" + - " WARNING: The param skip_stats has been changed from false to true'.\n" + - " None of the modules withing the stats subworkflow was activated. \n" + - " To activate them you can use param.calc_sim, params.calc_seq_stats. \n" + - "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" - println(warning_string) - } -} - -// -// Warning if incorrect combination of eval parameters are used -// -def evalParamsWarning() { - if (params.skip_eval){ - if(params.calc_sp || params.calc_tc || params.calc_irmsd) { - def warning_string = "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n" + - " WARNING: The param skip_eval is set to '${params.skip_eval}'.\n" + - " The following params have values params.calc_sp: ${params.calc_sp}, params.calc_tc: ${params.calc_tc} and params.calc_irms: ${params.calc_irmsd} \n" + - " As skip_eval is set to true, the params.calc_sp, params.calc_tc and params.calc_irmsd are set by default to false. \n" + - "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" - println(warning_string) - } - } - if (!params.skip_eval && !params.calc_sp && !params.calc_tc && !params.calc_irmsd ){ - params.skip_eval = true - print(params.skip_eval) - print("-----------------------------") - def warning_string = "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n" + - " WARNING: The param skip_eval has been changed from false to true'.\n" + - " None of the modules withing the stats subworkflow was activated. \n" + - " To activate them you can use param.calc_sp, params.calc_tc, params.calc_irmsd. \n" + - "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" - println(warning_string) - } -} - -def compressionParams(){ - if(!params.skip_compression){ - if(!params.skip_eval){ - params.compress_after_eval = true - print("Hey") - print(params.compress_after_eval) - } - } -} // // Generate methods description for MultiQC diff --git a/workflows/multiplesequencealign.nf b/workflows/multiplesequencealign.nf index 521f6a6d..4d03ac99 100644 --- a/workflows/multiplesequencealign.nf +++ b/workflows/multiplesequencealign.nf @@ -61,6 +61,7 @@ include { PREPARE_SHINY } from '../modules/local/prepare_shiny' include { UNTAR } from '../modules/nf-core/untar/main' include { CSVTK_JOIN as MERGE_STATS_EVAL } from '../modules/nf-core/csvtk/join/main.nf' +include { PIGZ_COMPRESS } from '../modules/nf-core/pigz/compress/main' /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -181,11 +182,16 @@ workflow MULTIPLESEQUENCEALIGN { // // Align // - - compress_during_align = ! params.skip_compression && !params.compress_after_eval + + compress_during_align = !params.skip_compression && params.skip_eval ALIGN(ch_seqs, ch_tools, ch_structures_template, compress_during_align) ch_versions = ch_versions.mix(ALIGN.out.versions) + if( !params.skip_compression && !compress_during_align){ + PIGZ_COMPRESS(ALIGN.out.msa) + ch_versions = ch_versions.mix(PIGZ_COMPRESS.out.versions) + } + // // Evaluate the quality of the alignment // @@ -196,6 +202,7 @@ workflow MULTIPLESEQUENCEALIGN { evaluation_summary = evaluation_summary.mix(EVALUATE.out.eval_summary) } + // // Combine stats and evaluation reports into a single CSV // From 832c5c0114ff84d41726bff9880d346e336beeaf Mon Sep 17 00:00:00 2001 From: luisas Date: Wed, 5 Jun 2024 12:21:40 +0200 Subject: [PATCH 04/16] Update readme and fix compression --- .github/workflows/ci.yml | 15 +- README.md | 134 ++++++++++++++---- assets/toolsheet copy.csv | 18 +++ assets/toolsheet.csv | 19 ++- conf/modules.config | 47 ++++-- conf/test.config | 7 +- conf/test_full.config | 3 + conf/test_parameters.config | 18 +-- conf/test_pdb.config | 29 ++++ modules/local/calculate_seqstats.nf | 1 + modules/local/prepare_shiny.nf | 1 + modules/nf-core/mtmalign/align/main.nf | 14 +- modules/nf-core/tcoffee/align/main.nf | 12 +- nextflow.config | 14 +- subworkflows/local/align.nf | 2 +- subworkflows/local/compute_trees.nf | 7 +- .../main.nf | 15 +- 17 files changed, 265 insertions(+), 91 deletions(-) create mode 100644 assets/toolsheet copy.csv create mode 100644 conf/test_pdb.config diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index d0a33337..506c5ac9 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -26,6 +26,9 @@ jobs: NXF_VER: - "24.04.1" - "latest-everything" + ANALYSIS: + - "test" + - "test_pdb" steps: - name: Check out pipeline code uses: actions/checkout@0ad4b8fadaa221de15dcec353f45205ec38ea70b # v4 @@ -40,9 +43,9 @@ jobs: - name: Run pipeline with test data run: | - nextflow run ${GITHUB_WORKSPACE} -profile test,docker --outdir ./results + nextflow run ${GITHUB_WORKSPACE} -profile ${{ matrix.ANALYSIS }},docker --outdir ./results - parameters: + parameters_stub: name: Test workflow parameters if: "${{ github.event_name != 'push' || (github.event_name == 'push' && github.repository == 'nf-core/multiplesequencealign') }}" runs-on: ubuntu-latest @@ -51,6 +54,12 @@ jobs: NXF_VER: - "24.04.1" - "latest-everything" + PARAMS: + - "--skip_stats" + - "--skip_eval" + - "--skip_compression" + - "--skip_shiny" + steps: - name: Check out pipeline code uses: actions/checkout@v4 @@ -62,4 +71,4 @@ jobs: - name: Test workflow parameters run: | - nextflow run ${GITHUB_WORKSPACE} -profile test_parameters,docker --outdir ./results + nextflow run -stub-run ${GITHUB_WORKSPACE} -profile test,docker ${{ matrix.PARAMS }} --outdir ./results diff --git a/README.md b/README.md index 44a12c2b..f4244842 100644 --- a/README.md +++ b/README.md @@ -27,37 +27,90 @@ On release, automated continuous integration tests run the pipeline on a full-si ![Alt text](docs/images/nf-core-msa_metro_map.png?raw=true "nf-core-msa metro map") -1. **Collect Input Information**: computation of summary statistics on the input fasta file, such as the average sequence similarity across the input sequences, their length, etc. Skip by `--skip_stats` as a parameter. -2. **Guide Tree**: (Optional, depends on alignment tools requirement) Renders a guide tree. -3. **Align**: Runs one or multiple MSA tools in parallel. -4. **Evaluate**: The obtained alignments are evaluated with different metrics: Sum Of Pairs (SoP), Total Column score (TC), iRMSD, Total Consistency Score (TCS), etc. Skip by passing `--skip_eval` as a parameter. -5. **Compress**: As MSAs can be very large, by default all tools in the pipeline produce compressed output. For most of them, the compression happens through a pipe, such that uncompressed data never hits the disk. This compression can be turned off by passing `--no_compression` as a parameter. +1. **Input files summary**: (Optional) computation of summary statistics on the input fasta file, such as the average sequence similarity across the input sequences, their length, etc. Skip by `--skip_stats` as a parameter. +2. **Guide Tree**: (Optional) Renders a guide tree. +3. **Align**: aligns the sequences. +4. **Evaluate**: (Optional) The obtained alignments are evaluated with different metrics: Sum Of Pairs (SoP), Total Column score (TC), iRMSD, Total Consistency Score (TCS), etc. Skip by passing `--skip_eval` as a parameter. +5. **Report**: Reports about the collected information of the runs are reported in a shiny app and a summary table in multiqc. Skip by passing `--skip_shiny` and `--skip_multiqc`. -Available GUIDE TREE methods: -- CLUSTALO -- FAMSA -- MAGUS +### 1. INPUT FILES SUMMARY + +Summary information about the input fasta files are calculated. Skip by `--skip_stats`. + + 1. Sequence similarity. Calculates pairwise and average sequence similarity is calculated using TCOFFEE. Activate with `--calc_sim` (default: false). + 2. General summary. Calculates the number and the average length of sequences. Activate with `--calc_seq_stats` (default: true). + 3. Extract plddt. If the structures were generated by AF2, plddt is extracted and reported. Activate with `--extract_plddt` (default: false). + +### 2. GUIDE TREES + +Guide trees define the order in which sequences and profiles are aligned and hold a crucial role in determining the final MSA accuracy. Tree rendering techniques most commonly rely on pairwise distances between sequences. + +> **Note** +> None of the below listed aligner needs an explicit definition of a guidetree: if they need one, they compute their own default guide tree. This explicit definition of a guide tree is available in case you want to test non-default combination of guide trees and aligner methods. + + +Available GUIDE TREE methods (Optional): + +- [CLUSTALO](http://clustal.org/omega/#Documentation) +- [FAMSA](https://github.com/refresh-bio/FAMSA) + + +### 3. ALIGN Available ALIGN methods: -- CLUSTALO -- FAMSA -- KALIGN -- LEARNMSA -- MAFFT -- MAGUS -- MUSCLE5 -- MTMALIGN -- T-COFFEE -- 3DCOFFEE +SEQUENCE-BASED (only require a fasta file as input): +- [CLUSTALO](http://clustal.org/omega/#Documentation) (accepts guide tree) +- [FAMSA](https://github.com/refresh-bio/FAMSA) (accepts guide tree) +- [KALIGN](https://github.com/TimoLassmann/kalign) +- [LEARNMSA](https://github.com/Gaius-Augustus/learnMSA) +- [MAFFT](https://mafft.cbrc.jp/alignment/server/index.html) +- [MAGUS](https://github.com/vlasmirnov/MAGUS) (accepts guide tree) +- [MUSCLE5](https://drive5.com/muscle5/manual/) +- [TCOFFEE](https://tcoffee.readthedocs.io/en/latest/index.html) (accepts guide tree) + +SEQUENCE- and STUCTURE-BASED (require both fasta and structures as input): +- [3DCOFFEE](https://tcoffee.org/Projects/expresso/index.html) (accepts guide tree) + +STRUCTURE-BASED (only require stuctures as input): +- [MTMALIGN](https://bio.tools/mtm-align) + +### 4. EVALUATE + +Optionally, the produced MSAs can be evaluated. Skip with `--skip_eval`. + +SEQUENCE-BASED (no extra input required): +1. Calculate number of gaps and its average across sequences. Activate using `--calc_gaps` (default: true). + +REFERENCE-BASED: +The reference MSAs (see samplesheet) are be used to evaluate the quality of the produced MSA. + +2. Sum Of Pairs. Calculates the SP score using the [TCOFFEE](https://tcoffee.readthedocs.io/en/latest/index.html) implementation. Activate using `--calc_sp` (default: true). +3. Total column. Calculates the TC score [TCOFFEE](https://tcoffee.readthedocs.io/en/latest/index.html). Activate using `--calc_tc` (default: true). + +STRUCTURE-BASED: +The provided structures (see samplesheet) are used to evaluate the quality of the alignment. +4. iRMSD. Calculates the iRMSD using the [TCOFFEE](https://tcoffee.readthedocs.io/en/latest/index.html) implementation. Activate using `--calc_irmsd` (default: false). + + +### 5. REPORT + +Finally, a summary table with all the computed statistics and evaluations is reported in MultiQC (Skip by `--skip_multiqc`). +Moreover, a shiny app is prepared with interactive summary plots. +> [!WARNING] +> You will need to have [shiny](https://shiny.posit.co/py/) installed to run it! See [output documentation](https://nf-co.re/multiplesequencealign/output) for more infos. + ## Usage > [!NOTE] > If you are new to Nextflow and nf-core, please refer to [this page](https://nf-co.re/docs/usage/installation) on how to set-up Nextflow. Make sure to [test your setup](https://nf-co.re/docs/usage/introduction#how-to-run-a-pipeline) with `-profile test` before running the workflow on actual data. -First, prepare a samplesheet with your input data that looks as follows: + +#### 1. SAMPLESHEET +The sample sheet defines the input data that the pipeline will process. +It should look like this: `samplesheet.csv`: @@ -69,32 +122,51 @@ toxin,toxin.fa,toxin-ref.fa,toxin_structures Each row represents a set of sequences (in this case the seatoxin and toxin protein families) to be processed. -`id` is the name of the set of sequences. It can correspond to the protein family name or to an internal id. +`id` is the name of the set of sequences. It can correspond to the protein family name or to an internal id. It should be unique. + +The column `fasta` contains the path to the fasta file that contains the sequencest be aligned. -The column `fasta` contains the path to the fasta file that contains the sequences. +The column `reference` is optional and contains the path to the reference alignment. It is used for the reference-based evaluation steps. It can be left empty. -The column `reference` is optional and contains the path to the reference alignment. It is used for certain evaluation steps. It can be left empty. +The column `structures` is also optional and contains the path to the folder that contains the protein structures for the sequences to be aligned. It is used for structural aligners and structure-based evaluation steps. It can be left empty. -The column `structures` is also optional and contains the path to the folder that contains the protein structures for the sequences to be aligned. It is used for structural aligners and certain evaluation steps. It can be left empty. -Then, you should prepare a toolsheet which defines which tools to run as follows: +> [!NOTE] +> You can have some samples with structures and/or references and some without. The pipeline will run the modules requiring structures/references only on the samples for which you have provided the required information and the others will be just skipped. + + +#### 2. TOOLSHEET + +The reason why we provide a toolsheet as input is that if the pipeline needs to be used as benchmarking framework, we need to test multiple arguments per module. This can be done by having multiple entries in the toolsheet per module with the multiple arguments to be tested. + +Each line of the toolsheet defines a combination of guide tree and multiple sequence aligner to run with the respective arguments to be used. + +It should look at foollows: `toolsheet.csv`: ```csv tree,args_tree,aligner,args_aligner, -FAMSA, -gt upgma -partree, FAMSA, -, ,TCOFFEE, -output fasta_aln +FAMSA, -gt upgma -medoidtree, FAMSA, +, ,TCOFFEE, +FAMSA,,REGRESSIVE, ``` +> [!NOTE] +> Each of the trees and aligners are available as standalones, so args_tree and args_aligner can be left empty if you are cool with the default settings of each method. Tree can also be left empty, and the default guide tree will be used for each aligner. + +> [!NOTE] +> use the exact spelling as listed above! + +`tree` is the tool used to build the tree. (optional) -`tree` is the tool used to build the tree. +Arguments to the tree tool can be provided using `args_tree`. Please refer to each tool's documentation. (optional) -Arguments to the tree tool can be provided using `args_tree`. +The `aligner` column contains the tool to run the alignment. (optional) -The `aligner` column contains the tool to run the alignment. +Finally, the arguments to the aligner tool can be set by using the `args_alginer` column. (optional) -Finally, the arguments to the aligner tool can be set by using the `args_alginer` column. +#### 3. RUN THE PIPELINE Now, you can run the pipeline using: ```bash diff --git a/assets/toolsheet copy.csv b/assets/toolsheet copy.csv new file mode 100644 index 00000000..39c228a8 --- /dev/null +++ b/assets/toolsheet copy.csv @@ -0,0 +1,18 @@ +tree,args_tree,aligner,args_aligner +,,CLUSTALO, +,,FAMSA, +,,KALIGN, +,,MAGUS, +,,MAFFT, +,,MAFFT, --dpparttree +,,MUSCLE5, +,,MTMALIGN, +,,REGRESSIVE, +,,REGRESSIVE,-reg_nseq 3 +,,TCOFFEE, +,,3DCOFFEE, +,,3DCOFFEE,-method TMalign_pair +FAMSA,-gt upgma -medoidtree,FAMSA, +FAMSA,,MAGUS, +CLUSTALO,,TCOFFEE + diff --git a/assets/toolsheet.csv b/assets/toolsheet.csv index 90d8ddb5..151eeeb2 100644 --- a/assets/toolsheet.csv +++ b/assets/toolsheet.csv @@ -1,4 +1,17 @@ tree,args_tree,aligner,args_aligner -FAMSA,-gt upgma -parttree,FAMSA, -,,TCOFFEE,-output fasta_aln -,,MTMALIGN,, \ No newline at end of file +,,CLUSTALO, +,,FAMSA, +,,KALIGN, +,,MAGUS, +,,MAFFT, +,,MAFFT, --dpparttree +,,MUSCLE5, +,,MTMALIGN, +,,REGRESSIVE, +,,REGRESSIVE,-reg_nseq 3 +,,TCOFFEE, +,,3DCOFFEE, +,,3DCOFFEE,-method TMalign_pair +FAMSA,-gt upgma -medoidtree,FAMSA, +FAMSA,,MAGUS, +CLUSTALO,,REGRESSIVE diff --git a/conf/modules.config b/conf/modules.config index ac90d289..17700e6e 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -18,7 +18,7 @@ process { withName: "CALCULATE_SEQSTATS"{ publishDir = [ - path: { "${params.outdir}/stats/sequences/seqstats" }, + path: { "${params.outdir}/summary/stats/sequences/seqstats" }, mode: params.publish_dir_mode, saveAs: { filename -> filename.equals('versions.yml')|filename.contains('_summary.csv') ? null : filename } ] @@ -30,7 +30,7 @@ process { withName: "EXTRACT_PLDDT"{ publishDir = [ - path: { "${params.outdir}/stats/structures/plddt" }, + path: { "${params.outdir}/summary/stats/structures/plddt" }, mode: params.publish_dir_mode, saveAs: { filename -> filename.equals('versions.yml')|filename.contains('_summary.csv') ? null : filename } ] @@ -43,7 +43,7 @@ process { withName: TCOFFEE_SEQREFORMAT_SIM{ ext.args = "-output=sim_idscore" publishDir = [ - path: { "${params.outdir}/stats/sequences/perc_sim" }, + path: { "${params.outdir}/summary/stats/sequences/perc_sim" }, mode: params.publish_dir_mode, saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ] @@ -73,7 +73,7 @@ process { ext.prefix = { "${meta.id}_${meta.tree}-args-${meta.argstree_clean}" } ext.args = { "${meta.args_tree}" == "null" ? '' : "${meta.args_tree}" } publishDir = [ - path: { "${params.outdir}/trees/${task.process.tokenize(':')[-1].tokenize('_')[0].toLowerCase()}" }, + path: { "${params.outdir}/trees/${meta.id}" }, mode: params.publish_dir_mode, saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ] @@ -84,7 +84,7 @@ process { ext.prefix = { "${meta.id}_${meta.tree}-args-${meta.argstree_clean}" } ext.args = { "${meta.args_tree}" == "null" ? '' : "${meta.args_tree}" } publishDir = [ - path: { "${params.outdir}/trees/${task.process.tokenize(':')[-1].tokenize('_')[0].toLowerCase()}" }, + path: { "${params.outdir}/trees/${meta.id}" }, mode: params.publish_dir_mode, saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ] @@ -95,7 +95,7 @@ process { ext.prefix = { "${meta.id}_${meta.tree}-args-${meta.argstree_clean}" } ext.args = { "${meta.args_tree}" == "null" ? '' : "${meta.args_tree}" } publishDir = [ - path: { "${params.outdir}/trees/${task.process.tokenize(':')[-1].tokenize('_')[0].toLowerCase()}" }, + path: { "${params.outdir}/trees/${meta.id}" }, mode: params.publish_dir_mode, saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ] @@ -110,15 +110,40 @@ process { } - withName: "CLUSTALO_ALIGN|FAMSA_ALIGN|LEARNMSA_ALIGN|MAFFT|MAGUS_ALIGN|MUSCLE5_SUPER5|REGRESSIVE|TCOFFEE_ALIGN|TCOFFEE3D_ALIGN|MTMALIGN_ALIGN"{ + withName: "CLUSTALO_ALIGN|FAMSA_ALIGN|LEARNMSA_ALIGN|MAFFT|MAGUS_ALIGN|MUSCLE5_SUPER5|REGRESSIVE|TCOFFEE_ALIGN|TCOFFEE3D_ALIGN"{ tag = { "${meta.id} tree:${meta.tree} argstree:${args_tree} args:${meta.args_aligner}" } ext.prefix = { "${meta.id}_${meta.tree}-args-${meta.argstree_clean}_${meta.aligner}-args-${meta.args_aligner_clean}" } ext.args = { "${meta.args_aligner}" == "null" ? '' : "${meta.args_aligner}" } + if(params.skip_compression){ + publishDir = [ + path: { "${params.outdir}/alignment/${meta.id}" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + } + + withName: "MTMALIGN_ALIGN"{ + tag = { "${meta.id} tree:${meta.tree} argstree:${args_tree} args:${meta.args_aligner}" } + ext.prefix = { "${meta.id}_${meta.tree}-args-${meta.argstree_clean}_${meta.aligner}-args-${meta.args_aligner_clean}" } + ext.args = { "${meta.args_aligner}" == "null" ? '' : "${meta.args_aligner}" } + if(params.skip_compression){ + publishDir = [ + path: { "${params.outdir}/alignment/${meta.id}" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, + pattern: "*.aln" + ] + } + + } + + withName:"PIGZ_COMPRESS"{ publishDir = [ - path: { "${params.outdir}/alignment/${task.process.tokenize(':')[-1].tokenize('_')[0].toLowerCase()}" }, + path: { "${params.outdir}/alignment/${meta.id}" }, mode: params.publish_dir_mode, saveAs: { filename -> filename.equals('versions.yml') ? null : filename } - ] + ] } // ------------------------------------ @@ -142,7 +167,7 @@ process { withName: 'TCOFFEE_IRMSD'{ ext.prefix = { "${meta.id}_${meta.tree}-args-${meta.argstree_clean}_${meta.aligner}-args-${meta.args_aligner_clean}_irmsd" } publishDir = [ - path: { "${params.outdir}/evaluation/${task.process.tokenize(':')[-1].toLowerCase()}" }, + path: { "${params.outdir}/evaluation/summary/${task.process.tokenize(':')[-1].toLowerCase()}" }, mode: params.publish_dir_mode, saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ] @@ -175,7 +200,7 @@ process { withName: 'TCOFFEE_TCS'{ ext.prefix = { "${meta.id}_${meta.tree}-args-${meta.argstree_clean}_${meta.aligner}-args-${meta.args_aligner_clean}_tcs" } publishDir = [ - path: { "${params.outdir}/evaluation/${task.process.tokenize(':')[-1].toLowerCase()}" }, + path: { "${params.outdir}/evaluation/summary/${task.process.tokenize(':')[-1].toLowerCase()}" }, mode: params.publish_dir_mode, saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ] diff --git a/conf/test.config b/conf/test.config index 8c297066..c7b2fd08 100644 --- a/conf/test.config +++ b/conf/test.config @@ -20,12 +20,11 @@ params { max_memory = '6.GB' max_time = '6.h' - skip_multiqc = false // Input data input = params.pipelines_testdata_base_path + 'multiplesequencealign/samplesheet/v1.0/samplesheet_test_af2.csv' - //tools = params.pipelines_testdata_base_path + 'multiplesequencealign/toolsheet/v1.0/toolsheet.csv' - tools = "./assets/toolsheet.csv" - + tools = params.pipelines_testdata_base_path + 'multiplesequencealign/toolsheet/v1.0/toolsheet_full.csv' + tools = "./assets/toolsheet.csv" + // Output directory outdir = "./outdir/" diff --git a/conf/test_full.config b/conf/test_full.config index 4a1aa3bf..df0bc3fe 100644 --- a/conf/test_full.config +++ b/conf/test_full.config @@ -17,4 +17,7 @@ params { // Input data for full size test input = params.pipelines_testdata_base_path + 'multiplesequencealign/samplesheet/v1.0/samplesheet_full.csv' tools = params.pipelines_testdata_base_path + 'multiplesequencealign/toolsheet/v1.0/toolsheet_full.csv' + + // Output directory + outdir = "./outdir/" } diff --git a/conf/test_parameters.config b/conf/test_parameters.config index e2aedc35..486ded00 100644 --- a/conf/test_parameters.config +++ b/conf/test_parameters.config @@ -11,23 +11,19 @@ */ params { + config_profile_name = 'Test profile' config_profile_description = 'Minimal test dataset to check pipeline function' // Limit resources so that this can run on GitHub Actions - max_cpus = 2 - max_memory = '6.GB' - max_time = '6.h' - - skip_stats = true - skip_eval = true - skip_shiny = true - skip_multiqc = true + max_cpus = 2 + max_memory = '6.GB' + max_time = '6.h' // Input data - input = 'https://raw.githubusercontent.com/nf-core/test-datasets/multiplesequencealign/samplesheet/v1.0/samplesheet_test.csv' - tools = 'https://raw.githubusercontent.com/nf-core/test-datasets/multiplesequencealign/toolsheet/v1.0/toolsheet.csv' - + input = params.pipelines_testdata_base_path + 'multiplesequencealign/samplesheet/v1.0/samplesheet_test_af2.csv' + tools = params.pipelines_testdata_base_path + 'multiplesequencealign/toolsheet/v1.0/toolsheet_full.csv' + // Output directory outdir = "./outdir/" diff --git a/conf/test_pdb.config b/conf/test_pdb.config new file mode 100644 index 00000000..0392718a --- /dev/null +++ b/conf/test_pdb.config @@ -0,0 +1,29 @@ +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Nextflow config file for running minimal tests +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Defines input files and everything required to run a fast and simple pipeline test. + + Use as follows: + nextflow run nf-core/multiplesequencealign -profile test, --outdir + +---------------------------------------------------------------------------------------- +*/ + +params { + config_profile_name = 'Test profile' + config_profile_description = 'Minimal test dataset to check pipeline function' + + // Limit resources so that this can run on GitHub Actions + max_cpus = 2 + max_memory = '6.GB' + max_time = '6.h' + + // Input data + input = params.pipelines_testdata_base_path + 'multiplesequencealign/samplesheet/v1.0/samplesheet_test.csv' + tools = params.pipelines_testdata_base_path + 'multiplesequencealign/toolsheet/v1.0/toolsheet.csv' + + // Output directory + outdir = "./outdir/" + +} diff --git a/modules/local/calculate_seqstats.nf b/modules/local/calculate_seqstats.nf index 42361e3e..641376a4 100644 --- a/modules/local/calculate_seqstats.nf +++ b/modules/local/calculate_seqstats.nf @@ -40,6 +40,7 @@ process CALCULATE_SEQSTATS { touch ${prefix}_seqstats_summary.csv touch ${prefix}_multiqc.tsv + cat <<-END_VERSIONS > versions.yml "${task.process}": python: \$(python --version | sed 's/Python //g') END_VERSIONS diff --git a/modules/local/prepare_shiny.nf b/modules/local/prepare_shiny.nf index 445a06a4..9e006770 100644 --- a/modules/local/prepare_shiny.nf +++ b/modules/local/prepare_shiny.nf @@ -39,6 +39,7 @@ process PREPARE_SHINY { """ touch shiny_data.csv touch shiny_app.R + touch run.sh cat <<-END_VERSIONS > versions.yml "${task.process}": diff --git a/modules/nf-core/mtmalign/align/main.nf b/modules/nf-core/mtmalign/align/main.nf index d8fa72c3..e539bb61 100644 --- a/modules/nf-core/mtmalign/align/main.nf +++ b/modules/nf-core/mtmalign/align/main.nf @@ -14,8 +14,8 @@ process MTMALIGN_ALIGN { val(compress) output: - tuple val(meta), path("./mTM_result/${prefix}.aln${compress ? '.gz' : ''}"), emit: alignment - tuple val(meta), path("./mTM_result/${prefix}.pdb${compress ? '.gz' : ''}"), emit: structure + tuple val(meta), path("${prefix}.aln${compress ? '.gz' : ''}"), emit: alignment + tuple val(meta), path("${prefix}.pdb${compress ? '.gz' : ''}"), emit: structure path "versions.yml" , emit: versions when: @@ -46,6 +46,9 @@ process MTMALIGN_ALIGN { pigz -p ${task.cpus} ./mTM_result/${prefix}.aln ./mTM_result/${prefix}.pdb fi + # move everything in mTM_result to the working directory + mv ./mTM_result/* . + # mtm-align -v prints the wrong version 20180725, so extract it from the cosmetic output in the help message cat <<-END_VERSIONS > versions.yml "${task.process}": @@ -56,10 +59,9 @@ process MTMALIGN_ALIGN { stub: prefix = task.ext.prefix ?: "${meta.id}" - """ - mkdir mTM_result - touch mTM_result/${prefix}.aln${compress ? '.gz' : ''} - touch mTM_result/${prefix}.pdb${compress ? '.gz' : ''} + """ + touch ${prefix}.aln${compress ? '.gz' : ''} + touch ${prefix}.pdb${compress ? '.gz' : ''} # mtm-align -v prints the wrong version 20180725, so extract it from the cosmetic output in the help message cat <<-END_VERSIONS > versions.yml diff --git a/modules/nf-core/tcoffee/align/main.nf b/modules/nf-core/tcoffee/align/main.nf index a7aa106c..f511e8ce 100644 --- a/modules/nf-core/tcoffee/align/main.nf +++ b/modules/nf-core/tcoffee/align/main.nf @@ -27,7 +27,8 @@ process TCOFFEE_ALIGN { def prefix = task.ext.prefix ?: "${meta.id}" def tree_args = tree ? "-usetree $tree" : "" def template_args = template ? "-template_file $template" : "" - def write_output = compress ? " >(pigz -cp ${task.cpus} > ${prefix}.aln.gz)" : "> ${prefix}.aln" + def outfile = compress ? "stdout" : "${prefix}.aln" + def write_output = compress ? " >(pigz -cp ${task.cpus} > ${prefix}.aln.gz)" : "" // using >() is necessary to preserve the tcoffee return value, // so nextflow knows to display an error when it failed """ @@ -37,8 +38,15 @@ process TCOFFEE_ALIGN { $template_args \ $args \ -thread ${task.cpus} \ - -outfile stdout \ + -outfile $outfile \ $write_output + + # If stdout file exist and compress is true, then compress the file + # This is a patch for the current behaviour of the regressive algorithm + # that does not support the stdout redirection + if [ -f stdout ] && [ "$compress" = true ]; then + pigz -cp ${task.cpus} < stdout > ${prefix}.aln.gz + fi cat <<-END_VERSIONS > versions.yml "${task.process}": diff --git a/nextflow.config b/nextflow.config index b886f2f3..f923be95 100644 --- a/nextflow.config +++ b/nextflow.config @@ -19,17 +19,17 @@ params { // Stats skip_stats = false - calc_sim = false + calc_sim = true calc_seq_stats = true - extract_plddt = false + extract_plddt = true // Evaluation skip_eval = false calc_sp = true - calc_tc = false - calc_irmsd = false - calc_gaps = false - calc_tcs = false + calc_tc = true + calc_irmsd = true + calc_gaps = true + calc_tcs = true skip_compression = false @@ -198,7 +198,7 @@ profiles { } test { includeConfig 'conf/test.config' } test_full { includeConfig 'conf/test_full.config' } - test_parameters { includeConfig 'conf/test_parameters.config' } + test_pdb { includeConfig 'conf/test_pdb.config' } } // Set default registry for Apptainer, Docker, Podman and Singularity independent of -profile diff --git a/subworkflows/local/align.nf b/subworkflows/local/align.nf index 76c273bf..94da8438 100644 --- a/subworkflows/local/align.nf +++ b/subworkflows/local/align.nf @@ -235,6 +235,6 @@ workflow ALIGN { emit: msa - versions = ch_versions.ifEmpty(null) // channel: [ versions.yml ] + versions = ch_versions // channel: [ versions.yml ] } diff --git a/subworkflows/local/compute_trees.nf b/subworkflows/local/compute_trees.nf index 4027680a..aa3d6c34 100644 --- a/subworkflows/local/compute_trees.nf +++ b/subworkflows/local/compute_trees.nf @@ -28,7 +28,6 @@ workflow COMPUTE_TREES { .branch { famsa: it[0]["tree"] == "FAMSA" clustalo: it[0]["tree"] == "CLUSTALO" - magus: it[0]["tree"] == "MAGUS" } .set { ch_fastas_fortrees } @@ -40,11 +39,7 @@ workflow COMPUTE_TREES { ch_trees = ch_trees.mix(CLUSTALO_GUIDETREE.out.tree) ch_versions = ch_versions.mix(CLUSTALO_GUIDETREE.out.versions.first()) - MAGUS_GUIDETREE(ch_fastas_fortrees.clustalo) - ch_trees = ch_trees.mix(MAGUS_GUIDETREE.out.tree) - ch_versions = ch_versions.mix(MAGUS_GUIDETREE.out.versions.first()) - emit: trees = ch_trees // channel: [ val(meta), path(tree) ] - versions = ch_versions.ifEmpty(null) // channel: [ versions.yml ] + versions = ch_versions // channel: [ versions.yml ] } diff --git a/subworkflows/local/utils_nfcore_multiplesequencealign_pipeline/main.nf b/subworkflows/local/utils_nfcore_multiplesequencealign_pipeline/main.nf index 8e1f2339..5e54fb53 100644 --- a/subworkflows/local/utils_nfcore_multiplesequencealign_pipeline/main.nf +++ b/subworkflows/local/utils_nfcore_multiplesequencealign_pipeline/main.nf @@ -73,10 +73,6 @@ workflow PIPELINE_INITIALISATION { UTILS_NFCORE_PIPELINE ( nextflow_cli_args ) - // - // Custom validation for pipeline parameters - // - //validateInputParameters() // // Create channel from input file provided through params.input @@ -98,7 +94,7 @@ workflow PIPELINE_INITIALISATION { align_map["args_aligner_clean"] = Utils.cleanArgs(align_map["args_aligner"]) [ tree_map, align_map ] - } + }.unique() emit: samplesheet = ch_input @@ -377,16 +373,23 @@ class Utils { if(args == null || args == ""|| args == "null"){ args = "" } - args = args + " " + required_flag + " " + default_value + def prefix = "" + if(args != ""){ + prefix = args + " " + } + args = prefix + required_flag + " " + default_value } } return args } + public static check_required_args(tool,args){ // 3DCOFFEE args = fix_args(tool,args,"3DCOFFEE", "-method", "TMalign_pair") + args = fix_args(tool,args,"3DCOFFEE", "-output", "fasta_aln") + // REGRESSIVE args = fix_args(tool,args,"REGRESSIVE", "-reg", "") args = fix_args(tool,args,"REGRESSIVE", "-reg_method", "famsa_msa") From b3dd1007054958ac1da4e1ca3d426dbeeb384e83 Mon Sep 17 00:00:00 2001 From: luisas Date: Wed, 5 Jun 2024 15:17:04 +0200 Subject: [PATCH 05/16] Ci tests --- .github/workflows/ci.yml | 1 + README.md | 14 ++++++++------ assets/toolsheet copy.csv | 18 ------------------ assets/toolsheet.csv | 17 ----------------- conf/test.config | 12 +++++++++++- conf/test_full.config | 11 +++++++++++ conf/test_parameters.config | 4 ++++ conf/test_pdb.config | 9 ++++++++- modules/nf-core/tcoffee/align/main.nf | 7 +++---- nextflow.config | 8 ++++---- 10 files changed, 50 insertions(+), 51 deletions(-) delete mode 100644 assets/toolsheet copy.csv delete mode 100644 assets/toolsheet.csv diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 506c5ac9..0c5c5980 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -29,6 +29,7 @@ jobs: ANALYSIS: - "test" - "test_pdb" + - "test_parameters" steps: - name: Check out pipeline code uses: actions/checkout@0ad4b8fadaa221de15dcec353f45205ec38ea70b # v4 diff --git a/README.md b/README.md index f4244842..16230b40 100644 --- a/README.md +++ b/README.md @@ -60,7 +60,7 @@ Available GUIDE TREE methods (Optional): Available ALIGN methods: -SEQUENCE-BASED (only require a fasta file as input): +**SEQUENCE-BASED** (only require a fasta file as input): - [CLUSTALO](http://clustal.org/omega/#Documentation) (accepts guide tree) - [FAMSA](https://github.com/refresh-bio/FAMSA) (accepts guide tree) - [KALIGN](https://github.com/TimoLassmann/kalign) @@ -70,26 +70,28 @@ SEQUENCE-BASED (only require a fasta file as input): - [MUSCLE5](https://drive5.com/muscle5/manual/) - [TCOFFEE](https://tcoffee.readthedocs.io/en/latest/index.html) (accepts guide tree) -SEQUENCE- and STUCTURE-BASED (require both fasta and structures as input): +**SEQUENCE- and STRUCTURE-BASED** (require both fasta and structures as input): - [3DCOFFEE](https://tcoffee.org/Projects/expresso/index.html) (accepts guide tree) -STRUCTURE-BASED (only require stuctures as input): +**STRUCTURE-BASED** (only require stuctures as input): - [MTMALIGN](https://bio.tools/mtm-align) ### 4. EVALUATE Optionally, the produced MSAs can be evaluated. Skip with `--skip_eval`. -SEQUENCE-BASED (no extra input required): +**SEQUENCE-BASED** (no extra input required): 1. Calculate number of gaps and its average across sequences. Activate using `--calc_gaps` (default: true). -REFERENCE-BASED: +**REFERENCE-BASED**: + The reference MSAs (see samplesheet) are be used to evaluate the quality of the produced MSA. 2. Sum Of Pairs. Calculates the SP score using the [TCOFFEE](https://tcoffee.readthedocs.io/en/latest/index.html) implementation. Activate using `--calc_sp` (default: true). 3. Total column. Calculates the TC score [TCOFFEE](https://tcoffee.readthedocs.io/en/latest/index.html). Activate using `--calc_tc` (default: true). -STRUCTURE-BASED: +**STRUCTURE-BASED**: + The provided structures (see samplesheet) are used to evaluate the quality of the alignment. 4. iRMSD. Calculates the iRMSD using the [TCOFFEE](https://tcoffee.readthedocs.io/en/latest/index.html) implementation. Activate using `--calc_irmsd` (default: false). diff --git a/assets/toolsheet copy.csv b/assets/toolsheet copy.csv deleted file mode 100644 index 39c228a8..00000000 --- a/assets/toolsheet copy.csv +++ /dev/null @@ -1,18 +0,0 @@ -tree,args_tree,aligner,args_aligner -,,CLUSTALO, -,,FAMSA, -,,KALIGN, -,,MAGUS, -,,MAFFT, -,,MAFFT, --dpparttree -,,MUSCLE5, -,,MTMALIGN, -,,REGRESSIVE, -,,REGRESSIVE,-reg_nseq 3 -,,TCOFFEE, -,,3DCOFFEE, -,,3DCOFFEE,-method TMalign_pair -FAMSA,-gt upgma -medoidtree,FAMSA, -FAMSA,,MAGUS, -CLUSTALO,,TCOFFEE - diff --git a/assets/toolsheet.csv b/assets/toolsheet.csv deleted file mode 100644 index 151eeeb2..00000000 --- a/assets/toolsheet.csv +++ /dev/null @@ -1,17 +0,0 @@ -tree,args_tree,aligner,args_aligner -,,CLUSTALO, -,,FAMSA, -,,KALIGN, -,,MAGUS, -,,MAFFT, -,,MAFFT, --dpparttree -,,MUSCLE5, -,,MTMALIGN, -,,REGRESSIVE, -,,REGRESSIVE,-reg_nseq 3 -,,TCOFFEE, -,,3DCOFFEE, -,,3DCOFFEE,-method TMalign_pair -FAMSA,-gt upgma -medoidtree,FAMSA, -FAMSA,,MAGUS, -CLUSTALO,,REGRESSIVE diff --git a/conf/test.config b/conf/test.config index c7b2fd08..c7787b6d 100644 --- a/conf/test.config +++ b/conf/test.config @@ -20,10 +20,20 @@ params { max_memory = '6.GB' max_time = '6.h' + skip_stats = false + calc_sim = true + calc_seq_stats = true + extract_plddt = true + skip_eval = false + calc_sp = true + calc_tc = true + calc_irmsd = true + calc_gaps = true + calc_tcs = true + // Input data input = params.pipelines_testdata_base_path + 'multiplesequencealign/samplesheet/v1.0/samplesheet_test_af2.csv' tools = params.pipelines_testdata_base_path + 'multiplesequencealign/toolsheet/v1.0/toolsheet_full.csv' - tools = "./assets/toolsheet.csv" // Output directory outdir = "./outdir/" diff --git a/conf/test_full.config b/conf/test_full.config index df0bc3fe..96eacde9 100644 --- a/conf/test_full.config +++ b/conf/test_full.config @@ -14,6 +14,17 @@ params { config_profile_name = 'Full test profile' config_profile_description = 'Full test dataset to check pipeline function' + skip_stats = false + calc_sim = true + calc_seq_stats = true + extract_plddt = true + skip_eval = false + calc_sp = true + calc_tc = true + calc_irmsd = true + calc_gaps = true + calc_tcs = true + // Input data for full size test input = params.pipelines_testdata_base_path + 'multiplesequencealign/samplesheet/v1.0/samplesheet_full.csv' tools = params.pipelines_testdata_base_path + 'multiplesequencealign/toolsheet/v1.0/toolsheet_full.csv' diff --git a/conf/test_parameters.config b/conf/test_parameters.config index 486ded00..a5b7e514 100644 --- a/conf/test_parameters.config +++ b/conf/test_parameters.config @@ -20,6 +20,10 @@ params { max_memory = '6.GB' max_time = '6.h' + skip_stats = true + skip_eval = true + skip_compression = false + // Input data input = params.pipelines_testdata_base_path + 'multiplesequencealign/samplesheet/v1.0/samplesheet_test_af2.csv' tools = params.pipelines_testdata_base_path + 'multiplesequencealign/toolsheet/v1.0/toolsheet_full.csv' diff --git a/conf/test_pdb.config b/conf/test_pdb.config index 0392718a..b451dcc0 100644 --- a/conf/test_pdb.config +++ b/conf/test_pdb.config @@ -19,9 +19,16 @@ params { max_memory = '6.GB' max_time = '6.h' + skip_stats = true + calc_irmsd = true + calc_sp = false + calc_tc = false + calc_gaps = false + calc_tcs = false + // Input data input = params.pipelines_testdata_base_path + 'multiplesequencealign/samplesheet/v1.0/samplesheet_test.csv' - tools = params.pipelines_testdata_base_path + 'multiplesequencealign/toolsheet/v1.0/toolsheet.csv' + tools = params.pipelines_testdata_base_path + 'multiplesequencealign/toolsheet/v1.0/toolsheet_structural.csv' // Output directory outdir = "./outdir/" diff --git a/modules/nf-core/tcoffee/align/main.nf b/modules/nf-core/tcoffee/align/main.nf index f511e8ce..77f122f3 100644 --- a/modules/nf-core/tcoffee/align/main.nf +++ b/modules/nf-core/tcoffee/align/main.nf @@ -16,7 +16,7 @@ process TCOFFEE_ALIGN { output: tuple val(meta), path("*.aln{.gz,}"), emit: alignment // in the args there might be the request to generate a lib file, so the following is an optional output - tuple val(meta), path("*.*lib") , emit: lib, optional : true + tuple val(meta), path("* lib") , emit: lib, optional : true path "versions.yml" , emit: versions when: @@ -28,9 +28,7 @@ process TCOFFEE_ALIGN { def tree_args = tree ? "-usetree $tree" : "" def template_args = template ? "-template_file $template" : "" def outfile = compress ? "stdout" : "${prefix}.aln" - def write_output = compress ? " >(pigz -cp ${task.cpus} > ${prefix}.aln.gz)" : "" - // using >() is necessary to preserve the tcoffee return value, - // so nextflow knows to display an error when it failed + def write_output = compress ? " | pigz -cp ${task.cpus} > ${prefix}.aln.gz" : "" """ export TEMP='./' t_coffee -seq ${fasta} \ @@ -46,6 +44,7 @@ process TCOFFEE_ALIGN { # that does not support the stdout redirection if [ -f stdout ] && [ "$compress" = true ]; then pigz -cp ${task.cpus} < stdout > ${prefix}.aln.gz + rm stdout fi cat <<-END_VERSIONS > versions.yml diff --git a/nextflow.config b/nextflow.config index f923be95..c6e19208 100644 --- a/nextflow.config +++ b/nextflow.config @@ -19,17 +19,17 @@ params { // Stats skip_stats = false - calc_sim = true + calc_sim = false calc_seq_stats = true - extract_plddt = true + extract_plddt = false // Evaluation skip_eval = false calc_sp = true calc_tc = true - calc_irmsd = true + calc_irmsd = false calc_gaps = true - calc_tcs = true + calc_tcs = false skip_compression = false From 97e2a9e76692160e7cf13e8556341769eb26763c Mon Sep 17 00:00:00 2001 From: luisas Date: Wed, 5 Jun 2024 15:36:01 +0200 Subject: [PATCH 06/16] Update readme --- README.md | 1 + nextflow.config | 3 ++- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 16230b40..bc40f2f2 100644 --- a/README.md +++ b/README.md @@ -93,6 +93,7 @@ The reference MSAs (see samplesheet) are be used to evaluate the quality of the **STRUCTURE-BASED**: The provided structures (see samplesheet) are used to evaluate the quality of the alignment. + 4. iRMSD. Calculates the iRMSD using the [TCOFFEE](https://tcoffee.readthedocs.io/en/latest/index.html) implementation. Activate using `--calc_irmsd` (default: false). diff --git a/nextflow.config b/nextflow.config index c6e19208..6a7777bd 100644 --- a/nextflow.config +++ b/nextflow.config @@ -198,7 +198,8 @@ profiles { } test { includeConfig 'conf/test.config' } test_full { includeConfig 'conf/test_full.config' } - test_pdb { includeConfig 'conf/test_pdb.config' } + test_pdb { includeConfig 'conf/test_pdb.config' } + test_parameters { includeConfig 'conf/test_parameters.config' } } // Set default registry for Apptainer, Docker, Podman and Singularity independent of -profile From dd7feb561f57b2b3329d181424249d4572315a05 Mon Sep 17 00:00:00 2001 From: luisas Date: Thu, 6 Jun 2024 12:27:50 +0200 Subject: [PATCH 07/16] Update tcoffee and readme --- README.md | 115 +---- conf/modules.config | 420 +++++++++---------- conf/test.config | 2 +- docs/extending.md | 6 +- docs/output.md | 25 +- docs/usage.md | 154 +++++-- modules.json | 4 +- modules/nf-core/tcoffee/align/main.nf | 4 +- modules/nf-core/tcoffee/tcs/main.nf.test | 29 -- modules/nf-core/tcoffee/tcs/tests/lib.config | 4 +- 10 files changed, 370 insertions(+), 393 deletions(-) delete mode 100644 modules/nf-core/tcoffee/tcs/main.nf.test diff --git a/README.md b/README.md index bc40f2f2..192a0c05 100644 --- a/README.md +++ b/README.md @@ -21,89 +21,14 @@ **nf-core/multiplesequencealign** is a pipeline to deploy and systematically evaluate Multiple Sequence Alignment (MSA) methods. -The pipeline is built using [Nextflow](https://www.nextflow.io), a workflow tool to run tasks across multiple compute infrastructures in a very portable manner. It uses Docker/Singularity containers making installation trivial and results highly reproducible. The [Nextflow DSL2](https://www.nextflow.io/docs/latest/dsl2.html) implementation of this pipeline uses one container per process which makes it much easier to maintain and update software dependencies. Where possible, these processes have been submitted to and installed from [nf-core/modules](https://github.com/nf-core/modules) in order to make them available to all nf-core pipelines, and to everyone within the Nextflow community! - -On release, automated continuous integration tests run the pipeline on a full-sized dataset on the AWS cloud infrastructure. This ensures that the pipeline runs on AWS, has sensible resource allocation defaults set to run on real-world datasets, and permits the persistent storage of results to benchmark between pipeline releases and other analysis sources.The results obtained from the full-sized test can be viewed on the [nf-core website](https://nf-co.re/proteinfold/results). - ![Alt text](docs/images/nf-core-msa_metro_map.png?raw=true "nf-core-msa metro map") -1. **Input files summary**: (Optional) computation of summary statistics on the input fasta file, such as the average sequence similarity across the input sequences, their length, etc. Skip by `--skip_stats` as a parameter. +In a nutshell, the pipeline performs the following streos: +1. **Input files summary**: (Optional) computation of summary statistics on the input fasta file, such as the average sequence similarity across the input sequences, their length, etc. 2. **Guide Tree**: (Optional) Renders a guide tree. 3. **Align**: aligns the sequences. -4. **Evaluate**: (Optional) The obtained alignments are evaluated with different metrics: Sum Of Pairs (SoP), Total Column score (TC), iRMSD, Total Consistency Score (TCS), etc. Skip by passing `--skip_eval` as a parameter. -5. **Report**: Reports about the collected information of the runs are reported in a shiny app and a summary table in multiqc. Skip by passing `--skip_shiny` and `--skip_multiqc`. - - -### 1. INPUT FILES SUMMARY - -Summary information about the input fasta files are calculated. Skip by `--skip_stats`. - - 1. Sequence similarity. Calculates pairwise and average sequence similarity is calculated using TCOFFEE. Activate with `--calc_sim` (default: false). - 2. General summary. Calculates the number and the average length of sequences. Activate with `--calc_seq_stats` (default: true). - 3. Extract plddt. If the structures were generated by AF2, plddt is extracted and reported. Activate with `--extract_plddt` (default: false). - -### 2. GUIDE TREES - -Guide trees define the order in which sequences and profiles are aligned and hold a crucial role in determining the final MSA accuracy. Tree rendering techniques most commonly rely on pairwise distances between sequences. - -> **Note** -> None of the below listed aligner needs an explicit definition of a guidetree: if they need one, they compute their own default guide tree. This explicit definition of a guide tree is available in case you want to test non-default combination of guide trees and aligner methods. - - -Available GUIDE TREE methods (Optional): - -- [CLUSTALO](http://clustal.org/omega/#Documentation) -- [FAMSA](https://github.com/refresh-bio/FAMSA) - - -### 3. ALIGN - -Available ALIGN methods: - -**SEQUENCE-BASED** (only require a fasta file as input): -- [CLUSTALO](http://clustal.org/omega/#Documentation) (accepts guide tree) -- [FAMSA](https://github.com/refresh-bio/FAMSA) (accepts guide tree) -- [KALIGN](https://github.com/TimoLassmann/kalign) -- [LEARNMSA](https://github.com/Gaius-Augustus/learnMSA) -- [MAFFT](https://mafft.cbrc.jp/alignment/server/index.html) -- [MAGUS](https://github.com/vlasmirnov/MAGUS) (accepts guide tree) -- [MUSCLE5](https://drive5.com/muscle5/manual/) -- [TCOFFEE](https://tcoffee.readthedocs.io/en/latest/index.html) (accepts guide tree) - -**SEQUENCE- and STRUCTURE-BASED** (require both fasta and structures as input): -- [3DCOFFEE](https://tcoffee.org/Projects/expresso/index.html) (accepts guide tree) - -**STRUCTURE-BASED** (only require stuctures as input): -- [MTMALIGN](https://bio.tools/mtm-align) - -### 4. EVALUATE - -Optionally, the produced MSAs can be evaluated. Skip with `--skip_eval`. - -**SEQUENCE-BASED** (no extra input required): -1. Calculate number of gaps and its average across sequences. Activate using `--calc_gaps` (default: true). - -**REFERENCE-BASED**: - -The reference MSAs (see samplesheet) are be used to evaluate the quality of the produced MSA. - -2. Sum Of Pairs. Calculates the SP score using the [TCOFFEE](https://tcoffee.readthedocs.io/en/latest/index.html) implementation. Activate using `--calc_sp` (default: true). -3. Total column. Calculates the TC score [TCOFFEE](https://tcoffee.readthedocs.io/en/latest/index.html). Activate using `--calc_tc` (default: true). - -**STRUCTURE-BASED**: - -The provided structures (see samplesheet) are used to evaluate the quality of the alignment. - -4. iRMSD. Calculates the iRMSD using the [TCOFFEE](https://tcoffee.readthedocs.io/en/latest/index.html) implementation. Activate using `--calc_irmsd` (default: false). - - -### 5. REPORT - -Finally, a summary table with all the computed statistics and evaluations is reported in MultiQC (Skip by `--skip_multiqc`). -Moreover, a shiny app is prepared with interactive summary plots. -> [!WARNING] -> You will need to have [shiny](https://shiny.posit.co/py/) installed to run it! See [output documentation](https://nf-co.re/multiplesequencealign/output) for more infos. - +4. **Evaluate**: (Optional) The obtained alignments are evaluated with different metrics: Sum Of Pairs (SoP), Total Column score (TC), iRMSD, Total Consistency Score (TCS), etc. +5. **Report**: Reports about the collected information of the runs are reported in a shiny app and a summary table in multiqc. ## Usage @@ -123,25 +48,14 @@ seatoxin,seatoxin.fa,seatoxin-ref.fa,seatoxin_structures toxin,toxin.fa,toxin-ref.fa,toxin_structures ``` -Each row represents a set of sequences (in this case the seatoxin and toxin protein families) to be processed. - -`id` is the name of the set of sequences. It can correspond to the protein family name or to an internal id. It should be unique. - -The column `fasta` contains the path to the fasta file that contains the sequencest be aligned. - -The column `reference` is optional and contains the path to the reference alignment. It is used for the reference-based evaluation steps. It can be left empty. - -The column `structures` is also optional and contains the path to the folder that contains the protein structures for the sequences to be aligned. It is used for structural aligners and structure-based evaluation steps. It can be left empty. - +Each row represents a set of sequences (in this case the seatoxin and toxin protein families) to be aligned. > [!NOTE] -> You can have some samples with structures and/or references and some without. The pipeline will run the modules requiring structures/references only on the samples for which you have provided the required information and the others will be just skipped. +> The only required input is the id column and either fasta or structures. #### 2. TOOLSHEET -The reason why we provide a toolsheet as input is that if the pipeline needs to be used as benchmarking framework, we need to test multiple arguments per module. This can be done by having multiple entries in the toolsheet per module with the multiple arguments to be tested. - Each line of the toolsheet defines a combination of guide tree and multiple sequence aligner to run with the respective arguments to be used. It should look at foollows: @@ -155,18 +69,7 @@ FAMSA, -gt upgma -medoidtree, FAMSA, FAMSA,,REGRESSIVE, ``` > [!NOTE] -> Each of the trees and aligners are available as standalones, so args_tree and args_aligner can be left empty if you are cool with the default settings of each method. Tree can also be left empty, and the default guide tree will be used for each aligner. - -> [!NOTE] -> use the exact spelling as listed above! - -`tree` is the tool used to build the tree. (optional) - -Arguments to the tree tool can be provided using `args_tree`. Please refer to each tool's documentation. (optional) - -The `aligner` column contains the tool to run the alignment. (optional) - -Finally, the arguments to the aligner tool can be set by using the `args_alginer` column. (optional) +> The only required input is aligner. #### 3. RUN THE PIPELINE @@ -186,6 +89,10 @@ nextflow run nf-core/multiplesequencealign \ For more details and further functionality, please refer to the [usage documentation](https://nf-co.re/multiplesequencealign/usage) and the [parameter documentation](https://nf-co.re/multiplesequencealign/parameters). +## Extending the pipeline + +For details on how to add your favourite guide tree/MSA/evaluation step in nf-core/multiplesequencealign please refer to [extending documentation](https://github.com/luisas/multiplesequencealign/blob/luisa_patch/docs/extending.md). + ## Pipeline output To see the results of an example test run with a full size dataset refer to the [results](https://nf-co.re/multiplesequencealign/results) tab on the nf-core website pipeline page. diff --git a/conf/modules.config b/conf/modules.config index 17700e6e..f3b421a9 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -1,249 +1,249 @@ -/* -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - Config file for defining DSL2 per module options and publishing paths -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - Available keys to override module options: - ext.args = Additional arguments appended to command in module. - ext.args2 = Second set of arguments appended to command in module (multi-tool modules). - ext.args3 = Third set of arguments appended to command in module (multi-tool modules). - ext.prefix = File name prefix for output files. ----------------------------------------------------------------------------------------- -*/ - -process { - - // ------------------------------------ - // Statistics about the input sequences - // ------------------------------------ - - withName: "CALCULATE_SEQSTATS"{ - publishDir = [ - path: { "${params.outdir}/summary/stats/sequences/seqstats" }, - mode: params.publish_dir_mode, - saveAs: { filename -> filename.equals('versions.yml')|filename.contains('_summary.csv') ? null : filename } - ] - } - - withName: "CONCAT_SEQSTATS"{ - ext.prefix = { "summary_seqstats" } - } - - withName: "EXTRACT_PLDDT"{ - publishDir = [ - path: { "${params.outdir}/summary/stats/structures/plddt" }, - mode: params.publish_dir_mode, - saveAs: { filename -> filename.equals('versions.yml')|filename.contains('_summary.csv') ? null : filename } - ] - } - - withName: "CONCAT_PLDDT"{ - ext.prefix = { "summary_plddt" } - } - - withName: TCOFFEE_SEQREFORMAT_SIM{ - ext.args = "-output=sim_idscore" - publishDir = [ - path: { "${params.outdir}/summary/stats/sequences/perc_sim" }, - mode: params.publish_dir_mode, - saveAs: { filename -> filename.equals('versions.yml') ? null : filename } - ] - } + /* + ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Config file for defining DSL2 per module options and publishing paths + ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Available keys to override module options: + ext.args = Additional arguments appended to command in module. + ext.args2 = Second set of arguments appended to command in module (multi-tool modules). + ext.args3 = Third set of arguments appended to command in module (multi-tool modules). + ext.prefix = File name prefix for output files. + ---------------------------------------------------------------------------------------- + */ + + process { + + // ------------------------------------ + // Statistics about the input sequences + // ------------------------------------ + + withName: "CALCULATE_SEQSTATS"{ + publishDir = [ + path: { "${params.outdir}/summary/stats/sequences/seqstats" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml')|filename.contains('_summary.csv') ? null : filename } + ] + } - withName: "CONCAT_SIMSTATS"{ - ext.prefix = { "summary_simstats" } - } + withName: "CONCAT_SEQSTATS"{ + ext.prefix = { "summary_seqstats" } + } - withName: "MERGE_STATS"{ - ext.prefix = { "complete_summary_stats" } - ext.args = "-f 1 -O" - publishDir = [ - path: { "${params.outdir}/stats/" }, - mode: params.publish_dir_mode, - saveAs: { filename -> filename.equals('versions.yml') ? null : filename } - ] - } + withName: "EXTRACT_PLDDT"{ + publishDir = [ + path: { "${params.outdir}/summary/stats/structures/plddt" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml')|filename.contains('_summary.csv') ? null : filename } + ] + } + withName: "CONCAT_PLDDT"{ + ext.prefix = { "summary_plddt" } + } - // ------------------------------------ - // Tree building - // ------------------------------------ + withName: TCOFFEE_SEQREFORMAT_SIM{ + ext.args = "-output=sim_idscore" + publishDir = [ + path: { "${params.outdir}/summary/stats/sequences/perc_sim" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } - withName: "FAMSA_GUIDETREE"{ - tag = { "${meta.id} args:${meta.args_tree}" } - ext.prefix = { "${meta.id}_${meta.tree}-args-${meta.argstree_clean}" } - ext.args = { "${meta.args_tree}" == "null" ? '' : "${meta.args_tree}" } - publishDir = [ - path: { "${params.outdir}/trees/${meta.id}" }, - mode: params.publish_dir_mode, - saveAs: { filename -> filename.equals('versions.yml') ? null : filename } - ] - } + withName: "CONCAT_SIMSTATS"{ + ext.prefix = { "summary_simstats" } + } - withName: "CLUSTALO_GUIDETREE"{ - tag = { "${meta.id} args:${meta.args_tree}" } - ext.prefix = { "${meta.id}_${meta.tree}-args-${meta.argstree_clean}" } - ext.args = { "${meta.args_tree}" == "null" ? '' : "${meta.args_tree}" } - publishDir = [ - path: { "${params.outdir}/trees/${meta.id}" }, - mode: params.publish_dir_mode, - saveAs: { filename -> filename.equals('versions.yml') ? null : filename } - ] - } + withName: "MERGE_STATS"{ + ext.prefix = { "complete_summary_stats" } + ext.args = "-f 1 -O" + publishDir = [ + path: { "${params.outdir}/stats/" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } - withName: "MAGUS_GUIDETREE"{ - tag = { "${meta.id} args:${meta.args_tree}" } - ext.prefix = { "${meta.id}_${meta.tree}-args-${meta.argstree_clean}" } - ext.args = { "${meta.args_tree}" == "null" ? '' : "${meta.args_tree}" } - publishDir = [ - path: { "${params.outdir}/trees/${meta.id}" }, - mode: params.publish_dir_mode, - saveAs: { filename -> filename.equals('versions.yml') ? null : filename } - ] - } - // ------------------------------------ - // Alignment - // ------------------------------------ + // ------------------------------------ + // Tree building + // ------------------------------------ - withName: "CREATE_TCOFFEETEMPLATE"{ - ext.prefix = { "${meta.id}" } - } + withName: "FAMSA_GUIDETREE"{ + tag = { "${meta.id} args:${meta.args_tree}" } + ext.prefix = { "${meta.id}_${meta.tree}-args-${meta.argstree_clean}" } + ext.args = { "${meta.args_tree}" == "null" ? '' : "${meta.args_tree}" } + publishDir = [ + path: { "${params.outdir}/trees/${meta.id}" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + withName: "CLUSTALO_GUIDETREE"{ + tag = { "${meta.id} args:${meta.args_tree}" } + ext.prefix = { "${meta.id}_${meta.tree}-args-${meta.argstree_clean}" } + ext.args = { "${meta.args_tree}" == "null" ? '' : "${meta.args_tree}" } + publishDir = [ + path: { "${params.outdir}/trees/${meta.id}" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } - withName: "CLUSTALO_ALIGN|FAMSA_ALIGN|LEARNMSA_ALIGN|MAFFT|MAGUS_ALIGN|MUSCLE5_SUPER5|REGRESSIVE|TCOFFEE_ALIGN|TCOFFEE3D_ALIGN"{ - tag = { "${meta.id} tree:${meta.tree} argstree:${args_tree} args:${meta.args_aligner}" } - ext.prefix = { "${meta.id}_${meta.tree}-args-${meta.argstree_clean}_${meta.aligner}-args-${meta.args_aligner_clean}" } - ext.args = { "${meta.args_aligner}" == "null" ? '' : "${meta.args_aligner}" } - if(params.skip_compression){ + withName: "MAGUS_GUIDETREE"{ + tag = { "${meta.id} args:${meta.args_tree}" } + ext.prefix = { "${meta.id}_${meta.tree}-args-${meta.argstree_clean}" } + ext.args = { "${meta.args_tree}" == "null" ? '' : "${meta.args_tree}" } publishDir = [ - path: { "${params.outdir}/alignment/${meta.id}" }, + path: { "${params.outdir}/trees/${meta.id}" }, mode: params.publish_dir_mode, saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ] } - } - withName: "MTMALIGN_ALIGN"{ - tag = { "${meta.id} tree:${meta.tree} argstree:${args_tree} args:${meta.args_aligner}" } - ext.prefix = { "${meta.id}_${meta.tree}-args-${meta.argstree_clean}_${meta.aligner}-args-${meta.args_aligner_clean}" } - ext.args = { "${meta.args_aligner}" == "null" ? '' : "${meta.args_aligner}" } - if(params.skip_compression){ + // ------------------------------------ + // Alignment + // ------------------------------------ + + withName: "CREATE_TCOFFEETEMPLATE"{ + ext.prefix = { "${meta.id}" } + } + + + withName: "CLUSTALO_ALIGN|FAMSA_ALIGN|LEARNMSA_ALIGN|MAFFT|MAGUS_ALIGN|MUSCLE5_SUPER5|REGRESSIVE|TCOFFEE_ALIGN|TCOFFEE3D_ALIGN"{ + tag = { "${meta.id} tree:${meta.tree} argstree:${args_tree} args:${meta.args_aligner}" } + ext.prefix = { "${meta.id}_${meta.tree}-args-${meta.argstree_clean}_${meta.aligner}-args-${meta.args_aligner_clean}" } + ext.args = { "${meta.args_aligner}" == "null" ? '' : "${meta.args_aligner}" } + if(params.skip_compression){ + publishDir = [ + path: { "${params.outdir}/alignment/${meta.id}" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + } + + withName: "MTMALIGN_ALIGN"{ + tag = { "${meta.id} tree:${meta.tree} argstree:${args_tree} args:${meta.args_aligner}" } + ext.prefix = { "${meta.id}_${meta.tree}-args-${meta.argstree_clean}_${meta.aligner}-args-${meta.args_aligner_clean}" } + ext.args = { "${meta.args_aligner}" == "null" ? '' : "${meta.args_aligner}" } + if(params.skip_compression){ + publishDir = [ + path: { "${params.outdir}/alignment/${meta.id}" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, + pattern: "*.aln" + ] + } + + } + + withName:"PIGZ_COMPRESS"{ publishDir = [ path: { "${params.outdir}/alignment/${meta.id}" }, mode: params.publish_dir_mode, - saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, - pattern: "*.aln" - ] + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] } - } - - withName:"PIGZ_COMPRESS"{ - publishDir = [ - path: { "${params.outdir}/alignment/${meta.id}" }, - mode: params.publish_dir_mode, - saveAs: { filename -> filename.equals('versions.yml') ? null : filename } - ] - } + // ------------------------------------ + // Alignment evaluation + // ------------------------------------ - // ------------------------------------ - // Alignment evaluation - // ------------------------------------ + withName: 'PARSE_IRMSD'{ + ext.prefix = { "${meta.id}_${meta.tree}-args-${meta.argstree_clean}_${meta.aligner}-args-${meta.args_aligner_clean}_irmsd" } + } - withName: 'PARSE_IRMSD'{ - ext.prefix = { "${meta.id}_${meta.tree}-args-${meta.argstree_clean}_${meta.aligner}-args-${meta.args_aligner_clean}_irmsd" } - } + withName: 'TCOFFEE_ALNCOMPARE_SP'{ + ext.prefix = { "${meta.id}_${meta.tree}-args-${meta.argstree_clean}_${meta.aligner}-args-${meta.args_aligner_clean}_sp" } + ext.args = "-compare_mode sp" + } - withName: 'TCOFFEE_ALNCOMPARE_SP'{ - ext.prefix = { "${meta.id}_${meta.tree}-args-${meta.argstree_clean}_${meta.aligner}-args-${meta.args_aligner_clean}_sp" } - ext.args = "-compare_mode sp" - } + withName: 'TCOFFEE_ALNCOMPARE_TC'{ + ext.prefix = { "${meta.id}_${meta.tree}-args-${meta.argstree_clean}_${meta.aligner}-args-${meta.args_aligner_clean}_tc" } + ext.args = "-compare_mode tc" + } - withName: 'TCOFFEE_ALNCOMPARE_TC'{ - ext.prefix = { "${meta.id}_${meta.tree}-args-${meta.argstree_clean}_${meta.aligner}-args-${meta.args_aligner_clean}_tc" } - ext.args = "-compare_mode tc" - } + withName: 'TCOFFEE_IRMSD'{ + ext.prefix = { "${meta.id}_${meta.tree}-args-${meta.argstree_clean}_${meta.aligner}-args-${meta.args_aligner_clean}_irmsd" } + publishDir = [ + path: { "${params.outdir}/evaluation/${task.process.tokenize(':')[-1].toLowerCase()}" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } - withName: 'TCOFFEE_IRMSD'{ - ext.prefix = { "${meta.id}_${meta.tree}-args-${meta.argstree_clean}_${meta.aligner}-args-${meta.args_aligner_clean}_irmsd" } - publishDir = [ - path: { "${params.outdir}/evaluation/summary/${task.process.tokenize(':')[-1].toLowerCase()}" }, - mode: params.publish_dir_mode, - saveAs: { filename -> filename.equals('versions.yml') ? null : filename } - ] - } + withName: "CALC_GAPS"{ + ext.prefix = { "${meta.id}_${meta.tree}-args-${meta.argstree_clean}_${meta.aligner}-args-${meta.args_aligner_clean}_gaps" } + } - withName: "CALC_GAPS"{ - ext.prefix = { "${meta.id}_${meta.tree}-args-${meta.argstree_clean}_${meta.aligner}-args-${meta.args_aligner_clean}_gaps" } - } + withName: "CONCAT_IRMSD"{ + ext.prefix = { "summary_irmsd" } + } - withName: "CONCAT_IRMSD"{ - ext.prefix = { "summary_irmsd" } - } + withName: "CONCAT_GAPS"{ + ext.prefix = { "summary_gaps" } + } - withName: "CONCAT_GAPS"{ - ext.prefix = { "summary_gaps" } - } + withName: "CONCAT_SP"{ + ext.prefix = { "summary_sp" } + } - withName: "CONCAT_SP"{ - ext.prefix = { "summary_sp" } - } + withName: "CONCAT_TC"{ + ext.prefix = { "summary_tc" } + } - withName: "CONCAT_TC"{ - ext.prefix = { "summary_tc" } - } + withName: "CONCAT_TCS"{ + ext.prefix = { "summary_tcs" } + } - withName: "CONCAT_TCS"{ - ext.prefix = { "summary_tcs" } - } + withName: 'TCOFFEE_TCS'{ + ext.prefix = { "${meta.id}_${meta.tree}-args-${meta.argstree_clean}_${meta.aligner}-args-${meta.args_aligner_clean}_tcs" } + publishDir = [ + path: { "${params.outdir}/evaluation/${task.process.tokenize(':')[-1].toLowerCase()}" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } - withName: 'TCOFFEE_TCS'{ - ext.prefix = { "${meta.id}_${meta.tree}-args-${meta.argstree_clean}_${meta.aligner}-args-${meta.args_aligner_clean}_tcs" } - publishDir = [ - path: { "${params.outdir}/evaluation/summary/${task.process.tokenize(':')[-1].toLowerCase()}" }, - mode: params.publish_dir_mode, - saveAs: { filename -> filename.equals('versions.yml') ? null : filename } - ] - } + withName: "MERGE_EVAL"{ + ext.prefix = { "complete_summary_eval" } + ext.args = "-f 1,2,3,4,5,6,7 -O" + publishDir = [ + path: { "${params.outdir}/evaluation/" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } - withName: "MERGE_EVAL"{ - ext.prefix = { "complete_summary_eval" } - ext.args = "-f 1,2,3,4,5,6,7 -O" - publishDir = [ - path: { "${params.outdir}/evaluation/" }, - mode: params.publish_dir_mode, - saveAs: { filename -> filename.equals('versions.yml') ? null : filename } - ] - } + withName: "MERGE_STATS_EVAL"{ + ext.prefix = { "complete_summary_stats_eval" } + ext.args = "-f 1 -O" + publishDir = [ + path: { "${params.outdir}/summary/" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } - withName: "MERGE_STATS_EVAL"{ - ext.prefix = { "complete_summary_stats_eval" } - ext.args = "-f 1 -O" - publishDir = [ - path: { "${params.outdir}/summary/" }, - mode: params.publish_dir_mode, - saveAs: { filename -> filename.equals('versions.yml') ? null : filename } - ] - } + withName: 'MULTIQC' { + ext.args = { params.multiqc_title ? "--title \"$params.multiqc_title\"" : '' } + publishDir = [ + path: { "${params.outdir}/multiqc" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } - withName: 'MULTIQC' { - ext.args = { params.multiqc_title ? "--title \"$params.multiqc_title\"" : '' } - publishDir = [ - path: { "${params.outdir}/multiqc" }, - mode: params.publish_dir_mode, - saveAs: { filename -> filename.equals('versions.yml') ? null : filename } - ] - } + // ------------------------------------ + // Shiny app + // ------------------------------------ + withName: 'PREPARE_SHINY' { + publishDir = [ + path: { "${params.outdir}/shiny_app" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } - // ------------------------------------ - // Shiny app - // ------------------------------------ - withName: 'PREPARE_SHINY' { - publishDir = [ - path: { "${params.outdir}/shiny_app" }, - mode: params.publish_dir_mode, - saveAs: { filename -> filename.equals('versions.yml') ? null : filename } - ] } - -} diff --git a/conf/test.config b/conf/test.config index c7787b6d..4d8bef0c 100644 --- a/conf/test.config +++ b/conf/test.config @@ -34,7 +34,7 @@ params { // Input data input = params.pipelines_testdata_base_path + 'multiplesequencealign/samplesheet/v1.0/samplesheet_test_af2.csv' tools = params.pipelines_testdata_base_path + 'multiplesequencealign/toolsheet/v1.0/toolsheet_full.csv' - + // Output directory outdir = "./outdir/" diff --git a/docs/extending.md b/docs/extending.md index 54ce2885..7822ef01 100644 --- a/docs/extending.md +++ b/docs/extending.md @@ -16,8 +16,6 @@ Useful resources are: The pipeline consists of four different subworkflows, one for computing the guidetrees of guidetree-based methods, one for performing the MSAs, one for evaluating the produced MSAs and one for computing statistics about the input dataset. The subworkflows are to a significant degree isolated from each other, and not all of them may run in any given execution of the pipeline. -`subworkflows/local/evaluate.nf` handles the evaluation step. It calls the modules used for evaluation and merges their output into some summary statistics. -If it is not skipped, it is the last part of the pipeline to run. ## Adding an aligner @@ -52,14 +50,14 @@ Congratulations, your guide tree estimator is now in nf-core/multiplesequencalig Adding a new evaluation module into the pipeline is a bit more tricky, since the output of the evaluation modules gets processed and merged in different ways in the pipeline. This requires changes in the `evaluate.nf` subworkflow and the pipeline config as well as adding an option to the main pipeline. -The process of adding `ULTRAMSATRIC` evaluation to the pipeline may be a useful reference: [commit history](https://github.com/lrauschning/multiplesequencealign/commits/ultramsatric/). + In general, the process of adding another evaluation module to the pipeline can be thought of as three steps: 1. Create a local or nf-core module. - Make sure the evaluation output is returned from the module in CSV format! - For merging the correct evaluation files in reporting the final output, the pipeline uses the `meta` field containing the tools to use. This information has to be included in the CSV returned by the module so as to merge it later. - - Have a look at how `TCOFFEE_ALNCOMPARE` and `ULTRAMSATRIC` handle this. + - Have a look at how `TCOFFEE_ALNCOMPARE` handles this. 2. Include the evaluation module in the evaluation subworkflow diff --git a/docs/output.md b/docs/output.md index 98627e19..d24a8375 100644 --- a/docs/output.md +++ b/docs/output.md @@ -10,16 +10,20 @@ The directories listed below will be created in the results directory after the The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes data using the following steps: -- [FastQC](#fastqc) - Raw read QC -- [MultiQC](#multiqc) - Aggregate report describing results and QC from the whole pipeline -- [Pipeline information](#pipeline-information) - Report metrics generated during the workflow execution +1. **Input files summary**: (Optional) computation of summary statistics on the input fasta file, such as the average sequence similarity across the input sequences, their length, etc. Skip by `--skip_stats` as a parameter. +2. **Guide Tree**: (Optional) Renders a guide tree. +3. **Align**: aligns the sequences. +4. **Evaluate**: (Optional) The obtained alignments are evaluated with different metrics: Sum Of Pairs (SoP), Total Column score (TC), iRMSD, Total Consistency Score (TCS), etc. Skip by passing `--skip_eval` as a parameter. +5. **Report**: Reports about the collected information of the runs are reported in a shiny app and a summary table in multiqc. Skip by passing `--skip_shiny` and `--skip_multiqc`. ## Summary statistics of input files +The stats.nf subworkflow collects statistics about the input files and summarizes them into a final csv file. +
Output files -- `stats/` +- `summary/stats/` - `complete_summary_stats.csv`: csv file containing the summary for all the statistics computed on the input file. - `sequences/` - `seqstats/*_seqstats.csv`: file containing the sequence input length for each sequence in the family defined by the file name. If `--calc_seq_stats` is specified. @@ -27,22 +31,24 @@ The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes d - `structures/` - `plddt/*_full_plddt.csv`: file containing the plddt of the structures for each sequence in the input file. If `--extract_plddt` is specified.
-The stats.nf subworkflow collects statistics about the input files and summarizes them into a final csv file. ## Trees +If you explicitly specifified (via the toolsheet) to compute guidetrees to be used by the MSA tool, those are stored here. +
Output files - `trees/` - - `*.dnd`: guide tree files. + - `*/*.dnd`: guide tree files.
-If you explicitly specifified (via the toolsheet) to compute guidetrees to be used by the MSA tool, those are stored here. ## Alignment +All MSA computed are stored here. +
Output files @@ -53,10 +59,11 @@ If you explicitly specifified (via the toolsheet) to compute guidetrees to be us
-All MSA computed are stored here. ## Evaluation +Stores the files with the summary of the computed evaluation statistics. +
Output files @@ -83,7 +90,7 @@ To run the shiny app: `cd shiny_app` `./run.sh` -Be aware that you have to have shiny installed to access this feature. +Be aware that you have to have [shiny](https://shiny.posit.co/py/) installed to access this feature. ### MultiQC diff --git a/docs/usage.md b/docs/usage.md index 79b2fa13..1e928653 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -6,51 +6,145 @@ ## Introduction - +**nf-core/multiplesequencealign** is a pipeline to deploy and systematically evaluate Multiple Sequence Alignment (MSA) methods. + +The main steps of the pipeline are: + +1. **Input files summary**: (Optional) computation of summary statistics on the input fasta file, such as the average sequence similarity across the input sequences, their length, etc. Skip by `--skip_stats` as a parameter. +2. **Guide Tree**: (Optional) Renders a guide tree. +3. **Align**: aligns the sequences. +4. **Evaluate**: (Optional) The obtained alignments are evaluated with different metrics: Sum Of Pairs (SoP), Total Column score (TC), iRMSD, Total Consistency Score (TCS), etc. Skip by passing `--skip_eval` as a parameter. +5. **Report**: Reports about the collected information of the runs are reported in a shiny app and a summary table in multiqc. Skip by passing `--skip_shiny` and `--skip_multiqc`. + +### 1. INPUT FILES SUMMARY + +Summary information about the input fasta files are calculated. Skip by `--skip_stats`. + + 1. Sequence similarity. Calculates pairwise and average sequence similarity is calculated using TCOFFEE. Activate with `--calc_sim` (default: false). + 2. General summary. Calculates the number and the average length of sequences. Activate with `--calc_seq_stats` (default: true). + 3. Extract plddt. If the structures were generated by AF2, plddt is extracted and reported. Activate with `--extract_plddt` (default: false). + +### 2. GUIDE TREES + +Guide trees define the order in which sequences and profiles are aligned and hold a crucial role in determining the final MSA accuracy. Tree rendering techniques most commonly rely on pairwise distances between sequences. + +> **Note** +> None of the below listed aligner needs an explicit definition of a guidetree: if they need one, they compute their own default guide tree. This explicit definition of a guide tree is available in case you want to test non-default combination of guide trees and aligner methods. + + +Available GUIDE TREE methods (Optional): + +- [CLUSTALO](http://clustal.org/omega/#Documentation) +- [FAMSA](https://github.com/refresh-bio/FAMSA) + + +### 3. ALIGN + +Available ALIGN methods: + +**SEQUENCE-BASED** (only require a fasta file as input): +- [CLUSTALO](http://clustal.org/omega/#Documentation) (accepts guide tree) +- [FAMSA](https://github.com/refresh-bio/FAMSA) (accepts guide tree) +- [KALIGN](https://github.com/TimoLassmann/kalign) +- [LEARNMSA](https://github.com/Gaius-Augustus/learnMSA) +- [MAFFT](https://mafft.cbrc.jp/alignment/server/index.html) +- [MAGUS](https://github.com/vlasmirnov/MAGUS) (accepts guide tree) +- [MUSCLE5](https://drive5.com/muscle5/manual/) +- [TCOFFEE](https://tcoffee.readthedocs.io/en/latest/index.html) (accepts guide tree) + +**SEQUENCE- and STRUCTURE-BASED** (require both fasta and structures as input): +- [3DCOFFEE](https://tcoffee.org/Projects/expresso/index.html) (accepts guide tree) + +**STRUCTURE-BASED** (only require stuctures as input): +- [MTMALIGN](https://bio.tools/mtm-align) + +### 4. EVALUATE + +Optionally, the produced MSAs can be evaluated. Skip with `--skip_eval`. + +**SEQUENCE-BASED** (no extra input required): +1. Calculate number of gaps and its average across sequences. Activate using `--calc_gaps` (default: true). + +**REFERENCE-BASED**: + +The reference MSAs (see samplesheet) are be used to evaluate the quality of the produced MSA. + +2. Sum Of Pairs. Calculates the SP score using the [TCOFFEE](https://tcoffee.readthedocs.io/en/latest/index.html) implementation. Activate using `--calc_sp` (default: true). +3. Total column. Calculates the TC score [TCOFFEE](https://tcoffee.readthedocs.io/en/latest/index.html). Activate using `--calc_tc` (default: true). + +**STRUCTURE-BASED**: + +The provided structures (see samplesheet) are used to evaluate the quality of the alignment. + +4. iRMSD. Calculates the iRMSD using the [TCOFFEE](https://tcoffee.readthedocs.io/en/latest/index.html) implementation. Activate using `--calc_irmsd` (default: false). + + +### 5. REPORT + +Finally, a summary table with all the computed statistics and evaluations is reported in MultiQC (Skip by `--skip_multiqc`). +Moreover, a shiny app is prepared with interactive summary plots. +> [!WARNING] +> You will need to have [shiny](https://shiny.posit.co/py/) installed to run it! See [output documentation](https://nf-co.re/multiplesequencealign/output) for more infos. + ## Samplesheet input -You will need to create a samplesheet with information about the samples you would like to analyse before running the pipeline. Use this parameter to specify its location. It has to be a comma-separated file with 3 columns, and a header row as shown in the examples below. +The sample sheet defines the input data that the pipeline will process. +It should look like this: -```bash ---input '[path to samplesheet file]' +`samplesheet.csv`: + +```csv +id,fasta,reference,structures +seatoxin,seatoxin.fa,seatoxin-ref.fa,seatoxin_structures +toxin,toxin.fa,toxin-ref.fa,toxin_structures ``` -### Multiple runs of the same sample +Each row represents a set of sequences (in this case the seatoxin and toxin protein families) to be processed. -The `sample` identifiers have to be the same when you have re-sequenced the same sample more than once e.g. to increase sequencing depth. The pipeline will concatenate the raw reads before performing any downstream analysis. Below is an example for the same sample sequenced across 3 lanes: -```csv title="samplesheet.csv" -sample,fastq_1,fastq_2 -CONTROL_REP1,AEG588A1_S1_L002_R1_001.fastq.gz,AEG588A1_S1_L002_R2_001.fastq.gz -CONTROL_REP1,AEG588A1_S1_L003_R1_001.fastq.gz,AEG588A1_S1_L003_R2_001.fastq.gz -CONTROL_REP1,AEG588A1_S1_L004_R1_001.fastq.gz,AEG588A1_S1_L004_R2_001.fastq.gz -``` +| Column | Description | +| ---------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `id` | Required. Name of the set of sequences. It can correspond to the protein family name or to an internal id. It should be unique. | +| `fasta` | Required (At least one of fasta and structures must be provided). Full path to the fasta file that contains the sequence to be aligned. | +| `reference` | Optional. Full path to the reference alignment. It is used for the reference-based evaluation steps. It can be left empty. | +| `structures` | Required (At least one of fasta and structures must be provided). Full path to the folder that contains the protein structures for the sequences to be aligned. It is used for structural aligners and structure-based evaluation steps. It can be left empty. + | + + +> [!NOTE] +> You can have some samples with structures and/or references and some without. The pipeline will run the modules requiring structures/references only on the samples for which you have provided the required information and the others will be just skipped. + -### Full samplesheet +## Toolsheet input -The pipeline will auto-detect whether a sample is single- or paired-end using the information provided in the samplesheet. The samplesheet can have as many columns as you desire, however, there is a strict requirement for the first 3 columns to match those defined in the table below. +The reason why we provide a toolsheet as input is that if the pipeline needs to be used as benchmarking framework, we need to test multiple arguments per module. This can be done by having multiple entries in the toolsheet per module with the multiple arguments to be tested. -A final samplesheet file consisting of both single- and paired-end data may look something like the one below. This is for 6 samples, where `TREATMENT_REP3` has been sequenced twice. +Each line of the toolsheet defines a combination of guide tree and multiple sequence aligner to run with the respective arguments to be used. -```csv title="samplesheet.csv" -sample,fastq_1,fastq_2 -CONTROL_REP1,AEG588A1_S1_L002_R1_001.fastq.gz,AEG588A1_S1_L002_R2_001.fastq.gz -CONTROL_REP2,AEG588A2_S2_L002_R1_001.fastq.gz,AEG588A2_S2_L002_R2_001.fastq.gz -CONTROL_REP3,AEG588A3_S3_L002_R1_001.fastq.gz,AEG588A3_S3_L002_R2_001.fastq.gz -TREATMENT_REP1,AEG588A4_S4_L003_R1_001.fastq.gz, -TREATMENT_REP2,AEG588A5_S5_L003_R1_001.fastq.gz, -TREATMENT_REP3,AEG588A6_S6_L003_R1_001.fastq.gz, -TREATMENT_REP3,AEG588A6_S6_L004_R1_001.fastq.gz, +It should look at foollows: + +`toolsheet.csv`: + +```csv +tree,args_tree,aligner,args_aligner, +FAMSA, -gt upgma -medoidtree, FAMSA, +, ,TCOFFEE, +FAMSA,,REGRESSIVE, ``` +> [!NOTE] +> Each of the trees and aligners are available as standalones, so args_tree and args_aligner can be left empty if you are cool with the default settings of each method. Tree can also be left empty, and the default guide tree will be used for each aligner. + +> [!NOTE] +> use the exact spelling as listed above! + +`tree` is the tool used to build the tree. (optional) + +Arguments to the tree tool can be provided using `args_tree`. Please refer to each tool's documentation. (optional) -| Column | Description | -| --------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `sample` | Custom sample name. This entry will be identical for multiple sequencing libraries/runs from the same sample. Spaces in sample names are automatically converted to underscores (`_`). | -| `fastq_1` | Full path to FastQ file for Illumina short reads 1. File has to be gzipped and have the extension ".fastq.gz" or ".fq.gz". | -| `fastq_2` | Full path to FastQ file for Illumina short reads 2. File has to be gzipped and have the extension ".fastq.gz" or ".fq.gz". | +The `aligner` column contains the tool to run the alignment. (optional) -An [example samplesheet](../assets/samplesheet.csv) has been provided with the pipeline. +Finally, the arguments to the aligner tool can be set by using the `args_alginer` column. (optional) ## Running the pipeline diff --git a/modules.json b/modules.json index 8cae7016..0f23731d 100644 --- a/modules.json +++ b/modules.json @@ -127,7 +127,7 @@ }, "tcoffee/align": { "branch": "master", - "git_sha": "5c82ca0a942f2793859bb2f25601eb69c50590dc", + "git_sha": "1cacaceabae75b0c3bc393dee52cb6a5020fcb5c", "installed_by": [ "modules" ] @@ -155,7 +155,7 @@ }, "tcoffee/tcs": { "branch": "master", - "git_sha": "2d5ea4959c36da8c21e74a0c5e8ecc6b101b999e", + "git_sha": "1cacaceabae75b0c3bc393dee52cb6a5020fcb5c", "installed_by": [ "modules" ] diff --git a/modules/nf-core/tcoffee/align/main.nf b/modules/nf-core/tcoffee/align/main.nf index 77f122f3..a14964c9 100644 --- a/modules/nf-core/tcoffee/align/main.nf +++ b/modules/nf-core/tcoffee/align/main.nf @@ -16,7 +16,7 @@ process TCOFFEE_ALIGN { output: tuple val(meta), path("*.aln{.gz,}"), emit: alignment // in the args there might be the request to generate a lib file, so the following is an optional output - tuple val(meta), path("* lib") , emit: lib, optional : true + tuple val(meta), path("*.*lib") , emit: lib, optional : true path "versions.yml" , emit: versions when: @@ -38,7 +38,7 @@ process TCOFFEE_ALIGN { -thread ${task.cpus} \ -outfile $outfile \ $write_output - + # If stdout file exist and compress is true, then compress the file # This is a patch for the current behaviour of the regressive algorithm # that does not support the stdout redirection diff --git a/modules/nf-core/tcoffee/tcs/main.nf.test b/modules/nf-core/tcoffee/tcs/main.nf.test deleted file mode 100644 index 6bf1f81e..00000000 --- a/modules/nf-core/tcoffee/tcs/main.nf.test +++ /dev/null @@ -1,29 +0,0 @@ -nextflow_process { - - name "Test Process TCOFFEE_TCS" - script "modules/nf-core/tcoffee/tcs/main.nf" - process "TCOFFEE_TCS" - - test("Should run without failures") { - - when { - params { - // define parameters here. Example: - // outdir = "tests/results" - } - process { - """ - // define inputs of the process here. Example: - // input[0] = file("test-file.txt") - """ - } - } - - then { - assert process.success - assert snapshot(process.out).match() - } - - } - -} diff --git a/modules/nf-core/tcoffee/tcs/tests/lib.config b/modules/nf-core/tcoffee/tcs/tests/lib.config index 723c92d5..56712f63 100644 --- a/modules/nf-core/tcoffee/tcs/tests/lib.config +++ b/modules/nf-core/tcoffee/tcs/tests/lib.config @@ -1,5 +1,5 @@ process { withName: "TCOFFEE_ALIGN"{ - ext.args = { "-output fasta_aln -out_lib=sample_lib1.tc_lib" } + ext.args = { "-output fasta_aln -out_lib sample_lib1.tc_lib" } } -} \ No newline at end of file +} From a24de25826b77882d7a316267b4abc4bff14a354 Mon Sep 17 00:00:00 2001 From: luisas Date: Thu, 6 Jun 2024 12:46:25 +0200 Subject: [PATCH 08/16] Update readme --- .github/workflows/ci.yml | 4 +- .nf-core.yml | 3 +- README.md | 37 ++++----- assets/schema_input.json | 2 +- docs/extending.md | 1 - docs/output.md | 13 ++- docs/usage.md | 83 ++++++++++--------- modules.json | 108 +++++++------------------ modules/nf-core/mtmalign/align/main.nf | 4 +- subworkflows/local/compute_trees.nf | 2 - subworkflows/local/evaluate.nf | 1 - 11 files changed, 102 insertions(+), 156 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 0c5c5980..7a1aeb48 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -26,7 +26,7 @@ jobs: NXF_VER: - "24.04.1" - "latest-everything" - ANALYSIS: + ANALYSIS: - "test" - "test_pdb" - "test_parameters" @@ -60,7 +60,7 @@ jobs: - "--skip_eval" - "--skip_compression" - "--skip_shiny" - + steps: - name: Check out pipeline code uses: actions/checkout@v4 diff --git a/.nf-core.yml b/.nf-core.yml index 9491102a..e164e770 100644 --- a/.nf-core.yml +++ b/.nf-core.yml @@ -2,5 +2,4 @@ repository_type: pipeline nf_core_version: "2.14.1" lint: multiqc_config: False - files_exist: - conf/igenomes.config \ No newline at end of file + files_exist: conf/igenomes.config diff --git a/README.md b/README.md index 192a0c05..9ff942f2 100644 --- a/README.md +++ b/README.md @@ -23,22 +23,23 @@ ![Alt text](docs/images/nf-core-msa_metro_map.png?raw=true "nf-core-msa metro map") -In a nutshell, the pipeline performs the following streos: -1. **Input files summary**: (Optional) computation of summary statistics on the input fasta file, such as the average sequence similarity across the input sequences, their length, etc. -2. **Guide Tree**: (Optional) Renders a guide tree. -3. **Align**: aligns the sequences. -4. **Evaluate**: (Optional) The obtained alignments are evaluated with different metrics: Sum Of Pairs (SoP), Total Column score (TC), iRMSD, Total Consistency Score (TCS), etc. -5. **Report**: Reports about the collected information of the runs are reported in a shiny app and a summary table in multiqc. +In a nutshell, the pipeline performs the following steps: + +1. **Input files summary**: (Optional) computation of summary statistics on the input files, such as the average sequence similarity across the input sequences, their length, plddt extraction if available, etc. +2. **Guide Tree**: (Optional) Renders a guide tree. +3. **Align**: (Required) Aligns the sequences. +4. **Evaluate**: (Optional) Evaluates the generated alignments with different metrics: Sum Of Pairs (SoP), Total Column score (TC), iRMSD, Total Consistency Score (TCS), etc. +5. **Report**: Reports the collected information of the runs in a shiny app and a summary table in MultiQC. ## Usage > [!NOTE] > If you are new to Nextflow and nf-core, please refer to [this page](https://nf-co.re/docs/usage/installation) on how to set-up Nextflow. Make sure to [test your setup](https://nf-co.re/docs/usage/introduction#how-to-run-a-pipeline) with `-profile test` before running the workflow on actual data. - #### 1. SAMPLESHEET -The sample sheet defines the input data that the pipeline will process. -It should look like this: + +The sample sheet defines the input data that the pipeline will process. +It should look like this: `samplesheet.csv`: @@ -53,26 +54,26 @@ Each row represents a set of sequences (in this case the seatoxin and toxin prot > [!NOTE] > The only required input is the id column and either fasta or structures. - #### 2. TOOLSHEET -Each line of the toolsheet defines a combination of guide tree and multiple sequence aligner to run with the respective arguments to be used. +Each line of the toolsheet defines a combination of guide tree and multiple sequence aligner to run with the respective arguments to be used. -It should look at foollows: +It should look at foollows: `toolsheet.csv`: ```csv tree,args_tree,aligner,args_aligner, FAMSA, -gt upgma -medoidtree, FAMSA, -, ,TCOFFEE, +, ,TCOFFEE, FAMSA,,REGRESSIVE, ``` + > [!NOTE] > The only required input is aligner. - #### 3. RUN THE PIPELINE + Now, you can run the pipeline using: ```bash @@ -89,16 +90,16 @@ nextflow run nf-core/multiplesequencealign \ For more details and further functionality, please refer to the [usage documentation](https://nf-co.re/multiplesequencealign/usage) and the [parameter documentation](https://nf-co.re/multiplesequencealign/parameters). -## Extending the pipeline - -For details on how to add your favourite guide tree/MSA/evaluation step in nf-core/multiplesequencealign please refer to [extending documentation](https://github.com/luisas/multiplesequencealign/blob/luisa_patch/docs/extending.md). - ## Pipeline output To see the results of an example test run with a full size dataset refer to the [results](https://nf-co.re/multiplesequencealign/results) tab on the nf-core website pipeline page. For more details about the output files and reports, please refer to the [output documentation](https://nf-co.re/multiplesequencealign/output). +## Extending the pipeline + +For details on how to add your favourite guide tree/MSA/evaluation step in nf-core/multiplesequencealign please refer to [extending documentation](https://github.com/luisas/multiplesequencealign/blob/luisa_patch/docs/extending.md). + ## Credits nf-core/multiplesequencealign was originally written by Luisa Santus ([@luisas](https://github.com/luisas)) and Jose Espinosa-Carrasco ([@JoseEspinosa](https://github.com/JoseEspinosa)) from The Comparative Bioinformatics Group at The Centre for Genomic Regulation, Spain. diff --git a/assets/schema_input.json b/assets/schema_input.json index 1678b9d9..a2770af7 100644 --- a/assets/schema_input.json +++ b/assets/schema_input.json @@ -28,7 +28,7 @@ "type": "string" } }, - "required": ["id"], + "required": ["id"], "anyOf": [{ "required": ["fasta"] }, { "required": ["structures"] }] } } diff --git a/docs/extending.md b/docs/extending.md index 7822ef01..65d626f1 100644 --- a/docs/extending.md +++ b/docs/extending.md @@ -16,7 +16,6 @@ Useful resources are: The pipeline consists of four different subworkflows, one for computing the guidetrees of guidetree-based methods, one for performing the MSAs, one for evaluating the produced MSAs and one for computing statistics about the input dataset. The subworkflows are to a significant degree isolated from each other, and not all of them may run in any given execution of the pipeline. - ## Adding an aligner 1. Create a local or nf-core module and ensure the output is in FASTA format diff --git a/docs/output.md b/docs/output.md index d24a8375..4bb6a2dd 100644 --- a/docs/output.md +++ b/docs/output.md @@ -11,12 +11,12 @@ The directories listed below will be created in the results directory after the The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes data using the following steps: 1. **Input files summary**: (Optional) computation of summary statistics on the input fasta file, such as the average sequence similarity across the input sequences, their length, etc. Skip by `--skip_stats` as a parameter. -2. **Guide Tree**: (Optional) Renders a guide tree. +2. **Guide Tree**: (Optional) Renders a guide tree. 3. **Align**: aligns the sequences. 4. **Evaluate**: (Optional) The obtained alignments are evaluated with different metrics: Sum Of Pairs (SoP), Total Column score (TC), iRMSD, Total Consistency Score (TCS), etc. Skip by passing `--skip_eval` as a parameter. 5. **Report**: Reports about the collected information of the runs are reported in a shiny app and a summary table in multiqc. Skip by passing `--skip_shiny` and `--skip_multiqc`. -## Summary statistics of input files +## Input files summary The stats.nf subworkflow collects statistics about the input files and summarizes them into a final csv file. @@ -31,7 +31,6 @@ The stats.nf subworkflow collects statistics about the input files and summarize - `structures/` - `plddt/*_full_plddt.csv`: file containing the plddt of the structures for each sequence in the input file. If `--extract_plddt` is specified.
- ## Trees If you explicitly specifified (via the toolsheet) to compute guidetrees to be used by the MSA tool, those are stored here. @@ -44,7 +43,6 @@ If you explicitly specifified (via the toolsheet) to compute guidetrees to be us - ## Alignment All MSA computed are stored here. @@ -53,16 +51,15 @@ All MSA computed are stored here. Output files - `alignment/` - - `*/*.fa`: each subdirectory is called as the input file. It contains all the alignments computed on it. The filename contains all the informations of the input file used and the tool. + - `*/*.fa`: each subdirectory is named after the sample id. It contains all the alignments computed on it. The filename contains all the informations of the input file used and the tool. The file naming convention is: {Input*file}*{Tree}_args-{Tree_args}_{MSA}\_args-{MSA_args}.aln - ## Evaluation -Stores the files with the summary of the computed evaluation statistics. +Files with the summary of the computed evaluation statistics.
Output files @@ -73,7 +70,7 @@ Stores the files with the summary of the computed evaluation statistics. - `complete_summary_eval.csv`: csv file containing the summary of all evaluation metrics for each input file.
-## shiny_app +## Shiny App
Output files diff --git a/docs/usage.md b/docs/usage.md index 1e928653..6a06ee69 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -8,41 +8,40 @@ **nf-core/multiplesequencealign** is a pipeline to deploy and systematically evaluate Multiple Sequence Alignment (MSA) methods. -The main steps of the pipeline are: +The main steps of the pipeline are: 1. **Input files summary**: (Optional) computation of summary statistics on the input fasta file, such as the average sequence similarity across the input sequences, their length, etc. Skip by `--skip_stats` as a parameter. -2. **Guide Tree**: (Optional) Renders a guide tree. +2. **Guide Tree**: (Optional) Renders a guide tree. 3. **Align**: aligns the sequences. 4. **Evaluate**: (Optional) The obtained alignments are evaluated with different metrics: Sum Of Pairs (SoP), Total Column score (TC), iRMSD, Total Consistency Score (TCS), etc. Skip by passing `--skip_eval` as a parameter. 5. **Report**: Reports about the collected information of the runs are reported in a shiny app and a summary table in multiqc. Skip by passing `--skip_shiny` and `--skip_multiqc`. -### 1. INPUT FILES SUMMARY +### 1. INPUT FILES SUMMARY Summary information about the input fasta files are calculated. Skip by `--skip_stats`. - 1. Sequence similarity. Calculates pairwise and average sequence similarity is calculated using TCOFFEE. Activate with `--calc_sim` (default: false). - 2. General summary. Calculates the number and the average length of sequences. Activate with `--calc_seq_stats` (default: true). - 3. Extract plddt. If the structures were generated by AF2, plddt is extracted and reported. Activate with `--extract_plddt` (default: false). +1. Sequence similarity. Calculates pairwise and average sequence similarity is calculated using TCOFFEE. Activate with `--calc_sim` (default: false). +2. General summary. Calculates the number and the average length of sequences. Activate with `--calc_seq_stats` (default: true). +3. Extract plddt. If the structures were generated by AF2, plddt is extracted and reported. Activate with `--extract_plddt` (default: false). ### 2. GUIDE TREES -Guide trees define the order in which sequences and profiles are aligned and hold a crucial role in determining the final MSA accuracy. Tree rendering techniques most commonly rely on pairwise distances between sequences. +Guide trees define the order in which sequences and profiles are aligned and hold a crucial role in determining the final MSA accuracy. Tree rendering techniques most commonly rely on pairwise distances between sequences. > **Note** -> None of the below listed aligner needs an explicit definition of a guidetree: if they need one, they compute their own default guide tree. This explicit definition of a guide tree is available in case you want to test non-default combination of guide trees and aligner methods. - +> None of the below listed aligner needs an explicit definition of a guidetree: if they need one, they compute their own default guide tree. This explicit definition of a guide tree is available in case you want to test non-default combination of guide trees and aligner methods. Available GUIDE TREE methods (Optional): - [CLUSTALO](http://clustal.org/omega/#Documentation) - [FAMSA](https://github.com/refresh-bio/FAMSA) - ### 3. ALIGN Available ALIGN methods: **SEQUENCE-BASED** (only require a fasta file as input): + - [CLUSTALO](http://clustal.org/omega/#Documentation) (accepts guide tree) - [FAMSA](https://github.com/refresh-bio/FAMSA) (accepts guide tree) - [KALIGN](https://github.com/TimoLassmann/kalign) @@ -53,44 +52,46 @@ Available ALIGN methods: - [TCOFFEE](https://tcoffee.readthedocs.io/en/latest/index.html) (accepts guide tree) **SEQUENCE- and STRUCTURE-BASED** (require both fasta and structures as input): + - [3DCOFFEE](https://tcoffee.org/Projects/expresso/index.html) (accepts guide tree) **STRUCTURE-BASED** (only require stuctures as input): -- [MTMALIGN](https://bio.tools/mtm-align) + +- [MTMALIGN](https://bio.tools/mtm-align) ### 4. EVALUATE Optionally, the produced MSAs can be evaluated. Skip with `--skip_eval`. -**SEQUENCE-BASED** (no extra input required): +**SEQUENCE-BASED** (no extra input required): + 1. Calculate number of gaps and its average across sequences. Activate using `--calc_gaps` (default: true). -**REFERENCE-BASED**: +**REFERENCE-BASED**: -The reference MSAs (see samplesheet) are be used to evaluate the quality of the produced MSA. +The reference MSAs (see samplesheet) are be used to evaluate the quality of the produced MSA. 2. Sum Of Pairs. Calculates the SP score using the [TCOFFEE](https://tcoffee.readthedocs.io/en/latest/index.html) implementation. Activate using `--calc_sp` (default: true). 3. Total column. Calculates the TC score [TCOFFEE](https://tcoffee.readthedocs.io/en/latest/index.html). Activate using `--calc_tc` (default: true). **STRUCTURE-BASED**: -The provided structures (see samplesheet) are used to evaluate the quality of the alignment. +The provided structures (see samplesheet) are used to evaluate the quality of the alignment. 4. iRMSD. Calculates the iRMSD using the [TCOFFEE](https://tcoffee.readthedocs.io/en/latest/index.html) implementation. Activate using `--calc_irmsd` (default: false). - ### 5. REPORT Finally, a summary table with all the computed statistics and evaluations is reported in MultiQC (Skip by `--skip_multiqc`). -Moreover, a shiny app is prepared with interactive summary plots. +Moreover, a shiny app is prepared with interactive summary plots. + > [!WARNING] > You will need to have [shiny](https://shiny.posit.co/py/) installed to run it! See [output documentation](https://nf-co.re/multiplesequencealign/output) for more infos. - ## Samplesheet input -The sample sheet defines the input data that the pipeline will process. -It should look like this: +The sample sheet defines the input data that the pipeline will process. +It should look like this: `samplesheet.csv`: @@ -102,41 +103,38 @@ toxin,toxin.fa,toxin-ref.fa,toxin_structures Each row represents a set of sequences (in this case the seatoxin and toxin protein families) to be processed. - -| Column | Description | -| ---------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `id` | Required. Name of the set of sequences. It can correspond to the protein family name or to an internal id. It should be unique. | -| `fasta` | Required (At least one of fasta and structures must be provided). Full path to the fasta file that contains the sequence to be aligned. | -| `reference` | Optional. Full path to the reference alignment. It is used for the reference-based evaluation steps. It can be left empty. | -| `structures` | Required (At least one of fasta and structures must be provided). Full path to the folder that contains the protein structures for the sequences to be aligned. It is used for structural aligners and structure-based evaluation steps. It can be left empty. - | - +| Column | Description | +| ------------ | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `id` | Required. Name of the set of sequences. It can correspond to the protein family name or to an internal id. It should be unique. | +| `fasta` | Required (At least one of fasta and structures must be provided). Full path to the fasta file that contains the sequence to be aligned. | +| `reference` | Optional. Full path to the reference alignment. It is used for the reference-based evaluation steps. It can be left empty. | +| `structures` | Required (At least one of fasta and structures must be provided). Full path to the folder that contains the protein structures for the sequences to be aligned. It is used for structural aligners and structure-based evaluation steps. It can be left empty. | > [!NOTE] -> You can have some samples with structures and/or references and some without. The pipeline will run the modules requiring structures/references only on the samples for which you have provided the required information and the others will be just skipped. - +> You can have some samples with structures and/or references and some without. The pipeline will run the modules requiring structures/references only on the samples for which you have provided the required information and the others will be just skipped. ## Toolsheet input -The reason why we provide a toolsheet as input is that if the pipeline needs to be used as benchmarking framework, we need to test multiple arguments per module. This can be done by having multiple entries in the toolsheet per module with the multiple arguments to be tested. +The reason why we provide a toolsheet as input is that if the pipeline needs to be used as benchmarking framework, we need to test multiple arguments per module. This can be done by having multiple entries in the toolsheet per module with the multiple arguments to be tested. -Each line of the toolsheet defines a combination of guide tree and multiple sequence aligner to run with the respective arguments to be used. +Each line of the toolsheet defines a combination of guide tree and multiple sequence aligner to run with the respective arguments to be used. -It should look at foollows: +It should look at foollows: `toolsheet.csv`: ```csv tree,args_tree,aligner,args_aligner, FAMSA, -gt upgma -medoidtree, FAMSA, -, ,TCOFFEE, +, ,TCOFFEE, FAMSA,,REGRESSIVE, ``` + > [!NOTE] -> Each of the trees and aligners are available as standalones, so args_tree and args_aligner can be left empty if you are cool with the default settings of each method. Tree can also be left empty, and the default guide tree will be used for each aligner. +> Each of the trees and aligners are available as standalones, so args_tree and args_aligner can be left empty if you are cool with the default settings of each method. Tree can also be left empty, and the default guide tree will be used for each aligner. > [!NOTE] -> use the exact spelling as listed above! +> use the exact spelling as listed above! `tree` is the tool used to build the tree. (optional) @@ -146,12 +144,19 @@ The `aligner` column contains the tool to run the alignment. (optional) Finally, the arguments to the aligner tool can be set by using the `args_alginer` column. (optional) +| Column | Description | +| -------------- | -------------------------------------------------------------------------------- | +| `tree` | Optional. Tool used to build the tree. | +| `args_tree` | Optional. Arguments to the tree tool. Please refer to each tool's documentation. | +| `aligner` | Required. Tool to run the alignment. Available options listed above. | +| `args_aligner` | Optional. Arguments to the alignment tool | + ## Running the pipeline The typical command for running the pipeline is as follows: ```bash -nextflow run nf-core/multiplesequencealign --input ./samplesheet.csv --outdir ./results --genome GRCh37 -profile docker +nextflow run nf-core/multiplesequencealign --input ./samplesheet.csv --tools ./toolsheet.csv --outdir ./results -profile docker ``` This will launch the pipeline with the `docker` configuration profile. See below for more information about profiles. @@ -183,8 +188,8 @@ with `params.yaml` containing: ```yaml input: './samplesheet.csv' +tools: "./toolsheet.csv" outdir: './results/' -genome: 'GRCh37' <...> ``` diff --git a/modules.json b/modules.json index 0f23731d..855615db 100644 --- a/modules.json +++ b/modules.json @@ -8,164 +8,118 @@ "clustalo/align": { "branch": "master", "git_sha": "faf557ba56156ac0e5de76a25c1e3df11c944f59", - "installed_by": [ - "modules" - ] + "installed_by": ["modules"] }, "clustalo/guidetree": { "branch": "master", "git_sha": "1f253ec05723293df7757af8769f8389b7a1884e", - "installed_by": [ - "modules" - ] + "installed_by": ["modules"] }, "csvtk/concat": { "branch": "master", "git_sha": "cfe2a24902bfdfe8132f11461ffda92d257f9f09", - "installed_by": [ - "modules" - ] + "installed_by": ["modules"] }, "csvtk/join": { "branch": "master", "git_sha": "614abbf126f287a3068dc86997b2e1b6a93abe20", - "installed_by": [ - "modules" - ], + "installed_by": ["modules"], "patch": "modules/nf-core/csvtk/join/csvtk-join.diff" }, "famsa/align": { "branch": "master", "git_sha": "faf557ba56156ac0e5de76a25c1e3df11c944f59", - "installed_by": [ - "modules" - ] + "installed_by": ["modules"] }, "famsa/guidetree": { "branch": "master", "git_sha": "46789a4621be261f10dab0033f46f34779a5afc9", - "installed_by": [ - "modules" - ] + "installed_by": ["modules"] }, "fastqc": { "branch": "master", "git_sha": "285a50500f9e02578d90b3ce6382ea3c30216acd", - "installed_by": [ - "modules" - ] + "installed_by": ["modules"] }, "kalign/align": { "branch": "master", "git_sha": "7afd02d048ad0100be37fa1741816265c4aa307c", - "installed_by": [ - "modules" - ] + "installed_by": ["modules"] }, "learnmsa/align": { "branch": "master", "git_sha": "62007703c84bcfef92ce9e4a57cb1cc382917201", - "installed_by": [ - "modules" - ] + "installed_by": ["modules"] }, "mafft": { "branch": "master", "git_sha": "faf557ba56156ac0e5de76a25c1e3df11c944f59", - "installed_by": [ - "modules" - ] + "installed_by": ["modules"] }, "magus/align": { "branch": "master", "git_sha": "dc37bcdfa78fe3e9ca56e4b85e1621333c7b4301", - "installed_by": [ - "modules" - ] + "installed_by": ["modules"] }, "magus/guidetree": { "branch": "master", "git_sha": "dc37bcdfa78fe3e9ca56e4b85e1621333c7b4301", - "installed_by": [ - "modules" - ] + "installed_by": ["modules"] }, "mtmalign/align": { "branch": "master", - "git_sha": "e2ea4d9fd6f326d51e468edc4b9cb1d6d4082be5", - "installed_by": [ - "modules" - ] + "git_sha": "7bfb142c3729c1c76198c237a614215d92fe935c", + "installed_by": ["modules"] }, "multiqc": { "branch": "master", "git_sha": "b7ebe95761cd389603f9cc0e0dc384c0f663815a", - "installed_by": [ - "modules" - ] + "installed_by": ["modules"] }, "muscle5/super5": { "branch": "master", "git_sha": "faf557ba56156ac0e5de76a25c1e3df11c944f59", - "installed_by": [ - "modules" - ] + "installed_by": ["modules"] }, "pigz/compress": { "branch": "master", "git_sha": "0eab94fc1e48703c1b0a8704bd665f554905c39d", - "installed_by": [ - "modules" - ] + "installed_by": ["modules"] }, "pigz/uncompress": { "branch": "master", "git_sha": "d7f0de8aae7bf84b080dfdcf4e294bf11a46a51c", - "installed_by": [ - "modules" - ] + "installed_by": ["modules"] }, "tcoffee/align": { "branch": "master", "git_sha": "1cacaceabae75b0c3bc393dee52cb6a5020fcb5c", - "installed_by": [ - "modules" - ] + "installed_by": ["modules"] }, "tcoffee/alncompare": { "branch": "master", "git_sha": "faf557ba56156ac0e5de76a25c1e3df11c944f59", - "installed_by": [ - "modules" - ] + "installed_by": ["modules"] }, "tcoffee/irmsd": { "branch": "master", "git_sha": "faf557ba56156ac0e5de76a25c1e3df11c944f59", - "installed_by": [ - "modules" - ] + "installed_by": ["modules"] }, "tcoffee/seqreformat": { "branch": "master", "git_sha": "b04c647f465bea2c5bb9871503182236cd65b246", - "installed_by": [ - "modules" - ] + "installed_by": ["modules"] }, "tcoffee/tcs": { "branch": "master", "git_sha": "1cacaceabae75b0c3bc393dee52cb6a5020fcb5c", - "installed_by": [ - "modules" - ] + "installed_by": ["modules"] }, "untar": { "branch": "master", "git_sha": "5caf7640a9ef1d18d765d55339be751bb0969dfa", - "installed_by": [ - "modules" - ] + "installed_by": ["modules"] } } }, @@ -174,26 +128,20 @@ "utils_nextflow_pipeline": { "branch": "master", "git_sha": "5caf7640a9ef1d18d765d55339be751bb0969dfa", - "installed_by": [ - "subworkflows" - ] + "installed_by": ["subworkflows"] }, "utils_nfcore_pipeline": { "branch": "master", "git_sha": "92de218a329bfc9a9033116eb5f65fd270e72ba3", - "installed_by": [ - "subworkflows" - ] + "installed_by": ["subworkflows"] }, "utils_nfvalidation_plugin": { "branch": "master", "git_sha": "5caf7640a9ef1d18d765d55339be751bb0969dfa", - "installed_by": [ - "subworkflows" - ] + "installed_by": ["subworkflows"] } } } } } -} \ No newline at end of file +} diff --git a/modules/nf-core/mtmalign/align/main.nf b/modules/nf-core/mtmalign/align/main.nf index e539bb61..933d2c74 100644 --- a/modules/nf-core/mtmalign/align/main.nf +++ b/modules/nf-core/mtmalign/align/main.nf @@ -16,7 +16,7 @@ process MTMALIGN_ALIGN { output: tuple val(meta), path("${prefix}.aln${compress ? '.gz' : ''}"), emit: alignment tuple val(meta), path("${prefix}.pdb${compress ? '.gz' : ''}"), emit: structure - path "versions.yml" , emit: versions + path "versions.yml" , emit: versions when: task.ext.when == null || task.ext.when @@ -59,7 +59,7 @@ process MTMALIGN_ALIGN { stub: prefix = task.ext.prefix ?: "${meta.id}" - """ + """ touch ${prefix}.aln${compress ? '.gz' : ''} touch ${prefix}.pdb${compress ? '.gz' : ''} diff --git a/subworkflows/local/compute_trees.nf b/subworkflows/local/compute_trees.nf index aa3d6c34..57801c30 100644 --- a/subworkflows/local/compute_trees.nf +++ b/subworkflows/local/compute_trees.nf @@ -4,8 +4,6 @@ include { FAMSA_GUIDETREE } from '../../modules/nf-core/famsa/guidetree/main' include { CLUSTALO_GUIDETREE } from '../../modules/nf-core/clustalo/guidetree/main' -include { MAGUS_GUIDETREE } from '../../modules/nf-core/magus/guidetree/main' - workflow COMPUTE_TREES { diff --git a/subworkflows/local/evaluate.nf b/subworkflows/local/evaluate.nf index 3c2a60df..3c5dd6a9 100644 --- a/subworkflows/local/evaluate.nf +++ b/subworkflows/local/evaluate.nf @@ -1,6 +1,5 @@ -include { PIGZ_UNCOMPRESS } from '../../modules/nf-core/pigz/uncompress/main.nf' include { TCOFFEE_ALNCOMPARE as TCOFFEE_ALNCOMPARE_SP } from '../../modules/nf-core/tcoffee/alncompare' include { TCOFFEE_ALNCOMPARE as TCOFFEE_ALNCOMPARE_TC } from '../../modules/nf-core/tcoffee/alncompare' include { TCOFFEE_IRMSD } from '../../modules/nf-core/tcoffee/irmsd' From 599bdf0eded16312d7f5054046fab350bd922429 Mon Sep 17 00:00:00 2001 From: luisas Date: Thu, 6 Jun 2024 13:32:35 +0200 Subject: [PATCH 09/16] Update readme --- docs/README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/docs/README.md b/docs/README.md index 588a4040..144e29a0 100644 --- a/docs/README.md +++ b/docs/README.md @@ -6,5 +6,7 @@ The nf-core/multiplesequencealign documentation is split into the following page - An overview of how the pipeline works, how to run it and a description of all of the different command-line flags. - [Output](output.md) - An overview of the different results produced by the pipeline and how to interpret them. +- [Extending](extending.md) + - Instructions on how to extend the pipeline. You can find a lot more documentation about installing, configuring and running nf-core pipelines on the website: [https://nf-co.re](https://nf-co.re) From fd349bd80af3e981f70e5b868d7985c930ef933f Mon Sep 17 00:00:00 2001 From: luisas Date: Thu, 6 Jun 2024 13:55:27 +0200 Subject: [PATCH 10/16] Update --- conf/modules.config | 6 +- conf/test_parameters.config | 2 +- modules.json | 107 +++++++++++++----- modules/nf-core/tcoffee/seqreformat/main.nf | 2 +- subworkflows/local/align.nf | 18 +-- .../main.nf | 2 +- workflows/multiplesequencealign.nf | 4 +- 7 files changed, 94 insertions(+), 47 deletions(-) diff --git a/conf/modules.config b/conf/modules.config index f3b421a9..67bcc450 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -131,9 +131,9 @@ publishDir = [ path: { "${params.outdir}/alignment/${meta.id}" }, mode: params.publish_dir_mode, - saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, pattern: "*.aln" - ] + ] } } @@ -143,7 +143,7 @@ path: { "${params.outdir}/alignment/${meta.id}" }, mode: params.publish_dir_mode, saveAs: { filename -> filename.equals('versions.yml') ? null : filename } - ] + ] } // ------------------------------------ diff --git a/conf/test_parameters.config b/conf/test_parameters.config index a5b7e514..e936e59c 100644 --- a/conf/test_parameters.config +++ b/conf/test_parameters.config @@ -27,7 +27,7 @@ params { // Input data input = params.pipelines_testdata_base_path + 'multiplesequencealign/samplesheet/v1.0/samplesheet_test_af2.csv' tools = params.pipelines_testdata_base_path + 'multiplesequencealign/toolsheet/v1.0/toolsheet_full.csv' - + // Output directory outdir = "./outdir/" diff --git a/modules.json b/modules.json index 855615db..b70630f2 100644 --- a/modules.json +++ b/modules.json @@ -8,118 +8,165 @@ "clustalo/align": { "branch": "master", "git_sha": "faf557ba56156ac0e5de76a25c1e3df11c944f59", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] }, "clustalo/guidetree": { "branch": "master", "git_sha": "1f253ec05723293df7757af8769f8389b7a1884e", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] }, "csvtk/concat": { "branch": "master", "git_sha": "cfe2a24902bfdfe8132f11461ffda92d257f9f09", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] }, "csvtk/join": { "branch": "master", "git_sha": "614abbf126f287a3068dc86997b2e1b6a93abe20", - "installed_by": ["modules"], + "installed_by": [ + "modules" + ], "patch": "modules/nf-core/csvtk/join/csvtk-join.diff" }, "famsa/align": { "branch": "master", "git_sha": "faf557ba56156ac0e5de76a25c1e3df11c944f59", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] }, "famsa/guidetree": { "branch": "master", "git_sha": "46789a4621be261f10dab0033f46f34779a5afc9", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] }, "fastqc": { "branch": "master", "git_sha": "285a50500f9e02578d90b3ce6382ea3c30216acd", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] }, "kalign/align": { "branch": "master", "git_sha": "7afd02d048ad0100be37fa1741816265c4aa307c", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] }, "learnmsa/align": { "branch": "master", "git_sha": "62007703c84bcfef92ce9e4a57cb1cc382917201", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] }, "mafft": { "branch": "master", "git_sha": "faf557ba56156ac0e5de76a25c1e3df11c944f59", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] }, "magus/align": { "branch": "master", "git_sha": "dc37bcdfa78fe3e9ca56e4b85e1621333c7b4301", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] }, "magus/guidetree": { "branch": "master", "git_sha": "dc37bcdfa78fe3e9ca56e4b85e1621333c7b4301", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] }, "mtmalign/align": { "branch": "master", "git_sha": "7bfb142c3729c1c76198c237a614215d92fe935c", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] }, "multiqc": { "branch": "master", "git_sha": "b7ebe95761cd389603f9cc0e0dc384c0f663815a", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] }, "muscle5/super5": { "branch": "master", "git_sha": "faf557ba56156ac0e5de76a25c1e3df11c944f59", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] }, "pigz/compress": { "branch": "master", "git_sha": "0eab94fc1e48703c1b0a8704bd665f554905c39d", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] }, "pigz/uncompress": { "branch": "master", "git_sha": "d7f0de8aae7bf84b080dfdcf4e294bf11a46a51c", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] }, "tcoffee/align": { "branch": "master", "git_sha": "1cacaceabae75b0c3bc393dee52cb6a5020fcb5c", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] }, "tcoffee/alncompare": { "branch": "master", "git_sha": "faf557ba56156ac0e5de76a25c1e3df11c944f59", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] }, "tcoffee/irmsd": { "branch": "master", "git_sha": "faf557ba56156ac0e5de76a25c1e3df11c944f59", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] }, "tcoffee/seqreformat": { "branch": "master", "git_sha": "b04c647f465bea2c5bb9871503182236cd65b246", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ], + "patch": "modules/nf-core/tcoffee/seqreformat/tcoffee-seqreformat.diff" }, "tcoffee/tcs": { "branch": "master", "git_sha": "1cacaceabae75b0c3bc393dee52cb6a5020fcb5c", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] }, "untar": { "branch": "master", "git_sha": "5caf7640a9ef1d18d765d55339be751bb0969dfa", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] } } }, @@ -128,20 +175,26 @@ "utils_nextflow_pipeline": { "branch": "master", "git_sha": "5caf7640a9ef1d18d765d55339be751bb0969dfa", - "installed_by": ["subworkflows"] + "installed_by": [ + "subworkflows" + ] }, "utils_nfcore_pipeline": { "branch": "master", "git_sha": "92de218a329bfc9a9033116eb5f65fd270e72ba3", - "installed_by": ["subworkflows"] + "installed_by": [ + "subworkflows" + ] }, "utils_nfvalidation_plugin": { "branch": "master", "git_sha": "5caf7640a9ef1d18d765d55339be751bb0969dfa", - "installed_by": ["subworkflows"] + "installed_by": [ + "subworkflows" + ] } } } } } -} +} \ No newline at end of file diff --git a/modules/nf-core/tcoffee/seqreformat/main.nf b/modules/nf-core/tcoffee/seqreformat/main.nf index 00a9a97d..774ae2be 100644 --- a/modules/nf-core/tcoffee/seqreformat/main.nf +++ b/modules/nf-core/tcoffee/seqreformat/main.nf @@ -38,7 +38,7 @@ process TCOFFEE_SEQREFORMAT { def args = task.ext.args ?: '' prefix = task.ext.prefix ?: "${meta.id}" """ - touch "${prefix}_${seq_reformat_type}.txt" + touch "${prefix}.txt" cat <<-END_VERSIONS > versions.yml "${task.process}": diff --git a/subworkflows/local/align.nf b/subworkflows/local/align.nf index 94da8438..d9cf7079 100644 --- a/subworkflows/local/align.nf +++ b/subworkflows/local/align.nf @@ -20,19 +20,13 @@ include { MTMALIGN_ALIGN } from '../../modules/nf-core/mtma workflow ALIGN { take: - ch_fastas // channel: meta, /path/to/file.fasta - ch_tools // channel: meta_tree, meta_aligner - // [[tree:, args_tree:, args_tree_clean: ], [aligner:, args_aligner:, args_aligner_clean:]] - // e.g.[[tree:FAMSA, args_tree:-gt upgma -parttree, args_tree_clean:-gt_upgma_-parttree], [aligner:FAMSA, args_aligner:null, args_aligner_clean:null]] - // e.g.[[tree:null, args_tree:null, args_tree_clean:null], [aligner:TCOFFEE, args_aligner:-output fasta_aln, args_aligner_clean:-output_fasta_aln]] - ch_structures // channel: meta, [/path/to/file.pdb,/path/to/file.pdb,/path/to/file.pdb] + ch_fastas // channel: meta, /path/to/file.fasta + ch_tools // channel: meta_tree, meta_aligner + // [[tree:, args_tree:, args_tree_clean: ], [aligner:, args_aligner:, args_aligner_clean:]] + // e.g.[[tree:FAMSA, args_tree:-gt upgma -parttree, args_tree_clean:-gt_upgma_-parttree], [aligner:FAMSA, args_aligner:null, args_aligner_clean:null]] + // e.g.[[tree:null, args_tree:null, args_tree_clean:null], [aligner:TCOFFEE, args_aligner:-output fasta_aln, args_aligner_clean:-output_fasta_aln]] + ch_structures // channel: meta, [/path/to/file.pdb,/path/to/file.pdb,/path/to/file.pdb] compress - // tree - // args_tree - // args_tree_clean - // aligner - // args_aligner - // args_aligner_clean main: diff --git a/subworkflows/local/utils_nfcore_multiplesequencealign_pipeline/main.nf b/subworkflows/local/utils_nfcore_multiplesequencealign_pipeline/main.nf index 5e54fb53..c9cb1811 100644 --- a/subworkflows/local/utils_nfcore_multiplesequencealign_pipeline/main.nf +++ b/subworkflows/local/utils_nfcore_multiplesequencealign_pipeline/main.nf @@ -389,7 +389,7 @@ class Utils { // 3DCOFFEE args = fix_args(tool,args,"3DCOFFEE", "-method", "TMalign_pair") args = fix_args(tool,args,"3DCOFFEE", "-output", "fasta_aln") - + // REGRESSIVE args = fix_args(tool,args,"REGRESSIVE", "-reg", "") args = fix_args(tool,args,"REGRESSIVE", "-reg_method", "famsa_msa") diff --git a/workflows/multiplesequencealign.nf b/workflows/multiplesequencealign.nf index 4d03ac99..c10a63d2 100644 --- a/workflows/multiplesequencealign.nf +++ b/workflows/multiplesequencealign.nf @@ -61,7 +61,7 @@ include { PREPARE_SHINY } from '../modules/local/prepare_shiny' include { UNTAR } from '../modules/nf-core/untar/main' include { CSVTK_JOIN as MERGE_STATS_EVAL } from '../modules/nf-core/csvtk/join/main.nf' -include { PIGZ_COMPRESS } from '../modules/nf-core/pigz/compress/main' +include { PIGZ_COMPRESS } from '../modules/nf-core/pigz/compress/main' /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -182,7 +182,7 @@ workflow MULTIPLESEQUENCEALIGN { // // Align // - + compress_during_align = !params.skip_compression && params.skip_eval ALIGN(ch_seqs, ch_tools, ch_structures_template, compress_during_align) ch_versions = ch_versions.mix(ALIGN.out.versions) From ab06fe938a01c6c8480558763881009e6c174522 Mon Sep 17 00:00:00 2001 From: luisas Date: Thu, 6 Jun 2024 13:56:54 +0200 Subject: [PATCH 11/16] Update --- docs/README.md | 2 - .../seqreformat/tcoffee-seqreformat.diff | 101 ++++++++++++++++++ 2 files changed, 101 insertions(+), 2 deletions(-) create mode 100644 modules/nf-core/tcoffee/seqreformat/tcoffee-seqreformat.diff diff --git a/docs/README.md b/docs/README.md index 144e29a0..588a4040 100644 --- a/docs/README.md +++ b/docs/README.md @@ -6,7 +6,5 @@ The nf-core/multiplesequencealign documentation is split into the following page - An overview of how the pipeline works, how to run it and a description of all of the different command-line flags. - [Output](output.md) - An overview of the different results produced by the pipeline and how to interpret them. -- [Extending](extending.md) - - Instructions on how to extend the pipeline. You can find a lot more documentation about installing, configuring and running nf-core pipelines on the website: [https://nf-co.re](https://nf-co.re) diff --git a/modules/nf-core/tcoffee/seqreformat/tcoffee-seqreformat.diff b/modules/nf-core/tcoffee/seqreformat/tcoffee-seqreformat.diff new file mode 100644 index 00000000..e285c980 --- /dev/null +++ b/modules/nf-core/tcoffee/seqreformat/tcoffee-seqreformat.diff @@ -0,0 +1,101 @@ +Changes in module 'nf-core/tcoffee/seqreformat' +--- modules/nf-core/tcoffee/seqreformat/main.nf ++++ modules/nf-core/tcoffee/seqreformat/main.nf +@@ -38,7 +38,7 @@ + def args = task.ext.args ?: '' + prefix = task.ext.prefix ?: "${meta.id}" + """ +- touch "${prefix}_${seq_reformat_type}.txt" ++ touch "${prefix}.txt" + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + +--- modules/nf-core/tcoffee/seqreformat/environment.yml ++++ /dev/null +@@ -1,7 +0,0 @@ +-name: tcoffee_seqreformat +-channels: +- - conda-forge +- - bioconda +- - defaults +-dependencies: +- - bioconda::t-coffee=13.46.0.919e8c6b + +--- modules/nf-core/tcoffee/seqreformat/tests/main.nf.test.snap ++++ /dev/null +@@ -1,23 +0,0 @@ +-{ +- "formatted_file": { +- "content": [ +- [ +- [ +- { +- "id": "test" +- }, +- "test.txt:md5,fcd4691daf120c88ec5def7ac06fb562" +- ] +- ] +- ], +- "timestamp": "2023-11-28T11:56:22.705185493" +- }, +- "versions": { +- "content": [ +- [ +- "versions.yml:md5,68fb841e6e44274d430a1382bb0bbd14" +- ] +- ], +- "timestamp": "2023-11-28T11:56:22.717235196" +- } +-} +--- modules/nf-core/tcoffee/seqreformat/tests/tags.yml ++++ /dev/null +@@ -1,2 +0,0 @@ +-tcoffee/seqreformat: +- - "modules/nf-core/tcoffee/seqreformat/**" + +--- modules/nf-core/tcoffee/seqreformat/tests/nextflow.config ++++ /dev/null +@@ -1,3 +0,0 @@ +-process { +- ext.args = "-output=sim_idscore" +-} + +--- modules/nf-core/tcoffee/seqreformat/tests/main.nf.test ++++ /dev/null +@@ -1,33 +0,0 @@ +-nextflow_process { +- +- name "Test Process TCOFFEE_SEQREFORMAT" +- script "../main.nf" +- process "TCOFFEE_SEQREFORMAT" +- +- tag "modules" +- tag "modules_nfcore" +- tag "tcoffee" +- tag "tcoffee/seqreformat" +- +- test("sarscov2 - bam") { +- +- when { +- process { +- """ +- input[0] = [ [ id:'test' ], +- file("https://raw.githubusercontent.com/nf-core/test-datasets/multiplesequencealign/testdata/setoxin-ref.fa", checkIfExists: true) +- ] +- """ +- } +- } +- +- +- then { +- assertAll( +- { assert process.success }, +- { assert snapshot(process.out.formatted_file).match("formatted_file")}, +- { assert snapshot(process.out.versions).match("versions") } +- ) +- } +- } +-} + +************************************************************ From 919f1079af19ae8ba1d85b0f21de99dfa11b0d69 Mon Sep 17 00:00:00 2001 From: luisas Date: Thu, 6 Jun 2024 13:58:28 +0200 Subject: [PATCH 12/16] Update --- .../seqreformat/tcoffee-seqreformat.diff | 101 ------------------ 1 file changed, 101 deletions(-) delete mode 100644 modules/nf-core/tcoffee/seqreformat/tcoffee-seqreformat.diff diff --git a/modules/nf-core/tcoffee/seqreformat/tcoffee-seqreformat.diff b/modules/nf-core/tcoffee/seqreformat/tcoffee-seqreformat.diff deleted file mode 100644 index e285c980..00000000 --- a/modules/nf-core/tcoffee/seqreformat/tcoffee-seqreformat.diff +++ /dev/null @@ -1,101 +0,0 @@ -Changes in module 'nf-core/tcoffee/seqreformat' ---- modules/nf-core/tcoffee/seqreformat/main.nf -+++ modules/nf-core/tcoffee/seqreformat/main.nf -@@ -38,7 +38,7 @@ - def args = task.ext.args ?: '' - prefix = task.ext.prefix ?: "${meta.id}" - """ -- touch "${prefix}_${seq_reformat_type}.txt" -+ touch "${prefix}.txt" - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - ---- modules/nf-core/tcoffee/seqreformat/environment.yml -+++ /dev/null -@@ -1,7 +0,0 @@ --name: tcoffee_seqreformat --channels: -- - conda-forge -- - bioconda -- - defaults --dependencies: -- - bioconda::t-coffee=13.46.0.919e8c6b - ---- modules/nf-core/tcoffee/seqreformat/tests/main.nf.test.snap -+++ /dev/null -@@ -1,23 +0,0 @@ --{ -- "formatted_file": { -- "content": [ -- [ -- [ -- { -- "id": "test" -- }, -- "test.txt:md5,fcd4691daf120c88ec5def7ac06fb562" -- ] -- ] -- ], -- "timestamp": "2023-11-28T11:56:22.705185493" -- }, -- "versions": { -- "content": [ -- [ -- "versions.yml:md5,68fb841e6e44274d430a1382bb0bbd14" -- ] -- ], -- "timestamp": "2023-11-28T11:56:22.717235196" -- } --} ---- modules/nf-core/tcoffee/seqreformat/tests/tags.yml -+++ /dev/null -@@ -1,2 +0,0 @@ --tcoffee/seqreformat: -- - "modules/nf-core/tcoffee/seqreformat/**" - ---- modules/nf-core/tcoffee/seqreformat/tests/nextflow.config -+++ /dev/null -@@ -1,3 +0,0 @@ --process { -- ext.args = "-output=sim_idscore" --} - ---- modules/nf-core/tcoffee/seqreformat/tests/main.nf.test -+++ /dev/null -@@ -1,33 +0,0 @@ --nextflow_process { -- -- name "Test Process TCOFFEE_SEQREFORMAT" -- script "../main.nf" -- process "TCOFFEE_SEQREFORMAT" -- -- tag "modules" -- tag "modules_nfcore" -- tag "tcoffee" -- tag "tcoffee/seqreformat" -- -- test("sarscov2 - bam") { -- -- when { -- process { -- """ -- input[0] = [ [ id:'test' ], -- file("https://raw.githubusercontent.com/nf-core/test-datasets/multiplesequencealign/testdata/setoxin-ref.fa", checkIfExists: true) -- ] -- """ -- } -- } -- -- -- then { -- assertAll( -- { assert process.success }, -- { assert snapshot(process.out.formatted_file).match("formatted_file")}, -- { assert snapshot(process.out.versions).match("versions") } -- ) -- } -- } --} - -************************************************************ From 554ad5c034ff03c025924b61dfd10e07dc645fa8 Mon Sep 17 00:00:00 2001 From: luisas Date: Thu, 6 Jun 2024 14:15:41 +0200 Subject: [PATCH 13/16] Update --- modules.json | 106 ++++++++--------------------- modules/local/parse_sim.nf | 4 +- subworkflows/local/align.nf | 2 +- workflows/multiplesequencealign.nf | 2 +- 4 files changed, 31 insertions(+), 83 deletions(-) diff --git a/modules.json b/modules.json index b70630f2..07476a3e 100644 --- a/modules.json +++ b/modules.json @@ -8,165 +8,119 @@ "clustalo/align": { "branch": "master", "git_sha": "faf557ba56156ac0e5de76a25c1e3df11c944f59", - "installed_by": [ - "modules" - ] + "installed_by": ["modules"] }, "clustalo/guidetree": { "branch": "master", "git_sha": "1f253ec05723293df7757af8769f8389b7a1884e", - "installed_by": [ - "modules" - ] + "installed_by": ["modules"] }, "csvtk/concat": { "branch": "master", "git_sha": "cfe2a24902bfdfe8132f11461ffda92d257f9f09", - "installed_by": [ - "modules" - ] + "installed_by": ["modules"] }, "csvtk/join": { "branch": "master", "git_sha": "614abbf126f287a3068dc86997b2e1b6a93abe20", - "installed_by": [ - "modules" - ], + "installed_by": ["modules"], "patch": "modules/nf-core/csvtk/join/csvtk-join.diff" }, "famsa/align": { "branch": "master", "git_sha": "faf557ba56156ac0e5de76a25c1e3df11c944f59", - "installed_by": [ - "modules" - ] + "installed_by": ["modules"] }, "famsa/guidetree": { "branch": "master", "git_sha": "46789a4621be261f10dab0033f46f34779a5afc9", - "installed_by": [ - "modules" - ] + "installed_by": ["modules"] }, "fastqc": { "branch": "master", "git_sha": "285a50500f9e02578d90b3ce6382ea3c30216acd", - "installed_by": [ - "modules" - ] + "installed_by": ["modules"] }, "kalign/align": { "branch": "master", "git_sha": "7afd02d048ad0100be37fa1741816265c4aa307c", - "installed_by": [ - "modules" - ] + "installed_by": ["modules"] }, "learnmsa/align": { "branch": "master", "git_sha": "62007703c84bcfef92ce9e4a57cb1cc382917201", - "installed_by": [ - "modules" - ] + "installed_by": ["modules"] }, "mafft": { "branch": "master", "git_sha": "faf557ba56156ac0e5de76a25c1e3df11c944f59", - "installed_by": [ - "modules" - ] + "installed_by": ["modules"] }, "magus/align": { "branch": "master", "git_sha": "dc37bcdfa78fe3e9ca56e4b85e1621333c7b4301", - "installed_by": [ - "modules" - ] + "installed_by": ["modules"] }, "magus/guidetree": { "branch": "master", "git_sha": "dc37bcdfa78fe3e9ca56e4b85e1621333c7b4301", - "installed_by": [ - "modules" - ] + "installed_by": ["modules"] }, "mtmalign/align": { "branch": "master", "git_sha": "7bfb142c3729c1c76198c237a614215d92fe935c", - "installed_by": [ - "modules" - ] + "installed_by": ["modules"] }, "multiqc": { "branch": "master", "git_sha": "b7ebe95761cd389603f9cc0e0dc384c0f663815a", - "installed_by": [ - "modules" - ] + "installed_by": ["modules"] }, "muscle5/super5": { "branch": "master", "git_sha": "faf557ba56156ac0e5de76a25c1e3df11c944f59", - "installed_by": [ - "modules" - ] + "installed_by": ["modules"] }, "pigz/compress": { "branch": "master", "git_sha": "0eab94fc1e48703c1b0a8704bd665f554905c39d", - "installed_by": [ - "modules" - ] + "installed_by": ["modules"] }, "pigz/uncompress": { "branch": "master", "git_sha": "d7f0de8aae7bf84b080dfdcf4e294bf11a46a51c", - "installed_by": [ - "modules" - ] + "installed_by": ["modules"] }, "tcoffee/align": { "branch": "master", "git_sha": "1cacaceabae75b0c3bc393dee52cb6a5020fcb5c", - "installed_by": [ - "modules" - ] + "installed_by": ["modules"] }, "tcoffee/alncompare": { "branch": "master", "git_sha": "faf557ba56156ac0e5de76a25c1e3df11c944f59", - "installed_by": [ - "modules" - ] + "installed_by": ["modules"] }, "tcoffee/irmsd": { "branch": "master", "git_sha": "faf557ba56156ac0e5de76a25c1e3df11c944f59", - "installed_by": [ - "modules" - ] + "installed_by": ["modules"] }, "tcoffee/seqreformat": { "branch": "master", "git_sha": "b04c647f465bea2c5bb9871503182236cd65b246", - "installed_by": [ - "modules" - ], + "installed_by": ["modules"], "patch": "modules/nf-core/tcoffee/seqreformat/tcoffee-seqreformat.diff" }, "tcoffee/tcs": { "branch": "master", "git_sha": "1cacaceabae75b0c3bc393dee52cb6a5020fcb5c", - "installed_by": [ - "modules" - ] + "installed_by": ["modules"] }, "untar": { "branch": "master", "git_sha": "5caf7640a9ef1d18d765d55339be751bb0969dfa", - "installed_by": [ - "modules" - ] + "installed_by": ["modules"] } } }, @@ -175,26 +129,20 @@ "utils_nextflow_pipeline": { "branch": "master", "git_sha": "5caf7640a9ef1d18d765d55339be751bb0969dfa", - "installed_by": [ - "subworkflows" - ] + "installed_by": ["subworkflows"] }, "utils_nfcore_pipeline": { "branch": "master", "git_sha": "92de218a329bfc9a9033116eb5f65fd270e72ba3", - "installed_by": [ - "subworkflows" - ] + "installed_by": ["subworkflows"] }, "utils_nfvalidation_plugin": { "branch": "master", "git_sha": "5caf7640a9ef1d18d765d55339be751bb0969dfa", - "installed_by": [ - "subworkflows" - ] + "installed_by": ["subworkflows"] } } } } } -} \ No newline at end of file +} diff --git a/modules/local/parse_sim.nf b/modules/local/parse_sim.nf index 66c9cd39..b89d50aa 100644 --- a/modules/local/parse_sim.nf +++ b/modules/local/parse_sim.nf @@ -10,7 +10,7 @@ process PARSE_SIM { tuple val(meta), path(infile) output: - tuple val (meta), path("${prefix}.sim_tot"), emit: sim_tot + tuple val (meta), path("*.sim_tot"), emit: sim_tot path "versions.yml", emit: versions when: @@ -36,7 +36,7 @@ process PARSE_SIM { """ stub: - def prefix = task.ext.prefix ?: "${meta.id}" + prefix = task.ext.prefix ?: "${meta.id}" """ touch ${prefix}.sim_tot diff --git a/subworkflows/local/align.nf b/subworkflows/local/align.nf index d9cf7079..5dffca96 100644 --- a/subworkflows/local/align.nf +++ b/subworkflows/local/align.nf @@ -91,7 +91,7 @@ workflow ALIGN { [ metastruct+metatree+metaalign, template, struct ] } .branch{ - mtmalign: it[0]["aligner"] == "MTMALIGN" + mtmalign: it[0]["aligner"] == "MTMALIGN" } .set { ch_structures_tools } diff --git a/workflows/multiplesequencealign.nf b/workflows/multiplesequencealign.nf index c10a63d2..9aa51a89 100644 --- a/workflows/multiplesequencealign.nf +++ b/workflows/multiplesequencealign.nf @@ -61,7 +61,7 @@ include { PREPARE_SHINY } from '../modules/local/prepare_shiny' include { UNTAR } from '../modules/nf-core/untar/main' include { CSVTK_JOIN as MERGE_STATS_EVAL } from '../modules/nf-core/csvtk/join/main.nf' -include { PIGZ_COMPRESS } from '../modules/nf-core/pigz/compress/main' +include { PIGZ_COMPRESS } from '../modules/nf-core/pigz/compress/main' /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ From 859ae11fafed5721c316e187ce6c7d624376f519 Mon Sep 17 00:00:00 2001 From: luisas Date: Thu, 6 Jun 2024 14:19:58 +0200 Subject: [PATCH 14/16] Update --- modules/local/calculate_gaps.nf | 2 +- modules/local/calculate_seqstats.nf | 2 +- modules/local/create_tcoffee_template.nf | 2 +- modules/local/extract_plddt.nf | 5 +++-- modules/local/prepare_multiqc.nf | 2 +- 5 files changed, 7 insertions(+), 6 deletions(-) diff --git a/modules/local/calculate_gaps.nf b/modules/local/calculate_gaps.nf index 4f324ad2..e1571b44 100644 --- a/modules/local/calculate_gaps.nf +++ b/modules/local/calculate_gaps.nf @@ -35,7 +35,7 @@ process CALC_GAPS { """ stub: - def prefix = task.ext.prefix ?: "${meta.id}" + prefix = task.ext.prefix ?: "${meta.id}" """ touch ${prefix}_gaps.csv diff --git a/modules/local/calculate_seqstats.nf b/modules/local/calculate_seqstats.nf index 641376a4..22ead050 100644 --- a/modules/local/calculate_seqstats.nf +++ b/modules/local/calculate_seqstats.nf @@ -34,7 +34,7 @@ process CALCULATE_SEQSTATS { """ stub: - def prefix = task.ext.prefix ?: "${meta.id}" + prefix = task.ext.prefix ?: "${meta.id}" """ touch ${prefix}_seqstats.csv touch ${prefix}_seqstats_summary.csv diff --git a/modules/local/create_tcoffee_template.nf b/modules/local/create_tcoffee_template.nf index f2181630..4238c916 100644 --- a/modules/local/create_tcoffee_template.nf +++ b/modules/local/create_tcoffee_template.nf @@ -30,7 +30,7 @@ process CREATE_TCOFFEETEMPLATE { """ stub: - def prefix = task.ext.prefix ?: "${meta.id}" + prefix = task.ext.prefix ?: "${meta.id}" """ touch ${prefix}_template.txt diff --git a/modules/local/extract_plddt.nf b/modules/local/extract_plddt.nf index 1b43f893..9191d796 100644 --- a/modules/local/extract_plddt.nf +++ b/modules/local/extract_plddt.nf @@ -40,9 +40,10 @@ process EXTRACT_PLDDT { """ stub: - def prefix = task.ext.prefix ?: "${meta.id}" + prefix = task.ext.prefix ?: "${meta.id}" """ - touch ${prefix}_plddt.csv + touch ${prefix}_plddt_summary.csv + touch ${prefix}_full_plddt.csv cat <<-END_VERSIONS > versions.yml "${task.process}": diff --git a/modules/local/prepare_multiqc.nf b/modules/local/prepare_multiqc.nf index f6d3e4be..018835ae 100644 --- a/modules/local/prepare_multiqc.nf +++ b/modules/local/prepare_multiqc.nf @@ -30,7 +30,7 @@ process PREPARE_MULTIQC { """ stub: - def prefix = task.ext.prefix ?: "${meta.id}" + prefix = task.ext.prefix ?: "${meta.id}" """ touch ${prefix}_multiqc_table.csv From f3038bbef49846ae03bf2134ed2b36ba1eeb84a8 Mon Sep 17 00:00:00 2001 From: luisas Date: Thu, 6 Jun 2024 14:22:56 +0200 Subject: [PATCH 15/16] Update --- modules.json | 5 ++- .../tcoffee/seqreformat/environment.yml | 7 ++++ .../tcoffee/seqreformat/tests/main.nf.test | 33 +++++++++++++++++++ .../seqreformat/tests/main.nf.test.snap | 23 +++++++++++++ .../tcoffee/seqreformat/tests/nextflow.config | 3 ++ .../tcoffee/seqreformat/tests/tags.yml | 2 ++ 6 files changed, 70 insertions(+), 3 deletions(-) create mode 100644 modules/nf-core/tcoffee/seqreformat/environment.yml create mode 100644 modules/nf-core/tcoffee/seqreformat/tests/main.nf.test create mode 100644 modules/nf-core/tcoffee/seqreformat/tests/main.nf.test.snap create mode 100644 modules/nf-core/tcoffee/seqreformat/tests/nextflow.config create mode 100644 modules/nf-core/tcoffee/seqreformat/tests/tags.yml diff --git a/modules.json b/modules.json index 07476a3e..f7f5635c 100644 --- a/modules.json +++ b/modules.json @@ -108,9 +108,8 @@ }, "tcoffee/seqreformat": { "branch": "master", - "git_sha": "b04c647f465bea2c5bb9871503182236cd65b246", - "installed_by": ["modules"], - "patch": "modules/nf-core/tcoffee/seqreformat/tcoffee-seqreformat.diff" + "git_sha": "32ae618a60a25a870b5fa47ea2060ddcd911ab53", + "installed_by": ["modules"] }, "tcoffee/tcs": { "branch": "master", diff --git a/modules/nf-core/tcoffee/seqreformat/environment.yml b/modules/nf-core/tcoffee/seqreformat/environment.yml new file mode 100644 index 00000000..84afe8aa --- /dev/null +++ b/modules/nf-core/tcoffee/seqreformat/environment.yml @@ -0,0 +1,7 @@ +name: tcoffee_seqreformat +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::t-coffee=13.46.0.919e8c6b diff --git a/modules/nf-core/tcoffee/seqreformat/tests/main.nf.test b/modules/nf-core/tcoffee/seqreformat/tests/main.nf.test new file mode 100644 index 00000000..7a5492c5 --- /dev/null +++ b/modules/nf-core/tcoffee/seqreformat/tests/main.nf.test @@ -0,0 +1,33 @@ +nextflow_process { + + name "Test Process TCOFFEE_SEQREFORMAT" + script "../main.nf" + process "TCOFFEE_SEQREFORMAT" + + tag "modules" + tag "modules_nfcore" + tag "tcoffee" + tag "tcoffee/seqreformat" + + test("sarscov2 - bam") { + + when { + process { + """ + input[0] = [ [ id:'test' ], + file("https://raw.githubusercontent.com/nf-core/test-datasets/multiplesequencealign/testdata/setoxin-ref.fa", checkIfExists: true) + ] + """ + } + } + + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out.formatted_file).match("formatted_file")}, + { assert snapshot(process.out.versions).match("versions") } + ) + } + } +} diff --git a/modules/nf-core/tcoffee/seqreformat/tests/main.nf.test.snap b/modules/nf-core/tcoffee/seqreformat/tests/main.nf.test.snap new file mode 100644 index 00000000..150102ee --- /dev/null +++ b/modules/nf-core/tcoffee/seqreformat/tests/main.nf.test.snap @@ -0,0 +1,23 @@ +{ + "formatted_file": { + "content": [ + [ + [ + { + "id": "test" + }, + "test.txt:md5,fcd4691daf120c88ec5def7ac06fb562" + ] + ] + ], + "timestamp": "2023-11-28T11:56:22.705185493" + }, + "versions": { + "content": [ + [ + "versions.yml:md5,68fb841e6e44274d430a1382bb0bbd14" + ] + ], + "timestamp": "2023-11-28T11:56:22.717235196" + } +} \ No newline at end of file diff --git a/modules/nf-core/tcoffee/seqreformat/tests/nextflow.config b/modules/nf-core/tcoffee/seqreformat/tests/nextflow.config new file mode 100644 index 00000000..910cc175 --- /dev/null +++ b/modules/nf-core/tcoffee/seqreformat/tests/nextflow.config @@ -0,0 +1,3 @@ +process { + ext.args = "-output=sim_idscore" +} diff --git a/modules/nf-core/tcoffee/seqreformat/tests/tags.yml b/modules/nf-core/tcoffee/seqreformat/tests/tags.yml new file mode 100644 index 00000000..268d8814 --- /dev/null +++ b/modules/nf-core/tcoffee/seqreformat/tests/tags.yml @@ -0,0 +1,2 @@ +tcoffee/seqreformat: + - "modules/nf-core/tcoffee/seqreformat/**" From 991bc918e0cac54e4a35e4771c718eaca6d0ee4f Mon Sep 17 00:00:00 2001 From: Luisa Santus Date: Wed, 19 Jun 2024 09:38:48 +0000 Subject: [PATCH 16/16] Fixes 4 review --- README.md | 8 ++++---- conf/test.config | 2 -- conf/test_full.config | 2 -- conf/test_parameters.config | 2 -- conf/test_pdb.config | 3 --- docs/usage.md | 12 ++++++------ workflows/multiplesequencealign.nf | 1 - 7 files changed, 10 insertions(+), 20 deletions(-) diff --git a/README.md b/README.md index 9ff942f2..1f9c87ea 100644 --- a/README.md +++ b/README.md @@ -26,8 +26,8 @@ In a nutshell, the pipeline performs the following steps: 1. **Input files summary**: (Optional) computation of summary statistics on the input files, such as the average sequence similarity across the input sequences, their length, plddt extraction if available, etc. -2. **Guide Tree**: (Optional) Renders a guide tree. -3. **Align**: (Required) Aligns the sequences. +2. **Guide Tree**: (Optional) Renders a guide tree with a chosen tool (list available below). Some aligners use guide trees to define the order in which the sequences are aligned. +3. **Align**: (Required) Aligns the sequences with a chosen tool (list available below). 4. **Evaluate**: (Optional) Evaluates the generated alignments with different metrics: Sum Of Pairs (SoP), Total Column score (TC), iRMSD, Total Consistency Score (TCS), etc. 5. **Report**: Reports the collected information of the runs in a shiny app and a summary table in MultiQC. @@ -49,7 +49,7 @@ seatoxin,seatoxin.fa,seatoxin-ref.fa,seatoxin_structures toxin,toxin.fa,toxin-ref.fa,toxin_structures ``` -Each row represents a set of sequences (in this case the seatoxin and toxin protein families) to be aligned. +Each row represents a set of sequences (in this case the seatoxin and toxin protein families) to be aligned and the associated (if available) reference alignments and protein structure files. > [!NOTE] > The only required input is the id column and either fasta or structures. @@ -58,7 +58,7 @@ Each row represents a set of sequences (in this case the seatoxin and toxin prot Each line of the toolsheet defines a combination of guide tree and multiple sequence aligner to run with the respective arguments to be used. -It should look at foollows: +It should look at follows: `toolsheet.csv`: diff --git a/conf/test.config b/conf/test.config index 4d8bef0c..4a7b6c7c 100644 --- a/conf/test.config +++ b/conf/test.config @@ -35,7 +35,5 @@ params { input = params.pipelines_testdata_base_path + 'multiplesequencealign/samplesheet/v1.0/samplesheet_test_af2.csv' tools = params.pipelines_testdata_base_path + 'multiplesequencealign/toolsheet/v1.0/toolsheet_full.csv' - // Output directory - outdir = "./outdir/" } diff --git a/conf/test_full.config b/conf/test_full.config index 96eacde9..ef47380f 100644 --- a/conf/test_full.config +++ b/conf/test_full.config @@ -29,6 +29,4 @@ params { input = params.pipelines_testdata_base_path + 'multiplesequencealign/samplesheet/v1.0/samplesheet_full.csv' tools = params.pipelines_testdata_base_path + 'multiplesequencealign/toolsheet/v1.0/toolsheet_full.csv' - // Output directory - outdir = "./outdir/" } diff --git a/conf/test_parameters.config b/conf/test_parameters.config index e936e59c..039be207 100644 --- a/conf/test_parameters.config +++ b/conf/test_parameters.config @@ -28,7 +28,5 @@ params { input = params.pipelines_testdata_base_path + 'multiplesequencealign/samplesheet/v1.0/samplesheet_test_af2.csv' tools = params.pipelines_testdata_base_path + 'multiplesequencealign/toolsheet/v1.0/toolsheet_full.csv' - // Output directory - outdir = "./outdir/" } diff --git a/conf/test_pdb.config b/conf/test_pdb.config index b451dcc0..ad14556c 100644 --- a/conf/test_pdb.config +++ b/conf/test_pdb.config @@ -30,7 +30,4 @@ params { input = params.pipelines_testdata_base_path + 'multiplesequencealign/samplesheet/v1.0/samplesheet_test.csv' tools = params.pipelines_testdata_base_path + 'multiplesequencealign/toolsheet/v1.0/toolsheet_structural.csv' - // Output directory - outdir = "./outdir/" - } diff --git a/docs/usage.md b/docs/usage.md index 6a06ee69..dbe1fc05 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -14,11 +14,11 @@ The main steps of the pipeline are: 2. **Guide Tree**: (Optional) Renders a guide tree. 3. **Align**: aligns the sequences. 4. **Evaluate**: (Optional) The obtained alignments are evaluated with different metrics: Sum Of Pairs (SoP), Total Column score (TC), iRMSD, Total Consistency Score (TCS), etc. Skip by passing `--skip_eval` as a parameter. -5. **Report**: Reports about the collected information of the runs are reported in a shiny app and a summary table in multiqc. Skip by passing `--skip_shiny` and `--skip_multiqc`. +5. **Report**: Reports about the collected information of the runs are reported in a Shiny app and a summary table in MultiQC. Skip by passing `--skip_shiny` and `--skip_multiqc`. ### 1. INPUT FILES SUMMARY -Summary information about the input fasta files are calculated. Skip by `--skip_stats`. +Summary information about the input fasta files is calculated. Skip by `--skip_stats`. 1. Sequence similarity. Calculates pairwise and average sequence similarity is calculated using TCOFFEE. Activate with `--calc_sim` (default: false). 2. General summary. Calculates the number and the average length of sequences. Activate with `--calc_seq_stats` (default: true). @@ -69,10 +69,10 @@ Optionally, the produced MSAs can be evaluated. Skip with `--skip_eval`. **REFERENCE-BASED**: -The reference MSAs (see samplesheet) are be used to evaluate the quality of the produced MSA. +The reference MSAs (see samplesheet) are used to evaluate the quality of the produced MSA. -2. Sum Of Pairs. Calculates the SP score using the [TCOFFEE](https://tcoffee.readthedocs.io/en/latest/index.html) implementation. Activate using `--calc_sp` (default: true). -3. Total column. Calculates the TC score [TCOFFEE](https://tcoffee.readthedocs.io/en/latest/index.html). Activate using `--calc_tc` (default: true). +2. Sum Of Pairs (SP). Calculates the SP score using the [TCOFFEE](https://tcoffee.readthedocs.io/en/latest/index.html) implementation. Activate using `--calc_sp` (default: true). +3. Total column (TC). Calculates the TC score [TCOFFEE](https://tcoffee.readthedocs.io/en/latest/index.html). Activate using `--calc_tc` (default: true). **STRUCTURE-BASED**: @@ -83,7 +83,7 @@ The provided structures (see samplesheet) are used to evaluate the quality of th ### 5. REPORT Finally, a summary table with all the computed statistics and evaluations is reported in MultiQC (Skip by `--skip_multiqc`). -Moreover, a shiny app is prepared with interactive summary plots. +Moreover, a Shiny app is generated with interactive summary plots. > [!WARNING] > You will need to have [shiny](https://shiny.posit.co/py/) installed to run it! See [output documentation](https://nf-co.re/multiplesequencealign/output) for more infos. diff --git a/workflows/multiplesequencealign.nf b/workflows/multiplesequencealign.nf index 9aa51a89..47e7d492 100644 --- a/workflows/multiplesequencealign.nf +++ b/workflows/multiplesequencealign.nf @@ -178,7 +178,6 @@ workflow MULTIPLESEQUENCEALIGN { stats_summary = stats_summary.mix(STATS.out.stats_summary) } - // // Align //