diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index d0a33337..7a1aeb48 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -26,6 +26,10 @@ jobs: NXF_VER: - "24.04.1" - "latest-everything" + ANALYSIS: + - "test" + - "test_pdb" + - "test_parameters" steps: - name: Check out pipeline code uses: actions/checkout@0ad4b8fadaa221de15dcec353f45205ec38ea70b # v4 @@ -40,9 +44,9 @@ jobs: - name: Run pipeline with test data run: | - nextflow run ${GITHUB_WORKSPACE} -profile test,docker --outdir ./results + nextflow run ${GITHUB_WORKSPACE} -profile ${{ matrix.ANALYSIS }},docker --outdir ./results - parameters: + parameters_stub: name: Test workflow parameters if: "${{ github.event_name != 'push' || (github.event_name == 'push' && github.repository == 'nf-core/multiplesequencealign') }}" runs-on: ubuntu-latest @@ -51,6 +55,12 @@ jobs: NXF_VER: - "24.04.1" - "latest-everything" + PARAMS: + - "--skip_stats" + - "--skip_eval" + - "--skip_compression" + - "--skip_shiny" + steps: - name: Check out pipeline code uses: actions/checkout@v4 @@ -62,4 +72,4 @@ jobs: - name: Test workflow parameters run: | - nextflow run ${GITHUB_WORKSPACE} -profile test_parameters,docker --outdir ./results + nextflow run -stub-run ${GITHUB_WORKSPACE} -profile test,docker ${{ matrix.PARAMS }} --outdir ./results diff --git a/.nf-core.yml b/.nf-core.yml index cb2323da..e164e770 100644 --- a/.nf-core.yml +++ b/.nf-core.yml @@ -2,3 +2,4 @@ repository_type: pipeline nf_core_version: "2.14.1" lint: multiqc_config: False + files_exist: conf/igenomes.config diff --git a/README.md b/README.md index 44a12c2b..1f9c87ea 100644 --- a/README.md +++ b/README.md @@ -21,43 +21,25 @@ **nf-core/multiplesequencealign** is a pipeline to deploy and systematically evaluate Multiple Sequence Alignment (MSA) methods. -The pipeline is built using [Nextflow](https://www.nextflow.io), a workflow tool to run tasks across multiple compute infrastructures in a very portable manner. It uses Docker/Singularity containers making installation trivial and results highly reproducible. The [Nextflow DSL2](https://www.nextflow.io/docs/latest/dsl2.html) implementation of this pipeline uses one container per process which makes it much easier to maintain and update software dependencies. Where possible, these processes have been submitted to and installed from [nf-core/modules](https://github.com/nf-core/modules) in order to make them available to all nf-core pipelines, and to everyone within the Nextflow community! - -On release, automated continuous integration tests run the pipeline on a full-sized dataset on the AWS cloud infrastructure. This ensures that the pipeline runs on AWS, has sensible resource allocation defaults set to run on real-world datasets, and permits the persistent storage of results to benchmark between pipeline releases and other analysis sources.The results obtained from the full-sized test can be viewed on the [nf-core website](https://nf-co.re/proteinfold/results). - ![Alt text](docs/images/nf-core-msa_metro_map.png?raw=true "nf-core-msa metro map") -1. **Collect Input Information**: computation of summary statistics on the input fasta file, such as the average sequence similarity across the input sequences, their length, etc. Skip by `--skip_stats` as a parameter. -2. **Guide Tree**: (Optional, depends on alignment tools requirement) Renders a guide tree. -3. **Align**: Runs one or multiple MSA tools in parallel. -4. **Evaluate**: The obtained alignments are evaluated with different metrics: Sum Of Pairs (SoP), Total Column score (TC), iRMSD, Total Consistency Score (TCS), etc. Skip by passing `--skip_eval` as a parameter. -5. **Compress**: As MSAs can be very large, by default all tools in the pipeline produce compressed output. For most of them, the compression happens through a pipe, such that uncompressed data never hits the disk. This compression can be turned off by passing `--no_compression` as a parameter. - -Available GUIDE TREE methods: - -- CLUSTALO -- FAMSA -- MAGUS +In a nutshell, the pipeline performs the following steps: -Available ALIGN methods: - -- CLUSTALO -- FAMSA -- KALIGN -- LEARNMSA -- MAFFT -- MAGUS -- MUSCLE5 -- MTMALIGN -- T-COFFEE -- 3DCOFFEE +1. **Input files summary**: (Optional) computation of summary statistics on the input files, such as the average sequence similarity across the input sequences, their length, plddt extraction if available, etc. +2. **Guide Tree**: (Optional) Renders a guide tree with a chosen tool (list available below). Some aligners use guide trees to define the order in which the sequences are aligned. +3. **Align**: (Required) Aligns the sequences with a chosen tool (list available below). +4. **Evaluate**: (Optional) Evaluates the generated alignments with different metrics: Sum Of Pairs (SoP), Total Column score (TC), iRMSD, Total Consistency Score (TCS), etc. +5. **Report**: Reports the collected information of the runs in a shiny app and a summary table in MultiQC. ## Usage > [!NOTE] > If you are new to Nextflow and nf-core, please refer to [this page](https://nf-co.re/docs/usage/installation) on how to set-up Nextflow. Make sure to [test your setup](https://nf-co.re/docs/usage/introduction#how-to-run-a-pipeline) with `-profile test` before running the workflow on actual data. -First, prepare a samplesheet with your input data that looks as follows: +#### 1. SAMPLESHEET + +The sample sheet defines the input data that the pipeline will process. +It should look like this: `samplesheet.csv`: @@ -67,33 +49,30 @@ seatoxin,seatoxin.fa,seatoxin-ref.fa,seatoxin_structures toxin,toxin.fa,toxin-ref.fa,toxin_structures ``` -Each row represents a set of sequences (in this case the seatoxin and toxin protein families) to be processed. - -`id` is the name of the set of sequences. It can correspond to the protein family name or to an internal id. +Each row represents a set of sequences (in this case the seatoxin and toxin protein families) to be aligned and the associated (if available) reference alignments and protein structure files. -The column `fasta` contains the path to the fasta file that contains the sequences. +> [!NOTE] +> The only required input is the id column and either fasta or structures. -The column `reference` is optional and contains the path to the reference alignment. It is used for certain evaluation steps. It can be left empty. +#### 2. TOOLSHEET -The column `structures` is also optional and contains the path to the folder that contains the protein structures for the sequences to be aligned. It is used for structural aligners and certain evaluation steps. It can be left empty. +Each line of the toolsheet defines a combination of guide tree and multiple sequence aligner to run with the respective arguments to be used. -Then, you should prepare a toolsheet which defines which tools to run as follows: +It should look at follows: `toolsheet.csv`: ```csv tree,args_tree,aligner,args_aligner, -FAMSA, -gt upgma -partree, FAMSA, -, ,TCOFFEE, -output fasta_aln +FAMSA, -gt upgma -medoidtree, FAMSA, +, ,TCOFFEE, +FAMSA,,REGRESSIVE, ``` -`tree` is the tool used to build the tree. - -Arguments to the tree tool can be provided using `args_tree`. - -The `aligner` column contains the tool to run the alignment. +> [!NOTE] +> The only required input is aligner. -Finally, the arguments to the aligner tool can be set by using the `args_alginer` column. +#### 3. RUN THE PIPELINE Now, you can run the pipeline using: @@ -117,6 +96,10 @@ To see the results of an example test run with a full size dataset refer to the For more details about the output files and reports, please refer to the [output documentation](https://nf-co.re/multiplesequencealign/output). +## Extending the pipeline + +For details on how to add your favourite guide tree/MSA/evaluation step in nf-core/multiplesequencealign please refer to [extending documentation](https://github.com/luisas/multiplesequencealign/blob/luisa_patch/docs/extending.md). + ## Credits nf-core/multiplesequencealign was originally written by Luisa Santus ([@luisas](https://github.com/luisas)) and Jose Espinosa-Carrasco ([@JoseEspinosa](https://github.com/JoseEspinosa)) from The Comparative Bioinformatics Group at The Centre for Genomic Regulation, Spain. diff --git a/assets/schema_input.json b/assets/schema_input.json index f5f85025..a2770af7 100644 --- a/assets/schema_input.json +++ b/assets/schema_input.json @@ -28,6 +28,7 @@ "type": "string" } }, - "required": ["id", "fasta"] + "required": ["id"], + "anyOf": [{ "required": ["fasta"] }, { "required": ["structures"] }] } } diff --git a/conf/modules.config b/conf/modules.config index ac90d289..67bcc450 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -1,224 +1,249 @@ -/* -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - Config file for defining DSL2 per module options and publishing paths -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - Available keys to override module options: - ext.args = Additional arguments appended to command in module. - ext.args2 = Second set of arguments appended to command in module (multi-tool modules). - ext.args3 = Third set of arguments appended to command in module (multi-tool modules). - ext.prefix = File name prefix for output files. ----------------------------------------------------------------------------------------- -*/ - -process { - - // ------------------------------------ - // Statistics about the input sequences - // ------------------------------------ - - withName: "CALCULATE_SEQSTATS"{ - publishDir = [ - path: { "${params.outdir}/stats/sequences/seqstats" }, - mode: params.publish_dir_mode, - saveAs: { filename -> filename.equals('versions.yml')|filename.contains('_summary.csv') ? null : filename } - ] - } - - withName: "CONCAT_SEQSTATS"{ - ext.prefix = { "summary_seqstats" } - } - - withName: "EXTRACT_PLDDT"{ - publishDir = [ - path: { "${params.outdir}/stats/structures/plddt" }, - mode: params.publish_dir_mode, - saveAs: { filename -> filename.equals('versions.yml')|filename.contains('_summary.csv') ? null : filename } - ] - } - - withName: "CONCAT_PLDDT"{ - ext.prefix = { "summary_plddt" } - } - - withName: TCOFFEE_SEQREFORMAT_SIM{ - ext.args = "-output=sim_idscore" - publishDir = [ - path: { "${params.outdir}/stats/sequences/perc_sim" }, - mode: params.publish_dir_mode, - saveAs: { filename -> filename.equals('versions.yml') ? null : filename } - ] - } - - withName: "CONCAT_SIMSTATS"{ - ext.prefix = { "summary_simstats" } - } - - withName: "MERGE_STATS"{ - ext.prefix = { "complete_summary_stats" } - ext.args = "-f 1 -O" - publishDir = [ - path: { "${params.outdir}/stats/" }, - mode: params.publish_dir_mode, - saveAs: { filename -> filename.equals('versions.yml') ? null : filename } - ] - } - - - // ------------------------------------ - // Tree building - // ------------------------------------ + /* + ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Config file for defining DSL2 per module options and publishing paths + ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Available keys to override module options: + ext.args = Additional arguments appended to command in module. + ext.args2 = Second set of arguments appended to command in module (multi-tool modules). + ext.args3 = Third set of arguments appended to command in module (multi-tool modules). + ext.prefix = File name prefix for output files. + ---------------------------------------------------------------------------------------- + */ + + process { + + // ------------------------------------ + // Statistics about the input sequences + // ------------------------------------ + + withName: "CALCULATE_SEQSTATS"{ + publishDir = [ + path: { "${params.outdir}/summary/stats/sequences/seqstats" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml')|filename.contains('_summary.csv') ? null : filename } + ] + } + + withName: "CONCAT_SEQSTATS"{ + ext.prefix = { "summary_seqstats" } + } + + withName: "EXTRACT_PLDDT"{ + publishDir = [ + path: { "${params.outdir}/summary/stats/structures/plddt" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml')|filename.contains('_summary.csv') ? null : filename } + ] + } + + withName: "CONCAT_PLDDT"{ + ext.prefix = { "summary_plddt" } + } + + withName: TCOFFEE_SEQREFORMAT_SIM{ + ext.args = "-output=sim_idscore" + publishDir = [ + path: { "${params.outdir}/summary/stats/sequences/perc_sim" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + + withName: "CONCAT_SIMSTATS"{ + ext.prefix = { "summary_simstats" } + } + + withName: "MERGE_STATS"{ + ext.prefix = { "complete_summary_stats" } + ext.args = "-f 1 -O" + publishDir = [ + path: { "${params.outdir}/stats/" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + + + // ------------------------------------ + // Tree building + // ------------------------------------ + + withName: "FAMSA_GUIDETREE"{ + tag = { "${meta.id} args:${meta.args_tree}" } + ext.prefix = { "${meta.id}_${meta.tree}-args-${meta.argstree_clean}" } + ext.args = { "${meta.args_tree}" == "null" ? '' : "${meta.args_tree}" } + publishDir = [ + path: { "${params.outdir}/trees/${meta.id}" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + + withName: "CLUSTALO_GUIDETREE"{ + tag = { "${meta.id} args:${meta.args_tree}" } + ext.prefix = { "${meta.id}_${meta.tree}-args-${meta.argstree_clean}" } + ext.args = { "${meta.args_tree}" == "null" ? '' : "${meta.args_tree}" } + publishDir = [ + path: { "${params.outdir}/trees/${meta.id}" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + + withName: "MAGUS_GUIDETREE"{ + tag = { "${meta.id} args:${meta.args_tree}" } + ext.prefix = { "${meta.id}_${meta.tree}-args-${meta.argstree_clean}" } + ext.args = { "${meta.args_tree}" == "null" ? '' : "${meta.args_tree}" } + publishDir = [ + path: { "${params.outdir}/trees/${meta.id}" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + + // ------------------------------------ + // Alignment + // ------------------------------------ + + withName: "CREATE_TCOFFEETEMPLATE"{ + ext.prefix = { "${meta.id}" } + } + + + withName: "CLUSTALO_ALIGN|FAMSA_ALIGN|LEARNMSA_ALIGN|MAFFT|MAGUS_ALIGN|MUSCLE5_SUPER5|REGRESSIVE|TCOFFEE_ALIGN|TCOFFEE3D_ALIGN"{ + tag = { "${meta.id} tree:${meta.tree} argstree:${args_tree} args:${meta.args_aligner}" } + ext.prefix = { "${meta.id}_${meta.tree}-args-${meta.argstree_clean}_${meta.aligner}-args-${meta.args_aligner_clean}" } + ext.args = { "${meta.args_aligner}" == "null" ? '' : "${meta.args_aligner}" } + if(params.skip_compression){ + publishDir = [ + path: { "${params.outdir}/alignment/${meta.id}" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + } + + withName: "MTMALIGN_ALIGN"{ + tag = { "${meta.id} tree:${meta.tree} argstree:${args_tree} args:${meta.args_aligner}" } + ext.prefix = { "${meta.id}_${meta.tree}-args-${meta.argstree_clean}_${meta.aligner}-args-${meta.args_aligner_clean}" } + ext.args = { "${meta.args_aligner}" == "null" ? '' : "${meta.args_aligner}" } + if(params.skip_compression){ + publishDir = [ + path: { "${params.outdir}/alignment/${meta.id}" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, + pattern: "*.aln" + ] + } + + } + + withName:"PIGZ_COMPRESS"{ + publishDir = [ + path: { "${params.outdir}/alignment/${meta.id}" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + + // ------------------------------------ + // Alignment evaluation + // ------------------------------------ + + withName: 'PARSE_IRMSD'{ + ext.prefix = { "${meta.id}_${meta.tree}-args-${meta.argstree_clean}_${meta.aligner}-args-${meta.args_aligner_clean}_irmsd" } + } + + withName: 'TCOFFEE_ALNCOMPARE_SP'{ + ext.prefix = { "${meta.id}_${meta.tree}-args-${meta.argstree_clean}_${meta.aligner}-args-${meta.args_aligner_clean}_sp" } + ext.args = "-compare_mode sp" + } + + withName: 'TCOFFEE_ALNCOMPARE_TC'{ + ext.prefix = { "${meta.id}_${meta.tree}-args-${meta.argstree_clean}_${meta.aligner}-args-${meta.args_aligner_clean}_tc" } + ext.args = "-compare_mode tc" + } + + withName: 'TCOFFEE_IRMSD'{ + ext.prefix = { "${meta.id}_${meta.tree}-args-${meta.argstree_clean}_${meta.aligner}-args-${meta.args_aligner_clean}_irmsd" } + publishDir = [ + path: { "${params.outdir}/evaluation/${task.process.tokenize(':')[-1].toLowerCase()}" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + + withName: "CALC_GAPS"{ + ext.prefix = { "${meta.id}_${meta.tree}-args-${meta.argstree_clean}_${meta.aligner}-args-${meta.args_aligner_clean}_gaps" } + } + + withName: "CONCAT_IRMSD"{ + ext.prefix = { "summary_irmsd" } + } + + withName: "CONCAT_GAPS"{ + ext.prefix = { "summary_gaps" } + } + + withName: "CONCAT_SP"{ + ext.prefix = { "summary_sp" } + } + + withName: "CONCAT_TC"{ + ext.prefix = { "summary_tc" } + } + + withName: "CONCAT_TCS"{ + ext.prefix = { "summary_tcs" } + } + + withName: 'TCOFFEE_TCS'{ + ext.prefix = { "${meta.id}_${meta.tree}-args-${meta.argstree_clean}_${meta.aligner}-args-${meta.args_aligner_clean}_tcs" } + publishDir = [ + path: { "${params.outdir}/evaluation/${task.process.tokenize(':')[-1].toLowerCase()}" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + + withName: "MERGE_EVAL"{ + ext.prefix = { "complete_summary_eval" } + ext.args = "-f 1,2,3,4,5,6,7 -O" + publishDir = [ + path: { "${params.outdir}/evaluation/" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + + withName: "MERGE_STATS_EVAL"{ + ext.prefix = { "complete_summary_stats_eval" } + ext.args = "-f 1 -O" + publishDir = [ + path: { "${params.outdir}/summary/" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + + withName: 'MULTIQC' { + ext.args = { params.multiqc_title ? "--title \"$params.multiqc_title\"" : '' } + publishDir = [ + path: { "${params.outdir}/multiqc" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + + // ------------------------------------ + // Shiny app + // ------------------------------------ + withName: 'PREPARE_SHINY' { + publishDir = [ + path: { "${params.outdir}/shiny_app" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } - withName: "FAMSA_GUIDETREE"{ - tag = { "${meta.id} args:${meta.args_tree}" } - ext.prefix = { "${meta.id}_${meta.tree}-args-${meta.argstree_clean}" } - ext.args = { "${meta.args_tree}" == "null" ? '' : "${meta.args_tree}" } - publishDir = [ - path: { "${params.outdir}/trees/${task.process.tokenize(':')[-1].tokenize('_')[0].toLowerCase()}" }, - mode: params.publish_dir_mode, - saveAs: { filename -> filename.equals('versions.yml') ? null : filename } - ] } - - withName: "CLUSTALO_GUIDETREE"{ - tag = { "${meta.id} args:${meta.args_tree}" } - ext.prefix = { "${meta.id}_${meta.tree}-args-${meta.argstree_clean}" } - ext.args = { "${meta.args_tree}" == "null" ? '' : "${meta.args_tree}" } - publishDir = [ - path: { "${params.outdir}/trees/${task.process.tokenize(':')[-1].tokenize('_')[0].toLowerCase()}" }, - mode: params.publish_dir_mode, - saveAs: { filename -> filename.equals('versions.yml') ? null : filename } - ] - } - - withName: "MAGUS_GUIDETREE"{ - tag = { "${meta.id} args:${meta.args_tree}" } - ext.prefix = { "${meta.id}_${meta.tree}-args-${meta.argstree_clean}" } - ext.args = { "${meta.args_tree}" == "null" ? '' : "${meta.args_tree}" } - publishDir = [ - path: { "${params.outdir}/trees/${task.process.tokenize(':')[-1].tokenize('_')[0].toLowerCase()}" }, - mode: params.publish_dir_mode, - saveAs: { filename -> filename.equals('versions.yml') ? null : filename } - ] - } - - // ------------------------------------ - // Alignment - // ------------------------------------ - - withName: "CREATE_TCOFFEETEMPLATE"{ - ext.prefix = { "${meta.id}" } - } - - - withName: "CLUSTALO_ALIGN|FAMSA_ALIGN|LEARNMSA_ALIGN|MAFFT|MAGUS_ALIGN|MUSCLE5_SUPER5|REGRESSIVE|TCOFFEE_ALIGN|TCOFFEE3D_ALIGN|MTMALIGN_ALIGN"{ - tag = { "${meta.id} tree:${meta.tree} argstree:${args_tree} args:${meta.args_aligner}" } - ext.prefix = { "${meta.id}_${meta.tree}-args-${meta.argstree_clean}_${meta.aligner}-args-${meta.args_aligner_clean}" } - ext.args = { "${meta.args_aligner}" == "null" ? '' : "${meta.args_aligner}" } - publishDir = [ - path: { "${params.outdir}/alignment/${task.process.tokenize(':')[-1].tokenize('_')[0].toLowerCase()}" }, - mode: params.publish_dir_mode, - saveAs: { filename -> filename.equals('versions.yml') ? null : filename } - ] - } - - // ------------------------------------ - // Alignment evaluation - // ------------------------------------ - - withName: 'PARSE_IRMSD'{ - ext.prefix = { "${meta.id}_${meta.tree}-args-${meta.argstree_clean}_${meta.aligner}-args-${meta.args_aligner_clean}_irmsd" } - } - - withName: 'TCOFFEE_ALNCOMPARE_SP'{ - ext.prefix = { "${meta.id}_${meta.tree}-args-${meta.argstree_clean}_${meta.aligner}-args-${meta.args_aligner_clean}_sp" } - ext.args = "-compare_mode sp" - } - - withName: 'TCOFFEE_ALNCOMPARE_TC'{ - ext.prefix = { "${meta.id}_${meta.tree}-args-${meta.argstree_clean}_${meta.aligner}-args-${meta.args_aligner_clean}_tc" } - ext.args = "-compare_mode tc" - } - - withName: 'TCOFFEE_IRMSD'{ - ext.prefix = { "${meta.id}_${meta.tree}-args-${meta.argstree_clean}_${meta.aligner}-args-${meta.args_aligner_clean}_irmsd" } - publishDir = [ - path: { "${params.outdir}/evaluation/${task.process.tokenize(':')[-1].toLowerCase()}" }, - mode: params.publish_dir_mode, - saveAs: { filename -> filename.equals('versions.yml') ? null : filename } - ] - } - - withName: "CALC_GAPS"{ - ext.prefix = { "${meta.id}_${meta.tree}-args-${meta.argstree_clean}_${meta.aligner}-args-${meta.args_aligner_clean}_gaps" } - } - - withName: "CONCAT_IRMSD"{ - ext.prefix = { "summary_irmsd" } - } - - withName: "CONCAT_GAPS"{ - ext.prefix = { "summary_gaps" } - } - - withName: "CONCAT_SP"{ - ext.prefix = { "summary_sp" } - } - - withName: "CONCAT_TC"{ - ext.prefix = { "summary_tc" } - } - - withName: "CONCAT_TCS"{ - ext.prefix = { "summary_tcs" } - } - - withName: 'TCOFFEE_TCS'{ - ext.prefix = { "${meta.id}_${meta.tree}-args-${meta.argstree_clean}_${meta.aligner}-args-${meta.args_aligner_clean}_tcs" } - publishDir = [ - path: { "${params.outdir}/evaluation/${task.process.tokenize(':')[-1].toLowerCase()}" }, - mode: params.publish_dir_mode, - saveAs: { filename -> filename.equals('versions.yml') ? null : filename } - ] - } - - withName: "MERGE_EVAL"{ - ext.prefix = { "complete_summary_eval" } - ext.args = "-f 1,2,3,4,5,6,7 -O" - publishDir = [ - path: { "${params.outdir}/evaluation/" }, - mode: params.publish_dir_mode, - saveAs: { filename -> filename.equals('versions.yml') ? null : filename } - ] - } - - withName: "MERGE_STATS_EVAL"{ - ext.prefix = { "complete_summary_stats_eval" } - ext.args = "-f 1 -O" - publishDir = [ - path: { "${params.outdir}/summary/" }, - mode: params.publish_dir_mode, - saveAs: { filename -> filename.equals('versions.yml') ? null : filename } - ] - } - - withName: 'MULTIQC' { - ext.args = { params.multiqc_title ? "--title \"$params.multiqc_title\"" : '' } - publishDir = [ - path: { "${params.outdir}/multiqc" }, - mode: params.publish_dir_mode, - saveAs: { filename -> filename.equals('versions.yml') ? null : filename } - ] - } - - // ------------------------------------ - // Shiny app - // ------------------------------------ - withName: 'PREPARE_SHINY' { - publishDir = [ - path: { "${params.outdir}/shiny_app" }, - mode: params.publish_dir_mode, - saveAs: { filename -> filename.equals('versions.yml') ? null : filename } - ] - } - -} diff --git a/conf/test.config b/conf/test.config index 544767dd..4a7b6c7c 100644 --- a/conf/test.config +++ b/conf/test.config @@ -20,12 +20,20 @@ params { max_memory = '6.GB' max_time = '6.h' - skip_multiqc = false + skip_stats = false + calc_sim = true + calc_seq_stats = true + extract_plddt = true + skip_eval = false + calc_sp = true + calc_tc = true + calc_irmsd = true + calc_gaps = true + calc_tcs = true + // Input data input = params.pipelines_testdata_base_path + 'multiplesequencealign/samplesheet/v1.0/samplesheet_test_af2.csv' - tools = params.pipelines_testdata_base_path + 'multiplesequencealign/toolsheet/v1.0/toolsheet.csv' + tools = params.pipelines_testdata_base_path + 'multiplesequencealign/toolsheet/v1.0/toolsheet_full.csv' - // Output directory - outdir = "./outdir/" } diff --git a/conf/test_full.config b/conf/test_full.config index 4a1aa3bf..ef47380f 100644 --- a/conf/test_full.config +++ b/conf/test_full.config @@ -14,7 +14,19 @@ params { config_profile_name = 'Full test profile' config_profile_description = 'Full test dataset to check pipeline function' + skip_stats = false + calc_sim = true + calc_seq_stats = true + extract_plddt = true + skip_eval = false + calc_sp = true + calc_tc = true + calc_irmsd = true + calc_gaps = true + calc_tcs = true + // Input data for full size test input = params.pipelines_testdata_base_path + 'multiplesequencealign/samplesheet/v1.0/samplesheet_full.csv' tools = params.pipelines_testdata_base_path + 'multiplesequencealign/toolsheet/v1.0/toolsheet_full.csv' + } diff --git a/conf/test_parameters.config b/conf/test_parameters.config index e2aedc35..039be207 100644 --- a/conf/test_parameters.config +++ b/conf/test_parameters.config @@ -11,24 +11,22 @@ */ params { + config_profile_name = 'Test profile' config_profile_description = 'Minimal test dataset to check pipeline function' // Limit resources so that this can run on GitHub Actions - max_cpus = 2 - max_memory = '6.GB' - max_time = '6.h' + max_cpus = 2 + max_memory = '6.GB' + max_time = '6.h' - skip_stats = true - skip_eval = true - skip_shiny = true - skip_multiqc = true + skip_stats = true + skip_eval = true + skip_compression = false // Input data - input = 'https://raw.githubusercontent.com/nf-core/test-datasets/multiplesequencealign/samplesheet/v1.0/samplesheet_test.csv' - tools = 'https://raw.githubusercontent.com/nf-core/test-datasets/multiplesequencealign/toolsheet/v1.0/toolsheet.csv' + input = params.pipelines_testdata_base_path + 'multiplesequencealign/samplesheet/v1.0/samplesheet_test_af2.csv' + tools = params.pipelines_testdata_base_path + 'multiplesequencealign/toolsheet/v1.0/toolsheet_full.csv' - // Output directory - outdir = "./outdir/" } diff --git a/conf/test_pdb.config b/conf/test_pdb.config new file mode 100644 index 00000000..ad14556c --- /dev/null +++ b/conf/test_pdb.config @@ -0,0 +1,33 @@ +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Nextflow config file for running minimal tests +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Defines input files and everything required to run a fast and simple pipeline test. + + Use as follows: + nextflow run nf-core/multiplesequencealign -profile test, --outdir + +---------------------------------------------------------------------------------------- +*/ + +params { + config_profile_name = 'Test profile' + config_profile_description = 'Minimal test dataset to check pipeline function' + + // Limit resources so that this can run on GitHub Actions + max_cpus = 2 + max_memory = '6.GB' + max_time = '6.h' + + skip_stats = true + calc_irmsd = true + calc_sp = false + calc_tc = false + calc_gaps = false + calc_tcs = false + + // Input data + input = params.pipelines_testdata_base_path + 'multiplesequencealign/samplesheet/v1.0/samplesheet_test.csv' + tools = params.pipelines_testdata_base_path + 'multiplesequencealign/toolsheet/v1.0/toolsheet_structural.csv' + +} diff --git a/docs/extending.md b/docs/extending.md index 54ce2885..65d626f1 100644 --- a/docs/extending.md +++ b/docs/extending.md @@ -16,9 +16,6 @@ Useful resources are: The pipeline consists of four different subworkflows, one for computing the guidetrees of guidetree-based methods, one for performing the MSAs, one for evaluating the produced MSAs and one for computing statistics about the input dataset. The subworkflows are to a significant degree isolated from each other, and not all of them may run in any given execution of the pipeline. -`subworkflows/local/evaluate.nf` handles the evaluation step. It calls the modules used for evaluation and merges their output into some summary statistics. -If it is not skipped, it is the last part of the pipeline to run. - ## Adding an aligner 1. Create a local or nf-core module and ensure the output is in FASTA format @@ -52,14 +49,14 @@ Congratulations, your guide tree estimator is now in nf-core/multiplesequencalig Adding a new evaluation module into the pipeline is a bit more tricky, since the output of the evaluation modules gets processed and merged in different ways in the pipeline. This requires changes in the `evaluate.nf` subworkflow and the pipeline config as well as adding an option to the main pipeline. -The process of adding `ULTRAMSATRIC` evaluation to the pipeline may be a useful reference: [commit history](https://github.com/lrauschning/multiplesequencealign/commits/ultramsatric/). + In general, the process of adding another evaluation module to the pipeline can be thought of as three steps: 1. Create a local or nf-core module. - Make sure the evaluation output is returned from the module in CSV format! - For merging the correct evaluation files in reporting the final output, the pipeline uses the `meta` field containing the tools to use. This information has to be included in the CSV returned by the module so as to merge it later. - - Have a look at how `TCOFFEE_ALNCOMPARE` and `ULTRAMSATRIC` handle this. + - Have a look at how `TCOFFEE_ALNCOMPARE` handles this. 2. Include the evaluation module in the evaluation subworkflow diff --git a/docs/output.md b/docs/output.md index 98627e19..4bb6a2dd 100644 --- a/docs/output.md +++ b/docs/output.md @@ -10,16 +10,20 @@ The directories listed below will be created in the results directory after the The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes data using the following steps: -- [FastQC](#fastqc) - Raw read QC -- [MultiQC](#multiqc) - Aggregate report describing results and QC from the whole pipeline -- [Pipeline information](#pipeline-information) - Report metrics generated during the workflow execution +1. **Input files summary**: (Optional) computation of summary statistics on the input fasta file, such as the average sequence similarity across the input sequences, their length, etc. Skip by `--skip_stats` as a parameter. +2. **Guide Tree**: (Optional) Renders a guide tree. +3. **Align**: aligns the sequences. +4. **Evaluate**: (Optional) The obtained alignments are evaluated with different metrics: Sum Of Pairs (SoP), Total Column score (TC), iRMSD, Total Consistency Score (TCS), etc. Skip by passing `--skip_eval` as a parameter. +5. **Report**: Reports about the collected information of the runs are reported in a shiny app and a summary table in multiqc. Skip by passing `--skip_shiny` and `--skip_multiqc`. -## Summary statistics of input files +## Input files summary + +The stats.nf subworkflow collects statistics about the input files and summarizes them into a final csv file.
Output files -- `stats/` +- `summary/stats/` - `complete_summary_stats.csv`: csv file containing the summary for all the statistics computed on the input file. - `sequences/` - `seqstats/*_seqstats.csv`: file containing the sequence input length for each sequence in the family defined by the file name. If `--calc_seq_stats` is specified. @@ -27,36 +31,36 @@ The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes d - `structures/` - `plddt/*_full_plddt.csv`: file containing the plddt of the structures for each sequence in the input file. If `--extract_plddt` is specified.
-The stats.nf subworkflow collects statistics about the input files and summarizes them into a final csv file. - ## Trees +If you explicitly specifified (via the toolsheet) to compute guidetrees to be used by the MSA tool, those are stored here. +
Output files - `trees/` - - `*.dnd`: guide tree files. + - `*/*.dnd`: guide tree files.
-If you explicitly specifified (via the toolsheet) to compute guidetrees to be used by the MSA tool, those are stored here. - ## Alignment +All MSA computed are stored here. +
Output files - `alignment/` - - `*/*.fa`: each subdirectory is called as the input file. It contains all the alignments computed on it. The filename contains all the informations of the input file used and the tool. + - `*/*.fa`: each subdirectory is named after the sample id. It contains all the alignments computed on it. The filename contains all the informations of the input file used and the tool. The file naming convention is: {Input*file}*{Tree}_args-{Tree_args}_{MSA}\_args-{MSA_args}.aln
-All MSA computed are stored here. - ## Evaluation +Files with the summary of the computed evaluation statistics. +
Output files @@ -66,7 +70,7 @@ All MSA computed are stored here. - `complete_summary_eval.csv`: csv file containing the summary of all evaluation metrics for each input file.
-## shiny_app +## Shiny App
Output files @@ -83,7 +87,7 @@ To run the shiny app: `cd shiny_app` `./run.sh` -Be aware that you have to have shiny installed to access this feature. +Be aware that you have to have [shiny](https://shiny.posit.co/py/) installed to access this feature. ### MultiQC diff --git a/docs/usage.md b/docs/usage.md index 79b2fa13..dbe1fc05 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -6,58 +6,157 @@ ## Introduction - +**nf-core/multiplesequencealign** is a pipeline to deploy and systematically evaluate Multiple Sequence Alignment (MSA) methods. + +The main steps of the pipeline are: + +1. **Input files summary**: (Optional) computation of summary statistics on the input fasta file, such as the average sequence similarity across the input sequences, their length, etc. Skip by `--skip_stats` as a parameter. +2. **Guide Tree**: (Optional) Renders a guide tree. +3. **Align**: aligns the sequences. +4. **Evaluate**: (Optional) The obtained alignments are evaluated with different metrics: Sum Of Pairs (SoP), Total Column score (TC), iRMSD, Total Consistency Score (TCS), etc. Skip by passing `--skip_eval` as a parameter. +5. **Report**: Reports about the collected information of the runs are reported in a Shiny app and a summary table in MultiQC. Skip by passing `--skip_shiny` and `--skip_multiqc`. + +### 1. INPUT FILES SUMMARY + +Summary information about the input fasta files is calculated. Skip by `--skip_stats`. + +1. Sequence similarity. Calculates pairwise and average sequence similarity is calculated using TCOFFEE. Activate with `--calc_sim` (default: false). +2. General summary. Calculates the number and the average length of sequences. Activate with `--calc_seq_stats` (default: true). +3. Extract plddt. If the structures were generated by AF2, plddt is extracted and reported. Activate with `--extract_plddt` (default: false). + +### 2. GUIDE TREES + +Guide trees define the order in which sequences and profiles are aligned and hold a crucial role in determining the final MSA accuracy. Tree rendering techniques most commonly rely on pairwise distances between sequences. + +> **Note** +> None of the below listed aligner needs an explicit definition of a guidetree: if they need one, they compute their own default guide tree. This explicit definition of a guide tree is available in case you want to test non-default combination of guide trees and aligner methods. + +Available GUIDE TREE methods (Optional): + +- [CLUSTALO](http://clustal.org/omega/#Documentation) +- [FAMSA](https://github.com/refresh-bio/FAMSA) + +### 3. ALIGN + +Available ALIGN methods: + +**SEQUENCE-BASED** (only require a fasta file as input): + +- [CLUSTALO](http://clustal.org/omega/#Documentation) (accepts guide tree) +- [FAMSA](https://github.com/refresh-bio/FAMSA) (accepts guide tree) +- [KALIGN](https://github.com/TimoLassmann/kalign) +- [LEARNMSA](https://github.com/Gaius-Augustus/learnMSA) +- [MAFFT](https://mafft.cbrc.jp/alignment/server/index.html) +- [MAGUS](https://github.com/vlasmirnov/MAGUS) (accepts guide tree) +- [MUSCLE5](https://drive5.com/muscle5/manual/) +- [TCOFFEE](https://tcoffee.readthedocs.io/en/latest/index.html) (accepts guide tree) + +**SEQUENCE- and STRUCTURE-BASED** (require both fasta and structures as input): + +- [3DCOFFEE](https://tcoffee.org/Projects/expresso/index.html) (accepts guide tree) + +**STRUCTURE-BASED** (only require stuctures as input): + +- [MTMALIGN](https://bio.tools/mtm-align) + +### 4. EVALUATE + +Optionally, the produced MSAs can be evaluated. Skip with `--skip_eval`. + +**SEQUENCE-BASED** (no extra input required): + +1. Calculate number of gaps and its average across sequences. Activate using `--calc_gaps` (default: true). + +**REFERENCE-BASED**: + +The reference MSAs (see samplesheet) are used to evaluate the quality of the produced MSA. + +2. Sum Of Pairs (SP). Calculates the SP score using the [TCOFFEE](https://tcoffee.readthedocs.io/en/latest/index.html) implementation. Activate using `--calc_sp` (default: true). +3. Total column (TC). Calculates the TC score [TCOFFEE](https://tcoffee.readthedocs.io/en/latest/index.html). Activate using `--calc_tc` (default: true). + +**STRUCTURE-BASED**: + +The provided structures (see samplesheet) are used to evaluate the quality of the alignment. + +4. iRMSD. Calculates the iRMSD using the [TCOFFEE](https://tcoffee.readthedocs.io/en/latest/index.html) implementation. Activate using `--calc_irmsd` (default: false). + +### 5. REPORT + +Finally, a summary table with all the computed statistics and evaluations is reported in MultiQC (Skip by `--skip_multiqc`). +Moreover, a Shiny app is generated with interactive summary plots. + +> [!WARNING] +> You will need to have [shiny](https://shiny.posit.co/py/) installed to run it! See [output documentation](https://nf-co.re/multiplesequencealign/output) for more infos. ## Samplesheet input -You will need to create a samplesheet with information about the samples you would like to analyse before running the pipeline. Use this parameter to specify its location. It has to be a comma-separated file with 3 columns, and a header row as shown in the examples below. +The sample sheet defines the input data that the pipeline will process. +It should look like this: -```bash ---input '[path to samplesheet file]' +`samplesheet.csv`: + +```csv +id,fasta,reference,structures +seatoxin,seatoxin.fa,seatoxin-ref.fa,seatoxin_structures +toxin,toxin.fa,toxin-ref.fa,toxin_structures ``` -### Multiple runs of the same sample +Each row represents a set of sequences (in this case the seatoxin and toxin protein families) to be processed. -The `sample` identifiers have to be the same when you have re-sequenced the same sample more than once e.g. to increase sequencing depth. The pipeline will concatenate the raw reads before performing any downstream analysis. Below is an example for the same sample sequenced across 3 lanes: +| Column | Description | +| ------------ | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `id` | Required. Name of the set of sequences. It can correspond to the protein family name or to an internal id. It should be unique. | +| `fasta` | Required (At least one of fasta and structures must be provided). Full path to the fasta file that contains the sequence to be aligned. | +| `reference` | Optional. Full path to the reference alignment. It is used for the reference-based evaluation steps. It can be left empty. | +| `structures` | Required (At least one of fasta and structures must be provided). Full path to the folder that contains the protein structures for the sequences to be aligned. It is used for structural aligners and structure-based evaluation steps. It can be left empty. | -```csv title="samplesheet.csv" -sample,fastq_1,fastq_2 -CONTROL_REP1,AEG588A1_S1_L002_R1_001.fastq.gz,AEG588A1_S1_L002_R2_001.fastq.gz -CONTROL_REP1,AEG588A1_S1_L003_R1_001.fastq.gz,AEG588A1_S1_L003_R2_001.fastq.gz -CONTROL_REP1,AEG588A1_S1_L004_R1_001.fastq.gz,AEG588A1_S1_L004_R2_001.fastq.gz -``` +> [!NOTE] +> You can have some samples with structures and/or references and some without. The pipeline will run the modules requiring structures/references only on the samples for which you have provided the required information and the others will be just skipped. + +## Toolsheet input + +The reason why we provide a toolsheet as input is that if the pipeline needs to be used as benchmarking framework, we need to test multiple arguments per module. This can be done by having multiple entries in the toolsheet per module with the multiple arguments to be tested. -### Full samplesheet +Each line of the toolsheet defines a combination of guide tree and multiple sequence aligner to run with the respective arguments to be used. -The pipeline will auto-detect whether a sample is single- or paired-end using the information provided in the samplesheet. The samplesheet can have as many columns as you desire, however, there is a strict requirement for the first 3 columns to match those defined in the table below. +It should look at foollows: -A final samplesheet file consisting of both single- and paired-end data may look something like the one below. This is for 6 samples, where `TREATMENT_REP3` has been sequenced twice. +`toolsheet.csv`: -```csv title="samplesheet.csv" -sample,fastq_1,fastq_2 -CONTROL_REP1,AEG588A1_S1_L002_R1_001.fastq.gz,AEG588A1_S1_L002_R2_001.fastq.gz -CONTROL_REP2,AEG588A2_S2_L002_R1_001.fastq.gz,AEG588A2_S2_L002_R2_001.fastq.gz -CONTROL_REP3,AEG588A3_S3_L002_R1_001.fastq.gz,AEG588A3_S3_L002_R2_001.fastq.gz -TREATMENT_REP1,AEG588A4_S4_L003_R1_001.fastq.gz, -TREATMENT_REP2,AEG588A5_S5_L003_R1_001.fastq.gz, -TREATMENT_REP3,AEG588A6_S6_L003_R1_001.fastq.gz, -TREATMENT_REP3,AEG588A6_S6_L004_R1_001.fastq.gz, +```csv +tree,args_tree,aligner,args_aligner, +FAMSA, -gt upgma -medoidtree, FAMSA, +, ,TCOFFEE, +FAMSA,,REGRESSIVE, ``` -| Column | Description | -| --------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `sample` | Custom sample name. This entry will be identical for multiple sequencing libraries/runs from the same sample. Spaces in sample names are automatically converted to underscores (`_`). | -| `fastq_1` | Full path to FastQ file for Illumina short reads 1. File has to be gzipped and have the extension ".fastq.gz" or ".fq.gz". | -| `fastq_2` | Full path to FastQ file for Illumina short reads 2. File has to be gzipped and have the extension ".fastq.gz" or ".fq.gz". | +> [!NOTE] +> Each of the trees and aligners are available as standalones, so args_tree and args_aligner can be left empty if you are cool with the default settings of each method. Tree can also be left empty, and the default guide tree will be used for each aligner. + +> [!NOTE] +> use the exact spelling as listed above! + +`tree` is the tool used to build the tree. (optional) + +Arguments to the tree tool can be provided using `args_tree`. Please refer to each tool's documentation. (optional) + +The `aligner` column contains the tool to run the alignment. (optional) + +Finally, the arguments to the aligner tool can be set by using the `args_alginer` column. (optional) -An [example samplesheet](../assets/samplesheet.csv) has been provided with the pipeline. +| Column | Description | +| -------------- | -------------------------------------------------------------------------------- | +| `tree` | Optional. Tool used to build the tree. | +| `args_tree` | Optional. Arguments to the tree tool. Please refer to each tool's documentation. | +| `aligner` | Required. Tool to run the alignment. Available options listed above. | +| `args_aligner` | Optional. Arguments to the alignment tool | ## Running the pipeline The typical command for running the pipeline is as follows: ```bash -nextflow run nf-core/multiplesequencealign --input ./samplesheet.csv --outdir ./results --genome GRCh37 -profile docker +nextflow run nf-core/multiplesequencealign --input ./samplesheet.csv --tools ./toolsheet.csv --outdir ./results -profile docker ``` This will launch the pipeline with the `docker` configuration profile. See below for more information about profiles. @@ -89,8 +188,8 @@ with `params.yaml` containing: ```yaml input: './samplesheet.csv' +tools: "./toolsheet.csv" outdir: './results/' -genome: 'GRCh37' <...> ``` diff --git a/modules.json b/modules.json index 4536422e..f7f5635c 100644 --- a/modules.json +++ b/modules.json @@ -17,13 +17,14 @@ }, "csvtk/concat": { "branch": "master", - "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", + "git_sha": "cfe2a24902bfdfe8132f11461ffda92d257f9f09", "installed_by": ["modules"] }, "csvtk/join": { "branch": "master", - "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", - "installed_by": ["modules"] + "git_sha": "614abbf126f287a3068dc86997b2e1b6a93abe20", + "installed_by": ["modules"], + "patch": "modules/nf-core/csvtk/join/csvtk-join.diff" }, "famsa/align": { "branch": "master", @@ -67,7 +68,7 @@ }, "mtmalign/align": { "branch": "master", - "git_sha": "faf557ba56156ac0e5de76a25c1e3df11c944f59", + "git_sha": "7bfb142c3729c1c76198c237a614215d92fe935c", "installed_by": ["modules"] }, "multiqc": { @@ -80,6 +81,11 @@ "git_sha": "faf557ba56156ac0e5de76a25c1e3df11c944f59", "installed_by": ["modules"] }, + "pigz/compress": { + "branch": "master", + "git_sha": "0eab94fc1e48703c1b0a8704bd665f554905c39d", + "installed_by": ["modules"] + }, "pigz/uncompress": { "branch": "master", "git_sha": "d7f0de8aae7bf84b080dfdcf4e294bf11a46a51c", @@ -87,7 +93,7 @@ }, "tcoffee/align": { "branch": "master", - "git_sha": "c65917d37cdaa9d6c26fccf8f7c313aab1a51d8a", + "git_sha": "1cacaceabae75b0c3bc393dee52cb6a5020fcb5c", "installed_by": ["modules"] }, "tcoffee/alncompare": { @@ -102,12 +108,12 @@ }, "tcoffee/seqreformat": { "branch": "master", - "git_sha": "b04c647f465bea2c5bb9871503182236cd65b246", + "git_sha": "32ae618a60a25a870b5fa47ea2060ddcd911ab53", "installed_by": ["modules"] }, "tcoffee/tcs": { "branch": "master", - "git_sha": "2d5ea4959c36da8c21e74a0c5e8ecc6b101b999e", + "git_sha": "1cacaceabae75b0c3bc393dee52cb6a5020fcb5c", "installed_by": ["modules"] }, "untar": { diff --git a/modules/local/calculate_gaps.nf b/modules/local/calculate_gaps.nf index 4f324ad2..e1571b44 100644 --- a/modules/local/calculate_gaps.nf +++ b/modules/local/calculate_gaps.nf @@ -35,7 +35,7 @@ process CALC_GAPS { """ stub: - def prefix = task.ext.prefix ?: "${meta.id}" + prefix = task.ext.prefix ?: "${meta.id}" """ touch ${prefix}_gaps.csv diff --git a/modules/local/calculate_seqstats.nf b/modules/local/calculate_seqstats.nf index 42361e3e..22ead050 100644 --- a/modules/local/calculate_seqstats.nf +++ b/modules/local/calculate_seqstats.nf @@ -34,12 +34,13 @@ process CALCULATE_SEQSTATS { """ stub: - def prefix = task.ext.prefix ?: "${meta.id}" + prefix = task.ext.prefix ?: "${meta.id}" """ touch ${prefix}_seqstats.csv touch ${prefix}_seqstats_summary.csv touch ${prefix}_multiqc.tsv + cat <<-END_VERSIONS > versions.yml "${task.process}": python: \$(python --version | sed 's/Python //g') END_VERSIONS diff --git a/modules/local/create_tcoffee_template.nf b/modules/local/create_tcoffee_template.nf index f2181630..4238c916 100644 --- a/modules/local/create_tcoffee_template.nf +++ b/modules/local/create_tcoffee_template.nf @@ -30,7 +30,7 @@ process CREATE_TCOFFEETEMPLATE { """ stub: - def prefix = task.ext.prefix ?: "${meta.id}" + prefix = task.ext.prefix ?: "${meta.id}" """ touch ${prefix}_template.txt diff --git a/modules/local/extract_plddt.nf b/modules/local/extract_plddt.nf index 1b43f893..9191d796 100644 --- a/modules/local/extract_plddt.nf +++ b/modules/local/extract_plddt.nf @@ -40,9 +40,10 @@ process EXTRACT_PLDDT { """ stub: - def prefix = task.ext.prefix ?: "${meta.id}" + prefix = task.ext.prefix ?: "${meta.id}" """ - touch ${prefix}_plddt.csv + touch ${prefix}_plddt_summary.csv + touch ${prefix}_full_plddt.csv cat <<-END_VERSIONS > versions.yml "${task.process}": diff --git a/modules/local/multiqc.nf b/modules/local/multiqc.nf index 4a069c5c..ee54c43e 100644 --- a/modules/local/multiqc.nf +++ b/modules/local/multiqc.nf @@ -1,8 +1,8 @@ process MULTIQC { label 'process_medium' - conda 'bioconda::multiqc=1.22.1' - container "community.wave.seqera.io/library/pip_multiqc:2c2e276ad8997cc4" + conda 'bioconda::multiqc=1.22.2' + container "community.wave.seqera.io/library/multiqc:1.22.1--4886de6095538010" input: path multiqc_config diff --git a/modules/local/parse_sim.nf b/modules/local/parse_sim.nf index 66c9cd39..b89d50aa 100644 --- a/modules/local/parse_sim.nf +++ b/modules/local/parse_sim.nf @@ -10,7 +10,7 @@ process PARSE_SIM { tuple val(meta), path(infile) output: - tuple val (meta), path("${prefix}.sim_tot"), emit: sim_tot + tuple val (meta), path("*.sim_tot"), emit: sim_tot path "versions.yml", emit: versions when: @@ -36,7 +36,7 @@ process PARSE_SIM { """ stub: - def prefix = task.ext.prefix ?: "${meta.id}" + prefix = task.ext.prefix ?: "${meta.id}" """ touch ${prefix}.sim_tot diff --git a/modules/local/prepare_multiqc.nf b/modules/local/prepare_multiqc.nf index f6d3e4be..018835ae 100644 --- a/modules/local/prepare_multiqc.nf +++ b/modules/local/prepare_multiqc.nf @@ -30,7 +30,7 @@ process PREPARE_MULTIQC { """ stub: - def prefix = task.ext.prefix ?: "${meta.id}" + prefix = task.ext.prefix ?: "${meta.id}" """ touch ${prefix}_multiqc_table.csv diff --git a/modules/local/prepare_shiny.nf b/modules/local/prepare_shiny.nf index 445a06a4..9e006770 100644 --- a/modules/local/prepare_shiny.nf +++ b/modules/local/prepare_shiny.nf @@ -39,6 +39,7 @@ process PREPARE_SHINY { """ touch shiny_data.csv touch shiny_app.R + touch run.sh cat <<-END_VERSIONS > versions.yml "${task.process}": diff --git a/modules/nf-core/csvtk/concat/environment.yml b/modules/nf-core/csvtk/concat/environment.yml index ed1ba26b..ac58390c 100644 --- a/modules/nf-core/csvtk/concat/environment.yml +++ b/modules/nf-core/csvtk/concat/environment.yml @@ -1,7 +1,9 @@ -name: csvtk_concat +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json +name: "csvtk_concat" channels: - conda-forge - bioconda - defaults dependencies: - - bioconda::csvtk=0.23.0 + - "bioconda::csvtk=0.30.0" diff --git a/modules/nf-core/csvtk/concat/main.nf b/modules/nf-core/csvtk/concat/main.nf index 16e59f64..741ed551 100644 --- a/modules/nf-core/csvtk/concat/main.nf +++ b/modules/nf-core/csvtk/concat/main.nf @@ -4,8 +4,8 @@ process CSVTK_CONCAT { conda "${moduleDir}/environment.yml" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/csvtk:0.23.0--h9ee0642_0' : - 'biocontainers/csvtk:0.23.0--h9ee0642_0' }" + 'https://depot.galaxyproject.org/singularity/csvtk:0.30.0--h9ee0642_0' : + 'biocontainers/csvtk:0.30.0--h9ee0642_0' }" input: tuple val(meta), path(csv) @@ -40,4 +40,16 @@ process CSVTK_CONCAT { csvtk: \$(echo \$( csvtk version | sed -e "s/csvtk v//g" )) END_VERSIONS """ + + stub: + prefix = task.ext.prefix ?: "${meta.id}" + out_extension = out_format == "tsv" ? 'tsv' : 'csv' + """ + touch ${prefix}.${out_extension} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + csvtk: \$(echo \$( csvtk version | sed -e "s/csvtk v//g" )) + END_VERSIONS + """ } diff --git a/modules/nf-core/csvtk/concat/tests/main.nf.test b/modules/nf-core/csvtk/concat/tests/main.nf.test new file mode 100644 index 00000000..13f20147 --- /dev/null +++ b/modules/nf-core/csvtk/concat/tests/main.nf.test @@ -0,0 +1,67 @@ +// nf-core modules test csvtk/concat +nextflow_process { + + name "Test Process CSVTK_CONCAT" + script "../main.nf" + process "CSVTK_CONCAT" + + tag "modules" + tag "modules_nfcore" + tag "csvtk" + tag "csvtk/concat" + + test("tsv - concat - csv") { + + when { + process { + """ + input[0] = [ + [ id:'test' ], // meta map + [ file("https://github.com/nf-core/test-datasets/raw/bacass/bacass_hybrid.csv", checkIfExists: true), + file("https://github.com/nf-core/test-datasets/raw/bacass/bacass_long.csv", checkIfExists: true), + file("https://github.com/nf-core/test-datasets/raw/bacass/bacass_short.csv", checkIfExists: true) ] + ] + input[1] = "tsv" + input[2] = "csv" + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + + test("tsv - concat - csv - stub") { + + options "-stub" + + when { + process { + """ + input[0] = [ + [ id:'test' ], // meta map + [ file("https://github.com/nf-core/test-datasets/raw/bacass/bacass_hybrid.csv", checkIfExists: true), + file("https://github.com/nf-core/test-datasets/raw/bacass/bacass_long.csv", checkIfExists: true), + file("https://github.com/nf-core/test-datasets/raw/bacass/bacass_short.csv", checkIfExists: true) ] + ] + input[1] = "tsv" + input[2] = "csv" + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + +} diff --git a/modules/nf-core/csvtk/concat/tests/main.nf.test.snap b/modules/nf-core/csvtk/concat/tests/main.nf.test.snap new file mode 100644 index 00000000..777114ba --- /dev/null +++ b/modules/nf-core/csvtk/concat/tests/main.nf.test.snap @@ -0,0 +1,60 @@ +{ + "tsv - concat - csv - stub": { + "content": [ + { + "0": [ + [ + { + "id": "test" + }, + "test.csv:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "1": [ + "versions.yml:md5,c04e6be6df50305cd689a92aacec947b" + ], + "csv": [ + [ + { + "id": "test" + }, + "test.csv:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "versions": [ + "versions.yml:md5,c04e6be6df50305cd689a92aacec947b" + ] + } + ], + "timestamp": "2024-05-17T12:43:26.787254" + }, + "tsv - concat - csv": { + "content": [ + { + "0": [ + [ + { + "id": "test" + }, + "test.csv:md5,917fe5d857f04b58e0f49c384d167cec" + ] + ], + "1": [ + "versions.yml:md5,c04e6be6df50305cd689a92aacec947b" + ], + "csv": [ + [ + { + "id": "test" + }, + "test.csv:md5,917fe5d857f04b58e0f49c384d167cec" + ] + ], + "versions": [ + "versions.yml:md5,c04e6be6df50305cd689a92aacec947b" + ] + } + ], + "timestamp": "2024-05-17T12:43:17.930902" + } +} \ No newline at end of file diff --git a/modules/nf-core/csvtk/concat/tests/tags.yml b/modules/nf-core/csvtk/concat/tests/tags.yml new file mode 100644 index 00000000..0d10e7c9 --- /dev/null +++ b/modules/nf-core/csvtk/concat/tests/tags.yml @@ -0,0 +1,2 @@ +csvtk/concat: + - "modules/nf-core/csvtk/concat/**" diff --git a/modules/nf-core/csvtk/join/csvtk-join.diff b/modules/nf-core/csvtk/join/csvtk-join.diff new file mode 100644 index 00000000..fded83ab --- /dev/null +++ b/modules/nf-core/csvtk/join/csvtk-join.diff @@ -0,0 +1,24 @@ +Changes in module 'nf-core/csvtk/join' +--- modules/nf-core/csvtk/join/main.nf ++++ modules/nf-core/csvtk/join/main.nf +@@ -22,12 +22,17 @@ + prefix = task.ext.prefix ?: "${meta.id}" + out_extension = args.contains('--out-delimiter "\t"') || args.contains('-D "\t"') || args.contains("-D \$'\t'") ? "tsv" : "csv" + """ ++ ++ # if the input is horter than 2, add an emtpy file as the second input ++ touch empty.csv ++ + csvtk \\ + join \\ + $args \\ + --num-cpus $task.cpus \\ + --out-file ${prefix}.${out_extension} \\ +- $csv ++ $csv \\ ++ empty.csv + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + +************************************************************ diff --git a/modules/nf-core/csvtk/join/environment.yml b/modules/nf-core/csvtk/join/environment.yml index b488c861..5b6c6468 100644 --- a/modules/nf-core/csvtk/join/environment.yml +++ b/modules/nf-core/csvtk/join/environment.yml @@ -4,4 +4,4 @@ channels: - bioconda - defaults dependencies: - - bioconda::csvtk=0.26.0 + - bioconda::csvtk=0.30.0 diff --git a/modules/nf-core/csvtk/join/main.nf b/modules/nf-core/csvtk/join/main.nf index bf02e7f5..47acc69b 100644 --- a/modules/nf-core/csvtk/join/main.nf +++ b/modules/nf-core/csvtk/join/main.nf @@ -4,8 +4,8 @@ process CSVTK_JOIN { conda "${moduleDir}/environment.yml" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/csvtk:0.26.0--h9ee0642_0': - 'biocontainers/csvtk:0.26.0--h9ee0642_0' }" + 'https://depot.galaxyproject.org/singularity/csvtk:0.30.0--h9ee0642_0': + 'biocontainers/csvtk:0.30.0--h9ee0642_0' }" input: tuple val(meta), path(csv) @@ -22,12 +22,17 @@ process CSVTK_JOIN { prefix = task.ext.prefix ?: "${meta.id}" out_extension = args.contains('--out-delimiter "\t"') || args.contains('-D "\t"') || args.contains("-D \$'\t'") ? "tsv" : "csv" """ + + # if the input is horter than 2, add an emtpy file as the second input + touch empty.csv + csvtk \\ join \\ $args \\ --num-cpus $task.cpus \\ --out-file ${prefix}.${out_extension} \\ - $csv + $csv \\ + empty.csv cat <<-END_VERSIONS > versions.yml "${task.process}": @@ -36,7 +41,6 @@ process CSVTK_JOIN { """ stub: - def args = task.ext.args ?: '' prefix = task.ext.prefix ?: "${meta.id}" out_extension = args.contains('--out-delimiter "\t"') || args.contains('-D "\t"') || args.contains("-D \$'\t'") ? "tsv" : "csv" """ diff --git a/modules/nf-core/csvtk/join/tests/main.nf.test b/modules/nf-core/csvtk/join/tests/main.nf.test new file mode 100644 index 00000000..3cf178c4 --- /dev/null +++ b/modules/nf-core/csvtk/join/tests/main.nf.test @@ -0,0 +1,64 @@ +nextflow_process { + + name "Test Process CSVTK_JOIN" + script "../main.nf" + process "CSVTK_JOIN" + + tag "modules" + tag "modules_nfcore" + tag "csvtk" + tag "csvtk/join" + + test("join - csv") { + + when { + process { + """ + input[0] = [ + [ id:'test' ], // meta map + [ + file("https://github.com/nf-core/test-datasets/raw/bacass/bacass_hybrid.csv", checkIfExists: true), + file("https://github.com/nf-core/test-datasets/raw/bacass/bacass_short.csv", checkIfExists: true), + ] + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + + test("join - csv - stub") { + + options "-stub" + + when { + process { + """ + input[0] = [ + [ id:'test' ], // meta map + [ + file("https://github.com/nf-core/test-datasets/raw/bacass/bacass_hybrid.csv", checkIfExists: true), + file("https://github.com/nf-core/test-datasets/raw/bacass/bacass_short.csv", checkIfExists: true), + ] + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + +} diff --git a/modules/nf-core/csvtk/join/tests/main.nf.test.snap b/modules/nf-core/csvtk/join/tests/main.nf.test.snap new file mode 100644 index 00000000..b124788b --- /dev/null +++ b/modules/nf-core/csvtk/join/tests/main.nf.test.snap @@ -0,0 +1,60 @@ +{ + "join - csv": { + "content": [ + { + "0": [ + [ + { + "id": "test" + }, + "test.csv:md5,d0ad82ca096c7e05eb9f9a04194c9e30" + ] + ], + "1": [ + "versions.yml:md5,e76147e4eca968d23543e7007522f1d3" + ], + "csv": [ + [ + { + "id": "test" + }, + "test.csv:md5,d0ad82ca096c7e05eb9f9a04194c9e30" + ] + ], + "versions": [ + "versions.yml:md5,e76147e4eca968d23543e7007522f1d3" + ] + } + ], + "timestamp": "2024-05-21T15:45:44.045434" + }, + "join - csv - stub": { + "content": [ + { + "0": [ + [ + { + "id": "test" + }, + "test.csv:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "1": [ + "versions.yml:md5,e76147e4eca968d23543e7007522f1d3" + ], + "csv": [ + [ + { + "id": "test" + }, + "test.csv:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "versions": [ + "versions.yml:md5,e76147e4eca968d23543e7007522f1d3" + ] + } + ], + "timestamp": "2024-05-21T15:45:55.59201" + } +} \ No newline at end of file diff --git a/modules/nf-core/csvtk/join/tests/nextflow.config b/modules/nf-core/csvtk/join/tests/nextflow.config new file mode 100644 index 00000000..1b14393a --- /dev/null +++ b/modules/nf-core/csvtk/join/tests/nextflow.config @@ -0,0 +1,5 @@ +process { + withName: CSVTK_JOIN { + ext.args = "--fields 'ID;ID' -p -e -d \"\t\" -D \",\"" + } +} diff --git a/modules/nf-core/csvtk/join/tests/tags.yml b/modules/nf-core/csvtk/join/tests/tags.yml new file mode 100644 index 00000000..6c3a0fa6 --- /dev/null +++ b/modules/nf-core/csvtk/join/tests/tags.yml @@ -0,0 +1,2 @@ +csvtk/join: + - "modules/nf-core/csvtk/join/**" diff --git a/modules/nf-core/mtmalign/align/main.nf b/modules/nf-core/mtmalign/align/main.nf index c6ad07b7..933d2c74 100644 --- a/modules/nf-core/mtmalign/align/main.nf +++ b/modules/nf-core/mtmalign/align/main.nf @@ -10,13 +10,13 @@ process MTMALIGN_ALIGN { 'biocontainers/mulled-v2-5bcf71dc66dac33d8e003c5e78043b80f5c7f269:8f0e486d46f7ab38892c1a8f78d2894a549d03b5-0' }" input: - tuple val(meta), path('*.pdb', arity: '2..*') + tuple val(meta), path(pdbs) val(compress) output: - tuple val(meta), path("./mTM_result/${prefix}.aln${compress ? '.gz' : ''}"), emit: alignment - tuple val(meta), path("./mTM_result/${prefix}.pdb${compress ? '.gz' : ''}"), emit: structure - path "versions.yml" , emit: versions + tuple val(meta), path("${prefix}.aln${compress ? '.gz' : ''}"), emit: alignment + tuple val(meta), path("${prefix}.pdb${compress ? '.gz' : ''}"), emit: structure + path "versions.yml" , emit: versions when: task.ext.when == null || task.ext.when @@ -38,12 +38,16 @@ process MTMALIGN_ALIGN { mtm-align -i input_list.txt -o ${prefix}.pdb # -o does not affect the fasta naming, so move it to the new name mv ./mTM_result/result.fasta ./mTM_result/${prefix}.aln + # Remove ".pdb" from the ids in the alignment file + sed -i 's/\\.pdb//g' ./mTM_result/${prefix}.aln # compress both output files if ${compress}; then pigz -p ${task.cpus} ./mTM_result/${prefix}.aln ./mTM_result/${prefix}.pdb fi - tree + + # move everything in mTM_result to the working directory + mv ./mTM_result/* . # mtm-align -v prints the wrong version 20180725, so extract it from the cosmetic output in the help message cat <<-END_VERSIONS > versions.yml @@ -54,12 +58,10 @@ process MTMALIGN_ALIGN { """ stub: - def args = task.ext.args ?: '' prefix = task.ext.prefix ?: "${meta.id}" """ - mkdir mTM_result - touch mTM_result/${prefix}.aln${compress ? '.gz' : ''} - touch mTM_result/${prefix}.pdb${compress ? '.gz' : ''} + touch ${prefix}.aln${compress ? '.gz' : ''} + touch ${prefix}.pdb${compress ? '.gz' : ''} # mtm-align -v prints the wrong version 20180725, so extract it from the cosmetic output in the help message cat <<-END_VERSIONS > versions.yml diff --git a/modules/nf-core/mtmalign/align/tests/main.nf.test b/modules/nf-core/mtmalign/align/tests/main.nf.test index cb3f3885..ada32c39 100644 --- a/modules/nf-core/mtmalign/align/tests/main.nf.test +++ b/modules/nf-core/mtmalign/align/tests/main.nf.test @@ -39,11 +39,11 @@ nextflow_process { { assert process.success }, // mTMalign may be nondeterministic, just check if the pdbs are all in there //{ assert snapshot(process.out).match() } - { assert path(process.out.alignment[0][1]).getText().contains(">1.pdb") }, - { assert path(process.out.alignment[0][1]).getText().contains(">2.pdb") }, - { assert path(process.out.alignment[0][1]).getText().contains(">3.pdb") }, - { assert path(process.out.alignment[0][1]).getText().contains(">4.pdb") }, - { assert path(process.out.alignment[0][1]).getText().contains(">5.pdb") }, + { assert path(process.out.alignment[0][1]).getText().contains(">1ahl") }, + { assert path(process.out.alignment[0][1]).getText().contains(">1apf") }, + { assert path(process.out.alignment[0][1]).getText().contains(">1atx") }, + { assert path(process.out.alignment[0][1]).getText().contains(">1bds") }, + { assert path(process.out.alignment[0][1]).getText().contains(">1sh1") }, { assert snapshot(process.out.versions).match("versions0") } ) } @@ -78,12 +78,11 @@ nextflow_process { assertAll( { assert process.success }, // mTMalign may be nondeterministic, just check if the pdbs are all in there - //{ assert snapshot(process.out).match() } - { assert path(process.out.alignment[0][1]).getTextGzip().contains(">1.pdb") }, - { assert path(process.out.alignment[0][1]).getTextGzip().contains(">2.pdb") }, - { assert path(process.out.alignment[0][1]).getTextGzip().contains(">3.pdb") }, - { assert path(process.out.alignment[0][1]).getTextGzip().contains(">4.pdb") }, - { assert path(process.out.alignment[0][1]).getTextGzip().contains(">5.pdb") }, + { assert path(process.out.alignment[0][1]).getTextGzip().contains(">1ahl") }, + { assert path(process.out.alignment[0][1]).getTextGzip().contains(">1apf") }, + { assert path(process.out.alignment[0][1]).getTextGzip().contains(">1atx") }, + { assert path(process.out.alignment[0][1]).getTextGzip().contains(">1bds") }, + { assert path(process.out.alignment[0][1]).getTextGzip().contains(">1sh1") }, { assert snapshot(process.out.versions).match("versions1") } ) } diff --git a/modules/nf-core/mtmalign/align/tests/main.nf.test.snap b/modules/nf-core/mtmalign/align/tests/main.nf.test.snap index eb321457..0eefb191 100644 --- a/modules/nf-core/mtmalign/align/tests/main.nf.test.snap +++ b/modules/nf-core/mtmalign/align/tests/main.nf.test.snap @@ -1,10 +1,26 @@ { - "versions": { + "versions0": { "content": [ [ "versions.yml:md5,7cbacec15bb9e0c8cbb27610bde74c10" ] ], - "timestamp": "2024-01-25T18:21:22.385207003" + "meta": { + "nf-test": "0.8.4", + "nextflow": "24.04.2" + }, + "timestamp": "2024-06-03T11:01:13.729263689" + }, + "versions1": { + "content": [ + [ + "versions.yml:md5,7cbacec15bb9e0c8cbb27610bde74c10" + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "24.04.2" + }, + "timestamp": "2024-06-03T11:01:37.28539854" } } \ No newline at end of file diff --git a/modules/nf-core/pigz/compress/environment.yml b/modules/nf-core/pigz/compress/environment.yml new file mode 100644 index 00000000..7551d187 --- /dev/null +++ b/modules/nf-core/pigz/compress/environment.yml @@ -0,0 +1,9 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json +name: "pigz_compress" +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - "pigz=2.8" diff --git a/modules/nf-core/pigz/compress/main.nf b/modules/nf-core/pigz/compress/main.nf new file mode 100644 index 00000000..152e7006 --- /dev/null +++ b/modules/nf-core/pigz/compress/main.nf @@ -0,0 +1,45 @@ +process PIGZ_COMPRESS { + tag "$meta.id" + label 'process_low' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/pigz:2.8': + 'biocontainers/pigz:2.8' }" + + input: + tuple val(meta), path(raw_file) + + output: + tuple val(meta), path("$archive"), emit: archive + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + archive = raw_file.toString() + ".gz" + """ + # Note: needs --stdout for pigz to avoid the following issue: + # pigz: skipping: ${raw_file} is a symbolic link + pigz --processes $task.cpus --stdout --force ${args} ${raw_file} > ${archive} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + pigz:\$(echo \$(pigz --version 2>&1) | sed 's/^.*pigz\\w*//' ) + END_VERSIONS + """ + + stub: + def args = task.ext.args ?: '' + archive = raw_file.toString() + ".gz" + """ + touch ${archive} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + pigz:\$(echo \$(pigz --version 2>&1) | sed 's/^.*pigz\\w*//' ) + END_VERSIONS + """ +} diff --git a/modules/nf-core/pigz/compress/meta.yml b/modules/nf-core/pigz/compress/meta.yml new file mode 100644 index 00000000..42efd735 --- /dev/null +++ b/modules/nf-core/pigz/compress/meta.yml @@ -0,0 +1,47 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/meta-schema.json +name: "pigz_compress" +description: Compresses files with pigz. +keywords: + - compress + - gzip + - parallelized +tools: + - "pigz": + description: "Parallel implementation of the gzip algorithm." + homepage: "https://zlib.net/pigz/" + documentation: "https://zlib.net/pigz/pigz.pdf" + +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'sample1', single_end:false ]` + + - raw_file: + type: file + description: File to be compressed + pattern: "*.*" + +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'sample1', single_end:false ]` + + - archive: + type: file + description: The compressed file + pattern: "*.gz" + + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + +authors: + - "@leoisl" +maintainers: + - "@leoisl" diff --git a/modules/nf-core/pigz/compress/tests/main.nf.test b/modules/nf-core/pigz/compress/tests/main.nf.test new file mode 100644 index 00000000..248d40fb --- /dev/null +++ b/modules/nf-core/pigz/compress/tests/main.nf.test @@ -0,0 +1,49 @@ +nextflow_process { + name "Test Process PIGZ_COMPRESS" + script "../main.nf" + process "PIGZ_COMPRESS" + + tag "modules" + tag "modules_nfcore" + tag "pigz" + tag "pigz/compress" + + test("sarscov2 - genome - fasta") { + when { + process { + """ + input[0] = [ + [ id:'test'], // meta map + file(params.test_data['sarscov2']['genome']['genome_fasta'], checkIfExists: true) + ] + """ + } + } + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + } + + test("sarscov2 - genome - fasta - stub") { + options "-stub-run" + when { + process { + """ + input[0] = [ + [ id:'test'], // meta map + file(params.test_data['sarscov2']['genome']['genome_fasta'], checkIfExists: true) + ] + """ + } + } + then { + assertAll( + { assert process.success }, + { assert snapshot(file(process.out.archive[0][1]).name).match() } + ) + } + } +} diff --git a/modules/nf-core/pigz/compress/tests/main.nf.test.snap b/modules/nf-core/pigz/compress/tests/main.nf.test.snap new file mode 100644 index 00000000..6e50456f --- /dev/null +++ b/modules/nf-core/pigz/compress/tests/main.nf.test.snap @@ -0,0 +1,37 @@ +{ + "sarscov2 - genome - fasta": { + "content": [ + { + "0": [ + [ + { + "id": "test" + }, + "genome.fasta.gz:md5,6e9fe4042a72f2345f644f239272b7e6" + ] + ], + "1": [ + "versions.yml:md5,ca30e9e1ffa1394ba7eefdac8cf3a3ad" + ], + "archive": [ + [ + { + "id": "test" + }, + "genome.fasta.gz:md5,6e9fe4042a72f2345f644f239272b7e6" + ] + ], + "versions": [ + "versions.yml:md5,ca30e9e1ffa1394ba7eefdac8cf3a3ad" + ] + } + ], + "timestamp": "2023-12-11T22:39:53.350546" + }, + "sarscov2 - genome - fasta - stub": { + "content": [ + "genome.fasta.gz" + ], + "timestamp": "2023-12-11T22:52:24.309192" + } +} \ No newline at end of file diff --git a/modules/nf-core/pigz/compress/tests/tags.yml b/modules/nf-core/pigz/compress/tests/tags.yml new file mode 100644 index 00000000..42c46bfa --- /dev/null +++ b/modules/nf-core/pigz/compress/tests/tags.yml @@ -0,0 +1,2 @@ +pigz/compress: + - "modules/nf-core/pigz/compress/**" diff --git a/modules/nf-core/tcoffee/align/main.nf b/modules/nf-core/tcoffee/align/main.nf index e159bb80..a14964c9 100644 --- a/modules/nf-core/tcoffee/align/main.nf +++ b/modules/nf-core/tcoffee/align/main.nf @@ -16,7 +16,7 @@ process TCOFFEE_ALIGN { output: tuple val(meta), path("*.aln{.gz,}"), emit: alignment // in the args there might be the request to generate a lib file, so the following is an optional output - tuple val(meta), path("*.*lib") , emit: lib, optional : true + tuple val(meta), path("*.*lib") , emit: lib, optional : true path "versions.yml" , emit: versions when: @@ -27,9 +27,8 @@ process TCOFFEE_ALIGN { def prefix = task.ext.prefix ?: "${meta.id}" def tree_args = tree ? "-usetree $tree" : "" def template_args = template ? "-template_file $template" : "" - def write_output = compress ? " >(pigz -cp ${task.cpus} > ${prefix}.aln.gz)" : "> ${prefix}.aln" - // using >() is necessary to preserve the tcoffee return value, - // so nextflow knows to display an error when it failed + def outfile = compress ? "stdout" : "${prefix}.aln" + def write_output = compress ? " | pigz -cp ${task.cpus} > ${prefix}.aln.gz" : "" """ export TEMP='./' t_coffee -seq ${fasta} \ @@ -37,9 +36,17 @@ process TCOFFEE_ALIGN { $template_args \ $args \ -thread ${task.cpus} \ - -outfile stdout \ + -outfile $outfile \ $write_output + # If stdout file exist and compress is true, then compress the file + # This is a patch for the current behaviour of the regressive algorithm + # that does not support the stdout redirection + if [ -f stdout ] && [ "$compress" = true ]; then + pigz -cp ${task.cpus} < stdout > ${prefix}.aln.gz + rm stdout + fi + cat <<-END_VERSIONS > versions.yml "${task.process}": tcoffee: \$( t_coffee -version | awk '{gsub("Version_", ""); print \$3}') diff --git a/modules/nf-core/tcoffee/align/tests/lib.config b/modules/nf-core/tcoffee/align/tests/lib.config new file mode 100644 index 00000000..2fc113ef --- /dev/null +++ b/modules/nf-core/tcoffee/align/tests/lib.config @@ -0,0 +1,3 @@ +process { + ext.args = { "-output fasta_aln -out_lib=sample_lib1.tc_lib" } +} \ No newline at end of file diff --git a/modules/nf-core/tcoffee/seqreformat/environment.yml b/modules/nf-core/tcoffee/seqreformat/environment.yml new file mode 100644 index 00000000..84afe8aa --- /dev/null +++ b/modules/nf-core/tcoffee/seqreformat/environment.yml @@ -0,0 +1,7 @@ +name: tcoffee_seqreformat +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::t-coffee=13.46.0.919e8c6b diff --git a/modules/nf-core/tcoffee/seqreformat/main.nf b/modules/nf-core/tcoffee/seqreformat/main.nf index 00a9a97d..774ae2be 100644 --- a/modules/nf-core/tcoffee/seqreformat/main.nf +++ b/modules/nf-core/tcoffee/seqreformat/main.nf @@ -38,7 +38,7 @@ process TCOFFEE_SEQREFORMAT { def args = task.ext.args ?: '' prefix = task.ext.prefix ?: "${meta.id}" """ - touch "${prefix}_${seq_reformat_type}.txt" + touch "${prefix}.txt" cat <<-END_VERSIONS > versions.yml "${task.process}": diff --git a/modules/nf-core/tcoffee/seqreformat/tests/main.nf.test b/modules/nf-core/tcoffee/seqreformat/tests/main.nf.test new file mode 100644 index 00000000..7a5492c5 --- /dev/null +++ b/modules/nf-core/tcoffee/seqreformat/tests/main.nf.test @@ -0,0 +1,33 @@ +nextflow_process { + + name "Test Process TCOFFEE_SEQREFORMAT" + script "../main.nf" + process "TCOFFEE_SEQREFORMAT" + + tag "modules" + tag "modules_nfcore" + tag "tcoffee" + tag "tcoffee/seqreformat" + + test("sarscov2 - bam") { + + when { + process { + """ + input[0] = [ [ id:'test' ], + file("https://raw.githubusercontent.com/nf-core/test-datasets/multiplesequencealign/testdata/setoxin-ref.fa", checkIfExists: true) + ] + """ + } + } + + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out.formatted_file).match("formatted_file")}, + { assert snapshot(process.out.versions).match("versions") } + ) + } + } +} diff --git a/modules/nf-core/tcoffee/seqreformat/tests/main.nf.test.snap b/modules/nf-core/tcoffee/seqreformat/tests/main.nf.test.snap new file mode 100644 index 00000000..150102ee --- /dev/null +++ b/modules/nf-core/tcoffee/seqreformat/tests/main.nf.test.snap @@ -0,0 +1,23 @@ +{ + "formatted_file": { + "content": [ + [ + [ + { + "id": "test" + }, + "test.txt:md5,fcd4691daf120c88ec5def7ac06fb562" + ] + ] + ], + "timestamp": "2023-11-28T11:56:22.705185493" + }, + "versions": { + "content": [ + [ + "versions.yml:md5,68fb841e6e44274d430a1382bb0bbd14" + ] + ], + "timestamp": "2023-11-28T11:56:22.717235196" + } +} \ No newline at end of file diff --git a/modules/nf-core/tcoffee/seqreformat/tests/nextflow.config b/modules/nf-core/tcoffee/seqreformat/tests/nextflow.config new file mode 100644 index 00000000..910cc175 --- /dev/null +++ b/modules/nf-core/tcoffee/seqreformat/tests/nextflow.config @@ -0,0 +1,3 @@ +process { + ext.args = "-output=sim_idscore" +} diff --git a/modules/nf-core/tcoffee/seqreformat/tests/tags.yml b/modules/nf-core/tcoffee/seqreformat/tests/tags.yml new file mode 100644 index 00000000..268d8814 --- /dev/null +++ b/modules/nf-core/tcoffee/seqreformat/tests/tags.yml @@ -0,0 +1,2 @@ +tcoffee/seqreformat: + - "modules/nf-core/tcoffee/seqreformat/**" diff --git a/modules/nf-core/tcoffee/tcs/main.nf.test b/modules/nf-core/tcoffee/tcs/main.nf.test deleted file mode 100644 index 6bf1f81e..00000000 --- a/modules/nf-core/tcoffee/tcs/main.nf.test +++ /dev/null @@ -1,29 +0,0 @@ -nextflow_process { - - name "Test Process TCOFFEE_TCS" - script "modules/nf-core/tcoffee/tcs/main.nf" - process "TCOFFEE_TCS" - - test("Should run without failures") { - - when { - params { - // define parameters here. Example: - // outdir = "tests/results" - } - process { - """ - // define inputs of the process here. Example: - // input[0] = file("test-file.txt") - """ - } - } - - then { - assert process.success - assert snapshot(process.out).match() - } - - } - -} diff --git a/modules/nf-core/tcoffee/tcs/tests/lib.config b/modules/nf-core/tcoffee/tcs/tests/lib.config index 723c92d5..56712f63 100644 --- a/modules/nf-core/tcoffee/tcs/tests/lib.config +++ b/modules/nf-core/tcoffee/tcs/tests/lib.config @@ -1,5 +1,5 @@ process { withName: "TCOFFEE_ALIGN"{ - ext.args = { "-output fasta_aln -out_lib=sample_lib1.tc_lib" } + ext.args = { "-output fasta_aln -out_lib sample_lib1.tc_lib" } } -} \ No newline at end of file +} diff --git a/nextflow.config b/nextflow.config index 09f6f230..6a7777bd 100644 --- a/nextflow.config +++ b/nextflow.config @@ -14,24 +14,24 @@ plugins { params { // Input options - input = null - tools = null + input = null + tools = null // Stats skip_stats = false calc_sim = false calc_seq_stats = true - extract_plddt = true + extract_plddt = false // Evaluation skip_eval = false calc_sp = true - calc_tc = false + calc_tc = true calc_irmsd = false calc_gaps = true - calc_tcs = true + calc_tcs = false - no_compression = false + skip_compression = false // MultiQC options multiqc_config = null @@ -198,6 +198,7 @@ profiles { } test { includeConfig 'conf/test.config' } test_full { includeConfig 'conf/test_full.config' } + test_pdb { includeConfig 'conf/test_pdb.config' } test_parameters { includeConfig 'conf/test_parameters.config' } } diff --git a/nextflow_schema.json b/nextflow_schema.json index 80ecdea8..b770573c 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -151,10 +151,10 @@ "compression": { "title": "Compression", "type": "object", - "description": "Define wether to run compression", + "description": "Define wether and how to run compression", "default": "", "properties": { - "no_compression": { + "skip_compression": { "type": "boolean", "description": "Produce uncompressed alignment files" } diff --git a/subworkflows/local/align.nf b/subworkflows/local/align.nf index f0c7f616..5dffca96 100644 --- a/subworkflows/local/align.nf +++ b/subworkflows/local/align.nf @@ -20,17 +20,19 @@ include { MTMALIGN_ALIGN } from '../../modules/nf-core/mtma workflow ALIGN { take: - ch_fastas // channel: meta, /path/to/file.fasta - ch_tools // string: - ch_structures // channel: meta, [/path/to/file.pdb,/path/to/file.pdb,/path/to/file.pdb] + ch_fastas // channel: meta, /path/to/file.fasta + ch_tools // channel: meta_tree, meta_aligner + // [[tree:, args_tree:, args_tree_clean: ], [aligner:, args_aligner:, args_aligner_clean:]] + // e.g.[[tree:FAMSA, args_tree:-gt upgma -parttree, args_tree_clean:-gt_upgma_-parttree], [aligner:FAMSA, args_aligner:null, args_aligner_clean:null]] + // e.g.[[tree:null, args_tree:null, args_tree_clean:null], [aligner:TCOFFEE, args_aligner:-output fasta_aln, args_aligner_clean:-output_fasta_aln]] + ch_structures // channel: meta, [/path/to/file.pdb,/path/to/file.pdb,/path/to/file.pdb] + compress main: msa = Channel.empty() ch_versions = Channel.empty() - compress = ! params.no_compression - // Branch the toolsheet information into two channels // This way, it can direct the computation of guidetrees // and aligners separately @@ -77,16 +79,28 @@ workflow ALIGN { magus: it[0]["aligner"] == "MAGUS" muscle5: it[0]["aligner"] == "MUSCLE5" mtmalign: it[0]["aligner"] == "MTMALIGN" + regressive: it[0]["aligner"] == "REGRESSIVE" tcoffee: it[0]["aligner"] == "TCOFFEE" tcoffee3d: it[0]["aligner"] == "3DCOFFEE" - regressive: it[0]["aligner"] == "REGRESSIVE" } .set { ch_fasta_trees } + ch_structures.combine(ch_tools) + .map { + metastruct, template, struct, metatree, metaalign -> + [ metastruct+metatree+metaalign, template, struct ] + } + .branch{ + mtmalign: it[0]["aligner"] == "MTMALIGN" + } + .set { ch_structures_tools } + // ------------------------------------------------ // Compute the alignments // ------------------------------------------------ + // 1. SEQUENCE BASED + // ----------------- CLUSTALO ------------------ ch_fasta_trees_clustalo = ch_fasta_trees.clustalo .multiMap{ @@ -172,6 +186,19 @@ workflow ALIGN { ch_versions = ch_versions.mix(TCOFFEE_ALIGN.out.versions.first()) msa = msa.mix(TCOFFEE_ALIGN.out.alignment) + // ----------------- REGRESSIVE ------------------ + ch_fasta_trees_regressive = ch_fasta_trees.regressive + .multiMap{ + meta, fastafile, treefile -> + fasta: [ meta, fastafile ] + tree: [ meta, treefile ] + } + REGRESSIVE_ALIGN(ch_fasta_trees_regressive.fasta, ch_fasta_trees_regressive.tree, [[:],[], []], compress) + ch_versions = ch_versions.mix(REGRESSIVE_ALIGN.out.versions.first()) + msa = msa.mix(REGRESSIVE_ALIGN.out.alignment) + + // 2. SEQUENCE + STRUCTURE BASED + // ----------------- 3DCOFFEE ------------------ ch_fasta_trees_3dcoffee = ch_fasta_trees.tcoffee3d.map{ meta, fasta, tree -> [meta["id"], meta, fasta, tree] } .combine(ch_structures.map{ meta, template, structures -> [meta["id"], template, structures]}, by: 0) @@ -185,35 +212,23 @@ workflow ALIGN { ch_versions = ch_versions.mix(TCOFFEE3D_ALIGN.out.versions.first()) msa = msa.mix(TCOFFEE3D_ALIGN.out.alignment) - // ----------------- REGRESSIVE ------------------ - ch_fasta_trees_regressive = ch_fasta_trees.regressive - .multiMap{ - meta, fastafile, treefile -> - fasta: [ meta, fastafile ] - tree: [ meta, treefile ] - } - REGRESSIVE_ALIGN(ch_fasta_trees_regressive.fasta, ch_fasta_trees_regressive.tree, [[:],[], []], compress) - ch_versions = ch_versions.mix(REGRESSIVE_ALIGN.out.versions.first()) - msa = msa.mix(REGRESSIVE_ALIGN.out.alignment) + // 3. STRUCTURE BASED // ----------------- MTMALIGN ------------------ - // this call discards the fasta, tree and template arguments, as MTMalign only takes pdb inputs - // nonetheless, this is required by the pipeline - ch_pdb_mtmalign = ch_fasta_trees.mtmalign.map{ meta, fasta, tree -> [meta["id"], meta] } - .combine(ch_structures.map{ meta, template, structures -> [meta["id"], structures]}, by: 0) - .multiMap{ - merging_id, meta, templatefile, structuresfiles -> - pdbs: [ meta, structuresfiles ] - } + ch_structures_tools.mtmalign + .multiMap{ + meta, template, struct -> + pdbs: [ meta, struct ] + }.set{ ch_pdb_mtmalign } - MTMALIGN_ALIGN(ch_pdb_mtmalign.pdbs, false) + MTMALIGN_ALIGN(ch_pdb_mtmalign.pdbs, compress) ch_versions = ch_versions.mix(MTMALIGN_ALIGN.out.versions.first()) msa = msa.mix(MTMALIGN_ALIGN.out.alignment) emit: msa - versions = ch_versions.ifEmpty(null) // channel: [ versions.yml ] + versions = ch_versions // channel: [ versions.yml ] } diff --git a/subworkflows/local/compute_trees.nf b/subworkflows/local/compute_trees.nf index 4027680a..57801c30 100644 --- a/subworkflows/local/compute_trees.nf +++ b/subworkflows/local/compute_trees.nf @@ -4,8 +4,6 @@ include { FAMSA_GUIDETREE } from '../../modules/nf-core/famsa/guidetree/main' include { CLUSTALO_GUIDETREE } from '../../modules/nf-core/clustalo/guidetree/main' -include { MAGUS_GUIDETREE } from '../../modules/nf-core/magus/guidetree/main' - workflow COMPUTE_TREES { @@ -28,7 +26,6 @@ workflow COMPUTE_TREES { .branch { famsa: it[0]["tree"] == "FAMSA" clustalo: it[0]["tree"] == "CLUSTALO" - magus: it[0]["tree"] == "MAGUS" } .set { ch_fastas_fortrees } @@ -40,11 +37,7 @@ workflow COMPUTE_TREES { ch_trees = ch_trees.mix(CLUSTALO_GUIDETREE.out.tree) ch_versions = ch_versions.mix(CLUSTALO_GUIDETREE.out.versions.first()) - MAGUS_GUIDETREE(ch_fastas_fortrees.clustalo) - ch_trees = ch_trees.mix(MAGUS_GUIDETREE.out.tree) - ch_versions = ch_versions.mix(MAGUS_GUIDETREE.out.versions.first()) - emit: trees = ch_trees // channel: [ val(meta), path(tree) ] - versions = ch_versions.ifEmpty(null) // channel: [ versions.yml ] + versions = ch_versions // channel: [ versions.yml ] } diff --git a/subworkflows/local/evaluate.nf b/subworkflows/local/evaluate.nf index 69eb8566..3c5dd6a9 100644 --- a/subworkflows/local/evaluate.nf +++ b/subworkflows/local/evaluate.nf @@ -1,6 +1,5 @@ -include { PIGZ_UNCOMPRESS } from '../../modules/nf-core/pigz/uncompress/main.nf' include { TCOFFEE_ALNCOMPARE as TCOFFEE_ALNCOMPARE_SP } from '../../modules/nf-core/tcoffee/alncompare' include { TCOFFEE_ALNCOMPARE as TCOFFEE_ALNCOMPARE_TC } from '../../modules/nf-core/tcoffee/alncompare' include { TCOFFEE_IRMSD } from '../../modules/nf-core/tcoffee/irmsd' @@ -28,17 +27,9 @@ workflow EVALUATE { tc_csv = Channel.empty() irmsd_csv = Channel.empty() tcs_csv = Channel.empty() + gaps_csv = Channel.empty() eval_summary = Channel.empty() - // ---------------------- - // Decompress if required - // ---------------------- - if( !params.no_compression ){ - PIGZ_UNCOMPRESS(ch_msa) - ch_msa = PIGZ_UNCOMPRESS.out.file - ch_versions = ch_versions.mix(PIGZ_UNCOMPRESS.out.versions) - } - // -------------------------- // Reference based evaluation @@ -49,7 +40,7 @@ workflow EVALUATE { // Sum of pairs - if( params.calc_sp == true){ + if( params.calc_sp ){ TCOFFEE_ALNCOMPARE_SP(alignment_and_ref) sp_scores = TCOFFEE_ALNCOMPARE_SP.out.scores ch_versions = ch_versions.mix(TCOFFEE_ALNCOMPARE_SP.out.versions.first()) @@ -65,7 +56,7 @@ workflow EVALUATE { } // Total column score - if( params.calc_tc == true){ + if( params.calc_tc ){ TCOFFEE_ALNCOMPARE_TC(alignment_and_ref) tc_scores = TCOFFEE_ALNCOMPARE_TC.out.scores ch_versions = ch_versions.mix(TCOFFEE_ALNCOMPARE_TC.out.versions.first()) @@ -81,7 +72,7 @@ workflow EVALUATE { } // number of gaps - if (params.calc_gaps == true){ + if ( params.calc_gaps ){ CALC_GAPS(ch_msa) gaps_scores = CALC_GAPS.out.gaps ch_versions = ch_versions.mix(CALC_GAPS.out.versions) @@ -104,7 +95,7 @@ workflow EVALUATE { // ------------------------------------------- // iRMSD - if (params.calc_irmsd == true){ + if (params.calc_irmsd ){ msa_str = ch_structures.map { meta, template, str -> [ meta.id, template, str ] } .cross (ch_msa.map { meta, aln -> [ meta.id, meta, aln ] }) .multiMap { chstr, chaln -> @@ -136,7 +127,7 @@ workflow EVALUATE { // ------------------------------------------- // TCS - if( params.calc_tcs == true){ + if( params.calc_tcs ){ // the second argument is empty but a lib file can be fed to it TCOFFEE_TCS(ch_msa, [[:], []]) tcs_scores = TCOFFEE_TCS.out.scores @@ -179,6 +170,6 @@ workflow EVALUATE { emit: eval_summary - versions = ch_versions.ifEmpty(null) // channel: [ versions.yml ] + versions = ch_versions // channel: [ versions.yml ] } diff --git a/subworkflows/local/stats.nf b/subworkflows/local/stats.nf index 44b88b3f..76b3b328 100644 --- a/subworkflows/local/stats.nf +++ b/subworkflows/local/stats.nf @@ -17,16 +17,16 @@ workflow STATS { main: - ch_versions = Channel.empty() - sim_csv = Channel.empty() - seqstats_csv = Channel.empty() - plddts_csv = Channel.empty() + ch_versions = Channel.empty() + sim_csv = Channel.empty() + seqstats_csv = Channel.empty() + plddts_csv = Channel.empty() stats_summary = Channel.empty() // // ------------------------------------------- // // SEQUENCE SIMILARITY // // ------------------------------------------- - if( params.calc_sim == true){ + if( params.calc_sim){ TCOFFEE_SEQREFORMAT_SIM(ch_seqs) tcoffee_seqreformat_sim = TCOFFEE_SEQREFORMAT_SIM.out.formatted_file ch_versions = ch_versions.mix(TCOFFEE_SEQREFORMAT_SIM.out.versions.first()) @@ -48,7 +48,7 @@ workflow STATS { // SEQUENCE GENERAL STATS // Sequence length, # of sequences, etc // ------------------------------------------- - if( params.calc_seq_stats == true){ + if( params.calc_seq_stats){ CALCULATE_SEQSTATS(ch_seqs) seqstats = CALCULATE_SEQSTATS.out.seqstats seqstats_summary = CALCULATE_SEQSTATS.out.seqstats_summary @@ -69,7 +69,7 @@ workflow STATS { // ------------------------------------------- // EXTRACT PLDDT // ------------------------------------------- - if (params.extract_plddt == true){ + if (params.extract_plddt){ EXTRACT_PLDDT(ch_structures) ch_versions = ch_versions.mix(EXTRACT_PLDDT.out.versions) plddt_summary = EXTRACT_PLDDT.out.plddt_summary @@ -104,5 +104,5 @@ workflow STATS { emit: stats_summary - versions = ch_versions.ifEmpty(null) // channel: [ versions.yml ] + versions = ch_versions } diff --git a/subworkflows/local/utils_nfcore_multiplesequencealign_pipeline/main.nf b/subworkflows/local/utils_nfcore_multiplesequencealign_pipeline/main.nf index c89ae745..c9cb1811 100644 --- a/subworkflows/local/utils_nfcore_multiplesequencealign_pipeline/main.nf +++ b/subworkflows/local/utils_nfcore_multiplesequencealign_pipeline/main.nf @@ -73,10 +73,6 @@ workflow PIPELINE_INITIALISATION { UTILS_NFCORE_PIPELINE ( nextflow_cli_args ) - // - // Custom validation for pipeline parameters - // - validateInputParameters() // // Create channel from input file provided through params.input @@ -98,7 +94,7 @@ workflow PIPELINE_INITIALISATION { align_map["args_aligner_clean"] = Utils.cleanArgs(align_map["args_aligner"]) [ tree_map, align_map ] - } + }.unique() emit: samplesheet = ch_input @@ -162,12 +158,6 @@ workflow PIPELINE_COMPLETION { ======================================================================================== */ // -// Check and validate pipeline parameters -// -def validateInputParameters() { - statsParamsWarning() - evalParamsWarning() -} // // Validate channels from input samplesheet @@ -184,55 +174,6 @@ def validateInputSamplesheet(input) { return [ metas[0], fastqs ] } -// -// Warning if incorrect combination of stats parameters are used -// -def statsParamsWarning() { - if (params.skip_stats){ - if(params.calc_sim || params.calc_seq_stats) { - def warning_string = "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n" + - " WARNING: The param skip_stats is set to '${params.skip_stats}'.\n" + - " The following params have values calc_sim: ${params.calc_sim} and calc_seq_stats: ${params.calc_seq_stats} \n" + - " As skip_stats is set to true, the params.calc_sim and params.calc_seq_stats will be set by default to false. \n" + - "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" - println(warning_string) - } - } - if (!params.skip_stats && !params.calc_sim && !params.calc_seq_stats){ - params.skip_stats = true - def warning_string = "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n" + - " WARNING: The param skip_stats has been changed from false to true'.\n" + - " None of the modules withing the stats subworkflow was activated. \n" + - " To activate them you can use param.calc_sim, params.calc_seq_stats. \n" + - "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" - println(warning_string) - } -} - -// -// Warning if incorrect combination of eval parameters are used -// -def evalParamsWarning() { - if (params.skip_eval){ - if(params.calc_sp || params.calc_tc || params.calc_irmsd) { - def warning_string = "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n" + - " WARNING: The param skip_eval is set to '${params.skip_eval}'.\n" + - " The following params have values params.calc_sp: ${params.calc_sp}, params.calc_tc: ${params.calc_tc} and params.calc_irms: ${params.calc_irmsd} \n" + - " As skip_eval is set to true, the params.calc_sp, params.calc_tc and params.calc_irmsd are set by default to false. \n" + - "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" - println(warning_string) - } - } - if (!params.skip_eval && !params.calc_sp && !params.calc_tc && !params.calc_irmsd ){ - params.skip_eval = true - def warning_string = "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n" + - " WARNING: The param skip_eval has been changed from false to true'.\n" + - " None of the modules withing the stats subworkflow was activated. \n" + - " To activate them you can use param.calc_sp, params.calc_tc, params.calc_irmsd. \n" + - "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" - println(warning_string) - } -} // // Generate methods description for MultiQC @@ -432,16 +373,23 @@ class Utils { if(args == null || args == ""|| args == "null"){ args = "" } - args = args + " " + required_flag + " " + default_value + def prefix = "" + if(args != ""){ + prefix = args + " " + } + args = prefix + required_flag + " " + default_value } } return args } + public static check_required_args(tool,args){ // 3DCOFFEE args = fix_args(tool,args,"3DCOFFEE", "-method", "TMalign_pair") + args = fix_args(tool,args,"3DCOFFEE", "-output", "fasta_aln") + // REGRESSIVE args = fix_args(tool,args,"REGRESSIVE", "-reg", "") args = fix_args(tool,args,"REGRESSIVE", "-reg_method", "famsa_msa") diff --git a/workflows/multiplesequencealign.nf b/workflows/multiplesequencealign.nf index 995b6813..47e7d492 100644 --- a/workflows/multiplesequencealign.nf +++ b/workflows/multiplesequencealign.nf @@ -26,6 +26,8 @@ ch_multiqc_table = Channel.empty() evaluation_summary = Channel.empty() stats_summary = Channel.empty() stats_and_evaluation_summary = Channel.empty() +ch_shiny_stats = Channel.empty() +shiny_app = Channel.fromPath(params.shiny_app) /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -59,6 +61,7 @@ include { PREPARE_SHINY } from '../modules/local/prepare_shiny' include { UNTAR } from '../modules/nf-core/untar/main' include { CSVTK_JOIN as MERGE_STATS_EVAL } from '../modules/nf-core/csvtk/join/main.nf' +include { PIGZ_COMPRESS } from '../modules/nf-core/pigz/compress/main' /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -168,6 +171,7 @@ workflow MULTIPLESEQUENCEALIGN { // // Compute summary statistics about the input sequences // + if( !params.skip_stats ){ STATS(ch_seqs, ch_structures) ch_versions = ch_versions.mix(STATS.out.versions) @@ -177,51 +181,51 @@ workflow MULTIPLESEQUENCEALIGN { // // Align // - ALIGN(ch_seqs, ch_tools, ch_structures_template) + + compress_during_align = !params.skip_compression && params.skip_eval + ALIGN(ch_seqs, ch_tools, ch_structures_template, compress_during_align) ch_versions = ch_versions.mix(ALIGN.out.versions) + if( !params.skip_compression && !compress_during_align){ + PIGZ_COMPRESS(ALIGN.out.msa) + ch_versions = ch_versions.mix(PIGZ_COMPRESS.out.versions) + } + // // Evaluate the quality of the alignment // + if( !params.skip_eval ){ EVALUATE(ALIGN.out.msa, ch_refs, ch_structures_template) ch_versions = ch_versions.mix(EVALUATE.out.versions) evaluation_summary = evaluation_summary.mix(EVALUATE.out.eval_summary) } + // // Combine stats and evaluation reports into a single CSV // - stats_summary_csv = stats_summary.map{ meta, csv -> csv } - eval_summary_csv = evaluation_summary.map{ meta, csv -> csv } - eval_summary_csv - .mix(stats_summary_csv) - .collect() - .map { - csvs -> - [ [ id:"summary_stats_eval" ], csvs ] - } - .set { stats_and_evaluation } - - if( !params.skip_stats && !params.skip_eval ){ - def number_of_stats = [params.calc_sim, params.calc_seq_stats].count(true) - def number_of_evals = [params.calc_sp, params.calc_tc, params.calc_irmsd].count(true) - if (number_of_evals > 0 && number_of_stats > 0 ){ - MERGE_STATS_EVAL(stats_and_evaluation) - stats_and_evaluation_summary = MERGE_STATS_EVAL.out.csv - ch_versions = ch_versions.mix(MERGE_STATS_EVAL.out.versions) - } - }else{ - stats_and_evaluation_summary = stats_and_evaluation - } + if( !params.skip_stats || !params.skip_eval ){ + stats_summary_csv = stats_summary.map{ meta, csv -> csv } + eval_summary_csv = evaluation_summary.map{ meta, csv -> csv } + stats_summary_csv.mix(eval_summary_csv) + .collect() + .map { + csvs -> + [ [ id:"summary_stats_eval" ], csvs ] + } + .set { stats_and_evaluation } + MERGE_STATS_EVAL(stats_and_evaluation) + stats_and_evaluation_summary = MERGE_STATS_EVAL.out.csv + ch_versions = ch_versions.mix(MERGE_STATS_EVAL.out.versions) + } // // MODULE: Shiny // - ch_shiny_stats = Channel.empty() if( !params.skip_shiny){ - PREPARE_SHINY ( stats_and_evaluation_summary, file(params.shiny_app) ) + PREPARE_SHINY ( stats_and_evaluation_summary, shiny_app ) ch_versions = ch_versions.mix(PREPARE_SHINY.out.versions) ch_shiny_stats = PREPARE_SHINY.out.data.toList() } @@ -238,7 +242,7 @@ workflow MULTIPLESEQUENCEALIGN { // MODULE: MultiQC // multiqc_out = Channel.empty() - if (!params.skip_multiqc){ + if (!params.skip_multiqc && (!params.skip_stats || !params.skip_eval)){ ch_multiqc_config = Channel.fromPath("$projectDir/assets/multiqc_config.yml", checkIfExists: true) ch_multiqc_custom_config = params.multiqc_config ? Channel.fromPath(params.multiqc_config, checkIfExists: true) : Channel.empty() ch_multiqc_logo = params.multiqc_logo ? Channel.fromPath(params.multiqc_logo, checkIfExists: true) : Channel.empty()