diff --git a/CHANGELOG.md b/CHANGELOG.md index 5babbeb9..790022d5 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -26,6 +26,7 @@ Initial release of nf-core/multiplesequencealign, created with the [nf-core](htt [#66](https://github.com/nf-core/multiplesequencealign/issues/66) - README: add new metromap and available tool list. [#54](https://github.com/nf-core/multiplesequencealign/issues/54) - Update modules versions from nf-core tools. [#80](https://github.com/nf-core/multiplesequencealign/pull/80) - Update modules versions from nf-core tools with nf-test. +[#32](https://github.com/nf-core/multiplesequencealign/issues/32) - Update Stats workflow with nf-core modules for merging. ### `Dependencies` diff --git a/assets/samplesheet.csv b/assets/samplesheet.csv index 163dc6e4..98791456 100644 --- a/assets/samplesheet.csv +++ b/assets/samplesheet.csv @@ -1,3 +1,2 @@ id,fasta,reference,structures,template seatoxin-ref,test-dataset/setoxin-ref.fa,test-dataset/setoxin.ref,https://raw.githubusercontent.com/nf-core/test-datasets/multiplesequencealign/testdata/structures/seatoxin-ref.tar.gz, -toxin-ref,test-dataset/toxin-ref.fa,test-dataset/toxin.ref, diff --git a/assets/toolsheet.csv b/assets/toolsheet.csv index deb788c3..7af6a027 100644 --- a/assets/toolsheet.csv +++ b/assets/toolsheet.csv @@ -1,8 +1,4 @@ tree,args_tree,aligner,args_aligner FAMSA,"-gt upgma -parttree",FAMSA,"" "","",MAFFT,"--anysymbol --quiet --dpparttree" -"","","KALIGN","" -CLUSTALO,"--force",3DCOFFEE,"-method TMalign_pair" -CLUSTALO,"--force",TCOFFEE,"-reg -reg_method famsa_msa -reg_nseq 100" -"","",FAMSA,"" -FAMSA,"",TCOFFEE,"-reg -reg_method famsa_msa -reg_nseq 100" + diff --git a/bin/calc_seqstats.py b/bin/calc_seqstats.py index 098dc70b..9b06a7de 100755 --- a/bin/calc_seqstats.py +++ b/bin/calc_seqstats.py @@ -1,7 +1,6 @@ #!/usr/bin/env python3 from Bio import SeqIO import pandas as pd -import time import sys fam_name = sys.argv[1] diff --git a/conf/modules.config b/conf/modules.config index 22bbd186..e153ba6b 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -18,6 +18,31 @@ process { saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ] + withName: TCOFFEE_SEQREFORMAT_SIM{ + ext.args = "-output=sim_idscore" + publishDir = [ + path: { "${params.outdir}/stats/${task.process.tokenize(':')[-1].toLowerCase()}" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + + withName: PARSE_SIM{ + publishDir = [ + path: { "${params.outdir}/stats/${task.process.tokenize(':')[-1].toLowerCase()}" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + + withName: "CALCULATE_SEQSTATS"{ + publishDir = [ + path: { "${params.outdir}/stats/${task.process.tokenize(':')[-1].toLowerCase()}" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + withName: "FAMSA_GUIDETREE"{ ext.prefix = { "${meta.id}_${meta.tree}-args-${meta.argstree_clean}" } ext.args = { "${meta.args_tree}" == "null" ? '' : "${meta.args_tree}" } @@ -38,6 +63,15 @@ process { ] } + withName: "CREATE_TCOFFEETEMPLATE"{ + ext.prefix = { "${meta.id}" } + publishDir = [ + path: { "${params.outdir}/alignment/templates" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + withName: ".*ALIGN"{ ext.prefix = { "${meta.id}_${meta.tree}-args-${meta.argstree_clean}_${meta.aligner}-args-${meta.argsaligner_clean}" } ext.args = { "${meta.args_aligner}" == "null" ? '' : "${meta.args_aligner}" } @@ -70,7 +104,9 @@ process { ] } - withName: "MERGE_STATS"{ + + withName: "CONCAT_SEQSTATS"{ + ext.prefix = { "summary_seqstats" } publishDir = [ path: { "${params.outdir}/stats/" }, mode: params.publish_dir_mode, @@ -78,9 +114,19 @@ process { ] } - withName: "TCOFFEE_SEQREFORMAT_SIM|CALCULATE_SEQSTATS"{ + withName: "CONCAT_SIMSTATS"{ + ext.prefix = { "summary_simstats" } publishDir = [ - path: { "${params.outdir}/stats/${task.process.tokenize(':')[-1].toLowerCase()}" }, + path: { "${params.outdir}/stats/" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + + withName: "MERGE_STATS"{ + ext.prefix = { "complete_summary_stats" } + publishDir = [ + path: { "${params.outdir}/stats/" }, mode: params.publish_dir_mode, saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ] diff --git a/conf/test.config b/conf/test.config index 359de16c..0ea9a401 100644 --- a/conf/test.config +++ b/conf/test.config @@ -20,7 +20,7 @@ params { max_time = '6.h' // Stats - skip_stats = true + skip_stats = false skip_eval = true // Input data diff --git a/modules.json b/modules.json index 32ba2d12..00ce3c5e 100644 --- a/modules.json +++ b/modules.json @@ -15,6 +15,16 @@ "git_sha": "1f253ec05723293df7757af8769f8389b7a1884e", "installed_by": ["modules"] }, + "csvtk/concat": { + "branch": "master", + "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", + "installed_by": ["modules"] + }, + "csvtk/join": { + "branch": "master", + "git_sha": "b2420a8bbd8af137380aa0a0c2e9a92456e5bb21", + "installed_by": ["modules"] + }, "custom/dumpsoftwareversions": { "branch": "master", "git_sha": "bba7e362e4afead70653f84d8700588ea28d0f9e", @@ -65,6 +75,11 @@ "git_sha": "c83c78835ca6d7a55b3f200718d887cbc7149d37", "installed_by": ["modules"] }, + "tcoffee/seqreformat": { + "branch": "master", + "git_sha": "f759fd45ecabb40c761df1338a4bb3851171a7f7", + "installed_by": ["modules"] + }, "untar": { "branch": "master", "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", diff --git a/modules/local/calculate_seqstats.nf b/modules/local/calculate_seqstats.nf index a47ad69c..511d8432 100644 --- a/modules/local/calculate_seqstats.nf +++ b/modules/local/calculate_seqstats.nf @@ -3,7 +3,10 @@ process CALCULATE_SEQSTATS { tag "$meta.id" label 'process_low' - container 'luisas/structural_regression:20' + conda "conda-forge::python=3.11.0 conda-forge::biopython=1.80 conda-forge::pandas=1.5.2" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/mulled-v2-27978155697a3671f3ef9aead4b5c823a02cc0b7:548df772fe13c0232a7eab1bc1deb98b495a05ab-0' : + 'biocontainers/mulled-v2-27978155697a3671f3ef9aead4b5c823a02cc0b7:548df772fe13c0232a7eab1bc1deb98b495a05ab-0' }" input: tuple val(meta), path(fasta) diff --git a/modules/local/merge_stats.nf b/modules/local/merge_stats.nf deleted file mode 100644 index bc9c4b70..00000000 --- a/modules/local/merge_stats.nf +++ /dev/null @@ -1,30 +0,0 @@ - - -process MERGE_STATS { - label 'process_low' - - input: - path(tcoffee_seqreformat_simtot) - path(seqstats_summary) - - output: - path '*.csv' , emit: csv - path "versions.yml", emit: versions - - when: - task.ext.when == null || task.ext.when - - script: - def args = task.ext.args ?: '' - """ - merge_stats.py \ - "stats_summary_report.csv" \ - ${tcoffee_seqreformat_simtot} \ - ${seqstats_summary} - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - python: \$(python --version | sed 's/Python //g') - END_VERSIONS - """ -} diff --git a/modules/local/parse_sim.nf b/modules/local/parse_sim.nf new file mode 100644 index 00000000..01d5c4c5 --- /dev/null +++ b/modules/local/parse_sim.nf @@ -0,0 +1,30 @@ +process PARSE_SIM { + tag "$meta.id" + label 'process_low' + + input: + tuple val(meta), path(infile) + + output: + tuple val (meta), path("${prefix}.sim_tot"), emit: sim_tot + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + prefix = task.ext.prefix ?: "${meta.id}" + """ + echo "$prefix" > tmp + grep ^TOT $infile | cut -f4 >> tmp + + echo "id,perc_sim" > ${prefix}.sim_tot + cat tmp | tr '\\n' ',' | awk 'gsub(/,\$/,x)' >> ${prefix}.sim_tot + """ + + stub: + def prefix = task.ext.prefix ?: "${meta.id}" + """ + touch ${prefix}.sim_tot + """ +} diff --git a/modules/local/tcoffee_seqreformat_sim.nf b/modules/local/tcoffee_seqreformat_sim.nf deleted file mode 100644 index b1e6c382..00000000 --- a/modules/local/tcoffee_seqreformat_sim.nf +++ /dev/null @@ -1,42 +0,0 @@ - - -process TCOFFEE_SEQREFORMAT_SIM { - tag "$meta.id" - label 'process_low' - - conda "bioconda::t-coffee=13.45.0.4846264" - container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/t-coffee:13.45.0.4846264--hc57179f_5': - 'biocontainers/t-coffee:13.45.0.4846264--hc57179f_5' }" - input: - tuple val(meta), path(fasta) - - output: - tuple val(meta), path("*.sim"), emit: perc_sim - tuple val(meta), path("*.sim_tot"), emit: perc_sim_tot - path "versions.yml" , emit: versions - - - when: - task.ext.when == null || task.ext.when - - script: - def args = task.ext.args ?: '' - def prefix = task.ext.prefix ?: "${meta.id}" - """ - t_coffee -other_pg seq_reformat -in ${fasta} -output=sim_idscore > "${prefix}.sim" - - echo "$prefix" > tmp - grep ^TOT ${prefix}.sim | cut -f4 >> tmp - - echo "id,perc_sim" > ${prefix}.sim_tot - cat tmp | tr '\\n' ',' | awk 'gsub(/,\$/,x)' >> ${prefix}.sim_tot - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - t_coffee: \$( t_coffee -version | sed 's/.*(Version_\\(.*\\)).*/\\1/' ) - END_VERSIONS - """ -} - - diff --git a/modules/nf-core/csvtk/concat/main.nf b/modules/nf-core/csvtk/concat/main.nf new file mode 100644 index 00000000..ad209ce7 --- /dev/null +++ b/modules/nf-core/csvtk/concat/main.nf @@ -0,0 +1,43 @@ +process CSVTK_CONCAT { + tag "$meta.id" + label 'process_low' + + conda "bioconda::csvtk=0.23.0" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/csvtk:0.23.0--h9ee0642_0' : + 'biocontainers/csvtk:0.23.0--h9ee0642_0' }" + + input: + tuple val(meta), path(csv) + val in_format + val out_format + + output: + tuple val(meta), path("${prefix}.${out_extension}"), emit: csv + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + prefix = task.ext.prefix ?: "${meta.id}" + def delimiter = in_format == "tsv" ? "\t" : (in_format == "csv" ? "," : in_format) + def out_delimiter = out_format == "tsv" ? "\t" : (out_format == "csv" ? "," : out_format) + out_extension = out_format == "tsv" ? 'tsv' : 'csv' + """ + csvtk \\ + concat \\ + $args \\ + --num-cpus $task.cpus \\ + --delimiter "${delimiter}" \\ + --out-delimiter "${out_delimiter}" \\ + --out-file ${prefix}.${out_extension} \\ + $csv + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + csvtk: \$(echo \$( csvtk version | sed -e "s/csvtk v//g" )) + END_VERSIONS + """ +} diff --git a/modules/nf-core/csvtk/concat/meta.yml b/modules/nf-core/csvtk/concat/meta.yml new file mode 100644 index 00000000..19e0e3b9 --- /dev/null +++ b/modules/nf-core/csvtk/concat/meta.yml @@ -0,0 +1,51 @@ +name: csvtk_concat +description: Concatenate two or more CSV (or TSV) tables into a single table +keywords: + - concatenate + - tsv + - csv +tools: + - csvtk: + description: A cross-platform, efficient, practical CSV/TSV toolkit + homepage: http://bioinf.shenwei.me/csvtk + documentation: http://bioinf.shenwei.me/csvtk + tool_dev_url: https://github.com/shenwei356/csvtk + + licence: ["MIT"] + +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - csv: + type: file + description: CSV/TSV formatted files + pattern: "*.{csv,tsv}" + - in_format: + type: string + description: Input format (csv, tab, or a delimiting character) + pattern: "*" + - out_format: + type: string + description: Output format (csv, tab, or a delimiting character) + pattern: "*" + +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - versions: + type: file + description: File containing software versions + pattern: "version.yml" + - csv: + type: file + description: Concatenated CSV/TSV file + pattern: "*.{csv,tsv}" + +authors: + - "@rpetit3" diff --git a/modules/nf-core/csvtk/join/main.nf b/modules/nf-core/csvtk/join/main.nf new file mode 100644 index 00000000..08833d78 --- /dev/null +++ b/modules/nf-core/csvtk/join/main.nf @@ -0,0 +1,50 @@ +process CSVTK_JOIN { + tag "$meta.id" + label 'process_single' + + conda "bioconda::csvtk=0.26.0" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/csvtk:0.26.0--h9ee0642_0': + 'biocontainers/csvtk:0.26.0--h9ee0642_0' }" + + input: + tuple val(meta), path(csv) + + output: + tuple val(meta), path("${prefix}.${out_extension}"), emit: csv + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + prefix = task.ext.prefix ?: "${meta.id}" + out_extension = args.contains('--out-delimiter "\t"') || args.contains('-D "\t"') || args.contains("-D \$'\t'") ? "tsv" : "csv" + """ + csvtk \\ + join \\ + $args \\ + --num-cpus $task.cpus \\ + --out-file ${prefix}.${out_extension} \\ + $csv + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + csvtk: \$(echo \$( csvtk version | sed -e "s/csvtk v//g" )) + END_VERSIONS + """ + + stub: + def args = task.ext.args ?: '' + prefix = task.ext.prefix ?: "${meta.id}" + out_extension = args.contains('--out-delimiter "\t"') || args.contains('-D "\t"') || args.contains("-D \$'\t'") ? "tsv" : "csv" + """ + touch ${prefix}.${out_extension} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + csvtk: \$(echo \$( csvtk version | sed -e "s/csvtk v//g" )) + END_VERSIONS + """ +} diff --git a/modules/nf-core/csvtk/join/meta.yml b/modules/nf-core/csvtk/join/meta.yml new file mode 100644 index 00000000..e817a3b5 --- /dev/null +++ b/modules/nf-core/csvtk/join/meta.yml @@ -0,0 +1,43 @@ +name: csvtk_join +description: Join two or more CSV (or TSV) tables by selected fields into a single table +keywords: + - join + - tsv + - csv +tools: + - csvtk: + description: A cross-platform, efficient, practical CSV/TSV toolkit + homepage: http://bioinf.shenwei.me/csvtk + documentation: http://bioinf.shenwei.me/csvtk + tool_dev_url: https://github.com/shenwei356/csvtk + + licence: ["MIT"] + +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - csv: + type: file + description: CSV/TSV formatted files + pattern: "*.{csv,tsv}" + +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - versions: + type: file + description: File containing software versions + pattern: "version.yml" + - csv: + type: file + description: Joined CSV/TSV file + pattern: "*.{csv,tsv}" + +authors: + - "@anoronh4" diff --git a/modules/nf-core/tcoffee/seqreformat/main.nf b/modules/nf-core/tcoffee/seqreformat/main.nf new file mode 100644 index 00000000..a48d6bb1 --- /dev/null +++ b/modules/nf-core/tcoffee/seqreformat/main.nf @@ -0,0 +1,50 @@ +process TCOFFEE_SEQREFORMAT { + tag "$meta.id" + label 'process_low' + + conda "bioconda::t-coffee=13.46.0.919e8c6b" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/t-coffee:13.46.0.919e8c6b--hfc96bf3_0': + 'biocontainers/t-coffee:13.46.0.919e8c6b--hfc96bf3_0' }" + + input: + tuple val(meta), path(infile) + + output: + tuple val(meta), path("${prefix}.txt"), emit: formatted_file + path "versions.yml" , emit: versions + + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + prefix = task.ext.prefix ?: "${meta.id}" + """ + export TEMP='./' + t_coffee -other_pg seq_reformat \ + -in ${infile} \ + $args \ + > "${prefix}.txt" + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + tcoffee: \$( t_coffee -version | awk '{gsub("Version_", ""); print \$3}') + END_VERSIONS + """ + + stub: + def args = task.ext.args ?: '' + prefix = task.ext.prefix ?: "${meta.id}" + """ + touch "${prefix}_${seq_reformat_type}.txt" + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + tcoffee: \$( t_coffee -version | awk '{gsub("Version_", ""); print \$3}') + END_VERSIONS + """ +} + + diff --git a/modules/nf-core/tcoffee/seqreformat/main.nf.test b/modules/nf-core/tcoffee/seqreformat/main.nf.test new file mode 100644 index 00000000..3befc69a --- /dev/null +++ b/modules/nf-core/tcoffee/seqreformat/main.nf.test @@ -0,0 +1,32 @@ +nextflow_process { + + name "Test Process TCOFFEE_SEQREFORMAT" + script "modules/nf-core/tcoffee/seqreformat/main.nf" + process "TCOFFEE_SEQREFORMAT" + tag "modules" + tag "modules_nfcore" + tag "tcoffee" + tag "tcoffee_seqreformat" + + + test("TCOFFEE_SEQREFORMAT seqidscore") { + + when { + process { + """ + input[0] = [ + [ id:'test' ], // meta map + file("https://raw.githubusercontent.com/nf-core/test-datasets/multiplesequencealign/testdata/setoxin-ref.fa", checkIfExists: true) + ] + """ + } + } + + then { + assert process.success + assert snapshot(process.out).match() + } + + } + +} diff --git a/modules/nf-core/tcoffee/seqreformat/main.nf.test.snap b/modules/nf-core/tcoffee/seqreformat/main.nf.test.snap new file mode 100644 index 00000000..5a405d49 --- /dev/null +++ b/modules/nf-core/tcoffee/seqreformat/main.nf.test.snap @@ -0,0 +1,31 @@ +{ + "TCOFFEE_SEQREFORMAT seqidscore": { + "content": [ + { + "0": [ + [ + { + "id": "test" + }, + "test.txt:md5,fcd4691daf120c88ec5def7ac06fb562" + ] + ], + "1": [ + "versions.yml:md5,68fb841e6e44274d430a1382bb0bbd14" + ], + "formatted_file": [ + [ + { + "id": "test" + }, + "test.txt:md5,fcd4691daf120c88ec5def7ac06fb562" + ] + ], + "versions": [ + "versions.yml:md5,68fb841e6e44274d430a1382bb0bbd14" + ] + } + ], + "timestamp": "2023-10-23T10:57:25.909083" + } +} \ No newline at end of file diff --git a/modules/nf-core/tcoffee/seqreformat/meta.yml b/modules/nf-core/tcoffee/seqreformat/meta.yml new file mode 100644 index 00000000..7d92631e --- /dev/null +++ b/modules/nf-core/tcoffee/seqreformat/meta.yml @@ -0,0 +1,45 @@ +name: "tcoffee_seqreformat" +description: Reformats files with t-coffee +keywords: + - reformatting + - alignment + - genomics +tools: + - "tcoffee": + description: "A collection of tools for Computing, Evaluating and Manipulating Multiple Alignments of DNA, RNA, Protein Sequences and Structures." + homepage: "http://www.tcoffee.org/Projects/tcoffee/" + documentation: "https://tcoffee.readthedocs.io/en/latest/tcoffee_main_documentation.html" + tool_dev_url: "https://github.com/cbcrg/tcoffee" + doi: "10.1006/jmbi.2000.4042" + licence: "['GPL v3']" +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'test' ]` + + - fasta: + type: file + description: Input file to be reformatted + pattern: "*" + +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'test' ]` + + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + + - formatted_file: + type: file + description: Formatted file + pattern: "*" + +authors: + - "@luisas" diff --git a/modules/nf-core/tcoffee/seqreformat/nextflow.config b/modules/nf-core/tcoffee/seqreformat/nextflow.config new file mode 100644 index 00000000..e185a5cf --- /dev/null +++ b/modules/nf-core/tcoffee/seqreformat/nextflow.config @@ -0,0 +1,9 @@ +process { + + publishDir = { "${params.outdir}/${task.process.tokenize(':')[-1].tokenize('_')[0].toLowerCase()}" } + + withName: TIDDIT_COV_WIG { + ext.args = '-w' + } + +} diff --git a/subworkflows/local/stats.nf b/subworkflows/local/stats.nf index 88076044..3eb12a1e 100644 --- a/subworkflows/local/stats.nf +++ b/subworkflows/local/stats.nf @@ -1,10 +1,12 @@ // // Compute stats about the input sequences // - -include { TCOFFEE_SEQREFORMAT_SIM } from '../../modules/local/tcoffee_seqreformat_sim.nf' -include { CALCULATE_SEQSTATS } from '../../modules/local/calculate_seqstats.nf' -include { MERGE_STATS } from '../../modules/local/merge_stats.nf' +include { CALCULATE_SEQSTATS } from '../../modules/local/calculate_seqstats.nf' +include { PARSE_SIM } from '../../modules/local/parse_sim.nf' +include { TCOFFEE_SEQREFORMAT as TCOFFEE_SEQREFORMAT_SIM } from '../../modules/nf-core/tcoffee/seqreformat/main.nf' +include { CSVTK_CONCAT as CONCAT_SEQSTATS } from '../../modules/nf-core/csvtk/concat/main.nf' +include { CSVTK_CONCAT as CONCAT_SIMSTATS } from '../../modules/nf-core/csvtk/concat/main.nf' +include { CSVTK_JOIN as MERGE_STATS } from '../../modules/nf-core/csvtk/join/main.nf' workflow STATS { @@ -14,44 +16,57 @@ workflow STATS { main: + ch_seqs.view() ch_versions = Channel.empty() + + // // ------------------------------------------- + // // SEQUENCE SIMILARITY + // // ------------------------------------------- TCOFFEE_SEQREFORMAT_SIM(ch_seqs) - tcoffee_seqreformat_sim = TCOFFEE_SEQREFORMAT_SIM.out.perc_sim - tcoffee_seqreformat_simtot = TCOFFEE_SEQREFORMAT_SIM.out.perc_sim_tot - ch_versions = ch_versions.mix(TCOFFEE_SEQREFORMAT_SIM.out.versions.first()) + tcoffee_seqreformat_sim = TCOFFEE_SEQREFORMAT_SIM.out.formatted_file + ch_versions = ch_versions.mix(TCOFFEE_SEQREFORMAT_SIM.out.versions.first()) + tcoffee_seqreformat_simtot = PARSE_SIM(tcoffee_seqreformat_sim) + + ch_sim_summary = tcoffee_seqreformat_simtot.map{ + meta, csv -> csv + }.collect().map{ + csv -> [ [id_simstats:"summary_simstats"], csv] + } + CONCAT_SIMSTATS(ch_sim_summary, "csv", "csv") + // ------------------------------------------- + // SEQUENCE GENERAL STATS + // Sequence length, # of sequences, etc + // ------------------------------------------- CALCULATE_SEQSTATS(ch_seqs) seqstats = CALCULATE_SEQSTATS.out.seqstats seqstats_summary = CALCULATE_SEQSTATS.out.seqstats_summary ch_versions = ch_versions.mix(CALCULATE_SEQSTATS.out.versions.first()) + ch_seqstats_summary = seqstats_summary.map{ + meta, csv -> csv + }.collect().map{ + csv -> [ [id_seqstats:"summary_seqstats"], csv] + } + + CONCAT_SEQSTATS(ch_seqstats_summary, "csv", "csv") - // - // Summarize stats into one summary file - // - tcoffee_seqreformat_simtot.map{ it -> "${it[1].text}" }.collectFile( name: 'tcoffee_seqreformat_simtot_summary.csv', - keepHeader : true, - skip:1, - newLine: false) - .set { tcoffee_seqreformat_simtot_summary } + // ------------------------------------------- + // MERGE ALL STATS + // ------------------------------------------- - seqstats_summary.map{ it -> "${it[1].text}" }.collectFile( name: 'seqstats.csv', - keepHeader : true, - skip:1, - newLine: false) - .set { seqstats_summary } + csv_sim = CONCAT_SIMSTATS.out.csv.map{ meta, csv -> csv } + csv_seqstats = CONCAT_SEQSTATS.out.csv.map{ meta, csv -> csv } - MERGE_STATS( tcoffee_seqreformat_simtot_summary, - seqstats_summary ) - + csvs_stats = csv_sim.mix(csv_seqstats).collect().map{ csvs -> [[id:"summary_stats"], csvs] } + csvs_stats.view() + MERGE_STATS(csvs_stats) + stats_summary = MERGE_STATS.out.csv ch_versions = ch_versions.mix(MERGE_STATS.out.versions) emit: - tcoffee_seqreformat_sim - tcoffee_seqreformat_simtot - seqstats - seqstats_summary + stats_summary versions = ch_versions.ifEmpty(null) // channel: [ versions.yml ] } \ No newline at end of file