diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index d2fa6e12..4d71c8f8 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -130,36 +130,3 @@ jobs: - name: Run pipeline with ${{ matrix.test_name }} test profile run: | nextflow run ${GITHUB_WORKSPACE} -profile ${{ matrix.test_name }},docker --outdir ./results - - checkm: - name: Run single test to checkm due to database download - # Only run on push if this is the nf-core dev branch (merged PRs) - if: ${{ github.event_name != 'push' || (github.event_name == 'push' && github.repository == 'nf-core/mag') }} - runs-on: ubuntu-latest - - steps: - - name: Free some space - run: | - sudo rm -rf "/usr/local/share/boost" - sudo rm -rf "$AGENT_TOOLSDIRECTORY" - - - name: Check out pipeline code - uses: actions/checkout@0ad4b8fadaa221de15dcec353f45205ec38ea70b # v4 - - - name: Install Nextflow - run: | - wget -qO- get.nextflow.io | bash - sudo mv nextflow /usr/local/bin/ - - - name: Clean up Disk space - uses: jlumbroso/free-disk-space@54081f138730dfa15788a46383842cd2f914a1be # v1.3.1 - - - name: Download and prepare CheckM database - run: | - mkdir -p databases/checkm - wget https://zenodo.org/records/7401545/files/checkm_data_2015_01_16.tar.gz -P databases/checkm - tar xzvf databases/checkm/checkm_data_2015_01_16.tar.gz -C databases/checkm/ - - - name: Run pipeline with ${{ matrix.profile }} test profile - run: | - nextflow run ${GITHUB_WORKSPACE} -profile test,docker --outdir ./results --binqc_tool checkm --checkm_db databases/checkm diff --git a/CHANGELOG.md b/CHANGELOG.md index fdae9536..5e5e1c72 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,12 +9,15 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - [#692](https://github.com/nf-core/mag/pull/692) - Added Nanoq as optional longread filtering tool (added by @muabnezor) - [#692](https://github.com/nf-core/mag/pull/692) - Added chopper as optional longread filtering tool and/or phage lambda removal tool (added by @muabnezor) +- [#707](https://github.com/nf-core/mag/pull/707) - Make Bin QC a subworkflow (added by @dialvarezs) +- [#707](https://github.com/nf-core/mag/pull/707) - Added CheckM2 as an alternative bin completeness and QC tool (added by @dialvarezs) - [#708](https://github.com/nf-core/mag/pull/708) - Added `--exclude_unbins_from_postbinning` parameter to exclude unbinned contigs from post-binning processes, speeding up Prokka in some cases (added by @dialvarezs) ### `Changed` ### `Fixed` +- [#707](https://github.com/nf-core/mag/pull/708) - Fixed channel passed as GUNC input (added by @dialvarezs) - [#724](https://github.com/nf-core/mag/pull/724) - Fix quoting in `utils_nfcore_mag_pipeline/main.nf` (added by @dialvarezs) - [#716](https://github.com/nf-core/mag/pull/692) - Make short read processing a subworkflow (added by @muabnezor) - [#708](https://github.com/nf-core/mag/pull/708) - Fixed channel passed as GUNC input (added by @dialvarezs) @@ -23,7 +26,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 | Tool | Previous version | New version | | ------- | ---------------- | ----------- | +| CheckM | 1.2.1 | 1.2.3 | +| CheckM2 | | 1.0.2 | | chopper | | 0.9.0 | +| GUNC | 1.0.5 | 1.0.6 | | nanoq | | 0.10.0 | ### `Deprecated` diff --git a/CITATIONS.md b/CITATIONS.md index 74138f6c..2feb3693 100644 --- a/CITATIONS.md +++ b/CITATIONS.md @@ -40,6 +40,10 @@ > Parks, D. H., Imelfort, M., Skennerton, C. T., Hugenholtz, P., & Tyson, G. W. (2015). CheckM: assessing the quality of microbial genomes recovered from isolates, single cells, and metagenomes. Genome Research, 25(7), 1043–1055. doi: 10.1101/gr.186072.114 +- [CheckM2](https://doi.org/10.1038/s41592-023-01940-w) + + > Chklovski, A., Parks, D. H., Woodcroft, B. J., & Tyson, G. W. (2023). CheckM2: a rapid, scalable and accurate tool for assessing microbial genome quality using machine learning. Nature Methods, 20(8), 1203-1212. doi: https://doi.org/10.1038/s41592-023-01940-w + - [Chopper](https://doi.org/10.1093/bioinformatics/bty149) > De Coster W, D'Hert S, Schultz DT, Cruts M, Van Broeckhoven C. NanoPack: visualizing and processing long-read sequencing data. Bioinformatics. 2018 Aug 1;34(15):2666-2669. doi: 10.1093/bioinformatics/bty149 diff --git a/README.md b/README.md index 15ddc4ad..a5ae232f 100644 --- a/README.md +++ b/README.md @@ -35,7 +35,7 @@ The pipeline then: - performs assembly using [MEGAHIT](https://github.com/voutcn/megahit) and [SPAdes](http://cab.spbu.ru/software/spades/), and checks their quality using [Quast](http://quast.sourceforge.net/quast) - (optionally) performs ancient DNA assembly validation using [PyDamage](https://github.com/maxibor/pydamage) and contig consensus sequence recalling with [Freebayes](https://github.com/freebayes/freebayes) and [BCFtools](http://samtools.github.io/bcftools/bcftools.html) - predicts protein-coding genes for the assemblies using [Prodigal](https://github.com/hyattpd/Prodigal), and bins with [Prokka](https://github.com/tseemann/prokka) and optionally [MetaEuk](https://www.google.com/search?channel=fs&client=ubuntu-sn&q=MetaEuk) -- performs metagenome binning using [MetaBAT2](https://bitbucket.org/berkeleylab/metabat/src/master/), [MaxBin2](https://sourceforge.net/projects/maxbin2/), and/or with [CONCOCT](https://github.com/BinPro/CONCOCT), and checks the quality of the genome bins using [Busco](https://busco.ezlab.org/), or [CheckM](https://ecogenomics.github.io/CheckM/), and optionally [GUNC](https://grp-bork.embl-community.io/gunc/). +- performs metagenome binning using [MetaBAT2](https://bitbucket.org/berkeleylab/metabat/src/master/), [MaxBin2](https://sourceforge.net/projects/maxbin2/), and/or with [CONCOCT](https://github.com/BinPro/CONCOCT), and checks the quality of the genome bins using [Busco](https://busco.ezlab.org/), [CheckM](https://ecogenomics.github.io/CheckM/), or [CheckM2](https://github.com/chklovski/CheckM2) and optionally [GUNC](https://grp-bork.embl-community.io/gunc/). - Performs ancient DNA validation and repair with [pyDamage](https://github.com/maxibor/pydamage) and [freebayes](https://github.com/freebayes/freebayes) - optionally refines bins with [DAS Tool](https://github.com/cmks/DAS_Tool) - assigns taxonomy to bins using [GTDB-Tk](https://github.com/Ecogenomics/GTDBTk) and/or [CAT](https://github.com/dutilh/CAT) and optionally identifies viruses in assemblies using [geNomad](https://github.com/apcamargo/genomad), or Eukaryotes with [Tiara](https://github.com/ibe-uw/tiara) diff --git a/bin/combine_tables.py b/bin/combine_tables.py index a2dcf986..2b8d3767 100755 --- a/bin/combine_tables.py +++ b/bin/combine_tables.py @@ -3,10 +3,9 @@ ## Originally written by Daniel Straub and Sabrina Krakau and released under the MIT license. ## See git repository (https://github.com/nf-core/mag) for full license text. - -import sys import argparse -import os.path +import sys + import pandas as pd @@ -19,19 +18,14 @@ def parse_args(args=None): metavar="FILE", help="Bin depths summary file.", ) + parser.add_argument("-b", "--binqc_summary", metavar="FILE", help="BUSCO summary file.") + parser.add_argument("-q", "--quast_summary", metavar="FILE", help="QUAST BINS summary file.") + parser.add_argument("-g", "--gtdbtk_summary", metavar="FILE", help="GTDB-Tk summary file.") + parser.add_argument("-a", "--cat_summary", metavar="FILE", help="CAT table file.") parser.add_argument( - "-b", "--busco_summary", metavar="FILE", help="BUSCO summary file." - ) - parser.add_argument( - "-c", "--checkm_summary", metavar="FILE", help="CheckM summary file." - ) - parser.add_argument( - "-q", "--quast_summary", metavar="FILE", help="QUAST BINS summary file." - ) - parser.add_argument( - "-g", "--gtdbtk_summary", metavar="FILE", help="GTDB-Tk summary file." + "-t", "--binqc_tool", help="Bin QC tool used", choices=["busco", "checkm", "checkm2"] ) - parser.add_argument("-a", "--cat_summary", metavar="FILE", help="CAT table file.") + parser.add_argument( "-o", "--out", @@ -81,9 +75,7 @@ def parse_cat_table(cat_table): ) # merge all rank columns into a single column df["CAT_rank"] = ( - df.filter(regex="rank_\d+") - .apply(lambda x: ";".join(x.dropna()), axis=1) - .str.lstrip() + df.filter(regex="rank_\d+").apply(lambda x: ";".join(x.dropna()), axis=1).str.lstrip() ) # remove rank_* columns df.drop(df.filter(regex="rank_\d+").columns, axis=1, inplace=True) @@ -95,39 +87,36 @@ def main(args=None): args = parse_args(args) if ( - not args.busco_summary - and not args.checkm_summary + not args.binqc_summary and not args.quast_summary and not args.gtdbtk_summary ): sys.exit( - "No summary specified! Please specify at least BUSCO, CheckM or QUAST summary." + "No summary specified! " + "Please specify at least BUSCO, CheckM, CheckM2 or QUAST summary." ) - # GTDB-Tk can only be run in combination with BUSCO or CheckM - if args.gtdbtk_summary and not (args.busco_summary or args.checkm_summary): + # GTDB-Tk can only be run in combination with BUSCO, CheckM or CheckM2 + if args.gtdbtk_summary and not args.binqc_summary: sys.exit( - "Invalid parameter combination: GTDB-TK summary specified, but no BUSCO or CheckM summary!" + "Invalid parameter combination: " + "GTDB-TK summary specified, but no BUSCO, CheckM or CheckM2 summary!" ) # handle bin depths results = pd.read_csv(args.depths_summary, sep="\t") - results.columns = [ - "Depth " + str(col) if col != "bin" else col for col in results.columns - ] + results.columns = ["Depth " + str(col) if col != "bin" else col for col in results.columns] bins = results["bin"].sort_values().reset_index(drop=True) - if args.busco_summary: - busco_results = pd.read_csv(args.busco_summary, sep="\t") - if not bins.equals( - busco_results["GenomeBin"].sort_values().reset_index(drop=True) - ): + if args.binqc_summary and args.binqc_tool == "busco": + busco_results = pd.read_csv(args.binqc_summary, sep="\t") + if not bins.equals(busco_results["GenomeBin"].sort_values().reset_index(drop=True)): sys.exit("Bins in BUSCO summary do not match bins in bin depths summary!") results = pd.merge( results, busco_results, left_on="bin", right_on="GenomeBin", how="outer" ) # assuming depths for all bins are given - if args.checkm_summary: + if args.binqc_summary and args.binqc_tool == "checkm": use_columns = [ "Bin Id", "Marker lineage", @@ -147,22 +136,37 @@ def main(args=None): "4", "5+", ] - checkm_results = pd.read_csv(args.checkm_summary, usecols=use_columns, sep="\t") + checkm_results = pd.read_csv(args.binqc_summary, usecols=use_columns, sep="\t") checkm_results["Bin Id"] = checkm_results["Bin Id"] + ".fa" - if not bins.equals( - checkm_results["Bin Id"].sort_values().reset_index(drop=True) - ): + if not set(checkm_results["Bin Id"]).issubset(set(bins)): sys.exit("Bins in CheckM summary do not match bins in bin depths summary!") results = pd.merge( results, checkm_results, left_on="bin", right_on="Bin Id", how="outer" ) # assuming depths for all bins are given results["Bin Id"] = results["Bin Id"].str.removesuffix(".fa") + if args.binqc_summary and args.binqc_tool == "checkm2": + use_columns = [ + "Name", + "Completeness", + "Contamination", + "Completeness_Model_Used", + "Coding_Density", + "Translation_Table_Used", + "Total_Coding_Sequences", + ] + checkm2_results = pd.read_csv(args.binqc_summary, usecols=use_columns, sep="\t") + checkm2_results["Name"] = checkm2_results["Name"] + ".fa" + if not set(checkm2_results["Name"]).issubset(set(bins)): + sys.exit("Bins in CheckM2 summary do not match bins in bin depths summary!") + results = pd.merge( + results, checkm2_results, left_on="bin", right_on="Name", how="outer" + ) # assuming depths for all bins are given + results["Name"] = results["Name"].str.removesuffix(".fa") + if args.quast_summary: quast_results = pd.read_csv(args.quast_summary, sep="\t") - if not bins.equals( - quast_results["Assembly"].sort_values().reset_index(drop=True) - ): + if not bins.equals(quast_results["Assembly"].sort_values().reset_index(drop=True)): sys.exit("Bins in QUAST summary do not match bins in bin depths summary!") results = pd.merge( results, quast_results, left_on="bin", right_on="Assembly", how="outer" diff --git a/conf/base.config b/conf/base.config index 21a8ac3e..9be49eaa 100644 --- a/conf/base.config +++ b/conf/base.config @@ -160,12 +160,14 @@ process { cpus = { 8 * task.attempt } memory = { 20.GB * task.attempt } } - withName: MAXBIN2 { errorStrategy = { task.exitStatus in [1, 255] ? 'ignore' : 'retry' } } - withName: DASTOOL_DASTOOL { errorStrategy = { task.exitStatus in ((130..145) + 104) ? 'retry' : task.exitStatus == 1 ? 'ignore' : 'finish' } } + //CheckM2 returns exit code 1 when Diamond doesn't find any hits + withName: CHECKM2_PREDICT { + errorStrategy = { task.exitStatus in (130..145) ? 'retry' : task.exitStatus == 1 ? 'ignore' : 'finish' } + } } diff --git a/conf/modules.config b/conf/modules.config index f5af5335..bcad2756 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -405,7 +405,11 @@ process { withName: CHECKM_LINEAGEWF { tag = { "${meta.assembler}-${meta.binner}-${meta.domain}-${meta.refinement}-${meta.id}" } ext.prefix = { "${meta.assembler}-${meta.binner}-${meta.domain}-${meta.refinement}-${meta.id}_wf" } - publishDir = [path: { "${params.outdir}/GenomeBinning/QC/CheckM" }, mode: params.publish_dir_mode, saveAs: { filename -> filename.equals('versions.yml') ? null : filename }] + publishDir = [ + path: { "${params.outdir}/GenomeBinning/QC/CheckM" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] } withName: CHECKM_QA { @@ -418,9 +422,31 @@ process { ] } - withName: COMBINE_CHECKM_TSV { - ext.prefix = { "checkm_summary" } - publishDir = [path: { "${params.outdir}/GenomeBinning/QC" }, mode: params.publish_dir_mode, saveAs: { filename -> filename.equals('versions.yml') ? null : filename }] + withName: COMBINE_BINQC_TSV { + ext.prefix = { "${params.binqc_tool}_summary" } + publishDir = [ + path: { "${params.outdir}/GenomeBinning/QC" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + + withName: CHECKM2_DATABASEDOWNLOAD { + publishDir = [ + path: { "${params.outdir}/GenomeBinning/QC/CheckM2/checkm2_downloads" }, + mode: params.publish_dir_mode, overwrite: false, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, + enabled: params.save_checkm2_data + ] + } + + withName: CHECKM2_PREDICT { + ext.prefix = { "${meta.assembler}-${meta.binner}-${meta.domain}-${meta.refinement}-${meta.id}" } + publishDir = [ + path: { "${params.outdir}/GenomeBinning/QC/CheckM2" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] } withName: GUNC_DOWNLOADDB { diff --git a/conf/test_adapterremoval.config b/conf/test_adapterremoval.config index 63f04cf6..fc433a8b 100644 --- a/conf/test_adapterremoval.config +++ b/conf/test_adapterremoval.config @@ -31,7 +31,7 @@ params { skip_krona = true min_length_unbinned_contigs = 1 max_unbinned_contigs = 2 - busco_db = "https://busco-data.ezlab.org/v5/data/lineages/bacteria_odb10.2024-01-08.tar.gz" + binqc_tool = 'checkm' skip_gtdbtk = true gtdbtk_min_completeness = 0.01 clip_tool = 'adapterremoval' diff --git a/conf/test_bbnorm.config b/conf/test_bbnorm.config index 223f99a3..a434f584 100644 --- a/conf/test_bbnorm.config +++ b/conf/test_bbnorm.config @@ -36,8 +36,7 @@ params { skip_krona = true min_length_unbinned_contigs = 1 max_unbinned_contigs = 2 - busco_db = "https://busco-data.ezlab.org/v5/data/lineages/bacteria_odb10.2024-01-08.tar.gz" - busco_clean = true + binqc_tool = 'checkm2' skip_gtdbtk = true gtdbtk_min_completeness = 0.01 bbnorm = true diff --git a/docs/output.md b/docs/output.md index 987be311..c50100e8 100644 --- a/docs/output.md +++ b/docs/output.md @@ -554,7 +554,7 @@ Besides the reference files or output files created by BUSCO, the following summ #### CheckM -[CheckM](https://ecogenomics.github.io/CheckM/) CheckM provides a set of tools for assessing the quality of genomes recovered from isolates, single cells, or metagenomes. It provides robust estimates of genome completeness and contamination by using collocated sets of genes that are ubiquitous and single-copy within a phylogenetic lineage +[CheckM](https://ecogenomics.github.io/CheckM/) provides a set of tools for assessing the quality of genomes recovered from isolates, single cells, or metagenomes. It provides robust estimates of genome completeness and contamination by using collocated sets of genes that are ubiquitous and single-copy within a phylogenetic lineage By default, nf-core/mag runs CheckM with the `check_lineage` workflow that places genome bins on a reference tree to define lineage-marker sets, to check for completeness and contamination based on lineage-specific marker genes. and then subsequently runs `qa` to generate the summary files. @@ -564,7 +564,8 @@ By default, nf-core/mag runs CheckM with the `check_lineage` workflow that place - `GenomeBinning/QC/CheckM/` - `[assembler]-[binner]-[domain]-[refinement]-[sample/group]_qa.txt`: Detailed statistics about bins informing completeness and contamamination scores (output of `checkm qa`). This should normally be your main file to use to evaluate your results. - `[assembler]-[binner]-[domain]-[refinement]-[sample/group]_wf.tsv`: Overall summary file for completeness and contamination (output of `checkm lineage_wf`). - - `[assembler]-[binner]-[domain]-[refinement]-[sample/group]/`: intermediate files for CheckM results, including CheckM generated annotations, log, lineage markers etc. + - `[assembler]-[binner]-[domain]-[refinement]-[sample/group]/`: Intermediate files for CheckM results, including CheckM generated annotations, log, lineage markers etc. +- `GenomeBinning/QC/` - `checkm_summary.tsv`: A summary table of the CheckM results for all bins (output of `checkm qa`). @@ -580,6 +581,31 @@ If the parameter `--save_checkm_reference` is set, additionally the used the Che +#### CheckM2 + +[CheckM2](https://github.com/chklovski/CheckM2) is a tool for assessing the quality of metagenome-derived genomes. It uses a machine learning approach to predict the completeness and contamination of a genome regardless of its taxonomic lineage. + +
+Output files + +- `GenomeBinning/QC/CheckM2/` + - `[assembler]-[binner]-[domain]-[refinement]-[sample/group]/quality_report.tsv`: Detailed statistics about bins informing completeness and contamamination scores. This should normally be your main file to use to evaluate your results. + - `[assembler]-[binner]-[domain]-[refinement]-[sample/group]/`: Intermediate files for CheckM2 results, including CheckM2 generated annotations, log, and DIAMOND alignment results. +- `GenomeBinning/QC/` + - `checkm2_summary.tsv`: A summary table of the CheckM2 results for all bins. + +
+ +If the parameter `--save_checkm2_data` is set, the CheckM2 reference datasets will be stored in the output directory. + +
+Output files + +- `GenomeBinning/QC/CheckM2/` + - `checkm2_downloads/CheckM2_database/*.dmnd`: Diamond database used by CheckM2. + +
+ #### GUNC [Genome UNClutterer (GUNC)](https://grp-bork.embl-community.io/gunc/index.html) is a tool for detection of chimerism and contamination in prokaryotic genomes resulting from mis-binning of genomic contigs from unrelated lineages. It does so by applying an entropy based score on taxonomic assignment and contig location of all genes in a genome. It is generally considered as a additional complement to CheckM results. diff --git a/modules.json b/modules.json index d56420d0..25021c0d 100644 --- a/modules.json +++ b/modules.json @@ -54,12 +54,22 @@ }, "checkm/lineagewf": { "branch": "master", - "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", + "git_sha": "3ea318161b8788623cec477bde0f089180b2245b", "installed_by": ["modules"] }, "checkm/qa": { "branch": "master", - "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", + "git_sha": "867961a8ef91135475ca48c83743646038be4196", + "installed_by": ["modules"] + }, + "checkm2/databasedownload": { + "branch": "master", + "git_sha": "e17652681c856afaf2e240ba4c98bf4631a0fe2d", + "installed_by": ["modules"] + }, + "checkm2/predict": { + "branch": "master", + "git_sha": "e17652681c856afaf2e240ba4c98bf4631a0fe2d", "installed_by": ["modules"] }, "chopper": { @@ -144,12 +154,12 @@ }, "gunc/mergecheckm": { "branch": "master", - "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", + "git_sha": "b6515a01897b11b64b3368858c0359b4c813ad1e", "installed_by": ["modules"] }, "gunc/run": { "branch": "master", - "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", + "git_sha": "b6515a01897b11b64b3368858c0359b4c813ad1e", "installed_by": ["modules"] }, "gunzip": { diff --git a/modules/local/bin_summary.nf b/modules/local/bin_summary.nf index b387174c..07784d83 100644 --- a/modules/local/bin_summary.nf +++ b/modules/local/bin_summary.nf @@ -1,36 +1,36 @@ process BIN_SUMMARY { conda "conda-forge::pandas=1.4.3" - container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/pandas:1.4.3' : - 'biocontainers/pandas:1.4.3' }" + container "${workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container + ? 'https://depot.galaxyproject.org/singularity/pandas:1.4.3' + : 'biocontainers/pandas:1.4.3'}" input: - path(bin_depths) - path(busco_sum) - path(checkm_sum) - path(quast_sum) - path(gtdbtk_sum) - path(cat_sum) + path bin_depths + path binqc_sum + path quast_sum + path gtdbtk_sum + path cat_sum + val binqc_tool output: - path("bin_summary.tsv"), emit: summary - path "versions.yml" , emit: versions + path "bin_summary.tsv", emit: summary + path "versions.yml" , emit: versions script: - def busco_summary = busco_sum.sort().size() > 0 ? "--busco_summary ${busco_sum}" : "" - def checkm_summary = checkm_sum.sort().size() > 0 ? "--checkm_summary ${checkm_sum}" : "" - def quast_summary = quast_sum.sort().size() > 0 ? "--quast_summary ${quast_sum}" : "" + def binqc_summary = binqc_sum.sort().size() > 0 ? "--binqc_summary ${binqc_sum}" : "" + def quast_summary = quast_sum.sort().size() > 0 ? "--quast_summary ${quast_sum}" : "" def gtdbtk_summary = gtdbtk_sum.sort().size() > 0 ? "--gtdbtk_summary ${gtdbtk_sum}" : "" - def cat_summary = cat_sum.sort().size() > 0 ? "--cat_summary ${cat_sum}" : "" + def cat_summary = cat_sum.sort().size() > 0 ? "--cat_summary ${cat_sum}" : "" """ - combine_tables.py --depths_summary ${bin_depths} \ - $busco_summary \ - $checkm_summary \ - $quast_summary \ - $gtdbtk_summary \ - $cat_summary \ - --out bin_summary.tsv + combine_tables.py \ + --depths_summary ${bin_depths} \ + ${binqc_summary} \ + ${quast_summary} \ + ${gtdbtk_summary} \ + ${cat_summary} \ + --binqc_tool ${binqc_tool} \ + --out bin_summary.tsv cat <<-END_VERSIONS > versions.yml "${task.process}": diff --git a/modules/local/busco_save_download.nf b/modules/local/busco_save_download.nf index 74bcffbf..099c4150 100644 --- a/modules/local/busco_save_download.nf +++ b/modules/local/busco_save_download.nf @@ -2,18 +2,23 @@ process BUSCO_SAVE_DOWNLOAD { // execute sequentially to avoid artefacts when saving files for multiple busco instances maxForks 1 - conda "conda-forge::sed=4.7" - container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/ubuntu:20.04' : - 'nf-core/ubuntu:20.04' }" + conda "conda-forge::bash=5.2.21" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container + ? 'https://depot.galaxyproject.org/singularity/ubuntu:20.04' + : 'nf-core/ubuntu:20.04' }" input: path(busco_downloads) output: - path('busco_downloads/**', includeInputs: true) + path 'busco_downloads/**', includeInputs: true, emit: busco_files + path 'versions.yml' , emit: versions script: """ + cat <<-END_VERSIONS > versions.yml + "${task.process}": + bash: \$(echo \$BASH_VERSION) + END_VERSIONS """ } diff --git a/modules/local/combine_tsv.nf b/modules/local/combine_tsv.nf index 5e62be27..1fe7ec1a 100644 --- a/modules/local/combine_tsv.nf +++ b/modules/local/combine_tsv.nf @@ -7,7 +7,7 @@ process COMBINE_TSV { 'biocontainers/bioawk:1.0--hed695b0_5' }" input: - path(bin_summaries) + path(bin_summaries, stageAs: "bin_summaries/*.tsv") output: path("*.tsv") , emit: combined diff --git a/modules/nf-core/checkm/lineagewf/environment.yml b/modules/nf-core/checkm/lineagewf/environment.yml new file mode 100644 index 00000000..1b870502 --- /dev/null +++ b/modules/nf-core/checkm/lineagewf/environment.yml @@ -0,0 +1,5 @@ +channels: + - conda-forge + - bioconda +dependencies: + - bioconda::checkm-genome=1.2.3 diff --git a/modules/nf-core/checkm/lineagewf/main.nf b/modules/nf-core/checkm/lineagewf/main.nf index d8674ddc..67fd8f35 100644 --- a/modules/nf-core/checkm/lineagewf/main.nf +++ b/modules/nf-core/checkm/lineagewf/main.nf @@ -2,10 +2,10 @@ process CHECKM_LINEAGEWF { tag "$meta.id" label 'process_medium' - conda "bioconda::checkm-genome=1.2.1" + conda "${moduleDir}/environment.yml" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/checkm-genome:1.2.1--pyhdfd78af_0' : - 'biocontainers/checkm-genome:1.2.1--pyhdfd78af_0' }" + 'https://depot.galaxyproject.org/singularity/checkm-genome:1.2.3--pyhdfd78af_1' : + 'biocontainers/checkm-genome:1.2.3--pyhdfd78af_1' }" input: tuple val(meta), path(fasta, stageAs: "input_bins/*") @@ -22,22 +22,34 @@ process CHECKM_LINEAGEWF { task.ext.when == null || task.ext.when script: - def args = task.ext.args ?: '' - prefix = task.ext.prefix ?: "${meta.id}" - checkm_db = db ? "export CHECKM_DATA_PATH=${db}" : "" + def args = task.ext.args ?: '' + def checkm_db = db ? "export CHECKM_DATA_PATH=${db}" : "" + prefix = task.ext.prefix ?: "${meta.id}" """ - $checkm_db + ${checkm_db} checkm \\ lineage_wf \\ - -t $task.cpus \\ + -t ${task.cpus} \\ -f ${prefix}.tsv \\ --tab_table \\ - --pplacer_threads $task.cpus \\ - -x $fasta_ext \\ - $args \\ + --pplacer_threads ${task.cpus} \\ + -x ${fasta_ext} \\ + ${args} \\ input_bins/ \\ - $prefix + ${prefix} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + checkm: \$( checkm 2>&1 | grep '...:::' | sed 's/.*CheckM v//;s/ .*//' ) + END_VERSIONS + """ + + stub: + prefix = task.ext.prefix ?: "${meta.id}" + """ + mkdir ${prefix}/ + touch ${prefix}/lineage.ms ${prefix}.tsv cat <<-END_VERSIONS > versions.yml "${task.process}": diff --git a/modules/nf-core/checkm/lineagewf/meta.yml b/modules/nf-core/checkm/lineagewf/meta.yml index 4716a3e9..e32441d2 100644 --- a/modules/nf-core/checkm/lineagewf/meta.yml +++ b/modules/nf-core/checkm/lineagewf/meta.yml @@ -1,5 +1,6 @@ name: checkm_lineagewf -description: CheckM provides a set of tools for assessing the quality of genomes recovered from isolates, single cells, or metagenomes. +description: CheckM provides a set of tools for assessing the quality of genomes recovered + from isolates, single cells, or metagenomes. keywords: - checkm - mag @@ -14,52 +15,69 @@ keywords: - genome bins tools: - checkm: - description: Assess the quality of microbial genomes recovered from isolates, single cells, and metagenomes. + description: Assess the quality of microbial genomes recovered from isolates, + single cells, and metagenomes. homepage: https://ecogenomics.github.io/CheckM/ documentation: https://github.com/Ecogenomics/CheckM/wiki tool_dev_url: https://github.com/Ecogenomics/CheckM doi: "10.1101/gr.186072.114" licence: ["GPL v3"] - + identifier: biotools:checkm input: - - meta: - type: map - description: | - Groovy Map containing sample information - e.g. [ id:'test', single_end:false ] - - fasta: - type: file - description: One or a list of multiple FASTA files of each bin, with extension defined with the fasta_ext value - pattern: "*.{$fasta_ext}" - - fasta_ext: - type: value - description: The file-type extension suffix of the input FASTA files (e.g., fasta, fna, fa, fas) - - db: - type: directory - description: Optional directory pointing to checkM database to prevent re-downloading - + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - fasta: + type: file + description: One or a list of multiple FASTA files of each bin, with extension + defined with the fasta_ext value + pattern: "*.{$fasta_ext}" + - - fasta_ext: + type: string + description: The file-type extension suffix of the input FASTA files (e.g., + fasta, fna, fa, fas) + - - db: + type: directory + description: Optional directory pointing to checkM database to prevent re-downloading output: - - meta: - type: map - description: | - Groovy Map containing sample information - e.g. [ id:'sample', bin:'1' ] - - versions: - type: file - description: File containing software versions - pattern: "versions.yml" - - checkm_output: - type: directory - description: CheckM output directory - pattern: "*/" - checkm_output: - type: file - description: Lineage markfer file - pattern: "lineage.ms" + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'sample', bin:'1' ] + - ${prefix}: + type: directory + description: CheckM output directory + pattern: "*/" + - marker_file: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'sample', bin:'1' ] + - ${prefix}/lineage.ms: + type: file + description: Lineage file + pattern: "*.ms" - checkm_tsv: - type: file - description: CheckM summary completeness statistics table - pattern: "*.tsv" - + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'sample', bin:'1' ] + - ${prefix}.tsv: + type: file + description: CheckM summary completeness statistics table + pattern: "*.tsv" + - versions: + - versions.yml: + type: file + description: File containing software versions + pattern: "versions.yml" authors: - "@jfy133" +maintainers: + - "@jfy133" diff --git a/modules/nf-core/checkm/lineagewf/tests/main.nf.test b/modules/nf-core/checkm/lineagewf/tests/main.nf.test new file mode 100644 index 00000000..8d60100e --- /dev/null +++ b/modules/nf-core/checkm/lineagewf/tests/main.nf.test @@ -0,0 +1,61 @@ +nextflow_process { + name "Test Process CHECKM_LINEAGEWF" + script "../main.nf" + process "CHECKM_LINEAGEWF" + + tag "modules" + tag "modules_nfcore" + tag "checkm" + tag "checkm/lineagewf" + + test("checkm - lineage_wf") { + + when { + process { + """ + input[0] = [ + [ id:'test', single_end:false ], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fasta/contigs.fasta', checkIfExists: true) + ] + input[1] = 'fasta' + input[2] = [] // Download CheckM database + """ + } + } + + then { + assert process.success + assert file(process.out.checkm_output[0][1]).list().find { file(it).name == "checkm.log" } + assert snapshot( + path(process.out.marker_file[0][1]).readLines().any{it.contains("PF00312.17")}, + process.out.checkm_tsv, + process.out.versions + ).match() + } + + } + + test("stub") { + + options "-stub" + + when { + process { + """ + input[0] = [ + [ id:'test', single_end:false ], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fasta/contigs.fasta', checkIfExists: true) + ] + input[1] = 'fasta' + input[2] = [] // Download CheckM database + """ + } + } + + then { + assert process.success + assert snapshot(process.out).match() + } + + } +} diff --git a/modules/nf-core/checkm/lineagewf/tests/main.nf.test.snap b/modules/nf-core/checkm/lineagewf/tests/main.nf.test.snap new file mode 100644 index 00000000..6d6d7f75 --- /dev/null +++ b/modules/nf-core/checkm/lineagewf/tests/main.nf.test.snap @@ -0,0 +1,99 @@ +{ + "stub": { + "content": [ + { + "0": [ + [ + { + "id": "test", + "single_end": false + }, + [ + "lineage.ms:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ] + ], + "1": [ + [ + { + "id": "test", + "single_end": false + }, + "lineage.ms:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "2": [ + [ + { + "id": "test", + "single_end": false + }, + "test.tsv:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "3": [ + "versions.yml:md5,08f99a3a9677aba1509cda63dcf5ce71" + ], + "checkm_output": [ + [ + { + "id": "test", + "single_end": false + }, + [ + "lineage.ms:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ] + ], + "checkm_tsv": [ + [ + { + "id": "test", + "single_end": false + }, + "test.tsv:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "marker_file": [ + [ + { + "id": "test", + "single_end": false + }, + "lineage.ms:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "versions": [ + "versions.yml:md5,08f99a3a9677aba1509cda63dcf5ce71" + ] + } + ], + "meta": { + "nf-test": "0.9.1", + "nextflow": "24.10.0" + }, + "timestamp": "2024-11-05T04:36:45.930077242" + }, + "checkm - lineage_wf": { + "content": [ + true, + [ + [ + { + "id": "test", + "single_end": false + }, + "test.tsv:md5,d5559764f563c4b55223e4e4a3dc1ec9" + ] + ], + [ + "versions.yml:md5,08f99a3a9677aba1509cda63dcf5ce71" + ] + ], + "meta": { + "nf-test": "0.9.1", + "nextflow": "24.10.0" + }, + "timestamp": "2024-11-05T04:27:36.491322471" + } +} \ No newline at end of file diff --git a/modules/nf-core/checkm/lineagewf/tests/tags.yml b/modules/nf-core/checkm/lineagewf/tests/tags.yml new file mode 100644 index 00000000..04438be8 --- /dev/null +++ b/modules/nf-core/checkm/lineagewf/tests/tags.yml @@ -0,0 +1,2 @@ +checkm/lineagewf: + - modules/nf-core/checkm/lineagewf/** diff --git a/modules/nf-core/checkm/qa/environment.yml b/modules/nf-core/checkm/qa/environment.yml new file mode 100644 index 00000000..1b870502 --- /dev/null +++ b/modules/nf-core/checkm/qa/environment.yml @@ -0,0 +1,5 @@ +channels: + - conda-forge + - bioconda +dependencies: + - bioconda::checkm-genome=1.2.3 diff --git a/modules/nf-core/checkm/qa/main.nf b/modules/nf-core/checkm/qa/main.nf index b0c0e69a..042f8b04 100644 --- a/modules/nf-core/checkm/qa/main.nf +++ b/modules/nf-core/checkm/qa/main.nf @@ -1,11 +1,11 @@ process CHECKM_QA { - tag "$meta.id" + tag "${meta.id}" label 'process_low' - conda "bioconda::checkm-genome=1.2.1" + conda "${moduleDir}/environment.yml" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/checkm-genome:1.2.1--pyhdfd78af_0' : - 'biocontainers/checkm-genome:1.2.1--pyhdfd78af_0' }" + 'https://depot.galaxyproject.org/singularity/checkm-genome:1.2.3--pyhdfd78af_1' : + 'biocontainers/checkm-genome:1.2.3--pyhdfd78af_1' }" input: tuple val(meta), path(analysis_dir), path(marker_file), path(coverage_file) @@ -23,18 +23,29 @@ process CHECKM_QA { def args = task.ext.args ?: '' prefix = task.ext.prefix ?: "${meta.id}" suffix = task.ext.args?.matches(".*-o 9.*|.*--out_file 9.*") ? "fasta" : "txt" - def coverage = coverage_file ? "--coverage_file ${coverage_file}" : "" - def exclude = exclude_marker_file ? "--exclude_markers ${marker_filer}" : "" + def coverage = coverage_file && coverage_file.isFile() ? "--coverage_file ${coverage_file}" : "" + def exclude = exclude_marker_file && exclude_marker_file.isFile() ? "--exclude_markers ${exclude_marker_file}" : "" """ checkm \\ qa \\ --threads ${task.cpus} \\ --file ${prefix}.${suffix} \\ - $marker_file \\ - $analysis_dir \\ - $coverage \\ - $exclude \\ - $args + ${marker_file} \\ + ${analysis_dir} \\ + ${coverage} \\ + ${exclude} \\ + ${args} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + checkm: \$( checkm 2>&1 | grep '...:::' | sed 's/.*CheckM v//;s/ .*//' ) + END_VERSIONS + """ + + stub: + prefix = task.ext.prefix ?: "${meta.id}" + """ + touch ${prefix}.txt ${prefix}.fasta cat <<-END_VERSIONS > versions.yml "${task.process}": diff --git a/modules/nf-core/checkm/qa/meta.yml b/modules/nf-core/checkm/qa/meta.yml index d0af39af..cd41eaec 100644 --- a/modules/nf-core/checkm/qa/meta.yml +++ b/modules/nf-core/checkm/qa/meta.yml @@ -1,5 +1,6 @@ name: checkm_qa -description: CheckM provides a set of tools for assessing the quality of genomes recovered from isolates, single cells, or metagenomes. +description: CheckM provides a set of tools for assessing the quality of genomes recovered + from isolates, single cells, or metagenomes. keywords: - checkm - mag @@ -16,52 +17,64 @@ keywords: - quality assurnce tools: - checkm: - description: Assess the quality of microbial genomes recovered from isolates, single cells, and metagenomes. + description: Assess the quality of microbial genomes recovered from isolates, + single cells, and metagenomes. homepage: https://ecogenomics.github.io/CheckM/ documentation: https://github.com/Ecogenomics/CheckM/wiki tool_dev_url: https://github.com/Ecogenomics/CheckM doi: "10.1101/gr.186072.114" licence: ["GPL v3"] - + identifier: biotools:checkm input: - - meta: - type: map - description: | - Groovy Map containing sample information - e.g. [ id:'test', single_end:false ] - - analysis_dir: - type: file - description: Directory containing output of checkm/analyze or checkm/lineage_wf etc. - pattern: "*" - - marker_file: - type: file - description: Marker file specified during checkm/analyze or produced by checkm/{lineage,taxonomy}_wf - pattern: "*.ms" - - coverage_file: - type: file - description: File containing coverage of each sequence (generated by checkm coverage) - - exclude_marker_file: - type: file - description: File specifying markers to exclude from marker sets - + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - analysis_dir: + type: file + description: Directory containing output of checkm/analyze or checkm/lineage_wf + etc. + pattern: "*" + - marker_file: + type: file + description: Marker file specified during checkm/analyze or produced by checkm/{lineage,taxonomy}_wf + pattern: "*.ms" + - coverage_file: + type: file + description: File containing coverage of each sequence (generated by checkm + coverage) + - - exclude_marker_file: + type: file + description: File specifying markers to exclude from marker sets output: - - meta: - type: map - description: | - Groovy Map containing sample information - e.g. [ id:'test', single_end:false ] - - versions: - type: file - description: File containing software versions - pattern: "versions.yml" - output: - type: file - description: "Default completeness statistics in various formats, as specified with --out_format (excluding option: 9)" - pattern: "*.txt" + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - ${prefix}.txt: + type: file + description: "Default completeness statistics in various formats, as specified + with --out_format (excluding option: 9)" + pattern: "*.txt" - fasta: - type: file - description: Output in fasta format (only if --out_format 9) - pattern: "*.fasta" - + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - ${prefix}.fasta: + type: file + description: Output in fasta format (only if --out_format 9) + pattern: "*.fasta" + - versions: + - versions.yml: + type: file + description: File containing software versions + pattern: "versions.yml" authors: - "@jfy133" +maintainers: + - "@jfy133" diff --git a/modules/nf-core/checkm/qa/tests/main.nf.test b/modules/nf-core/checkm/qa/tests/main.nf.test new file mode 100644 index 00000000..8037bbc2 --- /dev/null +++ b/modules/nf-core/checkm/qa/tests/main.nf.test @@ -0,0 +1,88 @@ +nextflow_process { + name "Test Process CHECKM_QA" + script "../main.nf" + process "CHECKM_QA" + + tag "modules" + tag "modules_nfcore" + tag "checkm" + tag "checkm/qa" + tag "checkm/lineagewf" + + test("checkm - qa") { + + setup { + run("CHECKM_LINEAGEWF") { + script "../../lineagewf/main.nf" + process { + """ + input[0] = [ + [ id:'test', single_end:false ], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fasta/contigs.fasta', checkIfExists: true) + ] + input[1] = 'fasta' + input[2] = [] // Download CheckM database + """ + } + } + } + + when { + process { + """ + input[0] = CHECKM_LINEAGEWF.out.checkm_output.join(CHECKM_LINEAGEWF.out.marker_file) + .map { v -> v + [file('NO_FILE')] } + input[1] = [] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + + test("stub") { + + options "-stub" + + setup { + run("CHECKM_LINEAGEWF") { + script "../../lineagewf/main.nf" + process { + """ + input[0] = [ + [ id:'test', single_end:false ], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fasta/contigs.fasta', checkIfExists: true) + ] + input[1] = 'fasta' + input[2] = [] // Download CheckM database + """ + } + } + } + + when { + process { + """ + input[0] = CHECKM_LINEAGEWF.out.checkm_output.join(CHECKM_LINEAGEWF.out.marker_file) + .map { v -> v + [file('NO_FILE')] } + input[1] = [] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + +} \ No newline at end of file diff --git a/modules/nf-core/checkm/qa/tests/main.nf.test.snap b/modules/nf-core/checkm/qa/tests/main.nf.test.snap new file mode 100644 index 00000000..77eca77b --- /dev/null +++ b/modules/nf-core/checkm/qa/tests/main.nf.test.snap @@ -0,0 +1,96 @@ +{ + "checkm - qa": { + "content": [ + { + "0": [ + [ + { + "id": "test", + "single_end": false + }, + "test.txt:md5,645f4282569afb4b171396732b2d2582" + ] + ], + "1": [ + + ], + "2": [ + "versions.yml:md5,7a0683a78cbf54a6a69ee64055c584a6" + ], + "fasta": [ + + ], + "output": [ + [ + { + "id": "test", + "single_end": false + }, + "test.txt:md5,645f4282569afb4b171396732b2d2582" + ] + ], + "versions": [ + "versions.yml:md5,7a0683a78cbf54a6a69ee64055c584a6" + ] + } + ], + "meta": { + "nf-test": "0.9.1", + "nextflow": "24.10.0" + }, + "timestamp": "2024-11-05T04:44:09.849072843" + }, + "stub": { + "content": [ + { + "0": [ + [ + { + "id": "test", + "single_end": false + }, + "test.txt:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "1": [ + [ + { + "id": "test", + "single_end": false + }, + "test.fasta:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "2": [ + "versions.yml:md5,7a0683a78cbf54a6a69ee64055c584a6" + ], + "fasta": [ + [ + { + "id": "test", + "single_end": false + }, + "test.fasta:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "output": [ + [ + { + "id": "test", + "single_end": false + }, + "test.txt:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "versions": [ + "versions.yml:md5,7a0683a78cbf54a6a69ee64055c584a6" + ] + } + ], + "meta": { + "nf-test": "0.9.1", + "nextflow": "24.10.0" + }, + "timestamp": "2024-11-05T04:14:12.680834625" + } +} \ No newline at end of file diff --git a/modules/nf-core/checkm/qa/tests/tags.yml b/modules/nf-core/checkm/qa/tests/tags.yml new file mode 100644 index 00000000..08b4747b --- /dev/null +++ b/modules/nf-core/checkm/qa/tests/tags.yml @@ -0,0 +1,3 @@ +checkm/qa: + - modules/nf-core/checkm/lineagewf/** + - modules/nf-core/checkm/qa/** diff --git a/modules/nf-core/checkm2/databasedownload/environment.yml b/modules/nf-core/checkm2/databasedownload/environment.yml new file mode 100644 index 00000000..52d11ba9 --- /dev/null +++ b/modules/nf-core/checkm2/databasedownload/environment.yml @@ -0,0 +1,5 @@ +channels: + - conda-forge + - bioconda +dependencies: + - conda-forge::aria2=1.36.0 diff --git a/modules/nf-core/checkm2/databasedownload/main.nf b/modules/nf-core/checkm2/databasedownload/main.nf new file mode 100644 index 00000000..6144067b --- /dev/null +++ b/modules/nf-core/checkm2/databasedownload/main.nf @@ -0,0 +1,55 @@ +import groovy.json.JsonSlurper + +process CHECKM2_DATABASEDOWNLOAD { + label 'process_single' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/aria2:1.36.0': + 'biocontainers/aria2:1.36.0' }" + + input: + val(db_zenodo_id) + + output: + tuple val(meta), path("checkm2_db_v${db_version}.dmnd"), emit: database + path("versions.yml") , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + zenodo_id = db_zenodo_id ?: 5571251 // Default to latest version if no ID provided + api_data = (new JsonSlurper()).parseText(file("https://zenodo.org/api/records/${zenodo_id}").text) + db_version = api_data.metadata.version + checksum = api_data.files[0].checksum.replaceFirst(/^md5:/, "md5=") + meta = [id: 'checkm2_db', version: db_version] + """ + # Automatic download is broken when using singularity/apptainer (https://github.com/chklovski/CheckM2/issues/73) + # So it's necessary to download the database manually + aria2c \ + ${args} \ + --checksum ${checksum} \ + https://zenodo.org/records/${zenodo_id}/files/checkm2_database.tar.gz + + tar -xzf checkm2_database.tar.gz + db_path=\$(find -name *.dmnd) + mv \$db_path checkm2_db_v${db_version}.dmnd + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + aria2: \$(echo \$(aria2c --version 2>&1) | grep 'aria2 version' | cut -f3 -d ' ') + END_VERSIONS + """ + + stub: + """ + touch checkm_db.dmnd + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + checkm2: \$(checkm2 --version) + END_VERSIONS + """ +} diff --git a/modules/nf-core/checkm2/databasedownload/meta.yml b/modules/nf-core/checkm2/databasedownload/meta.yml new file mode 100644 index 00000000..632b4922 --- /dev/null +++ b/modules/nf-core/checkm2/databasedownload/meta.yml @@ -0,0 +1,42 @@ +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/yaml-schema.json +name: "checkm2_databasedownload" +description: CheckM2 database download +keywords: + - checkm + - mag + - metagenome + - quality + - completeness + - contamination + - bins +tools: + - "checkm2": + description: "CheckM2 - Rapid assessment of genome bin quality using machine learning" + homepage: "https://github.com/chklovski/CheckM2" + doi: "10.1038/s41592-023-01940-w" + licence: ["GPL v3"] + identifier: "" + +input: + - - db_zenodo_id: + type: integer + description: Zenodo ID of the CheckM2 database to download + +output: + - database: + - meta: + type: map + description: | + Groovy Map containing database information + e.g. `[ id:'test', version:1 ]` + - checkm2_db_v${db_version}.dmnd: + type: file + description: CheckM2 database file + pattern: "checkm2_db_v*.dmnd" + - versions: + - versions.yml: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@dialvarezs" diff --git a/modules/nf-core/checkm2/databasedownload/tests/main.nf.test b/modules/nf-core/checkm2/databasedownload/tests/main.nf.test new file mode 100644 index 00000000..2a98f051 --- /dev/null +++ b/modules/nf-core/checkm2/databasedownload/tests/main.nf.test @@ -0,0 +1,30 @@ +nextflow_process { + + name "Test Process CHECKM2_DATABASEDOWNLOAD" + tag "modules_nfcore" + tag "modules" + tag "checkm2" + tag "checkm2/databasedownload" + script "modules/nf-core/checkm2/databasedownload/main.nf" + process "CHECKM2_DATABASEDOWNLOAD" + + test("Test CheckM2 Database Download") { + + when { + process { + """ + input[0] = [] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out.versions).match() }, + ) + } + + } + +} diff --git a/modules/nf-core/checkm2/databasedownload/tests/main.nf.test.snap b/modules/nf-core/checkm2/databasedownload/tests/main.nf.test.snap new file mode 100644 index 00000000..403d26fd --- /dev/null +++ b/modules/nf-core/checkm2/databasedownload/tests/main.nf.test.snap @@ -0,0 +1,10 @@ +{ + "Test CheckM2 Database Download": { + "content": [ + [ + "versions.yml:md5,6201d5ac7aca6e32b98daf4f8656aa2a" + ] + ], + "timestamp": "2024-09-16T22:23:54.183040031" + } +} \ No newline at end of file diff --git a/modules/nf-core/checkm2/databasedownload/tests/tags.yml b/modules/nf-core/checkm2/databasedownload/tests/tags.yml new file mode 100644 index 00000000..46266770 --- /dev/null +++ b/modules/nf-core/checkm2/databasedownload/tests/tags.yml @@ -0,0 +1,2 @@ +checkm2/databasedownload: + - modules/nf-core/checkm2/databasedownload/** diff --git a/modules/nf-core/checkm2/predict/environment.yml b/modules/nf-core/checkm2/predict/environment.yml new file mode 100644 index 00000000..18fd1f51 --- /dev/null +++ b/modules/nf-core/checkm2/predict/environment.yml @@ -0,0 +1,5 @@ +channels: + - conda-forge + - bioconda +dependencies: + - bioconda::checkm2=1.0.2 diff --git a/modules/nf-core/checkm2/predict/main.nf b/modules/nf-core/checkm2/predict/main.nf new file mode 100644 index 00000000..25271ba9 --- /dev/null +++ b/modules/nf-core/checkm2/predict/main.nf @@ -0,0 +1,52 @@ +process CHECKM2_PREDICT { + tag "${meta.id}" + label 'process_medium' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/checkm2:1.0.2--pyh7cba7a3_0': + 'biocontainers/checkm2:1.0.2--pyh7cba7a3_0' }" + + input: + tuple val(meta), path(fasta, stageAs: "input_bins/*") + tuple val(dbmeta), path(db) + + output: + tuple val(meta), path("${prefix}") , emit: checkm2_output + tuple val(meta), path("${prefix}/quality_report.tsv"), emit: checkm2_tsv + path("versions.yml") , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + prefix = task.ext.prefix ?: "${meta.id}" + """ + checkm2 \\ + predict \\ + --input ${fasta} \\ + --output-directory ${prefix} \\ + --threads ${task.cpus} \\ + --database_path ${db} \\ + ${args} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + checkm2: \$(checkm2 --version) + END_VERSIONS + """ + + stub: + def args = task.ext.args ?: '' + prefix = task.ext.prefix ?: "${meta.id}" + """ + mkdir -p ${prefix}/diamond_output ${prefix}/protein_files + touch ${prefix}/quality_report.tsv ${prefix}/checkm2.log + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + checkm2: \$(checkm2 --version) + END_VERSIONS + """ +} diff --git a/modules/nf-core/checkm2/predict/meta.yml b/modules/nf-core/checkm2/predict/meta.yml new file mode 100644 index 00000000..48cc9fbc --- /dev/null +++ b/modules/nf-core/checkm2/predict/meta.yml @@ -0,0 +1,65 @@ +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/yaml-schema.json +name: "checkm2_predict" +description: CheckM2 bin quality prediction +keywords: + - checkm + - mag + - metagenome + - quality + - completeness + - contamination + - bins +tools: + - "checkm2": + description: "CheckM2 - Rapid assessment of genome bin quality using machine learning" + homepage: "https://github.com/chklovski/CheckM2" + doi: "10.1038/s41592-023-01940-w" + licence: ["GPL v3"] + identifier: "" + +input: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'test' ]` + - fasta: + type: file + description: One or multiple FASTA files of each bin + pattern: "*.{fasta,fna,fa}" + - - dbmeta: + type: map + description: | + Groovy Map containing database information + e.g. `[ id:'test', version:1 ]` + - db: + type: file + description: CheckM2 database +output: + - checkm2_output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'test' ]` + - ${prefix}: + type: directory + description: CheckM2 output directory + pattern: "${prefix}/" + - checkm2_tsv: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'test' ]` + - ${prefix}/quality_report.tsv: + type: file + description: CheckM2 summary completeness statistics table + pattern: "*.tsv" + - versions: + - versions.yml: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@dialvarezs" diff --git a/modules/nf-core/checkm2/predict/tests/main.nf.test b/modules/nf-core/checkm2/predict/tests/main.nf.test new file mode 100644 index 00000000..e825f74c --- /dev/null +++ b/modules/nf-core/checkm2/predict/tests/main.nf.test @@ -0,0 +1,46 @@ +nextflow_process { + + name "Test Process CHECKM2_PREDICT" + tag "modules_nfcore" + tag "modules" + tag "checkm2" + tag "checkm2/predict" + tag "checkm2/databasedownload" + script "modules/nf-core/checkm2/predict/main.nf" + process "CHECKM2_PREDICT" + + test("Test CheckM2 Predict") { + + setup { + run("CHECKM2_DATABASEDOWNLOAD") { + script "../../databasedownload/main.nf" + process { + """ + input[0] = [] + """ + } + } + } + + when { + params { + outdir = "${launchDir}/tests/results" + } + process { + """ + input[0] = [ [id: 'test'], [file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fasta/contigs.fasta', checkIfExists: true)] ] + input[1] = CHECKM2_DATABASEDOWNLOAD.out.database + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out.checkm2_tsv, process.out.versions).match() } + ) + } + + } + +} diff --git a/modules/nf-core/checkm2/predict/tests/main.nf.test.snap b/modules/nf-core/checkm2/predict/tests/main.nf.test.snap new file mode 100644 index 00000000..6fd2e918 --- /dev/null +++ b/modules/nf-core/checkm2/predict/tests/main.nf.test.snap @@ -0,0 +1,18 @@ +{ + "Test CheckM2 Predict": { + "content": [ + [ + [ + { + "id": "test" + }, + "quality_report.tsv:md5,7f05ff49d18697304575d1106a871501" + ] + ], + [ + "versions.yml:md5,088ec2d8a46efd530c11019328064bff" + ] + ], + "timestamp": "2024-09-16T22:43:50.787486798" + } +} \ No newline at end of file diff --git a/modules/nf-core/checkm2/predict/tests/tags.yml b/modules/nf-core/checkm2/predict/tests/tags.yml new file mode 100644 index 00000000..c31d112a --- /dev/null +++ b/modules/nf-core/checkm2/predict/tests/tags.yml @@ -0,0 +1,3 @@ +checkm2/predict: + - modules/nf-core/checkm2/predict/** + - modules/nf-core/checkm2/databasedownload/** diff --git a/modules/nf-core/gunc/mergecheckm/environment.yml b/modules/nf-core/gunc/mergecheckm/environment.yml new file mode 100644 index 00000000..3a0264f4 --- /dev/null +++ b/modules/nf-core/gunc/mergecheckm/environment.yml @@ -0,0 +1,6 @@ +channels: + - conda-forge + - bioconda + +dependencies: + - bioconda::gunc=1.0.6 diff --git a/modules/nf-core/gunc/mergecheckm/main.nf b/modules/nf-core/gunc/mergecheckm/main.nf index b6399f22..611f916c 100644 --- a/modules/nf-core/gunc/mergecheckm/main.nf +++ b/modules/nf-core/gunc/mergecheckm/main.nf @@ -2,10 +2,10 @@ process GUNC_MERGECHECKM { tag "$meta.id" label 'process_single' - conda "bioconda::gunc=1.0.5" + conda "${moduleDir}/environment.yml" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/gunc:1.0.5--pyhdfd78af_0' : - 'biocontainers/gunc:1.0.5--pyhdfd78af_0' }" + 'https://depot.galaxyproject.org/singularity/gunc:1.0.6--pyhdfd78af_0' : + 'biocontainers/gunc:1.0.6--pyhdfd78af_0' }" input: tuple val(meta), path(gunc_file), path(checkm_file) @@ -33,4 +33,14 @@ process GUNC_MERGECHECKM { gunc: \$( gunc --version ) END_VERSIONS """ + + stub: + """ + touch gunc_merge_checkm.tsv + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + gunc: \$( gunc --version ) + END_VERSIONS + """ } diff --git a/modules/nf-core/gunc/mergecheckm/meta.yml b/modules/nf-core/gunc/mergecheckm/meta.yml index a88298f7..4a7a2c1c 100644 --- a/modules/nf-core/gunc/mergecheckm/meta.yml +++ b/modules/nf-core/gunc/mergecheckm/meta.yml @@ -11,42 +11,45 @@ keywords: - chimeras tools: - gunc: - description: Python package for detection of chimerism and contamination in prokaryotic genomes. + description: Python package for detection of chimerism and contamination in prokaryotic + genomes. homepage: https://grp-bork.embl-community.io/gunc/ documentation: https://grp-bork.embl-community.io/gunc/ tool_dev_url: https://github.com/grp-bork/gunc doi: "10.1186/s13059-021-02393-0" licence: ["GNU General Public v3 or later (GPL v3+)"] - + identifier: biotools:gunc input: - - meta: - type: map - description: | - Groovy Map containing sample information - e.g. [ id:'test', single_end:false ] - - gunc_file: - type: file - description: Path of a gunc_scores.tsv file (mandatory) - pattern: "*.{bam,cram,sam}" - - checkm_file: - type: file - description: Output TSV from CheckM qa (ideally with -o 2 extended format) (mandatory) - pattern: "*.{bam,cram,sam}" - + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - gunc_file: + type: file + description: Path of a gunc_scores.tsv file (mandatory) + pattern: "*.{bam,cram,sam}" + - checkm_file: + type: file + description: Output TSV from CheckM qa (ideally with -o 2 extended format) (mandatory) + pattern: "*.{bam,cram,sam}" output: - - meta: - type: map - description: | - Groovy Map containing sample information - e.g. [ id:'test', single_end:false ] - - versions: - type: file - description: File containing software versions - pattern: "versions.yml" - tsv: - type: file - description: Merged checkm/gunc results in TSV format - pattern: "*.tsv" - + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - "*.tsv": + type: file + description: Merged checkm/gunc results in TSV format + pattern: "*.tsv" + - versions: + - versions.yml: + type: file + description: File containing software versions + pattern: "versions.yml" authors: - "@jfy133" +maintainers: + - "@jfy133" diff --git a/modules/nf-core/gunc/mergecheckm/tests/main.nf.test b/modules/nf-core/gunc/mergecheckm/tests/main.nf.test new file mode 100644 index 00000000..dbd67b90 --- /dev/null +++ b/modules/nf-core/gunc/mergecheckm/tests/main.nf.test @@ -0,0 +1,175 @@ +nextflow_process { + + name "Test Process GUNC_MERGECHECKM" + script "../main.nf" + process "GUNC_MERGECHECKM" + config "./nextflow.config" + + tag "modules_nfcore" + tag "modules" + tag "gunc" + tag "gunc/mergecheckm" + tag "gunc/run" + tag "gunc/downloaddb" + tag "checkm/lineagewf" + tag "checkm/qa" + + // commented out because GitHub runners are not able to run this test + // test("gunc - mergecheckm") { + + // setup { + // run("CHECKM_LINEAGEWF") { + // script "../../../checkm/lineagewf/main.nf" + // process { + // """ + // input[0] = [ + // [id: 'test'], // meta map + // file( + // params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fasta/contigs.fasta', + // checkIfExists: true + // ) + // ] + // input[1] = 'fasta' + // input[2] = [] // Download CheckM database + // """ + // } + // } + + // run("CHECKM_QA") { + // script "../../../checkm/qa/main.nf" + // process { + // """ + // input[0] = CHECKM_LINEAGEWF.out.checkm_output + // .join(CHECKM_LINEAGEWF.out.marker_file) + // .map { sample_data -> sample_data + [file('NO_FILE')] } + // input[1] = [] + // """ + // } + // } + + // run("GUNC_DOWNLOADDB") { + // script "../../downloaddb/main.nf" + // process { + // """ + // input[0] = 'progenomes' + // """ + // } + // } + + // run("GUNC_RUN") { + // script "../../run/main.nf" + // process { + // """ + // input[0] = [ + // [id: 'test'], + // [file( + // params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fasta/contigs.fasta', + // checkIfExists: true + // )] + // ] + // input[1] = GUNC_DOWNLOADDB.out.db + // """ + // } + // } + // } + + // when { + // params { + // outdir = "${launchDir}/tests/results" + // } + // process { + // """ + // input[0] = GUNC_RUN.out.maxcss_level_tsv.join(CHECKM_QA.out.output) + // """ + // } + // } + + // then { + // assertAll( + // { assert process.success }, + // { assert snapshot(process.out).match() } + // ) + // } + + // } + + test("gunc - mergecheckm - stub") { + + options "-stub" + + setup { + run("CHECKM_LINEAGEWF") { + script "../../../checkm/lineagewf/main.nf" + process { + """ + input[0] = [ + [id: 'test'], // meta map + file( + params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fasta/contigs.fasta', + checkIfExists: true + ) + ] + input[1] = 'fasta' + input[2] = [] // Download CheckM database + """ + } + } + + run("CHECKM_QA") { + script "../../../checkm/qa/main.nf" + process { + """ + input[0] = CHECKM_LINEAGEWF.out.checkm_output + .join(CHECKM_LINEAGEWF.out.marker_file) + .map { v -> v + [file('NO_FILE')] } + input[1] = [] + """ + } + } + + run("GUNC_DOWNLOADDB") { + script "../../downloaddb/main.nf" + process { + """ + input[0] = 'progenomes' + """ + } + } + + run("GUNC_RUN") { + script "../../run/main.nf" + process { + """ + input[0] = [ + [id: 'test'], + [file( + params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fasta/contigs.fasta', + checkIfExists: true + )] + ] + input[1] = GUNC_DOWNLOADDB.out.db + """ + } + } + } + + when { + params { + outdir = "${launchDir}/tests/results" + } + process { + """ + input[0] = GUNC_RUN.out.maxcss_level_tsv.join(CHECKM_QA.out.output) + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } +} \ No newline at end of file diff --git a/modules/nf-core/gunc/mergecheckm/tests/main.nf.test.snap b/modules/nf-core/gunc/mergecheckm/tests/main.nf.test.snap new file mode 100644 index 00000000..807c23f2 --- /dev/null +++ b/modules/nf-core/gunc/mergecheckm/tests/main.nf.test.snap @@ -0,0 +1,68 @@ +{ + "gunc - mergecheckm": { + "content": [ + { + "0": [ + [ + { + "id": "test" + }, + "GUNC_checkM.merged.tsv:md5,24cbd3c76a36cb90ac993c83525a2c1b" + ] + ], + "1": [ + "versions.yml:md5,a94747201129170b1cfbce5e59de62b0" + ], + "tsv": [ + [ + { + "id": "test" + }, + "GUNC_checkM.merged.tsv:md5,24cbd3c76a36cb90ac993c83525a2c1b" + ] + ], + "versions": [ + "versions.yml:md5,a94747201129170b1cfbce5e59de62b0" + ] + } + ], + "meta": { + "nf-test": "0.9.2", + "nextflow": "24.10.0" + }, + "timestamp": "2024-11-22T09:37:48.146410153" + }, + "gunc - mergecheckm - stub": { + "content": [ + { + "0": [ + [ + { + "id": "test" + }, + "gunc_merge_checkm.tsv:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "1": [ + "versions.yml:md5,a94747201129170b1cfbce5e59de62b0" + ], + "tsv": [ + [ + { + "id": "test" + }, + "gunc_merge_checkm.tsv:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "versions": [ + "versions.yml:md5,a94747201129170b1cfbce5e59de62b0" + ] + } + ], + "meta": { + "nf-test": "0.9.2", + "nextflow": "24.10.0" + }, + "timestamp": "2024-11-21T16:47:06.752273424" + } +} \ No newline at end of file diff --git a/modules/nf-core/gunc/mergecheckm/tests/nextflow.config b/modules/nf-core/gunc/mergecheckm/tests/nextflow.config new file mode 100644 index 00000000..1e9ba3dc --- /dev/null +++ b/modules/nf-core/gunc/mergecheckm/tests/nextflow.config @@ -0,0 +1,5 @@ +process { + withName: CHECKM_QA { + ext.args = '--tab_table' + } +} diff --git a/modules/nf-core/gunc/mergecheckm/tests/tags.yml b/modules/nf-core/gunc/mergecheckm/tests/tags.yml new file mode 100644 index 00000000..d05282f2 --- /dev/null +++ b/modules/nf-core/gunc/mergecheckm/tests/tags.yml @@ -0,0 +1,6 @@ +gunc/run: + - modules/nf-core/gunc/mergecheckm/** + - modules/nf-core/gunc/run/** + - modules/nf-core/gunc/downloaddb/** + - modules/nf-core/checkm/lineagewf/** + - modules/nf-core/checkm/qa/** diff --git a/modules/nf-core/gunc/run/environment.yml b/modules/nf-core/gunc/run/environment.yml new file mode 100644 index 00000000..3a0264f4 --- /dev/null +++ b/modules/nf-core/gunc/run/environment.yml @@ -0,0 +1,6 @@ +channels: + - conda-forge + - bioconda + +dependencies: + - bioconda::gunc=1.0.6 diff --git a/modules/nf-core/gunc/run/main.nf b/modules/nf-core/gunc/run/main.nf index 2f1167fa..9ee614e4 100644 --- a/modules/nf-core/gunc/run/main.nf +++ b/modules/nf-core/gunc/run/main.nf @@ -2,13 +2,13 @@ process GUNC_RUN { tag "$meta.id" label 'process_medium' - conda "bioconda::gunc=1.0.5" + conda "${moduleDir}/environment.yml" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/gunc:1.0.5--pyhdfd78af_0' : - 'biocontainers/gunc:1.0.5--pyhdfd78af_0' }" + 'https://depot.galaxyproject.org/singularity/gunc:1.0.6--pyhdfd78af_0' : + 'biocontainers/gunc:1.0.6--pyhdfd78af_0' }" input: - tuple val(meta), path(fasta) + tuple val(meta), path(fasta_files, stageAs: 'input_files/*') path(db) output: @@ -23,9 +23,10 @@ process GUNC_RUN { def args = task.ext.args ?: '' def prefix = task.ext.prefix ?: "${meta.id}" """ + ls input_files/* > input_files.txt gunc \\ run \\ - --input_fasta $fasta \\ + --input_file input_files.txt \\ --db_file $db \\ --threads $task.cpus \\ $args @@ -35,4 +36,14 @@ process GUNC_RUN { gunc: \$( gunc --version ) END_VERSIONS """ + + stub: + """ + touch maxCSS_level.tsv all_levels.tsv + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + gunc: \$( gunc --version ) + END_VERSIONS + """ } diff --git a/modules/nf-core/gunc/run/meta.yml b/modules/nf-core/gunc/run/meta.yml index 3a85e1fb..3ecc0b74 100644 --- a/modules/nf-core/gunc/run/meta.yml +++ b/modules/nf-core/gunc/run/meta.yml @@ -8,46 +8,55 @@ keywords: - chimeras tools: - gunc: - description: Python package for detection of chimerism and contamination in prokaryotic genomes. + description: Python package for detection of chimerism and contamination in prokaryotic + genomes. homepage: https://grp-bork.embl-community.io/gunc/ documentation: https://grp-bork.embl-community.io/gunc/ tool_dev_url: https://github.com/grp-bork/gunc doi: "10.1186/s13059-021-02393-0" licence: ["GNU General Public v3 or later (GPL v3+)"] - + identifier: biotools:gunc input: - - meta: - type: map - description: | - Groovy Map containing sample information - e.g. [ id:'test', single_end:false ] - - fasta: - type: file - description: FASTA file containing contig (bins) - pattern: "*.fa" - - db: - type: file - description: GUNC database file - pattern: "*.dmnd" - + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - fasta_files: + type: file + description: A list of FASTA files containing contig (bins) + pattern: "*.fa" + - - db: + type: file + description: GUNC database file + pattern: "*.dmnd" output: - - meta: - type: map - description: | - Groovy Map containing sample information - e.g. [ id:'test', single_end:false ] - - versions: - type: file - description: File containing software versions - pattern: "versions.yml" - - maxcss_levels_tsv: - type: file - description: Output file with scores for a taxonomic level with the highest CSS score - pattern: "*.tsv" + - maxcss_level_tsv: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - "*maxCSS_level.tsv": + type: file + description: Output file with results for the maximum CSS level + pattern: "*.tsv" - all_levels_tsv: - type: file - description: Optional output file with results for each taxonomic level - pattern: "*.tsv" - + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - "*all_levels.tsv": + type: file + description: Optional output file with results for each taxonomic level + pattern: "*.tsv" + - versions: + - versions.yml: + type: file + description: File containing software versions + pattern: "versions.yml" authors: - "@jfy133" +maintainers: + - "@jfy133" diff --git a/modules/nf-core/gunc/run/tests/main.nf.test b/modules/nf-core/gunc/run/tests/main.nf.test new file mode 100644 index 00000000..c1659f0c --- /dev/null +++ b/modules/nf-core/gunc/run/tests/main.nf.test @@ -0,0 +1,96 @@ +nextflow_process { + + name "Test Process GUNC_RUN" + script "../main.nf" + process "GUNC_RUN" + + tag "modules_nfcore" + tag "modules" + tag "gunc" + tag "gunc/run" + tag "gunc/downloaddb" + + // commented out because GitHub runners are not able to run this test + // test("gunc - run") { + + // setup { + // run("GUNC_DOWNLOADDB") { + // script "../../downloaddb/main.nf" + // process { + // """ + // input[0] = 'progenomes' + // """ + // } + // } + // } + + // when { + // params { + // outdir = "${launchDir}/tests/results" + // } + // process { + // """ + // input[0] = [ + // [id: 'test'], + // [file( + // params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fasta/contigs.fasta', + // checkIfExists: true + // )] + // ] + // input[1] = GUNC_DOWNLOADDB.out.db + // """ + // } + // } + + // then { + // assertAll( + // { assert process.success }, + // { assert snapshot(process.out).match() } + // ) + // } + + // } + + test("gunc - run - stub") { + + options "-stub" + + setup { + run("GUNC_DOWNLOADDB") { + script "../../downloaddb/main.nf" + process { + """ + input[0] = 'progenomes' + """ + } + } + } + + when { + params { + outdir = "${launchDir}/tests/results" + } + process { + """ + input[0] = [ + [id: 'test'], + [file( + params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fasta/contigs.fasta', + checkIfExists: true + )] + ] + input[1] = GUNC_DOWNLOADDB.out.db + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + +} \ No newline at end of file diff --git a/modules/nf-core/gunc/run/tests/main.nf.test.snap b/modules/nf-core/gunc/run/tests/main.nf.test.snap new file mode 100644 index 00000000..516425c8 --- /dev/null +++ b/modules/nf-core/gunc/run/tests/main.nf.test.snap @@ -0,0 +1,90 @@ +{ + "gunc - run - stub": { + "content": [ + { + "0": [ + [ + { + "id": "test" + }, + "maxCSS_level.tsv:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "1": [ + [ + { + "id": "test" + }, + "all_levels.tsv:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "2": [ + "versions.yml:md5,2ee4942c0187a663aed4b66af3bead6a" + ], + "all_levels_tsv": [ + [ + { + "id": "test" + }, + "all_levels.tsv:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "maxcss_level_tsv": [ + [ + { + "id": "test" + }, + "maxCSS_level.tsv:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "versions": [ + "versions.yml:md5,2ee4942c0187a663aed4b66af3bead6a" + ] + } + ], + "meta": { + "nf-test": "0.9.2", + "nextflow": "24.10.0" + }, + "timestamp": "2024-11-21T17:29:46.904708749" + }, + "gunc - run": { + "content": [ + { + "0": [ + [ + { + "id": "test" + }, + "GUNC.progenomes_2.1.maxCSS_level.tsv:md5,938826458a44404d0bf2e7cb4edde405" + ] + ], + "1": [ + + ], + "2": [ + "versions.yml:md5,2ee4942c0187a663aed4b66af3bead6a" + ], + "all_levels_tsv": [ + + ], + "maxcss_level_tsv": [ + [ + { + "id": "test" + }, + "GUNC.progenomes_2.1.maxCSS_level.tsv:md5,938826458a44404d0bf2e7cb4edde405" + ] + ], + "versions": [ + "versions.yml:md5,2ee4942c0187a663aed4b66af3bead6a" + ] + } + ], + "meta": { + "nf-test": "0.9.2", + "nextflow": "24.10.0" + }, + "timestamp": "2024-11-22T10:12:03.813571948" + } +} \ No newline at end of file diff --git a/modules/nf-core/gunc/run/tests/tags.yml b/modules/nf-core/gunc/run/tests/tags.yml new file mode 100644 index 00000000..0af96444 --- /dev/null +++ b/modules/nf-core/gunc/run/tests/tags.yml @@ -0,0 +1,3 @@ +gunc/run: + - modules/nf-core/gunc/run/** + - modules/nf-core/gunc/downloaddb/** diff --git a/nextflow.config b/nextflow.config index 774e2ad2..cdb2d9f0 100644 --- a/nextflow.config +++ b/nextflow.config @@ -123,7 +123,7 @@ params { refine_bins_dastool = false refine_bins_dastool_threshold = 0.5 postbinning_input = 'raw_bins_only' - exclude_unbins_from_postbinning = false + exclude_unbins_from_postbinning = false // Bin QC skip_binqc = false @@ -135,6 +135,9 @@ params { checkm_download_url = "https://zenodo.org/records/7401545/files/checkm_data_2015_01_16.tar.gz" checkm_db = null save_checkm_data = false + checkm2_db = null + checkm2_db_version = 5571251 // corresponds to Zenodo record ID + save_checkm2_data = false run_gunc = false gunc_database_type = 'progenomes' gunc_db = null diff --git a/nextflow_schema.json b/nextflow_schema.json index 32acf06f..dedb286b 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -731,13 +731,13 @@ "properties": { "skip_binqc": { "type": "boolean", - "description": "Disable bin QC with BUSCO or CheckM." + "description": "Disable bin QC with BUSCO, CheckM or CheckM2." }, "binqc_tool": { "type": "string", "default": "busco", "description": "Specify which tool for bin quality-control validation to use.", - "enum": ["busco", "checkm"] + "enum": ["busco", "checkm", "checkm2"] }, "busco_db": { "type": "string", @@ -775,6 +775,21 @@ "description": "Save the used CheckM reference files downloaded when not using --checkm_db parameter.", "help_text": "If specified, the directories and files decompressed from the `tar.gz` file downloaded from the [CheckM FTP server](https://data.ace.uq.edu.au/public/CheckM_databases/) will be stored in your output directory alongside your CheckM results." }, + "checkm2_db": { + "type": "string", + "description": "Path to local folder containing already downloaded and uncompressed CheckM2 database (.dmnd file).", + "help_text": "The pipeline can also download this for you if not specified, and you can save the resulting directory into your output directory by specifying `--save_checkm2_data`. You should move this directory to somewhere else on your machine (and supply back to the pipeline in future runs again with `--checkm2_db`)." + }, + "checkm2_db_version": { + "type": "integer", + "default": 5571251, + "description": "CheckM2 database version number to download (Zenodo record ID, for reference check the canonical reference https://zenodo.org/records/5571251, and pick the Zenodo ID of the database version of your choice)." + }, + "save_checkm2_data": { + "type": "boolean", + "description": "Save the used CheckM2 reference files downloaded when not using --checkm2_db parameter.", + "help_text": "If specified, the directories and files decompressed from the `tar.gz` file downloaded from the [Zenodo repository](https://zenodo.org/records/5571251) will be stored in your output directory alongside your CheckM2 results." + }, "refine_bins_dastool": { "type": "boolean", "description": "Turn on bin refinement using DAS Tool." diff --git a/subworkflows/local/bin_qc.nf b/subworkflows/local/bin_qc.nf new file mode 100644 index 00000000..5a83d140 --- /dev/null +++ b/subworkflows/local/bin_qc.nf @@ -0,0 +1,234 @@ +/* + * BUSCO/CheckM/CheckM2/GUNC: Quantitative measures for the assessment of genome assembly + */ + +include { ARIA2 as ARIA2_UNTAR } from '../../modules/nf-core/aria2/main' +include { CHECKM2_DATABASEDOWNLOAD } from '../../modules/nf-core/checkm2/databasedownload/main' +include { BUSCO_DB_PREPARATION } from '../../modules/local/busco_db_preparation' +include { BUSCO } from '../../modules/local/busco' +include { BUSCO_SAVE_DOWNLOAD } from '../../modules/local/busco_save_download' +include { BUSCO_SUMMARY } from '../../modules/local/busco_summary' +include { CHECKM_QA } from '../../modules/nf-core/checkm/qa/main' +include { CHECKM_LINEAGEWF } from '../../modules/nf-core/checkm/lineagewf/main' +include { CHECKM2_PREDICT } from '../../modules/nf-core/checkm2/predict/main' +include { COMBINE_TSV as COMBINE_BINQC_TSV } from '../../modules/local/combine_tsv' +include { GUNC_DOWNLOADDB } from '../../modules/nf-core/gunc/downloaddb/main' +include { GUNC_RUN } from '../../modules/nf-core/gunc/run/main' +include { GUNC_MERGECHECKM } from '../../modules/nf-core/gunc/mergecheckm/main' + + +workflow BIN_QC { + take: + ch_bins // [ [ meta] , fasta ], input bins (mandatory) + + main: + qc_summary = [] + ch_input_bins_for_qc = ch_bins.transpose() + ch_versions = Channel.empty() + ch_multiqc_files = Channel.empty() + + + /* + ================================ + * Setup databases + ================================ + */ + + if (params.busco_db) { + ch_busco_db = file(params.busco_db, checkIfExists: true) + } + else { + ch_busco_db = [] + } + + if (params.checkm_db) { + ch_checkm_db = file(params.checkm_db, checkIfExists: true) + } + else if (params.binqc_tool == 'checkm') { + ARIA2_UNTAR(params.checkm_download_url) + ch_checkm_db = ARIA2_UNTAR.out.downloaded_file + } + else { + ch_checkm_db = [] + } + + if (params.checkm2_db) { + ch_checkm2_db = [[:], file(params.checkm2_db, checkIfExists: true)] + } + else if (params.binqc_tool == 'checkm2') { + CHECKM2_DATABASEDOWNLOAD(params.checkm2_db_version) + ch_checkm2_db = CHECKM2_DATABASEDOWNLOAD.out.database + } + else { + ch_checkm2_db = [] + } + + if (params.gunc_db) { + ch_gunc_db = file(params.gunc_db, checkIfExists: true) + } + else { + ch_gunc_db = Channel.empty() + } + + + /* + ================================ + * Run QC tools + ================================ + */ + + if (params.binqc_tool == "busco") { + /* + * BUSCO + */ + if (!ch_busco_db.isEmpty()) { + if (ch_busco_db.extension in ['gz', 'tgz']) { + // Expects to be tar.gz! + BUSCO_DB_PREPARATION(ch_busco_db) + ch_db_for_busco = BUSCO_DB_PREPARATION.out.db.map { meta, db -> + [[id: meta, lineage: 'Y'], db] + } + } + else if (ch_busco_db.isDirectory()) { + // Set meta to match expected channel cardinality for BUSCO + ch_db_for_busco = Channel + .of(ch_busco_db) + .collect { db -> + def basename = db.getBaseName() + def lineage = basename.contains('odb10') ? 'Y' : 'N' + [[id: basename, lineage: lineage], db] + } + } + } + else { + // Set BUSCO database to empty to allow for --auto-lineage + ch_db_for_busco = Channel + .of([[lineage: ''], []]) + .collect() + } + + if (params.save_busco_db) { + // publish files downloaded by Busco + ch_downloads = BUSCO.out.busco_downloads + .groupTuple() + .map { _lin, downloads -> downloads[0] } + .toSortedList() + .flatten() + BUSCO_SAVE_DOWNLOAD(ch_downloads) + + ch_versions = ch_versions.mix(BUSCO_SAVE_DOWNLOAD.out.versions.first()) + } + + BUSCO(ch_input_bins_for_qc, ch_db_for_busco) + + BUSCO_SUMMARY( + BUSCO.out.summary_domain.collect { _meta, summary -> summary }.ifEmpty([]), + BUSCO.out.summary_specific.collect { _meta, summary -> summary }.ifEmpty([]), + BUSCO.out.failed_bin.collect { _meta, summary -> summary }.ifEmpty([]) + ) + + ch_multiqc_files = ch_multiqc_files.mix( + BUSCO.out.summary_domain.mix(BUSCO.out.summary_specific).map { _meta, summary -> summary } + ) + qc_summary = BUSCO_SUMMARY.out.summary + ch_versions = ch_versions.mix(BUSCO.out.versions.first()) + } + else if (params.binqc_tool == "checkm") { + /* + * CheckM + */ + ch_bins_for_checkmlineagewf = ch_input_bins_for_qc + .groupTuple() + .filter { meta, _bins -> + meta.domain != "eukarya" + } + .multiMap { meta, fa -> + reads: [meta, fa] + ext: fa.extension.unique().join("") + } + + CHECKM_LINEAGEWF(ch_bins_for_checkmlineagewf.reads, ch_bins_for_checkmlineagewf.ext, ch_checkm_db) + ch_versions = ch_versions.mix(CHECKM_LINEAGEWF.out.versions.first()) + + ch_checkmqa_input = CHECKM_LINEAGEWF.out.checkm_output + .join(CHECKM_LINEAGEWF.out.marker_file) + .map { meta, dir, marker -> + [meta, dir, marker, []] + } + + CHECKM_QA(ch_checkmqa_input, []) + + COMBINE_BINQC_TSV(CHECKM_QA.out.output.collect { summary -> summary[1] }) + + qc_summary = COMBINE_BINQC_TSV.out.combined + ch_versions = ch_versions.mix( + CHECKM_QA.out.versions.first(), + COMBINE_BINQC_TSV.out.versions + ) + } + else if (params.binqc_tool == "checkm2") { + /* + * CheckM2 + */ + CHECKM2_PREDICT(ch_input_bins_for_qc.groupTuple(), ch_checkm2_db) + + COMBINE_BINQC_TSV(CHECKM2_PREDICT.out.checkm2_tsv.collect { summary -> summary[1] }) + + qc_summary = COMBINE_BINQC_TSV.out.combined + ch_versions = ch_versions.mix( + CHECKM2_PREDICT.out.versions.first(), + COMBINE_BINQC_TSV.out.versions + ) + } + + if (params.run_gunc) { + /* + * GUNC + */ + ch_input_bins_for_gunc = ch_bins + .filter { meta, _bins -> + meta.domain != "eukarya" + } + + if (params.gunc_db) { + ch_db_for_gunc = ch_gunc_db + } + else { + ch_db_for_gunc = GUNC_DOWNLOADDB(params.gunc_database_type).db + ch_versions.mix(GUNC_DOWNLOADDB.out.versions) + } + + GUNC_RUN(ch_input_bins_for_gunc, ch_db_for_gunc) + ch_versions.mix(GUNC_RUN.out.versions) + + // Make sure to keep directory in sync with modules.conf + GUNC_RUN.out.maxcss_level_tsv + .map { _meta, gunc_summary -> gunc_summary } + .collectFile( + name: "gunc_summary.tsv", + keepHeader: true, + storeDir: "${params.outdir}/GenomeBinning/QC/" + ) + + if (params.binqc_tool == 'checkm') { + ch_input_to_mergecheckm = GUNC_RUN.out.maxcss_level_tsv.combine(CHECKM_QA.out.output, by: 0) + + GUNC_MERGECHECKM(ch_input_to_mergecheckm) + ch_versions.mix(GUNC_MERGECHECKM.out.versions) + + // Make sure to keep directory in sync with modules.conf + GUNC_MERGECHECKM.out.tsv + .map { _meta, gunc_checkm_summary -> gunc_checkm_summary } + .collectFile( + name: "gunc_checkm_summary.tsv", + keepHeader: true, + storeDir: "${params.outdir}/GenomeBinning/QC/" + ) + } + } + + emit: + qc_summary = qc_summary + multiqc_files = ch_multiqc_files + versions = ch_versions +} diff --git a/subworkflows/local/busco_qc.nf b/subworkflows/local/busco_qc.nf deleted file mode 100644 index a5c3be8d..00000000 --- a/subworkflows/local/busco_qc.nf +++ /dev/null @@ -1,83 +0,0 @@ -/* - * BUSCO: Quantitative measures for the assessment of genome assembly - */ - -include { BUSCO_DB_PREPARATION } from '../../modules/local/busco_db_preparation' -include { BUSCO } from '../../modules/local/busco' -include { BUSCO_SAVE_DOWNLOAD } from '../../modules/local/busco_save_download' -include { BUSCO_SUMMARY } from '../../modules/local/busco_summary' - -workflow BUSCO_QC { - take: - busco_db // channel: path - bins // channel: [ val(meta), path(bin) ] - - main: - if ( !busco_db.isEmpty() ) { - if ( busco_db.extension in ['gz', 'tgz'] ) { - // Expects to be tar.gz! - ch_db_for_busco = BUSCO_DB_PREPARATION ( busco_db ).db - .map{ - meta, db -> - def meta_new = [:] - meta_new['id'] = meta - meta_new['lineage'] = 'Y' - [ meta_new, db ] - } - } else if ( busco_db.isDirectory() ) { - // Set meta to match expected channel cardinality for BUSCO - ch_db_for_busco = Channel - .of(busco_db) - .map{ - db -> - def meta = [:] - meta['id'] = db.getBaseName() - if ( meta['id'].contains('odb10') == true ) { - meta['lineage'] = 'Y' - } else { - meta['lineage'] = 'N' - } - [ meta, db ] - } - .collect() - } - } else { - // Set BUSCO database to empty to allow for --auto-lineage - ch_db_for_busco = Channel - .of([]) - .map{ - empty_db -> - def meta = [:] - meta['lineage'] = '' - [ meta, [] ] - } - .collect() - } - - BUSCO ( - bins, - ch_db_for_busco - ) - - if (params.save_busco_db){ - // publish files downloaded by Busco - ch_downloads = BUSCO.out.busco_downloads.groupTuple().map{lin,downloads -> downloads[0]}.toSortedList().flatten() - BUSCO_SAVE_DOWNLOAD ( ch_downloads ) - } - - busco_summary_domain = BUSCO.out.summary_domain.collect() - busco_summary_specific = BUSCO.out.summary_specific.collect() - busco_failed_bin = BUSCO.out.failed_bin.collect() - - BUSCO_SUMMARY ( - BUSCO.out.summary_domain.map{it[1]}.collect().ifEmpty([]), - BUSCO.out.summary_specific.map{it[1]}.collect().ifEmpty([]), - BUSCO.out.failed_bin.map{it[1]}.collect().ifEmpty([]) - ) - - emit: - summary = BUSCO_SUMMARY.out.summary - failed_bin = BUSCO.out.failed_bin.map{it[1]} - multiqc = BUSCO.out.summary_domain.mix(BUSCO.out.summary_specific).map{it[1]} - versions = BUSCO.out.versions -} diff --git a/subworkflows/local/checkm_qc.nf b/subworkflows/local/checkm_qc.nf deleted file mode 100644 index 70ed9708..00000000 --- a/subworkflows/local/checkm_qc.nf +++ /dev/null @@ -1,44 +0,0 @@ -/* - * CheckM: Quantitative measures for the assessment of genome assembly - */ - -include { CHECKM_QA } from '../../modules/nf-core/checkm/qa/main' -include { CHECKM_LINEAGEWF } from '../../modules/nf-core/checkm/lineagewf/main' -include { COMBINE_TSV as COMBINE_CHECKM_TSV } from '../../modules/local/combine_tsv' - -workflow CHECKM_QC { - take: - bins // channel: [ val(meta), path(bin) ] - checkm_db - - main: - ch_versions = Channel.empty() - - ch_input_checkmdb = checkm_db ? checkm_db : [] - ch_bins_for_checkmlineagewf = bins - .multiMap { - meta, fa -> - reads: [ meta, fa ] - ext: fa.extension.unique().join("") // we set this in the pipeline to always `.fa` so this should be fine - } - - CHECKM_LINEAGEWF ( ch_bins_for_checkmlineagewf.reads, ch_bins_for_checkmlineagewf.ext, checkm_db ) - ch_versions = ch_versions.mix(CHECKM_LINEAGEWF.out.versions.first()) - - ch_checkmqa_input = CHECKM_LINEAGEWF.out.checkm_output - .join(CHECKM_LINEAGEWF.out.marker_file) - .map{ - meta, dir, marker -> - [ meta, dir, marker, []] - } - - CHECKM_QA ( ch_checkmqa_input, [] ) - ch_versions = ch_versions.mix(CHECKM_QA.out.versions.first()) - - COMBINE_CHECKM_TSV ( CHECKM_QA.out.output.map{it[1]}.collect() ) - - emit: - summary = COMBINE_CHECKM_TSV.out.combined - checkm_tsv = CHECKM_QA.out.output - versions = ch_versions -} diff --git a/subworkflows/local/gtdbtk.nf b/subworkflows/local/gtdbtk.nf index 6da5680d..d3d66d47 100644 --- a/subworkflows/local/gtdbtk.nf +++ b/subworkflows/local/gtdbtk.nf @@ -9,8 +9,7 @@ include { GTDBTK_SUMMARY } from '../../modules/local/gtdbtk_summary' workflow GTDBTK { take: bins // channel: [ val(meta), [bins] ] - busco_summary // channel: path - checkm_summary // channel: path + bin_qc_summary // channel: path gtdb // channel: path gtdb_mash // channel: path @@ -19,14 +18,13 @@ workflow GTDBTK { ch_bin_metrics = Channel.empty() if ( params.binqc_tool == 'busco' ){ // Collect completeness and contamination metrics from busco summary - ch_bin_metrics = busco_summary + ch_bin_metrics = bin_qc_summary .splitCsv(header: true, sep: '\t') .map { row -> def completeness = -1 def contamination = -1 def missing, duplicated - def busco_db = file(params.busco_db) - if (busco_db.getBaseName().contains('odb10')) { + if (params.busco_db && file(params.busco_db).getBaseName().contains('odb10')) { missing = row.'%Missing (specific)' // TODO or just take '%Complete'? duplicated = row.'%Complete and duplicated (specific)' } else { @@ -38,13 +36,15 @@ workflow GTDBTK { [row.'GenomeBin', completeness, contamination] } } else { - // Collect completeness and contamination metrics from checkm summary - ch_bin_metrics = checkm_summary + // Collect completeness and contamination metrics from CheckM/CheckM2 summary + bin_name = params.binqc_tool == 'checkm' ? 'Bin Id' : 'Name' + + ch_bin_metrics = bin_qc_summary .splitCsv(header: true, sep: '\t') .map { row -> def completeness = Double.parseDouble(row.'Completeness') def contamination = Double.parseDouble(row.'Contamination') - [row.'Bin Id' + ".fa", completeness, contamination] + [row[bin_name] + ".fa", completeness, contamination] } } diff --git a/subworkflows/local/gunc_qc.nf b/subworkflows/local/gunc_qc.nf deleted file mode 100644 index 912b9425..00000000 --- a/subworkflows/local/gunc_qc.nf +++ /dev/null @@ -1,51 +0,0 @@ -/* - * GUNC: Detection and quantification of genome chimerism based on lineage homogeneity - */ - -include { GUNC_DOWNLOADDB } from '../../modules/nf-core/gunc/downloaddb/main' -include { GUNC_RUN } from '../../modules/nf-core/gunc/run/main' -include { GUNC_MERGECHECKM } from '../../modules/nf-core/gunc/mergecheckm/main' - -workflow GUNC_QC { - take: - ch_bins // [ [ meta] , fasta ], input bins (mandatory) - ch_gunc_db // [ db ], presupplied GUNC database (optional) - ch_checkm_table // [ [ meta ], checkm_qa_table ], extended checkm table from CHECKM_QA, (optional) - - main: - ch_versions = Channel.empty() - - if ( params.gunc_db ) { - ch_db_for_gunc = ch_gunc_db - } else { - ch_db_for_gunc = GUNC_DOWNLOADDB( params.gunc_database_type ).db - ch_versions.mix( GUNC_DOWNLOADDB.out.versions ) - } - - - GUNC_RUN ( ch_bins, ch_db_for_gunc ) - ch_versions.mix( GUNC_RUN.out.versions ) - - // Make sure to keep directory in sync with modules.conf - GUNC_RUN.out.maxcss_level_tsv - .map{it[1]} - .collectFile(name: "gunc_summary.tsv", keepHeader: true, storeDir: "${params.outdir}/GenomeBinning/QC/") - - if ( params.binqc_tool == 'checkm' ) { - - ch_input_to_mergecheckm = GUNC_RUN.out.maxcss_level_tsv - .combine(ch_checkm_table, by: 0) - - GUNC_MERGECHECKM ( ch_input_to_mergecheckm ) - ch_versions.mix( GUNC_MERGECHECKM.out.versions ) - - // Make sure to keep directory in sync with modules.conf - GUNC_MERGECHECKM.out.tsv - .map{it[1]} - .collectFile(name: "gunc_checkm_summary.tsv", keepHeader: true, storeDir: "${params.outdir}/GenomeBinning/QC/") - } - - emit: - versions = ch_versions - -} diff --git a/workflows/mag.nf b/workflows/mag.nf index 7f4ae3ec..1ef0e52b 100644 --- a/workflows/mag.nf +++ b/workflows/mag.nf @@ -14,11 +14,9 @@ include { methodsDescriptionText } from '../subwo // include { BINNING_PREPARATION } from '../subworkflows/local/binning_preparation' include { BINNING } from '../subworkflows/local/binning' +include { BIN_QC } from '../subworkflows/local/bin_qc' include { BINNING_REFINEMENT } from '../subworkflows/local/binning_refinement' -include { BUSCO_QC } from '../subworkflows/local/busco_qc' include { VIRUS_IDENTIFICATION } from '../subworkflows/local/virus_identification' -include { CHECKM_QC } from '../subworkflows/local/checkm_qc' -include { GUNC_QC } from '../subworkflows/local/gunc_qc' include { GTDBTK } from '../subworkflows/local/gtdbtk' include { ANCIENT_DNA_ASSEMBLY_VALIDATION } from '../subworkflows/local/ancient_dna' include { DOMAIN_CLASSIFICATION } from '../subworkflows/local/domain_classification' @@ -29,7 +27,6 @@ include { SHORTREAD_PREPROCESSING } from '../subwo // // MODULE: Installed directly from nf-core/modules // -include { ARIA2 as ARIA2_UNTAR } from '../modules/nf-core/aria2/main' include { UNTAR as CENTRIFUGEDB_UNTAR } from '../modules/nf-core/untar/main' include { CENTRIFUGE_CENTRIFUGE } from '../modules/nf-core/centrifuge/centrifuge/main' include { CENTRIFUGE_KREPORT } from '../modules/nf-core/centrifuge/kreport/main' @@ -66,7 +63,7 @@ include { COMBINE_TSV as COMBINE_SUMMARY_TSV } from '../modul workflow MAG { take: - ch_raw_short_reads // channel: samplesheet read in from --input + ch_raw_short_reads // channel: samplesheet read in from --input ch_raw_long_reads ch_input_assemblies @@ -92,24 +89,6 @@ workflow MAG { ch_host_fasta = Channel.empty() } - if (params.busco_db) { - ch_busco_db = file(params.busco_db, checkIfExists: true) - } - else { - ch_busco_db = [] - } - - if (params.checkm_db) { - ch_checkm_db = file(params.checkm_db, checkIfExists: true) - } - - if (params.gunc_db) { - ch_gunc_db = file(params.gunc_db, checkIfExists: true) - } - else { - ch_gunc_db = Channel.empty() - } - if (params.kraken2_db) { ch_kraken2_db_file = file(params.kraken2_db, checkIfExists: true) } @@ -165,16 +144,6 @@ workflow MAG { ch_metaeuk_db = Channel.empty() } - // Additional info for completion email and summary - def busco_failed_bins = [:] - - // Get checkM database if not supplied - - if (!params.skip_binqc && params.binqc_tool == 'checkm' && !params.checkm_db) { - ARIA2_UNTAR(params.checkm_download_url) - ch_checkm_db = ARIA2_UNTAR.out.downloaded_file - } - // Get mmseqs db for MetaEuk if requested if (!params.skip_metaeuk && params.metaeuk_mmseqs_db) { MMSEQS_DATABASES(params.metaeuk_mmseqs_db) @@ -503,8 +472,7 @@ workflow MAG { ================================================================================ */ - ch_busco_summary = Channel.empty() - ch_checkm_summary = Channel.empty() + ch_bin_qc_summary = Channel.empty() if (!params.skip_binning || params.ancient_dna) { BINNING_PREPARATION( @@ -644,58 +612,14 @@ workflow MAG { ch_versions = ch_versions.mix(DEPTHS.out.versions) /* - * Bin QC subworkflows: for checking bin completeness with either BUSCO, CHECKM, and/or GUNC + * Bin QC subworkflows: for checking bin completeness with either BUSCO, CHECKM, CHECKM2, and/or GUNC */ - ch_input_bins_for_qc = ch_input_for_postbinning.transpose() + if (!params.skip_binqc) { + BIN_QC(ch_input_for_postbinning) - if (!params.skip_binqc && params.binqc_tool == 'busco') { - /* - * BUSCO subworkflow: Quantitative measures for the assessment of genome assembly - */ - - BUSCO_QC( - ch_busco_db, - ch_input_bins_for_qc - ) - ch_busco_summary = BUSCO_QC.out.summary - ch_versions = ch_versions.mix(BUSCO_QC.out.versions.first()) - // process information if BUSCO analysis failed for individual bins due to no matching genes - BUSCO_QC.out.failed_bin.splitCsv(sep: '\t').map { bin, error -> - if (!bin.contains(".unbinned.")) { - busco_failed_bins[bin] = error - } - } - } - - if (!params.skip_binqc && params.binqc_tool == 'checkm') { - /* - * CheckM subworkflow: Quantitative measures for the assessment of genome assembly - */ - - ch_input_bins_for_checkm = ch_input_bins_for_qc.filter { meta, bins -> - meta.domain != "eukarya" - } - - CHECKM_QC( - ch_input_bins_for_checkm.groupTuple(), - ch_checkm_db - ) - ch_checkm_summary = CHECKM_QC.out.summary - - ch_versions = ch_versions.mix(CHECKM_QC.out.versions) - } - - if (params.run_gunc && params.binqc_tool == 'checkm') { - GUNC_QC(ch_input_bins_for_checkm, ch_gunc_db, CHECKM_QC.out.checkm_tsv) - ch_versions = ch_versions.mix(GUNC_QC.out.versions) - } - else if (params.run_gunc) { - ch_input_bins_for_gunc = ch_input_for_postbinning.filter { meta, bins -> - meta.domain != "eukarya" - } - GUNC_QC(ch_input_bins_for_gunc, ch_gunc_db, []) - ch_versions = ch_versions.mix(GUNC_QC.out.versions) + ch_bin_qc_summary = BIN_QC.out.qc_summary + ch_versions = ch_versions.mix(BIN_QC.out.versions) } ch_quast_bins_summary = Channel.empty() @@ -766,8 +690,7 @@ workflow MAG { GTDBTK( ch_gtdb_bins, - ch_busco_summary, - ch_checkm_summary, + ch_bin_qc_summary, gtdb, gtdb_mash ) @@ -782,11 +705,11 @@ workflow MAG { if ((!params.skip_binqc) || !params.skip_quast || !params.skip_gtdbtk) { BIN_SUMMARY( ch_input_for_binsummary, - ch_busco_summary.ifEmpty([]), - ch_checkm_summary.ifEmpty([]), + ch_bin_qc_summary.ifEmpty([]), ch_quast_bins_summary.ifEmpty([]), ch_gtdbtk_summary.ifEmpty([]), - ch_cat_global_summary.ifEmpty([]) + ch_cat_global_summary.ifEmpty([]), + params.binqc_tool ) } @@ -899,7 +822,7 @@ workflow MAG { } if (!params.skip_binning && !params.skip_binqc && params.binqc_tool == 'busco') { - ch_multiqc_files = ch_multiqc_files.mix(BUSCO_QC.out.multiqc.collect().ifEmpty([])) + ch_multiqc_files = ch_multiqc_files.mix(BIN_QC.out.multiqc_files.collect().ifEmpty([])) }