From 53864d2260b382a9f5fad35c7c68c0d7e487db11 Mon Sep 17 00:00:00 2001 From: "Diego Alvarez S." Date: Sun, 27 Oct 2024 18:17:53 -0300 Subject: [PATCH 01/33] feat: Migrate from CheckM to CheckM2 - Update modules - Update integration in mag and with other tools (bin_summary, gtdb-tk) - Update test - Update schema --- .github/workflows/ci.yml | 18 ++-- bin/combine_tables.py | 85 ++++++------------- conf/base.config | 6 +- conf/modules.config | 29 ++++--- modules.json | 14 +-- modules/local/bin_summary.nf | 6 +- modules/local/combine_tsv.nf | 2 +- modules/nf-core/aria2/aria2.diff | 24 ------ modules/nf-core/aria2/main.nf | 41 --------- modules/nf-core/aria2/meta.yml | 30 ------- modules/nf-core/checkm/lineagewf/main.nf | 47 ---------- modules/nf-core/checkm/lineagewf/meta.yml | 65 -------------- modules/nf-core/checkm/qa/main.nf | 44 ---------- modules/nf-core/checkm/qa/meta.yml | 67 --------------- .../checkm2/databasedownload/environment.yml | 5 ++ .../nf-core/checkm2/databasedownload/main.nf | 55 ++++++++++++ .../nf-core/checkm2/databasedownload/meta.yml | 42 +++++++++ .../databasedownload/tests/main.nf.test | 30 +++++++ .../databasedownload/tests/main.nf.test.snap | 10 +++ .../checkm2/databasedownload/tests/tags.yml | 2 + .../nf-core/checkm2/predict/environment.yml | 5 ++ modules/nf-core/checkm2/predict/main.nf | 52 ++++++++++++ modules/nf-core/checkm2/predict/meta.yml | 65 ++++++++++++++ .../checkm2/predict/tests/main.nf.test | 46 ++++++++++ .../checkm2/predict/tests/main.nf.test.snap | 18 ++++ .../nf-core/checkm2/predict/tests/tags.yml | 3 + nextflow.config | 6 +- nextflow_schema.json | 26 +++--- subworkflows/local/checkm2_qc.nf | 25 ++++++ subworkflows/local/checkm_qc.nf | 44 ---------- subworkflows/local/gtdbtk.nf | 8 +- workflows/mag.nf | 47 +++++----- 32 files changed, 463 insertions(+), 504 deletions(-) delete mode 100644 modules/nf-core/aria2/aria2.diff delete mode 100644 modules/nf-core/aria2/main.nf delete mode 100644 modules/nf-core/aria2/meta.yml delete mode 100644 modules/nf-core/checkm/lineagewf/main.nf delete mode 100644 modules/nf-core/checkm/lineagewf/meta.yml delete mode 100644 modules/nf-core/checkm/qa/main.nf delete mode 100644 modules/nf-core/checkm/qa/meta.yml create mode 100644 modules/nf-core/checkm2/databasedownload/environment.yml create mode 100644 modules/nf-core/checkm2/databasedownload/main.nf create mode 100644 modules/nf-core/checkm2/databasedownload/meta.yml create mode 100644 modules/nf-core/checkm2/databasedownload/tests/main.nf.test create mode 100644 modules/nf-core/checkm2/databasedownload/tests/main.nf.test.snap create mode 100644 modules/nf-core/checkm2/databasedownload/tests/tags.yml create mode 100644 modules/nf-core/checkm2/predict/environment.yml create mode 100644 modules/nf-core/checkm2/predict/main.nf create mode 100644 modules/nf-core/checkm2/predict/meta.yml create mode 100644 modules/nf-core/checkm2/predict/tests/main.nf.test create mode 100644 modules/nf-core/checkm2/predict/tests/main.nf.test.snap create mode 100644 modules/nf-core/checkm2/predict/tests/tags.yml create mode 100644 subworkflows/local/checkm2_qc.nf delete mode 100644 subworkflows/local/checkm_qc.nf diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index d2fa6e12..98fb0acd 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -131,8 +131,8 @@ jobs: run: | nextflow run ${GITHUB_WORKSPACE} -profile ${{ matrix.test_name }},docker --outdir ./results - checkm: - name: Run single test to checkm due to database download + checkm2: + name: Run single test to checkm2 due to database download # Only run on push if this is the nf-core dev branch (merged PRs) if: ${{ github.event_name != 'push' || (github.event_name == 'push' && github.repository == 'nf-core/mag') }} runs-on: ubuntu-latest @@ -154,12 +154,16 @@ jobs: - name: Clean up Disk space uses: jlumbroso/free-disk-space@54081f138730dfa15788a46383842cd2f914a1be # v1.3.1 - - name: Download and prepare CheckM database + - name: Download and prepare CheckM2 database run: | - mkdir -p databases/checkm - wget https://zenodo.org/records/7401545/files/checkm_data_2015_01_16.tar.gz -P databases/checkm - tar xzvf databases/checkm/checkm_data_2015_01_16.tar.gz -C databases/checkm/ + mkdir -p databases/checkm2 + wget https://zenodo.org/records/5571251/files/checkm2_database.tar.gz -P databases/checkm2 + tar xzvf databases/checkm2/checkm2_database.tar.gz -C databases/checkm2/ - name: Run pipeline with ${{ matrix.profile }} test profile run: | - nextflow run ${GITHUB_WORKSPACE} -profile test,docker --outdir ./results --binqc_tool checkm --checkm_db databases/checkm + nextflow run ${GITHUB_WORKSPACE} \ + -profile test,docker \ + --outdir ./results \ + --binqc_tool checkm2 \ + --checkm2_db databases/checkm2/CheckM2_database/uniref100.KO.1.dmnd diff --git a/bin/combine_tables.py b/bin/combine_tables.py index a2dcf986..769b24d4 100755 --- a/bin/combine_tables.py +++ b/bin/combine_tables.py @@ -3,10 +3,9 @@ ## Originally written by Daniel Straub and Sabrina Krakau and released under the MIT license. ## See git repository (https://github.com/nf-core/mag) for full license text. - -import sys import argparse -import os.path +import sys + import pandas as pd @@ -19,18 +18,10 @@ def parse_args(args=None): metavar="FILE", help="Bin depths summary file.", ) - parser.add_argument( - "-b", "--busco_summary", metavar="FILE", help="BUSCO summary file." - ) - parser.add_argument( - "-c", "--checkm_summary", metavar="FILE", help="CheckM summary file." - ) - parser.add_argument( - "-q", "--quast_summary", metavar="FILE", help="QUAST BINS summary file." - ) - parser.add_argument( - "-g", "--gtdbtk_summary", metavar="FILE", help="GTDB-Tk summary file." - ) + parser.add_argument("-b", "--busco_summary", metavar="FILE", help="BUSCO summary file.") + parser.add_argument("-c", "--checkm2_summary", metavar="FILE", help="CheckM2 summary file.") + parser.add_argument("-q", "--quast_summary", metavar="FILE", help="QUAST BINS summary file.") + parser.add_argument("-g", "--gtdbtk_summary", metavar="FILE", help="GTDB-Tk summary file.") parser.add_argument("-a", "--cat_summary", metavar="FILE", help="CAT table file.") parser.add_argument( "-o", @@ -81,9 +72,7 @@ def parse_cat_table(cat_table): ) # merge all rank columns into a single column df["CAT_rank"] = ( - df.filter(regex="rank_\d+") - .apply(lambda x: ";".join(x.dropna()), axis=1) - .str.lstrip() + df.filter(regex="rank_\d+").apply(lambda x: ";".join(x.dropna()), axis=1).str.lstrip() ) # remove rank_* columns df.drop(df.filter(regex="rank_\d+").columns, axis=1, inplace=True) @@ -96,73 +85,53 @@ def main(args=None): if ( not args.busco_summary - and not args.checkm_summary + and not args.checkm2_summary and not args.quast_summary and not args.gtdbtk_summary ): - sys.exit( - "No summary specified! Please specify at least BUSCO, CheckM or QUAST summary." - ) + sys.exit("No summary specified! Please specify at least BUSCO, CheckM2 or QUAST summary.") - # GTDB-Tk can only be run in combination with BUSCO or CheckM - if args.gtdbtk_summary and not (args.busco_summary or args.checkm_summary): + # GTDB-Tk can only be run in combination with BUSCO or CheckM2 + if args.gtdbtk_summary and not (args.busco_summary or args.checkm2_summary): sys.exit( - "Invalid parameter combination: GTDB-TK summary specified, but no BUSCO or CheckM summary!" + "Invalid parameter combination: GTDB-TK summary specified, but no BUSCO or CheckM2 summary!" ) # handle bin depths results = pd.read_csv(args.depths_summary, sep="\t") - results.columns = [ - "Depth " + str(col) if col != "bin" else col for col in results.columns - ] + results.columns = ["Depth " + str(col) if col != "bin" else col for col in results.columns] bins = results["bin"].sort_values().reset_index(drop=True) if args.busco_summary: busco_results = pd.read_csv(args.busco_summary, sep="\t") - if not bins.equals( - busco_results["GenomeBin"].sort_values().reset_index(drop=True) - ): + if not bins.equals(busco_results["GenomeBin"].sort_values().reset_index(drop=True)): sys.exit("Bins in BUSCO summary do not match bins in bin depths summary!") results = pd.merge( results, busco_results, left_on="bin", right_on="GenomeBin", how="outer" ) # assuming depths for all bins are given - if args.checkm_summary: + if args.checkm2_summary: use_columns = [ - "Bin Id", - "Marker lineage", - "# genomes", - "# markers", - "# marker sets", + "Name", "Completeness", "Contamination", - "Strain heterogeneity", - "Coding density", - "Translation table", - "# predicted genes", - "0", - "1", - "2", - "3", - "4", - "5+", + "Completeness_Model_Used", + "Coding_Density", + "Translation_Table_Used", + "Total_Coding_Sequences", ] - checkm_results = pd.read_csv(args.checkm_summary, usecols=use_columns, sep="\t") - checkm_results["Bin Id"] = checkm_results["Bin Id"] + ".fa" - if not bins.equals( - checkm_results["Bin Id"].sort_values().reset_index(drop=True) - ): - sys.exit("Bins in CheckM summary do not match bins in bin depths summary!") + checkm2_results = pd.read_csv(args.checkm2_summary, usecols=use_columns, sep="\t") + checkm2_results["Name"] = checkm2_results["Name"] + ".fa" + if not set(checkm2_results["Name"]).issubset(set(bins)): + sys.exit("Bins in CheckM2 summary do not match bins in bin depths summary!") results = pd.merge( - results, checkm_results, left_on="bin", right_on="Bin Id", how="outer" + results, checkm2_results, left_on="bin", right_on="Name", how="outer" ) # assuming depths for all bins are given - results["Bin Id"] = results["Bin Id"].str.removesuffix(".fa") + results["Name"] = results["Name"].str.removesuffix(".fa") if args.quast_summary: quast_results = pd.read_csv(args.quast_summary, sep="\t") - if not bins.equals( - quast_results["Assembly"].sort_values().reset_index(drop=True) - ): + if not bins.equals(quast_results["Assembly"].sort_values().reset_index(drop=True)): sys.exit("Bins in QUAST summary do not match bins in bin depths summary!") results = pd.merge( results, quast_results, left_on="bin", right_on="Assembly", how="outer" diff --git a/conf/base.config b/conf/base.config index 21a8ac3e..9be49eaa 100644 --- a/conf/base.config +++ b/conf/base.config @@ -160,12 +160,14 @@ process { cpus = { 8 * task.attempt } memory = { 20.GB * task.attempt } } - withName: MAXBIN2 { errorStrategy = { task.exitStatus in [1, 255] ? 'ignore' : 'retry' } } - withName: DASTOOL_DASTOOL { errorStrategy = { task.exitStatus in ((130..145) + 104) ? 'retry' : task.exitStatus == 1 ? 'ignore' : 'finish' } } + //CheckM2 returns exit code 1 when Diamond doesn't find any hits + withName: CHECKM2_PREDICT { + errorStrategy = { task.exitStatus in (130..145) ? 'retry' : task.exitStatus == 1 ? 'ignore' : 'finish' } + } } diff --git a/conf/modules.config b/conf/modules.config index 0fbea292..dca9ea21 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -344,29 +344,30 @@ process { publishDir = [path: { "${params.outdir}/GenomeBinning/QC" }, mode: params.publish_dir_mode, saveAs: { filename -> filename.equals('versions.yml') ? null : filename }] } - withName: ARIA2_UNTAR { - publishDir = [path: { "${params.outdir}/GenomeBinning/QC/CheckM/checkm_downloads" }, mode: params.publish_dir_mode, overwrite: false, saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, enabled: params.save_checkm_data] - } - - withName: CHECKM_LINEAGEWF { - tag = { "${meta.assembler}-${meta.binner}-${meta.domain}-${meta.refinement}-${meta.id}" } - ext.prefix = { "${meta.assembler}-${meta.binner}-${meta.domain}-${meta.refinement}-${meta.id}_wf" } - publishDir = [path: { "${params.outdir}/GenomeBinning/QC/CheckM" }, mode: params.publish_dir_mode, saveAs: { filename -> filename.equals('versions.yml') ? null : filename }] + withName: CHECKM2_DATABASEDOWNLOAD { + publishDir = [ + path: { "${params.outdir}/GenomeBinning/QC/CheckM2/checkm2_downloads" }, + mode: params.publish_dir_mode, overwrite: false, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, enabled: params.save_checkm2_data + ] } - withName: CHECKM_QA { - ext.prefix = { "${meta.assembler}-${meta.binner}-${meta.domain}-${meta.refinement}-${meta.id}_qa" } - ext.args = "-o 2 --tab_table" + withName: CHECKM2_PREDICT { + ext.prefix = { "${meta.assembler}-${meta.binner}-${meta.domain}-${meta.refinement}-${meta.id}" } publishDir = [ - path: { "${params.outdir}/GenomeBinning/QC/CheckM" }, + path: { "${params.outdir}/GenomeBinning/QC/CheckM2" }, mode: params.publish_dir_mode, saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ] } - withName: COMBINE_CHECKM_TSV { + withName: COMBINE_CHECKM2_TSV { ext.prefix = { "checkm_summary" } - publishDir = [path: { "${params.outdir}/GenomeBinning/QC" }, mode: params.publish_dir_mode, saveAs: { filename -> filename.equals('versions.yml') ? null : filename }] + publishDir = [ + path: { "${params.outdir}/GenomeBinning/QC" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] } withName: GUNC_DOWNLOADDB { diff --git a/modules.json b/modules.json index 3eea27cd..ae36414f 100644 --- a/modules.json +++ b/modules.json @@ -10,12 +10,6 @@ "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", "installed_by": ["modules"] }, - "aria2": { - "branch": "master", - "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", - "installed_by": ["modules"], - "patch": "modules/nf-core/aria2/aria2.diff" - }, "bbmap/bbnorm": { "branch": "master", "git_sha": "603ecbd9f45300c9788f197d2a15a005685b4220", @@ -52,14 +46,14 @@ "installed_by": ["modules"], "patch": "modules/nf-core/centrifuge/kreport/centrifuge-kreport.diff" }, - "checkm/lineagewf": { + "checkm2/databasedownload": { "branch": "master", - "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", + "git_sha": "e17652681c856afaf2e240ba4c98bf4631a0fe2d", "installed_by": ["modules"] }, - "checkm/qa": { + "checkm2/predict": { "branch": "master", - "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", + "git_sha": "e17652681c856afaf2e240ba4c98bf4631a0fe2d", "installed_by": ["modules"] }, "concoct/concoct": { diff --git a/modules/local/bin_summary.nf b/modules/local/bin_summary.nf index b387174c..31224442 100644 --- a/modules/local/bin_summary.nf +++ b/modules/local/bin_summary.nf @@ -8,7 +8,7 @@ process BIN_SUMMARY { input: path(bin_depths) path(busco_sum) - path(checkm_sum) + path(checkm2_sum) path(quast_sum) path(gtdbtk_sum) path(cat_sum) @@ -19,14 +19,14 @@ process BIN_SUMMARY { script: def busco_summary = busco_sum.sort().size() > 0 ? "--busco_summary ${busco_sum}" : "" - def checkm_summary = checkm_sum.sort().size() > 0 ? "--checkm_summary ${checkm_sum}" : "" + def checkm2_summary = checkm2_sum.sort().size() > 0 ? "--checkm2_summary ${checkm2_sum}" : "" def quast_summary = quast_sum.sort().size() > 0 ? "--quast_summary ${quast_sum}" : "" def gtdbtk_summary = gtdbtk_sum.sort().size() > 0 ? "--gtdbtk_summary ${gtdbtk_sum}" : "" def cat_summary = cat_sum.sort().size() > 0 ? "--cat_summary ${cat_sum}" : "" """ combine_tables.py --depths_summary ${bin_depths} \ $busco_summary \ - $checkm_summary \ + $checkm2_summary \ $quast_summary \ $gtdbtk_summary \ $cat_summary \ diff --git a/modules/local/combine_tsv.nf b/modules/local/combine_tsv.nf index 5e62be27..1fe7ec1a 100644 --- a/modules/local/combine_tsv.nf +++ b/modules/local/combine_tsv.nf @@ -7,7 +7,7 @@ process COMBINE_TSV { 'biocontainers/bioawk:1.0--hed695b0_5' }" input: - path(bin_summaries) + path(bin_summaries, stageAs: "bin_summaries/*.tsv") output: path("*.tsv") , emit: combined diff --git a/modules/nf-core/aria2/aria2.diff b/modules/nf-core/aria2/aria2.diff deleted file mode 100644 index 789fdb44..00000000 --- a/modules/nf-core/aria2/aria2.diff +++ /dev/null @@ -1,24 +0,0 @@ -Changes in module 'nf-core/aria2' ---- modules/nf-core/aria2/main.nf -+++ modules/nf-core/aria2/main.nf -@@ -12,7 +12,7 @@ - val source_url - - output: -- path ("$downloaded_file"), emit: downloaded_file -+ path ("checkm_data_2015_01_16/"), emit: downloaded_file - path "versions.yml" , emit: versions - - when: -@@ -30,6 +30,9 @@ - $args \\ - $source_url - -+ mkdir checkm_data_2015_01_16/ -+ tar x -C checkm_data_2015_01_16 -v -z -f *.tar.gz -+ - cat <<-END_VERSIONS > versions.yml - "${task.process}": - aria2: \$(echo \$(aria2c --version 2>&1) | grep 'aria2 version' | cut -f3 -d ' ') - -************************************************************ diff --git a/modules/nf-core/aria2/main.nf b/modules/nf-core/aria2/main.nf deleted file mode 100644 index b6091dad..00000000 --- a/modules/nf-core/aria2/main.nf +++ /dev/null @@ -1,41 +0,0 @@ - -process ARIA2 { - tag "$source_url" - label 'process_single' - - conda "conda-forge::aria2=1.36.0" - container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/aria2:1.36.0' : - 'biocontainers/aria2:1.36.0' }" - - input: - val source_url - - output: - path ("checkm_data_2015_01_16/"), emit: downloaded_file - path "versions.yml" , emit: versions - - when: - task.ext.when == null || task.ext.when - - script: - def args = task.ext.args ?: '' - downloaded_file = source_url.split("/")[-1] - - """ - set -e - - aria2c \\ - --check-certificate=false \\ - $args \\ - $source_url - - mkdir checkm_data_2015_01_16/ - tar x -C checkm_data_2015_01_16 -v -z -f *.tar.gz - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - aria2: \$(echo \$(aria2c --version 2>&1) | grep 'aria2 version' | cut -f3 -d ' ') - END_VERSIONS - """ -} diff --git a/modules/nf-core/aria2/meta.yml b/modules/nf-core/aria2/meta.yml deleted file mode 100644 index 64c2a524..00000000 --- a/modules/nf-core/aria2/meta.yml +++ /dev/null @@ -1,30 +0,0 @@ -name: "aria2" -description: CLI Download utility -keywords: - - download -tools: - - "aria2": - description: "aria2 is a lightweight multi-protocol & multi-source, cross platform download utility operated in command-line. It supports HTTP/HTTPS, FTP, SFTP, BitTorrent and Metalink." - - tool_dev_url: "https://github.com/aria2/aria2/" - - licence: "['GPL v2']" - -input: - - source_url: - type: url - description: Source URL to be downloaded - pattern: "{http,https}*" - -output: - - versions: - type: file - description: File containing software versions - pattern: "versions.yml" - - downloaded_file: - type: file - description: Downloaded files from source - pattern: "*.*" - -authors: - - "@JoseEspinosa" diff --git a/modules/nf-core/checkm/lineagewf/main.nf b/modules/nf-core/checkm/lineagewf/main.nf deleted file mode 100644 index d8674ddc..00000000 --- a/modules/nf-core/checkm/lineagewf/main.nf +++ /dev/null @@ -1,47 +0,0 @@ -process CHECKM_LINEAGEWF { - tag "$meta.id" - label 'process_medium' - - conda "bioconda::checkm-genome=1.2.1" - container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/checkm-genome:1.2.1--pyhdfd78af_0' : - 'biocontainers/checkm-genome:1.2.1--pyhdfd78af_0' }" - - input: - tuple val(meta), path(fasta, stageAs: "input_bins/*") - val fasta_ext - path db - - output: - tuple val(meta), path("${prefix}") , emit: checkm_output - tuple val(meta), path("${prefix}/lineage.ms"), emit: marker_file - tuple val(meta), path("${prefix}.tsv") , emit: checkm_tsv - path "versions.yml" , emit: versions - - when: - task.ext.when == null || task.ext.when - - script: - def args = task.ext.args ?: '' - prefix = task.ext.prefix ?: "${meta.id}" - checkm_db = db ? "export CHECKM_DATA_PATH=${db}" : "" - """ - $checkm_db - - checkm \\ - lineage_wf \\ - -t $task.cpus \\ - -f ${prefix}.tsv \\ - --tab_table \\ - --pplacer_threads $task.cpus \\ - -x $fasta_ext \\ - $args \\ - input_bins/ \\ - $prefix - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - checkm: \$( checkm 2>&1 | grep '...:::' | sed 's/.*CheckM v//;s/ .*//' ) - END_VERSIONS - """ -} diff --git a/modules/nf-core/checkm/lineagewf/meta.yml b/modules/nf-core/checkm/lineagewf/meta.yml deleted file mode 100644 index 4716a3e9..00000000 --- a/modules/nf-core/checkm/lineagewf/meta.yml +++ /dev/null @@ -1,65 +0,0 @@ -name: checkm_lineagewf -description: CheckM provides a set of tools for assessing the quality of genomes recovered from isolates, single cells, or metagenomes. -keywords: - - checkm - - mag - - metagenome - - quality - - isolates - - microbes - - single cells - - completeness - - contamination - - bins - - genome bins -tools: - - checkm: - description: Assess the quality of microbial genomes recovered from isolates, single cells, and metagenomes. - homepage: https://ecogenomics.github.io/CheckM/ - documentation: https://github.com/Ecogenomics/CheckM/wiki - tool_dev_url: https://github.com/Ecogenomics/CheckM - doi: "10.1101/gr.186072.114" - licence: ["GPL v3"] - -input: - - meta: - type: map - description: | - Groovy Map containing sample information - e.g. [ id:'test', single_end:false ] - - fasta: - type: file - description: One or a list of multiple FASTA files of each bin, with extension defined with the fasta_ext value - pattern: "*.{$fasta_ext}" - - fasta_ext: - type: value - description: The file-type extension suffix of the input FASTA files (e.g., fasta, fna, fa, fas) - - db: - type: directory - description: Optional directory pointing to checkM database to prevent re-downloading - -output: - - meta: - type: map - description: | - Groovy Map containing sample information - e.g. [ id:'sample', bin:'1' ] - - versions: - type: file - description: File containing software versions - pattern: "versions.yml" - - checkm_output: - type: directory - description: CheckM output directory - pattern: "*/" - - checkm_output: - type: file - description: Lineage markfer file - pattern: "lineage.ms" - - checkm_tsv: - type: file - description: CheckM summary completeness statistics table - pattern: "*.tsv" - -authors: - - "@jfy133" diff --git a/modules/nf-core/checkm/qa/main.nf b/modules/nf-core/checkm/qa/main.nf deleted file mode 100644 index b0c0e69a..00000000 --- a/modules/nf-core/checkm/qa/main.nf +++ /dev/null @@ -1,44 +0,0 @@ -process CHECKM_QA { - tag "$meta.id" - label 'process_low' - - conda "bioconda::checkm-genome=1.2.1" - container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/checkm-genome:1.2.1--pyhdfd78af_0' : - 'biocontainers/checkm-genome:1.2.1--pyhdfd78af_0' }" - - input: - tuple val(meta), path(analysis_dir), path(marker_file), path(coverage_file) - path exclude_marker_file - - output: - tuple val(meta), path("${prefix}.txt") , optional: true, emit: output - tuple val(meta), path("${prefix}.fasta"), optional: true, emit: fasta - path "versions.yml" , emit: versions - - when: - task.ext.when == null || task.ext.when - - script: - def args = task.ext.args ?: '' - prefix = task.ext.prefix ?: "${meta.id}" - suffix = task.ext.args?.matches(".*-o 9.*|.*--out_file 9.*") ? "fasta" : "txt" - def coverage = coverage_file ? "--coverage_file ${coverage_file}" : "" - def exclude = exclude_marker_file ? "--exclude_markers ${marker_filer}" : "" - """ - checkm \\ - qa \\ - --threads ${task.cpus} \\ - --file ${prefix}.${suffix} \\ - $marker_file \\ - $analysis_dir \\ - $coverage \\ - $exclude \\ - $args - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - checkm: \$( checkm 2>&1 | grep '...:::' | sed 's/.*CheckM v//;s/ .*//' ) - END_VERSIONS - """ -} diff --git a/modules/nf-core/checkm/qa/meta.yml b/modules/nf-core/checkm/qa/meta.yml deleted file mode 100644 index d0af39af..00000000 --- a/modules/nf-core/checkm/qa/meta.yml +++ /dev/null @@ -1,67 +0,0 @@ -name: checkm_qa -description: CheckM provides a set of tools for assessing the quality of genomes recovered from isolates, single cells, or metagenomes. -keywords: - - checkm - - mag - - metagenome - - quality - - isolates - - microbes - - single cells - - completeness - - contamination - - bins - - genome bins - - qa - - quality assurnce -tools: - - checkm: - description: Assess the quality of microbial genomes recovered from isolates, single cells, and metagenomes. - homepage: https://ecogenomics.github.io/CheckM/ - documentation: https://github.com/Ecogenomics/CheckM/wiki - tool_dev_url: https://github.com/Ecogenomics/CheckM - doi: "10.1101/gr.186072.114" - licence: ["GPL v3"] - -input: - - meta: - type: map - description: | - Groovy Map containing sample information - e.g. [ id:'test', single_end:false ] - - analysis_dir: - type: file - description: Directory containing output of checkm/analyze or checkm/lineage_wf etc. - pattern: "*" - - marker_file: - type: file - description: Marker file specified during checkm/analyze or produced by checkm/{lineage,taxonomy}_wf - pattern: "*.ms" - - coverage_file: - type: file - description: File containing coverage of each sequence (generated by checkm coverage) - - exclude_marker_file: - type: file - description: File specifying markers to exclude from marker sets - -output: - - meta: - type: map - description: | - Groovy Map containing sample information - e.g. [ id:'test', single_end:false ] - - versions: - type: file - description: File containing software versions - pattern: "versions.yml" - - output: - type: file - description: "Default completeness statistics in various formats, as specified with --out_format (excluding option: 9)" - pattern: "*.txt" - - fasta: - type: file - description: Output in fasta format (only if --out_format 9) - pattern: "*.fasta" - -authors: - - "@jfy133" diff --git a/modules/nf-core/checkm2/databasedownload/environment.yml b/modules/nf-core/checkm2/databasedownload/environment.yml new file mode 100644 index 00000000..52d11ba9 --- /dev/null +++ b/modules/nf-core/checkm2/databasedownload/environment.yml @@ -0,0 +1,5 @@ +channels: + - conda-forge + - bioconda +dependencies: + - conda-forge::aria2=1.36.0 diff --git a/modules/nf-core/checkm2/databasedownload/main.nf b/modules/nf-core/checkm2/databasedownload/main.nf new file mode 100644 index 00000000..6144067b --- /dev/null +++ b/modules/nf-core/checkm2/databasedownload/main.nf @@ -0,0 +1,55 @@ +import groovy.json.JsonSlurper + +process CHECKM2_DATABASEDOWNLOAD { + label 'process_single' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/aria2:1.36.0': + 'biocontainers/aria2:1.36.0' }" + + input: + val(db_zenodo_id) + + output: + tuple val(meta), path("checkm2_db_v${db_version}.dmnd"), emit: database + path("versions.yml") , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + zenodo_id = db_zenodo_id ?: 5571251 // Default to latest version if no ID provided + api_data = (new JsonSlurper()).parseText(file("https://zenodo.org/api/records/${zenodo_id}").text) + db_version = api_data.metadata.version + checksum = api_data.files[0].checksum.replaceFirst(/^md5:/, "md5=") + meta = [id: 'checkm2_db', version: db_version] + """ + # Automatic download is broken when using singularity/apptainer (https://github.com/chklovski/CheckM2/issues/73) + # So it's necessary to download the database manually + aria2c \ + ${args} \ + --checksum ${checksum} \ + https://zenodo.org/records/${zenodo_id}/files/checkm2_database.tar.gz + + tar -xzf checkm2_database.tar.gz + db_path=\$(find -name *.dmnd) + mv \$db_path checkm2_db_v${db_version}.dmnd + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + aria2: \$(echo \$(aria2c --version 2>&1) | grep 'aria2 version' | cut -f3 -d ' ') + END_VERSIONS + """ + + stub: + """ + touch checkm_db.dmnd + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + checkm2: \$(checkm2 --version) + END_VERSIONS + """ +} diff --git a/modules/nf-core/checkm2/databasedownload/meta.yml b/modules/nf-core/checkm2/databasedownload/meta.yml new file mode 100644 index 00000000..632b4922 --- /dev/null +++ b/modules/nf-core/checkm2/databasedownload/meta.yml @@ -0,0 +1,42 @@ +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/yaml-schema.json +name: "checkm2_databasedownload" +description: CheckM2 database download +keywords: + - checkm + - mag + - metagenome + - quality + - completeness + - contamination + - bins +tools: + - "checkm2": + description: "CheckM2 - Rapid assessment of genome bin quality using machine learning" + homepage: "https://github.com/chklovski/CheckM2" + doi: "10.1038/s41592-023-01940-w" + licence: ["GPL v3"] + identifier: "" + +input: + - - db_zenodo_id: + type: integer + description: Zenodo ID of the CheckM2 database to download + +output: + - database: + - meta: + type: map + description: | + Groovy Map containing database information + e.g. `[ id:'test', version:1 ]` + - checkm2_db_v${db_version}.dmnd: + type: file + description: CheckM2 database file + pattern: "checkm2_db_v*.dmnd" + - versions: + - versions.yml: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@dialvarezs" diff --git a/modules/nf-core/checkm2/databasedownload/tests/main.nf.test b/modules/nf-core/checkm2/databasedownload/tests/main.nf.test new file mode 100644 index 00000000..2a98f051 --- /dev/null +++ b/modules/nf-core/checkm2/databasedownload/tests/main.nf.test @@ -0,0 +1,30 @@ +nextflow_process { + + name "Test Process CHECKM2_DATABASEDOWNLOAD" + tag "modules_nfcore" + tag "modules" + tag "checkm2" + tag "checkm2/databasedownload" + script "modules/nf-core/checkm2/databasedownload/main.nf" + process "CHECKM2_DATABASEDOWNLOAD" + + test("Test CheckM2 Database Download") { + + when { + process { + """ + input[0] = [] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out.versions).match() }, + ) + } + + } + +} diff --git a/modules/nf-core/checkm2/databasedownload/tests/main.nf.test.snap b/modules/nf-core/checkm2/databasedownload/tests/main.nf.test.snap new file mode 100644 index 00000000..403d26fd --- /dev/null +++ b/modules/nf-core/checkm2/databasedownload/tests/main.nf.test.snap @@ -0,0 +1,10 @@ +{ + "Test CheckM2 Database Download": { + "content": [ + [ + "versions.yml:md5,6201d5ac7aca6e32b98daf4f8656aa2a" + ] + ], + "timestamp": "2024-09-16T22:23:54.183040031" + } +} \ No newline at end of file diff --git a/modules/nf-core/checkm2/databasedownload/tests/tags.yml b/modules/nf-core/checkm2/databasedownload/tests/tags.yml new file mode 100644 index 00000000..46266770 --- /dev/null +++ b/modules/nf-core/checkm2/databasedownload/tests/tags.yml @@ -0,0 +1,2 @@ +checkm2/databasedownload: + - modules/nf-core/checkm2/databasedownload/** diff --git a/modules/nf-core/checkm2/predict/environment.yml b/modules/nf-core/checkm2/predict/environment.yml new file mode 100644 index 00000000..18fd1f51 --- /dev/null +++ b/modules/nf-core/checkm2/predict/environment.yml @@ -0,0 +1,5 @@ +channels: + - conda-forge + - bioconda +dependencies: + - bioconda::checkm2=1.0.2 diff --git a/modules/nf-core/checkm2/predict/main.nf b/modules/nf-core/checkm2/predict/main.nf new file mode 100644 index 00000000..25271ba9 --- /dev/null +++ b/modules/nf-core/checkm2/predict/main.nf @@ -0,0 +1,52 @@ +process CHECKM2_PREDICT { + tag "${meta.id}" + label 'process_medium' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/checkm2:1.0.2--pyh7cba7a3_0': + 'biocontainers/checkm2:1.0.2--pyh7cba7a3_0' }" + + input: + tuple val(meta), path(fasta, stageAs: "input_bins/*") + tuple val(dbmeta), path(db) + + output: + tuple val(meta), path("${prefix}") , emit: checkm2_output + tuple val(meta), path("${prefix}/quality_report.tsv"), emit: checkm2_tsv + path("versions.yml") , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + prefix = task.ext.prefix ?: "${meta.id}" + """ + checkm2 \\ + predict \\ + --input ${fasta} \\ + --output-directory ${prefix} \\ + --threads ${task.cpus} \\ + --database_path ${db} \\ + ${args} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + checkm2: \$(checkm2 --version) + END_VERSIONS + """ + + stub: + def args = task.ext.args ?: '' + prefix = task.ext.prefix ?: "${meta.id}" + """ + mkdir -p ${prefix}/diamond_output ${prefix}/protein_files + touch ${prefix}/quality_report.tsv ${prefix}/checkm2.log + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + checkm2: \$(checkm2 --version) + END_VERSIONS + """ +} diff --git a/modules/nf-core/checkm2/predict/meta.yml b/modules/nf-core/checkm2/predict/meta.yml new file mode 100644 index 00000000..48cc9fbc --- /dev/null +++ b/modules/nf-core/checkm2/predict/meta.yml @@ -0,0 +1,65 @@ +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/yaml-schema.json +name: "checkm2_predict" +description: CheckM2 bin quality prediction +keywords: + - checkm + - mag + - metagenome + - quality + - completeness + - contamination + - bins +tools: + - "checkm2": + description: "CheckM2 - Rapid assessment of genome bin quality using machine learning" + homepage: "https://github.com/chklovski/CheckM2" + doi: "10.1038/s41592-023-01940-w" + licence: ["GPL v3"] + identifier: "" + +input: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'test' ]` + - fasta: + type: file + description: One or multiple FASTA files of each bin + pattern: "*.{fasta,fna,fa}" + - - dbmeta: + type: map + description: | + Groovy Map containing database information + e.g. `[ id:'test', version:1 ]` + - db: + type: file + description: CheckM2 database +output: + - checkm2_output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'test' ]` + - ${prefix}: + type: directory + description: CheckM2 output directory + pattern: "${prefix}/" + - checkm2_tsv: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'test' ]` + - ${prefix}/quality_report.tsv: + type: file + description: CheckM2 summary completeness statistics table + pattern: "*.tsv" + - versions: + - versions.yml: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@dialvarezs" diff --git a/modules/nf-core/checkm2/predict/tests/main.nf.test b/modules/nf-core/checkm2/predict/tests/main.nf.test new file mode 100644 index 00000000..e825f74c --- /dev/null +++ b/modules/nf-core/checkm2/predict/tests/main.nf.test @@ -0,0 +1,46 @@ +nextflow_process { + + name "Test Process CHECKM2_PREDICT" + tag "modules_nfcore" + tag "modules" + tag "checkm2" + tag "checkm2/predict" + tag "checkm2/databasedownload" + script "modules/nf-core/checkm2/predict/main.nf" + process "CHECKM2_PREDICT" + + test("Test CheckM2 Predict") { + + setup { + run("CHECKM2_DATABASEDOWNLOAD") { + script "../../databasedownload/main.nf" + process { + """ + input[0] = [] + """ + } + } + } + + when { + params { + outdir = "${launchDir}/tests/results" + } + process { + """ + input[0] = [ [id: 'test'], [file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fasta/contigs.fasta', checkIfExists: true)] ] + input[1] = CHECKM2_DATABASEDOWNLOAD.out.database + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out.checkm2_tsv, process.out.versions).match() } + ) + } + + } + +} diff --git a/modules/nf-core/checkm2/predict/tests/main.nf.test.snap b/modules/nf-core/checkm2/predict/tests/main.nf.test.snap new file mode 100644 index 00000000..6fd2e918 --- /dev/null +++ b/modules/nf-core/checkm2/predict/tests/main.nf.test.snap @@ -0,0 +1,18 @@ +{ + "Test CheckM2 Predict": { + "content": [ + [ + [ + { + "id": "test" + }, + "quality_report.tsv:md5,7f05ff49d18697304575d1106a871501" + ] + ], + [ + "versions.yml:md5,088ec2d8a46efd530c11019328064bff" + ] + ], + "timestamp": "2024-09-16T22:43:50.787486798" + } +} \ No newline at end of file diff --git a/modules/nf-core/checkm2/predict/tests/tags.yml b/modules/nf-core/checkm2/predict/tests/tags.yml new file mode 100644 index 00000000..c31d112a --- /dev/null +++ b/modules/nf-core/checkm2/predict/tests/tags.yml @@ -0,0 +1,3 @@ +checkm2/predict: + - modules/nf-core/checkm2/predict/** + - modules/nf-core/checkm2/databasedownload/** diff --git a/nextflow.config b/nextflow.config index 026f67d8..6a5a4d2c 100644 --- a/nextflow.config +++ b/nextflow.config @@ -129,9 +129,9 @@ params { busco_auto_lineage_prok = false save_busco_db = false busco_clean = false - checkm_download_url = "https://zenodo.org/records/7401545/files/checkm_data_2015_01_16.tar.gz" - checkm_db = null - save_checkm_data = false + checkm2_db = null + checkm2_db_version = 5571251 + save_checkm2_data = false run_gunc = false gunc_database_type = 'progenomes' gunc_db = null diff --git a/nextflow_schema.json b/nextflow_schema.json index ceb3ac08..5b5891f8 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -715,13 +715,13 @@ "properties": { "skip_binqc": { "type": "boolean", - "description": "Disable bin QC with BUSCO or CheckM." + "description": "Disable bin QC with BUSCO or CheckM2." }, "binqc_tool": { "type": "string", "default": "busco", "description": "Specify which tool for bin quality-control validation to use.", - "enum": ["busco", "checkm"] + "enum": ["busco", "checkm2"] }, "busco_db": { "type": "string", @@ -742,22 +742,20 @@ "description": "Enable clean-up of temporary files created during BUSCO runs.", "help_text": "By default, BUSCO creates a large number of intermediate files every run. This may cause problems on some clusters which have file number limits in plate, particularly with large numbers of bins. Enabling this option cleans these files, reducing the total file count of the work directory." }, - "checkm_download_url": { + "checkm2_db": { "type": "string", - "default": "https://zenodo.org/records/7401545/files/checkm_data_2015_01_16.tar.gz", - "hidden": true, - "description": "URL pointing to checkM database for auto download, if local path not supplied.", - "help_text": "You can use this parameter to point to an online copy of the checkM database TAR archive that the pipeline will use for auto download if a local path is not supplied to `--checkm_db`." + "description": "Path to local folder containing already downloaded and uncompressed CheckM2 database (.dmnd file).", + "help_text": "The pipeline can also download this for you if not specified, and you can save the resulting directory into your output directory by specifying `--save_checkm2_data`. You should move this directory to somewhere else on your machine (and supply back to the pipeline in future runs again with `--checkm2_db`." }, - "checkm_db": { - "type": "string", - "description": "Path to local folder containing already downloaded and uncompressed CheckM database.", - "help_text": "The pipeline can also download this for you if not specified, and you can save the resulting directory into your output directory by specifying `--save_checkm_data`. You should move this directory to somewhere else on your machine (and supply back to the pipeline in future runs again with `--checkm_db`." + "checkm2_db_version": { + "type": "integer", + "default": 5571251, + "description": "CheckM2 database version number to download (Zenodo ID, for reference check https://zenodo.org/records/5571251)." }, - "save_checkm_data": { + "save_checkm2_data": { "type": "boolean", - "description": "Save the used CheckM reference files downloaded when not using --checkm_db parameter.", - "help_text": "If specified, the directories and files decompressed from the `tar.gz` file downloaded from the [CheckM FTP server](https://data.ace.uq.edu.au/public/CheckM_databases/) will be stored in your output directory alongside your CheckM results." + "description": "Save the used CheckM2 reference files downloaded when not using --checkm2_db parameter.", + "help_text": "If specified, the directories and files decompressed from the `tar.gz` file downloaded from the [Zenodo repository](https://zenodo.org/records/5571251) will be stored in your output directory alongside your CheckM2 results." }, "refine_bins_dastool": { "type": "boolean", diff --git a/subworkflows/local/checkm2_qc.nf b/subworkflows/local/checkm2_qc.nf new file mode 100644 index 00000000..39264d70 --- /dev/null +++ b/subworkflows/local/checkm2_qc.nf @@ -0,0 +1,25 @@ +/* + * CheckM2: Assessing the quality of metagenome-derived genome bins using machine learning + */ + +include { CHECKM2_PREDICT } from '../../modules/nf-core/checkm2/predict/main' +include { COMBINE_TSV as COMBINE_CHECKM2_TSV } from '../../modules/local/combine_tsv' + + +workflow CHECKM2_QC { + take: + bins // channel: [ val(meta), path(bin) ] + checkm2_db + + main: + ch_versions = Channel.empty() + + CHECKM2_PREDICT ( bins, checkm2_db ) + ch_versions = ch_versions.mix(CHECKM2_PREDICT.out.versions.first()) + + COMBINE_CHECKM2_TSV ( CHECKM2_PREDICT.out.checkm2_tsv.map{it[1]}.collect() ) + + emit: + summary = COMBINE_CHECKM2_TSV.out.combined + versions = ch_versions +} \ No newline at end of file diff --git a/subworkflows/local/checkm_qc.nf b/subworkflows/local/checkm_qc.nf deleted file mode 100644 index 70ed9708..00000000 --- a/subworkflows/local/checkm_qc.nf +++ /dev/null @@ -1,44 +0,0 @@ -/* - * CheckM: Quantitative measures for the assessment of genome assembly - */ - -include { CHECKM_QA } from '../../modules/nf-core/checkm/qa/main' -include { CHECKM_LINEAGEWF } from '../../modules/nf-core/checkm/lineagewf/main' -include { COMBINE_TSV as COMBINE_CHECKM_TSV } from '../../modules/local/combine_tsv' - -workflow CHECKM_QC { - take: - bins // channel: [ val(meta), path(bin) ] - checkm_db - - main: - ch_versions = Channel.empty() - - ch_input_checkmdb = checkm_db ? checkm_db : [] - ch_bins_for_checkmlineagewf = bins - .multiMap { - meta, fa -> - reads: [ meta, fa ] - ext: fa.extension.unique().join("") // we set this in the pipeline to always `.fa` so this should be fine - } - - CHECKM_LINEAGEWF ( ch_bins_for_checkmlineagewf.reads, ch_bins_for_checkmlineagewf.ext, checkm_db ) - ch_versions = ch_versions.mix(CHECKM_LINEAGEWF.out.versions.first()) - - ch_checkmqa_input = CHECKM_LINEAGEWF.out.checkm_output - .join(CHECKM_LINEAGEWF.out.marker_file) - .map{ - meta, dir, marker -> - [ meta, dir, marker, []] - } - - CHECKM_QA ( ch_checkmqa_input, [] ) - ch_versions = ch_versions.mix(CHECKM_QA.out.versions.first()) - - COMBINE_CHECKM_TSV ( CHECKM_QA.out.output.map{it[1]}.collect() ) - - emit: - summary = COMBINE_CHECKM_TSV.out.combined - checkm_tsv = CHECKM_QA.out.output - versions = ch_versions -} diff --git a/subworkflows/local/gtdbtk.nf b/subworkflows/local/gtdbtk.nf index 6da5680d..0bc254e9 100644 --- a/subworkflows/local/gtdbtk.nf +++ b/subworkflows/local/gtdbtk.nf @@ -10,7 +10,7 @@ workflow GTDBTK { take: bins // channel: [ val(meta), [bins] ] busco_summary // channel: path - checkm_summary // channel: path + checkm2_summary // channel: path gtdb // channel: path gtdb_mash // channel: path @@ -38,13 +38,13 @@ workflow GTDBTK { [row.'GenomeBin', completeness, contamination] } } else { - // Collect completeness and contamination metrics from checkm summary - ch_bin_metrics = checkm_summary + // Collect completeness and contamination metrics from CheckM2 summary + ch_bin_metrics = checkm2_summary .splitCsv(header: true, sep: '\t') .map { row -> def completeness = Double.parseDouble(row.'Completeness') def contamination = Double.parseDouble(row.'Contamination') - [row.'Bin Id' + ".fa", completeness, contamination] + [row.'Name' + ".fa", completeness, contamination] } } diff --git a/workflows/mag.nf b/workflows/mag.nf index 7afb4316..6b334f73 100644 --- a/workflows/mag.nf +++ b/workflows/mag.nf @@ -17,7 +17,7 @@ include { BINNING } from '../subwo include { BINNING_REFINEMENT } from '../subworkflows/local/binning_refinement' include { BUSCO_QC } from '../subworkflows/local/busco_qc' include { VIRUS_IDENTIFICATION } from '../subworkflows/local/virus_identification' -include { CHECKM_QC } from '../subworkflows/local/checkm_qc' +include { CHECKM2_QC } from '../subworkflows/local/checkm2_qc' include { GUNC_QC } from '../subworkflows/local/gunc_qc' include { GTDBTK } from '../subworkflows/local/gtdbtk' include { ANCIENT_DNA_ASSEMBLY_VALIDATION } from '../subworkflows/local/ancient_dna' @@ -28,7 +28,6 @@ include { LONGREAD_PREPROCESSING } from '../subwo // // MODULE: Installed directly from nf-core/modules // -include { ARIA2 as ARIA2_UNTAR } from '../modules/nf-core/aria2/main' include { FASTQC as FASTQC_RAW } from '../modules/nf-core/fastqc/main' include { FASTQC as FASTQC_TRIMMED } from '../modules/nf-core/fastqc/main' include { SEQTK_MERGEPE } from '../modules/nf-core/seqtk/mergepe/main' @@ -52,6 +51,7 @@ include { PRODIGAL } from '../modul include { PROKKA } from '../modules/nf-core/prokka/main' include { MMSEQS_DATABASES } from '../modules/nf-core/mmseqs/databases/main' include { METAEUK_EASYPREDICT } from '../modules/nf-core/metaeuk/easypredict/main' +include { CHECKM2_DATABASEDOWNLOAD } from '../modules/nf-core/checkm2/databasedownload/main' // // MODULE: Local to the pipeline @@ -110,8 +110,8 @@ workflow MAG { ch_busco_db = [] } - if (params.checkm_db) { - ch_checkm_db = file(params.checkm_db, checkIfExists: true) + if(params.checkm2_db) { + ch_checkm2_db = [[:], file(params.checkm2_db, checkIfExists: true)] } if (params.gunc_db) { @@ -177,11 +177,10 @@ workflow MAG { // Additional info for completion email and summary def busco_failed_bins = [:] - // Get checkM database if not supplied - - if (!params.skip_binqc && params.binqc_tool == 'checkm' && !params.checkm_db) { - ARIA2_UNTAR(params.checkm_download_url) - ch_checkm_db = ARIA2_UNTAR.out.downloaded_file + // Get CheckM2 database if not supplied + if ( !params.skip_binqc && params.binqc_tool == 'checkm2' && !params.checkm2_db ) { + CHECKM2_DATABASEDOWNLOAD (params.checkm2_db_version) + ch_checkm2_db = CHECKM2_DATABASEDOWNLOAD.out.database } // Get mmseqs db for MetaEuk if requested @@ -638,7 +637,7 @@ workflow MAG { */ ch_busco_summary = Channel.empty() - ch_checkm_summary = Channel.empty() + ch_checkm2_summary = Channel.empty() if (!params.skip_binning || params.ancient_dna) { BINNING_PREPARATION( @@ -774,7 +773,7 @@ workflow MAG { ch_versions = ch_versions.mix(DEPTHS.out.versions) /* - * Bin QC subworkflows: for checking bin completeness with either BUSCO, CHECKM, and/or GUNC + * Bin QC subworkflows: for checking bin completeness with either BUSCO, CHECKM2, and/or GUNC */ ch_input_bins_for_qc = ch_input_for_postbinning_bins_unbins.transpose() @@ -798,29 +797,25 @@ workflow MAG { } } - if (!params.skip_binqc && params.binqc_tool == 'checkm') { + if (!params.skip_binqc && params.binqc_tool == 'checkm2') { /* - * CheckM subworkflow: Quantitative measures for the assessment of genome assembly + * CheckM2 subworkflow: Quantitative measures for the assessment of genome assembly */ - ch_input_bins_for_checkm = ch_input_bins_for_qc.filter { meta, bins -> + ch_input_bins_for_checkm2 = ch_input_bins_for_qc.filter { meta, bins -> meta.domain != "eukarya" } - CHECKM_QC( - ch_input_bins_for_checkm.groupTuple(), - ch_checkm_db + CHECKM2_QC ( + ch_input_bins_for_checkm2.groupTuple(), + ch_checkm2_db ) - ch_checkm_summary = CHECKM_QC.out.summary + ch_checkm2_summary = CHECKM2_QC.out.summary - ch_versions = ch_versions.mix(CHECKM_QC.out.versions) + ch_versions = ch_versions.mix(CHECKM2_QC.out.versions) } - if (params.run_gunc && params.binqc_tool == 'checkm') { - GUNC_QC(ch_input_bins_for_checkm, ch_gunc_db, CHECKM_QC.out.checkm_tsv) - ch_versions = ch_versions.mix(GUNC_QC.out.versions) - } - else if (params.run_gunc) { + if (params.run_gunc) { ch_input_bins_for_gunc = ch_input_for_postbinning_bins_unbins.filter { meta, bins -> meta.domain != "eukarya" } @@ -897,7 +892,7 @@ workflow MAG { GTDBTK( ch_gtdb_bins, ch_busco_summary, - ch_checkm_summary, + ch_checkm2_summary, gtdb, gtdb_mash ) @@ -913,7 +908,7 @@ workflow MAG { BIN_SUMMARY( ch_input_for_binsummary, ch_busco_summary.ifEmpty([]), - ch_checkm_summary.ifEmpty([]), + ch_checkm2_summary.ifEmpty([]), ch_quast_bins_summary.ifEmpty([]), ch_gtdbtk_summary.ifEmpty([]), ch_cat_global_summary.ifEmpty([]) From ca2f97b3b075a80d09ba9ded55af554e3eed5622 Mon Sep 17 00:00:00 2001 From: "Diego Alvarez S." Date: Sun, 27 Oct 2024 19:05:04 -0300 Subject: [PATCH 02/33] fix: Bring back CheckM --- .github/workflows/ci.yml | 34 ++++++++++++ bin/combine_tables.py | 32 +++++++++++ conf/modules.config | 29 ++++++++++ modules.json | 16 ++++++ modules/local/bin_summary.nf | 3 + modules/nf-core/aria2/aria2.diff | 24 ++++++++ modules/nf-core/aria2/main.nf | 41 ++++++++++++++ modules/nf-core/aria2/meta.yml | 30 ++++++++++ modules/nf-core/checkm/lineagewf/main.nf | 47 ++++++++++++++++ modules/nf-core/checkm/lineagewf/meta.yml | 65 ++++++++++++++++++++++ modules/nf-core/checkm/qa/main.nf | 44 +++++++++++++++ modules/nf-core/checkm/qa/meta.yml | 67 +++++++++++++++++++++++ nextflow.config | 3 + nextflow_schema.json | 21 ++++++- subworkflows/local/checkm_qc.nf | 44 +++++++++++++++ subworkflows/local/gtdbtk.nf | 10 +++- workflows/mag.nf | 43 ++++++++++++++- 17 files changed, 545 insertions(+), 8 deletions(-) create mode 100644 modules/nf-core/aria2/aria2.diff create mode 100644 modules/nf-core/aria2/main.nf create mode 100644 modules/nf-core/aria2/meta.yml create mode 100644 modules/nf-core/checkm/lineagewf/main.nf create mode 100644 modules/nf-core/checkm/lineagewf/meta.yml create mode 100644 modules/nf-core/checkm/qa/main.nf create mode 100644 modules/nf-core/checkm/qa/meta.yml create mode 100644 subworkflows/local/checkm_qc.nf diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 98fb0acd..6872104d 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -131,6 +131,40 @@ jobs: run: | nextflow run ${GITHUB_WORKSPACE} -profile ${{ matrix.test_name }},docker --outdir ./results + checkm: + name: Run single test to checkm due to database download + # Only run on push if this is the nf-core dev branch (merged PRs) + if: ${{ github.event_name != 'push' || (github.event_name == 'push' && github.repository == 'nf-core/mag') }} + runs-on: ubuntu-latest + + steps: + - name: Free some space + run: | + sudo rm -rf "/usr/local/share/boost" + sudo rm -rf "$AGENT_TOOLSDIRECTORY" + + - name: Check out pipeline code + uses: actions/checkout@0ad4b8fadaa221de15dcec353f45205ec38ea70b # v4 + + - name: Install Nextflow + run: | + wget -qO- get.nextflow.io | bash + sudo mv nextflow /usr/local/bin/ + + - name: Clean up Disk space + uses: jlumbroso/free-disk-space@54081f138730dfa15788a46383842cd2f914a1be # v1.3.1 + + - name: Download and prepare CheckM database + run: | + mkdir -p databases/checkm + wget https://zenodo.org/records/7401545/files/checkm_data_2015_01_16.tar.gz -P databases/checkm + tar xzvf databases/checkm/checkm_data_2015_01_16.tar.gz -C databases/checkm/ + + - name: Run pipeline with ${{ matrix.profile }} test profile + run: | + nextflow run ${GITHUB_WORKSPACE} -profile test,docker --outdir ./results --binqc_tool checkm --checkm_db databases/checkm + + checkm2: name: Run single test to checkm2 due to database download # Only run on push if this is the nf-core dev branch (merged PRs) diff --git a/bin/combine_tables.py b/bin/combine_tables.py index 769b24d4..6604748d 100755 --- a/bin/combine_tables.py +++ b/bin/combine_tables.py @@ -19,6 +19,7 @@ def parse_args(args=None): help="Bin depths summary file.", ) parser.add_argument("-b", "--busco_summary", metavar="FILE", help="BUSCO summary file.") + parser.add_argument("-c", "--checkm_summary", metavar="FILE", help="CheckM summary file.") parser.add_argument("-c", "--checkm2_summary", metavar="FILE", help="CheckM2 summary file.") parser.add_argument("-q", "--quast_summary", metavar="FILE", help="QUAST BINS summary file.") parser.add_argument("-g", "--gtdbtk_summary", metavar="FILE", help="GTDB-Tk summary file.") @@ -110,6 +111,37 @@ def main(args=None): results, busco_results, left_on="bin", right_on="GenomeBin", how="outer" ) # assuming depths for all bins are given + if args.checkm_summary: + use_columns = [ + "Bin Id", + "Marker lineage", + "# genomes", + "# markers", + "# marker sets", + "Completeness", + "Contamination", + "Strain heterogeneity", + "Coding density", + "Translation table", + "# predicted genes", + "0", + "1", + "2", + "3", + "4", + "5+", + ] + checkm_results = pd.read_csv(args.checkm_summary, usecols=use_columns, sep="\t") + checkm_results["Bin Id"] = checkm_results["Bin Id"] + ".fa" + if not bins.equals( + checkm_results["Bin Id"].sort_values().reset_index(drop=True) + ): + sys.exit("Bins in CheckM summary do not match bins in bin depths summary!") + results = pd.merge( + results, checkm_results, left_on="bin", right_on="Bin Id", how="outer" + ) # assuming depths for all bins are given + results["Bin Id"] = results["Bin Id"].str.removesuffix(".fa") + if args.checkm2_summary: use_columns = [ "Name", diff --git a/conf/modules.config b/conf/modules.config index dca9ea21..a12f6970 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -344,6 +344,35 @@ process { publishDir = [path: { "${params.outdir}/GenomeBinning/QC" }, mode: params.publish_dir_mode, saveAs: { filename -> filename.equals('versions.yml') ? null : filename }] } + withName: CHECKM_LINEAGEWF { + tag = { "${meta.assembler}-${meta.binner}-${meta.domain}-${meta.refinement}-${meta.id}" } + ext.prefix = { "${meta.assembler}-${meta.binner}-${meta.domain}-${meta.refinement}-${meta.id}_wf" } + publishDir = [ + path: { "${params.outdir}/GenomeBinning/QC/CheckM" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + + withName: CHECKM_QA { + ext.prefix = { "${meta.assembler}-${meta.binner}-${meta.domain}-${meta.refinement}-${meta.id}_qa" } + ext.args = "-o 2 --tab_table" + publishDir = [ + path: { "${params.outdir}/GenomeBinning/QC/CheckM" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + + withName: COMBINE_CHECKM_TSV { + ext.prefix = { "checkm_summary" } + publishDir = [ + path: { "${params.outdir}/GenomeBinning/QC" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + withName: CHECKM2_DATABASEDOWNLOAD { publishDir = [ path: { "${params.outdir}/GenomeBinning/QC/CheckM2/checkm2_downloads" }, diff --git a/modules.json b/modules.json index ae36414f..df496e68 100644 --- a/modules.json +++ b/modules.json @@ -10,6 +10,12 @@ "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", "installed_by": ["modules"] }, + "aria2": { + "branch": "master", + "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", + "installed_by": ["modules"], + "patch": "modules/nf-core/aria2/aria2.diff" + }, "bbmap/bbnorm": { "branch": "master", "git_sha": "603ecbd9f45300c9788f197d2a15a005685b4220", @@ -46,6 +52,16 @@ "installed_by": ["modules"], "patch": "modules/nf-core/centrifuge/kreport/centrifuge-kreport.diff" }, + "checkm/lineagewf": { + "branch": "master", + "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", + "installed_by": ["modules"] + }, + "checkm/qa": { + "branch": "master", + "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", + "installed_by": ["modules"] + }, "checkm2/databasedownload": { "branch": "master", "git_sha": "e17652681c856afaf2e240ba4c98bf4631a0fe2d", diff --git a/modules/local/bin_summary.nf b/modules/local/bin_summary.nf index 31224442..449f8962 100644 --- a/modules/local/bin_summary.nf +++ b/modules/local/bin_summary.nf @@ -8,6 +8,7 @@ process BIN_SUMMARY { input: path(bin_depths) path(busco_sum) + path(checkm_sum) path(checkm2_sum) path(quast_sum) path(gtdbtk_sum) @@ -19,6 +20,7 @@ process BIN_SUMMARY { script: def busco_summary = busco_sum.sort().size() > 0 ? "--busco_summary ${busco_sum}" : "" + def checkm_summary = checkm2_sum.sort().size() > 0 ? "--checkm2_summary ${checkm2_sum}" : "" def checkm2_summary = checkm2_sum.sort().size() > 0 ? "--checkm2_summary ${checkm2_sum}" : "" def quast_summary = quast_sum.sort().size() > 0 ? "--quast_summary ${quast_sum}" : "" def gtdbtk_summary = gtdbtk_sum.sort().size() > 0 ? "--gtdbtk_summary ${gtdbtk_sum}" : "" @@ -26,6 +28,7 @@ process BIN_SUMMARY { """ combine_tables.py --depths_summary ${bin_depths} \ $busco_summary \ + $checkm_summary \ $checkm2_summary \ $quast_summary \ $gtdbtk_summary \ diff --git a/modules/nf-core/aria2/aria2.diff b/modules/nf-core/aria2/aria2.diff new file mode 100644 index 00000000..789fdb44 --- /dev/null +++ b/modules/nf-core/aria2/aria2.diff @@ -0,0 +1,24 @@ +Changes in module 'nf-core/aria2' +--- modules/nf-core/aria2/main.nf ++++ modules/nf-core/aria2/main.nf +@@ -12,7 +12,7 @@ + val source_url + + output: +- path ("$downloaded_file"), emit: downloaded_file ++ path ("checkm_data_2015_01_16/"), emit: downloaded_file + path "versions.yml" , emit: versions + + when: +@@ -30,6 +30,9 @@ + $args \\ + $source_url + ++ mkdir checkm_data_2015_01_16/ ++ tar x -C checkm_data_2015_01_16 -v -z -f *.tar.gz ++ + cat <<-END_VERSIONS > versions.yml + "${task.process}": + aria2: \$(echo \$(aria2c --version 2>&1) | grep 'aria2 version' | cut -f3 -d ' ') + +************************************************************ diff --git a/modules/nf-core/aria2/main.nf b/modules/nf-core/aria2/main.nf new file mode 100644 index 00000000..b6091dad --- /dev/null +++ b/modules/nf-core/aria2/main.nf @@ -0,0 +1,41 @@ + +process ARIA2 { + tag "$source_url" + label 'process_single' + + conda "conda-forge::aria2=1.36.0" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/aria2:1.36.0' : + 'biocontainers/aria2:1.36.0' }" + + input: + val source_url + + output: + path ("checkm_data_2015_01_16/"), emit: downloaded_file + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + downloaded_file = source_url.split("/")[-1] + + """ + set -e + + aria2c \\ + --check-certificate=false \\ + $args \\ + $source_url + + mkdir checkm_data_2015_01_16/ + tar x -C checkm_data_2015_01_16 -v -z -f *.tar.gz + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + aria2: \$(echo \$(aria2c --version 2>&1) | grep 'aria2 version' | cut -f3 -d ' ') + END_VERSIONS + """ +} diff --git a/modules/nf-core/aria2/meta.yml b/modules/nf-core/aria2/meta.yml new file mode 100644 index 00000000..64c2a524 --- /dev/null +++ b/modules/nf-core/aria2/meta.yml @@ -0,0 +1,30 @@ +name: "aria2" +description: CLI Download utility +keywords: + - download +tools: + - "aria2": + description: "aria2 is a lightweight multi-protocol & multi-source, cross platform download utility operated in command-line. It supports HTTP/HTTPS, FTP, SFTP, BitTorrent and Metalink." + + tool_dev_url: "https://github.com/aria2/aria2/" + + licence: "['GPL v2']" + +input: + - source_url: + type: url + description: Source URL to be downloaded + pattern: "{http,https}*" + +output: + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - downloaded_file: + type: file + description: Downloaded files from source + pattern: "*.*" + +authors: + - "@JoseEspinosa" diff --git a/modules/nf-core/checkm/lineagewf/main.nf b/modules/nf-core/checkm/lineagewf/main.nf new file mode 100644 index 00000000..d8674ddc --- /dev/null +++ b/modules/nf-core/checkm/lineagewf/main.nf @@ -0,0 +1,47 @@ +process CHECKM_LINEAGEWF { + tag "$meta.id" + label 'process_medium' + + conda "bioconda::checkm-genome=1.2.1" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/checkm-genome:1.2.1--pyhdfd78af_0' : + 'biocontainers/checkm-genome:1.2.1--pyhdfd78af_0' }" + + input: + tuple val(meta), path(fasta, stageAs: "input_bins/*") + val fasta_ext + path db + + output: + tuple val(meta), path("${prefix}") , emit: checkm_output + tuple val(meta), path("${prefix}/lineage.ms"), emit: marker_file + tuple val(meta), path("${prefix}.tsv") , emit: checkm_tsv + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + prefix = task.ext.prefix ?: "${meta.id}" + checkm_db = db ? "export CHECKM_DATA_PATH=${db}" : "" + """ + $checkm_db + + checkm \\ + lineage_wf \\ + -t $task.cpus \\ + -f ${prefix}.tsv \\ + --tab_table \\ + --pplacer_threads $task.cpus \\ + -x $fasta_ext \\ + $args \\ + input_bins/ \\ + $prefix + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + checkm: \$( checkm 2>&1 | grep '...:::' | sed 's/.*CheckM v//;s/ .*//' ) + END_VERSIONS + """ +} diff --git a/modules/nf-core/checkm/lineagewf/meta.yml b/modules/nf-core/checkm/lineagewf/meta.yml new file mode 100644 index 00000000..4716a3e9 --- /dev/null +++ b/modules/nf-core/checkm/lineagewf/meta.yml @@ -0,0 +1,65 @@ +name: checkm_lineagewf +description: CheckM provides a set of tools for assessing the quality of genomes recovered from isolates, single cells, or metagenomes. +keywords: + - checkm + - mag + - metagenome + - quality + - isolates + - microbes + - single cells + - completeness + - contamination + - bins + - genome bins +tools: + - checkm: + description: Assess the quality of microbial genomes recovered from isolates, single cells, and metagenomes. + homepage: https://ecogenomics.github.io/CheckM/ + documentation: https://github.com/Ecogenomics/CheckM/wiki + tool_dev_url: https://github.com/Ecogenomics/CheckM + doi: "10.1101/gr.186072.114" + licence: ["GPL v3"] + +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - fasta: + type: file + description: One or a list of multiple FASTA files of each bin, with extension defined with the fasta_ext value + pattern: "*.{$fasta_ext}" + - fasta_ext: + type: value + description: The file-type extension suffix of the input FASTA files (e.g., fasta, fna, fa, fas) + - db: + type: directory + description: Optional directory pointing to checkM database to prevent re-downloading + +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'sample', bin:'1' ] + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - checkm_output: + type: directory + description: CheckM output directory + pattern: "*/" + - checkm_output: + type: file + description: Lineage markfer file + pattern: "lineage.ms" + - checkm_tsv: + type: file + description: CheckM summary completeness statistics table + pattern: "*.tsv" + +authors: + - "@jfy133" diff --git a/modules/nf-core/checkm/qa/main.nf b/modules/nf-core/checkm/qa/main.nf new file mode 100644 index 00000000..b0c0e69a --- /dev/null +++ b/modules/nf-core/checkm/qa/main.nf @@ -0,0 +1,44 @@ +process CHECKM_QA { + tag "$meta.id" + label 'process_low' + + conda "bioconda::checkm-genome=1.2.1" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/checkm-genome:1.2.1--pyhdfd78af_0' : + 'biocontainers/checkm-genome:1.2.1--pyhdfd78af_0' }" + + input: + tuple val(meta), path(analysis_dir), path(marker_file), path(coverage_file) + path exclude_marker_file + + output: + tuple val(meta), path("${prefix}.txt") , optional: true, emit: output + tuple val(meta), path("${prefix}.fasta"), optional: true, emit: fasta + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + prefix = task.ext.prefix ?: "${meta.id}" + suffix = task.ext.args?.matches(".*-o 9.*|.*--out_file 9.*") ? "fasta" : "txt" + def coverage = coverage_file ? "--coverage_file ${coverage_file}" : "" + def exclude = exclude_marker_file ? "--exclude_markers ${marker_filer}" : "" + """ + checkm \\ + qa \\ + --threads ${task.cpus} \\ + --file ${prefix}.${suffix} \\ + $marker_file \\ + $analysis_dir \\ + $coverage \\ + $exclude \\ + $args + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + checkm: \$( checkm 2>&1 | grep '...:::' | sed 's/.*CheckM v//;s/ .*//' ) + END_VERSIONS + """ +} diff --git a/modules/nf-core/checkm/qa/meta.yml b/modules/nf-core/checkm/qa/meta.yml new file mode 100644 index 00000000..d0af39af --- /dev/null +++ b/modules/nf-core/checkm/qa/meta.yml @@ -0,0 +1,67 @@ +name: checkm_qa +description: CheckM provides a set of tools for assessing the quality of genomes recovered from isolates, single cells, or metagenomes. +keywords: + - checkm + - mag + - metagenome + - quality + - isolates + - microbes + - single cells + - completeness + - contamination + - bins + - genome bins + - qa + - quality assurnce +tools: + - checkm: + description: Assess the quality of microbial genomes recovered from isolates, single cells, and metagenomes. + homepage: https://ecogenomics.github.io/CheckM/ + documentation: https://github.com/Ecogenomics/CheckM/wiki + tool_dev_url: https://github.com/Ecogenomics/CheckM + doi: "10.1101/gr.186072.114" + licence: ["GPL v3"] + +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - analysis_dir: + type: file + description: Directory containing output of checkm/analyze or checkm/lineage_wf etc. + pattern: "*" + - marker_file: + type: file + description: Marker file specified during checkm/analyze or produced by checkm/{lineage,taxonomy}_wf + pattern: "*.ms" + - coverage_file: + type: file + description: File containing coverage of each sequence (generated by checkm coverage) + - exclude_marker_file: + type: file + description: File specifying markers to exclude from marker sets + +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - output: + type: file + description: "Default completeness statistics in various formats, as specified with --out_format (excluding option: 9)" + pattern: "*.txt" + - fasta: + type: file + description: Output in fasta format (only if --out_format 9) + pattern: "*.fasta" + +authors: + - "@jfy133" diff --git a/nextflow.config b/nextflow.config index 6a5a4d2c..9351235c 100644 --- a/nextflow.config +++ b/nextflow.config @@ -129,6 +129,9 @@ params { busco_auto_lineage_prok = false save_busco_db = false busco_clean = false + checkm_download_url = "https://zenodo.org/records/7401545/files/checkm_data_2015_01_16.tar.gz" + checkm_db = null + save_checkm_data = false checkm2_db = null checkm2_db_version = 5571251 save_checkm2_data = false diff --git a/nextflow_schema.json b/nextflow_schema.json index 5b5891f8..41e6d9e6 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -715,13 +715,13 @@ "properties": { "skip_binqc": { "type": "boolean", - "description": "Disable bin QC with BUSCO or CheckM2." + "description": "Disable bin QC with BUSCO, CheckM or CheckM2." }, "binqc_tool": { "type": "string", "default": "busco", "description": "Specify which tool for bin quality-control validation to use.", - "enum": ["busco", "checkm2"] + "enum": ["busco", "checkm", "checkm2"] }, "busco_db": { "type": "string", @@ -742,6 +742,23 @@ "description": "Enable clean-up of temporary files created during BUSCO runs.", "help_text": "By default, BUSCO creates a large number of intermediate files every run. This may cause problems on some clusters which have file number limits in plate, particularly with large numbers of bins. Enabling this option cleans these files, reducing the total file count of the work directory." }, + "checkm_download_url": { + "type": "string", + "default": "https://zenodo.org/records/7401545/files/checkm_data_2015_01_16.tar.gz", + "hidden": true, + "description": "URL pointing to checkM database for auto download, if local path not supplied.", + "help_text": "You can use this parameter to point to an online copy of the checkM database TAR archive that the pipeline will use for auto download if a local path is not supplied to `--checkm_db`." + }, + "checkm_db": { + "type": "string", + "description": "Path to local folder containing already downloaded and uncompressed CheckM database.", + "help_text": "The pipeline can also download this for you if not specified, and you can save the resulting directory into your output directory by specifying `--save_checkm_data`. You should move this directory to somewhere else on your machine (and supply back to the pipeline in future runs again with `--checkm_db`." + }, + "save_checkm_data": { + "type": "boolean", + "description": "Save the used CheckM reference files downloaded when not using --checkm_db parameter.", + "help_text": "If specified, the directories and files decompressed from the `tar.gz` file downloaded from the [CheckM FTP server](https://data.ace.uq.edu.au/public/CheckM_databases/) will be stored in your output directory alongside your CheckM results." + }, "checkm2_db": { "type": "string", "description": "Path to local folder containing already downloaded and uncompressed CheckM2 database (.dmnd file).", diff --git a/subworkflows/local/checkm_qc.nf b/subworkflows/local/checkm_qc.nf new file mode 100644 index 00000000..70ed9708 --- /dev/null +++ b/subworkflows/local/checkm_qc.nf @@ -0,0 +1,44 @@ +/* + * CheckM: Quantitative measures for the assessment of genome assembly + */ + +include { CHECKM_QA } from '../../modules/nf-core/checkm/qa/main' +include { CHECKM_LINEAGEWF } from '../../modules/nf-core/checkm/lineagewf/main' +include { COMBINE_TSV as COMBINE_CHECKM_TSV } from '../../modules/local/combine_tsv' + +workflow CHECKM_QC { + take: + bins // channel: [ val(meta), path(bin) ] + checkm_db + + main: + ch_versions = Channel.empty() + + ch_input_checkmdb = checkm_db ? checkm_db : [] + ch_bins_for_checkmlineagewf = bins + .multiMap { + meta, fa -> + reads: [ meta, fa ] + ext: fa.extension.unique().join("") // we set this in the pipeline to always `.fa` so this should be fine + } + + CHECKM_LINEAGEWF ( ch_bins_for_checkmlineagewf.reads, ch_bins_for_checkmlineagewf.ext, checkm_db ) + ch_versions = ch_versions.mix(CHECKM_LINEAGEWF.out.versions.first()) + + ch_checkmqa_input = CHECKM_LINEAGEWF.out.checkm_output + .join(CHECKM_LINEAGEWF.out.marker_file) + .map{ + meta, dir, marker -> + [ meta, dir, marker, []] + } + + CHECKM_QA ( ch_checkmqa_input, [] ) + ch_versions = ch_versions.mix(CHECKM_QA.out.versions.first()) + + COMBINE_CHECKM_TSV ( CHECKM_QA.out.output.map{it[1]}.collect() ) + + emit: + summary = COMBINE_CHECKM_TSV.out.combined + checkm_tsv = CHECKM_QA.out.output + versions = ch_versions +} diff --git a/subworkflows/local/gtdbtk.nf b/subworkflows/local/gtdbtk.nf index 0bc254e9..07ba91b3 100644 --- a/subworkflows/local/gtdbtk.nf +++ b/subworkflows/local/gtdbtk.nf @@ -10,6 +10,7 @@ workflow GTDBTK { take: bins // channel: [ val(meta), [bins] ] busco_summary // channel: path + checkm_summary // channel: path checkm2_summary // channel: path gtdb // channel: path gtdb_mash // channel: path @@ -38,13 +39,16 @@ workflow GTDBTK { [row.'GenomeBin', completeness, contamination] } } else { - // Collect completeness and contamination metrics from CheckM2 summary - ch_bin_metrics = checkm2_summary + // Collect completeness and contamination metrics from CheckM/CheckM2 summary + summary = params.binqc_tool == 'checkm' ? checkm_summary : checkm2_summary + bin_name = params.binqc_tool == 'checkm' ? 'Bin Id' : 'Name' + + ch_bin_metrics = summary .splitCsv(header: true, sep: '\t') .map { row -> def completeness = Double.parseDouble(row.'Completeness') def contamination = Double.parseDouble(row.'Contamination') - [row.'Name' + ".fa", completeness, contamination] + [row[bin_name] + ".fa", completeness, contamination] } } diff --git a/workflows/mag.nf b/workflows/mag.nf index 6b334f73..541cb049 100644 --- a/workflows/mag.nf +++ b/workflows/mag.nf @@ -17,6 +17,7 @@ include { BINNING } from '../subwo include { BINNING_REFINEMENT } from '../subworkflows/local/binning_refinement' include { BUSCO_QC } from '../subworkflows/local/busco_qc' include { VIRUS_IDENTIFICATION } from '../subworkflows/local/virus_identification' +include { CHECKM_QC } from '../subworkflows/local/checkm_qc' include { CHECKM2_QC } from '../subworkflows/local/checkm2_qc' include { GUNC_QC } from '../subworkflows/local/gunc_qc' include { GTDBTK } from '../subworkflows/local/gtdbtk' @@ -110,6 +111,10 @@ workflow MAG { ch_busco_db = [] } + if (params.checkm_db) { + ch_checkm_db = file(params.checkm_db, checkIfExists: true) + } + if(params.checkm2_db) { ch_checkm2_db = [[:], file(params.checkm2_db, checkIfExists: true)] } @@ -177,8 +182,15 @@ workflow MAG { // Additional info for completion email and summary def busco_failed_bins = [:] + // Get checkM database if not supplied + + if (!params.skip_binqc && params.binqc_tool == 'checkm' && !params.checkm_db) { + ARIA2_UNTAR(params.checkm_download_url) + ch_checkm_db = ARIA2_UNTAR.out.downloaded_file + } + // Get CheckM2 database if not supplied - if ( !params.skip_binqc && params.binqc_tool == 'checkm2' && !params.checkm2_db ) { + if (!params.skip_binqc && params.binqc_tool == 'checkm2' && !params.checkm2_db) { CHECKM2_DATABASEDOWNLOAD (params.checkm2_db_version) ch_checkm2_db = CHECKM2_DATABASEDOWNLOAD.out.database } @@ -637,6 +649,7 @@ workflow MAG { */ ch_busco_summary = Channel.empty() + ch_checkm_summary = Channel.empty() ch_checkm2_summary = Channel.empty() if (!params.skip_binning || params.ancient_dna) { @@ -773,7 +786,7 @@ workflow MAG { ch_versions = ch_versions.mix(DEPTHS.out.versions) /* - * Bin QC subworkflows: for checking bin completeness with either BUSCO, CHECKM2, and/or GUNC + * Bin QC subworkflows: for checking bin completeness with either BUSCO, CHECKM, CHECKM2, and/or GUNC */ ch_input_bins_for_qc = ch_input_for_postbinning_bins_unbins.transpose() @@ -797,6 +810,24 @@ workflow MAG { } } + if (!params.skip_binqc && params.binqc_tool == 'checkm') { + /* + * CheckM subworkflow: Quantitative measures for the assessment of genome assembly + */ + + ch_input_bins_for_checkm = ch_input_bins_for_qc.filter { meta, bins -> + meta.domain != "eukarya" + } + + CHECKM_QC( + ch_input_bins_for_checkm.groupTuple(), + ch_checkm_db + ) + ch_checkm_summary = CHECKM_QC.out.summary + + ch_versions = ch_versions.mix(CHECKM_QC.out.versions) + } + if (!params.skip_binqc && params.binqc_tool == 'checkm2') { /* * CheckM2 subworkflow: Quantitative measures for the assessment of genome assembly @@ -815,7 +846,11 @@ workflow MAG { ch_versions = ch_versions.mix(CHECKM2_QC.out.versions) } - if (params.run_gunc) { + if (params.run_gunc && params.binqc_tool == 'checkm') { + GUNC_QC(ch_input_bins_for_checkm, ch_gunc_db, CHECKM_QC.out.checkm_tsv) + ch_versions = ch_versions.mix(GUNC_QC.out.versions) + } + else if (params.run_gunc) { ch_input_bins_for_gunc = ch_input_for_postbinning_bins_unbins.filter { meta, bins -> meta.domain != "eukarya" } @@ -892,6 +927,7 @@ workflow MAG { GTDBTK( ch_gtdb_bins, ch_busco_summary, + ch_checkm_summary, ch_checkm2_summary, gtdb, gtdb_mash @@ -908,6 +944,7 @@ workflow MAG { BIN_SUMMARY( ch_input_for_binsummary, ch_busco_summary.ifEmpty([]), + ch_checkm_summary.ifEmpty([]), ch_checkm2_summary.ifEmpty([]), ch_quast_bins_summary.ifEmpty([]), ch_gtdbtk_summary.ifEmpty([]), From 05e3393fbe531fe3416c9e4d50b0a51cb9a53ef6 Mon Sep 17 00:00:00 2001 From: "Diego Alvarez S." Date: Sun, 27 Oct 2024 19:07:38 -0300 Subject: [PATCH 03/33] fix: One more thing --- conf/modules.config | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/conf/modules.config b/conf/modules.config index a12f6970..d4934718 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -344,6 +344,10 @@ process { publishDir = [path: { "${params.outdir}/GenomeBinning/QC" }, mode: params.publish_dir_mode, saveAs: { filename -> filename.equals('versions.yml') ? null : filename }] } + withName: ARIA2_UNTAR { + publishDir = [path: { "${params.outdir}/GenomeBinning/QC/CheckM/checkm_downloads" }, mode: params.publish_dir_mode, overwrite: false, saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, enabled: params.save_checkm_data] + } + withName: CHECKM_LINEAGEWF { tag = { "${meta.assembler}-${meta.binner}-${meta.domain}-${meta.refinement}-${meta.id}" } ext.prefix = { "${meta.assembler}-${meta.binner}-${meta.domain}-${meta.refinement}-${meta.id}_wf" } From 3e49dbf953cd631847ace51b2eb692b077f754cf Mon Sep 17 00:00:00 2001 From: "Diego Alvarez S." Date: Sun, 27 Oct 2024 19:09:24 -0300 Subject: [PATCH 04/33] fix: Linting --- .github/workflows/ci.yml | 1 - subworkflows/local/checkm2_qc.nf | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 6872104d..ccc072c0 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -164,7 +164,6 @@ jobs: run: | nextflow run ${GITHUB_WORKSPACE} -profile test,docker --outdir ./results --binqc_tool checkm --checkm_db databases/checkm - checkm2: name: Run single test to checkm2 due to database download # Only run on push if this is the nf-core dev branch (merged PRs) diff --git a/subworkflows/local/checkm2_qc.nf b/subworkflows/local/checkm2_qc.nf index 39264d70..444f8c56 100644 --- a/subworkflows/local/checkm2_qc.nf +++ b/subworkflows/local/checkm2_qc.nf @@ -22,4 +22,4 @@ workflow CHECKM2_QC { emit: summary = COMBINE_CHECKM2_TSV.out.combined versions = ch_versions -} \ No newline at end of file +} From b1b6518da2854f5005fd27b60fddb3128420ca0d Mon Sep 17 00:00:00 2001 From: "Diego Alvarez S." Date: Sun, 27 Oct 2024 19:40:06 -0300 Subject: [PATCH 05/33] fix: Option and checks in bin summary script --- bin/combine_tables.py | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/bin/combine_tables.py b/bin/combine_tables.py index 6604748d..2e95b438 100755 --- a/bin/combine_tables.py +++ b/bin/combine_tables.py @@ -20,7 +20,7 @@ def parse_args(args=None): ) parser.add_argument("-b", "--busco_summary", metavar="FILE", help="BUSCO summary file.") parser.add_argument("-c", "--checkm_summary", metavar="FILE", help="CheckM summary file.") - parser.add_argument("-c", "--checkm2_summary", metavar="FILE", help="CheckM2 summary file.") + parser.add_argument("-C", "--checkm2_summary", metavar="FILE", help="CheckM2 summary file.") parser.add_argument("-q", "--quast_summary", metavar="FILE", help="QUAST BINS summary file.") parser.add_argument("-g", "--gtdbtk_summary", metavar="FILE", help="GTDB-Tk summary file.") parser.add_argument("-a", "--cat_summary", metavar="FILE", help="CAT table file.") @@ -86,16 +86,23 @@ def main(args=None): if ( not args.busco_summary + and not args.checkm_summary and not args.checkm2_summary and not args.quast_summary and not args.gtdbtk_summary ): - sys.exit("No summary specified! Please specify at least BUSCO, CheckM2 or QUAST summary.") + sys.exit( + "No summary specified! " + "Please specify at least BUSCO, CheckM, CheckM2 or QUAST summary." + ) - # GTDB-Tk can only be run in combination with BUSCO or CheckM2 - if args.gtdbtk_summary and not (args.busco_summary or args.checkm2_summary): + # GTDB-Tk can only be run in combination with BUSCO, CheckM or CheckM2 + if args.gtdbtk_summary and not ( + args.busco_summary or args.checkm_summary or args.checkm2_summary + ): sys.exit( - "Invalid parameter combination: GTDB-TK summary specified, but no BUSCO or CheckM2 summary!" + "Invalid parameter combination: " + "GTDB-TK summary specified, but no BUSCO, CheckM or CheckM2 summary!" ) # handle bin depths @@ -133,9 +140,7 @@ def main(args=None): ] checkm_results = pd.read_csv(args.checkm_summary, usecols=use_columns, sep="\t") checkm_results["Bin Id"] = checkm_results["Bin Id"] + ".fa" - if not bins.equals( - checkm_results["Bin Id"].sort_values().reset_index(drop=True) - ): + if not bins.equals(checkm_results["Bin Id"].sort_values().reset_index(drop=True)): sys.exit("Bins in CheckM summary do not match bins in bin depths summary!") results = pd.merge( results, checkm_results, left_on="bin", right_on="Bin Id", how="outer" From 73b5794b06939400ba0f79a57806705f02a890e2 Mon Sep 17 00:00:00 2001 From: "Diego Alvarez S." Date: Sun, 27 Oct 2024 19:43:35 -0300 Subject: [PATCH 06/33] Fix: missing import --- workflows/mag.nf | 1 + 1 file changed, 1 insertion(+) diff --git a/workflows/mag.nf b/workflows/mag.nf index 541cb049..07e462f1 100644 --- a/workflows/mag.nf +++ b/workflows/mag.nf @@ -29,6 +29,7 @@ include { LONGREAD_PREPROCESSING } from '../subwo // // MODULE: Installed directly from nf-core/modules // +include { ARIA2 as ARIA2_UNTAR } from '../modules/nf-core/aria2/main' include { FASTQC as FASTQC_RAW } from '../modules/nf-core/fastqc/main' include { FASTQC as FASTQC_TRIMMED } from '../modules/nf-core/fastqc/main' include { SEQTK_MERGEPE } from '../modules/nf-core/seqtk/mergepe/main' From 325310bc034b74b36c4d088307abb45f31e6cc9a Mon Sep 17 00:00:00 2001 From: "Diego Alvarez S." Date: Mon, 28 Oct 2024 07:30:26 -0300 Subject: [PATCH 07/33] docs: Output, changelog and citation for CheckM2 --- CHANGELOG.md | 12 ++++++++++++ CITATIONS.md | 4 ++++ conf/modules.config | 2 +- docs/output.md | 30 ++++++++++++++++++++++++++++-- 4 files changed, 45 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 43c68211..d0030836 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,6 +3,18 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## Unreleased + +### `Added` + +- [#707](https://github.com/nf-core/mag/pull/707) - Added CheckM2 as an alternative bin completeness and QC tool (added by @dialvarezs) + +### `Dependencies` + +| Tool | Previous version | New version | +| ------- | ---------------- | ----------- | +| CheckM2 | | 1.0.2 | + ## 3.2.0 [2024-10-27] ### `Added` diff --git a/CITATIONS.md b/CITATIONS.md index 52caa1e6..1eee90dc 100644 --- a/CITATIONS.md +++ b/CITATIONS.md @@ -40,6 +40,10 @@ > Parks, D. H., Imelfort, M., Skennerton, C. T., Hugenholtz, P., & Tyson, G. W. (2015). CheckM: assessing the quality of microbial genomes recovered from isolates, single cells, and metagenomes. Genome Research, 25(7), 1043–1055. doi: 10.1101/gr.186072.114 +- [CheckM2](https://doi.org/10.1038/s41592-023-01940-w) + + > Chklovski, A., Parks, D. H., Woodcroft, B. J., & Tyson, G. W. (2023). CheckM2: a rapid, scalable and accurate tool for assessing microbial genome quality using machine learning. Nature Methods, 20(8), 1203-1212. + - [CONCOCT](https://doi.org/10.1038/nmeth.3103) > Alneberg, J., Bjarnason, B. S., de Bruijn, I., Schirmer, M., Quick, J., Ijaz, U. Z., Lahti, L., Loman, N. J., Andersson, A. F., & Quince, C. (2014). Binning metagenomic contigs by coverage and composition. Nature Methods, 11(11), 1144–1146. doi: 10.1038/nmeth.3103 diff --git a/conf/modules.config b/conf/modules.config index d4934718..5b108751 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -395,7 +395,7 @@ process { } withName: COMBINE_CHECKM2_TSV { - ext.prefix = { "checkm_summary" } + ext.prefix = { "checkm2_summary" } publishDir = [ path: { "${params.outdir}/GenomeBinning/QC" }, mode: params.publish_dir_mode, diff --git a/docs/output.md b/docs/output.md index 4e43ffb6..b2ca2aea 100644 --- a/docs/output.md +++ b/docs/output.md @@ -540,7 +540,7 @@ Besides the reference files or output files created by BUSCO, the following summ #### CheckM -[CheckM](https://ecogenomics.github.io/CheckM/) CheckM provides a set of tools for assessing the quality of genomes recovered from isolates, single cells, or metagenomes. It provides robust estimates of genome completeness and contamination by using collocated sets of genes that are ubiquitous and single-copy within a phylogenetic lineage +[CheckM](https://ecogenomics.github.io/CheckM/) provides a set of tools for assessing the quality of genomes recovered from isolates, single cells, or metagenomes. It provides robust estimates of genome completeness and contamination by using collocated sets of genes that are ubiquitous and single-copy within a phylogenetic lineage By default, nf-core/mag runs CheckM with the `check_lineage` workflow that places genome bins on a reference tree to define lineage-marker sets, to check for completeness and contamination based on lineage-specific marker genes. and then subsequently runs `qa` to generate the summary files. @@ -550,7 +550,8 @@ By default, nf-core/mag runs CheckM with the `check_lineage` workflow that place - `GenomeBinning/QC/CheckM/` - `[assembler]-[binner]-[domain]-[refinement]-[sample/group]_qa.txt`: Detailed statistics about bins informing completeness and contamamination scores (output of `checkm qa`). This should normally be your main file to use to evaluate your results. - `[assembler]-[binner]-[domain]-[refinement]-[sample/group]_wf.tsv`: Overall summary file for completeness and contamination (output of `checkm lineage_wf`). - - `[assembler]-[binner]-[domain]-[refinement]-[sample/group]/`: intermediate files for CheckM results, including CheckM generated annotations, log, lineage markers etc. + - `[assembler]-[binner]-[domain]-[refinement]-[sample/group]/`: Intermediate files for CheckM results, including CheckM generated annotations, log, lineage markers etc. +- `GenomeBinning/QC/` - `checkm_summary.tsv`: A summary table of the CheckM results for all bins (output of `checkm qa`). @@ -566,6 +567,31 @@ If the parameter `--save_checkm_reference` is set, additionally the used the Che +#### CheckM2 + +[CheckM2](https://github.com/chklovski/CheckM2) is atool for assessing the quality of metagenome-derived genomes. It uses a machine learning approach to predict the completeness and contamination of a genome regardless of its taxonomic lineage. + +
+Output files + +- `GenomeBinning/QC/CheckM2/` + - `[assembler]-[binner]-[domain]-[refinement]-[sample/group]/quality_report.tsv`: Detailed statistics about bins informing completeness and contamamination scores. This should normally be your main file to use to evaluate your results. + - `[assembler]-[binner]-[domain]-[refinement]-[sample/group]/`: Intermediate files for CheckM2 results, including CheckM2 generated annotations, log, and Diamond alignment results. +- `GenomeBinning/QC/` + - `checkm2_summary.tsv`: A summary table of the CheckM2 results for all bins. + +
+ +If the parameter `--save_checkm2_reference` is set, the CheckM2 reference datasets will be stored in the output directory. + +
+Output files + +- `GenomeBinning/QC/CheckM2/` + - `checkm2_downloads/CheckM2_database/*.dmnd`: Diamond database used by CheckM2. + +
+ #### GUNC [Genome UNClutterer (GUNC)](https://grp-bork.embl-community.io/gunc/index.html) is a tool for detection of chimerism and contamination in prokaryotic genomes resulting from mis-binning of genomic contigs from unrelated lineages. It does so by applying an entropy based score on taxonomic assignment and contig location of all genes in a genome. It is generally considered as a additional complement to CheckM results. From b88302a81ddbcbeb413a90a54296dda688f8e9ca Mon Sep 17 00:00:00 2001 From: "Diego Alvarez S." Date: Mon, 28 Oct 2024 07:32:20 -0300 Subject: [PATCH 08/33] docs: readme --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index d82f04a9..aaad70ca 100644 --- a/README.md +++ b/README.md @@ -35,7 +35,7 @@ The pipeline then: - performs assembly using [MEGAHIT](https://github.com/voutcn/megahit) and [SPAdes](http://cab.spbu.ru/software/spades/), and checks their quality using [Quast](http://quast.sourceforge.net/quast) - (optionally) performs ancient DNA assembly validation using [PyDamage](https://github.com/maxibor/pydamage) and contig consensus sequence recalling with [Freebayes](https://github.com/freebayes/freebayes) and [BCFtools](http://samtools.github.io/bcftools/bcftools.html) - predicts protein-coding genes for the assemblies using [Prodigal](https://github.com/hyattpd/Prodigal), and bins with [Prokka](https://github.com/tseemann/prokka) and optionally [MetaEuk](https://www.google.com/search?channel=fs&client=ubuntu-sn&q=MetaEuk) -- performs metagenome binning using [MetaBAT2](https://bitbucket.org/berkeleylab/metabat/src/master/), [MaxBin2](https://sourceforge.net/projects/maxbin2/), and/or with [CONCOCT](https://github.com/BinPro/CONCOCT), and checks the quality of the genome bins using [Busco](https://busco.ezlab.org/), or [CheckM](https://ecogenomics.github.io/CheckM/), and optionally [GUNC](https://grp-bork.embl-community.io/gunc/). +- performs metagenome binning using [MetaBAT2](https://bitbucket.org/berkeleylab/metabat/src/master/), [MaxBin2](https://sourceforge.net/projects/maxbin2/), and/or with [CONCOCT](https://github.com/BinPro/CONCOCT), and checks the quality of the genome bins using [Busco](https://busco.ezlab.org/), [CheckM](https://ecogenomics.github.io/CheckM/), or [CheckM2](https://github.com/chklovski/CheckM2) and optionally [GUNC](https://grp-bork.embl-community.io/gunc/). - Performs ancient DNA validation and repair with [pyDamage](https://github.com/maxibor/pydamage) and [freebayes](https://github.com/freebayes/freebayes) - optionally refines bins with [DAS Tool](https://github.com/cmks/DAS_Tool) - assigns taxonomy to bins using [GTDB-Tk](https://github.com/Ecogenomics/GTDBTk) and/or [CAT](https://github.com/dutilh/CAT) and optionally identifies viruses in assemblies using [geNomad](https://github.com/apcamargo/genomad), or Eukaryotes with [Tiara](https://github.com/ibe-uw/tiara) From 3da7441e55cd8df4339b44900c56e66751a47ded Mon Sep 17 00:00:00 2001 From: "Diego Alvarez S." Date: Mon, 28 Oct 2024 07:56:15 -0300 Subject: [PATCH 09/33] docs: Update readme --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index aaad70ca..a5ae232f 100644 --- a/README.md +++ b/README.md @@ -90,6 +90,7 @@ Other code contributors include: - [Phil Palmer](https://github.com/PhilPalmer) - [@willros](https://github.com/willros) - [Adam Rosenbaum](https://github.com/muabnezor) +- [Diego Alvarez](https://github.com/dialvarezs) Long read processing was inspired by [caspargross/HybridAssembly](https://github.com/caspargross/HybridAssembly) written by Caspar Gross [@caspargross](https://github.com/caspargross) From 3d98415e75e4205c465b94d7ba13a9f8e9fdfd4a Mon Sep 17 00:00:00 2001 From: "Diego Alvarez S." Date: Mon, 28 Oct 2024 09:57:24 -0300 Subject: [PATCH 10/33] refactor: Merge checkm and checkm2 subworkflows in a single one Also, simplify bin_summary regarding bin qc --- bin/combine_tables.py | 28 ++++++------- conf/modules.config | 11 +----- modules/local/bin_summary.nf | 13 ++---- subworkflows/local/checkm2_qc.nf | 25 ------------ subworkflows/local/checkm_qc.nf | 52 ++++++++++++++---------- subworkflows/local/gtdbtk.nf | 4 +- workflows/mag.nf | 68 ++++++++++---------------------- 7 files changed, 71 insertions(+), 130 deletions(-) delete mode 100644 subworkflows/local/checkm2_qc.nf diff --git a/bin/combine_tables.py b/bin/combine_tables.py index 2e95b438..e287676a 100755 --- a/bin/combine_tables.py +++ b/bin/combine_tables.py @@ -18,12 +18,14 @@ def parse_args(args=None): metavar="FILE", help="Bin depths summary file.", ) - parser.add_argument("-b", "--busco_summary", metavar="FILE", help="BUSCO summary file.") - parser.add_argument("-c", "--checkm_summary", metavar="FILE", help="CheckM summary file.") - parser.add_argument("-C", "--checkm2_summary", metavar="FILE", help="CheckM2 summary file.") + parser.add_argument("-b", "--binqc_summary", metavar="FILE", help="BUSCO summary file.") parser.add_argument("-q", "--quast_summary", metavar="FILE", help="QUAST BINS summary file.") parser.add_argument("-g", "--gtdbtk_summary", metavar="FILE", help="GTDB-Tk summary file.") parser.add_argument("-a", "--cat_summary", metavar="FILE", help="CAT table file.") + parser.add_argument( + "-t", "--binqc_tool", help="Bin QC tool used", choices=["busco", "checkm", "checkm2"] + ) + parser.add_argument( "-o", "--out", @@ -85,9 +87,7 @@ def main(args=None): args = parse_args(args) if ( - not args.busco_summary - and not args.checkm_summary - and not args.checkm2_summary + not args.binqc_summary and not args.quast_summary and not args.gtdbtk_summary ): @@ -97,9 +97,7 @@ def main(args=None): ) # GTDB-Tk can only be run in combination with BUSCO, CheckM or CheckM2 - if args.gtdbtk_summary and not ( - args.busco_summary or args.checkm_summary or args.checkm2_summary - ): + if args.gtdbtk_summary and not args.binqc_summary: sys.exit( "Invalid parameter combination: " "GTDB-TK summary specified, but no BUSCO, CheckM or CheckM2 summary!" @@ -110,15 +108,15 @@ def main(args=None): results.columns = ["Depth " + str(col) if col != "bin" else col for col in results.columns] bins = results["bin"].sort_values().reset_index(drop=True) - if args.busco_summary: - busco_results = pd.read_csv(args.busco_summary, sep="\t") + if args.binqc_summary and args.binqc_tool == "busco": + busco_results = pd.read_csv(args.binqc_summary, sep="\t") if not bins.equals(busco_results["GenomeBin"].sort_values().reset_index(drop=True)): sys.exit("Bins in BUSCO summary do not match bins in bin depths summary!") results = pd.merge( results, busco_results, left_on="bin", right_on="GenomeBin", how="outer" ) # assuming depths for all bins are given - if args.checkm_summary: + if args.binqc_summary and args.binqc_tool == "checkm": use_columns = [ "Bin Id", "Marker lineage", @@ -138,7 +136,7 @@ def main(args=None): "4", "5+", ] - checkm_results = pd.read_csv(args.checkm_summary, usecols=use_columns, sep="\t") + checkm_results = pd.read_csv(args.binqc_summary, usecols=use_columns, sep="\t") checkm_results["Bin Id"] = checkm_results["Bin Id"] + ".fa" if not bins.equals(checkm_results["Bin Id"].sort_values().reset_index(drop=True)): sys.exit("Bins in CheckM summary do not match bins in bin depths summary!") @@ -147,7 +145,7 @@ def main(args=None): ) # assuming depths for all bins are given results["Bin Id"] = results["Bin Id"].str.removesuffix(".fa") - if args.checkm2_summary: + if args.binqc_summary and args.binqc_tool == "checkm2": use_columns = [ "Name", "Completeness", @@ -157,7 +155,7 @@ def main(args=None): "Translation_Table_Used", "Total_Coding_Sequences", ] - checkm2_results = pd.read_csv(args.checkm2_summary, usecols=use_columns, sep="\t") + checkm2_results = pd.read_csv(args.binqc_summary, usecols=use_columns, sep="\t") checkm2_results["Name"] = checkm2_results["Name"] + ".fa" if not set(checkm2_results["Name"]).issubset(set(bins)): sys.exit("Bins in CheckM2 summary do not match bins in bin depths summary!") diff --git a/conf/modules.config b/conf/modules.config index 5b108751..1c7afa3c 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -369,7 +369,7 @@ process { } withName: COMBINE_CHECKM_TSV { - ext.prefix = { "checkm_summary" } + ext.prefix = { params.binqc_tool == "checkm" ? "checkm_summary" : "checkm2_summary" } publishDir = [ path: { "${params.outdir}/GenomeBinning/QC" }, mode: params.publish_dir_mode, @@ -394,15 +394,6 @@ process { ] } - withName: COMBINE_CHECKM2_TSV { - ext.prefix = { "checkm2_summary" } - publishDir = [ - path: { "${params.outdir}/GenomeBinning/QC" }, - mode: params.publish_dir_mode, - saveAs: { filename -> filename.equals('versions.yml') ? null : filename } - ] - } - withName: GUNC_DOWNLOADDB { publishDir = [ path: { "${params.outdir}/GenomeBinning/QC/GUNC" }, diff --git a/modules/local/bin_summary.nf b/modules/local/bin_summary.nf index 449f8962..9208822a 100644 --- a/modules/local/bin_summary.nf +++ b/modules/local/bin_summary.nf @@ -7,9 +7,7 @@ process BIN_SUMMARY { input: path(bin_depths) - path(busco_sum) - path(checkm_sum) - path(checkm2_sum) + path(binqc_sum) path(quast_sum) path(gtdbtk_sum) path(cat_sum) @@ -19,20 +17,17 @@ process BIN_SUMMARY { path "versions.yml" , emit: versions script: - def busco_summary = busco_sum.sort().size() > 0 ? "--busco_summary ${busco_sum}" : "" - def checkm_summary = checkm2_sum.sort().size() > 0 ? "--checkm2_summary ${checkm2_sum}" : "" - def checkm2_summary = checkm2_sum.sort().size() > 0 ? "--checkm2_summary ${checkm2_sum}" : "" + def binqc_summary = binqc_sum.sort().size() > 0 ? "--binqc_summary ${binqc_sum}" : "" def quast_summary = quast_sum.sort().size() > 0 ? "--quast_summary ${quast_sum}" : "" def gtdbtk_summary = gtdbtk_sum.sort().size() > 0 ? "--gtdbtk_summary ${gtdbtk_sum}" : "" def cat_summary = cat_sum.sort().size() > 0 ? "--cat_summary ${cat_sum}" : "" """ combine_tables.py --depths_summary ${bin_depths} \ - $busco_summary \ - $checkm_summary \ - $checkm2_summary \ + $binqc_summary \ $quast_summary \ $gtdbtk_summary \ $cat_summary \ + --binqc_tool ${params.binqc_tool} \ --out bin_summary.tsv cat <<-END_VERSIONS > versions.yml diff --git a/subworkflows/local/checkm2_qc.nf b/subworkflows/local/checkm2_qc.nf deleted file mode 100644 index 444f8c56..00000000 --- a/subworkflows/local/checkm2_qc.nf +++ /dev/null @@ -1,25 +0,0 @@ -/* - * CheckM2: Assessing the quality of metagenome-derived genome bins using machine learning - */ - -include { CHECKM2_PREDICT } from '../../modules/nf-core/checkm2/predict/main' -include { COMBINE_TSV as COMBINE_CHECKM2_TSV } from '../../modules/local/combine_tsv' - - -workflow CHECKM2_QC { - take: - bins // channel: [ val(meta), path(bin) ] - checkm2_db - - main: - ch_versions = Channel.empty() - - CHECKM2_PREDICT ( bins, checkm2_db ) - ch_versions = ch_versions.mix(CHECKM2_PREDICT.out.versions.first()) - - COMBINE_CHECKM2_TSV ( CHECKM2_PREDICT.out.checkm2_tsv.map{it[1]}.collect() ) - - emit: - summary = COMBINE_CHECKM2_TSV.out.combined - versions = ch_versions -} diff --git a/subworkflows/local/checkm_qc.nf b/subworkflows/local/checkm_qc.nf index 70ed9708..73183d2d 100644 --- a/subworkflows/local/checkm_qc.nf +++ b/subworkflows/local/checkm_qc.nf @@ -1,44 +1,54 @@ /* - * CheckM: Quantitative measures for the assessment of genome assembly + * CheckM/CheckM2: Quantitative measures for the assessment of genome assembly */ include { CHECKM_QA } from '../../modules/nf-core/checkm/qa/main' include { CHECKM_LINEAGEWF } from '../../modules/nf-core/checkm/lineagewf/main' +include { CHECKM2_PREDICT } from '../../modules/nf-core/checkm2/predict/main' include { COMBINE_TSV as COMBINE_CHECKM_TSV } from '../../modules/local/combine_tsv' workflow CHECKM_QC { take: bins // channel: [ val(meta), path(bin) ] checkm_db + checkm2_db main: ch_versions = Channel.empty() - ch_input_checkmdb = checkm_db ? checkm_db : [] - ch_bins_for_checkmlineagewf = bins - .multiMap { - meta, fa -> - reads: [ meta, fa ] - ext: fa.extension.unique().join("") // we set this in the pipeline to always `.fa` so this should be fine - } - - CHECKM_LINEAGEWF ( ch_bins_for_checkmlineagewf.reads, ch_bins_for_checkmlineagewf.ext, checkm_db ) - ch_versions = ch_versions.mix(CHECKM_LINEAGEWF.out.versions.first()) - - ch_checkmqa_input = CHECKM_LINEAGEWF.out.checkm_output - .join(CHECKM_LINEAGEWF.out.marker_file) - .map{ - meta, dir, marker -> - [ meta, dir, marker, []] + if (params.binqc_tool == "checkm") { + ch_bins_for_checkmlineagewf = bins.multiMap { + meta, fa -> + reads: [ meta, fa ] + ext: fa.extension.unique().join("") // we set this in the pipeline to always `.fa` so this should be fine } - CHECKM_QA ( ch_checkmqa_input, [] ) - ch_versions = ch_versions.mix(CHECKM_QA.out.versions.first()) + CHECKM_LINEAGEWF(ch_bins_for_checkmlineagewf.reads, ch_bins_for_checkmlineagewf.ext, checkm_db) + ch_versions = ch_versions.mix(CHECKM_LINEAGEWF.out.versions.first()) - COMBINE_CHECKM_TSV ( CHECKM_QA.out.output.map{it[1]}.collect() ) + ch_checkmqa_input = CHECKM_LINEAGEWF.out.checkm_output + .join(CHECKM_LINEAGEWF.out.marker_file) + .map{ + meta, dir, marker -> + [ meta, dir, marker, []] + } + + CHECKM_QA ( ch_checkmqa_input, [] ) + + ch_versions = ch_versions.mix(CHECKM_QA.out.versions.first()) + + COMBINE_CHECKM_TSV(CHECKM_QA.out.output.map{it[1]}.collect()) + } + if (params.binqc_tool == "checkm2") { + CHECKM2_PREDICT(bins, checkm2_db) + + ch_versions = ch_versions.mix(CHECKM2_PREDICT.out.versions.first()) + + COMBINE_CHECKM_TSV(CHECKM2_PREDICT.out.checkm2_tsv.map{it[1]}.collect()) + } emit: summary = COMBINE_CHECKM_TSV.out.combined - checkm_tsv = CHECKM_QA.out.output + checkm_tsv = params.binqc_tool == "checkm" ? CHECKM_QA.out.output : [] versions = ch_versions } diff --git a/subworkflows/local/gtdbtk.nf b/subworkflows/local/gtdbtk.nf index 07ba91b3..83af9e24 100644 --- a/subworkflows/local/gtdbtk.nf +++ b/subworkflows/local/gtdbtk.nf @@ -11,7 +11,6 @@ workflow GTDBTK { bins // channel: [ val(meta), [bins] ] busco_summary // channel: path checkm_summary // channel: path - checkm2_summary // channel: path gtdb // channel: path gtdb_mash // channel: path @@ -40,10 +39,9 @@ workflow GTDBTK { } } else { // Collect completeness and contamination metrics from CheckM/CheckM2 summary - summary = params.binqc_tool == 'checkm' ? checkm_summary : checkm2_summary bin_name = params.binqc_tool == 'checkm' ? 'Bin Id' : 'Name' - ch_bin_metrics = summary + ch_bin_metrics = checkm_summary .splitCsv(header: true, sep: '\t') .map { row -> def completeness = Double.parseDouble(row.'Completeness') diff --git a/workflows/mag.nf b/workflows/mag.nf index 07e462f1..ab05e654 100644 --- a/workflows/mag.nf +++ b/workflows/mag.nf @@ -18,7 +18,6 @@ include { BINNING_REFINEMENT } from '../subwo include { BUSCO_QC } from '../subworkflows/local/busco_qc' include { VIRUS_IDENTIFICATION } from '../subworkflows/local/virus_identification' include { CHECKM_QC } from '../subworkflows/local/checkm_qc' -include { CHECKM2_QC } from '../subworkflows/local/checkm2_qc' include { GUNC_QC } from '../subworkflows/local/gunc_qc' include { GTDBTK } from '../subworkflows/local/gtdbtk' include { ANCIENT_DNA_ASSEMBLY_VALIDATION } from '../subworkflows/local/ancient_dna' @@ -79,7 +78,7 @@ include { COMBINE_TSV as COMBINE_SUMMARY_TSV } from '../modul workflow MAG { take: - ch_raw_short_reads // channel: samplesheet read in from --input + ch_raw_short_reads // channel: samplesheet read in from --input ch_raw_long_reads ch_input_assemblies @@ -115,10 +114,16 @@ workflow MAG { if (params.checkm_db) { ch_checkm_db = file(params.checkm_db, checkIfExists: true) } + else { + ch_checkm_db = [] + } - if(params.checkm2_db) { + if (params.checkm2_db) { ch_checkm2_db = [[:], file(params.checkm2_db, checkIfExists: true)] } + else { + ch_checkm2_db = [] + } if (params.gunc_db) { ch_gunc_db = file(params.gunc_db, checkIfExists: true) @@ -192,7 +197,7 @@ workflow MAG { // Get CheckM2 database if not supplied if (!params.skip_binqc && params.binqc_tool == 'checkm2' && !params.checkm2_db) { - CHECKM2_DATABASEDOWNLOAD (params.checkm2_db_version) + CHECKM2_DATABASEDOWNLOAD(params.checkm2_db_version) ch_checkm2_db = CHECKM2_DATABASEDOWNLOAD.out.database } @@ -232,20 +237,7 @@ workflow MAG { // due to strange output file scheme in AR2, have to manually separate // SE/PE to allow correct pulling of reads after. ch_adapterremoval_in = ch_raw_short_reads.branch { - single: it[0]['single_end'] - paired: !it[0]['single_end'] - } - - ADAPTERREMOVAL_PE(ch_adapterremoval_in.paired, []) - ADAPTERREMOVAL_SE(ch_adapterremoval_in.single, []) - - ch_short_reads_prepped = Channel.empty() - ch_short_reads_prepped = ch_short_reads_prepped.mix(ADAPTERREMOVAL_SE.out.singles_truncated, ADAPTERREMOVAL_PE.out.paired_truncated) - - ch_versions = ch_versions.mix(ADAPTERREMOVAL_PE.out.versions.first(), ADAPTERREMOVAL_SE.out.versions.first()) - } - } - else { + siMerge ch_short_reads_prepped = ch_raw_short_reads } @@ -804,16 +796,18 @@ workflow MAG { ch_busco_summary = BUSCO_QC.out.summary ch_versions = ch_versions.mix(BUSCO_QC.out.versions.first()) // process information if BUSCO analysis failed for individual bins due to no matching genes - BUSCO_QC.out.failed_bin.splitCsv(sep: '\t').map { bin, error -> - if (!bin.contains(".unbinned.")) { - busco_failed_bins[bin] = error + BUSCO_QC.out.failed_bin + .splitCsv(sep: '\t') + .map { bin, error -> + if (!bin.contains(".unbinned.")) { + busco_failed_bins[bin] = error + } } - } } - if (!params.skip_binqc && params.binqc_tool == 'checkm') { + if (!params.skip_binqc && params.binqc_tool in ['checkm', 'checkm2']) { /* - * CheckM subworkflow: Quantitative measures for the assessment of genome assembly + * CheckM/CheckM2 subworkflow: Quantitative measures for the assessment of genome assembly */ ch_input_bins_for_checkm = ch_input_bins_for_qc.filter { meta, bins -> @@ -822,31 +816,14 @@ workflow MAG { CHECKM_QC( ch_input_bins_for_checkm.groupTuple(), - ch_checkm_db + ch_checkm_db, + ch_checkm2_db ) ch_checkm_summary = CHECKM_QC.out.summary ch_versions = ch_versions.mix(CHECKM_QC.out.versions) } - if (!params.skip_binqc && params.binqc_tool == 'checkm2') { - /* - * CheckM2 subworkflow: Quantitative measures for the assessment of genome assembly - */ - - ch_input_bins_for_checkm2 = ch_input_bins_for_qc.filter { meta, bins -> - meta.domain != "eukarya" - } - - CHECKM2_QC ( - ch_input_bins_for_checkm2.groupTuple(), - ch_checkm2_db - ) - ch_checkm2_summary = CHECKM2_QC.out.summary - - ch_versions = ch_versions.mix(CHECKM2_QC.out.versions) - } - if (params.run_gunc && params.binqc_tool == 'checkm') { GUNC_QC(ch_input_bins_for_checkm, ch_gunc_db, CHECKM_QC.out.checkm_tsv) ch_versions = ch_versions.mix(GUNC_QC.out.versions) @@ -929,7 +906,6 @@ workflow MAG { ch_gtdb_bins, ch_busco_summary, ch_checkm_summary, - ch_checkm2_summary, gtdb, gtdb_mash ) @@ -944,9 +920,7 @@ workflow MAG { if ((!params.skip_binqc) || !params.skip_quast || !params.skip_gtdbtk) { BIN_SUMMARY( ch_input_for_binsummary, - ch_busco_summary.ifEmpty([]), - ch_checkm_summary.ifEmpty([]), - ch_checkm2_summary.ifEmpty([]), + params.binqc_tool == "busco" ? ch_busco_summary.ifEmpty([]) : ch_checkm_summary.ifEmpty([]), ch_quast_bins_summary.ifEmpty([]), ch_gtdbtk_summary.ifEmpty([]), ch_cat_global_summary.ifEmpty([]) From 8b4fcc7339f071aba0754bfa718466baabf8f200 Mon Sep 17 00:00:00 2001 From: "Diego Alvarez S." Date: Mon, 28 Oct 2024 10:02:31 -0300 Subject: [PATCH 11/33] fix: Restore mistakenly deleted code --- workflows/mag.nf | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/workflows/mag.nf b/workflows/mag.nf index ab05e654..8d993340 100644 --- a/workflows/mag.nf +++ b/workflows/mag.nf @@ -237,7 +237,20 @@ workflow MAG { // due to strange output file scheme in AR2, have to manually separate // SE/PE to allow correct pulling of reads after. ch_adapterremoval_in = ch_raw_short_reads.branch { - siMerge + single: it[0]['single_end'] + paired: !it[0]['single_end'] + } + + ADAPTERREMOVAL_PE(ch_adapterremoval_in.paired, []) + ADAPTERREMOVAL_SE(ch_adapterremoval_in.single, []) + + ch_short_reads_prepped = Channel.empty() + ch_short_reads_prepped = ch_short_reads_prepped.mix(ADAPTERREMOVAL_SE.out.singles_truncated, ADAPTERREMOVAL_PE.out.paired_truncated) + + ch_versions = ch_versions.mix(ADAPTERREMOVAL_PE.out.versions.first(), ADAPTERREMOVAL_SE.out.versions.first()) + } + } + else { ch_short_reads_prepped = ch_raw_short_reads } From 76dac5cecb8c17d779a6e7a28ec08399440b7a58 Mon Sep 17 00:00:00 2001 From: "Diego Alvarez S." Date: Thu, 31 Oct 2024 17:26:39 -0300 Subject: [PATCH 12/33] Bin QC workflow --- subworkflows/local/bin_qc.nf | 122 ++++++++++++++++++++++++++++++++ subworkflows/local/checkm_qc.nf | 54 -------------- workflows/mag.nf | 64 ++++------------- 3 files changed, 137 insertions(+), 103 deletions(-) create mode 100644 subworkflows/local/bin_qc.nf delete mode 100644 subworkflows/local/checkm_qc.nf diff --git a/subworkflows/local/bin_qc.nf b/subworkflows/local/bin_qc.nf new file mode 100644 index 00000000..25387af9 --- /dev/null +++ b/subworkflows/local/bin_qc.nf @@ -0,0 +1,122 @@ +/* + * BUSCO/CheckM/CheckM2: Quantitative measures for the assessment of genome assembly + */ + +include { BUSCO_DB_PREPARATION } from '../../modules/local/busco_db_preparation' +include { BUSCO } from '../../modules/local/busco' +include { BUSCO_SAVE_DOWNLOAD } from '../../modules/local/busco_save_download' +include { BUSCO_SUMMARY } from '../../modules/local/busco_summary' +include { CHECKM_QA } from '../../modules/nf-core/checkm/qa/main' +include { CHECKM_LINEAGEWF } from '../../modules/nf-core/checkm/lineagewf/main' +include { CHECKM2_PREDICT } from '../../modules/nf-core/checkm2/predict/main' +include { COMBINE_TSV } from '../../modules/local/combine_tsv' + +workflow BIN_QC { + take: + bins // channel: [ val(meta), path(bin) ] + checkm_db + checkm2_db + busco_db + + main: + ch_versions = Channel.empty() + + if (params.binqc_tool == "busco") { + // BUSCO workflow + if (!busco_db.isEmpty()) { + if (busco_db.extension in ['gz', 'tgz']) { + // Expects to be tar.gz! + BUSCO_DB_PREPARATION(busco_db) + ch_db_for_busco = BUSCO_DB_PREPARATION.out.db.map { meta, db -> + [[id: meta, lineage: 'Y'], db] + } + } + else if (busco_db.isDirectory()) { + // Set meta to match expected channel cardinality for BUSCO + ch_db_for_busco = Channel + .of(busco_db) + .map { db -> + def basename = db.getBaseName() + def lineage = basename.contains('odb10') ? 'Y' : 'N' + [[id: basename, lineage: lineage], db] + } + .collect() + } + } + else { + // Set BUSCO database to empty to allow for --auto-lineage + ch_db_for_busco = Channel + .of([]) + .map { empty_db -> [[lineage: ''], []] } + .collect() + } + + if (params.save_busco_db) { + // publish files downloaded by Busco + ch_downloads = BUSCO.out.busco_downloads + .groupTuple() + .map { lin, downloads -> downloads[0] } + .toSortedList() + .flatten() + BUSCO_SAVE_DOWNLOAD(ch_downloads) + } + + BUSCO(bins, ch_db_for_busco) + + // busco_summary_domain = BUSCO.out.summary_domain.collect() + // busco_summary_specific = BUSCO.out.summary_specific.collect() + // busco_failed_bin = BUSCO.out.failed_bin.collect() + + BUSCO_SUMMARY( + BUSCO.out.summary_domain.map { it[1] }.collect().ifEmpty([]), + BUSCO.out.summary_specific.map { it[1] }.collect().ifEmpty([]), + BUSCO.out.failed_bin.map { it[1] }.collect().ifEmpty([]) + ) + + multiqc_reports = BUSCO.out.summary_domain.mix(BUSCO.out.summary_specific).map{ it[1] } + summary = BUSCO_SUMMARY.out.summary + ch_versions = ch_versions.mix(BUSCO.out.versions.first()) + } + else if (params.binqc_tool == "checkm") { + // CheckM workflow + ch_bins_for_checkmlineagewf = bins + .filter { meta, bin -> + meta.domain != "eukarya" + } + .multiMap { meta, fa -> + reads: [meta, fa] + ext: fa.extension.unique().join("") + } + + CHECKM_LINEAGEWF(ch_bins_for_checkmlineagewf.reads, ch_bins_for_checkmlineagewf.ext, checkm_db) + ch_versions = ch_versions.mix(CHECKM_LINEAGEWF.out.versions.first()) + + ch_checkmqa_input = CHECKM_LINEAGEWF.out.checkm_output + .join(CHECKM_LINEAGEWF.out.marker_file) + .map { meta, dir, marker -> + [meta, dir, marker, []] + } + + CHECKM_QA(ch_checkmqa_input, []) + + COMBINE_TSV(CHECKM_QA.out.output.map { it[1] }.collect()) + + summary = COMBINE_TSV.out.combined + ch_versions = ch_versions.mix(CHECKM_QA.out.versions.first()) + } + else if (params.binqc_tool == "checkm2") { + // CheckM2 workflow + CHECKM2_PREDICT(bins, checkm2_db) + + COMBINE_TSV(CHECKM2_PREDICT.out.checkm2_tsv.map { it[1] }.collect()) + + summary = COMBINE_TSV.out.combined + ch_versions = ch_versions.mix(CHECKM2_PREDICT.out.versions.first()) + } + + emit: + summary = summary + checkm_tsv = params.binqc_tool == "checkm" ? CHECKM_QA.out.output : [] + multiqc = params.binqc_tool == "busco" ? multiqc_reports : [] + versions = ch_versions +} diff --git a/subworkflows/local/checkm_qc.nf b/subworkflows/local/checkm_qc.nf deleted file mode 100644 index 73183d2d..00000000 --- a/subworkflows/local/checkm_qc.nf +++ /dev/null @@ -1,54 +0,0 @@ -/* - * CheckM/CheckM2: Quantitative measures for the assessment of genome assembly - */ - -include { CHECKM_QA } from '../../modules/nf-core/checkm/qa/main' -include { CHECKM_LINEAGEWF } from '../../modules/nf-core/checkm/lineagewf/main' -include { CHECKM2_PREDICT } from '../../modules/nf-core/checkm2/predict/main' -include { COMBINE_TSV as COMBINE_CHECKM_TSV } from '../../modules/local/combine_tsv' - -workflow CHECKM_QC { - take: - bins // channel: [ val(meta), path(bin) ] - checkm_db - checkm2_db - - main: - ch_versions = Channel.empty() - - if (params.binqc_tool == "checkm") { - ch_bins_for_checkmlineagewf = bins.multiMap { - meta, fa -> - reads: [ meta, fa ] - ext: fa.extension.unique().join("") // we set this in the pipeline to always `.fa` so this should be fine - } - - CHECKM_LINEAGEWF(ch_bins_for_checkmlineagewf.reads, ch_bins_for_checkmlineagewf.ext, checkm_db) - ch_versions = ch_versions.mix(CHECKM_LINEAGEWF.out.versions.first()) - - ch_checkmqa_input = CHECKM_LINEAGEWF.out.checkm_output - .join(CHECKM_LINEAGEWF.out.marker_file) - .map{ - meta, dir, marker -> - [ meta, dir, marker, []] - } - - CHECKM_QA ( ch_checkmqa_input, [] ) - - ch_versions = ch_versions.mix(CHECKM_QA.out.versions.first()) - - COMBINE_CHECKM_TSV(CHECKM_QA.out.output.map{it[1]}.collect()) - } - if (params.binqc_tool == "checkm2") { - CHECKM2_PREDICT(bins, checkm2_db) - - ch_versions = ch_versions.mix(CHECKM2_PREDICT.out.versions.first()) - - COMBINE_CHECKM_TSV(CHECKM2_PREDICT.out.checkm2_tsv.map{it[1]}.collect()) - } - - emit: - summary = COMBINE_CHECKM_TSV.out.combined - checkm_tsv = params.binqc_tool == "checkm" ? CHECKM_QA.out.output : [] - versions = ch_versions -} diff --git a/workflows/mag.nf b/workflows/mag.nf index 8d993340..0b0b1936 100644 --- a/workflows/mag.nf +++ b/workflows/mag.nf @@ -14,10 +14,9 @@ include { methodsDescriptionText } from '../subwo // include { BINNING_PREPARATION } from '../subworkflows/local/binning_preparation' include { BINNING } from '../subworkflows/local/binning' +include { BIN_QC } from '../subworkflows/local/bin_qc' include { BINNING_REFINEMENT } from '../subworkflows/local/binning_refinement' -include { BUSCO_QC } from '../subworkflows/local/busco_qc' include { VIRUS_IDENTIFICATION } from '../subworkflows/local/virus_identification' -include { CHECKM_QC } from '../subworkflows/local/checkm_qc' include { GUNC_QC } from '../subworkflows/local/gunc_qc' include { GTDBTK } from '../subworkflows/local/gtdbtk' include { ANCIENT_DNA_ASSEMBLY_VALIDATION } from '../subworkflows/local/ancient_dna' @@ -185,11 +184,7 @@ workflow MAG { ch_metaeuk_db = Channel.empty() } - // Additional info for completion email and summary - def busco_failed_bins = [:] - // Get checkM database if not supplied - if (!params.skip_binqc && params.binqc_tool == 'checkm' && !params.checkm_db) { ARIA2_UNTAR(params.checkm_download_url) ch_checkm_db = ARIA2_UNTAR.out.downloaded_file @@ -797,55 +792,26 @@ workflow MAG { ch_input_bins_for_qc = ch_input_for_postbinning_bins_unbins.transpose() - if (!params.skip_binqc && params.binqc_tool == 'busco') { - /* - * BUSCO subworkflow: Quantitative measures for the assessment of genome assembly - */ - - BUSCO_QC( - ch_busco_db, - ch_input_bins_for_qc - ) - ch_busco_summary = BUSCO_QC.out.summary - ch_versions = ch_versions.mix(BUSCO_QC.out.versions.first()) - // process information if BUSCO analysis failed for individual bins due to no matching genes - BUSCO_QC.out.failed_bin - .splitCsv(sep: '\t') - .map { bin, error -> - if (!bin.contains(".unbinned.")) { - busco_failed_bins[bin] = error - } - } - } + BIN_QC( + ch_input_bins_for_qc, + ch_checkm_db, + ch_checkm2_db, + ch_busco_db + ) - if (!params.skip_binqc && params.binqc_tool in ['checkm', 'checkm2']) { - /* - * CheckM/CheckM2 subworkflow: Quantitative measures for the assessment of genome assembly - */ + ch_versions = ch_versions.mix(BIN_QC.out.versions) - ch_input_bins_for_checkm = ch_input_bins_for_qc.filter { meta, bins -> + if (params.run_gunc) { + ch_input_bins_for_gunc = ch_input_for_postbinning_bins_unbins.filter { meta, bins -> meta.domain != "eukarya" } - CHECKM_QC( - ch_input_bins_for_checkm.groupTuple(), - ch_checkm_db, - ch_checkm2_db + GUNC_QC( + ch_input_bins_for_gunc, + ch_gunc_db, + params.binqc_tool == 'checkm' ? BIN_QC.out.checkm_tsv : [] ) - ch_checkm_summary = CHECKM_QC.out.summary - - ch_versions = ch_versions.mix(CHECKM_QC.out.versions) - } - if (params.run_gunc && params.binqc_tool == 'checkm') { - GUNC_QC(ch_input_bins_for_checkm, ch_gunc_db, CHECKM_QC.out.checkm_tsv) - ch_versions = ch_versions.mix(GUNC_QC.out.versions) - } - else if (params.run_gunc) { - ch_input_bins_for_gunc = ch_input_for_postbinning_bins_unbins.filter { meta, bins -> - meta.domain != "eukarya" - } - GUNC_QC(ch_input_bins_for_qc, ch_gunc_db, []) ch_versions = ch_versions.mix(GUNC_QC.out.versions) } @@ -1075,7 +1041,7 @@ workflow MAG { } if (!params.skip_binning && !params.skip_binqc && params.binqc_tool == 'busco') { - ch_multiqc_files = ch_multiqc_files.mix(BUSCO_QC.out.multiqc.collect().ifEmpty([])) + ch_multiqc_files = ch_multiqc_files.mix(BIN_QC.out.multiqc.collect().ifEmpty([])) } From 38ce756067fabc1cc9a1e7b32ca6a81a06e1dec3 Mon Sep 17 00:00:00 2001 From: "Diego Alvarez S." Date: Thu, 31 Oct 2024 17:40:04 -0300 Subject: [PATCH 13/33] Cleanup --- subworkflows/local/bin_qc.nf | 10 ++-- subworkflows/local/busco_qc.nf | 83 ---------------------------------- subworkflows/local/gtdbtk.nf | 7 ++- workflows/mag.nf | 12 ++--- 4 files changed, 11 insertions(+), 101 deletions(-) delete mode 100644 subworkflows/local/busco_qc.nf diff --git a/subworkflows/local/bin_qc.nf b/subworkflows/local/bin_qc.nf index 25387af9..4aa9b50a 100644 --- a/subworkflows/local/bin_qc.nf +++ b/subworkflows/local/bin_qc.nf @@ -47,7 +47,7 @@ workflow BIN_QC { // Set BUSCO database to empty to allow for --auto-lineage ch_db_for_busco = Channel .of([]) - .map { empty_db -> [[lineage: ''], []] } + .map { _empty_db -> [[lineage: ''], []] } .collect() } @@ -55,7 +55,7 @@ workflow BIN_QC { // publish files downloaded by Busco ch_downloads = BUSCO.out.busco_downloads .groupTuple() - .map { lin, downloads -> downloads[0] } + .map { _lin, downloads -> downloads[0] } .toSortedList() .flatten() BUSCO_SAVE_DOWNLOAD(ch_downloads) @@ -63,10 +63,6 @@ workflow BIN_QC { BUSCO(bins, ch_db_for_busco) - // busco_summary_domain = BUSCO.out.summary_domain.collect() - // busco_summary_specific = BUSCO.out.summary_specific.collect() - // busco_failed_bin = BUSCO.out.failed_bin.collect() - BUSCO_SUMMARY( BUSCO.out.summary_domain.map { it[1] }.collect().ifEmpty([]), BUSCO.out.summary_specific.map { it[1] }.collect().ifEmpty([]), @@ -80,7 +76,7 @@ workflow BIN_QC { else if (params.binqc_tool == "checkm") { // CheckM workflow ch_bins_for_checkmlineagewf = bins - .filter { meta, bin -> + .filter { meta, _bins -> meta.domain != "eukarya" } .multiMap { meta, fa -> diff --git a/subworkflows/local/busco_qc.nf b/subworkflows/local/busco_qc.nf deleted file mode 100644 index a5c3be8d..00000000 --- a/subworkflows/local/busco_qc.nf +++ /dev/null @@ -1,83 +0,0 @@ -/* - * BUSCO: Quantitative measures for the assessment of genome assembly - */ - -include { BUSCO_DB_PREPARATION } from '../../modules/local/busco_db_preparation' -include { BUSCO } from '../../modules/local/busco' -include { BUSCO_SAVE_DOWNLOAD } from '../../modules/local/busco_save_download' -include { BUSCO_SUMMARY } from '../../modules/local/busco_summary' - -workflow BUSCO_QC { - take: - busco_db // channel: path - bins // channel: [ val(meta), path(bin) ] - - main: - if ( !busco_db.isEmpty() ) { - if ( busco_db.extension in ['gz', 'tgz'] ) { - // Expects to be tar.gz! - ch_db_for_busco = BUSCO_DB_PREPARATION ( busco_db ).db - .map{ - meta, db -> - def meta_new = [:] - meta_new['id'] = meta - meta_new['lineage'] = 'Y' - [ meta_new, db ] - } - } else if ( busco_db.isDirectory() ) { - // Set meta to match expected channel cardinality for BUSCO - ch_db_for_busco = Channel - .of(busco_db) - .map{ - db -> - def meta = [:] - meta['id'] = db.getBaseName() - if ( meta['id'].contains('odb10') == true ) { - meta['lineage'] = 'Y' - } else { - meta['lineage'] = 'N' - } - [ meta, db ] - } - .collect() - } - } else { - // Set BUSCO database to empty to allow for --auto-lineage - ch_db_for_busco = Channel - .of([]) - .map{ - empty_db -> - def meta = [:] - meta['lineage'] = '' - [ meta, [] ] - } - .collect() - } - - BUSCO ( - bins, - ch_db_for_busco - ) - - if (params.save_busco_db){ - // publish files downloaded by Busco - ch_downloads = BUSCO.out.busco_downloads.groupTuple().map{lin,downloads -> downloads[0]}.toSortedList().flatten() - BUSCO_SAVE_DOWNLOAD ( ch_downloads ) - } - - busco_summary_domain = BUSCO.out.summary_domain.collect() - busco_summary_specific = BUSCO.out.summary_specific.collect() - busco_failed_bin = BUSCO.out.failed_bin.collect() - - BUSCO_SUMMARY ( - BUSCO.out.summary_domain.map{it[1]}.collect().ifEmpty([]), - BUSCO.out.summary_specific.map{it[1]}.collect().ifEmpty([]), - BUSCO.out.failed_bin.map{it[1]}.collect().ifEmpty([]) - ) - - emit: - summary = BUSCO_SUMMARY.out.summary - failed_bin = BUSCO.out.failed_bin.map{it[1]} - multiqc = BUSCO.out.summary_domain.mix(BUSCO.out.summary_specific).map{it[1]} - versions = BUSCO.out.versions -} diff --git a/subworkflows/local/gtdbtk.nf b/subworkflows/local/gtdbtk.nf index 83af9e24..e2ee580b 100644 --- a/subworkflows/local/gtdbtk.nf +++ b/subworkflows/local/gtdbtk.nf @@ -9,8 +9,7 @@ include { GTDBTK_SUMMARY } from '../../modules/local/gtdbtk_summary' workflow GTDBTK { take: bins // channel: [ val(meta), [bins] ] - busco_summary // channel: path - checkm_summary // channel: path + bin_qc_summary // channel: path gtdb // channel: path gtdb_mash // channel: path @@ -19,7 +18,7 @@ workflow GTDBTK { ch_bin_metrics = Channel.empty() if ( params.binqc_tool == 'busco' ){ // Collect completeness and contamination metrics from busco summary - ch_bin_metrics = busco_summary + ch_bin_metrics = bin_qc_summary .splitCsv(header: true, sep: '\t') .map { row -> def completeness = -1 @@ -41,7 +40,7 @@ workflow GTDBTK { // Collect completeness and contamination metrics from CheckM/CheckM2 summary bin_name = params.binqc_tool == 'checkm' ? 'Bin Id' : 'Name' - ch_bin_metrics = checkm_summary + ch_bin_metrics = bin_qc_summary .splitCsv(header: true, sep: '\t') .map { row -> def completeness = Double.parseDouble(row.'Completeness') diff --git a/workflows/mag.nf b/workflows/mag.nf index 04b9af8f..3def8207 100644 --- a/workflows/mag.nf +++ b/workflows/mag.nf @@ -649,9 +649,7 @@ workflow MAG { ================================================================================ */ - ch_busco_summary = Channel.empty() - ch_checkm_summary = Channel.empty() - ch_checkm2_summary = Channel.empty() + bin_qc_summary = Channel.empty() if (!params.skip_binning || params.ancient_dna) { BINNING_PREPARATION( @@ -803,10 +801,11 @@ workflow MAG { ch_busco_db ) + bin_qc_summary = BIN_QC.out.summary ch_versions = ch_versions.mix(BIN_QC.out.versions) if (params.run_gunc) { - ch_input_bins_for_gunc = ch_input_for_postbinning.filter { meta, bins -> + ch_input_bins_for_gunc = ch_input_for_postbinning.filter { meta, _bins -> meta.domain != "eukarya" } @@ -887,8 +886,7 @@ workflow MAG { GTDBTK( ch_gtdb_bins, - ch_busco_summary, - ch_checkm_summary, + bin_qc_summary, gtdb, gtdb_mash ) @@ -903,7 +901,7 @@ workflow MAG { if ((!params.skip_binqc) || !params.skip_quast || !params.skip_gtdbtk) { BIN_SUMMARY( ch_input_for_binsummary, - params.binqc_tool == "busco" ? ch_busco_summary.ifEmpty([]) : ch_checkm_summary.ifEmpty([]), + bin_qc_summary.ifEmpty([]), ch_quast_bins_summary.ifEmpty([]), ch_gtdbtk_summary.ifEmpty([]), ch_cat_global_summary.ifEmpty([]) From 3702329954625e8beafb2a9de9c25545feac0554 Mon Sep 17 00:00:00 2001 From: "Diego Alvarez S." Date: Thu, 31 Oct 2024 18:22:22 -0300 Subject: [PATCH 14/33] Final touches --- conf/modules.config | 4 ++-- subworkflows/local/bin_qc.nf | 36 ++++++++++++++++++++---------------- 2 files changed, 22 insertions(+), 18 deletions(-) diff --git a/conf/modules.config b/conf/modules.config index b0a58c42..63e66adf 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -368,8 +368,8 @@ process { ] } - withName: COMBINE_CHECKM_TSV { - ext.prefix = { params.binqc_tool == "checkm" ? "checkm_summary" : "checkm2_summary" } + withName: COMBINE_BINQC_TSV { + ext.prefix = { "${params.binqc_tool}_summary" } publishDir = [ path: { "${params.outdir}/GenomeBinning/QC" }, mode: params.publish_dir_mode, diff --git a/subworkflows/local/bin_qc.nf b/subworkflows/local/bin_qc.nf index 4aa9b50a..e7f7d3fb 100644 --- a/subworkflows/local/bin_qc.nf +++ b/subworkflows/local/bin_qc.nf @@ -2,14 +2,14 @@ * BUSCO/CheckM/CheckM2: Quantitative measures for the assessment of genome assembly */ -include { BUSCO_DB_PREPARATION } from '../../modules/local/busco_db_preparation' -include { BUSCO } from '../../modules/local/busco' -include { BUSCO_SAVE_DOWNLOAD } from '../../modules/local/busco_save_download' -include { BUSCO_SUMMARY } from '../../modules/local/busco_summary' -include { CHECKM_QA } from '../../modules/nf-core/checkm/qa/main' -include { CHECKM_LINEAGEWF } from '../../modules/nf-core/checkm/lineagewf/main' -include { CHECKM2_PREDICT } from '../../modules/nf-core/checkm2/predict/main' -include { COMBINE_TSV } from '../../modules/local/combine_tsv' +include { BUSCO_DB_PREPARATION } from '../../modules/local/busco_db_preparation' +include { BUSCO } from '../../modules/local/busco' +include { BUSCO_SAVE_DOWNLOAD } from '../../modules/local/busco_save_download' +include { BUSCO_SUMMARY } from '../../modules/local/busco_summary' +include { CHECKM_QA } from '../../modules/nf-core/checkm/qa/main' +include { CHECKM_LINEAGEWF } from '../../modules/nf-core/checkm/lineagewf/main' +include { CHECKM2_PREDICT } from '../../modules/nf-core/checkm2/predict/main' +include { COMBINE_TSV as COMBINE_BINQC_TSV } from '../../modules/local/combine_tsv' workflow BIN_QC { take: @@ -20,6 +20,8 @@ workflow BIN_QC { main: ch_versions = Channel.empty() + multiqc_reports = [] + checkm_tsv = [] if (params.binqc_tool == "busco") { // BUSCO workflow @@ -76,12 +78,13 @@ workflow BIN_QC { else if (params.binqc_tool == "checkm") { // CheckM workflow ch_bins_for_checkmlineagewf = bins + .groupTuple() .filter { meta, _bins -> meta.domain != "eukarya" } .multiMap { meta, fa -> reads: [meta, fa] - ext: fa.extension.unique().join("") + ext: fa.extension.unique().join("") // the pipeline ensures that all bins will have the same extension } CHECKM_LINEAGEWF(ch_bins_for_checkmlineagewf.reads, ch_bins_for_checkmlineagewf.ext, checkm_db) @@ -95,24 +98,25 @@ workflow BIN_QC { CHECKM_QA(ch_checkmqa_input, []) - COMBINE_TSV(CHECKM_QA.out.output.map { it[1] }.collect()) + COMBINE_BINQC_TSV(CHECKM_QA.out.output.map { it[1] }.collect()) - summary = COMBINE_TSV.out.combined + summary = COMBINE_BINQC_TSV.out.combined ch_versions = ch_versions.mix(CHECKM_QA.out.versions.first()) + checkm_tsv = CHECKM_QA.out.output } else if (params.binqc_tool == "checkm2") { // CheckM2 workflow - CHECKM2_PREDICT(bins, checkm2_db) + CHECKM2_PREDICT(bins.groupTuple(), checkm2_db) - COMBINE_TSV(CHECKM2_PREDICT.out.checkm2_tsv.map { it[1] }.collect()) + COMBINE_BINQC_TSV(CHECKM2_PREDICT.out.checkm2_tsv.map { it[1] }.collect()) - summary = COMBINE_TSV.out.combined + summary = COMBINE_BINQC_TSV.out.combined ch_versions = ch_versions.mix(CHECKM2_PREDICT.out.versions.first()) } emit: summary = summary - checkm_tsv = params.binqc_tool == "checkm" ? CHECKM_QA.out.output : [] - multiqc = params.binqc_tool == "busco" ? multiqc_reports : [] + checkm_tsv = checkm_tsv + multiqc = multiqc_reports versions = ch_versions } From 0eb167a29d2a4dcb50ff297e7f5f3eb8e7b98265 Mon Sep 17 00:00:00 2001 From: "Diego Alvarez S." Date: Fri, 1 Nov 2024 01:13:19 -0300 Subject: [PATCH 15/33] Integrate GUNC in BIN_QC subworkflow --- CHANGELOG.md | 11 ++-- nextflow.config | 2 +- subworkflows/local/bin_qc.nf | 114 +++++++++++++++++++++++++--------- subworkflows/local/gunc_qc.nf | 51 --------------- workflows/mag.nf | 32 +++------- 5 files changed, 99 insertions(+), 111 deletions(-) delete mode 100644 subworkflows/local/gunc_qc.nf diff --git a/CHANGELOG.md b/CHANGELOG.md index 75f0332b..cdbf60d0 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### `Added` +- [#707](https://github.com/nf-core/mag/pull/707) - Make Bin QC a subworkflow (added by @dialvarezs) - [#707](https://github.com/nf-core/mag/pull/707) - Added CheckM2 as an alternative bin completeness and QC tool (added by @dialvarezs) - [#708](https://github.com/nf-core/mag/pull/708) - Added `--exclude_unbins_from_postbinning` parameter to exclude unbinned contigs from post-binning processes, speeding up Prokka in some cases (added by @dialvarezs) @@ -14,10 +15,14 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### `Fixed` -- [#708](https://github.com/nf-core/mag/pull/708) - Fixed channel passed as GUNC input (added by @dialvarezs) +- [#707](https://github.com/nf-core/mag/pull/708) - Fixed channel passed as GUNC input (added by @dialvarezs) ### `Dependencies` +| Tool | Previous version | New version | +| ------- | ---------------- | ----------- | +| CheckM2 | | 1.0.2 | + ### `Deprecated` ## 3.2.1 [2024-10-30] @@ -32,10 +37,6 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### `Dependencies` -| Tool | Previous version | New version | -| ------- | ---------------- | ----------- | -| CheckM2 | | 1.0.2 | - ### `Deprecated` ## 3.2.0 [2024-10-27] diff --git a/nextflow.config b/nextflow.config index ad7448fb..e35d2914 100644 --- a/nextflow.config +++ b/nextflow.config @@ -121,7 +121,7 @@ params { refine_bins_dastool = false refine_bins_dastool_threshold = 0.5 postbinning_input = 'raw_bins_only' - exclude_unbins_from_postbinning = false + exclude_unbins_from_postbinning = false // Bin QC skip_binqc = false diff --git a/subworkflows/local/bin_qc.nf b/subworkflows/local/bin_qc.nf index e7f7d3fb..b2f78e7b 100644 --- a/subworkflows/local/bin_qc.nf +++ b/subworkflows/local/bin_qc.nf @@ -1,5 +1,5 @@ /* - * BUSCO/CheckM/CheckM2: Quantitative measures for the assessment of genome assembly + * BUSCO/CheckM/CheckM2/GUNC: Quantitative measures for the assessment of genome assembly */ include { BUSCO_DB_PREPARATION } from '../../modules/local/busco_db_preparation' @@ -10,33 +10,42 @@ include { CHECKM_QA } from '../../modules/nf-core/checkm include { CHECKM_LINEAGEWF } from '../../modules/nf-core/checkm/lineagewf/main' include { CHECKM2_PREDICT } from '../../modules/nf-core/checkm2/predict/main' include { COMBINE_TSV as COMBINE_BINQC_TSV } from '../../modules/local/combine_tsv' +include { GUNC_DOWNLOADDB } from '../../modules/nf-core/gunc/downloaddb/main' +include { GUNC_RUN } from '../../modules/nf-core/gunc/run/main' +include { GUNC_MERGECHECKM } from '../../modules/nf-core/gunc/mergecheckm/main' + workflow BIN_QC { take: - bins // channel: [ val(meta), path(bin) ] - checkm_db - checkm2_db - busco_db + ch_bins // [ [ meta] , fasta ], input bins (mandatory) + ch_checkm_db // [ db ], presupplied CheckM database (optional) + ch_checkm2_db // [ [meta] , db ], presupplied CheckM2 database (optional) + ch_busco_db // [ [meta] , db ], presupplied BUSCO database (optional) + ch_gunc_db // [ db ], presupplied GUNC database (optional) main: + qc_summary = [] + ch_input_bins_for_qc = ch_bins.transpose() ch_versions = Channel.empty() - multiqc_reports = [] - checkm_tsv = [] + ch_multiqc_files = Channel.empty() + if (params.binqc_tool == "busco") { - // BUSCO workflow - if (!busco_db.isEmpty()) { - if (busco_db.extension in ['gz', 'tgz']) { + /* + * BUSCO + */ + if (!ch_busco_db.isEmpty()) { + if (ch_busco_db.extension in ['gz', 'tgz']) { // Expects to be tar.gz! - BUSCO_DB_PREPARATION(busco_db) + BUSCO_DB_PREPARATION(ch_busco_db) ch_db_for_busco = BUSCO_DB_PREPARATION.out.db.map { meta, db -> [[id: meta, lineage: 'Y'], db] } } - else if (busco_db.isDirectory()) { + else if (ch_busco_db.isDirectory()) { // Set meta to match expected channel cardinality for BUSCO ch_db_for_busco = Channel - .of(busco_db) + .of(ch_busco_db) .map { db -> def basename = db.getBaseName() def lineage = basename.contains('odb10') ? 'Y' : 'N' @@ -48,8 +57,7 @@ workflow BIN_QC { else { // Set BUSCO database to empty to allow for --auto-lineage ch_db_for_busco = Channel - .of([]) - .map { _empty_db -> [[lineage: ''], []] } + .of([[lineage: ''], []]) .collect() } @@ -63,7 +71,7 @@ workflow BIN_QC { BUSCO_SAVE_DOWNLOAD(ch_downloads) } - BUSCO(bins, ch_db_for_busco) + BUSCO(ch_input_bins_for_qc, ch_db_for_busco) BUSCO_SUMMARY( BUSCO.out.summary_domain.map { it[1] }.collect().ifEmpty([]), @@ -71,13 +79,17 @@ workflow BIN_QC { BUSCO.out.failed_bin.map { it[1] }.collect().ifEmpty([]) ) - multiqc_reports = BUSCO.out.summary_domain.mix(BUSCO.out.summary_specific).map{ it[1] } - summary = BUSCO_SUMMARY.out.summary + ch_multiqc_files = ch_multiqc_files.mix( + BUSCO.out.summary_domain.mix(BUSCO.out.summary_specific).map{ it[1] } + ) + qc_summary = BUSCO_SUMMARY.out.summary ch_versions = ch_versions.mix(BUSCO.out.versions.first()) } else if (params.binqc_tool == "checkm") { - // CheckM workflow - ch_bins_for_checkmlineagewf = bins + /* + * CheckM + */ + ch_bins_for_checkmlineagewf = ch_input_bins_for_qc .groupTuple() .filter { meta, _bins -> meta.domain != "eukarya" @@ -87,7 +99,7 @@ workflow BIN_QC { ext: fa.extension.unique().join("") // the pipeline ensures that all bins will have the same extension } - CHECKM_LINEAGEWF(ch_bins_for_checkmlineagewf.reads, ch_bins_for_checkmlineagewf.ext, checkm_db) + CHECKM_LINEAGEWF(ch_bins_for_checkmlineagewf.reads, ch_bins_for_checkmlineagewf.ext, ch_checkm_db) ch_versions = ch_versions.mix(CHECKM_LINEAGEWF.out.versions.first()) ch_checkmqa_input = CHECKM_LINEAGEWF.out.checkm_output @@ -100,23 +112,65 @@ workflow BIN_QC { COMBINE_BINQC_TSV(CHECKM_QA.out.output.map { it[1] }.collect()) - summary = COMBINE_BINQC_TSV.out.combined + qc_summary = COMBINE_BINQC_TSV.out.combined ch_versions = ch_versions.mix(CHECKM_QA.out.versions.first()) - checkm_tsv = CHECKM_QA.out.output } else if (params.binqc_tool == "checkm2") { - // CheckM2 workflow - CHECKM2_PREDICT(bins.groupTuple(), checkm2_db) + /* + * CheckM2 + */ + CHECKM2_PREDICT(ch_input_bins_for_qc.groupTuple(), ch_checkm2_db) COMBINE_BINQC_TSV(CHECKM2_PREDICT.out.checkm2_tsv.map { it[1] }.collect()) - summary = COMBINE_BINQC_TSV.out.combined + qc_summary = COMBINE_BINQC_TSV.out.combined ch_versions = ch_versions.mix(CHECKM2_PREDICT.out.versions.first()) } + if (params.run_gunc) { + /* + * GUNC + */ + ch_input_bins_for_gunc = ch_bins + .filter { meta, _bins -> + meta.domain != "eukarya" + } + .flatMap { meta, bins -> + bins.collect { bin -> [meta, bin] } + } + + if ( params.gunc_db ) { + ch_db_for_gunc = ch_gunc_db + } + else { + ch_db_for_gunc = GUNC_DOWNLOADDB(params.gunc_database_type).db + ch_versions.mix(GUNC_DOWNLOADDB.out.versions) + } + + GUNC_RUN(ch_input_bins_for_gunc, ch_db_for_gunc) + ch_versions.mix(GUNC_RUN.out.versions) + + // Make sure to keep directory in sync with modules.conf + GUNC_RUN.out.maxcss_level_tsv + .map{it[1]} + .collectFile(name: "gunc_summary.tsv", keepHeader: true, storeDir: "${params.outdir}/GenomeBinning/QC/") + + if ( params.binqc_tool == 'checkm' ) { + ch_input_to_mergecheckm = GUNC_RUN.out.maxcss_level_tsv.combine(CHECKM_QA.out.output, by: 0) + + GUNC_MERGECHECKM(ch_input_to_mergecheckm) + ch_versions.mix(GUNC_MERGECHECKM.out.versions) + + // Make sure to keep directory in sync with modules.conf + GUNC_MERGECHECKM.out.tsv + .map{it[1]} + .collectFile(name: "gunc_checkm_summary.tsv", keepHeader: true, storeDir: "${params.outdir}/GenomeBinning/QC/") + } + } + + emit: - summary = summary - checkm_tsv = checkm_tsv - multiqc = multiqc_reports - versions = ch_versions + qc_summary = qc_summary + multiqc_files = ch_multiqc_files + versions = ch_versions } diff --git a/subworkflows/local/gunc_qc.nf b/subworkflows/local/gunc_qc.nf deleted file mode 100644 index 912b9425..00000000 --- a/subworkflows/local/gunc_qc.nf +++ /dev/null @@ -1,51 +0,0 @@ -/* - * GUNC: Detection and quantification of genome chimerism based on lineage homogeneity - */ - -include { GUNC_DOWNLOADDB } from '../../modules/nf-core/gunc/downloaddb/main' -include { GUNC_RUN } from '../../modules/nf-core/gunc/run/main' -include { GUNC_MERGECHECKM } from '../../modules/nf-core/gunc/mergecheckm/main' - -workflow GUNC_QC { - take: - ch_bins // [ [ meta] , fasta ], input bins (mandatory) - ch_gunc_db // [ db ], presupplied GUNC database (optional) - ch_checkm_table // [ [ meta ], checkm_qa_table ], extended checkm table from CHECKM_QA, (optional) - - main: - ch_versions = Channel.empty() - - if ( params.gunc_db ) { - ch_db_for_gunc = ch_gunc_db - } else { - ch_db_for_gunc = GUNC_DOWNLOADDB( params.gunc_database_type ).db - ch_versions.mix( GUNC_DOWNLOADDB.out.versions ) - } - - - GUNC_RUN ( ch_bins, ch_db_for_gunc ) - ch_versions.mix( GUNC_RUN.out.versions ) - - // Make sure to keep directory in sync with modules.conf - GUNC_RUN.out.maxcss_level_tsv - .map{it[1]} - .collectFile(name: "gunc_summary.tsv", keepHeader: true, storeDir: "${params.outdir}/GenomeBinning/QC/") - - if ( params.binqc_tool == 'checkm' ) { - - ch_input_to_mergecheckm = GUNC_RUN.out.maxcss_level_tsv - .combine(ch_checkm_table, by: 0) - - GUNC_MERGECHECKM ( ch_input_to_mergecheckm ) - ch_versions.mix( GUNC_MERGECHECKM.out.versions ) - - // Make sure to keep directory in sync with modules.conf - GUNC_MERGECHECKM.out.tsv - .map{it[1]} - .collectFile(name: "gunc_checkm_summary.tsv", keepHeader: true, storeDir: "${params.outdir}/GenomeBinning/QC/") - } - - emit: - versions = ch_versions - -} diff --git a/workflows/mag.nf b/workflows/mag.nf index 3def8207..760f993f 100644 --- a/workflows/mag.nf +++ b/workflows/mag.nf @@ -17,7 +17,6 @@ include { BINNING } from '../subwo include { BIN_QC } from '../subworkflows/local/bin_qc' include { BINNING_REFINEMENT } from '../subworkflows/local/binning_refinement' include { VIRUS_IDENTIFICATION } from '../subworkflows/local/virus_identification' -include { GUNC_QC } from '../subworkflows/local/gunc_qc' include { GTDBTK } from '../subworkflows/local/gtdbtk' include { ANCIENT_DNA_ASSEMBLY_VALIDATION } from '../subworkflows/local/ancient_dna' include { DOMAIN_CLASSIFICATION } from '../subworkflows/local/domain_classification' @@ -649,7 +648,7 @@ workflow MAG { ================================================================================ */ - bin_qc_summary = Channel.empty() + ch_bin_qc_summary = Channel.empty() if (!params.skip_binning || params.ancient_dna) { BINNING_PREPARATION( @@ -792,32 +791,17 @@ workflow MAG { * Bin QC subworkflows: for checking bin completeness with either BUSCO, CHECKM, CHECKM2, and/or GUNC */ - ch_input_bins_for_qc = ch_input_for_postbinning.transpose() - BIN_QC( - ch_input_bins_for_qc, + ch_input_for_postbinning, ch_checkm_db, ch_checkm2_db, - ch_busco_db + ch_busco_db, + ch_gunc_db ) - bin_qc_summary = BIN_QC.out.summary + ch_bin_qc_summary = BIN_QC.out.qc_summary ch_versions = ch_versions.mix(BIN_QC.out.versions) - if (params.run_gunc) { - ch_input_bins_for_gunc = ch_input_for_postbinning.filter { meta, _bins -> - meta.domain != "eukarya" - } - - GUNC_QC( - ch_input_bins_for_gunc, - ch_gunc_db, - params.binqc_tool == 'checkm' ? BIN_QC.out.checkm_tsv : [] - ) - - ch_versions = ch_versions.mix(GUNC_QC.out.versions) - } - ch_quast_bins_summary = Channel.empty() if (!params.skip_quast) { ch_input_for_quast_bins = ch_input_for_postbinning @@ -886,7 +870,7 @@ workflow MAG { GTDBTK( ch_gtdb_bins, - bin_qc_summary, + ch_bin_qc_summary, gtdb, gtdb_mash ) @@ -901,7 +885,7 @@ workflow MAG { if ((!params.skip_binqc) || !params.skip_quast || !params.skip_gtdbtk) { BIN_SUMMARY( ch_input_for_binsummary, - bin_qc_summary.ifEmpty([]), + ch_bin_qc_summary.ifEmpty([]), ch_quast_bins_summary.ifEmpty([]), ch_gtdbtk_summary.ifEmpty([]), ch_cat_global_summary.ifEmpty([]) @@ -1043,7 +1027,7 @@ workflow MAG { } if (!params.skip_binning && !params.skip_binqc && params.binqc_tool == 'busco') { - ch_multiqc_files = ch_multiqc_files.mix(BIN_QC.out.multiqc.collect().ifEmpty([])) + ch_multiqc_files = ch_multiqc_files.mix(BIN_QC.out.multiqc_files.collect().ifEmpty([])) } From f0a6999bcd2eadfb26a5a38eb4de2438026ec083 Mon Sep 17 00:00:00 2001 From: "Diego Alvarez S." Date: Fri, 1 Nov 2024 02:52:20 -0300 Subject: [PATCH 16/33] Code style improvements --- subworkflows/local/bin_qc.nf | 44 ++++++++++++++++++++---------------- 1 file changed, 25 insertions(+), 19 deletions(-) diff --git a/subworkflows/local/bin_qc.nf b/subworkflows/local/bin_qc.nf index b2f78e7b..c1b5ace6 100644 --- a/subworkflows/local/bin_qc.nf +++ b/subworkflows/local/bin_qc.nf @@ -46,12 +46,11 @@ workflow BIN_QC { // Set meta to match expected channel cardinality for BUSCO ch_db_for_busco = Channel .of(ch_busco_db) - .map { db -> + .collect { db -> def basename = db.getBaseName() def lineage = basename.contains('odb10') ? 'Y' : 'N' [[id: basename, lineage: lineage], db] } - .collect() } } else { @@ -74,13 +73,13 @@ workflow BIN_QC { BUSCO(ch_input_bins_for_qc, ch_db_for_busco) BUSCO_SUMMARY( - BUSCO.out.summary_domain.map { it[1] }.collect().ifEmpty([]), - BUSCO.out.summary_specific.map { it[1] }.collect().ifEmpty([]), - BUSCO.out.failed_bin.map { it[1] }.collect().ifEmpty([]) + BUSCO.out.summary_domain.collect { v -> v[1] }.ifEmpty([]), + BUSCO.out.summary_specific.collect { v -> v[1] }.ifEmpty([]), + BUSCO.out.failed_bin.collect { v -> v[1] }.ifEmpty([]) ) ch_multiqc_files = ch_multiqc_files.mix( - BUSCO.out.summary_domain.mix(BUSCO.out.summary_specific).map{ it[1] } + BUSCO.out.summary_domain.mix(BUSCO.out.summary_specific).map { it[1] } ) qc_summary = BUSCO_SUMMARY.out.summary ch_versions = ch_versions.mix(BUSCO.out.versions.first()) @@ -92,11 +91,11 @@ workflow BIN_QC { ch_bins_for_checkmlineagewf = ch_input_bins_for_qc .groupTuple() .filter { meta, _bins -> - meta.domain != "eukarya" - } + meta.domain != "eukarya" + } .multiMap { meta, fa -> reads: [meta, fa] - ext: fa.extension.unique().join("") // the pipeline ensures that all bins will have the same extension + ext: fa.extension.unique().join("") } CHECKM_LINEAGEWF(ch_bins_for_checkmlineagewf.reads, ch_bins_for_checkmlineagewf.ext, ch_checkm_db) @@ -110,7 +109,7 @@ workflow BIN_QC { CHECKM_QA(ch_checkmqa_input, []) - COMBINE_BINQC_TSV(CHECKM_QA.out.output.map { it[1] }.collect()) + COMBINE_BINQC_TSV(CHECKM_QA.out.output.collect { v -> v[1] }) qc_summary = COMBINE_BINQC_TSV.out.combined ch_versions = ch_versions.mix(CHECKM_QA.out.versions.first()) @@ -121,7 +120,7 @@ workflow BIN_QC { */ CHECKM2_PREDICT(ch_input_bins_for_qc.groupTuple(), ch_checkm2_db) - COMBINE_BINQC_TSV(CHECKM2_PREDICT.out.checkm2_tsv.map { it[1] }.collect()) + COMBINE_BINQC_TSV(CHECKM2_PREDICT.out.checkm2_tsv.collect { v -> v[1] }) qc_summary = COMBINE_BINQC_TSV.out.combined ch_versions = ch_versions.mix(CHECKM2_PREDICT.out.versions.first()) @@ -139,7 +138,7 @@ workflow BIN_QC { bins.collect { bin -> [meta, bin] } } - if ( params.gunc_db ) { + if (params.gunc_db) { ch_db_for_gunc = ch_gunc_db } else { @@ -152,10 +151,14 @@ workflow BIN_QC { // Make sure to keep directory in sync with modules.conf GUNC_RUN.out.maxcss_level_tsv - .map{it[1]} - .collectFile(name: "gunc_summary.tsv", keepHeader: true, storeDir: "${params.outdir}/GenomeBinning/QC/") - - if ( params.binqc_tool == 'checkm' ) { + .map { v -> v[1] } + .collectFile( + name: "gunc_summary.tsv", + keepHeader: true, + storeDir: "${params.outdir}/GenomeBinning/QC/" + ) + + if (params.binqc_tool == 'checkm') { ch_input_to_mergecheckm = GUNC_RUN.out.maxcss_level_tsv.combine(CHECKM_QA.out.output, by: 0) GUNC_MERGECHECKM(ch_input_to_mergecheckm) @@ -163,12 +166,15 @@ workflow BIN_QC { // Make sure to keep directory in sync with modules.conf GUNC_MERGECHECKM.out.tsv - .map{it[1]} - .collectFile(name: "gunc_checkm_summary.tsv", keepHeader: true, storeDir: "${params.outdir}/GenomeBinning/QC/") + .map { v -> v[1] } + .collectFile( + name: "gunc_checkm_summary.tsv", + keepHeader: true, + storeDir: "${params.outdir}/GenomeBinning/QC/" + ) } } - emit: qc_summary = qc_summary multiqc_files = ch_multiqc_files From 3158c5243e3eb4ca924e5e3df8c8e19aa3f3c866 Mon Sep 17 00:00:00 2001 From: "Diego Alvarez S." Date: Tue, 26 Nov 2024 23:32:44 -0300 Subject: [PATCH 17/33] Address several review comments --- conf/modules.config | 3 +- docs/output.md | 4 +- .../nf-core/checkm2/databasedownload/meta.yml | 2 +- nextflow.config | 2 +- nextflow_schema.json | 2 +- subworkflows/local/bin_qc.nf | 52 +++++++++---------- 6 files changed, 33 insertions(+), 32 deletions(-) diff --git a/conf/modules.config b/conf/modules.config index 34d67480..bcad2756 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -435,7 +435,8 @@ process { publishDir = [ path: { "${params.outdir}/GenomeBinning/QC/CheckM2/checkm2_downloads" }, mode: params.publish_dir_mode, overwrite: false, - saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, enabled: params.save_checkm2_data + saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, + enabled: params.save_checkm2_data ] } diff --git a/docs/output.md b/docs/output.md index eea1ba38..b4d487e4 100644 --- a/docs/output.md +++ b/docs/output.md @@ -583,14 +583,14 @@ If the parameter `--save_checkm_reference` is set, additionally the used the Che #### CheckM2 -[CheckM2](https://github.com/chklovski/CheckM2) is atool for assessing the quality of metagenome-derived genomes. It uses a machine learning approach to predict the completeness and contamination of a genome regardless of its taxonomic lineage. +[CheckM2](https://github.com/chklovski/CheckM2) is a tool for assessing the quality of metagenome-derived genomes. It uses a machine learning approach to predict the completeness and contamination of a genome regardless of its taxonomic lineage.
Output files - `GenomeBinning/QC/CheckM2/` - `[assembler]-[binner]-[domain]-[refinement]-[sample/group]/quality_report.tsv`: Detailed statistics about bins informing completeness and contamamination scores. This should normally be your main file to use to evaluate your results. - - `[assembler]-[binner]-[domain]-[refinement]-[sample/group]/`: Intermediate files for CheckM2 results, including CheckM2 generated annotations, log, and Diamond alignment results. + - `[assembler]-[binner]-[domain]-[refinement]-[sample/group]/`: Intermediate files for CheckM2 results, including CheckM2 generated annotations, log, and DIAMOND alignment results. - `GenomeBinning/QC/` - `checkm2_summary.tsv`: A summary table of the CheckM2 results for all bins. diff --git a/modules/nf-core/checkm2/databasedownload/meta.yml b/modules/nf-core/checkm2/databasedownload/meta.yml index 632b4922..b79ccfc6 100644 --- a/modules/nf-core/checkm2/databasedownload/meta.yml +++ b/modules/nf-core/checkm2/databasedownload/meta.yml @@ -20,7 +20,7 @@ tools: input: - - db_zenodo_id: type: integer - description: Zenodo ID of the CheckM2 database to download + description: Zenodo record ID of the CheckM2 database to download output: - database: diff --git a/nextflow.config b/nextflow.config index c1d8d896..cdb2d9f0 100644 --- a/nextflow.config +++ b/nextflow.config @@ -136,7 +136,7 @@ params { checkm_db = null save_checkm_data = false checkm2_db = null - checkm2_db_version = 5571251 + checkm2_db_version = 5571251 // corresponds to Zenodo record ID save_checkm2_data = false run_gunc = false gunc_database_type = 'progenomes' diff --git a/nextflow_schema.json b/nextflow_schema.json index ae2be3ed..0616173d 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -783,7 +783,7 @@ "checkm2_db_version": { "type": "integer", "default": 5571251, - "description": "CheckM2 database version number to download (Zenodo ID, for reference check https://zenodo.org/records/5571251)." + "description": "CheckM2 database version number to download (Zenodo record ID, for reference check https://zenodo.org/records/5571251)." }, "save_checkm2_data": { "type": "boolean", diff --git a/subworkflows/local/bin_qc.nf b/subworkflows/local/bin_qc.nf index c1b5ace6..1211c36c 100644 --- a/subworkflows/local/bin_qc.nf +++ b/subworkflows/local/bin_qc.nf @@ -2,26 +2,26 @@ * BUSCO/CheckM/CheckM2/GUNC: Quantitative measures for the assessment of genome assembly */ -include { BUSCO_DB_PREPARATION } from '../../modules/local/busco_db_preparation' -include { BUSCO } from '../../modules/local/busco' -include { BUSCO_SAVE_DOWNLOAD } from '../../modules/local/busco_save_download' -include { BUSCO_SUMMARY } from '../../modules/local/busco_summary' -include { CHECKM_QA } from '../../modules/nf-core/checkm/qa/main' -include { CHECKM_LINEAGEWF } from '../../modules/nf-core/checkm/lineagewf/main' -include { CHECKM2_PREDICT } from '../../modules/nf-core/checkm2/predict/main' -include { COMBINE_TSV as COMBINE_BINQC_TSV } from '../../modules/local/combine_tsv' -include { GUNC_DOWNLOADDB } from '../../modules/nf-core/gunc/downloaddb/main' -include { GUNC_RUN } from '../../modules/nf-core/gunc/run/main' -include { GUNC_MERGECHECKM } from '../../modules/nf-core/gunc/mergecheckm/main' +include { BUSCO_DB_PREPARATION } from '../../modules/local/busco_db_preparation' +include { BUSCO } from '../../modules/local/busco' +include { BUSCO_SAVE_DOWNLOAD } from '../../modules/local/busco_save_download' +include { BUSCO_SUMMARY } from '../../modules/local/busco_summary' +include { CHECKM_QA } from '../../modules/nf-core/checkm/qa/main' +include { CHECKM_LINEAGEWF } from '../../modules/nf-core/checkm/lineagewf/main' +include { CHECKM2_PREDICT } from '../../modules/nf-core/checkm2/predict/main' +include { COMBINE_TSV as COMBINE_BINQC_TSV } from '../../modules/local/combine_tsv' +include { GUNC_DOWNLOADDB } from '../../modules/nf-core/gunc/downloaddb/main' +include { GUNC_RUN } from '../../modules/nf-core/gunc/run/main' +include { GUNC_MERGECHECKM } from '../../modules/nf-core/gunc/mergecheckm/main' workflow BIN_QC { take: - ch_bins // [ [ meta] , fasta ], input bins (mandatory) - ch_checkm_db // [ db ], presupplied CheckM database (optional) - ch_checkm2_db // [ [meta] , db ], presupplied CheckM2 database (optional) - ch_busco_db // [ [meta] , db ], presupplied BUSCO database (optional) - ch_gunc_db // [ db ], presupplied GUNC database (optional) + ch_bins // [ [ meta] , fasta ], input bins (mandatory) + ch_checkm_db // [ db ], presupplied CheckM database (optional) + ch_checkm2_db // [ [meta] , db ], presupplied CheckM2 database (optional) + ch_busco_db // [ [meta] , db ], presupplied BUSCO database (optional) + ch_gunc_db // [ db ], presupplied GUNC database (optional) main: qc_summary = [] @@ -73,9 +73,9 @@ workflow BIN_QC { BUSCO(ch_input_bins_for_qc, ch_db_for_busco) BUSCO_SUMMARY( - BUSCO.out.summary_domain.collect { v -> v[1] }.ifEmpty([]), - BUSCO.out.summary_specific.collect { v -> v[1] }.ifEmpty([]), - BUSCO.out.failed_bin.collect { v -> v[1] }.ifEmpty([]) + BUSCO.out.summary_domain.collect { _meta, summary -> summary }.ifEmpty([]), + BUSCO.out.summary_specific.collect { _meta, summary -> summary }.ifEmpty([]), + BUSCO.out.failed_bin.collect { _meta, summary -> summary }.ifEmpty([]) ) ch_multiqc_files = ch_multiqc_files.mix( @@ -109,7 +109,7 @@ workflow BIN_QC { CHECKM_QA(ch_checkmqa_input, []) - COMBINE_BINQC_TSV(CHECKM_QA.out.output.collect { v -> v[1] }) + COMBINE_BINQC_TSV(CHECKM_QA.out.output.collect { summary -> summary[1] }) qc_summary = COMBINE_BINQC_TSV.out.combined ch_versions = ch_versions.mix(CHECKM_QA.out.versions.first()) @@ -120,7 +120,7 @@ workflow BIN_QC { */ CHECKM2_PREDICT(ch_input_bins_for_qc.groupTuple(), ch_checkm2_db) - COMBINE_BINQC_TSV(CHECKM2_PREDICT.out.checkm2_tsv.collect { v -> v[1] }) + COMBINE_BINQC_TSV(CHECKM2_PREDICT.out.checkm2_tsv.collect { summary -> summary[1] }) qc_summary = COMBINE_BINQC_TSV.out.combined ch_versions = ch_versions.mix(CHECKM2_PREDICT.out.versions.first()) @@ -151,7 +151,7 @@ workflow BIN_QC { // Make sure to keep directory in sync with modules.conf GUNC_RUN.out.maxcss_level_tsv - .map { v -> v[1] } + .map { _meta, gunc_summary -> gunc_summary } .collectFile( name: "gunc_summary.tsv", keepHeader: true, @@ -166,7 +166,7 @@ workflow BIN_QC { // Make sure to keep directory in sync with modules.conf GUNC_MERGECHECKM.out.tsv - .map { v -> v[1] } + .map { _meta, gunc_checkm_summary -> gunc_checkm_summary } .collectFile( name: "gunc_checkm_summary.tsv", keepHeader: true, @@ -176,7 +176,7 @@ workflow BIN_QC { } emit: - qc_summary = qc_summary - multiqc_files = ch_multiqc_files - versions = ch_versions + qc_summary = qc_summary + multiqc_files = ch_multiqc_files + versions = ch_versions } From 6152696abd8cccdccf265890cd75e77aece30cad Mon Sep 17 00:00:00 2001 From: "Diego Alvarez S." Date: Tue, 26 Nov 2024 23:34:31 -0300 Subject: [PATCH 18/33] Update modules --- modules.json | 8 +- .../nf-core/checkm/lineagewf/environment.yml | 5 + modules/nf-core/checkm/lineagewf/main.nf | 36 ++-- modules/nf-core/checkm/lineagewf/meta.yml | 96 ++++++---- .../checkm/lineagewf/tests/main.nf.test | 61 ++++++ .../checkm/lineagewf/tests/main.nf.test.snap | 99 ++++++++++ .../nf-core/checkm/lineagewf/tests/tags.yml | 2 + modules/nf-core/checkm/qa/environment.yml | 5 + modules/nf-core/checkm/qa/main.nf | 33 ++-- modules/nf-core/checkm/qa/meta.yml | 91 +++++---- modules/nf-core/checkm/qa/tests/main.nf.test | 88 +++++++++ .../nf-core/checkm/qa/tests/main.nf.test.snap | 96 ++++++++++ modules/nf-core/checkm/qa/tests/tags.yml | 3 + .../nf-core/gunc/mergecheckm/environment.yml | 6 + modules/nf-core/gunc/mergecheckm/main.nf | 16 +- modules/nf-core/gunc/mergecheckm/meta.yml | 61 +++--- .../gunc/mergecheckm/tests/main.nf.test | 175 ++++++++++++++++++ .../gunc/mergecheckm/tests/main.nf.test.snap | 68 +++++++ .../gunc/mergecheckm/tests/nextflow.config | 5 + .../nf-core/gunc/mergecheckm/tests/tags.yml | 6 + modules/nf-core/gunc/run/environment.yml | 6 + modules/nf-core/gunc/run/main.nf | 21 ++- modules/nf-core/gunc/run/meta.yml | 75 ++++---- modules/nf-core/gunc/run/tests/main.nf.test | 96 ++++++++++ .../nf-core/gunc/run/tests/main.nf.test.snap | 90 +++++++++ modules/nf-core/gunc/run/tests/tags.yml | 3 + 26 files changed, 1076 insertions(+), 175 deletions(-) create mode 100644 modules/nf-core/checkm/lineagewf/environment.yml create mode 100644 modules/nf-core/checkm/lineagewf/tests/main.nf.test create mode 100644 modules/nf-core/checkm/lineagewf/tests/main.nf.test.snap create mode 100644 modules/nf-core/checkm/lineagewf/tests/tags.yml create mode 100644 modules/nf-core/checkm/qa/environment.yml create mode 100644 modules/nf-core/checkm/qa/tests/main.nf.test create mode 100644 modules/nf-core/checkm/qa/tests/main.nf.test.snap create mode 100644 modules/nf-core/checkm/qa/tests/tags.yml create mode 100644 modules/nf-core/gunc/mergecheckm/environment.yml create mode 100644 modules/nf-core/gunc/mergecheckm/tests/main.nf.test create mode 100644 modules/nf-core/gunc/mergecheckm/tests/main.nf.test.snap create mode 100644 modules/nf-core/gunc/mergecheckm/tests/nextflow.config create mode 100644 modules/nf-core/gunc/mergecheckm/tests/tags.yml create mode 100644 modules/nf-core/gunc/run/environment.yml create mode 100644 modules/nf-core/gunc/run/tests/main.nf.test create mode 100644 modules/nf-core/gunc/run/tests/main.nf.test.snap create mode 100644 modules/nf-core/gunc/run/tests/tags.yml diff --git a/modules.json b/modules.json index 8a7a3ef3..64f8d479 100644 --- a/modules.json +++ b/modules.json @@ -54,12 +54,12 @@ }, "checkm/lineagewf": { "branch": "master", - "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", + "git_sha": "3ea318161b8788623cec477bde0f089180b2245b", "installed_by": ["modules"] }, "checkm/qa": { "branch": "master", - "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", + "git_sha": "3ea318161b8788623cec477bde0f089180b2245b", "installed_by": ["modules"] }, "checkm2/databasedownload": { @@ -154,12 +154,12 @@ }, "gunc/mergecheckm": { "branch": "master", - "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", + "git_sha": "b6515a01897b11b64b3368858c0359b4c813ad1e", "installed_by": ["modules"] }, "gunc/run": { "branch": "master", - "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", + "git_sha": "b6515a01897b11b64b3368858c0359b4c813ad1e", "installed_by": ["modules"] }, "gunzip": { diff --git a/modules/nf-core/checkm/lineagewf/environment.yml b/modules/nf-core/checkm/lineagewf/environment.yml new file mode 100644 index 00000000..1b870502 --- /dev/null +++ b/modules/nf-core/checkm/lineagewf/environment.yml @@ -0,0 +1,5 @@ +channels: + - conda-forge + - bioconda +dependencies: + - bioconda::checkm-genome=1.2.3 diff --git a/modules/nf-core/checkm/lineagewf/main.nf b/modules/nf-core/checkm/lineagewf/main.nf index d8674ddc..67fd8f35 100644 --- a/modules/nf-core/checkm/lineagewf/main.nf +++ b/modules/nf-core/checkm/lineagewf/main.nf @@ -2,10 +2,10 @@ process CHECKM_LINEAGEWF { tag "$meta.id" label 'process_medium' - conda "bioconda::checkm-genome=1.2.1" + conda "${moduleDir}/environment.yml" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/checkm-genome:1.2.1--pyhdfd78af_0' : - 'biocontainers/checkm-genome:1.2.1--pyhdfd78af_0' }" + 'https://depot.galaxyproject.org/singularity/checkm-genome:1.2.3--pyhdfd78af_1' : + 'biocontainers/checkm-genome:1.2.3--pyhdfd78af_1' }" input: tuple val(meta), path(fasta, stageAs: "input_bins/*") @@ -22,22 +22,34 @@ process CHECKM_LINEAGEWF { task.ext.when == null || task.ext.when script: - def args = task.ext.args ?: '' - prefix = task.ext.prefix ?: "${meta.id}" - checkm_db = db ? "export CHECKM_DATA_PATH=${db}" : "" + def args = task.ext.args ?: '' + def checkm_db = db ? "export CHECKM_DATA_PATH=${db}" : "" + prefix = task.ext.prefix ?: "${meta.id}" """ - $checkm_db + ${checkm_db} checkm \\ lineage_wf \\ - -t $task.cpus \\ + -t ${task.cpus} \\ -f ${prefix}.tsv \\ --tab_table \\ - --pplacer_threads $task.cpus \\ - -x $fasta_ext \\ - $args \\ + --pplacer_threads ${task.cpus} \\ + -x ${fasta_ext} \\ + ${args} \\ input_bins/ \\ - $prefix + ${prefix} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + checkm: \$( checkm 2>&1 | grep '...:::' | sed 's/.*CheckM v//;s/ .*//' ) + END_VERSIONS + """ + + stub: + prefix = task.ext.prefix ?: "${meta.id}" + """ + mkdir ${prefix}/ + touch ${prefix}/lineage.ms ${prefix}.tsv cat <<-END_VERSIONS > versions.yml "${task.process}": diff --git a/modules/nf-core/checkm/lineagewf/meta.yml b/modules/nf-core/checkm/lineagewf/meta.yml index 4716a3e9..e32441d2 100644 --- a/modules/nf-core/checkm/lineagewf/meta.yml +++ b/modules/nf-core/checkm/lineagewf/meta.yml @@ -1,5 +1,6 @@ name: checkm_lineagewf -description: CheckM provides a set of tools for assessing the quality of genomes recovered from isolates, single cells, or metagenomes. +description: CheckM provides a set of tools for assessing the quality of genomes recovered + from isolates, single cells, or metagenomes. keywords: - checkm - mag @@ -14,52 +15,69 @@ keywords: - genome bins tools: - checkm: - description: Assess the quality of microbial genomes recovered from isolates, single cells, and metagenomes. + description: Assess the quality of microbial genomes recovered from isolates, + single cells, and metagenomes. homepage: https://ecogenomics.github.io/CheckM/ documentation: https://github.com/Ecogenomics/CheckM/wiki tool_dev_url: https://github.com/Ecogenomics/CheckM doi: "10.1101/gr.186072.114" licence: ["GPL v3"] - + identifier: biotools:checkm input: - - meta: - type: map - description: | - Groovy Map containing sample information - e.g. [ id:'test', single_end:false ] - - fasta: - type: file - description: One or a list of multiple FASTA files of each bin, with extension defined with the fasta_ext value - pattern: "*.{$fasta_ext}" - - fasta_ext: - type: value - description: The file-type extension suffix of the input FASTA files (e.g., fasta, fna, fa, fas) - - db: - type: directory - description: Optional directory pointing to checkM database to prevent re-downloading - + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - fasta: + type: file + description: One or a list of multiple FASTA files of each bin, with extension + defined with the fasta_ext value + pattern: "*.{$fasta_ext}" + - - fasta_ext: + type: string + description: The file-type extension suffix of the input FASTA files (e.g., + fasta, fna, fa, fas) + - - db: + type: directory + description: Optional directory pointing to checkM database to prevent re-downloading output: - - meta: - type: map - description: | - Groovy Map containing sample information - e.g. [ id:'sample', bin:'1' ] - - versions: - type: file - description: File containing software versions - pattern: "versions.yml" - - checkm_output: - type: directory - description: CheckM output directory - pattern: "*/" - checkm_output: - type: file - description: Lineage markfer file - pattern: "lineage.ms" + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'sample', bin:'1' ] + - ${prefix}: + type: directory + description: CheckM output directory + pattern: "*/" + - marker_file: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'sample', bin:'1' ] + - ${prefix}/lineage.ms: + type: file + description: Lineage file + pattern: "*.ms" - checkm_tsv: - type: file - description: CheckM summary completeness statistics table - pattern: "*.tsv" - + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'sample', bin:'1' ] + - ${prefix}.tsv: + type: file + description: CheckM summary completeness statistics table + pattern: "*.tsv" + - versions: + - versions.yml: + type: file + description: File containing software versions + pattern: "versions.yml" authors: - "@jfy133" +maintainers: + - "@jfy133" diff --git a/modules/nf-core/checkm/lineagewf/tests/main.nf.test b/modules/nf-core/checkm/lineagewf/tests/main.nf.test new file mode 100644 index 00000000..8d60100e --- /dev/null +++ b/modules/nf-core/checkm/lineagewf/tests/main.nf.test @@ -0,0 +1,61 @@ +nextflow_process { + name "Test Process CHECKM_LINEAGEWF" + script "../main.nf" + process "CHECKM_LINEAGEWF" + + tag "modules" + tag "modules_nfcore" + tag "checkm" + tag "checkm/lineagewf" + + test("checkm - lineage_wf") { + + when { + process { + """ + input[0] = [ + [ id:'test', single_end:false ], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fasta/contigs.fasta', checkIfExists: true) + ] + input[1] = 'fasta' + input[2] = [] // Download CheckM database + """ + } + } + + then { + assert process.success + assert file(process.out.checkm_output[0][1]).list().find { file(it).name == "checkm.log" } + assert snapshot( + path(process.out.marker_file[0][1]).readLines().any{it.contains("PF00312.17")}, + process.out.checkm_tsv, + process.out.versions + ).match() + } + + } + + test("stub") { + + options "-stub" + + when { + process { + """ + input[0] = [ + [ id:'test', single_end:false ], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fasta/contigs.fasta', checkIfExists: true) + ] + input[1] = 'fasta' + input[2] = [] // Download CheckM database + """ + } + } + + then { + assert process.success + assert snapshot(process.out).match() + } + + } +} diff --git a/modules/nf-core/checkm/lineagewf/tests/main.nf.test.snap b/modules/nf-core/checkm/lineagewf/tests/main.nf.test.snap new file mode 100644 index 00000000..6d6d7f75 --- /dev/null +++ b/modules/nf-core/checkm/lineagewf/tests/main.nf.test.snap @@ -0,0 +1,99 @@ +{ + "stub": { + "content": [ + { + "0": [ + [ + { + "id": "test", + "single_end": false + }, + [ + "lineage.ms:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ] + ], + "1": [ + [ + { + "id": "test", + "single_end": false + }, + "lineage.ms:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "2": [ + [ + { + "id": "test", + "single_end": false + }, + "test.tsv:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "3": [ + "versions.yml:md5,08f99a3a9677aba1509cda63dcf5ce71" + ], + "checkm_output": [ + [ + { + "id": "test", + "single_end": false + }, + [ + "lineage.ms:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ] + ], + "checkm_tsv": [ + [ + { + "id": "test", + "single_end": false + }, + "test.tsv:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "marker_file": [ + [ + { + "id": "test", + "single_end": false + }, + "lineage.ms:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "versions": [ + "versions.yml:md5,08f99a3a9677aba1509cda63dcf5ce71" + ] + } + ], + "meta": { + "nf-test": "0.9.1", + "nextflow": "24.10.0" + }, + "timestamp": "2024-11-05T04:36:45.930077242" + }, + "checkm - lineage_wf": { + "content": [ + true, + [ + [ + { + "id": "test", + "single_end": false + }, + "test.tsv:md5,d5559764f563c4b55223e4e4a3dc1ec9" + ] + ], + [ + "versions.yml:md5,08f99a3a9677aba1509cda63dcf5ce71" + ] + ], + "meta": { + "nf-test": "0.9.1", + "nextflow": "24.10.0" + }, + "timestamp": "2024-11-05T04:27:36.491322471" + } +} \ No newline at end of file diff --git a/modules/nf-core/checkm/lineagewf/tests/tags.yml b/modules/nf-core/checkm/lineagewf/tests/tags.yml new file mode 100644 index 00000000..04438be8 --- /dev/null +++ b/modules/nf-core/checkm/lineagewf/tests/tags.yml @@ -0,0 +1,2 @@ +checkm/lineagewf: + - modules/nf-core/checkm/lineagewf/** diff --git a/modules/nf-core/checkm/qa/environment.yml b/modules/nf-core/checkm/qa/environment.yml new file mode 100644 index 00000000..1b870502 --- /dev/null +++ b/modules/nf-core/checkm/qa/environment.yml @@ -0,0 +1,5 @@ +channels: + - conda-forge + - bioconda +dependencies: + - bioconda::checkm-genome=1.2.3 diff --git a/modules/nf-core/checkm/qa/main.nf b/modules/nf-core/checkm/qa/main.nf index b0c0e69a..0255d95c 100644 --- a/modules/nf-core/checkm/qa/main.nf +++ b/modules/nf-core/checkm/qa/main.nf @@ -1,11 +1,11 @@ process CHECKM_QA { - tag "$meta.id" + tag "${meta.id}" label 'process_low' - conda "bioconda::checkm-genome=1.2.1" + conda "${moduleDir}/environment.yml" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/checkm-genome:1.2.1--pyhdfd78af_0' : - 'biocontainers/checkm-genome:1.2.1--pyhdfd78af_0' }" + 'https://depot.galaxyproject.org/singularity/checkm-genome:1.2.3--pyhdfd78af_1' : + 'biocontainers/checkm-genome:1.2.3--pyhdfd78af_1' }" input: tuple val(meta), path(analysis_dir), path(marker_file), path(coverage_file) @@ -23,18 +23,29 @@ process CHECKM_QA { def args = task.ext.args ?: '' prefix = task.ext.prefix ?: "${meta.id}" suffix = task.ext.args?.matches(".*-o 9.*|.*--out_file 9.*") ? "fasta" : "txt" - def coverage = coverage_file ? "--coverage_file ${coverage_file}" : "" - def exclude = exclude_marker_file ? "--exclude_markers ${marker_filer}" : "" + def coverage = coverage_file.isFile() ? "--coverage_file ${coverage_file}" : "" + def exclude = exclude_marker_file && exclude_marker_file.isFile() ? "--exclude_markers ${exclude_marker_file}" : "" """ checkm \\ qa \\ --threads ${task.cpus} \\ --file ${prefix}.${suffix} \\ - $marker_file \\ - $analysis_dir \\ - $coverage \\ - $exclude \\ - $args + ${marker_file} \\ + ${analysis_dir} \\ + ${coverage} \\ + ${exclude} \\ + ${args} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + checkm: \$( checkm 2>&1 | grep '...:::' | sed 's/.*CheckM v//;s/ .*//' ) + END_VERSIONS + """ + + stub: + prefix = task.ext.prefix ?: "${meta.id}" + """ + touch ${prefix}.txt ${prefix}.fasta cat <<-END_VERSIONS > versions.yml "${task.process}": diff --git a/modules/nf-core/checkm/qa/meta.yml b/modules/nf-core/checkm/qa/meta.yml index d0af39af..cd41eaec 100644 --- a/modules/nf-core/checkm/qa/meta.yml +++ b/modules/nf-core/checkm/qa/meta.yml @@ -1,5 +1,6 @@ name: checkm_qa -description: CheckM provides a set of tools for assessing the quality of genomes recovered from isolates, single cells, or metagenomes. +description: CheckM provides a set of tools for assessing the quality of genomes recovered + from isolates, single cells, or metagenomes. keywords: - checkm - mag @@ -16,52 +17,64 @@ keywords: - quality assurnce tools: - checkm: - description: Assess the quality of microbial genomes recovered from isolates, single cells, and metagenomes. + description: Assess the quality of microbial genomes recovered from isolates, + single cells, and metagenomes. homepage: https://ecogenomics.github.io/CheckM/ documentation: https://github.com/Ecogenomics/CheckM/wiki tool_dev_url: https://github.com/Ecogenomics/CheckM doi: "10.1101/gr.186072.114" licence: ["GPL v3"] - + identifier: biotools:checkm input: - - meta: - type: map - description: | - Groovy Map containing sample information - e.g. [ id:'test', single_end:false ] - - analysis_dir: - type: file - description: Directory containing output of checkm/analyze or checkm/lineage_wf etc. - pattern: "*" - - marker_file: - type: file - description: Marker file specified during checkm/analyze or produced by checkm/{lineage,taxonomy}_wf - pattern: "*.ms" - - coverage_file: - type: file - description: File containing coverage of each sequence (generated by checkm coverage) - - exclude_marker_file: - type: file - description: File specifying markers to exclude from marker sets - + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - analysis_dir: + type: file + description: Directory containing output of checkm/analyze or checkm/lineage_wf + etc. + pattern: "*" + - marker_file: + type: file + description: Marker file specified during checkm/analyze or produced by checkm/{lineage,taxonomy}_wf + pattern: "*.ms" + - coverage_file: + type: file + description: File containing coverage of each sequence (generated by checkm + coverage) + - - exclude_marker_file: + type: file + description: File specifying markers to exclude from marker sets output: - - meta: - type: map - description: | - Groovy Map containing sample information - e.g. [ id:'test', single_end:false ] - - versions: - type: file - description: File containing software versions - pattern: "versions.yml" - output: - type: file - description: "Default completeness statistics in various formats, as specified with --out_format (excluding option: 9)" - pattern: "*.txt" + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - ${prefix}.txt: + type: file + description: "Default completeness statistics in various formats, as specified + with --out_format (excluding option: 9)" + pattern: "*.txt" - fasta: - type: file - description: Output in fasta format (only if --out_format 9) - pattern: "*.fasta" - + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - ${prefix}.fasta: + type: file + description: Output in fasta format (only if --out_format 9) + pattern: "*.fasta" + - versions: + - versions.yml: + type: file + description: File containing software versions + pattern: "versions.yml" authors: - "@jfy133" +maintainers: + - "@jfy133" diff --git a/modules/nf-core/checkm/qa/tests/main.nf.test b/modules/nf-core/checkm/qa/tests/main.nf.test new file mode 100644 index 00000000..8037bbc2 --- /dev/null +++ b/modules/nf-core/checkm/qa/tests/main.nf.test @@ -0,0 +1,88 @@ +nextflow_process { + name "Test Process CHECKM_QA" + script "../main.nf" + process "CHECKM_QA" + + tag "modules" + tag "modules_nfcore" + tag "checkm" + tag "checkm/qa" + tag "checkm/lineagewf" + + test("checkm - qa") { + + setup { + run("CHECKM_LINEAGEWF") { + script "../../lineagewf/main.nf" + process { + """ + input[0] = [ + [ id:'test', single_end:false ], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fasta/contigs.fasta', checkIfExists: true) + ] + input[1] = 'fasta' + input[2] = [] // Download CheckM database + """ + } + } + } + + when { + process { + """ + input[0] = CHECKM_LINEAGEWF.out.checkm_output.join(CHECKM_LINEAGEWF.out.marker_file) + .map { v -> v + [file('NO_FILE')] } + input[1] = [] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + + test("stub") { + + options "-stub" + + setup { + run("CHECKM_LINEAGEWF") { + script "../../lineagewf/main.nf" + process { + """ + input[0] = [ + [ id:'test', single_end:false ], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fasta/contigs.fasta', checkIfExists: true) + ] + input[1] = 'fasta' + input[2] = [] // Download CheckM database + """ + } + } + } + + when { + process { + """ + input[0] = CHECKM_LINEAGEWF.out.checkm_output.join(CHECKM_LINEAGEWF.out.marker_file) + .map { v -> v + [file('NO_FILE')] } + input[1] = [] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + +} \ No newline at end of file diff --git a/modules/nf-core/checkm/qa/tests/main.nf.test.snap b/modules/nf-core/checkm/qa/tests/main.nf.test.snap new file mode 100644 index 00000000..77eca77b --- /dev/null +++ b/modules/nf-core/checkm/qa/tests/main.nf.test.snap @@ -0,0 +1,96 @@ +{ + "checkm - qa": { + "content": [ + { + "0": [ + [ + { + "id": "test", + "single_end": false + }, + "test.txt:md5,645f4282569afb4b171396732b2d2582" + ] + ], + "1": [ + + ], + "2": [ + "versions.yml:md5,7a0683a78cbf54a6a69ee64055c584a6" + ], + "fasta": [ + + ], + "output": [ + [ + { + "id": "test", + "single_end": false + }, + "test.txt:md5,645f4282569afb4b171396732b2d2582" + ] + ], + "versions": [ + "versions.yml:md5,7a0683a78cbf54a6a69ee64055c584a6" + ] + } + ], + "meta": { + "nf-test": "0.9.1", + "nextflow": "24.10.0" + }, + "timestamp": "2024-11-05T04:44:09.849072843" + }, + "stub": { + "content": [ + { + "0": [ + [ + { + "id": "test", + "single_end": false + }, + "test.txt:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "1": [ + [ + { + "id": "test", + "single_end": false + }, + "test.fasta:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "2": [ + "versions.yml:md5,7a0683a78cbf54a6a69ee64055c584a6" + ], + "fasta": [ + [ + { + "id": "test", + "single_end": false + }, + "test.fasta:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "output": [ + [ + { + "id": "test", + "single_end": false + }, + "test.txt:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "versions": [ + "versions.yml:md5,7a0683a78cbf54a6a69ee64055c584a6" + ] + } + ], + "meta": { + "nf-test": "0.9.1", + "nextflow": "24.10.0" + }, + "timestamp": "2024-11-05T04:14:12.680834625" + } +} \ No newline at end of file diff --git a/modules/nf-core/checkm/qa/tests/tags.yml b/modules/nf-core/checkm/qa/tests/tags.yml new file mode 100644 index 00000000..08b4747b --- /dev/null +++ b/modules/nf-core/checkm/qa/tests/tags.yml @@ -0,0 +1,3 @@ +checkm/qa: + - modules/nf-core/checkm/lineagewf/** + - modules/nf-core/checkm/qa/** diff --git a/modules/nf-core/gunc/mergecheckm/environment.yml b/modules/nf-core/gunc/mergecheckm/environment.yml new file mode 100644 index 00000000..3a0264f4 --- /dev/null +++ b/modules/nf-core/gunc/mergecheckm/environment.yml @@ -0,0 +1,6 @@ +channels: + - conda-forge + - bioconda + +dependencies: + - bioconda::gunc=1.0.6 diff --git a/modules/nf-core/gunc/mergecheckm/main.nf b/modules/nf-core/gunc/mergecheckm/main.nf index b6399f22..611f916c 100644 --- a/modules/nf-core/gunc/mergecheckm/main.nf +++ b/modules/nf-core/gunc/mergecheckm/main.nf @@ -2,10 +2,10 @@ process GUNC_MERGECHECKM { tag "$meta.id" label 'process_single' - conda "bioconda::gunc=1.0.5" + conda "${moduleDir}/environment.yml" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/gunc:1.0.5--pyhdfd78af_0' : - 'biocontainers/gunc:1.0.5--pyhdfd78af_0' }" + 'https://depot.galaxyproject.org/singularity/gunc:1.0.6--pyhdfd78af_0' : + 'biocontainers/gunc:1.0.6--pyhdfd78af_0' }" input: tuple val(meta), path(gunc_file), path(checkm_file) @@ -33,4 +33,14 @@ process GUNC_MERGECHECKM { gunc: \$( gunc --version ) END_VERSIONS """ + + stub: + """ + touch gunc_merge_checkm.tsv + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + gunc: \$( gunc --version ) + END_VERSIONS + """ } diff --git a/modules/nf-core/gunc/mergecheckm/meta.yml b/modules/nf-core/gunc/mergecheckm/meta.yml index a88298f7..4a7a2c1c 100644 --- a/modules/nf-core/gunc/mergecheckm/meta.yml +++ b/modules/nf-core/gunc/mergecheckm/meta.yml @@ -11,42 +11,45 @@ keywords: - chimeras tools: - gunc: - description: Python package for detection of chimerism and contamination in prokaryotic genomes. + description: Python package for detection of chimerism and contamination in prokaryotic + genomes. homepage: https://grp-bork.embl-community.io/gunc/ documentation: https://grp-bork.embl-community.io/gunc/ tool_dev_url: https://github.com/grp-bork/gunc doi: "10.1186/s13059-021-02393-0" licence: ["GNU General Public v3 or later (GPL v3+)"] - + identifier: biotools:gunc input: - - meta: - type: map - description: | - Groovy Map containing sample information - e.g. [ id:'test', single_end:false ] - - gunc_file: - type: file - description: Path of a gunc_scores.tsv file (mandatory) - pattern: "*.{bam,cram,sam}" - - checkm_file: - type: file - description: Output TSV from CheckM qa (ideally with -o 2 extended format) (mandatory) - pattern: "*.{bam,cram,sam}" - + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - gunc_file: + type: file + description: Path of a gunc_scores.tsv file (mandatory) + pattern: "*.{bam,cram,sam}" + - checkm_file: + type: file + description: Output TSV from CheckM qa (ideally with -o 2 extended format) (mandatory) + pattern: "*.{bam,cram,sam}" output: - - meta: - type: map - description: | - Groovy Map containing sample information - e.g. [ id:'test', single_end:false ] - - versions: - type: file - description: File containing software versions - pattern: "versions.yml" - tsv: - type: file - description: Merged checkm/gunc results in TSV format - pattern: "*.tsv" - + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - "*.tsv": + type: file + description: Merged checkm/gunc results in TSV format + pattern: "*.tsv" + - versions: + - versions.yml: + type: file + description: File containing software versions + pattern: "versions.yml" authors: - "@jfy133" +maintainers: + - "@jfy133" diff --git a/modules/nf-core/gunc/mergecheckm/tests/main.nf.test b/modules/nf-core/gunc/mergecheckm/tests/main.nf.test new file mode 100644 index 00000000..dbd67b90 --- /dev/null +++ b/modules/nf-core/gunc/mergecheckm/tests/main.nf.test @@ -0,0 +1,175 @@ +nextflow_process { + + name "Test Process GUNC_MERGECHECKM" + script "../main.nf" + process "GUNC_MERGECHECKM" + config "./nextflow.config" + + tag "modules_nfcore" + tag "modules" + tag "gunc" + tag "gunc/mergecheckm" + tag "gunc/run" + tag "gunc/downloaddb" + tag "checkm/lineagewf" + tag "checkm/qa" + + // commented out because GitHub runners are not able to run this test + // test("gunc - mergecheckm") { + + // setup { + // run("CHECKM_LINEAGEWF") { + // script "../../../checkm/lineagewf/main.nf" + // process { + // """ + // input[0] = [ + // [id: 'test'], // meta map + // file( + // params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fasta/contigs.fasta', + // checkIfExists: true + // ) + // ] + // input[1] = 'fasta' + // input[2] = [] // Download CheckM database + // """ + // } + // } + + // run("CHECKM_QA") { + // script "../../../checkm/qa/main.nf" + // process { + // """ + // input[0] = CHECKM_LINEAGEWF.out.checkm_output + // .join(CHECKM_LINEAGEWF.out.marker_file) + // .map { sample_data -> sample_data + [file('NO_FILE')] } + // input[1] = [] + // """ + // } + // } + + // run("GUNC_DOWNLOADDB") { + // script "../../downloaddb/main.nf" + // process { + // """ + // input[0] = 'progenomes' + // """ + // } + // } + + // run("GUNC_RUN") { + // script "../../run/main.nf" + // process { + // """ + // input[0] = [ + // [id: 'test'], + // [file( + // params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fasta/contigs.fasta', + // checkIfExists: true + // )] + // ] + // input[1] = GUNC_DOWNLOADDB.out.db + // """ + // } + // } + // } + + // when { + // params { + // outdir = "${launchDir}/tests/results" + // } + // process { + // """ + // input[0] = GUNC_RUN.out.maxcss_level_tsv.join(CHECKM_QA.out.output) + // """ + // } + // } + + // then { + // assertAll( + // { assert process.success }, + // { assert snapshot(process.out).match() } + // ) + // } + + // } + + test("gunc - mergecheckm - stub") { + + options "-stub" + + setup { + run("CHECKM_LINEAGEWF") { + script "../../../checkm/lineagewf/main.nf" + process { + """ + input[0] = [ + [id: 'test'], // meta map + file( + params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fasta/contigs.fasta', + checkIfExists: true + ) + ] + input[1] = 'fasta' + input[2] = [] // Download CheckM database + """ + } + } + + run("CHECKM_QA") { + script "../../../checkm/qa/main.nf" + process { + """ + input[0] = CHECKM_LINEAGEWF.out.checkm_output + .join(CHECKM_LINEAGEWF.out.marker_file) + .map { v -> v + [file('NO_FILE')] } + input[1] = [] + """ + } + } + + run("GUNC_DOWNLOADDB") { + script "../../downloaddb/main.nf" + process { + """ + input[0] = 'progenomes' + """ + } + } + + run("GUNC_RUN") { + script "../../run/main.nf" + process { + """ + input[0] = [ + [id: 'test'], + [file( + params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fasta/contigs.fasta', + checkIfExists: true + )] + ] + input[1] = GUNC_DOWNLOADDB.out.db + """ + } + } + } + + when { + params { + outdir = "${launchDir}/tests/results" + } + process { + """ + input[0] = GUNC_RUN.out.maxcss_level_tsv.join(CHECKM_QA.out.output) + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } +} \ No newline at end of file diff --git a/modules/nf-core/gunc/mergecheckm/tests/main.nf.test.snap b/modules/nf-core/gunc/mergecheckm/tests/main.nf.test.snap new file mode 100644 index 00000000..807c23f2 --- /dev/null +++ b/modules/nf-core/gunc/mergecheckm/tests/main.nf.test.snap @@ -0,0 +1,68 @@ +{ + "gunc - mergecheckm": { + "content": [ + { + "0": [ + [ + { + "id": "test" + }, + "GUNC_checkM.merged.tsv:md5,24cbd3c76a36cb90ac993c83525a2c1b" + ] + ], + "1": [ + "versions.yml:md5,a94747201129170b1cfbce5e59de62b0" + ], + "tsv": [ + [ + { + "id": "test" + }, + "GUNC_checkM.merged.tsv:md5,24cbd3c76a36cb90ac993c83525a2c1b" + ] + ], + "versions": [ + "versions.yml:md5,a94747201129170b1cfbce5e59de62b0" + ] + } + ], + "meta": { + "nf-test": "0.9.2", + "nextflow": "24.10.0" + }, + "timestamp": "2024-11-22T09:37:48.146410153" + }, + "gunc - mergecheckm - stub": { + "content": [ + { + "0": [ + [ + { + "id": "test" + }, + "gunc_merge_checkm.tsv:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "1": [ + "versions.yml:md5,a94747201129170b1cfbce5e59de62b0" + ], + "tsv": [ + [ + { + "id": "test" + }, + "gunc_merge_checkm.tsv:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "versions": [ + "versions.yml:md5,a94747201129170b1cfbce5e59de62b0" + ] + } + ], + "meta": { + "nf-test": "0.9.2", + "nextflow": "24.10.0" + }, + "timestamp": "2024-11-21T16:47:06.752273424" + } +} \ No newline at end of file diff --git a/modules/nf-core/gunc/mergecheckm/tests/nextflow.config b/modules/nf-core/gunc/mergecheckm/tests/nextflow.config new file mode 100644 index 00000000..1e9ba3dc --- /dev/null +++ b/modules/nf-core/gunc/mergecheckm/tests/nextflow.config @@ -0,0 +1,5 @@ +process { + withName: CHECKM_QA { + ext.args = '--tab_table' + } +} diff --git a/modules/nf-core/gunc/mergecheckm/tests/tags.yml b/modules/nf-core/gunc/mergecheckm/tests/tags.yml new file mode 100644 index 00000000..d05282f2 --- /dev/null +++ b/modules/nf-core/gunc/mergecheckm/tests/tags.yml @@ -0,0 +1,6 @@ +gunc/run: + - modules/nf-core/gunc/mergecheckm/** + - modules/nf-core/gunc/run/** + - modules/nf-core/gunc/downloaddb/** + - modules/nf-core/checkm/lineagewf/** + - modules/nf-core/checkm/qa/** diff --git a/modules/nf-core/gunc/run/environment.yml b/modules/nf-core/gunc/run/environment.yml new file mode 100644 index 00000000..3a0264f4 --- /dev/null +++ b/modules/nf-core/gunc/run/environment.yml @@ -0,0 +1,6 @@ +channels: + - conda-forge + - bioconda + +dependencies: + - bioconda::gunc=1.0.6 diff --git a/modules/nf-core/gunc/run/main.nf b/modules/nf-core/gunc/run/main.nf index 2f1167fa..9ee614e4 100644 --- a/modules/nf-core/gunc/run/main.nf +++ b/modules/nf-core/gunc/run/main.nf @@ -2,13 +2,13 @@ process GUNC_RUN { tag "$meta.id" label 'process_medium' - conda "bioconda::gunc=1.0.5" + conda "${moduleDir}/environment.yml" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/gunc:1.0.5--pyhdfd78af_0' : - 'biocontainers/gunc:1.0.5--pyhdfd78af_0' }" + 'https://depot.galaxyproject.org/singularity/gunc:1.0.6--pyhdfd78af_0' : + 'biocontainers/gunc:1.0.6--pyhdfd78af_0' }" input: - tuple val(meta), path(fasta) + tuple val(meta), path(fasta_files, stageAs: 'input_files/*') path(db) output: @@ -23,9 +23,10 @@ process GUNC_RUN { def args = task.ext.args ?: '' def prefix = task.ext.prefix ?: "${meta.id}" """ + ls input_files/* > input_files.txt gunc \\ run \\ - --input_fasta $fasta \\ + --input_file input_files.txt \\ --db_file $db \\ --threads $task.cpus \\ $args @@ -35,4 +36,14 @@ process GUNC_RUN { gunc: \$( gunc --version ) END_VERSIONS """ + + stub: + """ + touch maxCSS_level.tsv all_levels.tsv + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + gunc: \$( gunc --version ) + END_VERSIONS + """ } diff --git a/modules/nf-core/gunc/run/meta.yml b/modules/nf-core/gunc/run/meta.yml index 3a85e1fb..3ecc0b74 100644 --- a/modules/nf-core/gunc/run/meta.yml +++ b/modules/nf-core/gunc/run/meta.yml @@ -8,46 +8,55 @@ keywords: - chimeras tools: - gunc: - description: Python package for detection of chimerism and contamination in prokaryotic genomes. + description: Python package for detection of chimerism and contamination in prokaryotic + genomes. homepage: https://grp-bork.embl-community.io/gunc/ documentation: https://grp-bork.embl-community.io/gunc/ tool_dev_url: https://github.com/grp-bork/gunc doi: "10.1186/s13059-021-02393-0" licence: ["GNU General Public v3 or later (GPL v3+)"] - + identifier: biotools:gunc input: - - meta: - type: map - description: | - Groovy Map containing sample information - e.g. [ id:'test', single_end:false ] - - fasta: - type: file - description: FASTA file containing contig (bins) - pattern: "*.fa" - - db: - type: file - description: GUNC database file - pattern: "*.dmnd" - + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - fasta_files: + type: file + description: A list of FASTA files containing contig (bins) + pattern: "*.fa" + - - db: + type: file + description: GUNC database file + pattern: "*.dmnd" output: - - meta: - type: map - description: | - Groovy Map containing sample information - e.g. [ id:'test', single_end:false ] - - versions: - type: file - description: File containing software versions - pattern: "versions.yml" - - maxcss_levels_tsv: - type: file - description: Output file with scores for a taxonomic level with the highest CSS score - pattern: "*.tsv" + - maxcss_level_tsv: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - "*maxCSS_level.tsv": + type: file + description: Output file with results for the maximum CSS level + pattern: "*.tsv" - all_levels_tsv: - type: file - description: Optional output file with results for each taxonomic level - pattern: "*.tsv" - + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - "*all_levels.tsv": + type: file + description: Optional output file with results for each taxonomic level + pattern: "*.tsv" + - versions: + - versions.yml: + type: file + description: File containing software versions + pattern: "versions.yml" authors: - "@jfy133" +maintainers: + - "@jfy133" diff --git a/modules/nf-core/gunc/run/tests/main.nf.test b/modules/nf-core/gunc/run/tests/main.nf.test new file mode 100644 index 00000000..c1659f0c --- /dev/null +++ b/modules/nf-core/gunc/run/tests/main.nf.test @@ -0,0 +1,96 @@ +nextflow_process { + + name "Test Process GUNC_RUN" + script "../main.nf" + process "GUNC_RUN" + + tag "modules_nfcore" + tag "modules" + tag "gunc" + tag "gunc/run" + tag "gunc/downloaddb" + + // commented out because GitHub runners are not able to run this test + // test("gunc - run") { + + // setup { + // run("GUNC_DOWNLOADDB") { + // script "../../downloaddb/main.nf" + // process { + // """ + // input[0] = 'progenomes' + // """ + // } + // } + // } + + // when { + // params { + // outdir = "${launchDir}/tests/results" + // } + // process { + // """ + // input[0] = [ + // [id: 'test'], + // [file( + // params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fasta/contigs.fasta', + // checkIfExists: true + // )] + // ] + // input[1] = GUNC_DOWNLOADDB.out.db + // """ + // } + // } + + // then { + // assertAll( + // { assert process.success }, + // { assert snapshot(process.out).match() } + // ) + // } + + // } + + test("gunc - run - stub") { + + options "-stub" + + setup { + run("GUNC_DOWNLOADDB") { + script "../../downloaddb/main.nf" + process { + """ + input[0] = 'progenomes' + """ + } + } + } + + when { + params { + outdir = "${launchDir}/tests/results" + } + process { + """ + input[0] = [ + [id: 'test'], + [file( + params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fasta/contigs.fasta', + checkIfExists: true + )] + ] + input[1] = GUNC_DOWNLOADDB.out.db + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + +} \ No newline at end of file diff --git a/modules/nf-core/gunc/run/tests/main.nf.test.snap b/modules/nf-core/gunc/run/tests/main.nf.test.snap new file mode 100644 index 00000000..516425c8 --- /dev/null +++ b/modules/nf-core/gunc/run/tests/main.nf.test.snap @@ -0,0 +1,90 @@ +{ + "gunc - run - stub": { + "content": [ + { + "0": [ + [ + { + "id": "test" + }, + "maxCSS_level.tsv:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "1": [ + [ + { + "id": "test" + }, + "all_levels.tsv:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "2": [ + "versions.yml:md5,2ee4942c0187a663aed4b66af3bead6a" + ], + "all_levels_tsv": [ + [ + { + "id": "test" + }, + "all_levels.tsv:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "maxcss_level_tsv": [ + [ + { + "id": "test" + }, + "maxCSS_level.tsv:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "versions": [ + "versions.yml:md5,2ee4942c0187a663aed4b66af3bead6a" + ] + } + ], + "meta": { + "nf-test": "0.9.2", + "nextflow": "24.10.0" + }, + "timestamp": "2024-11-21T17:29:46.904708749" + }, + "gunc - run": { + "content": [ + { + "0": [ + [ + { + "id": "test" + }, + "GUNC.progenomes_2.1.maxCSS_level.tsv:md5,938826458a44404d0bf2e7cb4edde405" + ] + ], + "1": [ + + ], + "2": [ + "versions.yml:md5,2ee4942c0187a663aed4b66af3bead6a" + ], + "all_levels_tsv": [ + + ], + "maxcss_level_tsv": [ + [ + { + "id": "test" + }, + "GUNC.progenomes_2.1.maxCSS_level.tsv:md5,938826458a44404d0bf2e7cb4edde405" + ] + ], + "versions": [ + "versions.yml:md5,2ee4942c0187a663aed4b66af3bead6a" + ] + } + ], + "meta": { + "nf-test": "0.9.2", + "nextflow": "24.10.0" + }, + "timestamp": "2024-11-22T10:12:03.813571948" + } +} \ No newline at end of file diff --git a/modules/nf-core/gunc/run/tests/tags.yml b/modules/nf-core/gunc/run/tests/tags.yml new file mode 100644 index 00000000..0af96444 --- /dev/null +++ b/modules/nf-core/gunc/run/tests/tags.yml @@ -0,0 +1,3 @@ +gunc/run: + - modules/nf-core/gunc/run/** + - modules/nf-core/gunc/downloaddb/** From ef1828c6924437067026393458c46b5bbfa98c1c Mon Sep 17 00:00:00 2001 From: "Diego Alvarez S." Date: Tue, 26 Nov 2024 23:43:17 -0300 Subject: [PATCH 19/33] Make binqc_tool an input for BIN_SUMMARY --- modules/local/bin_summary.nf | 42 +++++++++++++++++++----------------- workflows/mag.nf | 3 ++- 2 files changed, 24 insertions(+), 21 deletions(-) diff --git a/modules/local/bin_summary.nf b/modules/local/bin_summary.nf index 9208822a..07784d83 100644 --- a/modules/local/bin_summary.nf +++ b/modules/local/bin_summary.nf @@ -1,34 +1,36 @@ process BIN_SUMMARY { conda "conda-forge::pandas=1.4.3" - container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/pandas:1.4.3' : - 'biocontainers/pandas:1.4.3' }" + container "${workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container + ? 'https://depot.galaxyproject.org/singularity/pandas:1.4.3' + : 'biocontainers/pandas:1.4.3'}" input: - path(bin_depths) - path(binqc_sum) - path(quast_sum) - path(gtdbtk_sum) - path(cat_sum) + path bin_depths + path binqc_sum + path quast_sum + path gtdbtk_sum + path cat_sum + val binqc_tool output: - path("bin_summary.tsv"), emit: summary - path "versions.yml" , emit: versions + path "bin_summary.tsv", emit: summary + path "versions.yml" , emit: versions script: - def binqc_summary = binqc_sum.sort().size() > 0 ? "--binqc_summary ${binqc_sum}" : "" - def quast_summary = quast_sum.sort().size() > 0 ? "--quast_summary ${quast_sum}" : "" + def binqc_summary = binqc_sum.sort().size() > 0 ? "--binqc_summary ${binqc_sum}" : "" + def quast_summary = quast_sum.sort().size() > 0 ? "--quast_summary ${quast_sum}" : "" def gtdbtk_summary = gtdbtk_sum.sort().size() > 0 ? "--gtdbtk_summary ${gtdbtk_sum}" : "" - def cat_summary = cat_sum.sort().size() > 0 ? "--cat_summary ${cat_sum}" : "" + def cat_summary = cat_sum.sort().size() > 0 ? "--cat_summary ${cat_sum}" : "" """ - combine_tables.py --depths_summary ${bin_depths} \ - $binqc_summary \ - $quast_summary \ - $gtdbtk_summary \ - $cat_summary \ - --binqc_tool ${params.binqc_tool} \ - --out bin_summary.tsv + combine_tables.py \ + --depths_summary ${bin_depths} \ + ${binqc_summary} \ + ${quast_summary} \ + ${gtdbtk_summary} \ + ${cat_summary} \ + --binqc_tool ${binqc_tool} \ + --out bin_summary.tsv cat <<-END_VERSIONS > versions.yml "${task.process}": diff --git a/workflows/mag.nf b/workflows/mag.nf index 8b343c8c..d000db0f 100644 --- a/workflows/mag.nf +++ b/workflows/mag.nf @@ -890,7 +890,8 @@ workflow MAG { ch_bin_qc_summary.ifEmpty([]), ch_quast_bins_summary.ifEmpty([]), ch_gtdbtk_summary.ifEmpty([]), - ch_cat_global_summary.ifEmpty([]) + ch_cat_global_summary.ifEmpty([]), + params.binqc_tool ) } From bcae7f9952cd80b66c86c0542a478c41d0d52c44 Mon Sep 17 00:00:00 2001 From: "Diego Alvarez S." Date: Wed, 27 Nov 2024 00:01:12 -0300 Subject: [PATCH 20/33] Move bin qc database setup to the subworkflow --- subworkflows/local/bin_qc.nf | 57 ++++++++++++++++++++++++++++++++---- workflows/mag.nf | 50 +------------------------------ 2 files changed, 53 insertions(+), 54 deletions(-) diff --git a/subworkflows/local/bin_qc.nf b/subworkflows/local/bin_qc.nf index 1211c36c..15ead257 100644 --- a/subworkflows/local/bin_qc.nf +++ b/subworkflows/local/bin_qc.nf @@ -2,6 +2,8 @@ * BUSCO/CheckM/CheckM2/GUNC: Quantitative measures for the assessment of genome assembly */ +include { ARIA2 as ARIA2_UNTAR } from '../../modules/nf-core/aria2/main' +include { CHECKM2_DATABASEDOWNLOAD } from '../../modules/nf-core/checkm2/databasedownload/main' include { BUSCO_DB_PREPARATION } from '../../modules/local/busco_db_preparation' include { BUSCO } from '../../modules/local/busco' include { BUSCO_SAVE_DOWNLOAD } from '../../modules/local/busco_save_download' @@ -17,11 +19,7 @@ include { GUNC_MERGECHECKM } from '../../modules/nf-core/gunc/me workflow BIN_QC { take: - ch_bins // [ [ meta] , fasta ], input bins (mandatory) - ch_checkm_db // [ db ], presupplied CheckM database (optional) - ch_checkm2_db // [ [meta] , db ], presupplied CheckM2 database (optional) - ch_busco_db // [ [meta] , db ], presupplied BUSCO database (optional) - ch_gunc_db // [ db ], presupplied GUNC database (optional) + ch_bins // [ [ meta] , fasta ], input bins (mandatory) main: qc_summary = [] @@ -30,6 +28,55 @@ workflow BIN_QC { ch_multiqc_files = Channel.empty() + /* + ================================ + * Setup databases + ================================ + */ + + if (params.busco_db) { + ch_busco_db = file(params.busco_db, checkIfExists: true) + } + else { + ch_busco_db = [] + } + + if (params.checkm_db) { + ch_checkm_db = file(params.checkm_db, checkIfExists: true) + } + else if (!params.skip_binqc && params.binqc_tool == 'checkm') { + ARIA2_UNTAR(params.checkm_download_url) + ch_checkm_db = ARIA2_UNTAR.out.downloaded_file + } + else { + ch_checkm_db = [] + } + + if (params.checkm2_db) { + ch_checkm2_db = [[:], file(params.checkm2_db, checkIfExists: true)] + } + else if (!params.skip_binqc && params.binqc_tool == 'checkm2') { + CHECKM2_DATABASEDOWNLOAD(params.checkm2_db_version) + ch_checkm2_db = CHECKM2_DATABASEDOWNLOAD.out.database + } + else { + ch_checkm2_db = [] + } + + if (params.gunc_db) { + ch_gunc_db = file(params.gunc_db, checkIfExists: true) + } + else { + ch_gunc_db = Channel.empty() + } + + + /* + ================================ + * Run QC tools + ================================ + */ + if (params.binqc_tool == "busco") { /* * BUSCO diff --git a/workflows/mag.nf b/workflows/mag.nf index d000db0f..049cf26f 100644 --- a/workflows/mag.nf +++ b/workflows/mag.nf @@ -26,7 +26,6 @@ include { LONGREAD_PREPROCESSING } from '../subwo // // MODULE: Installed directly from nf-core/modules // -include { ARIA2 as ARIA2_UNTAR } from '../modules/nf-core/aria2/main' include { FASTQC as FASTQC_RAW } from '../modules/nf-core/fastqc/main' include { FASTQC as FASTQC_TRIMMED } from '../modules/nf-core/fastqc/main' include { SEQTK_MERGEPE } from '../modules/nf-core/seqtk/mergepe/main' @@ -50,7 +49,6 @@ include { PRODIGAL } from '../modul include { PROKKA } from '../modules/nf-core/prokka/main' include { MMSEQS_DATABASES } from '../modules/nf-core/mmseqs/databases/main' include { METAEUK_EASYPREDICT } from '../modules/nf-core/metaeuk/easypredict/main' -include { CHECKM2_DATABASEDOWNLOAD } from '../modules/nf-core/checkm2/databasedownload/main' // // MODULE: Local to the pipeline @@ -102,34 +100,6 @@ workflow MAG { ch_host_fasta = Channel.empty() } - if (params.busco_db) { - ch_busco_db = file(params.busco_db, checkIfExists: true) - } - else { - ch_busco_db = [] - } - - if (params.checkm_db) { - ch_checkm_db = file(params.checkm_db, checkIfExists: true) - } - else { - ch_checkm_db = [] - } - - if (params.checkm2_db) { - ch_checkm2_db = [[:], file(params.checkm2_db, checkIfExists: true)] - } - else { - ch_checkm2_db = [] - } - - if (params.gunc_db) { - ch_gunc_db = file(params.gunc_db, checkIfExists: true) - } - else { - ch_gunc_db = Channel.empty() - } - if (params.kraken2_db) { ch_kraken2_db_file = file(params.kraken2_db, checkIfExists: true) } @@ -185,18 +155,6 @@ workflow MAG { ch_metaeuk_db = Channel.empty() } - // Get checkM database if not supplied - if (!params.skip_binqc && params.binqc_tool == 'checkm' && !params.checkm_db) { - ARIA2_UNTAR(params.checkm_download_url) - ch_checkm_db = ARIA2_UNTAR.out.downloaded_file - } - - // Get CheckM2 database if not supplied - if (!params.skip_binqc && params.binqc_tool == 'checkm2' && !params.checkm2_db) { - CHECKM2_DATABASEDOWNLOAD(params.checkm2_db_version) - ch_checkm2_db = CHECKM2_DATABASEDOWNLOAD.out.database - } - // Get mmseqs db for MetaEuk if requested if (!params.skip_metaeuk && params.metaeuk_mmseqs_db) { MMSEQS_DATABASES(params.metaeuk_mmseqs_db) @@ -793,13 +751,7 @@ workflow MAG { * Bin QC subworkflows: for checking bin completeness with either BUSCO, CHECKM, CHECKM2, and/or GUNC */ - BIN_QC( - ch_input_for_postbinning, - ch_checkm_db, - ch_checkm2_db, - ch_busco_db, - ch_gunc_db - ) + BIN_QC(ch_input_for_postbinning) ch_bin_qc_summary = BIN_QC.out.qc_summary ch_versions = ch_versions.mix(BIN_QC.out.versions) From 158ee12d01607851f3766e156491f5c91fcb22e9 Mon Sep 17 00:00:00 2001 From: "Diego Alvarez S." Date: Wed, 27 Nov 2024 00:06:10 -0300 Subject: [PATCH 21/33] Fix --- modules/nf-core/checkm2/databasedownload/meta.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/nf-core/checkm2/databasedownload/meta.yml b/modules/nf-core/checkm2/databasedownload/meta.yml index b79ccfc6..632b4922 100644 --- a/modules/nf-core/checkm2/databasedownload/meta.yml +++ b/modules/nf-core/checkm2/databasedownload/meta.yml @@ -20,7 +20,7 @@ tools: input: - - db_zenodo_id: type: integer - description: Zenodo record ID of the CheckM2 database to download + description: Zenodo ID of the CheckM2 database to download output: - database: From 5738215ece2efa872efb16c8b430e50290adb5db Mon Sep 17 00:00:00 2001 From: "Diego Alvarez S." Date: Wed, 27 Nov 2024 00:19:30 -0300 Subject: [PATCH 22/33] Remove another it --- subworkflows/local/bin_qc.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/subworkflows/local/bin_qc.nf b/subworkflows/local/bin_qc.nf index 15ead257..887990ed 100644 --- a/subworkflows/local/bin_qc.nf +++ b/subworkflows/local/bin_qc.nf @@ -126,7 +126,7 @@ workflow BIN_QC { ) ch_multiqc_files = ch_multiqc_files.mix( - BUSCO.out.summary_domain.mix(BUSCO.out.summary_specific).map { it[1] } + BUSCO.out.summary_domain.mix(BUSCO.out.summary_specific).map { _meta, summary -> summary } ) qc_summary = BUSCO_SUMMARY.out.summary ch_versions = ch_versions.mix(BUSCO.out.versions.first()) From 49ebb712d80e9808afa00b454912af6a93c24293 Mon Sep 17 00:00:00 2001 From: "Diego Alvarez S." Date: Wed, 27 Nov 2024 00:30:31 -0300 Subject: [PATCH 23/33] Remove checkm/checkm2 ci tests --- .github/workflows/ci.yml | 70 ---------------------------------------- 1 file changed, 70 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index ccc072c0..4d71c8f8 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -130,73 +130,3 @@ jobs: - name: Run pipeline with ${{ matrix.test_name }} test profile run: | nextflow run ${GITHUB_WORKSPACE} -profile ${{ matrix.test_name }},docker --outdir ./results - - checkm: - name: Run single test to checkm due to database download - # Only run on push if this is the nf-core dev branch (merged PRs) - if: ${{ github.event_name != 'push' || (github.event_name == 'push' && github.repository == 'nf-core/mag') }} - runs-on: ubuntu-latest - - steps: - - name: Free some space - run: | - sudo rm -rf "/usr/local/share/boost" - sudo rm -rf "$AGENT_TOOLSDIRECTORY" - - - name: Check out pipeline code - uses: actions/checkout@0ad4b8fadaa221de15dcec353f45205ec38ea70b # v4 - - - name: Install Nextflow - run: | - wget -qO- get.nextflow.io | bash - sudo mv nextflow /usr/local/bin/ - - - name: Clean up Disk space - uses: jlumbroso/free-disk-space@54081f138730dfa15788a46383842cd2f914a1be # v1.3.1 - - - name: Download and prepare CheckM database - run: | - mkdir -p databases/checkm - wget https://zenodo.org/records/7401545/files/checkm_data_2015_01_16.tar.gz -P databases/checkm - tar xzvf databases/checkm/checkm_data_2015_01_16.tar.gz -C databases/checkm/ - - - name: Run pipeline with ${{ matrix.profile }} test profile - run: | - nextflow run ${GITHUB_WORKSPACE} -profile test,docker --outdir ./results --binqc_tool checkm --checkm_db databases/checkm - - checkm2: - name: Run single test to checkm2 due to database download - # Only run on push if this is the nf-core dev branch (merged PRs) - if: ${{ github.event_name != 'push' || (github.event_name == 'push' && github.repository == 'nf-core/mag') }} - runs-on: ubuntu-latest - - steps: - - name: Free some space - run: | - sudo rm -rf "/usr/local/share/boost" - sudo rm -rf "$AGENT_TOOLSDIRECTORY" - - - name: Check out pipeline code - uses: actions/checkout@0ad4b8fadaa221de15dcec353f45205ec38ea70b # v4 - - - name: Install Nextflow - run: | - wget -qO- get.nextflow.io | bash - sudo mv nextflow /usr/local/bin/ - - - name: Clean up Disk space - uses: jlumbroso/free-disk-space@54081f138730dfa15788a46383842cd2f914a1be # v1.3.1 - - - name: Download and prepare CheckM2 database - run: | - mkdir -p databases/checkm2 - wget https://zenodo.org/records/5571251/files/checkm2_database.tar.gz -P databases/checkm2 - tar xzvf databases/checkm2/checkm2_database.tar.gz -C databases/checkm2/ - - - name: Run pipeline with ${{ matrix.profile }} test profile - run: | - nextflow run ${GITHUB_WORKSPACE} \ - -profile test,docker \ - --outdir ./results \ - --binqc_tool checkm2 \ - --checkm2_db databases/checkm2/CheckM2_database/uniref100.KO.1.dmnd From 4ecd36f3960e90f91030fcf0f50b3a356471542b Mon Sep 17 00:00:00 2001 From: "Diego Alvarez S." Date: Thu, 28 Nov 2024 11:36:10 -0300 Subject: [PATCH 24/33] gtdbtk: add check when params.busco_db is not defined --- subworkflows/local/gtdbtk.nf | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/subworkflows/local/gtdbtk.nf b/subworkflows/local/gtdbtk.nf index e2ee580b..d3d66d47 100644 --- a/subworkflows/local/gtdbtk.nf +++ b/subworkflows/local/gtdbtk.nf @@ -24,8 +24,7 @@ workflow GTDBTK { def completeness = -1 def contamination = -1 def missing, duplicated - def busco_db = file(params.busco_db) - if (busco_db.getBaseName().contains('odb10')) { + if (params.busco_db && file(params.busco_db).getBaseName().contains('odb10')) { missing = row.'%Missing (specific)' // TODO or just take '%Complete'? duplicated = row.'%Complete and duplicated (specific)' } else { From 1cf1da7c04ef623a6cff29a4f736853910097984 Mon Sep 17 00:00:00 2001 From: "Diego Alvarez S." Date: Thu, 28 Nov 2024 14:11:16 -0300 Subject: [PATCH 25/33] Don't flatten bins for GUNC --- subworkflows/local/bin_qc.nf | 3 --- 1 file changed, 3 deletions(-) diff --git a/subworkflows/local/bin_qc.nf b/subworkflows/local/bin_qc.nf index 887990ed..7348d855 100644 --- a/subworkflows/local/bin_qc.nf +++ b/subworkflows/local/bin_qc.nf @@ -181,9 +181,6 @@ workflow BIN_QC { .filter { meta, _bins -> meta.domain != "eukarya" } - .flatMap { meta, bins -> - bins.collect { bin -> [meta, bin] } - } if (params.gunc_db) { ch_db_for_gunc = ch_gunc_db From ea9075f700aecc612fdd2320dd85dfec17fb2440 Mon Sep 17 00:00:00 2001 From: "Diego Alvarez S." Date: Thu, 28 Nov 2024 14:27:28 -0300 Subject: [PATCH 26/33] Update changelog --- CHANGELOG.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 4e5bef99..e899940a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -23,9 +23,11 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 | Tool | Previous version | New version | | ------- | ---------------- | ----------- | +| CheckM | 1.2.1 | 1.2.3 | +| CheckM2 | | 1.0.2 | | chopper | | 0.9.0 | +| GUNC | 1.0.5 | 1.0.6 | | nanoq | | 0.10.0 | -| CheckM2 | | 1.0.2 | ### `Deprecated` From a079596d8632d00f86e2a6f6f89d43363d49b798 Mon Sep 17 00:00:00 2001 From: "Diego Alvarez S." Date: Thu, 28 Nov 2024 14:45:28 -0300 Subject: [PATCH 27/33] Emit bash version on busco_save_download --- modules/local/busco_save_download.nf | 9 +++++++-- subworkflows/local/bin_qc.nf | 2 ++ 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/modules/local/busco_save_download.nf b/modules/local/busco_save_download.nf index 74bcffbf..0962d38f 100644 --- a/modules/local/busco_save_download.nf +++ b/modules/local/busco_save_download.nf @@ -2,7 +2,7 @@ process BUSCO_SAVE_DOWNLOAD { // execute sequentially to avoid artefacts when saving files for multiple busco instances maxForks 1 - conda "conda-forge::sed=4.7" + conda "conda-forge::bash=5.2.21" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/ubuntu:20.04' : 'nf-core/ubuntu:20.04' }" @@ -11,9 +11,14 @@ process BUSCO_SAVE_DOWNLOAD { path(busco_downloads) output: - path('busco_downloads/**', includeInputs: true) + path('busco_downloads/**', includeInputs: true), emit: busco_files + path "versions.yml" , emit: versions script: """ + cat <<-END_VERSIONS > versions.yml + "${task.process}": + bash: \$(echo \$BASH_VERSION) + END_VERSIONS """ } diff --git a/subworkflows/local/bin_qc.nf b/subworkflows/local/bin_qc.nf index 7348d855..e78b7b3b 100644 --- a/subworkflows/local/bin_qc.nf +++ b/subworkflows/local/bin_qc.nf @@ -115,6 +115,8 @@ workflow BIN_QC { .toSortedList() .flatten() BUSCO_SAVE_DOWNLOAD(ch_downloads) + + ch_versions = ch_versions.mix(BUSCO_SAVE_DOWNLOAD.out.versions.first()) } BUSCO(ch_input_bins_for_qc, ch_db_for_busco) From 80641b05c6526a538f3811213a25a200938ec119 Mon Sep 17 00:00:00 2001 From: "Diego Alvarez S." Date: Thu, 28 Nov 2024 15:58:23 -0300 Subject: [PATCH 28/33] Fix declaration --- modules/local/busco_save_download.nf | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/modules/local/busco_save_download.nf b/modules/local/busco_save_download.nf index 0962d38f..099c4150 100644 --- a/modules/local/busco_save_download.nf +++ b/modules/local/busco_save_download.nf @@ -3,16 +3,16 @@ process BUSCO_SAVE_DOWNLOAD { maxForks 1 conda "conda-forge::bash=5.2.21" - container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/ubuntu:20.04' : - 'nf-core/ubuntu:20.04' }" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container + ? 'https://depot.galaxyproject.org/singularity/ubuntu:20.04' + : 'nf-core/ubuntu:20.04' }" input: path(busco_downloads) output: - path('busco_downloads/**', includeInputs: true), emit: busco_files - path "versions.yml" , emit: versions + path 'busco_downloads/**', includeInputs: true, emit: busco_files + path 'versions.yml' , emit: versions script: """ From 4b084b11dc94727dfcda7cf964cd297d28695f2e Mon Sep 17 00:00:00 2001 From: "Diego Alvarez S." Date: Sun, 1 Dec 2024 12:29:00 -0300 Subject: [PATCH 29/33] Update checkm/qa module --- modules.json | 2 +- modules/nf-core/checkm/qa/main.nf | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/modules.json b/modules.json index 64f8d479..25021c0d 100644 --- a/modules.json +++ b/modules.json @@ -59,7 +59,7 @@ }, "checkm/qa": { "branch": "master", - "git_sha": "3ea318161b8788623cec477bde0f089180b2245b", + "git_sha": "867961a8ef91135475ca48c83743646038be4196", "installed_by": ["modules"] }, "checkm2/databasedownload": { diff --git a/modules/nf-core/checkm/qa/main.nf b/modules/nf-core/checkm/qa/main.nf index 0255d95c..042f8b04 100644 --- a/modules/nf-core/checkm/qa/main.nf +++ b/modules/nf-core/checkm/qa/main.nf @@ -23,7 +23,7 @@ process CHECKM_QA { def args = task.ext.args ?: '' prefix = task.ext.prefix ?: "${meta.id}" suffix = task.ext.args?.matches(".*-o 9.*|.*--out_file 9.*") ? "fasta" : "txt" - def coverage = coverage_file.isFile() ? "--coverage_file ${coverage_file}" : "" + def coverage = coverage_file && coverage_file.isFile() ? "--coverage_file ${coverage_file}" : "" def exclude = exclude_marker_file && exclude_marker_file.isFile() ? "--exclude_markers ${exclude_marker_file}" : "" """ checkm \\ From d176fcee3f4833d5c1b7bd888a3288f79f4afc73 Mon Sep 17 00:00:00 2001 From: "Diego Alvarez S." Date: Thu, 5 Dec 2024 10:37:11 -0300 Subject: [PATCH 30/33] Handle skip_binqc properly --- subworkflows/local/bin_qc.nf | 4 ++-- workflows/mag.nf | 8 +++++--- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/subworkflows/local/bin_qc.nf b/subworkflows/local/bin_qc.nf index e78b7b3b..ada34149 100644 --- a/subworkflows/local/bin_qc.nf +++ b/subworkflows/local/bin_qc.nf @@ -44,7 +44,7 @@ workflow BIN_QC { if (params.checkm_db) { ch_checkm_db = file(params.checkm_db, checkIfExists: true) } - else if (!params.skip_binqc && params.binqc_tool == 'checkm') { + else if (params.binqc_tool == 'checkm') { ARIA2_UNTAR(params.checkm_download_url) ch_checkm_db = ARIA2_UNTAR.out.downloaded_file } @@ -55,7 +55,7 @@ workflow BIN_QC { if (params.checkm2_db) { ch_checkm2_db = [[:], file(params.checkm2_db, checkIfExists: true)] } - else if (!params.skip_binqc && params.binqc_tool == 'checkm2') { + else if (params.binqc_tool == 'checkm2') { CHECKM2_DATABASEDOWNLOAD(params.checkm2_db_version) ch_checkm2_db = CHECKM2_DATABASEDOWNLOAD.out.database } diff --git a/workflows/mag.nf b/workflows/mag.nf index b269a19f..1ef0e52b 100644 --- a/workflows/mag.nf +++ b/workflows/mag.nf @@ -615,10 +615,12 @@ workflow MAG { * Bin QC subworkflows: for checking bin completeness with either BUSCO, CHECKM, CHECKM2, and/or GUNC */ - BIN_QC(ch_input_for_postbinning) + if (!params.skip_binqc) { + BIN_QC(ch_input_for_postbinning) - ch_bin_qc_summary = BIN_QC.out.qc_summary - ch_versions = ch_versions.mix(BIN_QC.out.versions) + ch_bin_qc_summary = BIN_QC.out.qc_summary + ch_versions = ch_versions.mix(BIN_QC.out.versions) + } ch_quast_bins_summary = Channel.empty() if (!params.skip_quast) { From 3a38a3744c5382e12b36909631b4d27749572e3b Mon Sep 17 00:00:00 2001 From: "Diego Alvarez S." Date: Wed, 11 Dec 2024 15:57:49 -0300 Subject: [PATCH 31/33] Address review comments --- CITATIONS.md | 2 +- docs/output.md | 2 +- nextflow_schema.json | 4 ++-- subworkflows/local/bin_qc.nf | 10 ++++++++-- 4 files changed, 12 insertions(+), 6 deletions(-) diff --git a/CITATIONS.md b/CITATIONS.md index 1d7dd986..2feb3693 100644 --- a/CITATIONS.md +++ b/CITATIONS.md @@ -42,7 +42,7 @@ - [CheckM2](https://doi.org/10.1038/s41592-023-01940-w) - > Chklovski, A., Parks, D. H., Woodcroft, B. J., & Tyson, G. W. (2023). CheckM2: a rapid, scalable and accurate tool for assessing microbial genome quality using machine learning. Nature Methods, 20(8), 1203-1212. + > Chklovski, A., Parks, D. H., Woodcroft, B. J., & Tyson, G. W. (2023). CheckM2: a rapid, scalable and accurate tool for assessing microbial genome quality using machine learning. Nature Methods, 20(8), 1203-1212. doi: https://doi.org/10.1038/s41592-023-01940-w - [Chopper](https://doi.org/10.1093/bioinformatics/bty149) diff --git a/docs/output.md b/docs/output.md index b4d487e4..c50100e8 100644 --- a/docs/output.md +++ b/docs/output.md @@ -596,7 +596,7 @@ If the parameter `--save_checkm_reference` is set, additionally the used the Che
-If the parameter `--save_checkm2_reference` is set, the CheckM2 reference datasets will be stored in the output directory. +If the parameter `--save_checkm2_data` is set, the CheckM2 reference datasets will be stored in the output directory.
Output files diff --git a/nextflow_schema.json b/nextflow_schema.json index 0616173d..dedb286b 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -778,12 +778,12 @@ "checkm2_db": { "type": "string", "description": "Path to local folder containing already downloaded and uncompressed CheckM2 database (.dmnd file).", - "help_text": "The pipeline can also download this for you if not specified, and you can save the resulting directory into your output directory by specifying `--save_checkm2_data`. You should move this directory to somewhere else on your machine (and supply back to the pipeline in future runs again with `--checkm2_db`." + "help_text": "The pipeline can also download this for you if not specified, and you can save the resulting directory into your output directory by specifying `--save_checkm2_data`. You should move this directory to somewhere else on your machine (and supply back to the pipeline in future runs again with `--checkm2_db`)." }, "checkm2_db_version": { "type": "integer", "default": 5571251, - "description": "CheckM2 database version number to download (Zenodo record ID, for reference check https://zenodo.org/records/5571251)." + "description": "CheckM2 database version number to download (Zenodo record ID, for reference check the canonical reference https://zenodo.org/records/5571251, and pick the Zenodo ID of the database version of your choice)." }, "save_checkm2_data": { "type": "boolean", diff --git a/subworkflows/local/bin_qc.nf b/subworkflows/local/bin_qc.nf index ada34149..5a83d140 100644 --- a/subworkflows/local/bin_qc.nf +++ b/subworkflows/local/bin_qc.nf @@ -161,7 +161,10 @@ workflow BIN_QC { COMBINE_BINQC_TSV(CHECKM_QA.out.output.collect { summary -> summary[1] }) qc_summary = COMBINE_BINQC_TSV.out.combined - ch_versions = ch_versions.mix(CHECKM_QA.out.versions.first()) + ch_versions = ch_versions.mix( + CHECKM_QA.out.versions.first(), + COMBINE_BINQC_TSV.out.versions + ) } else if (params.binqc_tool == "checkm2") { /* @@ -172,7 +175,10 @@ workflow BIN_QC { COMBINE_BINQC_TSV(CHECKM2_PREDICT.out.checkm2_tsv.collect { summary -> summary[1] }) qc_summary = COMBINE_BINQC_TSV.out.combined - ch_versions = ch_versions.mix(CHECKM2_PREDICT.out.versions.first()) + ch_versions = ch_versions.mix( + CHECKM2_PREDICT.out.versions.first(), + COMBINE_BINQC_TSV.out.versions + ) } if (params.run_gunc) { From a2afa5e1ff755166cc1bf73ecf2612e8b22d5b17 Mon Sep 17 00:00:00 2001 From: "Diego Alvarez S." Date: Fri, 13 Dec 2024 12:09:17 -0300 Subject: [PATCH 32/33] Update tests to integrate CheckM and CheckM2 --- conf/test_adapterremoval.config | 2 +- conf/test_bbnorm.config | 3 +-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/conf/test_adapterremoval.config b/conf/test_adapterremoval.config index 63f04cf6..fc433a8b 100644 --- a/conf/test_adapterremoval.config +++ b/conf/test_adapterremoval.config @@ -31,7 +31,7 @@ params { skip_krona = true min_length_unbinned_contigs = 1 max_unbinned_contigs = 2 - busco_db = "https://busco-data.ezlab.org/v5/data/lineages/bacteria_odb10.2024-01-08.tar.gz" + binqc_tool = 'checkm' skip_gtdbtk = true gtdbtk_min_completeness = 0.01 clip_tool = 'adapterremoval' diff --git a/conf/test_bbnorm.config b/conf/test_bbnorm.config index 223f99a3..a434f584 100644 --- a/conf/test_bbnorm.config +++ b/conf/test_bbnorm.config @@ -36,8 +36,7 @@ params { skip_krona = true min_length_unbinned_contigs = 1 max_unbinned_contigs = 2 - busco_db = "https://busco-data.ezlab.org/v5/data/lineages/bacteria_odb10.2024-01-08.tar.gz" - busco_clean = true + binqc_tool = 'checkm2' skip_gtdbtk = true gtdbtk_min_completeness = 0.01 bbnorm = true From 61c23fe8914503acabf537bef3682f5b84dc7bc0 Mon Sep 17 00:00:00 2001 From: "Diego Alvarez S." Date: Fri, 13 Dec 2024 13:23:34 -0300 Subject: [PATCH 33/33] Fix for when checkm is not run for some bins --- bin/combine_tables.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bin/combine_tables.py b/bin/combine_tables.py index e287676a..2b8d3767 100755 --- a/bin/combine_tables.py +++ b/bin/combine_tables.py @@ -138,7 +138,7 @@ def main(args=None): ] checkm_results = pd.read_csv(args.binqc_summary, usecols=use_columns, sep="\t") checkm_results["Bin Id"] = checkm_results["Bin Id"] + ".fa" - if not bins.equals(checkm_results["Bin Id"].sort_values().reset_index(drop=True)): + if not set(checkm_results["Bin Id"]).issubset(set(bins)): sys.exit("Bins in CheckM summary do not match bins in bin depths summary!") results = pd.merge( results, checkm_results, left_on="bin", right_on="Bin Id", how="outer"