From 5408656706c8747f4af31f7b7fdaf62a4483f12a Mon Sep 17 00:00:00 2001 From: luisas Date: Thu, 14 Sep 2023 15:45:40 +0200 Subject: [PATCH] Integrate nf-validation plugin --- assets/samplesheet.csv | 2 +- assets/schema_input.json | 1 + assets/schema_tools.json | 10 +- bin/check_samplesheet.py | 231 -------------------------- bin/check_toolsheet.py | 251 ----------------------------- conf/modules.config | 14 +- main.nf | 2 +- modules.json | 42 ++--- modules/local/samplesheet_check.nf | 34 ---- modules/local/toolsheet_check.nf | 35 ---- nextflow.config | 2 +- subworkflows/local/align.nf | 5 +- subworkflows/local/input_check.nf | 117 -------------- workflows/multiplesequencealign.nf | 54 ++++--- 14 files changed, 63 insertions(+), 737 deletions(-) delete mode 100755 bin/check_samplesheet.py delete mode 100755 bin/check_toolsheet.py delete mode 100644 modules/local/samplesheet_check.nf delete mode 100644 modules/local/toolsheet_check.nf delete mode 100644 subworkflows/local/input_check.nf diff --git a/assets/samplesheet.csv b/assets/samplesheet.csv index 1b7a2dd9..206caf6e 100644 --- a/assets/samplesheet.csv +++ b/assets/samplesheet.csv @@ -1,3 +1,3 @@ id,fasta,reference,structures -seatoxin-ref,https://raw.githubusercontent.com/nf-core/test-datasets/multiplesequencealign/testdata/setoxin-ref.fa,https://raw.githubusercontent.com/nf-core/test-datasets/multiplesequencealign/testdata/setoxin.ref, +seatoxin-ref,https://raw.githubusercontent.com/nf-core/test-datasets/multiplesequencealign/testdata/setoxin-ref.fa,https://raw.githubusercontent.com/nf-core/test-datasets/multiplesequencealign/testdata/setoxin.ref,test-dataset/structures/setoxin-ref/ toxin-ref,https://raw.githubusercontent.com/nf-core/test-datasets/multiplesequencealign/testdata/toxin-ref.fa,https://raw.githubusercontent.com/nf-core/test-datasets/multiplesequencealign/testdata/toxin.ref, diff --git a/assets/schema_input.json b/assets/schema_input.json index a17eca13..3ebb3c25 100644 --- a/assets/schema_input.json +++ b/assets/schema_input.json @@ -10,6 +10,7 @@ "id": { "type": "string", "pattern": "^\\S+$", + "meta": ["id"], "errorMessage": "id/sample name must be provided and cannot contain spaces" }, "fasta": { diff --git a/assets/schema_tools.json b/assets/schema_tools.json index 6c85028f..cd33e7fa 100644 --- a/assets/schema_tools.json +++ b/assets/schema_tools.json @@ -9,17 +9,21 @@ "properties": { "tree": { "type": "string", - "errorMessage": "tree name cannot contain spaces" + "errorMessage": "tree name cannot contain spaces", + "meta": ["tree"] }, "args_tree": { - "type": "string" + "type": "string", + "meta": ["args_tree"] }, "align": { "type": "string", + "meta": ["align"], "errorMessage": "align name must be provided and cannot contain spaces" }, "args_align": { - "type": "string" + "type": "string", + "meta": ["args_align"] } }, "required": ["align"] diff --git a/bin/check_samplesheet.py b/bin/check_samplesheet.py deleted file mode 100755 index ddea8011..00000000 --- a/bin/check_samplesheet.py +++ /dev/null @@ -1,231 +0,0 @@ -#!/usr/bin/env python - - -"""Provide a command line tool to validate and transform tabular samplesheets.""" - - -import argparse -import csv -import logging -import sys -from collections import Counter -from pathlib import Path - -logger = logging.getLogger() - - -class RowChecker: - """ - Define a service that can validate and transform each given row. - - Attributes: - modified (list): A list of dicts, where each dict corresponds to a previously - validated and transformed row. The order of rows is maintained. - - """ - - VALID_FORMATS = (".fa", ".fasta") - - def __init__( - self, - id_col="id", - fasta_col="fasta", - reference_col="reference", - structures_col="structures", - **kwargs, - ): - """ - Initialize the row checker with the expected column names. - - Args: - id_col (str): The name of the column that contains the id name - (default "id"). - fasta_col (str): The name of the column that contains the fasta file - path (default "fasta"). - - """ - super().__init__(**kwargs) - self._id_col = id_col - self._fasta_col = fasta_col - self._reference_col = reference_col - self._structures_col = structures_col - self._seen = set() - self.modified = [] - - def validate_and_transform(self, row): - """ - Perform all validations on the given row and insert the read pairing status. - - Args: - row (dict): A mapping from column headers (keys) to elements of that row - (values). - - """ - self._validate_id(row) - self._validate_fasta(row) - self._validate_structures(row) - self._seen.add((row[self._id_col], row[self._fasta_col])) - self.modified.append(row) - - def _validate_id(self, row): - """Assert that the id name exists and convert spaces to underscores.""" - if len(row[self._id_col]) <= 0: - raise AssertionError("Sample input is required.") - # Sanitize samples slightly. - row[self._id_col] = row[self._id_col].replace(" ", "_") - - def _validate_fasta(self, row): - """Assert that the fasta entry is non-empty and has the right format.""" - if len(row[self._fasta_col]) <= 0: - raise AssertionError("At least the first FASTQ file is required.") - self._validate_fasta_format(row[self._fasta_col]) - - def _validate_fasta_format(self, filename): - """Assert that a given filename has one of the expected fasta extensions.""" - if not any(filename.endswith(extension) for extension in self.VALID_FORMATS): - raise AssertionError( - f"The fasta file has an unrecognized extension: {filename}\n" - f"It should be one of: {', '.join(self.VALID_FORMATS)}" - ) - - def _validate_structures(self, row): - """Assert that the structures entry is non-empty and has the right format.""" - if len(row[self._structures_col]) <= 0: - row[self._structures_col] = "none" - - def validate_unique_samples(self): - """ - Assert that the combination of id name and fasta filename is unique. - - """ - if len(self._seen) != len(self.modified): - raise AssertionError("The pair of sample name and fasta must be unique.") - seen = Counter() - for row in self.modified: - id = row[self._id_col] - seen[id] += 1 - - -def read_head(handle, num_lines=10): - """Read the specified number of lines from the current position in the file.""" - lines = [] - for idx, line in enumerate(handle): - if idx == num_lines: - break - lines.append(line) - return "".join(lines) - - -def sniff_format(handle): - """ - Detect the tabular format. - - Args: - handle (text file): A handle to a `text file`_ object. The read position is - expected to be at the beginning (index 0). - - Returns: - csv.Dialect: The detected tabular format. - - .. _text file: - https://docs.python.org/3/glossary.html#term-text-file - - """ - peek = read_head(handle) - handle.seek(0) - sniffer = csv.Sniffer() - dialect = sniffer.sniff(peek) - return dialect - - -def check_samplesheet(file_in, file_out): - """ - Check that the tabular samplesheet has the structure expected by nf-core pipelines. - - Validate the general shape of the table, expected columns, and each row. Also add - an additional column which records whether one or two FASTQ reads were found. - - Args: - file_in (pathlib.Path): The given tabular samplesheet. The format can be either - CSV, TSV, or any other format automatically recognized by ``csv.Sniffer``. - file_out (pathlib.Path): Where the validated and transformed samplesheet should - be created; always in CSV format. - - Example: - This function checks that the samplesheet follows the following structure, - see also the `viral recon samplesheet`_:: - - id,fasta - seatoxin-ref,./testdata/seatoxin-ref.fa - toxin-ref,./testdata/toxin-ref.fa - - - """ - required_columns = {"id", "fasta"} - # See https://docs.python.org/3.9/library/csv.html#id3 to read up on `newline=""`. - with file_in.open(newline="") as in_handle: - reader = csv.DictReader(in_handle, dialect=sniff_format(in_handle)) - # Validate the existence of the expected header columns. - if not required_columns.issubset(reader.fieldnames): - req_cols = ", ".join(required_columns) - logger.critical(f"The sample sheet **must** contain these column headers: {req_cols}.") - sys.exit(1) - # Validate each row. - checker = RowChecker() - for i, row in enumerate(reader): - try: - checker.validate_and_transform(row) - except AssertionError as error: - logger.critical(f"{str(error)} On line {i + 2}.") - sys.exit(1) - checker.validate_unique_samples() - header = list(reader.fieldnames) - # See https://docs.python.org/3.9/library/csv.html#id3 to read up on `newline=""`. - with file_out.open(mode="w", newline="") as out_handle: - writer = csv.DictWriter(out_handle, header, delimiter=",") - writer.writeheader() - for row in checker.modified: - writer.writerow(row) - - -def parse_args(argv=None): - """Define and immediately parse command line arguments.""" - parser = argparse.ArgumentParser( - description="Validate and transform a tabular samplesheet.", - epilog="Example: python check_samplesheet.py samplesheet.csv samplesheet.valid.csv", - ) - parser.add_argument( - "file_in", - metavar="FILE_IN", - type=Path, - help="Tabular input samplesheet in CSV or TSV format.", - ) - parser.add_argument( - "file_out", - metavar="FILE_OUT", - type=Path, - help="Transformed output samplesheet in CSV format.", - ) - parser.add_argument( - "-l", - "--log-level", - help="The desired log level (default WARNING).", - choices=("CRITICAL", "ERROR", "WARNING", "INFO", "DEBUG"), - default="WARNING", - ) - return parser.parse_args(argv) - - -def main(argv=None): - """Coordinate argument parsing and program execution.""" - args = parse_args(argv) - logging.basicConfig(level=args.log_level, format="[%(levelname)s] %(message)s") - if not args.file_in.is_file(): - logger.error(f"The given input file {args.file_in} was not found!") - sys.exit(2) - args.file_out.parent.mkdir(parents=True, exist_ok=True) - check_samplesheet(args.file_in, args.file_out) - - -if __name__ == "__main__": - sys.exit(main()) diff --git a/bin/check_toolsheet.py b/bin/check_toolsheet.py deleted file mode 100755 index 8336e253..00000000 --- a/bin/check_toolsheet.py +++ /dev/null @@ -1,251 +0,0 @@ -#!/usr/bin/env python - - -"""Provide a command line tool to validate and transform tabular samplesheets.""" - - -import argparse -import csv -import logging -import sys -from collections import Counter -from pathlib import Path - -logger = logging.getLogger() - - -def cleanargs(argstring): - cleanargs = argstring.strip().replace("-", "").replace(" ", "_").replace("==", "_").replace("\s+", "") - - return cleanargs - - -class RowChecker: - """ - Define a service that can validate and transform each given row. - - Attributes: - modified (list): A list of dicts, where each dict corresponds to a previously - validated and transformed row. The order of rows is maintained. - - """ - - def __init__( - self, - tree_col="tree", - argstree_col="args_tree", - argstree_clean_col="argstree_clean", - align_col="align", - argsalign_col="args_align", - argsalign_clean_col="argsalign_clean", - **kwargs, - ): - """ - Initialize the row checker with the expected column names. - - Args: - family_col (str): The name of the column that contains the family name - (default "family"). - fasta_col (str): The name of the column that contains the fasta file - path (default "fasta"). - - """ - super().__init__(**kwargs) - self._tree_col = tree_col - self._argstree_col = argstree_col - self._argstree_clean_col = argstree_clean_col - self._align_col = align_col - self._argsalign_col = argsalign_col - self._argsalign_clean_col = argsalign_clean_col - self._seen = set() - self.modified = [] - - def validate_and_transform(self, row): - """ - Perform all validations on the given row and insert the read pairing status. - - Args: - row (dict): A mapping from column headers (keys) to elements of that row - (values). - - """ - self._validate_align(row) - self._validate_tree(row) - self._validate_argstree(row) - self._validate_argsalign(row) - self._seen.add( - ( - row[self._tree_col], - row[self._argstree_col], - row[self._align_col], - row[self._argsalign_col], - row[self._argstree_clean_col], - ) - ) - print(row) - self.modified.append(row) - - def _validate_tree(self, row): - """Assert that the family name exists and convert spaces to underscores.""" - if len(row[self._tree_col]) <= 0: - row[self._tree_col] = "none" - # Sanitize samples slightly. - row[self._tree_col] = row[self._tree_col] - - def _validate_argstree(self, row): - if len(row[self._argstree_col]) <= 0: - row[self._argstree_col] = "none" - row[self._argstree_clean_col] = "none" - # Sanitize samples slightly. - row[self._argstree_col] = row[self._argstree_col] - row[self._argstree_clean_col] = cleanargs(row[self._argstree_col]) - - def _validate_align(self, row): - if len(row[self._align_col]) <= 0: - raise AssertionError("alignment tool is required.") - # Sanitize samples slightly. - row[self._align_col] = row[self._align_col] - row[self._argsalign_clean_col] = cleanargs(row[self._argsalign_col]) - - def _validate_argsalign(self, row): - if len(row[self._argsalign_col]) <= 0: - row[self._argsalign_col] = "none" - row[self._argsalign_clean_col] = "none" - # Sanitize samples slightly. - row[self._argsalign_col] = row[self._argsalign_col] - - def validate_unique_samples(self): - """ - Assert that the combination of family name and fasta filename is unique. - - """ - if len(self._seen) != len(self.modified): - raise AssertionError("The pair of sample name and fasta must be unique.") - seen = Counter() - for row in self.modified: - entry = row[self._tree_col] + row[self._argstree_col] + row[self._align_col] + row[self._argsalign_col] - seen[entry] += 1 - - -def read_head(handle, num_lines=10): - """Read the specified number of lines from the current position in the file.""" - lines = [] - for idx, line in enumerate(handle): - if idx == num_lines: - break - lines.append(line) - return "".join(lines) - - -def sniff_format(handle): - """ - Detect the tabular format. - - Args: - handle (text file): A handle to a `text file`_ object. The read position is - expected to be at the beginning (index 0). - - Returns: - csv.Dialect: The detected tabular format. - - .. _text file: - https://docs.python.org/3/glossary.html#term-text-file - - """ - peek = read_head(handle) - handle.seek(0) - sniffer = csv.Sniffer() - dialect = sniffer.sniff(peek) - return dialect - - -def check_samplesheet(file_in, file_out): - """ - Check that the tabular samplesheet has the structure expected by nf-core pipelines. - - Validate the general shape of the table, expected columns, and each row. Also add - an additional column which records whether one or two FASTQ reads were found. - - Args: - file_in (pathlib.Path): The given tabular samplesheet. The format can be either - CSV, TSV, or any other format automatically recognized by ``csv.Sniffer``. - file_out (pathlib.Path): Where the validated and transformed samplesheet should - be created; always in CSV format. - - Example: - - - - """ - required_columns = {"align"} - # See https://docs.python.org/3.9/library/csv.html#id3 to read up on `newline=""`. - with file_in.open(newline="") as in_handle: - reader = csv.DictReader(in_handle, dialect=sniff_format(in_handle)) - # Remove white spaces from whole file, even after commas. - # Validate the existence of the expected header columns. - if not required_columns.issubset(reader.fieldnames): - req_cols = ", ".join(required_columns) - logger.critical(f"The sample sheet **must** contain these column headers: {req_cols}.") - sys.exit(1) - # Validate each row. - checker = RowChecker() - for i, row in enumerate(reader): - try: - checker.validate_and_transform(row) - except AssertionError as error: - logger.critical(f"{str(error)} On line {i + 2}.") - sys.exit(1) - checker.validate_unique_samples() - header = list(reader.fieldnames) - header.append("argstree_clean") - header.append("argsalign_clean") - - # See https://docs.python.org/3.9/library/csv.html#id3 to read up on `newline=""`. - with file_out.open(mode="w", newline="") as out_handle: - writer = csv.DictWriter(out_handle, header, delimiter=",") - writer.writeheader() - for row in checker.modified: - writer.writerow(row) - - -def parse_args(argv=None): - """Define and immediately parse command line arguments.""" - parser = argparse.ArgumentParser( - description="Validate and transform a tabular samplesheet.", - epilog="Example: python check_samplesheet.py samplesheet.csv samplesheet.valid.csv", - ) - parser.add_argument( - "file_in", - metavar="FILE_IN", - type=Path, - help="Tabular input samplesheet in CSV or TSV format.", - ) - parser.add_argument( - "file_out", - metavar="FILE_OUT", - type=Path, - help="Transformed output samplesheet in CSV format.", - ) - parser.add_argument( - "-l", - "--log-level", - help="The desired log level (default WARNING).", - choices=("CRITICAL", "ERROR", "WARNING", "INFO", "DEBUG"), - default="WARNING", - ) - return parser.parse_args(argv) - - -def main(argv=None): - """Coordinate argument parsing and program execution.""" - args = parse_args(argv) - logging.basicConfig(level=args.log_level, format="[%(levelname)s] %(message)s") - if not args.file_in.is_file(): - logger.error(f"The given input file {args.file_in} was not found!") - sys.exit(2) - args.file_out.parent.mkdir(parents=True, exist_ok=True) - check_samplesheet(args.file_in, args.file_out) - - -if __name__ == "__main__": - sys.exit(main()) diff --git a/conf/modules.config b/conf/modules.config index d3b935ce..1a8c5ec3 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -18,17 +18,9 @@ process { saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ] - withName: SAMPLESHEET_CHECK { - publishDir = [ - path: { "${params.outdir}/pipeline_info" }, - mode: params.publish_dir_mode, - saveAs: { filename -> filename.equals('versions.yml') ? null : filename } - ] - } - withName: "FAMSA_GUIDETREE"{ ext.prefix = { "${meta.id}_${meta.tree}-args-${meta.argstree_clean}" } - ext.args = { "${meta.args_tree}" == 'none' ? '' : "${meta.args_tree}" } + ext.args = { "${meta.args_tree}" == "null" ? '' : "${meta.args_tree}" } publishDir = [ path: { "${params.outdir}/trees/${task.process.tokenize(':')[-1].tokenize('_')[0].toLowerCase()}" }, mode: params.publish_dir_mode, @@ -38,7 +30,7 @@ process { withName: "CLUSTALO_GUIDETREE"{ ext.prefix = { "${meta.id}_${meta.tree}-args-${meta.argstree_clean}" } - ext.args = { "${meta.args_tree}" == 'none' ? '' : "${meta.args_tree}" } + ext.args = { "${meta.args_tree}" == "null" ? '' : "${meta.args_tree}" } publishDir = [ path: { "${params.outdir}/trees/${task.process.tokenize(':')[-1].tokenize('_')[0].toLowerCase()}" }, mode: params.publish_dir_mode, @@ -48,7 +40,7 @@ process { withName: ".*ALIGN"{ ext.prefix = { "${meta.id}_${meta.tree}-args-${meta.argstree_clean}_${meta.align}-args-${meta.argsalign_clean}" } - ext.args = { "${meta.args_align}" == 'none' ? '' : "${meta.args_align}" } + ext.args = { "${meta.args_align}" == "null" ? '' : "${meta.args_align}" } publishDir = [ path: { "${params.outdir}/alignment/${task.process.tokenize(':')[-1].tokenize('_')[0].toLowerCase()}" }, mode: params.publish_dir_mode, diff --git a/main.nf b/main.nf index 65949d47..3bdf7b6d 100644 --- a/main.nf +++ b/main.nf @@ -17,7 +17,7 @@ nextflow.enable.dsl = 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */ -include { validateParameters; paramsHelp } from 'plugin/nf-validation' +include { validateParameters; paramsHelp; paramsSummaryLog; fromSamplesheet } from 'plugin/nf-validation' // Print help message if needed if (params.help) { diff --git a/modules.json b/modules.json index 0be4793d..51ce8069 100644 --- a/modules.json +++ b/modules.json @@ -8,75 +8,55 @@ "clustalo/align": { "branch": "master", "git_sha": "5c73153097b0f906fa3fe91eb94faaee394d5704", - "installed_by": [ - "modules" - ] + "installed_by": ["modules"] }, "clustalo/guidetree": { "branch": "master", "git_sha": "9a884757b561688e0b3ff8b55ff7eb4da25eef33", - "installed_by": [ - "modules" - ] + "installed_by": ["modules"] }, "custom/dumpsoftwareversions": { "branch": "master", "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", - "installed_by": [ - "modules" - ] + "installed_by": ["modules"] }, "famsa/align": { "branch": "master", "git_sha": "db6245923c85e43df3fbc3a3a6c5150c9f374136", - "installed_by": [ - "modules" - ] + "installed_by": ["modules"] }, "famsa/guidetree": { "branch": "master", "git_sha": "2fe424b685150dbcfae708ea42f521aa137ea21e", - "installed_by": [ - "modules" - ] + "installed_by": ["modules"] }, "fastqc": { "branch": "master", "git_sha": "bd8092b67b5103bdd52e300f75889442275c3117", - "installed_by": [ - "modules" - ] + "installed_by": ["modules"] }, "kalign/align": { "branch": "master", "git_sha": "c4328fea9d972088482f163052be0f51950eb91d", - "installed_by": [ - "modules" - ] + "installed_by": ["modules"] }, "mafft": { "branch": "master", "git_sha": "feb29be775d9e41750180539e9a3bdce801d0609", - "installed_by": [ - "modules" - ] + "installed_by": ["modules"] }, "multiqc": { "branch": "master", "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", - "installed_by": [ - "modules" - ] + "installed_by": ["modules"] }, "untar": { "branch": "master", "git_sha": "d0b4fc03af52a1cc8c6fb4493b921b57352b1dd8", - "installed_by": [ - "modules" - ] + "installed_by": ["modules"] } } } } } -} \ No newline at end of file +} diff --git a/modules/local/samplesheet_check.nf b/modules/local/samplesheet_check.nf deleted file mode 100644 index 9b55bec8..00000000 --- a/modules/local/samplesheet_check.nf +++ /dev/null @@ -1,34 +0,0 @@ -process SAMPLESHEET_CHECK { - tag "$samplesheet" - label 'process_single' - - conda "conda-forge::python=3.8.3" - container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/python:3.8.3' : - 'biocontainers/python:3.8.3' }" - - input: - path samplesheet - - output: - path '*.csv' , emit: csv - path "versions.yml", emit: versions - - when: - task.ext.when == null || task.ext.when - - script: // This script is bundled with the pipeline, in nf-core/multiplesequencealign/bin/ - def args = task.ext.args ?: '' - """ - check_samplesheet.py \\ - $samplesheet \\ - samplesheet.valid.csv - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - python: \$(python --version | sed 's/Python //g') - END_VERSIONS - """ -} - - diff --git a/modules/local/toolsheet_check.nf b/modules/local/toolsheet_check.nf deleted file mode 100644 index f7a625a6..00000000 --- a/modules/local/toolsheet_check.nf +++ /dev/null @@ -1,35 +0,0 @@ - - - -process TOOLSHEET_CHECK { - tag "$toolsheet" - label 'process_single' - - conda "conda-forge::python=3.8.3" - container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/python:3.8.3' : - 'biocontainers/python:3.8.3' }" - - input: - path toolsheet - - output: - path '*.csv' , emit: csv - path "versions.yml", emit: versions - - when: - task.ext.when == null || task.ext.when - - script: // This script is bundled with the pipeline, in nf-core/msa/bin/ - def args = task.ext.args ?: '' - """ - check_toolsheet.py \\ - $toolsheet \\ - toolsheet.valid.csv - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - python: \$(python --version | sed 's/Python //g') - END_VERSIONS - """ -} diff --git a/nextflow.config b/nextflow.config index bdac53e8..daebc3eb 100644 --- a/nextflow.config +++ b/nextflow.config @@ -7,7 +7,7 @@ */ plugins { - id 'nf-validation@0.2.1' + id 'nf-validation@0.3.1' } // Global default params, used in configs diff --git a/subworkflows/local/align.nf b/subworkflows/local/align.nf index 4bd4d7f2..9d0c46aa 100644 --- a/subworkflows/local/align.nf +++ b/subworkflows/local/align.nf @@ -41,12 +41,11 @@ workflow ALIGN { ch_fastas.combine(ch_tools) .map{ it -> [it[0] + it[2] , it[3], it[1]] } .branch { - with_tree: it[0]["tree"] != "none" - without_tree: it[0]["tree"] == "none" + with_tree: it[0]["tree"] != null + without_tree: it[0]["tree"] == null } .set { ch_fasta_tools } - // Here is all the combinations we need to compute ch_fasta_tools .with_tree diff --git a/subworkflows/local/input_check.nf b/subworkflows/local/input_check.nf deleted file mode 100644 index d59515b0..00000000 --- a/subworkflows/local/input_check.nf +++ /dev/null @@ -1,117 +0,0 @@ -// -// Check input samplesheet and get read channels -// -import java.util.zip.ZipFile -import java.util.zip.ZipEntry - -include { SAMPLESHEET_CHECK } from '../../modules/local/samplesheet_check' -include { TOOLSHEET_CHECK } from '../../modules/local/toolsheet_check' -include { UNTAR as UNTAR_STRUCTURES } from '../../modules/nf-core/untar/main' - -workflow INPUT_CHECK { - take: - samplesheet // file: /path/to/samplesheet.csv - toolsheet // file: /path/to/toolsheet.csv - - main: - - ch_versions = Channel.empty() - - samplesheet_ch = SAMPLESHEET_CHECK ( samplesheet) - .csv - .splitCsv ( header:true, sep:',' ) - - fasta = samplesheet_ch.map { create_fasta_channel(it) } - references = samplesheet_ch.map { create_references_channel(it) } - structures = samplesheet_ch.map { create_structures_channel(it) }.unique() - ch_versions = ch_versions.mix(SAMPLESHEET_CHECK.out.versions) - - - TOOLSHEET_CHECK ( toolsheet ) - .csv - .splitCsv ( header:true, sep:',' ) - .map { create_tools_channel(it) } - .set { tools } - ch_versions = ch_versions.mix(TOOLSHEET_CHECK.out.versions) - - emit: - fasta - references - structures - tools // channel: [ val(meta), [ fasta ] ] - versions = ch_versions.ifEmpty(null) // channel: [ versions.yml ] -} - - -// Function to get list of [ meta, [ fasta ] ] -def create_fasta_channel(LinkedHashMap row) { - // create meta map - def meta = [:] - meta.id = row.id - - // add path(s) of the fastq file(s) to the meta map - def fasta_meta = [] - - if (!file(row.fasta).exists()) { - exit 1, "ERROR: Please check input samplesheet -> fasta file does not exist!\n${row.fasta}" - } - fasta_meta = [ meta, [ file(row.fasta) ] ] - - return fasta_meta -} - - -// Function to get list of [ meta, [ fasta ] ] -def create_references_channel(LinkedHashMap row) { - // create meta map - def meta = [:] - meta.id = row.id - - // add path(s) of the fastq file(s) to the meta map - def ref_meta = [] - ref_meta = [ meta, [ file(row.reference) ] ] - - return ref_meta -} - -import groovy.io.FileType - -// Function to get list of [ meta, [ fasta ] ] -def create_structures_channel(LinkedHashMap row) { - // create meta map - def meta = [:] - meta.id = row.id - - // add path(s) of the fastq file(s) to the meta map - if (row.structures != "none") { - def list = [] - def dir = new File(row.structures) - dir.eachFileRecurse (FileType.FILES) { it -> - list << file(it) - } - structures = [ meta, list ] - return structures - } else { - return [ meta, [:] ] - } - -} - -def create_tools_channel(LinkedHashMap row) { - // create meta map - def meta_tree = [:] - def meta_align = [:] - - meta_tree.tree = row.tree - meta_tree.args_tree = row.args_tree - meta_tree.argstree_clean = row.argstree_clean - meta_align.align = row.align - meta_align.args_align = row.args_align - meta_align.argsalign_clean = row.argsalign_clean - - // add path(s) of the fastq file(s) to the meta map - def tools_meta = [] - tools_meta = [ meta_tree, meta_align ] - - return tools_meta -} diff --git a/workflows/multiplesequencealign.nf b/workflows/multiplesequencealign.nf index 8d1a23f3..3fe7fdca 100644 --- a/workflows/multiplesequencealign.nf +++ b/workflows/multiplesequencealign.nf @@ -10,6 +10,15 @@ def logo = NfcoreTemplate.logo(workflow, params.monochrome_logs) def citation = '\n' + WorkflowMain.citation(workflow) + '\n' def summary_params = paramsSummaryMap(workflow) +def cleanArgs(argString) { + def cleanArgs = argString.toString().trim().replace("-", "").replace(" ", "_").replaceAll("==", "_").replaceAll("\\s+", "") + // if clearnArgs is empty, return "default" + if (cleanArgs == null || cleanArgs == "") { + return "" + }else{ + return cleanArgs + } +} // Print parameter summary log to screen log.info logo + paramsSummaryLog(workflow) + citation @@ -32,10 +41,9 @@ ch_multiqc_custom_methods_description = params.multiqc_methods_description ? fil ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */ -// -// SUBWORKFLOW: Consisting of a mix of local and nf-core/modules -// -include { INPUT_CHECK } from '../subworkflows/local/input_check' +include { STATS } from '../subworkflows/local/stats' +include { ALIGN } from '../subworkflows/local/align' +include { EVALUATE } from '../subworkflows/local/evaluate' /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -49,9 +57,6 @@ include { INPUT_CHECK } from '../subworkflows/local/input_check' include { FASTQC } from '../modules/nf-core/fastqc/main' include { MULTIQC } from '../modules/nf-core/multiqc/main' include { CUSTOM_DUMPSOFTWAREVERSIONS } from '../modules/nf-core/custom/dumpsoftwareversions/main' -include { STATS } from '../subworkflows/local/stats' -include { ALIGN } from '../subworkflows/local/align' -include { EVALUATE } from '../subworkflows/local/evaluate' /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -67,21 +72,32 @@ workflow MULTIPLESEQUENCEALIGN { ch_versions = Channel.empty() // - // SUBWORKFLOW: Read in samplesheet, validate and stage input files + // Prepare input and metadata // - INPUT_CHECK ( - file(params.input), - file(params.tools) - ) - ch_versions = ch_versions.mix(INPUT_CHECK.out.versions) + ch_input = Channel.fromSamplesheet('input') + ch_tools = Channel.fromSamplesheet('tools').map { + meta -> + def meta_clone = meta[0].clone() + def treeMap = [:] + def alignMap = [:] - ch_seqs = INPUT_CHECK.out.fasta - ch_tools = INPUT_CHECK.out.tools - ch_refs = INPUT_CHECK.out.references - ch_structures = INPUT_CHECK.out.structures + treeMap["tree"] = meta_clone["tree"] + treeMap["args_tree"] = meta_clone["args_tree"] + treeMap["args_tree_clean"] = cleanArgs(meta_clone.args_tree) + + alignMap["align"] = meta_clone["align"] + alignMap["args_align"] = meta_clone["args_align"] + alignMap["args_align_clean"] = cleanArgs(meta_clone.args_align) + + [ treeMap, alignMap ] + } + + + ch_seqs = ch_input.map{ sample -> [ sample[0], file(sample[1]) ]} + ch_refs = ch_input.map{ sample -> [ sample[0], file(sample[2]) ]} + ch_structures = ch_input.map{ sample -> [ sample[0], sample[3] ]} - // // Compute summary statistics about the input sequences // if( !params.skip_stats ){ @@ -148,6 +164,8 @@ workflow.onComplete { } } + + /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ THE END