diff --git a/CHANGELOG.md b/CHANGELOG.md index 7f5f0db..ec6ae79 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,6 +3,10 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## v1.1.0 "Nattou maki" - [September 27th, 2024] + +Added a new `softmask` parameter, to optionally keep original softmasking. + ## v1.0.0 "Sweet potato" - [August 27th, 2024] Initial release of nf-core/pairgenomealign, created with the [nf-core](https://nf-co.re/) template. diff --git a/assets/multiqc_config.yml b/assets/multiqc_config.yml index 646ff21..1ed94d1 100644 --- a/assets/multiqc_config.yml +++ b/assets/multiqc_config.yml @@ -1,7 +1,7 @@ report_comment: > - This report has been generated by the nf-core/pairgenomealign + This report has been generated by the nf-core/pairgenomealign analysis pipeline. For information about how to interpret these results, please see the - documentation. + documentation. report_section_order: "nf-core-pairgenomealign-methods-description": order: -1000 diff --git a/conf/modules.config b/conf/modules.config index 3c1b725..883a68e 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -28,10 +28,11 @@ process { withName: 'ALIGNMENT_LASTDB' { // See https://gitlab.com/mcfrith/last/-/blob/main/doc/lastdb.rst for details - // -R01: uppercase all sequences and then lowercase simple repeats + // -R01: uppercase all sequences and then lowercase simple repeats with tantan + // -R10: keep original lowercase masking // -c: soft-mask lowercase letters // -S2: index both strands - ext.args = { "-R01 -c -u${params.seed} -S2" } + ext.args = { "${params.softmask=="tantan" ? '-R01' : '-R11'} -c -u${params.seed} -S2" } } withName: 'ALIGNMENT_SPLIT_O2M' { diff --git a/nextflow.config b/nextflow.config index c75282a..e8c4bd3 100644 --- a/nextflow.config +++ b/nextflow.config @@ -58,7 +58,10 @@ params { max_cpus = 16 max_time = '240.h' + // Indexing options seed = 'YASS' + softmask = 'tantan' + targetName = 'target' m2m = false @@ -258,7 +261,7 @@ manifest { description = """Pairwise alignment pipeline (genome to genome or reads to genome)""" mainScript = 'main.nf' nextflowVersion = '!>=23.04.0' - version = '1.0.0' + version = '1.1.0' doi = '' } diff --git a/nextflow_schema.json b/nextflow_schema.json index 312b1bd..d52e963 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -56,6 +56,31 @@ } } }, + "indexing_options": { + "title": "Indexing options", + "type": "object", + "description": "", + "default": "", + "fa_icon": "fas fa-database", + "properties": { + "seed": { + "type": "string", + "enum": ["YASS", "NEAR", "MAM8", "RY128"], + "help_text": "LAST creates a database of seed sequences in the _target_ genome, and provides different ways to generate these seeds. The default (`YASS`) searches for long-and-weak similarities that allow for mismatches but not gaps. Among alternatives, there are `NEAR` for short-and-strong (near-identical) similarities with many gaps (insertions and deletions), `MAM8` to find weak similarities with high sensitivity, but low speed and high memory usage, or `RY128` that reduces run time and memory use, by only seeking seeds at ~1/128 of positions in each sequence, which is useful when the purpose of running this pipeline is only to generate whole-genome dotplots, or when sensitivity for tiny fragments may be unnecessary or undesirable. See for details.", + "description": "Select the the LAST seed to index the _target_ genome.", + "default": "YASS", + "fa_icon": "fas fa-seedling" + }, + "softmask": { + "type": "string", + "enum": ["tantan", "original"], + "help_text": "In this pipeline, letters soft-masked in lowercase are excluded from indexing (`lastdb -c`). By default, the original mask is removed and a new one is made with an internal verison of the “tantan” tool. Set this option to `original` to keep the orininal soft-masking. See for details.", + "description": "Customise the way to mask the _target_ genome.", + "default": "tantan", + "fa_icon": "fas fa-theater-masks" + } + } + }, "alignment_options": { "title": "Alignment options", "type": "object", @@ -68,14 +93,6 @@ "help_text": "This adds time and can comsume considerable amount of space; use only if you need that data, for instance in the case of a self-alignment", "fa_icon": "fas fa-arrows-alt" }, - "seed": { - "type": "string", - "enum": ["YASS", "NEAR", "MAM8", "RY128"], - "help_text": "LAST creates a database of seed sequences in the _target_ genome, and provides different ways to generate these seeds. The default (`YASS`) searches for long-and-weak similarities that allow for mismatches but not gaps. Among alternatives, there are `NEAR` for short-and-strong (near-identical) similarities with many gaps (insertions and deletions), `MAM8` to find weak similarities with high sensitivity, but low speed and high memory usage, or `RY128` that reduces run time and memory use, by only seeking seeds at ~1/128 of positions in each sequence, which is useful when the purpose of running this pipeline is only to generate whole-genome dotplots, or when sensitivity for tiny fragments may be unnecessary or undesirable. See for details.", - "description": "Select the the LAST seed to index the _target_ genome.", - "default": "YASS", - "fa_icon": "fas fa-seedling" - }, "lastal_params": { "type": "string", "description": "Path to a file containing alignment parameters or a scoring matrix. If this option is used, `last-train` will be skipped and alignment parameters will be the same for each query.", @@ -372,6 +389,9 @@ { "$ref": "#/definitions/input_output_options" }, + { + "$ref": "#/definitions/indexing_options" + }, { "$ref": "#/definitions/alignment_options" },