diff --git a/.github/workflows/awsfulltest.yml b/.github/workflows/awsfulltest.yml index c55aa4c..006dda9 100644 --- a/.github/workflows/awsfulltest.yml +++ b/.github/workflows/awsfulltest.yml @@ -15,7 +15,6 @@ jobs: steps: - name: Launch workflow via Seqera Platform uses: seqeralabs/action-tower-launch@v2 - # TODO nf-core: You can customise AWS full pipeline tests as required # Add full size test data (but still relatively small datasets for few samples) # on the `test_full.config` test runs with only one set of parameters with: diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index ebe60da..4ba71f3 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -39,7 +39,6 @@ jobs: uses: jlumbroso/free-disk-space@54081f138730dfa15788a46383842cd2f914a1be # v1.3.1 - name: Run pipeline with test data - # TODO nf-core: You can customise CI pipeline run tests as required # For example: adding multiple test runs with different parameters # Remember that you can parallelise this by using strategy.matrix run: | diff --git a/.nf-core.yml b/.nf-core.yml index e0b85a7..21e42f7 100644 --- a/.nf-core.yml +++ b/.nf-core.yml @@ -1,2 +1,7 @@ repository_type: pipeline +lint: + files_unchanged: + - assets/nf-core-pairgenomealign_logo_light.png + - docs/images/nf-core-pairgenomealign_logo_light.png + - docs/images/nf-core-pairgenomealign_logo_dark.png nf_core_version: "2.14.1" diff --git a/CHANGELOG.md b/CHANGELOG.md index e5e3cb3..7f5f0db 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,14 +3,6 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). -## v1.0dev - [date] +## v1.0.0 "Sweet potato" - [August 27th, 2024] Initial release of nf-core/pairgenomealign, created with the [nf-core](https://nf-co.re/) template. - -### `Added` - -### `Fixed` - -### `Dependencies` - -### `Deprecated` diff --git a/CITATIONS.md b/CITATIONS.md index 4867b6f..1d06505 100644 --- a/CITATIONS.md +++ b/CITATIONS.md @@ -8,11 +8,23 @@ > Di Tommaso P, Chatzou M, Floden EW, Barja PP, Palumbo E, Notredame C. Nextflow enables reproducible computational workflows. Nat Biotechnol. 2017 Apr 11;35(4):316-319. doi: 10.1038/nbt.3820. PubMed PMID: 28398311. +## Pipeline design + +> Charles Plessy, Michael J. Mansfield, Aleksandra Bliznina, Aki Masunaga, Charlotte West, Yongkai Tan, Andrew W. Liu, Jan Grašič, María Sara del Río Pisula, Gaspar Sánchez-Serna, Marc Fabrega-Torrus, Alfonso Ferrández-Roldán, Vittoria Roncalli, Pavla Navratilova, Eric M. Thompson, Takeshi Onuma, Hiroki Nishida, Cristian Cañestro, Nicholas M. Luscombe. Extreme genome scrambling in marine planktonic Oikopleura dioica cryptic species. Genome Res. 2024. 34: 426-440; doi: [10.1101/2023.05.09.539028](https://doi.org/10.1101/gr.278295.123). PubMed ID: [38621828](https://pubmed.ncbi.nlm.nih.gov/38621828/) + ## Pipeline tools -- [FastQC](https://www.bioinformatics.babraham.ac.uk/projects/fastqc/) +- [LAST](https://gitlab.com/mcfrith/last/) + + > Kiełbasa SM, Wan R, Sato K, Horton P, Frith MC. Adaptive seeds tame genomic sequence comparison. Genome Res. 2011 21(3):487-93. doi: 10.1101/gr.113985.110. PubMed PMID: 21209072 (This describes the main algorithms used by LAST.) + + > Frith MC, Noé L. Improved search heuristics find 20,000 new alignments between human and mouse genomes. doi: 10.1093/nar/gku104 PubMed PMID: 24493737 (This describes sensitive DNA seeding (MAM8 and MAM4) + + > Frith MC, Kawaguchi R. Split-alignment of genomes finds orthologies more accurately. Genome Biology. 2015 16:106. doi: 10.1186/s13059-015-0670-9 PubMed PMID: 25994148 (Describes the split alignment algorithm, and its application to whole genome alignment.) + + > Hamada M, Ono Y, Asai K Frith MC. Training alignment parameters for arbitrary sequencers with LAST-TRAIN. Bioinformatics. 2017 33(6):926-928. doi: 10.1093/bioinformatics/btw742 PubMed PMID: 28039163 (Describes last-train.) - > Andrews, S. (2010). FastQC: A Quality Control Tool for High Throughput Sequence Data [Online]. + > Frith MC, Shaw J, Spouge JL. How to optimally sample a sequence for rapid analysis. doi: 10.1093/bioinformatics/btad057 PubMed PMID: 36702468 (Describes the lastdb -u RY sparsity options.) - [MultiQC](https://pubmed.ncbi.nlm.nih.gov/27312411/) diff --git a/README.md b/README.md index 521e9ef..f31d667 100644 --- a/README.md +++ b/README.md @@ -19,49 +19,37 @@ ## Introduction -**nf-core/pairgenomealign** is a bioinformatics pipeline that ... +**nf-core/pairgenomealign** is a bioinformatics pipeline that aligns one or more _query_ genomes to a _target_ genome, and plots pairwise representations. - +![Tubemap workflow summary](docs/images/pairgenomealign-tubemap.png "Tubemap workflow summary") - - +The pipeline can generate four kinds of outputs, called _many-to-many_, _many-to-one_, _one-to-many_ and _one-to-one_, depending on whether sequences of one genome are allowed match the other genome multiple times or not. -1. Read QC ([`FastQC`](https://www.bioinformatics.babraham.ac.uk/projects/fastqc/)) -2. Present QC for raw reads ([`MultiQC`](http://multiqc.info/)) +These alignments are output in [MAF](https://genome.ucsc.edu/FAQ/FAQformat.html#format5) format, and optional line plot representations are output in PNG format. ## Usage > [!NOTE] > If you are new to Nextflow and nf-core, please refer to [this page](https://nf-co.re/docs/usage/installation) on how to set-up Nextflow. Make sure to [test your setup](https://nf-co.re/docs/usage/introduction#how-to-run-a-pipeline) with `-profile test` before running the workflow on actual data. - +Each row represents a fasta file, this can also contain multiple rows to accomodate multiple query genomes in fasta format. Now, you can run the pipeline using: - - ```bash nextflow run nf-core/pairgenomealign \ -profile \ + --target sequencefile.fa \ --input samplesheet.csv \ --outdir ``` @@ -80,11 +68,15 @@ For more details about the output files and reports, please refer to the ## Credits -nf-core/pairgenomealign was originally written by charles-plessy. +`nf-core/pairgenomealign` was originally written by [charles-plessy](https://github.com/charles-plessy); the original versions are available at . We thank the following people for their extensive assistance in the development of this pipeline: - +- [Mahdi Mohammed](https://github.com/U13bs1125) ported the original pipeline to _nf-core_ template 2.14.x. +- [Martin Frith](https://github.com/mcfrith/), the author of LAST, gave us extensive feedback and advices. +- [Michael Mansfield](https://github.com/mjmansfi) tested the pipeline and provided critical comments. +- [Aleksandra Bliznina](https://github.com/aleksandrabliznina) contributed to the creation of the initial `last/*` modules. +- [Jiashun Miao](https://github.com/miaojiashun) and [Huyen Pham](https://github.com/ngochuyenpham) tested the pipeline on vertebrate genomes. ## Contributions and Support @@ -94,10 +86,15 @@ For further information or help, don't hesitate to get in touch on the [Slack `# ## Citations - - +If you use this pipeline, please cite: + +> **Extreme genome scrambling in marine planktonic Oikopleura dioica cryptic species.** +> Charles Plessy, Michael J. Mansfield, Aleksandra Bliznina, Aki Masunaga, Charlotte West, Yongkai Tan, Andrew W. Liu, Jan Grašič, María Sara del Río Pisula, Gaspar Sánchez-Serna, Marc Fabrega-Torrus, Alfonso Ferrández-Roldán, Vittoria Roncalli, Pavla Navratilova, Eric M. Thompson, Takeshi Onuma, Hiroki Nishida, Cristian Cañestro, Nicholas M. Luscombe. +> _Genome Res._ 2024. 34: 426-440; doi: [10.1101/2023.05.09.539028](https://doi.org/10.1101/gr.278295.123). PubMed ID: [38621828](https://pubmed.ncbi.nlm.nih.gov/38621828/) + +[OIST research news article](https://www.oist.jp/news-center/news/2024/4/25/oikopleura-who-species-identity-crisis-genome-community) - +And also please cite the [LAST papers](https://gitlab.com/mcfrith/last/-/blob/main/doc/last-papers.rst). An extensive list of references for the tools used by the pipeline can be found in the [`CITATIONS.md`](CITATIONS.md) file. diff --git a/assets/methods_description_template.yml b/assets/methods_description_template.yml index 5a1a396..cb250d2 100644 --- a/assets/methods_description_template.yml +++ b/assets/methods_description_template.yml @@ -3,7 +3,6 @@ description: "Suggested text and references to use when describing pipeline usag section_name: "nf-core/pairgenomealign Methods Description" section_href: "https://github.com/nf-core/pairgenomealign" plot_type: "html" -## TODO nf-core: Update the HTML below to your preferred methods description, e.g. add publication citation for this pipeline ## You inject any metadata in the Nextflow '${workflow}' object data: |

Methods

diff --git a/assets/multiqc_config.yml b/assets/multiqc_config.yml index 52f81cc..646ff21 100644 --- a/assets/multiqc_config.yml +++ b/assets/multiqc_config.yml @@ -1,7 +1,7 @@ report_comment: > - This report has been generated by the nf-core/pairgenomealign + This report has been generated by the nf-core/pairgenomealign analysis pipeline. For information about how to interpret these results, please see the - documentation. + documentation. report_section_order: "nf-core-pairgenomealign-methods-description": order: -1000 @@ -13,3 +13,48 @@ report_section_order: export_plots: true disable_version_detection: true + +custom_data: + train: + file_format: "tsv" + section_name: "Training parameter statistics" + plot_type: "table" + headers: + id: + title: "ID" + description: "target___query" + substitution_percent_identity: + title: "Substitution Percent Identity" + "last -t": + title: "Temperature" + description: "Parameter for converting between scores and probability ratios. This affects the column ambiguity estimates. A score is converted to a probability ratio by this formula: exp(score / TEMPERATURE). The default value is 1/lambda, where lambda is the scale factor of the scoring matrix, which is calculated by the method of Yu and Altschul (YK Yu et al. 2003, PNAS 100(26):15688-93)." + "last -a": + title: "Gap existence" + description: "Gap existence cost (lastal -a)" + "last -b": + title: "Gap extension" + description: "Gap extension cost (lastal -b)" + "last -A": + title: "Insertion existence" + description: "Insertion existence cost (lastal -A)" + "last -B": + title: "Insertion extension" + description: "Insertion extension cost (lastal -B)" + last_o2o: + file_format: "tsv" + section_name: "Alignment statistics" + plot_type: "table" + headers: + id: + title: "ID" + description: "target__query" + TotalAlignmentLength: + title: "Total alignment length" + PercentSimilarity: + title: "Percent similarity" + +sp: + last_o2o: + fn: "*o2o_aln.tsv" + train: + fn: "*train.tsv" diff --git a/assets/samplesheet_full.csv b/assets/samplesheet_full.csv new file mode 100644 index 0000000..32f2d39 --- /dev/null +++ b/assets/samplesheet_full.csv @@ -0,0 +1,35 @@ +sample,fasta +Homo_sapiens_GCA_000001405.29,https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/001/405/GCA_000001405.29_GRCh38.p14/GCA_000001405.29_GRCh38.p14_genomic.fna.gz +Callithrix_jacchus_GCA_000004665.1,https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/004/665/GCA_000004665.1_Callithrix_jacchus-3.2/GCA_000004665.1_Callithrix_jacchus-3.2_genomic.fna.gz +Cercopithecus_mitis_GCA_028627265.1,https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/028/627/265/GCA_028627265.1_Cercopithecus_mitis_HiC/GCA_028627265.1_Cercopithecus_mitis_HiC_genomic.fna.gz +Chlorocebus_sabaeus_GCA_000409795.2,https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/409/795/GCA_000409795.2_Chlorocebus_sabeus_1.1/GCA_000409795.2_Chlorocebus_sabeus_1.1_genomic.fna.gz +Colobus_guereza_GCA_030247045.1,https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/030/247/045/GCA_030247045.1_ASM3024704v1/GCA_030247045.1_ASM3024704v1_genomic.fna.gz +Eulemur_mongoz_GCA_028534055.1,https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/028/534/055/GCA_028534055.1_Eulemur_mongoz_HiC/GCA_028534055.1_Eulemur_mongoz_HiC_genomic.fna.gz +Gorilla_gorilla_gorilla_GCA_000151905.3,https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/151/905/GCA_000151905.3_gorGor4/GCA_000151905.3_gorGor4_genomic.fna.gz +Hylobates_pileatus_GCA_021498465.1,https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/021/498/465/GCA_021498465.1_ASM2149846v1/GCA_021498465.1_ASM2149846v1_genomic.fna.gz +Lemur_catta_GCA_020740605.1,https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/020/740/605/GCA_020740605.1_mLemCat1.pri/GCA_020740605.1_mLemCat1.pri_genomic.fna.gz +Leontopithecus_rosalia_GCA_028533165.1,https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/028/533/165/GCA_028533165.1_Leontopithecus_rosalia_HiC/GCA_028533165.1_Leontopithecus_rosalia_HiC_genomic.fna.gz +Macaca_cyclopis_GCA_026956025.1,https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/026/956/025/GCA_026956025.1_MCyc01/GCA_026956025.1_MCyc01_genomic.fna.gz +Macaca_fascicularis_GCA_011100615.1,https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/011/100/615/GCA_011100615.1_Macaca_fascicularis_6.0/GCA_011100615.1_Macaca_fascicularis_6.0_genomic.fna.gz +Macaca_mulatta_GCA_003339765.3,https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/003/339/765/GCA_003339765.3_Mmul_10/GCA_003339765.3_Mmul_10_genomic.fna.gz +Macaca_thibetana_thibetana_GCA_024542745.1,https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/024/542/745/GCA_024542745.1_ASM2454274v1/GCA_024542745.1_ASM2454274v1_genomic.fna.gz +Microcebus_murinus_GCA_000165445.3,https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/165/445/GCA_000165445.3_Mmur_3.0/GCA_000165445.3_Mmur_3.0_genomic.fna.gz +Miopithecus_talapoin_GCA_028551445.1,https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/028/551/445/GCA_028551445.1_Miopithecus_talapoin_HiC/GCA_028551445.1_Miopithecus_talapoin_HiC_genomic.fna.gz +Nasalis_larvatus_GCA_000772465.1,https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/772/465/GCA_000772465.1_Charlie1.0/GCA_000772465.1_Charlie1.0_genomic.fna.gz +Nomascus_leucogenys_GCA_000146795.3,https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/146/795/GCA_000146795.3_Nleu_3.0/GCA_000146795.3_Nleu_3.0_genomic.fna.gz +Nycticebus_bengalensis_GCA_023898255.1,https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/023/898/255/GCA_023898255.1_ASM2389825v1/GCA_023898255.1_ASM2389825v1_genomic.fna.gz +Nycticebus_coucang_GCA_027406575.1,https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/027/406/575/GCA_027406575.1_mNycCou1.pri/GCA_027406575.1_mNycCou1.pri_genomic.fna.gz +Pan_paniscus_GCA_000258655.2,https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/258/655/GCA_000258655.2_panpan1.1/GCA_000258655.2_panpan1.1_genomic.fna.gz +Pan_troglodytes_GCA_000001515.5,https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/001/515/GCA_000001515.5_Pan_tro_3.0/GCA_000001515.5_Pan_tro_3.0_genomic.fna.gz +Papio_anubis_GCA_000264685.2,https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/264/685/GCA_000264685.2_Panu_3.0/GCA_000264685.2_Panu_3.0_genomic.fna.gz +Papio_papio_GCA_028645565.1,https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/028/645/565/GCA_028645565.1_Papio_papio_HiC/GCA_028645565.1_Papio_papio_HiC_genomic.fna.gz +Piliocolobus_tephrosceles_GCA_002776525.5,https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/002/776/525/GCA_002776525.5_ASM277652v5/GCA_002776525.5_ASM277652v5_genomic.fna.gz +Pithecia_pithecia_GCA_028551515.1,https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/028/551/515/GCA_028551515.1_Pithecia_pithecia_HiC/GCA_028551515.1_Pithecia_pithecia_HiC_genomic.fna.gz +Pongo_abelii_GCA_028885655.2,https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/028/885/655/GCA_028885655.2_NHGRI_mPonAbe1-v2.0_pri/GCA_028885655.2_NHGRI_mPonAbe1-v2.0_pri_genomic.fna.gz +Pongo_pygmaeus_GCA_028885625.2,https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/028/885/625/GCA_028885625.2_NHGRI_mPonPyg2-v2.0_pri/GCA_028885625.2_NHGRI_mPonPyg2-v2.0_pri_genomic.fna.gz +Rhinopithecus_roxellana_GCA_007565055.1,https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/007/565/055/GCA_007565055.1_ASM756505v1/GCA_007565055.1_ASM756505v1_genomic.fna.gz +Saguinus_midas_GCA_021498475.1,https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/021/498/475/GCA_021498475.1_ASM2149847v1/GCA_021498475.1_ASM2149847v1_genomic.fna.gz +Saguinus_oedipus_GCA_031835075.1,https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/031/835/075/GCA_031835075.1_ASM3183507v1/GCA_031835075.1_ASM3183507v1_genomic.fna.gz +Symphalangus_syndactylus_GCA_028878055.3,https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/028/878/055/GCA_028878055.3_NHGRI_mSymSyn1-v2.1_pri/GCA_028878055.3_NHGRI_mSymSyn1-v2.1_pri_genomic.fna.gz +Theropithecus_gelada_GCA_003255815.1,https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/003/255/815/GCA_003255815.1_Tgel_1.0/GCA_003255815.1_Tgel_1.0_genomic.fna.gz +Varecia_variegata_GCA_028533085.1,https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/028/533/085/GCA_028533085.1_Varecia_variegata_HiC/GCA_028533085.1_Varecia_variegata_HiC_genomic.fna.gz diff --git a/assets/samplesheet_small.csv b/assets/samplesheet_small.csv new file mode 100644 index 0000000..6ab077b --- /dev/null +++ b/assets/samplesheet_small.csv @@ -0,0 +1,3 @@ +sample,fasta +Fusarium_asiaticum_GCA_025258505.1,https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/025/258/505/GCA_025258505.1_ASM2525850v1/GCA_025258505.1_ASM2525850v1_genomic.fna.gz +Fusarium_oxysporum_GCA_014857085.1,https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/014/857/085/GCA_014857085.1_ASM1485708v1/GCA_014857085.1_ASM1485708v1_genomic.fna.gz diff --git a/assets/schema_input.json b/assets/schema_input.json index 1f7cd7b..f28d8f4 100644 --- a/assets/schema_input.json +++ b/assets/schema_input.json @@ -13,21 +13,14 @@ "errorMessage": "Sample name must be provided and cannot contain spaces", "meta": ["id"] }, - "fastq_1": { + "fasta": { "type": "string", "format": "file-path", "exists": true, - "pattern": "^\\S+\\.f(ast)?q\\.gz$", - "errorMessage": "FastQ file for reads 1 must be provided, cannot contain spaces and must have extension '.fq.gz' or '.fastq.gz'" - }, - "fastq_2": { - "type": "string", - "format": "file-path", - "exists": true, - "pattern": "^\\S+\\.f(ast)?q\\.gz$", - "errorMessage": "FastQ file for reads 2 cannot contain spaces and must have extension '.fq.gz' or '.fastq.gz'" + "pattern": "^\\S+\\.f(ast|n)?a(\\.gz)?$", + "errorMessage": "Fasta file for genomes must be provided, cannot contain spaces and must have extension `.fa`, `.fa.gz`, `.fna`, `.fna.gz`, `.fasta` or `.fasta.gz`" } }, - "required": ["sample", "fastq_1"] + "required": ["sample", "fasta"] } } diff --git a/conf/base.config b/conf/base.config index 639fda2..9c2232e 100644 --- a/conf/base.config +++ b/conf/base.config @@ -10,7 +10,6 @@ process { - // TODO nf-core: Check the defaults for all processes cpus = { check_max( 1 * task.attempt, 'cpus' ) } memory = { check_max( 6.GB * task.attempt, 'memory' ) } time = { check_max( 4.h * task.attempt, 'time' ) } @@ -24,7 +23,6 @@ process { // These labels are used and recognised by default in DSL2 files hosted on nf-core/modules. // If possible, it would be nice to keep the same label naming convention when // adding in your local modules too. - // TODO nf-core: Customise requirements for specific processes. // See https://www.nextflow.io/docs/latest/config.html#config-process-selectors withLabel:process_single { cpus = { check_max( 1 , 'cpus' ) } diff --git a/conf/modules.config b/conf/modules.config index d203d2b..3c1b725 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -18,8 +18,69 @@ process { saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ] - withName: FASTQC { - ext.args = '--quiet' + withName: CUTN_TARGET { + ext.args = { "-n 10" } + } + + withName: CUTN_QUERY { + ext.args = { "-n 10" } + } + + withName: 'ALIGNMENT_LASTDB' { + // See https://gitlab.com/mcfrith/last/-/blob/main/doc/lastdb.rst for details + // -R01: uppercase all sequences and then lowercase simple repeats + // -c: soft-mask lowercase letters + // -S2: index both strands + ext.args = { "-R01 -c -u${params.seed} -S2" } + } + + withName: 'ALIGNMENT_SPLIT_O2M' { + ext.prefix = { "${meta.id}.o2m_aln" } + ext.args = { "--reverse -m${params.last_split_mismap}" } + } + + withName: 'ALIGNMENT_DOTPLOT_O2M' { + ext.prefix = { "${meta.id}.o2m_plt" } + ext.args = { "--rot2=h --sort2=3 --strands2=1 ${params.dotplot_options}" } + } + + withName: 'ALIGNMENT_SPLIT_M2O' { + ext.prefix = { "${meta.id}.m2o_aln" } + ext.args = { "-m${params.last_split_mismap}" } + } + + withName: 'ALIGNMENT_SPLIT_O2O' { + ext.prefix = { "${meta.id}.o2o_aln" } + ext.args = { "--reverse -m${params.last_split_mismap}" } + } + + withName: 'ALIGNMENT_TRAIN' { + ext.args = { "--revsym ${params.lastal_args}" } + } + + withName: 'ALIGNMENT_LASTAL_M2O' { + ext.prefix = { "${meta.id}.m2o_aln" } + ext.args = { "--split-f=MAF+ ${params.lastal_args} ${params.lastal_extr_args}" } + } + + withName: 'ALIGNMENT_LASTAL_M2M' { + ext.prefix = { "${meta.id}.m2m_aln" } + ext.args = { "${params.lastal_args} ${params.lastal_extr_args}" } + } + + withName: 'ALIGNMENT_DOTPLOT_O2O' { + ext.prefix = { "${meta.id}.o2o_plt" } + ext.args = { "--rot2=h --sort2=3 --strands2=1 ${params.dotplot_options}" } + } + + withName: 'ALIGNMENT_DOTPLOT_M2O' { + ext.prefix = { "${meta.id}.m2o_plt" } + ext.args = { "--rot2=h --sort2=3 --strands2=1 ${params.dotplot_options}" } + } + + withName: 'ALIGNMENT_DOTPLOT_M2M' { + ext.prefix = { "${meta.id}.m2m_plt" } + ext.args = { "--rot2=h --sort2=3 --strands2=1 ${params.dotplot_options}" } } withName: 'MULTIQC' { @@ -31,4 +92,12 @@ process { ] } + withName: 'MULTIQC_ASSEMBLYSCAN_PLOT_DATA' { + publishDir = [ + path: { "${params.outdir}/multiqc/assemblyscan_plot_data" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + } diff --git a/conf/test.config b/conf/test.config index 9f770ac..ae05f53 100644 --- a/conf/test.config +++ b/conf/test.config @@ -17,12 +17,14 @@ params { // Limit resources so that this can run on GitHub Actions max_cpus = 2 max_memory = '6.GB' - max_time = '6.h' + max_time = '1.h' // Input data - // TODO nf-core: Specify the paths to your test data on nf-core/test-datasets - // TODO nf-core: Give any required params for the test so that command line flags are not needed - input = params.pipelines_testdata_base_path + 'viralrecon/samplesheet/samplesheet_test_illumina_amplicon.csv' + input = 'https://raw.githubusercontent.com/nf-core/test-datasets/pairgenomealign/tests/testsamplesheet.csv' + + // Target + target = params.pipelines_testdata_base_path + 'modules/data/genomics/sarscov2/genome/genome.fasta' + lastal_args = '-C2' // Remove -D1e9 because the virus genomes are so small. // Genome references genome = 'R64-1-1' diff --git a/conf/test_full.config b/conf/test_full.config index 388fe86..0ee9264 100644 --- a/conf/test_full.config +++ b/conf/test_full.config @@ -14,10 +14,16 @@ params { config_profile_name = 'Full test profile' config_profile_description = 'Full test dataset to check pipeline function' - // Input data for full size test - // TODO nf-core: Specify the paths to your full test data ( on nf-core/test-datasets or directly in repositories, e.g. SRA) - // TODO nf-core: Give any required params for the test so that command line flags are not needed - input = params.pipelines_testdata_base_path + 'viralrecon/samplesheet/samplesheet_full_illumina_amplicon.csv' + // Input data + input = 'assets/samplesheet_full.csv' + + // Target + // Took 1h 32m 14s (257.8 CPU hours) to compute on OIST's HPC cluster. + target = 'https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/001/405/GCA_000001405.29_GRCh38.p14/GCA_000001405.29_GRCh38.p14_genomic.fna.gz' + targetName = 'Homo_sapiens_GRCh38.p14' + + // Other parameters + seed = 'RY128' // Genome references genome = 'R64-1-1' diff --git a/conf/test_small.config b/conf/test_small.config new file mode 100644 index 0000000..7950ab6 --- /dev/null +++ b/conf/test_small.config @@ -0,0 +1,34 @@ +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Nextflow config file for running minimal tests +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Defines input files and everything required to run a fast and simple pipeline test. + + Use as follows: + nextflow run nf-core/pairgenomealign -profile test_small, --outdir + +---------------------------------------------------------------------------------------- +*/ + +params { + config_profile_name = 'Test profile (small-scale)' + config_profile_description = 'Small test dataset using fungal genomes to check pipeline function' + + // Limit resources so that this can run on GitHub Actions + max_cpus = 2 + max_memory = '6.GB' + max_time = '1.h' + + // Input data + input = 'assets/samplesheet_small.csv' + + // Target + target = 'https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/025/258/505/GCA_025258505.1_ASM2525850v1/GCA_025258505.1_ASM2525850v1_genomic.fna.gz' + targetName = 'Fusarium_asiaticum_GCA_025258505.1' + + // Do a many-to-many alignment because first query is same as target. + m2m = true + + // Genome references + genome = 'R64-1-1' +} diff --git a/docs/images/Homo_sapiens_GCA_000001405.29_GRCh38.p14___Macaca_mulatta_GCA_003339765.3.o2o_plt.png b/docs/images/Homo_sapiens_GCA_000001405.29_GRCh38.p14___Macaca_mulatta_GCA_003339765.3.o2o_plt.png new file mode 100644 index 0000000..96dc49c Binary files /dev/null and b/docs/images/Homo_sapiens_GCA_000001405.29_GRCh38.p14___Macaca_mulatta_GCA_003339765.3.o2o_plt.png differ diff --git a/docs/images/mqc_base_content_summary-pct.png b/docs/images/mqc_base_content_summary-pct.png new file mode 100644 index 0000000..96538f6 Binary files /dev/null and b/docs/images/mqc_base_content_summary-pct.png differ diff --git a/docs/images/mqc_contigs_length_statistics.png b/docs/images/mqc_contigs_length_statistics.png new file mode 100644 index 0000000..34c01ba Binary files /dev/null and b/docs/images/mqc_contigs_length_statistics.png differ diff --git a/docs/images/mqc_fastqc_adapter.png b/docs/images/mqc_fastqc_adapter.png deleted file mode 100755 index 361d0e4..0000000 Binary files a/docs/images/mqc_fastqc_adapter.png and /dev/null differ diff --git a/docs/images/mqc_fastqc_counts.png b/docs/images/mqc_fastqc_counts.png deleted file mode 100755 index cb39ebb..0000000 Binary files a/docs/images/mqc_fastqc_counts.png and /dev/null differ diff --git a/docs/images/mqc_fastqc_quality.png b/docs/images/mqc_fastqc_quality.png deleted file mode 100755 index a4b89bf..0000000 Binary files a/docs/images/mqc_fastqc_quality.png and /dev/null differ diff --git a/docs/images/mqc_last_o2o-stats.png b/docs/images/mqc_last_o2o-stats.png new file mode 100644 index 0000000..7530c07 Binary files /dev/null and b/docs/images/mqc_last_o2o-stats.png differ diff --git a/docs/images/mqc_train-stats.png b/docs/images/mqc_train-stats.png new file mode 100644 index 0000000..0c76a3e Binary files /dev/null and b/docs/images/mqc_train-stats.png differ diff --git a/docs/images/pairgenomealign-tubemap.png b/docs/images/pairgenomealign-tubemap.png new file mode 100644 index 0000000..6c12352 Binary files /dev/null and b/docs/images/pairgenomealign-tubemap.png differ diff --git a/docs/images/pairgenomealign-tubemap.svg b/docs/images/pairgenomealign-tubemap.svg new file mode 100644 index 0000000..d467777 --- /dev/null +++ b/docs/images/pairgenomealign-tubemap.svg @@ -0,0 +1,1559 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Indexseeds + seqtk cutN + assemblyscan + Target genome + TrainParameters + Alignquery(ies)on target + Query genome(s) + + + + fasta + + + + + + + + + + fasta + + + + + + png + + + + + png + + + + + + + + + + + + + Postprocessing + Inputs QC + Dotplots + + Alignment + + + One-to-one + + + + Many-to-one + + + + Outputs + + + + modules + + + + + v1.0 + + + + + + + + + + + + + + + + + + + nf- + core/ + + + + + + One-to-many + + + + Target genome + + + + + Query genome(s) + + + + pairgenome + align + + + + Inputs + + + + + Many-to-many + + + + + + + + + maf + + + + + maf + + + + + + + + One-tomany + + + png + + + + + + + + + + + + maf + + + + + maf + + + + + + + + Many-tomany + + + png + + + + + + + + + maf + + + + + + + maf + + + + + + Many-toone + + + + png + + + + + png + + + + + + + + maf + + + + + + + + + + + + + + + + + + + + maf + + + + + One-toone + + + + + png + + + + + png + + + + + + diff --git a/docs/output.md b/docs/output.md index cd1a4b0..f303490 100644 --- a/docs/output.md +++ b/docs/output.md @@ -2,42 +2,79 @@ ## Introduction -This document describes the output produced by the pipeline. Most of the plots are taken from the MultiQC report, which summarises results at the end of the pipeline. +This document describes the output produced by the pipeline. The directories listed below will be created in the results directory after the pipeline has finished. All paths are relative to the top-level results directory. - - ## Pipeline overview The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes data using the following steps: -- [FastQC](#fastqc) - Raw read QC +- [Alignments](#alignments) - Alignment of the _query_ genomes to the _target_ genome +- [Dot plots](#dot-plots) - Visualisation of the alignment of the _query_ genomes to the _target_ genome +- [`N` regions](#n-regions) - Coordinate of the `N` regions on the _query_ and _target_ genomes - [MultiQC](#multiqc) - Aggregate report describing results and QC from the whole pipeline - [Pipeline information](#pipeline-information) - Report metrics generated during the workflow execution -### FastQC +Each _query_ genome, is aligned to the _target_ genome, and each alignment is visualised with dot plots. The output file names are constructed by concatenating the _target_ and _query_ sample identifiers with a `___` separator (three underscores), to faciliate re-extraction of the IDs from file names. + +### Assembly statistics + +
+Output files + +- `assemblyscan/` + - `*.json` contains the statistics collected with the [`assembly-scan`](https://github.com/rpetit3/assembly-scan) software. + +
+ +Basic statistics on nucleotide content and contig length are collected for aligned genome for later plotting with MultiQC. + +### Alignments + +
+Output files + +- `alignment/` + - `*.train` is the alignment parameters computed by `last-train` (optional) + - `*.m2m_aln.maf.gz` is the _**many-to-many**_ alignment between _target_ and _query_ genomes. (optional through the `--m2m` option) + - `*.m2o_aln.maf.gz` is the _**many-to-one**_ alignment regions of the _target_ genome are matched at most once by the _query_ genome. + - `*.o2o_aln.maf.gz` is the _**one-to-one**_ alignment between the _target_ and _query_ genomes. + - `*.o2m_aln.maf.gz` is the _**one-to-many**_ alignment between the _target_ and _query_ genomes (optional). + +
+ +Genomes are aligned witn [`lastal`](https://gitlab.com/mcfrith/last/-/blob/main/doc/lastal.rst) after alignment parameters have been determined with [`last-train`](https://gitlab.com/mcfrith/last/-/blob/main/doc/last-train.rst). _**Many-to-many**_ alignments are progressively converted to _**one-to-one**_ with [`last-split`](https://gitlab.com/mcfrith/last/-/blob/main/doc/last-split.rst). + +### Dot plots
Output files -- `fastqc/` - - `*_fastqc.html`: FastQC report containing quality metrics. - - `*_fastqc.zip`: Zip archive containing the FastQC report, tab-delimited data file and plot images. +- `alignment/` + - `*.m2m_plot` (optional) + - `*.m2o_plot` (optional) + - `*.o2o_plot` (optional) + - `*.o2m_plot` (optional)
-[FastQC](http://www.bioinformatics.babraham.ac.uk/projects/fastqc/) gives general quality metrics about your sequenced reads. It provides information about the quality score distribution across your reads, per base sequence content (%A/T/G/C), adapter contamination and overrepresented sequences. For further reading and documentation see the [FastQC help pages](http://www.bioinformatics.babraham.ac.uk/projects/fastqc/Help/). +Dot plots representing the pairwise genome alignments, produced with the [`last-dotplot`](https://gitlab.com/mcfrith/last/-/blob/main/doc/last-dotplot.rst) tool. In the one-to-one alignment example below, the `hg38` human genome (_target_) is represented on the horizontal axis and a monkey genopme (_Macaca mulatta_ accession number `GCA\_003339765.3`) on the vertical axis (_query_). Regions containing unknown (`N`) sequences are on pink background. Forward (+/+) alignments are plotted in red and reverse (+/– or –/+) in blue. _Target_ (human) contigs are displayed in their original order. _Query_ contigs (monkey) are reordered and possibly reverse-complemented to diagonalise the plot as much as possible. The names of reverse-complemented contigs are printed in blue. -![MultiQC - FastQC sequence counts plot](images/mqc_fastqc_counts.png) +![Example of a dot-plot produced by the pipeline after aligning human and macaque genomes](images/Homo_sapiens_GCA_000001405.29_GRCh38.p14___Macaca_mulatta_GCA_003339765.3.o2o_plt.png "Human–Monkey comparison") -![MultiQC - FastQC mean quality scores plot](images/mqc_fastqc_quality.png) +### `N` regions -![MultiQC - FastQC adapter content plot](images/mqc_fastqc_adapter.png) +
+Output files + +- `cutn/` + - `targetGenome.bed` + - `.bed` + +
-:::note -The FastQC plots displayed in the MultiQC report shows _untrimmed_ reads. They may contain adapter sequence and potentially regions with low quality. -::: +The poly-N regions longer than 9 bases in each genome sequence often indicate contig boundaries in scaffolds. Therefore, we marked them in pale red in the dot-plots. They are detected with the`seqtk cutN` command and its output (in 3-column BED format) is provided in the `cutn` directory. Sample IDs are constructed to generate file names, except for the _target_ genome which is always called `targetGenome` to avoid filename collisions. ### MultiQC @@ -48,12 +85,39 @@ The FastQC plots displayed in the MultiQC report shows _untrimmed_ reads. They m - `multiqc_report.html`: a standalone HTML file that can be viewed in your web browser. - `multiqc_data/`: directory containing parsed statistics from the different tools used in the pipeline. - `multiqc_plots/`: directory containing static images from the report in various formats. + - `assemblyscan_plot_data`: GC content and contig length statistics parsed from `assemblyscan` for MultiQC with a local module. [MultiQC](http://multiqc.info) is a visualization tool that generates a single HTML report summarising all samples in your project. Most of the pipeline QC results are visualised in the report and further statistics are available in the report data directory. -Results generated by MultiQC collate pipeline QC from supported tools e.g. FastQC. The pipeline has special steps which also allow the software versions to be reported in the MultiQC output for future traceability. For more information about how to use MultiQC reports, see . +Results generated by MultiQC collate pipeline QC. The pipeline has special steps which also allow the software versions to be reported in the MultiQC output for future traceability. For more information about how to use MultiQC reports, see . + +The example MultiQC plots below were generated on this pipeline's full test dataset, which aligns the `hg38` human genome to other primate genomes. + +#### Base content + +The pipeline reports the base content of every query genome, like in the example below: + +![Example of a base content report for primate genomes](images/mqc_base_content_summary-pct.png "Primate genome base content") + +#### Contig length statistics + +Contig length statistics can be displayed by MultiQC as violin plots. + +![Example of a contig length report for primate genomes](images/mqc_contigs_length_statistics.png "Contig length statistics") + +#### TRaining parameters + +Alignment parameters computed by `last-train` can be displayed by MultiQC as violin plots. + +![Example of alignment parameters for primate genomes aligned to the human genome](images/mqc_train-stats.png "Alignment parameters") + +#### Alignment + +Alignment statistics can be displayed by MultiQC as violin plots. + +![Example of alignment statistics for primate genomes aligned to the human genome](images/mqc_last_o2o-stats.png "Alignment statistics") ### Pipeline information diff --git a/docs/usage.md b/docs/usage.md index c8344d3..cf91932 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -6,58 +6,53 @@ ## Introduction - +You need at least two genomes, a _target_, which will be indexed, and one or more _queries_, which will be aligned to the _target_. Paths to the genome files for the _queries_ are passed as samplesheets through the `--input` parameter and the path to the genome file of the _target_ is passed with the `--target` parameter. Note that the computation is not symmetric: inverting _target_ and _query_ does not lead to strictly identical results. -## Samplesheet input +## Input -You will need to create a samplesheet with information about the samples you would like to analyse before running the pipeline. Use this parameter to specify its location. It has to be a comma-separated file with 3 columns, and a header row as shown in the examples below. +### Target genome -```bash ---input '[path to samplesheet file]' -``` +The target genome sequence is taken from a FASTA-formated file passed by the `--target` parameter. Its masking information (sequences in lower-case letters) is first discarded, and then simple repeats (like `cacacacacacacacac`) are converted to lower-case (`lastdb -R01`). The lowercased letters in the _target_ **and** in the _query_ will be excluded for initial matches (`lastdb -c`). Both strands of the genome are indexed (`lastdb -S2`). -### Multiple runs of the same sample +### Samplesheet for query genome(s) -The `sample` identifiers have to be the same when you have re-sequenced the same sample more than once e.g. to increase sequencing depth. The pipeline will concatenate the raw reads before performing any downstream analysis. Below is an example for the same sample sequenced across 3 lanes: +You will need to create a samplesheet with information about the samples you would like to analyse before running the pipeline. Use the `--input` parameter to specify its location. It has to be a comma-separated file with 2 columns, a header row and single or multiple sample rows (genome samples) as shown in the examples below. + +First, prepare a samplesheet with your input data that looks as follows: ```csv title="samplesheet.csv" -sample,fastq_1,fastq_2 -CONTROL_REP1,AEG588A1_S1_L002_R1_001.fastq.gz,AEG588A1_S1_L002_R2_001.fastq.gz -CONTROL_REP1,AEG588A1_S1_L003_R1_001.fastq.gz,AEG588A1_S1_L003_R2_001.fastq.gz -CONTROL_REP1,AEG588A1_S1_L004_R1_001.fastq.gz,AEG588A1_S1_L004_R2_001.fastq.gz +sample,fasta +Query_1,query1_assembly.fasta +Query_2,query2_assembly.fasta +… ``` -### Full samplesheet +Each row represents a fasta file. Use multiple rows as in the example above to accomodate multiple query genomes. -The pipeline will auto-detect whether a sample is single- or paired-end using the information provided in the samplesheet. The samplesheet can have as many columns as you desire, however, there is a strict requirement for the first 3 columns to match those defined in the table below. +| Column | Description | +| -------- | -------------------------------------------------------------------------------------------- | +| `sample` | Custom sample name. Spaces in sample names are automatically converted to underscores (`_`). | +| `fasta` | Full path to Fasta/fa/gz file | -A final samplesheet file consisting of both single- and paired-end data may look something like the one below. This is for 6 samples, where `TREATMENT_REP3` has been sequenced twice. +An [example samplesheet](../assets/samplesheet_full.csv) has been provided with the pipeline. -```csv title="samplesheet.csv" -sample,fastq_1,fastq_2 -CONTROL_REP1,AEG588A1_S1_L002_R1_001.fastq.gz,AEG588A1_S1_L002_R2_001.fastq.gz -CONTROL_REP2,AEG588A2_S2_L002_R1_001.fastq.gz,AEG588A2_S2_L002_R2_001.fastq.gz -CONTROL_REP3,AEG588A3_S3_L002_R1_001.fastq.gz,AEG588A3_S3_L002_R2_001.fastq.gz -TREATMENT_REP1,AEG588A4_S4_L003_R1_001.fastq.gz, -TREATMENT_REP2,AEG588A5_S5_L003_R1_001.fastq.gz, -TREATMENT_REP3,AEG588A6_S6_L003_R1_001.fastq.gz, -TREATMENT_REP3,AEG588A6_S6_L004_R1_001.fastq.gz, -``` +## Options + +Please see the [parameter documentation](https://nf-co.re/pairgenomealign/parameters) for details. + +## Fixed arguments (taken from the [LAST cookbook][] and the [LAST tuning][] manual) -| Column | Description | -| --------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `sample` | Custom sample name. This entry will be identical for multiple sequencing libraries/runs from the same sample. Spaces in sample names are automatically converted to underscores (`_`). | -| `fastq_1` | Full path to FastQ file for Illumina short reads 1. File has to be gzipped and have the extension ".fastq.gz" or ".fq.gz". | -| `fastq_2` | Full path to FastQ file for Illumina short reads 2. File has to be gzipped and have the extension ".fastq.gz" or ".fq.gz". | +[LAST cookbook]: https://gitlab.com/mcfrith/last/-/blob/main/doc/last-cookbook.rst +[LAST tuning]: https://gitlab.com/mcfrith/last/-/blob/main/doc/last-tuning.rst -An [example samplesheet](../assets/samplesheet.csv) has been provided with the pipeline. +- The `last-train` commands runs with `--revsym` as the DNA strands play equivalent roles in the studied genomes. ## Running the pipeline The typical command for running the pipeline is as follows: ```bash -nextflow run nf-core/pairgenomealign --input ./samplesheet.csv --outdir ./results --genome GRCh37 -profile docker +nextflow run nf-core/pairgenomealign --target ./target_genome_file.fa --input ./samplesheet.csv --outdir ./results -profile docker ``` This will launch the pipeline with the `docker` configuration profile. See below for more information about profiles. @@ -144,6 +139,14 @@ If `-profile` is not specified, the pipeline will run locally and expect all sof - `test` - A profile with a complete configuration for automated testing - Includes links to test data so needs no other parameters +- `test_small` + - A profile with a complete configuration for small-scale testing + - Includes links to two fungal geonomes at NCBI so needs no other parameters + - Should take less than 5 min to run and produce meaningful plots +- `test_full` + - A profile with a complete configuration for automated testing at large-scale + - Includes links to primate genomes at NCBI so needs no other parameters + - Requires larger computational power, useful for stress-testing and real-scale example results - `docker` - A generic configuration profile to be used with [Docker](https://docker.com/) - `singularity` diff --git a/main.nf b/main.nf index 330eb0d..e220d45 100644 --- a/main.nf +++ b/main.nf @@ -29,7 +29,6 @@ include { getGenomeAttribute } from './subworkflows/local/utils_nfcore_pair ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */ -// TODO nf-core: Remove this line if you don't need a FASTA file // This is an example of how to use getGenomeAttribute() to fetch parameters // from igenomes.config using `--genome` params.fasta = getGenomeAttribute('fasta') @@ -47,6 +46,7 @@ workflow NFCORE_PAIRGENOMEALIGN { take: samplesheet // channel: samplesheet read in from --input + target_genome // channel: genome file read in from --target main: @@ -54,7 +54,8 @@ workflow NFCORE_PAIRGENOMEALIGN { // WORKFLOW: Run pipeline // PAIRGENOMEALIGN ( - samplesheet + samplesheet, + target_genome ) emit: @@ -84,11 +85,18 @@ workflow { params.input ) + channel + .value( params.target ) + .map { filename -> file(filename, checkIfExists: true) } + .map { file_obj -> [ [id:params.targetName], file_obj] } + .set { ch_target } + // // WORKFLOW: Run main workflow // NFCORE_PAIRGENOMEALIGN ( - PIPELINE_INITIALISATION.out.samplesheet + PIPELINE_INITIALISATION.out.samplesheet, + ch_target ) // diff --git a/modules.json b/modules.json index 1c35468..e5d35c7 100644 --- a/modules.json +++ b/modules.json @@ -5,14 +5,54 @@ "https://github.com/nf-core/modules.git": { "modules": { "nf-core": { - "fastqc": { + "assemblyscan": { "branch": "master", - "git_sha": "285a50500f9e02578d90b3ce6382ea3c30216acd", + "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", + "installed_by": ["modules"] + }, + "gfastats": { + "branch": "master", + "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", + "installed_by": ["modules"] + }, + "last/dotplot": { + "branch": "master", + "git_sha": "23a928df77b20861eac09ca998029ad47a7155cb", + "installed_by": ["modules"] + }, + "last/lastal": { + "branch": "master", + "git_sha": "882e20c8a18270f0d391a931cef4b80d1a0eeea5", + "installed_by": ["modules"] + }, + "last/lastdb": { + "branch": "master", + "git_sha": "3fa9017b55b9c26e1c327ca189d3942b55f4d496", + "installed_by": ["modules"] + }, + "last/mafswap": { + "branch": "master", + "git_sha": "3fa9017b55b9c26e1c327ca189d3942b55f4d496", + "installed_by": ["modules"] + }, + "last/split": { + "branch": "master", + "git_sha": "882e20c8a18270f0d391a931cef4b80d1a0eeea5", + "installed_by": ["modules"] + }, + "last/train": { + "branch": "master", + "git_sha": "4e5f4687318f24ba944a13609d3ea6ebd890737d", "installed_by": ["modules"] }, "multiqc": { "branch": "master", - "git_sha": "b7ebe95761cd389603f9cc0e0dc384c0f663815a", + "git_sha": "b80f5fd12ff7c43938f424dd76392a2704fa2396", + "installed_by": ["modules"] + }, + "seqtk/cutn": { + "branch": "master", + "git_sha": "7f88aae93c69586c0789322b77743ee0ef469502", "installed_by": ["modules"] } } diff --git a/modules/local/multiqc_assemblyscan_plot_data.nf b/modules/local/multiqc_assemblyscan_plot_data.nf new file mode 100644 index 0000000..a3fa099 --- /dev/null +++ b/modules/local/multiqc_assemblyscan_plot_data.nf @@ -0,0 +1,56 @@ +process MULTIQC_ASSEMBLYSCAN_PLOT_DATA { + label 'process_single' + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/jq:1.6': + 'biocontainers/jq:1.6' }" + + // This module parses the JSON output of the assemblyscan module with jq to extract + // statistics about GC content and contig length. I do not know how to contribute + // this as a proper MultiQC module but feel free to do so! + + input: + path(json) + + output: + path "*_mqc.tsv", emit: tsv + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + """ + echo "# id: 'base_content_summary'" > gc_summary_mqc.tsv + echo "# section_name: 'Base frequency'" >> gc_summary_mqc.tsv + echo "# format: 'tsv'" >> gc_summary_mqc.tsv + echo "# plot_type: 'bargraph'" >> gc_summary_mqc.tsv + echo "# description: 'This plot shows a brief summary of each base content/percentage in the query genomes'" >> gc_summary_mqc.tsv + echo "# pconfig:" >> gc_summary_mqc.tsv + echo "# id: 'base content summary'" >> gc_summary_mqc.tsv + echo "# title: 'per_base content and percentage'" >> gc_summary_mqc.tsv + echo "# ylab: ''" >> gc_summary_mqc.tsv + echo "id\tpercent_A\tpercent_C\tpercent_G\tpercent_T\tpercent_N\tcontig_non_ACGTN" >> gc_summary_mqc.tsv + for i in $json + do + printf "\$(basename \$i .json)\t" >> gc_summary_mqc.tsv + jq -r '[.contig_percent_a, .contig_percent_c, .contig_percent_g, .contig_percent_t, .contig_percent_n, .contig_non_acgtn] | @tsv' \$i >> gc_summary_mqc.tsv + done + + echo "# id: 'contigs_length_statistics'" > contig_length_mqc.tsv + echo "# section_name: 'Contig length statistics'" >> contig_length_mqc.tsv + echo "# format: 'tsv'" >> contig_length_mqc.tsv + echo "# plot_type: 'table'" >> contig_length_mqc.tsv + echo "# description: 'This plot shows a short statistics abouth the length of contigs in the query genomes'" >> contig_length_mqc.tsv + echo "# pconfig:" >> contig_length_mqc.tsv + echo "# id: 'contigs length statistics'" >> contig_length_mqc.tsv + echo "# title: 'contigs length statistics'" >> contig_length_mqc.tsv + echo "# ylab: 'length'" >> contig_length_mqc.tsv + echo "id\tTOTALcontiglen\tMINcontiglen\tMAXcontiglen\ttotalcontigs\tcontigs>1k\tcontigs>10k" >> contig_length_mqc.tsv + for i in $json + do + printf "\$(basename \$i .json)\t" >> contig_length_mqc.tsv + jq -r '[.total_contig_length, .min_contig_length, .max_contig_length, .total_contig, .contigs_greater_1k, .contigs_greater_10k] | @tsv' \$i >> contig_length_mqc.tsv + done + """ +} diff --git a/modules/nf-core/assemblyscan/environment.yml b/modules/nf-core/assemblyscan/environment.yml new file mode 100644 index 0000000..34a02bc --- /dev/null +++ b/modules/nf-core/assemblyscan/environment.yml @@ -0,0 +1,7 @@ +name: assemblyscan +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::assembly-scan=0.4.1 diff --git a/modules/nf-core/assemblyscan/main.nf b/modules/nf-core/assemblyscan/main.nf new file mode 100644 index 0000000..5a3dff4 --- /dev/null +++ b/modules/nf-core/assemblyscan/main.nf @@ -0,0 +1,31 @@ +process ASSEMBLYSCAN { + tag "$meta.id" + label 'process_low' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/assembly-scan:0.4.1--pyhdfd78af_0' : + 'biocontainers/assembly-scan:0.4.1--pyhdfd78af_0' }" + + input: + tuple val(meta), path(assembly) + + output: + tuple val(meta), path("*.json"), emit: json + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + """ + assembly-scan $assembly > ${prefix}.json + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + assemblyscan: \$( assembly-scan --version 2>&1 | sed 's/^.*assembly-scan //; s/Using.*\$//' ) + END_VERSIONS + """ +} diff --git a/modules/nf-core/assemblyscan/meta.yml b/modules/nf-core/assemblyscan/meta.yml new file mode 100644 index 0000000..9ff7e3f --- /dev/null +++ b/modules/nf-core/assemblyscan/meta.yml @@ -0,0 +1,42 @@ +name: assemblyscan +description: Assembly summary statistics in JSON format +keywords: + - assembly + - statistics +tools: + - assemblyscan: + description: Assembly summary statistics in JSON format + homepage: https://github.com/rpetit3/assembly-scan + documentation: https://github.com/rpetit3/assembly-scan + tool_dev_url: https://github.com/rpetit3/assembly-scan + licence: ["MIT"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - assembly: + type: file + description: FASTA file for a given assembly + pattern: "*.fasta" +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - json: + type: file + description: Assembly statistics in JSON format + pattern: "*.json" +authors: + - "@sateeshperi" + - "@mjcipriano" +maintainers: + - "@sateeshperi" + - "@mjcipriano" diff --git a/modules/nf-core/assemblyscan/tests/main.nf.test b/modules/nf-core/assemblyscan/tests/main.nf.test new file mode 100644 index 0000000..7e5a7bb --- /dev/null +++ b/modules/nf-core/assemblyscan/tests/main.nf.test @@ -0,0 +1,34 @@ +nextflow_process { + + name "Test Process ASSEMBLYSCAN" + script "../main.nf" + process "ASSEMBLYSCAN" + tag "modules" + tag "modules_nfcore" + tag "assemblyscan" + + test("Should run without failures") { + + when { + params { + outdir = "$outputDir" + } + process { + """ + input[0] = [ [ id:'test', single_end:false ], // meta map + file(params.test_data['sarscov2']['genome']['genome_fasta'], checkIfExists: true) + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + +} diff --git a/modules/nf-core/assemblyscan/tests/main.nf.test.snap b/modules/nf-core/assemblyscan/tests/main.nf.test.snap new file mode 100644 index 0000000..b459af8 --- /dev/null +++ b/modules/nf-core/assemblyscan/tests/main.nf.test.snap @@ -0,0 +1,33 @@ +{ + "Should run without failures": { + "content": [ + { + "0": [ + [ + { + "id": "test", + "single_end": false + }, + "test.json:md5,9140e3d43f2d676f62e1325ace5dd8bd" + ] + ], + "1": [ + "versions.yml:md5,406f16dbbbf8f98709caafe49ca10e21" + ], + "json": [ + [ + { + "id": "test", + "single_end": false + }, + "test.json:md5,9140e3d43f2d676f62e1325ace5dd8bd" + ] + ], + "versions": [ + "versions.yml:md5,406f16dbbbf8f98709caafe49ca10e21" + ] + } + ], + "timestamp": "2023-10-18T15:04:38.644743625" + } +} \ No newline at end of file diff --git a/modules/nf-core/assemblyscan/tests/tags.yml b/modules/nf-core/assemblyscan/tests/tags.yml new file mode 100644 index 0000000..b037942 --- /dev/null +++ b/modules/nf-core/assemblyscan/tests/tags.yml @@ -0,0 +1,2 @@ +assemblyscan: + - modules/nf-core/assemblyscan/** diff --git a/modules/nf-core/fastqc/main.nf b/modules/nf-core/fastqc/main.nf deleted file mode 100644 index d79f1c8..0000000 --- a/modules/nf-core/fastqc/main.nf +++ /dev/null @@ -1,61 +0,0 @@ -process FASTQC { - tag "$meta.id" - label 'process_medium' - - conda "${moduleDir}/environment.yml" - container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/fastqc:0.12.1--hdfd78af_0' : - 'biocontainers/fastqc:0.12.1--hdfd78af_0' }" - - input: - tuple val(meta), path(reads) - - output: - tuple val(meta), path("*.html"), emit: html - tuple val(meta), path("*.zip") , emit: zip - path "versions.yml" , emit: versions - - when: - task.ext.when == null || task.ext.when - - script: - def args = task.ext.args ?: '' - def prefix = task.ext.prefix ?: "${meta.id}" - // Make list of old name and new name pairs to use for renaming in the bash while loop - def old_new_pairs = reads instanceof Path || reads.size() == 1 ? [[ reads, "${prefix}.${reads.extension}" ]] : reads.withIndex().collect { entry, index -> [ entry, "${prefix}_${index + 1}.${entry.extension}" ] } - def rename_to = old_new_pairs*.join(' ').join(' ') - def renamed_files = old_new_pairs.collect{ old_name, new_name -> new_name }.join(' ') - - def memory_in_mb = MemoryUnit.of("${task.memory}").toUnit('MB') - // FastQC memory value allowed range (100 - 10000) - def fastqc_memory = memory_in_mb > 10000 ? 10000 : (memory_in_mb < 100 ? 100 : memory_in_mb) - - """ - printf "%s %s\\n" $rename_to | while read old_name new_name; do - [ -f "\${new_name}" ] || ln -s \$old_name \$new_name - done - - fastqc \\ - $args \\ - --threads $task.cpus \\ - --memory $fastqc_memory \\ - $renamed_files - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - fastqc: \$( fastqc --version | sed '/FastQC v/!d; s/.*v//' ) - END_VERSIONS - """ - - stub: - def prefix = task.ext.prefix ?: "${meta.id}" - """ - touch ${prefix}.html - touch ${prefix}.zip - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - fastqc: \$( fastqc --version | sed '/FastQC v/!d; s/.*v//' ) - END_VERSIONS - """ -} diff --git a/modules/nf-core/fastqc/meta.yml b/modules/nf-core/fastqc/meta.yml deleted file mode 100644 index ee5507e..0000000 --- a/modules/nf-core/fastqc/meta.yml +++ /dev/null @@ -1,57 +0,0 @@ -name: fastqc -description: Run FastQC on sequenced reads -keywords: - - quality control - - qc - - adapters - - fastq -tools: - - fastqc: - description: | - FastQC gives general quality metrics about your reads. - It provides information about the quality score distribution - across your reads, the per base sequence content (%A/C/G/T). - You get information about adapter contamination and other - overrepresented sequences. - homepage: https://www.bioinformatics.babraham.ac.uk/projects/fastqc/ - documentation: https://www.bioinformatics.babraham.ac.uk/projects/fastqc/Help/ - licence: ["GPL-2.0-only"] -input: - - meta: - type: map - description: | - Groovy Map containing sample information - e.g. [ id:'test', single_end:false ] - - reads: - type: file - description: | - List of input FastQ files of size 1 and 2 for single-end and paired-end data, - respectively. -output: - - meta: - type: map - description: | - Groovy Map containing sample information - e.g. [ id:'test', single_end:false ] - - html: - type: file - description: FastQC report - pattern: "*_{fastqc.html}" - - zip: - type: file - description: FastQC report archive - pattern: "*_{fastqc.zip}" - - versions: - type: file - description: File containing software versions - pattern: "versions.yml" -authors: - - "@drpatelh" - - "@grst" - - "@ewels" - - "@FelixKrueger" -maintainers: - - "@drpatelh" - - "@grst" - - "@ewels" - - "@FelixKrueger" diff --git a/modules/nf-core/fastqc/tests/main.nf.test b/modules/nf-core/fastqc/tests/main.nf.test deleted file mode 100644 index 70edae4..0000000 --- a/modules/nf-core/fastqc/tests/main.nf.test +++ /dev/null @@ -1,212 +0,0 @@ -nextflow_process { - - name "Test Process FASTQC" - script "../main.nf" - process "FASTQC" - - tag "modules" - tag "modules_nfcore" - tag "fastqc" - - test("sarscov2 single-end [fastq]") { - - when { - process { - """ - input[0] = Channel.of([ - [ id: 'test', single_end:true ], - [ file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_1.fastq.gz', checkIfExists: true) ] - ]) - """ - } - } - - then { - assertAll ( - { assert process.success }, - - // NOTE The report contains the date inside it, which means that the md5sum is stable per day, but not longer than that. So you can't md5sum it. - // looks like this:
Mon 2 Oct 2023
test.gz
- // https://github.com/nf-core/modules/pull/3903#issuecomment-1743620039 - - { assert process.out.html[0][1] ==~ ".*/test_fastqc.html" }, - { assert process.out.zip[0][1] ==~ ".*/test_fastqc.zip" }, - { assert path(process.out.html[0][1]).text.contains("File typeConventional base calls") }, - - { assert snapshot(process.out.versions).match("fastqc_versions_single") } - ) - } - } - - test("sarscov2 paired-end [fastq]") { - - when { - process { - """ - input[0] = Channel.of([ - [id: 'test', single_end: false], // meta map - [ file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_1.fastq.gz', checkIfExists: true), - file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_2.fastq.gz', checkIfExists: true) ] - ]) - """ - } - } - - then { - assertAll ( - { assert process.success }, - - { assert process.out.html[0][1][0] ==~ ".*/test_1_fastqc.html" }, - { assert process.out.html[0][1][1] ==~ ".*/test_2_fastqc.html" }, - { assert process.out.zip[0][1][0] ==~ ".*/test_1_fastqc.zip" }, - { assert process.out.zip[0][1][1] ==~ ".*/test_2_fastqc.zip" }, - { assert path(process.out.html[0][1][0]).text.contains("File typeConventional base calls") }, - { assert path(process.out.html[0][1][1]).text.contains("File typeConventional base calls") }, - - { assert snapshot(process.out.versions).match("fastqc_versions_paired") } - ) - } - } - - test("sarscov2 interleaved [fastq]") { - - when { - process { - """ - input[0] = Channel.of([ - [id: 'test', single_end: false], // meta map - file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_interleaved.fastq.gz', checkIfExists: true) - ]) - """ - } - } - - then { - assertAll ( - { assert process.success }, - - { assert process.out.html[0][1] ==~ ".*/test_fastqc.html" }, - { assert process.out.zip[0][1] ==~ ".*/test_fastqc.zip" }, - { assert path(process.out.html[0][1]).text.contains("File typeConventional base calls") }, - - { assert snapshot(process.out.versions).match("fastqc_versions_interleaved") } - ) - } - } - - test("sarscov2 paired-end [bam]") { - - when { - process { - """ - input[0] = Channel.of([ - [id: 'test', single_end: false], // meta map - file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/bam/test.paired_end.sorted.bam', checkIfExists: true) - ]) - """ - } - } - - then { - assertAll ( - { assert process.success }, - - { assert process.out.html[0][1] ==~ ".*/test_fastqc.html" }, - { assert process.out.zip[0][1] ==~ ".*/test_fastqc.zip" }, - { assert path(process.out.html[0][1]).text.contains("File typeConventional base calls") }, - - { assert snapshot(process.out.versions).match("fastqc_versions_bam") } - ) - } - } - - test("sarscov2 multiple [fastq]") { - - when { - process { - """ - input[0] = Channel.of([ - [id: 'test', single_end: false], // meta map - [ file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_1.fastq.gz', checkIfExists: true), - file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_2.fastq.gz', checkIfExists: true), - file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test2_1.fastq.gz', checkIfExists: true), - file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test2_2.fastq.gz', checkIfExists: true) ] - ]) - """ - } - } - - then { - assertAll ( - { assert process.success }, - - { assert process.out.html[0][1][0] ==~ ".*/test_1_fastqc.html" }, - { assert process.out.html[0][1][1] ==~ ".*/test_2_fastqc.html" }, - { assert process.out.html[0][1][2] ==~ ".*/test_3_fastqc.html" }, - { assert process.out.html[0][1][3] ==~ ".*/test_4_fastqc.html" }, - { assert process.out.zip[0][1][0] ==~ ".*/test_1_fastqc.zip" }, - { assert process.out.zip[0][1][1] ==~ ".*/test_2_fastqc.zip" }, - { assert process.out.zip[0][1][2] ==~ ".*/test_3_fastqc.zip" }, - { assert process.out.zip[0][1][3] ==~ ".*/test_4_fastqc.zip" }, - { assert path(process.out.html[0][1][0]).text.contains("File typeConventional base calls") }, - { assert path(process.out.html[0][1][1]).text.contains("File typeConventional base calls") }, - { assert path(process.out.html[0][1][2]).text.contains("File typeConventional base calls") }, - { assert path(process.out.html[0][1][3]).text.contains("File typeConventional base calls") }, - - { assert snapshot(process.out.versions).match("fastqc_versions_multiple") } - ) - } - } - - test("sarscov2 custom_prefix") { - - when { - process { - """ - input[0] = Channel.of([ - [ id:'mysample', single_end:true ], // meta map - file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_1.fastq.gz', checkIfExists: true) - ]) - """ - } - } - - then { - assertAll ( - { assert process.success }, - - { assert process.out.html[0][1] ==~ ".*/mysample_fastqc.html" }, - { assert process.out.zip[0][1] ==~ ".*/mysample_fastqc.zip" }, - { assert path(process.out.html[0][1]).text.contains("File typeConventional base calls") }, - - { assert snapshot(process.out.versions).match("fastqc_versions_custom_prefix") } - ) - } - } - - test("sarscov2 single-end [fastq] - stub") { - - options "-stub" - - when { - process { - """ - input[0] = Channel.of([ - [ id: 'test', single_end:true ], - [ file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_1.fastq.gz', checkIfExists: true) ] - ]) - """ - } - } - - then { - assertAll ( - { assert process.success }, - { assert snapshot(process.out.html.collect { file(it[1]).getName() } + - process.out.zip.collect { file(it[1]).getName() } + - process.out.versions ).match("fastqc_stub") } - ) - } - } - -} diff --git a/modules/nf-core/fastqc/tests/main.nf.test.snap b/modules/nf-core/fastqc/tests/main.nf.test.snap deleted file mode 100644 index 86f7c31..0000000 --- a/modules/nf-core/fastqc/tests/main.nf.test.snap +++ /dev/null @@ -1,88 +0,0 @@ -{ - "fastqc_versions_interleaved": { - "content": [ - [ - "versions.yml:md5,e1cc25ca8af856014824abd842e93978" - ] - ], - "meta": { - "nf-test": "0.8.4", - "nextflow": "23.10.1" - }, - "timestamp": "2024-01-31T17:40:07.293713" - }, - "fastqc_stub": { - "content": [ - [ - "test.html", - "test.zip", - "versions.yml:md5,e1cc25ca8af856014824abd842e93978" - ] - ], - "meta": { - "nf-test": "0.8.4", - "nextflow": "23.10.1" - }, - "timestamp": "2024-01-31T17:31:01.425198" - }, - "fastqc_versions_multiple": { - "content": [ - [ - "versions.yml:md5,e1cc25ca8af856014824abd842e93978" - ] - ], - "meta": { - "nf-test": "0.8.4", - "nextflow": "23.10.1" - }, - "timestamp": "2024-01-31T17:40:55.797907" - }, - "fastqc_versions_bam": { - "content": [ - [ - "versions.yml:md5,e1cc25ca8af856014824abd842e93978" - ] - ], - "meta": { - "nf-test": "0.8.4", - "nextflow": "23.10.1" - }, - "timestamp": "2024-01-31T17:40:26.795862" - }, - "fastqc_versions_single": { - "content": [ - [ - "versions.yml:md5,e1cc25ca8af856014824abd842e93978" - ] - ], - "meta": { - "nf-test": "0.8.4", - "nextflow": "23.10.1" - }, - "timestamp": "2024-01-31T17:39:27.043675" - }, - "fastqc_versions_paired": { - "content": [ - [ - "versions.yml:md5,e1cc25ca8af856014824abd842e93978" - ] - ], - "meta": { - "nf-test": "0.8.4", - "nextflow": "23.10.1" - }, - "timestamp": "2024-01-31T17:39:47.584191" - }, - "fastqc_versions_custom_prefix": { - "content": [ - [ - "versions.yml:md5,e1cc25ca8af856014824abd842e93978" - ] - ], - "meta": { - "nf-test": "0.8.4", - "nextflow": "23.10.1" - }, - "timestamp": "2024-01-31T17:41:14.576531" - } -} \ No newline at end of file diff --git a/modules/nf-core/fastqc/tests/tags.yml b/modules/nf-core/fastqc/tests/tags.yml deleted file mode 100644 index 7834294..0000000 --- a/modules/nf-core/fastqc/tests/tags.yml +++ /dev/null @@ -1,2 +0,0 @@ -fastqc: - - modules/nf-core/fastqc/** diff --git a/modules/nf-core/gfastats/environment.yml b/modules/nf-core/gfastats/environment.yml new file mode 100644 index 0000000..1c875ce --- /dev/null +++ b/modules/nf-core/gfastats/environment.yml @@ -0,0 +1,7 @@ +name: gfastats +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::gfastats=1.3.6 diff --git a/modules/nf-core/gfastats/main.nf b/modules/nf-core/gfastats/main.nf new file mode 100644 index 0000000..8db239a --- /dev/null +++ b/modules/nf-core/gfastats/main.nf @@ -0,0 +1,66 @@ +process GFASTATS { + tag "$meta.id" + label 'process_low' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/gfastats:1.3.6--hdcf5f25_3': + 'biocontainers/gfastats:1.3.6--hdcf5f25_3' }" + + input: + tuple val(meta), path(assembly) // input.[fasta|fastq|gfa][.gz] + val out_fmt // output format (fasta/fastq/gfa) + val genome_size // estimated genome size for NG* statistics (optional). + val target // target specific sequence by header, optionally with coordinates (optional). + path agpfile // -a --agp-to-path converts input agp to path and replaces existing paths. + path include_bed // -i --include-bed generates output on a subset list of headers or coordinates in 0-based bed format. + path exclude_bed // -e --exclude-bed opposite of --include-bed. They can be combined (no coordinates). + path instructions // -k --swiss-army-knife set of instructions provided as an ordered list. + + output: + tuple val(meta), path("*.assembly_summary"), emit: assembly_summary + tuple val(meta), path("*.${out_fmt}.gz") , emit: assembly + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def agp = agpfile ? "--agp-to-path $agp" : "" + def ibed = include_bed ? "--include-bed $include_bed" : "" + def ebed = exclude_bed ? "--exclude-bed $exclude_bed" : "" + def sak = instructions ? "--swiss-army-knife $instructions" : "" + """ + gfastats \\ + $args \\ + --threads $task.cpus \\ + $agp \\ + $ibed \\ + $ebed \\ + $sak \\ + --out-format ${prefix}.${out_fmt}.gz \\ + $assembly \\ + $genome_size \\ + $target \\ + > ${prefix}.assembly_summary + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + gfastats: \$( gfastats -v | sed '1!d;s/.*v//' ) + END_VERSIONS + """ + + stub: + def prefix = task.ext.prefix ?: "${meta.id}" + """ + touch ${prefix}.${out_fmt}.gz + touch ${prefix}.assembly_summary + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + gfastats: \$( gfastats -v | sed '1!d;s/.*v//' ) + END_VERSIONS + """ +} diff --git a/modules/nf-core/gfastats/meta.yml b/modules/nf-core/gfastats/meta.yml new file mode 100644 index 0000000..d0e97a8 --- /dev/null +++ b/modules/nf-core/gfastats/meta.yml @@ -0,0 +1,72 @@ +name: "gfastats" +description: | + A single fast and exhaustive tool for summary statistics and simultaneous *fa* + (fasta, fastq, gfa [.gz]) genome assembly file manipulation. +keywords: + - gfastats + - fasta + - genome assembly + - genome summary + - genome manipulation + - genome statistics +tools: + - "gfastats": + description: "The swiss army knife for genome assembly." + homepage: "https://github.com/vgl-hub/gfastats" + documentation: "https://github.com/vgl-hub/gfastats/tree/main/instructions" + tool_dev_url: "https://github.com/vgl-hub/gfastats" + doi: "10.1093/bioinformatics/btac460" + licence: "['MIT']" +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - assembly: + type: file + description: Draft assembly file + pattern: "*.{fasta,fastq,gfa}(.gz)?" + - out_fmt: + type: string + description: Output format (fasta, fastq, gfa) + - genome_size: + type: integer + description: estimated genome size (bp) for NG* statistics (optional). + - target: + type: string + description: target specific sequence by header, optionally with coordinates (optional). + - agpfile: + type: file + description: converts input agp to path and replaces existing paths. + - include_bed: + type: file + description: generates output on a subset list of headers or coordinates in 0-based bed format. + - exclude_bed: + type: file + description: opposite of --include-bed. They can be combined (no coordinates). + - instructions: + type: file + description: set of instructions provided as an ordered list. +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - assembly_summary: + type: file + description: Assembly summary statistics file + pattern: "*.assembly_summary" + - assembly: + type: file + description: The assembly as modified by gfastats + pattern: "*.{fasta,fastq,gfa}.gz" +authors: + - "@mahesh-panchal" +maintainers: + - "@mahesh-panchal" diff --git a/modules/nf-core/last/dotplot/environment.yml b/modules/nf-core/last/dotplot/environment.yml new file mode 100644 index 0000000..b013d4d --- /dev/null +++ b/modules/nf-core/last/dotplot/environment.yml @@ -0,0 +1,9 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json +name: "last_dotplot" +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - "bioconda::last=1542" diff --git a/modules/nf-core/last/dotplot/main.nf b/modules/nf-core/last/dotplot/main.nf new file mode 100644 index 0000000..93f311d --- /dev/null +++ b/modules/nf-core/last/dotplot/main.nf @@ -0,0 +1,56 @@ +process LAST_DOTPLOT { + tag "$meta.id" + label 'process_low' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/last:1542--h43eeafb_1' : + 'biocontainers/last:1542--h43eeafb_1' }" + + input: + tuple val(meta), path(maf), path(annot_b) + tuple val(meta2), path(annot_a) + val(format) + + output: + tuple val(meta), path("*.gif"), optional:true, emit: gif + tuple val(meta), path("*.png"), optional:true, emit: png + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def annot_a_arg = annot_a ? "-a ${annot_a}" : '' + def annot_b_arg = annot_b ? "-b ${annot_b}" : '' + """ + last-dotplot \\ + $args \\ + $annot_a_arg \\ + $annot_b_arg \\ + $maf \\ + $prefix.$format + + # last-dotplot has no --version option so let's use lastal from the same suite + cat <<-END_VERSIONS > versions.yml + "${task.process}": + last: \$(lastal --version | sed 's/lastal //') + END_VERSIONS + """ + + stub: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + """ + touch $prefix.$format + + # last-dotplot has no --version option so let's use lastal from the same suite + cat <<-END_VERSIONS > versions.yml + "${task.process}": + last: \$(lastal --version | sed 's/lastal //') + END_VERSIONS + """ + +} diff --git a/modules/nf-core/last/dotplot/meta.yml b/modules/nf-core/last/dotplot/meta.yml new file mode 100644 index 0000000..5243743 --- /dev/null +++ b/modules/nf-core/last/dotplot/meta.yml @@ -0,0 +1,60 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/meta-schema.json +name: "last_dotplot" +description: Makes a dotplot (Oxford Grid) of pair-wise sequence alignments +keywords: + - LAST + - plot + - pair + - alignment + - MAF +tools: + - "last": + description: "LAST finds & aligns related regions of sequences." + homepage: "https://gitlab.com/mcfrith/last" + documentation: "https://gitlab.com/mcfrith/last/-/blob/main/doc/last-dotplot.rst" + tool_dev_url: "https://gitlab.com/mcfrith/last" + licence: ["GPL v3-or-later"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'sample1', single_end:false ]` + - maf: + type: file + description: Multiple Aligment Format (MAF) file, compressed with gzip + pattern: "*.{maf.gz}" + - format: + type: string + description: Output format (PNG or GIF). + - annot_a: + type: file + description: Annotation file in BED, Repeamasker, genePred or AGP format for the first (horizontal) sequence + pattern: "*.{bed,bed.gz,out,out.gz,rmsk.txt,rmsk.txt.gz,genePred,genePred.gz,gff,gff.gz,gtf,gtf.gz,gap.txt,gap.txt.gz}" + - annot_b: + type: file + description: Annotation file in BED, Repeamasker, genePred or AGP format for the second (vertical) sequence + pattern: "*.{bed,bed.gz,out,out.gz,rmsk.txt,rmsk.txt.gz,genePred,genePred.gz,gff,gff.gz,gtf,gtf.gz,gap.txt,gap.txt.gz}" +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'sample1', single_end:false ]` + - png: + type: file + description: Pairwise alignment dot plot image, in PNG format. + pattern: "*.png" + - gif: + type: file + description: Pairwise alignment dot plot image, in GIF format. + pattern: "*.gif" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@charles-plessy" +maintainers: + - "@charles-plessy" diff --git a/modules/nf-core/last/dotplot/tests/main.nf.test b/modules/nf-core/last/dotplot/tests/main.nf.test new file mode 100644 index 0000000..fa5f767 --- /dev/null +++ b/modules/nf-core/last/dotplot/tests/main.nf.test @@ -0,0 +1,95 @@ +nextflow_process { + + name "Test Process LAST_DOTPLOT" + script "../main.nf" + process "LAST_DOTPLOT" + + tag "modules" + tag "modules_nfcore" + tag "last" + tag "last/dotplot" + + test("sarscov2 - contigs - genome - png") { + + when { + process { + """ + input[0] = channel.of('NODE_1_length_20973_cov_191.628754\t2000\t2010') + . collectFile(name: 'dummy_annot_b.bed', newLine: true) + . map { [ + [ id:'test' ], // meta map + file(params.test_data['sarscov2']['genome']['contigs_genome_maf_gz'], checkIfExists: true), + it + ] } + input[1] = channel.of('MT192765.1\t1000\t1010') + . collectFile(name: 'dummy_annot_a.bed', newLine: true) + . map { [ [ id:'test'], it ] } + input[2] = channel.of("png") + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out.versions).match() }, + { assert process.out.png.get(0).get(1).endsWith("test.png") } + ) + } + + } + + test("sarscov2 - contigs - genome - gif") { + // Test a different output format and absence of annotation files + when { + process { + """ + input[0] = [ + [ id:'test' ], // meta map + file(params.test_data['sarscov2']['genome']['contigs_genome_maf_gz'], checkIfExists: true), + [] + ] + input[1] = [ [id: 'test'], [] ] + input[2] = "gif" + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out.versions).match() }, + { assert process.out.gif.get(0).get(1).endsWith("test.gif") } + ) + } + + } + + test("sarscov2 - contigs - genome - png - stub") { + + options "-stub" + when { + process { + """ + input[0] = [ + [ id:'test' ], // meta map + file(params.test_data['sarscov2']['genome']['contigs_genome_maf_gz'], checkIfExists: true), + [] + ] + input[1] = [ [id: 'test'], [] ] + input[2] = "png" + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out.versions).match() }, + { assert process.out.png.get(0).get(1).endsWith("test.png") } + ) + } + + } + +} diff --git a/modules/nf-core/last/dotplot/tests/main.nf.test.snap b/modules/nf-core/last/dotplot/tests/main.nf.test.snap new file mode 100644 index 0000000..cff5b56 --- /dev/null +++ b/modules/nf-core/last/dotplot/tests/main.nf.test.snap @@ -0,0 +1,38 @@ +{ + "sarscov2 - contigs - genome - gif": { + "content": [ + [ + "versions.yml:md5,8a86fe4a0227c77ecfcc0aa21a3ece07" + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "24.04.2" + }, + "timestamp": "2024-06-27T09:19:30.116358" + }, + "sarscov2 - contigs - genome - png - stub": { + "content": [ + [ + "versions.yml:md5,8a86fe4a0227c77ecfcc0aa21a3ece07" + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "24.04.2" + }, + "timestamp": "2024-06-27T09:19:46.588825" + }, + "sarscov2 - contigs - genome - png": { + "content": [ + [ + "versions.yml:md5,8a86fe4a0227c77ecfcc0aa21a3ece07" + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "24.04.2" + }, + "timestamp": "2024-06-27T09:19:08.959252" + } +} \ No newline at end of file diff --git a/modules/nf-core/last/dotplot/tests/tags.yml b/modules/nf-core/last/dotplot/tests/tags.yml new file mode 100644 index 0000000..e75c193 --- /dev/null +++ b/modules/nf-core/last/dotplot/tests/tags.yml @@ -0,0 +1,2 @@ +last/dotplot: + - "modules/nf-core/last/dotplot/**" diff --git a/modules/nf-core/last/lastal/environment.yml b/modules/nf-core/last/lastal/environment.yml new file mode 100644 index 0000000..c3a87c6 --- /dev/null +++ b/modules/nf-core/last/lastal/environment.yml @@ -0,0 +1,9 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json +name: "last_lastal" +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - "bioconda::last=1542" diff --git a/modules/nf-core/last/lastal/main.nf b/modules/nf-core/last/lastal/main.nf new file mode 100644 index 0000000..560ada1 --- /dev/null +++ b/modules/nf-core/last/lastal/main.nf @@ -0,0 +1,77 @@ +process LAST_LASTAL { + tag "$meta.id" + label 'process_high' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/last:1542--h43eeafb_1' : + 'biocontainers/last:1542--h43eeafb_1' }" + + input: + tuple val(meta), path(fastx), path (param_file) + path index + + output: + tuple val(meta), path("*.maf.gz"), emit: maf + tuple val(meta), path("*.tsv") , emit: multiqc + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def trained_params = param_file ? "-p ${param_file}" : '' + """ + INDEX_NAME=\$(basename \$(ls $index/*.des) .des) + set -o pipefail + + function calculate_psl_metrics() { + awk 'BEGIN { + FS="\t"; # Set field separator as tab + totalMatches = 0; + totalAlignmentLength = 0; + print "Sample\tTotalAlignmentLength\tPercentSimilarity"; # Header for MultiQC + } + { + totalMatches += \$1 + \$3; # Sum matches and repMatches + totalAlignmentLength += \$1 + \$2 + \$3 + \$6 + \$8; # Sum matches, misMatches, repMatches, qBaseInsert, and tBaseInsert + } + END { + percentSimilarity = (totalAlignmentLength > 0) ? (totalMatches / totalAlignmentLength * 100) : 0; + print "$meta.id" "\t" totalAlignmentLength "\t" percentSimilarity; # Data in TSV format + }' + } + + lastal \\ + -P $task.cpus \\ + $trained_params \\ + $args \\ + ${index}/\$INDEX_NAME \\ + $fastx | + tee >(gzip --no-name > ${prefix}.maf.gz) | + maf-convert psl | + calculate_psl_metrics > ${prefix}.tsv + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + last: \$(lastal --version 2>&1 | sed 's/lastal //') + END_VERSIONS + """ + + stub: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def trained_params = param_file ? "-p ${param_file}" : '' + """ + INDEX_NAME=STUB + echo stub | gzip --no-name > ${prefix}.\$INDEX_NAME.maf.gz + touch ${prefix}.tsv + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + last: \$(lastal --version 2>&1 | sed 's/lastal //') + END_VERSIONS + """ +} diff --git a/modules/nf-core/last/lastal/meta.yml b/modules/nf-core/last/lastal/meta.yml new file mode 100644 index 0000000..c14fa27 --- /dev/null +++ b/modules/nf-core/last/lastal/meta.yml @@ -0,0 +1,56 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/meta-schema.json +name: "last_lastal" +description: Aligns query sequences to target sequences indexed with lastdb +keywords: + - LAST + - align + - fastq + - fasta +tools: + - "last": + description: "LAST finds & aligns related regions of sequences." + homepage: "https://gitlab.com/mcfrith/last" + documentation: "https://gitlab.com/mcfrith/last/-/blob/main/doc/last-train.rst" + tool_dev_url: "https://gitlab.com/mcfrith/last" + licence: ["GPL v3-or-later"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'sample1', single_end:false ]` + - fastx: + type: file + description: FASTA/FASTQ file + pattern: "*.{fasta,fastq}" + - param_file: + type: file + description: Trained parameter file + pattern: "*.train" + - index: + type: directory + description: Directory containing the files of the LAST index + pattern: "lastdb/" +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'sample1', single_end:false ]` + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - maf: + type: file + description: Gzipped MAF (Multiple Alignment Format) file + pattern: "*.{maf.gz}" + - multiqc: + type: file + description: Alignment summary for MultiQC + pattern: "*.tsv" +authors: + - "@charles-plessy" +maintainers: + - "@charles-plessy" diff --git a/modules/nf-core/last/lastal/tests/main.nf.test b/modules/nf-core/last/lastal/tests/main.nf.test new file mode 100644 index 0000000..b5c0730 --- /dev/null +++ b/modules/nf-core/last/lastal/tests/main.nf.test @@ -0,0 +1,130 @@ +nextflow_process { + + name "Test Process LAST_LASTAL" + script "../main.nf" + process "LAST_LASTAL" + config "./nextflow.config" + + tag "modules" + tag "modules_nfcore" + tag "last" + tag "last/lastal" + tag "untar" + + test("sarscov2 - contigs - genome") { + + setup { + run("UNTAR") { + script "../../../untar/main.nf" + process { + """ + input[0] = [ + [id:'genome'], // meta map + file(params.test_data['sarscov2']['genome']['lastdb_tar_gz'], checkIfExists: true) + ] + """ + } + } + } + + when { + process { + """ + input[0] = [ + [ id:'contigs', single_end:false ], // meta map + file(params.test_data['sarscov2']['illumina']['contigs_fasta'], checkIfExists: true), + [] + ] + input[1] = UNTAR.out.untar.map{ it[1] } + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + + test("sarscov2 - contigs - genome - withparams") { + + setup { + run("UNTAR") { + script "../../../untar/main.nf" + process { + """ + input[0] = [ + [id:'genome'], // meta map + file(params.test_data['sarscov2']['genome']['lastdb_tar_gz'], checkIfExists: true) + ] + """ + } + } + } + + when { + process { + """ + input[0] = [ + [ id:'contigs', single_end:false ], // meta map + file(params.test_data['sarscov2']['illumina']['contigs_fasta'], checkIfExists: true), + file(params.test_data['sarscov2']['genome']['contigs_genome_par'], checkIfExists: true) + ] + input[1] = UNTAR.out.untar.map{ it[1] } + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + + test("sarscov2 - contigs - genome - stub") { + + options "-stub" + + setup { + run("UNTAR") { + script "../../../untar/main.nf" + process { + """ + input[0] = [ + [id:'genome'], // meta map + file(params.test_data['sarscov2']['genome']['lastdb_tar_gz'], checkIfExists: true) + ] + """ + } + } + } + + when { + process { + """ + input[0] = [ + [ id:'contigs', single_end:false ], // meta map + file(params.test_data['sarscov2']['illumina']['contigs_fasta'], checkIfExists: true), + [] + ] + input[1] = UNTAR.out.untar.map{ it[1] } + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + +} diff --git a/modules/nf-core/last/lastal/tests/main.nf.test.snap b/modules/nf-core/last/lastal/tests/main.nf.test.snap new file mode 100644 index 0000000..9245a96 --- /dev/null +++ b/modules/nf-core/last/lastal/tests/main.nf.test.snap @@ -0,0 +1,161 @@ +{ + "sarscov2 - contigs - genome": { + "content": [ + { + "0": [ + [ + { + "id": "contigs", + "single_end": false + }, + "contigs.maf.gz:md5,902274b72657f62d270d284dc211aa7f" + ] + ], + "1": [ + [ + { + "id": "contigs", + "single_end": false + }, + "contigs.tsv:md5,f028e69bd64e54080b9a03fd809cba74" + ] + ], + "2": [ + "versions.yml:md5,e0a425d7cbca674252a1e4328b247ca2" + ], + "maf": [ + [ + { + "id": "contigs", + "single_end": false + }, + "contigs.maf.gz:md5,902274b72657f62d270d284dc211aa7f" + ] + ], + "multiqc": [ + [ + { + "id": "contigs", + "single_end": false + }, + "contigs.tsv:md5,f028e69bd64e54080b9a03fd809cba74" + ] + ], + "versions": [ + "versions.yml:md5,e0a425d7cbca674252a1e4328b247ca2" + ] + } + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "24.04.2" + }, + "timestamp": "2024-07-02T17:57:48.589408" + }, + "sarscov2 - contigs - genome - stub": { + "content": [ + { + "0": [ + [ + { + "id": "contigs", + "single_end": false + }, + "contigs.STUB.maf.gz:md5,f50b84b1db4b83ba62ec1deacc69c260" + ] + ], + "1": [ + [ + { + "id": "contigs", + "single_end": false + }, + "contigs.tsv:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "2": [ + "versions.yml:md5,e0a425d7cbca674252a1e4328b247ca2" + ], + "maf": [ + [ + { + "id": "contigs", + "single_end": false + }, + "contigs.STUB.maf.gz:md5,f50b84b1db4b83ba62ec1deacc69c260" + ] + ], + "multiqc": [ + [ + { + "id": "contigs", + "single_end": false + }, + "contigs.tsv:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "versions": [ + "versions.yml:md5,e0a425d7cbca674252a1e4328b247ca2" + ] + } + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "24.04.2" + }, + "timestamp": "2024-07-02T17:58:30.521811" + }, + "sarscov2 - contigs - genome - withparams": { + "content": [ + { + "0": [ + [ + { + "id": "contigs", + "single_end": false + }, + "contigs.maf.gz:md5,8cb97b6daa34dbf9c723a2c4a984992d" + ] + ], + "1": [ + [ + { + "id": "contigs", + "single_end": false + }, + "contigs.tsv:md5,f315664aa18f1f6bad79486f9750f200" + ] + ], + "2": [ + "versions.yml:md5,e0a425d7cbca674252a1e4328b247ca2" + ], + "maf": [ + [ + { + "id": "contigs", + "single_end": false + }, + "contigs.maf.gz:md5,8cb97b6daa34dbf9c723a2c4a984992d" + ] + ], + "multiqc": [ + [ + { + "id": "contigs", + "single_end": false + }, + "contigs.tsv:md5,f315664aa18f1f6bad79486f9750f200" + ] + ], + "versions": [ + "versions.yml:md5,e0a425d7cbca674252a1e4328b247ca2" + ] + } + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "24.04.2" + }, + "timestamp": "2024-07-02T17:58:09.677672" + } +} \ No newline at end of file diff --git a/modules/nf-core/last/lastal/tests/nextflow.config b/modules/nf-core/last/lastal/tests/nextflow.config new file mode 100644 index 0000000..2326423 --- /dev/null +++ b/modules/nf-core/last/lastal/tests/nextflow.config @@ -0,0 +1,7 @@ +process { + + withName: LAST_LASTAL { + ext.args = '-P1' + } + +} diff --git a/modules/nf-core/last/lastal/tests/tags.yml b/modules/nf-core/last/lastal/tests/tags.yml new file mode 100644 index 0000000..f99c637 --- /dev/null +++ b/modules/nf-core/last/lastal/tests/tags.yml @@ -0,0 +1,2 @@ +last/lastal: + - "modules/nf-core/last/lastal/**" diff --git a/modules/nf-core/last/lastdb/environment.yml b/modules/nf-core/last/lastdb/environment.yml new file mode 100644 index 0000000..9e98a10 --- /dev/null +++ b/modules/nf-core/last/lastdb/environment.yml @@ -0,0 +1,9 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json +name: "last_lastdb" +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - "bioconda::last=1542" diff --git a/modules/nf-core/last/lastdb/main.nf b/modules/nf-core/last/lastdb/main.nf new file mode 100644 index 0000000..856b364 --- /dev/null +++ b/modules/nf-core/last/lastdb/main.nf @@ -0,0 +1,55 @@ +process LAST_LASTDB { + tag "$meta.id" + label 'process_medium' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/last:1542--h43eeafb_1' : + 'biocontainers/last:1542--h43eeafb_1' }" + + input: + tuple val(meta), path(fastx) + + output: + tuple val(meta), path("lastdb"), emit: index + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + """ + mkdir lastdb + lastdb \\ + $args \\ + -P $task.cpus \\ + lastdb/${prefix} \\ + $fastx + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + last: \$(lastdb --version 2>&1 | sed 's/lastdb //') + END_VERSIONS + """ + + stub: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + """ + mkdir lastdb + touch lastdb/${prefix}.bck + touch lastdb/${prefix}.des + touch lastdb/${prefix}.prj + touch lastdb/${prefix}.sds + touch lastdb/${prefix}.ssp + touch lastdb/${prefix}.suf + touch lastdb/${prefix}.tis + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + last: \$(lastdb --version 2>&1 | sed 's/lastdb //') + END_VERSIONS + """ +} diff --git a/modules/nf-core/last/lastdb/meta.yml b/modules/nf-core/last/lastdb/meta.yml new file mode 100644 index 0000000..c3b499e --- /dev/null +++ b/modules/nf-core/last/lastdb/meta.yml @@ -0,0 +1,45 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/meta-schema.json +name: "last_lastdb" +description: Prepare sequences for subsequent alignment with lastal. +keywords: + - LAST + - index + - fasta + - fastq +tools: + - "last": + description: "LAST finds & aligns related regions of sequences." + homepage: "https://gitlab.com/mcfrith/last" + documentation: "https://gitlab.com/mcfrith/last/-/blob/main/doc/lastdb.rst" + tool_dev_url: "https://gitlab.com/mcfrith/last" + licence: ["GPL v3-or-later"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'sample1', single_end:false ]` + - fastx: + type: file + description: > + Sequence file in FASTA or FASTQ format. May be compressed with gzip. + pattern: "*.{fasta,fasta.gz,fastq,fastq.gz}" +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'sample1', single_end:false ]` + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - index: + type: directory + description: directory containing the files of the LAST index + pattern: "lastdb/" +authors: + - "@charles-plessy" +maintainers: + - "@charles-plessy" diff --git a/modules/nf-core/last/lastdb/tests/main.nf.test b/modules/nf-core/last/lastdb/tests/main.nf.test new file mode 100644 index 0000000..2e3aa2b --- /dev/null +++ b/modules/nf-core/last/lastdb/tests/main.nf.test @@ -0,0 +1,80 @@ +nextflow_process { + + name "Test Process LAST_LASTDB" + script "../main.nf" + process "LAST_LASTDB" + config "./nextflow.config" + + tag "modules" + tag "modules_nfcore" + tag "last" + tag "last/lastdb" + + test("sarscov2 - fasta") { + + when { + process { + """ + input[0] = [ + [ id:'test' ], // meta map + file(params.test_data['sarscov2']['genome']['genome_fasta'], checkIfExists: true) + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + + test("sarscov2 - fastq gzipped") { + + when { + process { + """ + input[0] = [ + [ id:'test' ], // meta map + file(params.test_data['sarscov2']['illumina']['test_1_fastq_gz'], checkIfExists: true) + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + + test("sarscov2 - fasta stub") { + + options "-stub" + when { + process { + """ + input[0] = [ + [ id:'test' ], // meta map + file(params.test_data['sarscov2']['genome']['genome_fasta'], checkIfExists: true) + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + +} diff --git a/modules/nf-core/last/lastdb/tests/main.nf.test.snap b/modules/nf-core/last/lastdb/tests/main.nf.test.snap new file mode 100644 index 0000000..ee2a113 --- /dev/null +++ b/modules/nf-core/last/lastdb/tests/main.nf.test.snap @@ -0,0 +1,149 @@ +{ + "sarscov2 - fasta stub": { + "content": [ + { + "0": [ + [ + { + "id": "test" + }, + [ + "test.bck:md5,d41d8cd98f00b204e9800998ecf8427e", + "test.des:md5,d41d8cd98f00b204e9800998ecf8427e", + "test.prj:md5,d41d8cd98f00b204e9800998ecf8427e", + "test.sds:md5,d41d8cd98f00b204e9800998ecf8427e", + "test.ssp:md5,d41d8cd98f00b204e9800998ecf8427e", + "test.suf:md5,d41d8cd98f00b204e9800998ecf8427e", + "test.tis:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ] + ], + "1": [ + "versions.yml:md5,c7fa6cfa252a61c600a73d6341d47557" + ], + "index": [ + [ + { + "id": "test" + }, + [ + "test.bck:md5,d41d8cd98f00b204e9800998ecf8427e", + "test.des:md5,d41d8cd98f00b204e9800998ecf8427e", + "test.prj:md5,d41d8cd98f00b204e9800998ecf8427e", + "test.sds:md5,d41d8cd98f00b204e9800998ecf8427e", + "test.ssp:md5,d41d8cd98f00b204e9800998ecf8427e", + "test.suf:md5,d41d8cd98f00b204e9800998ecf8427e", + "test.tis:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ] + ], + "versions": [ + "versions.yml:md5,c7fa6cfa252a61c600a73d6341d47557" + ] + } + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "24.04.2" + }, + "timestamp": "2024-06-06T16:42:08.735561" + }, + "sarscov2 - fastq gzipped": { + "content": [ + { + "0": [ + [ + { + "id": "test" + }, + [ + "test.bck:md5,05b14d8ac418b3193d9cc921086cea05", + "test.des:md5,26ab49015cc572172b9efa50fc5190bc", + "test.prj:md5,b50003077b7c7357fb8bacdf6f87653d", + "test.sds:md5,d3deb4c985081c9f5ad6684d405bd20b", + "test.ssp:md5,5c17139a9022b0cb97f007146fa1c6da", + "test.suf:md5,9ac359afa86a8964d81a87a1d4f05ef0", + "test.tis:md5,d57a3a5f7e3e036807356c15bd3aad97" + ] + ] + ], + "1": [ + "versions.yml:md5,c7fa6cfa252a61c600a73d6341d47557" + ], + "index": [ + [ + { + "id": "test" + }, + [ + "test.bck:md5,05b14d8ac418b3193d9cc921086cea05", + "test.des:md5,26ab49015cc572172b9efa50fc5190bc", + "test.prj:md5,b50003077b7c7357fb8bacdf6f87653d", + "test.sds:md5,d3deb4c985081c9f5ad6684d405bd20b", + "test.ssp:md5,5c17139a9022b0cb97f007146fa1c6da", + "test.suf:md5,9ac359afa86a8964d81a87a1d4f05ef0", + "test.tis:md5,d57a3a5f7e3e036807356c15bd3aad97" + ] + ] + ], + "versions": [ + "versions.yml:md5,c7fa6cfa252a61c600a73d6341d47557" + ] + } + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "24.04.2" + }, + "timestamp": "2024-06-06T16:41:07.4512" + }, + "sarscov2 - fasta": { + "content": [ + { + "0": [ + [ + { + "id": "test" + }, + [ + "test.bck:md5,157526d333b88523cb15ac4efe00738f", + "test.des:md5,3a9ea6d336e113a74d7fdca5e7b623fc", + "test.prj:md5,b937b1565cb4c983c8fcd3780d3e151e", + "test.sds:md5,e7729db27ac7a5a109c9d48cfcdc9015", + "test.ssp:md5,53524efdea3d8989201419a29e81ec1f", + "test.suf:md5,ef7482260705bb8146acbbbdce6c0068", + "test.tis:md5,b7c40f06b1309dc6f37849eeb86dfd22" + ] + ] + ], + "1": [ + "versions.yml:md5,c7fa6cfa252a61c600a73d6341d47557" + ], + "index": [ + [ + { + "id": "test" + }, + [ + "test.bck:md5,157526d333b88523cb15ac4efe00738f", + "test.des:md5,3a9ea6d336e113a74d7fdca5e7b623fc", + "test.prj:md5,b937b1565cb4c983c8fcd3780d3e151e", + "test.sds:md5,e7729db27ac7a5a109c9d48cfcdc9015", + "test.ssp:md5,53524efdea3d8989201419a29e81ec1f", + "test.suf:md5,ef7482260705bb8146acbbbdce6c0068", + "test.tis:md5,b7c40f06b1309dc6f37849eeb86dfd22" + ] + ] + ], + "versions": [ + "versions.yml:md5,c7fa6cfa252a61c600a73d6341d47557" + ] + } + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "24.04.2" + }, + "timestamp": "2024-06-06T16:40:05.874954" + } +} \ No newline at end of file diff --git a/modules/nf-core/last/lastdb/tests/nextflow.config b/modules/nf-core/last/lastdb/tests/nextflow.config new file mode 100644 index 0000000..50628ec --- /dev/null +++ b/modules/nf-core/last/lastdb/tests/nextflow.config @@ -0,0 +1,7 @@ +process { + + withName: LAST_LASTDB { + ext.args = '-Q0' + } + +} diff --git a/modules/nf-core/last/lastdb/tests/tags.yml b/modules/nf-core/last/lastdb/tests/tags.yml new file mode 100644 index 0000000..a679271 --- /dev/null +++ b/modules/nf-core/last/lastdb/tests/tags.yml @@ -0,0 +1,2 @@ +last/lastdb: + - "modules/nf-core/last/lastdb/**" diff --git a/modules/nf-core/last/mafswap/environment.yml b/modules/nf-core/last/mafswap/environment.yml new file mode 100644 index 0000000..cc112af --- /dev/null +++ b/modules/nf-core/last/mafswap/environment.yml @@ -0,0 +1,9 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json +name: "last_mafswap" +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - "bioconda::last=1542" diff --git a/modules/nf-core/last/mafswap/main.nf b/modules/nf-core/last/mafswap/main.nf new file mode 100644 index 0000000..875a6af --- /dev/null +++ b/modules/nf-core/last/mafswap/main.nf @@ -0,0 +1,46 @@ +process LAST_MAFSWAP { + tag "$meta.id" + label 'process_low' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/last:1542--h43eeafb_1' : + 'biocontainers/last:1542--h43eeafb_1' }" + + input: + tuple val(meta), path(maf) + + output: + tuple val(meta), path("*.maf.gz"), emit: maf + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + """ + set -o pipefail + maf-swap $args $maf | gzip --no-name > ${prefix}.swapped.maf.gz + + # maf-swap has no --version option but lastdb, part of the same package, has. + cat <<-END_VERSIONS > versions.yml + "${task.process}": + last: \$(lastdb --version 2>&1 | sed 's/lastdb //') + END_VERSIONS + """ + + stub: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + """ + echo stub | gzip --no-name > ${prefix}.swapped.maf.gz + + # maf-swap has no --version option but lastdb, part of the same package, has. + cat <<-END_VERSIONS > versions.yml + "${task.process}": + last: \$(lastdb --version 2>&1 | sed 's/lastdb //') + END_VERSIONS + """ +} diff --git a/modules/nf-core/last/mafswap/meta.yml b/modules/nf-core/last/mafswap/meta.yml new file mode 100644 index 0000000..aa0fc09 --- /dev/null +++ b/modules/nf-core/last/mafswap/meta.yml @@ -0,0 +1,44 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/meta-schema.json +name: "last_mafswap" +description: Reorder alignments in a MAF file +keywords: + - LAST + - reorder + - alignment + - MAF +tools: + - "last": + description: "LAST finds & aligns related regions of sequences." + homepage: "https://gitlab.com/mcfrith/last" + documentation: "https://gitlab.com/mcfrith/last/-/blob/main/doc/" + tool_dev_url: "https://gitlab.com/mcfrith/last" + licence: ["GPL v3-or-later"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'sample1', single_end:false ]` + - maf: + type: file + description: Multiple Aligment Format (MAF) file, optionally compressed with gzip + pattern: "*.{maf.gz,maf}" +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'sample1', single_end:false ]` + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - maf: + type: file + description: Multiple Aligment Format (MAF) file, compressed with gzip + pattern: "*.{maf.gz}" +authors: + - "@charles-plessy" +maintainers: + - "@charles-plessy" diff --git a/modules/nf-core/last/mafswap/tests/main.nf.test b/modules/nf-core/last/mafswap/tests/main.nf.test new file mode 100644 index 0000000..7aa7704 --- /dev/null +++ b/modules/nf-core/last/mafswap/tests/main.nf.test @@ -0,0 +1,57 @@ +nextflow_process { + + name "Test Process LAST_MAFSWAP" + script "../main.nf" + process "LAST_MAFSWAP" + + tag "modules" + tag "modules_nfcore" + tag "last" + tag "last/mafswap" + + test("sarscov2 - contigs - genome") { + + when { + process { + """ + input[0] = [ + [ id:'contigs.genome' ], // meta map + file(params.test_data['sarscov2']['genome']['contigs_genome_maf_gz'], checkIfExists: true) + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + + test("sarscov2 - contigs - genome - stub") { + + options "-stub" + when { + process { + """ + input[0] = [ + [ id:'contigs.genome' ], // meta map + file(params.test_data['sarscov2']['genome']['contigs_genome_maf_gz'], checkIfExists: true) + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + +} diff --git a/modules/nf-core/last/mafswap/tests/main.nf.test.snap b/modules/nf-core/last/mafswap/tests/main.nf.test.snap new file mode 100644 index 0000000..cc085df --- /dev/null +++ b/modules/nf-core/last/mafswap/tests/main.nf.test.snap @@ -0,0 +1,68 @@ +{ + "sarscov2 - contigs - genome": { + "content": [ + { + "0": [ + [ + { + "id": "contigs.genome" + }, + "contigs.genome.swapped.maf.gz:md5,394f4aa24ef8ff1eaa7258bd319cb8e3" + ] + ], + "1": [ + "versions.yml:md5,9819f8873c15cc665cbee998cefb72dd" + ], + "maf": [ + [ + { + "id": "contigs.genome" + }, + "contigs.genome.swapped.maf.gz:md5,394f4aa24ef8ff1eaa7258bd319cb8e3" + ] + ], + "versions": [ + "versions.yml:md5,9819f8873c15cc665cbee998cefb72dd" + ] + } + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "24.04.2" + }, + "timestamp": "2024-06-04T17:59:43.47396" + }, + "sarscov2 - contigs - genome - stub": { + "content": [ + { + "0": [ + [ + { + "id": "contigs.genome" + }, + "contigs.genome.swapped.maf.gz:md5,f50b84b1db4b83ba62ec1deacc69c260" + ] + ], + "1": [ + "versions.yml:md5,9819f8873c15cc665cbee998cefb72dd" + ], + "maf": [ + [ + { + "id": "contigs.genome" + }, + "contigs.genome.swapped.maf.gz:md5,f50b84b1db4b83ba62ec1deacc69c260" + ] + ], + "versions": [ + "versions.yml:md5,9819f8873c15cc665cbee998cefb72dd" + ] + } + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "24.04.2" + }, + "timestamp": "2024-06-06T19:41:29.891962" + } +} \ No newline at end of file diff --git a/modules/nf-core/last/mafswap/tests/tags.yml b/modules/nf-core/last/mafswap/tests/tags.yml new file mode 100644 index 0000000..698db64 --- /dev/null +++ b/modules/nf-core/last/mafswap/tests/tags.yml @@ -0,0 +1,2 @@ +last/mafswap: + - "modules/nf-core/last/mafswap/**" diff --git a/modules/nf-core/last/split/environment.yml b/modules/nf-core/last/split/environment.yml new file mode 100644 index 0000000..7d76b55 --- /dev/null +++ b/modules/nf-core/last/split/environment.yml @@ -0,0 +1,9 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json +name: "last_split" +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - "bioconda::last=1542" diff --git a/modules/nf-core/last/split/main.nf b/modules/nf-core/last/split/main.nf new file mode 100644 index 0000000..410d16f --- /dev/null +++ b/modules/nf-core/last/split/main.nf @@ -0,0 +1,70 @@ +process LAST_SPLIT { + tag "$meta.id" + label 'process_high' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/last:1542--h43eeafb_1' : + 'biocontainers/last:1542--h43eeafb_1' }" + + input: + tuple val(meta), path(maf) + + output: + tuple val(meta), path("*.maf.gz"), emit: maf + tuple val(meta), path("*.tsv") , emit: multiqc + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + if( "$maf" == "${prefix}.maf.gz" ) error "Input and output names are the same, use \"task.ext.prefix\" to disambiguate!" + """ + set -o pipefail + + function calculate_psl_metrics() { + awk 'BEGIN { + FS="\t"; # Set field separator as tab + totalMatches = 0; + totalAlignmentLength = 0; + print "Sample\tTotalAlignmentLength\tPercentSimilarity"; # Header for MultiQC + } + { + totalMatches += \$1 + \$3; # Sum matches and repMatches + totalAlignmentLength += \$1 + \$2 + \$3 + \$6 + \$8; # Sum matches, misMatches, repMatches, qBaseInsert, and tBaseInsert + } + END { + percentSimilarity = (totalAlignmentLength > 0) ? (totalMatches / totalAlignmentLength * 100) : 0; + print "$meta.id" "\t" totalAlignmentLength "\t" percentSimilarity; # Data in TSV format + }' + } + + zcat < $maf | + last-split $args | + tee >(gzip --no-name > ${prefix}.maf.gz) | + maf-convert psl | + calculate_psl_metrics > ${prefix}.tsv + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + last: \$(last-split --version 2>&1 | sed 's/last-split //') + END_VERSIONS + """ + + stub: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + if( "$maf" == "${prefix}.maf.gz" ) error "Input and output names are the same, use \"task.ext.prefix\" to disambiguate!" + """ + echo stub | gzip --no-name > ${prefix}.maf.gz + touch ${prefix}.tsv + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + last: \$(last-split --version 2>&1 | sed 's/last-split //') + END_VERSIONS + """ +} diff --git a/modules/nf-core/last/split/meta.yml b/modules/nf-core/last/split/meta.yml new file mode 100644 index 0000000..2e23f8b --- /dev/null +++ b/modules/nf-core/last/split/meta.yml @@ -0,0 +1,51 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/meta-schema.json +name: "last_split" +description: Find split or spliced alignments in a MAF file +keywords: + - LAST + - split + - spliced + - alignment + - MAF +tools: + - "last": + description: "LAST finds & aligns related regions of sequences." + homepage: "https://gitlab.com/mcfrith/last" + documentation: "https://gitlab.com/mcfrith/last/-/blob/main/doc/" + tool_dev_url: "https://gitlab.com/mcfrith/last" + licence: ["GPL v3-or-later"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'sample1', single_end:false ]` + - maf: + type: file + description: Multiple Aligment Format (MAF) file, compressed with gzip + pattern: "*.{maf.gz}" + +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'sample1', single_end:false ]` + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - maf: + type: file + description: Multiple Aligment Format (MAF) file, compressed with gzip + pattern: "*.{maf.gz}" + - multiqc: + type: file + description: Alignment summary for MultiQC + pattern: "*.tsv" +authors: + - "@aleksandrabliznina" + - "@charles-plessy" +maintainers: + - "@charles-plessy" diff --git a/modules/nf-core/last/split/tests/main.nf.test b/modules/nf-core/last/split/tests/main.nf.test new file mode 100644 index 0000000..4460d69 --- /dev/null +++ b/modules/nf-core/last/split/tests/main.nf.test @@ -0,0 +1,58 @@ +// nf-core modules test last/split +nextflow_process { + + name "Test Process LAST_SPLIT" + script "../main.nf" + process "LAST_SPLIT" + + tag "modules" + tag "modules_nfcore" + tag "last" + tag "last/split" + + test("sarscov2 - contigs_genome") { + + when { + process { + """ + input[0] = [ + [ id:'sarscov.contigs.genome' ], // meta map + file(params.test_data['sarscov2']['genome']['contigs_genome_maf_gz'], checkIfExists: true) + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + + test("sarscov2 - contigs_genome - stub") { + + options "-stub" + when { + process { + """ + input[0] = [ + [ id:'sarscov.contigs.genome' ], // meta map + file(params.test_data['sarscov2']['genome']['contigs_genome_maf_gz'], checkIfExists: true) + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + +} diff --git a/modules/nf-core/last/split/tests/main.nf.test.snap b/modules/nf-core/last/split/tests/main.nf.test.snap new file mode 100644 index 0000000..953a654 --- /dev/null +++ b/modules/nf-core/last/split/tests/main.nf.test.snap @@ -0,0 +1,100 @@ +{ + "sarscov2 - contigs_genome": { + "content": [ + { + "0": [ + [ + { + "id": "sarscov.contigs.genome" + }, + "sarscov.contigs.genome.maf.gz:md5,689cb18ff7098ff90eaf87017f590208" + ] + ], + "1": [ + [ + { + "id": "sarscov.contigs.genome" + }, + "sarscov.contigs.genome.tsv:md5,b625a3b37343e9e6a279b8625d4c2da8" + ] + ], + "2": [ + "versions.yml:md5,9e429d0800988ae0bbe5000827d34ad1" + ], + "maf": [ + [ + { + "id": "sarscov.contigs.genome" + }, + "sarscov.contigs.genome.maf.gz:md5,689cb18ff7098ff90eaf87017f590208" + ] + ], + "multiqc": [ + [ + { + "id": "sarscov.contigs.genome" + }, + "sarscov.contigs.genome.tsv:md5,b625a3b37343e9e6a279b8625d4c2da8" + ] + ], + "versions": [ + "versions.yml:md5,9e429d0800988ae0bbe5000827d34ad1" + ] + } + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "24.04.2" + }, + "timestamp": "2024-07-02T11:45:00.535348" + }, + "sarscov2 - contigs_genome - stub": { + "content": [ + { + "0": [ + [ + { + "id": "sarscov.contigs.genome" + }, + "sarscov.contigs.genome.maf.gz:md5,f50b84b1db4b83ba62ec1deacc69c260" + ] + ], + "1": [ + [ + { + "id": "sarscov.contigs.genome" + }, + "sarscov.contigs.genome.tsv:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "2": [ + "versions.yml:md5,9e429d0800988ae0bbe5000827d34ad1" + ], + "maf": [ + [ + { + "id": "sarscov.contigs.genome" + }, + "sarscov.contigs.genome.maf.gz:md5,f50b84b1db4b83ba62ec1deacc69c260" + ] + ], + "multiqc": [ + [ + { + "id": "sarscov.contigs.genome" + }, + "sarscov.contigs.genome.tsv:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "versions": [ + "versions.yml:md5,9e429d0800988ae0bbe5000827d34ad1" + ] + } + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "24.04.2" + }, + "timestamp": "2024-07-02T11:45:21.243325" + } +} \ No newline at end of file diff --git a/modules/nf-core/last/split/tests/tags.yml b/modules/nf-core/last/split/tests/tags.yml new file mode 100644 index 0000000..950878a --- /dev/null +++ b/modules/nf-core/last/split/tests/tags.yml @@ -0,0 +1,2 @@ +last/split: + - "modules/nf-core/last/split/**" diff --git a/modules/nf-core/last/train/environment.yml b/modules/nf-core/last/train/environment.yml new file mode 100644 index 0000000..5edaf64 --- /dev/null +++ b/modules/nf-core/last/train/environment.yml @@ -0,0 +1,9 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json +name: "last_train" +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - "bioconda::last=1542" diff --git a/modules/nf-core/last/train/main.nf b/modules/nf-core/last/train/main.nf new file mode 100644 index 0000000..9f592b5 --- /dev/null +++ b/modules/nf-core/last/train/main.nf @@ -0,0 +1,64 @@ +process LAST_TRAIN { + tag "$meta.id" + label 'process_high' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/last:1542--h43eeafb_1' : + 'biocontainers/last:1542--h43eeafb_1' }" + + input: + tuple val(meta), path(fastx) + path index + + output: + tuple val(meta), path("*.train"), emit: param_file + tuple val(meta), path("*.tsv") , emit: multiqc + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + """ + INDEX_NAME=\$(basename \$(ls $index/*.des) .des) + + last-train \\ + $args \\ + -P $task.cpus \\ + ${index}/\$INDEX_NAME \\ + $fastx \\ + > ${prefix}.\$INDEX_NAME.train + + echo "id\tsubstitution_percent_identity\tlast -t\tlast -a\tlast -A\tlast -b\tlast -B\tlast -S" > ${prefix}.train.tsv + printf "\$(basename ${prefix}.\$INDEX_NAME.train .target.train)\t" >> ${prefix}.train.tsv + grep 'substitution percent identity' ${prefix}.\$INDEX_NAME.train | tail -n 1 | awk '{print \$5}' | tr '\n' '\t' >> ${prefix}.train.tsv + grep 'last -t' ${prefix}.\$INDEX_NAME.train | tail -n 1 | awk '{print \$2}' | sed -e 's/-t//' | tr '\n' '\t' >> ${prefix}.train.tsv + grep 'last -a' ${prefix}.\$INDEX_NAME.train | tail -n 1 | awk '{print \$3}' | tr '\n' '\t' >> ${prefix}.train.tsv + grep 'last -A' ${prefix}.\$INDEX_NAME.train | tail -n 1 | awk '{print \$3}' | tr '\n' '\t' >> ${prefix}.train.tsv + grep 'last -b' ${prefix}.\$INDEX_NAME.train | tail -n 1 | awk '{print \$3}' | tr '\n' '\t' >> ${prefix}.train.tsv + grep 'last -B' ${prefix}.\$INDEX_NAME.train | tail -n 1 | awk '{print \$3}' | tr '\n' '\t' >> ${prefix}.train.tsv + grep 'last -S' ${prefix}.\$INDEX_NAME.train | tail -n 1 | awk '{print \$3}' >> ${prefix}.train.tsv + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + last: \$(lastdb --version | sed 's/lastdb //') + END_VERSIONS + """ + + stub: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + """ + INDEX_NAME=\$(basename \$(ls $index/*.des) .des) + touch ${prefix}.\$INDEX_NAME.train + touch ${prefix}.train.tsv + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + last: \$(lastdb --version | sed 's/lastdb //') + END_VERSIONS + """ +} diff --git a/modules/nf-core/last/train/meta.yml b/modules/nf-core/last/train/meta.yml new file mode 100644 index 0000000..d55e827 --- /dev/null +++ b/modules/nf-core/last/train/meta.yml @@ -0,0 +1,54 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/meta-schema.json +name: "last_train" +description: Find suitable score parameters for sequence alignment +keywords: + - LAST + - train + - fastq + - fasta +tools: + - "last": + description: "LAST finds & aligns related regions of sequences." + homepage: "https://gitlab.com/mcfrith/last" + documentation: "https://gitlab.com/mcfrith/last/-/blob/main/doc/last-train.rst" + tool_dev_url: "https://gitlab.com/mcfrith/last" + licence: ["GPL v3-or-later"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'sample1', single_end:false ]` + - fastx: + type: file + description: FASTA/FASTQ file + pattern: "*.{fasta,fastq}" + - index: + type: directory + description: Directory containing the files of the LAST index + pattern: "lastdb/" +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'sample1', single_end:false ]` + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - param_file: + type: file + description: Trained parameter file + pattern: "*.train" + - multiqc: + type: file + description: Alignment parameter summary for MultiQC + pattern: "*.tsv" +authors: + - "@aleksandrabliznina" + - "@charles-plessy" + - "@U13bs1125" +maintainers: + - "@charles-plessy" diff --git a/modules/nf-core/last/train/tests/main.nf.test b/modules/nf-core/last/train/tests/main.nf.test new file mode 100644 index 0000000..a4168f2 --- /dev/null +++ b/modules/nf-core/last/train/tests/main.nf.test @@ -0,0 +1,85 @@ +nextflow_process { + + name "Test Process LAST_TRAIN" + script "../main.nf" + process "LAST_TRAIN" + + tag "modules" + tag "modules_nfcore" + tag "last" + tag "last/train" + tag "untar" + + test("sarscov2 - genome - contigs") { + + setup { + run("UNTAR") { + script "../../../untar/main.nf" + process { + """ + input[0] = [ + [id:'genome'], // meta map + file(params.test_data['sarscov2']['genome']['lastdb_tar_gz'], checkIfExists: true) + ] + """ + } + } + } + + when { + process { + """ + input[0] = [ + [ id:'contigs', single_end:false ], // meta map + file(params.test_data['sarscov2']['illumina']['contigs_fasta'], checkIfExists: true), + ] + input[1] = UNTAR.out.untar.map{ it[1] } + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + } + + test("sarscov2 - genome - contigs - stub") { + + options "-stub" + setup { + run("UNTAR") { + script "../../../untar/main.nf" + process { + """ + input[0] = [ + [id:'genome'], // meta map + file(params.test_data['sarscov2']['genome']['lastdb_tar_gz'], checkIfExists: true) + ] + """ + } + } + } + + when { + process { + """ + input[0] = [ + [ id:'contigs', single_end:false ], // meta map + file(params.test_data['sarscov2']['illumina']['contigs_fasta'], checkIfExists: true), + ] + input[1] = UNTAR.out.untar.map{ it[1] } + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + } +} diff --git a/modules/nf-core/last/train/tests/main.nf.test.snap b/modules/nf-core/last/train/tests/main.nf.test.snap new file mode 100644 index 0000000..a3a0cc8 --- /dev/null +++ b/modules/nf-core/last/train/tests/main.nf.test.snap @@ -0,0 +1,108 @@ +{ + "sarscov2 - genome - contigs - stub": { + "content": [ + { + "0": [ + [ + { + "id": "contigs", + "single_end": false + }, + "contigs..des.train:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "1": [ + [ + { + "id": "contigs", + "single_end": false + }, + "contigs.train.tsv:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "2": [ + "versions.yml:md5,c5578547acf9e77e1e8f6bf796e32ac2" + ], + "multiqc": [ + [ + { + "id": "contigs", + "single_end": false + }, + "contigs.train.tsv:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "param_file": [ + [ + { + "id": "contigs", + "single_end": false + }, + "contigs..des.train:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "versions": [ + "versions.yml:md5,c5578547acf9e77e1e8f6bf796e32ac2" + ] + } + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "24.04.3" + }, + "timestamp": "2024-07-10T14:50:11.528587" + }, + "sarscov2 - genome - contigs": { + "content": [ + { + "0": [ + [ + { + "id": "contigs", + "single_end": false + }, + "contigs.genome.train:md5,3d7e1c630705d83c6a11b6f28d5aefcb" + ] + ], + "1": [ + [ + { + "id": "contigs", + "single_end": false + }, + "contigs.train.tsv:md5,f09bcd1a111241a3439258a43c2a1a4e" + ] + ], + "2": [ + "versions.yml:md5,38234cf053c708e57cc080990f777411" + ], + "multiqc": [ + [ + { + "id": "contigs", + "single_end": false + }, + "contigs.train.tsv:md5,f09bcd1a111241a3439258a43c2a1a4e" + ] + ], + "param_file": [ + [ + { + "id": "contigs", + "single_end": false + }, + "contigs.genome.train:md5,3d7e1c630705d83c6a11b6f28d5aefcb" + ] + ], + "versions": [ + "versions.yml:md5,38234cf053c708e57cc080990f777411" + ] + } + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "24.04.2" + }, + "timestamp": "2024-07-03T18:04:34.914789" + } +} \ No newline at end of file diff --git a/modules/nf-core/last/train/tests/tags.yml b/modules/nf-core/last/train/tests/tags.yml new file mode 100644 index 0000000..8418b5f --- /dev/null +++ b/modules/nf-core/last/train/tests/tags.yml @@ -0,0 +1,2 @@ +last/train: + - "modules/nf-core/last/train/**" diff --git a/modules/nf-core/multiqc/environment.yml b/modules/nf-core/multiqc/environment.yml index ca39fb6..2121492 100644 --- a/modules/nf-core/multiqc/environment.yml +++ b/modules/nf-core/multiqc/environment.yml @@ -4,4 +4,4 @@ channels: - bioconda - defaults dependencies: - - bioconda::multiqc=1.21 + - bioconda::multiqc=1.23 diff --git a/modules/nf-core/multiqc/main.nf b/modules/nf-core/multiqc/main.nf index 47ac352..459dfea 100644 --- a/modules/nf-core/multiqc/main.nf +++ b/modules/nf-core/multiqc/main.nf @@ -3,14 +3,16 @@ process MULTIQC { conda "${moduleDir}/environment.yml" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/multiqc:1.21--pyhdfd78af_0' : - 'biocontainers/multiqc:1.21--pyhdfd78af_0' }" + 'https://depot.galaxyproject.org/singularity/multiqc:1.23--pyhdfd78af_0' : + 'biocontainers/multiqc:1.23--pyhdfd78af_0' }" input: path multiqc_files, stageAs: "?/*" path(multiqc_config) path(extra_multiqc_config) path(multiqc_logo) + path(replace_names) + path(sample_names) output: path "*multiqc_report.html", emit: report @@ -26,6 +28,8 @@ process MULTIQC { def config = multiqc_config ? "--config $multiqc_config" : '' def extra_config = extra_multiqc_config ? "--config $extra_multiqc_config" : '' def logo = multiqc_logo ? /--cl-config 'custom_logo: "${multiqc_logo}"'/ : '' + def replace = replace_names ? "--replace-names ${replace_names}" : '' + def samples = sample_names ? "--sample-names ${sample_names}" : '' """ multiqc \\ --force \\ @@ -33,6 +37,8 @@ process MULTIQC { $config \\ $extra_config \\ $logo \\ + $replace \\ + $samples \\ . cat <<-END_VERSIONS > versions.yml diff --git a/modules/nf-core/multiqc/meta.yml b/modules/nf-core/multiqc/meta.yml index 45a9bc3..382c08c 100644 --- a/modules/nf-core/multiqc/meta.yml +++ b/modules/nf-core/multiqc/meta.yml @@ -29,6 +29,19 @@ input: type: file description: Optional logo file for MultiQC pattern: "*.{png}" + - replace_names: + type: file + description: | + Optional two-column sample renaming file. First column a set of + patterns, second column a set of corresponding replacements. Passed via + MultiQC's `--replace-names` option. + pattern: "*.{tsv}" + - sample_names: + type: file + description: | + Optional TSV file with headers, passed to the MultiQC --sample_names + argument. + pattern: "*.{tsv}" output: - report: type: file diff --git a/modules/nf-core/multiqc/tests/main.nf.test b/modules/nf-core/multiqc/tests/main.nf.test index f1c4242..6aa27f4 100644 --- a/modules/nf-core/multiqc/tests/main.nf.test +++ b/modules/nf-core/multiqc/tests/main.nf.test @@ -17,6 +17,8 @@ nextflow_process { input[1] = [] input[2] = [] input[3] = [] + input[4] = [] + input[5] = [] """ } } @@ -41,6 +43,8 @@ nextflow_process { input[1] = Channel.of(file("https://github.com/nf-core/tools/raw/dev/nf_core/pipeline-template/assets/multiqc_config.yml", checkIfExists: true)) input[2] = [] input[3] = [] + input[4] = [] + input[5] = [] """ } } @@ -66,6 +70,8 @@ nextflow_process { input[1] = [] input[2] = [] input[3] = [] + input[4] = [] + input[5] = [] """ } } diff --git a/modules/nf-core/multiqc/tests/main.nf.test.snap b/modules/nf-core/multiqc/tests/main.nf.test.snap index bfebd80..45e95e5 100644 --- a/modules/nf-core/multiqc/tests/main.nf.test.snap +++ b/modules/nf-core/multiqc/tests/main.nf.test.snap @@ -2,14 +2,14 @@ "multiqc_versions_single": { "content": [ [ - "versions.yml:md5,21f35ee29416b9b3073c28733efe4b7d" + "versions.yml:md5,87904cd321df21fac35d18f0fc01bb19" ] ], "meta": { "nf-test": "0.8.4", - "nextflow": "23.10.1" + "nextflow": "24.04.2" }, - "timestamp": "2024-02-29T08:48:55.657331" + "timestamp": "2024-07-10T12:41:34.562023" }, "multiqc_stub": { "content": [ @@ -17,25 +17,25 @@ "multiqc_report.html", "multiqc_data", "multiqc_plots", - "versions.yml:md5,21f35ee29416b9b3073c28733efe4b7d" + "versions.yml:md5,87904cd321df21fac35d18f0fc01bb19" ] ], "meta": { "nf-test": "0.8.4", - "nextflow": "23.10.1" + "nextflow": "24.04.2" }, - "timestamp": "2024-02-29T08:49:49.071937" + "timestamp": "2024-07-10T11:27:11.933869532" }, "multiqc_versions_config": { "content": [ [ - "versions.yml:md5,21f35ee29416b9b3073c28733efe4b7d" + "versions.yml:md5,87904cd321df21fac35d18f0fc01bb19" ] ], "meta": { "nf-test": "0.8.4", - "nextflow": "23.10.1" + "nextflow": "24.04.2" }, - "timestamp": "2024-02-29T08:49:25.457567" + "timestamp": "2024-07-10T11:26:56.709849369" } } \ No newline at end of file diff --git a/modules/nf-core/fastqc/environment.yml b/modules/nf-core/seqtk/cutn/environment.yml similarity index 61% rename from modules/nf-core/fastqc/environment.yml rename to modules/nf-core/seqtk/cutn/environment.yml index 1787b38..a57afbb 100644 --- a/modules/nf-core/fastqc/environment.yml +++ b/modules/nf-core/seqtk/cutn/environment.yml @@ -1,7 +1,7 @@ -name: fastqc +name: seqtk_cutn channels: - conda-forge - bioconda - defaults dependencies: - - bioconda::fastqc=0.12.1 + - bioconda::seqtk=1.4 diff --git a/modules/nf-core/seqtk/cutn/main.nf b/modules/nf-core/seqtk/cutn/main.nf new file mode 100644 index 0000000..c2344a8 --- /dev/null +++ b/modules/nf-core/seqtk/cutn/main.nf @@ -0,0 +1,49 @@ +process SEQTK_CUTN { + tag "$meta.id" + label 'process_low' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/seqtk:1.4--he4a0461_1' : + 'biocontainers/seqtk:1.4--he4a0461_1' }" + + input: + tuple val(meta), path(fasta) + + output: + tuple val(meta), path("*.bed") , emit: bed + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + + """ + seqtk \\ + cutN \\ + $args \\ + -g $fasta \\ + > ${prefix}.bed + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + seqtk: \$(echo \$(seqtk 2>&1) | sed 's/^.*Version: //; s/ .*\$//') + END_VERSIONS + """ + + stub: + def prefix = task.ext.prefix ?: "${meta.id}" + + """ + touch ${prefix}.bed + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + seqtk: \$(echo \$(seqtk 2>&1) | sed 's/^.*Version: //; s/ .*\$//') + END_VERSIONS + """ + +} diff --git a/modules/nf-core/seqtk/cutn/meta.yml b/modules/nf-core/seqtk/cutn/meta.yml new file mode 100644 index 0000000..1082867 --- /dev/null +++ b/modules/nf-core/seqtk/cutn/meta.yml @@ -0,0 +1,41 @@ +name: seqtk_cutn +description: Generates a BED file containing genomic locations of lengths of N. +keywords: + - cut + - fasta + - seqtk +tools: + - seqtk: + description: Seqtk is a fast and lightweight tool for processing sequences in the FASTA or FASTQ format. Seqtk mergepe command merges pair-end reads into one interleaved file. + homepage: https://github.com/lh3/seqtk + documentation: https://docs.csc.fi/apps/seqtk/ + tool_dev_url: https://github.com/lh3/seqtk + licence: ["MIT"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - fasta: + type: file + description: A single fasta file to be split. + pattern: "*.{fasta}" +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - bed: + type: file + description: The output bed which summarised locations of cuts + pattern: "*.{bed}" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@DLBPointon" +maintainers: + - "@DLBPointon" diff --git a/modules/nf-core/seqtk/cutn/tests/main.nf.test b/modules/nf-core/seqtk/cutn/tests/main.nf.test new file mode 100644 index 0000000..a38ed41 --- /dev/null +++ b/modules/nf-core/seqtk/cutn/tests/main.nf.test @@ -0,0 +1,57 @@ +nextflow_process { + + name "Test Process SEQTK_CUTN" + script "../main.nf" + process "SEQTK_CUTN" + + tag "modules" + tag "modules_nfcore" + tag "seqtk" + tag "seqtk/cutn" + + test("homo_21_cut") { + + when { + process { + """ + input[0] = [ + [ id:'test', single_end:true ], + file(params.test_data['homo_sapiens']['genome']['genome_21_fasta'], checkIfExists: true) + ] + """ + } + } + + then { + assertAll ( + { assert process.success }, + { assert snapshot(process.out).match() }, + { assert snapshot( + file(process.out.bed[0][1]).name + ).match("genome_cut") + } + ) + } + } + + test("homo_21_cut_stub") { + options "-stub" + when { + process { + """ + input[0] = [ + [ id:'test', single_end:true ], + file(params.test_data['homo_sapiens']['genome']['genome_21_fasta'], checkIfExists: true) + ] + """ + } + } + + then { + assertAll ( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + } +} diff --git a/modules/nf-core/seqtk/cutn/tests/main.nf.test.snap b/modules/nf-core/seqtk/cutn/tests/main.nf.test.snap new file mode 100644 index 0000000..998beda --- /dev/null +++ b/modules/nf-core/seqtk/cutn/tests/main.nf.test.snap @@ -0,0 +1,70 @@ +{ + "genome_cut": { + "content": [ + "test.bed" + ], + "timestamp": "2024-02-22T16:02:14.744148" + }, + "homo_21_cut_stub": { + "content": [ + { + "0": [ + [ + { + "id": "test", + "single_end": true + }, + "test.bed:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "1": [ + "versions.yml:md5,3da8ed2738f3c093d1e62d796fd76428" + ], + "bed": [ + [ + { + "id": "test", + "single_end": true + }, + "test.bed:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "versions": [ + "versions.yml:md5,3da8ed2738f3c093d1e62d796fd76428" + ] + } + ], + "timestamp": "2024-02-22T16:02:23.596389" + }, + "homo_21_cut": { + "content": [ + { + "0": [ + [ + { + "id": "test", + "single_end": true + }, + "test.bed:md5,16cbba84e3a4bdbb52217afb5051f948" + ] + ], + "1": [ + "versions.yml:md5,3da8ed2738f3c093d1e62d796fd76428" + ], + "bed": [ + [ + { + "id": "test", + "single_end": true + }, + "test.bed:md5,16cbba84e3a4bdbb52217afb5051f948" + ] + ], + "versions": [ + "versions.yml:md5,3da8ed2738f3c093d1e62d796fd76428" + ] + } + ], + "timestamp": "2024-02-22T16:02:14.695205" + } +} \ No newline at end of file diff --git a/modules/nf-core/seqtk/cutn/tests/tags.yml b/modules/nf-core/seqtk/cutn/tests/tags.yml new file mode 100644 index 0000000..13c64cc --- /dev/null +++ b/modules/nf-core/seqtk/cutn/tests/tags.yml @@ -0,0 +1,2 @@ +seqtk/cutn: + - "modules/nf-core/seqtk/cutn/**" diff --git a/nextflow.config b/nextflow.config index 5242373..c75282a 100644 --- a/nextflow.config +++ b/nextflow.config @@ -9,9 +9,17 @@ // Global default params, used in configs params { - // TODO nf-core: Specify your pipeline's command line flags - // Input options + // Mandatory options input = null + target = null + + // Dotplot parameter + skip_dotplot_m2m = false + skip_dotplot_m2o = false + skip_dotplot_o2m = false + skip_dotplot_o2o = false + dotplot_options = '' + // References genome = null igenomes_base = 's3://ngi-igenomes/igenomes/' @@ -50,6 +58,16 @@ params { max_cpus = 16 max_time = '240.h' + seed = 'YASS' + targetName = 'target' + m2m = false + + // Alignment options + lastal_args = '-C2 -D1e9' + lastal_extr_args = '' + last_split_mismap = '1e-05' + lastal_params = null + // Schema validation default options validationFailUnrecognisedParams = false validationLenientMode = false @@ -175,6 +193,7 @@ profiles { executor.memory = 8.GB } test { includeConfig 'conf/test.config' } + test_small{ includeConfig 'conf/test_small.config'} // Just a pair of fungal genomes test_full { includeConfig 'conf/test_full.config' } } @@ -239,7 +258,7 @@ manifest { description = """Pairwise alignment pipeline (genome to genome or reads to genome)""" mainScript = 'main.nf' nextflowVersion = '!>=23.04.0' - version = '1.0dev' + version = '1.0.0' doi = '' } diff --git a/nextflow_schema.json b/nextflow_schema.json index c3c84b9..312b1bd 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -10,7 +10,7 @@ "type": "object", "fa_icon": "fas fa-terminal", "description": "Define where the pipeline should find input data and save output data.", - "required": ["input", "outdir"], + "required": ["input", "target", "outdir"], "properties": { "input": { "type": "string", @@ -23,6 +23,19 @@ "help_text": "You will need to create a design file with information about the samples in your experiment before running the pipeline. Use this parameter to specify its location. It has to be a comma-separated file with 3 columns, and a header row. See [usage docs](https://nf-co.re/pairgenomealign/usage#samplesheet-input).", "fa_icon": "fas fa-file-csv" }, + "target": { + "type": "string", + "pattern": "^\\S+\\.fn?a(sta)?(\\.gz)?$", + "format": "file-path", + "description": "Path or URL to a FASTA genome file for the _target_ genome.", + "fa_icon": "far fa-file-code" + }, + "targetName": { + "type": "string", + "default": "target", + "help_text": "By default the _target_ genome is named `target` and this name is concatenated with the sample IDs using `___` as a separator to construct alignment file names. Use this option to provide a more informative name for the target genome.", + "description": "Target genome name." + }, "outdir": { "type": "string", "format": "directory-path", @@ -43,6 +56,85 @@ } } }, + "alignment_options": { + "title": "Alignment options", + "type": "object", + "description": "Arguments for the lastdb, last-train, lastal and last-split programs.", + "default": "", + "properties": { + "m2m": { + "type": "boolean", + "description": "Make a many to many alignment", + "help_text": "This adds time and can comsume considerable amount of space; use only if you need that data, for instance in the case of a self-alignment", + "fa_icon": "fas fa-arrows-alt" + }, + "seed": { + "type": "string", + "enum": ["YASS", "NEAR", "MAM8", "RY128"], + "help_text": "LAST creates a database of seed sequences in the _target_ genome, and provides different ways to generate these seeds. The default (`YASS`) searches for long-and-weak similarities that allow for mismatches but not gaps. Among alternatives, there are `NEAR` for short-and-strong (near-identical) similarities with many gaps (insertions and deletions), `MAM8` to find weak similarities with high sensitivity, but low speed and high memory usage, or `RY128` that reduces run time and memory use, by only seeking seeds at ~1/128 of positions in each sequence, which is useful when the purpose of running this pipeline is only to generate whole-genome dotplots, or when sensitivity for tiny fragments may be unnecessary or undesirable. See for details.", + "description": "Select the the LAST seed to index the _target_ genome.", + "default": "YASS", + "fa_icon": "fas fa-seedling" + }, + "lastal_params": { + "type": "string", + "description": "Path to a file containing alignment parameters or a scoring matrix. If this option is used, `last-train` will be skipped and alignment parameters will be the same for each query.", + "fa_icon": "far fa-file-alt" + }, + "lastal_args": { + "type": "string", + "default": "-C2 -D1e9", + "description": "Arguments passed to both `last-train` and `lastal`.", + "fa_icon": "fas fa-align-center" + }, + "lastal_extr_args": { + "type": "string", + "description": "Arguments passed only to `lastal` (useful when they are not recognised by `last-train`).", + "fa_icon": "fas fa-align-center" + }, + "last_split_mismap": { + "type": "string", + "default": 0.00001, + "fa_icon": "fas fa-cut", + "description": "Mismap probability cutoff for `last-split`." + } + }, + "fa_icon": "fas fa-cogs" + }, + "dotplot_parameters": { + "title": "Dotplot parameters", + "type": "object", + "description": "Customise dot-plots or skip them.", + "default": "", + "properties": { + "dotplot_options": { + "type": "string", + "description": "Extra arguments passed to `last-dotplot` to customise the output. See .", + "fa_icon": "fas fa-cog" + }, + "skip_dotplot_o2m": { + "type": "boolean", + "description": "Do not generate the one-to-many alignment dot-plot.", + "fa_icon": "fas fa-forward" + }, + "skip_dotplot_o2o": { + "type": "boolean", + "description": "Do not generate the one-to-one alignment dot-plot.", + "fa_icon": "fas fa-forward" + }, + "skip_dotplot_m2o": { + "type": "boolean", + "description": "Do not generate the many-to-one alignment dot-plot.", + "fa_icon": "fas fa-forward" + }, + "skip_dotplot_m2m": { + "type": "boolean", + "description": "Do not generate the many-to-many alignment dot-plot.", + "fa_icon": "fas fa-forward" + } + }, + "fa_icon": "fas fa-cogs" + }, "reference_genome_options": { "title": "Reference genome options", "type": "object", @@ -280,6 +372,12 @@ { "$ref": "#/definitions/input_output_options" }, + { + "$ref": "#/definitions/alignment_options" + }, + { + "$ref": "#/definitions/dotplot_parameters" + }, { "$ref": "#/definitions/reference_genome_options" }, diff --git a/subworkflows/local/pairalign_m2m/main.nf b/subworkflows/local/pairalign_m2m/main.nf new file mode 100644 index 0000000..1bbe385 --- /dev/null +++ b/subworkflows/local/pairalign_m2m/main.nf @@ -0,0 +1,119 @@ +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + IMPORT MODULES / SUBWORKFLOWS / FUNCTIONS +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +*/ + +include { LAST_DOTPLOT as ALIGNMENT_DOTPLOT_M2O } from '../../../modules/nf-core/last/dotplot/main' +include { LAST_DOTPLOT as ALIGNMENT_DOTPLOT_M2M } from '../../../modules/nf-core/last/dotplot/main' +include { LAST_DOTPLOT as ALIGNMENT_DOTPLOT_O2O } from '../../../modules/nf-core/last/dotplot/main' +include { LAST_DOTPLOT as ALIGNMENT_DOTPLOT_O2M } from '../../../modules/nf-core/last/dotplot/main' +include { LAST_LASTAL as ALIGNMENT_LASTAL_M2M } from '../../../modules/nf-core/last/lastal/main' +include { LAST_LASTDB as ALIGNMENT_LASTDB } from '../../../modules/nf-core/last/lastdb/main' +include { LAST_SPLIT as ALIGNMENT_SPLIT_M2O } from '../../../modules/nf-core/last/split/main' +include { LAST_SPLIT as ALIGNMENT_SPLIT_O2O } from '../../../modules/nf-core/last/split/main' +include { LAST_SPLIT as ALIGNMENT_SPLIT_O2M } from '../../../modules/nf-core/last/split/main' +include { LAST_TRAIN as ALIGNMENT_TRAIN } from '../../../modules/nf-core/last/train/main' + +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + RUN MAIN WORKFLOW +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +*/ + +workflow PAIRALIGN_M2M { + + take: + ch_target // channel: target file read in from --target + ch_queries // channel: query sequences found in samplesheet read in from --input + ch_target_bed // channel: position of poly-N stretches in the target genome + ch_queries_bed // channel: position of poly-N stretches in the query genomes + + main: + + // Index the target genome + // + ALIGNMENT_LASTDB ( + ch_target + ) + + // Train alignment parameters + // + ALIGNMENT_TRAIN ( + ch_queries, + ALIGNMENT_LASTDB.out.index.map { row -> row[1] } // Remove metadata map + ) + + // Align queries to target. This is a many-to-many alignment + // + ALIGNMENT_LASTAL_M2M ( + ch_queries.join(ALIGNMENT_TRAIN.out.param_file), + ALIGNMENT_LASTDB.out.index.map { row -> row[1] } // Remove metadata map + ) + + // Optionally plot the many-to-many alignment + // + if (! (params.skip_dotplot_m2m) ) { + ALIGNMENT_DOTPLOT_M2M ( + ALIGNMENT_LASTAL_M2M.out.maf.join(ch_queries_bed), + ch_target_bed, + 'png' + ) + } + + // Compute the one-to-many alignment and optionally plot it + // + ALIGNMENT_SPLIT_O2M ( + ALIGNMENT_LASTAL_M2M.out.maf + ) + if (! (params.skip_dotplot_o2m) ) { + ALIGNMENT_DOTPLOT_O2M ( + ALIGNMENT_SPLIT_O2M.out.maf.join(ch_queries_bed), + ch_target_bed, + 'png' + ) + } + + // Compute the many-to-one alignment and optionally plot it + // + ALIGNMENT_SPLIT_M2O ( + ALIGNMENT_LASTAL_M2M.out.maf + ) + if (! (params.skip_dotplot_m2o) ) { + ALIGNMENT_DOTPLOT_M2O ( + ALIGNMENT_SPLIT_M2O.out.maf.join(ch_queries_bed), + ch_target_bed, + 'png' + ) + } + + // Compute the one-to-one alignment and optionally plot it + // + ALIGNMENT_SPLIT_O2O ( + ALIGNMENT_SPLIT_M2O.out.maf + ) + if (! (params.skip_dotplot_o2o) ) { + ALIGNMENT_DOTPLOT_O2O ( + ALIGNMENT_SPLIT_O2O.out.maf.join(ch_queries_bed), + ch_target_bed, + 'png' + ) + } + + emit: + + multiqc = Channel.empty() + .mix( ALIGNMENT_TRAIN.out.multiqc.collect{ it[1]} ) + .mix(ALIGNMENT_SPLIT_O2O.out.multiqc.collect{ it[1]} ) + m2m = ALIGNMENT_LASTAL_M2M.out.maf + m2o = ALIGNMENT_SPLIT_M2O.out.maf + o2m = ALIGNMENT_SPLIT_O2M.out.maf + o2o = ALIGNMENT_SPLIT_O2O.out.maf + versions = ALIGNMENT_LASTDB.out.versions +} + +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + THE END +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +*/ diff --git a/subworkflows/local/pairalign_m2o/main.nf b/subworkflows/local/pairalign_m2o/main.nf new file mode 100644 index 0000000..8d902d2 --- /dev/null +++ b/subworkflows/local/pairalign_m2o/main.nf @@ -0,0 +1,88 @@ +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + IMPORT MODULES / SUBWORKFLOWS / FUNCTIONS +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +*/ + +include { LAST_DOTPLOT as ALIGNMENT_DOTPLOT_M2O } from '../../../modules/nf-core/last/dotplot/main' +include { LAST_DOTPLOT as ALIGNMENT_DOTPLOT_O2O } from '../../../modules/nf-core/last/dotplot/main' +include { LAST_LASTAL as ALIGNMENT_LASTAL_M2O } from '../../../modules/nf-core/last/lastal/main' +include { LAST_LASTDB as ALIGNMENT_LASTDB } from '../../../modules/nf-core/last/lastdb/main' +include { LAST_SPLIT as ALIGNMENT_SPLIT_O2O } from '../../../modules/nf-core/last/split/main' +include { LAST_TRAIN as ALIGNMENT_TRAIN } from '../../../modules/nf-core/last/train/main' + +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + RUN MAIN WORKFLOW +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +*/ + +workflow PAIRALIGN_M2O { + + take: + ch_target // channel: target file read in from --target + ch_queries // channel: query sequences found in samplesheet read in from --input + ch_target_bed // channel: position of poly-N stretches in the target genome + ch_queries_bed // channel: position of poly-N stretches in the query genomes + + main: + + // Index the target genome + // + ALIGNMENT_LASTDB ( + ch_target + ) + + // Train alignment parameters + // + ALIGNMENT_TRAIN ( + ch_queries, + ALIGNMENT_LASTDB.out.index.map { row -> row[1] } // Remove metadata map + ) + + // Align queries to target. + // This directly computes a many-to-one alignment because of parameter modules + // + ALIGNMENT_LASTAL_M2O ( + ch_queries.join(ALIGNMENT_TRAIN.out.param_file), + ALIGNMENT_LASTDB.out.index.map { row -> row[1] } // Remove metadata map + ) + + // Optionally plot the many-to-one alignment + // + if (! (params.skip_dotplot_m2o) ) { + ALIGNMENT_DOTPLOT_M2O ( + ALIGNMENT_LASTAL_M2O.out.maf.join(ch_queries_bed), + ch_target_bed, + 'png' + ) + } + + // Compute the one-to-one alignment and optionally plot it + // + ALIGNMENT_SPLIT_O2O ( + ALIGNMENT_LASTAL_M2O.out.maf + ) + if (! (params.skip_dotplot_o2o) ) { + ALIGNMENT_DOTPLOT_O2O ( + ALIGNMENT_SPLIT_O2O.out.maf.join(ch_queries_bed), + ch_target_bed, + 'png' + ) + } + + emit: + + multiqc = Channel.empty() + .mix( ALIGNMENT_TRAIN.out.multiqc.collect{ it[1]} ) + .mix(ALIGNMENT_SPLIT_O2O.out.multiqc.collect{ it[1]} ) + m2o = ALIGNMENT_LASTAL_M2O.out.maf + o2o = ALIGNMENT_SPLIT_O2O.out.maf + versions = ALIGNMENT_LASTDB.out.versions +} + +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + THE END +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +*/ diff --git a/subworkflows/local/utils_nfcore_pairgenomealign_pipeline/main.nf b/subworkflows/local/utils_nfcore_pairgenomealign_pipeline/main.nf index 87776d4..d322a1d 100644 --- a/subworkflows/local/utils_nfcore_pairgenomealign_pipeline/main.nf +++ b/subworkflows/local/utils_nfcore_pairgenomealign_pipeline/main.nf @@ -82,22 +82,6 @@ workflow PIPELINE_INITIALISATION { // Channel .fromSamplesheet("input") - .map { - meta, fastq_1, fastq_2 -> - if (!fastq_2) { - return [ meta.id, meta + [ single_end:true ], [ fastq_1 ] ] - } else { - return [ meta.id, meta + [ single_end:false ], [ fastq_1, fastq_2 ] ] - } - } - .groupTuple() - .map { - validateInputSamplesheet(it) - } - .map { - meta, fastqs -> - return [ meta, fastqs.flatten() ] - } .set { ch_samplesheet } emit: @@ -202,7 +186,6 @@ def genomeExistsError() { // Generate methods description for MultiQC // def toolCitationText() { - // TODO nf-core: Optionally add in-text citation tools to this list. // Can use ternary operators to dynamically construct based conditions, e.g. params["run_xyz"] ? "Tool (Foo et al. 2023)" : "", // Uncomment function in methodsDescriptionText to render in MultiQC report def citation_text = [ @@ -216,7 +199,6 @@ def toolCitationText() { } def toolBibliographyText() { - // TODO nf-core: Optionally add bibliographic entries to this list. // Can use ternary operators to dynamically construct based conditions, e.g. params["run_xyz"] ? "
  • Author (2023) Pub name, Journal, DOI
  • " : "", // Uncomment function in methodsDescriptionText to render in MultiQC report def reference_text = [ @@ -249,7 +231,6 @@ def methodsDescriptionText(mqc_methods_yaml) { meta["tool_citations"] = "" meta["tool_bibliography"] = "" - // TODO nf-core: Only uncomment below if logic in toolCitationText/toolBibliographyText has been filled! // meta["tool_citations"] = toolCitationText().replaceAll(", \\.", ".").replaceAll("\\. \\.", ".").replaceAll(", \\.", ".") // meta["tool_bibliography"] = toolBibliographyText() diff --git a/workflows/pairgenomealign.nf b/workflows/pairgenomealign.nf index f32678d..0b289f9 100644 --- a/workflows/pairgenomealign.nf +++ b/workflows/pairgenomealign.nf @@ -4,10 +4,15 @@ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */ -include { FASTQC } from '../modules/nf-core/fastqc/main' -include { MULTIQC } from '../modules/nf-core/multiqc/main' -include { paramsSummaryMap } from 'plugin/nf-validation' -include { paramsSummaryMultiqc } from '../subworkflows/nf-core/utils_nfcore_pipeline' +include { ASSEMBLYSCAN } from '../modules/nf-core/assemblyscan/main' +include { MULTIQC_ASSEMBLYSCAN_PLOT_DATA } from '../modules/local/multiqc_assemblyscan_plot_data' +include { PAIRALIGN_M2M } from '../subworkflows/local/pairalign_m2m/main' +include { SEQTK_CUTN as CUTN_TARGET } from '../modules/nf-core/seqtk/cutn/main' +include { SEQTK_CUTN as CUTN_QUERY } from '../modules/nf-core/seqtk/cutn/main' +include { PAIRALIGN_M2O } from '../subworkflows/local/pairalign_m2o/main' +include { MULTIQC } from '../modules/nf-core/multiqc/main' +include { paramsSummaryMap } from 'plugin/nf-validation' +include { paramsSummaryMultiqc } from '../subworkflows/nf-core/utils_nfcore_pipeline' include { softwareVersionsToYAML } from '../subworkflows/nf-core/utils_nfcore_pipeline' include { methodsDescriptionText } from '../subworkflows/local/utils_nfcore_pairgenomealign_pipeline' @@ -20,25 +25,70 @@ include { methodsDescriptionText } from '../subworkflows/local/utils_nfcore_pair workflow PAIRGENOMEALIGN { take: - ch_samplesheet // channel: samplesheet read in from --input + ch_samplesheet // channel: samplesheet read in from --input + ch_targetgenome // channel: genome file read in from --target main: ch_versions = Channel.empty() ch_multiqc_files = Channel.empty() + // Extract coordinates of poly-N regions; they are often contig boundaries in scaffolds // - // MODULE: Run FastQC + CUTN_TARGET ( + // Avoid file name conflicts when target genome is also in the list of queries + ch_targetgenome.map { meta, file -> [ [id:'targetGenome'] , file ] } + ) + CUTN_QUERY ( + ch_samplesheet + ) + + // Extract statistics on contig length and GC content // - FASTQC ( + ASSEMBLYSCAN ( ch_samplesheet ) - ch_multiqc_files = ch_multiqc_files.mix(FASTQC.out.zip.collect{it[1]}) - ch_versions = ch_versions.mix(FASTQC.out.versions.first()) + // Parse assembly-scan's JSON for MultiQC + MULTIQC_ASSEMBLYSCAN_PLOT_DATA ( + ASSEMBLYSCAN.out.json.collect{it[1]} + ) + + // Prefix query ids with target genome name before producing alignment files + // + ch_samplesheet = ch_samplesheet + .map { row -> [ [id: params.targetName + '___' + row[0].id] , row.tail() ] } + ch_seqtk_cutn_query = CUTN_QUERY.out.bed + .map { row -> [ [id: params.targetName + '___' + row[0].id] , row.tail() ] } + // Align with either the many-to-many or the many-to-one subworkflow + // and collect the output under a fixed name // + if (!(params.m2m)) { + PAIRALIGN_M2O ( + ch_targetgenome, + ch_samplesheet, + CUTN_TARGET.out.bed, + ch_seqtk_cutn_query + ) + pairalign_out = PAIRALIGN_M2O.out + } else { + PAIRALIGN_M2M ( + ch_targetgenome, + ch_samplesheet, + CUTN_TARGET.out.bed, + ch_seqtk_cutn_query + ) + pairalign_out = PAIRALIGN_M2M.out + } + // Collate and save software versions // + + ch_versions = ch_versions + .mix( CUTN_TARGET.out.versions) + .mix(ASSEMBLYSCAN.out.versions) + .mix( pairalign_out.versions) + softwareVersionsToYAML(ch_versions) .collectFile( storeDir: "${params.outdir}/pipeline_info", @@ -69,10 +119,12 @@ workflow PAIRGENOMEALIGN { ch_methods_description = Channel.value( methodsDescriptionText(ch_multiqc_custom_methods_description)) - ch_multiqc_files = ch_multiqc_files.mix( - ch_workflow_summary.collectFile(name: 'workflow_summary_mqc.yaml')) - ch_multiqc_files = ch_multiqc_files.mix(ch_collated_versions) - ch_multiqc_files = ch_multiqc_files.mix( + ch_multiqc_files = ch_multiqc_files + .mix(ch_workflow_summary.collectFile(name: 'workflow_summary_mqc.yaml')) + .mix(MULTIQC_ASSEMBLYSCAN_PLOT_DATA.out.tsv) + .mix(pairalign_out.multiqc) + .mix(ch_collated_versions) + .mix( ch_methods_description.collectFile( name: 'methods_description_mqc.yaml', sort: true @@ -83,7 +135,9 @@ workflow PAIRGENOMEALIGN { ch_multiqc_files.collect(), ch_multiqc_config.toList(), ch_multiqc_custom_config.toList(), - ch_multiqc_logo.toList() + ch_multiqc_logo.toList(), + [], + [] ) emit: