diff --git a/.gitpod.yml b/.gitpod.yml index 461186376..5907fb59c 100644 --- a/.gitpod.yml +++ b/.gitpod.yml @@ -7,8 +7,9 @@ tasks: vscode: extensions: # based on nf-core.nf-core-extensionpack - #- esbenp.prettier-vscode # Markdown/CommonMark linting and style checking for Visual Studio Code - - EditorConfig.EditorConfig # override user/workspace settings with settings found in .editorconfig files + #{%- if code_linters -%} + - esbenp.prettier-vscode # Markdown/CommonMark linting and style checking for Visual Studio Code + - EditorConfig.EditorConfig # override user/workspace settings with settings found in .editorconfig files{% endif %} - Gruntfuggly.todo-tree # Display TODO and FIXME in a tree view in the activity bar - mechatroner.rainbow-csv # Highlight columns in csv files in different colors - nextflow.nextflow # Nextflow syntax highlighting diff --git a/CHANGELOG.md b/CHANGELOG.md index 56e634389..750e3e085 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,12 +3,18 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). -## v0.0.1dev - [date] +## 0.0.1dev - [2024-10-16] Initial release of nf-core/circrna, created with the [nf-core](https://nf-co.re/) template. ### `Added` +- Multiple methods for BSJ detection (CIRIquant, find_circ, segemehl, mapsplice, circexplorer2, circrna_finder, DCC) +- Multiple methods for circRNA quantification (psirc-quant, CIRIquant, sum and max aggregations) +- DB and GTF-based circRNA annotation +- MiRNA target prediction (TargetScan, miRanda) and correlation analysis +- Basic statistical analyses (CircTest, CIRIquant differential expression) + ### `Fixed` ### `Dependencies` diff --git a/CITATIONS.md b/CITATIONS.md index 0103d5451..a4d226ee1 100644 --- a/CITATIONS.md +++ b/CITATIONS.md @@ -10,14 +10,158 @@ ## Pipeline tools +- [BEDTools](https://pubmed.ncbi.nlm.nih.gov/20110278/) + + > Quinlan AR, Hall IM. BEDTools: a flexible suite of utilities for comparing genomic features. Bioinformatics. 2010 Mar 15;26(6):841-2. doi: 10.1093/bioinformatics/btq033. Epub 2010 Jan 28. PubMed PMID: 20110278; PubMed Central PMCID: PMC2832824. + +- [Bowtie](https://doi.org/10.1186/gb-2009-10-3-r25) + + > Langmead, B., Trapnell, C., Pop, M. et al., 2009. Ultrafast and memory-efficient alignment of short DNA sequences to the human genome. Genome Biol 10, R25. doi: 10.1186/gb-2009-10-3-r25 + +- [Bowtie2](https:/dx.doi.org/10.1038/nmeth.1923) + + > Langmead, B. and Salzberg, S. L. 2012 Fast gapped-read alignment with Bowtie 2. Nature methods, 9(4), p. 357–359. doi: 10.1038/nmeth.1923. + +- [BWA](https://www.ncbi.nlm.nih.gov/pubmed/19451168/) + + > Li H, Durbin R. Fast and accurate short read alignment with Burrows-Wheeler transform. Bioinformatics. 2009 Jul 15;25(14):1754-60. doi: 10.1093/bioinformatics/btp324. Epub 2009 May 18. PubMed PMID: 19451168; PubMed Central PMCID: PMC2705234. + +- [CIRCexplorer2](https://doi.org/10.1101/gr.202895.115) + + > Zhang XO, Dong R, Zhang Y, Zhang JL, Luo Z, Zhang J, Chen LL, Yang L. (2016). Diverse alternative back-splicing and alternative splicing landscape of circular RNAs. Genome Res. 2016 Sep;26(9):1277-87. + +- [circRNA finder](https://doi.org/10.1016/j.celrep.2014.10.062) + + > Westholm, J.O., Lai, E.C., et al. (2016). Genome-wide Analysis of Drosophila Circular RNAs Reveals Their Structural and Sequence Properties and Age-Dependent Neural Accumulation Westholm et al. Cell Reports. + +- [CIRIquant](https://doi.org/10.1038/s41467-019-13840-9) + + > Zhang, J., Chen, S., Yang, J. et al. (2020). Accurate quantification of circular RNAs identifies extensive circular isoform switching events. Nat Commun 11, 90. + +- [DCC](https://doi.org/10.1093/bioinformatics/btv656) + + > Jun Cheng, Franziska Metge, Christoph Dieterich, (2016). Specific identification and quantification of circular RNAs from sequencing data, Bioinformatics, 32(7), 1094–1096. + - [FastQC](https://www.bioinformatics.babraham.ac.uk/projects/fastqc/) > Andrews, S. (2010). FastQC: A Quality Control Tool for High Throughput Sequence Data [Online]. +- [find circ](https://doi.org/10.1038/nature11928) + + > Memczak, S., Jens, M., Elefsinioti, A., Torti, F., Krueger, J., Rybak, A., Maier, L., Mackowiak, S. D., Gregersen, L. H., Munschauer, M., Loewer, A., Ziebold, U., Landthaler, M., Kocks, C., le Noble, F., & Rajewsky, N. (2013). Circular RNAs are a large class of animal RNAs with regulatory potency. Nature, 495(7441), 333–338. + +- [GATK](https://pubmed.ncbi.nlm.nih.gov/20644199/) + + > McKenna A, Hanna M, Banks E, et al.: The Genome Analysis Toolkit: a MapReduce framework for analyzing next-generation DNA sequencing data. Genome Res. 2010 Sep;20(9):1297-303. doi: 10.1101/gr.107524.110. Epub 2010 Jul 19. PubMed PMID: 20644199; PubMed Central PMCID: PMC2928508. + +- [HISAT2](https://pubmed.ncbi.nlm.nih.gov/31375807/) + + > Kim D, Paggi JM, Park C, Bennett C, Salzberg SL. Graph-based genome alignment and genotyping with HISAT2 and HISAT-genotype Graph-based genome alignment and genotyping with HISAT2 and HISAT-genotype. Nat Biotechnol. 2019 Aug;37(8):907-915. doi: 10.1038/s41587-019-0201-4. Epub 2019 Aug 2. PubMed PMID: 31375807. + +- [MapSplice2](https://doi.org/10.1093/nar/gkq622) + + > Wang, K., Liu J., et al. (2010) MapSplice: Accurate mapping of RNA-seq reads for splice junction discovery, Nucleic Acids Research, 38(18), 178. + +- [miRanda](https://doi.org/10.1186/gb-2003-5-1-r1) + + > Enright, A.J., John, B., Gaul, U. et al. (2003). MicroRNA targets in Drosophila. Genome Biol 5, R1. + +- [find circ](https://doi.org/10.1038/nature11928) + + > Memczak, S., Jens, M., Elefsinioti, A., Torti, F., Krueger, J., Rybak, A., Maier, L., Mackowiak, S. D., Gregersen, L. H., Munschauer, M., Loewer, A., Ziebold, U., Landthaler, M., Kocks, C., le Noble, F., & Rajewsky, N. (2013). Circular RNAs are a large class of animal RNAs with regulatory potency. Nature, 495(7441), 333–338. + +- [GATK](https://pubmed.ncbi.nlm.nih.gov/20644199/) + + > McKenna A, Hanna M, Banks E, et al.: The Genome Analysis Toolkit: a MapReduce framework for analyzing next-generation DNA sequencing data. Genome Res. 2010 Sep;20(9):1297-303. doi: 10.1101/gr.107524.110. Epub 2010 Jul 19. PubMed PMID: 20644199; PubMed Central PMCID: PMC2928508. + +- [HISAT2](https://pubmed.ncbi.nlm.nih.gov/31375807/) + + > Kim D, Paggi JM, Park C, Bennett C, Salzberg SL. Graph-based genome alignment and genotyping with HISAT2 and HISAT-genotype Graph-based genome alignment and genotyping with HISAT2 and HISAT-genotype. Nat Biotechnol. 2019 Aug;37(8):907-915. doi: 10.1038/s41587-019-0201-4. Epub 2019 Aug 2. PubMed PMID: 31375807. + +- [MapSplice2](https://doi.org/10.1093/nar/gkq622) + + > Wang, K., Liu J., et al. (2010) MapSplice: Accurate mapping of RNA-seq reads for splice junction discovery, Nucleic Acids Research, 38(18), 178. + +- [miRanda](https://doi.org/10.1186/gb-2003-5-1-r1) + + > Enright, A.J., John, B., Gaul, U. et al. (2003). MicroRNA targets in Drosophila. Genome Biol 5, R1. + - [MultiQC](https://pubmed.ncbi.nlm.nih.gov/27312411/) > Ewels P, Magnusson M, Lundin S, Käller M. MultiQC: summarize analysis results for multiple tools and samples in a single report. Bioinformatics. 2016 Oct 1;32(19):3047-8. doi: 10.1093/bioinformatics/btw354. Epub 2016 Jun 16. PubMed PMID: 27312411; PubMed Central PMCID: PMC5039924. +- [R](https://www.R-project.org/) + + > R Core Team (2020). R: A language and environment for statistical computing. R Foundation for Statistical Computing, Vienna, Austria. + + - [biomaRt](https://doi.org/10.1038/nprot.2009.97) + + > Durinck S, Spellman PT, Birney E, Huber W. (2009). Mapping identifiers for the integration of genomic datasets with the R/Bioconductor package biomaRt. Nat Protoc. 4(8):1184-91. + + - [circlize](https://doi.org/10.1093/bioinformatics/btu393) + + > Zuguang Gu, Lei Gu, Roland Eils, Matthias Schlesner, Benedikt Brors (2014). circlize implements and enhances circular visualization in R , Bioinformatics, 30,(19) 2811–2812. + + - [DESeq2](https://doi.org/10.1186/s13059-014-0550-8) + + > Love, M.I., Huber, W. & Anders, S. (2014). Moderated estimation of fold change and dispersion for RNA-seq data with DESeq2. Genome Biol 15, 550. + + - [EnhancedVolcano](https://bioconductor.org/packages/release/bioc/html/EnhancedVolcano.html) + + > Blighe K, Rana S, Lewis M (2020). EnhancedVolcano: Publication-ready volcano plots with enhanced colouring and labeling. + + - [ggplot2](https://ggplot2.tidyverse.org) + + > Wickham H (2016). ggplot2: Elegant Graphics for Data Analysis. Springer-Verlag New York. ISBN 978-3-319-24277-4. + + - [ggpubr](https://rpkgs.datanovia.com/ggpubr/) + + > Kassambara A. (2020). ggpubr: 'ggplot2' Based Publication Ready Plots. + + - [ihw](https://doi.org/10.1038/nmeth.3885) + + > Ignatiadis, N., Klaus, B., Zaugg, J. et al. (2016). Data-driven hypothesis weighting increases detection power in genome-scale multiple testing. Nat Methods 13, 577–580. + + - [PCAtools](https://bioconductor.org/packages/release/bioc/html/PCAtools.html) + + > Blighe K, Lun A (2020). PCAtools: PCAtools: Everything Principal Components Analysis. + + - [pheatmap](https://cran.r-project.org/package=pheatmap) + + > Kolde, R. (2019) Pretty Heatmaps. + + - [pvclust](https://doi.org/10.1093/bioinformatics/btl117) + + > Suzuki R., Shimodaira H., (2006). Pvclust: an R package for assessing the uncertainty in hierarchical clustering, Bioinformatics, 22(12), 1540–1542. + +- [SAMtools](https://pubmed.ncbi.nlm.nih.gov/19505943/) + + > Li H, Handsaker B, Wysoker A, Fennell T, Ruan J, Homer N, Marth G, Abecasis G, Durbin R; 1000 Genome Project Data Processing Subgroup. The Sequence Alignment/Map format and SAMtools. Bioinformatics. 2009 Aug 15;25(16):2078-9. doi: 10.1093/bioinformatics/btp352. Epub 2009 Jun 8. PubMed PMID: 19505943; PubMed Central PMCID: PMC2723002. + +- [Segemehl](https://doi.org/10.1371/journal.pcbi.1000502) + + > Hoffmann S, Otto C, Kurtz S, Sharma CM, Khaitovich P, Vogel J, Stadler PF, Hackermueller J: "Fast mapping of short sequences with mismatches, insertions and deletions using index structures", PLoS Comput Biol (2009) vol. 5 (9) pp. e1000502. + +- [STAR](https://pubmed.ncbi.nlm.nih.gov/23104886/) + + > Dobin A, Davis CA, Schlesinger F, Drenkow J, Zaleski C, Jha S, Batut P, Chaisson M, Gingeras TR. STAR: ultrafast universal RNA-seq aligner Bioinformatics. 2013 Jan 1;29(1):15-21. doi: 10.1093/bioinformatics/bts635. Epub 2012 Oct 25. PubMed PMID: 23104886; PubMed Central PMCID: PMC3530905. + +- [StringTie2](https://pubmed.ncbi.nlm.nih.gov/31842956/) + + > Kovaka S, Zimin AV, Pertea GM, Razaghi R, Salzberg SL, Pertea M. Transcriptome assembly from long-read RNA-seq alignments with StringTie2 Genome Biol. 2019 Dec 16;20(1):278. doi: 10.1186/s13059-019-1910-1. PubMed PMID: 31842956; PubMed Central PMCID: PMC6912988. + +- [TargetScan](https://doi.org/10.7554/elife.05005) + + > Agarwal V, Bell GW, Nam JW, Bartel DP. (2015). Predicting effective microRNA target sites in mammalian mRNAs. Elife, 4:e05005. + +- [ViennaRNA](https://doi.org/10.1186/1748-7188-6-26) + + > Lorenz, R., Bernhart, S.H., Höner zu Siederdissen, C. et al. (2011). ViennaRNA Package 2.0. Algorithms Mol Biol 6, 26. + +## Test data References + +> Cao D. An autoregulation loop in fust-1 for circular RNA regulation in Caenorhabditis elegans. Genetics. 2021 Nov 5;219(3):iyab145. doi: 10.1093/genetics/iyab145. PMID: 34740247; PMCID: PMC8570788. + ## Software packaging/containerisation tools - [Anaconda](https://anaconda.com) diff --git a/README.md b/README.md index f5fc95718..b1bda54a1 100644 --- a/README.md +++ b/README.md @@ -9,7 +9,10 @@ [![GitHub Actions Linting Status](https://github.com/nf-core/circrna/actions/workflows/linting.yml/badge.svg)](https://github.com/nf-core/circrna/actions/workflows/linting.yml)[![AWS CI](https://img.shields.io/badge/CI%20tests-full%20size-FF9900?labelColor=000000&logo=Amazon%20AWS)](https://nf-co.re/circrna/results)[![Cite with Zenodo](http://img.shields.io/badge/DOI-10.5281/zenodo.XXXXXXX-1073c8?labelColor=000000)](https://doi.org/10.5281/zenodo.XXXXXXX) [![nf-test](https://img.shields.io/badge/unit_tests-nf--test-337ab7.svg)](https://www.nf-test.com) -[![Nextflow](https://img.shields.io/badge/nextflow%20DSL2-%E2%89%A524.04.2-23aa62.svg)](https://www.nextflow.io/) +[![GitHub Actions CI Status](https://github.com/nf-core/circrna/workflows/nf-core%20CI/badge.svg)](https://github.com/nf-core/circrna/actions?query=workflow%3A%22nf-core+CI%22) +[![GitHub Actions Linting Status](https://github.com/nf-core/circrna/workflows/nf-core%20linting/badge.svg)](https://github.com/nf-core/circrna/actions?query=workflow%3A%22nf-core+linting%22)[![AWS CI](https://img.shields.io/badge/CI%20tests-full%20size-FF9900?labelColor=000000&logo=Amazon%20AWS)](https://nf-co.re/circrna/results)[![Cite with Zenodo](http://img.shields.io/badge/DOI-10.5281/zenodo.XXXXXXX-1073c8?labelColor=000000)](https://doi.org/10.5281/zenodo.XXXXXXX) + +[![Nextflow](https://img.shields.io/badge/nextflow%20DSL2-%E2%89%A523.04.0-23aa62.svg)](https://www.nextflow.io/) [![run with conda](http://img.shields.io/badge/run%20with-conda-3EB049?labelColor=000000&logo=anaconda)](https://docs.conda.io/en/latest/) [![run with docker](https://img.shields.io/badge/run%20with-docker-0db7ed?labelColor=000000&logo=docker)](https://www.docker.com/) [![run with singularity](https://img.shields.io/badge/run%20with-singularity-1d355c.svg?labelColor=000000)](https://sylabs.io/docs/) @@ -19,45 +22,84 @@ ## Introduction -**nf-core/circrna** is a bioinformatics pipeline that ... - - - - - - -1. Read QC ([`FastQC`](https://www.bioinformatics.babraham.ac.uk/projects/fastqc/)) -2. Present QC for raw reads ([`MultiQC`](http://multiqc.info/)) +**nf-core/circrna** is a bioinformatics pipeline to analyse total RNA sequencing data obtained from organisms with a reference genome and annotation. It takes a samplesheet and FASTQ files as input, performs quality control (QC), trimming, back-splice junction (BSJ) detection, annotation, quantification and miRNA target prediction of circular RNAs. + +The pipeline is still under development, but the BSJ detection and quantification steps are already implemented and functional. The following features are planned to be implemented soon: + +- Isoform-level circRNA detection and quantification +- circRNA-miRNA interaction analysis using [SPONGE](https://doi.org/10.1093/bioinformatics/btz314) and [spongEffects](https://doi.org/10.1093/bioinformatics/btad276) +- Improved downstream analyses + +If you want to contribute, feel free to create an issue or pull request on the [GitHub repository](https://github.com/nf-core/circrna) or join the [Slack channel](https://nf-co.re/join/slack). + +## Pipeline summary + +![Metro Map](./docs/images/metro-map.png) + +- Raw read QC ([`FastQC`](https://www.bioinformatics.babraham.ac.uk/projects/fastqc/)) +- Adapter trimming ([`Trim Galore!`](https://www.bioinformatics.babraham.ac.uk/projects/trim_galore/)) +- BSJ detection + - [`CIRIquant`](https://github.com/Kevinzjy/CIRIquant) + - [`STAR 2-Pass mode`](https://github.com/alexdobin/STAR) + - [`CIRCexplorer2`](https://circexplorer2.readthedocs.io/en/latest/) + - [`circRNA finder`](https://github.com/orzechoj/circRNA_finder) + - [`DCC`](https://github.com/dieterich-lab/DCC) + - [`find circ`](https://github.com/marvin-jens/find_circ) + - [`MapSplice`](http://www.netlab.uky.edu/p/bioinfo/MapSplice2) + - [`Segemehl`](https://www.bioinf.uni-leipzig.de/Software/segemehl/) +- circRNA annotation + - Based on a GTF file + - Based on database files (if provided) +- Extract circRNA sequences and build circular transcriptome +- Merge circular transcriptome with linear transcriptome derived from provided GTF +- Quantification of combined circular and linear transcriptome + - [`psirc-quant`](https://github.com/Christina-hshi/psirc) +- miRNA binding affinity analysis (only if the `mature` parameter is provided) + - Normalizes miRNA expression (only if the `mirna_expression` parameter is provided) + - Binding site prediction + - [`miRanda`](http://cbio.mskcc.org/miRNA2003/miranda.html) + - [`TargetScan`](http://www.targetscan.org/cgi-bin/targetscan/data_download.vert72.cgi) + - Perform majority vote on binding sites + - Compute correlations between miRNA and transcript expression levels (only if the `mirna_expression` parameter is provided) +- Statistical tests (only if the `phenotype` parameter is provided) + - [`CircTest`](https://github.com/dieterich-lab/CircTest) +- MultiQC report [`MultiQC`](http://multiqc.info/) ## Usage > [!NOTE] > If you are new to Nextflow and nf-core, please refer to [this page](https://nf-co.re/docs/usage/installation) on how to set-up Nextflow. Make sure to [test your setup](https://nf-co.re/docs/usage/introduction#how-to-run-a-pipeline) with `-profile test` before running the workflow on actual data. - - Now, you can run the pipeline using: - +```bash +nextflow run nf-core/circrna \ + -profile \ + --input samplesheet.csv \ + --outdir +``` + +> [!WARNING] +> Please provide pipeline parameters via the CLI or Nextflow `-params-file` option. Custom config files including those provided by the `-c` Nextflow option can be used to provide any configuration _**except for parameters**_; +> see [docs](https://nf-co.re/usage/configuration#custom-configuration-files). + +For more details and further functionality, please refer to the [usage documentation](https://nf-co.re/circrna/usage) and the [parameter documentation](https://nf-co.re/circrna/parameters). + +## Pipeline output + +To see the results of an example test run with a full size dataset refer to the [results](https://nf-co.re/circrna/results) tab on the nf-core website pipeline page. +For more details about the output files and reports, please refer to the +[output documentation](https://nf-co.re/circrna/output). ```bash nextflow run nf-core/circrna \ @@ -79,11 +121,22 @@ For more details about the output files and reports, please refer to the ## Credits -nf-core/circrna was originally written by Barry Digby, Nico Trummer. +nf-core/circrna was originally written by [Barry Digby](https://github.com/BarryDigby). +It was later refactored, extended and improved by [Nico Trummer](https://github.com/nictru). + +We thank the following people for their extensive assistance in the development of this pipeline (in alphabetical order): -We thank the following people for their extensive assistance in the development of this pipeline: +- [Alexander Peltzer](https://github.com/apeltzer) +- [Ben Whittle](https://github.com/bj-w) +- [Kevin Menden](https://github.com/KevinMenden) +- [Malte Weyrich](https://github.com/mweyrich28) +- [Marieke Vromman](https://github.com/MariekeVromman) +- [Maxime Garcia](https://github.com/maxulysse) +- [Phil Ewels](https://github.com/ewels) - +## Acknowledgements + +![SFI](./docs/images/Genomics-Data-Science-original.png) ## Contributions and Support @@ -96,7 +149,12 @@ For further information or help, don't hesitate to get in touch on the [Slack `# - +> **nf-core/circrna: a portable workflow for the quantification, miRNA target prediction and differential expression analysis of circular RNAs.** +> +> Barry Digby, Stephen P. Finn, & Pilib Ó Broin +> +> [BMC Bioinformatics 24, 27 (2023)](https://bmcbioinformatics.biomedcentral.com/articles/10.1186/s12859-022-05125-8) +> doi: [10.1186/s12859-022-05125-8](https://doi.org/10.1186/s12859-022-05125-8) An extensive list of references for the tools used by the pipeline can be found in the [`CITATIONS.md`](CITATIONS.md) file. diff --git a/assets/samplesheet.csv b/assets/samplesheet.csv index 5f653ab7b..8d222a80e 100644 --- a/assets/samplesheet.csv +++ b/assets/samplesheet.csv @@ -1,3 +1,3 @@ -sample,fastq_1,fastq_2 -SAMPLE_PAIRED_END,/path/to/fastq/files/AEG588A1_S1_L002_R1_001.fastq.gz,/path/to/fastq/files/AEG588A1_S1_L002_R2_001.fastq.gz -SAMPLE_SINGLE_END,/path/to/fastq/files/AEG588A4_S4_L003_R1_001.fastq.gz, +sample,fastq_1,fastq_2,strandedness +SAMPLE_PAIRED_END,/path/to/fastq/files/AEG588A1_S1_L002_R1_001.fastq.gz,/path/to/fastq/files/AEG588A1_S1_L002_R2_001.fastq.gz,forward +SAMPLE_SINGLE_END,/path/to/fastq/files/AEG588A4_S4_L003_R1_001.fastq.gz,,auto diff --git a/assets/schema_annotation.json b/assets/schema_annotation.json new file mode 100644 index 000000000..3eeae0eca --- /dev/null +++ b/assets/schema_annotation.json @@ -0,0 +1,34 @@ +{ + "$schema": "https://json-schema.org/draft-07/schema", + "$id": "https://raw.githubusercontent.com/nf-core/circrna/master/assets/schema_annotation.json", + "title": "nf-core/circrna pipeline - params.annotation schema", + "description": "Schema for the file provided with params.annotation", + "type": "array", + "items": { + "type": "object", + "properties": { + "name": { + "type": "string", + "pattern": "^\\S+$", + "errorMessage": "Annotation file name must be provided and cannot contain spaces", + "meta": ["id"] + }, + "file": { + "type": "string", + "format": "file-path", + "exists": true, + "pattern": "^\\S+\\.bed$", + "errorMessage": "Annotation file must be provided and must be a BED file" + }, + "min_overlap": { + "type": "number", + "minimum": 0, + "maximum": 1, + "default": 0.9, + "errorMessage": "Minimum overlap must be a number between 0 and 1", + "meta": ["min_overlap"] + } + }, + "required": ["name", "file"] + } +} diff --git a/assets/schema_input.json b/assets/schema_input.json index fd10663ae..8d288c210 100644 --- a/assets/schema_input.json +++ b/assets/schema_input.json @@ -26,6 +26,13 @@ "exists": true, "pattern": "^\\S+\\.f(ast)?q\\.gz$", "errorMessage": "FastQ file for reads 2 cannot contain spaces and must have extension '.fq.gz' or '.fastq.gz'" + }, + "strandedness": { + "type": "string", + "enum": ["unstranded", "forward", "reverse", "auto"], + "default": "auto", + "errorMessage": "Strandedness must be one of 'unstranded', 'forward', 'reverse' or 'auto'", + "meta": ["strandedness"] } }, "required": ["sample", "fastq_1"] diff --git a/assets/schema_phenotype.json b/assets/schema_phenotype.json new file mode 100644 index 000000000..740ec5b2d --- /dev/null +++ b/assets/schema_phenotype.json @@ -0,0 +1,23 @@ +{ + "$schema": "https://json-schema.org/draft-07/schema", + "$id": "https://raw.githubusercontent.com/nf-core/circrna/master/assets/schema_phenotype.json", + "title": "nf-core/circrna pipeline - params.phenotype schema", + "description": "Schema for the file provided with params.phenotype", + "type": "array", + "items": { + "type": "object", + "properties": { + "sample": { + "type": "string", + "pattern": "^\\S+$", + "errorMessage": "Sample name must be provided and cannot contain spaces" + }, + "condition": { + "type": "string", + "pattern": "^\\S+$", + "errorMessage": "Condition name must be provided and cannot contain spaces" + } + }, + "required": ["sample", "condition"] + } +} diff --git a/bin/targetscan_format.sh b/bin/targetscan_format.sh new file mode 100755 index 000000000..9321221bb --- /dev/null +++ b/bin/targetscan_format.sh @@ -0,0 +1,44 @@ +#!/usr/bin/env bash + +## Author: Barry Digby +## License: MIT + +## Script that converts miRbase (mature.fa) file to +## TargetScan compatability. The motivation for doing +## this is the mature.fa file contains many more +## species than TargetScans miR_Family_Info.txt file. + +## Stragtegy is simply to output a 3 column tab delim +## text file containing: +## 1. miR ID +## 2. miR (7bp) seed sequence from mature seq +## 3. Species ID (set to 0000, not important for output). + +## Subset mature.fa according to the species provided by user to '--genome'. + +## Stage input mature.fa file, species +MATURE="$1" + +## Uncompress if necessary +if [ ${MATURE: -3} == ".gz" ]; then + gunzip -f $MATURE + MATURE=${MATURE%%.gz} +fi + +## Convert to TargetScan +## Isolate the sequences +grep -v ">" $MATURE > mature_sequence +## Extract seed sequence (7bp after 1st) +awk '{print substr($1, 2, 7)}' mature_sequence > seed_sequence +## Isolate ID (awk last field (NF)) +grep ">" $MATURE | awk -F ' ' '{print $NF}' > miR_ID +## Combine +paste miR_ID seed_sequence > targetscan_tmp.txt +## Correct delimiter, add dummy species +awk -v OFS="\t" '{print $1, $2, "0000"}' targetscan_tmp.txt > mature.txt + +## Tidy the work dir (comment these for debugging scratch dirs) +rm -rf mature_sequence +rm -rf miR_ID +rm -rf targetscan_tmp.txt +rm -rf seed_sequence diff --git a/conf/base.config b/conf/base.config index aaf44bfed..3c4ef541d 100644 --- a/conf/base.config +++ b/conf/base.config @@ -24,7 +24,6 @@ process { // These labels are used and recognised by default in DSL2 files hosted on nf-core/modules. // If possible, it would be nice to keep the same label naming convention when // adding in your local modules too. - // TODO nf-core: Customise requirements for specific processes. // See https://www.nextflow.io/docs/latest/config.html#config-process-selectors withLabel:process_single { cpus = { 1 } diff --git a/conf/full.config b/conf/full.config new file mode 100644 index 000000000..0c6aa64b5 --- /dev/null +++ b/conf/full.config @@ -0,0 +1,17 @@ +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Nextflow config for full-size tests +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Defines parameters so that mimimal tests are converted to full size pipeline test. + + Use as follows: + nextflow run nf-core/circrna -profile test,full, --outdir + nextflow run nf-core/circrna -profile test_igenomes,full, --outdir + +---------------------------------------------------------------------------------------- +*/ + +params { + tools = 'circexplorer2,ciriquant,find_circ,circrna_finder,mapsplice,dcc,segemehl' + min_tools = 2 +} diff --git a/conf/igenomes.config b/conf/igenomes.config index 3f1143775..ab2b7f045 100644 --- a/conf/igenomes.config +++ b/conf/igenomes.config @@ -13,28 +13,37 @@ params { genomes { 'GRCh37' { fasta = "${params.igenomes_base}/Homo_sapiens/Ensembl/GRCh37/Sequence/WholeGenomeFasta/genome.fa" + fasta_fai = "${params.igenomes_base}/Homo_sapiens/Ensembl/GRCh37/Sequence/WholeGenomeFasta/genome.fa.fai" bwa = "${params.igenomes_base}/Homo_sapiens/Ensembl/GRCh37/Sequence/BWAIndex/version0.6.0/" + bowtie = "${params.igenomes_base}/Homo_sapiens/Ensembl/GRCh37/Sequence/BowtieIndex/" bowtie2 = "${params.igenomes_base}/Homo_sapiens/Ensembl/GRCh37/Sequence/Bowtie2Index/" star = "${params.igenomes_base}/Homo_sapiens/Ensembl/GRCh37/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Homo_sapiens/Ensembl/GRCh37/Sequence/BismarkIndex/" gtf = "${params.igenomes_base}/Homo_sapiens/Ensembl/GRCh37/Annotation/Genes/genes.gtf" bed12 = "${params.igenomes_base}/Homo_sapiens/Ensembl/GRCh37/Annotation/Genes/genes.bed" - readme = "${params.igenomes_base}/Homo_sapiens/Ensembl/GRCh37/Annotation/README.txt" + mature = "${params.igenomes_base}/Homo_sapiens/Ensembl/GRCh37/Annotation/SmallRNA/mature.fa" mito_name = "MT" - macs_gsize = "2.7e9" - blacklist = "${projectDir}/assets/blacklists/GRCh37-blacklist.bed" + species_id = "hsa" } 'GRCh38' { fasta = "${params.igenomes_base}/Homo_sapiens/NCBI/GRCh38/Sequence/WholeGenomeFasta/genome.fa" + fasta_fai = "${params.igenomes_base}/Homo_sapiens/NCBI/GRCh38/Sequence/WholeGenomeFasta/genome.fa.fai" bwa = "${params.igenomes_base}/Homo_sapiens/NCBI/GRCh38/Sequence/BWAIndex/version0.6.0/" + bowtie = "${params.igenomes_base}/Homo_sapiens/NCBI/GRCh38/Sequence/BowtieIndex/" bowtie2 = "${params.igenomes_base}/Homo_sapiens/NCBI/GRCh38/Sequence/Bowtie2Index/" star = "${params.igenomes_base}/Homo_sapiens/NCBI/GRCh38/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Homo_sapiens/NCBI/GRCh38/Sequence/BismarkIndex/" gtf = "${params.igenomes_base}/Homo_sapiens/NCBI/GRCh38/Annotation/Genes/genes.gtf" bed12 = "${params.igenomes_base}/Homo_sapiens/NCBI/GRCh38/Annotation/Genes/genes.bed" + mature = "${params.igenomes_base}/Homo_sapiens/NCBI/GRCh38/Annotation/SmallRNA/mature.fa" + mito_name = "chrM" + species_id = "hsa" + } + 'CHM13' { + fasta = "${params.igenomes_base}/Homo_sapiens/UCSC/CHM13/Sequence/WholeGenomeFasta/genome.fa" + bwa = "${params.igenomes_base}/Homo_sapiens/UCSC/CHM13/Sequence/BWAIndex/" + bwamem2 = "${params.igenomes_base}/Homo_sapiens/UCSC/CHM13/Sequence/BWAmem2Index/" + gtf = "${params.igenomes_base}/Homo_sapiens/NCBI/CHM13/Annotation/Genes/genes.gtf" + gff = "ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/009/914/755/GCF_009914755.1_T2T-CHM13v2.0/GCF_009914755.1_T2T-CHM13v2.0_genomic.gff.gz" mito_name = "chrM" - macs_gsize = "2.7e9" - blacklist = "${projectDir}/assets/blacklists/hg38-blacklist.bed" } 'CHM13' { fasta = "${params.igenomes_base}/Homo_sapiens/UCSC/CHM13/Sequence/WholeGenomeFasta/genome.fa" @@ -46,395 +55,465 @@ params { } 'GRCm38' { fasta = "${params.igenomes_base}/Mus_musculus/Ensembl/GRCm38/Sequence/WholeGenomeFasta/genome.fa" + fasta_fai = "${params.igenomes_base}/Mus_musculus/Ensembl/GRCm38/Sequence/WholeGenomeFasta/genome.fa.fai" bwa = "${params.igenomes_base}/Mus_musculus/Ensembl/GRCm38/Sequence/BWAIndex/version0.6.0/" + bowtie = "${params.igenomes_base}/Mus_musculus/Ensembl/GRCm38/Sequence/BowtieIndex/" bowtie2 = "${params.igenomes_base}/Mus_musculus/Ensembl/GRCm38/Sequence/Bowtie2Index/" star = "${params.igenomes_base}/Mus_musculus/Ensembl/GRCm38/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Mus_musculus/Ensembl/GRCm38/Sequence/BismarkIndex/" gtf = "${params.igenomes_base}/Mus_musculus/Ensembl/GRCm38/Annotation/Genes/genes.gtf" bed12 = "${params.igenomes_base}/Mus_musculus/Ensembl/GRCm38/Annotation/Genes/genes.bed" - readme = "${params.igenomes_base}/Mus_musculus/Ensembl/GRCm38/Annotation/README.txt" + mature = "${params.igenomes_base}/Mus_musculus/Ensembl/GRCm38/Annotation/SmallRNA/mature.fa" mito_name = "MT" - macs_gsize = "1.87e9" - blacklist = "${projectDir}/assets/blacklists/GRCm38-blacklist.bed" + species_id = "hsa" } 'TAIR10' { fasta = "${params.igenomes_base}/Arabidopsis_thaliana/Ensembl/TAIR10/Sequence/WholeGenomeFasta/genome.fa" + fasta_fai = "${params.igenomes_base}/Arabidopsis_thaliana/Ensembl/TAIR10/Sequence/WholeGenomeFasta/genome.fa.fai" bwa = "${params.igenomes_base}/Arabidopsis_thaliana/Ensembl/TAIR10/Sequence/BWAIndex/version0.6.0/" + bowtie = "${params.igenomes_base}/Arabidopsis_thaliana/Ensembl/TAIR10/Sequence/BowtieIndex/" bowtie2 = "${params.igenomes_base}/Arabidopsis_thaliana/Ensembl/TAIR10/Sequence/Bowtie2Index/" star = "${params.igenomes_base}/Arabidopsis_thaliana/Ensembl/TAIR10/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Arabidopsis_thaliana/Ensembl/TAIR10/Sequence/BismarkIndex/" gtf = "${params.igenomes_base}/Arabidopsis_thaliana/Ensembl/TAIR10/Annotation/Genes/genes.gtf" bed12 = "${params.igenomes_base}/Arabidopsis_thaliana/Ensembl/TAIR10/Annotation/Genes/genes.bed" - readme = "${params.igenomes_base}/Arabidopsis_thaliana/Ensembl/TAIR10/Annotation/README.txt" + mature = "${params.igenomes_base}/Arabidopsis_thaliana/Ensembl/TAIR10/Annotation/SmallRNA/mature.fa" mito_name = "Mt" + species_id = "ath" } 'EB2' { fasta = "${params.igenomes_base}/Bacillus_subtilis_168/Ensembl/EB2/Sequence/WholeGenomeFasta/genome.fa" + fasta_fai = "${params.igenomes_base}/Bacillus_subtilis_168/Ensembl/EB2/Sequence/WholeGenomeFasta/genome.fa.fai" bwa = "${params.igenomes_base}/Bacillus_subtilis_168/Ensembl/EB2/Sequence/BWAIndex/version0.6.0/" + bowtie = "${params.igenomes_base}/Bacillus_subtilis_168/Ensembl/EB2/Sequence/BowtieIndex/" bowtie2 = "${params.igenomes_base}/Bacillus_subtilis_168/Ensembl/EB2/Sequence/Bowtie2Index/" star = "${params.igenomes_base}/Bacillus_subtilis_168/Ensembl/EB2/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Bacillus_subtilis_168/Ensembl/EB2/Sequence/BismarkIndex/" gtf = "${params.igenomes_base}/Bacillus_subtilis_168/Ensembl/EB2/Annotation/Genes/genes.gtf" bed12 = "${params.igenomes_base}/Bacillus_subtilis_168/Ensembl/EB2/Annotation/Genes/genes.bed" - readme = "${params.igenomes_base}/Bacillus_subtilis_168/Ensembl/EB2/Annotation/README.txt" + mature = "${params.igenomes_base}/Bacillus_subtilis_168/Ensembl/EB2/Annotation/SmallRNA/mature.fa" + species_id = "bsu" } 'UMD3.1' { fasta = "${params.igenomes_base}/Bos_taurus/Ensembl/UMD3.1/Sequence/WholeGenomeFasta/genome.fa" + fasta_fai = "${params.igenomes_base}/Bos_taurus/Ensembl/UMD3.1/Sequence/WholeGenomeFasta/genome.fa.fai" bwa = "${params.igenomes_base}/Bos_taurus/Ensembl/UMD3.1/Sequence/BWAIndex/version0.6.0/" + bowtie = "${params.igenomes_base}/Bos_taurus/Ensembl/UMD3.1/Sequence/BowtieIndex/" bowtie2 = "${params.igenomes_base}/Bos_taurus/Ensembl/UMD3.1/Sequence/Bowtie2Index/" star = "${params.igenomes_base}/Bos_taurus/Ensembl/UMD3.1/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Bos_taurus/Ensembl/UMD3.1/Sequence/BismarkIndex/" gtf = "${params.igenomes_base}/Bos_taurus/Ensembl/UMD3.1/Annotation/Genes/genes.gtf" bed12 = "${params.igenomes_base}/Bos_taurus/Ensembl/UMD3.1/Annotation/Genes/genes.bed" - readme = "${params.igenomes_base}/Bos_taurus/Ensembl/UMD3.1/Annotation/README.txt" + mature = "${params.igenomes_base}/Bos_taurus/Ensembl/UMD3.1/Annotation/SmallRNA/mature.fa" mito_name = "MT" + species_id = "bta" } 'WBcel235' { fasta = "${params.igenomes_base}/Caenorhabditis_elegans/Ensembl/WBcel235/Sequence/WholeGenomeFasta/genome.fa" + fasta_fai = "${params.igenomes_base}/Caenorhabditis_elegans/Ensembl/WBcel235/Sequence/WholeGenomeFasta/genome.fa.fai" bwa = "${params.igenomes_base}/Caenorhabditis_elegans/Ensembl/WBcel235/Sequence/BWAIndex/version0.6.0/" + bowtie = "${params.igenomes_base}/Caenorhabditis_elegans/Ensembl/WBcel235/Sequence/BowtieIndex/" bowtie2 = "${params.igenomes_base}/Caenorhabditis_elegans/Ensembl/WBcel235/Sequence/Bowtie2Index/" star = "${params.igenomes_base}/Caenorhabditis_elegans/Ensembl/WBcel235/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Caenorhabditis_elegans/Ensembl/WBcel235/Sequence/BismarkIndex/" gtf = "${params.igenomes_base}/Caenorhabditis_elegans/Ensembl/WBcel235/Annotation/Genes/genes.gtf" bed12 = "${params.igenomes_base}/Caenorhabditis_elegans/Ensembl/WBcel235/Annotation/Genes/genes.bed" + mature = "${params.igenomes_base}/Caenorhabditis_elegans/Ensembl/WBcel235/Annotation/SmallRNA/mature.fa" mito_name = "MtDNA" - macs_gsize = "9e7" + species_id = "cel" } 'CanFam3.1' { fasta = "${params.igenomes_base}/Canis_familiaris/Ensembl/CanFam3.1/Sequence/WholeGenomeFasta/genome.fa" + fasta_fai = "${params.igenomes_base}/Canis_familiaris/Ensembl/CanFam3.1/Sequence/WholeGenomeFasta/genome.fa.fai" bwa = "${params.igenomes_base}/Canis_familiaris/Ensembl/CanFam3.1/Sequence/BWAIndex/version0.6.0/" + bowtie = "${params.igenomes_base}/Canis_familiaris/Ensembl/CanFam3.1/Sequence/BowtieIndex/" bowtie2 = "${params.igenomes_base}/Canis_familiaris/Ensembl/CanFam3.1/Sequence/Bowtie2Index/" star = "${params.igenomes_base}/Canis_familiaris/Ensembl/CanFam3.1/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Canis_familiaris/Ensembl/CanFam3.1/Sequence/BismarkIndex/" gtf = "${params.igenomes_base}/Canis_familiaris/Ensembl/CanFam3.1/Annotation/Genes/genes.gtf" bed12 = "${params.igenomes_base}/Canis_familiaris/Ensembl/CanFam3.1/Annotation/Genes/genes.bed" - readme = "${params.igenomes_base}/Canis_familiaris/Ensembl/CanFam3.1/Annotation/README.txt" + mature = "${params.igenomes_base}/Canis_familiaris/Ensembl/CanFam3.1/Annotation/SmallRNA/mature.fa" mito_name = "MT" + species_id = "cfa" } 'GRCz10' { fasta = "${params.igenomes_base}/Danio_rerio/Ensembl/GRCz10/Sequence/WholeGenomeFasta/genome.fa" + fasta_fai = "${params.igenomes_base}/Danio_rerio/Ensembl/GRCz10/Sequence/WholeGenomeFasta/genome.fa.fai" bwa = "${params.igenomes_base}/Danio_rerio/Ensembl/GRCz10/Sequence/BWAIndex/version0.6.0/" + bowtie = "${params.igenomes_base}/Danio_rerio/Ensembl/GRCz10/Sequence/BowtieIndex/" bowtie2 = "${params.igenomes_base}/Danio_rerio/Ensembl/GRCz10/Sequence/Bowtie2Index/" star = "${params.igenomes_base}/Danio_rerio/Ensembl/GRCz10/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Danio_rerio/Ensembl/GRCz10/Sequence/BismarkIndex/" gtf = "${params.igenomes_base}/Danio_rerio/Ensembl/GRCz10/Annotation/Genes/genes.gtf" bed12 = "${params.igenomes_base}/Danio_rerio/Ensembl/GRCz10/Annotation/Genes/genes.bed" + mature = "${params.igenomes_base}/Danio_rerio/Ensembl/GRCz10/Annotation/SmallRNA/mature.fa" mito_name = "MT" + species_id = "dre" } 'BDGP6' { fasta = "${params.igenomes_base}/Drosophila_melanogaster/Ensembl/BDGP6/Sequence/WholeGenomeFasta/genome.fa" + fasta_fai = "${params.igenomes_base}/Drosophila_melanogaster/Ensembl/BDGP6/Sequence/WholeGenomeFasta/genome.fa.fai" bwa = "${params.igenomes_base}/Drosophila_melanogaster/Ensembl/BDGP6/Sequence/BWAIndex/version0.6.0/" + bowtie = "${params.igenomes_base}/Drosophila_melanogaster/Ensembl/BDGP6/Sequence/BowtieIndex/" bowtie2 = "${params.igenomes_base}/Drosophila_melanogaster/Ensembl/BDGP6/Sequence/Bowtie2Index/" star = "${params.igenomes_base}/Drosophila_melanogaster/Ensembl/BDGP6/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Drosophila_melanogaster/Ensembl/BDGP6/Sequence/BismarkIndex/" gtf = "${params.igenomes_base}/Drosophila_melanogaster/Ensembl/BDGP6/Annotation/Genes/genes.gtf" bed12 = "${params.igenomes_base}/Drosophila_melanogaster/Ensembl/BDGP6/Annotation/Genes/genes.bed" + mature = "${params.igenomes_base}/Drosophila_melanogaster/Ensembl/BDGP6/Annotation/SmallRNA/mature.fa" mito_name = "M" - macs_gsize = "1.2e8" + species_id = "dme" } 'EquCab2' { fasta = "${params.igenomes_base}/Equus_caballus/Ensembl/EquCab2/Sequence/WholeGenomeFasta/genome.fa" + fasta_fai = "${params.igenomes_base}/Equus_caballus/Ensembl/EquCab2/Sequence/WholeGenomeFasta/genome.fa.fai" bwa = "${params.igenomes_base}/Equus_caballus/Ensembl/EquCab2/Sequence/BWAIndex/version0.6.0/" + bowtie = "${params.igenomes_base}/Equus_caballus/Ensembl/EquCab2/Sequence/BowtieIndex/" bowtie2 = "${params.igenomes_base}/Equus_caballus/Ensembl/EquCab2/Sequence/Bowtie2Index/" star = "${params.igenomes_base}/Equus_caballus/Ensembl/EquCab2/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Equus_caballus/Ensembl/EquCab2/Sequence/BismarkIndex/" gtf = "${params.igenomes_base}/Equus_caballus/Ensembl/EquCab2/Annotation/Genes/genes.gtf" bed12 = "${params.igenomes_base}/Equus_caballus/Ensembl/EquCab2/Annotation/Genes/genes.bed" - readme = "${params.igenomes_base}/Equus_caballus/Ensembl/EquCab2/Annotation/README.txt" + mature = "${params.igenomes_base}/Equus_caballus/Ensembl/EquCab2/Annotation/SmallRNA/mature.fa" mito_name = "MT" + species_id = "eca" } 'EB1' { fasta = "${params.igenomes_base}/Escherichia_coli_K_12_DH10B/Ensembl/EB1/Sequence/WholeGenomeFasta/genome.fa" + fasta_fai = "${params.igenomes_base}/Escherichia_coli_K_12_DH10B/Ensembl/EB1/Sequence/WholeGenomeFasta/genome.fa.fai" bwa = "${params.igenomes_base}/Escherichia_coli_K_12_DH10B/Ensembl/EB1/Sequence/BWAIndex/version0.6.0/" + bowtie = "${params.igenomes_base}/Escherichia_coli_K_12_DH10B/Ensembl/EB1/Sequence/BowtieIndex/" bowtie2 = "${params.igenomes_base}/Escherichia_coli_K_12_DH10B/Ensembl/EB1/Sequence/Bowtie2Index/" star = "${params.igenomes_base}/Escherichia_coli_K_12_DH10B/Ensembl/EB1/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Escherichia_coli_K_12_DH10B/Ensembl/EB1/Sequence/BismarkIndex/" gtf = "${params.igenomes_base}/Escherichia_coli_K_12_DH10B/Ensembl/EB1/Annotation/Genes/genes.gtf" bed12 = "${params.igenomes_base}/Escherichia_coli_K_12_DH10B/Ensembl/EB1/Annotation/Genes/genes.bed" - readme = "${params.igenomes_base}/Escherichia_coli_K_12_DH10B/Ensembl/EB1/Annotation/README.txt" + mature = "${params.igenomes_base}/Escherichia_coli_K_12_DH10B/Ensembl/EB1/Annotation/SmallRNA/mature.fa" + species_id = "eco" } 'Galgal4' { fasta = "${params.igenomes_base}/Gallus_gallus/Ensembl/Galgal4/Sequence/WholeGenomeFasta/genome.fa" + fasta_fai = "${params.igenomes_base}/Gallus_gallus/Ensembl/Galgal4/Sequence/WholeGenomeFasta/genome.fa.fai" bwa = "${params.igenomes_base}/Gallus_gallus/Ensembl/Galgal4/Sequence/BWAIndex/version0.6.0/" + bowtie = "${params.igenomes_base}/Gallus_gallus/Ensembl/Galgal4/Sequence/BowtieIndex/" bowtie2 = "${params.igenomes_base}/Gallus_gallus/Ensembl/Galgal4/Sequence/Bowtie2Index/" star = "${params.igenomes_base}/Gallus_gallus/Ensembl/Galgal4/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Gallus_gallus/Ensembl/Galgal4/Sequence/BismarkIndex/" gtf = "${params.igenomes_base}/Gallus_gallus/Ensembl/Galgal4/Annotation/Genes/genes.gtf" bed12 = "${params.igenomes_base}/Gallus_gallus/Ensembl/Galgal4/Annotation/Genes/genes.bed" + mature = "${params.igenomes_base}/Gallus_gallus/Ensembl/Galgal4/Annotation/SmallRNA/mature.fa" mito_name = "MT" + species_id = "gga" } 'Gm01' { fasta = "${params.igenomes_base}/Glycine_max/Ensembl/Gm01/Sequence/WholeGenomeFasta/genome.fa" + fasta_fai = "${params.igenomes_base}/Glycine_max/Ensembl/Gm01/Sequence/WholeGenomeFasta/genome.fa.fai" bwa = "${params.igenomes_base}/Glycine_max/Ensembl/Gm01/Sequence/BWAIndex/version0.6.0/" + bowtie = "${params.igenomes_base}/Glycine_max/Ensembl/Gm01/Sequence/BowtieIndex/" bowtie2 = "${params.igenomes_base}/Glycine_max/Ensembl/Gm01/Sequence/Bowtie2Index/" star = "${params.igenomes_base}/Glycine_max/Ensembl/Gm01/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Glycine_max/Ensembl/Gm01/Sequence/BismarkIndex/" gtf = "${params.igenomes_base}/Glycine_max/Ensembl/Gm01/Annotation/Genes/genes.gtf" bed12 = "${params.igenomes_base}/Glycine_max/Ensembl/Gm01/Annotation/Genes/genes.bed" - readme = "${params.igenomes_base}/Glycine_max/Ensembl/Gm01/Annotation/README.txt" + mature = "${params.igenomes_base}/Glycine_max/Ensembl/Gm01/Annotation/SmallRNA/mature.fa" + species_id = "gmx" } 'Mmul_1' { fasta = "${params.igenomes_base}/Macaca_mulatta/Ensembl/Mmul_1/Sequence/WholeGenomeFasta/genome.fa" + fasta_fai = "${params.igenomes_base}/Macaca_mulatta/Ensembl/Mmul_1/Sequence/WholeGenomeFasta/genome.fa.fai" bwa = "${params.igenomes_base}/Macaca_mulatta/Ensembl/Mmul_1/Sequence/BWAIndex/version0.6.0/" + bowtie = "${params.igenomes_base}/Macaca_mulatta/Ensembl/Mmul_1/Sequence/BowtieIndex/" bowtie2 = "${params.igenomes_base}/Macaca_mulatta/Ensembl/Mmul_1/Sequence/Bowtie2Index/" star = "${params.igenomes_base}/Macaca_mulatta/Ensembl/Mmul_1/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Macaca_mulatta/Ensembl/Mmul_1/Sequence/BismarkIndex/" gtf = "${params.igenomes_base}/Macaca_mulatta/Ensembl/Mmul_1/Annotation/Genes/genes.gtf" bed12 = "${params.igenomes_base}/Macaca_mulatta/Ensembl/Mmul_1/Annotation/Genes/genes.bed" - readme = "${params.igenomes_base}/Macaca_mulatta/Ensembl/Mmul_1/Annotation/README.txt" + mature = "${params.igenomes_base}/Macaca_mulatta/Ensembl/Mmul_1/Annotation/SmallRNA/mature.fa" mito_name = "MT" + species_id = "mml" } 'IRGSP-1.0' { fasta = "${params.igenomes_base}/Oryza_sativa_japonica/Ensembl/IRGSP-1.0/Sequence/WholeGenomeFasta/genome.fa" + fasta_fai = "${params.igenomes_base}/Oryza_sativa_japonica/Ensembl/IRGSP-1.0/Sequence/WholeGenomeFasta/genome.fa.fai" bwa = "${params.igenomes_base}/Oryza_sativa_japonica/Ensembl/IRGSP-1.0/Sequence/BWAIndex/version0.6.0/" + bowtie = "${params.igenomes_base}/Oryza_sativa_japonica/Ensembl/IRGSP-1.0/Sequence/BowtieIndex/" bowtie2 = "${params.igenomes_base}/Oryza_sativa_japonica/Ensembl/IRGSP-1.0/Sequence/Bowtie2Index/" star = "${params.igenomes_base}/Oryza_sativa_japonica/Ensembl/IRGSP-1.0/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Oryza_sativa_japonica/Ensembl/IRGSP-1.0/Sequence/BismarkIndex/" gtf = "${params.igenomes_base}/Oryza_sativa_japonica/Ensembl/IRGSP-1.0/Annotation/Genes/genes.gtf" bed12 = "${params.igenomes_base}/Oryza_sativa_japonica/Ensembl/IRGSP-1.0/Annotation/Genes/genes.bed" + mature = "${params.igenomes_base}/Oryza_sativa_japonica/Ensembl/IRGSP-1.0/Annotation/SmallRNA/mature.fa" mito_name = "Mt" + species_id = "osa" } 'CHIMP2.1.4' { fasta = "${params.igenomes_base}/Pan_troglodytes/Ensembl/CHIMP2.1.4/Sequence/WholeGenomeFasta/genome.fa" + fasta_fai = "${params.igenomes_base}/Pan_troglodytes/Ensembl/CHIMP2.1.4/Sequence/WholeGenomeFasta/genome.fa.fai" bwa = "${params.igenomes_base}/Pan_troglodytes/Ensembl/CHIMP2.1.4/Sequence/BWAIndex/version0.6.0/" + bowtie = "${params.igenomes_base}/Pan_troglodytes/Ensembl/CHIMP2.1.4/Sequence/BowtieIndex/" bowtie2 = "${params.igenomes_base}/Pan_troglodytes/Ensembl/CHIMP2.1.4/Sequence/Bowtie2Index/" star = "${params.igenomes_base}/Pan_troglodytes/Ensembl/CHIMP2.1.4/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Pan_troglodytes/Ensembl/CHIMP2.1.4/Sequence/BismarkIndex/" gtf = "${params.igenomes_base}/Pan_troglodytes/Ensembl/CHIMP2.1.4/Annotation/Genes/genes.gtf" bed12 = "${params.igenomes_base}/Pan_troglodytes/Ensembl/CHIMP2.1.4/Annotation/Genes/genes.bed" - readme = "${params.igenomes_base}/Pan_troglodytes/Ensembl/CHIMP2.1.4/Annotation/README.txt" + mature = "${params.igenomes_base}/Pan_troglodytes/Ensembl/CHIMP2.1.4/Annotation/SmallRNA/mature.fa" mito_name = "MT" + species_id = "ptr" } 'Rnor_5.0' { fasta = "${params.igenomes_base}/Rattus_norvegicus/Ensembl/Rnor_5.0/Sequence/WholeGenomeFasta/genome.fa" + fasta_fai = "${params.igenomes_base}/Rattus_norvegicus/Ensembl/Rnor_5.0/Sequence/WholeGenomeFasta/genome.fa.fai" bwa = "${params.igenomes_base}/Rattus_norvegicus/Ensembl/Rnor_5.0/Sequence/BWAIndex/version0.6.0/" + bowtie = "${params.igenomes_base}/Rattus_norvegicus/Ensembl/Rnor_5.0/Sequence/BowtieIndex/" bowtie2 = "${params.igenomes_base}/Rattus_norvegicus/Ensembl/Rnor_5.0/Sequence/Bowtie2Index/" star = "${params.igenomes_base}/Rattus_norvegicus/Ensembl/Rnor_5.0/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Rattus_norvegicus/Ensembl/Rnor_5.0/Sequence/BismarkIndex/" gtf = "${params.igenomes_base}/Rattus_norvegicus/Ensembl/Rnor_5.0/Annotation/Genes/genes.gtf" bed12 = "${params.igenomes_base}/Rattus_norvegicus/Ensembl/Rnor_5.0/Annotation/Genes/genes.bed" + mature = "${params.igenomes_base}/Rattus_norvegicus/Ensembl/Rnor_5.0/Annotation/SmallRNA/mature.fa" mito_name = "MT" + species_id = "rno" } 'Rnor_6.0' { fasta = "${params.igenomes_base}/Rattus_norvegicus/Ensembl/Rnor_6.0/Sequence/WholeGenomeFasta/genome.fa" + fasta_fai = "${params.igenomes_base}/Rattus_norvegicus/Ensembl/Rnor_6.0/Sequence/WholeGenomeFasta/genome.fa.fai" bwa = "${params.igenomes_base}/Rattus_norvegicus/Ensembl/Rnor_6.0/Sequence/BWAIndex/version0.6.0/" + bowtie = "${params.igenomes_base}/Rattus_norvegicus/Ensembl/Rnor_6.0/Sequence/BowtieIndex/" bowtie2 = "${params.igenomes_base}/Rattus_norvegicus/Ensembl/Rnor_6.0/Sequence/Bowtie2Index/" star = "${params.igenomes_base}/Rattus_norvegicus/Ensembl/Rnor_6.0/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Rattus_norvegicus/Ensembl/Rnor_6.0/Sequence/BismarkIndex/" gtf = "${params.igenomes_base}/Rattus_norvegicus/Ensembl/Rnor_6.0/Annotation/Genes/genes.gtf" bed12 = "${params.igenomes_base}/Rattus_norvegicus/Ensembl/Rnor_6.0/Annotation/Genes/genes.bed" + mature = "${params.igenomes_base}/Rattus_norvegicus/Ensembl/Rnor_6.0/Annotation/SmallRNA/mature.fa" mito_name = "MT" + species_id = "rno" } 'R64-1-1' { fasta = "${params.igenomes_base}/Saccharomyces_cerevisiae/Ensembl/R64-1-1/Sequence/WholeGenomeFasta/genome.fa" + fasta_fai = "${params.igenomes_base}/Saccharomyces_cerevisiae/Ensembl/R64-1-1/Sequence/WholeGenomeFasta/genome.fa.fai" bwa = "${params.igenomes_base}/Saccharomyces_cerevisiae/Ensembl/R64-1-1/Sequence/BWAIndex/version0.6.0/" + bowtie = "${params.igenomes_base}/Saccharomyces_cerevisiae/Ensembl/R64-1-1/Sequence/BowtieIndex/" bowtie2 = "${params.igenomes_base}/Saccharomyces_cerevisiae/Ensembl/R64-1-1/Sequence/Bowtie2Index/" star = "${params.igenomes_base}/Saccharomyces_cerevisiae/Ensembl/R64-1-1/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Saccharomyces_cerevisiae/Ensembl/R64-1-1/Sequence/BismarkIndex/" gtf = "${params.igenomes_base}/Saccharomyces_cerevisiae/Ensembl/R64-1-1/Annotation/Genes/genes.gtf" bed12 = "${params.igenomes_base}/Saccharomyces_cerevisiae/Ensembl/R64-1-1/Annotation/Genes/genes.bed" + mature = "${params.igenomes_base}/Saccharomyces_cerevisiae/Ensembl/R64-1-1/Annotation/SmallRNA/mature.fa" mito_name = "MT" - macs_gsize = "1.2e7" + species_id = "sce" } 'EF2' { fasta = "${params.igenomes_base}/Schizosaccharomyces_pombe/Ensembl/EF2/Sequence/WholeGenomeFasta/genome.fa" + fasta_fai = "${params.igenomes_base}/Schizosaccharomyces_pombe/Ensembl/EF2/Sequence/WholeGenomeFasta/genome.fa.fai" bwa = "${params.igenomes_base}/Schizosaccharomyces_pombe/Ensembl/EF2/Sequence/BWAIndex/version0.6.0/" + bowtie = "${params.igenomes_base}/Schizosaccharomyces_pombe/Ensembl/EF2/Sequence/BowtieIndex/" bowtie2 = "${params.igenomes_base}/Schizosaccharomyces_pombe/Ensembl/EF2/Sequence/Bowtie2Index/" star = "${params.igenomes_base}/Schizosaccharomyces_pombe/Ensembl/EF2/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Schizosaccharomyces_pombe/Ensembl/EF2/Sequence/BismarkIndex/" gtf = "${params.igenomes_base}/Schizosaccharomyces_pombe/Ensembl/EF2/Annotation/Genes/genes.gtf" bed12 = "${params.igenomes_base}/Schizosaccharomyces_pombe/Ensembl/EF2/Annotation/Genes/genes.bed" - readme = "${params.igenomes_base}/Schizosaccharomyces_pombe/Ensembl/EF2/Annotation/README.txt" + mature = "${params.igenomes_base}/Schizosaccharomyces_pombe/Ensembl/EF2/Annotation/SmallRNA/mature.fa" mito_name = "MT" - macs_gsize = "1.21e7" + species_id = "spo" } 'Sbi1' { fasta = "${params.igenomes_base}/Sorghum_bicolor/Ensembl/Sbi1/Sequence/WholeGenomeFasta/genome.fa" + fasta_fai = "${params.igenomes_base}/Sorghum_bicolor/Ensembl/Sbi1/Sequence/WholeGenomeFasta/genome.fa.fai" bwa = "${params.igenomes_base}/Sorghum_bicolor/Ensembl/Sbi1/Sequence/BWAIndex/version0.6.0/" + bowtie = "${params.igenomes_base}/Sorghum_bicolor/Ensembl/Sbi1/Sequence/BowtieIndex/" bowtie2 = "${params.igenomes_base}/Sorghum_bicolor/Ensembl/Sbi1/Sequence/Bowtie2Index/" star = "${params.igenomes_base}/Sorghum_bicolor/Ensembl/Sbi1/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Sorghum_bicolor/Ensembl/Sbi1/Sequence/BismarkIndex/" gtf = "${params.igenomes_base}/Sorghum_bicolor/Ensembl/Sbi1/Annotation/Genes/genes.gtf" bed12 = "${params.igenomes_base}/Sorghum_bicolor/Ensembl/Sbi1/Annotation/Genes/genes.bed" - readme = "${params.igenomes_base}/Sorghum_bicolor/Ensembl/Sbi1/Annotation/README.txt" + mature = "${params.igenomes_base}/Sorghum_bicolor/Ensembl/Sbi1/Annotation/SmallRNA/mature.fa" + species_id = "sbi" } 'Sscrofa10.2' { fasta = "${params.igenomes_base}/Sus_scrofa/Ensembl/Sscrofa10.2/Sequence/WholeGenomeFasta/genome.fa" + fasta_fai = "${params.igenomes_base}/Sus_scrofa/Ensembl/Sscrofa10.2/Sequence/WholeGenomeFasta/genome.fa.fai" bwa = "${params.igenomes_base}/Sus_scrofa/Ensembl/Sscrofa10.2/Sequence/BWAIndex/version0.6.0/" + bowtie = "${params.igenomes_base}/Sus_scrofa/Ensembl/Sscrofa10.2/Sequence/BowtieIndex/" bowtie2 = "${params.igenomes_base}/Sus_scrofa/Ensembl/Sscrofa10.2/Sequence/Bowtie2Index/" star = "${params.igenomes_base}/Sus_scrofa/Ensembl/Sscrofa10.2/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Sus_scrofa/Ensembl/Sscrofa10.2/Sequence/BismarkIndex/" gtf = "${params.igenomes_base}/Sus_scrofa/Ensembl/Sscrofa10.2/Annotation/Genes/genes.gtf" bed12 = "${params.igenomes_base}/Sus_scrofa/Ensembl/Sscrofa10.2/Annotation/Genes/genes.bed" - readme = "${params.igenomes_base}/Sus_scrofa/Ensembl/Sscrofa10.2/Annotation/README.txt" + mature = "${params.igenomes_base}/Sus_scrofa/Ensembl/Sscrofa10.2/Annotation/SmallRNA/mature.fa" mito_name = "MT" + species_id = "ssc" } 'AGPv3' { fasta = "${params.igenomes_base}/Zea_mays/Ensembl/AGPv3/Sequence/WholeGenomeFasta/genome.fa" + fasta_fai = "${params.igenomes_base}/Zea_mays/Ensembl/AGPv3/Sequence/WholeGenomeFasta/genome.fa.fai" bwa = "${params.igenomes_base}/Zea_mays/Ensembl/AGPv3/Sequence/BWAIndex/version0.6.0/" + bowtie = "${params.igenomes_base}/Zea_mays/Ensembl/AGPv3/Sequence/BowtieIndex/" bowtie2 = "${params.igenomes_base}/Zea_mays/Ensembl/AGPv3/Sequence/Bowtie2Index/" star = "${params.igenomes_base}/Zea_mays/Ensembl/AGPv3/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Zea_mays/Ensembl/AGPv3/Sequence/BismarkIndex/" gtf = "${params.igenomes_base}/Zea_mays/Ensembl/AGPv3/Annotation/Genes/genes.gtf" bed12 = "${params.igenomes_base}/Zea_mays/Ensembl/AGPv3/Annotation/Genes/genes.bed" + mature = "${params.igenomes_base}/Zea_mays/Ensembl/AGPv3/Annotation/SmallRNA/mature.fa" mito_name = "Mt" + species_id = "zma" } 'hg38' { fasta = "${params.igenomes_base}/Homo_sapiens/UCSC/hg38/Sequence/WholeGenomeFasta/genome.fa" + fasta_fai = "${params.igenomes_base}/Homo_sapiens/UCSC/hg38/Sequence/WholeGenomeFasta/genome.fa.fai" bwa = "${params.igenomes_base}/Homo_sapiens/UCSC/hg38/Sequence/BWAIndex/version0.6.0/" + bowtie = "${params.igenomes_base}/Homo_sapiens/UCSC/hg38/Sequence/BowtieIndex/" bowtie2 = "${params.igenomes_base}/Homo_sapiens/UCSC/hg38/Sequence/Bowtie2Index/" star = "${params.igenomes_base}/Homo_sapiens/UCSC/hg38/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Homo_sapiens/UCSC/hg38/Sequence/BismarkIndex/" gtf = "${params.igenomes_base}/Homo_sapiens/UCSC/hg38/Annotation/Genes/genes.gtf" bed12 = "${params.igenomes_base}/Homo_sapiens/UCSC/hg38/Annotation/Genes/genes.bed" + mature = "${params.igenomes_base}/Homo_sapiens/UCSC/hg38/Annotation/SmallRNA/mature.fa" mito_name = "chrM" - macs_gsize = "2.7e9" - blacklist = "${projectDir}/assets/blacklists/hg38-blacklist.bed" + species_id = "hsa" } 'hg19' { fasta = "${params.igenomes_base}/Homo_sapiens/UCSC/hg19/Sequence/WholeGenomeFasta/genome.fa" + fasta_fai = "${params.igenomes_base}/Homo_sapiens/UCSC/hg19/Sequence/WholeGenomeFasta/genome.fa.fai" bwa = "${params.igenomes_base}/Homo_sapiens/UCSC/hg19/Sequence/BWAIndex/version0.6.0/" + bowtie = "${params.igenomes_base}/Homo_sapiens/UCSC/hg19/Sequence/BowtieIndex/" bowtie2 = "${params.igenomes_base}/Homo_sapiens/UCSC/hg19/Sequence/Bowtie2Index/" star = "${params.igenomes_base}/Homo_sapiens/UCSC/hg19/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Homo_sapiens/UCSC/hg19/Sequence/BismarkIndex/" gtf = "${params.igenomes_base}/Homo_sapiens/UCSC/hg19/Annotation/Genes/genes.gtf" bed12 = "${params.igenomes_base}/Homo_sapiens/UCSC/hg19/Annotation/Genes/genes.bed" - readme = "${params.igenomes_base}/Homo_sapiens/UCSC/hg19/Annotation/README.txt" + mature = "${params.igenomes_base}/Homo_sapiens/UCSC/hg19/Annotation/SmallRNA/mature.fa" mito_name = "chrM" - macs_gsize = "2.7e9" - blacklist = "${projectDir}/assets/blacklists/hg19-blacklist.bed" + species_id = "hsa" } 'mm10' { fasta = "${params.igenomes_base}/Mus_musculus/UCSC/mm10/Sequence/WholeGenomeFasta/genome.fa" + fasta_fai = "${params.igenomes_base}/Mus_musculus/UCSC/mm10/Sequence/WholeGenomeFasta/genome.fa.fai" bwa = "${params.igenomes_base}/Mus_musculus/UCSC/mm10/Sequence/BWAIndex/version0.6.0/" + bowtie = "${params.igenomes_base}/Mus_musculus/UCSC/mm10/Sequence/BowtieIndex/" bowtie2 = "${params.igenomes_base}/Mus_musculus/UCSC/mm10/Sequence/Bowtie2Index/" star = "${params.igenomes_base}/Mus_musculus/UCSC/mm10/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Mus_musculus/UCSC/mm10/Sequence/BismarkIndex/" gtf = "${params.igenomes_base}/Mus_musculus/UCSC/mm10/Annotation/Genes/genes.gtf" bed12 = "${params.igenomes_base}/Mus_musculus/UCSC/mm10/Annotation/Genes/genes.bed" - readme = "${params.igenomes_base}/Mus_musculus/UCSC/mm10/Annotation/README.txt" + mature = "${params.igenomes_base}/Mus_musculus/UCSC/mm10/Annotation/SmallRNA/mature.fa" mito_name = "chrM" - macs_gsize = "1.87e9" - blacklist = "${projectDir}/assets/blacklists/mm10-blacklist.bed" + species_id = "mmu" } 'bosTau8' { fasta = "${params.igenomes_base}/Bos_taurus/UCSC/bosTau8/Sequence/WholeGenomeFasta/genome.fa" + fasta_fai = "${params.igenomes_base}/Bos_taurus/UCSC/bosTau8/Sequence/WholeGenomeFasta/genome.fa.fai" bwa = "${params.igenomes_base}/Bos_taurus/UCSC/bosTau8/Sequence/BWAIndex/version0.6.0/" + bowtie = "${params.igenomes_base}/Bos_taurus/UCSC/bosTau8/Sequence/BowtieIndex/" bowtie2 = "${params.igenomes_base}/Bos_taurus/UCSC/bosTau8/Sequence/Bowtie2Index/" star = "${params.igenomes_base}/Bos_taurus/UCSC/bosTau8/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Bos_taurus/UCSC/bosTau8/Sequence/BismarkIndex/" gtf = "${params.igenomes_base}/Bos_taurus/UCSC/bosTau8/Annotation/Genes/genes.gtf" bed12 = "${params.igenomes_base}/Bos_taurus/UCSC/bosTau8/Annotation/Genes/genes.bed" + mature = "${params.igenomes_base}/Bos_taurus/UCSC/bosTau8/Annotation/SmallRNA/mature.fa" mito_name = "chrM" + species_id = "bta" } 'ce10' { fasta = "${params.igenomes_base}/Caenorhabditis_elegans/UCSC/ce10/Sequence/WholeGenomeFasta/genome.fa" + fasta_fai = "${params.igenomes_base}/Caenorhabditis_elegans/UCSC/ce10/Sequence/WholeGenomeFasta/genome.fa.fai" bwa = "${params.igenomes_base}/Caenorhabditis_elegans/UCSC/ce10/Sequence/BWAIndex/version0.6.0/" + bowtie = "${params.igenomes_base}/Caenorhabditis_elegans/UCSC/ce10/Sequence/BowtieIndex/" bowtie2 = "${params.igenomes_base}/Caenorhabditis_elegans/UCSC/ce10/Sequence/Bowtie2Index/" star = "${params.igenomes_base}/Caenorhabditis_elegans/UCSC/ce10/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Caenorhabditis_elegans/UCSC/ce10/Sequence/BismarkIndex/" gtf = "${params.igenomes_base}/Caenorhabditis_elegans/UCSC/ce10/Annotation/Genes/genes.gtf" bed12 = "${params.igenomes_base}/Caenorhabditis_elegans/UCSC/ce10/Annotation/Genes/genes.bed" - readme = "${params.igenomes_base}/Caenorhabditis_elegans/UCSC/ce10/Annotation/README.txt" + mature = "${params.igenomes_base}/Caenorhabditis_elegans/UCSC/ce10/Annotation/SmallRNA/mature.fa" mito_name = "chrM" - macs_gsize = "9e7" + species_id = "cel" } 'canFam3' { fasta = "${params.igenomes_base}/Canis_familiaris/UCSC/canFam3/Sequence/WholeGenomeFasta/genome.fa" + fasta_fai = "${params.igenomes_base}/Canis_familiaris/UCSC/canFam3/Sequence/WholeGenomeFasta/genome.fa.fai" bwa = "${params.igenomes_base}/Canis_familiaris/UCSC/canFam3/Sequence/BWAIndex/version0.6.0/" + bowtie = "${params.igenomes_base}/Canis_familiaris/UCSC/canFam3/Sequence/BowtieIndex/" bowtie2 = "${params.igenomes_base}/Canis_familiaris/UCSC/canFam3/Sequence/Bowtie2Index/" star = "${params.igenomes_base}/Canis_familiaris/UCSC/canFam3/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Canis_familiaris/UCSC/canFam3/Sequence/BismarkIndex/" gtf = "${params.igenomes_base}/Canis_familiaris/UCSC/canFam3/Annotation/Genes/genes.gtf" bed12 = "${params.igenomes_base}/Canis_familiaris/UCSC/canFam3/Annotation/Genes/genes.bed" - readme = "${params.igenomes_base}/Canis_familiaris/UCSC/canFam3/Annotation/README.txt" + bed12 = "${params.igenomes_base}/Canis_familiaris/UCSC/canFam3/Annotation/Genes/genes.bed" mito_name = "chrM" + species_id = "cfa" } 'danRer10' { fasta = "${params.igenomes_base}/Danio_rerio/UCSC/danRer10/Sequence/WholeGenomeFasta/genome.fa" + fasta_fai = "${params.igenomes_base}/Danio_rerio/UCSC/danRer10/Sequence/WholeGenomeFasta/genome.fa.fai" bwa = "${params.igenomes_base}/Danio_rerio/UCSC/danRer10/Sequence/BWAIndex/version0.6.0/" + bowtie = "${params.igenomes_base}/Danio_rerio/UCSC/danRer10/Sequence/BowtieIndex/" bowtie2 = "${params.igenomes_base}/Danio_rerio/UCSC/danRer10/Sequence/Bowtie2Index/" star = "${params.igenomes_base}/Danio_rerio/UCSC/danRer10/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Danio_rerio/UCSC/danRer10/Sequence/BismarkIndex/" gtf = "${params.igenomes_base}/Danio_rerio/UCSC/danRer10/Annotation/Genes/genes.gtf" bed12 = "${params.igenomes_base}/Danio_rerio/UCSC/danRer10/Annotation/Genes/genes.bed" + bed12 = "${params.igenomes_base}/Danio_rerio/UCSC/danRer10/Annotation/Genes/genes.bed" mito_name = "chrM" - macs_gsize = "1.37e9" + species_id = "dre" } 'dm6' { fasta = "${params.igenomes_base}/Drosophila_melanogaster/UCSC/dm6/Sequence/WholeGenomeFasta/genome.fa" + fasta_fai = "${params.igenomes_base}/Drosophila_melanogaster/UCSC/dm6/Sequence/WholeGenomeFasta/genome.fa.fai" bwa = "${params.igenomes_base}/Drosophila_melanogaster/UCSC/dm6/Sequence/BWAIndex/version0.6.0/" + bowtie = "${params.igenomes_base}/Drosophila_melanogaster/UCSC/dm6/Sequence/BowtieIndex/" bowtie2 = "${params.igenomes_base}/Drosophila_melanogaster/UCSC/dm6/Sequence/Bowtie2Index/" star = "${params.igenomes_base}/Drosophila_melanogaster/UCSC/dm6/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Drosophila_melanogaster/UCSC/dm6/Sequence/BismarkIndex/" gtf = "${params.igenomes_base}/Drosophila_melanogaster/UCSC/dm6/Annotation/Genes/genes.gtf" bed12 = "${params.igenomes_base}/Drosophila_melanogaster/UCSC/dm6/Annotation/Genes/genes.bed" + mature = "${params.igenomes_base}/Drosophila_melanogaster/UCSC/dm6/Annotation/SmallRNA/mature.fa" mito_name = "chrM" - macs_gsize = "1.2e8" + species_id = "dme" } 'equCab2' { fasta = "${params.igenomes_base}/Equus_caballus/UCSC/equCab2/Sequence/WholeGenomeFasta/genome.fa" + fasta_fai = "${params.igenomes_base}/Equus_caballus/UCSC/equCab2/Sequence/WholeGenomeFasta/genome.fa.fai" bwa = "${params.igenomes_base}/Equus_caballus/UCSC/equCab2/Sequence/BWAIndex/version0.6.0/" + bowtie = "${params.igenomes_base}/Equus_caballus/UCSC/equCab2/Sequence/BowtieIndex/" bowtie2 = "${params.igenomes_base}/Equus_caballus/UCSC/equCab2/Sequence/Bowtie2Index/" star = "${params.igenomes_base}/Equus_caballus/UCSC/equCab2/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Equus_caballus/UCSC/equCab2/Sequence/BismarkIndex/" gtf = "${params.igenomes_base}/Equus_caballus/UCSC/equCab2/Annotation/Genes/genes.gtf" bed12 = "${params.igenomes_base}/Equus_caballus/UCSC/equCab2/Annotation/Genes/genes.bed" - readme = "${params.igenomes_base}/Equus_caballus/UCSC/equCab2/Annotation/README.txt" + mature = "${params.igenomes_base}/Equus_caballus/UCSC/equCab2/Annotation/SmallRNA/mature.fa" mito_name = "chrM" + species_id = "eca" } 'galGal4' { fasta = "${params.igenomes_base}/Gallus_gallus/UCSC/galGal4/Sequence/WholeGenomeFasta/genome.fa" + fasta_fai = "${params.igenomes_base}/Gallus_gallus/UCSC/galGal4/Sequence/WholeGenomeFasta/genome.fa.fai" bwa = "${params.igenomes_base}/Gallus_gallus/UCSC/galGal4/Sequence/BWAIndex/version0.6.0/" + bowtie = "${params.igenomes_base}/Gallus_gallus/UCSC/galGal4/Sequence/BowtieIndex/" bowtie2 = "${params.igenomes_base}/Gallus_gallus/UCSC/galGal4/Sequence/Bowtie2Index/" star = "${params.igenomes_base}/Gallus_gallus/UCSC/galGal4/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Gallus_gallus/UCSC/galGal4/Sequence/BismarkIndex/" gtf = "${params.igenomes_base}/Gallus_gallus/UCSC/galGal4/Annotation/Genes/genes.gtf" bed12 = "${params.igenomes_base}/Gallus_gallus/UCSC/galGal4/Annotation/Genes/genes.bed" - readme = "${params.igenomes_base}/Gallus_gallus/UCSC/galGal4/Annotation/README.txt" + mature = "${params.igenomes_base}/Gallus_gallus/UCSC/galGal4/Annotation/SmallRNA/mature.fa" mito_name = "chrM" + species_id = "gga" } 'panTro4' { fasta = "${params.igenomes_base}/Pan_troglodytes/UCSC/panTro4/Sequence/WholeGenomeFasta/genome.fa" + fasta_fai = "${params.igenomes_base}/Pan_troglodytes/UCSC/panTro4/Sequence/WholeGenomeFasta/genome.fa.fai" bwa = "${params.igenomes_base}/Pan_troglodytes/UCSC/panTro4/Sequence/BWAIndex/version0.6.0/" + bowtie = "${params.igenomes_base}/Pan_troglodytes/UCSC/panTro4/Sequence/BowtieIndex/" bowtie2 = "${params.igenomes_base}/Pan_troglodytes/UCSC/panTro4/Sequence/Bowtie2Index/" star = "${params.igenomes_base}/Pan_troglodytes/UCSC/panTro4/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Pan_troglodytes/UCSC/panTro4/Sequence/BismarkIndex/" gtf = "${params.igenomes_base}/Pan_troglodytes/UCSC/panTro4/Annotation/Genes/genes.gtf" bed12 = "${params.igenomes_base}/Pan_troglodytes/UCSC/panTro4/Annotation/Genes/genes.bed" - readme = "${params.igenomes_base}/Pan_troglodytes/UCSC/panTro4/Annotation/README.txt" + mature = "${params.igenomes_base}/Pan_troglodytes/UCSC/panTro4/Annotation/SmallRNA/mature.fa" mito_name = "chrM" + species_id = "ptr" } 'rn6' { fasta = "${params.igenomes_base}/Rattus_norvegicus/UCSC/rn6/Sequence/WholeGenomeFasta/genome.fa" + fasta_fai = "${params.igenomes_base}/Rattus_norvegicus/UCSC/rn6/Sequence/WholeGenomeFasta/genome.fa.fai" bwa = "${params.igenomes_base}/Rattus_norvegicus/UCSC/rn6/Sequence/BWAIndex/version0.6.0/" + bowtie = "${params.igenomes_base}/Rattus_norvegicus/UCSC/rn6/Sequence/BowtieIndex/" bowtie2 = "${params.igenomes_base}/Rattus_norvegicus/UCSC/rn6/Sequence/Bowtie2Index/" star = "${params.igenomes_base}/Rattus_norvegicus/UCSC/rn6/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Rattus_norvegicus/UCSC/rn6/Sequence/BismarkIndex/" gtf = "${params.igenomes_base}/Rattus_norvegicus/UCSC/rn6/Annotation/Genes/genes.gtf" bed12 = "${params.igenomes_base}/Rattus_norvegicus/UCSC/rn6/Annotation/Genes/genes.bed" + mature = "${params.igenomes_base}/Rattus_norvegicus/UCSC/rn6/Annotation/SmallRNA/mature.fa" mito_name = "chrM" + species_id = "rno" } 'sacCer3' { fasta = "${params.igenomes_base}/Saccharomyces_cerevisiae/UCSC/sacCer3/Sequence/WholeGenomeFasta/genome.fa" + fasta_fai = "${params.igenomes_base}/Saccharomyces_cerevisiae/UCSC/sacCer3/Sequence/WholeGenomeFasta/genome.fa.fai" bwa = "${params.igenomes_base}/Saccharomyces_cerevisiae/UCSC/sacCer3/Sequence/BWAIndex/version0.6.0/" + bowtie = "${params.igenomes_base}/Saccharomyces_cerevisiae/UCSC/sacCer3/Sequence/BowtieIndex/" bowtie2 = "${params.igenomes_base}/Saccharomyces_cerevisiae/UCSC/sacCer3/Sequence/Bowtie2Index/" star = "${params.igenomes_base}/Saccharomyces_cerevisiae/UCSC/sacCer3/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Saccharomyces_cerevisiae/UCSC/sacCer3/Sequence/BismarkIndex/" - readme = "${params.igenomes_base}/Saccharomyces_cerevisiae/UCSC/sacCer3/Annotation/README.txt" + mature = "${params.igenomes_base}/Saccharomyces_cerevisiae/UCSC/sacCer3/Annotation/SmallRNA/mature.fa" mito_name = "chrM" - macs_gsize = "1.2e7" + species_id = "sce" } 'susScr3' { fasta = "${params.igenomes_base}/Sus_scrofa/UCSC/susScr3/Sequence/WholeGenomeFasta/genome.fa" + fasta_fai = "${params.igenomes_base}/Sus_scrofa/UCSC/susScr3/Sequence/WholeGenomeFasta/genome.fa.fai" bwa = "${params.igenomes_base}/Sus_scrofa/UCSC/susScr3/Sequence/BWAIndex/version0.6.0/" + bowtie = "${params.igenomes_base}/Sus_scrofa/UCSC/susScr3/Sequence/BowtieIndex/" bowtie2 = "${params.igenomes_base}/Sus_scrofa/UCSC/susScr3/Sequence/Bowtie2Index/" star = "${params.igenomes_base}/Sus_scrofa/UCSC/susScr3/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Sus_scrofa/UCSC/susScr3/Sequence/BismarkIndex/" gtf = "${params.igenomes_base}/Sus_scrofa/UCSC/susScr3/Annotation/Genes/genes.gtf" bed12 = "${params.igenomes_base}/Sus_scrofa/UCSC/susScr3/Annotation/Genes/genes.bed" - readme = "${params.igenomes_base}/Sus_scrofa/UCSC/susScr3/Annotation/README.txt" + mature = "${params.igenomes_base}/Sus_scrofa/UCSC/susScr3/Annotation/SmallRNA/mature.fa" mito_name = "chrM" + species_id = "ssc" } } } diff --git a/conf/modules.config b/conf/modules.config index d266a387f..8fbad91e5 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -18,16 +18,1079 @@ process { saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ] - withName: FASTQC { - ext.args = '--quiet' + withName: CUSTOM_DUMPSOFTWAREVERSIONS { + publishDir = [ + path: { "${params.outdir}/pipeline_info" }, + mode: params.publish_dir_mode, + pattern: '*_versions.yml' + ] + } + + withName: CAT_FASTQ { + publishDir = [ + path: { "${params.outdir}/preprocessing/merged_samples" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + + withName: SAMTOOLS_FAIDX { + publishDir = [ + path: { "${params.outdir}/references/index/fasta" }, + mode: params.publish_dir_mode, + saveAs: { filename -> ( filename != 'versions.yml' && params.save_reference ) ? filename : null } + ] + } + + withName: '.*:FASTQC_TRIMGALORE:FASTQC' { + publishDir = [ + path: { "${params.outdir}/quality_control/fastqc" }, + mode: params.publish_dir_mode, + pattern: "*.{html,zip}" + ] + } + + withName: '.*:FASTQC_TRIMGALORE:TRIMGALORE' { + ext.args = { + [ + "--fastqc_args '-t ${task.cpus}' ", + params.trim_nextseq > 0 ? "--nextseq ${params.trim_nextseq}" : '' + ].join(' ').trim() + } + publishDir = [ + [ + path: { "${params.outdir}/quality_control/trimgalore" }, + mode: params.publish_dir_mode, + pattern: "*.fq.gz", + enabled: params.save_trimmed + ], + [ + path: { "${params.outdir}/quality_control/trimgalore" }, + mode: params.publish_dir_mode, + pattern: "*.txt" + ] + ] + } + + // PREPARE GENOME + withName: CLEAN_FASTA { + ext.args2 = '\'/>/{ gsub(\$2, "",\$2);gsub(" ", "") };{print}\'' + publishDir = [ + path: { "${params.outdir}/references/genome/clean_fasta" }, + mode: params.publish_dir_mode, + saveAs: { filename -> ( filename != 'versions.yml' && params.save_reference ) ? filename : null } + ] + } + + withName: GTFFILTER { + ext.suffix = "filtered.gtf" + publishDir = [ + path: { "${params.outdir}/references/genome/filtered_gtf" }, + mode: params.publish_dir_mode, + saveAs: { filename -> ( filename != 'versions.yml' && params.save_reference ) ? filename : null } + ] + } + + withName: SEQKIT_SPLIT { + ext.args = "-i --by-id-prefix \"\"" + publishDir = [ + path: { "${params.outdir}/references/genome/chromosomes" }, + mode: params.publish_dir_mode, + saveAs: { filename -> ( filename != 'versions.yml' && params.save_reference ) ? filename : null } + ] + } + + withName: BOWTIE_BUILD { + ext.when = { !params.bowtie && params.tools.split(',').contains('mapsplice') } + publishDir = [ + path: { "${params.outdir}/references/index" }, + mode: params.publish_dir_mode, + saveAs: { filename -> ( filename != 'versions.yml' && params.save_reference ) ? filename : null } + ] + } + + withName: BOWTIE2_BUILD { + ext.when = { !params.bowtie2 && params.tools.split(',').contains('find_circ') } + publishDir = [ + path: { "${params.outdir}/references/index" }, + mode: params.publish_dir_mode, + saveAs: { filename -> ( filename != 'versions.yml' && params.save_reference ) ? filename : null } + ] + } + + withName: BWA_INDEX { + ext.when = { !params.bwa && params.tools.split(',').contains('ciriquant') } + publishDir = [ + path: { "${params.outdir}/references/index" }, + mode: params.publish_dir_mode, + saveAs: { filename -> ( filename != 'versions.yml' && params.save_reference ) ? filename : null } + ] + } + + withName: HISAT2_EXTRACTSPLICESITES { + ext.when = { params.tools.split(',').contains('ciriquant') } + publishDir = [ + path: { "${params.outdir}/references/index/hisat2" }, + mode: params.publish_dir_mode, + saveAs: { filename -> ( filename != 'versions.yml' && params.save_reference ) ? filename : null } + ] + } + + withName: HISAT2_BUILD { + ext.when = { params.tools.split(',').contains('ciriquant') } + publishDir = [ + path: { "${params.outdir}/references/index" }, + mode: params.publish_dir_mode, + saveAs: { filename -> ( filename != 'versions.yml' && params.save_reference ) ? filename : null } + ] + } + + withName: STAR_GENOMEGENERATE { + ext.when = { !params.star && ( params.tools.split(',').contains('circexplorer2') || params.tools.split(',').contains('dcc') || params.tools.split(',').contains('circrna_finder') ) } + ext.args = [ "", + params.sjdboverhang ? "--sjdbOverhang ${params.sjdboverhang}" : '', + ].join(' ').trim() + publishDir = [ + path: { "${params.outdir}/references/index" }, + mode: params.publish_dir_mode, + saveAs: { filename -> ( filename != 'versions.yml' && params.save_reference ) ? filename : null } + ] + } + + // circRNA + + withName: '.*:SEGEMEHL:INDEX' { + publishDir = [ + path: { "${params.outdir}/references/index/segemehl" }, + mode: params.publish_dir_mode, + saveAs: { filename -> ( filename != 'versions.yml' && params.save_reference ) ? filename : null } + ] + } + + withName: '.*:SEGEMEHL:ALIGN' { + ext.args = [ "", + "-b", + "-S" + ].join(' ').trim() + publishDir = [ + path: { "${params.outdir}/bsj_detection/tools/segemehl/intermediates" }, + mode: params.publish_dir_mode, + saveAs: { filename -> ( filename != 'versions.yml' && params.save_intermediates ) ? filename : null } + ] + } + + withName: '.*:SEGEMEHL:EXTRACT' { + // Keep only rows with ";C;" in column 4 + // Print $1 $2 $3 $1:$2-$3 $5 $6 + ext.args = "-v FS='\\t' -v OFS='\\t' '{ if (\$4 ~ /;C;/) { print \$1, \$2, \$3, \$1 \":\" \$2 \"-\" \$3 \":\" \$6, \$5, \$6 } }'" + ext.suffix = "segemehl_extracted.bed" + publishDir = [ + path: { "${params.outdir}/bsj_detection/tools/segemehl/extracted" }, + mode: params.publish_dir_mode, + saveAs: { filename -> ( filename != 'versions.yml' && params.save_intermediates ) ? filename : null } + ] + } + + withName: '.*:SEGEMEHL:SORT' { + ext.args = "-k1,1 -k2,2n -k3,3n -k4,4 -k6,6" + ext.suffix = "segemehl_sorted.bed" + publishDir = [ + path: { "${params.outdir}/bsj_detection/tools/segemehl/sorted" }, + mode: params.publish_dir_mode, + saveAs: { filename -> ( filename != 'versions.yml' && params.save_intermediates ) ? filename : null } + ] + } + + withName: '.*:SEGEMEHL:GROUP' { + ext.summary_col = 5 + ext.args = "-g 1,2,3,4,6 -o count" + ext.suffix = "segemehl_grouped.bed" + publishDir = [ + path: { "${params.outdir}/bsj_detection/tools/segemehl/grouped" }, + mode: params.publish_dir_mode, + saveAs: { filename -> ( filename != 'versions.yml' && params.save_intermediates ) ? filename : null } + ] + } + + withName: '.*:SEGEMEHL:UNIFY' { + ext.args = "-v FS='\\t' -v OFS='\\t' '{ print \$1, \$2, \$3, \$4, \$6, \$5 }'" + ext.suffix = "segemehl.bed" + publishDir = [ + path: { "${params.outdir}/bsj_detection/tools/segemehl/unified" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, + pattern: "*.segemehl.bed" + ] + } + + withName: '.*:STAR2PASS:PASS_1' { + ext.when = { params.tools.split(',').contains('circexplorer2') || params.tools.split(',').contains('circrna_finder') } + ext.args = [ "", + "--chimOutType Junctions WithinBAM", + "--outSAMunmapped Within", + "--outFilterType BySJout", + "--outReadsUnmapped None", + "--readFilesCommand zcat", + "--alignSJDBoverhangMin ${params.alignSJDBoverhangMin}", + "--limitSjdbInsertNsj ${params.limitSjdbInsertNsj}", + "--chimJunctionOverhangMin ${params.chimJunctionOverhangMin}", + "--chimSegmentMin ${params.chimSegmentMin}" + ].join(' ').trim() + publishDir = [ + path: { "${params.outdir}/bsj_detection/tools/star/1st_pass/${meta.id}" }, + mode: params.publish_dir_mode, + saveAs: { filename -> ( filename != 'versions.yml' && params.save_intermediates ) ? filename : null } + ] + } + + withName: '.*:STAR2PASS:SJDB' { + publishDir = [ + path: { "${params.outdir}/bsj_detection/tools/star/sjdb/${meta.id}" }, + mode: params.publish_dir_mode, + saveAs: { filename -> ( filename != 'versions.yml' && params.save_intermediates ) ? filename : null } + ] + } + + withName: '.*:STAR2PASS:PASS_2' { + ext.args = [ "", + params.tools.split(',').contains('circrna_finder') ? "--chimOutType Junctions SeparateSAMold" : "--chimOutType Junctions WithinBAM", + "--outSAMunmapped Within", + "--outFilterType BySJout", + "--outReadsUnmapped None", + "--readFilesCommand zcat", + "--sjdbFileChrStartEnd dataset.SJ.out.tab", + "--alignSJDBoverhangMin ${params.alignSJDBoverhangMin}", + "--limitSjdbInsertNsj ${params.limitSjdbInsertNsj}", + "--chimJunctionOverhangMin ${params.chimJunctionOverhangMin}", + "--chimSegmentMin ${params.chimSegmentMin}" + ].join(' ').trim() + publishDir = [ + path: { "${params.outdir}/bsj_detection/tools/star/2nd_pass/${meta.id}" }, + mode: params.publish_dir_mode, + saveAs: { filename -> ( filename != 'versions.yml' && params.save_intermediates ) ? filename : null } + ] + } + + withName: '.*:CIRCEXPLORER2:REFERENCE' { + ext.args = [ "", + "-genePredExt", + "-geneNameAsName2" + ].join(' ').trim() + publishDir = [ + path: { "${params.outdir}/references/bsj_detection/circexplorer2" }, + mode: params.publish_dir_mode, + saveAs: { filename -> ( filename != 'versions.yml' && params.save_reference ) ? filename : null } + ] + } + + withName: '.*:CIRCEXPLORER2:PARSE' { + publishDir = [ + path: { "${params.outdir}/bsj_detection/tools/circexplorer2/intermediates/${meta.id}" }, + mode: params.publish_dir_mode, + saveAs: { filename -> ( filename != 'versions.yml' && params.save_intermediates ) ? filename : null } + ] + } + + withName: '.*:CIRCEXPLORER2:ANNOTATE' { + publishDir = [ + path: { "${params.outdir}/bsj_detection/tools/circexplorer2/intermediates/${meta.id}" }, + mode: params.publish_dir_mode, + saveAs: { filename -> ( filename != 'versions.yml' && params.save_intermediates ) ? filename : null } + ] + } + + withName: '.*:CIRCEXPLORER2:UNIFY' { + ext.args = "-v FS='\\t' -v OFS='\\t' '{ print \$1, \$2, \$3, \$1 \":\" \$2 \"-\" \$3 \":\" \$6, \$13, \$6 }'" + ext.suffix = "circexplorer2.bed" + publishDir = [ + path: { "${params.outdir}/bsj_detection/tools/circexplorer2/unified" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, + pattern: "*.circexplorer2.bed" + ] + } + + withName: '.*:CIRCRNA_FINDER:MAIN' { + publishDir = [ + path: { "${params.outdir}/bsj_detection/tools/circrna_finder/intermediates/${meta.id}" }, + mode: params.publish_dir_mode, + saveAs: { filename -> ( filename != 'versions.yml' && params.save_intermediates ) ? filename : null }, + pattern: "*.bed" + ] + } + + withName: '.*:CIRCRNA_FINDER:UNIFY' { + ext.args = "-v FS='\\t' -v OFS='\\t' '{ print \$1, \$2, \$3, \$1 \":\" \$2 \"-\" \$3 \":\" \$6, \$5, \$6 }'" + ext.suffix = "circrna_finder.bed" + publishDir = [ + path: { "${params.outdir}/bsj_detection/tools/circrna_finder/unified" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, + pattern: "*.circrna_finder.bed" + ] + } + + withName: '.*:FIND_CIRC:ALIGN' { + ext.args = { "--very-sensitive --mm -D 20 --score-min=C,-15,0 -q " + + (!meta.strandedness || meta.strandedness == 'unstranded' || meta.strandedness == 'auto' ? '' : + meta.strandedness == 'forward' ? ' --norc' : ' --nofw') } + publishDir = [ + path: { "${params.outdir}/bsj_detection/tools/find_circ/intermediates/${meta.id}" }, + mode: params.publish_dir_mode, + saveAs: { filename -> ( filename != 'versions.yml' && params.save_intermediates ) ? filename : null } + ] + } + + withName: '.*:FIND_CIRC:SAMTOOLS_INDEX' { + publishDir = [ + path: { "${params.outdir}/bsj_detection/tools/find_circ/intermediates/${meta.id}" }, + mode: params.publish_dir_mode, + saveAs: { filename -> ( filename != 'versions.yml' && params.save_intermediates ) ? filename : null } + ] + } + + withName: '.*:FIND_CIRC:SAMTOOLS_VIEW' { + ext.prefix = { "${meta.id}_unmapped" } + ext.args = "-hf 4" + publishDir = [ + path: { "${params.outdir}/bsj_detection/tools/find_circ/intermediates/${meta.id}" }, + mode: params.publish_dir_mode, + saveAs: { filename -> ( filename != 'versions.yml' && params.save_intermediates ) ? filename : null } + ] + } + + withName: '.*:FIND_CIRC:ANCHORS' { + publishDir = [ + path: { "${params.outdir}/bsj_detection/tools/find_circ/intermediates/${meta.id}" }, + mode: params.publish_dir_mode, + saveAs: { filename -> ( filename != 'versions.yml' && params.save_intermediates ) ? filename : null } + ] + } + + withName: '.*:FIND_CIRC:MAIN' { + ext.args = { !meta.strandedness || meta.strandedness == 'unstranded' || meta.strandedness == 'auto' ? '' : + meta.strandedness == 'forward' ? ' --norc' : ' --nofw' } + publishDir = [ + path: { "${params.outdir}/bsj_detection/tools/find_circ/intermediates/${meta.id}" }, + mode: params.publish_dir_mode, + saveAs: { filename -> ( filename != 'versions.yml' && params.save_intermediates ) ? filename : null } + ] + } + + withName: '.*:FIND_CIRC:UNIFY' { + // Keep only rows with CIRCULAR, UNAMBIGUOUS_BP and ANCHOR_UNIQUE in $18 + ext.args = "-v FS='\\t' -v OFS='\\t' '{ if (\$18 ~ /CIRCULAR/ && \$18 ~ /UNAMBIGUOUS_BP/ && \$18 ~ /ANCHOR_UNIQUE/) { print \$1, \$2, \$3, \$1 \":\" \$2 \"-\" \$3 \":\" \$6, \$5, \$6 } }'" + ext.suffix = "find_circ.bed" + publishDir = [ + path: { "${params.outdir}/bsj_detection/tools/find_circ/unified" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, + pattern: "*.find_circ.bed" + ] + } + + withName: '.*:BSJ_DETECTION:CIRIQUANT:MAIN' { + ext.args = "--no-gene" + publishDir = [ + path: { "${params.outdir}/bsj_detection/tools/ciriquant/intermediates/${meta.id}" }, + mode: params.publish_dir_mode, + saveAs: { filename -> ( filename != 'versions.yml' && params.save_intermediates ) ? filename : null } + ] + } + + withName: '.*:CIRIQUANT:UNIFY' { + // Drop all rows starting with # + // $count is $14 until the dot (never has decimals) + // Print $1 $4 $5 $1:$4-$5:$7 $count $7 + ext.args = "-v OFS='\\t' '{ count = substr(\$14, 1, index(\$14, \".\") - 1); print \$1, \$4, \$5, \$1 \":\" \$4 \"-\" \$5 \":\" \$7, count, \$7 }'" + ext.suffix = "ciriquant.bed" + + publishDir = [ + path: { "${params.outdir}/bsj_detection/tools/ciriquant/unified" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, + pattern: "*.ciriquant.bed" + ] + } + + withName: '.*:DCC:MATE1_1ST_PASS' { + ext.prefix = { "${meta.id}_mate1" } + ext.args = [ "", + "--chimOutType Junctions WithinBAM", + "--outSAMunmapped Within", + "--outFilterType BySJout", + "--outReadsUnmapped None", + "--readFilesCommand zcat", + "--alignSJDBoverhangMin ${params.alignSJDBoverhangMin}", + "--limitSjdbInsertNsj ${params.limitSjdbInsertNsj}", + "--chimJunctionOverhangMin ${params.chimJunctionOverhangMin}", + "--chimSegmentMin ${params.chimSegmentMin}" + ].join(' ').trim() + publishDir = [ + path: { "${params.outdir}/bsj_detection/tools/dcc/intermediates/mate1/1st_pass" }, + mode: params.publish_dir_mode, + saveAs: { filename -> ( filename != 'versions.yml' && params.save_intermediates ) ? filename : null } + ] + } + + withName: '.*:DCC:MATE1_SJDB' { + publishDir = [ + path: { "${params.outdir}/bsj_detection/tools/dcc/intermediates/mate1/sjdb" }, + mode: params.publish_dir_mode, + saveAs: { filename -> ( filename != 'versions.yml' && params.save_intermediates ) ? filename : null } + ] + } + + withName: '.*:DCC:MATE1_2ND_PASS' { + ext.prefix = { "${meta.id}_mate1" } + ext.args = [ "", + "--chimOutType Junctions WithinBAM", + "--outSAMunmapped Within", + "--outFilterType BySJout", + "--outReadsUnmapped None", + "--readFilesCommand zcat", + "--sjdbFileChrStartEnd dataset.SJ.out.tab", + "--alignSJDBoverhangMin ${params.alignSJDBoverhangMin}", + "--limitSjdbInsertNsj ${params.limitSjdbInsertNsj}", + "--chimJunctionOverhangMin ${params.chimJunctionOverhangMin}", + "--chimSegmentMin ${params.chimSegmentMin}" + ].join(' ').trim() + publishDir = [ + path: { "${params.outdir}/bsj_detection/tools/dcc/intermediates/mate1/2nd_pass" }, + mode: params.publish_dir_mode, + saveAs: { filename -> ( filename != 'versions.yml' && params.save_intermediates ) ? filename : null } + ] + } + + withName: '.*:DCC:MATE2_1ST_PASS' { + ext.prefix = { "${meta.id}_mate2" } + ext.args = [ "", + "--chimOutType Junctions WithinBAM", + "--outSAMunmapped Within", + "--outFilterType BySJout", + "--outReadsUnmapped None", + "--readFilesCommand zcat", + "--alignSJDBoverhangMin ${params.alignSJDBoverhangMin}", + "--limitSjdbInsertNsj ${params.limitSjdbInsertNsj}", + "--chimJunctionOverhangMin ${params.chimJunctionOverhangMin}", + "--chimSegmentMin ${params.chimSegmentMin}" + ].join(' ').trim() + publishDir = [ + path: { "${params.outdir}/bsj_detection/tools/dcc/intermediates/mate2/1st_pass" }, + mode: params.publish_dir_mode, + saveAs: { filename -> ( filename != 'versions.yml' && params.save_intermediates ) ? filename : null } + ] + } + + withName: '.*:DCC:MATE2_SJDB' { + publishDir = [ + path: { "${params.outdir}/bsj_detection/tools/dcc/intermediates/mate2/sjdb" }, + mode: params.publish_dir_mode, + saveAs: { filename -> ( filename != 'versions.yml' && params.save_intermediates ) ? filename : null } + ] + } + + withName: '.*:DCC:MATE2_2ND_PASS' { + ext.prefix = { "${meta.id}_mate2" } + ext.args = [ "", + "--chimOutType Junctions WithinBAM", + "--outSAMunmapped Within", + "--outFilterType BySJout", + "--outReadsUnmapped None", + "--readFilesCommand zcat", + "--sjdbFileChrStartEnd dataset.SJ.out.tab", + "--alignSJDBoverhangMin ${params.alignSJDBoverhangMin}", + "--limitSjdbInsertNsj ${params.limitSjdbInsertNsj}", + "--chimJunctionOverhangMin ${params.chimJunctionOverhangMin}", + "--chimSegmentMin ${params.chimSegmentMin}" + ].join(' ').trim() + publishDir = [ + path: { "${params.outdir}/bsj_detection/tools/dcc/intermediates/mate2/2nd_pass" }, + mode: params.publish_dir_mode, + saveAs: { filename -> ( filename != 'versions.yml' && params.save_intermediates ) ? filename : null } + ] + } + + withName: '.*:DCC:MAIN' { + publishDir = [ + path: { "${params.outdir}/bsj_detection/tools/dcc/intermediates/${meta.id}" }, + mode: params.publish_dir_mode, + saveAs: { filename -> ( filename != 'versions.yml' && params.save_intermediates ) ? filename : null } + ] + } + + withName: '.*:DCC:UNIFY' { + ext.args = "-v FS='\\t' -v OFS='\\t' '{ print \$1, \$2, \$3, \$1 \":\" \$2 \"-\" \$3 \":\" \$4, \$5, \$4 }'" + ext.suffix = "dcc.bed" + publishDir = [ + path: { "${params.outdir}/bsj_detection/tools/dcc/unified" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, + pattern: "*.dcc.bed" + ] + } + + withName: '.*:MAPSPLICE:REFERENCE' { + ext.args = [ "", + "-genePredExt", + "-geneNameAsName2" + ].join(' ').trim() + publishDir = [ + path: { "${params.outdir}/references/bsj_detection/mapsplice" }, + mode: params.publish_dir_mode, + saveAs: { filename -> ( filename != 'versions.yml' && params.save_reference ) ? filename : null } + ] + } + + withName: '.*:MAPSPLICE:ALIGN' { + ext.args = [ "", + "--seglen ${params.seglen}", + "--min-intron ${params.min_intron}", + "--max-intron ${params.max_intron}", + "--min-map-len ${params.min_map_len}", + "--min-fusion-distance ${params.min_fusion_distance}", + "--fusion-non-canonical" + ].join(' ').trim() + publishDir = [ + path: { "${params.outdir}/bsj_detection/tools/mapsplice/intermediates/${meta.id}" }, + mode: params.publish_dir_mode, + saveAs: { filename -> ( filename != 'versions.yml' && params.save_intermediates ) ? filename : null } + ] + } + + withName: '.*:MAPSPLICE:PARSE' { + publishDir = [ + path: { "${params.outdir}/bsj_detection/tools/mapsplice/intermediates/${meta.id}" }, + mode: params.publish_dir_mode, + saveAs: { filename -> ( filename != 'versions.yml' && params.save_intermediates ) ? filename : null } + ] + } + + withName: '.*:MAPSPLICE:ANNOTATE' { + publishDir = [ + path: { "${params.outdir}/bsj_detection/tools/mapsplice/intermediates/${meta.id}" }, + mode: params.publish_dir_mode, + saveAs: { filename -> ( filename != 'versions.yml' && params.save_intermediates ) ? filename : null } + ] + } + + withName: '.*:MAPSPLICE:UNIFY' { + ext.args = "-v FS='\\t' -v OFS='\\t' '{ print \$1, \$2, \$3, \$1 \":\" \$2 \"-\" \$3 \":\" \$6, \$10, \$6 }'" + ext.suffix = "mapsplice.bed" + publishDir = [ + path: { "${params.outdir}/bsj_detection/tools/mapsplice/unified" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, + pattern: "*.mapsplice.bed" + ] + } + + withName: 'FILTER_BSJS' { + // Make sure score is higher or equal to the threshold + ext.args = { "-v FS='\\t' -v OFS='\\t' '{ if (\$5 >= ${params.bsj_reads}) { print } }'" } + ext.suffix = {"${meta.tool}.filtered.bed"} + publishDir = [ + path: { "${params.outdir}/bsj_detection/tools/${meta.tool}/filtered" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, + ] + } + + withName: 'EXTRACT_COUNTS' { + // Add meta.id as header + // Keep columns 4,5 + ext.args = { "-v FS='\\t' -v OFS='\\t' 'BEGIN { print \"id\", \"${meta.id}\" } { print \$4, \$5 }'" } + ext.suffix = {"counts.tsv"} + publishDir = [ + enabled: false + ] + } + + withName: 'COMBINE_COUNTS_PER_TOOL' { + ext.args = "-f 1 -t -O" + maxRetries = 3 + publishDir = [ + path: { "${params.outdir}/bsj_detection/tools/${meta.id}" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + + withName: 'BED_ADD_SAMPLE_TOOL' { + ext.args = { "-v FS='\\t' -v OFS='\\t' '{ print \$1, \$2, \$3, \$4, \$5, \$6, \"${meta.id}\", \"${meta.tool}\" }'" } + ext.prefix = { "${meta.id}_${meta.tool}" } + ext.suffix = { "meta.bed" } + publishDir = [ + path: { "${params.outdir}/bsj_detection/tools/${meta.tool}/meta" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + + withName: 'COMBINE_TOOLS_PER_SAMPLE' { + ext.suffix = "combined.bed" + + publishDir = [ + path: { "${params.outdir}/bsj_detection/samples/${meta.id}" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + + withName: 'COMBINE_SAMPLES' { + ext.suffix = "combined.bed" + + publishDir = [ + path: { "${params.outdir}/bsj_detection/combined" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + + withName: FASTA_COMBINED { + ext.suffix = "fasta" + publishDir = [ + path: { "${params.outdir}/bsj_detection/combined" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + + withName: FASTA_PER_SAMPLE { + ext.suffix = "fasta" + publishDir = [ + path: { "${params.outdir}/bsj_detection/samples/${meta.id}" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] } - withName: 'MULTIQC' { - ext.args = { params.multiqc_title ? "--title \"$params.multiqc_title\"" : '' } + + withName: FASTA_PER_SAMPLE_TOOL { + ext.suffix = { "${meta.tool}.fasta" } + publishDir = [ + path: { "${params.outdir}/bsj_detection/tools/${meta.tool}/fasta" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + + withName: '.*:ANNOTATE_(COMBINED|PER_SAMPLE|PER_SAMPLE_TOOL):INTERSECT_GTF' { + ext.args = "-loj" + ext.suffix = "intersect_gtf.bed" + } + + withName: '.*:ANNOTATE_COMBINED:INTERSECT_GTF' { + publishDir = [ + path: { "${params.outdir}/bsj_detection/combined" }, + mode: params.publish_dir_mode, + saveAs: { filename -> ( filename != 'versions.yml' && params.save_intermediates ) ? filename : null } + ] + } + + withName: '.*:ANNOTATE_PER_SAMPLE:INTERSECT_GTF' { + publishDir = [ + path: { "${params.outdir}/bsj_detection/samples/${meta.id}" }, + mode: params.publish_dir_mode, + saveAs: { filename -> ( filename != 'versions.yml' && params.save_intermediates ) ? filename : null } + ] + } + + withName: '.*:ANNOTATE_PER_SAMPLE_TOOL:INTERSECT_GTF' { + publishDir = [ + path: { "${params.outdir}/bsj_detection/tools/${meta.tool}/annotated" }, + mode: params.publish_dir_mode, + saveAs: { filename -> ( filename != 'versions.yml' && params.save_intermediates ) ? filename : null } + ] + } + + withName: '.*:ANNOTATE_(COMBINED|PER_SAMPLE|PER_SAMPLE_TOOL):INGEST_DATABASE_NAMES' { + ext.args = { "-v FS='\\t' -v OFS='\\t' '{ print \$1, \$2, \$3, \"${meta.id}:\" \$4, \$5, \$6 }'" } + ext.suffix = "named.bed" + + publishDir = [ + path: { "${params.outdir}/references/named_databases" }, + mode: params.publish_dir_mode, + saveAs: { filename -> ( filename != 'versions.yml' && params.save_reference ) ? filename : null } + ] + } + + withName: '.*:ANNOTATE_(COMBINED|PER_SAMPLE|PER_SAMPLE_TOOL):INTERSECT_DATABASE' { + ext.args = { "-f ${meta.min_overlap} -r -loj -wa -wb" } + ext.suffix = "intersect_database.bed" + } + + withName: '.*:ANNOTATE_COMBINED:INTERSECT_DATABASE' { + publishDir = [ + path: { "${params.outdir}/bsj_detection/combined" }, + mode: params.publish_dir_mode, + saveAs: { filename -> ( filename != 'versions.yml' && params.save_intermediates ) ? filename : null } + ] + } + + withName: '.*:ANNOTATE_PER_SAMPLE:INTERSECT_DATABASE' { + publishDir = [ + path: { "${params.outdir}/bsj_detection/samples/${meta.id}" }, + mode: params.publish_dir_mode, + saveAs: { filename -> ( filename != 'versions.yml' && params.save_intermediates ) ? filename : null } + ] + } + + withName: '.*:ANNOTATE_PER_SAMPLE_TOOL:INTERSECT_DATABASE' { + publishDir = [ + path: { "${params.outdir}/bsj_detection/tools/${meta.tool}/annotated" }, + mode: params.publish_dir_mode, + saveAs: { filename -> ( filename != 'versions.yml' && params.save_intermediates ) ? filename : null } + ] + } + + withName: '.*:ANNOTATE_(COMBINED|PER_SAMPLE|PER_SAMPLE_TOOL):ANNOTATE' { + ext.prefix = { "${meta.id}.annotated" } + maxRetries = 3 + } + + withName: '.*:ANNOTATE_COMBINED:ANNOTATE' { + publishDir = [ + path: { "${params.outdir}/bsj_detection/combined" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + + withName: '.*:ANNOTATE_PER_SAMPLE:ANNOTATE' { + publishDir = [ + path: { "${params.outdir}/bsj_detection/samples/${meta.id}" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + + withName: '.*:ANNOTATE_PER_SAMPLE_TOOL:ANNOTATE' { + publishDir = [ + path: { "${params.outdir}/bsj_detection/tools/${meta.tool}/annotated" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + + withName: ADD_BACKSPLICE { + ext.args = "-c fastx '{ if (\$name ~ /^circ_/) { \$seq = \$seq substr(\$seq, 1, 25) } print \">\" \$name; print \$seq }'" + ext.suffix = "backspliced.fa" + publishDir = [ + path: { "${params.outdir}/mirna_prediction" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + + withName: UNIFY_MIRANDA { + ext.args = "-v FS='\\t' -v OFS='\\t' 'NR>1 { print \$1, \$2, \$7, \$8, \"miranda\" }'" + ext.suffix = "miranda.tsv" + publishDir = [ + path: { "${params.outdir}/mirna_prediction/binding_sites/tools/miranda/unified" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, + ] + } + + withName: UNIFY_TARGETSCAN { + ext.args = "-v FS='\\t' -v OFS='\\t' 'NR>1 { print \$2, \$1, \$6, \$7, \"targetscan\" }'" + ext.suffix = "targetscan.tsv" + publishDir = [ + path: { "${params.outdir}/mirna_prediction/binding_sites/tools/targetscan/unified" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, + ] + } + + withName: COMBINE_BINDINGSITES { + ext.prefix = "bindingsites.tsv" + } + + withName: COMBINE_TRANSCRIPTOME_GTFS { + ext.args = "-k 1,1 -k4,4n -k5,5n" + ext.suffix = "combined.gtf" + publishDir = [ + path: { "${params.outdir}/quantification/transcriptome" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + + withName: EXCLUDE_OVERLONG_TRANSCRIPTS { + ext.args = "-v FS='\\t' -v OFS='\\t' '\$5-\$4 <= 10000 { print }'" + ext.suffix = "filtered.gtf" + publishDir = [ + path: { "${params.outdir}/quantification/transcriptome" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + + withName: TRANSCRIPTOME { + ext.args = "-w" + publishDir = [ + path: { "${params.outdir}/quantification/transcriptome" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + + withName: MARK_CIRCULAR { + // GAWK process that marks FASTA headers. + // Leaves headers starting with "ENS" and non-header lines as is. + // Adds "\tC" to the end of the header for all other headers + ext.args = "-v FS='\\t' -v OFS='\\t' '{ if (!/^>circ_/) { print } else { print \$1 \"\\tC\" } }'" + ext.suffix = "marked.fasta" + publishDir = [ + path: { "${params.outdir}/quantification/transcriptome" }, + mode: params.publish_dir_mode, + saveAs: { filename -> ( filename != 'versions.yml' && params.save_intermediates ) ? filename : null } + ] + } + + withName: '.*:QUANTIFICATION:CIRIQUANT:MAIN' { + publishDir = [ + [ + path: { "${params.outdir}/quantification/ciriquant/results/transcripts" }, + mode: params.publish_dir_mode, + pattern: "*/gene/*_out.gtf", + saveAs: { filename -> filename.equals('versions.yml') ? null : filename.split('/').last() } + ], + [ + path: { "${params.outdir}/quantification/ciriquant/results/genes" }, + mode: params.publish_dir_mode, + pattern: "*/gene/*_genes.list", + saveAs: { filename -> filename.equals('versions.yml') ? null : filename.split('/').last() } + ], + [ + path: { "${params.outdir}/quantification/ciriquant/results/circs" }, + mode: params.publish_dir_mode, + pattern: "*/*.gtf", + saveAs: { filename -> filename.equals('versions.yml') ? null : filename.split('/').last() } + ] + ] + } + + withName: '.*:QUANTIFICATION:CIRIQUANT:EXTRACT_CIRC' { + // PYGTFTK process that transforms a GTF file into circ_id, gene_id, count + ext.args = "-k circ_id,gene_id,score -s \\\t" + ext.suffix = "circ.tsv" + publishDir = [ + path: { "${params.outdir}/quantification/ciriquant/circ" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + + withName: '.*:QUANTIFICATION:CIRIQUANT:EXTRACT_GENES' { + ext.args = "-v FS='\\t' -v OFS='\\t' '{ print \$1, \$2, \$9 }'" + ext.suffix = "gene.tsv" + publishDir = [ + path: { "${params.outdir}/quantification/ciriquant/gene" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + + withName: ".*:CIRIQUANT:JOIN_(GENE|CIRC)" { + publishDir = [ + path: { "${params.outdir}/quantification/ciriquant/combined" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + + withName: ".*:CIRIQUANT:JOIN_CIRC" { + ext.metacols = "circ_id,gene_id" + } + + withName: PSIRC_INDEX { + publishDir = [ + path: { "${params.outdir}/references/index/psirc" }, + mode: params.publish_dir_mode, + saveAs: { filename -> ( filename != 'versions.yml' && params.save_reference ) ? filename : null } + ] + } + + withName: RUN_PSIRC_QUANT { + publishDir = [ + path: { "${params.outdir}/quantification/samples/${meta.id}/psirc" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + + withName: 'CUSTOM_TX2GENE' { + publishDir = [ + path: { "${params.outdir}/quantification/transcriptome" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + + withName: TXIMETA_TXIMETA { + publishDir = [ + path: { "${params.outdir}/quantification/samples/${meta.id}/tximeta" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + + withName: TXIMETA_TXIMPORT { + publishDir = [ + path: { "${params.outdir}/quantification/samples/${meta.id}/tximport" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + + withName: ".*:PSIRC_QUANT:JOIN_(GENE|TX)_(COUNTS|TPM)" { + ext.args = "-f 1,2 -t" + label = "process_medium" + maxRetries = 3 + publishDir = [ + path: { "${params.outdir}/quantification/combined" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + + withName: ".*:SPLIT_TYPES_(COUNTS|TPM)" { + publishDir = [ + path: { "${params.outdir}/quantification/combined" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + + withName: MERGE_EXPERIMENTS { + publishDir = [ + path: { "${params.outdir}/quantification/combined" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + + withName: '.*:MIRNA_PREDICTION:DESEQ2_NORMALIZATION' { + publishDir = [ + path: { "${params.outdir}/mirna_prediction/mirna_expression" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, + ] + } + + withName: '.*:MIRNA_PREDICTION:MIRNA_FILTERING' { + publishDir = [ + path: { "${params.outdir}/mirna_prediction/mirna_expression" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, + ] + } + + withName: TARGETSCAN_DATABASE { + publishDir = [ + path: { "${params.outdir}/references/mirna_prediction/targetscan" }, + mode: params.publish_dir_mode, + saveAs: { filename -> ( filename != 'versions.yml' && params.save_reference ) ? filename : null }, + pattern: "mature.txt" + ] + } + + withName: TARGETSCAN { + ext.prefix = { "${meta.id}.targetscan" } + publishDir = [ + path: { "${params.outdir}/mirna_prediction/binding_sites/tools/targetscan/output" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, + pattern: "*.txt" + ] + } + + withName: MIRANDA { + ext.prefix = { "${meta.id}.miranda" } + ext.args = "-strict" + publishDir = [ + path: { "${params.outdir}/mirna_prediction/binding_sites/tools/miranda/output" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, + pattern: "*.txt" + ] + } + + withName: MIRNA_TARGETS { + publishDir = [ + path: { "${params.outdir}/mirna_prediction/binding_sites/targets" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, + pattern: "*.txt" + ] + } + + withName: COMBINE_BINDINGSITES { + publishDir = [ + path: { "${params.outdir}/mirna_prediction/binding_sites/majority_vote" }, + mode: params.publish_dir_mode, + saveAs: { filename -> ( filename != 'versions.yml' && params.save_intermediates ) ? filename : null } + ] + } + + withName: MAJORITY_VOTE { + publishDir = [ + path: { "${params.outdir}/mirna_prediction/binding_sites/majority_vote" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, + ] + } + + withName: '.*:MIRNA_PREDICTION:COMPUTE_CORRELATIONS' { + publishDir = [ + path: { "${params.outdir}/mirna_prediction/correlation" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, + ] + } + + withName: CIRCTEST_PREPARE { + publishDir = [ + path: { "${params.outdir}/statistical_tests/circtest" }, + mode: params.publish_dir_mode, + saveAs: { filename -> ( filename != 'versions.yml' && params.save_intermediates ) ? filename : null } + ] + } + + withName: CIRCTEST_CIRCTEST { + publishDir = [ + path: { "${params.outdir}/statistical_tests/circtest" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + + withName: CIRIQUANT_PREPDE { + publishDir = [ + path: { "${params.outdir}/statistical_tests/ciriquant_de/ciriquant_prep" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + + withName: STRINGTIE_PREPDE { + publishDir = [ + path: { "${params.outdir}/statistical_tests/ciriquant_de/stringtie_prep" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + + withName: CIRIQUANT_DE { publishDir = [ - path: { "${params.outdir}/multiqc" }, + path: { "${params.outdir}/statistical_tests/ciriquant_de" }, mode: params.publish_dir_mode, saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ] } + withName: 'MULTIQC' { + ext.args = { params.multiqc_title ? "--title \"$params.multiqc_title\"" : '' } + } } diff --git a/conf/test.config b/conf/test.config index 4c30298f3..4c3d51c7f 100644 --- a/conf/test.config +++ b/conf/test.config @@ -22,11 +22,19 @@ params { config_profile_name = 'Test profile' config_profile_description = 'Minimal test dataset to check pipeline function' - // Input data - // TODO nf-core: Specify the paths to your test data on nf-core/test-datasets - // TODO nf-core: Give any required params for the test so that command line flags are not needed - input = params.pipelines_testdata_base_path + 'viralrecon/samplesheet/samplesheet_test_illumina_amplicon.csv' + // Limit resources so that this can run on GitHub Actions + max_cpus = 2 + max_memory = 6.GB + max_time = 6.h - // Genome references - genome = 'R64-1-1' + // Test input data + input = "${params.test_data_base}/samples.csv" + fasta = "${params.test_data_base}/reference/chrI.fa" + gtf = "${params.test_data_base}/reference/chrI.gtf" + mature = "${params.test_data_base}/reference/mature.fa" + tools = "circexplorer2" + phenotype = "${params.test_data_base}/phenotype.csv" + skip_trimming = false + outdir = "results/" + bsj_reads = 2 } diff --git a/conf/test_full.config b/conf/test_full.config index 448193d73..094a3f072 100644 --- a/conf/test_full.config +++ b/conf/test_full.config @@ -1,24 +1,2 @@ -/* -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - Nextflow config file for running full-size tests -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - Defines input files and everything required to run a full size pipeline test. - - Use as follows: - nextflow run nf-core/circrna -profile test_full, --outdir - ----------------------------------------------------------------------------------------- -*/ - -params { - config_profile_name = 'Full test profile' - config_profile_description = 'Full test dataset to check pipeline function' - - // Input data for full size test - // TODO nf-core: Specify the paths to your full test data ( on nf-core/test-datasets or directly in repositories, e.g. SRA) - // TODO nf-core: Give any required params for the test so that command line flags are not needed - input = params.pipelines_testdata_base_path + 'viralrecon/samplesheet/samplesheet_full_illumina_amplicon.csv' - - // Genome references - genome = 'R64-1-1' -} +includeConfig 'test.config' +includeConfig 'full.config' diff --git a/conf/test_igenomes.config b/conf/test_igenomes.config new file mode 100644 index 000000000..d23ddbe8f --- /dev/null +++ b/conf/test_igenomes.config @@ -0,0 +1,27 @@ +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Nextflow config file for running minimal tests using igenomes +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Defines input files and everything required to run a minimal pipeline test. + + Use as follows: + nextflow run nf-core/circrna -profile test_full, --outdir + +---------------------------------------------------------------------------------------- +*/ + +params { + config_profile_name = 'Minimal igenomes profile' + config_profile_description = 'Minimal igenomes test dataset to check pipeline function' + + // Input data for minima test using igenomes + input = 'https://raw.githubusercontent.com/nf-core/test-datasets/circrna/samples.csv' + + genome = 'ce10' + tool = 'circexplorer2' + phenotype = 'https://raw.githubusercontent.com/nf-core/test-datasets/circrna/phenotype.csv' + skip_trimming = false + star = null // igenomes STAR version is not compatible + outdir = 'results/' + bsj_reads = 2 +} diff --git a/docs/images/Genomics-Data-Science-original.png b/docs/images/Genomics-Data-Science-original.png new file mode 100644 index 000000000..50e9c02b7 Binary files /dev/null and b/docs/images/Genomics-Data-Science-original.png differ diff --git a/docs/images/metro-map.png b/docs/images/metro-map.png new file mode 100644 index 000000000..2f4f8af89 Binary files /dev/null and b/docs/images/metro-map.png differ diff --git a/docs/output.md b/docs/output.md index 23814600e..b88860a98 100644 --- a/docs/output.md +++ b/docs/output.md @@ -2,22 +2,44 @@ ## Introduction -This document describes the output produced by the pipeline. Most of the plots are taken from the MultiQC report, which summarises results at the end of the pipeline. +This document describes the output produced by the pipeline. Most of the plots are taken from the MultiQC report generated from the [full-sized test dataset](https://github.com/nf-core/test-datasets/tree/circrna) for the pipeline using a command similar to the one below: -The directories listed below will be created in the results directory after the pipeline has finished. All paths are relative to the top-level results directory. - - +```bash +nextflow run nf-core/circrna -profile test_full, +``` -## Pipeline overview +The directories listed below will be created in the results directory after the pipeline has finished. All paths are relative to the top-level results directory. -The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes data using the following steps: +- references: Indices for various tools and intermediate reference genome files +- preprocessing: Per-sample concatenated FASTQ files +- quality_control + - fastqc: FastQC reports for raw reads + - trimgalore: Trim Galore! reports for trimmed reads +- bsj_detection + - combined: Combined BSJ calls across all samples + - samples: Per sample BSJ calls + - tools: Per tool and sample BSJ calls +- quantification + - combined: Quantification results for linear and circular transcripts across samples + - samples: Per sample quantification results + - transcriptome: Combined linear and circular transcriptome, based on GTF file and detected BSJs +- mirna_prediction + - binding_sites + - correlation + - mirna_expression +- statistical_tests + - circtest +- multiqc +- pipeline_info -- [FastQC](#fastqc) - Raw read QC -- [MultiQC](#multiqc) - Aggregate report describing results and QC from the whole pipeline -- [Pipeline information](#pipeline-information) - Report metrics generated during the workflow execution +## Quality Control ### FastQC +:::note +The FastQC plots displayed in the MultiQC report show _untrimmed_ reads. They may contain adapter sequence and potentially regions with low quality. +::: +
Output files @@ -25,37 +47,467 @@ The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes d - `*_fastqc.html`: FastQC report containing quality metrics. - `*_fastqc.zip`: Zip archive containing the FastQC report, tab-delimited data file and plot images. +::: note +The FastQC plots in this directory are generated relative to the raw, input reads. They may contain adapter sequence and regions of low quality. +::: +
[FastQC](http://www.bioinformatics.babraham.ac.uk/projects/fastqc/) gives general quality metrics about your sequenced reads. It provides information about the quality score distribution across your reads, per base sequence content (%A/T/G/C), adapter contamination and overrepresented sequences. For further reading and documentation see the [FastQC help pages](http://www.bioinformatics.babraham.ac.uk/projects/fastqc/Help/). +![MultiQC - FastQC sequence counts plot](images/mqc_fastqc_counts.png) + +![MultiQC - FastQC mean quality scores plot](images/mqc_fastqc_quality.png) + +![MultiQC - FastQC adapter content plot](images/mqc_fastqc_adapter.png) + +### TrimGalore + +
+Output files + +- `trimgalore/` + - `*.fq.gz`: If `--save_trimmed` is specified, FastQ files **after** adapter trimming will be placed in this directory. + - `*_trimming_report.txt`: Log file generated by Trim Galore!. +- `trimgalore/fastqc/` + - `*_fastqc.html`: FastQC report containing quality metrics for read 1 (_and read2 if paired-end_) **after** adapter trimming. + - `*_fastqc.zip`: Zip archive containing the FastQC report, tab-delimited data file and plot images. + +
+ +[Trim Galore!](https://www.bioinformatics.babraham.ac.uk/projects/trim_galore/) is a wrapper tool around Cutadapt and FastQC to peform quality and adapter trimming on FastQ files. By default, Trim Galore! will automatically detect and trim the appropriate adapter sequence. + +![MultiQC - cutadapt trimmed sequence length plot](images/mqc_cutadapt_trimmed.png) + ### MultiQC
Output files -- `multiqc/` - - `multiqc_report.html`: a standalone HTML file that can be viewed in your web browser. - - `multiqc_data/`: directory containing parsed statistics from the different tools used in the pipeline. - - `multiqc_plots/`: directory containing static images from the report in various formats. +- `quality_control/MultiQC/` + - `Raw_Reads_MultiQC.html`: Summary reports of unprocessed RNA-Seq reads. + - `Trimmed_Reads_MultiQC.html`: Summary reports of processed RNA-Seq reads.
-[MultiQC](http://multiqc.info) is a visualization tool that generates a single HTML report summarising all samples in your project. Most of the pipeline QC results are visualised in the report and further statistics are available in the report data directory. +[MultiQC](http://multiqc.info) is a visualization tool that generates a single HTML report summarising all samples in your project. `nf-core` outputs HTML reports for sequencing read quality control. + +## Reference files + +
+Output files + +- `references` + - `index` + - `bowtie`: Directory containing `Bowtie` indices. + - `bowtie2`: Directory containing `Bowtie2` indices. + - `bwa`: Directory containing `BWA` indices. + - `fasta`: Directory containing FASTA index (`.fai`). + - `hisat2`: Directory containing `HISAT2` indices. + - `segemehl`: Directory containing `Segemehl` index file. + - `star`: Directory containing `STAR` indices. + - `genome` + - `clean_fasta`: Directory containing a FASTA file with reduced headers, since MapSplice has problems with multiple header fields. + - `filtered_gtf`: Directory containing a GTF file with only entries that reside on chromosomes present in the reference FASTA file. + - `chromosomes`: Directory containing individual FASTA files for each chromosome. + - `bsh_detection` + - `circexplorer2`: Directory containing the `CIRCexplorer2` annotation file. + - `mapsplice`: Directory containing the `MapSplice` annotation file. + - `mirna_prediction` + - `targetscan`: Directory containing the TargetScan miRNA database. + +
-Results generated by MultiQC collate pipeline QC from supported tools e.g. FastQC. The pipeline has special steps which also allow the software versions to be reported in the MultiQC output for future traceability. For more information about how to use MultiQC reports, see . +nf-core/circrna will add the reference files to the output directory if `save_reference` is set to `true`. The resulting files, especially the aligner indices, can be used for speeding up future runs (if the `resume` option cannot be used). In order to achieve this, copy the indices to a location outside of the pipeline's output directory and provide the path to the indices via the corresponding aligner flags (check the [parameters documentation](https://nf-co.re/circrna/parameters/#reference-genome-options) for more information). -### Pipeline information +## Pipeline info
Output files -- `pipeline_info/` +- `pipeline_info` - Reports generated by Nextflow: `execution_report.html`, `execution_timeline.html`, `execution_trace.txt` and `pipeline_dag.dot`/`pipeline_dag.svg`. - Reports generated by the pipeline: `pipeline_report.html`, `pipeline_report.txt` and `software_versions.yml`. The `pipeline_report*` files will only be present if the `--email` / `--email_on_fail` parameter's are used when running the pipeline. - - Reformatted samplesheet files used as input to the pipeline: `samplesheet.valid.csv`. - Parameters used by the pipeline run: `params.json`.
-[Nextflow](https://www.nextflow.io/docs/latest/tracing.html) provides excellent functionality for generating various reports relevant to the running and execution of the pipeline. This will allow you to troubleshoot errors with the running of the pipeline, and also provide you with other information such as launch commands, run times and resource usage. +## BSJ detection + +The rough workflow for the BSJ detection looks like this: + +1. Each tool detects BSJs in each sample and quantifies how many reads support each BSJ. +2. Bring the tool outputs into a common format. +3. Apply a threshold (parameter `bsj_reads`) to the BSJ reads to filter out lowly supported BSJs. +4. Combine all tool-specific BSJ calls per sample into a single file. +5. Filter out BSJs that are not supported by at least as many tools as specified by`min_tools`. +6. Merge all samples into a single file. This now represents the "circular transcriptome" + +### Per tool + +
+Output files available for all tools + +- `unified`: Directory containing the BSJ calls in the BED6 format. +- `filtered`: Based on `unified`, but filtered for BSJs with at least `bsj_reads` supporting reads. +- `masked`: Based on `filtered`, but scores are replaced by a dot (.) +- `annotated`: Based on `masked`, but with additional columns for the circRNA type, the host gene(s), host transcript(s) and potential database hits. Contains a BED and a GTF file for each sample. +- `fasta`: Extracted sequences of the circRNAs in FASTA format. Based on `masked`. +- `intermediates`: Contains intermediate files generated by the BSJ detection tools, as explained below. +- `${tool}.csv`: Number of reads that the tool found supporting the BSJ. + +
+ +An exemption of the above is `star`, which is not used as a standalone BSJ detection tool, but the output of a 2-pass STAR alignment is used by `CIRCexplorer2`, `circRNA finder` and `DCC`. + +#### CIRCexplorer2 + +
+Output files + +- `bsj_detection/tools/circexplorer2/intermediates/${sample_id}/` + - `*.bed`: Intermediate file generated by `CIRCexplorer2 parse` module, identifying STAR fusion junctions for downstream annotation. + - `*.txt`: Output files generated by `CIRCexplorer2 annotate` module, based on BED 12 format containing circRNA genomic location information, exon cassette composition and an additional 6 columns specifying circRNA annotations. Full descriptions of the 18 columns can be found in the `CIRCexplorer2` [documentation](https://circexplorer2.readthedocs.io/en/latest/modules/annotate/#output). + +
+ +[CIRCexplorer2](https://circexplorer2.readthedocs.io/en/latest/) uses `*.Chimeric.out.junction` files generated from `STAR` 2 pass mode to extract back-splice junction sites using the `CIRCexplorer2 parse` module. Following this, `CIRCexplorer2 annotate` performs re-alignment of reads to the back-splice junction sites to determine the precise positions of downstream donor and upstream acceptor splice sites. Back-splice junction sites are subsequently updated and annotated using the customised annotation text file. + +#### circRNA finder + +
+Output files + +- `bsj_detection/tools/circrna_finder/intermediates/${sample_id}/` + + - `*.filteredJunctions.bed`: A bed file with **all** circular junctions found by the pipeline. The score column indicates the number reads spanning each junction. + - `*.s_filteredJunctions.bed`: A bed file with those junctions in `*.filteredJunctions.bed` that are flanked by GT-AG splice sites. The score column indicates the number reads spanning each junction. + - `*.s_filteredJunctions_fw.bed`: A bed file with the same circular junctions as in file (b), but here the score column gives the average number of forward spliced reads at both splice sites around each circular junction. + +
+ +[circRNA finder](https://github.com/orzechoj/circRNA_finder) uses `*.Chimeric.out.sam`, `*.Chimeric.out.junction` & `*.SJ.out.tab` from STAR 2nd pass files to identify circular RNAs in RNA-Seq data. + +#### CIRIquant + +
+Output files + +- `bsj_detection/tools/ciriquant/intermediates/${sample_id}/` + - `*.log`: A `CIRIerror.log` file which should be empty, and a `${sample_id}.log` file which contains the output log of `CIRIquant`. + - `*.bed`: `CIRI2` output file in BED 6 format. + - `*.gtf`: Output file from `CIRIquant` in GTF format. Full description of the columns available in the `CIRIquant` [documentation](https://ciriquant-cookbook.readthedocs.io/en/latest/quantification.html#output-format). + - `align/` + - `*.sorted.{bam, bam.bai}`: (Sorted and indexed) bam file from `HISAT2` alignment of RNA-Seq reads. + - `circ/` + - `*.ciri`: `CIRI2` output file. + - `*_denovo.sorted.{bam, bam.bai}`: (Sorted and indexed) bam file from `BWA` alignment of candidate circular reads to the pseudo reference. + - `*_index.*.ht2`: `BWA` index files of the pseudo reference. + - `*_index.fa`: Reference FASTA file of candidate circular reads. + +
+ +[CIRIquant](https://github.com/Kevinzjy/CIRIquant) operates by aligning RNA-Seq reads using `HISAT2` and [CIRI2](https://sourceforge.net/projects/ciri/files/CIRI2/) to identify putative circRNAs. Next, a pseudo reference index is generated using `bwa index` by concatenating the two full-length sequences of the putative back-splice junction regions. Candidate circular reads are re-aligned against this pseudo reference using `bwa mem`, and back-splice junction reads are determined if they can be linearly and completely aligned to the putative back-splice junction regions. + +#### DCC + +
+Output files + +- `/bsj_detection/tools/dcc/intermediates/${sample_id}/` + - `*.txt`: Output file from `DCC` containing position and BSJ read counts of circRNAs. + +
+ +[DCC](https://github.com/dieterich-lab/DCC) identifies back-splice junction sites from `*Chimeric.out.junction`, `*SJ.out.tab` & `*Aligned.sortedByCoord.out.bam` files generated by `STAR` 2 pass mode, mapping the paired end reads both jointly and separately (`STAR` does not output read pairs that contain more than one chimeric junction thus a more granular approach is taken by `DCC` to fully characterise back-splice junctions in reads). + +`DCC` then performs a series of filtering steps on candidate circular reads: + +1. Mapping of mates must be consistent with a circular RNA template i.e align to the back-splice junction. +2. Filtering by a minimum number of junction reads per replicate (nf-core/circrna has set this parameter to`-Nr 1 1` allowing all reads). +3. Circular reads are not allowed span more than one gene. +4. Circular reads aligning to mitochondrial genome are removed. +5. Circular reads that lack a canonical (GT/AG) splicing signal at the circRNA junction borders are removed. + +#### Find circ + +
+Output files + +- `bsj_detection/tools/find_circ/intermediates/${sample_id}/` + - `*_anchors.qfa.gz`: 20mer anchors extracted from unmapped reads. + - `*_unmapped.bam`: Unmapped RNA-Seq reads to reference genome. + - `*.sites.bed`: Output from `find_circ`, first six columns are in standard BED format. A description of the remaining columns is available in the `find_circ` [documentation](https://github.com/marvin-jens/find_circ#output-format). + - `*.sites.log`: Summary statistics of candidate circular reads in the sample. + - `*.sites.reads`: Tab delimited file containing circRNA ID & sequence. + +
+ +[find circ](https://github.com/marvin-jens/find_circ) utilises `Bowtie2` short read mapper to align RNA-Seq reads to the genome. Reads that align fully and contiguously are discarded. Unmapped reads are converted to 20mers and aligned independently to find unique anchor positions within spliced exons - anchors that align in reverse orientation indicate circular RNA junctions. Anchor alignments are extended and must meet the following criteria: + +1. Breakpoints flanked by GT/AG splice sites. +2. Unambiguous breakpoint detection. +3. Maximum 2 mismatches in extension procedure. +4. Breakpoint cannot reside more than 2nt inside a 20mer anchor. +5. 2 reads must support the junction. + +#### MapSplice + +
+Output files + +- `bsj_detection/tools/mapsplice/intermediates/${sample_id}/` + - `alignments.bam`: Bam file containing aligned reads and fusion alignments. + - `deletions.txt`: Report of deletions. + - `Fusion output files`: + - `fusions_raw.txt`: raw fusion junctions without filtering + - `fusion_candidates.txt`: filtered fusion junctions + - `fusions_well_annotated.txt`: annotated fusion junction candidates (align to annotation file provided) + - `fusions_not_well_annotated.txt`: fusions that do not align with supplied annotations + - `circular_RNAs.txt`: circular RNAs reported. + - `insertions.txt`: Report of Insertions. + - `junctions.txt`: Reported splice junctions. + - `stats.txt`: Read alignment, Junction statistics. + +
+ +[MapSplice](http://www.netlab.uky.edu/p/bioinfo/MapSplice2) first splits reads into segments, and maps them to reference genome by using `Bowtie`. `MapSplice` attempts to fix unmapped segments as gapped alignments, with each gap corresponding to a splice junction. Finally a remapping step is used to identify back-spliced alignments that are in the presence of small exons. + +#### Segemehl + +
+Output files + +- `bsj_detection/tools/segemehl/intermediates/${sample_id}/` + - `*.bam`: Aligned reads in BAM format + - `*.mult.bed`: Thus, this bed file contains all splice events of a read. The start and end positions indicate the nucleotide after the first split (i.e. the beginning of the first intron) and the nucleotide before the last split (i.e. the end of the last intron), respectively. The name and score are equivalent to the one in the \*.sngl file described above. The following fields 7 & 8 (thickStart and thickEnd) should be the identical to fields 2 & 3. Field 9 holds the color information for the item in RGB encoding (itemRGB). Field 10 (blockCount) indicates the number of splits represented by the BED item. Field 11 is a comma separated list of the intron sizes (blockSizes). Field 12 is the comma separated list of intron starts (blockStarts). + - `*.sngl.bed`: The bed file contains all single splice events predicted in the split read alignments. + - `*.trns.bed`: The custom text file contains all single split alignments predicted to be in trans, i.e. split alignments that are located on different chromosomes and/or different strands. + +
+ +`Segemehl` implements split read alignment mode for reads that failed the attempt of collinear alignment. The algorithm will consider circular alignments. Circular splits are output to `${sample_id}.sngl.bed` and parsed using customised scripts to produce counts representative of `Segemehl` quantification. + +#### STAR + +
+Output files + +- `bsj_detection/tools/star` + - `1st_pass` + - `*.Aligned.out.bam`: Coordinate sorted bam file containing aligned reads and chimeric reads. + - `*.Chimeric.out.junction`: Each line contains the details of chimerically aligned reads. Full descriptions of columns can be found in `STAR` [documentation](https://physiology.med.cornell.edu/faculty/skrabanek/lab/angsd/lecture_notes/STARmanual.pdf) (section 5.4). + - `*.Log.final.out`: Summary mapping statistics after mapping job is complete, useful for quality control. The statistics are calculated for each read (single- or paired-end) and then summed or averaged over all reads. + - `*.Log.out`: Main log file with a lot of detailed information about the run. This file is most useful for troubleshooting and debugging. + - `*.Log.progress.out`: Reports job progress statistics, such as the number of processed reads, % of mapped reads etc. + - `*.SJ.out.tab`: High confidence collapsed splice junctions in tab-delimited form. Full description of columns can be found in `STAR` [documentation](https://physiology.med.cornell.edu/faculty/skrabanek/lab/angsd/lecture_notes/STARmanual.pdf) (section 4.4). + - `2nd_pass` + - `*.Aligned.out.bam`: Coordinate sorted bam file containing aligned reads and chimeric reads. + - `*.Chimeric.out.junction`: Each line contains the details of chimerically aligned reads. Full descriptions of columns can be found in `STAR` [documentation](https://physiology.med.cornell.edu/faculty/skrabanek/lab/angsd/lecture_notes/STARmanual.pdf) (section 5.4). + - `*.Chimeric.out.sam`: Chimeric alignments in SAM format. + - `*.Log.final.out`: Summary mapping statistics after mapping job is complete, useful for quality control. The statistics are calculated for each read (single- or paired-end) and then summed or averaged over all reads. + - `*.Log.out`: Main log file with a lot of detailed information about the run. This file is most useful for troubleshooting and debugging. + - `*.Log.progress.out`: Reports job progress statistics, such as the number of processed reads, % of mapped reads etc. + - `*.SJ.out.tab`: High confidence collapsed splice junctions in tab-delimited form. Full description of columns can be found in `STAR` [documentation](https://physiology.med.cornell.edu/faculty/skrabanek/lab/angsd/lecture_notes/STARmanual.pdf) (section 4.4). + - `sjdb` + - `dataset.SJ.out.tab`: Chromosome, start, end & strand coordinates of novel splice junctions for **all samples** aligned using STAR 1st pass. + +
+ +STAR in 2-pass mode is used to identify novel splice junctions in RNA-Seq data. The first pass of STAR is used to generate a genome index and align reads to the reference genome. The second pass of STAR uses the splice junctions identified in the first pass to align reads to the reference genome. This does not increase the number of detected novel junctions, but allows for more sensitive detection of splice reads mapping to novel junctions. + +### Per sample + +
+Output files + +- `bsj_detection/samples/${sample_id}/` + - `*.grouped.bed`: Grouped BSJ calls in BED format. Score column represents the number of tools that support the BSJ. + - `*.filtered.bed`: Based on `*.grouped.bed`, but filtered for BSJs with at least `min_tools` supporting tools. + - `*.intersect_gtf.bed`: Intersection of `*.filtered.bed` with the reference GTF file. Intermediate file for annotation. + - `*.intersect_database.bed`: Intersection of `*.filtered.bed` with the database BED file. Intermediate file for annotation. + - `*.annotated.bed`: Annotated BSJ calls in BED format, based on `*.filtered.bed`. + - `*.annotated.gtf`: Annotated BSJ calls in GTF format, based on `*.filtered.bed`. + - `*.fa`: Extracted sequences of the circRNAs in FASTA format, based on `*.filtered.bed`. + - `*.upset.png`: Sample-specific upset plot of BSJ calls across tools. + +
+ +nf-core/circrna produces a sample-specific set of BSJ calls. The BSJ calls are filtered for BSJs with at least `min_tools` supporting tools. The filtered BSJ calls are then annotated with the reference GTF file and the database BED file. An upset plot is generated to visualise the overlap of BSJ calls across tools. + +### Combined + +
+Output files + +- `bsj_detection/combined/` + - `*.combined.bed`: Unique BSJ calls across samples in BED format. + - `*.intersect_gtf.bed`: Intersection of `*.filtered.bed` with the reference GTF file. Intermediate file for annotation. + - `*.intersect_database.bed`: Intersection of `*.filtered.bed` with the database BED file. Intermediate file for annotation. + - `*.annotated.bed`: Annotated BSJ calls in BED format, based on `*.filtered.bed`. + - `*.annotated.gtf`: Annotated BSJ calls in GTF format, based on `*.filtered.bed`. + - `*.fa`: Extracted sequences of the circRNAs in FASTA format, based on `*.filtered.bed`. + - `*.upset.png`: Combined upset plot of BSJ calls across samples. + +
+ +nf-core/circrna combines the sample-specific BSJ calls into a single file. The filtered BSJ calls are then annotated with the reference GTF file and the database BED file. An upset plot is generated to visualise the overlap of BSJ calls across tools. + +## Quantification + +Since we now know the BSJ locations, we can now quantify their expression by mapping the reads to the region between the BSJ start and end coordinates. As each read can potentially originate from both linear and circular transcripts, the pipeline performs a joint quantification of the linear and circular transcriptome. +The quantification is performed using psirc-quant, which is a wrapper around `kallisto`. It allows for inferential-uncertainty aware quantification of linear and circular transcripts. + +### Transcriptome + +
+Output files + +- `quantification/transcriptome/` + - `*.combined.gtf`: Combined linear and circular transcriptome in GTF format. + - `*.filtered.gtf`: Filtered linear and circular transcriptome in GTF format, based on `*.combined.gtf`. + - `*.fasta`: Combined linear and circular transcriptome in FASTA format, based on `*.filtered.gtf`. + - `*.marked.fasta`: Transcript sequences in FASTA format with the circRNA sequences marked with a `C` field in the header. + - `*.tx2gene.tsv`: Transcript to gene mapping file. + +
+ +### Per sample + +
+Output files + +- `quantification/samples/${sample_id}/` + - `psirc` + - `*.abundance.h5`: Abundance estimates in HDF5 format. + - `*.abundance.tsv`: Abundance estimates in TSV format. + - `*.run_info.json`: Run information in JSON format. + - `pseudoalignments.bam`: Pseudoalignments in BAM format. + - `pseudoalignments.bai`: Index file for pseudoalignments. + - `tximeta/` + - `*.rds`: RDS file containing the the sample-specific transcript quantification data. + - `tximport/` + - `*.gene_counts_length_scaled.tsv`: Gene counts scaled by transcript length. + - `*.gene_counts_scaled.tsv`: Gene counts scaled by library size. + - `*.gene_counts.tsv`: Gene counts. + - `*.gene_lengths.tsv`: Gene lengths. + - `*.gene_tpm.tsv`: Gene TPM values. + - `*.transcript_counts.tsv`: Transcript counts. + - `*.transcript_lengths.tsv`: Transcript lengths. + - `*.transcript_tpm.tsv`: Transcript TPM values. + +
+ +nf-core/circrna performs quantification of linear and circular transcripts using `psirc-quant`. The quantification results are stored in HDF5 and TSV format. The pipeline also generates a `tximeta` RDS file containing the sample-specific transcript quantification data. The `tximport` directory contains gene and transcript counts, lengths and TPM values. + +### Combined + +
+Output files + +- `quantification/combined/` + - `gene_counts.csv`: Count matrix of genes across samples. + - `gene_tpm.csv`: TPM matrix of genes across samples. + - `tx_counts.csv`: Count matrix of transcripts across samples. + - `tx_tpm.csv`: TPM matrix of transcripts across samples. + - `linear.tsv`: Count matrix of linear transcripts across samples. + - `circular.tsv`: Count matrix of circular transcripts across samples. + - `experiments.merged.rds`: RDS file containing a SummarizedExperiment with the merged transcript quantification data. + +
+ +nf-core/circrna combines the sample-specific quantification results into proper count matrices. It also generates an RDS file containing a SummarizedExperiment with the merged transcript quantification data. + +## miRNA Prediction + +### Binding Sites + +#### Tools + +This section contains predicted binding sites for miRNA-target interactions generated by various computational tools. +Each tool utilizes unique algorithms and criteria to identify potential miRNA binding sites on target genomic sequences, providing complementary insights into miRNA regulatory networks. + +##### miRanda + +
+Output files + +- `mirna_prediction/bindingsites/tools/miranda/output` + - `*.miranda.txt`: Raw predictions from `miRanda`. +- `mirna_prediction/bindingsites/tools/miranda/unified` + - `*.miranda.tsv`: Unified predictions from `miRanda`. + +
+ +[miRanda](http://cbio.mskcc.org/miRNA2003/miranda.html) performs miRNA target prediction of a genomic sequence against a miRNA database in 2 phases: + +1. First a dynamic programming local alignment is carried out between the query miRNA sequence and the reference sequence. This alignment procedure scores based on sequence complementarity and not on sequence identity. +2. Secondly, the algorithm takes high-scoring alignments detected from phase 1 and estimates the thermodynamic stability of RNA duplexes based on these alignments. This second phase of the method utilises folding routines from the `RNAlib` library, part of the [ViennaRNA](https://www.tbi.univie.ac.at/RNA/) package. + +##### TargetScan + +
+Output files + +- `mirna_prediction/bindingsites/tools/targetscan/output` + - `*.targetscan.txt`: Raw predictions from `TargetScan`. +- `mirna_prediction/bindingsites/tools/targetscan/unified` + - `*.targetscan.tsv`: Unified predictions from `TargetScan`. + +
+ +[TargetScan](http://www.targetscan.org/vert_72/) predicts biological targets of miRNAs by searching for the presence of conserved 8mer, 7mer, and 6mer sites within the circRNA mature sequence that match the seed region of each miRNA. + +#### Targets + +
+Output files + +- `mirna_prediction/binding_sites/targets` + - `*_miRNA_targets.txt`: Filtered target miRNAs of circRNAs called by quantification tools. Columns are self explanatory: miRNA, Score, Energy_KcalMol, Start, End, Site_type. + +
+ +nf-core/circrna performs miRNA target filtering on `miRanda` and `TargetScan` predictions: + +1. miRNA must be called by both `miRanda` and `TargetScan`. +2. If a site within the circRNA mature sequence shares duplicate miRNA ID's overlapping the same coordinates, the miRNA with the highest score is kept. + +#### Majority Vote + +
+Output files + +- `mirna_prediction/binding_sites/majority_vote` + - `mirna.targets.tsv`: Stores miRNA-target mappings with all targets listed per miRNA, making it compact and suitable for bulk analyses. + - `mirna.majority.tsv`: Lists each miRNA-target interaction on a separate line, which is helpful for detailed analysis of each interaction independently. + +
+ +nf-core/circrna performs a majority vote on the predicted miRNA targets from [TargetScan](http://www.targetscan.org/vert_72/) and [miRanda](http://cbio.mskcc.org/miRNA2003/miranda.html) based on a +threshold specified by the user. + +### miRNA Expression + +
+Output files + +- `mirna_prediction/mirna_expression/` + - `mirna.normalized_counts.tsv`: Contains normalized miRNA expression of all samples. + - `mirna.normalized_counts_filtered.tsv`: Contains miRNA expression after filtering. + +
+ +nf-core/circrna processes miRNA expression data by normalizing and filtering it for further analysis. + +### Correlation + +
+Output files + +- `mirna_prediction/correlation` + - `*.tsv`: Files named after the specific miRNA containing correlation results for that miRNA with its target transcripts. + +
+ +nf-core/circrna computes correlations between miRNA and transcript expression levels and writes the results to individual TSV files for each miRNA-target interaction specified in the input binding sites file. diff --git a/docs/usage.md b/docs/usage.md index de4b1c3b9..9edad67ea 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -1,16 +1,16 @@ # nf-core/circrna: Usage -## :warning: Please read this documentation on the nf-core website: [https://nf-co.re/circrna/usage](https://nf-co.re/circrna/usage) +## :warning: Please read this documentation on the nf-core website: [https://nf-co.re/rnaseq/usage](https://nf-co.re/rnaseq/usage) > _Documentation of pipeline parameters is generated automatically from the pipeline schema and can no longer be found in markdown files._ -## Introduction +## Pipeline parameters - +Please provide pipeline parameters via the CLI or Nextflow `-params-file` option. Custom config files including those provided by the `-c` Nextflow option can be used to provide any configuration except for parameters; see [docs](https://nf-co.re/usage/configuration#custom-configuration-files). ## Samplesheet input -You will need to create a samplesheet with information about the samples you would like to analyse before running the pipeline. Use this parameter to specify its location. It has to be a comma-separated file with 3 columns, and a header row as shown in the examples below. +You will need to create a samplesheet with information about the samples you would like to analyse before running the pipeline. Use this parameter to specify its location. It has to be a comma-separated file with 4 columns, and a header row as shown in the examples below. ```bash --input '[path to samplesheet file]' @@ -18,48 +18,133 @@ You will need to create a samplesheet with information about the samples you wou ### Multiple runs of the same sample -The `sample` identifiers have to be the same when you have re-sequenced the same sample more than once e.g. to increase sequencing depth. The pipeline will concatenate the raw reads before performing any downstream analysis. Below is an example for the same sample sequenced across 3 lanes: +The `sample` identifiers have to be the same when you have re-sequenced the same sample more than once e.g. to increase sequencing depth. The pipeline will concatenate the raw reads before performing any downstream analysis. Below is an example for the same sample sequenced across 3 lanes. If you set the strandedness value to `auto` the pipeline will use the tool-defaults throughout the pipeline. ```csv title="samplesheet.csv" -sample,fastq_1,fastq_2 -CONTROL_REP1,AEG588A1_S1_L002_R1_001.fastq.gz,AEG588A1_S1_L002_R2_001.fastq.gz -CONTROL_REP1,AEG588A1_S1_L003_R1_001.fastq.gz,AEG588A1_S1_L003_R2_001.fastq.gz -CONTROL_REP1,AEG588A1_S1_L004_R1_001.fastq.gz,AEG588A1_S1_L004_R2_001.fastq.gz +sample,fastq_1,fastq_2,strandedness +CONTROL_REP1,AEG588A1_S1_L002_R1_001.fastq.gz,AEG588A1_S1_L002_R2_001.fastq.gz,auto +CONTROL_REP1,AEG588A1_S1_L003_R1_001.fastq.gz,AEG588A1_S1_L003_R2_001.fastq.gz,auto +CONTROL_REP1,AEG588A1_S1_L004_R1_001.fastq.gz,AEG588A1_S1_L004_R2_001.fastq.gz,auto ``` ### Full samplesheet -The pipeline will auto-detect whether a sample is single- or paired-end using the information provided in the samplesheet. The samplesheet can have as many columns as you desire, however, there is a strict requirement for the first 3 columns to match those defined in the table below. +The pipeline will auto-detect whether a sample is single- or paired-end using the information provided in the samplesheet. The samplesheet can have as many columns as you desire, however, there is a strict requirement for the first 4 columns to match those defined in the table below. A final samplesheet file consisting of both single- and paired-end data may look something like the one below. This is for 6 samples, where `TREATMENT_REP3` has been sequenced twice. ```csv title="samplesheet.csv" -sample,fastq_1,fastq_2 -CONTROL_REP1,AEG588A1_S1_L002_R1_001.fastq.gz,AEG588A1_S1_L002_R2_001.fastq.gz -CONTROL_REP2,AEG588A2_S2_L002_R1_001.fastq.gz,AEG588A2_S2_L002_R2_001.fastq.gz -CONTROL_REP3,AEG588A3_S3_L002_R1_001.fastq.gz,AEG588A3_S3_L002_R2_001.fastq.gz -TREATMENT_REP1,AEG588A4_S4_L003_R1_001.fastq.gz, -TREATMENT_REP2,AEG588A5_S5_L003_R1_001.fastq.gz, -TREATMENT_REP3,AEG588A6_S6_L003_R1_001.fastq.gz, -TREATMENT_REP3,AEG588A6_S6_L004_R1_001.fastq.gz, +sample,fastq_1,fastq_2,strandedness +CONTROL_REP1,AEG588A1_S1_L002_R1_001.fastq.gz,AEG588A1_S1_L002_R2_001.fastq.gz,forward +CONTROL_REP2,AEG588A2_S2_L002_R1_001.fastq.gz,AEG588A2_S2_L002_R2_001.fastq.gz,forward +CONTROL_REP3,AEG588A3_S3_L002_R1_001.fastq.gz,AEG588A3_S3_L002_R2_001.fastq.gz,forward +TREATMENT_REP1,AEG588A4_S4_L003_R1_001.fastq.gz,,reverse +TREATMENT_REP2,AEG588A5_S5_L003_R1_001.fastq.gz,,reverse +TREATMENT_REP3,AEG588A6_S6_L003_R1_001.fastq.gz,,reverse +TREATMENT_REP3,AEG588A6_S6_L004_R1_001.fastq.gz,,reverse ``` -| Column | Description | -| --------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `sample` | Custom sample name. This entry will be identical for multiple sequencing libraries/runs from the same sample. Spaces in sample names are automatically converted to underscores (`_`). | -| `fastq_1` | Full path to FastQ file for Illumina short reads 1. File has to be gzipped and have the extension ".fastq.gz" or ".fq.gz". | -| `fastq_2` | Full path to FastQ file for Illumina short reads 2. File has to be gzipped and have the extension ".fastq.gz" or ".fq.gz". | +| Column | Description | +| -------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `sample` | Custom sample name. This entry will be identical for multiple sequencing libraries/runs from the same sample. Spaces in sample names are automatically converted to underscores (`_`). | +| `fastq_1` | Full path to FastQ file for Illumina short reads 1. File has to be gzipped and have the extension ".fastq.gz" or ".fq.gz". | +| `fastq_2` | Full path to FastQ file for Illumina short reads 2. File has to be gzipped and have the extension ".fastq.gz" or ".fq.gz". | +| `strandedness` | Sample strand-specificity. Must be one of `unstranded`, `forward`, `reverse` or `auto`. | An [example samplesheet](../assets/samplesheet.csv) has been provided with the pipeline. +## BSJ detection + +This part of the pipeline is responsible for the detection of back-splice junctions (BSJs) in the input data. The following tools are currently supported: + +- `CIRCexplorer2` +- `circRNA finder` +- `CIRIquant` +- `DCC` +- `find circ` +- `MapSplice` +- `Segemehl` + +The tools to be used can be specified using the `tools` parameter. +Each of the tools also quantifies how many reads support each BSJ. You can specify a cutoff for the minimum number of reads supporting a BSJ using the `bsj_reads` parameter. +Additionally, the parameter `min_tools` can be used to specify how many tools a BSJ has to be detected by to be considered as a valid hit. + +For instructions on how to interpret the output of this section, please check out the [output documentation](https://nf-co.re/circrna/output#bsj-detection). + +## Annotation + +The annotation is generally based on the reference GTF file. It can also utilize BED files that are provided by the various circRNA databases. +The GTF-based annotation allows setting the parameter `exon_boundary` to specify a window around exons. If the BSJ is within this window, it will be annotated as a circRNA - otherwise, it will be annotated as an exon-intron circRNA (EI-circRNA). The default value is `0`. + +For the database-based annotation, an additional sample sheet is required: + +```csv title="annotation.csv" +name,file,min_overlap +db1,db1.bed,0.9 +db2,db2.bed,0.8 +``` + +| Column | Description | +| ------------- | --------------------------------------------------------------------------------------------- | +| `name` | Name of the database. This will be used as a prefix for the region names in the output files. | +| `file` | Path to the BED file. The file has to be a valid BED6 file. | +| `min_overlap` | Minimum bidirectional overlap required between the BSJ and the region in the BED file. | + +The output of the annotation step will be bundled with the outputs of the BSJ detection step. + +## miRNA prediction + +This section allows looking for miRNA binding sites in the circRNAs. +The following tools are currently supported: + +- `miRanda` +- `TargetScan` + +This section will only be executed if the `mature` parameter is provided. +The parameter `mature` should point to a FASTA file containing mature miRNA sequences. +By providing a TSV file containing the miRNA expression of all samples via `mirna_expression`, this +sub-workflow will perform additional normalization and filtering of `mirna_expression` and `mature` before +executing the miRNA binding size prediction. + +To view the outputs of the module, please see the output [documentation](https://nf-co.re/circrna/dev/output#mirna-prediction). + +## Statistical tests + +Currently, only [CircTest](https://github.com/dieterich-lab/CircTest) is supported for the statistical analysis of the circRNA expression data. The `phenotype` parameter is required for this step. + +A valid example of a `phenotype.csv` file (matching the "Full samplesheet") is shown here: + +```csv title="phenotype.csv" +sample,condition +CONTROL_REP1,control +CONTROL_REP2,control +CONTROL_REP3,control +TREATMENT_REP1,treatment +TREATMENT_REP2,treatment +TREATMENT_REP3,treatment +``` + +Note that `TREATMENT_REP3` only has one entry in the `phenotype.csv` file, even though it has two entries in the `samplesheet.csv` file. +If the `phenotype` parameter is provided, the phenotype information will also be added to the `SummarizedExperiment` object, that results from the "Quantification" step. + ## Running the pipeline The typical command for running the pipeline is as follows: ```bash -nextflow run nf-core/circrna --input ./samplesheet.csv --outdir ./results --genome GRCh37 -profile docker +nextflow run \ + nf-core/circrna \ + --input \ + --outdir \ + --gtf \ + --fasta \ + --igenomes_ignore \ + --genome null \ + -profile docker ``` +> **NB:** Loading iGenomes configuration remains the default for reasons of consistency with other workflows, but should be disabled when not using iGenomes, applying the recommended usage above. + This will launch the pipeline with the `docker` configuration profile. See below for more information about profiles. Note that the pipeline will create the following files in your working directory: @@ -104,9 +189,11 @@ When you run the above command, Nextflow automatically pulls the pipeline code f nextflow pull nf-core/circrna ``` +When you run the above command, Nextflow automatically pulls the pipeline code from GitHub and stores it as a cached version. When running the pipeline after this, it will always use the cached version if available - even if the pipeline has been updated since. + ### Reproducibility -It is a good idea to specify a pipeline version when running the pipeline on your data. This ensures that a specific version of the pipeline code and software are used when you run your pipeline. If you keep using the same tag, you'll be running the same version of the pipeline, even if there have been changes to the code since. +It's a good idea to specify a pipeline version when running the pipeline on your data. This ensures that a specific version of the pipeline code and software are used when you run your pipeline. If you keep using the same tag, you'll be running the same version of the pipeline, even if there have been changes to the code since. First, go to the [nf-core/circrna releases page](https://github.com/nf-core/circrna/releases) and find the latest pipeline version - numeric only (eg. `1.3.1`). Then specify this when running the pipeline with `-r` (one hyphen) - eg. `-r 1.3.1`. Of course, you can switch to another version by changing the number after the `-r` flag. @@ -146,14 +233,19 @@ If `-profile` is not specified, the pipeline will run locally and expect all sof - Includes links to test data so needs no other parameters - `docker` - A generic configuration profile to be used with [Docker](https://docker.com/) + - Pulls software from Docker Hub: [`nfcore/circrna`](https://hub.docker.com/r/nfcore/circrna/) - `singularity` - A generic configuration profile to be used with [Singularity](https://sylabs.io/docs/) + - Pulls software from Docker Hub: [`nfcore/circrna`](https://hub.docker.com/r/nfcore/circrna/) - `podman` - A generic configuration profile to be used with [Podman](https://podman.io/) + - Pulls software from Docker Hub: [`nfcore/circrna`](https://hub.docker.com/r/nfcore/circrna/) - `shifter` - A generic configuration profile to be used with [Shifter](https://nersc.gitlab.io/development/shifter/how-to-use/) + - Pulls software from Docker Hub: [`nfcore/circrna`](https://hub.docker.com/r/nfcore/circrna/) - `charliecloud` - A generic configuration profile to be used with [Charliecloud](https://hpc.github.io/charliecloud/) + - Pulls software from Docker Hub: [`nfcore/circrna`](https://hub.docker.com/r/nfcore/circrna/) - `apptainer` - A generic configuration profile to be used with [Apptainer](https://apptainer.org/) - `wave` @@ -163,7 +255,7 @@ If `-profile` is not specified, the pipeline will run locally and expect all sof ### `-resume` -Specify this when restarting a pipeline. Nextflow will use cached results from any pipeline steps where the inputs are the same, continuing from where it got to previously. For input to be considered the same, not only the names must be identical but the files' contents as well. For more info about this parameter, see [this blog post](https://www.nextflow.io/blog/2019/demystifying-nextflow-resume.html). +Specify this when restarting a pipeline. Nextflow will used cached results from any pipeline steps where the inputs are the same, continuing from where it got to previously. You can also supply a run name to resume a specific run: `-resume [run-name]`. Use the `nextflow log` command to show previous run names. @@ -189,6 +281,53 @@ To use a different container from the default container or conda environment spe A pipeline might not always support every possible argument or option of a particular tool used in pipeline. Fortunately, nf-core pipelines provide some freedom to users to insert additional parameters that the pipeline does not include by default. +Command error: +.command.sh: line 9: 30 Killed STAR --genomeDir star --readFilesIn WT_REP1_trimmed.fq.gz --runThreadN 2 --outFileNamePrefix WT_REP1. +Work dir: +/home/pipelinetest/work/9d/172ca5881234073e8d76f2a19c88fb + +Tip: you can replicate the issue by changing to the process work dir and entering the command `bash .command.run` + +```` + +To change the resource requests, please see the [max resources](https://nf-co.re/docs/usage/configuration#max-resources) and [tuning workflow resources](https://nf-co.re/docs/usage/configuration#tuning-workflow-resources) section of the nf-core website. + +#### For beginners + +A first step to bypass this error, you could try to increase the amount of CPUs, memory, and time for the whole pipeline. Therefor you can try to increase the resource for the parameters `--max_cpus`, `--max_memory`, and `--max_time`. Based on the error above, you have to increase the amount of memory. Therefore you can go to the [parameter documentation of rnaseq](https://nf-co.re/rnaseq/3.9/parameters) and scroll down to the `show hidden parameter` button to get the default value for `--max_memory`. In this case 128GB, you than can try to run your pipeline again with `--max_memory 200GB -resume` to skip all process, that were already calculated. If you can not increase the resource of the complete pipeline, you can try to adapt the resource for a single process as mentioned below. + +#### Advanced option on process level + +To bypass this error you would need to find exactly which resources are set by the `STAR_ALIGN` process. The quickest way is to search for `process STAR_ALIGN` in the [nf-core/rnaseq Github repo](https://github.com/nf-core/rnaseq/search?q=process+STAR_ALIGN). +We have standardised the structure of Nextflow DSL2 pipelines such that all module files will be present in the `modules/` directory and so, based on the search results, the file we want is `modules/nf-core/star/align/main.nf`. +If you click on the link to that file you will notice that there is a `label` directive at the top of the module that is set to [`label process_high`](https://github.com/nf-core/rnaseq/blob/4c27ef5610c87db00c3c5a3eed10b1d161abf575/modules/nf-core/software/star/align/main.nf#L9). +The [Nextflow `label`](https://www.nextflow.io/docs/latest/process.html#label) directive allows us to organise workflow processes in separate groups which can be referenced in a configuration file to select and configure subset of processes having similar computing requirements. +The default values for the `process_high` label are set in the pipeline's [`base.config`](https://github.com/nf-core/rnaseq/blob/4c27ef5610c87db00c3c5a3eed10b1d161abf575/conf/base.config#L33-L37) which in this case is defined as 72GB. +Providing you haven't set any other standard nf-core parameters to **cap** the [maximum resources](https://nf-co.re/usage/configuration#max-resources) used by the pipeline then we can try and bypass the `STAR_ALIGN` process failure by creating a custom config file that sets at least 72GB of memory, in this case increased to 100GB. +The custom config below can then be provided to the pipeline via the [`-c`](#-c) parameter as highlighted in previous sections. + +```nextflow +process { + withName: 'NFCORE_RNASEQ:RNASEQ:ALIGN_STAR:STAR_ALIGN' { + memory = 100.GB + } +} +```` + +> **NB:** We specify the full process name i.e. `NFCORE_RNASEQ:RNASEQ:ALIGN_STAR:STAR_ALIGN` in the config file because this takes priority over the short name (`STAR_ALIGN`) and allows existing configuration using the full process name to be correctly overridden. +> +> If you get a warning suggesting that the process selector isn't recognised check that the process name has been specified correctly. + +### Custom Containers + +In some cases you may wish to change which container or conda environment a step of the pipeline uses for a particular tool. By default nf-core pipelines use containers and software from the [biocontainers](https://biocontainers.pro/) or [bioconda](https://bioconda.github.io/) projects. However in some cases the pipeline specified version maybe out of date. + +To use a different container from the default container or conda environment specified in a pipeline, please see the [updating tool versions](https://nf-co.re/docs/usage/configuration#updating-tool-versions) section of the nf-core website. + +### Custom Tool Arguments + +A pipeline might not always support every possible argument or option of a particular tool used in pipeline. Fortunately, nf-core pipelines provide some freedom to users to insert additional parameters that the pipeline does not include by default. + To learn how to provide additional arguments to a particular tool of the pipeline, please see the [customising tool arguments](https://nf-co.re/docs/usage/configuration#customising-tool-arguments) section of the nf-core website. ### nf-core/configs diff --git a/main.nf b/main.nf index 7b13c9792..b86c49f45 100644 --- a/main.nf +++ b/main.nf @@ -26,10 +26,13 @@ include { getGenomeAttribute } from './subworkflows/local/utils_nfcore_circ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */ -// TODO nf-core: Remove this line if you don't need a FASTA file -// This is an example of how to use getGenomeAttribute() to fetch parameters -// from igenomes.config using `--genome` -params.fasta = getGenomeAttribute('fasta') +params.fasta = getGenomeAttribute('fasta') +params.gtf = getGenomeAttribute('gtf') +params.bwa = getGenomeAttribute('bwa') +params.star = getGenomeAttribute('star') +params.bowtie = getGenomeAttribute('bowtie') +params.bowtie2 = getGenomeAttribute('bowtie2') +params.mature = getGenomeAttribute('mature') /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -41,17 +44,32 @@ params.fasta = getGenomeAttribute('fasta') // WORKFLOW: Run main analysis pipeline depending on type of input // workflow NFCORE_CIRCRNA { - take: - samplesheet // channel: samplesheet read in from --input + ch_samplesheet main: + ch_versions = Channel.empty() + // - // WORKFLOW: Run pipeline + // WORKFLOW: Run nf-core/circrna workflow // + ch_fasta = Channel.value([[id: "fasta"], file(params.fasta, checkIfExists:true)]) + ch_gtf = Channel.value([[id: "gtf"], file(params.gtf, checkIfExists:true)]) + ch_mature = params.mature ? Channel.value([[id: "mature"], file(params.mature, checkIfExists:true)]) : Channel.empty() + ch_phenotype = params.phenotype ? Channel.value([[id: "phenotype"], file(params.phenotype, checkIfExists:true)]) : Channel.empty() + ch_annotation = params.annotation ? Channel.fromSamplesheet("annotation") : Channel.empty() + ch_mirna = params.mature && params.mirna_expression ? Channel.value([[id: "mirna"], file(params.mirna_expression, checkIfExists:true)]) : Channel.empty() + CIRCRNA ( - samplesheet + ch_samplesheet, + ch_phenotype, + ch_fasta, + ch_gtf, + ch_mature, + ch_annotation, + ch_versions, + ch_mirna ) emit: multiqc_report = CIRCRNA.out.multiqc_report // channel: /path/to/multiqc_report.html diff --git a/modules.json b/modules.json index d04c08092..ef6b1b012 100644 --- a/modules.json +++ b/modules.json @@ -5,20 +5,231 @@ "https://github.com/nf-core/modules.git": { "modules": { "nf-core": { + "bedtools/getfasta": { + "branch": "master", + "git_sha": "cdcdd5e3d806f0ff3983c40c69e0b07bb44ec299", + "installed_by": ["modules"] + }, + "bedtools/groupby": { + "branch": "master", + "git_sha": "3b248b84694d1939ac4bb33df84bf6233a34d668", + "installed_by": ["modules"] + }, + "bedtools/intersect": { + "branch": "master", + "git_sha": "575e1bc54b083fb15e7dd8b5fcc40bea60e8ce83", + "installed_by": ["modules"] + }, + "bedtools/sort": { + "branch": "master", + "git_sha": "571a5feac4c9ce0a8df0bc15b94230e7f3e8db47", + "installed_by": ["modules"] + }, + "bioawk": { + "branch": "master", + "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", + "installed_by": ["modules"], + "patch": "modules/nf-core/bioawk/bioawk.diff" + }, + "bowtie/align": { + "branch": "master", + "git_sha": "3c77ca9aac783e76c3614a06db3bfe4fef619bde", + "installed_by": ["modules"] + }, + "bowtie/build": { + "branch": "master", + "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", + "installed_by": ["modules"] + }, + "bowtie2/align": { + "branch": "master", + "git_sha": "e4bad511789f16d0df39ee306b2cd50418365048", + "installed_by": ["modules"] + }, + "bowtie2/build": { + "branch": "master", + "git_sha": "1fea64f5132a813ec97c1c6d3a74e0aee7142b6d", + "installed_by": ["modules"] + }, + "bwa/index": { + "branch": "master", + "git_sha": "e0ff65e1fb313677de09f5f477ae3da30ce19b7b", + "installed_by": ["modules"] + }, + "cat/cat": { + "branch": "master", + "git_sha": "9437e6053dccf4aafa022bfd6e7e9de67e625af8", + "installed_by": ["modules"] + }, + "cat/fastq": { + "branch": "master", + "git_sha": "4fc983ad0b30e6e32696fa7d980c76c7bfe1c03e", + "installed_by": ["modules"] + }, + "circexplorer2/annotate": { + "branch": "master", + "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", + "installed_by": ["modules"] + }, + "circexplorer2/parse": { + "branch": "master", + "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", + "installed_by": ["modules"] + }, + "csvtk/join": { + "branch": "master", + "git_sha": "614abbf126f287a3068dc86997b2e1b6a93abe20", + "installed_by": ["modules"] + }, + "csvtk/split": { + "branch": "master", + "git_sha": "614abbf126f287a3068dc86997b2e1b6a93abe20", + "installed_by": ["modules"] + }, + "custom/dumpsoftwareversions": { + "branch": "master", + "git_sha": "de45447d060b8c8b98575bc637a4a575fd0638e1", + "installed_by": ["modules"] + }, + "custom/gtffilter": { + "branch": "master", + "git_sha": "a0aee18374b7f072aa0f89f4d66f5a3a9f8176d2", + "installed_by": ["modules"] + }, + "custom/tx2gene": { + "branch": "master", + "git_sha": "ec155021a9104441bf6a9bae3b55d1b5b0bfdb3a", + "installed_by": ["modules"] + }, "fastqc": { "branch": "master", "git_sha": "666652151335353eef2fcd58880bcef5bc2928e1", "installed_by": ["modules"] }, + "gawk": { + "branch": "master", + "git_sha": "cf3ed075695639b0a0924eb0901146df1996dc08", + "installed_by": ["modules"] + }, + "gffread": { + "branch": "master", + "git_sha": "6c996d7fbe0816dcbb68ce587ad5f873313682a1", + "installed_by": ["modules"] + }, + "gnu/sort": { + "branch": "master", + "git_sha": "a3cc42943548378b726610f45bb5a79ab3f0b633", + "installed_by": ["modules"] + }, + "hisat2/align": { + "branch": "master", + "git_sha": "400037f54de4b0c42712ec5a499d9fd9e66250d1", + "installed_by": ["modules"] + }, + "hisat2/build": { + "branch": "master", + "git_sha": "400037f54de4b0c42712ec5a499d9fd9e66250d1", + "installed_by": ["modules"] + }, + "hisat2/extractsplicesites": { + "branch": "master", + "git_sha": "400037f54de4b0c42712ec5a499d9fd9e66250d1", + "installed_by": ["modules"] + }, + "miranda": { + "branch": "master", + "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", + "installed_by": ["modules"] + }, "multiqc": { "branch": "master", "git_sha": "cf17ca47590cc578dfb47db1c2a44ef86f89976d", "installed_by": ["modules"] + }, + "samtools/faidx": { + "branch": "master", + "git_sha": "04fbbc7c43cebc0b95d5b126f6d9fe4effa33519", + "installed_by": ["modules"] + }, + "samtools/flagstat": { + "branch": "master", + "git_sha": "46eca555142d6e597729fcb682adcc791796f514", + "installed_by": ["bam_stats_samtools"] + }, + "samtools/idxstats": { + "branch": "master", + "git_sha": "46eca555142d6e597729fcb682adcc791796f514", + "installed_by": ["bam_stats_samtools"] + }, + "samtools/index": { + "branch": "master", + "git_sha": "46eca555142d6e597729fcb682adcc791796f514", + "installed_by": ["bam_sort_stats_samtools", "modules"] + }, + "samtools/sort": { + "branch": "master", + "git_sha": "46eca555142d6e597729fcb682adcc791796f514", + "installed_by": ["bam_sort_stats_samtools", "modules"] + }, + "samtools/stats": { + "branch": "master", + "git_sha": "46eca555142d6e597729fcb682adcc791796f514", + "installed_by": ["bam_stats_samtools"] + }, + "samtools/view": { + "branch": "master", + "git_sha": "6c2309aaec566c0d44a6cf14d4b2d0c51afe2e91", + "installed_by": ["modules"] + }, + "segemehl/align": { + "branch": "master", + "git_sha": "9a6b0745dbb5359286d36dee2183ffab240abba0", + "installed_by": ["modules"] + }, + "segemehl/index": { + "branch": "master", + "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", + "installed_by": ["modules"] + }, + "star/align": { + "branch": "master", + "git_sha": "a21faa6a3481af92a343a10926f59c189a2c16c9", + "installed_by": ["modules"] + }, + "star/genomegenerate": { + "branch": "master", + "git_sha": "a21faa6a3481af92a343a10926f59c189a2c16c9", + "installed_by": ["modules"] + }, + "stringtie/stringtie": { + "branch": "master", + "git_sha": "b1b959609bda44341120aed1766329909f54b8d0", + "installed_by": ["modules"] + }, + "trimgalore": { + "branch": "master", + "git_sha": "a98418419ae6c9df3cf6cf108d1e1aba71037d5a", + "installed_by": ["modules"] + }, + "tximeta/tximport": { + "branch": "master", + "git_sha": "5d095e8413da1f4c72b7d07ce87f75c09482486f", + "installed_by": ["modules"] } } }, "subworkflows": { "nf-core": { + "bam_sort_stats_samtools": { + "branch": "master", + "git_sha": "46eca555142d6e597729fcb682adcc791796f514", + "installed_by": ["subworkflows"] + }, + "bam_stats_samtools": { + "branch": "master", + "git_sha": "0eacd714effe5aac1c1de26593873960b3346cab", + "installed_by": ["bam_sort_stats_samtools", "subworkflows"] + }, "utils_nextflow_pipeline": { "branch": "master", "git_sha": "3aa0aec1d52d492fe241919f0c6100ebf0074082", diff --git a/modules/local/annotation/environment.yml b/modules/local/annotation/environment.yml new file mode 100644 index 000000000..8c3113528 --- /dev/null +++ b/modules/local/annotation/environment.yml @@ -0,0 +1,5 @@ +name: annotation +channels: + - conda-forge +dependencies: + - pandas=1.5.2 diff --git a/modules/local/annotation/main.nf b/modules/local/annotation/main.nf new file mode 100644 index 000000000..01b51eda8 --- /dev/null +++ b/modules/local/annotation/main.nf @@ -0,0 +1,23 @@ +process ANNOTATION { + tag "$meta.id" + label 'process_medium' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/pandas:1.5.2' : + 'biocontainers/pandas:1.5.2' }" + + input: + tuple val(meta), path(gtf_intersection), path(db_intersections) + val(exon_boundary) + + output: + tuple val(meta), path("${prefix}.bed"), emit: bed + tuple val(meta), path("${prefix}.gtf"), emit: gtf + + path "versions.yml" , emit: versions + + script: + prefix = task.ext.prefix ?: meta.id + template 'annotation.py' +} diff --git a/modules/local/annotation/templates/annotation.py b/modules/local/annotation/templates/annotation.py new file mode 100755 index 000000000..b0ff285be --- /dev/null +++ b/modules/local/annotation/templates/annotation.py @@ -0,0 +1,166 @@ +#!/usr/bin/env python + +import pandas as pd +import numpy as np +import platform +import csv + +def format_yaml_like(data: dict, indent: int = 0) -> str: + """Formats a dictionary to a YAML-like string. + + Args: + data (dict): The dictionary to format. + indent (int): The current indentation level. + + Returns: + str: A string formatted as YAML. + """ + yaml_str = "" + for key, value in data.items(): + spaces = " " * indent + if isinstance(value, dict): + yaml_str += f"{spaces}{key}:\\n{format_yaml_like(value, indent + 1)}" + else: + yaml_str += f"{spaces}{key}: {value}\\n" + return yaml_str + + +columns = { + 0: 'chr', + 1: 'start', + 2: 'end', + 3: 'name', + 4: 'score', + 5: 'strand', + 9: 'tx_start', + 10: 'tx_end', + 14: 'attributes' +} + +attributes = ['gene_id', 'gene_name', 'transcript_id'] + +exon_boundary = int("${exon_boundary}") + +try: + df = pd.read_csv("${gtf_intersection}", sep="\\t", header=None, usecols=columns.keys()) +except pd.errors.EmptyDataError: + raise ValueError("Intersection between circRNAs and GTF file is empty.") +df = df.rename(columns=columns) + +# Extract circRNAs without match +mask = df['tx_start'] == -1 +df_intergenic = df[mask] +df = df[~mask] +df_intergenic['type'] = 'intergenic-circRNA' +df_intergenic['gene_id'] = 'intergenic_' + df_intergenic['name'] +df_intergenic['gene_name'] = 'intergenic_' + df_intergenic['name'] +df_intergenic['transcript_id'] = 'intergenic_' + df_intergenic['name'] + +# Convert attributes to a dictionary +df['attributes'] = df['attributes'].apply(lambda row: dict([[value.strip(r'"') for value in entry.strip().split(' ', 1)] for entry in row.split(';') if entry])) +# Make sure all attributes are present +df_incomplete = df['attributes'].apply(lambda row: ", ".join([key for key in attributes if key not in row])) +df_incomplete = df_incomplete[df_incomplete != ""] +if len(df_incomplete) > 0: + counts = df_incomplete.value_counts() + counts.name = 'count' + counts.index.name = 'missing' + raise ValueError(f"The following attributes are missing in the intersection file:\\n\\n{counts.to_frame()}") +# Keep only the attributes we want +df['attributes'] = df['attributes'].apply(lambda row: {key: row[key] for key in attributes if key in row}) +# Convert attributes to columns +df = pd.concat([df.drop(['attributes'], axis=1), df['attributes'].apply(pd.Series)], axis=1) + +df['any_outside'] = (df['start'] < df['tx_start'] - exon_boundary) | (df['end'] > df['tx_end'] + exon_boundary) +# Perfect is inverse of any_outside +df['perfect'] = ~df['any_outside'] +# Drop any_outside +df = df.drop(['any_outside', 'tx_start', 'tx_end'], axis=1) + +df = df.groupby(['chr', 'start', 'end', 'strand']).aggregate({ + 'name': lambda x: x.iloc[0], + 'score': lambda x: x.iloc[0], + 'gene_id': lambda x: list(x), + 'gene_name': lambda x: list(x), + 'transcript_id': lambda x: list(x), + 'perfect': lambda x: list(x) +}) + +def filter_perfect(row, col): + if any(row['perfect']): + matching_values = [value for value, perfectness in zip(row[col], row['perfect']) if perfectness] + else: + matching_values = row[col] + valid_values = set([value for value in matching_values if type(value) == str]) + return ",".join(valid_values) if valid_values else "NaN" + +def determine_type(row): + if row["no_transcript"]: + return "ciRNA" + if any(row['perfect']): + return "circRNA" + else: + return 'EI-circRNA' + +df['no_transcript'] = df['transcript_id'].apply(lambda x: all([type(value) != str and np.isnan(value) for value in x])) +df['type'] = df.apply(lambda row: determine_type(row), axis=1) +df['gene_id'] = df.apply(lambda row: filter_perfect(row, 'gene_id'), axis=1) +df['gene_name'] = df.apply(lambda row: filter_perfect(row, 'gene_name'), axis=1) +df['transcript_id'] = df.apply(lambda row: filter_perfect(row, 'transcript_id'), axis=1) +# Drop perfect +df = df.drop(['perfect'], axis=1) + +df = df.reset_index() +df_intergenic = df_intergenic.reset_index() +bed_order = ['chr', 'start', 'end', 'name', 'score', 'strand', 'type', 'gene_id', 'gene_name', 'transcript_id'] +df = df[bed_order] +df_intergenic = df_intergenic[bed_order] + +df = pd.concat([df, df_intergenic], axis=0) + +db_intersections = "${db_intersections}".split() +has_db = len(db_intersections) > 0 + +if has_db: + db_colnames = ['chr', 'start', 'end', 'name', 'score', 'strand', 'db_chr', 'db_start', 'db_end', 'db_name', 'db_score', 'db_strand'] + db_usecols = ['chr', 'start', 'end', 'name', 'score', 'strand', 'db_name'] + df_databases = pd.concat([pd.read_csv(db_path, sep="\\t", names=db_colnames, usecols=db_usecols) for db_path in db_intersections]) + + # Group by chr, start, end, name, score, strand, and aggregate the db_name to list + df_databases = df_databases.groupby(['chr', 'start', 'end', 'name', 'score', 'strand']).aggregate({ + 'db_name': lambda x: ",".join([val for val in x if val != '.']) + }) + + df_databases['db_name'] = df_databases['db_name'].apply(lambda x: x if x else '.') + + df = df.merge(df_databases, how='left', on=['chr', 'start', 'end', 'name', 'score', 'strand']) +else: + df['db_name'] = "." + +# Sort by chr, start, end +df = df.sort_values(['chr', 'start', 'end']) + +df.to_csv("${prefix}.bed", sep='\\t', index=False, header=False) + +# Convert to GTF +df['source'] = 'circRNA' +df['frame'] = '.' +df['attributes'] = 'gene_id "' + df['gene_id'] + '"; gene_name "' + df['gene_name'] + '"; transcript_id "circ_' + df['name'] + '"; db_ids "' + df['db_name'] + '";' + +gtf_order = ['chr', 'source', 'type', 'start', 'end', 'score', 'strand', 'frame', 'attributes'] +df = df[gtf_order] + +df.to_csv("${prefix}.gtf", sep='\\t', index=False, header=False, quoting=csv.QUOTE_NONE) + +# Versions + +versions = { + "${task.process}": { + "python": platform.python_version(), + "pandas": pd.__version__, + "numpy": np.__version__ + } +} + +with open("versions.yml", "w") as f: + f.write(format_yaml_like(versions)) diff --git a/modules/local/circexplorer2/reference/main.nf b/modules/local/circexplorer2/reference/main.nf new file mode 100644 index 000000000..f16fa9c0b --- /dev/null +++ b/modules/local/circexplorer2/reference/main.nf @@ -0,0 +1,38 @@ +process CIRCEXPLORER2_REFERENCE { + tag "$gtf" + label 'process_single' + + conda "bioconda::ucsc-gtftogenepred=377 bioconda::ucsc-genepredtobed=377 bioconda::bedtools=2.27.0" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/mulled-v2-d7ee3552d06d8acebbc660507b48487c7369e221:07daadbfe8182aa3c974c7b78924d5c8730b922d-0' : + 'biocontainers/mulled-v2-d7ee3552d06d8acebbc660507b48487c7369e221:07daadbfe8182aa3c974c7b78924d5c8730b922d-0' }" + + input: + path gtf + + output: + path("${prefix}.txt"), emit: txt + path "versions.yml", emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + prefix = gtf.simpleName + def VERSION = '377' + """ + gtfToGenePred \ + $args \ + $gtf \ + ${prefix}.genepred + + awk -v OFS="\\t" '{print \$12, \$1, \$2, \$3, \$4, \$5, \$6, \$7, \$8, \$9, \$10}' ${prefix}.genepred > ${prefix}.txt + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + awk: \$(awk --version | head -n1 | cut -d' ' -f3 | sed 's/,//g' ) + ucsc: $VERSION + END_VERSIONS + """ +} diff --git a/modules/local/circrna_finder/main.nf b/modules/local/circrna_finder/main.nf new file mode 100644 index 000000000..e9a407205 --- /dev/null +++ b/modules/local/circrna_finder/main.nf @@ -0,0 +1,34 @@ +process CIRCRNA_FINDER { + tag "$meta.id" + label 'process_low' + + conda "bioconda::circrna_finder=1.2" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/circrna_finder%3A1.2--pl5321hdfd78af_1' : + 'biocontainers/circrna_finder:1.2--pl5321hdfd78af_1' }" + + input: + tuple val(meta), path(star_input, stageAs: 'input/') + + output: + tuple val(meta), path("${prefix}.filteredJunctions.bed"), emit: results + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + prefix = task.ext.prefix ?: "${meta.id}" + def VERSION = 'v1.2' + """ + postProcessStarAlignment.pl --starDir input/ --outDir ./ + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + awk: \$(awk --version | head -n1 | cut -d' ' -f3 | sed 's/,//g' ) + cat: \$(cat --version | head -n 1 | sed -e 's/cat (GNU coreutils) //') + circRNA_finder: $VERSION + END_VERSIONS + """ +} diff --git a/modules/local/circtest/circtest/main.nf b/modules/local/circtest/circtest/main.nf new file mode 100644 index 000000000..be3494af9 --- /dev/null +++ b/modules/local/circtest/circtest/main.nf @@ -0,0 +1,24 @@ +process CIRCTEST_CIRCTEST { + label 'process_medium' + + conda "conda-forge::r-base=4.2.2 conda-forge::r-aod=1.3.2 conda-forge::r-ggplot2=3.4.0 conda-forge::r-plyr=1.8.8" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/mulled-v2-c79b00aa4647c739dbe7e8480789d3ba67988f2e:0' : + 'biocontainers/mulled-v2-c79b00aa4647c739dbe7e8480789d3ba67988f2e:0' }" + + input: + tuple val(meta) , path(gene_counts), path(circ_counts) + tuple val(meta2), path(phenotype) + + output: + tuple val(meta), path("${prefix}_summary.txt"), emit: summary + tuple val(meta), path("*.pdf") , emit: plots + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + prefix = task.ext.prefix ?: meta.id + template 'circtest.R' +} diff --git a/modules/local/circtest/circtest/templates/circtest.R b/modules/local/circtest/circtest/templates/circtest.R new file mode 100755 index 000000000..debdf0395 --- /dev/null +++ b/modules/local/circtest/circtest/templates/circtest.R @@ -0,0 +1,475 @@ +#!/usr/bin/env Rscript + +require(aod) +require(plyr) +require(ggplot2) + +## CircTest functions +## Package: CircTest (https://github.com/dieterich-lab/CircTest) +## Version: 0.1.1 +## Author(s): Jun Cheng, Tobias Jakobi +## License: GPL + +## SUMMMARY + +#'@title Summarizes data +## Gives count, mean, standard deviation, standard error of the mean, and confidence interval (default 95%). +## data: a data frame. +## measurevar: the name of a column that contains the variable to be summariezed +## groupvars: a vector containing names of columns that contain grouping variables +## na.rm: a boolean that indicates whether to ignore NA's +## conf.interval: the percent range of the confidence interval (default is 95%) +#' +summarySE <- function(data=NULL, measurevar, groupvars=NULL, na.rm=FALSE, + conf.interval=.95, .drop=TRUE) { + + require(plyr) + + # New version of length which can handle NA's: if na.rm==T, don't count them + length2 <- function (x, na.rm=FALSE) { + if (na.rm) sum(!is.na(x)) + else length(x) + } + + # This does the summary. For each group's data frame, return a vector with + # N, mean, and sd + datac <- ddply(data, groupvars, .drop=.drop, + .fun = function(xx, col) { + c(N = length2(xx[[col]], na.rm=na.rm), + mean = mean (xx[[col]], na.rm=na.rm), + sd = sd (xx[[col]], na.rm=na.rm) + ) + }, + measurevar + ) + + # Rename the "mean" column + datac <- rename(datac, c("mean" = measurevar)) + + datac\$se <- datac\$sd / sqrt(datac\$N) # Calculate standard error of the mean + + # Confidence interval multiplier for standard error + # Calculate t-statistic for confidence interval: + # e.g., if conf.interval is .95, use .975 (above/below), and use df=N-1 + ciMult <- qt(conf.interval/2 + .5, datac\$N-1) + datac\$ci <- datac\$se * ciMult + + return(datac) +} + +## RATIO PLOT + +Circ.ratioplot <- function(Circ,Linear,CircCoordinates = None,plotrow='1',size=24,ncol=2,groupindicator1=NULL,groupindicator2=NULL,x='Conditions',y='circRNA/(circRNA+Linear)',lab_legend='groupindicator1', circle_description = c(1:3), gene_column = None, y_axis_range = 1, colour_mode = "colour"){ + + if( !is.null(groupindicator1) & length(groupindicator1) != ncol(Circ)-length(circle_description) ){ + stop("If provided, the length of groupindicator1 should be equal to the number of samples.") + } + if( !is.null(groupindicator2) & length(groupindicator2) != ncol(Circ)-length(circle_description) ){ + stop("If provided, the length of groupindicator2 should be equal to the number of samples.") + } + if(is.null(groupindicator1)){ + stop("At least one grouping should be provided through groupindicator1.") + } + if(!is.null(groupindicator2)){ + twolevel <- TRUE + }else{ + twolevel <- FALSE + } + + rownames.circ <- rownames(Circ) + Circ <- data.frame(lapply(Circ, as.character), stringsAsFactors=FALSE) + rownames(Circ) <- rownames.circ + + rownames.linear <- rownames(Linear) + Linear <- data.frame(lapply(Linear, as.character), stringsAsFactors=FALSE) + rownames(Linear) <- rownames.linear + + if(!missing(CircCoordinates)){ + rownames.circCoordinates <- rownames(CircCoordinates) + CircCoordinates <- data.frame(lapply(CircCoordinates, as.character), stringsAsFactors=FALSE) + rownames(CircCoordinates) <- rownames.circCoordinates + }else{ + CircCoordinates <- data.frame(Circ[,circle_description]) + rownames(CircCoordinates) <- rownames.circ + rownames.circCoordinates <- rownames(CircCoordinates) + CircCoordinates <- data.frame(lapply(CircCoordinates, as.character), stringsAsFactors=FALSE) + rownames(CircCoordinates) <- rownames.circCoordinates + } + + groupindicator1 <- factor(groupindicator1,levels=unique(groupindicator1)) + groupindicator2 <- factor(groupindicator2,levels=unique(groupindicator2)) + + # Get gene name, if no annotation, output NULL + if (is.character(plotrow)){ + if ( ! plotrow %in% rownames(CircCoordinates) ){ + stop("Specified 'plotrow' not found.") + } + }else{ + if ( is.numeric(plotrow) ){ + if ( ! plotrow %in% 1:nrow(CircCoordinates) ){ + stop("Specified 'plotrow' not found.") + } + }else{ + stop("Specified plotrow should be ONE rowname or ONE rownumber.") + } + } + # Choose your own column containing the gene name using gene_column. The genename will be displayed in the plot title if available + if (missing(gene_column)){ + genename = NULL + }else{ + genename <- as.character(CircCoordinates[plotrow,gene_column]) + if (genename == '.'){ + genename = NULL + } + } + if(twolevel){ + plotdat <- summarySE( data.frame(Ratio=as.numeric(Circ[plotrow,-circle_description])/(as.numeric(Linear[plotrow,-circle_description])+as.numeric(Circ[plotrow,-circle_description])), + groupindicator1, + groupindicator2), + measurevar='Ratio',groupvars=c('groupindicator1','groupindicator2') ) + }else{ + plotdat <- summarySE( data.frame(Ratio=as.numeric(Circ[plotrow,-circle_description])/(as.numeric(Linear[plotrow,-circle_description])+as.numeric(Circ[plotrow,-circle_description])), + groupindicator1), + measurevar='Ratio',groupvars=c('groupindicator1') ) + } +# construct plot + Q <- ggplot(plotdat, aes(x=groupindicator1, y=Ratio)) + + geom_boxplot() + theme_classic() + + theme(axis.text.x = element_blank())+ + theme(axis.text.y = element_text(size=size+4))+ + theme(axis.ticks = element_line(colour = 'black', size = 1)) + + theme(axis.ticks.x = element_blank())+ + theme(legend.title=element_blank()) + + theme(text=element_text(size=size+4))+ + theme(legend.text=element_text(size=size)) + + theme(plot.title = element_text(size=size)) + + theme(axis.text.y = element_text(margin=margin(5,5,10,5,"pt")))+ + ggtitle(paste("Annotation: ", genename, "\\nChr ", toString(Circ[plotrow,circle_description]),sep="")) + + ylab("circRNA/(circRNA + Linear RNA)") + + xlab("Sample") + + geom_errorbar(aes(ymin=Ratio, ymax=Ratio+se), width=.2 , size=2) + + geom_bar(stat="identity",aes(fill=groupindicator1), color = "black", size=2) + + if (colour_mode == "bw"){ + Q <- Q + scale_fill_grey(start = 0.0, end = 1) + } else { + Q <- Q + scale_fill_discrete(name=lab_legend) + } + + Q <- Q + + theme(legend.position="bottom") + + theme(axis.ticks.length = unit(0.5, "cm")) + + theme(panel.background = element_blank(), + panel.grid.major = element_blank(), + panel.grid.minor = element_blank(), + axis.line = element_line(colour = "black"), + panel.border = element_rect(colour = "black", fill=NA, size=3)) + + guides(fill=guide_legend( + keywidth=0.3, + keyheight=0.3, + default.unit="inch") + ) + scale_y_continuous(expand=c(0,0), limits= c(0, y_axis_range)) + + if(twolevel){ + Q <- Q + facet_wrap( ~ groupindicator2,ncol=ncol ) + } + + print(Q) +} + +## LINEPLOT + +Circ.lineplot <- function(Circ,Linear,CircCoordinates = None,plotrow='1',size=18,ncol=2,groupindicator1=NULL,groupindicator2=NULL,x='Conditions',y='Counts', circle_description = c(1:3), gene_column = None){ + + require(ggplot2) + + if( !is.null(groupindicator1) & length(groupindicator1) != ncol(Circ)-length(circle_description) ){ + stop("If provided, the length of groupindicator1 should be equal to the number of samples.") + } + if( !is.null(groupindicator2) & length(groupindicator2) != ncol(Circ)-length(circle_description) ){ + stop("If provided, the length of groupindicator2 should be equal to the number of samples.") + } + if(is.null(groupindicator1)){ + stop("At least one grouping should be provided through groupindicator1.") + } + if(!is.null(groupindicator2)){ + twolevel <- TRUE + }else{ + twolevel <- FALSE + } + + rownames.circ <- rownames(Circ) + Circ <- data.frame(lapply(Circ, as.character), stringsAsFactors=FALSE) + rownames(Circ) <- rownames.circ + + rownames.linear <- rownames(Linear) + Linear <- data.frame(lapply(Linear, as.character), stringsAsFactors=FALSE) + rownames(Linear) <- rownames.linear + + # if CircCoordinates are available, use them, otherwise get more information from the Circ table, as indicated by the circle_description columns. + if(!missing(CircCoordinates)){ + rownames.circCoordinates <- rownames(CircCoordinates) + CircCoordinates <- data.frame(lapply(CircCoordinates, as.character), stringsAsFactors=FALSE) + rownames(CircCoordinates) <- rownames.circCoordinates + }else{ + CircCoordinates <- data.frame(Circ[,circle_description]) + rownames(CircCoordinates) <- rownames.circ + rownames.circCoordinates <- rownames(CircCoordinates) + CircCoordinates <- data.frame(lapply(CircCoordinates, as.character), stringsAsFactors=FALSE) + rownames(CircCoordinates) <- rownames.circCoordinates + } + + groupindicator1 <- factor(groupindicator1,levels=unique(groupindicator1)) + groupindicator2 <- factor(groupindicator2,levels=unique(groupindicator2)) + + # Get gene name, if no annotation, output NULL + if (is.character(plotrow)){ + if ( ! plotrow %in% rownames(CircCoordinates) ){ + stop("Specified 'plotrow' not found.") + } + }else{ + if ( is.numeric(plotrow) ){ + if ( ! plotrow %in% 1:nrow(CircCoordinates) ){ + stop("Specified 'plotrow' not found.") + } + }else{ + stop("Specified plotrow should be ONE rowname or ONE rownumber.") + } + } + # Choose your own column containing the gene name using gene_column. The genename will be displayed in the plot title if available + if (missing(gene_column)){ + genename = NULL + }else{ + genename <- as.character(CircCoordinates[plotrow,gene_column]) + if (genename == '.'){ + genename = NULL + } + } + + plot.func <- function(row=plotrow){ + if(twolevel){ + plotdat <- summarySE(data.frame(Counts=c(as.numeric(Circ[row,-circle_description]),as.numeric(Linear[row,-circle_description])), + groupindicator1, + groupindicator2, + Type=c(rep('circRNA',ncol(Circ)-length(circle_description)),rep('linear RNA',ncol(Circ)-length(circle_description))) + ), measurevar='Counts',groupvars=c('Type','groupindicator1','groupindicator2') ) + }else{ + plotdat <- summarySE(data.frame(Counts=c(as.numeric(Circ[row,-circle_description]),as.numeric(Linear[row,-circle_description])), + groupindicator1, + Type=c(rep('circRNA',ncol(Circ)-length(circle_description)),rep('linear RNA',ncol(Circ)-length(circle_description))) + ), measurevar='Counts',groupvars=c('Type','groupindicator1') ) + } + + Q=ggplot(plotdat, aes(x=groupindicator1, y=Counts, group=Type,colour=Type)) + + theme(text=element_text(size=size))+ + theme_bw()+ + labs( list(title=paste(toString(Circ[row,circle_description]),genename,sep=" "),x=x,y=y) ) + + ggtitle(paste(toString(Circ[row,circle_description]),genename,sep=" "))+ + geom_errorbar(aes(ymin=Counts-se, ymax=Counts+se), width=.1, position=position_dodge(.1) ) + + xlab("Condition") + + geom_line(position=position_dodge(.1)) + + geom_point(position=position_dodge(.1)) + if (twolevel){ + Q = Q + facet_wrap( ~ groupindicator2,ncol=ceiling(sqrt(length(levels(groupindicator2)))) ) + } + + print(Q) + } + + return(plot.func(row=plotrow)) +} + +## FILTER + +Circ.filter <- function(circ=circ,linear=linear,Nreplicates=3,filter.sample=4,filter.count=5,percentage=1, circle_description=c(1:3)){ + del_row=c() + for ( i in 1:nrow(circ) ){ + if ( sum(circ[i,-circle_description]>=filter.count) 0){ + new_dat=circ[-del_row,] + return(new_dat) + } else { + return(circ) + } +} + +circ_max_perc <- function(circ=circ,linear=linear,Nreplicates=3){ + # convert to vector + circ = as.numeric(circ) + linear = as.numeric(linear) + if( length(circ) != length(linear) ){ + stop ('Number of samples in circRNA is not equal to Hostgene.') + } + Ngroups = length(circ)/Nreplicates + # calculate percentage + circ_sum = unname(tapply(circ, (seq_along(1:length(circ))-1) %/% Nreplicates, sum )) + linear_sum = unname(tapply(linear, (seq_along(1:length(linear))-1) %/% Nreplicates, sum )) + perc = max(circ_sum / (circ_sum+linear_sum),na.rm=T) + return(perc) +} + +## CIRC TEST + +Circ.test <- function(Circ, Linear, CircCoordinates=None, group, alpha=0.05, plotsig=T, circle_description = c(1:3)){ + + # Requre packge + require(aod) + + # check whether the input matrix are correct + if ( nrow(Circ)!=nrow(Linear) | ncol(Circ) != ncol(Linear)){ + stop('Circ data and Linear data are not matched, dimention different.') + } + + # A vector for pvalue and directions indicator + p.val <- c() + direction <- c() + + # groups + if ( length(group) != ncol(Circ)-length(circle_description) ){ + print(length(group)) + print(ncol(Circ)-length(circle_description)) + stop("length of 'group' must be equal to the number of samples of 'Circ' and 'Linear'. ") + } + group <- factor(group) + counter <- 0 + + ## test + # construct test matrix for each circRNA + + tmp_df = Circ[,FALSE] + + for (j in seq(1,length(unique(group)))){ + tmp_df[paste("group_",j,"_ratio_mean",sep="")] <- NA + } + + for ( i in rownames(Circ) ){ + counter <- counter+1 + + # total read counts vector + tot <- round( as.numeric(Linear[i,-circle_description]) + as.numeric(Circ[i,-circle_description]) ) + + # circRNA read counts + circ <- as.numeric(Circ[i,-circle_description]) + + # if there is 0 in the total count vector, the model will fail. So permute 0 to 1 + if ( 0 %in% tot ){ + tot[tot==0]=1 + } + + if (counter %% 1000 == 0){ + message(paste(counter, "candidates processed")) + } + + tmp_rations <- data.frame(Ratio=as.numeric(Circ[i,-circle_description])/(as.numeric(Linear[i,-circle_description])+as.numeric(Circ[i,-circle_description])), + group=group) + for (rep_group in seq(1,max(as.numeric(levels(group))),1)){ + tmp_df[i, paste("group_",rep_group,"_ratio_mean",sep="")] <- mean(na.omit(unlist(tmp_rations[tmp_rations\$group==rep_group,1]))) + } + + # Constract data frame + testdat = data.frame(tot,circ,group) + + ## do test + # Null model + fitNull <- betabin(cbind(circ,tot-circ) ~ 1, ~ 1, data=testdat) + # Alternative model + fitAlt <- betabin(cbind(circ,tot-circ) ~ group, ~ 1, data=testdat) + # test models + a <- anova(fitNull,fitAlt) + p.value <- a@anova.table[,11][2] + p.val <- c( p.val, p.value ) + } + message(paste(counter, "candidates processed in total")) + + Circ\$direction <- direction + p.adj <- p.adjust(p.val,n=sum(!is.na(p.val)),'BH') + # select significant ones + sig_dat <- Circ[p.adj<=alpha & !is.na(p.adj),] + sig_ratios <- tmp_df[p.adj<=alpha & !is.na(p.adj),] + sig_p <- p.adj[p.adj<=alpha & !is.na(p.adj)] + + # sort by p-val + sig_dat <- sig_dat[order(sig_p),] + sig_ratios <- sig_ratios[order(sig_p),] + sig_p <- sort(sig_p) + + # A summary table + if (missing(CircCoordinates)){ + summary_table <- data.frame(sig_dat[,circle_description],sig_p,sig_dat[,circle_description]) + + rownames(summary_table) <- rownames(sig_dat) + names(summary_table) <- c(names(sig_dat)[circle_description],"sig_p",names(sig_ratios)[circle_description]) + } else { + summary_table <- cbind(CircCoordinates[rownames(sig_dat),],sig_p,sig_ratios) + colnames(summary_table) <- c(colnames(CircCoordinates),"sig_p",colnames(sig_ratios)) + } + + message(paste(nrow(summary_table), "candidates passed the specified thresholds")) + + # return all objects in a list + return(list(summary_table=summary_table, + sig.dat=sig_dat, + p.val=p.val, + p.adj=p.adj, + sig_p=sig_p, + ratios=sig_ratios + ) + ) +} + +## MAIN + +circs = read.table("${circ_counts}", header=T, sep="\\t") +genes = read.table("${gene_counts}", header=T, sep="\\t") +pheno = read.csv ("${phenotype}" , header=T, row.names = "sample") + +circs <- Circ.filter(circ = circs, linear = genes, filter.sample = 2, filter.count = 5, percentage = 0.00001) +genes <- genes[rownames(circs),] + +description <- c(1) +pheno <- pheno[colnames(circs[,-description]),,drop=FALSE] + +test <- Circ.test(circs, genes, group=as.numeric(as.factor(pheno\$condition)), circle_description = description) +write.table(test\$summary_table, "${prefix}_summary.txt", row.names=F) + + +pdf("circ_linear_ratio_plots.pdf", width = 8, height = 10) +for (i in rownames(test\$summary_table)) { + Circ.ratioplot(circs, genes, plotrow=i, groupindicator1=pheno\$condition, + lab_legend = 'condition', circle_description = description ) +} +dev.off() + +pdf("circ_linear_line_plots.pdf", width = 8, height = 10) +for (i in rownames(test\$summary_table)) { + Circ.lineplot(circs, genes, plotrow=i, groupindicator1=pheno\$condition, + circle_description = description ) +} +dev.off() + + +################################################ +################################################ +## VERSIONS FILE ## +################################################ +################################################ + +r.version <- strsplit(version[['version.string']], ' ')[[1]][3] + +writeLines( + c( + '"${task.process}":', + paste(' r-base:', r.version), + paste(' aod:', packageVersion('aod')), + paste(' plyr:', packageVersion('plyr')), + paste(' ggplot2:', packageVersion('ggplot2')) + ), +'versions.yml') + +################################################ +################################################ +################################################ +################################################ diff --git a/modules/local/circtest/prepare/environment.yml b/modules/local/circtest/prepare/environment.yml new file mode 100644 index 000000000..02becc7cb --- /dev/null +++ b/modules/local/circtest/prepare/environment.yml @@ -0,0 +1,5 @@ +name: circtest_prepare +channels: + - conda-forge +dependencies: + - conda-forge::r-base=4.2.1 diff --git a/modules/local/circtest/prepare/main.nf b/modules/local/circtest/prepare/main.nf new file mode 100644 index 000000000..496fe6afc --- /dev/null +++ b/modules/local/circtest/prepare/main.nf @@ -0,0 +1,21 @@ +process CIRCTEST_PREPARE { + label 'process_low' + + conda "${moduleDir}/environment.yml" + container "biocontainers/r-base:4.2.1" + + input: + tuple val(meta), path(gene_counts), path(circ_counts) + + output: + tuple val(meta), path('*_genes.tsv'), path('*_circs.tsv'), emit: counts, optional: true + + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + prefix = task.ext.prefix ?: meta.id + template 'prepare.R' +} diff --git a/modules/local/circtest/prepare/templates/prepare.R b/modules/local/circtest/prepare/templates/prepare.R new file mode 100644 index 000000000..d5dcdebb0 --- /dev/null +++ b/modules/local/circtest/prepare/templates/prepare.R @@ -0,0 +1,39 @@ +#!/usr/bin/env Rscript + +circ <- read.table("${circ_counts}", header=T, sep="\\t", check.names = FALSE) +gene <- read.table("${gene_counts}", header=T, sep="\\t", check.names = FALSE, row.names = 1) + +gene <- gene[circ\$gene_id, ] + +rownames(circ) <- circ\$tx +rownames(gene) <- rownames(circ) +circ\$tx <- NULL + +if (nrow(circ) != nrow(gene)) { + stop("Number of rows in circ and gene counts do not match") +} + +if (nrow(circ) > 0) { + write.table(circ, "${prefix}_circs.tsv", sep="\\t", quote=F, row.names=T) + write.table(gene, "${prefix}_genes.tsv", sep="\\t", quote=F, row.names=T) +} + +################################################ +################################################ +## VERSIONS FILE ## +################################################ +################################################ + +r.version <- strsplit(version[['version.string']], ' ')[[1]][3] + +writeLines( + c( + '"${task.process}":', + paste(' r-base:', r.version) + ), +'versions.yml') + +################################################ +################################################ +################################################ +################################################ diff --git a/modules/local/ciriquant/Dockerfile b/modules/local/ciriquant/Dockerfile new file mode 100644 index 000000000..cbfa3525a --- /dev/null +++ b/modules/local/ciriquant/Dockerfile @@ -0,0 +1,4 @@ +FROM community.wave.seqera.io/library/bioconductor-edger_bwa_hisat2_pysam_pruned:f14fb8726c7f0cf8 + +# Install custom fork +RUN pip install git+https://github.com/nictru/CIRIquant.git@e4916ca7b3370cef54d76ca162be64792d8c1b16 diff --git a/modules/local/ciriquant/ciriquant/main.nf b/modules/local/ciriquant/ciriquant/main.nf new file mode 100644 index 000000000..8c28130e2 --- /dev/null +++ b/modules/local/ciriquant/ciriquant/main.nf @@ -0,0 +1,65 @@ +process CIRIQUANT { + tag "$meta.id" + label 'process_high' + + container "docker.io/nicotru/ciriquant:1.0.4" + + input: + tuple val(meta), path(reads) + tuple val(meta2), path(bed) + tuple val(meta3), path(gtf) + tuple val(meta4), path(fasta) + tuple val(meta5), path(bwa) + tuple val(meta6), path(hisat2) + + output: + tuple val(meta), path("${prefix}/${prefix}.gtf") , emit: gtf + tuple val(meta), path("${prefix}/gene/${prefix}_genes.list"), emit: gene_list, optional: true + tuple val(meta), path("${prefix}/gene/${prefix}_out.gtf") , emit: gene_gtf, optional: true + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + prefix = task.ext.prefix ?: "${meta.id}" + def VERSION = '2.1.0' + def strandedness = meta.strandedness ?: 'auto' + def library_type = strandedness == 'auto' ? '' : strandedness == 'unstranded' ? '-l 0' : strandedness == 'forward' ? '-l 1' : '-l 2' + def reads_string = meta.single_end ? "-r ${reads}" : "-1 ${reads[0]} -2 ${reads[1]}" + def bed_string = bed ? "--bed ${bed}" : '' + """ + BWA=`which bwa` + HISAT2=`which hisat2` + STRINGTIE=`which stringtie` + SAMTOOLS=`which samtools` + + BWA_FILE=`ls ${bwa}/*.bwt` + BWA_PREFIX=`basename \$BWA_FILE .bwt` + + HISAT2_FILE=`ls ${hisat2}/*.1.ht2` + HISAT2_PREFIX=`basename \$HISAT2_FILE .1.ht2` + + printf "name: ciriquant\\ntools:\\n bwa: \$BWA\\n hisat2: \$HISAT2\\n stringtie: \$STRINGTIE\\n samtools: \$SAMTOOLS\\n\\nreference:\\n fasta: ${fasta}\\n gtf: ${gtf}\\n bwa_index: ${bwa}/\$BWA_PREFIX\\n hisat_index: ${hisat2}/\$HISAT2_PREFIX" > config.yml + + CIRIquant \\ + -t ${task.cpus} \\ + ${reads_string} \\ + ${bed_string} \\ + --config config.yml \\ + -o ${prefix} \\ + -p ${prefix} \\ + ${library_type} \\ + ${args} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + bwa: \$(echo \$(bwa 2>&1) | sed 's/^.*Version: //; s/Contact:.*\$//') + ciriquant: \$(echo \$(CIRIquant --version 2>&1) | sed 's/CIRIquant //g' ) + samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') + stringtie: \$(stringtie --version 2>&1) + hisat2: $VERSION + END_VERSIONS + """ +} diff --git a/modules/local/ciriquant/de/main.nf b/modules/local/ciriquant/de/main.nf new file mode 100644 index 000000000..0e3eb14a8 --- /dev/null +++ b/modules/local/ciriquant/de/main.nf @@ -0,0 +1,35 @@ +process CIRIQUANT_DE { + tag "$meta.id" + label 'process_high' + + container "docker.io/nicotru/ciriquant:1.0.4" + + input: + tuple val(meta), path(library), path(expression), path(gene) + + output: + tuple val(meta), path("${circ_path}"), emit: circ + tuple val(meta), path("${gene_path}"), emit: gene + path "versions.yml", emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + prefix = task.ext.prefix ?: "${meta.id}" + circ_path = "${prefix}.circ.csv" + gene_path = "${prefix}.gene.csv" + """ + CIRI_DE_replicate \\ + --lib ${library} \\ + --bsj ${expression} \\ + --gene ${gene} \\ + --out ${circ_path} \\ + --out2 ${gene_path} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + ciriquant: \$(echo \$(CIRIquant --version 2>&1) | sed 's/CIRIquant //g' ) + END_VERSIONS + """ +} diff --git a/modules/local/ciriquant/prepde/main.nf b/modules/local/ciriquant/prepde/main.nf new file mode 100644 index 000000000..fda4f2c3b --- /dev/null +++ b/modules/local/ciriquant/prepde/main.nf @@ -0,0 +1,42 @@ +process CIRIQUANT_PREPDE { + tag "$meta.id" + label 'process_high' + + container "docker.io/nicotru/ciriquant:1.0.4" + + input: + tuple val(meta), val(samples), path(gtfs), val(conditions) + + output: + tuple val(meta), path("${prefix}_library.tsv") , emit: library + tuple val(meta), path("${prefix}_annotation.tsv"), emit: annotation + tuple val(meta), path("${prefix}_expression.tsv"), emit: expression + tuple val(meta), path("${prefix}_ratio.tsv") , emit: gene + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + prefix = task.ext.prefix ?: "${meta.id}" + samplesheet = [samples, gtfs, conditions] + .transpose() + .collect{ sample, gtf, condition -> + "${sample}\\t${gtf}\\t${condition}" }.join('\\n') + """ + echo -e "${samplesheet}" > samples.txt + + prep_CIRIquant -i samples.txt \\ + --lib ${prefix}_library.tsv \\ + --circ ${prefix}_annotation.tsv \\ + --bsj ${prefix}_expression.tsv \\ + --ratio ${prefix}_ratio.tsv \\ + ${args} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + ciriquant: \$(echo \$(CIRIquant --version 2>&1) | sed 's/CIRIquant //g' ) + END_VERSIONS + """ +} diff --git a/modules/local/combine_beds/environment.yml b/modules/local/combine_beds/environment.yml new file mode 100644 index 000000000..57f25392c --- /dev/null +++ b/modules/local/combine_beds/environment.yml @@ -0,0 +1,6 @@ +channels: + - conda-forge + - bioconda +dependencies: + - conda-forge::polars=1.8.2 + - conda-forge::upsetplot=0.9.0 diff --git a/modules/local/combine_beds/main.nf b/modules/local/combine_beds/main.nf new file mode 100644 index 000000000..7a63b3f4b --- /dev/null +++ b/modules/local/combine_beds/main.nf @@ -0,0 +1,26 @@ +process COMBINE_BEDS { + tag "$meta.id" + label "process_low" + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'oras://community.wave.seqera.io/library/polars_upsetplot:0fc26c37f7821606' : + 'community.wave.seqera.io/library/polars_upsetplot:3382b69d3c1f6bf1' }" + + input: + tuple val(meta), path(beds) + val(max_shift) + val(min_tools) + val(min_samples) + + output: + tuple val(meta), path("${prefix}.${suffix}"), emit: combined + path "*.png" , emit: plots, optional: true + path "*.json" , emit: multiqc, optional: true + path "versions.yml" , emit: versions + + script: + prefix = task.ext.prefix ?: "${meta.id}" + suffix = task.ext.suffix ?: "bed" + template "combine.py" +} diff --git a/modules/local/combine_beds/templates/combine.py b/modules/local/combine_beds/templates/combine.py new file mode 100644 index 000000000..bdd523e13 --- /dev/null +++ b/modules/local/combine_beds/templates/combine.py @@ -0,0 +1,105 @@ +#!/usr/bin/env python + +import platform +import base64 +import json + +import polars as pl +import upsetplot +import matplotlib +import matplotlib.pyplot as plt + +def format_yaml_like(data: dict, indent: int = 0) -> str: + """Formats a dictionary to a YAML-like string. + + Args: + data (dict): The dictionary to format. + indent (int): The current indentation level. + + Returns: + str: A string formatted as YAML. + """ + yaml_str = "" + for key, value in data.items(): + spaces = " " * indent + if isinstance(value, dict): + yaml_str += f"{spaces}{key}:\\n{format_yaml_like(value, indent + 1)}" + else: + yaml_str += f"{spaces}{key}: {value}\\n" + return yaml_str + +max_shift = int("${max_shift}") +min_tools = int("${min_tools}") +min_samples = int("${min_samples}") +meta_id = "{meta_id}" +prefix = "${prefix}" + +df = pl.scan_csv("*.bed", + separator="\\t", + has_header=False, + new_columns=["chr", "start", "end", "name", "score", "strand", "sample", "tool"]) + +for col in ["end", "start"]: + df = df.sort(["chr", col]) + df = df.with_columns(**{f"{col}_group": pl.col(col).diff().fill_null(0).gt(max_shift).cum_sum()}) + +df = (df.group_by(["chr", "start_group", "end_group"]) + .agg( pl.col("start").median().round().cast(int), + pl.col("end").median().round().cast(int), + pl.col("sample").unique().alias("samples"), + pl.col("tool").unique().alias("tools"), + pl.col("sample").n_unique().alias("n_samples"), + pl.col("tool").n_unique().alias("n_tools")) + .with_columns(name=pl.col("chr").cast(str) + ":" + pl.col("start").cast(str) + "-" + pl.col("end").cast(str), + score=pl.lit("."), + strand=pl.lit("."))) + +for col in ["samples", "tools"]: + series = pl.Series(df.select(col).collect()) + if series.explode().n_unique() == 1: + continue + memberships = series.to_list() + dataset = upsetplot.from_memberships(memberships) + upsetplot.plot(dataset, + orientation='horizontal', + show_counts=True, + subset_size="count") + plot_file = f"{prefix}_{col}.upset.png" + plt.savefig(plot_file) + + image_string = base64.b64encode(open(plot_file, "rb").read()).decode("utf-8") + image_html = f'
' + + multiqc = { + 'id': f"{meta_id}_upset_{col}", + 'parent_id': "upset_plots", + 'parent_name': 'UpSet Plots', + 'parent_description': 'UpSet plots showing the overlap between tools for each sample', + 'section_name': f'UpSet {col}: {meta_id} ', + 'description': f'UpSet plot showing the overlap between {col} for {meta_id}', + 'plot_type': 'image', + 'data': image_html + } + + with open(f"{prefix}_{col}.upset_mqc.json", "w") as f: + f.write(json.dumps(multiqc, indent=4)) + + +df = (df.filter((pl.col("n_tools") >= min_tools) & (pl.col("n_samples") >= min_samples)) + .select(["chr", "start", "end", "name", "score", "strand"])) + +df.collect().write_csv("${prefix}.${suffix}", separator="\\t", include_header=False) + +# Versions + +versions = { + "${task.process}": { + "python": platform.python_version(), + "polars": pl.__version__, + "upsetplot": upsetplot.__version__, + "matplotlib": matplotlib.__version__ + } +} + +with open("versions.yml", "w") as f: + f.write(format_yaml_like(versions)) diff --git a/modules/local/compute_correlations/environment.yml b/modules/local/compute_correlations/environment.yml new file mode 100644 index 000000000..efe728347 --- /dev/null +++ b/modules/local/compute_correlations/environment.yml @@ -0,0 +1,7 @@ +name: "compute_correlations" +channels: + - conda-forge + - defaults + - bioconda +dependencies: + - "bioconda::bioconductor-fishpond=2.8.0--r43hdfd78af_0" diff --git a/modules/local/compute_correlations/main.nf b/modules/local/compute_correlations/main.nf new file mode 100644 index 000000000..4a72b5eb8 --- /dev/null +++ b/modules/local/compute_correlations/main.nf @@ -0,0 +1,29 @@ +process COMPUTE_CORRELATIONS { + tag "$meta.id" + label 'process_single' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/bioconductor-fishpond:2.8.0--r43hdfd78af_0' : + 'biocontainers/bioconductor-fishpond:2.8.0--r43hdfd78af_0' }" + + input: + tuple val(meta), path(bindingsites) + tuple val(meta2), path(mirna_expression) + tuple val(meta3), path(transcript_rds) + + output: + tuple val(meta), path("*.tsv"), emit: correlations, optional: true + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + template 'compute_correlations.R' + + stub: + """ + touch ${meta.id}.circrna_correlation.tsv + """ +} diff --git a/modules/local/compute_correlations/templates/compute_correlations.R b/modules/local/compute_correlations/templates/compute_correlations.R new file mode 100644 index 000000000..d9976226d --- /dev/null +++ b/modules/local/compute_correlations/templates/compute_correlations.R @@ -0,0 +1,62 @@ +#!/usr/bin/env Rscript + +library(fishpond) +suppressMessages(library(SummarizedExperiment)) + +tx_expression <- readRDS('${transcript_rds}') +mi_expression <- read.table('${mirna_expression}', header=TRUE, row.names=1, sep='\\t') +interactions <- read.table('${bindingsites}', sep='\\t') + +tx_expression <- scaleInfReps(tx_expression) +tx_expression <- labelKeep(tx_expression) # Here one can perform custom filtering + +if (!any(mcols(tx_expression)\$keep)) { + stop('No transcripts left after filtering') +} + +result_cols <- c('stat', 'log2FC', 'pvalue', 'locfdr', 'qvalue') + +# Iterate rows of interactions +for (i in 1:nrow(interactions)) { + # Get miRNA and target gene + miRNA <- interactions[i, 1] + targets <- unlist(strsplit(interactions[i, 2], ',')) + + mirna_expression <- mi_expression[miRNA,] + transcript_expression <- tx_expression[targets,] + + if (!any(mcols(transcript_expression)\$keep)) { + print(paste('No transcripts left after filtering for miRNA', miRNA)) + next + } + + # Add miRNA expression to colData so that it can be used for correlation + colData(transcript_expression) <- cbind( + colData(transcript_expression), + t(mirna_expression[, rownames(colData(transcript_expression))]) + ) + + result <- rowData(swish(transcript_expression, miRNA, cor = "${params.mirna_correlation}"))[, result_cols] + result <- result[complete.cases(result), ] + write.table(result, paste0(miRNA, '.tsv'), sep = '\\t') +} + +################################################ +################################################ +## VERSIONS FILE ## +################################################ +################################################ + +r.version <- strsplit(version[['version.string']], ' ')[[1]][3] + +writeLines( + c( + '"${task.process}":', + paste(' r-base:', r.version) + ), +'versions.yml') + +################################################ +################################################ +################################################ +################################################ diff --git a/modules/local/dcc/main.nf b/modules/local/dcc/main.nf new file mode 100644 index 000000000..429d7dc59 --- /dev/null +++ b/modules/local/dcc/main.nf @@ -0,0 +1,62 @@ +process DCC { + tag "$meta.id" + label 'process_high' + + conda "bioconda::circtools=1.2.1" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/circtools:1.2.1--pyh7cba7a3_0' : + 'biocontainers/circtools:1.2.1--pyh7cba7a3_0' }" + + input: + tuple val(meta), path(pairs), path(mate1), path(mate2) + path fasta + path gtf + + output: + tuple val(meta), path("${prefix}.txt"), emit: txt + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + prefix = task.ext.prefix ?: "${meta.id}" + def strandedness = meta.strandedness ?: 'auto' + def strand_args = strandedness == 'auto' || strandedness == 'unstranded' ? '-N' : strandedness == 'forward' ? '' : '-ss' + if(meta.single_end){ + """ + sed -i 's/^chr//g' $gtf + + mkdir ${prefix} && mv ${prefix}.Chimeric.out.junction ${prefix} && printf "${prefix}/${prefix}.Chimeric.out.junction" > samplesheet + + DCC @samplesheet -D -an $gtf -F -M -Nr 1 1 -A $fasta $strand_args -T ${task.cpus} + + awk '{print \$6}' CircCoordinates >> strand + paste CircRNACount strand | tail -n +2 | awk -v OFS="\\t" '{print \$1,\$2,\$3,\$5,\$4}' >> ${prefix}.txt + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + dcc: \$(DCC --version) + END_VERSIONS + """ + }else{ + """ + sed -i 's/^chr//g' $gtf + + mkdir ${prefix} && mv ${prefix}.Chimeric.out.junction ${prefix} && printf "${prefix}/${prefix}.Chimeric.out.junction" > samplesheet + mkdir ${prefix}_mate1 && mv ${prefix}_mate1.Chimeric.out.junction ${prefix}_mate1 && printf "${prefix}_mate1/${prefix}_mate1.Chimeric.out.junction" > mate1file + mkdir ${prefix}_mate2 && mv ${prefix}_mate2.Chimeric.out.junction ${prefix}_mate2 && printf "${prefix}_mate2/${prefix}_mate2.Chimeric.out.junction" > mate2file + + DCC @samplesheet -mt1 @mate1file -mt2 @mate2file -D -an $gtf -Pi -F -M -Nr 1 1 -A $fasta $strand_args -T ${task.cpus} + + awk '{print \$6}' CircCoordinates >> strand + paste CircRNACount strand | tail -n +2 | awk -v OFS="\\t" '{print \$1,\$2,\$3,\$5,\$4}' >> ${prefix}.txt + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + dcc: \$(DCC --version) + END_VERSIONS + """ + } +} diff --git a/modules/local/deseq2/normalization/environment.yml b/modules/local/deseq2/normalization/environment.yml new file mode 100644 index 000000000..8eb117c31 --- /dev/null +++ b/modules/local/deseq2/normalization/environment.yml @@ -0,0 +1,7 @@ +name: deseq2_normalization +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::bioconductor-deseq2=1.34.0 diff --git a/modules/local/deseq2/normalization/main.nf b/modules/local/deseq2/normalization/main.nf new file mode 100644 index 000000000..74cb8b5a3 --- /dev/null +++ b/modules/local/deseq2/normalization/main.nf @@ -0,0 +1,32 @@ +process DESEQ2_NORMALIZATION { + tag "$meta.id" + label 'process_single' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/bioconductor-deseq2:1.34.0--r41hc247a5b_3' : + 'biocontainers/bioconductor-deseq2:1.34.0--r41hc247a5b_3' }" + + input: + tuple val(meta), path(counts) + + output: + tuple val(meta), path("${meta.id}.normalized_counts.tsv"), emit: normalized + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + template 'deseq_normalization.R' + + stub: + """ + touch ${meta.id}.normalized_counts.tsv + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + bioconductor-deseq2: \$(Rscript -e "library(DESeq2); cat(as.character(packageVersion('DESeq2')))") + END_VERSIONS + """ +} diff --git a/modules/local/deseq2/normalization/templates/deseq_normalization.R b/modules/local/deseq2/normalization/templates/deseq_normalization.R new file mode 100644 index 000000000..91b366dc5 --- /dev/null +++ b/modules/local/deseq2/normalization/templates/deseq_normalization.R @@ -0,0 +1,50 @@ +#!/usr/bin/env Rscript + +library(DESeq2) + +raw_counts <- read.table("$counts", sep = "\\t", header = TRUE, stringsAsFactors = FALSE, check.names = FALSE) +samples <- colnames(raw_counts)[-c(1)] + +row.names(raw_counts) <- raw_counts\$miRNA +data <- raw_counts[, -1] +mirna_names <- data.frame(miRNA = raw_counts\$miRNA, order = seq_len(nrow(raw_counts))) + +# normalize using DeSeq2, Library Size Estimation +meta_data <- data.frame(samples) +row.names(meta_data) <- meta_data\$samples +all(colnames(data) %in% rownames(meta_data)) +all(colnames(data) == rownames(meta_data)) + +dds <- DESeqDataSetFromMatrix(countData = data, colData = meta_data, design = ~ 1) +dds <- estimateSizeFactors(dds) +sizeFactors(dds) +normalized_counts <- DESeq2::counts(dds, normalized = TRUE) + +# add miRNA IDs back to counts table +merged_data <- merge(mirna_names, normalized_counts, + by.x = "miRNA", by.y = "row.names") +merged_data <- merged_data[order(merged_data\$order), ] + +norm_data <- subset(merged_data, select = -c(order)) + +write.table(norm_data, paste0("${meta.id}.normalized_counts.tsv"), quote = FALSE, sep = "\\t", row.names = FALSE) + +# TODO: (Can be done later) Add support for Samplesheet so that we can eliminate batch effects + + +################################################ +################################################ +## VERSIONS FILE ## +################################################ +################################################ + +r.version <- strsplit(version[['version.string']], ' ')[[1]][3] +deseq2.version <- as.character(packageVersion('DESeq2')) + +writeLines( + c( + '"${task.process}":', + paste(' r-base:', r.version), + paste(' bioconductor-deseq2:', deseq2.version) + ), +'versions.yml') diff --git a/modules/local/fail_on_empty/main.nf b/modules/local/fail_on_empty/main.nf new file mode 100644 index 000000000..fc7daa6aa --- /dev/null +++ b/modules/local/fail_on_empty/main.nf @@ -0,0 +1,18 @@ +process FAIL_ON_EMPTY { + tag "$meta.id" + + input: + tuple val(meta), path(bed) + path(waitFor, stageAs: 'waitFor*.txt') + + exec: + if (!bed) { + log.error (( + "No circular RNAs were found by at least ${params.min_tools} tools and in at least ${params.min_samples} samples.\n") + + "Feel free to check the preliminary results in '${params.outdir}'\n" + + (params.save_intermediates ? "" : + "You can enable saving intermediate files by setting the parameter 'save_intermediates' to 'true'.")) + + exit 1 + } +} diff --git a/modules/local/find_circ/anchors/main.nf b/modules/local/find_circ/anchors/main.nf new file mode 100644 index 000000000..9bccb403b --- /dev/null +++ b/modules/local/find_circ/anchors/main.nf @@ -0,0 +1,31 @@ +process FIND_CIRC_ANCHORS { + tag "$meta.id" + label "process_high" + + conda "bioconda::find_circ=1.2" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/find_circ%3A1.2--hdfd78af_0' : + 'biocontainers/find_circ:1.2--hdfd78af_0' }" + + input: + tuple val(meta), path(bam) + + output: + tuple val(meta), path("${prefix}_anchors.qfa.gz"), emit: anchors + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + prefix = task.ext.prefix ?: "${meta.id}" + def VERSION = '1.2' + """ + unmapped2anchors.py $bam | gzip > ${prefix}_anchors.qfa.gz + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + find_circ: $VERSION + END_VERSIONS + """ +} diff --git a/modules/local/find_circ/find_circ/main.nf b/modules/local/find_circ/find_circ/main.nf new file mode 100644 index 000000000..8a4ca128c --- /dev/null +++ b/modules/local/find_circ/find_circ/main.nf @@ -0,0 +1,53 @@ +process FIND_CIRC { + tag "$meta.id" + label "process_high" + + conda "bioconda::find_circ=1.2 bioconda::bowtie2" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/mulled-v2-c27e472038a09e49d9147bc52903e12836302c12:60ffb3b15a2c40c669f8d38382b1e6e4b065f5e4-0' : + 'biocontainers/mulled-v2-c27e472038a09e49d9147bc52903e12836302c12:60ffb3b15a2c40c669f8d38382b1e6e4b065f5e4-0' }" + + input: + tuple val(meta), path(anchors) + tuple val(meta2), path(index) + path fasta + + output: + tuple val(meta), path("${prefix}.sites.bed"), emit: bed + path("${prefix}.sites.reads") , emit: reads + path("${prefix}.sites.log") , emit: log + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + prefix = task.ext.prefix ?: "${meta.id}" + args = task.ext.args ?: "" + args2 = task.ext.args2 ?: "" + def strand_arg = meta.strandedness && (meta.strandedness == 'forward' || meta.strandedness == 'reverse') ? "--stranded" : "" + def VERSION = '1.2' + """ + INDEX=`find -L ./ -name "*.rev.1.bt2" | sed "s/.rev.1.bt2//"` + [ -z "\$INDEX" ] && INDEX=`find -L ./ -name "*.rev.1.bt2l" | sed "s/.rev.1.bt2l//"` + [ -z "\$INDEX" ] && echo "Bowtie2 index files not found" 1>&2 && exit 1 + + bowtie2 \\ + --threads $task.cpus \\ + --reorder \\ + --mm \\ + -D 20 \\ + --score-min=C,-15,0 \\ + -q \\ + -x \$INDEX \\ + $args \\ + -U $anchors | \\ + find_circ.py --genome=$fasta $strand_arg $args2 --prefix=${prefix} --stats=${prefix}.sites.log --reads=${prefix}.sites.reads > ${prefix}.sites.bed + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + bowtie2: \$(echo \$(bowtie2 --version 2>&1) | sed 's/^.*bowtie2-align-s version //; s/ .*\$//') + find_circ: $VERSION + END_VERSIONS + """ +} diff --git a/modules/local/majority_vote/environment.yml b/modules/local/majority_vote/environment.yml new file mode 100644 index 000000000..262d8798a --- /dev/null +++ b/modules/local/majority_vote/environment.yml @@ -0,0 +1,7 @@ +name: annotation +channels: + - conda-forge + - bioconda +dependencies: + - conda-forge::polars=1.5.0 + - conda-forge::pyyaml=6.0.2 diff --git a/modules/local/majority_vote/main.nf b/modules/local/majority_vote/main.nf new file mode 100644 index 000000000..33f417ba2 --- /dev/null +++ b/modules/local/majority_vote/main.nf @@ -0,0 +1,35 @@ +process MAJORITY_VOTE { + tag "$meta.id" + label 'process_medium' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'oras://community.wave.seqera.io/library/polars_pyyaml:962a0cf7480258c7' : + 'community.wave.seqera.io/library/polars_pyyaml:ad93db0d7bcd508e' }" + + input: + tuple val(meta), path(bindingsites) + + output: + tuple val(meta), path("${meta.id}.majority.tsv"), emit: tsv + tuple val(meta), path("${meta.id}.targets.tsv") , emit: targets + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + min_tools = params.mirna_min_tools + template 'majority.py' + + stub: + """ + touch ${meta.id}.majority.tsv + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + python: \$(python --version | sed 's/Python //g') + pandas: \$(python -c "import pandas; print(pandas.__version__)") + END_VERSIONS + """ +} diff --git a/modules/local/majority_vote/templates/majority.py b/modules/local/majority_vote/templates/majority.py new file mode 100644 index 000000000..4812301ad --- /dev/null +++ b/modules/local/majority_vote/templates/majority.py @@ -0,0 +1,41 @@ +#!/usr/bin/env python3 + +import platform + +import polars as pl +import yaml + +paths = "${bindingsites}".split(" ") + +df = pl.scan_csv(paths, + separator="\\t", + has_header=False, + new_columns=['mirna', 'target', 'start', 'end', 'tool']) + +df = df.select(["mirna", "target", "tool"]) + +df = df.group_by(['mirna', 'target']).agg(pl.col("tool").n_unique()) + +df = df.filter(pl.col("tool") > int("${min_tools}")) \ + .select(["mirna", "target"]) + +df = df.collect() + +df.write_csv('${meta.id}.majority.tsv', separator='\\t', include_header=False) + +# Create targets file + +df = df.group_by('mirna').agg(pl.col("target").str.concat(",")) + +df.write_csv('${meta.id}.targets.tsv', separator='\\t', include_header=False) + +# Create version file +versions = { + "${task.process}" : { + "python": platform.python_version(), + "polars": pl.__version__, + } +} + +with open("versions.yml", "w") as f: + f.write(yaml.dump(versions)) diff --git a/modules/local/mapsplice/align/main.nf b/modules/local/mapsplice/align/main.nf new file mode 100644 index 000000000..a90930d6e --- /dev/null +++ b/modules/local/mapsplice/align/main.nf @@ -0,0 +1,75 @@ +process MAPSPLICE_ALIGN { + tag "$meta.id" + label 'process_high' + + conda "bioconda::mapsplice=2.2.1" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/mapsplice:2.2.1--py27h07887db_0': + 'biocontainers/mapsplice:2.2.1--py27h07887db_0' }" + + input: + tuple val(meta), path(reads) + path bowtie_index + tuple val(meta2), path(chromosomes, stageAs: 'chromosomes/*') + path gtf + + output: + tuple val(meta), path("${prefix}/fusions_raw.txt"), emit: raw_fusions + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + prefix = task.ext.prefix ?: "${meta.id}" + def VERSION = 'v2.2.1' + def gtf_prefix = gtf.toString() - ~/.gtf/ + if(meta.single_end){ + def handleGzip_R1 = reads[0].getExtension() == 'gz' ? "gzip -d -f ${reads[0]}" : '' + def read1 = reads[0].getExtension() == 'gz' ? reads[0].toString() - ~/.gz/ : reads[0] + """ + $handleGzip_R1 + + mapsplice.py \\ + -c chromosomes \\ + -x $gtf_prefix \\ + -1 ${read1} \\ + -p ${task.cpus} \\ + --bam \\ + --gene-gtf $gtf \\ + -o $prefix \\ + $args + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + mapsplice: $VERSION + END_VERSIONS + """ + } else { + def handleGzip_R1 = reads[0].getExtension() == 'gz' ? "gzip -d -f ${reads[0]}" : '' + def handleGzip_R2 = reads[1].getExtension() == 'gz' ? "gzip -d -f ${reads[1]}" : '' + def read1 = reads[0].getExtension() == 'gz' ? reads[0].toString() - ~/.gz/ : reads[0] + def read2 = reads[1].getExtension() == 'gz' ? reads[1].toString() - ~/.gz/ : reads[1] + """ + $handleGzip_R1 + $handleGzip_R2 + + mapsplice.py \\ + -c chromosomes \\ + -x $gtf_prefix \\ + -1 ${read1} \\ + -2 ${read2} \\ + -p ${task.cpus} \\ + --bam \\ + --gene-gtf $gtf \\ + -o $prefix \\ + $args + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + mapsplice: $VERSION + END_VERSIONS + """ + } +} diff --git a/modules/local/matrix/join_samples/environment.yml b/modules/local/matrix/join_samples/environment.yml new file mode 100644 index 000000000..d1d6cd8ed --- /dev/null +++ b/modules/local/matrix/join_samples/environment.yml @@ -0,0 +1,7 @@ +name: join_samples +channels: + - conda-forge + - bioconda +dependencies: + - conda-forge::polars=1.5.0 + - conda-forge::pyyaml=6.0.2 diff --git a/modules/local/matrix/join_samples/main.nf b/modules/local/matrix/join_samples/main.nf new file mode 100644 index 000000000..a301a9cbf --- /dev/null +++ b/modules/local/matrix/join_samples/main.nf @@ -0,0 +1,35 @@ +process JOIN_SAMPLES { + tag "$meta.id" + label 'process_medium' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'oras://community.wave.seqera.io/library/polars_pyyaml:962a0cf7480258c7' : + 'community.wave.seqera.io/library/polars_pyyaml:ad93db0d7bcd508e' }" + + input: + tuple val(meta), val(samples), path(matrices) + + output: + tuple val(meta), path("${meta.id}.joined.tsv"), emit: joined + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + metacols = task.ext.metacols ?: "gene_id,gene_name" + has_header = task.ext.has_header == null ? true : task.ext.has_header + template 'join.py' + + stub: + """ + touch ${meta.id}.joined.tsv + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + python: \$(python --version | sed 's/Python //g') + polars: \$(python -c "import polars; print(polars.__version__)") + END_VERSIONS + """ +} diff --git a/modules/local/matrix/join_samples/templates/join.py b/modules/local/matrix/join_samples/templates/join.py new file mode 100644 index 000000000..30351c332 --- /dev/null +++ b/modules/local/matrix/join_samples/templates/join.py @@ -0,0 +1,42 @@ +#!/usr/bin/env python3 + +import platform + +import polars as pl +import yaml + +samples = "${samples.join(' ')}".split(" ") +matrices = "${matrices}".split(" ") +metacols = "${metacols}".split(",") + +dfs = { + sample: + pl.scan_csv(matrix, + separator="\\t", + new_columns=metacols + [sample], + has_header="${has_header}" == "true") + .group_by(metacols).agg(pl.sum(sample).alias(sample)) + for sample, matrix in zip(samples, matrices)} + +df_order = pl.concat([df.select(metacols) for df in dfs.values()]).unique().sort(metacols[0]) + +dfs_sorted = [ + df_order.join(df, on=metacols, how="left", coalesce=True) + .select(sample) + for sample, df in dfs.items() + ] + +df = pl.concat([df_order] + dfs_sorted, how="horizontal").fill_null(0) + +df.collect().write_csv("${meta.id}.joined.tsv", separator="\\t") + +# Create version file +versions = { + "${task.process}" : { + "python": platform.python_version(), + "polars": pl.__version__, + } +} + +with open("versions.yml", "w") as f: + f.write(yaml.dump(versions)) diff --git a/modules/local/mirna_filtering/main.nf b/modules/local/mirna_filtering/main.nf new file mode 100644 index 000000000..03efeea48 --- /dev/null +++ b/modules/local/mirna_filtering/main.nf @@ -0,0 +1,24 @@ +process MIRNA_FILTERING { + tag "$meta.id" + label 'process_single' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/r-base:4.2.1' : + 'biocontainers/r-base:4.2.1' }" + + input: + tuple val(meta), path(normalized_counts) + val(mirna_min_sample_percentage) + val(mirna_min_reads) + + output: + tuple val(meta), path("${meta.id}.normalized_counts_filtered.tsv"), emit: filtered + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + template 'mirna_filtering.R' +} diff --git a/modules/local/mirna_filtering/templates/mirna_filtering.R b/modules/local/mirna_filtering/templates/mirna_filtering.R new file mode 100644 index 000000000..588eb2601 --- /dev/null +++ b/modules/local/mirna_filtering/templates/mirna_filtering.R @@ -0,0 +1,51 @@ +#!/usr/bin/env Rscript + +expression_norm <- read.table("$normalized_counts", + sep = "\\t", + header = TRUE, + stringsAsFactors = FALSE, + check.names = FALSE +) + +samples <- colnames(expression_norm)[-c(1)] + +# filter data: counts > 5 in at least 20% of samples +if (length(samples) < 5) { + stop("Cannot perform filtering on less than 5 samples") +} + +sample_nr_cutoff <- ceiling($mirna_min_sample_percentage * length(samples)) +rows_to_keep <- c() + +for (i in seq_len(nrow(expression_norm))) { + mirna_per_sample <- 0 + for (j in 5:ncol(expression_norm)) { + if (expression_norm[i, j] >= $mirna_min_reads) { + mirna_per_sample <- mirna_per_sample + 1 + } + } + if (mirna_per_sample >= sample_nr_cutoff) { + rows_to_keep <- append(rows_to_keep, i) + } +} + +filtered_data <- expression_norm[rows_to_keep, ] + +write.table(filtered_data, paste0("${meta.id}.normalized_counts_filtered.tsv"), + quote = FALSE, sep = "\\t", + row.names = FALSE) + +################################################ +################################################ +## VERSIONS FILE ## +################################################ +################################################ + +r.version <- strsplit(version[['version.string']], ' ')[[1]][3] + +writeLines( + c( + '"${task.process}":', + paste(' r-base:', r.version) + ), +'versions.yml') diff --git a/modules/local/mirna_targets/main.nf b/modules/local/mirna_targets/main.nf new file mode 100644 index 000000000..e525a9f5e --- /dev/null +++ b/modules/local/mirna_targets/main.nf @@ -0,0 +1,44 @@ +process MIRNA_TARGETS { + tag "$meta.id" + label 'process_low' + + conda "bioconda::bedtools=2.30.0" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/bedtools:2.30.0--h7d7f7ad_2': + 'biocontainers/bedtools:2.30.0--h7d7f7ad_2' }" + + input: + tuple val(meta), path(targetscan), path(miranda) + + output: + tuple val(meta), path("${prefix}.mirna_targets.txt"), emit: results + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + prefix = task.ext.prefix ?: "${meta.id}" + """ + ## reformat and sort miRanda, TargetScan outputs, convert to BED for overlaps. + tail -n +2 $targetscan | sort -k1,1 -k4n | awk -v OFS="\\t" '{print \$1, \$2, \$4, \$5, \$9}' | awk -v OFS="\\t" '{print \$2, \$3, \$4, \$1, "0", \$5}' > targetscan.bed + tail -n +2 $miranda | sort -k2,2 -k7n | awk -v OFS="\\t" '{print \$2, \$1, \$3, \$4, \$7, \$8}' | awk -v OFS="\\t" '{print \$2, \$5, \$6, \$1, \$3, \$4}' > miranda.bed + + ## intersect, consolidate miRanda, TargetScan information about miRs. + ## -wa to output miRanda hits - targetscan makes it difficult to resolve duplicate miRNAs at MRE sites. + bedtools intersect -a miranda.bed -b targetscan.bed -wa > ${prefix}.mirnas.tmp + bedtools intersect -a targetscan.bed -b miranda.bed | awk '{print \$6}' > mirna_type + + ## remove duplicate miRNA entries at MRE sites. + ## strategy: sory by circs, sort by start position, sort by site type - the goal is to take the best site type (i.e rank site type found at MRE site). + paste ${prefix}.mirnas.tmp mirna_type | sort -k3n -k2n -k7r | awk -v OFS="\\t" '{print \$4,\$1,\$2,\$3,\$5,\$6,\$7}' | awk -F "\\t" '{if (!seen[\$1,\$2,\$3,\$4,\$5,\$6]++)print}' | sort -k1,1 -k3n > ${prefix}.mirna_targets.tmp + echo -e "circRNA\\tmiRNA\\tStart\\tEnd\\tScore\\tEnergy_KcalMol\\tSite_type" | cat - ${prefix}.mirna_targets.tmp > ${prefix}.mirna_targets.txt + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + awk: \$(awk --version | head -n1 | cut -d' ' -f3 | sed 's/,//g' ) + bedtools: \$(bedtools --version | sed -e "s/bedtools v//g") + END_VERSIONS + """ +} diff --git a/modules/local/psirc/index/environment.yml b/modules/local/psirc/index/environment.yml new file mode 100644 index 000000000..e0303603c --- /dev/null +++ b/modules/local/psirc/index/environment.yml @@ -0,0 +1,6 @@ +name: psirc_index +channels: + - conda-forge + - bioconda +dependencies: + - bioconda::psirc=1.0.0 diff --git a/modules/local/psirc/index/main.nf b/modules/local/psirc/index/main.nf new file mode 100644 index 000000000..c0cae5967 --- /dev/null +++ b/modules/local/psirc/index/main.nf @@ -0,0 +1,26 @@ +process PSIRC_INDEX { + tag "${meta.id}" + label 'process_high' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/psirc:1.0.0--he1fd2f9_0' : + 'biocontainers/psirc:1.0.0--he1fd2f9_0' }" + + input: + tuple val(meta), path(fasta) + + output: + tuple val(meta), path("psirc.index"), emit: index + path "versions.yml", emit: versions + + script: + """ + psirc-quant index -i psirc.index --make-unique $fasta + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + psirc-quant: \$(psirc-quant version | sed -n 's/^psirc-quant, version \\([0-9.]*\\).*\$/\\1/p') + END_VERSIONS + """ +} diff --git a/modules/local/psirc/quant/environment.yml b/modules/local/psirc/quant/environment.yml new file mode 100644 index 000000000..222b4e893 --- /dev/null +++ b/modules/local/psirc/quant/environment.yml @@ -0,0 +1,6 @@ +name: psirc_quant +channels: + - conda-forge + - bioconda +dependencies: + - bioconda::psirc=1.0.0 diff --git a/modules/local/psirc/quant/main.nf b/modules/local/psirc/quant/main.nf new file mode 100644 index 000000000..862e9ccdb --- /dev/null +++ b/modules/local/psirc/quant/main.nf @@ -0,0 +1,33 @@ +process PSIRC_QUANT { + tag "${meta.id}" + label 'process_high' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/psirc:1.0.0--he1fd2f9_0' : + 'biocontainers/psirc:1.0.0--he1fd2f9_0' }" + + input: + tuple val(meta), path(reads) + tuple val(meta2), path(index) + tuple val(meta3), path(gtf) + tuple val(meta4), path(chrom_sizes) + val(bootstrap_samples) + + output: + tuple val(meta), path("${meta.id}"), emit: directory + path "versions.yml" , emit: versions + + script: + def single_end = meta.single_end ? "--single -l 76 -s 20" : "" + def genomebam = gtf ? "--genomebam -g $gtf" : "" + def chromosomes = chrom_sizes ? "-c $chrom_sizes" : "" + """ + psirc-quant quant -t $task.cpus -i $index -o $meta.id $single_end $reads -b $bootstrap_samples $genomebam $chromosomes + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + psirc-quant: \$(psirc-quant version | sed -n 's/^psirc-quant, version \\([0-9.]*\\).*\$/\\1/p') + END_VERSIONS + """ +} diff --git a/modules/local/pygtftk/tabulate/environment.yml b/modules/local/pygtftk/tabulate/environment.yml new file mode 100644 index 000000000..cf20b1f94 --- /dev/null +++ b/modules/local/pygtftk/tabulate/environment.yml @@ -0,0 +1,6 @@ +name: pygtftk_tabulate +channels: + - conda-forge + - bioconda +dependencies: + - pygtftk=1.6.2 diff --git a/modules/local/pygtftk/tabulate/main.nf b/modules/local/pygtftk/tabulate/main.nf new file mode 100644 index 000000000..0c150213f --- /dev/null +++ b/modules/local/pygtftk/tabulate/main.nf @@ -0,0 +1,36 @@ +process PYGTFTK_TABULATE { + tag "$meta.id" + label 'process_low' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'depot.galaxyproject.org/singularity/pygtftk:1.6.2--py39h4e691d4_2' : + 'biocontainers/pygtftk:1.6.2--py39h4e691d4_2' }" + + input: + tuple val(meta), path(gtf) + + output: + tuple val(meta), path("$outfile"), emit: table + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def suffix = task.ext.suffix ?: gff.extension + outfile = "${prefix}.${suffix}" + """ + gtftk tabulate \\ + $args \\ + -i $gtf | \\ + grep -v '^tabulate()' > ${outfile} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + gtftk: \$(gtftk -v | awk '{print substr(\$2, 2)}') + END_VERSIONS + """ +} diff --git a/modules/local/quantification/merge_experiments/environment.yml b/modules/local/quantification/merge_experiments/environment.yml new file mode 100644 index 000000000..07f95f055 --- /dev/null +++ b/modules/local/quantification/merge_experiments/environment.yml @@ -0,0 +1,8 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json +name: "fishpond_swish" +channels: + - conda-forge + - bioconda +dependencies: + - "bioconda::bioconductor-rtracklayer=1.62.0" diff --git a/modules/local/quantification/merge_experiments/main.nf b/modules/local/quantification/merge_experiments/main.nf new file mode 100644 index 000000000..32fadcd87 --- /dev/null +++ b/modules/local/quantification/merge_experiments/main.nf @@ -0,0 +1,32 @@ +process MERGE_EXPERIMENTS { + tag "$meta.id" + label "process_medium" + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/bioconductor-rtracklayer:1.62.0--r43ha9d7317_0' : + 'biocontainers/bioconductor-rtracklayer:1.62.0--r43ha9d7317_0' }" + + input: + tuple val(meta), path(experiments) + tuple val(meta2), path(phenotype) + tuple val(meta3), path(gtf) + tuple val(meta4), path(tpm) + + output: + tuple val(meta), path("${meta.id}.merged.rds"), emit: merged + path "versions.yml" , emit: versions + + script: + template "merge_experiments.r" + + stub: + """ + touch ${meta.id}.merged.rds + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + bioconductor-summarizedexperiment: \$(Rscript -e "library(SummarizedExperiment); cat(as.character(packageVersion('SummarizedExperiment')))") + END_VERSIONS + """ +} diff --git a/modules/local/quantification/merge_experiments/templates/merge_experiments.r b/modules/local/quantification/merge_experiments/templates/merge_experiments.r new file mode 100644 index 000000000..4e82e57da --- /dev/null +++ b/modules/local/quantification/merge_experiments/templates/merge_experiments.r @@ -0,0 +1,62 @@ +#!/usr/bin/env Rscript --vanilla + +library(SummarizedExperiment) + +paths <- c('${experiments.join("\', \'")}') +experiments <- lapply(paths, readRDS) + +annotation <- rtracklayer::import('${gtf}') +tpm <- read.table('${tpm}', header=TRUE, row.names=1)[, -1] + +se_assays <- list() + +for (se in experiments) { + assays <- assays(se) + # Iterate over named list of assays + for (assay_name in names(assays)) { + assay <- assays[[assay_name]] + + # Add assay to se_assays for its name + if (is.null(se_assays[[assay_name]])) { + se_assays[[assay_name]] <- assay + } else { + se_assays[[assay_name]] <- cbind(se_assays[[assay_name]], assay) + } + } +} + +se_cbind <- do.call(SummarizedExperiment::cbind, experiments) +se <- SummarizedExperiment(assays = se_assays, colData = colData(se_cbind), rowData = rowData(se_cbind)) + +# Join phenotype data +phenotype_path <- '${phenotype}' +if (file.exists(phenotype_path)) { + phenotype <- read.csv(phenotype_path, stringsAsFactors = FALSE) + colData(se) <- merge(colData(se), phenotype, by.x="names", by.y=colnames(phenotype)[1]) +} + +# Convert string columns to factors +for (col in colnames(colData(se))) { + if (is.character(colData(se)[[col]]) && !(col == "names")) { + colData(se)[[col]] <- as.factor(colData(se)[[col]]) + } +} + +rownames(colData(se)) <- colData(se)\$names +colData(se)\$names <- NULL + +# Add transcript annotation +annotation <- annotation[match(rownames(se), annotation\$transcript_id),] +rowData(se) <- annotation + +# Add TPM +assay(se, "tpm", withDimnames = FALSE) <- tpm[rownames(se), rownames(colData(se))] + +saveRDS(se, '${meta.id}.merged.rds') + +writeLines( + c( + '"${task.process}":', + paste(' bioconductor-summarizedexperiment:', packageVersion('SummarizedExperiment')) + ), +'versions.yml') diff --git a/modules/local/quantification/split_types/environment.yml b/modules/local/quantification/split_types/environment.yml new file mode 100644 index 000000000..0c6dab505 --- /dev/null +++ b/modules/local/quantification/split_types/environment.yml @@ -0,0 +1,6 @@ +name: split_types +channels: + - conda-forge + - bioconda +dependencies: + - gawk=5.1.0 diff --git a/modules/local/quantification/split_types/main.nf b/modules/local/quantification/split_types/main.nf new file mode 100644 index 000000000..a95b18af6 --- /dev/null +++ b/modules/local/quantification/split_types/main.nf @@ -0,0 +1,43 @@ +process SPLIT_TYPES { + tag "$meta.id" + label 'process_single' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/gawk:5.1.0' : + 'biocontainers/gawk:5.1.0' }" + + input: + tuple val(meta), path(input) + + output: + tuple val(meta), path("linear.tsv") , emit: linear + tuple val(meta), path("circular.tsv"), emit: circular + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + """ + awk -F'\\t' \\ + 'NR==1 {print > "circular.tsv"; print > "linear.tsv"} \\ + NR>1 {if (\$1 ~ /^circ_/) print > "circular.tsv"; else print > "linear.tsv"}' ${input} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + gawk: \$(awk -Wversion | sed '1!d; s/.*Awk //; s/,.*//') + END_VERSIONS + """ + + stub: + """ + touch linear.tsv + touch circular.tsv + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + gawk: \$(awk -Wversion | sed '1!d; s/.*Awk //; s/,.*//') + END_VERSIONS + """ +} diff --git a/modules/local/seqkit/split/environment.yml b/modules/local/seqkit/split/environment.yml new file mode 100644 index 000000000..d557b8b31 --- /dev/null +++ b/modules/local/seqkit/split/environment.yml @@ -0,0 +1,7 @@ +name: seqkit_split +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::seqkit=2.8.0 diff --git a/modules/local/seqkit/split/main.nf b/modules/local/seqkit/split/main.nf new file mode 100644 index 000000000..1fbc0c8d6 --- /dev/null +++ b/modules/local/seqkit/split/main.nf @@ -0,0 +1,36 @@ +process SEQKIT_SPLIT { + tag "$meta.id" + label 'process_medium' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/seqkit:2.8.0--h9ee0642_0' : + 'biocontainers/seqkit:2.8.0--h9ee0642_0' }" + + input: + tuple val(meta), path(fasta) + + output: + tuple val(meta), path("${prefix}/*"), emit: split + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + args = task.ext.args ?: '' + prefix = task.ext.prefix ?: "${meta.id}" + """ + seqkit \\ + split \\ + $args \\ + --threads $task.cpus \\ + $fasta \\ + --out-dir ${prefix} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + seqkit: \$(echo \$(seqkit 2>&1) | sed 's/^.*Version: //; s/ .*\$//') + END_VERSIONS + """ +} diff --git a/modules/local/star/sjdb/main.nf b/modules/local/star/sjdb/main.nf new file mode 100644 index 000000000..835f4e212 --- /dev/null +++ b/modules/local/star/sjdb/main.nf @@ -0,0 +1,32 @@ +process SJDB { + tag "$meta.id" + label 'process_single' + + conda "conda-forge::sed=4.7" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/ubuntu:20.04' : + 'nf-core/ubuntu:20.04' }" + + input: + tuple val(meta), path(sjdb) + val(bsj_reads) + + output: + tuple val(meta), path("dataset.SJ.out.tab"), emit: sjtab + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def VERSION = '1.3.4' + """ + mkdir tmp + cat *.tab | awk -v BSJ=${bsj_reads} '(\$7 >= BSJ && \$6==0)' | cut -f1-6 | sort -T ./tmp/ | uniq > dataset.SJ.out.tab + rm -rf tmp + cat <<-END_VERSIONS > versions.yml + "${task.process}": + mawk: $VERSION + END_VERSIONS + """ +} diff --git a/modules/local/stringtie/prepde/main.nf b/modules/local/stringtie/prepde/main.nf new file mode 100644 index 000000000..3d24e65f0 --- /dev/null +++ b/modules/local/stringtie/prepde/main.nf @@ -0,0 +1,44 @@ +process STRINGTIE_PREPDE { + tag "$meta.id" + label 'process_low' + + conda "bioconda::stringtie=2.2.1" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/stringtie:2.2.1--hecb563c_2' : + 'biocontainers/stringtie:2.2.1--hecb563c_2' }" + + input: + tuple val(meta), val(samples), path(gtfs) + + output: + tuple val(meta), path("${prefix}_transcript_count_matrix.csv") , emit: transcript_matrix + tuple val(meta), path("${prefix}_gene_count_matrix.csv") , emit: gene_matrix + + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + prefix = task.ext.prefix ?: "${meta.id}" + samplesheet = [samples, gtfs] + .transpose() + .collect{ sample, gtf -> + "${sample}\\t${gtf}" }.join('\\n') + transcript_path = "${prefix}_transcript_count_matrix.csv" + gene_path = "${prefix}_gene_count_matrix.csv" + """ + echo -e "${samplesheet}" > samples.txt + + prepDE.py -i samples.txt \\ + -g ${gene_path} \\ + -t ${transcript_path} \\ + ${args} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + stringtie: \$(stringtie --version 2>&1) + END_VERSIONS + """ +} diff --git a/modules/local/targetscan/database/main.nf b/modules/local/targetscan/database/main.nf new file mode 100644 index 000000000..8ba3ef71c --- /dev/null +++ b/modules/local/targetscan/database/main.nf @@ -0,0 +1,30 @@ +process TARGETSCAN_DATABASE { + tag "$meta.id" + label 'process_low' + + conda "conda-forge::sed=4.7" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/ubuntu:20.04' : + 'nf-core/ubuntu:20.04' }" + + input: + tuple val(meta), path(mature) + + output: + tuple val(meta), path("mature.txt") , emit: mature_txt + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def VERSION = '1.3.4' + """ + targetscan_format.sh $mature + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + mawk: $VERSION + END_VERSIONS + """ +} diff --git a/modules/local/targetscan/predict/main.nf b/modules/local/targetscan/predict/main.nf new file mode 100644 index 000000000..450611c2d --- /dev/null +++ b/modules/local/targetscan/predict/main.nf @@ -0,0 +1,38 @@ +process TARGETSCAN { + tag "$meta.id" + label 'process_medium' + + conda "bioconda::targetscan=7.0" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/targetscan:7.0--pl5321hdfd78af_0' : + 'biocontainers/targetscan:7.0--pl5321hdfd78af_0' }" + + input: + tuple val(meta), path(fasta) + tuple val(meta2), path(mature_txt) + + output: + tuple val(meta), path("${prefix}.txt"), emit: txt + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + prefix = task.ext.prefix ?: "${meta.id}" + def VERSION = "7.0" + """ + ##format for targetscan + cat $fasta | grep ">" | sed 's/>//g' > id + cat $fasta | grep -v ">" > seq + paste id seq | awk -v OFS="\\t" '{print \$1, "0000", \$2}' > ${prefix}_ts.txt + # run targetscan + targetscan_70.pl mature.txt ${prefix}_ts.txt ${prefix}.txt + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + awk: \$(awk --version | head -n1 | cut -d' ' -f3 | sed 's/,//g' ) + targetscan: $VERSION + END_VERSIONS + """ +} diff --git a/modules/local/tximeta/tximeta/environment.yml b/modules/local/tximeta/tximeta/environment.yml new file mode 100644 index 000000000..be4bcd30b --- /dev/null +++ b/modules/local/tximeta/tximeta/environment.yml @@ -0,0 +1,8 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json +name: "tximeta_tximeta" +channels: + - conda-forge + - bioconda +dependencies: + - "bioconda::bioconductor-tximeta=1.20.1" diff --git a/modules/local/tximeta/tximeta/main.nf b/modules/local/tximeta/tximeta/main.nf new file mode 100644 index 000000000..c1be1d37d --- /dev/null +++ b/modules/local/tximeta/tximeta/main.nf @@ -0,0 +1,34 @@ +process TXIMETA_TXIMETA { + tag "$meta.id" + label "process_medium" + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/bioconductor-tximeta%3A1.20.1--r43hdfd78af_1' : + 'biocontainers/bioconductor-tximeta:1.20.1--r43hdfd78af_1' }" + + input: + tuple val(meta), path("quants/*") + val quant_type + + output: + tuple val(meta), path("*.rds"), emit: se + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + prefix = task.ext.prefix ?: meta.id + template 'tximeta.r' + + stub: + """ + touch ${meta.id}.rds + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + bioconductor-tximeta: \$(Rscript -e "library(tximeta); cat(as.character(packageVersion('tximeta')))") + END_VERSIONS + """ +} diff --git a/modules/local/tximeta/tximeta/templates/tximeta.r b/modules/local/tximeta/tximeta/templates/tximeta.r new file mode 100755 index 000000000..878ada020 --- /dev/null +++ b/modules/local/tximeta/tximeta/templates/tximeta.r @@ -0,0 +1,66 @@ +#!/usr/bin/env Rscript --vanilla + +# Script for importing and processing transcript-level quantifications. +# Written by Lorena Pantano, later modified by Jonathan Manning, and released +# under the MIT license. + +# Loading required libraries +library(tximeta) + +################################################ +################################################ +## Main script starts here ## +################################################ +################################################ + +# Define pattern for file names based on quantification type +pattern <- ifelse('$quant_type' == "kallisto", + ifelse(length(list.files('quants', pattern = "abundance.h5", recursive = T, full.names = T)) != 0, + "abundance.h5", + "abundance.tsv"), + "quant.sf") + +fns <- list.files('quants', pattern = pattern, recursive = T, full.names = T) +names <- basename(dirname(fns)) +names(fns) <- names + +coldata <- data.frame(files = fns, names = names) +rownames(coldata) <- coldata[["names"]] + +# Import transcript-level quantifications +se <- tximeta(coldata, type = '$quant_type', txOut = TRUE) + +# Save summarized experiment to file +saveRDS(se, file = paste0('$prefix', '.rds')) + +################################################ +################################################ +## R SESSION INFO ## +################################################ +################################################ + +sink(paste("R_sessionInfo.log", sep = '.')) +citation("tximeta") +print(sessionInfo()) +sink() + +################################################ +################################################ +## VERSIONS FILE ## +################################################ +################################################ + +r.version <- strsplit(version[['version.string']], ' ')[[1]][3] +tximeta.version <- as.character(packageVersion('tximeta')) + +writeLines( + c( + '"${task.process}":', + paste(' bioconductor-tximeta:', tximeta.version) + ), +'versions.yml') + +################################################ +################################################ +################################################ +################################################ diff --git a/modules/nf-core/bedtools/getfasta/environment.yml b/modules/nf-core/bedtools/getfasta/environment.yml new file mode 100644 index 000000000..a89401f2a --- /dev/null +++ b/modules/nf-core/bedtools/getfasta/environment.yml @@ -0,0 +1,7 @@ +name: bedtools_getfasta +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::bedtools=2.31.1 diff --git a/modules/nf-core/bedtools/getfasta/main.nf b/modules/nf-core/bedtools/getfasta/main.nf new file mode 100644 index 000000000..b316117d4 --- /dev/null +++ b/modules/nf-core/bedtools/getfasta/main.nf @@ -0,0 +1,50 @@ +process BEDTOOLS_GETFASTA { + tag "$meta.id" + label 'process_single' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/bedtools:2.31.1--hf5e1c6e_0' : + 'biocontainers/bedtools:2.31.1--hf5e1c6e_0' }" + + input: + tuple val(meta), path(bed) + path fasta + + output: + tuple val(meta), path("*.fa"), emit: fasta + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + if ("$fasta" == "${prefix}.fa") error "Input and output names are the same, use \"task.ext.prefix\" to disambiguate!" + """ + bedtools \\ + getfasta \\ + $args \\ + -fi $fasta \\ + -bed $bed \\ + -fo ${prefix}.fa + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + bedtools: \$(bedtools --version | sed -e "s/bedtools v//g") + END_VERSIONS + """ + + stub: + def prefix = task.ext.prefix ?: "${meta.id}" + if ("$fasta" == "${prefix}.fa") error "Input and output names are the same, use \"task.ext.prefix\" to disambiguate!" + """ + touch ${prefix}.fa + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + bedtools: \$(bedtools --version | sed -e "s/bedtools v//g") + END_VERSIONS + """ +} diff --git a/modules/nf-core/bedtools/getfasta/meta.yml b/modules/nf-core/bedtools/getfasta/meta.yml new file mode 100644 index 000000000..41917fe3f --- /dev/null +++ b/modules/nf-core/bedtools/getfasta/meta.yml @@ -0,0 +1,46 @@ +name: bedtools_getfasta +description: extract sequences in a FASTA file based on intervals defined in a feature file. +keywords: + - bed + - fasta + - getfasta +tools: + - bedtools: + description: | + A set of tools for genomic analysis tasks, specifically enabling genome arithmetic (merge, count, complement) on various file types. + documentation: https://bedtools.readthedocs.io/en/latest/content/tools/getfasta.html + licence: ["MIT"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - bed: + type: file + description: Bed feature file + pattern: "*.{bed}" + - fasta: + type: file + description: Input fasta file + pattern: "*.{fa,fasta}" +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - fasta: + type: file + description: Output fasta file with extracted sequences + pattern: "*.{fa}" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@joseespinosa" + - "@drpatelh" +maintainers: + - "@joseespinosa" + - "@drpatelh" diff --git a/modules/nf-core/bedtools/getfasta/tests/main.nf.test b/modules/nf-core/bedtools/getfasta/tests/main.nf.test new file mode 100644 index 000000000..4da7552c8 --- /dev/null +++ b/modules/nf-core/bedtools/getfasta/tests/main.nf.test @@ -0,0 +1,62 @@ +nextflow_process { + + name "Test Process BEDTOOLS_GETFASTA" + script "../main.nf" + process "BEDTOOLS_GETFASTA" + + tag "modules" + tag "modules_nfcore" + tag "bedtools" + tag "bedtools/getfasta" + + test("sarscov2 - bed - fasta") { + + when { + process { + """ + input[0] = [ + [ id:'test', single_end:false], + file(params.test_data['sarscov2']['genome']['test_bed'], checkIfExists: true), + ] + + input[1] = file(params.test_data['sarscov2']['genome']['genome_fasta'], checkIfExists: true) + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + + test("sarscov2 - bed - fasta - stub") { + + options "-stub" + + when { + process { + """ + input[0] = [ + [ id:'test', single_end:false], + file(params.test_data['sarscov2']['genome']['test_bed'], checkIfExists: true), + ] + + input[1] = file(params.test_data['sarscov2']['genome']['genome_fasta'], checkIfExists: true) + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + +} diff --git a/modules/nf-core/bedtools/getfasta/tests/main.nf.test.snap b/modules/nf-core/bedtools/getfasta/tests/main.nf.test.snap new file mode 100644 index 000000000..69bf33f74 --- /dev/null +++ b/modules/nf-core/bedtools/getfasta/tests/main.nf.test.snap @@ -0,0 +1,72 @@ +{ + "sarscov2 - bed - fasta": { + "content": [ + { + "0": [ + [ + { + "id": "test", + "single_end": false + }, + "test.fa:md5,41c3a45a57a16c04f828d8f8bb52df70" + ] + ], + "1": [ + "versions.yml:md5,427b4f64b2f05f28f0beef96c9f0d310" + ], + "fasta": [ + [ + { + "id": "test", + "single_end": false + }, + "test.fa:md5,41c3a45a57a16c04f828d8f8bb52df70" + ] + ], + "versions": [ + "versions.yml:md5,427b4f64b2f05f28f0beef96c9f0d310" + ] + } + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-05-03T14:16:19.383758985" + }, + "sarscov2 - bed - fasta - stub": { + "content": [ + { + "0": [ + [ + { + "id": "test", + "single_end": false + }, + "test.fa:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "1": [ + "versions.yml:md5,427b4f64b2f05f28f0beef96c9f0d310" + ], + "fasta": [ + [ + { + "id": "test", + "single_end": false + }, + "test.fa:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "versions": [ + "versions.yml:md5,427b4f64b2f05f28f0beef96c9f0d310" + ] + } + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-05-03T14:16:47.47010536" + } +} \ No newline at end of file diff --git a/modules/nf-core/bedtools/getfasta/tests/tags.yml b/modules/nf-core/bedtools/getfasta/tests/tags.yml new file mode 100644 index 000000000..42ec3026c --- /dev/null +++ b/modules/nf-core/bedtools/getfasta/tests/tags.yml @@ -0,0 +1,2 @@ +bedtools/getfasta: + - "modules/nf-core/bedtools/getfasta/**" diff --git a/modules/nf-core/bedtools/groupby/environment.yml b/modules/nf-core/bedtools/groupby/environment.yml new file mode 100644 index 000000000..dab99ea1f --- /dev/null +++ b/modules/nf-core/bedtools/groupby/environment.yml @@ -0,0 +1,7 @@ +name: bedtools_groupby +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::bedtools=2.31.1 diff --git a/modules/nf-core/bedtools/groupby/main.nf b/modules/nf-core/bedtools/groupby/main.nf new file mode 100644 index 000000000..063e7ba2a --- /dev/null +++ b/modules/nf-core/bedtools/groupby/main.nf @@ -0,0 +1,50 @@ +process BEDTOOLS_GROUPBY { + tag "$meta.id" + label 'process_single' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/bedtools:2.31.1--hf5e1c6e_0' : + 'biocontainers/bedtools:2.31.1--hf5e1c6e_0' }" + + input: + tuple val(meta), path(bed) + val(summary_col) + + output: + tuple val(meta), path('*.bed'), emit: bed + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}.grouped" + def summary_col = task.ext.summary_col ? "-c ${task.ext.summary_col}" : "-c 5" + if ("$bed" == "${prefix}.bed") error "Input and output names are the same, use \"task.ext.prefix\" to disambiguate!" + """ + bedtools \\ + groupby \\ + -i $bed \\ + ${summary_col} \\ + $args \\ + > ${prefix}.bed + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + bedtools: \$(bedtools --version | sed -e "s/bedtools v//g") + END_VERSIONS + """ + + stub: + def prefix = task.ext.prefix ?: "${meta.id}" + """ + touch ${prefix}.bed + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + bedtools: \$(bedtools --version | sed -e "s/bedtools v//g") + END_VERSIONS + """ +} diff --git a/modules/nf-core/bedtools/groupby/meta.yml b/modules/nf-core/bedtools/groupby/meta.yml new file mode 100644 index 000000000..bcbc561a0 --- /dev/null +++ b/modules/nf-core/bedtools/groupby/meta.yml @@ -0,0 +1,47 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/yaml-schema.json +name: bedtools_groupby +description: Groups features in a BED file by given column(s) and computes summary statistics for each group to another column. +keywords: + - bed + - groupby + - bedtools +tools: + - bedtools: + description: | + A set of tools for genomic analysis tasks, specifically enabling genome arithmetic (merge, count, complement) on various file types. + documentation: https://bedtools.readthedocs.io/en/latest/content/tools/groupby.html + homepage: https://bedtools.readthedocs.io/en/latest/ + doi: 10.1093/bioinformatics/btq033 + licence: ["MIT"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'test', single_end:false ]` + - bed: + type: file + description: Input BED file + pattern: "*.{bed}" + - summary_column: + type: integer + description: Column to be summarized (1-based) +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'test', single_end:false ]` + - bed: + type: file + description: Grouped by bed file with combined features + pattern: "*.{bed}" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@mashehu" +maintainers: + - "@mashehu" diff --git a/modules/nf-core/bedtools/intersect/environment.yml b/modules/nf-core/bedtools/intersect/environment.yml new file mode 100644 index 000000000..2a3430508 --- /dev/null +++ b/modules/nf-core/bedtools/intersect/environment.yml @@ -0,0 +1,7 @@ +name: bedtools_intersect +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::bedtools=2.31.1 diff --git a/modules/nf-core/bedtools/intersect/main.nf b/modules/nf-core/bedtools/intersect/main.nf new file mode 100644 index 000000000..d9e79e7fa --- /dev/null +++ b/modules/nf-core/bedtools/intersect/main.nf @@ -0,0 +1,59 @@ +process BEDTOOLS_INTERSECT { + tag "$meta.id" + label 'process_single' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/bedtools:2.31.1--hf5e1c6e_0' : + 'biocontainers/bedtools:2.31.1--hf5e1c6e_0' }" + + input: + tuple val(meta), path(intervals1), path(intervals2) + tuple val(meta2), path(chrom_sizes) + + output: + tuple val(meta), path("*.${extension}"), emit: intersect + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + //Extension of the output file. It is set by the user via "ext.suffix" in the config. Corresponds to the file format which depends on arguments (e. g., ".bed", ".bam", ".txt", etc.). + extension = task.ext.suffix ?: "${intervals1.extension}" + def sizes = chrom_sizes ? "-g ${chrom_sizes}" : '' + if ("$intervals1" == "${prefix}.${extension}" || + "$intervals2" == "${prefix}.${extension}") + error "Input and output names are the same, use \"task.ext.prefix\" to disambiguate!" + """ + bedtools \\ + intersect \\ + -a $intervals1 \\ + -b $intervals2 \\ + $args \\ + $sizes \\ + > ${prefix}.${extension} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + bedtools: \$(bedtools --version | sed -e "s/bedtools v//g") + END_VERSIONS + """ + + stub: + def prefix = task.ext.prefix ?: "${meta.id}" + extension = task.ext.suffix ?: "bed" + if ("$intervals1" == "${prefix}.${extension}" || + "$intervals2" == "${prefix}.${extension}") + error "Input and output names are the same, use \"task.ext.prefix\" to disambiguate!" + """ + touch ${prefix}.${extension} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + bedtools: \$(bedtools --version | sed -e "s/bedtools v//g") + END_VERSIONS + """ +} diff --git a/modules/nf-core/bedtools/intersect/meta.yml b/modules/nf-core/bedtools/intersect/meta.yml new file mode 100644 index 000000000..0939cb54a --- /dev/null +++ b/modules/nf-core/bedtools/intersect/meta.yml @@ -0,0 +1,59 @@ +name: bedtools_intersect +description: Allows one to screen for overlaps between two sets of genomic features. +keywords: + - bed + - intersect + - overlap +tools: + - bedtools: + description: | + A set of tools for genomic analysis tasks, specifically enabling genome arithmetic (merge, count, complement) on various file types. + documentation: https://bedtools.readthedocs.io/en/latest/content/tools/intersect.html + licence: ["MIT"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - intervals1: + type: file + description: BAM/BED/GFF/VCF + pattern: "*.{bam|bed|gff|vcf}" + - intervals2: + type: file + description: BAM/BED/GFF/VCF + pattern: "*.{bam|bed|gff|vcf}" + - meta2: + type: map + description: | + Groovy Map containing reference chromosome sizes + e.g. [ id:'test' ] + - chrom_sizes: + type: file + description: Chromosome sizes file + pattern: "*{.sizes,.txt}" +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - intersect: + type: file + description: File containing the description of overlaps found between the two features + pattern: "*.${extension}" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@edmundmiller" + - "@sruthipsuresh" + - "@drpatelh" + - "@sidorov-si" +maintainers: + - "@edmundmiller" + - "@sruthipsuresh" + - "@drpatelh" + - "@sidorov-si" diff --git a/modules/nf-core/bedtools/sort/environment.yml b/modules/nf-core/bedtools/sort/environment.yml new file mode 100644 index 000000000..87b2e4252 --- /dev/null +++ b/modules/nf-core/bedtools/sort/environment.yml @@ -0,0 +1,7 @@ +name: bedtools_sort +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::bedtools=2.31.1 diff --git a/modules/nf-core/bedtools/sort/main.nf b/modules/nf-core/bedtools/sort/main.nf new file mode 100644 index 000000000..b833150a1 --- /dev/null +++ b/modules/nf-core/bedtools/sort/main.nf @@ -0,0 +1,54 @@ +process BEDTOOLS_SORT { + tag "$meta.id" + label 'process_single' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/bedtools:2.31.1--hf5e1c6e_0' : + 'biocontainers/bedtools:2.31.1--hf5e1c6e_0' }" + + input: + tuple val(meta), path(intervals) + path genome_file + + output: + tuple val(meta), path("*.${extension}"), emit: sorted + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def genome_cmd = genome_file ? "-g $genome_file" : "" + extension = task.ext.suffix ?: intervals.extension + if ("$intervals" == "${prefix}.${extension}") { + error "Input and output names are the same, use \"task.ext.prefix\" to disambiguate!" + } + """ + bedtools \\ + sort \\ + -i $intervals \\ + $genome_cmd \\ + $args \\ + > ${prefix}.${extension} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + bedtools: \$(bedtools --version | sed -e "s/bedtools v//g") + END_VERSIONS + """ + + stub: + def prefix = task.ext.prefix ?: "${meta.id}" + extension = task.ext.suffix ?: intervals.extension + """ + touch ${prefix}.${extension} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + bedtools: \$(bedtools --version | sed -e "s/bedtools v//g") + END_VERSIONS + """ +} diff --git a/modules/nf-core/bedtools/sort/meta.yml b/modules/nf-core/bedtools/sort/meta.yml new file mode 100644 index 000000000..7c915f5f9 --- /dev/null +++ b/modules/nf-core/bedtools/sort/meta.yml @@ -0,0 +1,54 @@ +name: bedtools_sort +description: Sorts a feature file by chromosome and other criteria. +keywords: + - bed + - sort + - bedtools + - chromosome +tools: + - bedtools: + description: | + A set of tools for genomic analysis tasks, specifically enabling genome arithmetic (merge, count, complement) on various file types. + documentation: https://bedtools.readthedocs.io/en/latest/content/tools/sort.html + licence: ["MIT"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - intervals: + type: file + description: BED/BEDGRAPH + pattern: "*.{bed|bedGraph}" + - genome_file: + type: file + description: | + Optional reference genome 2 column file that defines the expected chromosome order. + pattern: "*.{fai,txt,chromsizes}" +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - sorted: + type: file + description: Sorted output file + pattern: "*.${extension}" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@edmundmiller" + - "@sruthipsuresh" + - "@drpatelh" + - "@chris-cheshire" + - "@adamrtalbot" +maintainers: + - "@edmundmiller" + - "@sruthipsuresh" + - "@drpatelh" + - "@chris-cheshire" + - "@adamrtalbot" diff --git a/modules/nf-core/bedtools/sort/tests/main.nf.test b/modules/nf-core/bedtools/sort/tests/main.nf.test new file mode 100644 index 000000000..b1f36dd91 --- /dev/null +++ b/modules/nf-core/bedtools/sort/tests/main.nf.test @@ -0,0 +1,58 @@ +nextflow_process { + + name "Test Process BEDTOOLS_SORT" + script "../main.nf" + config "./nextflow.config" + process "BEDTOOLS_SORT" + + tag "modules" + tag "modules_nfcore" + tag "bedtools" + tag "bedtools/sort" + + test("test_bedtools_sort") { + + when { + process { + """ + input[0] = [ [ id:'test'], + file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/bed/test.bed', checkIfExists: true) + ] + input[1] = [] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + + + test("test_bedtools_sort_with_genome") { + + when { + process { + """ + input[0] = [ [ id:'test'], + file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/bed/test.bed', checkIfExists: true) + ] + input[1] = file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.fasta.fai', checkIfExists: true) + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + +} \ No newline at end of file diff --git a/modules/nf-core/bedtools/sort/tests/main.nf.test.snap b/modules/nf-core/bedtools/sort/tests/main.nf.test.snap new file mode 100644 index 000000000..f10e8b984 --- /dev/null +++ b/modules/nf-core/bedtools/sort/tests/main.nf.test.snap @@ -0,0 +1,68 @@ +{ + "test_bedtools_sort_with_genome": { + "content": [ + { + "0": [ + [ + { + "id": "test" + }, + "test_out.testtext:md5,fe4053cf4de3aebbdfc3be2efb125a74" + ] + ], + "1": [ + "versions.yml:md5,cdbae2c7ebc41e534aaf0835779061f8" + ], + "sorted": [ + [ + { + "id": "test" + }, + "test_out.testtext:md5,fe4053cf4de3aebbdfc3be2efb125a74" + ] + ], + "versions": [ + "versions.yml:md5,cdbae2c7ebc41e534aaf0835779061f8" + ] + } + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-03-19T10:13:11.830452" + }, + "test_bedtools_sort": { + "content": [ + { + "0": [ + [ + { + "id": "test" + }, + "test_out.testtext:md5,fe4053cf4de3aebbdfc3be2efb125a74" + ] + ], + "1": [ + "versions.yml:md5,cdbae2c7ebc41e534aaf0835779061f8" + ], + "sorted": [ + [ + { + "id": "test" + }, + "test_out.testtext:md5,fe4053cf4de3aebbdfc3be2efb125a74" + ] + ], + "versions": [ + "versions.yml:md5,cdbae2c7ebc41e534aaf0835779061f8" + ] + } + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-03-19T10:16:40.535947" + } +} \ No newline at end of file diff --git a/modules/nf-core/bedtools/sort/tests/nextflow.config b/modules/nf-core/bedtools/sort/tests/nextflow.config new file mode 100644 index 000000000..f203c99c5 --- /dev/null +++ b/modules/nf-core/bedtools/sort/tests/nextflow.config @@ -0,0 +1,8 @@ +process { + + withName: BEDTOOLS_SORT { + ext.prefix = { "${meta.id}_out" } + ext.suffix = "testtext" + } + +} \ No newline at end of file diff --git a/modules/nf-core/bedtools/sort/tests/tags.yml b/modules/nf-core/bedtools/sort/tests/tags.yml new file mode 100644 index 000000000..47c85eead --- /dev/null +++ b/modules/nf-core/bedtools/sort/tests/tags.yml @@ -0,0 +1,2 @@ +bedtools/sort: + - "modules/nf-core/bedtools/sort/**" diff --git a/modules/nf-core/bioawk/bioawk.diff b/modules/nf-core/bioawk/bioawk.diff new file mode 100644 index 000000000..1303738f2 --- /dev/null +++ b/modules/nf-core/bioawk/bioawk.diff @@ -0,0 +1,24 @@ +Changes in module 'nf-core/bioawk' +--- modules/nf-core/bioawk/main.nf ++++ modules/nf-core/bioawk/main.nf +@@ -20,15 +20,15 @@ + script: + def args = task.ext.args ?: '' // args is used for the main arguments of the tool + prefix = task.ext.prefix ?: "${meta.id}" ++ suffix = task.ext.suffix ?: input.extension ++ file_name = "${prefix}.${suffix}" + + def VERSION = '1.0' // WARN: Version information not provided by tool on CLI. Please update this string when bumping container versions. + """ + bioawk \\ + $args \\ + $input \\ +- > ${prefix} +- +- gzip ${prefix} ++ > ${file_name} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + +************************************************************ diff --git a/modules/nf-core/bioawk/environment.yml b/modules/nf-core/bioawk/environment.yml new file mode 100644 index 000000000..5fdfd4176 --- /dev/null +++ b/modules/nf-core/bioawk/environment.yml @@ -0,0 +1,7 @@ +name: bioawk +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::bioawk=1.0 diff --git a/modules/nf-core/bioawk/main.nf b/modules/nf-core/bioawk/main.nf new file mode 100644 index 000000000..0fded517a --- /dev/null +++ b/modules/nf-core/bioawk/main.nf @@ -0,0 +1,37 @@ +process BIOAWK { + tag "$meta.id" + label 'process_single' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/bioawk:1.0--h5bf99c6_6': + 'biocontainers/bioawk:1.0--h5bf99c6_6' }" + + input: + tuple val(meta), path(input) + + output: + tuple val(meta), path("${prefix}.${suffix}"), emit: output + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' // args is used for the main arguments of the tool + prefix = task.ext.prefix ?: "${meta.id}" + suffix = task.ext.suffix ?: input.extension + + def VERSION = '1.0' // WARN: Version information not provided by tool on CLI. Please update this string when bumping container versions. + """ + bioawk \\ + $args \\ + $input \\ + > ${prefix}.${suffix} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + bioawk: $VERSION + END_VERSIONS + """ +} diff --git a/modules/nf-core/bioawk/meta.yml b/modules/nf-core/bioawk/meta.yml new file mode 100644 index 000000000..c9d001118 --- /dev/null +++ b/modules/nf-core/bioawk/meta.yml @@ -0,0 +1,46 @@ +name: "bioawk" +description: Bioawk is an extension to Brian Kernighan's awk, adding the support of several common biological data formats. +keywords: + - bioawk + - fastq + - fasta + - sam + - file manipulation + - awk +tools: + - "bioawk": + description: "BWK awk modified for biological data" + homepage: "https://github.com/lh3/bioawk" + documentation: "https://github.com/lh3/bioawk" + tool_dev_url: "https://github.com/lh3/bioawk" + licence: "['Free software license (https://github.com/lh3/bioawk/blob/master/README.awk#L1)']" +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - input: + type: file + description: Input sequence biological sequence file (optionally gzipped) to be manipulated via program specified in `$args`. + pattern: "*.{bed,gff,sam,vcf,fastq,fasta,tab,bed.gz,gff.gz,sam.gz,vcf.gz,fastq.gz,fasta.gz,tab.gz}" +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - output: + type: file + description: | + Manipulated and gzipped version of input sequence file following program specified in `args`. + File name will be what is specified in `$prefix`. Do not include `.gz` suffix in `$prefix`! Output files` will be gzipped for you! + pattern: "*.gz" +authors: + - "@jfy133" +maintainers: + - "@jfy133" diff --git a/modules/nf-core/bowtie/align/environment.yml b/modules/nf-core/bowtie/align/environment.yml new file mode 100644 index 000000000..2617e6f0a --- /dev/null +++ b/modules/nf-core/bowtie/align/environment.yml @@ -0,0 +1,7 @@ +name: bowtie_align +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::bowtie=1.3.0 diff --git a/modules/nf-core/bowtie/align/main.nf b/modules/nf-core/bowtie/align/main.nf new file mode 100644 index 000000000..29e9cd533 --- /dev/null +++ b/modules/nf-core/bowtie/align/main.nf @@ -0,0 +1,56 @@ +process BOWTIE_ALIGN { + tag "$meta.id" + label 'process_high' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/mulled-v2-ffbf83a6b0ab6ec567a336cf349b80637135bca3:c84c7c55c45af231883d9ff4fe706ac44c479c36-0' : + 'biocontainers/mulled-v2-ffbf83a6b0ab6ec567a336cf349b80637135bca3:c84c7c55c45af231883d9ff4fe706ac44c479c36-0' }" + + input: + tuple val(meta), path(reads) + path index + + output: + tuple val(meta), path('*.bam'), emit: bam + tuple val(meta), path('*.out'), emit: log + path "versions.yml" , emit: versions + tuple val(meta), path('*fastq.gz'), optional:true, emit: fastq + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def args2 = task.ext.args2 ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def unaligned = params.save_unaligned ? "--un ${prefix}.unmapped.fastq" : '' + def endedness = meta.single_end ? "$reads" : "-1 ${reads[0]} -2 ${reads[1]}" + """ + INDEX=`find -L ./ -name "*.3.ebwt" | sed 's/\\.3.ebwt\$//'` + bowtie \\ + --threads $task.cpus \\ + --sam \\ + -x \$INDEX \\ + -q \\ + $unaligned \\ + $args \\ + $endedness \\ + 2> >(tee ${prefix}.out >&2) \\ + | samtools view $args2 -@ $task.cpus -bS -o ${prefix}.bam - + + if [ -f ${prefix}.unmapped.fastq ]; then + gzip ${prefix}.unmapped.fastq + fi + if [ -f ${prefix}.unmapped_1.fastq ]; then + gzip ${prefix}.unmapped_1.fastq + gzip ${prefix}.unmapped_2.fastq + fi + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + bowtie: \$(echo \$(bowtie --version 2>&1) | sed 's/^.*bowtie-align-s version //; s/ .*\$//') + samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/bowtie/align/meta.yml b/modules/nf-core/bowtie/align/meta.yml new file mode 100644 index 000000000..89eaedd6c --- /dev/null +++ b/modules/nf-core/bowtie/align/meta.yml @@ -0,0 +1,50 @@ +name: bowtie_align +description: Align reads to a reference genome using bowtie +keywords: + - align + - map + - fastq + - fasta + - genome + - reference +tools: + - bowtie: + description: | + bowtie is a software package for mapping DNA sequences against + a large reference genome, such as the human genome. + homepage: http://bowtie-bio.sourceforge.net/index.shtml + documentation: http://bowtie-bio.sourceforge.net/manual.shtml + arxiv: arXiv:1303.3997 + licence: ["Artistic-2.0"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - reads: + type: file + description: | + List of input FastQ files of size 1 and 2 for single-end and paired-end data, + respectively. + - index: + type: file + description: Bowtie genome index files + pattern: "*.ebwt" +output: + - bam: + type: file + description: Output BAM file containing read alignments + pattern: "*.{bam}" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - fastq: + type: file + description: Unaligned FastQ files + pattern: "*.fastq.gz" +authors: + - "@kevinmenden" +maintainers: + - "@kevinmenden" diff --git a/modules/nf-core/bowtie/build/environment.yml b/modules/nf-core/bowtie/build/environment.yml new file mode 100644 index 000000000..0907b0f84 --- /dev/null +++ b/modules/nf-core/bowtie/build/environment.yml @@ -0,0 +1,7 @@ +name: bowtie_build +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::bowtie=1.3.0 diff --git a/modules/nf-core/bowtie/build/main.nf b/modules/nf-core/bowtie/build/main.nf new file mode 100644 index 000000000..05e22fe8c --- /dev/null +++ b/modules/nf-core/bowtie/build/main.nf @@ -0,0 +1,30 @@ +process BOWTIE_BUILD { + tag "$fasta" + label 'process_high' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/bowtie:1.3.0--py38hed8969a_1' : + 'biocontainers/bowtie:1.3.0--py38hed8969a_1' }" + + input: + path fasta + + output: + path 'bowtie' , emit: index + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + """ + mkdir bowtie + bowtie-build --threads $task.cpus $fasta bowtie/${fasta.baseName} + cat <<-END_VERSIONS > versions.yml + "${task.process}": + bowtie: \$(echo \$(bowtie --version 2>&1) | sed 's/^.*bowtie-align-s version //; s/ .*\$//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/bowtie/build/meta.yml b/modules/nf-core/bowtie/build/meta.yml new file mode 100644 index 000000000..262855f42 --- /dev/null +++ b/modules/nf-core/bowtie/build/meta.yml @@ -0,0 +1,35 @@ +name: bowtie_build +description: Create bowtie index for reference genome +keywords: + - index + - fasta + - genome + - reference +tools: + - bowtie: + description: | + bowtie is a software package for mapping DNA sequences against + a large reference genome, such as the human genome. + homepage: http://bowtie-bio.sourceforge.net/index.shtml + documentation: http://bowtie-bio.sourceforge.net/manual.shtml + arxiv: arXiv:1303.3997 + licence: ["Artistic-2.0"] +input: + - fasta: + type: file + description: Input genome fasta file +output: + - index: + type: file + description: Bowtie genome index files + pattern: "*.ebwt" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@kevinmenden" + - "@drpatelh" +maintainers: + - "@kevinmenden" + - "@drpatelh" diff --git a/modules/nf-core/bowtie2/align/environment.yml b/modules/nf-core/bowtie2/align/environment.yml new file mode 100644 index 000000000..d2796359a --- /dev/null +++ b/modules/nf-core/bowtie2/align/environment.yml @@ -0,0 +1,9 @@ +name: bowtie2_align +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::bowtie2=2.5.2 + - bioconda::samtools=1.18 + - conda-forge::pigz=2.6 diff --git a/modules/nf-core/bowtie2/align/main.nf b/modules/nf-core/bowtie2/align/main.nf new file mode 100644 index 000000000..809525ad3 --- /dev/null +++ b/modules/nf-core/bowtie2/align/main.nf @@ -0,0 +1,117 @@ +process BOWTIE2_ALIGN { + tag "$meta.id" + label 'process_high' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/mulled-v2-ac74a7f02cebcfcc07d8e8d1d750af9c83b4d45a:f70b31a2db15c023d641c32f433fb02cd04df5a6-0' : + 'biocontainers/mulled-v2-ac74a7f02cebcfcc07d8e8d1d750af9c83b4d45a:f70b31a2db15c023d641c32f433fb02cd04df5a6-0' }" + + input: + tuple val(meta) , path(reads) + tuple val(meta2), path(index) + tuple val(meta3), path(fasta) + val save_unaligned + val sort_bam + + output: + tuple val(meta), path("*.sam") , emit: sam , optional:true + tuple val(meta), path("*.bam") , emit: bam , optional:true + tuple val(meta), path("*.cram") , emit: cram , optional:true + tuple val(meta), path("*.csi") , emit: csi , optional:true + tuple val(meta), path("*.crai") , emit: crai , optional:true + tuple val(meta), path("*.log") , emit: log + tuple val(meta), path("*fastq.gz") , emit: fastq , optional:true + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: "" + def args2 = task.ext.args2 ?: "" + def prefix = task.ext.prefix ?: "${meta.id}" + + def unaligned = "" + def reads_args = "" + if (meta.single_end) { + unaligned = save_unaligned ? "--un-gz ${prefix}.unmapped.fastq.gz" : "" + reads_args = "-U ${reads}" + } else { + unaligned = save_unaligned ? "--un-conc-gz ${prefix}.unmapped.fastq.gz" : "" + reads_args = "-1 ${reads[0]} -2 ${reads[1]}" + } + + def samtools_command = sort_bam ? 'sort' : 'view' + def extension_pattern = /(--output-fmt|-O)+\s+(\S+)/ + def extension_matcher = (args2 =~ extension_pattern) + def extension = extension_matcher.getCount() > 0 ? extension_matcher[0][2].toLowerCase() : "bam" + def reference = fasta && extension=="cram" ? "--reference ${fasta}" : "" + if (!fasta && extension=="cram") error "Fasta reference is required for CRAM output" + + """ + INDEX=`find -L ./ -name "*.rev.1.bt2" | sed "s/\\.rev.1.bt2\$//"` + [ -z "\$INDEX" ] && INDEX=`find -L ./ -name "*.rev.1.bt2l" | sed "s/\\.rev.1.bt2l\$//"` + [ -z "\$INDEX" ] && echo "Bowtie2 index files not found" 1>&2 && exit 1 + + bowtie2 \\ + -x \$INDEX \\ + $reads_args \\ + --threads $task.cpus \\ + $unaligned \\ + $args \\ + 2> >(tee ${prefix}.bowtie2.log >&2) \\ + | samtools $samtools_command $args2 --threads $task.cpus ${reference} -o ${prefix}.${extension} - + + if [ -f ${prefix}.unmapped.fastq.1.gz ]; then + mv ${prefix}.unmapped.fastq.1.gz ${prefix}.unmapped_1.fastq.gz + fi + + if [ -f ${prefix}.unmapped.fastq.2.gz ]; then + mv ${prefix}.unmapped.fastq.2.gz ${prefix}.unmapped_2.fastq.gz + fi + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + bowtie2: \$(echo \$(bowtie2 --version 2>&1) | sed 's/^.*bowtie2-align-s version //; s/ .*\$//') + samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') + pigz: \$( pigz --version 2>&1 | sed 's/pigz //g' ) + END_VERSIONS + """ + + stub: + def args2 = task.ext.args2 ?: "" + def prefix = task.ext.prefix ?: "${meta.id}" + def extension_pattern = /(--output-fmt|-O)+\s+(\S+)/ + def extension = (args2 ==~ extension_pattern) ? (args2 =~ extension_pattern)[0][2].toLowerCase() : "bam" + def create_unmapped = "" + if (meta.single_end) { + create_unmapped = save_unaligned ? "touch ${prefix}.unmapped.fastq.gz" : "" + } else { + create_unmapped = save_unaligned ? "touch ${prefix}.unmapped_1.fastq.gz && touch ${prefix}.unmapped_2.fastq.gz" : "" + } + def reference = fasta && extension=="cram" ? "--reference ${fasta}" : "" + if (!fasta && extension=="cram") error "Fasta reference is required for CRAM output" + + def create_index = "" + if (extension == "cram") { + create_index = "touch ${prefix}.crai" + } else if (extension == "bam") { + create_index = "touch ${prefix}.csi" + } + + """ + touch ${prefix}.${extension} + ${create_index} + touch ${prefix}.bowtie2.log + ${create_unmapped} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + bowtie2: \$(echo \$(bowtie2 --version 2>&1) | sed 's/^.*bowtie2-align-s version //; s/ .*\$//') + samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') + pigz: \$( pigz --version 2>&1 | sed 's/pigz //g' ) + END_VERSIONS + """ + +} diff --git a/modules/nf-core/bowtie2/align/meta.yml b/modules/nf-core/bowtie2/align/meta.yml new file mode 100644 index 000000000..38610e0ed --- /dev/null +++ b/modules/nf-core/bowtie2/align/meta.yml @@ -0,0 +1,95 @@ +name: bowtie2_align +description: Align reads to a reference genome using bowtie2 +keywords: + - align + - map + - fasta + - fastq + - genome + - reference +tools: + - bowtie2: + description: | + Bowtie 2 is an ultrafast and memory-efficient tool for aligning + sequencing reads to long reference sequences. + homepage: http://bowtie-bio.sourceforge.net/bowtie2/index.shtml + documentation: http://bowtie-bio.sourceforge.net/bowtie2/manual.shtml + doi: 10.1038/nmeth.1923 + licence: ["GPL-3.0-or-later"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - reads: + type: file + description: | + List of input FastQ files of size 1 and 2 for single-end and paired-end data, + respectively. + - meta2: + type: map + description: | + Groovy Map containing reference information + e.g. [ id:'test', single_end:false ] + - index: + type: file + description: Bowtie2 genome index files + pattern: "*.ebwt" + - meta3: + type: map + description: | + Groovy Map containing reference information + e.g. [ id:'test', single_end:false ] + - fasta: + type: file + description: Bowtie2 genome fasta file + pattern: "*.fasta" + - save_unaligned: + type: boolean + description: | + Save reads that do not map to the reference (true) or discard them (false) + (default: false) + - sort_bam: + type: boolean + description: use samtools sort (true) or samtools view (false) + pattern: "true or false" +output: + - sam: + type: file + description: Output SAM file containing read alignments + pattern: "*.sam" + - bam: + type: file + description: Output BAM file containing read alignments + pattern: "*.bam" + - cram: + type: file + description: Output CRAM file containing read alignments + pattern: "*.cram" + - csi: + type: file + description: Output SAM/BAM index for large inputs + pattern: "*.csi" + - crai: + type: file + description: Output CRAM index + pattern: "*.crai" + - log: + type: file + description: Aligment log + pattern: "*.log" + - fastq: + type: file + description: Unaligned FastQ files + pattern: "*.fastq.gz" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@joseespinosa" + - "@drpatelh" +maintainers: + - "@joseespinosa" + - "@drpatelh" diff --git a/modules/nf-core/bowtie2/align/tests/cram_crai.config b/modules/nf-core/bowtie2/align/tests/cram_crai.config new file mode 100644 index 000000000..03f1d5e51 --- /dev/null +++ b/modules/nf-core/bowtie2/align/tests/cram_crai.config @@ -0,0 +1,5 @@ +process { + withName: BOWTIE2_ALIGN { + ext.args2 = '--output-fmt cram --write-index' + } +} diff --git a/modules/nf-core/bowtie2/align/tests/large_index.config b/modules/nf-core/bowtie2/align/tests/large_index.config new file mode 100644 index 000000000..fdc1c59dd --- /dev/null +++ b/modules/nf-core/bowtie2/align/tests/large_index.config @@ -0,0 +1,5 @@ +process { + withName: BOWTIE2_BUILD { + ext.args = '--large-index' + } +} \ No newline at end of file diff --git a/modules/nf-core/bowtie2/align/tests/main.nf.test b/modules/nf-core/bowtie2/align/tests/main.nf.test new file mode 100644 index 000000000..03aeaf9ee --- /dev/null +++ b/modules/nf-core/bowtie2/align/tests/main.nf.test @@ -0,0 +1,623 @@ +nextflow_process { + + name "Test Process BOWTIE2_ALIGN" + script "../main.nf" + process "BOWTIE2_ALIGN" + tag "modules" + tag "modules_nfcore" + tag "bowtie2" + tag "bowtie2/build" + tag "bowtie2/align" + + test("sarscov2 - fastq, index, fasta, false, false - bam") { + + setup { + run("BOWTIE2_BUILD") { + script "../../build/main.nf" + process { + """ + input[0] = [ + [ id:'test'], + file(params.test_data['sarscov2']['genome']['genome_fasta'], checkIfExists: true) + ] + """ + } + } + } + + when { + process { + """ + input[0] = [ + [ id:'test', single_end:true ], // meta map + file(params.test_data['sarscov2']['illumina']['test_1_fastq_gz'], checkIfExists: true) + ] + input[1] = BOWTIE2_BUILD.out.index + input[2] = [[ id:'test'], file(params.test_data['sarscov2']['genome']['genome_fasta'], checkIfExists: true)] + input[3] = false //save_unaligned + input[4] = false //sort + """ + } + } + + then { + assertAll ( + { assert process.success }, + { assert snapshot( + file(process.out.bam[0][1]).name, + process.out.log, + process.out.fastq, + process.out.versions + ).match() } + ) + } + + } + + test("sarscov2 - fastq, index, fasta, false, false - sam") { + + config "./sam.config" + setup { + run("BOWTIE2_BUILD") { + script "../../build/main.nf" + process { + """ + input[0] = [ + [ id:'test'], + file(params.test_data['sarscov2']['genome']['genome_fasta'], checkIfExists: true) + ] + """ + } + } + } + + when { + process { + """ + input[0] = [ + [ id:'test', single_end:true ], // meta map + file(params.test_data['sarscov2']['illumina']['test_1_fastq_gz'], checkIfExists: true) + ] + input[1] = BOWTIE2_BUILD.out.index + input[2] = [[ id:'test'], file(params.test_data['sarscov2']['genome']['genome_fasta'], checkIfExists: true)] + input[3] = false //save_unaligned + input[4] = false //sort + """ + } + } + + then { + assertAll ( + { assert process.success }, + { assert snapshot( + file(process.out.sam[0][1]).readLines()[0..4], + process.out.log, + process.out.fastq, + process.out.versions + ).match() } + ) + } + + } + + test("sarscov2 - fastq, index, fasta, false, false - sam2") { + + config "./sam2.config" + setup { + run("BOWTIE2_BUILD") { + script "../../build/main.nf" + process { + """ + input[0] = [ + [ id:'test'], + file(params.test_data['sarscov2']['genome']['genome_fasta'], checkIfExists: true) + ] + """ + } + } + } + + when { + process { + """ + input[0] = [ + [ id:'test', single_end:true ], // meta map + file(params.test_data['sarscov2']['illumina']['test_1_fastq_gz'], checkIfExists: true) + ] + input[1] = BOWTIE2_BUILD.out.index + input[2] = [[ id:'test'], file(params.test_data['sarscov2']['genome']['genome_fasta'], checkIfExists: true)] + input[3] = false //save_unaligned + input[4] = false //sort + """ + } + } + + then { + assertAll ( + { assert process.success }, + { assert snapshot( + file(process.out.sam[0][1]).readLines()[0..4], + process.out.log, + process.out.fastq, + process.out.versions + ).match() } + ) + } + + } + + test("sarscov2 - fastq, index, fasta, false, true - bam") { + + setup { + run("BOWTIE2_BUILD") { + script "../../build/main.nf" + process { + """ + input[0] = [ + [ id:'test'], + file(params.test_data['sarscov2']['genome']['genome_fasta'], checkIfExists: true) + ] + """ + } + } + } + + when { + process { + """ + input[0] = [ + [ id:'test', single_end:true ], // meta map + file(params.test_data['sarscov2']['illumina']['test_1_fastq_gz'], checkIfExists: true) + ] + input[1] = BOWTIE2_BUILD.out.index + input[2] = [[ id:'test'], file(params.test_data['sarscov2']['genome']['genome_fasta'], checkIfExists: true)] + input[3] = false //save_unaligned + input[4] = false //sort + """ + } + } + + then { + assertAll ( + { assert process.success }, + { assert snapshot( + file(process.out.bam[0][1]).name, + process.out.log, + process.out.fastq, + process.out.versions + ).match() } + ) + } + + } + + test("sarscov2 - [fastq1, fastq2], index, fasta, false, false - bam") { + + setup { + run("BOWTIE2_BUILD") { + script "../../build/main.nf" + process { + """ + input[0] = [ + [ id:'test'], + file(params.test_data['sarscov2']['genome']['genome_fasta'], checkIfExists: true) + ] + """ + } + } + } + + when { + process { + """ + input[0] = [ + [ id:'test', single_end:false ], // meta map + [ + file(params.test_data['sarscov2']['illumina']['test_1_fastq_gz'], checkIfExists: true), + file(params.test_data['sarscov2']['illumina']['test_2_fastq_gz'], checkIfExists: true) + ] + ] + input[1] = BOWTIE2_BUILD.out.index + input[2] = [[ id:'test'], file(params.test_data['sarscov2']['genome']['genome_fasta'], checkIfExists: true)] + input[3] = false //save_unaligned + input[4] = false //sort + """ + } + } + + then { + assertAll ( + { assert process.success }, + { assert snapshot( + file(process.out.bam[0][1]).name, + process.out.log, + process.out.fastq, + process.out.versions + ).match() } + ) + } + + } + + test("sarscov2 - [fastq1, fastq2], index, fasta, false, true - bam") { + + setup { + run("BOWTIE2_BUILD") { + script "../../build/main.nf" + process { + """ + input[0] = [ + [ id:'test'], + file(params.test_data['sarscov2']['genome']['genome_fasta'], checkIfExists: true) + ] + """ + } + } + } + + when { + process { + """ + input[0] = [ + [ id:'test', single_end:false ], // meta map + [ + file(params.test_data['sarscov2']['illumina']['test_1_fastq_gz'], checkIfExists: true), + file(params.test_data['sarscov2']['illumina']['test_2_fastq_gz'], checkIfExists: true) + ] + ] + input[1] = BOWTIE2_BUILD.out.index + input[2] = [[ id:'test'], file(params.test_data['sarscov2']['genome']['genome_fasta'], checkIfExists: true)] + input[3] = false //save_unaligned + input[4] = false //sort + """ + } + } + + then { + assertAll ( + { assert process.success }, + { assert snapshot( + file(process.out.bam[0][1]).name, + process.out.log, + process.out.fastq, + process.out.versions + ).match() } + ) + } + + } + + test("sarscov2 - fastq, large_index, fasta, false, false - bam") { + + config "./large_index.config" + setup { + run("BOWTIE2_BUILD") { + script "../../build/main.nf" + process { + """ + input[0] = [ + [ id:'test'], + file(params.test_data['sarscov2']['genome']['genome_fasta'], checkIfExists: true) + ] + """ + } + } + } + + when { + process { + """ + input[0] = [ + [ id:'test', single_end:true ], // meta map + file(params.test_data['sarscov2']['illumina']['test_1_fastq_gz'], checkIfExists: true) + ] + input[1] = BOWTIE2_BUILD.out.index + input[2] = [[ id:'test'], file(params.test_data['sarscov2']['genome']['genome_fasta'], checkIfExists: true)] + input[3] = false //save_unaligned + input[4] = false //sort + """ + } + } + + then { + assertAll ( + { assert process.success }, + { assert snapshot( + file(process.out.bam[0][1]).name, + process.out.log, + process.out.fastq, + process.out.versions + ).match() } + ) + } + + } + + test("sarscov2 - [fastq1, fastq2], large_index, fasta, false, false - bam") { + + config "./large_index.config" + setup { + run("BOWTIE2_BUILD") { + script "../../build/main.nf" + process { + """ + input[0] = [ + [ id:'test'], + file(params.test_data['sarscov2']['genome']['genome_fasta'], checkIfExists: true) + ] + """ + } + } + } + + when { + process { + """ + input[0] = [ + [ id:'test', single_end:false ], // meta map + [ + file(params.test_data['sarscov2']['illumina']['test_1_fastq_gz'], checkIfExists: true), + file(params.test_data['sarscov2']['illumina']['test_2_fastq_gz'], checkIfExists: true) + ] + ] + input[1] = BOWTIE2_BUILD.out.index + input[2] = [[ id:'test'], file(params.test_data['sarscov2']['genome']['genome_fasta'], checkIfExists: true)] + input[3] = false //save_unaligned + input[4] = false //sort + """ + } + } + + then { + assertAll ( + { assert process.success }, + { assert snapshot( + file(process.out.bam[0][1]).name, + process.out.log, + process.out.fastq, + process.out.versions + ).match() } + ) + } + + } + + test("sarscov2 - [fastq1, fastq2], index, fasta, true, false - bam") { + + setup { + run("BOWTIE2_BUILD") { + script "../../build/main.nf" + process { + """ + input[0] = [ + [ id:'test'], + file(params.test_data['sarscov2']['genome']['genome_fasta'], checkIfExists: true) + ] + """ + } + } + } + + when { + process { + """ + input[0] = [ + [ id:'test', single_end:false ], // meta map + [ + file(params.test_data['sarscov2']['illumina']['test_1_fastq_gz'], checkIfExists: true), + file(params.test_data['sarscov2']['illumina']['test_2_fastq_gz'], checkIfExists: true) + ] + ] + input[1] = BOWTIE2_BUILD.out.index + input[2] = [[ id:'test'], file(params.test_data['sarscov2']['genome']['genome_fasta'], checkIfExists: true)] + input[3] = false //save_unaligned + input[4] = false //sort + """ + } + } + + then { + assertAll ( + { assert process.success }, + { assert snapshot( + file(process.out.bam[0][1]).name, + process.out.log, + process.out.fastq, + process.out.versions + ).match() } + ) + } + + } + + test("sarscov2 - fastq, index, fasta, true, false - bam") { + + setup { + run("BOWTIE2_BUILD") { + script "../../build/main.nf" + process { + """ + input[0] = [ + [ id:'test'], + file(params.test_data['sarscov2']['genome']['genome_fasta'], checkIfExists: true) + ] + """ + } + } + } + + when { + process { + """ + input[0] = [ + [ id:'test', single_end:true ], // meta map + file(params.test_data['sarscov2']['illumina']['test_1_fastq_gz'], checkIfExists: true) + ] + input[1] = BOWTIE2_BUILD.out.index + input[2] = [[ id:'test'], file(params.test_data['sarscov2']['genome']['genome_fasta'], checkIfExists: true)] + input[3] = false //save_unaligned + input[4] = false //sort + """ + } + } + + then { + assertAll ( + { assert process.success }, + { assert snapshot( + file(process.out.bam[0][1]).name, + process.out.log, + process.out.fastq, + process.out.versions + ).match() } + + ) + } + + } + + test("sarscov2 - [fastq1, fastq2], index, fasta, true, true - cram") { + + config "./cram_crai.config" + setup { + run("BOWTIE2_BUILD") { + script "../../build/main.nf" + process { + """ + input[0] = [ + [ id:'test'], + file(params.test_data['sarscov2']['genome']['genome_fasta'], checkIfExists: true) + ] + """ + } + } + } + + when { + process { + """ + input[0] = [ + [ id:'test', single_end:false ], // meta map + [ + file(params.test_data['sarscov2']['illumina']['test_1_fastq_gz'], checkIfExists: true), + file(params.test_data['sarscov2']['illumina']['test_2_fastq_gz'], checkIfExists: true) + ] + ] + input[1] = BOWTIE2_BUILD.out.index + input[2] = [[ id:'test'], file(params.test_data['sarscov2']['genome']['genome_fasta'], checkIfExists: true)] + input[3] = false //save_unaligned + input[4] = true //sort + """ + } + } + + then { + assertAll ( + { assert process.success }, + { assert snapshot( + file(process.out.cram[0][1]).name, + file(process.out.crai[0][1]).name + ).match() } + ) + } + + } + + test("sarscov2 - [fastq1, fastq2], index, fasta, false, false - stub") { + + options "-stub" + setup { + run("BOWTIE2_BUILD") { + script "../../build/main.nf" + process { + """ + input[0] = [ + [ id:'test'], + file(params.test_data['sarscov2']['genome']['genome_fasta'], checkIfExists: true) + ] + """ + } + } + } + + when { + process { + """ + input[0] = [ + [ id:'test', single_end:false ], // meta map + [ + file(params.test_data['sarscov2']['illumina']['test_1_fastq_gz'], checkIfExists: true), + file(params.test_data['sarscov2']['illumina']['test_2_fastq_gz'], checkIfExists: true) + ] + ] + input[1] = BOWTIE2_BUILD.out.index + input[2] = [[ id:'test'], file(params.test_data['sarscov2']['genome']['genome_fasta'], checkIfExists: true)] + input[3] = false //save_unaligned + input[4] = false //sort + """ + } + } + + then { + assertAll ( + { assert process.success }, + { assert snapshot( + file(process.out.bam[0][1]).name, + file(process.out.csi[0][1]).name, + file(process.out.log[0][1]).name, + process.out.fastq, + process.out.versions + ).match() } + ) + } + + } + + test("sarscov2 - fastq, index, fasta, true, false - stub") { + + options "-stub" + setup { + run("BOWTIE2_BUILD") { + script "../../build/main.nf" + process { + """ + input[0] = [ + [ id:'test'], + file(params.test_data['sarscov2']['genome']['genome_fasta'], checkIfExists: true) + ] + """ + } + } + } + + when { + process { + """ + input[0] = [ + [ id:'test', single_end:true ], // meta map + file(params.test_data['sarscov2']['illumina']['test_1_fastq_gz'], checkIfExists: true) + ] + input[1] = BOWTIE2_BUILD.out.index + input[2] = [[ id:'test'], file(params.test_data['sarscov2']['genome']['genome_fasta'], checkIfExists: true)] + input[3] = false //save_unaligned + input[4] = false //sort + """ + } + } + + then { + assertAll ( + { assert process.success }, + { assert snapshot( + file(process.out.bam[0][1]).name, + file(process.out.csi[0][1]).name, + file(process.out.log[0][1]).name, + process.out.fastq, + process.out.versions + ).match() } + ) + } + + } + +} diff --git a/modules/nf-core/bowtie2/align/tests/main.nf.test.snap b/modules/nf-core/bowtie2/align/tests/main.nf.test.snap new file mode 100644 index 000000000..028e7da68 --- /dev/null +++ b/modules/nf-core/bowtie2/align/tests/main.nf.test.snap @@ -0,0 +1,311 @@ +{ + "sarscov2 - [fastq1, fastq2], large_index, fasta, false, false - bam": { + "content": [ + "test.bam", + [ + [ + { + "id": "test", + "single_end": false + }, + "test.bowtie2.log:md5,bd89ce1b28c93bf822bae391ffcedd19" + ] + ], + [ + + ], + [ + "versions.yml:md5,01d18ab035146ea790e9a0f70adb758f" + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-03-18T13:19:25.337323" + }, + "sarscov2 - fastq, index, fasta, false, false - sam2": { + "content": [ + [ + "ERR5069949.2151832\t16\tMT192765.1\t17453\t42\t150M\t*\t0\t0\tACGCACATTGCTAACTAAGGGCACACTAGAACCAGAATATTTCAATTCAGTGTGTAGACTTATGAAAACTATAGGTCCAGACATGTTCCTCGGAACTTGTCGGCGTTGTCCTGCTGAAATTGTTGACACTGTGAGTGCTTTGGTTTATGA\tAAAA versions.yml + "${task.process}": + bowtie2: \$(echo \$(bowtie2 --version 2>&1) | sed 's/^.*bowtie2-align-s version //; s/ .*\$//') + END_VERSIONS + """ + + stub: + """ + mkdir bowtie2 + touch bowtie2/${fasta.baseName}.{1..4}.bt2 + touch bowtie2/${fasta.baseName}.rev.{1,2}.bt2 + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + bowtie2: \$(echo \$(bowtie2 --version 2>&1) | sed 's/^.*bowtie2-align-s version //; s/ .*\$//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/bowtie2/build/meta.yml b/modules/nf-core/bowtie2/build/meta.yml new file mode 100644 index 000000000..2d6879919 --- /dev/null +++ b/modules/nf-core/bowtie2/build/meta.yml @@ -0,0 +1,46 @@ +name: bowtie2_build +description: Builds bowtie index for reference genome +keywords: + - build + - index + - fasta + - genome + - reference +tools: + - bowtie2: + description: | + Bowtie 2 is an ultrafast and memory-efficient tool for aligning + sequencing reads to long reference sequences. + homepage: http://bowtie-bio.sourceforge.net/bowtie2/index.shtml + documentation: http://bowtie-bio.sourceforge.net/bowtie2/manual.shtml + doi: 10.1038/nmeth.1923 + licence: ["GPL-3.0-or-later"] +input: + - meta: + type: map + description: | + Groovy Map containing reference information + e.g. [ id:'test', single_end:false ] + - fasta: + type: file + description: Input genome fasta file +output: + - meta: + type: map + description: | + Groovy Map containing reference information + e.g. [ id:'test', single_end:false ] + - index: + type: file + description: Bowtie2 genome index files + pattern: "*.bt2" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@joseespinosa" + - "@drpatelh" +maintainers: + - "@joseespinosa" + - "@drpatelh" diff --git a/modules/nf-core/bowtie2/build/tests/main.nf.test b/modules/nf-core/bowtie2/build/tests/main.nf.test new file mode 100644 index 000000000..163760257 --- /dev/null +++ b/modules/nf-core/bowtie2/build/tests/main.nf.test @@ -0,0 +1,31 @@ +nextflow_process { + + name "Test Process BOWTIE2_BUILD" + script "modules/nf-core/bowtie2/build/main.nf" + process "BOWTIE2_BUILD" + tag "modules" + tag "modules_nfcore" + tag "bowtie2" + tag "bowtie2/build" + + test("Should run without failures") { + + when { + process { + """ + input[0] = [ + [ id:'test' ], + file(params.test_data['sarscov2']['genome']['genome_fasta'], checkIfExists: true) + ] + """ + } + } + + then { + assert process.success + assert snapshot(process.out).match() + } + + } + +} diff --git a/modules/nf-core/bowtie2/build/tests/main.nf.test.snap b/modules/nf-core/bowtie2/build/tests/main.nf.test.snap new file mode 100644 index 000000000..6875e0213 --- /dev/null +++ b/modules/nf-core/bowtie2/build/tests/main.nf.test.snap @@ -0,0 +1,45 @@ +{ + "Should run without failures": { + "content": [ + { + "0": [ + [ + { + "id": "test" + }, + [ + "genome.1.bt2:md5,cbe3d0bbea55bc57c99b4bfa25b5fbdf", + "genome.2.bt2:md5,47b153cd1319abc88dda532462651fcf", + "genome.3.bt2:md5,4ed93abba181d8dfab2e303e33114777", + "genome.4.bt2:md5,c25be5f8b0378abf7a58c8a880b87626", + "genome.rev.1.bt2:md5,52be6950579598a990570fbcf5372184", + "genome.rev.2.bt2:md5,e3b4ef343dea4dd571642010a7d09597" + ] + ] + ], + "1": [ + "versions.yml:md5,1df11e9b82891527271c889c880d3974" + ], + "index": [ + [ + { + "id": "test" + }, + [ + "genome.1.bt2:md5,cbe3d0bbea55bc57c99b4bfa25b5fbdf", + "genome.2.bt2:md5,47b153cd1319abc88dda532462651fcf", + "genome.3.bt2:md5,4ed93abba181d8dfab2e303e33114777", + "genome.4.bt2:md5,c25be5f8b0378abf7a58c8a880b87626", + "genome.rev.1.bt2:md5,52be6950579598a990570fbcf5372184", + "genome.rev.2.bt2:md5,e3b4ef343dea4dd571642010a7d09597" + ] + ] + ], + "versions": [ + "versions.yml:md5,1df11e9b82891527271c889c880d3974" + ] + } + ], + "timestamp": "2023-11-23T11:51:01.107681997" + } +} \ No newline at end of file diff --git a/modules/nf-core/bowtie2/build/tests/tags.yml b/modules/nf-core/bowtie2/build/tests/tags.yml new file mode 100644 index 000000000..81aa61dab --- /dev/null +++ b/modules/nf-core/bowtie2/build/tests/tags.yml @@ -0,0 +1,2 @@ +bowtie2/build: + - modules/nf-core/bowtie2/build/** diff --git a/modules/nf-core/bwa/index/environment.yml b/modules/nf-core/bwa/index/environment.yml new file mode 100644 index 000000000..126e00344 --- /dev/null +++ b/modules/nf-core/bwa/index/environment.yml @@ -0,0 +1,7 @@ +name: bwa_index +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::bwa=0.7.18 diff --git a/modules/nf-core/bwa/index/main.nf b/modules/nf-core/bwa/index/main.nf new file mode 100644 index 000000000..2e48b6caa --- /dev/null +++ b/modules/nf-core/bwa/index/main.nf @@ -0,0 +1,53 @@ +process BWA_INDEX { + tag "$fasta" + label 'process_single' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/bwa:0.7.18--he4a0461_0' : + 'biocontainers/bwa:0.7.18--he4a0461_0' }" + + input: + tuple val(meta), path(fasta) + + output: + tuple val(meta), path(bwa) , emit: index + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def prefix = task.ext.prefix ?: "${fasta.baseName}" + def args = task.ext.args ?: '' + """ + mkdir bwa + bwa \\ + index \\ + $args \\ + -p bwa/${prefix} \\ + $fasta + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + bwa: \$(echo \$(bwa 2>&1) | sed 's/^.*Version: //; s/Contact:.*\$//') + END_VERSIONS + """ + + stub: + def prefix = task.ext.prefix ?: "${fasta.baseName}" + """ + mkdir bwa + + touch bwa/${prefix}.amb + touch bwa/${prefix}.ann + touch bwa/${prefix}.bwt + touch bwa/${prefix}.pac + touch bwa/${prefix}.sa + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + bwa: \$(echo \$(bwa 2>&1) | sed 's/^.*Version: //; s/Contact:.*\$//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/bwa/index/meta.yml b/modules/nf-core/bwa/index/meta.yml new file mode 100644 index 000000000..6bbc87a64 --- /dev/null +++ b/modules/nf-core/bwa/index/meta.yml @@ -0,0 +1,46 @@ +name: bwa_index +description: Create BWA index for reference genome +keywords: + - index + - fasta + - genome + - reference +tools: + - bwa: + description: | + BWA is a software package for mapping DNA sequences against + a large reference genome, such as the human genome. + homepage: http://bio-bwa.sourceforge.net/ + documentation: https://bio-bwa.sourceforge.net/bwa.shtml + arxiv: arXiv:1303.3997 + licence: ["GPL-3.0-or-later"] +input: + - meta: + type: map + description: | + Groovy Map containing reference information. + e.g. [ id:'test', single_end:false ] + - fasta: + type: file + description: Input genome fasta file +output: + - meta: + type: map + description: | + Groovy Map containing reference information. + e.g. [ id:'test', single_end:false ] + - index: + type: file + description: BWA genome index files + pattern: "*.{amb,ann,bwt,pac,sa}" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@drpatelh" + - "@maxulysse" +maintainers: + - "@drpatelh" + - "@maxulysse" + - "@gallvp" diff --git a/modules/nf-core/bwa/index/tests/main.nf.test b/modules/nf-core/bwa/index/tests/main.nf.test new file mode 100644 index 000000000..af33e73ca --- /dev/null +++ b/modules/nf-core/bwa/index/tests/main.nf.test @@ -0,0 +1,33 @@ +nextflow_process { + + name "Test Process BWA_INDEX" + tag "modules_nfcore" + tag "modules" + tag "bwa" + tag "bwa/index" + script "../main.nf" + process "BWA_INDEX" + + test("BWA index") { + + when { + process { + """ + input[0] = [ + [id: 'test'], + file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.fasta', checkIfExists: true) + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + +} diff --git a/modules/nf-core/bwa/index/tests/main.nf.test.snap b/modules/nf-core/bwa/index/tests/main.nf.test.snap new file mode 100644 index 000000000..7c8f04657 --- /dev/null +++ b/modules/nf-core/bwa/index/tests/main.nf.test.snap @@ -0,0 +1,47 @@ +{ + "BWA index": { + "content": [ + { + "0": [ + [ + { + "id": "test" + }, + [ + "genome.amb:md5,3a68b8b2287e07dd3f5f95f4344ba76e", + "genome.ann:md5,c32e11f6c859f166c7525a9c1d583567", + "genome.bwt:md5,0469c30a1e239dd08f68afe66fde99da", + "genome.pac:md5,983e3d2cd6f36e2546e6d25a0da78d66", + "genome.sa:md5,ab3952cabf026b48cd3eb5bccbb636d1" + ] + ] + ], + "1": [ + "versions.yml:md5,a64462ac7dfb21f4ade9b02e7f65c5bb" + ], + "index": [ + [ + { + "id": "test" + }, + [ + "genome.amb:md5,3a68b8b2287e07dd3f5f95f4344ba76e", + "genome.ann:md5,c32e11f6c859f166c7525a9c1d583567", + "genome.bwt:md5,0469c30a1e239dd08f68afe66fde99da", + "genome.pac:md5,983e3d2cd6f36e2546e6d25a0da78d66", + "genome.sa:md5,ab3952cabf026b48cd3eb5bccbb636d1" + ] + ] + ], + "versions": [ + "versions.yml:md5,a64462ac7dfb21f4ade9b02e7f65c5bb" + ] + } + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-05-16T11:40:09.925307" + } +} \ No newline at end of file diff --git a/modules/nf-core/bwa/index/tests/tags.yml b/modules/nf-core/bwa/index/tests/tags.yml new file mode 100644 index 000000000..28bb483c4 --- /dev/null +++ b/modules/nf-core/bwa/index/tests/tags.yml @@ -0,0 +1,2 @@ +bwa/index: + - modules/nf-core/bwa/index/** diff --git a/modules/nf-core/cat/cat/environment.yml b/modules/nf-core/cat/cat/environment.yml new file mode 100644 index 000000000..17a04ef23 --- /dev/null +++ b/modules/nf-core/cat/cat/environment.yml @@ -0,0 +1,7 @@ +name: cat_cat +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - conda-forge::pigz=2.3.4 diff --git a/modules/nf-core/cat/cat/main.nf b/modules/nf-core/cat/cat/main.nf new file mode 100644 index 000000000..adbdbd7ba --- /dev/null +++ b/modules/nf-core/cat/cat/main.nf @@ -0,0 +1,79 @@ +process CAT_CAT { + tag "$meta.id" + label 'process_low' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/pigz:2.3.4' : + 'biocontainers/pigz:2.3.4' }" + + input: + tuple val(meta), path(files_in) + + output: + tuple val(meta), path("${prefix}"), emit: file_out + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def args2 = task.ext.args2 ?: '' + def file_list = files_in.collect { it.toString() } + + // choose appropriate concatenation tool depending on input and output format + + // | input | output | command1 | command2 | + // |-----------|------------|----------|----------| + // | gzipped | gzipped | cat | | + // | ungzipped | ungzipped | cat | | + // | gzipped | ungzipped | zcat | | + // | ungzipped | gzipped | cat | pigz | + + // Use input file ending as default + prefix = task.ext.prefix ?: "${meta.id}${getFileSuffix(file_list[0])}" + out_zip = prefix.endsWith('.gz') + in_zip = file_list[0].endsWith('.gz') + command1 = (in_zip && !out_zip) ? 'zcat' : 'cat' + command2 = (!in_zip && out_zip) ? "| pigz -c -p $task.cpus $args2" : '' + if(file_list.contains(prefix.trim())) { + error "The name of the input file can't be the same as for the output prefix in the " + + "module CAT_CAT (currently `$prefix`). Please choose a different one." + } + """ + $command1 \\ + $args \\ + ${file_list.join(' ')} \\ + $command2 \\ + > ${prefix} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + pigz: \$( pigz --version 2>&1 | sed 's/pigz //g' ) + END_VERSIONS + """ + + stub: + def file_list = files_in.collect { it.toString() } + prefix = task.ext.prefix ?: "${meta.id}${file_list[0].substring(file_list[0].lastIndexOf('.'))}" + if(file_list.contains(prefix.trim())) { + error "The name of the input file can't be the same as for the output prefix in the " + + "module CAT_CAT (currently `$prefix`). Please choose a different one." + } + """ + touch $prefix + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + pigz: \$( pigz --version 2>&1 | sed 's/pigz //g' ) + END_VERSIONS + """ +} + +// for .gz files also include the second to last extension if it is present. E.g., .fasta.gz +def getFileSuffix(filename) { + def match = filename =~ /^.*?((\.\w{1,5})?(\.\w{1,5}\.gz$))/ + return match ? match[0][1] : filename.substring(filename.lastIndexOf('.')) +} + diff --git a/modules/nf-core/cat/cat/meta.yml b/modules/nf-core/cat/cat/meta.yml new file mode 100644 index 000000000..00a8db0bc --- /dev/null +++ b/modules/nf-core/cat/cat/meta.yml @@ -0,0 +1,36 @@ +name: cat_cat +description: A module for concatenation of gzipped or uncompressed files +keywords: + - concatenate + - gzip + - cat +tools: + - cat: + description: Just concatenation + documentation: https://man7.org/linux/man-pages/man1/cat.1.html + licence: ["GPL-3.0-or-later"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - files_in: + type: file + description: List of compressed / uncompressed files + pattern: "*" +output: + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - file_out: + type: file + description: Concatenated file. Will be gzipped if file_out ends with ".gz" + pattern: "${file_out}" +authors: + - "@erikrikarddaniel" + - "@FriederikeHanssen" +maintainers: + - "@erikrikarddaniel" + - "@FriederikeHanssen" diff --git a/modules/nf-core/cat/cat/tests/main.nf.test b/modules/nf-core/cat/cat/tests/main.nf.test new file mode 100644 index 000000000..fcee2d19f --- /dev/null +++ b/modules/nf-core/cat/cat/tests/main.nf.test @@ -0,0 +1,178 @@ +nextflow_process { + + name "Test Process CAT_CAT" + script "../main.nf" + process "CAT_CAT" + tag "modules" + tag "modules_nfcore" + tag "cat" + tag "cat/cat" + + test("test_cat_name_conflict") { + when { + params { + outdir = "${outputDir}" + } + process { + """ + input[0] = + [ + [ id:'genome', single_end:true ], + [ + file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.fasta', checkIfExists: true), + file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.sizes', checkIfExists: true) + ] + ] + """ + } + } + then { + assertAll( + { assert !process.success }, + { assert process.stdout.toString().contains("The name of the input file can't be the same as for the output prefix") } + ) + } + } + + test("test_cat_unzipped_unzipped") { + when { + params { + outdir = "${outputDir}" + } + process { + """ + input[0] = + [ + [ id:'test', single_end:true ], + [ + file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.fasta', checkIfExists: true), + file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.sizes', checkIfExists: true) + ] + ] + """ + } + } + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + } + + + test("test_cat_zipped_zipped") { + when { + params { + outdir = "${outputDir}" + } + process { + """ + input[0] = + [ + [ id:'test', single_end:true ], + [ + file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.gff3.gz', checkIfExists: true), + file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/alignment/last/contigs.genome.maf.gz', checkIfExists: true) + ] + ] + """ + } + } + then { + def lines = path(process.out.file_out.get(0).get(1)).linesGzip + assertAll( + { assert process.success }, + { assert snapshot(lines[0..5]).match("test_cat_zipped_zipped_lines") }, + { assert snapshot(lines.size()).match("test_cat_zipped_zipped_size")} + ) + } + } + + test("test_cat_zipped_unzipped") { + config './nextflow_zipped_unzipped.config' + + when { + params { + outdir = "${outputDir}" + } + process { + """ + input[0] = + [ + [ id:'test', single_end:true ], + [ + file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.gff3.gz', checkIfExists: true), + file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/alignment/last/contigs.genome.maf.gz', checkIfExists: true) + ] + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + + test("test_cat_unzipped_zipped") { + config './nextflow_unzipped_zipped.config' + when { + params { + outdir = "${outputDir}" + } + process { + """ + input[0] = + [ + [ id:'test', single_end:true ], + [ + file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.fasta', checkIfExists: true), + file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.sizes', checkIfExists: true) + ] + ] + """ + } + } + then { + def lines = path(process.out.file_out.get(0).get(1)).linesGzip + assertAll( + { assert process.success }, + { assert snapshot(lines[0..5]).match("test_cat_unzipped_zipped_lines") }, + { assert snapshot(lines.size()).match("test_cat_unzipped_zipped_size")} + ) + } + } + + test("test_cat_one_file_unzipped_zipped") { + config './nextflow_unzipped_zipped.config' + when { + params { + outdir = "${outputDir}" + } + process { + """ + input[0] = + [ + [ id:'test', single_end:true ], + [ + file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.fasta', checkIfExists: true) + ] + ] + """ + } + } + then { + def lines = path(process.out.file_out.get(0).get(1)).linesGzip + assertAll( + { assert process.success }, + { assert snapshot(lines[0..5]).match("test_cat_one_file_unzipped_zipped_lines") }, + { assert snapshot(lines.size()).match("test_cat_one_file_unzipped_zipped_size")} + ) + } + } +} diff --git a/modules/nf-core/cat/cat/tests/main.nf.test.snap b/modules/nf-core/cat/cat/tests/main.nf.test.snap new file mode 100644 index 000000000..423571ba2 --- /dev/null +++ b/modules/nf-core/cat/cat/tests/main.nf.test.snap @@ -0,0 +1,121 @@ +{ + "test_cat_unzipped_zipped_size": { + "content": [ + 375 + ], + "timestamp": "2023-10-16T14:33:08.049445686" + }, + "test_cat_unzipped_unzipped": { + "content": [ + { + "0": [ + [ + { + "id": "test", + "single_end": true + }, + "test.fasta:md5,f44b33a0e441ad58b2d3700270e2dbe2" + ] + ], + "1": [ + "versions.yml:md5,115ed6177ebcff24eb99d503fa5ef894" + ], + "file_out": [ + [ + { + "id": "test", + "single_end": true + }, + "test.fasta:md5,f44b33a0e441ad58b2d3700270e2dbe2" + ] + ], + "versions": [ + "versions.yml:md5,115ed6177ebcff24eb99d503fa5ef894" + ] + } + ], + "timestamp": "2023-10-16T14:32:18.500464399" + }, + "test_cat_zipped_unzipped": { + "content": [ + { + "0": [ + [ + { + "id": "test", + "single_end": true + }, + "cat.txt:md5,c439d3b60e7bc03e8802a451a0d9a5d9" + ] + ], + "1": [ + "versions.yml:md5,115ed6177ebcff24eb99d503fa5ef894" + ], + "file_out": [ + [ + { + "id": "test", + "single_end": true + }, + "cat.txt:md5,c439d3b60e7bc03e8802a451a0d9a5d9" + ] + ], + "versions": [ + "versions.yml:md5,115ed6177ebcff24eb99d503fa5ef894" + ] + } + ], + "timestamp": "2023-10-16T14:32:49.642741302" + }, + "test_cat_zipped_zipped_lines": { + "content": [ + [ + "MT192765.1\tGenbank\ttranscript\t259\t29667\t.\t+\t.\tID=unknown_transcript_1;geneID=orf1ab;gene_name=orf1ab", + "MT192765.1\tGenbank\tgene\t259\t21548\t.\t+\t.\tParent=unknown_transcript_1", + "MT192765.1\tGenbank\tCDS\t259\t13461\t.\t+\t0\tParent=unknown_transcript_1;exception=\"ribosomal slippage\";gbkey=CDS;gene=orf1ab;note=\"pp1ab;translated=by -1 ribosomal frameshift\";product=\"orf1ab polyprotein\";protein_id=QIK50426.1", + "MT192765.1\tGenbank\tCDS\t13461\t21548\t.\t+\t0\tParent=unknown_transcript_1;exception=\"ribosomal slippage\";gbkey=CDS;gene=orf1ab;note=\"pp1ab;translated=by -1 ribosomal frameshift\";product=\"orf1ab polyprotein\";protein_id=QIK50426.1", + "MT192765.1\tGenbank\tCDS\t21556\t25377\t.\t+\t0\tParent=unknown_transcript_1;gbkey=CDS;gene=S;note=\"structural protein\";product=\"surface glycoprotein\";protein_id=QIK50427.1", + "MT192765.1\tGenbank\tgene\t21556\t25377\t.\t+\t.\tParent=unknown_transcript_1" + ] + ], + "timestamp": "2023-10-16T14:32:33.629048645" + }, + "test_cat_unzipped_zipped_lines": { + "content": [ + [ + ">MT192765.1 Severe acute respiratory syndrome coronavirus 2 isolate SARS-CoV-2/human/USA/PC00101P/2020, complete genome", + "GTTTATACCTTCCCAGGTAACAAACCAACCAACTTTCGATCTCTTGTAGATCTGTTCTCTAAACGAACTTTAAAATCTGT", + "GTGGCTGTCACTCGGCTGCATGCTTAGTGCACTCACGCAGTATAATTAATAACTAATTACTGTCGTTGACAGGACACGAG", + "TAACTCGTCTATCTTCTGCAGGCTGCTTACGGTTTCGTCCGTGTTGCAGCCGATCATCAGCACATCTAGGTTTTGTCCGG", + "GTGTGACCGAAAGGTAAGATGGAGAGCCTTGTCCCTGGTTTCAACGAGAAAACACACGTCCAACTCAGTTTGCCTGTTTT", + "ACAGGTTCGCGACGTGCTCGTACGTGGCTTTGGAGACTCCGTGGAGGAGGTCTTATCAGAGGCACGTCAACATCTTAAAG" + ] + ], + "timestamp": "2023-10-16T14:33:08.038830506" + }, + "test_cat_one_file_unzipped_zipped_lines": { + "content": [ + [ + ">MT192765.1 Severe acute respiratory syndrome coronavirus 2 isolate SARS-CoV-2/human/USA/PC00101P/2020, complete genome", + "GTTTATACCTTCCCAGGTAACAAACCAACCAACTTTCGATCTCTTGTAGATCTGTTCTCTAAACGAACTTTAAAATCTGT", + "GTGGCTGTCACTCGGCTGCATGCTTAGTGCACTCACGCAGTATAATTAATAACTAATTACTGTCGTTGACAGGACACGAG", + "TAACTCGTCTATCTTCTGCAGGCTGCTTACGGTTTCGTCCGTGTTGCAGCCGATCATCAGCACATCTAGGTTTTGTCCGG", + "GTGTGACCGAAAGGTAAGATGGAGAGCCTTGTCCCTGGTTTCAACGAGAAAACACACGTCCAACTCAGTTTGCCTGTTTT", + "ACAGGTTCGCGACGTGCTCGTACGTGGCTTTGGAGACTCCGTGGAGGAGGTCTTATCAGAGGCACGTCAACATCTTAAAG" + ] + ], + "timestamp": "2023-10-16T14:33:21.39642399" + }, + "test_cat_zipped_zipped_size": { + "content": [ + 78 + ], + "timestamp": "2023-10-16T14:32:33.641869244" + }, + "test_cat_one_file_unzipped_zipped_size": { + "content": [ + 374 + ], + "timestamp": "2023-10-16T14:33:21.4094373" + } +} \ No newline at end of file diff --git a/modules/nf-core/cat/cat/tests/nextflow_unzipped_zipped.config b/modules/nf-core/cat/cat/tests/nextflow_unzipped_zipped.config new file mode 100644 index 000000000..ec26b0fdc --- /dev/null +++ b/modules/nf-core/cat/cat/tests/nextflow_unzipped_zipped.config @@ -0,0 +1,6 @@ + +process { + withName: CAT_CAT { + ext.prefix = 'cat.txt.gz' + } +} diff --git a/modules/nf-core/cat/cat/tests/nextflow_zipped_unzipped.config b/modules/nf-core/cat/cat/tests/nextflow_zipped_unzipped.config new file mode 100644 index 000000000..fbc79783d --- /dev/null +++ b/modules/nf-core/cat/cat/tests/nextflow_zipped_unzipped.config @@ -0,0 +1,8 @@ + +process { + + withName: CAT_CAT { + ext.prefix = 'cat.txt' + } + +} diff --git a/modules/nf-core/cat/cat/tests/tags.yml b/modules/nf-core/cat/cat/tests/tags.yml new file mode 100644 index 000000000..37b578f52 --- /dev/null +++ b/modules/nf-core/cat/cat/tests/tags.yml @@ -0,0 +1,2 @@ +cat/cat: + - modules/nf-core/cat/cat/** diff --git a/modules/nf-core/cat/fastq/environment.yml b/modules/nf-core/cat/fastq/environment.yml new file mode 100644 index 000000000..8c69b121f --- /dev/null +++ b/modules/nf-core/cat/fastq/environment.yml @@ -0,0 +1,7 @@ +name: cat_fastq +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - conda-forge::coreutils=8.30 diff --git a/modules/nf-core/cat/fastq/main.nf b/modules/nf-core/cat/fastq/main.nf new file mode 100644 index 000000000..f132b2adc --- /dev/null +++ b/modules/nf-core/cat/fastq/main.nf @@ -0,0 +1,79 @@ +process CAT_FASTQ { + tag "$meta.id" + label 'process_single' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/ubuntu:20.04' : + 'nf-core/ubuntu:20.04' }" + + input: + tuple val(meta), path(reads, stageAs: "input*/*") + + output: + tuple val(meta), path("*.merged.fastq.gz"), emit: reads + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def readList = reads instanceof List ? reads.collect{ it.toString() } : [reads.toString()] + if (meta.single_end) { + if (readList.size >= 1) { + """ + cat ${readList.join(' ')} > ${prefix}.merged.fastq.gz + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + cat: \$(echo \$(cat --version 2>&1) | sed 's/^.*coreutils) //; s/ .*\$//') + END_VERSIONS + """ + } + } else { + if (readList.size >= 2) { + def read1 = [] + def read2 = [] + readList.eachWithIndex{ v, ix -> ( ix & 1 ? read2 : read1 ) << v } + """ + cat ${read1.join(' ')} > ${prefix}_1.merged.fastq.gz + cat ${read2.join(' ')} > ${prefix}_2.merged.fastq.gz + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + cat: \$(echo \$(cat --version 2>&1) | sed 's/^.*coreutils) //; s/ .*\$//') + END_VERSIONS + """ + } + } + + stub: + def prefix = task.ext.prefix ?: "${meta.id}" + def readList = reads instanceof List ? reads.collect{ it.toString() } : [reads.toString()] + if (meta.single_end) { + if (readList.size > 1) { + """ + touch ${prefix}.merged.fastq.gz + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + cat: \$(echo \$(cat --version 2>&1) | sed 's/^.*coreutils) //; s/ .*\$//') + END_VERSIONS + """ + } + } else { + if (readList.size > 2) { + """ + touch ${prefix}_1.merged.fastq.gz + touch ${prefix}_2.merged.fastq.gz + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + cat: \$(echo \$(cat --version 2>&1) | sed 's/^.*coreutils) //; s/ .*\$//') + END_VERSIONS + """ + } + } +} diff --git a/modules/nf-core/cat/fastq/meta.yml b/modules/nf-core/cat/fastq/meta.yml new file mode 100644 index 000000000..db4ac3c79 --- /dev/null +++ b/modules/nf-core/cat/fastq/meta.yml @@ -0,0 +1,42 @@ +name: cat_fastq +description: Concatenates fastq files +keywords: + - cat + - fastq + - concatenate +tools: + - cat: + description: | + The cat utility reads files sequentially, writing them to the standard output. + documentation: https://www.gnu.org/software/coreutils/manual/html_node/cat-invocation.html + licence: ["GPL-3.0-or-later"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - reads: + type: file + description: | + List of input FastQ files to be concatenated. +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - reads: + type: file + description: Merged fastq file + pattern: "*.{merged.fastq.gz}" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@joseespinosa" + - "@drpatelh" +maintainers: + - "@joseespinosa" + - "@drpatelh" diff --git a/modules/nf-core/cat/fastq/tests/main.nf.test b/modules/nf-core/cat/fastq/tests/main.nf.test new file mode 100644 index 000000000..a71dcb8df --- /dev/null +++ b/modules/nf-core/cat/fastq/tests/main.nf.test @@ -0,0 +1,140 @@ +// NOTE The version snaps may not be consistant +// https://github.com/nf-core/modules/pull/4087#issuecomment-1767948035 +nextflow_process { + + name "Test Process CAT_FASTQ" + script "../main.nf" + process "CAT_FASTQ" + tag "modules" + tag "modules_nfcore" + tag "cat" + tag "cat/fastq" + + test("test_cat_fastq_single_end") { + + when { + params { + outdir = "$outputDir" + } + process { + """ + input[0] = Channel.of([ + [ id:'test', single_end:true ], // meta map + [ file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_1.fastq.gz', checkIfExists: true), + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_2.fastq.gz', checkIfExists: true)] + ]) + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + } + + test("test_cat_fastq_paired_end") { + + when { + params { + outdir = "$outputDir" + } + process { + """ + input[0] = Channel.of([ + [ id:'test', single_end:false ], // meta map + [ file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_1.fastq.gz', checkIfExists: true), + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_2.fastq.gz', checkIfExists: true), + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test2_1.fastq.gz', checkIfExists: true), + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test2_2.fastq.gz', checkIfExists: true)] + ]) + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + } + + test("test_cat_fastq_single_end_same_name") { + + when { + params { + outdir = "$outputDir" + } + process { + """ + input[0] = Channel.of([ + [ id:'test', single_end:true ], // meta map + [ file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_1.fastq.gz', checkIfExists: true), + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_1.fastq.gz', checkIfExists: true)] + ]) + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + } + + test("test_cat_fastq_paired_end_same_name") { + + when { + params { + outdir = "$outputDir" + } + process { + """ + input[0] = Channel.of([ + [ id:'test', single_end:false ], // meta map + [ file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_1.fastq.gz', checkIfExists: true), + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_2.fastq.gz', checkIfExists: true), + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_1.fastq.gz', checkIfExists: true), + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_2.fastq.gz', checkIfExists: true)] + ]) + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + } + + test("test_cat_fastq_single_end_single_file") { + + when { + params { + outdir = "$outputDir" + } + process { + """ + input[0] = Channel.of([ + [ id:'test', single_end:true ], // meta map + [ file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_1.fastq.gz', checkIfExists: true)] + ]) + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + } +} diff --git a/modules/nf-core/cat/fastq/tests/main.nf.test.snap b/modules/nf-core/cat/fastq/tests/main.nf.test.snap new file mode 100644 index 000000000..43dfe28fc --- /dev/null +++ b/modules/nf-core/cat/fastq/tests/main.nf.test.snap @@ -0,0 +1,169 @@ +{ + "test_cat_fastq_single_end": { + "content": [ + { + "0": [ + [ + { + "id": "test", + "single_end": true + }, + "test.merged.fastq.gz:md5,ee314a9bd568d06617171b0c85f508da" + ] + ], + "1": [ + "versions.yml:md5,d42d6e24d67004608495883e00bd501b" + ], + "reads": [ + [ + { + "id": "test", + "single_end": true + }, + "test.merged.fastq.gz:md5,ee314a9bd568d06617171b0c85f508da" + ] + ], + "versions": [ + "versions.yml:md5,d42d6e24d67004608495883e00bd501b" + ] + } + ], + "timestamp": "2024-01-17T17:30:39.816981" + }, + "test_cat_fastq_single_end_same_name": { + "content": [ + { + "0": [ + [ + { + "id": "test", + "single_end": true + }, + "test.merged.fastq.gz:md5,3ad9406595fafec8172368f9cd0b6a22" + ] + ], + "1": [ + "versions.yml:md5,d42d6e24d67004608495883e00bd501b" + ], + "reads": [ + [ + { + "id": "test", + "single_end": true + }, + "test.merged.fastq.gz:md5,3ad9406595fafec8172368f9cd0b6a22" + ] + ], + "versions": [ + "versions.yml:md5,d42d6e24d67004608495883e00bd501b" + ] + } + ], + "timestamp": "2024-01-17T17:32:35.229332" + }, + "test_cat_fastq_single_end_single_file": { + "content": [ + { + "0": [ + [ + { + "id": "test", + "single_end": true + }, + "test.merged.fastq.gz:md5,4161df271f9bfcd25d5845a1e220dbec" + ] + ], + "1": [ + "versions.yml:md5,d42d6e24d67004608495883e00bd501b" + ], + "reads": [ + [ + { + "id": "test", + "single_end": true + }, + "test.merged.fastq.gz:md5,4161df271f9bfcd25d5845a1e220dbec" + ] + ], + "versions": [ + "versions.yml:md5,d42d6e24d67004608495883e00bd501b" + ] + } + ], + "timestamp": "2024-01-17T17:34:00.058829" + }, + "test_cat_fastq_paired_end_same_name": { + "content": [ + { + "0": [ + [ + { + "id": "test", + "single_end": false + }, + [ + "test_1.merged.fastq.gz:md5,3ad9406595fafec8172368f9cd0b6a22", + "test_2.merged.fastq.gz:md5,a52cab0b840c7178b0ea83df1fdbe8d5" + ] + ] + ], + "1": [ + "versions.yml:md5,d42d6e24d67004608495883e00bd501b" + ], + "reads": [ + [ + { + "id": "test", + "single_end": false + }, + [ + "test_1.merged.fastq.gz:md5,3ad9406595fafec8172368f9cd0b6a22", + "test_2.merged.fastq.gz:md5,a52cab0b840c7178b0ea83df1fdbe8d5" + ] + ] + ], + "versions": [ + "versions.yml:md5,d42d6e24d67004608495883e00bd501b" + ] + } + ], + "timestamp": "2024-01-17T17:33:33.031555" + }, + "test_cat_fastq_paired_end": { + "content": [ + { + "0": [ + [ + { + "id": "test", + "single_end": false + }, + [ + "test_1.merged.fastq.gz:md5,3ad9406595fafec8172368f9cd0b6a22", + "test_2.merged.fastq.gz:md5,a52cab0b840c7178b0ea83df1fdbe8d5" + ] + ] + ], + "1": [ + "versions.yml:md5,d42d6e24d67004608495883e00bd501b" + ], + "reads": [ + [ + { + "id": "test", + "single_end": false + }, + [ + "test_1.merged.fastq.gz:md5,3ad9406595fafec8172368f9cd0b6a22", + "test_2.merged.fastq.gz:md5,a52cab0b840c7178b0ea83df1fdbe8d5" + ] + ] + ], + "versions": [ + "versions.yml:md5,d42d6e24d67004608495883e00bd501b" + ] + } + ], + "timestamp": "2024-01-17T17:32:02.270935" + } +} \ No newline at end of file diff --git a/modules/nf-core/cat/fastq/tests/tags.yml b/modules/nf-core/cat/fastq/tests/tags.yml new file mode 100644 index 000000000..6ac436140 --- /dev/null +++ b/modules/nf-core/cat/fastq/tests/tags.yml @@ -0,0 +1,2 @@ +cat/fastq: + - modules/nf-core/cat/fastq/** diff --git a/modules/nf-core/circexplorer2/annotate/environment.yml b/modules/nf-core/circexplorer2/annotate/environment.yml new file mode 100644 index 000000000..def886e93 --- /dev/null +++ b/modules/nf-core/circexplorer2/annotate/environment.yml @@ -0,0 +1,7 @@ +name: circexplorer2_annotate +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::circexplorer2=2.3.8 diff --git a/modules/nf-core/circexplorer2/annotate/main.nf b/modules/nf-core/circexplorer2/annotate/main.nf new file mode 100644 index 000000000..0e9fa0a02 --- /dev/null +++ b/modules/nf-core/circexplorer2/annotate/main.nf @@ -0,0 +1,50 @@ +process CIRCEXPLORER2_ANNOTATE { + tag "$meta.id" + label 'process_low' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/circexplorer2:2.3.8--pyh864c0ab_1': + 'biocontainers/circexplorer2:2.3.8--pyh864c0ab_1' }" + + input: + tuple val(meta), path(junctions) + path(fasta) + path(gene_annotation) + + output: + tuple val(meta), path("*.txt"), emit: txt + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + """ + CIRCexplorer2 \\ + annotate \\ + -r $gene_annotation \\ + -g $fasta \\ + -b $junctions \\ + -o ${prefix}.txt \\ + $args + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + circexplorer2: \$(echo \$(CIRCexplorer2 --version 2>&1) ) + END_VERSIONS + """ + + stub: + def prefix = task.ext.prefix ?: "${meta.id}" + """ + touch ${prefix}.txt + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + circexplorer2: \$(echo \$(CIRCexplorer2 --version 2>&1) ) + END_VERSIONS + """ +} diff --git a/modules/nf-core/circexplorer2/annotate/meta.yml b/modules/nf-core/circexplorer2/annotate/meta.yml new file mode 100644 index 000000000..e11df81c2 --- /dev/null +++ b/modules/nf-core/circexplorer2/annotate/meta.yml @@ -0,0 +1,48 @@ +name: "circexplorer2_annotate" +description: Annotate circRNAs detected in the output from CIRCexplorer2 parse +keywords: + - circrna + - annotate +tools: + - "circexplorer2": + description: "Circular RNA analysis toolkits" + homepage: "https://github.com/YangLab/CIRCexplorer2/" + documentation: "https://circexplorer2.readthedocs.io/en/latest/" + doi: "10.1101/gr.202895.115" + licence: "['MIT License']" +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - junctions: + type: file + description: Reformatted junctions file + pattern: "*.{junction}" + - fasta: + type: file + description: Genome FASTA file + pattern: "*.{fa,fasta}" + - gene_annotation: + type: file + description: Reformatted GTF file for CIRCexplorer2 + pattern: "*.{txt}" +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - txt: + type: file + description: Annotated circRNA TXT file + pattern: "*.{txt}" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@BarryDigby" +maintainers: + - "@BarryDigby" diff --git a/modules/nf-core/circexplorer2/parse/environment.yml b/modules/nf-core/circexplorer2/parse/environment.yml new file mode 100644 index 000000000..52e172e7e --- /dev/null +++ b/modules/nf-core/circexplorer2/parse/environment.yml @@ -0,0 +1,7 @@ +name: circexplorer2_parse +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::circexplorer2=2.3.8 diff --git a/modules/nf-core/circexplorer2/parse/main.nf b/modules/nf-core/circexplorer2/parse/main.nf new file mode 100644 index 000000000..db7a0063f --- /dev/null +++ b/modules/nf-core/circexplorer2/parse/main.nf @@ -0,0 +1,49 @@ +process CIRCEXPLORER2_PARSE { + tag "$meta.id" + label 'process_low' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/circexplorer2:2.3.8--pyh864c0ab_1': + 'biocontainers/circexplorer2:2.3.8--pyh864c0ab_1' }" + + input: + tuple val(meta), path(fusions) + + output: + tuple val(meta), path("*.bed"), emit: junction + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def aligner = "${fusions}".endsWith(".junction") ? "-t STAR" : "${fusions}".endsWith(".txt") ? "-t MapSplice" : "${fusions}".endsWith(".bam") ? "-t BWA" : "-t segemehl" + if ("${fusions}" == "${prefix}.bed") error "Input and output names are the same, set prefix in module configuration to disambiguate!" + """ + CIRCexplorer2 \\ + parse \\ + $aligner \\ + $fusions \\ + -b ${prefix}.bed \\ + $args + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + circexplorer2: \$( echo \$(CIRCexplorer2 --version 2>&1) ) + END_VERSIONS + """ + + stub: + def prefix = task.ext.prefix ?: "${meta.id}" + """ + touch ${prefix}.bed + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + circexplorer2: \$( echo \$(CIRCexplorer2 --version 2>&1) ) + END_VERSIONS + """ +} diff --git a/modules/nf-core/circexplorer2/parse/meta.yml b/modules/nf-core/circexplorer2/parse/meta.yml new file mode 100644 index 000000000..ef3ebf856 --- /dev/null +++ b/modules/nf-core/circexplorer2/parse/meta.yml @@ -0,0 +1,41 @@ +name: "circexplorer2_parse" +description: CIRCexplorer2 parses fusion junction files from multiple aligners to prepare them for CIRCexplorer2 annotate. +keywords: + - parse + - circrna + - splice +tools: + - "circexplorer2": + description: "Circular RNA analysis toolkit" + homepage: "https://github.com/YangLab/CIRCexplorer2/" + documentation: "https://circexplorer2.readthedocs.io/en/latest/" + doi: "10.1101/gr.202895.115" + licence: "['MIT License']" +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - fusions: + type: file + description: BAM (BWA), BED (Segemehl), TXT (MapSplice), or Junction (STAR) file. Aligner will be autodetected based on file suffix. + pattern: "*.{bam,junction,bed,txt}" +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - bed: + type: file + description: Standardized fusion junction file suitable for CIRCexplorer2 annotate module. + pattern: "*.{bam,cram,sam}" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@BarryDigby" +maintainers: + - "@BarryDigby" diff --git a/modules/nf-core/csvtk/join/environment.yml b/modules/nf-core/csvtk/join/environment.yml new file mode 100644 index 000000000..5b6c6468f --- /dev/null +++ b/modules/nf-core/csvtk/join/environment.yml @@ -0,0 +1,7 @@ +name: csvtk_join +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::csvtk=0.30.0 diff --git a/modules/nf-core/csvtk/join/main.nf b/modules/nf-core/csvtk/join/main.nf new file mode 100644 index 000000000..5f3afeeae --- /dev/null +++ b/modules/nf-core/csvtk/join/main.nf @@ -0,0 +1,49 @@ +process CSVTK_JOIN { + tag "$meta.id" + label 'process_single' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/csvtk:0.30.0--h9ee0642_0': + 'biocontainers/csvtk:0.30.0--h9ee0642_0' }" + + input: + tuple val(meta), path(csv) + + output: + tuple val(meta), path("${prefix}.${out_extension}"), emit: csv + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + prefix = task.ext.prefix ?: "${meta.id}" + out_extension = args.contains('--out-delimiter "\t"') || args.contains('-D "\t"') || args.contains("-D \$'\t'") ? "tsv" : "csv" + """ + csvtk \\ + join \\ + $args \\ + --num-cpus $task.cpus \\ + --out-file ${prefix}.${out_extension} \\ + $csv + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + csvtk: \$(echo \$( csvtk version | sed -e "s/csvtk v//g" )) + END_VERSIONS + """ + + stub: + prefix = task.ext.prefix ?: "${meta.id}" + out_extension = args.contains('--out-delimiter "\t"') || args.contains('-D "\t"') || args.contains("-D \$'\t'") ? "tsv" : "csv" + """ + touch ${prefix}.${out_extension} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + csvtk: \$(echo \$( csvtk version | sed -e "s/csvtk v//g" )) + END_VERSIONS + """ +} diff --git a/modules/nf-core/csvtk/join/meta.yml b/modules/nf-core/csvtk/join/meta.yml new file mode 100644 index 000000000..a75ec40f0 --- /dev/null +++ b/modules/nf-core/csvtk/join/meta.yml @@ -0,0 +1,41 @@ +name: csvtk_join +description: Join two or more CSV (or TSV) tables by selected fields into a single table +keywords: + - join + - tsv + - csv +tools: + - csvtk: + description: A cross-platform, efficient, practical CSV/TSV toolkit + homepage: http://bioinf.shenwei.me/csvtk + documentation: http://bioinf.shenwei.me/csvtk + tool_dev_url: https://github.com/shenwei356/csvtk + licence: ["MIT"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - csv: + type: file + description: CSV/TSV formatted files + pattern: "*.{csv,tsv}" +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - versions: + type: file + description: File containing software versions + pattern: "version.yml" + - csv: + type: file + description: Joined CSV/TSV file + pattern: "*.{csv,tsv}" +authors: + - "@anoronh4" +maintainers: + - "@anoronh4" diff --git a/modules/nf-core/csvtk/join/tests/main.nf.test b/modules/nf-core/csvtk/join/tests/main.nf.test new file mode 100644 index 000000000..3cf178c4f --- /dev/null +++ b/modules/nf-core/csvtk/join/tests/main.nf.test @@ -0,0 +1,64 @@ +nextflow_process { + + name "Test Process CSVTK_JOIN" + script "../main.nf" + process "CSVTK_JOIN" + + tag "modules" + tag "modules_nfcore" + tag "csvtk" + tag "csvtk/join" + + test("join - csv") { + + when { + process { + """ + input[0] = [ + [ id:'test' ], // meta map + [ + file("https://github.com/nf-core/test-datasets/raw/bacass/bacass_hybrid.csv", checkIfExists: true), + file("https://github.com/nf-core/test-datasets/raw/bacass/bacass_short.csv", checkIfExists: true), + ] + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + + test("join - csv - stub") { + + options "-stub" + + when { + process { + """ + input[0] = [ + [ id:'test' ], // meta map + [ + file("https://github.com/nf-core/test-datasets/raw/bacass/bacass_hybrid.csv", checkIfExists: true), + file("https://github.com/nf-core/test-datasets/raw/bacass/bacass_short.csv", checkIfExists: true), + ] + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + +} diff --git a/modules/nf-core/csvtk/join/tests/main.nf.test.snap b/modules/nf-core/csvtk/join/tests/main.nf.test.snap new file mode 100644 index 000000000..b124788bb --- /dev/null +++ b/modules/nf-core/csvtk/join/tests/main.nf.test.snap @@ -0,0 +1,60 @@ +{ + "join - csv": { + "content": [ + { + "0": [ + [ + { + "id": "test" + }, + "test.csv:md5,d0ad82ca096c7e05eb9f9a04194c9e30" + ] + ], + "1": [ + "versions.yml:md5,e76147e4eca968d23543e7007522f1d3" + ], + "csv": [ + [ + { + "id": "test" + }, + "test.csv:md5,d0ad82ca096c7e05eb9f9a04194c9e30" + ] + ], + "versions": [ + "versions.yml:md5,e76147e4eca968d23543e7007522f1d3" + ] + } + ], + "timestamp": "2024-05-21T15:45:44.045434" + }, + "join - csv - stub": { + "content": [ + { + "0": [ + [ + { + "id": "test" + }, + "test.csv:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "1": [ + "versions.yml:md5,e76147e4eca968d23543e7007522f1d3" + ], + "csv": [ + [ + { + "id": "test" + }, + "test.csv:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "versions": [ + "versions.yml:md5,e76147e4eca968d23543e7007522f1d3" + ] + } + ], + "timestamp": "2024-05-21T15:45:55.59201" + } +} \ No newline at end of file diff --git a/modules/nf-core/csvtk/join/tests/nextflow.config b/modules/nf-core/csvtk/join/tests/nextflow.config new file mode 100644 index 000000000..1b14393a9 --- /dev/null +++ b/modules/nf-core/csvtk/join/tests/nextflow.config @@ -0,0 +1,5 @@ +process { + withName: CSVTK_JOIN { + ext.args = "--fields 'ID;ID' -p -e -d \"\t\" -D \",\"" + } +} diff --git a/modules/nf-core/csvtk/join/tests/tags.yml b/modules/nf-core/csvtk/join/tests/tags.yml new file mode 100644 index 000000000..6c3a0fa6b --- /dev/null +++ b/modules/nf-core/csvtk/join/tests/tags.yml @@ -0,0 +1,2 @@ +csvtk/join: + - "modules/nf-core/csvtk/join/**" diff --git a/modules/nf-core/csvtk/split/environment.yml b/modules/nf-core/csvtk/split/environment.yml new file mode 100644 index 000000000..ec08bb439 --- /dev/null +++ b/modules/nf-core/csvtk/split/environment.yml @@ -0,0 +1,7 @@ +name: csvtk_split +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::csvtk=0.30.0 diff --git a/modules/nf-core/csvtk/split/main.nf b/modules/nf-core/csvtk/split/main.nf new file mode 100644 index 000000000..1b7d5dd15 --- /dev/null +++ b/modules/nf-core/csvtk/split/main.nf @@ -0,0 +1,56 @@ +process CSVTK_SPLIT { + tag "$meta.id" + label 'process_low' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/csvtk:0.30.0--h9ee0642_0' : + 'biocontainers/csvtk:0.30.0--h9ee0642_0' }" + + input: + tuple val(meta), path(csv) + val in_format + val out_format + + output: + tuple val(meta), path("*.${out_extension}"), emit: split_csv + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def delimiter = in_format == "tsv" ? "--tabs" : (in_format == "csv" ? "--delimiter ',' " : in_format) + def out_delimiter = out_format == "tsv" ? "--out-tabs" : (out_format == "csv" ? "--out-delimiter ',' " : out_format) + out_extension = out_format == "tsv" ? 'tsv' : 'csv' + """ + sed -i.bak '/^##/d' $csv + csvtk \\ + split \\ + $args \\ + --num-cpus $task.cpus \\ + $delimiter \\ + $out_delimiter \\ + $csv + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + csvtk: \$(echo \$( csvtk version | sed -e 's/csvtk v//g' )) + END_VERSIONS + """ + + stub: + def args = task.ext.args ?: '' + prefix = task.ext.prefix ?: "${meta.id}" + out_extension = args.contains('--out-delimiter "\t"') || args.contains('-D "\t"') || args.contains("-D \$'\t'") ? "tsv" : "csv" + """ + touch ${prefix}.${out_extension} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + csvtk: \$(echo \$( csvtk version | sed -e "s/csvtk v//g" )) + END_VERSIONS + """ +} diff --git a/modules/nf-core/csvtk/split/meta.yml b/modules/nf-core/csvtk/split/meta.yml new file mode 100644 index 000000000..6ff78aa01 --- /dev/null +++ b/modules/nf-core/csvtk/split/meta.yml @@ -0,0 +1,49 @@ +name: csvtk_split +description: Splits CSV/TSV into multiple files according to column values +keywords: + - split + - csv + - tsv +tools: + - csvtk: + description: CSVTK is a cross-platform, efficient and practical CSV/TSV toolkit that allows rapid data investigation and manipulation. + homepage: https://bioinf.shenwei.me/csvtk/ + documentation: https://bioinf.shenwei.me/csvtk/ + tool_dev_url: https://github.com/shenwei356/csvtk + licence: ["MIT"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - csv: + type: file + description: CSV/TSV file + pattern: "*.{csv,tsv}" + - in_format: + type: string + description: Input format (csv, tab, or a delimiting character) + pattern: "*" + - out_format: + type: string + description: Output format (csv, tab, or a delimiting character) + pattern: "*" +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - split_csv: + type: file + description: Split CSV/TSV file + pattern: "*.{csv,tsv}" +authors: + - "@SusiJo" +maintainers: + - "@SusiJo" diff --git a/modules/nf-core/csvtk/split/tests/main.nf.test b/modules/nf-core/csvtk/split/tests/main.nf.test new file mode 100644 index 000000000..f3c499266 --- /dev/null +++ b/modules/nf-core/csvtk/split/tests/main.nf.test @@ -0,0 +1,62 @@ +nextflow_process { + + name "Test Process CSVTK_SPLIT" + script "../main.nf" + process "CSVTK_SPLIT" + + tag "modules" + tag "modules_nfcore" + tag "csvtk" + tag "csvtk/split" + + test("split - csv") { + + when { + process { + """ + input[0] = [ + [ id:'test' ], // meta map + [ file(params.modules_testdata_base_path + '/generic/tsv/test.tsv', checkIfExists: true) ] + ] + input[1] = "tsv" + input[2] = "tsv" + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + + test("split - csv - stub") { + + options "-stub" + + when { + process { + """ + input[0] = [ + [ id:'test' ], // meta map + [ file(params.modules_testdata_base_path + '/generic/tsv/test.tsv', checkIfExists: true) ] + ] + input[1] = "tsv" + input[2] = "tsv" + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + +} diff --git a/modules/nf-core/csvtk/split/tests/main.nf.test.snap b/modules/nf-core/csvtk/split/tests/main.nf.test.snap new file mode 100644 index 000000000..f0ec9def0 --- /dev/null +++ b/modules/nf-core/csvtk/split/tests/main.nf.test.snap @@ -0,0 +1,72 @@ +{ + "split - csv - stub": { + "content": [ + { + "0": [ + [ + { + "id": "test" + }, + "test.csv:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "1": [ + "versions.yml:md5,b17a61b0c41b19f7df3740979d68a8a0" + ], + "split_csv": [ + [ + { + "id": "test" + }, + "test.csv:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "versions": [ + "versions.yml:md5,b17a61b0c41b19f7df3740979d68a8a0" + ] + } + ], + "timestamp": "2024-05-22T10:02:46.053585" + }, + "split - csv": { + "content": [ + { + "0": [ + [ + { + "id": "test" + }, + [ + "test-1.tsv:md5,2827284f1a6f41dd14ef82fb6a36ebad", + "test-11.tsv:md5,6c5555d689c4e685d35d6e394ad6e1e6", + "test-2.tsv:md5,589a2add7f0b8e998d4959e5d883e7d5", + "test-4.tsv:md5,e51cd0bfc35f5353d1fb75f723772ed0", + "test-NA.tsv:md5,20afd42832c6cf5821f9862d285c9350" + ] + ] + ], + "1": [ + "versions.yml:md5,b17a61b0c41b19f7df3740979d68a8a0" + ], + "split_csv": [ + [ + { + "id": "test" + }, + [ + "test-1.tsv:md5,2827284f1a6f41dd14ef82fb6a36ebad", + "test-11.tsv:md5,6c5555d689c4e685d35d6e394ad6e1e6", + "test-2.tsv:md5,589a2add7f0b8e998d4959e5d883e7d5", + "test-4.tsv:md5,e51cd0bfc35f5353d1fb75f723772ed0", + "test-NA.tsv:md5,20afd42832c6cf5821f9862d285c9350" + ] + ] + ], + "versions": [ + "versions.yml:md5,b17a61b0c41b19f7df3740979d68a8a0" + ] + } + ], + "timestamp": "2024-05-22T10:02:35.8578" + } +} \ No newline at end of file diff --git a/modules/nf-core/csvtk/split/tests/nextflow.config b/modules/nf-core/csvtk/split/tests/nextflow.config new file mode 100644 index 000000000..8f5a6f7ee --- /dev/null +++ b/modules/nf-core/csvtk/split/tests/nextflow.config @@ -0,0 +1,5 @@ +process { + withName: CSVTK_SPLIT { + ext.args = "-C \'&\' --fields \'first_name\' " + } +} diff --git a/modules/nf-core/csvtk/split/tests/tags.yml b/modules/nf-core/csvtk/split/tests/tags.yml new file mode 100644 index 000000000..0d7dc029d --- /dev/null +++ b/modules/nf-core/csvtk/split/tests/tags.yml @@ -0,0 +1,2 @@ +csvtk/split: + - "modules/nf-core/csvtk/split/**" diff --git a/modules/nf-core/custom/dumpsoftwareversions/environment.yml b/modules/nf-core/custom/dumpsoftwareversions/environment.yml new file mode 100644 index 000000000..b48ced269 --- /dev/null +++ b/modules/nf-core/custom/dumpsoftwareversions/environment.yml @@ -0,0 +1,7 @@ +name: custom_dumpsoftwareversions +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::multiqc=1.20 diff --git a/modules/nf-core/custom/dumpsoftwareversions/main.nf b/modules/nf-core/custom/dumpsoftwareversions/main.nf new file mode 100644 index 000000000..105f9265a --- /dev/null +++ b/modules/nf-core/custom/dumpsoftwareversions/main.nf @@ -0,0 +1,24 @@ +process CUSTOM_DUMPSOFTWAREVERSIONS { + label 'process_single' + + // Requires `pyyaml` which does not have a dedicated container but is in the MultiQC container + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/multiqc:1.20--pyhdfd78af_0' : + 'biocontainers/multiqc:1.20--pyhdfd78af_0' }" + + input: + path versions + + output: + path "software_versions.yml" , emit: yml + path "software_versions_mqc.yml", emit: mqc_yml + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + template 'dumpsoftwareversions.py' +} diff --git a/modules/nf-core/custom/dumpsoftwareversions/meta.yml b/modules/nf-core/custom/dumpsoftwareversions/meta.yml new file mode 100644 index 000000000..5f15a5fde --- /dev/null +++ b/modules/nf-core/custom/dumpsoftwareversions/meta.yml @@ -0,0 +1,37 @@ +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/meta-schema.json +name: custom_dumpsoftwareversions +description: Custom module used to dump software versions within the nf-core pipeline template +keywords: + - custom + - dump + - version +tools: + - custom: + description: Custom module used to dump software versions within the nf-core pipeline template + homepage: https://github.com/nf-core/tools + documentation: https://github.com/nf-core/tools + licence: ["MIT"] +input: + - versions: + type: file + description: YML file containing software versions + pattern: "*.yml" +output: + - yml: + type: file + description: Standard YML file containing software versions + pattern: "software_versions.yml" + - mqc_yml: + type: file + description: MultiQC custom content YML file containing software versions + pattern: "software_versions_mqc.yml" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@drpatelh" + - "@grst" +maintainers: + - "@drpatelh" + - "@grst" diff --git a/modules/nf-core/custom/dumpsoftwareversions/templates/dumpsoftwareversions.py b/modules/nf-core/custom/dumpsoftwareversions/templates/dumpsoftwareversions.py new file mode 100755 index 000000000..da0334085 --- /dev/null +++ b/modules/nf-core/custom/dumpsoftwareversions/templates/dumpsoftwareversions.py @@ -0,0 +1,101 @@ +#!/usr/bin/env python + + +"""Provide functions to merge multiple versions.yml files.""" + + +import yaml +import platform +from textwrap import dedent + + +def _make_versions_html(versions): + """Generate a tabular HTML output of all versions for MultiQC.""" + html = [ + dedent( + """\\ + + + + + + + + + + """ + ) + ] + for process, tmp_versions in sorted(versions.items()): + html.append("") + for i, (tool, version) in enumerate(sorted(tmp_versions.items())): + html.append( + dedent( + f"""\\ + + + + + + """ + ) + ) + html.append("") + html.append("
Process Name Software Version
{process if (i == 0) else ''}{tool}{version}
") + return "\\n".join(html) + + +def main(): + """Load all version files and generate merged output.""" + versions_this_module = {} + versions_this_module["${task.process}"] = { + "python": platform.python_version(), + "yaml": yaml.__version__, + } + + with open("$versions") as f: + versions_by_process = yaml.load(f, Loader=yaml.BaseLoader) | versions_this_module + + # aggregate versions by the module name (derived from fully-qualified process name) + versions_by_module = {} + for process, process_versions in versions_by_process.items(): + module = process.split(":")[-1] + try: + if versions_by_module[module] != process_versions: + raise AssertionError( + "We assume that software versions are the same between all modules. " + "If you see this error-message it means you discovered an edge-case " + "and should open an issue in nf-core/tools. " + ) + except KeyError: + versions_by_module[module] = process_versions + + versions_by_module["Workflow"] = { + "Nextflow": "$workflow.nextflow.version", + "$workflow.manifest.name": "$workflow.manifest.version", + } + + versions_mqc = { + "id": "software_versions", + "section_name": "${workflow.manifest.name} Software Versions", + "section_href": "https://github.com/${workflow.manifest.name}", + "plot_type": "html", + "description": "are collected at run time from the software output.", + "data": _make_versions_html(versions_by_module), + } + + with open("software_versions.yml", "w") as f: + yaml.dump(versions_by_module, f, default_flow_style=False) + with open("software_versions_mqc.yml", "w") as f: + yaml.dump(versions_mqc, f, default_flow_style=False) + + with open("versions.yml", "w") as f: + yaml.dump(versions_this_module, f, default_flow_style=False) + + +if __name__ == "__main__": + main() diff --git a/modules/nf-core/custom/dumpsoftwareversions/tests/main.nf.test b/modules/nf-core/custom/dumpsoftwareversions/tests/main.nf.test new file mode 100644 index 000000000..b1e1630bb --- /dev/null +++ b/modules/nf-core/custom/dumpsoftwareversions/tests/main.nf.test @@ -0,0 +1,43 @@ +nextflow_process { + + name "Test Process CUSTOM_DUMPSOFTWAREVERSIONS" + script "../main.nf" + process "CUSTOM_DUMPSOFTWAREVERSIONS" + tag "modules" + tag "modules_nfcore" + tag "custom" + tag "dumpsoftwareversions" + tag "custom/dumpsoftwareversions" + + test("Should run without failures") { + when { + process { + """ + def tool1_version = ''' + TOOL1: + tool1: 0.11.9 + '''.stripIndent() + + def tool2_version = ''' + TOOL2: + tool2: 1.9 + '''.stripIndent() + + input[0] = Channel.of(tool1_version, tool2_version).collectFile() + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot( + process.out.versions, + file(process.out.mqc_yml[0]).readLines()[0..10], + file(process.out.yml[0]).readLines()[0..7] + ).match() + } + ) + } + } +} diff --git a/modules/nf-core/custom/dumpsoftwareversions/tests/main.nf.test.snap b/modules/nf-core/custom/dumpsoftwareversions/tests/main.nf.test.snap new file mode 100644 index 000000000..5f59a936d --- /dev/null +++ b/modules/nf-core/custom/dumpsoftwareversions/tests/main.nf.test.snap @@ -0,0 +1,33 @@ +{ + "Should run without failures": { + "content": [ + [ + "versions.yml:md5,76d454d92244589d32455833f7c1ba6d" + ], + [ + "data: \"\\n\\n \\n \\n \\n \\n \\n \\n \\n\\", + " \\n\\n\\n \\n \\n\\", + " \\ \\n\\n\\n\\n \\n \\", + " \\ \\n \\n\\n\\n\\n\\", + " \\n\\n \\n \\n\\", + " \\ \\n\\n\\n\\n\\n\\n \\n\\", + " \\ \\n \\n\\n\\n\\n\\", + " \\n\\n \\n \\n\\" + ], + [ + "CUSTOM_DUMPSOFTWAREVERSIONS:", + " python: 3.11.7", + " yaml: 5.4.1", + "TOOL1:", + " tool1: 0.11.9", + "TOOL2:", + " tool2: '1.9'", + "Workflow:" + ] + ], + "timestamp": "2024-01-09T23:01:18.710682" + } +} \ No newline at end of file diff --git a/modules/nf-core/custom/dumpsoftwareversions/tests/tags.yml b/modules/nf-core/custom/dumpsoftwareversions/tests/tags.yml new file mode 100644 index 000000000..405aa24ae --- /dev/null +++ b/modules/nf-core/custom/dumpsoftwareversions/tests/tags.yml @@ -0,0 +1,2 @@ +custom/dumpsoftwareversions: + - modules/nf-core/custom/dumpsoftwareversions/** diff --git a/modules/nf-core/custom/gtffilter/environment.yml b/modules/nf-core/custom/gtffilter/environment.yml new file mode 100644 index 000000000..115f41235 --- /dev/null +++ b/modules/nf-core/custom/gtffilter/environment.yml @@ -0,0 +1,9 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json +name: "custom_gtffilter" +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - "conda-forge::python=3.9.5" diff --git a/modules/nf-core/custom/gtffilter/main.nf b/modules/nf-core/custom/gtffilter/main.nf new file mode 100644 index 000000000..b682ff8c5 --- /dev/null +++ b/modules/nf-core/custom/gtffilter/main.nf @@ -0,0 +1,37 @@ +process CUSTOM_GTFFILTER { + tag "$meta.id" + label 'process_single' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/python:3.9--1' : + 'biocontainers/python:3.9--1' }" + + input: + tuple val(meta), path(gtf) + tuple val(meta2), path(fasta) + + output: + tuple val(meta), path("${prefix}.${suffix}"), emit: gtf + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + prefix = task.ext.prefix ?: "${meta.id}" + suffix = task.ext.suffix ?: "gtf" + (gtf.extension == 'gz' ? '.gz' : '') + template 'gtffilter.py' + + stub: + prefix = task.ext.prefix ?: "${meta.id}" + suffix = task.ext.suffix ?: "gtf" + (gtf.extension == 'gz' ? '.gz' : '') + """ + touch ${prefix}.${suffix} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + python: \$(python --version 2>&1 | cut -d ' ' -f 2) + END_VERSIONS + """ +} diff --git a/modules/nf-core/custom/gtffilter/meta.yml b/modules/nf-core/custom/gtffilter/meta.yml new file mode 100644 index 000000000..2c8692218 --- /dev/null +++ b/modules/nf-core/custom/gtffilter/meta.yml @@ -0,0 +1,51 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/meta-schema.json +name: "custom_gtffilter" +description: Filter a gtf file to keep only regions that are located on a chromosome represented in a given fasta file +keywords: + - gtf + - fasta + - filter +tools: + - "gtffilter": + description: "Filter a gtf file to keep only regions that are located on a chromosome represented in a given fasta file" + tool_dev_url: "https://github.com/nf-core/modules/blob/master/modules/nf-core/custom/gtffilter/main.nf" + +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'sample1' ]` + + - gtf: + type: file + description: GTF file + pattern: "*.{gtf}" + + - fasta: + type: file + description: Genome fasta file + pattern: "*.{fasta,fa}" + +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'sample1' ]` + + - gtf: + type: file + description: Filtered GTF file + pattern: "*.{gtf}" + + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + +authors: + - "@nictru" +maintainers: + - "@nictru" diff --git a/modules/nf-core/custom/gtffilter/templates/gtffilter.py b/modules/nf-core/custom/gtffilter/templates/gtffilter.py new file mode 100644 index 000000000..764ec2eff --- /dev/null +++ b/modules/nf-core/custom/gtffilter/templates/gtffilter.py @@ -0,0 +1,132 @@ +#!/usr/bin/env python + +# Written by Olga Botvinnik with subsequent reworking by Jonathan Manning and Nico Trummer. + +# MIT License + +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: + +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. + +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +import logging +import re +import gzip +import statistics +import platform +from typing import Set + +# Create a logger +logging.basicConfig(format="%(name)s - %(asctime)s %(levelname)s: %(message)s") +logger = logging.getLogger("fasta_gtf_filter") +logger.setLevel(logging.INFO) + + +def format_yaml_like(data: dict, indent: int = 0) -> str: + """Formats a dictionary to a YAML-like string. + + Args: + data (dict): The dictionary to format. + indent (int): The current indentation level. + + Returns: + str: A string formatted as YAML. + """ + yaml_str = "" + for key, value in data.items(): + spaces = " " * indent + if isinstance(value, dict): + yaml_str += f"{spaces}{key}:\\n{format_yaml_like(value, indent + 1)}" + else: + yaml_str += f"{spaces}{key}: {value}\\n" + return yaml_str + + +def extract_fasta_seq_names(fasta_name: str) -> Set[str]: + """Extracts the sequence names from a FASTA file.""" + + is_gz = fasta_name.endswith(".gz") + open_fn = gzip.open if is_gz else open + + with open_fn(fasta_name) as fasta: + sequences = set() + for line in fasta: + line = line.decode("utf-8") if is_gz else line + if line.startswith(">"): + sequences.add(line[1:].split(None, 1)[0]) + + return sequences + + +def tab_delimited(file: str) -> float: + """Check if file is tab-delimited and return median number of tabs.""" + with open(file, "r") as f: + data = f.read(102400) + return statistics.median(line.count("\\t") for line in data.split("\\n")) + + +def filter_gtf( + fasta: str, gtf_in: str, filtered_gtf_out: str, skip_transcript_id_check: bool +) -> None: + """Filter GTF file based on FASTA sequence names.""" + if tab_delimited(gtf_in) != 8: + raise ValueError("Invalid GTF file: Expected 9 tab-separated columns.") + + seq_names_in_genome = extract_fasta_seq_names(fasta) + logger.info(f"Extracted chromosome sequence names from {fasta}") + logger.debug( + "All sequence IDs from FASTA: " + ", ".join(sorted(seq_names_in_genome)) + ) + + seq_names_in_gtf = set() + try: + is_gz = gtf_in.endswith(".gz") + open_fn = gzip.open if is_gz else open + with open_fn(gtf_in) as gtf, open_fn(filtered_gtf_out, "wb" if is_gz else "w") as out: + line_count = 0 + for line in gtf: + line = line.decode("utf-8") if is_gz else line + seq_name = line.split("\\t")[0] + seq_names_in_gtf.add(seq_name) # Add sequence name to the set + + if seq_name in seq_names_in_genome: + if skip_transcript_id_check or re.search( + r'transcript_id "([^"]+)"', line + ): + out.write(line.encode() if is_gz else line) + line_count += 1 + + if line_count == 0: + raise ValueError("All GTF lines removed by filters") + + except IOError as e: + logger.error(f"File operation failed: {e}") + return + + logger.debug("All sequence IDs from GTF: " + ", ".join(sorted(seq_names_in_gtf))) + logger.info( + f"Extracted {line_count} matching sequences from {gtf_in} into {filtered_gtf_out}" + ) + + +filter_gtf("${fasta}", "${gtf}", "${prefix}.${suffix}", False) + +# Versions + +versions = {"${task.process}": {"python": platform.python_version()}} + +with open("versions.yml", "w") as f: + f.write(format_yaml_like(versions)) diff --git a/modules/nf-core/custom/gtffilter/tests/main.nf.test b/modules/nf-core/custom/gtffilter/tests/main.nf.test new file mode 100644 index 000000000..252d11a16 --- /dev/null +++ b/modules/nf-core/custom/gtffilter/tests/main.nf.test @@ -0,0 +1,115 @@ +nextflow_process { + + name "Test Process CUSTOM_GTFFILTER" + script "../main.nf" + process "CUSTOM_GTFFILTER" + + tag "modules" + tag "modules_nfcore" + tag "custom" + tag "custom/gtffilter" + + test("test_custom_gtffilter") { + + when { + process { + """ + input[0] = [ + [ id:'test' ], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.gtf', checkIfExists: true) + ] + input[1] = [ + [ id: 'test' ], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.fasta', checkIfExists: true) + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + } + + test("test_custom_gtffilter_gzip") { + + when { + process { + """ + input[0] = [ + [ id:'test' ], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.gtf', checkIfExists: true) + ] + input[1] = [ + [ id: 'test' ], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.fasta.gz', checkIfExists: true) + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + } + + test("test_custom_gtffilter - stub") { + + options "-stub" + + when { + process { + """ + input[0] = [ + [ id:'test' ], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.gtf', checkIfExists: true) + ] + input[1] = [ + [ id: 'test' ], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.fasta', checkIfExists: true) + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + } + + test("test_custom_gtffilter_gzip - stub") { + + options "-stub" + + when { + process { + """ + input[0] = [ + [ id:'test' ], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.gtf', checkIfExists: true) + ] + input[1] = [ + [ id: 'test' ], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.fasta.gz', checkIfExists: true) + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + } +} diff --git a/modules/nf-core/custom/gtffilter/tests/main.nf.test.snap b/modules/nf-core/custom/gtffilter/tests/main.nf.test.snap new file mode 100644 index 000000000..787dd42e1 --- /dev/null +++ b/modules/nf-core/custom/gtffilter/tests/main.nf.test.snap @@ -0,0 +1,134 @@ +{ + "test_custom_gtffilter_gzip": { + "content": [ + { + "0": [ + [ + { + "id": "test" + }, + "test.gtf:md5,aa8b2aa1e0b5fbbba3b04d471e1b0535" + ] + ], + "1": [ + "versions.yml:md5,39c43040514c93566d2e3dca39e54cf2" + ], + "gtf": [ + [ + { + "id": "test" + }, + "test.gtf:md5,aa8b2aa1e0b5fbbba3b04d471e1b0535" + ] + ], + "versions": [ + "versions.yml:md5,39c43040514c93566d2e3dca39e54cf2" + ] + } + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "24.04.2" + }, + "timestamp": "2024-07-15T14:23:11.091273747" + }, + "test_custom_gtffilter": { + "content": [ + { + "0": [ + [ + { + "id": "test" + }, + "test.gtf:md5,aa8b2aa1e0b5fbbba3b04d471e1b0535" + ] + ], + "1": [ + "versions.yml:md5,39c43040514c93566d2e3dca39e54cf2" + ], + "gtf": [ + [ + { + "id": "test" + }, + "test.gtf:md5,aa8b2aa1e0b5fbbba3b04d471e1b0535" + ] + ], + "versions": [ + "versions.yml:md5,39c43040514c93566d2e3dca39e54cf2" + ] + } + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "24.04.2" + }, + "timestamp": "2024-07-15T14:23:03.654104046" + }, + "test_custom_gtffilter_gzip - stub": { + "content": [ + { + "0": [ + [ + { + "id": "test" + }, + "test.gtf:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "1": [ + "versions.yml:md5,4547ffaa530b6d65b2dd1f607d7f85e3" + ], + "gtf": [ + [ + { + "id": "test" + }, + "test.gtf:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "versions": [ + "versions.yml:md5,4547ffaa530b6d65b2dd1f607d7f85e3" + ] + } + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "24.04.2" + }, + "timestamp": "2024-07-15T14:23:24.216284615" + }, + "test_custom_gtffilter - stub": { + "content": [ + { + "0": [ + [ + { + "id": "test" + }, + "test.gtf:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "1": [ + "versions.yml:md5,4547ffaa530b6d65b2dd1f607d7f85e3" + ], + "gtf": [ + [ + { + "id": "test" + }, + "test.gtf:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "versions": [ + "versions.yml:md5,4547ffaa530b6d65b2dd1f607d7f85e3" + ] + } + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "24.04.2" + }, + "timestamp": "2024-07-15T14:23:17.765499066" + } +} \ No newline at end of file diff --git a/modules/nf-core/custom/gtffilter/tests/tags.yml b/modules/nf-core/custom/gtffilter/tests/tags.yml new file mode 100644 index 000000000..34dda2178 --- /dev/null +++ b/modules/nf-core/custom/gtffilter/tests/tags.yml @@ -0,0 +1,2 @@ +custom/gtffilter: + - "modules/nf-core/custom/gtffilter/**" diff --git a/modules/nf-core/custom/tx2gene/environment.yml b/modules/nf-core/custom/tx2gene/environment.yml new file mode 100644 index 000000000..a859dc881 --- /dev/null +++ b/modules/nf-core/custom/tx2gene/environment.yml @@ -0,0 +1,9 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json +name: "custom_tx2gene" +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - python=3.9.5 diff --git a/modules/nf-core/custom/tx2gene/main.nf b/modules/nf-core/custom/tx2gene/main.nf new file mode 100644 index 000000000..99c00aa06 --- /dev/null +++ b/modules/nf-core/custom/tx2gene/main.nf @@ -0,0 +1,36 @@ +process CUSTOM_TX2GENE { + tag "$meta.id" + label 'process_single' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/python:3.9--1' : + 'biocontainers/python:3.9--1' }" + + input: + tuple val(meta), path(gtf) + tuple val(meta2), path ("quants/*") + val quant_type + val id + val extra + + output: + tuple val(meta), path("*tx2gene.tsv"), emit: tx2gene + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + template 'tx2gene.py' + + stub: + """ + touch ${meta.id}.tx2gene.tsv + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + python: \$(python --version | sed 's/Python //g') + END_VERSIONS + """ +} diff --git a/modules/nf-core/custom/tx2gene/meta.yml b/modules/nf-core/custom/tx2gene/meta.yml new file mode 100644 index 000000000..d991bf1be --- /dev/null +++ b/modules/nf-core/custom/tx2gene/meta.yml @@ -0,0 +1,65 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/meta-schema.json +name: "custom_tx2gene" +description: Make a transcript/gene mapping from a GTF and cross-reference with transcript quantifications. +keywords: + - gene + - gtf + - pseudoalignment + - transcript +tools: + - "custom": + description: | + "Custom module to create a transcript to gene mapping from a GTF and + check it against transcript quantifications" + tool_dev_url: "https://github.com/nf-core/modules/blob/master/modules/nf-core/custom/tx2gene/main.nf" + licence: ["MIT"] + +input: + - meta: + type: map + description: | + Groovy Map containing reference information related to the GTF file + e.g. `[ id:'yeast' ]` + - gtf: + type: file + description: An annotation file of the reference genome in GTF format + pattern: "*.gtf" + - meta2: + type: map + description: | + Groovy Map containing information related to the experiment as a whole + e.g. `[ id:'SRP123456' ]` + - quants: + type: directory + description: Paths to subdirectories corresponding to + sample-wise runs of Salmon or Kallisto + - quant_type: + type: string + description: Quantification type, 'kallisto' or 'salmon' + - id: + type: string + description: Gene ID attribute in the GTF file (default= gene_id) + - extra: + type: string + description: Extra gene attribute in the GTF file (default= gene_name) + +output: + - meta: + type: map + description: | + Groovy Map containing reference information related to the GTF file + e.g. `[ id:'yeast' ]` + - tx2gene: + type: file + description: A transcript/ gene mapping table in TSV format + pattern: "*.tx2gene.tsv" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + +authors: + - "@pinin4fjords" +maintainers: + - "@pinin4fjords" diff --git a/modules/nf-core/custom/tx2gene/templates/tx2gene.py b/modules/nf-core/custom/tx2gene/templates/tx2gene.py new file mode 100755 index 000000000..7fd0de64e --- /dev/null +++ b/modules/nf-core/custom/tx2gene/templates/tx2gene.py @@ -0,0 +1,187 @@ +#!/usr/bin/env python3 + +# Written by Lorena Pantano with subsequent reworking by Jonathan Manning. Released under the MIT license. + +import logging +import argparse +import glob +import os +import platform +import re +from collections import Counter, defaultdict, OrderedDict +from collections.abc import Set +from typing import Dict + +# Configure logging +logging.basicConfig(format="%(name)s - %(asctime)s %(levelname)s: %(message)s") +logger = logging.getLogger(__name__) +logger.setLevel(logging.INFO) + +def format_yaml_like(data: dict, indent: int = 0) -> str: + """Formats a dictionary to a YAML-like string. + + Args: + data (dict): The dictionary to format. + indent (int): The current indentation level. + + Returns: + str: A string formatted as YAML. + """ + yaml_str = "" + for key, value in data.items(): + spaces = " " * indent + if isinstance(value, dict): + yaml_str += f"{spaces}{key}:\\n{format_yaml_like(value, indent + 1)}" + else: + yaml_str += f"{spaces}{key}: {value}\\n" + return yaml_str + +def read_top_transcripts(quant_dir: str, file_pattern: str) -> Set[str]: + """ + Read the top 100 transcripts from the quantification file. + + Parameters: + quant_dir (str): Directory where quantification files are located. + file_pattern (str): Pattern to match quantification files. + + Returns: + set: A set containing the top 100 transcripts. + """ + try: + # Find the quantification file within the directory + quant_file_path = glob.glob(os.path.join(quant_dir, "*", file_pattern))[0] + with open(quant_file_path, "r") as file_handle: + # Read the file and extract the top 100 transcripts + return {line.split()[0] for i, line in enumerate(file_handle) if i > 0 and i <= 100} + except IndexError: + # Log an error and raise a FileNotFoundError if the quant file does not exist + logger.error("No quantification files found.") + raise FileNotFoundError("Quantification file not found.") + + +def discover_transcript_attribute(gtf_file: str, transcripts: Set[str]) -> str: + """ + Discover the attribute in the GTF that corresponds to transcripts, prioritizing 'transcript_id'. + + Parameters: + gtf_file (str): Path to the GTF file. + transcripts (Set[str]): A set of transcripts to match in the GTF file. + + Returns: + str: The attribute name that corresponds to transcripts in the GTF file. + """ + + votes = Counter() + with open(gtf_file) as inh: + # Read GTF file, skipping header lines + for line in filter(lambda x: not x.startswith("#"), inh): + cols = line.split("\\t") + + # Use regular expression to correctly split the attributes string + attributes_str = cols[8] + attributes = dict(re.findall(r'(\\S+) "(.*?)(? Dict[str, str]: + """ + Parse the attributes column of a GTF file. + + :param attributes_text: The attributes column as a string. + :return: A dictionary of the attributes. + """ + # Split the attributes string by semicolon and strip whitespace + attributes = attributes_text.strip().split(";") + attr_dict = OrderedDict() + + # Iterate over each attribute pair + for attribute in attributes: + # Split the attribute into key and value, ensuring there are two parts + parts = attribute.strip().split(" ", 1) + if len(parts) == 2: + key, value = parts + # Remove any double quotes from the value + value = value.replace('"', "") + attr_dict[key] = value + + return attr_dict + + +def map_transcripts_to_gene( + quant_type: str, gtf_file: str, quant_dir: str, gene_id: str, extra_id_field: str, output_file: str +) -> bool: + """ + Map transcripts to gene names and write the output to a file. + + Parameters: + quant_type (str): The quantification method used (e.g., 'salmon'). + gtf_file (str): Path to the GTF file. + quant_dir (str): Directory where quantification files are located. + gene_id (str): The gene ID attribute in the GTF file. + extra_id_field (str): Additional ID field in the GTF file. + output_file (str): The output file path. + + Returns: + bool: True if the operation was successful, False otherwise. + """ + # Read the top transcripts based on quantification type + transcripts = read_top_transcripts(quant_dir, "quant.sf" if quant_type == "salmon" else "abundance.tsv") + # Discover the attribute that corresponds to transcripts in the GTF + transcript_attribute = discover_transcript_attribute(gtf_file, transcripts) + + # Open GTF and output file to write the mappings + # Initialize the set to track seen combinations + seen = set() + + with open(gtf_file) as inh, open(output_file, "w") as output_handle: + output_handle.write(f"{transcript_attribute}\\t{gene_id}\\t{extra_id_field}\\n") + # Parse each line of the GTF, mapping transcripts to genes + for line in filter(lambda x: not x.startswith("#"), inh): + cols = line.split("\\t") + attr_dict = parse_attributes(cols[8]) + if gene_id in attr_dict and transcript_attribute in attr_dict: + # Create a unique identifier for the transcript-gene combination + transcript_gene_pair = (attr_dict[transcript_attribute], attr_dict[gene_id]) + + # Check if the combination has already been seen + if transcript_gene_pair not in seen: + # If it's a new combination, write it to the output and add to the seen set + extra_id = attr_dict.get(extra_id_field, attr_dict[gene_id]) + output_handle.write(f"{attr_dict[transcript_attribute]}\\t{attr_dict[gene_id]}\\t{extra_id}\\n") + seen.add(transcript_gene_pair) + + return True + + +# Main function to parse arguments and call the mapping function +if __name__ == "__main__": + if '${task.ext.prefix}' != "null": + prefix = "${task.ext.prefix}." + elif '$meta.id' != "null": + prefix = '${meta.id}.' + else: + prefix = '' + + if not map_transcripts_to_gene('$quant_type', '$gtf', 'quants', '$id', '$extra', f"{prefix}tx2gene.tsv"): + logger.error("Failed to map transcripts to genes.") + + # Write the versions + versions_this_module = {} + versions_this_module["${task.process}"] = {"python": platform.python_version()} + with open("versions.yml", "w") as f: + f.write(format_yaml_like(versions_this_module)) diff --git a/modules/nf-core/custom/tx2gene/tests/main.nf.test b/modules/nf-core/custom/tx2gene/tests/main.nf.test new file mode 100644 index 000000000..b15592798 --- /dev/null +++ b/modules/nf-core/custom/tx2gene/tests/main.nf.test @@ -0,0 +1,81 @@ +nextflow_process { + + name "Test Process CUSTOM_TX2GENE" + script "../main.nf" + process "CUSTOM_TX2GENE" + + tag "modules" + tag "modules_nfcore" + tag "custom" + tag "custom/tx2gene" + tag "untar" + + setup { + + run("UNTAR") { + script "../../../untar/main.nf" + process { + """ + input[0] = Channel.of([ + [ id:'test'], // meta map + file(params.modules_testdata_base_path + 'genomics/eukaryotes/saccharomyces_cerevisiae/kallisto_results.tar.gz', checkIfExists: true) + ]) + """ + } + } + } + + test("saccharomyces_cerevisiae - gtf") { + + when { + process { + """ + input[0] = Channel.of([ + [ id:'test'], // meta map + file(params.modules_testdata_base_path + 'genomics/eukaryotes/saccharomyces_cerevisiae/genome_gfp.gtf', checkIfExists: true) + ]) + input[1] = UNTAR.out.untar.map { meta, dir -> [ meta, dir.listFiles().collect() ] } + input[2] = 'kallisto' + input[3] = 'gene_id' + input[4] = 'gene_name' + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out.tx2gene).match('tx2gene') }, + { assert snapshot(process.out.versions).match('versions') } + ) + } + } + + test("saccharomyces_cerevisiae - gtf - stub") { + + options "-stub" + + when { + process { + """ + input[0] = Channel.of([ + [ id:'test'], // meta map + file(params.modules_testdata_base_path + 'genomics/eukaryotes/saccharomyces_cerevisiae/genome_gfp.gtf', checkIfExists: true) + ]) + input[1] = UNTAR.out.untar.map { meta, dir -> [ meta, dir.listFiles().collect() ] } + input[2] = 'kallisto' + input[3] = 'gene_id' + input[4] = 'gene_name' + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out.tx2gene).match('tx2gene - stub') }, + { assert snapshot(process.out.versions).match('versions - stub') } + ) + } + } +} \ No newline at end of file diff --git a/modules/nf-core/custom/tx2gene/tests/main.nf.test.snap b/modules/nf-core/custom/tx2gene/tests/main.nf.test.snap new file mode 100644 index 000000000..1e76e10d6 --- /dev/null +++ b/modules/nf-core/custom/tx2gene/tests/main.nf.test.snap @@ -0,0 +1,60 @@ +{ + "versions": { + "content": [ + [ + "versions.yml:md5,fb8145d7fbc6043ba031249b23ecda50" + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "24.01.0" + }, + "timestamp": "2024-02-26T13:14:18.218251" + }, + "tx2gene": { + "content": [ + [ + [ + { + "id": "test" + }, + "test.tx2gene.tsv:md5,0e2418a69d2eba45097ebffc2f700bfe" + ] + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "24.01.0" + }, + "timestamp": "2024-02-26T13:14:18.21054" + }, + "tx2gene - stub": { + "content": [ + [ + [ + { + "id": "test" + }, + "test.tx2gene.tsv:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "24.01.0" + }, + "timestamp": "2024-02-26T13:14:25.915434" + }, + "versions - stub": { + "content": [ + [ + "versions.yml:md5,5613eefbca41377128f1d8dc09b9fb60" + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "24.01.0" + }, + "timestamp": "2024-02-26T13:14:25.919243" + } +} \ No newline at end of file diff --git a/modules/nf-core/custom/tx2gene/tests/tags.yml b/modules/nf-core/custom/tx2gene/tests/tags.yml new file mode 100644 index 000000000..493fbc3b1 --- /dev/null +++ b/modules/nf-core/custom/tx2gene/tests/tags.yml @@ -0,0 +1,2 @@ +custom/tx2gene: + - "modules/nf-core/custom/tx2gene/**" diff --git a/modules/nf-core/gawk/environment.yml b/modules/nf-core/gawk/environment.yml new file mode 100644 index 000000000..3d98a08b0 --- /dev/null +++ b/modules/nf-core/gawk/environment.yml @@ -0,0 +1,7 @@ +name: gawk +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - conda-forge::gawk=5.3.0 diff --git a/modules/nf-core/gawk/main.nf b/modules/nf-core/gawk/main.nf new file mode 100644 index 000000000..ca4689297 --- /dev/null +++ b/modules/nf-core/gawk/main.nf @@ -0,0 +1,55 @@ +process GAWK { + tag "$meta.id" + label 'process_single' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/gawk:5.3.0' : + 'biocontainers/gawk:5.3.0' }" + + input: + tuple val(meta), path(input) + path(program_file) + + output: + tuple val(meta), path("${prefix}.${suffix}"), emit: output + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' // args is used for the main arguments of the tool + def args2 = task.ext.args2 ?: '' // args2 is used to specify a program when no program file has been given + prefix = task.ext.prefix ?: "${meta.id}" + suffix = task.ext.suffix ?: "${input.getExtension()}" + + program = program_file ? "-f ${program_file}" : "${args2}" + + """ + awk \\ + ${args} \\ + ${program} \\ + ${input} \\ + > ${prefix}.${suffix} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + gawk: \$(awk -Wversion | sed '1!d; s/.*Awk //; s/,.*//') + END_VERSIONS + """ + + stub: + prefix = task.ext.prefix ?: "${meta.id}" + suffix = task.ext.suffix ?: "${input.getExtension()}" + def create_cmd = suffix.endsWith("gz") ? "echo '' | gzip >" : "touch" + + """ + ${create_cmd} ${prefix}.${suffix} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + gawk: \$(awk -Wversion | sed '1!d; s/.*Awk //; s/,.*//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/gawk/meta.yml b/modules/nf-core/gawk/meta.yml new file mode 100644 index 000000000..2b6033b0b --- /dev/null +++ b/modules/nf-core/gawk/meta.yml @@ -0,0 +1,50 @@ +name: "gawk" +description: | + If you are like many computer users, you would frequently like to make changes in various text files + wherever certain patterns appear, or extract data from parts of certain lines while discarding the rest. + The job is easy with awk, especially the GNU implementation gawk. +keywords: + - gawk + - awk + - txt + - text + - file parsing +tools: + - "gawk": + description: "GNU awk" + homepage: "https://www.gnu.org/software/gawk/" + documentation: "https://www.gnu.org/software/gawk/manual/" + tool_dev_url: "https://www.gnu.org/prep/ftp.html" + licence: ["GPL v3"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - input: + type: file + description: The input file - Specify the logic that needs to be executed on this file on the `ext.args2` or in the program file + pattern: "*" + - program_file: + type: file + description: Optional file containing logic for awk to execute. If you don't wish to use a file, you can use `ext.args2` to specify the logic. + pattern: "*" +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - output: + type: file + description: The output file - specify the name of this file using `ext.prefix` and the extension using `ext.suffix` + pattern: "*" +authors: + - "@nvnieuwk" +maintainers: + - "@nvnieuwk" diff --git a/modules/nf-core/gawk/tests/main.nf.test b/modules/nf-core/gawk/tests/main.nf.test new file mode 100644 index 000000000..fce82ca95 --- /dev/null +++ b/modules/nf-core/gawk/tests/main.nf.test @@ -0,0 +1,56 @@ +nextflow_process { + + name "Test Process GAWK" + script "../main.nf" + process "GAWK" + + tag "modules" + tag "modules_nfcore" + tag "gawk" + + test("convert fasta to bed") { + config "./nextflow.config" + + when { + process { + """ + input[0] = [ + [ id:'test' ], // meta map + file(params.modules_testdata_base_path + 'genomics/homo_sapiens/genome/genome.fasta.fai', checkIfExists: true) + ] + input[1] = [] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + } + + test("convert fasta to bed with program file") { + config "./nextflow_with_program_file.config" + + when { + process { + """ + input[0] = [ + [ id:'test' ], // meta map + file(params.modules_testdata_base_path + 'genomics/homo_sapiens/genome/genome.fasta.fai', checkIfExists: true) + ] + input[1] = Channel.of('BEGIN {FS="\t"}; {print \$1 FS "0" FS \$2}').collectFile(name:"program.txt") + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + } +} \ No newline at end of file diff --git a/modules/nf-core/gawk/tests/main.nf.test.snap b/modules/nf-core/gawk/tests/main.nf.test.snap new file mode 100644 index 000000000..4f3a759c6 --- /dev/null +++ b/modules/nf-core/gawk/tests/main.nf.test.snap @@ -0,0 +1,68 @@ +{ + "convert fasta to bed with program file": { + "content": [ + { + "0": [ + [ + { + "id": "test" + }, + "test.bed:md5,87a15eb9c2ff20ccd5cd8735a28708f7" + ] + ], + "1": [ + "versions.yml:md5,842acc9870dc8ac280954047cb2aa23a" + ], + "output": [ + [ + { + "id": "test" + }, + "test.bed:md5,87a15eb9c2ff20ccd5cd8735a28708f7" + ] + ], + "versions": [ + "versions.yml:md5,842acc9870dc8ac280954047cb2aa23a" + ] + } + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "24.03.0" + }, + "timestamp": "2024-05-17T15:20:02.495430346" + }, + "convert fasta to bed": { + "content": [ + { + "0": [ + [ + { + "id": "test" + }, + "test.bed:md5,87a15eb9c2ff20ccd5cd8735a28708f7" + ] + ], + "1": [ + "versions.yml:md5,842acc9870dc8ac280954047cb2aa23a" + ], + "output": [ + [ + { + "id": "test" + }, + "test.bed:md5,87a15eb9c2ff20ccd5cd8735a28708f7" + ] + ], + "versions": [ + "versions.yml:md5,842acc9870dc8ac280954047cb2aa23a" + ] + } + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "24.03.0" + }, + "timestamp": "2024-05-17T15:19:53.291809648" + } +} \ No newline at end of file diff --git a/modules/nf-core/gawk/tests/nextflow.config b/modules/nf-core/gawk/tests/nextflow.config new file mode 100644 index 000000000..6e5d43a35 --- /dev/null +++ b/modules/nf-core/gawk/tests/nextflow.config @@ -0,0 +1,6 @@ +process { + withName: GAWK { + ext.suffix = "bed" + ext.args2 = '\'BEGIN {FS="\t"}; {print \$1 FS "0" FS \$2}\'' + } +} diff --git a/modules/nf-core/gawk/tests/nextflow_with_program_file.config b/modules/nf-core/gawk/tests/nextflow_with_program_file.config new file mode 100644 index 000000000..693ad4196 --- /dev/null +++ b/modules/nf-core/gawk/tests/nextflow_with_program_file.config @@ -0,0 +1,5 @@ +process { + withName: GAWK { + ext.suffix = "bed" + } +} diff --git a/modules/nf-core/gawk/tests/tags.yml b/modules/nf-core/gawk/tests/tags.yml new file mode 100644 index 000000000..72e4531d2 --- /dev/null +++ b/modules/nf-core/gawk/tests/tags.yml @@ -0,0 +1,2 @@ +gawk: + - "modules/nf-core/gawk/**" diff --git a/modules/nf-core/gffread/environment.yml b/modules/nf-core/gffread/environment.yml new file mode 100644 index 000000000..c6df58ad0 --- /dev/null +++ b/modules/nf-core/gffread/environment.yml @@ -0,0 +1,7 @@ +name: gffread +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::gffread=0.12.7 diff --git a/modules/nf-core/gffread/main.nf b/modules/nf-core/gffread/main.nf new file mode 100644 index 000000000..da55cbab7 --- /dev/null +++ b/modules/nf-core/gffread/main.nf @@ -0,0 +1,60 @@ +process GFFREAD { + tag "$meta.id" + label 'process_low' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/gffread:0.12.7--hdcf5f25_4' : + 'biocontainers/gffread:0.12.7--hdcf5f25_4' }" + + input: + tuple val(meta), path(gff) + path fasta + + output: + tuple val(meta), path("*.gtf") , emit: gtf , optional: true + tuple val(meta), path("*.gff3") , emit: gffread_gff , optional: true + tuple val(meta), path("*.fasta"), emit: gffread_fasta , optional: true + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def extension = args.contains("-T") ? 'gtf' : ( ( ['-w', '-x', '-y' ].any { args.contains(it) } ) ? 'fasta' : 'gff3' ) + def fasta_arg = fasta ? "-g $fasta" : '' + def output_name = "${prefix}.${extension}" + def output = extension == "fasta" ? "$output_name" : "-o $output_name" + def args_sorted = args.replaceAll(/(.*)(-[wxy])(.*)/) { all, pre, param, post -> "$pre $post $param" }.trim() + // args_sorted = Move '-w', '-x', and '-y' to the end of the args string as gffread expects the file name after these parameters + if ( "$output_name" in [ "$gff", "$fasta" ] ) error "Input and output names are the same, use \"task.ext.prefix\" to disambiguate!" + """ + gffread \\ + $gff \\ + $fasta_arg \\ + $args_sorted \\ + $output + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + gffread: \$(gffread --version 2>&1) + END_VERSIONS + """ + + stub: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def extension = args.contains("-T") ? 'gtf' : ( ( ['-w', '-x', '-y' ].any { args.contains(it) } ) ? 'fasta' : 'gff3' ) + def output_name = "${prefix}.${extension}" + if ( "$output_name" in [ "$gff", "$fasta" ] ) error "Input and output names are the same, use \"task.ext.prefix\" to disambiguate!" + """ + touch $output_name + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + gffread: \$(gffread --version 2>&1) + END_VERSIONS + """ +} diff --git a/modules/nf-core/gffread/meta.yml b/modules/nf-core/gffread/meta.yml new file mode 100644 index 000000000..c06028208 --- /dev/null +++ b/modules/nf-core/gffread/meta.yml @@ -0,0 +1,55 @@ +name: gffread +description: Validate, filter, convert and perform various other operations on GFF files +keywords: + - gff + - conversion + - validation +tools: + - gffread: + description: GFF/GTF utility providing format conversions, region filtering, FASTA sequence extraction and more. + homepage: http://ccb.jhu.edu/software/stringtie/gff.shtml#gffread + documentation: http://ccb.jhu.edu/software/stringtie/gff.shtml#gffread + tool_dev_url: https://github.com/gpertea/gffread + doi: 10.12688/f1000research.23297.1 + licence: ["MIT"] +input: + - meta: + type: map + description: | + Groovy Map containing meta data + e.g. [ id:'test' ] + - gff: + type: file + description: A reference file in either the GFF3, GFF2 or GTF format. + pattern: "*.{gff, gtf}" + - fasta: + type: file + description: A multi-fasta file with the genomic sequences + pattern: "*.{fasta,fa,faa,fas,fsa}" +output: + - meta: + type: map + description: | + Groovy Map containing meta data + e.g. [ id:'test' ] + - gtf: + type: file + description: GTF file resulting from the conversion of the GFF input file if '-T' argument is present + pattern: "*.{gtf}" + - gffread_gff: + type: file + description: GFF3 file resulting from the conversion of the GFF input file if '-T' argument is absent + pattern: "*.gff3" + - gffread_fasta: + type: file + description: Fasta file produced when either of '-w', '-x', '-y' parameters is present + pattern: "*.fasta" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@edmundmiller" +maintainers: + - "@edmundmiller" + - "@gallvp" diff --git a/modules/nf-core/gffread/tests/main.nf.test b/modules/nf-core/gffread/tests/main.nf.test new file mode 100644 index 000000000..4cd13dcd3 --- /dev/null +++ b/modules/nf-core/gffread/tests/main.nf.test @@ -0,0 +1,223 @@ +nextflow_process { + + name "Test Process GFFREAD" + script "../main.nf" + process "GFFREAD" + + tag "gffread" + tag "modules_nfcore" + tag "modules" + + test("sarscov2-gff3-gtf") { + + config "./nextflow.config" + + when { + params { + outdir = "$outputDir" + } + process { + """ + input[0] = [ + [id: 'test'], + file(params.modules_testdata_base_path + "genomics/sarscov2/genome/genome.gff3", checkIfExists: true) + ] + input[1] = [] + """ + } + } + + then { + assertAll ( + { assert process.success }, + { assert snapshot(process.out).match() }, + { assert process.out.gffread_gff == [] }, + { assert process.out.gffread_fasta == [] } + ) + } + + } + + test("sarscov2-gff3-gtf-stub") { + + options '-stub' + config "./nextflow.config" + + when { + params { + outdir = "$outputDir" + } + process { + """ + input[0] = [ + [id: 'test'], + file(params.modules_testdata_base_path + "genomics/sarscov2/genome/genome.gff3", checkIfExists: true) + ] + input[1] = [] + """ + } + } + + then { + assertAll ( + { assert process.success }, + { assert snapshot(process.out).match() }, + { assert process.out.gffread_gff == [] }, + { assert process.out.gffread_fasta == [] } + ) + } + + } + + test("sarscov2-gff3-gff3") { + + config "./nextflow-gff3.config" + + when { + params { + outdir = "$outputDir" + } + process { + """ + input[0] = [ + [id: 'test'], + file(params.modules_testdata_base_path + "genomics/sarscov2/genome/genome.gff3", checkIfExists: true) + ] + input[1] = [] + """ + } + } + + then { + assertAll ( + { assert process.success }, + { assert snapshot(process.out).match() }, + { assert process.out.gtf == [] }, + { assert process.out.gffread_fasta == [] } + ) + } + + } + + test("sarscov2-gff3-gff3-stub") { + + options '-stub' + config "./nextflow-gff3.config" + + when { + params { + outdir = "$outputDir" + } + process { + """ + input[0] = [ + [id: 'test'], + file(params.modules_testdata_base_path + "genomics/sarscov2/genome/genome.gff3", checkIfExists: true) + ] + input[1] = [] + """ + } + } + + then { + assertAll ( + { assert process.success }, + { assert snapshot(process.out).match() }, + { assert process.out.gtf == [] }, + { assert process.out.gffread_fasta == [] } + ) + } + + } + + test("sarscov2-gff3-fasta") { + + config "./nextflow-fasta.config" + + when { + params { + outdir = "$outputDir" + } + process { + """ + input[0] = [ + [id: 'test'], + file(params.modules_testdata_base_path + "genomics/sarscov2/genome/genome.gff3", checkIfExists: true) + ] + input[1] = file(params.modules_testdata_base_path + "genomics/sarscov2/genome/genome.fasta", checkIfExists: true) + """ + } + } + + then { + assertAll ( + { assert process.success }, + { assert snapshot(process.out).match() }, + { assert process.out.gtf == [] }, + { assert process.out.gffread_gff == [] } + ) + } + + } + + test("sarscov2-gff3-fasta-stub") { + + options '-stub' + config "./nextflow-fasta.config" + + when { + params { + outdir = "$outputDir" + } + process { + """ + input[0] = [ + [id: 'test'], + file(params.modules_testdata_base_path + "genomics/sarscov2/genome/genome.gff3", checkIfExists: true) + ] + input[1] = file(params.modules_testdata_base_path + "genomics/sarscov2/genome/genome.fasta", checkIfExists: true) + """ + } + } + + then { + assertAll ( + { assert process.success }, + { assert snapshot(process.out).match() }, + { assert process.out.gtf == [] }, + { assert process.out.gffread_gff == [] } + ) + } + + } + + test("sarscov2-gff3-fasta-fail-catch") { + + options '-stub' + config "./nextflow-fasta.config" + + when { + params { + outdir = "$outputDir" + } + process { + """ + input[0] = [ + [id: 'genome'], + file(params.modules_testdata_base_path + "genomics/sarscov2/genome/genome.gff3", checkIfExists: true) + ] + input[1] = file(params.modules_testdata_base_path + "genomics/sarscov2/genome/genome.fasta", checkIfExists: true) + """ + } + } + + then { + assertAll ( + { assert ! process.success }, + { assert process.stdout.toString().contains("Input and output names are the same") } + ) + } + + } + +} diff --git a/modules/nf-core/gffread/tests/main.nf.test.snap b/modules/nf-core/gffread/tests/main.nf.test.snap new file mode 100644 index 000000000..15262320d --- /dev/null +++ b/modules/nf-core/gffread/tests/main.nf.test.snap @@ -0,0 +1,272 @@ +{ + "sarscov2-gff3-gtf": { + "content": [ + { + "0": [ + [ + { + "id": "test" + }, + "test.gtf:md5,1ea0ae98d3388e0576407dc4a24ef428" + ] + ], + "1": [ + + ], + "2": [ + + ], + "3": [ + "versions.yml:md5,05f671c6c6e530acedad0af0a5948dbd" + ], + "gffread_fasta": [ + + ], + "gffread_gff": [ + + ], + "gtf": [ + [ + { + "id": "test" + }, + "test.gtf:md5,1ea0ae98d3388e0576407dc4a24ef428" + ] + ], + "versions": [ + "versions.yml:md5,05f671c6c6e530acedad0af0a5948dbd" + ] + } + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-04-09T10:48:56.496187" + }, + "sarscov2-gff3-gff3": { + "content": [ + { + "0": [ + + ], + "1": [ + [ + { + "id": "test" + }, + "test.gff3:md5,c4e5da6267c6bee5899a2c204ae1ad91" + ] + ], + "2": [ + + ], + "3": [ + "versions.yml:md5,05f671c6c6e530acedad0af0a5948dbd" + ], + "gffread_fasta": [ + + ], + "gffread_gff": [ + [ + { + "id": "test" + }, + "test.gff3:md5,c4e5da6267c6bee5899a2c204ae1ad91" + ] + ], + "gtf": [ + + ], + "versions": [ + "versions.yml:md5,05f671c6c6e530acedad0af0a5948dbd" + ] + } + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-04-09T10:49:00.892782" + }, + "sarscov2-gff3-gtf-stub": { + "content": [ + { + "0": [ + [ + { + "id": "test" + }, + "test.gtf:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "1": [ + + ], + "2": [ + + ], + "3": [ + "versions.yml:md5,05f671c6c6e530acedad0af0a5948dbd" + ], + "gffread_fasta": [ + + ], + "gffread_gff": [ + + ], + "gtf": [ + [ + { + "id": "test" + }, + "test.gtf:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "versions": [ + "versions.yml:md5,05f671c6c6e530acedad0af0a5948dbd" + ] + } + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-04-09T11:11:26.975666" + }, + "sarscov2-gff3-fasta-stub": { + "content": [ + { + "0": [ + + ], + "1": [ + + ], + "2": [ + [ + { + "id": "test" + }, + "test.fasta:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "3": [ + "versions.yml:md5,05f671c6c6e530acedad0af0a5948dbd" + ], + "gffread_fasta": [ + [ + { + "id": "test" + }, + "test.fasta:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "gffread_gff": [ + + ], + "gtf": [ + + ], + "versions": [ + "versions.yml:md5,05f671c6c6e530acedad0af0a5948dbd" + ] + } + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-04-09T11:11:44.34792" + }, + "sarscov2-gff3-gff3-stub": { + "content": [ + { + "0": [ + + ], + "1": [ + [ + { + "id": "test" + }, + "test.gff3:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "2": [ + + ], + "3": [ + "versions.yml:md5,05f671c6c6e530acedad0af0a5948dbd" + ], + "gffread_fasta": [ + + ], + "gffread_gff": [ + [ + { + "id": "test" + }, + "test.gff3:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "gtf": [ + + ], + "versions": [ + "versions.yml:md5,05f671c6c6e530acedad0af0a5948dbd" + ] + } + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-04-09T11:11:35.221671" + }, + "sarscov2-gff3-fasta": { + "content": [ + { + "0": [ + + ], + "1": [ + + ], + "2": [ + [ + { + "id": "test" + }, + "test.fasta:md5,5f8108fb51739a0588ccf0a251de919a" + ] + ], + "3": [ + "versions.yml:md5,05f671c6c6e530acedad0af0a5948dbd" + ], + "gffread_fasta": [ + [ + { + "id": "test" + }, + "test.fasta:md5,5f8108fb51739a0588ccf0a251de919a" + ] + ], + "gffread_gff": [ + + ], + "gtf": [ + + ], + "versions": [ + "versions.yml:md5,05f671c6c6e530acedad0af0a5948dbd" + ] + } + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-04-09T10:54:02.88143" + } +} \ No newline at end of file diff --git a/modules/nf-core/gffread/tests/nextflow-fasta.config b/modules/nf-core/gffread/tests/nextflow-fasta.config new file mode 100644 index 000000000..ac6cb1484 --- /dev/null +++ b/modules/nf-core/gffread/tests/nextflow-fasta.config @@ -0,0 +1,5 @@ +process { + withName: GFFREAD { + ext.args = '-w -S' + } +} diff --git a/modules/nf-core/gffread/tests/nextflow-gff3.config b/modules/nf-core/gffread/tests/nextflow-gff3.config new file mode 100644 index 000000000..afe0830e5 --- /dev/null +++ b/modules/nf-core/gffread/tests/nextflow-gff3.config @@ -0,0 +1,5 @@ +process { + withName: GFFREAD { + ext.args = '' + } +} diff --git a/modules/nf-core/gffread/tests/nextflow.config b/modules/nf-core/gffread/tests/nextflow.config new file mode 100644 index 000000000..74b25094f --- /dev/null +++ b/modules/nf-core/gffread/tests/nextflow.config @@ -0,0 +1,5 @@ +process { + withName: GFFREAD { + ext.args = '-T' + } +} diff --git a/modules/nf-core/gffread/tests/tags.yml b/modules/nf-core/gffread/tests/tags.yml new file mode 100644 index 000000000..055760656 --- /dev/null +++ b/modules/nf-core/gffread/tests/tags.yml @@ -0,0 +1,2 @@ +gffread: + - modules/nf-core/gffread/** diff --git a/modules/nf-core/gnu/sort/environment.yml b/modules/nf-core/gnu/sort/environment.yml new file mode 100644 index 000000000..eb9b77edd --- /dev/null +++ b/modules/nf-core/gnu/sort/environment.yml @@ -0,0 +1,9 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json +name: gnu_sort +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - conda-forge::coreutils=9.3 diff --git a/modules/nf-core/gnu/sort/main.nf b/modules/nf-core/gnu/sort/main.nf new file mode 100644 index 000000000..e1167666f --- /dev/null +++ b/modules/nf-core/gnu/sort/main.nf @@ -0,0 +1,51 @@ +process GNU_SORT { + tag "$meta.id" + label "process_low" + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/coreutils:9.3': + 'biocontainers/coreutils:9.3' }" + + input: + tuple val(meta), path(input) + + output: + tuple val(meta), file( "${output_file}" ) , emit: sorted + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + suffix = task.ext.suffix ?: "${input.extension}" + output_file = "${prefix}.${suffix}" + def VERSION = "9.3" // WARN: Version information not provided by tool on CLI. Please update this string when bumping container versions. + if ("$input" == "$output_file") error "Input and output names are the same, use \"task.ext.prefix\" to disambiguate!" + """ + sort ${args} ${input} > ${output_file} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + coreutils: $VERSION + END_VERSIONS + """ + + stub: + def prefix = task.ext.prefix ?: "${meta.id}" + suffix = task.ext.suffix ?: "${input.extension}" + output_file = "${prefix}.${suffix}" + def VERSION = "9.3" + + if ("$input" == "$output_file") error "Input and output names are the same, use \"task.ext.prefix\" to disambiguate!" + """ + touch ${output_file} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + coreutils: $VERSION + END_VERSIONS + """ +} diff --git a/modules/nf-core/gnu/sort/meta.yml b/modules/nf-core/gnu/sort/meta.yml new file mode 100644 index 000000000..9d961750c --- /dev/null +++ b/modules/nf-core/gnu/sort/meta.yml @@ -0,0 +1,41 @@ +name: "gnu_sort" +description: | + Writes a sorted concatenation of file/s +keywords: + - GNU + - sort + - merge compare +tools: + - sort: + description: "Writes a sorted concatenation of file/s" + homepage: "https://github.com/vgl-hub/gfastats" + documentation: "https://www.gnu.org/software/coreutils/manual/html_node/sort-invocation.html" + licence: ["GPL"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - input: + type: file + description: Draft assembly file + pattern: "*.{txt,bed,interval,genome,bins}" +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - sorted: + type: file + description: The sorted txt file generated by sort + pattern: "*.{txt,bed,interval,genome,bins}" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@DLBPointon" +maintainers: + - "@DLBPointon" diff --git a/modules/nf-core/gnu/sort/tests/main.nf.test b/modules/nf-core/gnu/sort/tests/main.nf.test new file mode 100644 index 000000000..e40301871 --- /dev/null +++ b/modules/nf-core/gnu/sort/tests/main.nf.test @@ -0,0 +1,120 @@ +nextflow_process { + + name "Test Process GNU_SORT" + script "modules/nf-core/gnu/sort/main.nf" + process "GNU_SORT" + + tag "modules" + tag "modules_nfcore" + tag "gnu" + tag "gnu/sort" + + test("unsorted_genome_sort") { + config "./sort_simple_bed.config" + + when { + process { + """ + input[0] = [ + [id:'genome_test'], + file(params.test_data['generic']['unsorted_data']['unsorted_text']['genome_file'], + checkIfExists: true) + ] + """ + } + } + + then { + assertAll ( + { assert process.success }, + { assert snapshot(process.out).match() }, + { assert snapshot( + file(process.out.sorted[0][1]).name + ).match("genome_sort") + } + ) + } + + } + + test("unsorted_intervals_sort") { + config "./sort_simple_bed.config" + when { + process { + """ + input[0] = [ + [id:'test'], + file(params.test_data['generic']['unsorted_data']['unsorted_text']['intervals'], + checkIfExists: true) + ] + """ + } + } + + then { + assertAll ( + { assert process.success }, + { assert snapshot(process.out).match() }, + { assert snapshot( + file(process.out.sorted[0][1]).name + ).match("interval_sort") + } + ) + } + + } + + test("unsorted_csv_sort") { + config "./sort_complex.config" + + when { + process { + """ + input[0] = [ + [id:'test'], + file(params.test_data['generic']['unsorted_data']['unsorted_text']['numbers_csv'], + checkIfExists: true) + ] + """ + } + } + + then { + assertAll ( + { assert process.success }, + { assert snapshot(process.out).match() }, + { assert snapshot( + file(process.out.sorted[0][1]).name + ).match("csv_sort") + } + ) + } + + } + + test("unsorted_csv_sort_stub") { + config "./sort_complex.config" + options "-stub" + + when { + process { + """ + input[0] = [ + [id:'test'], + file(params.test_data['generic']['unsorted_data']['unsorted_text']['numbers_csv'], + checkIfExists: true) + ] + """ + } + } + + then { + assertAll ( + { assert process.success }, + { assert snapshot(process.out).match() }, + ) + } + + } + +} diff --git a/modules/nf-core/gnu/sort/tests/main.nf.test.snap b/modules/nf-core/gnu/sort/tests/main.nf.test.snap new file mode 100644 index 000000000..63891bc4b --- /dev/null +++ b/modules/nf-core/gnu/sort/tests/main.nf.test.snap @@ -0,0 +1,164 @@ +{ + "unsorted_csv_sort": { + "content": [ + { + "0": [ + [ + { + "id": "test" + }, + "test.csv.sorted:md5,0b52d1b4c4a0c6e972c6f94aafd75a1d" + ] + ], + "1": [ + "versions.yml:md5,dd412503ec9dd665203e083ea44326cb" + ], + "sorted": [ + [ + { + "id": "test" + }, + "test.csv.sorted:md5,0b52d1b4c4a0c6e972c6f94aafd75a1d" + ] + ], + "versions": [ + "versions.yml:md5,dd412503ec9dd665203e083ea44326cb" + ] + } + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "24.04.2" + }, + "timestamp": "2024-06-14T11:13:44.714632791" + }, + "interval_sort": { + "content": [ + "test.bed.sorted" + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "24.04.2" + }, + "timestamp": "2024-06-14T11:13:37.962807086" + }, + "unsorted_csv_sort_stub": { + "content": [ + { + "0": [ + [ + { + "id": "test" + }, + "test.csv.sorted:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "1": [ + "versions.yml:md5,dd412503ec9dd665203e083ea44326cb" + ], + "sorted": [ + [ + { + "id": "test" + }, + "test.csv.sorted:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "versions": [ + "versions.yml:md5,dd412503ec9dd665203e083ea44326cb" + ] + } + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "24.04.2" + }, + "timestamp": "2024-06-14T11:13:51.456258705" + }, + "csv_sort": { + "content": [ + "test.csv.sorted" + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "24.04.2" + }, + "timestamp": "2024-06-14T11:13:44.725431761" + }, + "unsorted_genome_sort": { + "content": [ + { + "0": [ + [ + { + "id": "genome_test" + }, + "genome_test.bed.sorted:md5,fd97f7efafdbbfa71d9b560f10b4b048" + ] + ], + "1": [ + "versions.yml:md5,dd412503ec9dd665203e083ea44326cb" + ], + "sorted": [ + [ + { + "id": "genome_test" + }, + "genome_test.bed.sorted:md5,fd97f7efafdbbfa71d9b560f10b4b048" + ] + ], + "versions": [ + "versions.yml:md5,dd412503ec9dd665203e083ea44326cb" + ] + } + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "24.04.2" + }, + "timestamp": "2024-06-14T11:13:31.041778719" + }, + "genome_sort": { + "content": [ + "genome_test.bed.sorted" + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "24.04.2" + }, + "timestamp": "2024-06-14T11:13:31.060201722" + }, + "unsorted_intervals_sort": { + "content": [ + { + "0": [ + [ + { + "id": "test" + }, + "test.bed.sorted:md5,abbce903ef263d38b2f71856387799ab" + ] + ], + "1": [ + "versions.yml:md5,dd412503ec9dd665203e083ea44326cb" + ], + "sorted": [ + [ + { + "id": "test" + }, + "test.bed.sorted:md5,abbce903ef263d38b2f71856387799ab" + ] + ], + "versions": [ + "versions.yml:md5,dd412503ec9dd665203e083ea44326cb" + ] + } + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "24.04.2" + }, + "timestamp": "2024-06-14T11:13:37.951397547" + } +} \ No newline at end of file diff --git a/modules/nf-core/gnu/sort/tests/sort_complex.config b/modules/nf-core/gnu/sort/tests/sort_complex.config new file mode 100644 index 000000000..103eaaf6f --- /dev/null +++ b/modules/nf-core/gnu/sort/tests/sort_complex.config @@ -0,0 +1,6 @@ +process { + withName: GNU_SORT { + ext.args = { "-t ';' -g -k 1,1 -k 2,2" } + ext.suffix = { "csv.sorted" } + } +} \ No newline at end of file diff --git a/modules/nf-core/gnu/sort/tests/sort_simple_bed.config b/modules/nf-core/gnu/sort/tests/sort_simple_bed.config new file mode 100644 index 000000000..d7d52e0f2 --- /dev/null +++ b/modules/nf-core/gnu/sort/tests/sort_simple_bed.config @@ -0,0 +1,6 @@ +process { + withName: GNU_SORT { + ext.args = { "-k1,1 -k2,2n" } + ext.suffix = { "bed.sorted" } + } +} \ No newline at end of file diff --git a/modules/nf-core/gnu/sort/tests/sort_simple_genome.config b/modules/nf-core/gnu/sort/tests/sort_simple_genome.config new file mode 100644 index 000000000..4dcec3855 --- /dev/null +++ b/modules/nf-core/gnu/sort/tests/sort_simple_genome.config @@ -0,0 +1,6 @@ +process { + withName: GNU_SORT { + ext.args = { "-k1,1 -k2,2n" } + ext.suffix = { "genome.sorted" } + } +} \ No newline at end of file diff --git a/modules/nf-core/gnu/sort/tests/tags.yml b/modules/nf-core/gnu/sort/tests/tags.yml new file mode 100644 index 000000000..ac40e376d --- /dev/null +++ b/modules/nf-core/gnu/sort/tests/tags.yml @@ -0,0 +1,2 @@ +gnu/sort: + - "modules/nf-core/gnu/sort/**" diff --git a/modules/nf-core/hisat2/align/environment.yml b/modules/nf-core/hisat2/align/environment.yml new file mode 100644 index 000000000..0c1415f94 --- /dev/null +++ b/modules/nf-core/hisat2/align/environment.yml @@ -0,0 +1,8 @@ +name: hisat2_align +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::hisat2=2.2.1 + - bioconda::samtools=1.16.1 diff --git a/modules/nf-core/hisat2/align/main.nf b/modules/nf-core/hisat2/align/main.nf new file mode 100644 index 000000000..2289a9fc0 --- /dev/null +++ b/modules/nf-core/hisat2/align/main.nf @@ -0,0 +1,93 @@ +process HISAT2_ALIGN { + tag "$meta.id" + label 'process_high' + + // WARN: Version information not provided by tool on CLI. Please update version string below when bumping container versions. + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/mulled-v2-a97e90b3b802d1da3d6958e0867610c718cb5eb1:2cdf6bf1e92acbeb9b2834b1c58754167173a410-0' : + 'biocontainers/mulled-v2-a97e90b3b802d1da3d6958e0867610c718cb5eb1:2cdf6bf1e92acbeb9b2834b1c58754167173a410-0' }" + + input: + tuple val(meta), path(reads) + tuple val(meta2), path(index) + tuple val(meta3), path(splicesites) + + output: + tuple val(meta), path("*.bam") , emit: bam + tuple val(meta), path("*.log") , emit: summary + tuple val(meta), path("*fastq.gz"), optional:true, emit: fastq + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def VERSION = '2.2.1' // WARN: Version information not provided by tool on CLI. Please update this string when bumping container versions. + + def strandedness = '' + if (meta.strandedness == 'forward') { + strandedness = meta.single_end ? '--rna-strandness F' : '--rna-strandness FR' + } else if (meta.strandedness == 'reverse') { + strandedness = meta.single_end ? '--rna-strandness R' : '--rna-strandness RF' + } + ss = "$splicesites" ? "--known-splicesite-infile $splicesites" : '' + def seq_center = params.seq_center ? "--rg-id ${prefix} --rg SM:$prefix --rg CN:${params.seq_center.replaceAll('\\s','_')}" : "--rg-id ${prefix} --rg SM:$prefix" + if (meta.single_end) { + def unaligned = params.save_unaligned ? "--un-gz ${prefix}.unmapped.fastq.gz" : '' + """ + INDEX=`find -L ./ -name "*.1.ht2" | sed 's/\\.1.ht2\$//'` + hisat2 \\ + -x \$INDEX \\ + -U $reads \\ + $strandedness \\ + $ss \\ + --summary-file ${prefix}.hisat2.summary.log \\ + --threads $task.cpus \\ + $seq_center \\ + $unaligned \\ + $args \\ + | samtools view -bS -F 4 -F 256 - > ${prefix}.bam + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + hisat2: $VERSION + samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') + END_VERSIONS + """ + } else { + def unaligned = params.save_unaligned ? "--un-conc-gz ${prefix}.unmapped.fastq.gz" : '' + """ + INDEX=`find -L ./ -name "*.1.ht2" | sed 's/\\.1.ht2\$//'` + hisat2 \\ + -x \$INDEX \\ + -1 ${reads[0]} \\ + -2 ${reads[1]} \\ + $strandedness \\ + $ss \\ + --summary-file ${prefix}.hisat2.summary.log \\ + --threads $task.cpus \\ + $seq_center \\ + $unaligned \\ + --no-mixed \\ + --no-discordant \\ + $args \\ + | samtools view -bS -F 4 -F 8 -F 256 - > ${prefix}.bam + + if [ -f ${prefix}.unmapped.fastq.1.gz ]; then + mv ${prefix}.unmapped.fastq.1.gz ${prefix}.unmapped_1.fastq.gz + fi + if [ -f ${prefix}.unmapped.fastq.2.gz ]; then + mv ${prefix}.unmapped.fastq.2.gz ${prefix}.unmapped_2.fastq.gz + fi + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + hisat2: $VERSION + samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') + END_VERSIONS + """ + } +} diff --git a/modules/nf-core/hisat2/align/meta.yml b/modules/nf-core/hisat2/align/meta.yml new file mode 100644 index 000000000..b23eab75b --- /dev/null +++ b/modules/nf-core/hisat2/align/meta.yml @@ -0,0 +1,67 @@ +name: hisat2_align +description: Align RNA-Seq reads to a reference with HISAT2 +keywords: + - align + - fasta + - genome + - reference +tools: + - hisat2: + description: HISAT2 is a fast and sensitive alignment program for mapping next-generation sequencing reads (both DNA and RNA) to a population of human genomes as well as to a single reference genome. + homepage: https://daehwankimlab.github.io/hisat2/ + documentation: https://daehwankimlab.github.io/hisat2/manual/ + doi: "10.1038/s41587-019-0201-4" + licence: ["MIT"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - reads: + type: file + description: | + List of input FastQ files of size 1 and 2 for single-end and paired-end data, + respectively. + - meta2: + type: map + description: | + Groovy Map containing reference information + e.g. [ id:'genome' ] + - index: + type: file + description: HISAT2 genome index file + pattern: "*.ht2" + - meta3: + type: map + description: | + Groovy Map containing reference information + e.g. [ id:'genome' ] + - splicesites: + type: file + description: Splices sites in gtf file + pattern: "*.{txt}" +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - bam: + type: file + description: Output BAM file containing read alignments + pattern: "*.{bam}" + - summary: + type: file + description: Aligment log + pattern: "*.log" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@ntoda03" + - "@ramprasadn" +maintainers: + - "@ntoda03" + - "@ramprasadn" diff --git a/modules/nf-core/hisat2/align/tests/main.nf.test b/modules/nf-core/hisat2/align/tests/main.nf.test new file mode 100644 index 000000000..3a520e9a0 --- /dev/null +++ b/modules/nf-core/hisat2/align/tests/main.nf.test @@ -0,0 +1,218 @@ +nextflow_process { + + name "Test Process HISAT2_ALIGN" + script "../main.nf" + process "HISAT2_ALIGN" + tag "modules" + tag "modules_nfcore" + tag "hisat2" + tag "hisat2/align" + tag "hisat2/build" + tag "hisat2/extractsplicesites" + + test("Single-End") { + + setup { + run("HISAT2_EXTRACTSPLICESITES") { + script "../../extractsplicesites/main.nf" + process { + """ + input[0] = Channel.of([ + [id:'genome'], + file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.gtf', checkIfExists: true) + ]) + """ + } + } + + run("HISAT2_BUILD") { + script "../../build/main.nf" + process { + """ + input[0] = Channel.of([ + [id:'genome'], + file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.fasta', checkIfExists: true) + ]) + input[1] = Channel.of([ [id:'genome'], + file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.gtf', checkIfExists: true) + ]) + input[2] = HISAT2_EXTRACTSPLICESITES.out.txt + """ + } + } + } + + when { + params { + outdir = "$outputDir" + } + process { + """ + input[0] = Channel.of([ + [ id:'test', single_end:true ], // meta map + [ file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_1.fastq.gz', checkIfExists: true)] + ]) + input[1] = HISAT2_BUILD.out.index + input[2] = HISAT2_EXTRACTSPLICESITES.out.txt + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out.summary).match("se_summary") }, + { assert snapshot(process.out.fastq).match("se_fastq") }, + { assert snapshot(process.out.versions).match("se_versions") } + ) + } + } + + test("Paired-End") { + + setup { + run("HISAT2_EXTRACTSPLICESITES") { + script "../../extractsplicesites/main.nf" + process { + """ + input[0] = Channel.of([ + [id:'genome'], + file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.gtf', checkIfExists: true) + ]) + """ + } + } + + run("HISAT2_BUILD") { + script "../../build/main.nf" + process { + """ + input[0] = Channel.of([ + [id:'genome'], + file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.fasta', checkIfExists: true) + ]) + input[1] = Channel.of([ [id:'genome'], + file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.gtf', checkIfExists: true) + ]) + input[2] = HISAT2_EXTRACTSPLICESITES.out.txt + """ + } + } + } + + when { + params { + outdir = "$outputDir" + } + process { + """ + input[0] = Channel.of([ + [ id:'test', single_end:false ], // meta map + [ file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_1.fastq.gz', checkIfExists: true), + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_2.fastq.gz', checkIfExists: true) ] + ]) + input[1] = HISAT2_BUILD.out.index + input[2] = HISAT2_EXTRACTSPLICESITES.out.txt + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out.summary).match("pe_summary") }, + { assert snapshot(process.out.fastq).match("pe_fastq") }, + { assert snapshot(process.out.versions).match("pe_versions") } + ) + } + } + + test("Single-End No Splice Sites") { + + setup { + run("HISAT2_BUILD") { + script "../../build/main.nf" + process { + """ + input[0] = Channel.of([ + [id:'genome'], + file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.fasta', checkIfExists: true) + ]) + input[1] = [[:],[]] + input[2] = [[:],[]] + """ + } + } + } + + when { + params { + outdir = "$outputDir" + } + process { + """ + input[0] = Channel.of([ + [ id:'test', single_end:true ], // meta map + [ file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_1.fastq.gz', checkIfExists: true) ] + ]) + input[1] = HISAT2_BUILD.out.index + input[2] = [[:],[]] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out.summary).match("se_no_ss_summary") }, + { assert snapshot(process.out.fastq).match("se_no_ss_fastq") }, + { assert snapshot(process.out.versions).match("se_no_ss_versions") } + ) + } + } + + test("Paired-End No Splice Sites") { + + setup { + run("HISAT2_BUILD") { + script "../../build/main.nf" + process { + """ + input[0] = Channel.of([ + [id:'genome'], + file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.fasta', checkIfExists: true) + ]) + input[1] = [[:],[]] + input[2] = [[:],[]] + """ + } + } + } + + when { + params { + outdir = "$outputDir" + } + process { + """ + input[0] = Channel.of([ + [ id:'test', single_end:false ], // meta map + [ file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_1.fastq.gz', checkIfExists: true), + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_2.fastq.gz', checkIfExists: true) ] + ]) + input[1] = HISAT2_BUILD.out.index + input[2] = [[:],[]] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out.summary).match("pe_no_ss_summary") }, + { assert snapshot(process.out.fastq).match("pe_no_ss_fastq") }, + { assert snapshot(process.out.versions).match("pe_no_ss_versions") } + ) + } + } +} diff --git a/modules/nf-core/hisat2/align/tests/main.nf.test.snap b/modules/nf-core/hisat2/align/tests/main.nf.test.snap new file mode 100644 index 000000000..a80fa3c50 --- /dev/null +++ b/modules/nf-core/hisat2/align/tests/main.nf.test.snap @@ -0,0 +1,122 @@ +{ + "se_versions": { + "content": [ + [ + "versions.yml:md5,ceb638f44ebdaf09ba1f5c5c409585e2" + ] + ], + "timestamp": "2023-10-16T15:14:50.269895296" + }, + "se_no_ss_summary": { + "content": [ + [ + [ + { + "id": "test", + "single_end": true + }, + "test.hisat2.summary.log:md5,7b8a9e61b7646da1089b041333c41a87" + ] + ] + ], + "timestamp": "2023-10-16T15:15:22.897386626" + }, + "pe_no_ss_versions": { + "content": [ + [ + "versions.yml:md5,ceb638f44ebdaf09ba1f5c5c409585e2" + ] + ], + "timestamp": "2023-10-16T15:15:42.583699978" + }, + "se_no_ss_versions": { + "content": [ + [ + "versions.yml:md5,ceb638f44ebdaf09ba1f5c5c409585e2" + ] + ], + "timestamp": "2023-10-16T15:15:22.909407356" + }, + "pe_no_ss_summary": { + "content": [ + [ + [ + { + "id": "test", + "single_end": false + }, + "test.hisat2.summary.log:md5,9839b31db795958cc4b70711a3414e9c" + ] + ] + ], + "timestamp": "2023-10-16T15:15:42.569775538" + }, + "pe_no_ss_fastq": { + "content": [ + [ + + ] + ], + "timestamp": "2023-10-16T15:15:42.576881608" + }, + "se_summary": { + "content": [ + [ + [ + { + "id": "test", + "single_end": true + }, + "test.hisat2.summary.log:md5,7b8a9e61b7646da1089b041333c41a87" + ] + ] + ], + "timestamp": "2023-10-16T15:14:50.252466896" + }, + "pe_summary": { + "content": [ + [ + [ + { + "id": "test", + "single_end": false + }, + "test.hisat2.summary.log:md5,9839b31db795958cc4b70711a3414e9c" + ] + ] + ], + "timestamp": "2023-10-16T15:15:09.881690889" + }, + "pe_fastq": { + "content": [ + [ + + ] + ], + "timestamp": "2023-10-16T15:15:09.888696129" + }, + "se_no_ss_fastq": { + "content": [ + [ + + ] + ], + "timestamp": "2023-10-16T15:15:22.904010016" + }, + "se_fastq": { + "content": [ + [ + + ] + ], + "timestamp": "2023-10-16T15:14:50.264366105" + }, + "pe_versions": { + "content": [ + [ + "versions.yml:md5,ceb638f44ebdaf09ba1f5c5c409585e2" + ] + ], + "timestamp": "2023-10-16T15:15:09.894683308" + } +} \ No newline at end of file diff --git a/modules/nf-core/hisat2/align/tests/tags.yml b/modules/nf-core/hisat2/align/tests/tags.yml new file mode 100644 index 000000000..3a46cc896 --- /dev/null +++ b/modules/nf-core/hisat2/align/tests/tags.yml @@ -0,0 +1,4 @@ +hisat2/align: + - modules/nf-core/hisat2/align/** + - modules/nf-core/hisat2/build/** + - modules/nf-core/hisat2/extractsplicesites/** diff --git a/modules/nf-core/hisat2/build/environment.yml b/modules/nf-core/hisat2/build/environment.yml new file mode 100644 index 000000000..2e67cd3ea --- /dev/null +++ b/modules/nf-core/hisat2/build/environment.yml @@ -0,0 +1,7 @@ +name: hisat2_build +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::hisat2=2.2.1 diff --git a/modules/nf-core/hisat2/build/main.nf b/modules/nf-core/hisat2/build/main.nf new file mode 100644 index 000000000..766e8731d --- /dev/null +++ b/modules/nf-core/hisat2/build/main.nf @@ -0,0 +1,64 @@ +process HISAT2_BUILD { + tag "$fasta" + label 'process_high' + label 'process_high_memory' + + // WARN: Version information not provided by tool on CLI. Please update version string below when bumping container versions. + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/hisat2:2.2.1--h1b792b2_3' : + 'biocontainers/hisat2:2.2.1--h1b792b2_3' }" + + input: + tuple val(meta), path(fasta) + tuple val(meta2), path(gtf) + tuple val(meta3), path(splicesites) + + output: + tuple val(meta), path("hisat2") , emit: index + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def avail_mem = 0 + if (!task.memory) { + log.info "[HISAT2 index build] Available memory not known - defaulting to 0. Specify process memory requirements to change this." + } else { + log.info "[HISAT2 index build] Available memory: ${task.memory}" + avail_mem = task.memory.toGiga() + } + + def ss = '' + def exon = '' + def extract_exons = '' + def hisat2_build_memory = params.hisat2_build_memory ? (params.hisat2_build_memory as nextflow.util.MemoryUnit).toGiga() : 0 + if (avail_mem >= hisat2_build_memory) { + log.info "[HISAT2 index build] At least ${hisat2_build_memory} GB available, so using splice sites and exons to build HISAT2 index" + extract_exons = gtf ? "hisat2_extract_exons.py $gtf > ${gtf.baseName}.exons.txt" : "" + ss = splicesites ? "--ss $splicesites" : "" + exon = gtf ? "--exon ${gtf.baseName}.exons.txt" : "" + } else { + log.info "[HISAT2 index build] Less than ${hisat2_build_memory} GB available, so NOT using splice sites and exons to build HISAT2 index." + log.info "[HISAT2 index build] Use --hisat2_build_memory [small number] to skip this check." + } + def VERSION = '2.2.1' // WARN: Version information not provided by tool on CLI. Please update this string when bumping container versions. + """ + mkdir hisat2 + $extract_exons + hisat2-build \\ + -p $task.cpus \\ + $ss \\ + $exon \\ + $args \\ + $fasta \\ + hisat2/${fasta.baseName} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + hisat2: $VERSION + END_VERSIONS + """ +} diff --git a/modules/nf-core/hisat2/build/meta.yml b/modules/nf-core/hisat2/build/meta.yml new file mode 100644 index 000000000..6c28eb21c --- /dev/null +++ b/modules/nf-core/hisat2/build/meta.yml @@ -0,0 +1,61 @@ +name: hisat2_build +description: Builds HISAT2 index for reference genome +keywords: + - build + - index + - fasta + - genome + - reference +tools: + - hisat2: + description: HISAT2 is a fast and sensitive alignment program for mapping next-generation sequencing reads (both DNA and RNA) to a population of human genomes as well as to a single reference genome. + homepage: https://daehwankimlab.github.io/hisat2/ + documentation: https://daehwankimlab.github.io/hisat2/manual/ + doi: "10.1038/s41587-019-0201-4" + licence: ["MIT"] +input: + - meta: + type: map + description: | + Groovy Map containing reference information + e.g. [ id:'genome' ] + - fasta: + type: file + description: Reference fasta file + pattern: "*.{fa,fasta,fna}" + - meta2: + type: map + description: | + Groovy Map containing reference information + e.g. [ id:'genome' ] + - gtf: + type: file + description: Reference gtf annotation file + pattern: "*.{gtf}" + - meta3: + type: map + description: | + Groovy Map containing reference information + e.g. [ id:'genome' ] + - splicesites: + type: file + description: Splices sites in gtf file + pattern: "*.{txt}" +output: + - meta: + type: map + description: | + Groovy Map containing reference information + e.g. [ id:'genome' ] + - index: + type: file + description: HISAT2 genome index file + pattern: "*.ht2" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@ntoda03" +maintainers: + - "@ntoda03" diff --git a/modules/nf-core/hisat2/build/tests/main.nf.test b/modules/nf-core/hisat2/build/tests/main.nf.test new file mode 100644 index 000000000..5b31debc4 --- /dev/null +++ b/modules/nf-core/hisat2/build/tests/main.nf.test @@ -0,0 +1,53 @@ +nextflow_process { + + name "Test Process HISAT2_BUILD" + script "../main.nf" + process "HISAT2_BUILD" + tag "modules" + tag "modules_nfcore" + tag "hisat2" + tag "hisat2/build" + tag "hisat2/extractsplicesites" + + test("Should run without failures") { + + setup { + run("HISAT2_EXTRACTSPLICESITES") { + script "../../extractsplicesites/main.nf" + process { + """ + input[0] = Channel.of([ + [id:'genome'], + file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.gtf', checkIfExists: true) + ]) + """ + } + } + } + + when { + params { + outdir = "$outputDir" + } + process { + """ + input[0] = Channel.of([ + [id:'genome'], + file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.fasta', checkIfExists: true) + ]) + input[1] = Channel.of([ [id:'genome'], + file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.gtf', checkIfExists: true) + ]) + input[2] = HISAT2_EXTRACTSPLICESITES.out.txt + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + } +} diff --git a/modules/nf-core/hisat2/build/tests/main.nf.test.snap b/modules/nf-core/hisat2/build/tests/main.nf.test.snap new file mode 100644 index 000000000..c7d364dbc --- /dev/null +++ b/modules/nf-core/hisat2/build/tests/main.nf.test.snap @@ -0,0 +1,49 @@ +{ + "Should run without failures": { + "content": [ + { + "0": [ + [ + { + "id": "genome" + }, + [ + "genome.1.ht2:md5,057cfa8a22b97ee9cff4c8d342498803", + "genome.2.ht2:md5,47b153cd1319abc88dda532462651fcf", + "genome.3.ht2:md5,4ed93abba181d8dfab2e303e33114777", + "genome.4.ht2:md5,c25be5f8b0378abf7a58c8a880b87626", + "genome.5.ht2:md5,91198831aaba993acac1734138c5f173", + "genome.6.ht2:md5,265e1284ce85686516fae5d35540994a", + "genome.7.ht2:md5,9013eccd91ad614d7893c739275a394f", + "genome.8.ht2:md5,33cdeccccebe80329f1fdbee7f5874cb" + ] + ] + ], + "1": [ + "versions.yml:md5,e36ef3cd73d19ccf2378c9358fe942c0" + ], + "index": [ + [ + { + "id": "genome" + }, + [ + "genome.1.ht2:md5,057cfa8a22b97ee9cff4c8d342498803", + "genome.2.ht2:md5,47b153cd1319abc88dda532462651fcf", + "genome.3.ht2:md5,4ed93abba181d8dfab2e303e33114777", + "genome.4.ht2:md5,c25be5f8b0378abf7a58c8a880b87626", + "genome.5.ht2:md5,91198831aaba993acac1734138c5f173", + "genome.6.ht2:md5,265e1284ce85686516fae5d35540994a", + "genome.7.ht2:md5,9013eccd91ad614d7893c739275a394f", + "genome.8.ht2:md5,33cdeccccebe80329f1fdbee7f5874cb" + ] + ] + ], + "versions": [ + "versions.yml:md5,e36ef3cd73d19ccf2378c9358fe942c0" + ] + } + ], + "timestamp": "2023-10-16T14:42:22.381609786" + } +} \ No newline at end of file diff --git a/modules/nf-core/hisat2/build/tests/tags.yml b/modules/nf-core/hisat2/build/tests/tags.yml new file mode 100644 index 000000000..a7faecb27 --- /dev/null +++ b/modules/nf-core/hisat2/build/tests/tags.yml @@ -0,0 +1,3 @@ +hisat2/build: + - modules/nf-core/hisat2/build/** + - modules/nf-core/hisat2/extractsplicesites/** diff --git a/modules/nf-core/hisat2/extractsplicesites/environment.yml b/modules/nf-core/hisat2/extractsplicesites/environment.yml new file mode 100644 index 000000000..4b03e5e46 --- /dev/null +++ b/modules/nf-core/hisat2/extractsplicesites/environment.yml @@ -0,0 +1,7 @@ +name: hisat2_extractsplicesites +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::hisat2=2.2.1 diff --git a/modules/nf-core/hisat2/extractsplicesites/main.nf b/modules/nf-core/hisat2/extractsplicesites/main.nf new file mode 100644 index 000000000..b0c8513aa --- /dev/null +++ b/modules/nf-core/hisat2/extractsplicesites/main.nf @@ -0,0 +1,31 @@ +process HISAT2_EXTRACTSPLICESITES { + tag "$gtf" + label 'process_medium' + + // WARN: Version information not provided by tool on CLI. Please update version string below when bumping container versions. + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/hisat2:2.2.1--h1b792b2_3' : + 'biocontainers/hisat2:2.2.1--h1b792b2_3' }" + + input: + tuple val(meta), path(gtf) + + output: + tuple val(meta), path("*.splice_sites.txt"), emit: txt + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def VERSION = '2.2.1' // WARN: Version information not provided by tool on CLI. Please update this string when bumping container versions. + """ + hisat2_extract_splice_sites.py $gtf > ${gtf.baseName}.splice_sites.txt + cat <<-END_VERSIONS > versions.yml + "${task.process}": + hisat2: $VERSION + END_VERSIONS + """ +} diff --git a/modules/nf-core/hisat2/extractsplicesites/meta.yml b/modules/nf-core/hisat2/extractsplicesites/meta.yml new file mode 100644 index 000000000..40d77ce00 --- /dev/null +++ b/modules/nf-core/hisat2/extractsplicesites/meta.yml @@ -0,0 +1,44 @@ +name: hisat2_extractsplicesites +description: Extracts splicing sites from a gtf files +keywords: + - splicing + - gtf + - genome + - reference +tools: + - hisat2: + description: HISAT2 is a fast and sensitive alignment program for mapping next-generation sequencing reads (both DNA and RNA) to a population of human genomes as well as to a single reference genome. + homepage: https://daehwankimlab.github.io/hisat2/ + documentation: https://daehwankimlab.github.io/hisat2/manual/ + doi: "10.1038/s41587-019-0201-4" + licence: ["MIT"] +input: + - meta: + type: map + description: | + Groovy Map containing reference information + e.g. [ id:'genome' ] + - gtf: + type: file + description: Reference gtf annotation file + pattern: "*.{gtf}" +output: + - meta: + type: map + description: | + Groovy Map containing reference information + e.g. [ id:'genome' ] + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - splicesites: + type: file + description: Splices sites in gtf file + pattern: "*.{splice_sites.txt}" +authors: + - "@ntoda03" + - "@ramprasadn" +maintainers: + - "@ntoda03" + - "@ramprasadn" diff --git a/modules/nf-core/hisat2/extractsplicesites/tests/main.nf.test b/modules/nf-core/hisat2/extractsplicesites/tests/main.nf.test new file mode 100644 index 000000000..72eb6d53b --- /dev/null +++ b/modules/nf-core/hisat2/extractsplicesites/tests/main.nf.test @@ -0,0 +1,35 @@ +nextflow_process { + + name "Test Process HISAT2_EXTRACTSPLICESITES" + script "../main.nf" + process "HISAT2_EXTRACTSPLICESITES" + tag "modules" + tag "modules_nfcore" + tag "hisat2" + tag "hisat2/extractsplicesites" + + test("Should run without failures") { + + when { + params { + outdir = "$outputDir" + } + process { + """ + input[0] = Channel.of([ + [id:'genome'], + file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.gtf', checkIfExists: true) + ]) + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert path("${process.out.txt[0][1]}").exists() }, + { assert snapshot(process.out.versions).match() } + ) + } + } +} diff --git a/modules/nf-core/hisat2/extractsplicesites/tests/main.nf.test.snap b/modules/nf-core/hisat2/extractsplicesites/tests/main.nf.test.snap new file mode 100644 index 000000000..17f1c8ebf --- /dev/null +++ b/modules/nf-core/hisat2/extractsplicesites/tests/main.nf.test.snap @@ -0,0 +1,10 @@ +{ + "Should run without failures": { + "content": [ + [ + "versions.yml:md5,eeea7231fe197810659b8bad4133aff2" + ] + ], + "timestamp": "2024-01-18T20:56:30.71763" + } +} \ No newline at end of file diff --git a/modules/nf-core/hisat2/extractsplicesites/tests/tags.yml b/modules/nf-core/hisat2/extractsplicesites/tests/tags.yml new file mode 100644 index 000000000..4b0ed4010 --- /dev/null +++ b/modules/nf-core/hisat2/extractsplicesites/tests/tags.yml @@ -0,0 +1,2 @@ +hisat2/extractsplicesites: + - modules/nf-core/hisat2/extractsplicesites/** diff --git a/modules/nf-core/miranda/environment.yml b/modules/nf-core/miranda/environment.yml new file mode 100644 index 000000000..a04ca7f84 --- /dev/null +++ b/modules/nf-core/miranda/environment.yml @@ -0,0 +1,7 @@ +name: miranda +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::miranda=3.3a diff --git a/modules/nf-core/miranda/main.nf b/modules/nf-core/miranda/main.nf new file mode 100644 index 000000000..47a98253e --- /dev/null +++ b/modules/nf-core/miranda/main.nf @@ -0,0 +1,50 @@ +process MIRANDA { + tag "$meta.id" + label 'process_medium' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/miranda:3.3a--h779adbc_3': + 'biocontainers/miranda:3.3a--h779adbc_3' }" + + input: + tuple val(meta), path(query) + path(mirbase) + + output: + tuple val(meta), path("*.txt"), emit: txt + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + """ + miranda \\ + $mirbase \\ + $query \\ + $args \\ + -out ${prefix}.out + + echo "miRNA\tTarget\tScore\tEnergy_KcalMol\tQuery_Start\tQuery_End\tSubject_Start\tSubject_End\tAln_len\tSubject_Identity\tQuery_Identity" > ${prefix}.txt + grep -A 1 "Scores for this hit:" ${prefix}.out | sort | grep ">" | cut -c 2- | tr ' ' '\t' >> ${prefix}.txt + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + miranda: \$(echo \$(miranda -v | sed -n 4p | sed 's/^.*miranda v//; s/microRNA.*\$//' )) + END_VERSIONS + """ + + stub: + def prefix = task.ext.prefix ?: "${meta.id}" + """ + touch ${prefix}.txt + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + miranda: \$(echo \$(miranda -v | sed -n 4p | sed 's/^.*miranda v//; s/microRNA.*\$//' )) + END_VERSIONS + """ +} diff --git a/modules/nf-core/miranda/meta.yml b/modules/nf-core/miranda/meta.yml new file mode 100644 index 000000000..d3950fd19 --- /dev/null +++ b/modules/nf-core/miranda/meta.yml @@ -0,0 +1,45 @@ +name: "miranda" +description: miRanda is an algorithm for finding genomic targets for microRNAs +keywords: + - microrna + - mirna + - target prediction +tools: + - "miranda": + description: "An algorithm for finding genomic targets for microRNAs" + homepage: "https://cbio.mskcc.org/miRNA2003/miranda.html" + documentation: "https://cbio.mskcc.org/miRNA2003/miranda.html" + doi: "10.1186/gb-2003-5-1-r1" + licence: "GNU Public License" +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test' ] + - query: + type: file + description: FASTA file containing the microRNA query sequences + pattern: "*.{fa,fasta}" + - mirbase: + type: file + description: FASTA file containing the sequence(s) to be scanned + pattern: "*.{fa,fasta}" +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test' ] + - txt: + type: file + description: Reformatted TXT file containing microRNA targets + pattern: "*.{txt}" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@BarryDigby" +maintainers: + - "@BarryDigby" diff --git a/modules/nf-core/samtools/faidx/environment.yml b/modules/nf-core/samtools/faidx/environment.yml new file mode 100644 index 000000000..f8450fa56 --- /dev/null +++ b/modules/nf-core/samtools/faidx/environment.yml @@ -0,0 +1,10 @@ +name: samtools_faidx + +channels: + - conda-forge + - bioconda + - defaults + +dependencies: + - bioconda::htslib=1.20 + - bioconda::samtools=1.20 diff --git a/modules/nf-core/samtools/faidx/main.nf b/modules/nf-core/samtools/faidx/main.nf new file mode 100644 index 000000000..bdcdbc954 --- /dev/null +++ b/modules/nf-core/samtools/faidx/main.nf @@ -0,0 +1,50 @@ +process SAMTOOLS_FAIDX { + tag "$fasta" + label 'process_single' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/samtools:1.20--h50ea8bc_0' : + 'biocontainers/samtools:1.20--h50ea8bc_0' }" + + input: + tuple val(meta), path(fasta) + tuple val(meta2), path(fai) + + output: + tuple val(meta), path ("*.{fa,fasta}") , emit: fa , optional: true + tuple val(meta), path ("*.fai") , emit: fai, optional: true + tuple val(meta), path ("*.gzi") , emit: gzi, optional: true + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + """ + samtools \\ + faidx \\ + $fasta \\ + $args + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') + END_VERSIONS + """ + + stub: + def match = (task.ext.args =~ /-o(?:utput)?\s(.*)\s?/).findAll() + def fastacmd = match[0] ? "touch ${match[0][1]}" : '' + """ + ${fastacmd} + touch ${fasta}.fai + + cat <<-END_VERSIONS > versions.yml + + "${task.process}": + samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/samtools/faidx/meta.yml b/modules/nf-core/samtools/faidx/meta.yml new file mode 100644 index 000000000..f3c25de20 --- /dev/null +++ b/modules/nf-core/samtools/faidx/meta.yml @@ -0,0 +1,65 @@ +name: samtools_faidx +description: Index FASTA file +keywords: + - index + - fasta + - faidx +tools: + - samtools: + description: | + SAMtools is a set of utilities for interacting with and post-processing + short DNA sequence read alignments in the SAM, BAM and CRAM formats, written by Heng Li. + These files are generated as output by short read aligners like BWA. + homepage: http://www.htslib.org/ + documentation: http://www.htslib.org/doc/samtools.html + doi: 10.1093/bioinformatics/btp352 + licence: ["MIT"] +input: + - meta: + type: map + description: | + Groovy Map containing reference information + e.g. [ id:'test' ] + - fasta: + type: file + description: FASTA file + pattern: "*.{fa,fasta}" + - meta2: + type: map + description: | + Groovy Map containing reference information + e.g. [ id:'test' ] + - fai: + type: file + description: FASTA index file + pattern: "*.{fai}" +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - fa: + type: file + description: FASTA file + pattern: "*.{fa}" + - fai: + type: file + description: FASTA index file + pattern: "*.{fai}" + - gzi: + type: file + description: Optional gzip index file for compressed inputs + pattern: "*.gzi" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@drpatelh" + - "@ewels" + - "@phue" +maintainers: + - "@drpatelh" + - "@ewels" + - "@phue" diff --git a/modules/nf-core/samtools/faidx/tests/main.nf.test b/modules/nf-core/samtools/faidx/tests/main.nf.test new file mode 100644 index 000000000..17244ef2e --- /dev/null +++ b/modules/nf-core/samtools/faidx/tests/main.nf.test @@ -0,0 +1,122 @@ +nextflow_process { + + name "Test Process SAMTOOLS_FAIDX" + script "../main.nf" + process "SAMTOOLS_FAIDX" + + tag "modules" + tag "modules_nfcore" + tag "samtools" + tag "samtools/faidx" + + test("test_samtools_faidx") { + + when { + process { + """ + input[0] = [ [ id:'test', single_end:false ], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.fasta', checkIfExists: true) ] + + input[1] = [[],[]] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + } + + test("test_samtools_faidx_bgzip") { + + when { + process { + """ + input[0] = [ [ id:'test', single_end:false ], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.fasta.gz', checkIfExists: true)] + + input[1] = [[],[]] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + } + + test("test_samtools_faidx_fasta") { + + config "./nextflow.config" + + when { + process { + """ + input[0] = [ [ id:'test', single_end:false ], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.fasta', checkIfExists: true) ] + + input[1] = [ [ id:'test', single_end:false ], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.fasta.fai', checkIfExists: true) ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + } + + test("test_samtools_faidx_stub_fasta") { + + config "./nextflow2.config" + + when { + process { + """ + input[0] = [ [ id:'test', single_end:false ], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.fasta', checkIfExists: true) ] + + input[1] = [ [ id:'test', single_end:false ], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.fasta.fai', checkIfExists: true) ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + } + + test("test_samtools_faidx_stub_fai") { + + when { + process { + """ + input[0] = [ [ id:'test', single_end:false ], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.fasta', checkIfExists: true) ] + + input[1] = [[],[]] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + } +} \ No newline at end of file diff --git a/modules/nf-core/samtools/faidx/tests/main.nf.test.snap b/modules/nf-core/samtools/faidx/tests/main.nf.test.snap new file mode 100644 index 000000000..3223b72bc --- /dev/null +++ b/modules/nf-core/samtools/faidx/tests/main.nf.test.snap @@ -0,0 +1,249 @@ +{ + "test_samtools_faidx": { + "content": [ + { + "0": [ + + ], + "1": [ + [ + { + "id": "test", + "single_end": false + }, + "genome.fasta.fai:md5,9da2a56e2853dc8c0b86a9e7229c9fe5" + ] + ], + "2": [ + + ], + "3": [ + "versions.yml:md5,2db78952923a61e05d50b95518b21856" + ], + "fa": [ + + ], + "fai": [ + [ + { + "id": "test", + "single_end": false + }, + "genome.fasta.fai:md5,9da2a56e2853dc8c0b86a9e7229c9fe5" + ] + ], + "gzi": [ + + ], + "versions": [ + "versions.yml:md5,2db78952923a61e05d50b95518b21856" + ] + } + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-05-28T15:42:14.779784761" + }, + "test_samtools_faidx_bgzip": { + "content": [ + { + "0": [ + + ], + "1": [ + [ + { + "id": "test", + "single_end": false + }, + "genome.fasta.gz.fai:md5,9da2a56e2853dc8c0b86a9e7229c9fe5" + ] + ], + "2": [ + [ + { + "id": "test", + "single_end": false + }, + "genome.fasta.gz.gzi:md5,7dea362b3fac8e00956a4952a3d4f474" + ] + ], + "3": [ + "versions.yml:md5,2db78952923a61e05d50b95518b21856" + ], + "fa": [ + + ], + "fai": [ + [ + { + "id": "test", + "single_end": false + }, + "genome.fasta.gz.fai:md5,9da2a56e2853dc8c0b86a9e7229c9fe5" + ] + ], + "gzi": [ + [ + { + "id": "test", + "single_end": false + }, + "genome.fasta.gz.gzi:md5,7dea362b3fac8e00956a4952a3d4f474" + ] + ], + "versions": [ + "versions.yml:md5,2db78952923a61e05d50b95518b21856" + ] + } + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-05-28T15:42:20.256633877" + }, + "test_samtools_faidx_fasta": { + "content": [ + { + "0": [ + [ + { + "id": "test", + "single_end": false + }, + "extract.fa:md5,6a0774a0ad937ba0bfd2ac7457d90f36" + ] + ], + "1": [ + + ], + "2": [ + + ], + "3": [ + "versions.yml:md5,2db78952923a61e05d50b95518b21856" + ], + "fa": [ + [ + { + "id": "test", + "single_end": false + }, + "extract.fa:md5,6a0774a0ad937ba0bfd2ac7457d90f36" + ] + ], + "fai": [ + + ], + "gzi": [ + + ], + "versions": [ + "versions.yml:md5,2db78952923a61e05d50b95518b21856" + ] + } + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-05-28T15:42:25.632577273" + }, + "test_samtools_faidx_stub_fasta": { + "content": [ + { + "0": [ + [ + { + "id": "test", + "single_end": false + }, + "extract.fa:md5,9da2a56e2853dc8c0b86a9e7229c9fe5" + ] + ], + "1": [ + + ], + "2": [ + + ], + "3": [ + "versions.yml:md5,2db78952923a61e05d50b95518b21856" + ], + "fa": [ + [ + { + "id": "test", + "single_end": false + }, + "extract.fa:md5,9da2a56e2853dc8c0b86a9e7229c9fe5" + ] + ], + "fai": [ + + ], + "gzi": [ + + ], + "versions": [ + "versions.yml:md5,2db78952923a61e05d50b95518b21856" + ] + } + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-05-28T15:42:31.058424849" + }, + "test_samtools_faidx_stub_fai": { + "content": [ + { + "0": [ + + ], + "1": [ + [ + { + "id": "test", + "single_end": false + }, + "genome.fasta.fai:md5,9da2a56e2853dc8c0b86a9e7229c9fe5" + ] + ], + "2": [ + + ], + "3": [ + "versions.yml:md5,2db78952923a61e05d50b95518b21856" + ], + "fa": [ + + ], + "fai": [ + [ + { + "id": "test", + "single_end": false + }, + "genome.fasta.fai:md5,9da2a56e2853dc8c0b86a9e7229c9fe5" + ] + ], + "gzi": [ + + ], + "versions": [ + "versions.yml:md5,2db78952923a61e05d50b95518b21856" + ] + } + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-05-28T15:42:36.479929617" + } +} \ No newline at end of file diff --git a/modules/nf-core/samtools/faidx/tests/nextflow.config b/modules/nf-core/samtools/faidx/tests/nextflow.config new file mode 100644 index 000000000..f76a3ba09 --- /dev/null +++ b/modules/nf-core/samtools/faidx/tests/nextflow.config @@ -0,0 +1,7 @@ +process { + + withName: SAMTOOLS_FAIDX { + ext.args = 'MT192765.1 -o extract.fa' + } + +} diff --git a/modules/nf-core/samtools/faidx/tests/nextflow2.config b/modules/nf-core/samtools/faidx/tests/nextflow2.config new file mode 100644 index 000000000..33ebbd5df --- /dev/null +++ b/modules/nf-core/samtools/faidx/tests/nextflow2.config @@ -0,0 +1,6 @@ +process { + + withName: SAMTOOLS_FAIDX { + ext.args = '-o extract.fa' + } +} diff --git a/modules/nf-core/samtools/faidx/tests/tags.yml b/modules/nf-core/samtools/faidx/tests/tags.yml new file mode 100644 index 000000000..e4a839481 --- /dev/null +++ b/modules/nf-core/samtools/faidx/tests/tags.yml @@ -0,0 +1,2 @@ +samtools/faidx: + - modules/nf-core/samtools/faidx/** diff --git a/modules/nf-core/samtools/flagstat/environment.yml b/modules/nf-core/samtools/flagstat/environment.yml new file mode 100644 index 000000000..68b81558e --- /dev/null +++ b/modules/nf-core/samtools/flagstat/environment.yml @@ -0,0 +1,8 @@ +name: samtools_flagstat +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::samtools=1.20 + - bioconda::htslib=1.20 diff --git a/modules/nf-core/samtools/flagstat/main.nf b/modules/nf-core/samtools/flagstat/main.nf new file mode 100644 index 000000000..754d84b73 --- /dev/null +++ b/modules/nf-core/samtools/flagstat/main.nf @@ -0,0 +1,46 @@ +process SAMTOOLS_FLAGSTAT { + tag "$meta.id" + label 'process_single' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/samtools:1.20--h50ea8bc_0' : + 'biocontainers/samtools:1.20--h50ea8bc_0' }" + + input: + tuple val(meta), path(bam), path(bai) + + output: + tuple val(meta), path("*.flagstat"), emit: flagstat + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + """ + samtools \\ + flagstat \\ + --threads ${task.cpus} \\ + $bam \\ + > ${prefix}.flagstat + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') + END_VERSIONS + """ + + stub: + def prefix = task.ext.prefix ?: "${meta.id}" + """ + touch ${prefix}.flagstat + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/samtools/flagstat/meta.yml b/modules/nf-core/samtools/flagstat/meta.yml new file mode 100644 index 000000000..97991358e --- /dev/null +++ b/modules/nf-core/samtools/flagstat/meta.yml @@ -0,0 +1,51 @@ +name: samtools_flagstat +description: Counts the number of alignments in a BAM/CRAM/SAM file for each FLAG type +keywords: + - stats + - mapping + - counts + - bam + - sam + - cram +tools: + - samtools: + description: | + SAMtools is a set of utilities for interacting with and post-processing + short DNA sequence read alignments in the SAM, BAM and CRAM formats, written by Heng Li. + These files are generated as output by short read aligners like BWA. + homepage: http://www.htslib.org/ + documentation: http://www.htslib.org/doc/samtools.html + doi: 10.1093/bioinformatics/btp352 + licence: ["MIT"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - bam: + type: file + description: BAM/CRAM/SAM file + pattern: "*.{bam,cram,sam}" + - bai: + type: file + description: Index for BAM/CRAM/SAM file + pattern: "*.{bai,crai,sai}" +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - flagstat: + type: file + description: File containing samtools flagstat output + pattern: "*.{flagstat}" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@drpatelh" +maintainers: + - "@drpatelh" diff --git a/modules/nf-core/samtools/flagstat/tests/main.nf.test b/modules/nf-core/samtools/flagstat/tests/main.nf.test new file mode 100644 index 000000000..3b648a37d --- /dev/null +++ b/modules/nf-core/samtools/flagstat/tests/main.nf.test @@ -0,0 +1,56 @@ +nextflow_process { + + name "Test Process SAMTOOLS_FLAGSTAT" + script "../main.nf" + process "SAMTOOLS_FLAGSTAT" + tag "modules" + tag "modules_nfcore" + tag "samtools" + tag "samtools/flagstat" + + test("BAM") { + + when { + process { + """ + input[0] = Channel.of([ + [ id:'test', single_end:false ], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/bam/test.paired_end.sorted.bam', checkIfExists: true), + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/bam/test.paired_end.sorted.bam.bai', checkIfExists: true) + ]) + """ + } + } + + then { + assertAll ( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + } + + test("BAM - stub") { + + options "-stub" + + when { + process { + """ + input[0] = Channel.of([ + [ id:'test', single_end:false ], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/bam/test.paired_end.sorted.bam', checkIfExists: true), + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/bam/test.paired_end.sorted.bam.bai', checkIfExists: true) + ]) + """ + } + } + + then { + assertAll ( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + } +} diff --git a/modules/nf-core/samtools/flagstat/tests/main.nf.test.snap b/modules/nf-core/samtools/flagstat/tests/main.nf.test.snap new file mode 100644 index 000000000..23989c612 --- /dev/null +++ b/modules/nf-core/samtools/flagstat/tests/main.nf.test.snap @@ -0,0 +1,72 @@ +{ + "BAM - stub": { + "content": [ + { + "0": [ + [ + { + "id": "test", + "single_end": false + }, + "test.flagstat:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "1": [ + "versions.yml:md5,f606681ef971cbb548a4d9e3fbabdbc2" + ], + "flagstat": [ + [ + { + "id": "test", + "single_end": false + }, + "test.flagstat:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "versions": [ + "versions.yml:md5,f606681ef971cbb548a4d9e3fbabdbc2" + ] + } + ], + "meta": { + "nf-test": "0.9.0", + "nextflow": "24.04.3" + }, + "timestamp": "2024-07-22T14:17:28.002887" + }, + "BAM": { + "content": [ + { + "0": [ + [ + { + "id": "test", + "single_end": false + }, + "test.flagstat:md5,4f7ffd1e6a5e85524d443209ac97d783" + ] + ], + "1": [ + "versions.yml:md5,f606681ef971cbb548a4d9e3fbabdbc2" + ], + "flagstat": [ + [ + { + "id": "test", + "single_end": false + }, + "test.flagstat:md5,4f7ffd1e6a5e85524d443209ac97d783" + ] + ], + "versions": [ + "versions.yml:md5,f606681ef971cbb548a4d9e3fbabdbc2" + ] + } + ], + "meta": { + "nf-test": "0.9.0", + "nextflow": "24.04.3" + }, + "timestamp": "2024-07-22T14:17:13.330971" + } +} \ No newline at end of file diff --git a/modules/nf-core/samtools/flagstat/tests/tags.yml b/modules/nf-core/samtools/flagstat/tests/tags.yml new file mode 100644 index 000000000..2d2b7255e --- /dev/null +++ b/modules/nf-core/samtools/flagstat/tests/tags.yml @@ -0,0 +1,2 @@ +samtools/flagstat: + - modules/nf-core/samtools/flagstat/** diff --git a/modules/nf-core/samtools/idxstats/environment.yml b/modules/nf-core/samtools/idxstats/environment.yml new file mode 100644 index 000000000..eb6c88099 --- /dev/null +++ b/modules/nf-core/samtools/idxstats/environment.yml @@ -0,0 +1,8 @@ +name: samtools_idxstats +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::samtools=1.20 + - bioconda::htslib=1.20 diff --git a/modules/nf-core/samtools/idxstats/main.nf b/modules/nf-core/samtools/idxstats/main.nf new file mode 100644 index 000000000..2ea2a5ccd --- /dev/null +++ b/modules/nf-core/samtools/idxstats/main.nf @@ -0,0 +1,48 @@ +process SAMTOOLS_IDXSTATS { + tag "$meta.id" + label 'process_single' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/samtools:1.20--h50ea8bc_0' : + 'biocontainers/samtools:1.20--h50ea8bc_0' }" + + input: + tuple val(meta), path(bam), path(bai) + + output: + tuple val(meta), path("*.idxstats"), emit: idxstats + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + + """ + samtools \\ + idxstats \\ + --threads ${task.cpus-1} \\ + $bam \\ + > ${prefix}.idxstats + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') + END_VERSIONS + """ + + stub: + def prefix = task.ext.prefix ?: "${meta.id}" + + """ + touch ${prefix}.idxstats + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/samtools/idxstats/meta.yml b/modules/nf-core/samtools/idxstats/meta.yml new file mode 100644 index 000000000..344e92a3f --- /dev/null +++ b/modules/nf-core/samtools/idxstats/meta.yml @@ -0,0 +1,52 @@ +name: samtools_idxstats +description: Reports alignment summary statistics for a BAM/CRAM/SAM file +keywords: + - stats + - mapping + - counts + - chromosome + - bam + - sam + - cram +tools: + - samtools: + description: | + SAMtools is a set of utilities for interacting with and post-processing + short DNA sequence read alignments in the SAM, BAM and CRAM formats, written by Heng Li. + These files are generated as output by short read aligners like BWA. + homepage: http://www.htslib.org/ + documentation: http://www.htslib.org/doc/samtools.html + doi: 10.1093/bioinformatics/btp352 + licence: ["MIT"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - bam: + type: file + description: BAM/CRAM/SAM file + pattern: "*.{bam,cram,sam}" + - bai: + type: file + description: Index for BAM/CRAM/SAM file + pattern: "*.{bai,crai,sai}" +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - idxstats: + type: file + description: File containing samtools idxstats output + pattern: "*.{idxstats}" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@drpatelh" +maintainers: + - "@drpatelh" diff --git a/modules/nf-core/samtools/idxstats/tests/main.nf.test b/modules/nf-core/samtools/idxstats/tests/main.nf.test new file mode 100644 index 000000000..5fd1fc78e --- /dev/null +++ b/modules/nf-core/samtools/idxstats/tests/main.nf.test @@ -0,0 +1,53 @@ +nextflow_process { + + name "Test Process SAMTOOLS_IDXSTATS" + script "../main.nf" + process "SAMTOOLS_IDXSTATS" + tag "modules" + tag "modules_nfcore" + tag "samtools" + tag "samtools/idxstats" + + test("bam") { + + when { + process { + """ + input[0] = Channel.of([ + [ id:'test', single_end:false ], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/bam/test.paired_end.sorted.bam', checkIfExists: true), + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/bam/test.paired_end.sorted.bam.bai', checkIfExists: true) + ]) + """ + } + } + + then { + assertAll ( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + } + + test("bam - stub") { + options "-stub" + when { + process { + """ + input[0] = Channel.of([ + [ id:'test', single_end:false ], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/bam/test.paired_end.sorted.bam', checkIfExists: true), + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/bam/test.paired_end.sorted.bam.bai', checkIfExists: true) + ]) + """ + } + } + + then { + assertAll ( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + }} diff --git a/modules/nf-core/samtools/idxstats/tests/main.nf.test.snap b/modules/nf-core/samtools/idxstats/tests/main.nf.test.snap new file mode 100644 index 000000000..a5ac8104e --- /dev/null +++ b/modules/nf-core/samtools/idxstats/tests/main.nf.test.snap @@ -0,0 +1,72 @@ +{ + "bam - stub": { + "content": [ + { + "0": [ + [ + { + "id": "test", + "single_end": false + }, + "test.idxstats:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "1": [ + "versions.yml:md5,7acbcb2a8ec6436ba7b2916d3ff13351" + ], + "idxstats": [ + [ + { + "id": "test", + "single_end": false + }, + "test.idxstats:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "versions": [ + "versions.yml:md5,7acbcb2a8ec6436ba7b2916d3ff13351" + ] + } + ], + "meta": { + "nf-test": "0.9.0", + "nextflow": "24.04.3" + }, + "timestamp": "2024-07-22T14:17:56.180093" + }, + "bam": { + "content": [ + { + "0": [ + [ + { + "id": "test", + "single_end": false + }, + "test.idxstats:md5,df60a8c8d6621100d05178c93fb053a2" + ] + ], + "1": [ + "versions.yml:md5,7acbcb2a8ec6436ba7b2916d3ff13351" + ], + "idxstats": [ + [ + { + "id": "test", + "single_end": false + }, + "test.idxstats:md5,df60a8c8d6621100d05178c93fb053a2" + ] + ], + "versions": [ + "versions.yml:md5,7acbcb2a8ec6436ba7b2916d3ff13351" + ] + } + ], + "meta": { + "nf-test": "0.9.0", + "nextflow": "24.04.3" + }, + "timestamp": "2024-07-22T14:17:41.408704" + } +} \ No newline at end of file diff --git a/modules/nf-core/samtools/idxstats/tests/tags.yml b/modules/nf-core/samtools/idxstats/tests/tags.yml new file mode 100644 index 000000000..d3057c61f --- /dev/null +++ b/modules/nf-core/samtools/idxstats/tests/tags.yml @@ -0,0 +1,2 @@ +samtools/idxstats: + - modules/nf-core/samtools/idxstats/** diff --git a/modules/nf-core/samtools/index/environment.yml b/modules/nf-core/samtools/index/environment.yml new file mode 100644 index 000000000..260d516be --- /dev/null +++ b/modules/nf-core/samtools/index/environment.yml @@ -0,0 +1,8 @@ +name: samtools_index +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::samtools=1.20 + - bioconda::htslib=1.20 diff --git a/modules/nf-core/samtools/index/main.nf b/modules/nf-core/samtools/index/main.nf new file mode 100644 index 000000000..e002585b9 --- /dev/null +++ b/modules/nf-core/samtools/index/main.nf @@ -0,0 +1,49 @@ +process SAMTOOLS_INDEX { + tag "$meta.id" + label 'process_low' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/samtools:1.20--h50ea8bc_0' : + 'biocontainers/samtools:1.20--h50ea8bc_0' }" + + input: + tuple val(meta), path(input) + + output: + tuple val(meta), path("*.bai") , optional:true, emit: bai + tuple val(meta), path("*.csi") , optional:true, emit: csi + tuple val(meta), path("*.crai"), optional:true, emit: crai + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + """ + samtools \\ + index \\ + -@ ${task.cpus-1} \\ + $args \\ + $input + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') + END_VERSIONS + """ + + stub: + def args = task.ext.args ?: '' + def extension = file(input).getExtension() == 'cram' ? + "crai" : args.contains("-c") ? "csi" : "bai" + """ + touch ${input}.${extension} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/samtools/index/meta.yml b/modules/nf-core/samtools/index/meta.yml new file mode 100644 index 000000000..01a4ee03e --- /dev/null +++ b/modules/nf-core/samtools/index/meta.yml @@ -0,0 +1,57 @@ +name: samtools_index +description: Index SAM/BAM/CRAM file +keywords: + - index + - bam + - sam + - cram +tools: + - samtools: + description: | + SAMtools is a set of utilities for interacting with and post-processing + short DNA sequence read alignments in the SAM, BAM and CRAM formats, written by Heng Li. + These files are generated as output by short read aligners like BWA. + homepage: http://www.htslib.org/ + documentation: http://www.htslib.org/doc/samtools.html + doi: 10.1093/bioinformatics/btp352 + licence: ["MIT"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - bam: + type: file + description: BAM/CRAM/SAM file + pattern: "*.{bam,cram,sam}" +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - bai: + type: file + description: BAM/CRAM/SAM index file + pattern: "*.{bai,crai,sai}" + - crai: + type: file + description: BAM/CRAM/SAM index file + pattern: "*.{bai,crai,sai}" + - csi: + type: file + description: CSI index file + pattern: "*.{csi}" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@drpatelh" + - "@ewels" + - "@maxulysse" +maintainers: + - "@drpatelh" + - "@ewels" + - "@maxulysse" diff --git a/modules/nf-core/samtools/index/tests/csi.nextflow.config b/modules/nf-core/samtools/index/tests/csi.nextflow.config new file mode 100644 index 000000000..0ed260efa --- /dev/null +++ b/modules/nf-core/samtools/index/tests/csi.nextflow.config @@ -0,0 +1,7 @@ +process { + + withName: SAMTOOLS_INDEX { + ext.args = '-c' + } + +} diff --git a/modules/nf-core/samtools/index/tests/main.nf.test b/modules/nf-core/samtools/index/tests/main.nf.test new file mode 100644 index 000000000..ca34fb5cd --- /dev/null +++ b/modules/nf-core/samtools/index/tests/main.nf.test @@ -0,0 +1,140 @@ +nextflow_process { + + name "Test Process SAMTOOLS_INDEX" + script "../main.nf" + process "SAMTOOLS_INDEX" + tag "modules" + tag "modules_nfcore" + tag "samtools" + tag "samtools/index" + + test("bai") { + when { + process { + """ + input[0] = Channel.of([ + [ id:'test', single_end:false ], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/bam/test.paired_end.sorted.bam', checkIfExists: true) + ]) + """ + } + } + + then { + assertAll ( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + } + + test("crai") { + when { + process { + """ + input[0] = Channel.of([ + [ id:'test', single_end:false ], // meta map + file(params.modules_testdata_base_path + 'genomics/homo_sapiens/illumina/cram/test.paired_end.recalibrated.sorted.cram', checkIfExists: true) + ]) + """ + } + } + + then { + assertAll ( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + } + + test("csi") { + config "./csi.nextflow.config" + + when { + process { + """ + input[0] = Channel.of([ + [ id:'test', single_end:false ], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/bam/test.paired_end.sorted.bam', checkIfExists: true) + ]) + """ + } + } + + then { + assertAll ( + { assert process.success }, + { assert snapshot( + file(process.out.csi[0][1]).name, + process.out.versions + ).match() } + ) + } + } + + test("bai - stub") { + options "-stub" + when { + process { + """ + input[0] = Channel.of([ + [ id:'test', single_end:false ], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/bam/test.paired_end.sorted.bam', checkIfExists: true) + ]) + """ + } + } + + then { + assertAll ( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + } + + test("crai - stub") { + options "-stub" + when { + process { + """ + input[0] = Channel.of([ + [ id:'test', single_end:false ], // meta map + file(params.modules_testdata_base_path + 'genomics/homo_sapiens/illumina/cram/test.paired_end.recalibrated.sorted.cram', checkIfExists: true) + ]) + """ + } + } + + then { + assertAll ( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + } + + test("csi - stub") { + options "-stub" + config "./csi.nextflow.config" + + when { + process { + """ + input[0] = Channel.of([ + [ id:'test', single_end:false ], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/bam/test.paired_end.sorted.bam', checkIfExists: true) + ]) + """ + } + } + + then { + assertAll ( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + } +} diff --git a/modules/nf-core/samtools/index/tests/main.nf.test.snap b/modules/nf-core/samtools/index/tests/main.nf.test.snap new file mode 100644 index 000000000..799d199ce --- /dev/null +++ b/modules/nf-core/samtools/index/tests/main.nf.test.snap @@ -0,0 +1,250 @@ +{ + "csi - stub": { + "content": [ + { + "0": [ + + ], + "1": [ + [ + { + "id": "test", + "single_end": false + }, + "test.paired_end.sorted.bam.csi:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "2": [ + + ], + "3": [ + "versions.yml:md5,802c9776d9c5e95314e888cf18e96d77" + ], + "bai": [ + + ], + "crai": [ + + ], + "csi": [ + [ + { + "id": "test", + "single_end": false + }, + "test.paired_end.sorted.bam.csi:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "versions": [ + "versions.yml:md5,802c9776d9c5e95314e888cf18e96d77" + ] + } + ], + "meta": { + "nf-test": "0.9.0", + "nextflow": "24.04.3" + }, + "timestamp": "2024-07-22T16:51:53.9057" + }, + "crai - stub": { + "content": [ + { + "0": [ + + ], + "1": [ + + ], + "2": [ + [ + { + "id": "test", + "single_end": false + }, + "test.paired_end.recalibrated.sorted.cram.crai:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "3": [ + "versions.yml:md5,802c9776d9c5e95314e888cf18e96d77" + ], + "bai": [ + + ], + "crai": [ + [ + { + "id": "test", + "single_end": false + }, + "test.paired_end.recalibrated.sorted.cram.crai:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "csi": [ + + ], + "versions": [ + "versions.yml:md5,802c9776d9c5e95314e888cf18e96d77" + ] + } + ], + "meta": { + "nf-test": "0.9.0", + "nextflow": "24.04.3" + }, + "timestamp": "2024-07-22T16:51:45.931558" + }, + "bai - stub": { + "content": [ + { + "0": [ + [ + { + "id": "test", + "single_end": false + }, + "test.paired_end.sorted.bam.bai:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "1": [ + + ], + "2": [ + + ], + "3": [ + "versions.yml:md5,802c9776d9c5e95314e888cf18e96d77" + ], + "bai": [ + [ + { + "id": "test", + "single_end": false + }, + "test.paired_end.sorted.bam.bai:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "crai": [ + + ], + "csi": [ + + ], + "versions": [ + "versions.yml:md5,802c9776d9c5e95314e888cf18e96d77" + ] + } + ], + "meta": { + "nf-test": "0.9.0", + "nextflow": "24.04.3" + }, + "timestamp": "2024-07-22T16:51:34.807525" + }, + "csi": { + "content": [ + "test.paired_end.sorted.bam.csi", + [ + "versions.yml:md5,802c9776d9c5e95314e888cf18e96d77" + ] + ], + "meta": { + "nf-test": "0.9.0", + "nextflow": "24.04.3" + }, + "timestamp": "2024-07-22T16:52:55.688799" + }, + "crai": { + "content": [ + { + "0": [ + + ], + "1": [ + + ], + "2": [ + [ + { + "id": "test", + "single_end": false + }, + "test.paired_end.recalibrated.sorted.cram.crai:md5,14bc3bd5c89cacc8f4541f9062429029" + ] + ], + "3": [ + "versions.yml:md5,802c9776d9c5e95314e888cf18e96d77" + ], + "bai": [ + + ], + "crai": [ + [ + { + "id": "test", + "single_end": false + }, + "test.paired_end.recalibrated.sorted.cram.crai:md5,14bc3bd5c89cacc8f4541f9062429029" + ] + ], + "csi": [ + + ], + "versions": [ + "versions.yml:md5,802c9776d9c5e95314e888cf18e96d77" + ] + } + ], + "meta": { + "nf-test": "0.9.0", + "nextflow": "24.04.3" + }, + "timestamp": "2024-07-22T16:51:17.609533" + }, + "bai": { + "content": [ + { + "0": [ + [ + { + "id": "test", + "single_end": false + }, + "test.paired_end.sorted.bam.bai:md5,704c10dd1326482448ca3073fdebc2f4" + ] + ], + "1": [ + + ], + "2": [ + + ], + "3": [ + "versions.yml:md5,802c9776d9c5e95314e888cf18e96d77" + ], + "bai": [ + [ + { + "id": "test", + "single_end": false + }, + "test.paired_end.sorted.bam.bai:md5,704c10dd1326482448ca3073fdebc2f4" + ] + ], + "crai": [ + + ], + "csi": [ + + ], + "versions": [ + "versions.yml:md5,802c9776d9c5e95314e888cf18e96d77" + ] + } + ], + "meta": { + "nf-test": "0.9.0", + "nextflow": "24.04.3" + }, + "timestamp": "2024-07-22T16:51:04.16585" + } +} \ No newline at end of file diff --git a/modules/nf-core/samtools/index/tests/tags.yml b/modules/nf-core/samtools/index/tests/tags.yml new file mode 100644 index 000000000..e0f58a7a3 --- /dev/null +++ b/modules/nf-core/samtools/index/tests/tags.yml @@ -0,0 +1,2 @@ +samtools/index: + - modules/nf-core/samtools/index/** diff --git a/modules/nf-core/samtools/sort/environment.yml b/modules/nf-core/samtools/sort/environment.yml new file mode 100644 index 000000000..36a12eab0 --- /dev/null +++ b/modules/nf-core/samtools/sort/environment.yml @@ -0,0 +1,8 @@ +name: samtools_sort +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::samtools=1.20 + - bioconda::htslib=1.20 diff --git a/modules/nf-core/samtools/sort/main.nf b/modules/nf-core/samtools/sort/main.nf new file mode 100644 index 000000000..8e019099c --- /dev/null +++ b/modules/nf-core/samtools/sort/main.nf @@ -0,0 +1,73 @@ +process SAMTOOLS_SORT { + tag "$meta.id" + label 'process_medium' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/samtools:1.20--h50ea8bc_0' : + 'biocontainers/samtools:1.20--h50ea8bc_0' }" + + input: + tuple val(meta) , path(bam) + tuple val(meta2), path(fasta) + + output: + tuple val(meta), path("*.bam"), emit: bam, optional: true + tuple val(meta), path("*.cram"), emit: cram, optional: true + tuple val(meta), path("*.crai"), emit: crai, optional: true + tuple val(meta), path("*.csi"), emit: csi, optional: true + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def extension = args.contains("--output-fmt sam") ? "sam" : + args.contains("--output-fmt cram") ? "cram" : + "bam" + def reference = fasta ? "--reference ${fasta}" : "" + if ("$bam" == "${prefix}.bam") error "Input and output names are the same, use \"task.ext.prefix\" to disambiguate!" + + """ + samtools cat \\ + --threads $task.cpus \\ + ${bam} \\ + | \\ + samtools sort \\ + $args \\ + -T ${prefix} \\ + --threads $task.cpus \\ + ${reference} \\ + -o ${prefix}.${extension} \\ + - + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') + END_VERSIONS + """ + + stub: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def extension = args.contains("--output-fmt sam") ? "sam" : + args.contains("--output-fmt cram") ? "cram" : + "bam" + """ + touch ${prefix}.${extension} + if [ "${extension}" == "bam" ]; + then + touch ${prefix}.${extension}.csi + elif [ "${extension}" == "cram" ]; + then + touch ${prefix}.${extension}.crai + fi + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/samtools/sort/meta.yml b/modules/nf-core/samtools/sort/meta.yml new file mode 100644 index 000000000..341a7d0eb --- /dev/null +++ b/modules/nf-core/samtools/sort/meta.yml @@ -0,0 +1,71 @@ +name: samtools_sort +description: Sort SAM/BAM/CRAM file +keywords: + - sort + - bam + - sam + - cram +tools: + - samtools: + description: | + SAMtools is a set of utilities for interacting with and post-processing + short DNA sequence read alignments in the SAM, BAM and CRAM formats, written by Heng Li. + These files are generated as output by short read aligners like BWA. + homepage: http://www.htslib.org/ + documentation: http://www.htslib.org/doc/samtools.html + doi: 10.1093/bioinformatics/btp352 + licence: ["MIT"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - bam: + type: file + description: BAM/CRAM/SAM file(s) + pattern: "*.{bam,cram,sam}" + - meta2: + type: map + description: | + Groovy Map containing reference information + e.g. [ id:'genome' ] + - fasta: + type: file + description: Reference genome FASTA file + pattern: "*.{fa,fasta,fna}" + optional: true +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - bam: + type: file + description: Sorted BAM file + pattern: "*.{bam}" + - cram: + type: file + description: Sorted CRAM file + pattern: "*.{cram}" + - crai: + type: file + description: CRAM index file (optional) + pattern: "*.crai" + - csi: + type: file + description: BAM index file (optional) + pattern: "*.csi" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@drpatelh" + - "@ewels" + - "@matthdsm" +maintainers: + - "@drpatelh" + - "@ewels" + - "@matthdsm" diff --git a/modules/nf-core/samtools/sort/tests/main.nf.test b/modules/nf-core/samtools/sort/tests/main.nf.test new file mode 100644 index 000000000..c2ea9c72a --- /dev/null +++ b/modules/nf-core/samtools/sort/tests/main.nf.test @@ -0,0 +1,128 @@ +nextflow_process { + + name "Test Process SAMTOOLS_SORT" + script "../main.nf" + process "SAMTOOLS_SORT" + tag "modules" + tag "modules_nfcore" + tag "samtools" + tag "samtools/sort" + + test("bam") { + + config "./nextflow.config" + + when { + process { + """ + input[0] = Channel.of([ + [ id:'test', single_end:false ], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/bam/test.paired_end.bam', checkIfExists: true) + ]) + input[1] = Channel.of([ + [ id:'fasta' ], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.fasta', checkIfExists: true) + ]) + """ + } + } + + then { + assertAll ( + { assert process.success }, + { assert snapshot( + process.out.bam, + process.out.csi.collect { it.collect { it instanceof Map ? it : file(it).name } }, + process.out.versions + ).match()} + ) + } + } + + test("cram") { + + config "./nextflow_cram.config" + + when { + process { + """ + input[0] = Channel.of([ + [ id:'test', single_end:false ], // meta map + file(params.modules_testdata_base_path + 'genomics/homo_sapiens/illumina/cram/test.paired_end.sorted.cram', checkIfExists: true) + ]) + input[1] = Channel.of([ + [ id:'fasta' ], // meta map + file(params.modules_testdata_base_path + 'genomics/homo_sapiens/genome/genome.fasta', checkIfExists: true) + ]) + """ + } + } + + then { + assertAll ( + { assert process.success }, + { assert snapshot( + process.out.cram.collect { it.collect { it instanceof Map ? it : file(it).name } }, + process.out.crai.collect { it.collect { it instanceof Map ? it : file(it).name } }, + process.out.versions + ).match()} + ) + } + } + + test("bam - stub") { + + options "-stub" + config "./nextflow.config" + + when { + process { + """ + input[0] = Channel.of([ + [ id:'test', single_end:false ], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/bam/test.paired_end.bam', checkIfExists: true) + ]) + input[1] = Channel.of([ + [ id:'fasta' ], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.fasta', checkIfExists: true) + ]) + """ + } + } + + then { + assertAll ( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + } + + test("cram - stub") { + + options "-stub" + config "./nextflow_cram.config" + + when { + process { + """ + input[0] = Channel.of([ + [ id:'test', single_end:false ], // meta map + file(params.modules_testdata_base_path + 'genomics/homo_sapiens/illumina/cram/test.paired_end.sorted.cram', checkIfExists: true) + ]) + input[1] = Channel.of([ + [ id:'fasta' ], // meta map + file(params.modules_testdata_base_path + 'genomics/homo_sapiens/genome/genome.fasta', checkIfExists: true) + ]) + """ + } + } + + then { + assertAll ( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + } +} diff --git a/modules/nf-core/samtools/sort/tests/main.nf.test.snap b/modules/nf-core/samtools/sort/tests/main.nf.test.snap new file mode 100644 index 000000000..da38d5d15 --- /dev/null +++ b/modules/nf-core/samtools/sort/tests/main.nf.test.snap @@ -0,0 +1,192 @@ +{ + "cram": { + "content": [ + [ + [ + { + "id": "test", + "single_end": false + }, + "test.sorted.cram" + ] + ], + [ + [ + { + "id": "test", + "single_end": false + }, + "test.sorted.cram.crai" + ] + ], + [ + "versions.yml:md5,7a360de20e1d7a6f15a5e8fbe0a9c062" + ] + ], + "meta": { + "nf-test": "0.9.0", + "nextflow": "24.04.3" + }, + "timestamp": "2024-07-22T17:19:37.196205" + }, + "bam - stub": { + "content": [ + { + "0": [ + [ + { + "id": "test", + "single_end": false + }, + "test.sorted.bam:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "1": [ + + ], + "2": [ + + ], + "3": [ + [ + { + "id": "test", + "single_end": false + }, + "test.sorted.bam.csi:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "4": [ + "versions.yml:md5,7a360de20e1d7a6f15a5e8fbe0a9c062" + ], + "bam": [ + [ + { + "id": "test", + "single_end": false + }, + "test.sorted.bam:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "crai": [ + + ], + "cram": [ + + ], + "csi": [ + [ + { + "id": "test", + "single_end": false + }, + "test.sorted.bam.csi:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "versions": [ + "versions.yml:md5,7a360de20e1d7a6f15a5e8fbe0a9c062" + ] + } + ], + "meta": { + "nf-test": "0.9.0", + "nextflow": "24.04.3" + }, + "timestamp": "2024-07-22T15:54:46.580756" + }, + "cram - stub": { + "content": [ + { + "0": [ + + ], + "1": [ + [ + { + "id": "test", + "single_end": false + }, + "test.sorted.cram:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "2": [ + [ + { + "id": "test", + "single_end": false + }, + "test.sorted.cram.crai:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "3": [ + + ], + "4": [ + "versions.yml:md5,7a360de20e1d7a6f15a5e8fbe0a9c062" + ], + "bam": [ + + ], + "crai": [ + [ + { + "id": "test", + "single_end": false + }, + "test.sorted.cram.crai:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "cram": [ + [ + { + "id": "test", + "single_end": false + }, + "test.sorted.cram:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "csi": [ + + ], + "versions": [ + "versions.yml:md5,7a360de20e1d7a6f15a5e8fbe0a9c062" + ] + } + ], + "meta": { + "nf-test": "0.9.0", + "nextflow": "24.04.3" + }, + "timestamp": "2024-07-22T15:57:30.505698" + }, + "bam": { + "content": [ + [ + [ + { + "id": "test", + "single_end": false + }, + "test.sorted.bam:md5,21c992d59615936b99f2ad008aa54400" + ] + ], + [ + [ + { + "id": "test", + "single_end": false + }, + "test.sorted.bam.csi" + ] + ], + [ + "versions.yml:md5,7a360de20e1d7a6f15a5e8fbe0a9c062" + ] + ], + "meta": { + "nf-test": "0.9.0", + "nextflow": "24.04.3" + }, + "timestamp": "2024-07-22T15:54:25.872954" + } +} \ No newline at end of file diff --git a/modules/nf-core/samtools/sort/tests/nextflow.config b/modules/nf-core/samtools/sort/tests/nextflow.config new file mode 100644 index 000000000..f642771f5 --- /dev/null +++ b/modules/nf-core/samtools/sort/tests/nextflow.config @@ -0,0 +1,8 @@ +process { + + withName: SAMTOOLS_SORT { + ext.prefix = { "${meta.id}.sorted" } + ext.args = "--write-index" + } + +} diff --git a/modules/nf-core/samtools/sort/tests/nextflow_cram.config b/modules/nf-core/samtools/sort/tests/nextflow_cram.config new file mode 100644 index 000000000..3a8c0188b --- /dev/null +++ b/modules/nf-core/samtools/sort/tests/nextflow_cram.config @@ -0,0 +1,8 @@ +process { + + withName: SAMTOOLS_SORT { + ext.prefix = { "${meta.id}.sorted" } + ext.args = "--write-index --output-fmt cram" + } + +} diff --git a/modules/nf-core/samtools/sort/tests/tags.yml b/modules/nf-core/samtools/sort/tests/tags.yml new file mode 100644 index 000000000..cd63ea208 --- /dev/null +++ b/modules/nf-core/samtools/sort/tests/tags.yml @@ -0,0 +1,3 @@ +samtools/sort: + - modules/nf-core/samtools/sort/** + - tests/modules/nf-core/samtools/sort/** diff --git a/modules/nf-core/samtools/stats/environment.yml b/modules/nf-core/samtools/stats/environment.yml new file mode 100644 index 000000000..1cc83bd95 --- /dev/null +++ b/modules/nf-core/samtools/stats/environment.yml @@ -0,0 +1,8 @@ +name: samtools_stats +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::samtools=1.20 + - bioconda::htslib=1.20 diff --git a/modules/nf-core/samtools/stats/main.nf b/modules/nf-core/samtools/stats/main.nf new file mode 100644 index 000000000..982bc28e7 --- /dev/null +++ b/modules/nf-core/samtools/stats/main.nf @@ -0,0 +1,49 @@ +process SAMTOOLS_STATS { + tag "$meta.id" + label 'process_single' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/samtools:1.20--h50ea8bc_0' : + 'biocontainers/samtools:1.20--h50ea8bc_0' }" + + input: + tuple val(meta), path(input), path(input_index) + tuple val(meta2), path(fasta) + + output: + tuple val(meta), path("*.stats"), emit: stats + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def reference = fasta ? "--reference ${fasta}" : "" + """ + samtools \\ + stats \\ + --threads ${task.cpus} \\ + ${reference} \\ + ${input} \\ + > ${prefix}.stats + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') + END_VERSIONS + """ + + stub: + def prefix = task.ext.prefix ?: "${meta.id}" + """ + touch ${prefix}.stats + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/samtools/stats/meta.yml b/modules/nf-core/samtools/stats/meta.yml new file mode 100644 index 000000000..735ff8122 --- /dev/null +++ b/modules/nf-core/samtools/stats/meta.yml @@ -0,0 +1,63 @@ +name: samtools_stats +description: Produces comprehensive statistics from SAM/BAM/CRAM file +keywords: + - statistics + - counts + - bam + - sam + - cram +tools: + - samtools: + description: | + SAMtools is a set of utilities for interacting with and post-processing + short DNA sequence read alignments in the SAM, BAM and CRAM formats, written by Heng Li. + These files are generated as output by short read aligners like BWA. + homepage: http://www.htslib.org/ + documentation: http://www.htslib.org/doc/samtools.html + doi: 10.1093/bioinformatics/btp352 + licence: ["MIT"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - input: + type: file + description: BAM/CRAM file from alignment + pattern: "*.{bam,cram}" + - input_index: + type: file + description: BAI/CRAI file from alignment + pattern: "*.{bai,crai}" + - meta2: + type: map + description: | + Groovy Map containing reference information + e.g. [ id:'genome' ] + - fasta: + type: file + description: Reference file the CRAM was created with (optional) + pattern: "*.{fasta,fa}" +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - stats: + type: file + description: File containing samtools stats output + pattern: "*.{stats}" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@drpatelh" + - "@FriederikeHanssen" + - "@ramprasadn" +maintainers: + - "@drpatelh" + - "@FriederikeHanssen" + - "@ramprasadn" diff --git a/modules/nf-core/samtools/stats/tests/main.nf.test b/modules/nf-core/samtools/stats/tests/main.nf.test new file mode 100644 index 000000000..28a77db28 --- /dev/null +++ b/modules/nf-core/samtools/stats/tests/main.nf.test @@ -0,0 +1,112 @@ +nextflow_process { + + name "Test Process SAMTOOLS_STATS" + script "../main.nf" + process "SAMTOOLS_STATS" + tag "modules" + tag "modules_nfcore" + tag "samtools" + tag "samtools/stats" + + test("bam") { + + when { + process { + """ + input[0] = Channel.of([ + [ id:'test', single_end:false ], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/bam/test.paired_end.sorted.bam', checkIfExists: true), + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/bam/test.paired_end.sorted.bam.bai', checkIfExists: true) + ]) + input[1] = [[],[]] + """ + } + } + + then { + assertAll( + {assert process.success}, + {assert snapshot(process.out).match()} + ) + } + } + + test("cram") { + + when { + process { + """ + input[0] = Channel.of([ + [ id:'test', single_end:false ], // meta map + file(params.modules_testdata_base_path + 'genomics/homo_sapiens/illumina/cram/test.paired_end.recalibrated.sorted.cram', checkIfExists: true), + file(params.modules_testdata_base_path + 'genomics/homo_sapiens/illumina/cram/test.paired_end.recalibrated.sorted.cram.crai', checkIfExists: true) + ]) + input[1] = Channel.of([ + [ id:'genome' ], // meta map + file(params.modules_testdata_base_path + 'genomics/homo_sapiens/genome/genome.fasta', checkIfExists: true) + ]) + """ + } + } + + then { + assertAll( + {assert process.success}, + {assert snapshot(process.out).match()} + ) + } + } + + test("bam - stub") { + + options "-stub" + + when { + process { + """ + input[0] = Channel.of([ + [ id:'test', single_end:false ], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/bam/test.paired_end.sorted.bam', checkIfExists: true), + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/bam/test.paired_end.sorted.bam.bai', checkIfExists: true) + ]) + input[1] = [[],[]] + """ + } + } + + then { + assertAll( + {assert process.success}, + {assert snapshot(process.out).match()} + ) + } + } + + test("cram - stub") { + + options "-stub" + + when { + process { + """ + input[0] = Channel.of([ + [ id:'test', single_end:false ], // meta map + file(params.modules_testdata_base_path + 'genomics/homo_sapiens/illumina/cram/test.paired_end.recalibrated.sorted.cram', checkIfExists: true), + file(params.modules_testdata_base_path + 'genomics/homo_sapiens/illumina/cram/test.paired_end.recalibrated.sorted.cram.crai', checkIfExists: true) + ]) + input[1] = Channel.of([ + [ id:'genome' ], // meta map + file(params.modules_testdata_base_path + 'genomics/homo_sapiens/genome/genome.fasta', checkIfExists: true) + ]) + """ + } + } + + then { + assertAll( + {assert process.success}, + {assert snapshot(process.out).match()} + ) + } + } +} diff --git a/modules/nf-core/samtools/stats/tests/main.nf.test.snap b/modules/nf-core/samtools/stats/tests/main.nf.test.snap new file mode 100644 index 000000000..3828f3788 --- /dev/null +++ b/modules/nf-core/samtools/stats/tests/main.nf.test.snap @@ -0,0 +1,142 @@ +{ + "cram": { + "content": [ + { + "0": [ + [ + { + "id": "test", + "single_end": false + }, + "test.stats:md5,c9d39b38c22de2057fc2f89949090975" + ] + ], + "1": [ + "versions.yml:md5,b3b70b126f867fdbb7dcea5e36e49d4a" + ], + "stats": [ + [ + { + "id": "test", + "single_end": false + }, + "test.stats:md5,c9d39b38c22de2057fc2f89949090975" + ] + ], + "versions": [ + "versions.yml:md5,b3b70b126f867fdbb7dcea5e36e49d4a" + ] + } + ], + "meta": { + "nf-test": "0.9.0", + "nextflow": "24.04.3" + }, + "timestamp": "2024-07-22T14:20:24.885816" + }, + "bam - stub": { + "content": [ + { + "0": [ + [ + { + "id": "test", + "single_end": false + }, + "test.stats:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "1": [ + "versions.yml:md5,b3b70b126f867fdbb7dcea5e36e49d4a" + ], + "stats": [ + [ + { + "id": "test", + "single_end": false + }, + "test.stats:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "versions": [ + "versions.yml:md5,b3b70b126f867fdbb7dcea5e36e49d4a" + ] + } + ], + "meta": { + "nf-test": "0.9.0", + "nextflow": "24.04.3" + }, + "timestamp": "2024-07-22T14:20:39.310713" + }, + "cram - stub": { + "content": [ + { + "0": [ + [ + { + "id": "test", + "single_end": false + }, + "test.stats:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "1": [ + "versions.yml:md5,b3b70b126f867fdbb7dcea5e36e49d4a" + ], + "stats": [ + [ + { + "id": "test", + "single_end": false + }, + "test.stats:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "versions": [ + "versions.yml:md5,b3b70b126f867fdbb7dcea5e36e49d4a" + ] + } + ], + "meta": { + "nf-test": "0.9.0", + "nextflow": "24.04.3" + }, + "timestamp": "2024-07-22T14:21:04.771199" + }, + "bam": { + "content": [ + { + "0": [ + [ + { + "id": "test", + "single_end": false + }, + "test.stats:md5,d522a1fa016b259d6a55620ae53dcd63" + ] + ], + "1": [ + "versions.yml:md5,b3b70b126f867fdbb7dcea5e36e49d4a" + ], + "stats": [ + [ + { + "id": "test", + "single_end": false + }, + "test.stats:md5,d522a1fa016b259d6a55620ae53dcd63" + ] + ], + "versions": [ + "versions.yml:md5,b3b70b126f867fdbb7dcea5e36e49d4a" + ] + } + ], + "meta": { + "nf-test": "0.9.0", + "nextflow": "24.04.3" + }, + "timestamp": "2024-07-22T14:19:06.645466" + } +} \ No newline at end of file diff --git a/modules/nf-core/samtools/stats/tests/tags.yml b/modules/nf-core/samtools/stats/tests/tags.yml new file mode 100644 index 000000000..7c28e30f3 --- /dev/null +++ b/modules/nf-core/samtools/stats/tests/tags.yml @@ -0,0 +1,2 @@ +samtools/stats: + - modules/nf-core/samtools/stats/** diff --git a/modules/nf-core/samtools/view/environment.yml b/modules/nf-core/samtools/view/environment.yml new file mode 100644 index 000000000..150c37777 --- /dev/null +++ b/modules/nf-core/samtools/view/environment.yml @@ -0,0 +1,8 @@ +name: samtools_view +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::samtools=1.20 + - bioconda::htslib=1.20 diff --git a/modules/nf-core/samtools/view/main.nf b/modules/nf-core/samtools/view/main.nf new file mode 100644 index 000000000..dc611448c --- /dev/null +++ b/modules/nf-core/samtools/view/main.nf @@ -0,0 +1,77 @@ +process SAMTOOLS_VIEW { + tag "$meta.id" + label 'process_low' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/samtools:1.20--h50ea8bc_0' : + 'biocontainers/samtools:1.20--h50ea8bc_0' }" + + input: + tuple val(meta), path(input), path(index) + tuple val(meta2), path(fasta) + path qname + + output: + tuple val(meta), path("${prefix}.bam"), emit: bam, optional: true + tuple val(meta), path("${prefix}.cram"), emit: cram, optional: true + tuple val(meta), path("${prefix}.sam"), emit: sam, optional: true + tuple val(meta), path("${prefix}.${file_type}.bai"), emit: bai, optional: true + tuple val(meta), path("${prefix}.${file_type}.csi"), emit: csi, optional: true + tuple val(meta), path("${prefix}.${file_type}.crai"), emit: crai, optional: true + tuple val(meta), path("${prefix}.unselected.${file_type}"), emit: unselected, optional: true + tuple val(meta), path("${prefix}.unselected.${file_type}.{bai,csi,crsi}"), emit: unselected_index, optional: true + path "versions.yml", emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def args2 = task.ext.args2 ?: '' + prefix = task.ext.prefix ?: "${meta.id}" + def reference = fasta ? "--reference ${fasta}" : "" + file_type = args.contains("--output-fmt sam") ? "sam" : + args.contains("--output-fmt bam") ? "bam" : + args.contains("--output-fmt cram") ? "cram" : + input.getExtension() + readnames = qname ? "--qname-file ${qname} --output-unselected ${prefix}.unselected.${file_type}": "" + if ("$input" == "${prefix}.${file_type}") error "Input and output names are the same, use \"task.ext.prefix\" to disambiguate!" + """ + samtools \\ + view \\ + --threads ${task.cpus-1} \\ + ${reference} \\ + ${readnames} \\ + $args \\ + -o ${prefix}.${file_type} \\ + $input \\ + $args2 + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') + END_VERSIONS + """ + + stub: + def args = task.ext.args ?: '' + prefix = task.ext.prefix ?: "${meta.id}" + file_type = args.contains("--output-fmt sam") ? "sam" : + args.contains("--output-fmt bam") ? "bam" : + args.contains("--output-fmt cram") ? "cram" : + input.getExtension() + if ("$input" == "${prefix}.${file_type}") error "Input and output names are the same, use \"task.ext.prefix\" to disambiguate!" + + def index = args.contains("--write-index") ? "touch ${prefix}.${file_type}.csi" : "" + + """ + touch ${prefix}.${file_type} + ${index} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/samtools/view/meta.yml b/modules/nf-core/samtools/view/meta.yml new file mode 100644 index 000000000..27be60d08 --- /dev/null +++ b/modules/nf-core/samtools/view/meta.yml @@ -0,0 +1,98 @@ +name: samtools_view +description: filter/convert SAM/BAM/CRAM file +keywords: + - view + - bam + - sam + - cram +tools: + - samtools: + description: | + SAMtools is a set of utilities for interacting with and post-processing + short DNA sequence read alignments in the SAM, BAM and CRAM formats, written by Heng Li. + These files are generated as output by short read aligners like BWA. + homepage: http://www.htslib.org/ + documentation: http://www.htslib.org/doc/samtools.html + doi: 10.1093/bioinformatics/btp352 + licence: ["MIT"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - input: + type: file + description: BAM/CRAM/SAM file + pattern: "*.{bam,cram,sam}" + - index: + type: file + description: BAM.BAI/BAM.CSI/CRAM.CRAI file (optional) + pattern: "*.{.bai,.csi,.crai}" + - meta2: + type: map + description: | + Groovy Map containing reference information + e.g. [ id:'test' ] + - fasta: + type: file + description: Reference file the CRAM was created with (optional) + pattern: "*.{fasta,fa}" + - qname: + type: file + description: Optional file with read names to output only select alignments + pattern: "*.{txt,list}" +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - bam: + type: file + description: optional filtered/converted BAM file + pattern: "*.{bam}" + - cram: + type: file + description: optional filtered/converted CRAM file + pattern: "*.{cram}" + - sam: + type: file + description: optional filtered/converted SAM file + pattern: "*.{sam}" + # bai, csi, and crai are created with `--write-index` + - bai: + type: file + description: optional BAM file index + pattern: "*.{bai}" + - csi: + type: file + description: optional tabix BAM file index + pattern: "*.{csi}" + - crai: + type: file + description: optional CRAM file index + pattern: "*.{crai}" + # unselected and unselected_index are created when passing a qname + - unselected: + type: file + description: optional file with unselected alignments + pattern: "*.unselected.{bam,cram,sam}" + - unselected_index: + type: file + description: index for the "unselected" file + pattern: "*.unselected.{bai,csi,crai}" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@drpatelh" + - "@joseespinosa" + - "@FriederikeHanssen" + - "@priyanka-surana" +maintainers: + - "@drpatelh" + - "@joseespinosa" + - "@FriederikeHanssen" + - "@priyanka-surana" diff --git a/modules/nf-core/samtools/view/tests/bam.config b/modules/nf-core/samtools/view/tests/bam.config new file mode 100644 index 000000000..c10d10811 --- /dev/null +++ b/modules/nf-core/samtools/view/tests/bam.config @@ -0,0 +1,3 @@ +process { + ext.args = "--output-fmt bam" +} \ No newline at end of file diff --git a/modules/nf-core/samtools/view/tests/bam_index.config b/modules/nf-core/samtools/view/tests/bam_index.config new file mode 100644 index 000000000..771ae033a --- /dev/null +++ b/modules/nf-core/samtools/view/tests/bam_index.config @@ -0,0 +1,3 @@ +process { + ext.args = "--output-fmt bam --write-index" +} \ No newline at end of file diff --git a/modules/nf-core/samtools/view/tests/main.nf.test b/modules/nf-core/samtools/view/tests/main.nf.test new file mode 100644 index 000000000..37b81a916 --- /dev/null +++ b/modules/nf-core/samtools/view/tests/main.nf.test @@ -0,0 +1,214 @@ +nextflow_process { + + name "Test Process SAMTOOLS_VIEW" + script "../main.nf" + process "SAMTOOLS_VIEW" + + tag "modules" + tag "modules_nfcore" + tag "samtools" + tag "samtools/view" + + test("bam") { + + when { + process { + """ + input[0] = Channel.of([ + [ id:'test', single_end:false ], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/bam/test.paired_end.bam', checkIfExists: true), + [] + ]) + input[1] = [[],[]] + input[2] = [] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(file(process.out.bam[0][1]).name).match("bam_bam") }, + { assert snapshot(process.out.bai).match("bam_bai") }, + { assert snapshot(process.out.crai).match("bam_crai") }, + { assert snapshot(process.out.cram).match("bam_cram") }, + { assert snapshot(process.out.csi).match("bam_csi") }, + { assert snapshot(process.out.sam).match("bam_sam") }, + { assert snapshot(process.out.versions).match("bam_versions") } + ) + } + } + + test("cram") { + + when { + process { + """ + input[0] = Channel.of([ + [ id:'test', single_end:false ], // meta map + file(params.modules_testdata_base_path + 'genomics/homo_sapiens/illumina/cram/test.paired_end.sorted.cram', checkIfExists: true), + file(params.modules_testdata_base_path + 'genomics/homo_sapiens/illumina/cram/test.paired_end.sorted.cram.crai', checkIfExists: true) + ]) + input[1] = Channel.of([ + [ id:'genome' ], // meta map + file(params.modules_testdata_base_path + 'genomics/homo_sapiens/genome/genome.fasta', checkIfExists: true) + ]) + input[2] = [] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(file(process.out.cram[0][1]).name).match("cram_cram") }, + { assert snapshot(process.out.bai).match("cram_bai") }, + { assert snapshot(process.out.bam).match("cram_bam") }, + { assert snapshot(process.out.crai).match("cram_crai") }, + { assert snapshot(process.out.csi).match("cram_csi") }, + { assert snapshot(process.out.sam).match("cram_sam") }, + { assert snapshot(process.out.versions).match("cram_versions") } + ) + } + } + + test("cram_to_bam") { + + config "./bam.config" + + when { + process { + """ + input[0] = Channel.of([ + [ id:'test', single_end:false ], // meta map + file(params.modules_testdata_base_path + 'genomics/homo_sapiens/illumina/cram/test.paired_end.sorted.cram', checkIfExists: true), + [] + ]) + input[1] = Channel.of([ + [ id:'genome' ], // meta map + file(params.modules_testdata_base_path + 'genomics/homo_sapiens/genome/genome.fasta', checkIfExists: true) + ]) + input[2] = [] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(file(process.out.bam[0][1]).name).match("cram_to_bam_bam") }, + { assert snapshot(process.out.bai).match("cram_to_bam_bai") }, + { assert snapshot(process.out.crai).match("cram_to_bam_crai") }, + { assert snapshot(process.out.cram).match("cram_to_bam_cram") }, + { assert snapshot(process.out.csi).match("cram_to_bam_csi") }, + { assert snapshot(process.out.sam).match("cram_to_bam_sam") }, + { assert snapshot(process.out.versions).match("cram_to_bam_versions") } + ) + } + } + + test("cram_to_bam_index") { + + config "./bam_index.config" + + when { + process { + """ + input[0] = Channel.of([ + [ id:'test', single_end:false ], // meta map + file(params.modules_testdata_base_path + 'genomics/homo_sapiens/illumina/cram/test.paired_end.sorted.cram', checkIfExists: true), + [] + ]) + input[1] = Channel.of([ + [ id:'genome' ], // meta map + file(params.modules_testdata_base_path + 'genomics/homo_sapiens/genome/genome.fasta', checkIfExists: true) + ]) + input[2] = [] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(file(process.out.bam[0][1]).name).match("cram_to_bam_index_bam") }, + { assert snapshot(file(process.out.csi[0][1]).name).match("cram_to_bam_index_csi") }, + { assert snapshot(process.out.bai).match("cram_to_bam_index_bai") }, + { assert snapshot(process.out.crai).match("cram_to_bam_index_crai") }, + { assert snapshot(process.out.cram).match("cram_to_bam_index_cram") }, + { assert snapshot(process.out.sam).match("cram_to_bam_index_sam") }, + { assert snapshot(process.out.versions).match("cram_to_bam_index_versions") } + ) + } + } + + test("cram_to_bam_index_qname") { + + config "./bam_index.config" + + when { + process { + """ + input[0] = Channel.of([ + [ id:'test', single_end:false ], // meta map + file(params.modules_testdata_base_path + 'genomics/homo_sapiens/illumina/cram/test.paired_end.sorted.cram', checkIfExists: true), + [] + ]) + input[1] = Channel.of([ + [ id:'genome' ], // meta map + file(params.modules_testdata_base_path + 'genomics/homo_sapiens/genome/genome.fasta', checkIfExists: true) + ]) + input[2] = Channel.of("testN:2817", "testN:2814").collectFile(name: "readnames.list", newLine: true) + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(file(process.out.bam[0][1]).name).match("cram_to_bam_index_qname_bam") }, + { assert snapshot(file(process.out.csi[0][1]).name).match("cram_to_bam_index_qname_csi") }, + { assert snapshot(process.out.bai).match("cram_to_bam_index_qname_bai") }, + { assert snapshot(process.out.crai).match("cram_to_bam_index_qname_crai") }, + { assert snapshot(process.out.cram).match("cram_to_bam_index_qname_cram") }, + { assert snapshot(process.out.sam).match("cram_to_bam_index_qname_sam") }, + { assert snapshot(file(process.out.unselected[0][1]).name).match("cram_to_bam_index_qname_unselected") }, + { assert snapshot(file(process.out.unselected_index[0][1]).name).match("cram_to_bam_index_qname_unselected_csi") }, + { assert snapshot(process.out.versions).match("cram_to_bam_index_qname_versions") } + ) + } + } + + test("bam_stub") { + + options "-stub" + config "./bam_index.config" + + when { + process { + """ + input[0] = Channel.of([ + [ id:'test', single_end:false ], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/bam/test.paired_end.bam', checkIfExists: true), + [] + ]) + input[1] = [[],[]] + input[2] = [] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(file(process.out.bam[0][1]).name).match("bam_stub_bam") }, + { assert snapshot(file(process.out.csi[0][1]).name).match("bam_stub_csi") }, + { assert snapshot(process.out.bai).match("bam_stub_bai") }, + { assert snapshot(process.out.crai).match("bam_stub_crai") }, + { assert snapshot(process.out.cram).match("bam_stub_cram") }, + { assert snapshot(process.out.sam).match("bam_stub_sam") }, + { assert snapshot(process.out.versions).match("bam_stub_versions") } + ) + } + } +} diff --git a/modules/nf-core/samtools/view/tests/main.nf.test.snap b/modules/nf-core/samtools/view/tests/main.nf.test.snap new file mode 100644 index 000000000..6bcce9fea --- /dev/null +++ b/modules/nf-core/samtools/view/tests/main.nf.test.snap @@ -0,0 +1,508 @@ +{ + "bam_bam": { + "content": [ + "test.bam" + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.04.3" + }, + "timestamp": "2024-02-12T19:37:51.256068" + }, + "cram_to_bam_index_csi": { + "content": [ + "test.bam.csi" + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.04.3" + }, + "timestamp": "2024-02-12T19:38:12.958617" + }, + "bam_stub_bam": { + "content": [ + "test.bam" + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.04.3" + }, + "timestamp": "2024-02-12T19:38:32.065301" + }, + "bam_bai": { + "content": [ + [ + + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.04.3" + }, + "timestamp": "2024-02-12T19:37:51.258578" + }, + "bam_stub_bai": { + "content": [ + [ + + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.04.3" + }, + "timestamp": "2024-02-12T19:38:32.071284" + }, + "bam_stub_versions": { + "content": [ + [ + "versions.yml:md5,6cd41a9a3b4a95271ec011ea990a2838" + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-05-28T15:43:20.390692583" + }, + "cram_to_bam_index_cram": { + "content": [ + [ + + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.04.3" + }, + "timestamp": "2024-02-12T19:38:12.972288" + }, + "cram_to_bam_sam": { + "content": [ + [ + + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.04.3" + }, + "timestamp": "2024-02-12T19:38:04.999247" + }, + "cram_to_bam_index_sam": { + "content": [ + [ + + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.04.3" + }, + "timestamp": "2024-02-12T19:38:12.976457" + }, + "cram_crai": { + "content": [ + [ + + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.04.3" + }, + "timestamp": "2024-02-12T19:37:56.497581" + }, + "cram_csi": { + "content": [ + [ + + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.04.3" + }, + "timestamp": "2024-02-12T19:37:56.50038" + }, + "cram_to_bam_cram": { + "content": [ + [ + + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.04.3" + }, + "timestamp": "2024-02-12T19:38:04.992239" + }, + "cram_to_bam_index_qname_csi": { + "content": [ + "test.bam.csi" + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.04.3" + }, + "timestamp": "2024-02-12T19:38:23.325496" + }, + "bam_stub_sam": { + "content": [ + [ + + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.04.3" + }, + "timestamp": "2024-02-12T19:38:32.079529" + }, + "cram_cram": { + "content": [ + "test.cram" + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.04.3" + }, + "timestamp": "2024-02-12T19:37:56.490286" + }, + "bam_csi": { + "content": [ + [ + + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.04.3" + }, + "timestamp": "2024-02-12T19:37:51.262882" + }, + "cram_to_bam_crai": { + "content": [ + [ + + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.04.3" + }, + "timestamp": "2024-02-12T19:38:04.989247" + }, + "cram_to_bam_index_crai": { + "content": [ + [ + + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.04.3" + }, + "timestamp": "2024-02-12T19:38:12.967681" + }, + "cram_to_bam_index_qname_versions": { + "content": [ + [ + "versions.yml:md5,6cd41a9a3b4a95271ec011ea990a2838" + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-05-28T15:43:15.007493874" + }, + "cram_to_bam_bam": { + "content": [ + "test.bam" + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.04.3" + }, + "timestamp": "2024-02-12T19:38:04.982361" + }, + "cram_to_bam_index_bam": { + "content": [ + "test.bam" + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.04.3" + }, + "timestamp": "2024-02-12T19:38:12.95456" + }, + "cram_to_bam_index_versions": { + "content": [ + [ + "versions.yml:md5,6cd41a9a3b4a95271ec011ea990a2838" + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-05-28T15:43:09.472376824" + }, + "cram_to_bam_bai": { + "content": [ + [ + + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.04.3" + }, + "timestamp": "2024-02-12T19:38:04.98601" + }, + "cram_to_bam_versions": { + "content": [ + [ + "versions.yml:md5,6cd41a9a3b4a95271ec011ea990a2838" + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-05-28T15:43:04.080050906" + }, + "cram_bam": { + "content": [ + [ + + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.04.3" + }, + "timestamp": "2024-02-12T19:37:56.495512" + }, + "bam_stub_cram": { + "content": [ + [ + + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.04.3" + }, + "timestamp": "2024-02-12T19:38:32.076908" + }, + "cram_to_bam_index_qname_bai": { + "content": [ + [ + + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.04.3" + }, + "timestamp": "2024-02-12T19:38:23.328458" + }, + "cram_to_bam_index_qname_crai": { + "content": [ + [ + + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.04.3" + }, + "timestamp": "2024-02-12T19:38:23.330789" + }, + "cram_bai": { + "content": [ + [ + + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.04.3" + }, + "timestamp": "2024-02-12T19:37:56.493129" + }, + "bam_stub_crai": { + "content": [ + [ + + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.04.3" + }, + "timestamp": "2024-02-12T19:38:32.074313" + }, + "cram_to_bam_index_qname_bam": { + "content": [ + "test.bam" + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.04.3" + }, + "timestamp": "2024-02-12T19:38:23.322874" + }, + "cram_to_bam_index_qname_unselected": { + "content": [ + "test.unselected.bam" + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.04.3" + }, + "timestamp": "2024-02-12T19:38:23.322874" + }, + "cram_to_bam_index_qname_unselected_csi": { + "content": [ + "test.unselected.bam.csi" + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.04.3" + }, + "timestamp": "2024-02-12T19:38:23.328458" + }, + "bam_versions": { + "content": [ + [ + "versions.yml:md5,6cd41a9a3b4a95271ec011ea990a2838" + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-05-28T15:42:52.978954857" + }, + "cram_to_bam_index_qname_cram": { + "content": [ + [ + + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.04.3" + }, + "timestamp": "2024-02-12T19:38:23.333248" + }, + "bam_crai": { + "content": [ + [ + + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.04.3" + }, + "timestamp": "2024-02-12T19:37:51.259774" + }, + "bam_cram": { + "content": [ + [ + + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.04.3" + }, + "timestamp": "2024-02-12T19:37:51.261287" + }, + "cram_to_bam_csi": { + "content": [ + [ + + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.04.3" + }, + "timestamp": "2024-02-12T19:38:04.995454" + }, + "cram_sam": { + "content": [ + [ + + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.04.3" + }, + "timestamp": "2024-02-12T19:37:56.502625" + }, + "cram_versions": { + "content": [ + [ + "versions.yml:md5,6cd41a9a3b4a95271ec011ea990a2838" + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-05-28T15:42:58.400776109" + }, + "bam_sam": { + "content": [ + [ + + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.04.3" + }, + "timestamp": "2024-02-12T19:37:51.264651" + }, + "cram_to_bam_index_bai": { + "content": [ + [ + + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.04.3" + }, + "timestamp": "2024-02-12T19:38:12.962863" + }, + "cram_to_bam_index_qname_sam": { + "content": [ + [ + + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.04.3" + }, + "timestamp": "2024-02-12T19:38:23.337634" + }, + "bam_stub_csi": { + "content": [ + "test.bam.csi" + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.04.3" + }, + "timestamp": "2024-02-12T19:38:32.068596" + } +} \ No newline at end of file diff --git a/modules/nf-core/samtools/view/tests/tags.yml b/modules/nf-core/samtools/view/tests/tags.yml new file mode 100644 index 000000000..4fdf1dd12 --- /dev/null +++ b/modules/nf-core/samtools/view/tests/tags.yml @@ -0,0 +1,2 @@ +samtools/view: + - "modules/nf-core/samtools/view/**" diff --git a/modules/nf-core/segemehl/align/environment.yml b/modules/nf-core/segemehl/align/environment.yml new file mode 100644 index 000000000..e7dbc6628 --- /dev/null +++ b/modules/nf-core/segemehl/align/environment.yml @@ -0,0 +1,7 @@ +name: segemehl_align +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::segemehl=0.3.4 diff --git a/modules/nf-core/segemehl/align/main.nf b/modules/nf-core/segemehl/align/main.nf new file mode 100644 index 000000000..fa829a73a --- /dev/null +++ b/modules/nf-core/segemehl/align/main.nf @@ -0,0 +1,59 @@ +process SEGEMEHL_ALIGN { + tag "$meta.id" + label 'process_high' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/segemehl:0.3.4--hc2ea5fd_5': + 'biocontainers/segemehl:0.3.4--hc2ea5fd_5' }" + + input: + tuple val(meta), path(reads) + path(fasta) + path(index) + + output: + tuple val(meta), path("${prefix}/${prefix}.${suffix}"), emit: alignment + tuple val(meta), path("${prefix}/${prefix}.trns.txt") , emit: trans_alignments, optional: true + tuple val(meta), path("${prefix}/${prefix}.mult.bed") , emit: multi_bed, optional: true + tuple val(meta), path("${prefix}/${prefix}.sngl.bed") , emit: single_bed, optional: true + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + prefix = task.ext.prefix ?: "${meta.id}" + def reads = meta.single_end ? "-q ${reads}" : "-q ${reads[0]} -p ${reads[1]}" + suffix = ( args.contains("-b") || args.contains("--bamabafixoida") ) ? "bam" : "sam" + """ + mkdir -p $prefix + + segemehl.x \\ + -t $task.cpus \\ + -d $fasta \\ + -i $index \\ + $reads \\ + $args \\ + -o ${prefix}/${prefix}.${suffix} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + segemehl: \$(echo \$(segemehl.x 2>&1 | grep "ge5dee" | awk -F Z '{print substr(\$1, 2, 6)}' )) + END_VERSIONS + """ + + stub: + prefix = task.ext.prefix ?: "${meta.id}" + suffix = ( args.contains("-b") || args.contains("--bamabafixoida") ) ? "bam" : "sam" + """ + mkdir -p $prefix + touch ${prefix}/${prefix}.${suffix} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + segemehl: \$(echo \$(segemehl.x 2>&1 | grep "ge5dee" | awk -F Z '{print substr(\$1, 2, 6)}' )) + END_VERSIONS + """ +} diff --git a/modules/nf-core/segemehl/align/meta.yml b/modules/nf-core/segemehl/align/meta.yml new file mode 100644 index 000000000..fc8e43bab --- /dev/null +++ b/modules/nf-core/segemehl/align/meta.yml @@ -0,0 +1,73 @@ +name: "segemehl_align" +description: A multi-split mapping algorithm for circular RNA, splicing, trans-splicing and fusion detection +keywords: + - alignment + - circrna + - splicing + - fusions +tools: + - "segemehl": + description: "A multi-split mapping algorithm for circular RNA, splicing, trans-splicing and fusion detection" + homepage: "https://www.bioinf.uni-leipzig.de/Software/segemehl/" + documentation: "https://www.bioinf.uni-leipzig.de/Software/segemehl/" + doi: "10.1186/gb-2014-15-2-r34" + licence: ["GPL v3"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - reads: + type: file + description: FASTA or FASTQ files + pattern: "*.{fa,fasta,fq,fastq,fq.gz,fastq.gz}" + - fasta: + type: file + description: Reference genome FASTA file used to construct Segemehl + pattern: "*.{fa,fasta}" + - index: + type: file + description: Segemehl Index file from SEGEMEHL_INDEX + pattern: "*.idx" +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - alignment: + type: file + description: | + File containing genomic alignments in SAM format + (please add "-b" flag to task.ext.args for BAM) + pattern: "*.{sam,bam}" + - trans_alignments: + type: file + description: | + Custom text file containing all single split alignments predicted to be in trans + (optional, only if -S flag is set in task.ext.args) + pattern: "*.trns.txt" + - single_bed: + type: file + description: | + Bed file containing all single splice events predicted + in the split read alignments. + (optional, only if -S flag is set in task.ext.args) + pattern: "*.sngl.bed" + - multi_bed: + type: file + description: | + Bed file containing all splice events predicted + in the split read alignments. + (optional, only if -S flag is set in task.ext.args) + pattern: "*.mult.bed" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@BarryDigby" + - "@nictru" +maintainers: + - "@nictru" diff --git a/modules/nf-core/segemehl/align/tests/main.nf.test b/modules/nf-core/segemehl/align/tests/main.nf.test new file mode 100644 index 000000000..c1b4921e7 --- /dev/null +++ b/modules/nf-core/segemehl/align/tests/main.nf.test @@ -0,0 +1,140 @@ +nextflow_process { + + name "Test Process SEGEMEHL_ALIGN" + script "../main.nf" + process "SEGEMEHL_ALIGN" + tag "modules" + tag "modules_nfcore" + tag "segemehl" + tag "segemehl/align" + tag "segemehl/index" + + setup { + run("SEGEMEHL_INDEX") { + script "../../../segemehl/index/main.nf" + process { + """ + input[0] = Channel.of([ + file(params.modules_testdata_base_path + 'genomics/homo_sapiens/genome/genome.fasta', checkIfExists: true) + ]) + """ + } + } + } + + test("homo_sapiens - single_end") { + when { + process { + """ + input[0] = Channel.of([ + [ id:'test', single_end:true ], // meta map + [ file(params.modules_testdata_base_path + 'genomics/homo_sapiens/illumina/fastq/test_rnaseq_1.fastq.gz', checkIfExists: true) ] + ]) + input[1] = Channel.of([ + file(params.modules_testdata_base_path + 'genomics/homo_sapiens/genome/genome.fasta', checkIfExists: true) + ]) + input[2] = SEGEMEHL_INDEX.out.index + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert path(process.out.alignment[0][1]).exists() }, + { assert snapshot(process.out.versions).match("homo_sapiens - single_end - versions") } + ) + } + } + + test("homo_sapiens - paired_end") { + when { + process { + """ + input[0] = Channel.of([ + [ id:'test', single_end:false ], // meta map + [ + file(params.modules_testdata_base_path + 'genomics/homo_sapiens/illumina/fastq/test_rnaseq_1.fastq.gz', checkIfExists: true), + file(params.modules_testdata_base_path + 'genomics/homo_sapiens/illumina/fastq/test_rnaseq_2.fastq.gz', checkIfExists: true) + ] + ]) + input[1] = Channel.of([ + file(params.modules_testdata_base_path + 'genomics/homo_sapiens/genome/genome.fasta', checkIfExists: true) + ]) + input[2] = SEGEMEHL_INDEX.out.index + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert path(process.out.alignment[0][1]).exists() }, + { assert snapshot(process.out.versions).match("homo_sapiens - paired_end - versions") } + ) + } + } + + test("homo_sapiens - split - single_end") { + config "./split.config" + + when { + process { + """ + input[0] = Channel.of([ + [ id:'test', single_end:true ], // meta map + [ file(params.modules_testdata_base_path + 'genomics/homo_sapiens/illumina/fastq/test_rnaseq_1.fastq.gz', checkIfExists: true) ] + ]) + input[1] = Channel.of([ + file(params.modules_testdata_base_path + 'genomics/homo_sapiens/genome/genome.fasta', checkIfExists: true) + ]) + input[2] = SEGEMEHL_INDEX.out.index + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert path(process.out.alignment[0][1]).exists() }, + { assert path(process.out.trans_alignments[0][1]).exists() }, + { assert path(process.out.multi_bed[0][1]).exists() }, + { assert path(process.out.single_bed[0][1]).exists() }, + { assert snapshot(process.out.versions).match("homo_sapiens - split - single_end - versions") } + ) + } + } + + test("homo_sapiens - split - paired_end") { + config "./split.config" + + when { + process { + """ + input[0] = Channel.of([ + [ id:'test', single_end:false ], // meta map + [ + file(params.modules_testdata_base_path + 'genomics/homo_sapiens/illumina/fastq/test_rnaseq_1.fastq.gz', checkIfExists: true), + file(params.modules_testdata_base_path + 'genomics/homo_sapiens/illumina/fastq/test_rnaseq_2.fastq.gz', checkIfExists: true) + ] + ]) + input[1] = Channel.of([ + file(params.modules_testdata_base_path + 'genomics/homo_sapiens/genome/genome.fasta', checkIfExists: true) + ]) + input[2] = SEGEMEHL_INDEX.out.index + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert path(process.out.alignment[0][1]).exists() }, + { assert path(process.out.trans_alignments[0][1]).exists() }, + { assert path(process.out.multi_bed[0][1]).exists() }, + { assert path(process.out.single_bed[0][1]).exists() }, + { assert snapshot(process.out.versions).match("homo_sapiens - split - paired_end - versions") } + ) + } + } +} \ No newline at end of file diff --git a/modules/nf-core/segemehl/align/tests/main.nf.test.snap b/modules/nf-core/segemehl/align/tests/main.nf.test.snap new file mode 100644 index 000000000..c914bc3d1 --- /dev/null +++ b/modules/nf-core/segemehl/align/tests/main.nf.test.snap @@ -0,0 +1,50 @@ +{ + "homo_sapiens - paired_end - versions": { + "content": [ + [ + "versions.yml:md5,0c6afcd6ae65e27a0ea87f5b42c853eb" + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "24.04.2" + }, + "timestamp": "2024-05-30T12:58:05.434115758" + }, + "homo_sapiens - single_end - versions": { + "content": [ + [ + "versions.yml:md5,0c6afcd6ae65e27a0ea87f5b42c853eb" + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "24.04.2" + }, + "timestamp": "2024-05-30T12:57:56.488707635" + }, + "homo_sapiens - split - single_end - versions": { + "content": [ + [ + "versions.yml:md5,0c6afcd6ae65e27a0ea87f5b42c853eb" + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "24.04.2" + }, + "timestamp": "2024-05-30T13:06:11.217385877" + }, + "homo_sapiens - split - paired_end - versions": { + "content": [ + [ + "versions.yml:md5,0c6afcd6ae65e27a0ea87f5b42c853eb" + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "24.04.2" + }, + "timestamp": "2024-05-30T13:06:29.757385118" + } +} \ No newline at end of file diff --git a/modules/nf-core/segemehl/align/tests/split.config b/modules/nf-core/segemehl/align/tests/split.config new file mode 100644 index 000000000..d4f6aab83 --- /dev/null +++ b/modules/nf-core/segemehl/align/tests/split.config @@ -0,0 +1,5 @@ +process{ + withName: SEGEMEHL_ALIGN { + ext.args = "-S" + } +} \ No newline at end of file diff --git a/modules/nf-core/segemehl/align/tests/tags.yml b/modules/nf-core/segemehl/align/tests/tags.yml new file mode 100644 index 000000000..6e7bf26ee --- /dev/null +++ b/modules/nf-core/segemehl/align/tests/tags.yml @@ -0,0 +1,2 @@ +segemehl/align: + - modules/nf-core/segemehl/align/** diff --git a/modules/nf-core/segemehl/index/environment.yml b/modules/nf-core/segemehl/index/environment.yml new file mode 100644 index 000000000..c06413305 --- /dev/null +++ b/modules/nf-core/segemehl/index/environment.yml @@ -0,0 +1,7 @@ +name: segemehl_index +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::segemehl=0.3.4 diff --git a/modules/nf-core/segemehl/index/main.nf b/modules/nf-core/segemehl/index/main.nf new file mode 100644 index 000000000..ea912c6ef --- /dev/null +++ b/modules/nf-core/segemehl/index/main.nf @@ -0,0 +1,46 @@ +process SEGEMEHL_INDEX { + tag "$fasta" + label 'process_high' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/segemehl:0.3.4--hc2ea5fd_5': + 'biocontainers/segemehl:0.3.4--hc2ea5fd_5' }" + + input: + path fasta + + output: + path "*.idx", emit: index + path "versions.yml", emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = "${fasta.baseName}" + """ + segemehl.x \\ + -t $task.cpus \\ + -d $fasta \\ + -x ${prefix}.idx \\ + $args + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + segemehl: \$(echo \$(segemehl.x 2>&1 | grep "ge5dee" | awk -F Z '{print substr(\$1, 2, 6)}' )) + END_VERSIONS + """ + + stub: + def prefix = "${fasta.baseName}" + """ + touch ${prefix}.idx + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + segemehl: \$(echo \$(segemehl.x 2>&1 | grep "ge5dee" | awk -F Z '{print substr(\$1, 2, 6)}' )) + END_VERSIONS + """ +} diff --git a/modules/nf-core/segemehl/index/meta.yml b/modules/nf-core/segemehl/index/meta.yml new file mode 100644 index 000000000..f2b9eb225 --- /dev/null +++ b/modules/nf-core/segemehl/index/meta.yml @@ -0,0 +1,32 @@ +name: "segemehl_index" +description: Generate genome indices for segemehl align +keywords: + - index + - circrna + - splicing + - fusions +tools: + - "segemehl": + description: "A multi-split mapping algorithm for circular RNA, splicing, trans-splicing and fusion detection" + homepage: "https://www.bioinf.uni-leipzig.de/Software/segemehl/" + documentation: "https://www.bioinf.uni-leipzig.de/Software/segemehl/" + doi: "10.1186/gb-2014-15-2-r34" + licence: "GPL v3" +input: + - fasta: + type: file + description: Reference genome FASTA file + pattern: "*.{fa,fasta}" +output: + - index: + type: file + description: Segemehl index file + pattern: "*.{idx}" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@BarryDigby" +maintainers: + - "@BarryDigby" diff --git a/modules/nf-core/star/align/environment.yml b/modules/nf-core/star/align/environment.yml new file mode 100644 index 000000000..8bd58cff5 --- /dev/null +++ b/modules/nf-core/star/align/environment.yml @@ -0,0 +1,10 @@ +name: star_align +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::star=2.7.10a + - bioconda::samtools=1.18 + - bioconda::htslib=1.18 + - conda-forge::gawk=5.1.0 diff --git a/modules/nf-core/star/align/main.nf b/modules/nf-core/star/align/main.nf new file mode 100644 index 000000000..8e9c48b1c --- /dev/null +++ b/modules/nf-core/star/align/main.nf @@ -0,0 +1,109 @@ +process STAR_ALIGN { + tag "$meta.id" + label 'process_high' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/mulled-v2-1fa26d1ce03c295fe2fdcf85831a92fbcbd7e8c2:ded3841da0194af2701c780e9b3d653a85d27489-0' : + 'biocontainers/mulled-v2-1fa26d1ce03c295fe2fdcf85831a92fbcbd7e8c2:ded3841da0194af2701c780e9b3d653a85d27489-0' }" + + input: + tuple val(meta), path(reads, stageAs: "input*/*") + tuple val(meta2), path(index) + tuple val(meta3), path(gtf) + val star_ignore_sjdbgtf + val seq_platform + val seq_center + + output: + tuple val(meta), path('*Log.final.out') , emit: log_final + tuple val(meta), path('*Log.out') , emit: log_out + tuple val(meta), path('*Log.progress.out'), emit: log_progress + path "versions.yml" , emit: versions + + tuple val(meta), path('*d.out.bam') , optional:true, emit: bam + tuple val(meta), path('*sortedByCoord.out.bam') , optional:true, emit: bam_sorted + tuple val(meta), path('*toTranscriptome.out.bam'), optional:true, emit: bam_transcript + tuple val(meta), path('*Aligned.unsort.out.bam') , optional:true, emit: bam_unsorted + tuple val(meta), path('*fastq.gz') , optional:true, emit: fastq + tuple val(meta), path('*.tab') , optional:true, emit: tab + tuple val(meta), path('*.SJ.out.tab') , optional:true, emit: spl_junc_tab + tuple val(meta), path('*.ReadsPerGene.out.tab') , optional:true, emit: read_per_gene_tab + tuple val(meta), path('*.out.junction') , optional:true, emit: junction + tuple val(meta), path('*.out.sam') , optional:true, emit: sam + tuple val(meta), path('*.wig') , optional:true, emit: wig + tuple val(meta), path('*.bg') , optional:true, emit: bedgraph + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def reads1 = [], reads2 = [] + meta.single_end ? [reads].flatten().each{reads1 << it} : reads.eachWithIndex{ v, ix -> ( ix & 1 ? reads2 : reads1) << v } + def ignore_gtf = star_ignore_sjdbgtf ? '' : "--sjdbGTFfile $gtf" + def seq_platform = seq_platform ? "'PL:$seq_platform'" : "" + def seq_center = seq_center ? "'CN:$seq_center'" : "" + def attrRG = args.contains("--outSAMattrRGline") ? "" : "--outSAMattrRGline 'ID:$prefix' $seq_center 'SM:$prefix' $seq_platform" + def out_sam_type = (args.contains('--outSAMtype')) ? '' : '--outSAMtype BAM Unsorted' + def mv_unsorted_bam = (args.contains('--outSAMtype BAM Unsorted SortedByCoordinate')) ? "mv ${prefix}.Aligned.out.bam ${prefix}.Aligned.unsort.out.bam" : '' + """ + STAR \\ + --genomeDir $index \\ + --readFilesIn ${reads1.join(",")} ${reads2.join(",")} \\ + --runThreadN $task.cpus \\ + --outFileNamePrefix $prefix. \\ + $out_sam_type \\ + $ignore_gtf \\ + $attrRG \\ + $args + + $mv_unsorted_bam + + if [ -f ${prefix}.Unmapped.out.mate1 ]; then + mv ${prefix}.Unmapped.out.mate1 ${prefix}.unmapped_1.fastq + gzip ${prefix}.unmapped_1.fastq + fi + if [ -f ${prefix}.Unmapped.out.mate2 ]; then + mv ${prefix}.Unmapped.out.mate2 ${prefix}.unmapped_2.fastq + gzip ${prefix}.unmapped_2.fastq + fi + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + star: \$(STAR --version | sed -e "s/STAR_//g") + samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') + gawk: \$(echo \$(gawk --version 2>&1) | sed 's/^.*GNU Awk //; s/, .*\$//') + END_VERSIONS + """ + + stub: + def prefix = task.ext.prefix ?: "${meta.id}" + """ + touch ${prefix}Xd.out.bam + touch ${prefix}.Log.final.out + touch ${prefix}.Log.out + touch ${prefix}.Log.progress.out + touch ${prefix}.sortedByCoord.out.bam + touch ${prefix}.toTranscriptome.out.bam + touch ${prefix}.Aligned.unsort.out.bam + touch ${prefix}.Aligned.sortedByCoord.out.bam + touch ${prefix}.unmapped_1.fastq.gz + touch ${prefix}.unmapped_2.fastq.gz + touch ${prefix}.tab + touch ${prefix}.SJ.out.tab + touch ${prefix}.ReadsPerGene.out.tab + touch ${prefix}.Chimeric.out.junction + touch ${prefix}.out.sam + touch ${prefix}.Signal.UniqueMultiple.str1.out.wig + touch ${prefix}.Signal.UniqueMultiple.str1.out.bg + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + star: \$(STAR --version | sed -e "s/STAR_//g") + samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') + gawk: \$(echo \$(gawk --version 2>&1) | sed 's/^.*GNU Awk //; s/, .*\$//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/star/align/meta.yml b/modules/nf-core/star/align/meta.yml new file mode 100644 index 000000000..e80dbb7dd --- /dev/null +++ b/modules/nf-core/star/align/meta.yml @@ -0,0 +1,115 @@ +name: star_align +description: Align reads to a reference genome using STAR +keywords: + - align + - fasta + - genome + - reference +tools: + - star: + description: | + STAR is a software package for mapping DNA sequences against + a large reference genome, such as the human genome. + homepage: https://github.com/alexdobin/STAR + manual: https://github.com/alexdobin/STAR/blob/master/doc/STARmanual.pdf + doi: 10.1093/bioinformatics/bts635 + licence: ["MIT"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - reads: + type: file + description: | + List of input FastQ files of size 1 and 2 for single-end and paired-end data, + respectively. + - meta2: + type: map + description: | + Groovy Map containing reference information + e.g. [ id:'test' ] + - index: + type: directory + description: STAR genome index + pattern: "star" + - meta3: + type: map + description: | + Groovy Map containing reference information + e.g. [ id:'test' ] + - gtf: + type: file + description: Annotation GTF file + pattern: "*.{gtf}" + - star_ignore_sjdbgtf: + type: boolean + description: Ignore annotation GTF file + - seq_platform: + type: string + description: Sequencing platform + - seq_center: + type: string + description: Sequencing center +output: + - bam: + type: file + description: Output BAM file containing read alignments + pattern: "*.{bam}" + - log_final: + type: file + description: STAR final log file + pattern: "*Log.final.out" + - log_out: + type: file + description: STAR lot out file + pattern: "*Log.out" + - log_progress: + type: file + description: STAR log progress file + pattern: "*Log.progress.out" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - bam_sorted: + type: file + description: Sorted BAM file of read alignments (optional) + pattern: "*sortedByCoord.out.bam" + - bam_transcript: + type: file + description: Output BAM file of transcriptome alignment (optional) + pattern: "*toTranscriptome.out.bam" + - bam_unsorted: + type: file + description: Unsorted BAM file of read alignments (optional) + pattern: "*Aligned.unsort.out.bam" + - fastq: + type: file + description: Unmapped FastQ files (optional) + pattern: "*fastq.gz" + - tab: + type: file + description: STAR output tab file(s) (optional) + pattern: "*.tab" + - junction: + type: file + description: STAR chimeric junction output file (optional) + pattern: "*.out.junction" + - wig: + type: file + description: STAR output wiggle format file(s) (optional) + pattern: "*.wig" + - bedgraph: + type: file + description: STAR output bedGraph format file(s) (optional) + pattern: "*.bg" +authors: + - "@kevinmenden" + - "@drpatelh" + - "@praveenraj2018" +maintainers: + - "@kevinmenden" + - "@drpatelh" + - "@praveenraj2018" diff --git a/modules/nf-core/star/align/tests/main.nf.test b/modules/nf-core/star/align/tests/main.nf.test new file mode 100644 index 000000000..6ecd77863 --- /dev/null +++ b/modules/nf-core/star/align/tests/main.nf.test @@ -0,0 +1,268 @@ +nextflow_process { + + name "Test Process STAR_ALIGN" + script "../main.nf" + process "STAR_ALIGN" + tag "modules" + tag "modules_nfcore" + tag "star" + tag "star/align" + tag "star/genomegenerate" + + setup { + run("STAR_GENOMEGENERATE") { + script "../../../star/genomegenerate/main.nf" + process { + """ + input[0] = Channel.of([ + [ id:'test_fasta' ], + [ file(params.modules_testdata_base_path + 'genomics/homo_sapiens/genome/genome.fasta', checkIfExists: true) ] + ]) + input[1] = Channel.of([ + [ id:'test_gtf' ], + [ file(params.modules_testdata_base_path + 'genomics/homo_sapiens/genome/genome.gtf', checkIfExists: true) ] + ]) + """ + } + } + } + + test("homo_sapiens - single_end") { + config "./nextflow.config" + + when { + process { + """ + input[0] = Channel.of([ + [ id:'test', single_end:true ], // meta map + [ file(params.modules_testdata_base_path + 'genomics/homo_sapiens/illumina/fastq/test_rnaseq_1.fastq.gz', checkIfExists: true) ] + ]) + input[1] = STAR_GENOMEGENERATE.out.index + input[2] = Channel.of([ + [ id:'test_gtf' ], + [ file(params.modules_testdata_base_path + 'genomics/homo_sapiens/genome/genome.gtf', checkIfExists: true) ] + ]) + input[3] = false + input[4] = 'illumina' + input[5] = false + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(file(process.out.log_final[0][1]).name).match("homo_sapiens - single_end - log_final") }, + { assert snapshot(file(process.out.log_out[0][1]).name).match("homo_sapiens - single_end - log_out") }, + { assert snapshot(process.out.bam).match("homo_sapiens - single_end - bam") }, + { assert snapshot(process.out.bam_sorted).match("homo_sapiens - single_end - bam_sorted") }, + { assert snapshot(process.out.bam_transcript).match("homo_sapiens - single_end - bam_transcript") }, + { assert snapshot(process.out.bam_unsorted).match("homo_sapiens - single_end - bam_unsorted") }, + { assert snapshot(process.out.bedgraph).match("homo_sapiens - single_end - bedgraph") }, + { assert snapshot(process.out.fastq).match("homo_sapiens - single_end - fastq") }, + { assert snapshot(process.out.junction).match("homo_sapiens - single_end - junction") }, + { assert snapshot(process.out.log_progress).match("homo_sapiens - single_end - log_progress") }, + { assert snapshot(process.out.read_per_gene_tab).match("homo_sapiens - single_end - read_per_gene_tab") }, + { assert snapshot(process.out.sam).match("homo_sapiens - single_end - sam") }, + { assert snapshot(process.out.spl_junc_tab).match("homo_sapiens - single_end - spl_junc_tab") }, + { assert snapshot(process.out.tab).match("homo_sapiens - single_end - tab") }, + { assert snapshot(process.out.wig).match("homo_sapiens - single_end - wig") }, + { assert snapshot(process.out.versions).match("homo_sapiens - single_end - versions") } + ) + } + } + + test("homo_sapiens - paired_end") { + config "./nextflow.config" + + when { + process { + """ + input[0] = Channel.of([ + [ id:'test', single_end:false ], // meta map + [ + file(params.modules_testdata_base_path + 'genomics/homo_sapiens/illumina/fastq/test_rnaseq_1.fastq.gz', checkIfExists: true), + file(params.modules_testdata_base_path + 'genomics/homo_sapiens/illumina/fastq/test_rnaseq_2.fastq.gz', checkIfExists: true) + ] + ]) + input[1] = STAR_GENOMEGENERATE.out.index + input[2] = Channel.of([ + [ id:'test_gtf' ], + [ file(params.modules_testdata_base_path + 'genomics/homo_sapiens/genome/genome.gtf', checkIfExists: true) ] + ]) + input[3] = false + input[4] = 'illumina' + input[5] = false + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(file(process.out.log_final[0][1]).name).match("homo_sapiens - paired_end - log_final") }, + { assert snapshot(file(process.out.log_out[0][1]).name).match("homo_sapiens - paired_end - log_out") }, + { assert snapshot(process.out.bam).match("homo_sapiens - paired_end - bam") }, + { assert snapshot(process.out.bam_sorted).match("homo_sapiens - paired_end - bam_sorted") }, + { assert snapshot(process.out.bam_transcript).match("homo_sapiens - paired_end - bam_transcript") }, + { assert snapshot(process.out.bam_unsorted).match("homo_sapiens - paired_end - bam_unsorted") }, + { assert snapshot(process.out.bedgraph).match("homo_sapiens - paired_end - bedgraph") }, + { assert snapshot(process.out.fastq).match("homo_sapiens - paired_end - fastq") }, + { assert snapshot(process.out.junction).match("homo_sapiens - paired_end - junction") }, + { assert snapshot(process.out.log_progress).match("homo_sapiens - paired_end - log_progress") }, + { assert snapshot(process.out.read_per_gene_tab).match("homo_sapiens - paired_end - read_per_gene_tab") }, + { assert snapshot(process.out.sam).match("homo_sapiens - paired_end - sam") }, + { assert snapshot(process.out.spl_junc_tab).match("homo_sapiens - paired_end - spl_junc_tab") }, + { assert snapshot(process.out.tab).match("homo_sapiens - paired_end - tab") }, + { assert snapshot(process.out.wig).match("homo_sapiens - paired_end - wig") }, + { assert snapshot(process.out.versions).match("homo_sapiens - paired_end - versions") } + ) + } + } + + test("homo_sapiens - paired_end - arriba") { + config "./nextflow.arriba.config" + + when { + process { + """ + input[0] = Channel.of([ + [ id:'test', single_end:false ], // meta map + [ + file(params.modules_testdata_base_path + 'genomics/homo_sapiens/illumina/fastq/test_rnaseq_1.fastq.gz', checkIfExists: true), + file(params.modules_testdata_base_path + 'genomics/homo_sapiens/illumina/fastq/test_rnaseq_2.fastq.gz', checkIfExists: true) + ] + ]) + input[1] = STAR_GENOMEGENERATE.out.index + input[2] = Channel.of([ + [ id:'test_gtf' ], + [ file(params.modules_testdata_base_path + 'genomics/homo_sapiens/genome/genome.gtf', checkIfExists: true) ] + ]) + input[3] = false + input[4] = 'illumina' + input[5] = false + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(file(process.out.log_final[0][1]).name).match("homo_sapiens - paired_end - arriba - log_final") }, + { assert snapshot(file(process.out.log_out[0][1]).name).match("homo_sapiens - paired_end - arriba - log_out") }, + { assert snapshot(file(process.out.log_progress[0][1]).name).match("homo_sapiens - paired_end - arriba - log_progress") }, + { assert snapshot(process.out.bam).match("homo_sapiens - paired_end - arriba - bam") }, + { assert snapshot(process.out.bam_sorted).match("homo_sapiens - paired_end - arriba - bam_sorted") }, + { assert snapshot(process.out.bam_transcript).match("homo_sapiens - paired_end - arriba - bam_transcript") }, + { assert snapshot(process.out.bam_unsorted).match("homo_sapiens - paired_end - arriba - bam_unsorted") }, + { assert snapshot(process.out.bedgraph).match("homo_sapiens - paired_end - arriba - bedgraph") }, + { assert snapshot(process.out.fastq).match("homo_sapiens - paired_end - arriba - fastq") }, + { assert snapshot(process.out.junction).match("homo_sapiens - paired_end - arriba - junction") }, + { assert snapshot(process.out.read_per_gene_tab).match("homo_sapiens - paired_end - arriba - read_per_gene_tab") }, + { assert snapshot(process.out.sam).match("homo_sapiens - paired_end - arriba - sam") }, + { assert snapshot(process.out.spl_junc_tab).match("homo_sapiens - paired_end - arriba - spl_junc_tab") }, + { assert snapshot(process.out.tab).match("homo_sapiens - paired_end - arriba - tab") }, + { assert snapshot(process.out.wig).match("homo_sapiens - paired_end - arriba - wig") }, + { assert snapshot(process.out.versions).match("homo_sapiens - paired_end - arriba - versions") } + ) + } + } + + test("homo_sapiens - paired_end - starfusion") { + config "./nextflow.starfusion.config" + + when { + process { + """ + input[0] = Channel.of([ + [ id:'test', single_end:false ], // meta map + [ + file(params.modules_testdata_base_path + 'genomics/homo_sapiens/illumina/fastq/test_rnaseq_1.fastq.gz', checkIfExists: true), + file(params.modules_testdata_base_path + 'genomics/homo_sapiens/illumina/fastq/test_rnaseq_2.fastq.gz', checkIfExists: true) + ] + ]) + input[1] = STAR_GENOMEGENERATE.out.index + input[2] = Channel.of([ + [ id:'test_gtf' ], + [ file(params.modules_testdata_base_path + 'genomics/homo_sapiens/genome/genome.gtf', checkIfExists: true) ] + ]) + input[3] = false + input[4] = 'illumina' + input[5] = false + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(file(process.out.log_final[0][1]).name).match("homo_sapiens - paired_end - starfusion - log_final") }, + { assert snapshot(file(process.out.log_out[0][1]).name).match("homo_sapiens - paired_end - starfusion - log_out") }, + { assert snapshot(file(process.out.log_progress[0][1]).name).match("homo_sapiens - paired_end - starfusion - log_progress") }, + { assert snapshot(process.out.bam).match("homo_sapiens - paired_end - starfusion - bam") }, + { assert snapshot(process.out.bam_sorted).match("homo_sapiens - paired_end - starfusion - bam_sorted") }, + { assert snapshot(process.out.bam_transcript).match("homo_sapiens - paired_end - starfusion - bam_transcript") }, + { assert snapshot(process.out.bam_unsorted).match("homo_sapiens - paired_end - starfusion - bam_unsorted") }, + { assert snapshot(process.out.bedgraph).match("homo_sapiens - paired_end - starfusion - bedgraph") }, + { assert snapshot(process.out.fastq).match("homo_sapiens - paired_end - starfusion - fastq") }, + { assert snapshot(process.out.junction).match("homo_sapiens - paired_end - starfusion - junction") }, + { assert snapshot(process.out.read_per_gene_tab).match("homo_sapiens - paired_end - starfusion - read_per_gene_tab") }, + { assert snapshot(process.out.sam).match("homo_sapiens - paired_end - starfusion - sam") }, + { assert snapshot(process.out.spl_junc_tab).match("homo_sapiens - paired_end - starfusion - spl_junc_tab") }, + { assert snapshot(process.out.tab).match("homo_sapiens - paired_end - starfusion - tab") }, + { assert snapshot(process.out.wig).match("homo_sapiens - paired_end - starfusion - wig") }, + { assert snapshot(process.out.versions).match("homo_sapiens - paired_end - starfusion - versions") } + ) + } + } + + test("homo_sapiens - paired_end - multiple") { + config "./nextflow.config" + + when { + process { + """ + input[0] = Channel.of([ + [ id:'test', single_end:false ], // meta map + [ + file(params.modules_testdata_base_path + 'genomics/homo_sapiens/illumina/fastq/test_rnaseq_1.fastq.gz', checkIfExists: true), + file(params.modules_testdata_base_path + 'genomics/homo_sapiens/illumina/fastq/test_rnaseq_2.fastq.gz', checkIfExists: true), + file(params.modules_testdata_base_path + 'genomics/homo_sapiens/illumina/fastq/test_rnaseq_1.fastq.gz', checkIfExists: true), + file(params.modules_testdata_base_path + 'genomics/homo_sapiens/illumina/fastq/test_rnaseq_2.fastq.gz', checkIfExists: true) + ] + ]) + input[1] = STAR_GENOMEGENERATE.out.index + input[2] = Channel.of([ + [ id:'test_gtf' ], + [ file(params.modules_testdata_base_path + 'genomics/homo_sapiens/genome/genome.gtf', checkIfExists: true) ] + ]) + input[3] = false + input[4] = 'illumina' + input[5] = false + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(file(process.out.log_final[0][1]).name).match("homo_sapiens - paired_end - multiple - log_final") }, + { assert snapshot(file(process.out.log_out[0][1]).name).match("homo_sapiens - paired_end - multiple - log_out") }, + { assert snapshot(file(process.out.log_progress[0][1]).name).match("homo_sapiens - paired_end - multiple - log_progress") }, + { assert snapshot(process.out.bam).match("homo_sapiens - paired_end - multiple - bam") }, + { assert snapshot(process.out.bam_sorted).match("homo_sapiens - paired_end - multiple - bam_sorted") }, + { assert snapshot(process.out.bam_transcript).match("homo_sapiens - paired_end - multiple - bam_transcript") }, + { assert snapshot(process.out.bam_unsorted).match("homo_sapiens - paired_end - multiple - bam_unsorted") }, + { assert snapshot(process.out.bedgraph).match("homo_sapiens - paired_end - multiple - bedgraph") }, + { assert snapshot(process.out.fastq).match("homo_sapiens - paired_end - multiple - fastq") }, + { assert snapshot(process.out.junction).match("homo_sapiens - paired_end - multiple - junction") }, + { assert snapshot(process.out.read_per_gene_tab).match("homo_sapiens - paired_end - multiple - read_per_gene_tab") }, + { assert snapshot(process.out.sam).match("homo_sapiens - paired_end - multiple - sam") }, + { assert snapshot(process.out.spl_junc_tab).match("homo_sapiens - paired_end - multiple - spl_junc_tab") }, + { assert snapshot(process.out.tab).match("homo_sapiens - paired_end - multiple - tab") }, + { assert snapshot(process.out.wig).match("homo_sapiens - paired_end - multiple - wig") }, + { assert snapshot(process.out.versions).match("homo_sapiens - paired_end - multiple - versions") } + ) + } + } +} \ No newline at end of file diff --git a/modules/nf-core/star/align/tests/main.nf.test.snap b/modules/nf-core/star/align/tests/main.nf.test.snap new file mode 100644 index 000000000..08edb914b --- /dev/null +++ b/modules/nf-core/star/align/tests/main.nf.test.snap @@ -0,0 +1,769 @@ +{ + "homo_sapiens - paired_end - multiple - bam_sorted": { + "content": [ + [ + [ + { + "id": "test", + "single_end": false + }, + "test.Aligned.sortedByCoord.out.bam:md5,ab07c21d63ab0a6c07d171d213c81d5a" + ] + ] + ], + "timestamp": "2023-12-04T18:01:19.968225733" + }, + "homo_sapiens - paired_end - multiple - wig": { + "content": [ + [ + + ] + ], + "timestamp": "2023-11-23T13:29:01.857804" + }, + "homo_sapiens - paired_end - arriba - tab": { + "content": [ + [ + [ + { + "id": "test", + "single_end": false + }, + "test.SJ.out.tab:md5,5155c9fd1f787ad6d7d80987fb06219c" + ] + ] + ], + "timestamp": "2023-12-04T17:56:12.347549723" + }, + "homo_sapiens - single_end - wig": { + "content": [ + [ + + ] + ], + "timestamp": "2023-11-23T13:22:55.24701" + }, + "homo_sapiens - paired_end - sam": { + "content": [ + [ + + ] + ], + "timestamp": "2023-11-23T13:23:33.383818" + }, + "homo_sapiens - paired_end - arriba - versions": { + "content": [ + [ + "versions.yml:md5,2e6b6d8809f5a17f38f4d27c45dcb22f" + ] + ], + "timestamp": "2023-12-04T17:56:12.431212643" + }, + "homo_sapiens - paired_end - multiple - bedgraph": { + "content": [ + [ + [ + { + "id": "test", + "single_end": false + }, + [ + "test.Signal.Unique.str1.out.bg:md5,d7bf8b70b436ca048a62513e1d0ece3a", + "test.Signal.UniqueMultiple.str1.out.bg:md5,686d58493b9eb445b56ace4d67f76ef6" + ] + ] + ] + ], + "timestamp": "2023-12-04T18:01:20.07119229" + }, + "homo_sapiens - paired_end - read_per_gene_tab": { + "content": [ + [ + + ] + ], + "timestamp": "2023-11-23T13:23:33.368841" + }, + "homo_sapiens - paired_end - arriba - bedgraph": { + "content": [ + [ + + ] + ], + "timestamp": "2023-11-23T13:25:07.102537" + }, + "homo_sapiens - single_end - junction": { + "content": [ + [ + + ] + ], + "timestamp": "2023-11-23T13:22:55.185369" + }, + "homo_sapiens - paired_end - arriba - spl_junc_tab": { + "content": [ + [ + [ + { + "id": "test", + "single_end": false + }, + "test.SJ.out.tab:md5,5155c9fd1f787ad6d7d80987fb06219c" + ] + ] + ], + "timestamp": "2023-12-04T17:56:12.268388251" + }, + "homo_sapiens - single_end - sam": { + "content": [ + [ + + ] + ], + "timestamp": "2023-11-23T13:22:55.216183" + }, + "homo_sapiens - paired_end - fastq": { + "content": [ + [ + + ] + ], + "timestamp": "2023-11-23T13:23:33.327236" + }, + "homo_sapiens - single_end - versions": { + "content": [ + [ + "versions.yml:md5,2e6b6d8809f5a17f38f4d27c45dcb22f" + ] + ], + "timestamp": "2023-12-04T17:53:26.664210196" + }, + "homo_sapiens - paired_end - multiple - log_out": { + "content": [ + "test.Log.out" + ], + "timestamp": "2023-11-23T13:29:01.022176" + }, + "homo_sapiens - paired_end - arriba - fastq": { + "content": [ + [ + + ] + ], + "timestamp": "2023-11-23T13:25:07.15277" + }, + "homo_sapiens - paired_end - multiple - junction": { + "content": [ + [ + + ] + ], + "timestamp": "2023-11-23T13:29:01.52923" + }, + "homo_sapiens - paired_end - multiple - spl_junc_tab": { + "content": [ + [ + [ + { + "id": "test", + "single_end": false + }, + "test.SJ.out.tab:md5,069877e053714e23010fe4e1c003b4a2" + ] + ] + ], + "timestamp": "2023-12-04T18:01:20.189486201" + }, + "homo_sapiens - paired_end - starfusion - log_final": { + "content": [ + "test.Log.final.out" + ], + "timestamp": "2023-11-23T13:27:55.905883" + }, + "homo_sapiens - paired_end - starfusion - fastq": { + "content": [ + [ + + ] + ], + "timestamp": "2023-11-23T13:27:56.192302" + }, + "homo_sapiens - paired_end - multiple - sam": { + "content": [ + [ + + ] + ], + "timestamp": "2023-11-23T13:29:01.661837" + }, + "homo_sapiens - paired_end - multiple - log_final": { + "content": [ + "test.Log.final.out" + ], + "timestamp": "2023-11-23T13:29:00.966417" + }, + "homo_sapiens - paired_end - starfusion - bam": { + "content": [ + [ + [ + { + "id": "test", + "single_end": false + }, + "test.Aligned.out.bam:md5,bcad07b838f6762fc01eea52b5cd3f84" + ] + ] + ], + "timestamp": "2023-12-04T17:59:58.53235164" + }, + "homo_sapiens - paired_end - arriba - junction": { + "content": [ + [ + + ] + ], + "timestamp": "2023-11-23T13:25:07.202776" + }, + "homo_sapiens - single_end - bedgraph": { + "content": [ + [ + [ + { + "id": "test", + "single_end": true + }, + [ + "test.Signal.Unique.str1.out.bg:md5,c56fc1472776fb927eaf62d973da5f9a", + "test.Signal.UniqueMultiple.str1.out.bg:md5,e93373cf6f2a2a9506e2efdb260cdd4f" + ] + ] + ] + ], + "timestamp": "2023-12-04T17:53:26.394863748" + }, + "homo_sapiens - paired_end - arriba - read_per_gene_tab": { + "content": [ + [ + + ] + ], + "timestamp": "2023-11-23T13:25:07.251962" + }, + "homo_sapiens - paired_end - starfusion - bam_sorted": { + "content": [ + [ + + ] + ], + "timestamp": "2023-11-23T13:27:56.040843" + }, + "homo_sapiens - single_end - bam_unsorted": { + "content": [ + [ + + ] + ], + "timestamp": "2023-11-23T13:22:55.154172" + }, + "homo_sapiens - paired_end - bam": { + "content": [ + [ + [ + { + "id": "test", + "single_end": false + }, + "test.Aligned.sortedByCoord.out.bam:md5,b9ee1c607e07323bc1652ef3babb543f" + ] + ] + ], + "timestamp": "2023-12-04T17:54:11.934832258" + }, + "homo_sapiens - paired_end - arriba - bam_transcript": { + "content": [ + [ + + ] + ], + "timestamp": "2023-11-23T13:25:06.998817" + }, + "homo_sapiens - paired_end - log_out": { + "content": [ + "test.Log.out" + ], + "timestamp": "2023-11-23T13:23:33.259699" + }, + "homo_sapiens - paired_end - arriba - log_out": { + "content": [ + "test.Log.out" + ], + "timestamp": "2023-11-23T13:25:06.849451" + }, + "homo_sapiens - paired_end - multiple - versions": { + "content": [ + [ + "versions.yml:md5,2e6b6d8809f5a17f38f4d27c45dcb22f" + ] + ], + "timestamp": "2023-12-04T18:01:20.393705142" + }, + "homo_sapiens - paired_end - starfusion - bam_transcript": { + "content": [ + [ + + ] + ], + "timestamp": "2023-11-23T13:27:56.082408" + }, + "homo_sapiens - paired_end - starfusion - tab": { + "content": [ + [ + [ + { + "id": "test", + "single_end": false + }, + "test.SJ.out.tab:md5,19c3faa1bfa9a0cc5e4c45f17065b53a" + ] + ] + ], + "timestamp": "2023-12-04T17:59:58.818041322" + }, + "homo_sapiens - single_end - fastq": { + "content": [ + [ + + ] + ], + "timestamp": "2023-11-23T13:22:55.175307" + }, + "homo_sapiens - paired_end - tab": { + "content": [ + [ + [ + { + "id": "test", + "single_end": false + }, + "test.SJ.out.tab:md5,844af19ab0fc8cd9a3f75228445aca0d" + ] + ] + ], + "timestamp": "2023-12-04T17:54:12.255481058" + }, + "homo_sapiens - paired_end - starfusion - bedgraph": { + "content": [ + [ + + ] + ], + "timestamp": "2023-11-23T13:27:56.155413" + }, + "homo_sapiens - single_end - bam_transcript": { + "content": [ + [ + + ] + ], + "timestamp": "2023-11-23T13:22:55.144852" + }, + "homo_sapiens - paired_end - versions": { + "content": [ + [ + "versions.yml:md5,2e6b6d8809f5a17f38f4d27c45dcb22f" + ] + ], + "timestamp": "2023-12-04T17:54:12.343840482" + }, + "homo_sapiens - paired_end - multiple - tab": { + "content": [ + [ + [ + { + "id": "test", + "single_end": false + }, + "test.SJ.out.tab:md5,069877e053714e23010fe4e1c003b4a2" + ] + ] + ], + "timestamp": "2023-12-04T18:01:20.291692062" + }, + "homo_sapiens - single_end - bam": { + "content": [ + [ + [ + { + "id": "test", + "single_end": true + }, + "test.Aligned.sortedByCoord.out.bam:md5,c6cfaccaf91bc7fdabed3cfe236d4535" + ] + ] + ], + "timestamp": "2023-12-04T17:53:26.265642675" + }, + "homo_sapiens - paired_end - arriba - wig": { + "content": [ + [ + + ] + ], + "timestamp": "2023-11-23T13:25:07.444214" + }, + "homo_sapiens - paired_end - log_progress": { + "content": [ + [ + [ + { + "id": "test", + "single_end": false + }, + "test.Log.progress.out:md5,b2bd061d6cbaaf3d6d3b1fed547f69b8" + ] + ] + ], + "timestamp": "2023-12-04T17:54:12.126063825" + }, + "homo_sapiens - paired_end - arriba - log_final": { + "content": [ + "test.Log.final.out" + ], + "timestamp": "2023-11-23T13:25:06.829799" + }, + "homo_sapiens - paired_end - bam_unsorted": { + "content": [ + [ + + ] + ], + "timestamp": "2023-11-23T13:23:33.300509" + }, + "homo_sapiens - paired_end - arriba - sam": { + "content": [ + [ + + ] + ], + "timestamp": "2023-11-23T13:25:07.300383" + }, + "homo_sapiens - paired_end - multiple - bam": { + "content": [ + [ + [ + { + "id": "test", + "single_end": false + }, + "test.Aligned.sortedByCoord.out.bam:md5,ab07c21d63ab0a6c07d171d213c81d5a" + ] + ] + ], + "timestamp": "2023-12-04T18:01:19.851247126" + }, + "homo_sapiens - paired_end - multiple - fastq": { + "content": [ + [ + + ] + ], + "timestamp": "2023-11-23T13:29:01.462257" + }, + "homo_sapiens - single_end - bam_sorted": { + "content": [ + [ + [ + { + "id": "test", + "single_end": true + }, + "test.Aligned.sortedByCoord.out.bam:md5,c6cfaccaf91bc7fdabed3cfe236d4535" + ] + ] + ], + "timestamp": "2023-12-04T17:53:26.335457371" + }, + "homo_sapiens - paired_end - arriba - bam_sorted": { + "content": [ + [ + + ] + ], + "timestamp": "2023-11-23T13:25:06.94699" + }, + "homo_sapiens - paired_end - starfusion - junction": { + "content": [ + [ + [ + { + "id": "test", + "single_end": false + }, + "test.Chimeric.out.junction:md5,c10ef219f4a30e83711b995bc5e40dba" + ] + ] + ], + "timestamp": "2023-12-04T17:59:58.641115828" + }, + "homo_sapiens - single_end - tab": { + "content": [ + [ + [ + { + "id": "test", + "single_end": true + }, + "test.SJ.out.tab:md5,75a516ab950fb958f40b29996474949c" + ] + ] + ], + "timestamp": "2023-12-04T17:53:26.580593434" + }, + "homo_sapiens - paired_end - starfusion - versions": { + "content": [ + [ + "versions.yml:md5,2e6b6d8809f5a17f38f4d27c45dcb22f" + ] + ], + "timestamp": "2023-12-04T17:59:58.907317103" + }, + "homo_sapiens - paired_end - multiple - bam_unsorted": { + "content": [ + [ + + ] + ], + "timestamp": "2023-11-23T13:29:01.330463" + }, + "homo_sapiens - paired_end - arriba - log_progress": { + "content": [ + "test.Log.progress.out" + ], + "timestamp": "2023-11-23T13:25:06.86866" + }, + "homo_sapiens - paired_end - bedgraph": { + "content": [ + [ + [ + { + "id": "test", + "single_end": false + }, + [ + "test.Signal.Unique.str1.out.bg:md5,d7bf8b70b436ca048a62513e1d0ece3a", + "test.Signal.UniqueMultiple.str1.out.bg:md5,686d58493b9eb445b56ace4d67f76ef6" + ] + ] + ] + ], + "timestamp": "2023-12-04T17:54:12.064121304" + }, + "homo_sapiens - paired_end - starfusion - bam_unsorted": { + "content": [ + [ + + ] + ], + "timestamp": "2023-11-23T13:27:56.118974" + }, + "homo_sapiens - paired_end - starfusion - read_per_gene_tab": { + "content": [ + [ + + ] + ], + "timestamp": "2023-11-23T13:27:56.264699" + }, + "homo_sapiens - paired_end - multiple - log_progress": { + "content": [ + "test.Log.progress.out" + ], + "timestamp": "2023-11-23T13:29:01.076947" + }, + "homo_sapiens - paired_end - arriba - bam_unsorted": { + "content": [ + [ + + ] + ], + "timestamp": "2023-11-23T13:25:07.050409" + }, + "homo_sapiens - paired_end - bam_sorted": { + "content": [ + [ + [ + { + "id": "test", + "single_end": false + }, + "test.Aligned.sortedByCoord.out.bam:md5,b9ee1c607e07323bc1652ef3babb543f" + ] + ] + ], + "timestamp": "2023-12-04T17:54:12.002180537" + }, + "homo_sapiens - single_end - spl_junc_tab": { + "content": [ + [ + [ + { + "id": "test", + "single_end": true + }, + "test.SJ.out.tab:md5,75a516ab950fb958f40b29996474949c" + ] + ] + ], + "timestamp": "2023-12-04T17:53:26.50932751" + }, + "homo_sapiens - paired_end - starfusion - spl_junc_tab": { + "content": [ + [ + [ + { + "id": "test", + "single_end": false + }, + "test.SJ.out.tab:md5,19c3faa1bfa9a0cc5e4c45f17065b53a" + ] + ] + ], + "timestamp": "2023-12-04T17:59:58.731699486" + }, + "homo_sapiens - single_end - log_out": { + "content": [ + "test.Log.out" + ], + "timestamp": "2023-11-23T13:22:55.126286" + }, + "homo_sapiens - paired_end - log_final": { + "content": [ + "test.Log.final.out" + ], + "timestamp": "2023-11-23T13:23:33.253884" + }, + "homo_sapiens - single_end - log_final": { + "content": [ + "test.Log.final.out" + ], + "timestamp": "2023-11-23T13:22:55.11799" + }, + "homo_sapiens - paired_end - bam_transcript": { + "content": [ + [ + + ] + ], + "timestamp": "2023-11-23T13:23:33.287684" + }, + "homo_sapiens - paired_end - starfusion - log_progress": { + "content": [ + "test.Log.progress.out" + ], + "timestamp": "2023-11-23T13:27:55.971484" + }, + "homo_sapiens - paired_end - multiple - bam_transcript": { + "content": [ + [ + + ] + ], + "timestamp": "2023-11-23T13:29:01.264176" + }, + "homo_sapiens - paired_end - multiple - read_per_gene_tab": { + "content": [ + [ + + ] + ], + "timestamp": "2023-11-23T13:29:01.596406" + }, + "homo_sapiens - single_end - read_per_gene_tab": { + "content": [ + [ + + ] + ], + "timestamp": "2023-11-23T13:22:55.205936" + }, + "homo_sapiens - paired_end - junction": { + "content": [ + [ + + ] + ], + "timestamp": "2023-11-23T13:23:33.340653" + }, + "homo_sapiens - paired_end - spl_junc_tab": { + "content": [ + [ + [ + { + "id": "test", + "single_end": false + }, + "test.SJ.out.tab:md5,844af19ab0fc8cd9a3f75228445aca0d" + ] + ] + ], + "timestamp": "2023-12-04T17:54:12.185730856" + }, + "homo_sapiens - paired_end - starfusion - sam": { + "content": [ + [ + + ] + ], + "timestamp": "2023-11-23T13:27:56.300637" + }, + "homo_sapiens - paired_end - arriba - bam": { + "content": [ + [ + [ + { + "id": "test", + "single_end": false + }, + "test.Aligned.out.bam:md5,c1b1747f5873f2d17762725636e891d5" + ] + ] + ], + "timestamp": "2023-12-04T17:56:12.190560178" + }, + "homo_sapiens - single_end - log_progress": { + "content": [ + [ + [ + { + "id": "test", + "single_end": true + }, + "test.Log.progress.out:md5,b2bd061d6cbaaf3d6d3b1fed547f69b8" + ] + ] + ], + "timestamp": "2023-12-04T17:53:26.450352138" + }, + "homo_sapiens - paired_end - starfusion - wig": { + "content": [ + [ + + ] + ], + "timestamp": "2023-11-23T13:27:56.422018" + }, + "homo_sapiens - paired_end - wig": { + "content": [ + [ + + ] + ], + "timestamp": "2023-11-23T13:23:33.429457" + }, + "homo_sapiens - paired_end - starfusion - log_out": { + "content": [ + "test.Log.out" + ], + "timestamp": "2023-11-23T13:27:55.93945" + } +} \ No newline at end of file diff --git a/modules/nf-core/star/align/tests/nextflow.arriba.config b/modules/nf-core/star/align/tests/nextflow.arriba.config new file mode 100644 index 000000000..2324b9e58 --- /dev/null +++ b/modules/nf-core/star/align/tests/nextflow.arriba.config @@ -0,0 +1,14 @@ +process { + + withName: STAR_GENOMEGENERATE { + ext.args = '--genomeSAindexNbases 9' + } + + withName: STAR_ALIGN { + ext.args = '--readFilesCommand zcat --outSAMtype BAM Unsorted --outSAMunmapped Within --outBAMcompression 0 --outFilterMultimapNmax 50 --peOverlapNbasesMin 10 --alignSplicedMateMapLminOverLmate 0.5 --alignSJstitchMismatchNmax 5 -1 5 5 --chimSegmentMin 10 --chimOutType WithinBAM HardClip --chimJunctionOverhangMin 10 --chimScoreDropMax 30 --chimScoreJunctionNonGTAG 0 --chimScoreSeparation 1 --chimSegmentReadGapMax 3 --chimMultimapNmax 50' + } + +} + +// Fix chown issue for the output star folder +docker.runOptions = '--platform=linux/amd64 -u $(id -u):$(id -g)' diff --git a/modules/nf-core/star/align/tests/nextflow.config b/modules/nf-core/star/align/tests/nextflow.config new file mode 100644 index 000000000..c4ac58088 --- /dev/null +++ b/modules/nf-core/star/align/tests/nextflow.config @@ -0,0 +1,14 @@ +process { + + withName: STAR_GENOMEGENERATE { + ext.args = '--genomeSAindexNbases 9' + } + + withName: STAR_ALIGN { + ext.args = '--readFilesCommand zcat --outSAMtype BAM SortedByCoordinate --outWigType bedGraph --outWigStrand Unstranded' + } + +} + +// Fix chown issue for the output star folder +docker.runOptions = '--platform=linux/amd64 -u $(id -u):$(id -g)' diff --git a/modules/nf-core/star/align/tests/nextflow.starfusion.config b/modules/nf-core/star/align/tests/nextflow.starfusion.config new file mode 100644 index 000000000..467b64977 --- /dev/null +++ b/modules/nf-core/star/align/tests/nextflow.starfusion.config @@ -0,0 +1,14 @@ +process { + + withName: STAR_GENOMEGENERATE { + ext.args = '--genomeSAindexNbases 9' + } + + withName: STAR_ALIGN { + ext.args = '--readFilesCommand zcat --outSAMtype BAM Unsorted --outReadsUnmapped None --twopassMode Basic --outSAMstrandField intronMotif --outSAMunmapped Within --chimSegmentMin 12 --chimJunctionOverhangMin 8 --chimOutJunctionFormat 1 --alignSJDBoverhangMin 10 --alignMatesGapMax 100000 --alignIntronMax 100000 --alignSJstitchMismatchNmax 5 -1 5 5 --chimMultimapScoreRange 3 --chimScoreJunctionNonGTAG -4 --chimMultimapNmax 20 --chimNonchimScoreDropMin 10 --peOverlapNbasesMin 12 --peOverlapMMp 0.1 --alignInsertionFlush Right --alignSplicedMateMapLminOverLmate 0 --alignSplicedMateMapLmin 30' + } + +} + +// Fix chown issue for the output star folder +docker.runOptions = '--platform=linux/amd64 -u $(id -u):$(id -g)' diff --git a/modules/nf-core/star/align/tests/tags.yml b/modules/nf-core/star/align/tests/tags.yml new file mode 100644 index 000000000..8beace16e --- /dev/null +++ b/modules/nf-core/star/align/tests/tags.yml @@ -0,0 +1,2 @@ +star/align: + - modules/nf-core/star/align/** diff --git a/modules/nf-core/star/genomegenerate/environment.yml b/modules/nf-core/star/genomegenerate/environment.yml new file mode 100644 index 000000000..791f255e5 --- /dev/null +++ b/modules/nf-core/star/genomegenerate/environment.yml @@ -0,0 +1,10 @@ +name: star_genomegenerate +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::samtools=1.18 + - bioconda::htslib=1.18 + - bioconda::star=2.7.10a + - conda-forge::gawk=5.1.0 diff --git a/modules/nf-core/star/genomegenerate/main.nf b/modules/nf-core/star/genomegenerate/main.nf new file mode 100644 index 000000000..b8855715b --- /dev/null +++ b/modules/nf-core/star/genomegenerate/main.nf @@ -0,0 +1,119 @@ +process STAR_GENOMEGENERATE { + tag "$fasta" + label 'process_high' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/mulled-v2-1fa26d1ce03c295fe2fdcf85831a92fbcbd7e8c2:ded3841da0194af2701c780e9b3d653a85d27489-0' : + 'biocontainers/mulled-v2-1fa26d1ce03c295fe2fdcf85831a92fbcbd7e8c2:ded3841da0194af2701c780e9b3d653a85d27489-0' }" + + input: + tuple val(meta), path(fasta) + tuple val(meta2), path(gtf) + + output: + tuple val(meta), path("star") , emit: index + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def args_list = args.tokenize() + def memory = task.memory ? "--limitGenomeGenerateRAM ${task.memory.toBytes() - 100000000}" : '' + def include_gtf = gtf ? "--sjdbGTFfile $gtf" : '' + if (args_list.contains('--genomeSAindexNbases')) { + """ + mkdir star + STAR \\ + --runMode genomeGenerate \\ + --genomeDir star/ \\ + --genomeFastaFiles $fasta \\ + $include_gtf \\ + --runThreadN $task.cpus \\ + $memory \\ + $args + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + star: \$(STAR --version | sed -e "s/STAR_//g") + samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') + gawk: \$(echo \$(gawk --version 2>&1) | sed 's/^.*GNU Awk //; s/, .*\$//') + END_VERSIONS + """ + } else { + """ + samtools faidx $fasta + NUM_BASES=`gawk '{sum = sum + \$2}END{if ((log(sum)/log(2))/2 - 1 > 14) {printf "%.0f", 14} else {printf "%.0f", (log(sum)/log(2))/2 - 1}}' ${fasta}.fai` + + mkdir star + STAR \\ + --runMode genomeGenerate \\ + --genomeDir star/ \\ + --genomeFastaFiles $fasta \\ + $include_gtf \\ + --runThreadN $task.cpus \\ + --genomeSAindexNbases \$NUM_BASES \\ + $memory \\ + $args + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + star: \$(STAR --version | sed -e "s/STAR_//g") + samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') + gawk: \$(echo \$(gawk --version 2>&1) | sed 's/^.*GNU Awk //; s/, .*\$//') + END_VERSIONS + """ + } + + stub: + if (gtf) { + """ + mkdir star + touch star/Genome + touch star/Log.out + touch star/SA + touch star/SAindex + touch star/chrLength.txt + touch star/chrName.txt + touch star/chrNameLength.txt + touch star/chrStart.txt + touch star/exonGeTrInfo.tab + touch star/exonInfo.tab + touch star/geneInfo.tab + touch star/genomeParameters.txt + touch star/sjdbInfo.txt + touch star/sjdbList.fromGTF.out.tab + touch star/sjdbList.out.tab + touch star/transcriptInfo.tab + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + star: \$(STAR --version | sed -e "s/STAR_//g") + samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') + gawk: \$(echo \$(gawk --version 2>&1) | sed 's/^.*GNU Awk //; s/, .*\$//') + END_VERSIONS + """ + } else { + """ + mkdir star + touch star/Genome + touch star/Log.out + touch star/SA + touch star/SAindex + touch star/chrLength.txt + touch star/chrName.txt + touch star/chrNameLength.txt + touch star/chrStart.txt + touch star/genomeParameters.txt + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + star: \$(STAR --version | sed -e "s/STAR_//g") + samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') + gawk: \$(echo \$(gawk --version 2>&1) | sed 's/^.*GNU Awk //; s/, .*\$//') + END_VERSIONS + """ + } +} diff --git a/modules/nf-core/star/genomegenerate/meta.yml b/modules/nf-core/star/genomegenerate/meta.yml new file mode 100644 index 000000000..1061e1b8d --- /dev/null +++ b/modules/nf-core/star/genomegenerate/meta.yml @@ -0,0 +1,53 @@ +name: star_genomegenerate +description: Create index for STAR +keywords: + - index + - fasta + - genome + - reference +tools: + - star: + description: | + STAR is a software package for mapping DNA sequences against + a large reference genome, such as the human genome. + homepage: https://github.com/alexdobin/STAR + manual: https://github.com/alexdobin/STAR/blob/master/doc/STARmanual.pdf + doi: 10.1093/bioinformatics/bts635 + licence: ["MIT"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - fasta: + type: file + description: Fasta file of the reference genome + - meta2: + type: map + description: | + Groovy Map containing reference information + e.g. [ id:'test' ] + - gtf: + type: file + description: GTF file of the reference genome +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - index: + type: directory + description: Folder containing the star index files + pattern: "star" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@kevinmenden" + - "@drpatelh" +maintainers: + - "@kevinmenden" + - "@drpatelh" diff --git a/modules/nf-core/star/genomegenerate/tests/main.nf.test b/modules/nf-core/star/genomegenerate/tests/main.nf.test new file mode 100644 index 000000000..c17c8ba45 --- /dev/null +++ b/modules/nf-core/star/genomegenerate/tests/main.nf.test @@ -0,0 +1,115 @@ +nextflow_process { + + name "Test Process STAR_GENOMEGENERATE" + script "../main.nf" + process "STAR_GENOMEGENERATE" + tag "modules" + tag "modules_nfcore" + tag "star" + tag "star/genomegenerate" + + test("fasta_gtf") { + + when { + process { + """ + input[0] = Channel.of([ + [ id:'test_fasta' ], + [ file(params.modules_testdata_base_path + 'genomics/homo_sapiens/genome/genome.fasta', checkIfExists: true) ] + ]) + input[1] = Channel.of([ + [ id:'test_gtf' ], + [ file(params.modules_testdata_base_path + 'genomics/homo_sapiens/genome/genome.gtf', checkIfExists: true) ] + ]) + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(file(process.out.index[0][1]).listFiles().collect { it.getName() }.sort().toString()).match("fasta_gtf_index") }, + { assert snapshot(process.out.versions).match("fasta_gtf_versions") } + ) + } + } + + test("fasta_gtf_stub") { + + options '-stub' + + when { + process { + """ + input[0] = Channel.of([ + [ id:'test_fasta' ], + [ file(params.modules_testdata_base_path + 'genomics/homo_sapiens/genome/genome.fasta', checkIfExists: true) ] + ]) + input[1] = Channel.of([ + [ id:'test_gtf' ], + [ file(params.modules_testdata_base_path + 'genomics/homo_sapiens/genome/genome.gtf', checkIfExists: true) ] + ]) + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(file(process.out.index[0][1]).listFiles().collect { it.getName() }.sort().toString()).match("fasta_gtf_stub_index") }, + { assert snapshot(process.out.versions).match("fasta_gtf_stub_versions") } + ) + } + } + + test("fasta") { + + when { + process { + """ + input[0] = Channel.of([ + [ id:'test_fasta' ], + [ file(params.modules_testdata_base_path + 'genomics/homo_sapiens/genome/genome.fasta', checkIfExists: true) ] + ]) + input[1] = Channel.of([ [], [] ]) + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(file(process.out.index[0][1]).listFiles().collect { it.getName() }.sort().toString()).match("fasta_index") }, + { assert snapshot(process.out.versions).match("fasta_versions") } + ) + } + + } + + test("fasta_stub") { + + options '-stub' + + when { + process { + """ + input[0] = Channel.of([ + [ id:'test_fasta' ], + [ file(params.modules_testdata_base_path + 'genomics/homo_sapiens/genome/genome.fasta', checkIfExists: true) ] + ]) + input[1] = Channel.of([ [], [] ]) + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(file(process.out.index[0][1]).listFiles().collect { it.getName() }.sort().toString()).match("fasta_stub_index") }, + { assert snapshot(process.out.versions).match("fasta_stub_versions") } + ) + } + + } + +} diff --git a/modules/nf-core/star/genomegenerate/tests/main.nf.test.snap b/modules/nf-core/star/genomegenerate/tests/main.nf.test.snap new file mode 100644 index 000000000..5653d6e6c --- /dev/null +++ b/modules/nf-core/star/genomegenerate/tests/main.nf.test.snap @@ -0,0 +1,90 @@ +{ + "fasta_gtf_versions": { + "content": [ + [ + "versions.yml:md5,46b8f1f34bb7f23892cd1eb249ed4d7f" + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.04.3" + }, + "timestamp": "2024-02-01T15:54:31.798555" + }, + "fasta_stub_versions": { + "content": [ + [ + "versions.yml:md5,46b8f1f34bb7f23892cd1eb249ed4d7f" + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.04.3" + }, + "timestamp": "2024-02-01T15:55:07.521209" + }, + "fasta_gtf_stub_index": { + "content": [ + "[Genome, Log.out, SA, SAindex, chrLength.txt, chrName.txt, chrNameLength.txt, chrStart.txt, exonGeTrInfo.tab, exonInfo.tab, geneInfo.tab, genomeParameters.txt, sjdbInfo.txt, sjdbList.fromGTF.out.tab, sjdbList.out.tab, transcriptInfo.tab]" + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.04.3" + }, + "timestamp": "2024-02-01T15:54:46.478098" + }, + "fasta_gtf_stub_versions": { + "content": [ + [ + "versions.yml:md5,46b8f1f34bb7f23892cd1eb249ed4d7f" + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.04.3" + }, + "timestamp": "2024-02-01T15:54:46.491657" + }, + "fasta_index": { + "content": [ + "[Genome, Log.out, SA, SAindex, chrLength.txt, chrName.txt, chrNameLength.txt, chrStart.txt, genomeParameters.txt]" + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.04.3" + }, + "timestamp": "2024-02-01T15:54:57.552329" + }, + "fasta_versions": { + "content": [ + [ + "versions.yml:md5,46b8f1f34bb7f23892cd1eb249ed4d7f" + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.04.3" + }, + "timestamp": "2024-02-01T15:54:57.560541" + }, + "fasta_gtf_index": { + "content": [ + "[Genome, Log.out, SA, SAindex, chrLength.txt, chrName.txt, chrNameLength.txt, chrStart.txt, exonGeTrInfo.tab, exonInfo.tab, geneInfo.tab, genomeParameters.txt, sjdbInfo.txt, sjdbList.fromGTF.out.tab, sjdbList.out.tab, transcriptInfo.tab]" + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.04.3" + }, + "timestamp": "2024-02-01T15:54:31.786814" + }, + "fasta_stub_index": { + "content": [ + "[Genome, Log.out, SA, SAindex, chrLength.txt, chrName.txt, chrNameLength.txt, chrStart.txt, genomeParameters.txt]" + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.04.3" + }, + "timestamp": "2024-02-01T15:55:07.517472" + } +} \ No newline at end of file diff --git a/modules/nf-core/star/genomegenerate/tests/tags.yml b/modules/nf-core/star/genomegenerate/tests/tags.yml new file mode 100644 index 000000000..79f619bfe --- /dev/null +++ b/modules/nf-core/star/genomegenerate/tests/tags.yml @@ -0,0 +1,2 @@ +star/genomegenerate: + - modules/nf-core/star/genomegenerate/** diff --git a/modules/nf-core/stringtie/stringtie/environment.yml b/modules/nf-core/stringtie/stringtie/environment.yml new file mode 100644 index 000000000..7a0eccdb8 --- /dev/null +++ b/modules/nf-core/stringtie/stringtie/environment.yml @@ -0,0 +1,7 @@ +name: stringtie_stringtie +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::stringtie=2.2.1 diff --git a/modules/nf-core/stringtie/stringtie/main.nf b/modules/nf-core/stringtie/stringtie/main.nf new file mode 100644 index 000000000..6e25ba27d --- /dev/null +++ b/modules/nf-core/stringtie/stringtie/main.nf @@ -0,0 +1,68 @@ +process STRINGTIE_STRINGTIE { + tag "$meta.id" + label 'process_medium' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/stringtie:2.2.1--hecb563c_2' : + 'biocontainers/stringtie:2.2.1--hecb563c_2' }" + + input: + tuple val(meta), path(bam) + path annotation_gtf + + output: + tuple val(meta), path("*.transcripts.gtf"), emit: transcript_gtf + tuple val(meta), path("*.abundance.txt") , emit: abundance + tuple val(meta), path("*.coverage.gtf") , optional: true, emit: coverage_gtf + tuple val(meta), path("*.ballgown") , optional: true, emit: ballgown + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def reference = annotation_gtf ? "-G $annotation_gtf" : "" + def ballgown = annotation_gtf ? "-b ${prefix}.ballgown" : "" + def coverage = annotation_gtf ? "-C ${prefix}.coverage.gtf" : "" + + def strandedness = '' + if (meta.strandedness == 'forward') { + strandedness = '--fr' + } else if (meta.strandedness == 'reverse') { + strandedness = '--rf' + } + """ + stringtie \\ + $bam \\ + $strandedness \\ + $reference \\ + -o ${prefix}.transcripts.gtf \\ + -A ${prefix}.gene.abundance.txt \\ + $coverage \\ + $ballgown \\ + -p $task.cpus \\ + $args + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + stringtie: \$(stringtie --version 2>&1) + END_VERSIONS + """ + + stub: + def prefix = task.ext.prefix ?: "${meta.id}" + """ + touch ${prefix}.transcripts.gtf + touch ${prefix}.gene.abundance.txt + touch ${prefix}.coverage.gtf + touch ${prefix}.ballgown + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + stringtie: \$(stringtie --version 2>&1) + END_VERSIONS + """ +} diff --git a/modules/nf-core/stringtie/stringtie/meta.yml b/modules/nf-core/stringtie/stringtie/meta.yml new file mode 100644 index 000000000..d8ebdd88a --- /dev/null +++ b/modules/nf-core/stringtie/stringtie/meta.yml @@ -0,0 +1,58 @@ +name: stringtie_stringtie +description: Transcript assembly and quantification for RNA-Se +keywords: + - transcript + - assembly + - quantification + - gtf +tools: + - stringtie2: + description: | + Transcript assembly and quantification for RNA-Seq + homepage: https://ccb.jhu.edu/software/stringtie/index.shtml + documentation: https://ccb.jhu.edu/software/stringtie/index.shtml?t=manual + licence: ["MIT"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - bam: + type: file + description: | + Stringtie transcript gtf output(s). + - annotation_gtf: + type: file + description: | + Annotation gtf file (optional). +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - transcript_gtf: + type: file + description: transcript gtf + pattern: "*.{transcripts.gtf}" + - coverage_gtf: + type: file + description: coverage gtf + pattern: "*.{coverage.gtf}" + - abudance: + type: file + description: abudance + pattern: "*.{abudance.txt}" + - ballgown: + type: file + description: for running ballgown + pattern: "*.{ballgown}" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@drpatelh" +maintainers: + - "@drpatelh" diff --git a/modules/nf-core/stringtie/stringtie/tests/main.nf.test b/modules/nf-core/stringtie/stringtie/tests/main.nf.test new file mode 100644 index 000000000..00efe8f1a --- /dev/null +++ b/modules/nf-core/stringtie/stringtie/tests/main.nf.test @@ -0,0 +1,116 @@ +nextflow_process { + + name "Test Process STRINGTIE_STRINGTIE" + script "../main.nf" + process "STRINGTIE_STRINGTIE" + tag "modules" + tag "modules_nfcore" + tag "stringtie" + tag "stringtie/stringtie" + + test("sarscov2 [bam] - forward strandedness") { + + config "./nextflow.config" + + when { + process { + """ + input[0] = [ + [ id:'test', strandedness:'forward' ], // meta map + [ file(params.modules_testdata_base_path + "genomics/sarscov2/illumina/bam/test.paired_end.sorted.bam", checkIfExists: true) ] + ] + input[1] = [] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out.transcript_gtf).match("fs_transcript_gtf") }, + { assert snapshot(process.out.abundance).match("fs_abundance") }, + { assert snapshot(process.out.versions).match("fs_versions") } + ) + } + } + + test("sarscov2 [bam] - forward strandedness + reference annotation") { + + config "./nextflow.config" + + when { + process { + """ + input[0] = [ + [ id:'test', strandedness:'forward' ], // meta map + [ file(params.modules_testdata_base_path + "genomics/sarscov2/illumina/bam/test.paired_end.sorted.bam", checkIfExists: true) ] + ] + input[1] = file(params.modules_testdata_base_path + "genomics/sarscov2/genome/genome.gtf", checkIfExists: true) + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out.transcript_gtf).match("fs_gtf_transcript_gtf") }, + { assert snapshot(process.out.abundance).match("fs_gtf_abundance") }, + { assert snapshot(process.out.ballgown).match("fs_gtf_ballgown") }, + { assert snapshot(process.out.versions).match("fs_gtf_versions") } + ) + } + } + + test("sarscov2 [bam] - reverse strandedness") { + + config "./nextflow.config" + + when { + process { + """ + input[0] = [ + [ id:'test', strandedness:'reverse' ], // meta map + [ file(params.modules_testdata_base_path + "genomics/sarscov2/illumina/bam/test.paired_end.sorted.bam", checkIfExists: true) ] + ] + input[1] = [] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out.transcript_gtf).match("rs_transcript_gtf") }, + { assert snapshot(process.out.abundance).match("rs_abundance") }, + { assert snapshot(process.out.versions).match("rs_versions") } + ) + } + } + + test("sarscov2 [bam] - reverse strandedness + reference annotation") { + + config "./nextflow.config" + + when { + process { + """ + input[0] = [ + [ id:'test', strandedness:'reverse' ], // meta map + [ file(params.modules_testdata_base_path + "genomics/sarscov2/illumina/bam/test.paired_end.sorted.bam", checkIfExists: true) ] + ] + input[1] = file(params.modules_testdata_base_path + "genomics/sarscov2/genome/genome.gtf", checkIfExists: true) + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out.transcript_gtf).match("rs_gtf_transcript_gtf") }, + { assert snapshot(process.out.abundance).match("rs_gtf_abundance") }, + { assert snapshot(process.out.ballgown).match("rs_gtf_ballgown") }, + { assert snapshot(process.out.versions).match("rs_gtf_versions") } + ) + } + } +} diff --git a/modules/nf-core/stringtie/stringtie/tests/main.nf.test.snap b/modules/nf-core/stringtie/stringtie/tests/main.nf.test.snap new file mode 100644 index 000000000..bf7516364 --- /dev/null +++ b/modules/nf-core/stringtie/stringtie/tests/main.nf.test.snap @@ -0,0 +1,186 @@ +{ + "fs_abundance": { + "content": [ + [ + [ + { + "id": "test", + "strandedness": "forward" + }, + "test.gene.abundance.txt:md5,d6f5c8cadb8458f1df0427cf790246e3" + ] + ] + ], + "timestamp": "2023-11-23T13:55:41.032044613" + }, + "fs_transcript_gtf": { + "content": [ + [ + [ + { + "id": "test", + "strandedness": "forward" + }, + "test.transcripts.gtf:md5,569137af5be452413086b50653a97203" + ] + ] + ], + "timestamp": "2023-11-23T13:55:41.017978904" + }, + "rs_abundance": { + "content": [ + [ + [ + { + "id": "test", + "strandedness": "reverse" + }, + "test.gene.abundance.txt:md5,d6f5c8cadb8458f1df0427cf790246e3" + ] + ] + ], + "timestamp": "2023-11-23T13:56:13.601112933" + }, + "fs_gtf_versions": { + "content": [ + [ + "versions.yml:md5,3410e8ac349d18c85ddee89337851d38" + ] + ], + "timestamp": "2023-11-23T13:56:00.523797974" + }, + "fs_gtf_transcript_gtf": { + "content": [ + [ + [ + { + "id": "test", + "strandedness": "forward" + }, + "test.transcripts.gtf:md5,f56cf8aba2c4a5673bc7963ba5f12d04" + ] + ] + ], + "timestamp": "2023-11-23T13:56:00.475164879" + }, + "rs_versions": { + "content": [ + [ + "versions.yml:md5,3410e8ac349d18c85ddee89337851d38" + ] + ], + "timestamp": "2023-11-23T13:56:13.623892691" + }, + "rs_gtf_transcript_gtf": { + "content": [ + [ + [ + { + "id": "test", + "strandedness": "reverse" + }, + "test.transcripts.gtf:md5,bb346053a8c156b803b055133376c7fa" + ] + ] + ], + "timestamp": "2023-11-23T13:56:22.693599559" + }, + "fs_gtf_abundance": { + "content": [ + [ + [ + { + "id": "test", + "strandedness": "forward" + }, + "test.gene.abundance.txt:md5,7d8bce7f2a922e367cedccae7267c22e" + ] + ] + ], + "timestamp": "2023-11-23T13:56:00.482135418" + }, + "rs_gtf_ballgown": { + "content": [ + [ + [ + { + "id": "test", + "strandedness": "reverse" + }, + [ + "e2t.ctab:md5,e981c0038295ae54b63cedb1083f1540", + "e_data.ctab:md5,879b6696029d19c4737b562e9d149218", + "i2t.ctab:md5,8a117c8aa4334b4c2d4711932b006fb4", + "i_data.ctab:md5,be3abe09740603213f83d50dcf81427f", + "t_data.ctab:md5,3b66c065da73ae0dd41cc332eff6a818" + ] + ] + ] + ], + "timestamp": "2023-11-23T13:56:22.715698347" + }, + "rs_transcript_gtf": { + "content": [ + [ + [ + { + "id": "test", + "strandedness": "reverse" + }, + "test.transcripts.gtf:md5,31c34aec2bf36bb0ea3c16c2afeeeb1f" + ] + ] + ], + "timestamp": "2023-11-23T13:56:13.590054035" + }, + "rs_gtf_versions": { + "content": [ + [ + "versions.yml:md5,3410e8ac349d18c85ddee89337851d38" + ] + ], + "timestamp": "2023-11-23T13:56:22.725513476" + }, + "fs_gtf_ballgown": { + "content": [ + [ + [ + { + "id": "test", + "strandedness": "forward" + }, + [ + "e2t.ctab:md5,e981c0038295ae54b63cedb1083f1540", + "e_data.ctab:md5,6b4cf69bc03f3f69890f972a0e8b7471", + "i2t.ctab:md5,8a117c8aa4334b4c2d4711932b006fb4", + "i_data.ctab:md5,be3abe09740603213f83d50dcf81427f", + "t_data.ctab:md5,3b66c065da73ae0dd41cc332eff6a818" + ] + ] + ] + ], + "timestamp": "2023-11-23T13:56:00.494299817" + }, + "fs_versions": { + "content": [ + [ + "versions.yml:md5,3410e8ac349d18c85ddee89337851d38" + ] + ], + "timestamp": "2023-11-23T13:55:41.049417582" + }, + "rs_gtf_abundance": { + "content": [ + [ + [ + { + "id": "test", + "strandedness": "reverse" + }, + "test.gene.abundance.txt:md5,7385b870b955dae2c2ab78a70cf05cce" + ] + ] + ], + "timestamp": "2023-11-23T13:56:22.701059059" + } +} diff --git a/modules/nf-core/stringtie/stringtie/tests/nextflow.config b/modules/nf-core/stringtie/stringtie/tests/nextflow.config new file mode 100644 index 000000000..e3aaa0999 --- /dev/null +++ b/modules/nf-core/stringtie/stringtie/tests/nextflow.config @@ -0,0 +1,5 @@ +process { + withName: 'STRINGTIE_STRINGTIE' { + ext.args = '' + } +} diff --git a/modules/nf-core/stringtie/stringtie/tests/tags.yml b/modules/nf-core/stringtie/stringtie/tests/tags.yml new file mode 100644 index 000000000..da9b051c3 --- /dev/null +++ b/modules/nf-core/stringtie/stringtie/tests/tags.yml @@ -0,0 +1,2 @@ +stringtie/stringtie: + - modules/nf-core/stringtie/stringtie/** diff --git a/modules/nf-core/trimgalore/environment.yml b/modules/nf-core/trimgalore/environment.yml new file mode 100644 index 000000000..0981320c1 --- /dev/null +++ b/modules/nf-core/trimgalore/environment.yml @@ -0,0 +1,10 @@ +name: trimgalore + +channels: + - conda-forge + - bioconda + - defaults + +dependencies: + - bioconda::cutadapt=3.4 + - bioconda::trim-galore=0.6.7 diff --git a/modules/nf-core/trimgalore/main.nf b/modules/nf-core/trimgalore/main.nf new file mode 100644 index 000000000..24ead8714 --- /dev/null +++ b/modules/nf-core/trimgalore/main.nf @@ -0,0 +1,75 @@ +process TRIMGALORE { + tag "$meta.id" + label 'process_high' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/trim-galore:0.6.7--hdfd78af_0' : + 'biocontainers/trim-galore:0.6.7--hdfd78af_0' }" + + input: + tuple val(meta), path(reads) + + output: + tuple val(meta), path("*{3prime,5prime,trimmed,val}*.fq.gz"), emit: reads + tuple val(meta), path("*report.txt") , emit: log , optional: true + tuple val(meta), path("*unpaired*.fq.gz") , emit: unpaired, optional: true + tuple val(meta), path("*.html") , emit: html , optional: true + tuple val(meta), path("*.zip") , emit: zip , optional: true + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + // Calculate number of --cores for TrimGalore based on value of task.cpus + // See: https://github.com/FelixKrueger/TrimGalore/blob/master/Changelog.md#version-060-release-on-1-mar-2019 + // See: https://github.com/nf-core/atacseq/pull/65 + def cores = 1 + if (task.cpus) { + cores = (task.cpus as int) - 4 + if (meta.single_end) cores = (task.cpus as int) - 3 + if (cores < 1) cores = 1 + if (cores > 8) cores = 8 + } + + // Added soft-links to original fastqs for consistent naming in MultiQC + def prefix = task.ext.prefix ?: "${meta.id}" + if (meta.single_end) { + def args_list = args.split("\\s(?=--)").toList() + args_list.removeAll { it.toLowerCase().contains('_r2 ') } + """ + [ ! -f ${prefix}.fastq.gz ] && ln -s $reads ${prefix}.fastq.gz + trim_galore \\ + ${args_list.join(' ')} \\ + --cores $cores \\ + --gzip \\ + ${prefix}.fastq.gz + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + trimgalore: \$(echo \$(trim_galore --version 2>&1) | sed 's/^.*version //; s/Last.*\$//') + cutadapt: \$(cutadapt --version) + END_VERSIONS + """ + } else { + """ + [ ! -f ${prefix}_1.fastq.gz ] && ln -s ${reads[0]} ${prefix}_1.fastq.gz + [ ! -f ${prefix}_2.fastq.gz ] && ln -s ${reads[1]} ${prefix}_2.fastq.gz + trim_galore \\ + $args \\ + --cores $cores \\ + --paired \\ + --gzip \\ + ${prefix}_1.fastq.gz \\ + ${prefix}_2.fastq.gz + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + trimgalore: \$(echo \$(trim_galore --version 2>&1) | sed 's/^.*version //; s/Last.*\$//') + cutadapt: \$(cutadapt --version) + END_VERSIONS + """ + } +} diff --git a/modules/nf-core/trimgalore/meta.yml b/modules/nf-core/trimgalore/meta.yml new file mode 100644 index 000000000..e649088ce --- /dev/null +++ b/modules/nf-core/trimgalore/meta.yml @@ -0,0 +1,68 @@ +name: trimgalore +description: Trim FastQ files using Trim Galore! +keywords: + - trimming + - adapters + - sequencing adapters + - fastq +tools: + - trimgalore: + description: | + A wrapper tool around Cutadapt and FastQC to consistently apply quality + and adapter trimming to FastQ files, with some extra functionality for + MspI-digested RRBS-type (Reduced Representation Bisufite-Seq) libraries. + homepage: https://www.bioinformatics.babraham.ac.uk/projects/trim_galore/ + documentation: https://github.com/FelixKrueger/TrimGalore/blob/master/Docs/Trim_Galore_User_Guide.md + licence: ["GPL-3.0-or-later"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - reads: + type: file + description: | + List of input FastQ files of size 1 and 2 for single-end and paired-end data, + respectively. +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - reads: + type: file + description: | + List of input adapter trimmed FastQ files of size 1 and 2 for + single-end and paired-end data, respectively. + pattern: "*{3prime,5prime,trimmed,val}*.fq.gz" + - unpaired: + type: file + description: | + FastQ files containing unpaired reads from read 1 or read 2 + pattern: "*unpaired*.fq.gz" + - html: + type: file + description: FastQC report (optional) + pattern: "*_{fastqc.html}" + - zip: + type: file + description: FastQC report archive (optional) + pattern: "*_{fastqc.zip}" + - log: + type: file + description: Trim Galore! trimming report + pattern: "*_{report.txt}" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@drpatelh" + - "@ewels" + - "@FelixKrueger" +maintainers: + - "@drpatelh" + - "@ewels" + - "@FelixKrueger" diff --git a/modules/nf-core/trimgalore/tests/main.nf.test b/modules/nf-core/trimgalore/tests/main.nf.test new file mode 100644 index 000000000..43904ac32 --- /dev/null +++ b/modules/nf-core/trimgalore/tests/main.nf.test @@ -0,0 +1,103 @@ +nextflow_process { + + name "Test Process TRIMGALORE" + script "../main.nf" + process "TRIMGALORE" + tag "modules" + tag "modules_nfcore" + tag "trimgalore" + + test("test_trimgalore_single_end") { + + when { + process { + """ + input[0] = [ [ id:'test', single_end:true ], // meta map + [ file(params.modules_testdata_base_path + "genomics/sarscov2/illumina/fastq/test_1.fastq.gz", checkIfExists: true) ] + ] + """ + } + } + + then { + def read_lines = ["@ERR5069949.2151832 NS500628:121:HK3MMAFX2:2:21208:10793:15304/1", + "TCATAAACCAAAGCACTCACAGTGTCAACAATTTCAGCAGGACAACGCCGACAAGTTCCGAGGAACATGTCTGGACCTATAGTTTTCATAAGTCTACACACTGAATTGAAATATTCTGGTTCTAGTGTGCCCTTAGTTAGCAATGTGCGT", + "AAAAAAEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEAAEEEEAEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEAAEEEEE + { assert path(process.out.reads.get(0).get(1)).linesGzip.contains(read_line) } + } + }, + { report1_lines.each { report1_line -> + { assert path(process.out.log.get(0).get(1)).getText().contains(report1_line) } + } + }, + { assert snapshot(process.out.versions).match() } + ) + } + } + + test("test_trimgalore_paired_end") { + + when { + process { + """ + input[0] = [ [ id:'test', single_end:false ], // meta map + [ + file(params.modules_testdata_base_path + "genomics/sarscov2/illumina/fastq/test_1.fastq.gz", checkIfExists: true), + file(params.modules_testdata_base_path + "genomics/sarscov2/illumina/fastq/test_2.fastq.gz", checkIfExists: true) + ] + ] + """ + } + } + + then { + def read1_lines = ["@ERR5069949.2151832 NS500628:121:HK3MMAFX2:2:21208:10793:15304/1", + "TCATAAACCAAAGCACTCACAGTGTCAACAATTTCAGCAGGACAACGCCGACAAGTTCCGAGGAACATGTCTGGACCTATAGTTTTCATAAGTCTACACACTGAATTGAAATATTCTGGTTCTAGTGTGCCCTTAGTTAGCAATGTGCGT", + "AAAAAAEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEAAEEEEAEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEAAEEEEE + { assert path(process.out.reads.get(0).get(1).get(0)).linesGzip.contains(read1_line) } + } + }, + { read2_lines.each { read2_line -> + { assert path(process.out.reads.get(0).get(1).get(1)).linesGzip.contains(read2_line) } + } + }, + { report1_lines.each { report1_line -> + { assert path(process.out.log.get(0).get(1).get(0)).getText().contains(report1_line) } + } + }, + { report2_lines.each { report2_line -> + { assert path(process.out.log.get(0).get(1).get(1)).getText().contains(report2_line) } + } + }, + { assert snapshot(process.out.versions).match() } + ) + } + } +} diff --git a/modules/nf-core/trimgalore/tests/main.nf.test.snap b/modules/nf-core/trimgalore/tests/main.nf.test.snap new file mode 100644 index 000000000..082c55004 --- /dev/null +++ b/modules/nf-core/trimgalore/tests/main.nf.test.snap @@ -0,0 +1,26 @@ +{ + "test_trimgalore_single_end": { + "content": [ + [ + "versions.yml:md5,47d966cbb31c80eb8f7fe860d55659b7" + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "24.01.0" + }, + "timestamp": "2024-02-29T16:33:20.401347" + }, + "test_trimgalore_paired_end": { + "content": [ + [ + "versions.yml:md5,47d966cbb31c80eb8f7fe860d55659b7" + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "24.01.0" + }, + "timestamp": "2024-02-29T16:33:28.960497" + } +} \ No newline at end of file diff --git a/modules/nf-core/trimgalore/tests/tags.yml b/modules/nf-core/trimgalore/tests/tags.yml new file mode 100644 index 000000000..e9937691a --- /dev/null +++ b/modules/nf-core/trimgalore/tests/tags.yml @@ -0,0 +1,2 @@ +trimgalore: + - modules/nf-core/trimgalore/** diff --git a/modules/nf-core/tximeta/tximport/environment.yml b/modules/nf-core/tximeta/tximport/environment.yml new file mode 100644 index 000000000..24b202222 --- /dev/null +++ b/modules/nf-core/tximeta/tximport/environment.yml @@ -0,0 +1,9 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json +name: "tximeta_tximport" +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - "bioconda::bioconductor-tximeta=1.20.1" diff --git a/modules/nf-core/tximeta/tximport/main.nf b/modules/nf-core/tximeta/tximport/main.nf new file mode 100644 index 000000000..b0cce8536 --- /dev/null +++ b/modules/nf-core/tximeta/tximport/main.nf @@ -0,0 +1,47 @@ +process TXIMETA_TXIMPORT { + label "process_medium" + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/bioconductor-tximeta%3A1.20.1--r43hdfd78af_0' : + 'biocontainers/bioconductor-tximeta:1.20.1--r43hdfd78af_0' }" + + input: + tuple val(meta), path("quants/*") + tuple val(meta2), path(tx2gene) + val quant_type + + output: + tuple val(meta), path("*gene_tpm.tsv") , emit: tpm_gene + tuple val(meta), path("*gene_counts.tsv") , emit: counts_gene + tuple val(meta), path("*gene_counts_length_scaled.tsv"), emit: counts_gene_length_scaled + tuple val(meta), path("*gene_counts_scaled.tsv") , emit: counts_gene_scaled + tuple val(meta), path("*gene_lengths.tsv") , emit: lengths_gene + tuple val(meta), path("*transcript_tpm.tsv") , emit: tpm_transcript + tuple val(meta), path("*transcript_counts.tsv") , emit: counts_transcript + tuple val(meta), path("*transcript_lengths.tsv") , emit: lengths_transcript + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + template 'tximport.r' + + stub: + """ + touch ${meta.id}.gene_tpm.tsv + touch ${meta.id}.gene_counts.tsv + touch ${meta.id}.gene_counts_length_scaled.tsv + touch ${meta.id}.gene_counts_scaled.tsv + touch ${meta.id}.gene_lengths.tsv + touch ${meta.id}.transcript_tpm.tsv + touch ${meta.id}.transcript_counts.tsv + touch ${meta.id}.transcript_lengths.tsv + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + bioconductor-tximeta: \$(Rscript -e "library(tximeta); cat(as.character(packageVersion('tximeta')))") + END_VERSIONS + """ +} diff --git a/modules/nf-core/tximeta/tximport/meta.yml b/modules/nf-core/tximeta/tximport/meta.yml new file mode 100644 index 000000000..9ee5fd365 --- /dev/null +++ b/modules/nf-core/tximeta/tximport/meta.yml @@ -0,0 +1,120 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/meta-schema.json +name: "tximeta_tximport" +description: | + Import transcript-level abundances and estimated counts for gene-level + analysis packages +keywords: + - gene + - kallisto + - pseudoalignment + - salmon + - transcript +tools: + - "tximeta": + description: "Transcript Quantification Import with Automatic Metadata" + homepage: "https://bioconductor.org/packages/release/bioc/html/tximeta.html" + documentation: "https://bioconductor.org/packages/release/bioc/vignettes/tximeta/inst/doc/tximeta.html" + tool_dev_url: "https://github.com/thelovelab/tximeta" + doi: "10.1371/journal.pcbi.1007664" + licence: ["GPL-2"] + +input: + - meta: + type: map + description: | + Groovy Map containing information related to the experiment as a whole + e.g. `[ id:'SRP123456' ]` + - quants: + type: directory + description: Paths to subdirectories corresponding to + sample-wise runs of Salmon or Kallisto + - meta2: + type: map + description: | + Groovy Map containing reference information related to the species + reference e.g. `[ id:'yeast' ]` + - tx2gene: + type: file + description: A transcript to gene mapping table such as those generated + by custom/tx2gene + pattern: "*.{csv,tsv}" + - meta3: + type: map + description: | + Groovy Map containing information related to the experiment as a whole + e.g. `[ id:'SRP123456' ]` + - coldata: + type: file + description: | + Optional 'coldata' file equivalent to a sample sheet where the first + column corresponds to the sample names (directory names in the input + salmon/ kallisto results) + pattern: "*.{csv,tsv}" + - quant_type: + type: string + description: Quantification type, 'kallisto' or 'salmon' + +output: + - meta: + type: map + description: | + Groovy Map containing information related to the experiment as a whole + e.g. `[ id:'SRP123456' ]` + - tpm_gene: + type: file + description: | + Abundance (TPM) values derived from tximport output after + summarizeToGene(), without a 'countsFromAbundance' specification + pattern: "*gene_tpm.tsv" + - counts_gene: + type: file + description: | + Count values derived from tximport output after + summarizeToGene(), without a 'countsFromAbundance' specification + pattern: "*gene_counts.tsv" + - counts_gene_length_scaled: + type: file + description: | + Count values derived from tximport output after summarizeToGene(), with + a 'countsFromAbundance' specification of 'lengthScaledTPM' + pattern: "*gene_counts_length_scaled.tsv" + - counts_gene_scaled: + type: file + description: | + Count values derived from tximport output after summarizeToGene(), with + a 'countsFromAbundance' specification of 'scaledTPM' + pattern: "*gene_counts_scaled.tsv" + - lengths_gene: + type: file + description: | + Length values derived from tximport output after summarizeToGene(), + without a 'countsFromAbundance' specification + pattern: "*gene_lengths.tsv" + - tpm_transcript: + type: file + description: | + Abundance (TPM) values derived from tximport output without + summarizeToGene(), without a 'countsFromAbundance' specification + pattern: "*transcript_tpm.tsv" + - counts_transcript: + type: file + description: | + Count values derived from tximport output without + summarizeToGene(), without a 'countsFromAbundance' specification + pattern: "*transcript_counts.tsv" + - lengths_transcript: + type: file + description: | + Length values derived from tximport output without summarizeToGene(), + without a 'countsFromAbundance' specification + pattern: "*gene_lengths.tsv" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + +authors: + - "@pinin4fjords" +maintainers: + - "@pinin4fjords" diff --git a/modules/nf-core/tximeta/tximport/templates/tximport.r b/modules/nf-core/tximeta/tximport/templates/tximport.r new file mode 100755 index 000000000..40d79eb93 --- /dev/null +++ b/modules/nf-core/tximeta/tximport/templates/tximport.r @@ -0,0 +1,218 @@ +#!/usr/bin/env Rscript --vanilla + +# Script for importing and processing transcript-level quantifications. +# Written by Lorena Pantano, later modified by Jonathan Manning, and released +# under the MIT license. + +# Loading required libraries +library(SummarizedExperiment) +library(tximport) + +################################################ +################################################ +## Functions ## +################################################ +################################################ + +#' Build a table from a SummarizedExperiment object +#' +#' This function takes a SummarizedExperiment object and a specific slot name to extract +#' assay data. It then combines the first two columns of the rowData with the specified +#' assay data slot into a new data table. +#' +#' @param se.obj A SummarizedExperiment object from which to build the table. +#' @param slot The name of the slot in the assays list from which to extract data. +#' +#' @return A data frame combining the first two columns of the rowData with the assay data from the specified slot. + +build_table <- function(se.obj, slot) { + cbind(rowData(se.obj)[,1:2], assays(se.obj)[[slot]]) +} + +#' Write a table to a file from a SummarizedExperiment object with given parameters +#' +#' This function generates a table from a SummarizedExperiment object using specified parameters +#' and writes the resulting table to a file. The file name is constructed using a prefix and a +#' suffix from the parameters, and the table is written with tab separation, without quoting text, +#' and without row names. +#' +#' @param params A list containing the parameters needed for file generation and table writing. +#' The list should include: +#' - `obj`: A SummarizedExperiment object from which to build the table. +#' - `slot`: The name of the slot in the assays list from which to extract data. +#' - `suffix`: Suffix to use for generating the file name. +#' +#' @return NULL The function is called for its side effect of writing a file and does not return anything. + +write_se_table <- function(params, prefix) { + file_name <- paste0(prefix, ".", params\$suffix) + write.table(build_table(params\$obj, params\$slot), file_name, + sep="\t", quote=FALSE, row.names = FALSE) +} + +#' Read Transcript Metadata from a Given Path +#' +#' This function reads transcript metadata from a specified file path. The file is expected to +#' be a tab-separated values file without headers, containing transcript information. The function +#' checks if the file is empty and stops execution with an error message if so. It reads the file +#' into a data frame, expecting columns for transcript IDs, gene IDs, and gene names. Additional +#' processing is done to ensure compatibility with a predefined data structure (e.g., `txi[[1]]`), +#' including adding missing entries and reordering based on the transcript IDs found in `txi[[1]]`. +#' +#' @param tinfo_path The file path to the transcript information file. +#' +#' @return A list containing three elements: +#' - `transcript`: A data frame with transcript IDs, gene IDs, and gene names, indexed by transcript IDs. +#' - `gene`: A data frame with unique gene IDs and gene names. +#' - `tx2gene`: A data frame mapping transcript IDs to gene IDs. + +read_transcript_info <- function(tinfo_path){ + info <- file.info(tinfo_path) + if (info\$size == 0) { + stop("tx2gene file is empty") + } + + transcript_info <- read.csv(tinfo_path, sep="\t", header = TRUE, + col.names = c("tx", "gene_id", "gene_name")) + + extra <- setdiff(rownames(txi[[1]]), as.character(transcript_info[["tx"]])) + transcript_info <- rbind(transcript_info, data.frame(tx=extra, gene_id=extra, gene_name=extra)) + transcript_info <- transcript_info[match(rownames(txi[[1]]), transcript_info[["tx"]]), ] + rownames(transcript_info) <- transcript_info[["tx"]] + + list(transcript = transcript_info, + gene = unique(transcript_info[,2:3]), + tx2gene = transcript_info[,1:2]) +} + +#' Create a SummarizedExperiment Object +#' +#' Constructs a SummarizedExperiment object using provided matrices for counts, abundance, and length, +#' along with metadata for columns and rows. This function facilitates the organization of experimental +#' data (e.g., RNA-seq or other high-throughput data) in a structured format that is convenient for +#' further analyses and visualization. +#' +#' @param counts A matrix or DataFrame containing counts data, with rows as features (e.g., genes) and +#' columns as samples. +#' @param abundance A matrix or DataFrame containing abundance data (e.g., TPM or FPKM) with the same +#' dimensions and row/column names as the counts data. +#' @param length A matrix or DataFrame containing feature lengths, matching the dimensions and row/column +#' names of the counts data. +#' @param col_data A DataFrame containing sample-level metadata, with rows corresponding to columns in the +#' counts, abundance, and length matrices. +#' @param row_data A DataFrame containing feature-level metadata, with rows corresponding to features in +#' the counts, abundance, and length matrices. +#' +#' @return A SummarizedExperiment object containing the supplied data and metadata. + +create_summarized_experiment <- function(counts, abundance, length, col_data, row_data) { + SummarizedExperiment(assays = list(counts = counts, abundance = abundance, length = length), + colData = col_data, + rowData = row_data) +} + +################################################ +################################################ +## Main script starts here ## +################################################ +################################################ + +# Define pattern for file names based on quantification type +pattern <- ifelse('$quant_type' == "kallisto", "abundance.tsv", "quant.sf") +fns <- list.files('quants', pattern = pattern, recursive = T, full.names = T) +names <- basename(dirname(fns)) +names(fns) <- names +dropInfReps <- '$quant_type' == "kallisto" + +# Import transcript-level quantifications +txi <- tximport(fns, type = '$quant_type', txOut = TRUE, dropInfReps = dropInfReps) + +# Read transcript and sample data +transcript_info <- read_transcript_info('$tx2gene') + +# Make coldata just to appease the summarizedexperiment +coldata <- data.frame(files = fns, names = names) +rownames(coldata) <- coldata[["names"]] + +# Create initial SummarizedExperiment object +se <- create_summarized_experiment(txi[["counts"]], txi[["abundance"]], txi[["length"]], + DataFrame(coldata), transcript_info\$transcript) + +# Setting parameters for writing tables +params <- list( + list(obj = se, slot = "abundance", suffix = "transcript_tpm.tsv"), + list(obj = se, slot = "counts", suffix = "transcript_counts.tsv"), + list(obj = se, slot = "length", suffix = "transcript_lengths.tsv") +) + +# Process gene-level data if tx2gene mapping is available +if ("tx2gene" %in% names(transcript_info) && !is.null(transcript_info\$tx2gene)) { + tx2gene <- transcript_info\$tx2gene + gi <- summarizeToGene(txi, tx2gene = tx2gene) + gi.ls <- summarizeToGene(txi, tx2gene = tx2gene, countsFromAbundance = "lengthScaledTPM") + gi.s <- summarizeToGene(txi, tx2gene = tx2gene, countsFromAbundance = "scaledTPM") + + gene_info <- transcript_info\$gene[match(rownames(gi[[1]]), transcript_info\$gene[["gene_id"]]),] + rownames(gene_info) <- gene_info[["tx"]] + + col_data_frame <- DataFrame(coldata) + + # Create gene-level SummarizedExperiment objects + gse <- create_summarized_experiment(gi[["counts"]], gi[["abundance"]], gi[["length"]], + col_data_frame, gene_info) + gse.ls <- create_summarized_experiment(gi.ls[["counts"]], gi.ls[["abundance"]], gi.ls[["length"]], + col_data_frame, gene_info) + gse.s <- create_summarized_experiment(gi.s[["counts"]], gi.s[["abundance"]], gi.s[["length"]], + col_data_frame, gene_info) + + params <- c(params, list( + list(obj = gse, slot = "length", suffix = "gene_lengths.tsv"), + list(obj = gse, slot = "abundance", suffix = "gene_tpm.tsv"), + list(obj = gse, slot = "counts", suffix = "gene_counts.tsv"), + list(obj = gse.ls, slot = "counts", suffix = "gene_counts_length_scaled.tsv"), + list(obj = gse.s, slot = "counts", suffix = "gene_counts_scaled.tsv") + )) +} + +# Writing tables for each set of parameters + +prefix <- '' +if ('$task.ext.prefix' != 'null'){ + prefix = '$task.ext.prefix' +} else if ('$meta.id' != 'null'){ + prefix = '$meta.id' +} + +done <- lapply(params, write_se_table, prefix) + +################################################ +################################################ +## R SESSION INFO ## +################################################ +################################################ + +sink(paste(prefix, "R_sessionInfo.log", sep = '.')) +citation("tximeta") +print(sessionInfo()) +sink() + +################################################ +################################################ +## VERSIONS FILE ## +################################################ +################################################ + +r.version <- strsplit(version[['version.string']], ' ')[[1]][3] +tximeta.version <- as.character(packageVersion('tximeta')) + +writeLines( + c( + '"${task.process}":', + paste(' bioconductor-tximeta:', tximeta.version) + ), +'versions.yml') + +################################################ +################################################ +################################################ +################################################ diff --git a/modules/nf-core/tximeta/tximport/tests/main.nf.test b/modules/nf-core/tximeta/tximport/tests/main.nf.test new file mode 100644 index 000000000..5cf6af83e --- /dev/null +++ b/modules/nf-core/tximeta/tximport/tests/main.nf.test @@ -0,0 +1,193 @@ +nextflow_process { + + name "Test Process TXIMETA_TXIMPORT" + script "../main.nf" + process "TXIMETA_TXIMPORT" + + tag "modules" + tag "modules_nfcore" + tag "custom/tx2gene" + tag "tximeta" + tag "tximeta/tximport" + tag "untar" + + test("saccharomyces_cerevisiae - kallisto - gtf") { + + setup { + + run("UNTAR") { + script "../../../untar/main.nf" + process { + """ + input[0] = Channel.of([ + [ id:'test'], // meta map + file(params.modules_testdata_base_path + 'genomics/eukaryotes/saccharomyces_cerevisiae/kallisto_results.tar.gz', checkIfExists: true) + ]) + """ + } + } + run("CUSTOM_TX2GENE") { + script "../../../custom/tx2gene/main.nf" + process { + """ + input[0] = Channel.of([ + [ id:'test'], // meta map + file(params.modules_testdata_base_path + 'genomics/eukaryotes/saccharomyces_cerevisiae/genome_gfp.gtf', checkIfExists: true) + ]) + input[1] = UNTAR.out.untar.map { meta, dir -> [ meta, dir.listFiles().collect() ] } + input[2] = 'kallisto' + input[3] = 'gene_id' + input[4] = 'gene_name' + """ + } + } + } + + when { + process { + """ + input[0] = UNTAR.out.untar.map { meta, dir -> [ meta, dir.listFiles().collect() ] } + input[1] = CUSTOM_TX2GENE.out.tx2gene + input[2] = 'kallisto' + """ + } + } + + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out.counts_gene).match('counts_gene_kallisto') }, + { assert snapshot(process.out.counts_gene_length_scaled).match('counts_gene_length_scaled_kallisto') }, + { assert snapshot(process.out.counts_gene_scaled).match('counts_gene_scaled_kallisto') }, + { assert snapshot(process.out.counts_transcript).match('counts_transcript_kallisto') }, + { assert snapshot(process.out.lengths_gene).match('lengths_gene_kallisto') }, + { assert snapshot(process.out.lengths_transcript).match('lengths_transcript_kallisto') }, + { assert snapshot(process.out.tpm_gene).match('tpm_gene_kallisto') }, + { assert snapshot(process.out.tpm_transcript).match('tpm_transcript_kallisto') }, + { assert snapshot(process.out.versions).match('versions_kallisto') } + ) + } + } + + test("saccharomyces_cerevisiae - kallisto - gtf - stub") { + + options "-stub" + + when { + process { + """ + input[0] = Channel.of([ [], [] ]) + input[1] = Channel.of([ [], [] ]) + input[2] = 'kallisto' + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out.counts_gene).match('counts_gene_kallisto - stub') }, + { assert snapshot(process.out.counts_gene_length_scaled).match('counts_gene_length_scaled_kallisto - stub') }, + { assert snapshot(process.out.counts_gene_scaled).match('counts_gene_scaled_kallisto - stub') }, + { assert snapshot(process.out.counts_transcript).match('counts_transcript_kallisto - stub') }, + { assert snapshot(process.out.lengths_gene).match('lengths_gene_kallisto - stub') }, + { assert snapshot(process.out.lengths_transcript).match('lengths_transcript_kallisto - stub') }, + { assert snapshot(process.out.tpm_gene).match('tpm_gene_kallisto - stub') }, + { assert snapshot(process.out.tpm_transcript).match('tpm_transcript_kallisto - stub') }, + { assert snapshot(process.out.versions).match('versions_kallisto - stub') } + ) + } + + } + test("saccharomyces_cerevisiae - salmon - gtf") { + + setup { + + run("UNTAR") { + script "../../../untar/main.nf" + process { + """ + input[0] = Channel.of([ + [ id:'test'], // meta map + file(params.modules_testdata_base_path + 'genomics/eukaryotes/saccharomyces_cerevisiae/salmon_results.tar.gz', checkIfExists: true) + ]) + """ + } + } + run("CUSTOM_TX2GENE") { + script "../../../custom/tx2gene/main.nf" + process { + """ + input[0] = Channel.of([ + [ id:'test'], // meta map + file(params.modules_testdata_base_path + 'genomics/eukaryotes/saccharomyces_cerevisiae/genome_gfp.gtf', checkIfExists: true) + ]) + input[1] = UNTAR.out.untar.map { meta, dir -> [ meta, dir.listFiles().collect() ] } + input[2] = 'salmon' + input[3] = 'gene_id' + input[4] = 'gene_name' + """ + } + } + } + + when { + process { + """ + input[0] = UNTAR.out.untar.map { meta, dir -> [ meta, dir.listFiles().collect() ] } + input[1] = CUSTOM_TX2GENE.out.tx2gene + input[2] = 'salmon' + """ + } + } + + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out.counts_gene).match('counts_gene_salmon') }, + { assert snapshot(process.out.counts_gene_length_scaled).match('counts_gene_length_scaled_salmon') }, + { assert snapshot(process.out.counts_gene_scaled).match('counts_gene_scaled_salmon') }, + { assert snapshot(process.out.counts_transcript).match('counts_transcript_salmon') }, + { assert snapshot(process.out.lengths_gene).match('lengths_gene_salmon') }, + { assert snapshot(process.out.lengths_transcript).match('lengths_transcript_salmon') }, + { assert snapshot(process.out.tpm_gene).match('tpm_gene_salmon') }, + { assert snapshot(process.out.tpm_transcript).match('tpm_transcript_salmon') }, + { assert snapshot(process.out.versions).match('versions_salmon') } + ) + } + + } + + test("saccharomyces_cerevisiae - salmon - gtf - stub") { + + options "-stub" + + when { + process { + """ + input[0] = Channel.of([ [], [] ]) + input[1] = Channel.of([ [], [] ]) + input[2] = 'salmon' + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out.counts_gene).match('counts_gene_salmon - stub') }, + { assert snapshot(process.out.counts_gene_length_scaled).match('counts_gene_length_scaled_salmon - stub') }, + { assert snapshot(process.out.counts_gene_scaled).match('counts_gene_scaled_salmon - stub') }, + { assert snapshot(process.out.counts_transcript).match('counts_transcript_salmon - stub') }, + { assert snapshot(process.out.lengths_gene).match('lengths_gene_salmon - stub') }, + { assert snapshot(process.out.lengths_transcript).match('lengths_transcript_salmon - stub') }, + { assert snapshot(process.out.tpm_gene).match('tpm_gene_salmon - stub') }, + { assert snapshot(process.out.tpm_transcript).match('tpm_transcript_salmon - stub') }, + { assert snapshot(process.out.versions).match('versions_salmon - stub') } + ) + } + } +} + diff --git a/modules/nf-core/tximeta/tximport/tests/main.nf.test.snap b/modules/nf-core/tximeta/tximport/tests/main.nf.test.snap new file mode 100644 index 000000000..3cd0ee9e4 --- /dev/null +++ b/modules/nf-core/tximeta/tximport/tests/main.nf.test.snap @@ -0,0 +1,594 @@ +{ + "tpm_transcript_salmon - stub": { + "content": [ + [ + [ + [ + + ], + "[].transcript_tpm.tsv:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-05-28T12:35:50.683744" + }, + "lengths_gene_kallisto - stub": { + "content": [ + [ + [ + [ + + ], + "[].gene_lengths.tsv:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-05-28T12:35:16.126128" + }, + "counts_gene_scaled_salmon - stub": { + "content": [ + [ + [ + [ + + ], + "[].gene_counts_scaled.tsv:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-05-28T12:35:50.654405" + }, + "counts_gene_kallisto - stub": { + "content": [ + [ + [ + [ + + ], + "[].gene_counts.tsv:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-05-28T12:35:16.112898" + }, + "lengths_transcript_salmon - stub": { + "content": [ + [ + [ + [ + + ], + "[].transcript_lengths.tsv:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-05-28T12:35:50.67148" + }, + "versions_salmon - stub": { + "content": [ + [ + "versions.yml:md5,6ff317cceddc686f84d79cb976e1e28b" + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-05-28T12:35:50.690592" + }, + "counts_gene_length_scaled_kallisto": { + "content": [ + [ + [ + { + "id": "test" + }, + "test.gene_counts_length_scaled.tsv:md5,4944841ac711124d29673b6b6ed16ef3" + ] + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-05-28T12:34:59.621599" + }, + "lengths_transcript_salmon": { + "content": [ + [ + [ + { + "id": "test" + }, + "test.transcript_lengths.tsv:md5,db6d8ab9f8e1123d5984fd534b4347dc" + ] + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-05-28T12:35:32.876208" + }, + "counts_transcript_kallisto": { + "content": [ + [ + [ + { + "id": "test" + }, + "test.transcript_counts.tsv:md5,42e0106e75fa97c1c684c6d9060f1724" + ] + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-05-28T12:34:59.62725" + }, + "counts_transcript_kallisto - stub": { + "content": [ + [ + [ + [ + + ], + "[].transcript_counts.tsv:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-05-28T12:35:16.122852" + }, + "counts_transcript_salmon": { + "content": [ + [ + [ + { + "id": "test" + }, + "test.transcript_counts.tsv:md5,ff0f5be09ca7a322672c0074ba35da17" + ] + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-05-28T12:35:32.866731" + }, + "lengths_gene_salmon - stub": { + "content": [ + [ + [ + [ + + ], + "[].gene_lengths.tsv:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-05-28T12:35:50.6654" + }, + "tpm_gene_salmon": { + "content": [ + [ + [ + { + "id": "test" + }, + "test.gene_tpm.tsv:md5,6076364cc78741a4f8bc8935a045d13d" + ] + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-05-28T12:35:32.881193" + }, + "tpm_transcript_salmon": { + "content": [ + [ + [ + { + "id": "test" + }, + "test.transcript_tpm.tsv:md5,7a334b565e1e865efb1caf615f194ef7" + ] + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-05-28T12:35:32.886363" + }, + "tpm_gene_salmon - stub": { + "content": [ + [ + [ + [ + + ], + "[].gene_tpm.tsv:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-05-28T12:35:50.677538" + }, + "lengths_transcript_kallisto": { + "content": [ + [ + [ + { + "id": "test" + }, + "test.transcript_lengths.tsv:md5,f974b52840431a5dae57bcb615badbf1" + ] + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-05-28T12:34:59.632822" + }, + "counts_gene_length_scaled_kallisto - stub": { + "content": [ + [ + [ + [ + + ], + "[].gene_counts_length_scaled.tsv:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-05-28T12:35:16.11652" + }, + "tpm_gene_kallisto - stub": { + "content": [ + [ + [ + [ + + ], + "[].gene_tpm.tsv:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-05-28T12:35:16.133742" + }, + "counts_transcript_salmon - stub": { + "content": [ + [ + [ + [ + + ], + "[].transcript_counts.tsv:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-05-28T12:35:50.660144" + }, + "counts_gene_scaled_kallisto": { + "content": [ + [ + [ + { + "id": "test" + }, + "test.gene_counts_scaled.tsv:md5,39d14e361434978b3cadae901a26a028" + ] + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-05-28T12:34:59.624732" + }, + "counts_gene_salmon": { + "content": [ + [ + [ + { + "id": "test" + }, + "test.gene_counts.tsv:md5,c14cab7e15cfac73ec0602dc2c404551" + ] + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-05-28T12:35:32.852188" + }, + "versions_salmon": { + "content": [ + [ + "versions.yml:md5,6ff317cceddc686f84d79cb976e1e28b" + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-05-28T12:35:32.892224" + }, + "counts_gene_length_scaled_salmon": { + "content": [ + [ + [ + { + "id": "test" + }, + "test.gene_counts_length_scaled.tsv:md5,5f92a6784f6edc5e3b336c71c3ee7daf" + ] + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-05-28T12:35:32.857451" + }, + "tpm_gene_kallisto": { + "content": [ + [ + [ + { + "id": "test" + }, + "test.gene_tpm.tsv:md5,85d108269769ae0d841247b9b9ed922d" + ] + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-05-28T12:34:59.636454" + }, + "lengths_transcript_kallisto - stub": { + "content": [ + [ + [ + [ + + ], + "[].transcript_lengths.tsv:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-05-28T12:35:16.129712" + }, + "lengths_gene_kallisto": { + "content": [ + [ + [ + { + "id": "test" + }, + "test.gene_lengths.tsv:md5,db6becdf807fd164a9c63dd1dd916d9c" + ] + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-05-28T12:34:59.630042" + }, + "counts_gene_scaled_kallisto - stub": { + "content": [ + [ + [ + [ + + ], + "[].gene_counts_scaled.tsv:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-05-28T12:35:16.119638" + }, + "tpm_transcript_kallisto": { + "content": [ + [ + [ + { + "id": "test" + }, + "test.transcript_tpm.tsv:md5,65862ed9d4a05abfab952e680dc0e49d" + ] + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-05-28T12:34:59.639525" + }, + "lengths_gene_salmon": { + "content": [ + [ + [ + { + "id": "test" + }, + "test.gene_lengths.tsv:md5,1691ea2677612805cd699265c83024d7" + ] + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-05-28T12:35:32.871162" + }, + "counts_gene_length_scaled_salmon - stub": { + "content": [ + [ + [ + [ + + ], + "[].gene_counts_length_scaled.tsv:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-05-28T12:35:50.605613" + }, + "counts_gene_kallisto": { + "content": [ + [ + [ + { + "id": "test" + }, + "test.gene_counts.tsv:md5,e89c28692ea214396b2d4cb702a804c3" + ] + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-05-28T12:34:59.61832" + }, + "versions_kallisto": { + "content": [ + [ + "versions.yml:md5,6ff317cceddc686f84d79cb976e1e28b" + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-05-28T12:34:59.642751" + }, + "counts_gene_salmon - stub": { + "content": [ + [ + [ + [ + + ], + "[].gene_counts.tsv:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-05-28T12:35:50.598457" + }, + "versions_kallisto - stub": { + "content": [ + [ + "versions.yml:md5,6ff317cceddc686f84d79cb976e1e28b" + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-05-28T12:35:16.141689" + }, + "tpm_transcript_kallisto - stub": { + "content": [ + [ + [ + [ + + ], + "[].transcript_tpm.tsv:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-05-28T12:35:16.137716" + }, + "counts_gene_scaled_salmon": { + "content": [ + [ + [ + { + "id": "test" + }, + "test.gene_counts_scaled.tsv:md5,fdfb3d23aaf5d4316d81247ec4664ca0" + ] + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-05-28T12:35:32.862272" + } +} \ No newline at end of file diff --git a/modules/nf-core/tximeta/tximport/tests/tags.yml b/modules/nf-core/tximeta/tximport/tests/tags.yml new file mode 100644 index 000000000..fc96a89e0 --- /dev/null +++ b/modules/nf-core/tximeta/tximport/tests/tags.yml @@ -0,0 +1,2 @@ +tximeta/tximport: + - "modules/nf-core/tximeta/tximport/**" diff --git a/nextflow.config b/nextflow.config index 7f032f1ec..a13ef5768 100644 --- a/nextflow.config +++ b/nextflow.config @@ -9,16 +9,78 @@ // Global default params, used in configs params { - // TODO nf-core: Specify your pipeline's command line flags // Input options input = null + outdir = null + phenotype = null + annotation = null + mirna_expression = null + + //> circRNA + tools = 'circexplorer2' + bsj_reads = 1 + max_shift = 1 + min_samples = 1 + min_tools = 1 + exon_boundary = 0 + save_intermediates = false + quantification_tools = 'ciriquant,psirc' + bootstrap_samples = 30 + + //> miRNA + mirna_expression = null + mirna_min_reads = 5 + mirna_min_sample_percentage = 0.2 + mirna_tools = 'miranda,targetscan' + mirna_min_tools = 1 + mirna_correlation = 'pearson' // References genome = null igenomes_base = 's3://ngi-igenomes/igenomes/' igenomes_ignore = false + bowtie = null + bowtie2 = null + bwa = null + star = null + hisat2 = null + hisat2_build_memory = '200.GB' + segemehl = null + save_reference = true + mature = null + + // Trimming + min_trimmed_reads = 10000 + clip_r1 = null + clip_r2 = null + three_prime_clip_r1 = null + three_prime_clip_r2 = null + trim_nextseq = null + save_trimmed = false + skip_trimming = false + + // Alignment options + + //> STAR + chimJunctionOverhangMin = 10 + alignSJDBoverhangMin = 10 + chimSegmentMin = 10 + sjdboverhang = 100 + limitSjdbInsertNsj = 1000000 + + //> MAPSPLICE + seglen = 25 + min_intron = 20 + max_intron = 1000000 + min_map_len = 40 + min_fusion_distance = 200 + + //> MISC + save_unaligned = false + seq_center = null // MultiQC options + skip_fastqc = false multiqc_config = null multiqc_title = null multiqc_logo = null @@ -141,6 +203,7 @@ profiles { podman.enabled = false shifter.enabled = false charliecloud.enabled = false + apptainer.runOptions = '--no-mount tmp --writable-tmpfs' } wave { apptainer.ociAutoPull = true @@ -154,8 +217,10 @@ profiles { executor.cpus = 4 executor.memory = 8.GB } - test { includeConfig 'conf/test.config' } - test_full { includeConfig 'conf/test_full.config' } + test { includeConfig 'conf/test.config' } + test_igenomes { includeConfig 'conf/test_igenomes.config' } + full { includeConfig 'conf/full.config' } + test_full { includeConfig 'conf/test_full.config' } } // Load nf-core custom profiles from different Institutions diff --git a/nextflow_schema.json b/nextflow_schema.json index b24ff3dbf..5ce48b2b5 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -27,19 +27,227 @@ "type": "string", "format": "directory-path", "description": "The output directory where the results will be saved. You have to use absolute paths to storage on Cloud infrastructure.", - "fa_icon": "fas fa-folder-open" + "fa_icon": "fas fa-folder-open", + "default": null }, - "email": { + "phenotype": { "type": "string", - "description": "Email address for completion summary.", - "fa_icon": "fas fa-envelope", - "help_text": "Set this parameter to your e-mail address to get a summary e-mail with details of the run sent to you when the workflow exits. If set in your user config file (`~/.nextflow/config`) then you don't need to specify this on the command line for every run.", - "pattern": "^([a-zA-Z0-9_\\-\\.]+)@([a-zA-Z0-9_\\-\\.]+)\\.([a-zA-Z]{2,5})$" + "format": "file-path", + "exists": true, + "schema": "assets/schema_phenotype.json", + "mimetype": "text/csv", + "pattern": "^\\S+\\.csv$", + "description": "Phenotype CSV file specifying the experimental design. If provided, the pipeline will run CIRCTEST.", + "help_text": "There are two rules for providing the phenotype CSV file. 1) The 'sample' column must match the sample sheets 'sample' column. 2) The response variable containing the phenotype of primary interest in the experiment must have the column name condition. All other columns included in the file are controlled for in the `DESeq2` design. \n\n| sample \t| condition \t| replicate \t|\n|-----------\t|-----------\t|-----------\t|\n| control_1 \t| ctr \t| 1 \t|\n| control_2 \t| ctr \t| 2 \t|\n| control_3 \t| ctr \t| 3 \t|\n| treated_1 \t| trt \t| 1 \t|\n| treated_2 \t| trt \t| 2 \t|\n| treated_3 \t| trt \t| 3 \t|\n\nThe above phenotype file will identify differentially expressed circRNAs/mRNAs between control and treatment cells, whilst controlling for the effect of variation between replicates: ` ~ replicates + condition`", + "fa_icon": "fas fa-file-csv" }, - "multiqc_title": { + "annotation": { "type": "string", - "description": "MultiQC report title. Printed as page header, used for filename if not otherwise specified.", - "fa_icon": "fas fa-file-signature" + "format": "file-path", + "exists": true, + "schema": "assets/schema_annotation.json", + "mimetype": "text/csv", + "pattern": "^\\S+\\.csv$", + "description": "Path to a CSV file containing BED files that should be used for annotation.", + "help_text": "The annotation file should be a CSV file with the following columns: `name`, `file` and `min_overlap`. The `name` column should contain a unique identifier for the annotation, the `file` column should contain the path to the BED file and the `min_overlap` column should contain the minimum overlap required for a circRNA to be considered as overlapping with the annotation. The `min_overlap` column is optional and defaults to 0.9 if not provided.", + "fa_icon": "fas fa-file-csv" + } + } + }, + "circrna_options": { + "title": "circRNA Options", + "type": "object", + "fa_icon": "fas fa-circle-notch", + "description": "Parameters for circrna discovery.", + "required": ["tools"], + "properties": { + "tools": { + "type": "string", + "fa_icon": "fas fa-wrench", + "description": "Comma separated list of circRNA quantification tools to use. Supported tools: ciriquant, circexplorer2, find_circ, circrna_finder, mapsplice, dcc, segemehl", + "pattern": "^(ciriquant|circexplorer2|find_circ|circrna_finder|mapsplice|dcc|segemehl)(,(ciriquant|circexplorer2|find_circ|circrna_finder|mapsplice|dcc|segemehl))*$", + "help_text": "Select one or a combination of circRNA quantification tools for the pipeline e.g:\n--tool 'circexplorer2, ciriquant, find_circ'\n\nN.B: Selecting more than one circRNA quantification tool will trigger the circRNA filtering parameter --min_tools", + "default": "circexplorer2" + }, + "bsj_reads": { + "type": "integer", + "fa_icon": "fas fa-circle-notch", + "description": "Minimum number of reads spanning circRNA back-splice junction required for circRNA to be output by workflow.", + "help_text": "Filter low confidence circRNAs by removing circRNAs with read counts below a specified value. To disable, set the value to 1 (default).", + "default": 1, + "minimum": 1 + }, + "max_shift": { + "type": "integer", + "fa_icon": "fas fa-file-plus-minus", + "description": "If both start and end of a pair of BSJs are within max_shift bp, they are considered as the same BSJ.", + "default": 1, + "minimum": 0 + }, + "min_tools": { + "type": "integer", + "fa_icon": "fas fa-intersection", + "description": "Specify the minimum number of tools circRNAs must be called by to be output by the workflow.", + "help_text": "When multiple circRNA quantification tools have been provided to `--tool`, set a filtering method whereby circRNAs are output if they have been called by at least *n* quantification tools.\n\nSetting `--min_tools` to 1 is the same as taking the union, all circRNAs are included in the output.\n\nSetting `--min_tools` to 2 will output circRNAs that have been called by at least 2 quantification tools and so on.", + "default": 1, + "minimum": 1, + "maximum": 7 + }, + "min_samples": { + "type": "integer", + "fa_icon": "fas fa-intersection", + "description": "Minimum number of samples a circRNA must be detected in to be output by the workflow.", + "help_text": "Filter circRNAs by removing circRNAs detected in fewer samples than the specified value. To disable, set the value to 1 (default).", + "default": 1, + "minimum": 1 + }, + "exon_boundary": { + "type": "integer", + "description": "Specify the distance at which the annotation script decides if a candidate is a circRNA or EI-circRNA.", + "help_text": "During annotation, if one of the start or end position of a circular candidate imperfectly overlaps an exon boundary, the script will consider positions within 'exon_boundary' (default 0bp) to be an exonic circRNA. If they fall outside of this range, the candidate is assumed to be an exonic-intronic circRNA, and the entire underlying sequence is taken for miRNA analysis, as opposed to just the exonic sequences for canonical exonic circRNAs. ", + "default": 0 + }, + "quantification_tools": { + "type": "string", + "fa_icon": "fas fa-wrench", + "description": "Comma separated list of circRNA quantification tools to use. Supported tools: ciriquant, psirc", + "help_text": "Select one or a combination of circRNA quantification tools for the pipeline e.g:\n--quantification_tools 'ciriquant,psirc'", + "default": "ciriquant,psirc", + "pattern": "^((ciriquant|psirc)(,(ciriquant|psirc))*)+$" + }, + "bootstrap_samples": { + "type": "integer", + "description": "Number of bootstrap samples to use during psirc quantification.", + "default": 30 + } + } + }, + "mirna_options": { + "title": "miRNA options", + "type": "object", + "fa_icon": "fas fa-terminal", + "description": "Define paths and threasholds for miRNA analysis.", + "properties": { + "mirna_expression": { + "type": "string", + "format": "file-path", + "exists": true, + "mimetype": "text/tsv", + "pattern": "^\\S+\\.tsv$", + "description": "path to tab-separated file providing the expression counts of mirnas, which are created in pipeline 'smrnaseq'. \n\nmirna \t sample1 \t sample2 \t sample3 \t\nid1\t count_sample1 \t count_sample2 \t count_sample3 \t\nid2 \t ... \t ... \t ... \t \n", + "fa_icon": "fas fa-file-tsv" + }, + "mirna_min_sample_percentage": { + "type": "number", + "fa_icon": "fas fa-circle-notch", + "description": "Minimum percentage of samples, a miRNA has to be expressed in to pass filtering.", + "help_text": "The mirna_min_percentage parameter sets the minimum percentage of samples in which a miRNA must be expressed to pass filtering. The default value is 0.2, which means a miRNA must be detected in at least 20% of the samples to be included in the analysis.", + "default": 0.2, + "minimum": 0 + }, + "mirna_min_reads": { + "type": "integer", + "fa_icon": "fas fa-circle-notch", + "description": "Minimum number of reads, a miRNA is required to have to pass filtering.", + "help_text": "This parameter determines the minimum number of reads that a miRNA must have to pass filtering. The default is 5, meaning a miRNA must have at least 5 reads across the samples to be considered for analysis.", + "default": 5, + "minimum": 0 + }, + "mirna_correlation": { + "type": "string", + "fa_icon": "fas fa-wrench", + "description": "Specifies the type of correlation to be used when analyzing the relationship between miRNA and transcript expression levels. Valid options are 'pearson' or 'spearman'.", + "help_text": "Select the correlation method to be applied in the correlation analysis of miRNAs.", + "default": "pearson", + "pattern": "^(pearson|spearman)$" + }, + "mirna_tools": { + "type": "string", + "fa_icon": "fas fa-wrench", + "description": "Comma separated list of miRNA bindingsite prediction tools to use. Supported tools: miranda, targetscan.", + "help_text": "Select one or a combination of miRNA bindingsite prediction tools for the pipeline e.g:\n--mirna_tools 'miranda,targetscan'", + "default": "miranda,targetscan", + "pattern": "^((miranda|targetscan)?,?)*[^,]+$" + }, + "mirna_min_tools": { + "type": "integer", + "fa_icon": "fas fa-intersection", + "description": "Specify the number of votes required for a miRNA to be further considered in downstream analysis.'", + "help_text": "Controls the number of votes required for a binding site prediction to be considered valid. If a miRNA binding site was predicted by two different tools (e.g., miRanda and TargetScan), it receives two votes. By specifying additional tools for miRNA binding site prediction (using the 'mirna_min_tools' parameter), you can adjust the number of votes required for a binding site to be considered valid.", + "default": 1, + "minimum": 1, + "maximum": 4 + } + } + }, + "alignment_options": { + "title": "Alignment Options", + "type": "object", + "description": "Parameters used by aligners pertinent to circRNA detection", + "default": "", + "fa_icon": "fas fa-align-center", + "properties": { + "sjdboverhang": { + "type": "integer", + "description": "*only used at the genome generation step* tells STAR how many bases to concatenate from donor and acceptor sides of the junctions.", + "default": 100 + }, + "chimJunctionOverhangMin": { + "type": "integer", + "description": "Minimum overhang for a chimeric junction", + "default": 10 + }, + "alignSJDBoverhangMin": { + "type": "integer", + "description": "Minimum overhang for annotated junctions", + "default": 10 + }, + "limitSjdbInsertNsj": { + "type": "integer", + "description": "Maximum number of junction to be inserted to the genome on the fly at the mapping stage, including those from annotations and those detected in the 1st step of the 2-pass run", + "default": 1000000 + }, + "chimSegmentMin": { + "type": "integer", + "description": "Minimum length of chimeric segment length. Must be set to a positive value to detect circular junctions.", + "default": 10 + }, + "seglen": { + "type": "integer", + "description": "Segment length. Default 25", + "default": 25 + }, + "min_intron": { + "type": "integer", + "description": "Minimum intron length. Default 20", + "default": 20 + }, + "max_intron": { + "type": "integer", + "description": "Maximum intron length. Default 1000000", + "default": 1000000 + }, + "min_map_len": { + "type": "integer", + "description": "Minimum alignment length. Default 40", + "default": 40 + }, + "min_fusion_distance": { + "type": "integer", + "description": "Minimum distance between two gapped segments to be considered as fusion candidate. Must set to lower values to be sensitive to circular candidates (e.g 200).", + "default": 200 + }, + "seq_center": { + "type": "string", + "description": "Sequencing center information to be added to read group of BAM files.", + "fa_icon": "fas fa-synagogue" + }, + "save_unaligned": { + "type": "boolean", + "fa_icon": "fas fa-save", + "description": "Where possible, save unaligned reads from either STAR, HISAT2 or Salmon to the results directory.", + "help_text": "This may either be in the form of FastQ or BAM files depending on the options available for that particular tool.", + "default": false } } }, @@ -49,11 +257,17 @@ "fa_icon": "fas fa-dna", "description": "Reference genome related files and options required for the workflow.", "properties": { + "save_reference": { + "type": "boolean", + "description": "Save generated reference genome files such as indices, chromosome FASTA files.", + "default": true, + "fa_icon": "fas fa-save" + }, "genome": { "type": "string", "description": "Name of iGenomes reference.", "fa_icon": "fas fa-book", - "help_text": "If using a reference genome configured in the pipeline using iGenomes, use this parameter to give the ID for the reference. This is then used to build the full paths for all required reference genome files e.g. `--genome GRCh38`. \n\nSee the [nf-core website docs](https://nf-co.re/usage/reference_genomes) for more details." + "help_text": "By using a reference genome build on iGenomes, the gtf, mature, species and index files (bar HISAT2 and segemehl) will be automatically downloaded for you. \n\nSee the [nf-core website docs](https://nf-co.re/usage/reference_genomes) for more details." }, "fasta": { "type": "string", @@ -62,8 +276,67 @@ "mimetype": "text/plain", "pattern": "^\\S+\\.fn?a(sta)?(\\.gz)?$", "description": "Path to FASTA genome file.", - "help_text": "This parameter is *mandatory* if `--genome` is not specified. If you don't have a BWA index available this will be generated for you automatically. Combine with `--save_reference` to save BWA index for future runs.", - "fa_icon": "far fa-file-code" + "help_text": "If you use AWS iGenomes, this has already been set for you appropriately.\n\nThis parameter is *mandatory* if `--genome` is not specified.", + "fa_icon": "fas fa-book" + }, + "gtf": { + "type": "string", + "fa_icon": "fas fa-address-book", + "mimetype": "text/plain", + "description": "Path to reference GTF file.", + "help_text": "This parameter is *mandatory* if `--genome` is not specified. Needs to contain the following attributes: `gene_id`, `transcript_id` and `gene_name`.", + "pattern": "\\.gtf$" + }, + "mature": { + "type": "string", + "description": "Path to FASTA file with mature miRNAs. This parameter needs to be specified to perform miRNA interaction analyses.", + "mimetype": "text/plain", + "help_text": "Typically this will be the `mature.fa` file from miRBase. Can be given either as a plain text `.fa` file or a compressed `.gz` file.", + "fa_icon": "fas fa-wheelchair", + "default": null + }, + "bowtie": { + "type": "string", + "fa_icon": "fas fa-bold", + "description": "Path to Bowtie index files, surrounded by quotes. No glob pattern required.", + "default": null + }, + "bowtie2": { + "type": "string", + "fa_icon": "fas fa-bold", + "description": "Path to Bowtie2 index files, surrounded by quotes. No glob pattern required.", + "default": null + }, + "bwa": { + "type": "string", + "fa_icon": "fas fa-bold", + "description": "Path to BWA index directory, surrounded by quotes. No glob pattern required.", + "default": null + }, + "hisat2": { + "type": "string", + "description": "Path to Hisat2 index directory, surrounded by quotes. No glob pattern required.", + "default": null, + "fa_icon": "fab fa-bold" + }, + "hisat2_build_memory": { + "type": "string", + "default": "200.GB", + "fa_icon": "fas fa-memory", + "pattern": "^\\d+(\\.\\d+)?\\.?\\s*(K|M|G|T)?B$", + "description": "Minimum memory required to use splice sites and exons in the HiSAT2 index build process.", + "help_text": "HiSAT2 requires a huge amount of RAM to build a genome index for larger genomes, if including splice sites and exons e.g. the human genome might typically require 200GB. If you specify less than this threshold for the `HISAT2_BUILD` process then the splice sites and exons will be ignored, meaning that the process will require a lot less memory. If you are working with a small genome, set this parameter to a lower value to reduce the threshold for skipping this check. If using a larger genome, consider supplying more memory to the `HISAT2_BUILD` process." + }, + "segemehl": { + "type": "string", + "default": null, + "fa_icon": "fab fa-stripe-s", + "description": "Path to Segemehl Index **file**." + }, + "star": { + "type": "string", + "fa_icon": "far fa-star", + "description": "Path to STAR index directory, surrounded by quotes. No glob pattern required." }, "igenomes_ignore": { "type": "boolean", @@ -82,6 +355,66 @@ } } }, + "read_trimming_options": { + "title": "Read trimming options", + "type": "object", + "fa_icon": "fas fa-cut", + "description": "Options to adjust read trimming criteria.", + "properties": { + "skip_trimming": { + "type": "boolean", + "description": "Skip the adapter trimming step.", + "help_text": "Use this if your input FastQ files have already been trimmed outside of the workflow or if you're very confident that there is no adapter contamination in your data.", + "fa_icon": "fas fa-fast-forward", + "default": false + }, + "save_trimmed": { + "type": "boolean", + "description": "Save the trimmed FastQ files in the results directory.", + "help_text": "By default, trimmed FastQ files will not be saved to the results directory. Specify this flag (or set to true in your config file) to copy these files to the results directory when complete.", + "fa_icon": "fas fa-save", + "default": false + }, + "skip_fastqc": { + "type": "boolean", + "description": "Skip FastQC quality control of the sequencing reads.", + "fa_icon": "fas fa-terminal", + "default": false + }, + "clip_r1": { + "type": "integer", + "description": "Instructs Trim Galore to remove bp from the 5' end of read 1 (or single-end reads).", + "fa_icon": "fas fa-cut" + }, + "clip_r2": { + "type": "integer", + "description": "Instructs Trim Galore to remove bp from the 5' end of read 2 (paired-end reads only).", + "fa_icon": "fas fa-cut" + }, + "three_prime_clip_r1": { + "type": "integer", + "description": "Instructs Trim Galore to remove bp from the 3' end of read 1 AFTER adapter/quality trimming has been performed.", + "fa_icon": "fas fa-cut" + }, + "three_prime_clip_r2": { + "type": "integer", + "description": "Instructs Trim Galore to remove bp from the 3' end of read 2 AFTER adapter/quality trimming has been performed.", + "fa_icon": "fas fa-cut" + }, + "trim_nextseq": { + "type": "integer", + "description": "Instructs Trim Galore to apply the --nextseq=X option, to trim based on quality after removing poly-G tails.", + "help_text": "This enables the option Cutadapt `--nextseq-trim=3'CUTOFF` option via Trim Galore, which will set a quality cutoff (that is normally given with -q instead), but qualities of G bases are ignored. This trimming is in common for the NextSeq- and NovaSeq-platforms, where basecalls without any signal are called as high-quality G bases.", + "fa_icon": "fas fa-cut" + }, + "min_trimmed_reads": { + "type": "integer", + "default": 10000, + "fa_icon": "fas fa-hand-paper", + "description": "Minimum number of trimmed reads below which samples are removed from further processing. Some downstream steps in the pipeline will fail if this threshold is too low." + } + } + }, "institutional_config_options": { "title": "Institutional config options", "type": "object", @@ -143,6 +476,12 @@ "fa_icon": "fas fa-question-circle", "hidden": true }, + "save_intermediates": { + "type": "boolean", + "description": "Save intermediate files.", + "default": false, + "fa_icon": "fas fa-save" + }, "publish_dir_mode": { "type": "string", "default": "copy", @@ -152,6 +491,13 @@ "enum": ["symlink", "rellink", "link", "copy", "copyNoFollow", "move"], "hidden": true }, + "email": { + "type": "string", + "description": "Email address for completion summary.", + "fa_icon": "fas fa-envelope", + "help_text": "Set this parameter to your e-mail address to get a summary e-mail with details of the run sent to you when the workflow exits. If set in your user config file (`~/.nextflow/config`) then you don't need to specify this on the command line for every run.", + "pattern": "^([a-zA-Z0-9_\\-\\.]+)@([a-zA-Z0-9_\\-\\.]+)\\.([a-zA-Z]{2,5})$" + }, "email_on_fail": { "type": "string", "description": "Email address for completion summary, only when pipeline fails.", @@ -166,6 +512,11 @@ "fa_icon": "fas fa-remove-format", "hidden": true }, + "multiqc_title": { + "type": "string", + "description": "MultiQC report title. Printed as page header, used for filename if not otherwise specified.", + "fa_icon": "fas fa-file-signature" + }, "max_multiqc_email_size": { "type": "string", "description": "File size limit when attaching MultiQC reports to summary emails.", @@ -229,6 +580,18 @@ { "$ref": "#/$defs/reference_genome_options" }, + { + "$ref": "#/$defs/read_trimming_options" + }, + { + "$ref": "#/$defs/alignment_options" + }, + { + "$ref": "#/$defs/circrna_options" + }, + { + "$ref": "#/$defs/mirna_options" + }, { "$ref": "#/$defs/institutional_config_options" }, diff --git a/subworkflows/local/annotation.nf b/subworkflows/local/annotation.nf new file mode 100644 index 000000000..853dfee27 --- /dev/null +++ b/subworkflows/local/annotation.nf @@ -0,0 +1,45 @@ +include { BEDTOOLS_INTERSECT as INTERSECT_GTF } from '../../modules/nf-core/bedtools/intersect' +include { GAWK as INGEST_DATABASE_NAMES } from '../../modules/nf-core/gawk' +include { GNU_SORT as COMBINE_DATABASES } from '../../modules/nf-core/gnu/sort' +include { BEDTOOLS_INTERSECT as INTERSECT_DATABASE } from '../../modules/nf-core/bedtools/intersect' +include { ANNOTATION as ANNOTATE } from '../../modules/local/annotation' + +workflow ANNOTATION { + take: + regions + ch_gtf + exon_boundary + ch_annotation + + main: + ch_versions = Channel.empty() + + INTERSECT_GTF( regions.combine(ch_gtf.map{meta, gtf -> gtf}), [[], []] ) + ch_versions = ch_versions.mix(INTERSECT_GTF.out.versions) + + INGEST_DATABASE_NAMES( ch_annotation, [] ) + ch_versions = ch_versions.mix(INGEST_DATABASE_NAMES.out.versions) + + INTERSECT_DATABASE( regions.combine(INGEST_DATABASE_NAMES.out.output) + .map{ meta1, regions, meta2, database -> + [[id: "${meta1.id}-${meta2.id}", + tool: meta1.tool, + original_meta: meta1, + min_overlap: meta2.min_overlap], regions, database] }, + [[], []]) + ch_versions = ch_versions.mix(INTERSECT_DATABASE.out.versions) + + ANNOTATE( INTERSECT_GTF.out.intersect + .join(INTERSECT_DATABASE.out.intersect + .map{ meta, bed -> [meta.original_meta, bed] } + .groupTuple(), remainder: true) + .map{ meta, gtf_intersection, db_intersections -> [meta, gtf_intersection, db_intersections ?: []]}, + exon_boundary ) + ch_versions = ch_versions.mix(ANNOTATE.out.versions) + + emit: + bed = ANNOTATE.out.bed + gtf = ANNOTATE.out.gtf + + versions = ch_versions +} diff --git a/subworkflows/local/bsj_detection.nf b/subworkflows/local/bsj_detection.nf new file mode 100644 index 000000000..3a4065c4c --- /dev/null +++ b/subworkflows/local/bsj_detection.nf @@ -0,0 +1,218 @@ +// MODULES +include { GAWK as EXTRACT_COUNTS } from '../../modules/nf-core/gawk' +include { CSVTK_JOIN as COMBINE_COUNTS_PER_TOOL } from '../../modules/nf-core/csvtk/join' +include { GAWK as FILTER_BSJS } from '../../modules/nf-core/gawk' +include { GAWK as BED_ADD_SAMPLE_TOOL } from '../../modules/nf-core/gawk' +include { COMBINE_BEDS as COMBINE_TOOLS_PER_SAMPLE } from '../../modules/local/combine_beds' +include { COMBINE_BEDS as COMBINE_SAMPLES } from '../../modules/local/combine_beds' +include { BEDTOOLS_GETFASTA as FASTA_COMBINED } from '../../modules/nf-core/bedtools/getfasta' +include { BEDTOOLS_GETFASTA as FASTA_PER_SAMPLE } from '../../modules/nf-core/bedtools/getfasta' +include { BEDTOOLS_GETFASTA as FASTA_PER_SAMPLE_TOOL } from '../../modules/nf-core/bedtools/getfasta' +include { FAIL_ON_EMPTY } from '../../modules/local/fail_on_empty' + +// SUBWORKFLOWS +include { SEGEMEHL } from './detection_tools/segemehl' +include { STAR2PASS } from './detection_tools/star2pass' +include { CIRCEXPLORER2 } from './detection_tools/circexplorer2' +include { CIRCRNA_FINDER } from './detection_tools/circrna_finder' +include { FIND_CIRC } from './detection_tools/find_circ' +include { CIRIQUANT } from './detection_tools/ciriquant' +include { DCC } from './detection_tools/dcc' +include { MAPSPLICE } from './detection_tools/mapsplice' +include { ANNOTATION as ANNOTATE_COMBINED } from './annotation' +include { ANNOTATION as ANNOTATE_PER_SAMPLE } from './annotation' +include { ANNOTATION as ANNOTATE_PER_SAMPLE_TOOL } from './annotation' + +workflow BSJ_DETECTION { + + take: + reads + ch_fasta + ch_gtf + ch_annotation + bowtie_index + bowtie2_index + bwa_index + chromosomes + hisat2_index + star_index + bsj_reads + exon_boundary + + main: + ch_versions = Channel.empty() + ch_bsj_bed_per_sample_tool = Channel.empty() + ch_multiqc_files = Channel.empty() + fasta = ch_fasta.map{meta, fasta -> fasta} + gtf = ch_gtf.map{meta, gtf -> gtf} + + // STAR 2-PASS-MODE + star_ignore_sjdbgtf = true + seq_center = params.seq_center ?: '' + seq_platform = '' + STAR2PASS( reads, star_index, ch_gtf, bsj_reads, star_ignore_sjdbgtf, seq_center, seq_platform ) + ch_versions = ch_versions.mix(STAR2PASS.out.versions) + + // + // DISCOVERY TOOLS: + // + tools_selected = params.tools.split(',').collect{it.trim().toLowerCase()} + + if (tools_selected.size() == 0) { + error 'No tools selected for circRNA discovery.' + } + + if (tools_selected.contains('segemehl')) { + SEGEMEHL( reads, fasta, params.segemehl ) + ch_versions = ch_versions.mix(SEGEMEHL.out.versions) + ch_bsj_bed_per_sample_tool = ch_bsj_bed_per_sample_tool.mix(SEGEMEHL.out.bed) + } + + if (tools_selected.contains('circexplorer2')) { + CIRCEXPLORER2( gtf, fasta, STAR2PASS.out.junction ) + ch_versions = ch_versions.mix(CIRCEXPLORER2.out.versions) + ch_bsj_bed_per_sample_tool = ch_bsj_bed_per_sample_tool.mix(CIRCEXPLORER2.out.bed) + } + + if (tools_selected.contains('circrna_finder')) { + CIRCRNA_FINDER( fasta, STAR2PASS.out.sam, STAR2PASS.out.junction, + STAR2PASS.out.tab ) + ch_versions = ch_versions.mix(CIRCRNA_FINDER.out.versions) + ch_bsj_bed_per_sample_tool = ch_bsj_bed_per_sample_tool.mix(CIRCRNA_FINDER.out.bed) + } + + if (tools_selected.contains('find_circ')) { + FIND_CIRC( reads, bowtie2_index, ch_fasta ) + ch_versions = ch_versions.mix(FIND_CIRC.out.versions) + ch_bsj_bed_per_sample_tool = ch_bsj_bed_per_sample_tool.mix(FIND_CIRC.out.bed) + } + + if (tools_selected.contains('ciriquant')) { + CIRIQUANT( reads, ch_gtf, ch_fasta, bwa_index, hisat2_index ) + ch_versions = ch_versions.mix(CIRIQUANT.out.versions) + ch_bsj_bed_per_sample_tool = ch_bsj_bed_per_sample_tool.mix(CIRIQUANT.out.bed) + } + + if (tools_selected.contains('dcc')) { + DCC( reads, ch_fasta, ch_gtf, star_index, STAR2PASS.out.junction, + star_ignore_sjdbgtf, seq_platform, seq_center, bsj_reads ) + ch_versions = ch_versions.mix(DCC.out.versions) + ch_bsj_bed_per_sample_tool = ch_bsj_bed_per_sample_tool.mix(DCC.out.bed) + } + + if (tools_selected.contains('mapsplice')) { + MAPSPLICE( reads, gtf, fasta, bowtie_index, chromosomes, + STAR2PASS.out.junction ) + ch_versions = ch_versions.mix(MAPSPLICE.out.versions) + ch_bsj_bed_per_sample_tool = ch_bsj_bed_per_sample_tool.mix(MAPSPLICE.out.bed) + } + + // + // QUANTIFY BSJs PER TOOL + // + + EXTRACT_COUNTS( ch_bsj_bed_per_sample_tool, [] ) + ch_versions = ch_versions.mix(EXTRACT_COUNTS.out.versions) + + COMBINE_COUNTS_PER_TOOL( EXTRACT_COUNTS.out.output + .map{ meta, bed -> [[id: meta.tool], bed]} + .groupTuple() ) + ch_versions = ch_versions.mix(COMBINE_COUNTS_PER_TOOL.out.versions) + + // + // APPLY bsj_reads FILTER + // + + ch_bsj_bed_per_sample_tool_filtered = FILTER_BSJS( ch_bsj_bed_per_sample_tool, [] ).output + ch_versions = ch_versions.mix(FILTER_BSJS.out.versions) + + // + // MERGE BED FILES + // + + BED_ADD_SAMPLE_TOOL( ch_bsj_bed_per_sample_tool_filtered, [] ) + ch_versions = ch_versions.mix(BED_ADD_SAMPLE_TOOL.out.versions) + ch_bsj_bed_per_sample_tool_meta = BED_ADD_SAMPLE_TOOL.out.output + + COMBINE_TOOLS_PER_SAMPLE( + ch_bsj_bed_per_sample_tool_meta + .map{ meta, bed -> [ [id: meta.id], bed ] } + .groupTuple(), + params.max_shift, + params.min_tools, + 1 + ) + ch_versions = ch_versions.mix(COMBINE_TOOLS_PER_SAMPLE.out.versions) + ch_bsj_bed_per_sample = COMBINE_TOOLS_PER_SAMPLE.out.combined + .filter{ meta, bed -> !bed.isEmpty() } + + COMBINE_SAMPLES( + ch_bsj_bed_per_sample_tool_meta.map{ meta, bed -> [[id: "all"], bed] }.groupTuple(), + params.max_shift, + params.min_tools, + params.min_samples + ) + ch_versions = ch_versions.mix(COMBINE_SAMPLES.out.versions) + ch_bsj_bed_combined = COMBINE_SAMPLES.out.combined + .filter{ meta, bed -> !bed.isEmpty() } + .collect() + + // + // ANNOTATION + // + + ANNOTATE_COMBINED( ch_bsj_bed_combined, ch_gtf, exon_boundary, ch_annotation ) + ch_versions = ch_versions.mix(ANNOTATE_COMBINED.out.versions) + ch_bsj_bed12_combined = ANNOTATE_COMBINED.out.bed.collect() + ch_bsj_gtf_combined = ANNOTATE_COMBINED.out.gtf.collect() + + ANNOTATE_PER_SAMPLE( ch_bsj_bed_per_sample, ch_gtf, exon_boundary, ch_annotation ) + ch_versions = ch_versions.mix(ANNOTATE_PER_SAMPLE.out.versions) + ch_bsj_bed12_per_sample = ANNOTATE_PER_SAMPLE.out.bed + ch_bsj_gtf_per_sample = ANNOTATE_PER_SAMPLE.out.gtf + + ANNOTATE_PER_SAMPLE_TOOL( ch_bsj_bed_per_sample_tool, ch_gtf, exon_boundary, ch_annotation ) + ch_versions = ch_versions.mix(ANNOTATE_PER_SAMPLE_TOOL.out.versions) + ch_bsj_bed12_per_sample_tool = ANNOTATE_PER_SAMPLE_TOOL.out.bed + ch_bsj_gtf_per_sample_tool = ANNOTATE_PER_SAMPLE_TOOL.out.gtf + + // + // FASTA WORKFLOW: + // + + FASTA_COMBINED( ch_bsj_bed_combined, fasta ) + ch_versions = ch_versions.mix(FASTA_COMBINED.out.versions) + ch_bsj_fasta_combined = FASTA_COMBINED.out.fasta + + FASTA_PER_SAMPLE( ch_bsj_bed_per_sample, fasta ) + ch_versions = ch_versions.mix(FASTA_PER_SAMPLE.out.versions) + ch_bsj_fasta_per_sample = FASTA_PER_SAMPLE.out.fasta + + FASTA_PER_SAMPLE_TOOL( ch_bsj_bed_per_sample_tool, fasta ) + ch_versions = ch_versions.mix(FASTA_PER_SAMPLE_TOOL.out.versions) + ch_bsj_fasta_per_sample_tool = FASTA_PER_SAMPLE_TOOL.out.fasta + + // STOP PIPELINE IF NO CIRCULAR RNAs WERE FOUND + FAIL_ON_EMPTY( + ch_bsj_bed_combined.ifEmpty([[id: "empty"], []]), + // Make sure to wait for per-sample results + Channel.empty() + .mix(ch_bsj_bed12_combined) + .mix(ch_bsj_bed12_per_sample) + .mix(ch_bsj_bed12_per_sample_tool) + .mix(ch_bsj_fasta_combined) + .mix(ch_bsj_fasta_per_sample) + .mix(ch_bsj_fasta_per_sample_tool) + .map{ meta, f -> f } + .collect() + ) + + emit: + bed = ch_bsj_bed_combined + bed12 = ch_bsj_bed12_combined + gtf = ch_bsj_gtf_combined + fasta = ch_bsj_fasta_combined + + multiqc_files = ch_multiqc_files + versions = ch_versions +} diff --git a/subworkflows/local/combine_transcriptomes.nf b/subworkflows/local/combine_transcriptomes.nf new file mode 100644 index 000000000..3faa855fc --- /dev/null +++ b/subworkflows/local/combine_transcriptomes.nf @@ -0,0 +1,39 @@ +include { GNU_SORT as COMBINE_TRANSCRIPTOME_GTFS } from '../../modules/nf-core/gnu/sort' +include { GAWK as EXCLUDE_OVERLONG_TRANSCRIPTS } from '../../modules/nf-core/gawk' +include { GFFREAD as TRANSCRIPTOME } from '../../modules/nf-core/gffread' + +workflow COMBINE_TRANSCRIPTOMES { + take: + ch_genome_fasta + ch_genome_gtf + ch_circ_gtf + + main: + ch_versions = Channel.empty() + + COMBINE_TRANSCRIPTOME_GTFS( + ch_genome_gtf.mix(ch_circ_gtf).map{meta, gtf -> gtf}.collect().map{[[id: "transcriptome"], it]}, + ) + ch_versions = ch_versions.mix(COMBINE_TRANSCRIPTOME_GTFS.out.versions) + + EXCLUDE_OVERLONG_TRANSCRIPTS( + COMBINE_TRANSCRIPTOME_GTFS.out.sorted, [] + ) + ch_versions = ch_versions.mix(EXCLUDE_OVERLONG_TRANSCRIPTS.out.versions) + + TRANSCRIPTOME( + EXCLUDE_OVERLONG_TRANSCRIPTS.out.output, + ch_genome_fasta.map{meta, fasta -> fasta} + ) + ch_versions = ch_versions.mix(TRANSCRIPTOME.out.versions) + + TRANSCRIPTOME.out.gffread_fasta.ifEmpty { + error 'No transcriptome fasta file produced.' + } + + emit: + fasta = TRANSCRIPTOME.out.gffread_fasta + gtf = EXCLUDE_OVERLONG_TRANSCRIPTS.out.output + + versions = ch_versions +} diff --git a/subworkflows/local/detection_tools/circexplorer2.nf b/subworkflows/local/detection_tools/circexplorer2.nf new file mode 100644 index 000000000..b3908e91d --- /dev/null +++ b/subworkflows/local/detection_tools/circexplorer2.nf @@ -0,0 +1,30 @@ +include { CIRCEXPLORER2_REFERENCE as REFERENCE } from '../../../modules/local/circexplorer2/reference' +include { CIRCEXPLORER2_PARSE as PARSE } from '../../../modules/nf-core/circexplorer2/parse' +include { CIRCEXPLORER2_ANNOTATE as ANNOTATE } from '../../../modules/nf-core/circexplorer2/annotate' +include { GAWK as UNIFY } from '../../../modules/nf-core/gawk' + +workflow CIRCEXPLORER2 { + take: + gtf + fasta + star_junctions + + main: + ch_versions = Channel.empty() + + REFERENCE( gtf ) + PARSE( star_junctions ) + ANNOTATE( PARSE.out.junction, fasta, REFERENCE.out.txt ) + UNIFY( ANNOTATE.out.txt + .map{ meta, txt -> [ meta + [tool: "circexplorer2"], txt ] }, [] ) + + ch_versions = ch_versions.mix(REFERENCE.out.versions) + ch_versions = ch_versions.mix(PARSE.out.versions) + ch_versions = ch_versions.mix(ANNOTATE.out.versions) + ch_versions = ch_versions.mix(UNIFY.out.versions) + + emit: + bed = UNIFY.out.output + + versions = ch_versions +} diff --git a/subworkflows/local/detection_tools/circrna_finder.nf b/subworkflows/local/detection_tools/circrna_finder.nf new file mode 100644 index 000000000..a45459394 --- /dev/null +++ b/subworkflows/local/detection_tools/circrna_finder.nf @@ -0,0 +1,28 @@ +include { CIRCRNA_FINDER as MAIN } from '../../../modules/local/circrna_finder' +include { GAWK as UNIFY } from '../../../modules/nf-core/gawk' + +workflow CIRCRNA_FINDER { + take: + fasta + star_sam + star_junctions + star_tab + + main: + ch_versions = Channel.empty() + + ch_joined = star_sam.join(star_junctions).join(star_tab) + .map{ meta, sam, junction, tab -> + [ meta + [tool: "circrna_finder"], [sam, junction, tab] ] } + + MAIN( ch_joined ) + UNIFY( MAIN.out.results, [] ) + + ch_versions = ch_versions.mix(MAIN.out.versions) + ch_versions = ch_versions.mix(UNIFY.out.versions) + + emit: + bed = UNIFY.out.output + + versions = ch_versions +} diff --git a/subworkflows/local/detection_tools/ciriquant.nf b/subworkflows/local/detection_tools/ciriquant.nf new file mode 100644 index 000000000..32f4246fc --- /dev/null +++ b/subworkflows/local/detection_tools/ciriquant.nf @@ -0,0 +1,26 @@ +include { CIRIQUANT as MAIN } from '../../../modules/local/ciriquant/ciriquant' +include { GAWK as UNIFY } from '../../../modules/nf-core/gawk' + +workflow CIRIQUANT { + take: + reads + ch_gtf + ch_fasta + bwa_index + hisat2_index + + main: + ch_versions = Channel.empty() + + MAIN( reads, [[], []], ch_gtf, ch_fasta, bwa_index, hisat2_index ) + UNIFY( MAIN.out.gtf.map{ meta, gtf -> + [ meta + [tool: "ciriquant"], gtf ] }, [] ) + + ch_versions = ch_versions.mix(MAIN.out.versions) + ch_versions = ch_versions.mix(UNIFY.out.versions) + + emit: + bed = UNIFY.out.output + + versions = ch_versions +} diff --git a/subworkflows/local/detection_tools/dcc.nf b/subworkflows/local/detection_tools/dcc.nf new file mode 100644 index 000000000..4bb1103d6 --- /dev/null +++ b/subworkflows/local/detection_tools/dcc.nf @@ -0,0 +1,67 @@ +include { STAR_ALIGN as MATE1_1ST_PASS } from '../../../modules/nf-core/star/align' +include { STAR_ALIGN as MATE1_2ND_PASS } from '../../../modules/nf-core/star/align' +include { SJDB as MATE1_SJDB } from '../../../modules/local/star/sjdb' +include { STAR_ALIGN as MATE2_1ST_PASS } from '../../../modules/nf-core/star/align' +include { STAR_ALIGN as MATE2_2ND_PASS } from '../../../modules/nf-core/star/align' +include { SJDB as MATE2_SJDB } from '../../../modules/local/star/sjdb' +include { DCC as MAIN } from '../../../modules/local/dcc' +include { GAWK as UNIFY } from '../../../modules/nf-core/gawk' + +workflow DCC { + take: + reads + ch_fasta + ch_gtf + star_index + star_junction + ignore_sjdbgtf + seq_platform + seq_center + bsj_reads + + main: + ch_versions = Channel.empty() + + mate1 = reads.filter{ meta, reads -> !meta.single_end } + .map{ meta, reads -> return [ [id: meta.id, single_end: true], reads[0] ] } + MATE1_1ST_PASS( mate1, star_index, ch_gtf, ignore_sjdbgtf, seq_platform, seq_center ) + MATE1_SJDB( MATE1_1ST_PASS.out.tab + .map{ meta, tab -> return tab }.collect().map{[[id: "mate1_sjdb"], it]}, bsj_reads ) + MATE1_2ND_PASS( mate1, star_index, MATE1_SJDB.out.sjtab, ignore_sjdbgtf, seq_platform, seq_center ) + + mate2 = reads.filter{ meta, reads -> !meta.single_end } + .map{ meta, reads -> return [ [id: meta.id, single_end: true], reads[1] ] } + MATE2_1ST_PASS( mate2, star_index, ch_gtf, ignore_sjdbgtf, seq_platform, seq_center ) + MATE2_SJDB( MATE2_1ST_PASS.out.tab + .map{ meta, tab -> return tab }.collect().map{[[id: "mate2_sjdb"], it]}, bsj_reads ) + MATE2_2ND_PASS( mate2, star_index, MATE2_SJDB.out.sjtab, ignore_sjdbgtf, seq_platform, seq_center ) + + dcc_stage = star_junction.map{ meta, junction -> return [ meta.id, meta, junction]} + .join( + MATE1_2ND_PASS.out.junction.map{ meta, junction -> return [ meta.id, junction] }, + remainder: true + ) + .join( + MATE2_2ND_PASS.out.junction.map{ meta, junction -> return [ meta.id, junction] }, + remainder: true + ) + .map{ id, meta, junction, mate1, mate2 -> return [ meta, junction, mate1, mate2 ]} + + dcc = dcc_stage.map{ it -> [ it[0], it[1], it[2] ?: [], it[3] ?: [] ] } + MAIN( dcc, ch_fasta.map{ meta, fasta -> fasta }, ch_gtf.map{ meta, gtf -> gtf } ) + UNIFY( MAIN.out.txt.map{ meta, txt -> [ meta + [tool: "dcc"], txt ] }, [] ) + + ch_versions = ch_versions.mix(MATE1_1ST_PASS.out.versions) + ch_versions = ch_versions.mix(MATE1_SJDB.out.versions) + ch_versions = ch_versions.mix(MATE1_2ND_PASS.out.versions) + ch_versions = ch_versions.mix(MATE2_1ST_PASS.out.versions) + ch_versions = ch_versions.mix(MATE2_SJDB.out.versions) + ch_versions = ch_versions.mix(MATE2_2ND_PASS.out.versions) + ch_versions = ch_versions.mix(MAIN.out.versions) + ch_versions = ch_versions.mix(UNIFY.out.versions) + + emit: + bed = UNIFY.out.output + + versions = ch_versions +} diff --git a/subworkflows/local/detection_tools/find_circ.nf b/subworkflows/local/detection_tools/find_circ.nf new file mode 100644 index 000000000..ce0cc557b --- /dev/null +++ b/subworkflows/local/detection_tools/find_circ.nf @@ -0,0 +1,36 @@ +include { BOWTIE2_ALIGN as ALIGN } from '../../../modules/nf-core/bowtie2/align' +include { SAMTOOLS_VIEW } from '../../../modules/nf-core/samtools/view' +include { SAMTOOLS_INDEX } from '../../../modules/nf-core/samtools/index' +include { FIND_CIRC_ANCHORS as ANCHORS } from '../../../modules/local/find_circ/anchors' +include { FIND_CIRC as MAIN } from '../../../modules/local/find_circ/find_circ' +include { GAWK as UNIFY } from '../../../modules/nf-core/gawk' + +workflow FIND_CIRC { + take: + reads + bowtie2_index + ch_fasta + + main: + ch_versions = Channel.empty() + + ALIGN( reads, bowtie2_index, ch_fasta, false, true ) + SAMTOOLS_INDEX( ALIGN.out.bam ) + SAMTOOLS_VIEW( ALIGN.out.bam.join( SAMTOOLS_INDEX.out.bai ), ch_fasta, [] ) + ANCHORS( SAMTOOLS_VIEW.out.bam ) + MAIN( ANCHORS.out.anchors, bowtie2_index, ch_fasta.map{ meta, fasta -> fasta } ) + UNIFY( MAIN.out.bed.map{ meta, bed -> + [ meta + [tool: "find_circ"], bed ] }, [] ) + + ch_versions = ch_versions.mix(ALIGN.out.versions) + ch_versions = ch_versions.mix(SAMTOOLS_INDEX.out.versions) + ch_versions = ch_versions.mix(SAMTOOLS_VIEW.out.versions) + ch_versions = ch_versions.mix(ANCHORS.out.versions) + ch_versions = ch_versions.mix(MAIN.out.versions) + ch_versions = ch_versions.mix(UNIFY.out.versions) + + emit: + bed = UNIFY.out.output + + versions = ch_versions +} diff --git a/subworkflows/local/detection_tools/mapsplice.nf b/subworkflows/local/detection_tools/mapsplice.nf new file mode 100644 index 000000000..f81124265 --- /dev/null +++ b/subworkflows/local/detection_tools/mapsplice.nf @@ -0,0 +1,36 @@ +include { CIRCEXPLORER2_REFERENCE as REFERENCE } from '../../../modules/local/circexplorer2/reference' +include { MAPSPLICE_ALIGN as ALIGN } from '../../../modules/local/mapsplice/align' +include { CIRCEXPLORER2_PARSE as PARSE } from '../../../modules/nf-core/circexplorer2/parse' +include { CIRCEXPLORER2_ANNOTATE as ANNOTATE } from '../../../modules/nf-core/circexplorer2/annotate' +include { GAWK as UNIFY } from '../../../modules/nf-core/gawk' + +workflow MAPSPLICE { + take: + reads + gtf + fasta + bowtie_index + chromosomes + star_junctions + + main: + ch_versions = Channel.empty() + + REFERENCE( gtf ) + ALIGN( reads, bowtie_index, chromosomes, gtf ) + PARSE( ALIGN.out.raw_fusions ) + ANNOTATE( PARSE.out.junction, fasta, REFERENCE.out.txt ) + UNIFY( ANNOTATE.out.txt.map{ meta, txt -> + [ meta + [tool: "mapsplice"], txt ] }, [] ) + + ch_versions = ch_versions.mix(REFERENCE.out.versions) + ch_versions = ch_versions.mix(ALIGN.out.versions) + ch_versions = ch_versions.mix(PARSE.out.versions) + ch_versions = ch_versions.mix(ANNOTATE.out.versions) + ch_versions = ch_versions.mix(UNIFY.out.versions) + + emit: + bed = UNIFY.out.output + + versions = ch_versions +} diff --git a/subworkflows/local/detection_tools/segemehl.nf b/subworkflows/local/detection_tools/segemehl.nf new file mode 100644 index 000000000..9e8e2c0d9 --- /dev/null +++ b/subworkflows/local/detection_tools/segemehl.nf @@ -0,0 +1,37 @@ +include { SEGEMEHL_INDEX as INDEX } from '../../../modules/nf-core/segemehl/index' +include { SEGEMEHL_ALIGN as ALIGN } from '../../../modules/nf-core/segemehl/align' +include { GAWK as EXTRACT } from '../../../modules/nf-core/gawk' +include { GNU_SORT as SORT } from '../../../modules/nf-core/gnu/sort' +include { BEDTOOLS_GROUPBY as GROUP } from '../../../modules/nf-core/bedtools/groupby' +include { GAWK as UNIFY } from '../../../modules/nf-core/gawk' + +workflow SEGEMEHL { + take: + reads + fasta + index + + main: + ch_versions = Channel.empty() + + index = index ?: INDEX( fasta ).index + + ALIGN( reads, fasta, index ) + EXTRACT( ALIGN.out.single_bed + .map{ meta, bed -> [ meta + [tool: "segemehl"], bed ] }, [] ) + + SORT( EXTRACT.out.output ) + GROUP( SORT.out.sorted, 5 ) + UNIFY( GROUP.out.bed, [] ) + + ch_versions = ch_versions.mix(ALIGN.out.versions) + ch_versions = ch_versions.mix(EXTRACT.out.versions) + ch_versions = ch_versions.mix(SORT.out.versions) + ch_versions = ch_versions.mix(GROUP.out.versions) + ch_versions = ch_versions.mix(UNIFY.out.versions) + + emit: + bed = UNIFY.out.output + + versions = ch_versions +} diff --git a/subworkflows/local/detection_tools/star2pass.nf b/subworkflows/local/detection_tools/star2pass.nf new file mode 100644 index 000000000..4e216545b --- /dev/null +++ b/subworkflows/local/detection_tools/star2pass.nf @@ -0,0 +1,34 @@ +include { STAR_ALIGN as PASS_1 } from '../../../modules/nf-core/star/align' +include { STAR_ALIGN as PASS_2 } from '../../../modules/nf-core/star/align' +include { SJDB } from '../../../modules/local/star/sjdb' + + +workflow STAR2PASS { + take: + reads + star_index + ch_gtf + bsj_reads + ignore_sjdbgtf + seq_center + seq_platform + + main: + ch_versions = Channel.empty() + + PASS_1( reads, star_index, ch_gtf, ignore_sjdbgtf, seq_platform, seq_center) + sjdb = PASS_1.out.tab.map{ meta, tab -> return tab }.collect().map{[[id: "star_sjdb"], it]} + SJDB( sjdb, bsj_reads ) + PASS_2( reads, star_index, SJDB.out.sjtab, ignore_sjdbgtf, seq_platform, seq_center ) + + ch_versions = ch_versions.mix(PASS_1.out.versions) + ch_versions = ch_versions.mix(SJDB.out.versions) + ch_versions = ch_versions.mix(PASS_2.out.versions) + + emit: + junction = PASS_2.out.junction + sam = PASS_2.out.sam + tab = PASS_2.out.tab + + versions = ch_versions +} diff --git a/subworkflows/local/mirna/mirna_bindingsites.nf b/subworkflows/local/mirna/mirna_bindingsites.nf new file mode 100644 index 000000000..f6633ee5f --- /dev/null +++ b/subworkflows/local/mirna/mirna_bindingsites.nf @@ -0,0 +1,111 @@ +include { BIOAWK as ADD_BACKSPLICE } from '../../../modules/nf-core/bioawk' +include { MIRANDA } from '../../../modules/nf-core/miranda' +include { GAWK as UNIFY_MIRANDA } from '../../../modules/nf-core/gawk' +include { TARGETSCAN } from '../../../modules/local/targetscan/predict' +include { GAWK as UNIFY_TARGETSCAN } from '../../../modules/nf-core/gawk' +include { MIRNA_TARGETS } from '../../../modules/local/mirna_targets' +include { CAT_CAT as COMBINE_BINDINGSITES } from '../../../modules/nf-core/cat/cat' +include { MAJORITY_VOTE } from '../../../modules/local/majority_vote' + +workflow MIRNA_BINDINGSITES { + take: + transcriptome_fasta + circrna_bed12 + mirna_fasta + + main: + ch_versions = Channel.empty() + ch_predictions = Channel.empty() + + // miRNAs can potentially bind to circRNAs right at the backsplice site + // In this case, the miRNA binding sequence would partially overlap with start and end of the circRNA + // To account for this, the first 25bp of the circRNA are added to the end of the circRNA sequence + ADD_BACKSPLICE( transcriptome_fasta ) + ch_versions = ch_versions.mix(ADD_BACKSPLICE.out.versions) + + ch_transcriptome_batches = ADD_BACKSPLICE.out.output + .splitFasta(by: 100, file: true) + .map{ meta, file -> [[id: "batch_" + file.baseName.split("\\.").last()], file]} + + // + // MIRNA PREDICTION TOOLS: + // + tools_selected = params.mirna_tools.split(',').collect{it.trim().toLowerCase()} + + if (tools_selected.size() == 0) { + error 'No tools selected for miRNA discovery.' + } + + if (tools_selected.contains('targetscan')) { + // + // TARGETSCAN WORKFLOW: + // + TARGETSCAN( ch_transcriptome_batches, formatMiRNAForTargetScan( mirna_fasta ).collect() ) + UNIFY_TARGETSCAN( TARGETSCAN.out.txt, [] ) + + ch_versions = ch_versions.mix(TARGETSCAN.out.versions) + ch_versions = ch_versions.mix(UNIFY_TARGETSCAN.out.versions) + ch_predictions = ch_predictions.mix(UNIFY_TARGETSCAN.out.output) + } + + if (tools_selected.contains('miranda')) { + // + // MIRANDA WORKFLOW: + // + MIRANDA( ch_transcriptome_batches, mirna_fasta.map{meta, mature -> mature}.collect() ) + UNIFY_MIRANDA( MIRANDA.out.txt, [] ) + + ch_versions = ch_versions.mix(MIRANDA.out.versions) + ch_versions = ch_versions.mix(UNIFY_MIRANDA.out.versions) + ch_predictions = ch_predictions.mix(UNIFY_MIRANDA.out.output) + } + + // + // CONSOLIDATE PREDICTIONS WORKFLOW: + // + // TODO: This is an artifact and should be removed if we have a replacement + + // consolidate_targets = TARGETSCAN.out.txt.join(MIRANDA.out.txt).join(circrna_bed12) + consolidate_targets = TARGETSCAN.out.txt.join(MIRANDA.out.txt) + + MIRNA_TARGETS( consolidate_targets ) + + ch_versions = ch_versions.mix(MIRNA_TARGETS.out.versions) + + // + // MAJORITY VOTING: + // + MAJORITY_VOTE( ch_predictions.map{meta, file -> file}.collect().map{[[id: "mirna"], it]} ) + ch_versions = ch_versions.mix(MAJORITY_VOTE.out.versions) + + emit: + binding_sites = MAJORITY_VOTE.out.targets + + versions = ch_versions +} + +/* +======================================================================================== + FUNCTIONS +======================================================================================== +*/ +// Formatting miRNA input for targetscan +// takes mature.fa, iterates over entries (id, seq) and generates a new file +// writing: +// 1. miR ID +// 2. miR (7bp) seed sequence from mature seq +// 3. Species ID (set to 0000, not important for output). +// to new file +def formatMiRNAForTargetScan(ch_mature) { + + def ch_targetscan_meta_formatted = ch_mature + .map { meta, mature -> mature } + .splitFasta(record: [id: true, seqString: true]) + .map { record -> + return "${record.id}\t${record.seqString[1..7]}\t0000\n" + } + .collectFile(name: 'mature.txt') + + ch_targetscan_meta_formatted = ch_targetscan_meta_formatted.map { [[id: "mature_targetscan"], it] } + return ch_targetscan_meta_formatted +} diff --git a/subworkflows/local/mirna_prediction.nf b/subworkflows/local/mirna_prediction.nf new file mode 100644 index 000000000..b3fc46cc0 --- /dev/null +++ b/subworkflows/local/mirna_prediction.nf @@ -0,0 +1,85 @@ +// MODULES +include { BIOAWK as ADD_BACKSPLICE } from '../../modules/nf-core/bioawk' +include { DESEQ2_NORMALIZATION } from '../../modules/local/deseq2/normalization' +include { MIRNA_FILTERING } from '../../modules/local/mirna_filtering' +include { COMPUTE_CORRELATIONS } from '../../modules/local/compute_correlations' + +// SUBWORKFLOWS +include { MIRNA_BINDINGSITES } from './mirna/mirna_bindingsites' + +workflow MIRNA_PREDICTION { + + take: + transcriptome_fasta + circrna_annotation + ch_mature + ch_mirna + transcript_counts + quantification_rds + + main: + ch_versions = Channel.empty() + + // + // MIRNA NORMALIZATION WORKFLOW: + // + + if (params.mirna_expression) { + + ch_mirna_normalized = DESEQ2_NORMALIZATION( ch_mirna ).normalized + + ch_versions = ch_versions.mix(DESEQ2_NORMALIZATION.out.versions) + + ch_mirna_filtered = MIRNA_FILTERING(ch_mirna_normalized, + params.mirna_min_sample_percentage, + params.mirna_min_reads + ).filtered + + ch_versions = ch_versions.mix(MIRNA_FILTERING.out.versions) + + // + // MIRNA BINDING SITES: + // + + // Filtering miRNAs from ch_mature if they are not in ch_mirna_filtered. + ch_uniq_mirnas = ch_mirna_filtered.map{ meta, path -> path }.splitCsv( sep: '\t' ).map{ it[0] }.unique().collect() + + ch_mature = ch_mature + .map{ meta, path -> + path + } + .splitFasta( record: [id:true, seqString:true] ) + .combine(ch_uniq_mirnas.map{ it -> [it]}) // Not sure why this mapping is necessary but I think it is + .filter{ record, mirnas -> + ch_uniq_mirnas.contains(record.id).value + }.map{ record, mirnas -> + ">${record.id}\n${record.seqString}" + } + .collectFile( name: 'mature_filtered.fa', newLine: true) + .map{ it -> [[id: 'mature_filtered'], it]} + } + + MIRNA_BINDINGSITES( transcriptome_fasta, circrna_annotation, ch_mature ) + ch_versions = ch_versions.mix(MIRNA_BINDINGSITES.out.versions) + + if (params.mirna_expression) { + // + // COMPUTE CORRELATION: + // + ch_binding_site_batches = MIRNA_BINDINGSITES.out.binding_sites + .splitText(by: 100, file: true) + .map{ meta, file -> [[id: "batch_" + file.baseName.split("\\.").last()], file]} + + COMPUTE_CORRELATIONS(ch_binding_site_batches, ch_mirna_filtered, quantification_rds) + + ch_correlation_results = COMPUTE_CORRELATIONS.out.correlations + .map{meta, results -> results} + .flatten().collect() + .map{results -> [[id: 'correlation'], results]} + + ch_versions = ch_versions.mix(COMPUTE_CORRELATIONS.out.versions) + } + + emit: + versions = ch_versions +} diff --git a/subworkflows/local/prepare_genome.nf b/subworkflows/local/prepare_genome.nf new file mode 100644 index 000000000..cb87f6b47 --- /dev/null +++ b/subworkflows/local/prepare_genome.nf @@ -0,0 +1,72 @@ +include { CUSTOM_GTFFILTER as GTFFILTER } from '../../modules/nf-core/custom/gtffilter' +include { SEQKIT_SPLIT } from '../../modules/local/seqkit/split' +include { BOWTIE_BUILD } from '../../modules/nf-core/bowtie/build' +include { BOWTIE2_BUILD } from '../../modules/nf-core/bowtie2/build' +include { BWA_INDEX } from '../../modules/nf-core/bwa/index' +include { HISAT2_EXTRACTSPLICESITES } from '../../modules/nf-core/hisat2/extractsplicesites' +include { HISAT2_BUILD } from '../../modules/nf-core/hisat2/build' +include { STAR_GENOMEGENERATE } from '../../modules/nf-core/star/genomegenerate' +include { GAWK as CLEAN_FASTA } from '../../modules/nf-core/gawk' +include { SAMTOOLS_FAIDX } from '../../modules/nf-core/samtools/faidx' + +workflow PREPARE_GENOME { + + take: + ch_fasta + ch_gtf + + main: + ch_versions = Channel.empty() + + // MapSplice cannot deal with extra field in the fasta headers + // this removes all additional fields in the headers of the input fasta file + if( params.tools.split(',').contains('mapsplice') ) { + CLEAN_FASTA(ch_fasta, []) + ch_fasta = CLEAN_FASTA.out.output + + ch_versions = ch_versions.mix(CLEAN_FASTA.out.versions) + } + + GTFFILTER(ch_gtf, ch_fasta) + ch_gtf = GTFFILTER.out.gtf + + SEQKIT_SPLIT(ch_fasta) + + BOWTIE_BUILD(ch_fasta.map{ meta, fasta -> fasta }) + + BOWTIE2_BUILD(ch_fasta) + + BWA_INDEX (ch_fasta) + + HISAT2_EXTRACTSPLICESITES(ch_gtf) + + HISAT2_BUILD(ch_fasta, ch_gtf, HISAT2_EXTRACTSPLICESITES.out.txt) + + STAR_GENOMEGENERATE(ch_fasta, ch_gtf) + + SAMTOOLS_FAIDX(ch_fasta, [[], []]) + + // Collect versions + ch_versions = ch_versions.mix(GTFFILTER.out.versions, + SEQKIT_SPLIT.out.versions, + BOWTIE_BUILD.out.versions, + BOWTIE2_BUILD.out.versions, + BWA_INDEX.out.versions, + HISAT2_EXTRACTSPLICESITES.out.versions, + HISAT2_BUILD.out.versions, + STAR_GENOMEGENERATE.out.versions, + SAMTOOLS_FAIDX.out.versions) + + emit: + gtf = ch_gtf + faidx = SAMTOOLS_FAIDX.out.fai + bowtie = params.bowtie ?: BOWTIE_BUILD.out.index + bowtie2 = params.bowtie2 ? Channel.value([[id: "bowtie2"], file(params.bowtie2, checkIfExists: true)]) : BOWTIE2_BUILD.out.index.collect() + bwa = params.bwa ? Channel.value([[id: "bwa"], file(params.bwa, checkIfExists: true)]) : BWA_INDEX.out.index.collect() + hisat2 = params.hisat2 ? Channel.value([[id: "hisat2"], file(params.hisat2, checkIfExists: true)]) : HISAT2_BUILD.out.index.collect() + star = params.star ? Channel.value([[id: "star"], file(params.star, checkIfExists: true)]) : STAR_GENOMEGENERATE.out.index.collect() + chromosomes = SEQKIT_SPLIT.out.split + splice_sites = HISAT2_EXTRACTSPLICESITES.out.txt.collect() + + versions = ch_versions +} diff --git a/subworkflows/local/quantification.nf b/subworkflows/local/quantification.nf new file mode 100644 index 000000000..0d2496c52 --- /dev/null +++ b/subworkflows/local/quantification.nf @@ -0,0 +1,75 @@ +include { PSIRC_QUANT } from './quantification_tools/psirc_quant' +include { CIRIQUANT } from './quantification_tools/ciriquant' + +workflow QUANTIFICATION { + take: + reads + ch_gtf + ch_fasta + ch_transcriptome_fasta + ch_transcriptome_gtf + circ_annotation_bed + circ_annotation_gtf + bootstrap_samples + ch_phenotype + ch_faidx + bwa_index + hisat2_index + + main: + ch_versions = Channel.empty() + ch_gene_counts = Channel.empty() + ch_circ_counts = Channel.empty() + ch_ciriquant = Channel.empty() + ch_stringtie = Channel.empty() + ch_rds = Channel.empty() + + tools_selected = params.quantification_tools.split(',').collect{it.trim().toLowerCase()} + if (tools_selected.size() == 0) { + error 'No tools selected for circRNA quantification.' + } + + if (tools_selected.contains('psirc')) { + PSIRC_QUANT( + reads, + ch_transcriptome_fasta, + ch_transcriptome_gtf, + circ_annotation_bed, + circ_annotation_gtf, + bootstrap_samples, + ch_phenotype, + ch_faidx + ) + ch_gene_counts = ch_gene_counts + .mix(PSIRC_QUANT.out.gene_counts.map{meta, counts -> [meta + [quantification: 'psirc'], counts]}) + ch_circ_counts = ch_circ_counts + .mix(PSIRC_QUANT.out.circular_tx_counts.map{meta, counts -> [meta + [quantification: 'psirc'], counts]}) + ch_versions = ch_versions.mix(PSIRC_QUANT.out.versions) + ch_rds = ch_rds.mix(PSIRC_QUANT.out.rds) + } + + if (tools_selected.contains('ciriquant')) { + CIRIQUANT( + reads, + circ_annotation_bed, + ch_gtf, + ch_fasta, + bwa_index, + hisat2_index + ) + ch_versions = ch_versions.mix(CIRIQUANT.out.versions) + ch_gene_counts = ch_gene_counts.mix(CIRIQUANT.out.gene_tpm) + ch_circ_counts = ch_circ_counts.mix(CIRIQUANT.out.circ_cpm) + ch_ciriquant = ch_ciriquant.mix(CIRIQUANT.out.raw) + ch_stringtie = ch_stringtie.mix(CIRIQUANT.out.stringtie) + } + + emit: + gene = ch_gene_counts + circ = ch_circ_counts + ciriquant = ch_ciriquant + stringtie = ch_stringtie + rds = ch_rds + + versions = ch_versions +} diff --git a/subworkflows/local/quantification_tools/ciriquant.nf b/subworkflows/local/quantification_tools/ciriquant.nf new file mode 100644 index 000000000..0484a587b --- /dev/null +++ b/subworkflows/local/quantification_tools/ciriquant.nf @@ -0,0 +1,45 @@ +include { CIRIQUANT as MAIN } from '../../../modules/local/ciriquant/ciriquant' +include { PYGTFTK_TABULATE as EXTRACT_CIRC } from '../../../modules/local/pygtftk/tabulate' +include { GAWK as EXTRACT_GENES } from '../../../modules/nf-core/gawk' +include { JOIN_SAMPLES as JOIN_GENE } from '../../../modules/local/matrix/join_samples' +include { JOIN_SAMPLES as JOIN_CIRC } from '../../../modules/local/matrix/join_samples' + +workflow CIRIQUANT { + take: + reads + ch_bed + ch_gtf + ch_fasta + bwa_index + hisat2_index + + main: + ch_versions = Channel.empty() + + MAIN( reads, ch_bed, ch_gtf, ch_fasta, bwa_index, hisat2_index ) + ch_versions = ch_versions.mix(MAIN.out.versions) + + EXTRACT_CIRC( MAIN.out.gtf ) + ch_versions = ch_versions.mix(EXTRACT_CIRC.out.versions) + + EXTRACT_GENES( MAIN.out.gene_list, [] ) + ch_versions = ch_versions.mix(EXTRACT_GENES.out.versions) + + JOIN_GENE( + EXTRACT_GENES.out.output.map{meta, table -> [[id: 'gene'], meta.id, table]}.groupTuple() + ) + ch_versions = ch_versions.mix(JOIN_GENE.out.versions) + + JOIN_CIRC( + EXTRACT_CIRC.out.table.map{meta, table -> [[id: 'circ'], meta.id, table]}.groupTuple() + ) + ch_versions = ch_versions.mix(JOIN_CIRC.out.versions) + + emit: + gene_tpm = JOIN_GENE.out.joined + circ_cpm = JOIN_CIRC.out.joined + raw = MAIN.out.gtf + stringtie = MAIN.out.gene_gtf + + versions = ch_versions +} diff --git a/subworkflows/local/quantification_tools/psirc_quant.nf b/subworkflows/local/quantification_tools/psirc_quant.nf new file mode 100644 index 000000000..bba2d38ad --- /dev/null +++ b/subworkflows/local/quantification_tools/psirc_quant.nf @@ -0,0 +1,117 @@ +include { GAWK as MARK_CIRCULAR } from '../../../modules/nf-core/gawk' +include { PSIRC_INDEX } from '../../../modules/local/psirc/index' +include { PSIRC_QUANT as RUN_PSIRC_QUANT } from '../../../modules/local/psirc/quant' +include { CUSTOM_TX2GENE } from '../../../modules/nf-core/custom/tx2gene' +include { TXIMETA_TXIMPORT } from '../../../modules/nf-core/tximeta/tximport' +include { TXIMETA_TXIMETA } from '../../../modules/local/tximeta/tximeta' +include { MERGE_EXPERIMENTS } from '../../../modules/local/quantification/merge_experiments' +include { CSVTK_JOIN as JOIN_GENE_COUNTS } from '../../../modules/nf-core/csvtk/join' +include { CSVTK_JOIN as JOIN_GENE_TPM } from '../../../modules/nf-core/csvtk/join' +include { CSVTK_JOIN as JOIN_TX_COUNTS } from '../../../modules/nf-core/csvtk/join' +include { CSVTK_JOIN as JOIN_TX_TPM } from '../../../modules/nf-core/csvtk/join' +include { SPLIT_TYPES as SPLIT_TYPES_COUNTS } from '../../../modules/local/quantification/split_types' +include { SPLIT_TYPES as SPLIT_TYPES_TPM } from '../../../modules/local/quantification/split_types' + +workflow PSIRC_QUANT { + take: + reads + ch_transcriptome_fasta + ch_transcriptome_gtf + circ_annotation_bed + circ_annotation_gtf + bootstrap_samples + ch_phenotype + ch_faidx + + main: + ch_versions = Channel.empty() + + MARK_CIRCULAR(ch_transcriptome_fasta, []) + ch_versions = ch_versions.mix(MARK_CIRCULAR.out.versions) + + PSIRC_INDEX(MARK_CIRCULAR.out.output) + RUN_PSIRC_QUANT(reads, PSIRC_INDEX.out.index.collect(), MARK_CIRCULAR.out.output, ch_faidx, bootstrap_samples) + + CUSTOM_TX2GENE( + ch_transcriptome_gtf, + RUN_PSIRC_QUANT.out.directory.map{meta, quant -> quant}.collect().map{[[id: "quant"], it]}, + "kallisto", + "gene_id", + "gene_name" + ) + + TXIMETA_TXIMETA( + RUN_PSIRC_QUANT.out.directory, + "kallisto" + ) + + TXIMETA_TXIMPORT( + RUN_PSIRC_QUANT.out.directory, + CUSTOM_TX2GENE.out.tx2gene, + "kallisto" + ) + + ch_versions = ch_versions.mix( + PSIRC_INDEX.out.versions, + RUN_PSIRC_QUANT.out.versions, + CUSTOM_TX2GENE.out.versions, + TXIMETA_TXIMETA.out.versions, + TXIMETA_TXIMPORT.out.versions + ) + + JOIN_GENE_COUNTS( + TXIMETA_TXIMPORT.out.counts_gene.map{meta, counts -> counts}.collect().map{[[id: "gene_counts"], it]} + ) + + JOIN_GENE_TPM( + TXIMETA_TXIMPORT.out.tpm_gene.map{meta, tpm -> tpm}.collect().map{[[id: "gene_tpm"], it]} + ) + + JOIN_TX_COUNTS( + TXIMETA_TXIMPORT.out.counts_transcript.map{meta, counts -> counts}.collect().map{[[id: "tx_counts"], it]} + ) + + JOIN_TX_TPM( + TXIMETA_TXIMPORT.out.tpm_transcript.map{meta, tpm -> tpm}.collect().map{[[id: "tx_tpm"], it]} + ) + + SPLIT_TYPES_COUNTS( + JOIN_TX_COUNTS.out.csv + ) + + SPLIT_TYPES_TPM( + JOIN_TX_TPM.out.csv + ) + + + MERGE_EXPERIMENTS( + TXIMETA_TXIMETA.out.se.map{meta, se -> se}.collect().map{[[id: "experiments"], it]}, + ch_phenotype.ifEmpty([[], []]), + ch_transcriptome_gtf, + JOIN_TX_TPM.out.csv + ) + + ch_versions = ch_versions.mix( + JOIN_GENE_COUNTS.out.versions, + JOIN_GENE_TPM.out.versions, + JOIN_TX_COUNTS.out.versions, + JOIN_TX_TPM.out.versions, + SPLIT_TYPES_COUNTS.out.versions, + SPLIT_TYPES_TPM.out.versions, + MERGE_EXPERIMENTS.out.versions + ) + + emit: + se = MERGE_EXPERIMENTS.out.merged + rds = MERGE_EXPERIMENTS.out.merged + gene_counts = JOIN_GENE_COUNTS.out.csv + gene_tpm = JOIN_GENE_TPM.out.csv + tx_counts = JOIN_TX_COUNTS.out.csv + tx_tpm = JOIN_TX_TPM.out.csv + linear_tx_counts = SPLIT_TYPES_COUNTS.out.linear + linear_tx_tpm = SPLIT_TYPES_TPM.out.linear + circular_tx_counts = SPLIT_TYPES_COUNTS.out.circular + circular_tx_tpm = SPLIT_TYPES_TPM.out.circular + + versions = ch_versions +} diff --git a/subworkflows/local/statistical_tests.nf b/subworkflows/local/statistical_tests.nf new file mode 100644 index 000000000..4abc3605c --- /dev/null +++ b/subworkflows/local/statistical_tests.nf @@ -0,0 +1,77 @@ +include { CIRCTEST_PREPARE } from '../../modules/local/circtest/prepare' +include { CIRCTEST_CIRCTEST } from '../../modules/local/circtest/circtest' +include { CIRIQUANT_PREPDE } from '../../modules/local/ciriquant/prepde' +include { STRINGTIE_PREPDE } from '../../modules/local/stringtie/prepde' +include { CIRIQUANT_DE } from '../../modules/local/ciriquant/de' + +workflow STATISTICAL_TESTS { + take: + ch_gene_counts + ch_circ_counts + ch_ciriquant + ch_stringtie + ch_phenotype + + main: + ch_versions = Channel.empty() + + ch_counts = ch_gene_counts.join(ch_circ_counts) + + CIRCTEST_PREPARE(ch_counts) + ch_versions = ch_versions.mix(CIRCTEST_PREPARE.out.versions) + + CIRCTEST_CIRCTEST(CIRCTEST_PREPARE.out.counts, + ch_phenotype) + ch_versions = ch_versions.mix(CIRCTEST_CIRCTEST.out.versions) + + ch_phenotype_annotations = ch_phenotype + .map{ meta, table -> table.text } + .splitCsv( header: true ) + + ch_condition_samples = ch_phenotype_annotations + .map{ annotations -> [annotations.sample, annotations.condition] } + .join(ch_ciriquant.map{ meta, gtf -> [meta.id, gtf] }) + .join(ch_stringtie.map{ meta, gtf -> [meta.id, gtf] }) + .map{ sample, condition, ciriquant, stringtie -> [condition, [sample, ciriquant, stringtie]] } + .groupTuple() + .map{ condition, samples -> + [condition, samples.sort({ a, b -> a[0] <=> b[0] }).transpose()] + } + .map{ condition, samples -> + [condition, samples[0], samples[1], samples[2]] + } + + ch_condition_pairs = ch_condition_samples + .combine(ch_condition_samples) + .filter{ c_control, s_control, f_ciri_control, f_stringtie_control, c_treatment, s_treatment, f_ciri_treatment, f_stringtie_treatment + -> c_control > c_treatment } + .map{ c_control, s_control, f_ciri_control, f_stringtie_control, c_treatment, s_treatment, f_ciri_treatment, f_stringtie_treatment -> + [ [id: "${c_control}_${c_treatment}"], + s_control + s_treatment, + f_ciri_control + f_ciri_treatment, + f_stringtie_control + f_stringtie_treatment, + ['C'] * s_control.size() + ['T'] * s_treatment.size() + ]} + + CIRIQUANT_PREPDE(ch_condition_pairs + .map{meta, samples, ciri, stringtie, conditions -> [meta, samples, ciri, conditions]} + ) + ch_versions = ch_versions.mix(CIRIQUANT_PREPDE.out.versions) + + STRINGTIE_PREPDE(ch_condition_pairs + .map{meta, samples, ciri, stringtie, conditions -> [meta, samples, stringtie]} + ) + ch_versions = ch_versions.mix(STRINGTIE_PREPDE.out.versions) + + CIRIQUANT_DE( + CIRIQUANT_PREPDE.out.library.join( + CIRIQUANT_PREPDE.out.expression + ).join( + STRINGTIE_PREPDE.out.gene_matrix + ) + ) + ch_versions = ch_versions.mix(CIRIQUANT_DE.out.versions) + + emit: + versions = ch_versions +} diff --git a/subworkflows/local/utils_nfcore_circrna_pipeline/main.nf b/subworkflows/local/utils_nfcore_circrna_pipeline/main.nf index aae831258..bb303899a 100644 --- a/subworkflows/local/utils_nfcore_circrna_pipeline/main.nf +++ b/subworkflows/local/utils_nfcore_circrna_pipeline/main.nf @@ -168,6 +168,12 @@ def validateInputSamplesheet(input) { error("Please check input samplesheet -> Multiple runs of a sample must be of the same datatype i.e. single-end or paired-end: ${metas[0].id}") } + // Check that multiple runs of the same sample are of the same strandedness i.e. auto / unstranded / forward / reverse + def strandedness_ok = metas.collect{ it.strandedness }.unique().size == 1 + if (!strandedness_ok) { + error("Please check input samplesheet -> Multiple runs of a sample must be of the same strandedness: ${metas[0].id}") + } + return [ metas[0], fastqs ] } // @@ -260,4 +266,3 @@ def methodsDescriptionText(mqc_methods_yaml) { return description_html.toString() } - diff --git a/subworkflows/nf-core/bam_sort_stats_samtools/main.nf b/subworkflows/nf-core/bam_sort_stats_samtools/main.nf new file mode 100644 index 000000000..b716375b0 --- /dev/null +++ b/subworkflows/nf-core/bam_sort_stats_samtools/main.nf @@ -0,0 +1,50 @@ +// +// Sort, index BAM file and run samtools stats, flagstat and idxstats +// + +include { SAMTOOLS_SORT } from '../../../modules/nf-core/samtools/sort/main' +include { SAMTOOLS_INDEX } from '../../../modules/nf-core/samtools/index/main' +include { BAM_STATS_SAMTOOLS } from '../bam_stats_samtools/main' + +workflow BAM_SORT_STATS_SAMTOOLS { + take: + ch_bam // channel: [ val(meta), [ bam ] ] + ch_fasta // channel: [ val(meta), path(fasta) ] + + main: + + ch_versions = Channel.empty() + + SAMTOOLS_SORT ( ch_bam, ch_fasta ) + ch_versions = ch_versions.mix(SAMTOOLS_SORT.out.versions.first()) + + SAMTOOLS_INDEX ( SAMTOOLS_SORT.out.bam ) + ch_versions = ch_versions.mix(SAMTOOLS_INDEX.out.versions.first()) + + SAMTOOLS_SORT.out.bam + .join(SAMTOOLS_INDEX.out.bai, by: [0], remainder: true) + .join(SAMTOOLS_INDEX.out.csi, by: [0], remainder: true) + .map { + meta, bam, bai, csi -> + if (bai) { + [ meta, bam, bai ] + } else { + [ meta, bam, csi ] + } + } + .set { ch_bam_bai } + + BAM_STATS_SAMTOOLS ( ch_bam_bai, ch_fasta ) + ch_versions = ch_versions.mix(BAM_STATS_SAMTOOLS.out.versions) + + emit: + bam = SAMTOOLS_SORT.out.bam // channel: [ val(meta), [ bam ] ] + bai = SAMTOOLS_INDEX.out.bai // channel: [ val(meta), [ bai ] ] + csi = SAMTOOLS_INDEX.out.csi // channel: [ val(meta), [ csi ] ] + + stats = BAM_STATS_SAMTOOLS.out.stats // channel: [ val(meta), [ stats ] ] + flagstat = BAM_STATS_SAMTOOLS.out.flagstat // channel: [ val(meta), [ flagstat ] ] + idxstats = BAM_STATS_SAMTOOLS.out.idxstats // channel: [ val(meta), [ idxstats ] ] + + versions = ch_versions // channel: [ versions.yml ] +} diff --git a/subworkflows/nf-core/bam_sort_stats_samtools/meta.yml b/subworkflows/nf-core/bam_sort_stats_samtools/meta.yml new file mode 100644 index 000000000..e01f9ccf6 --- /dev/null +++ b/subworkflows/nf-core/bam_sort_stats_samtools/meta.yml @@ -0,0 +1,70 @@ +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/subworkflows/yaml-schema.json +name: bam_sort_stats_samtools +description: Sort SAM/BAM/CRAM file +keywords: + - sort + - bam + - sam + - cram +components: + - samtools/sort + - samtools/index + - samtools/stats + - samtools/idxstats + - samtools/flagstat + - bam_stats_samtools +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - bam: + type: file + description: BAM/CRAM/SAM file + pattern: "*.{bam,cram,sam}" + - fasta: + type: file + description: Reference genome fasta file + pattern: "*.{fasta,fa}" +# TODO Update when we decide on a standard for subworkflow docs +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - bam: + type: file + description: Sorted BAM/CRAM/SAM file + pattern: "*.{bam,cram,sam}" + - bai: + type: file + description: BAM/CRAM/SAM index file + pattern: "*.{bai,crai,sai}" + - crai: + type: file + description: BAM/CRAM/SAM index file + pattern: "*.{bai,crai,sai}" + - stats: + type: file + description: File containing samtools stats output + pattern: "*.{stats}" + - flagstat: + type: file + description: File containing samtools flagstat output + pattern: "*.{flagstat}" + - idxstats: + type: file + description: File containing samtools idxstats output + pattern: "*.{idxstats}" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@drpatelh" + - "@ewels" +maintainers: + - "@drpatelh" + - "@ewels" diff --git a/subworkflows/nf-core/bam_sort_stats_samtools/tests/main.nf.test b/subworkflows/nf-core/bam_sort_stats_samtools/tests/main.nf.test new file mode 100644 index 000000000..821a3cf50 --- /dev/null +++ b/subworkflows/nf-core/bam_sort_stats_samtools/tests/main.nf.test @@ -0,0 +1,134 @@ +nextflow_workflow { + + name "Test Workflow BAM_SORT_STATS_SAMTOOLS" + script "../main.nf" + workflow "BAM_SORT_STATS_SAMTOOLS" + tag "subworkflows" + tag "subworkflows_nfcore" + tag "subworkflows/bam_sort_stats_samtools" + tag "bam_sort_stats_samtools" + tag "subworkflows/bam_stats_samtools" + tag "bam_stats_samtools" + tag "samtools" + tag "samtools/index" + tag "samtools/sort" + tag "samtools/stats" + tag "samtools/idxstats" + tag "samtools/flagstat" + + test("test_bam_sort_stats_samtools_single_end") { + + when { + workflow { + """ + input[0] = Channel.of([ + [ id:'test', single_end:false ], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/bam/test.single_end.bam', checkIfExists: true) + ]) + input[1] = Channel.of([ + [ id:'genome' ], + file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.fasta', checkIfExists: true) + ]) + """ + } + } + + then { + assertAll( + { assert workflow.success}, + { assert workflow.out.bam.get(0).get(1) ==~ ".*.bam"}, + { assert workflow.out.bai.get(0).get(1) ==~ ".*.bai"}, + { assert snapshot( + workflow.out.flagstat, + workflow.out.idxstats, + workflow.out.stats, + workflow.out.versions).match() } + ) + } + } + + test("test_bam_sort_stats_samtools_paired_end") { + + when { + workflow { + """ + input[0] = Channel.of([ + [ id:'test', single_end:false ], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/bam/test.paired_end.bam', checkIfExists: true) + ]) + input[1] = Channel.of([ + [ id:'genome' ], + file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.fasta', checkIfExists: true) + ]) + """ + } + } + + then { + assertAll( + { assert workflow.success}, + { assert workflow.out.bam.get(0).get(1) ==~ ".*.bam"}, + { assert workflow.out.bai.get(0).get(1) ==~ ".*.bai"}, + { assert snapshot( + workflow.out.flagstat, + workflow.out.idxstats, + workflow.out.stats, + workflow.out.versions).match() } + ) + } + } + + test("test_bam_sort_stats_samtools_single_end - stub") { + + options "-stub" + + when { + workflow { + """ + input[0] = Channel.of([ + [ id:'test', single_end:false ], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/bam/test.single_end.bam', checkIfExists: true) + ]) + input[1] = Channel.of([ + [ id:'genome' ], + file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.fasta', checkIfExists: true) + ]) + """ + } + } + + then { + assertAll( + { assert workflow.success}, + { assert snapshot(workflow.out).match() } + ) + } + } + + test("test_bam_sort_stats_samtools_paired_end - stub") { + + options "-stub" + + when { + workflow { + """ + input[0] = Channel.of([ + [ id:'test', single_end:false ], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/bam/test.paired_end.bam', checkIfExists: true) + ]) + input[1] = Channel.of([ + [ id:'genome' ], + file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.fasta', checkIfExists: true) + ]) + """ + } + } + + then { + assertAll( + { assert workflow.success}, + { assert snapshot(workflow.out).match() } + ) + } + } +} diff --git a/subworkflows/nf-core/bam_sort_stats_samtools/tests/main.nf.test.snap b/subworkflows/nf-core/bam_sort_stats_samtools/tests/main.nf.test.snap new file mode 100644 index 000000000..b7f4da177 --- /dev/null +++ b/subworkflows/nf-core/bam_sort_stats_samtools/tests/main.nf.test.snap @@ -0,0 +1,330 @@ +{ + "test_bam_sort_stats_samtools_single_end": { + "content": [ + [ + [ + { + "id": "test", + "single_end": false + }, + "test.flagstat:md5,2191911d72575a2358b08b1df64ccb53" + ] + ], + [ + [ + { + "id": "test", + "single_end": false + }, + "test.idxstats:md5,613e048487662c694aa4a2f73ca96a20" + ] + ], + [ + [ + { + "id": "test", + "single_end": false + }, + "test.stats:md5,d32de3b3716a11039cef2367c3c1a56e" + ] + ], + [ + "versions.yml:md5,494b5530a1aa29fd5867cf655bebbfe1", + "versions.yml:md5,9fcb0cd845bfb1f89d83201bb20649b4", + "versions.yml:md5,bacc323ec4055d6f69f07a09089772d1", + "versions.yml:md5,ce946e97097c6a9ccf834a3f91f6da30", + "versions.yml:md5,d6c8dae685f1b7d050165fc15c7a20b5" + ] + ], + "meta": { + "nf-test": "0.9.0", + "nextflow": "24.04.3" + }, + "timestamp": "2024-07-22T17:02:44.34964" + }, + "test_bam_sort_stats_samtools_paired_end": { + "content": [ + [ + [ + { + "id": "test", + "single_end": false + }, + "test.flagstat:md5,4f7ffd1e6a5e85524d443209ac97d783" + ] + ], + [ + [ + { + "id": "test", + "single_end": false + }, + "test.idxstats:md5,df60a8c8d6621100d05178c93fb053a2" + ] + ], + [ + [ + { + "id": "test", + "single_end": false + }, + "test.stats:md5,cca83e4fc9406fc3875b5e60055d6574" + ] + ], + [ + "versions.yml:md5,494b5530a1aa29fd5867cf655bebbfe1", + "versions.yml:md5,9fcb0cd845bfb1f89d83201bb20649b4", + "versions.yml:md5,bacc323ec4055d6f69f07a09089772d1", + "versions.yml:md5,ce946e97097c6a9ccf834a3f91f6da30", + "versions.yml:md5,d6c8dae685f1b7d050165fc15c7a20b5" + ] + ], + "meta": { + "nf-test": "0.9.0", + "nextflow": "24.04.3" + }, + "timestamp": "2024-07-22T17:03:02.583095" + }, + "test_bam_sort_stats_samtools_single_end - stub": { + "content": [ + { + "0": [ + [ + { + "id": "test", + "single_end": false + }, + "test.bam:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "1": [ + [ + { + "id": "test", + "single_end": false + }, + "test.bam.bai:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "2": [ + + ], + "3": [ + [ + { + "id": "test", + "single_end": false + }, + "test.stats:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "4": [ + [ + { + "id": "test", + "single_end": false + }, + "test.flagstat:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "5": [ + [ + { + "id": "test", + "single_end": false + }, + "test.idxstats:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "6": [ + "versions.yml:md5,494b5530a1aa29fd5867cf655bebbfe1", + "versions.yml:md5,9fcb0cd845bfb1f89d83201bb20649b4", + "versions.yml:md5,bacc323ec4055d6f69f07a09089772d1", + "versions.yml:md5,ce946e97097c6a9ccf834a3f91f6da30", + "versions.yml:md5,d6c8dae685f1b7d050165fc15c7a20b5" + ], + "bai": [ + [ + { + "id": "test", + "single_end": false + }, + "test.bam.bai:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "bam": [ + [ + { + "id": "test", + "single_end": false + }, + "test.bam:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "csi": [ + + ], + "flagstat": [ + [ + { + "id": "test", + "single_end": false + }, + "test.flagstat:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "idxstats": [ + [ + { + "id": "test", + "single_end": false + }, + "test.idxstats:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "stats": [ + [ + { + "id": "test", + "single_end": false + }, + "test.stats:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "versions": [ + "versions.yml:md5,494b5530a1aa29fd5867cf655bebbfe1", + "versions.yml:md5,9fcb0cd845bfb1f89d83201bb20649b4", + "versions.yml:md5,bacc323ec4055d6f69f07a09089772d1", + "versions.yml:md5,ce946e97097c6a9ccf834a3f91f6da30", + "versions.yml:md5,d6c8dae685f1b7d050165fc15c7a20b5" + ] + } + ], + "meta": { + "nf-test": "0.9.0", + "nextflow": "24.04.3" + }, + "timestamp": "2024-07-22T17:03:22.328703" + }, + "test_bam_sort_stats_samtools_paired_end - stub": { + "content": [ + { + "0": [ + [ + { + "id": "test", + "single_end": false + }, + "test.bam:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "1": [ + [ + { + "id": "test", + "single_end": false + }, + "test.bam.bai:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "2": [ + + ], + "3": [ + [ + { + "id": "test", + "single_end": false + }, + "test.stats:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "4": [ + [ + { + "id": "test", + "single_end": false + }, + "test.flagstat:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "5": [ + [ + { + "id": "test", + "single_end": false + }, + "test.idxstats:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "6": [ + "versions.yml:md5,494b5530a1aa29fd5867cf655bebbfe1", + "versions.yml:md5,9fcb0cd845bfb1f89d83201bb20649b4", + "versions.yml:md5,bacc323ec4055d6f69f07a09089772d1", + "versions.yml:md5,ce946e97097c6a9ccf834a3f91f6da30", + "versions.yml:md5,d6c8dae685f1b7d050165fc15c7a20b5" + ], + "bai": [ + [ + { + "id": "test", + "single_end": false + }, + "test.bam.bai:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "bam": [ + [ + { + "id": "test", + "single_end": false + }, + "test.bam:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "csi": [ + + ], + "flagstat": [ + [ + { + "id": "test", + "single_end": false + }, + "test.flagstat:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "idxstats": [ + [ + { + "id": "test", + "single_end": false + }, + "test.idxstats:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "stats": [ + [ + { + "id": "test", + "single_end": false + }, + "test.stats:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "versions": [ + "versions.yml:md5,494b5530a1aa29fd5867cf655bebbfe1", + "versions.yml:md5,9fcb0cd845bfb1f89d83201bb20649b4", + "versions.yml:md5,bacc323ec4055d6f69f07a09089772d1", + "versions.yml:md5,ce946e97097c6a9ccf834a3f91f6da30", + "versions.yml:md5,d6c8dae685f1b7d050165fc15c7a20b5" + ] + } + ], + "meta": { + "nf-test": "0.9.0", + "nextflow": "24.04.3" + }, + "timestamp": "2024-07-22T17:03:38.833662" + } +} \ No newline at end of file diff --git a/subworkflows/nf-core/bam_sort_stats_samtools/tests/tags.yml b/subworkflows/nf-core/bam_sort_stats_samtools/tests/tags.yml new file mode 100644 index 000000000..30b69d6a4 --- /dev/null +++ b/subworkflows/nf-core/bam_sort_stats_samtools/tests/tags.yml @@ -0,0 +1,2 @@ +subworkflows/bam_sort_stats_samtools: + - subworkflows/nf-core/bam_sort_stats_samtools/** diff --git a/subworkflows/nf-core/bam_stats_samtools/main.nf b/subworkflows/nf-core/bam_stats_samtools/main.nf new file mode 100644 index 000000000..44d4c010a --- /dev/null +++ b/subworkflows/nf-core/bam_stats_samtools/main.nf @@ -0,0 +1,32 @@ +// +// Run SAMtools stats, flagstat and idxstats +// + +include { SAMTOOLS_STATS } from '../../../modules/nf-core/samtools/stats/main' +include { SAMTOOLS_IDXSTATS } from '../../../modules/nf-core/samtools/idxstats/main' +include { SAMTOOLS_FLAGSTAT } from '../../../modules/nf-core/samtools/flagstat/main' + +workflow BAM_STATS_SAMTOOLS { + take: + ch_bam_bai // channel: [ val(meta), path(bam), path(bai) ] + ch_fasta // channel: [ val(meta), path(fasta) ] + + main: + ch_versions = Channel.empty() + + SAMTOOLS_STATS ( ch_bam_bai, ch_fasta ) + ch_versions = ch_versions.mix(SAMTOOLS_STATS.out.versions) + + SAMTOOLS_FLAGSTAT ( ch_bam_bai ) + ch_versions = ch_versions.mix(SAMTOOLS_FLAGSTAT.out.versions) + + SAMTOOLS_IDXSTATS ( ch_bam_bai ) + ch_versions = ch_versions.mix(SAMTOOLS_IDXSTATS.out.versions) + + emit: + stats = SAMTOOLS_STATS.out.stats // channel: [ val(meta), path(stats) ] + flagstat = SAMTOOLS_FLAGSTAT.out.flagstat // channel: [ val(meta), path(flagstat) ] + idxstats = SAMTOOLS_IDXSTATS.out.idxstats // channel: [ val(meta), path(idxstats) ] + + versions = ch_versions // channel: [ path(versions.yml) ] +} diff --git a/subworkflows/nf-core/bam_stats_samtools/meta.yml b/subworkflows/nf-core/bam_stats_samtools/meta.yml new file mode 100644 index 000000000..809bf736b --- /dev/null +++ b/subworkflows/nf-core/bam_stats_samtools/meta.yml @@ -0,0 +1,43 @@ +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/subworkflows/yaml-schema.json +name: bam_stats_samtools +description: Produces comprehensive statistics from SAM/BAM/CRAM file +keywords: + - statistics + - counts + - bam + - sam + - cram +components: + - samtools/stats + - samtools/idxstats + - samtools/flagstat +input: + - ch_bam_bai: + description: | + The input channel containing the BAM/CRAM and it's index + Structure: [ val(meta), path(bam), path(bai) ] + - ch_fasta: + description: | + Reference genome fasta file + Structure: [ path(fasta) ] +output: + - stats: + description: | + File containing samtools stats output + Structure: [ val(meta), path(stats) ] + - flagstat: + description: | + File containing samtools flagstat output + Structure: [ val(meta), path(flagstat) ] + - idxstats: + description: | + File containing samtools idxstats output + Structure: [ val(meta), path(idxstats)] + - versions: + description: | + Files containing software versions + Structure: [ path(versions.yml) ] +authors: + - "@drpatelh" +maintainers: + - "@drpatelh" diff --git a/subworkflows/nf-core/fastqc_trimgalore.nf b/subworkflows/nf-core/fastqc_trimgalore.nf new file mode 100644 index 000000000..50f9dc97b --- /dev/null +++ b/subworkflows/nf-core/fastqc_trimgalore.nf @@ -0,0 +1,48 @@ +// +// Read QC, UMI extraction and trimming +// + +include { FASTQC } from '../../modules/nf-core/fastqc/main' +include { TRIMGALORE } from '../../modules/nf-core/trimgalore/main' + +workflow FASTQC_TRIMGALORE { + take: + reads // channel: [ val(meta), [ reads ] ] + skip_fastqc // boolean: true/false + skip_trimming // boolean: true/false + + main: + ch_versions = Channel.empty() + fastqc_html = Channel.empty() + fastqc_zip = Channel.empty() + if (!skip_fastqc) { + FASTQC ( reads ).html.set { fastqc_html } + fastqc_zip = FASTQC.out.zip + ch_versions = ch_versions.mix(FASTQC.out.versions.first()) + } + + trim_reads = reads + trim_html = Channel.empty() + trim_zip = Channel.empty() + trim_log = Channel.empty() + trimgalore_versions = Channel.empty() + if (!skip_trimming) { + TRIMGALORE ( reads ).reads.set { trim_reads } + trim_html = TRIMGALORE.out.html + trim_zip = TRIMGALORE.out.zip + trim_log = TRIMGALORE.out.log + ch_versions = ch_versions.mix(TRIMGALORE.out.versions.first()) + } + + emit: + reads = trim_reads // channel: [ val(meta), [ reads ] ] + + fastqc_html // channel: [ val(meta), [ html ] ] + fastqc_zip // channel: [ val(meta), [ zip ] ] + + trim_html // channel: [ val(meta), [ html ] ] + trim_zip // channel: [ val(meta), [ zip ] ] + trim_log // channel: [ val(meta), [ txt ] ] + + versions = ch_versions // channel: [ versions.yml ] +} diff --git a/workflows/circrna.nf b/workflows/circrna.nf deleted file mode 100644 index 1f63f055c..000000000 --- a/workflows/circrna.nf +++ /dev/null @@ -1,97 +0,0 @@ -/* -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - IMPORT MODULES / SUBWORKFLOWS / FUNCTIONS -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -*/ -include { FASTQC } from '../modules/nf-core/fastqc/main' -include { MULTIQC } from '../modules/nf-core/multiqc/main' -include { paramsSummaryMap } from 'plugin/nf-schema' -include { paramsSummaryMultiqc } from '../subworkflows/nf-core/utils_nfcore_pipeline' -include { softwareVersionsToYAML } from '../subworkflows/nf-core/utils_nfcore_pipeline' -include { methodsDescriptionText } from '../subworkflows/local/utils_nfcore_circrna_pipeline' - -/* -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - RUN MAIN WORKFLOW -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -*/ - -workflow CIRCRNA { - - take: - ch_samplesheet // channel: samplesheet read in from --input - main: - - ch_versions = Channel.empty() - ch_multiqc_files = Channel.empty() - // - // MODULE: Run FastQC - // - FASTQC ( - ch_samplesheet - ) - ch_multiqc_files = ch_multiqc_files.mix(FASTQC.out.zip.collect{it[1]}) - ch_versions = ch_versions.mix(FASTQC.out.versions.first()) - - // - // Collate and save software versions - // - softwareVersionsToYAML(ch_versions) - .collectFile( - storeDir: "${params.outdir}/pipeline_info", - name: 'nf_core_' + 'pipeline_software_' + 'mqc_' + 'versions.yml', - sort: true, - newLine: true - ).set { ch_collated_versions } - - - // - // MODULE: MultiQC - // - ch_multiqc_config = Channel.fromPath( - "$projectDir/assets/multiqc_config.yml", checkIfExists: true) - ch_multiqc_custom_config = params.multiqc_config ? - Channel.fromPath(params.multiqc_config, checkIfExists: true) : - Channel.empty() - ch_multiqc_logo = params.multiqc_logo ? - Channel.fromPath(params.multiqc_logo, checkIfExists: true) : - Channel.empty() - - summary_params = paramsSummaryMap( - workflow, parameters_schema: "nextflow_schema.json") - ch_workflow_summary = Channel.value(paramsSummaryMultiqc(summary_params)) - ch_multiqc_files = ch_multiqc_files.mix( - ch_workflow_summary.collectFile(name: 'workflow_summary_mqc.yaml')) - ch_multiqc_custom_methods_description = params.multiqc_methods_description ? - file(params.multiqc_methods_description, checkIfExists: true) : - file("$projectDir/assets/methods_description_template.yml", checkIfExists: true) - ch_methods_description = Channel.value( - methodsDescriptionText(ch_multiqc_custom_methods_description)) - - ch_multiqc_files = ch_multiqc_files.mix(ch_collated_versions) - ch_multiqc_files = ch_multiqc_files.mix( - ch_methods_description.collectFile( - name: 'methods_description_mqc.yaml', - sort: true - ) - ) - - MULTIQC ( - ch_multiqc_files.collect(), - ch_multiqc_config.toList(), - ch_multiqc_custom_config.toList(), - ch_multiqc_logo.toList(), - [], - [] - ) - - emit:multiqc_report = MULTIQC.out.report.toList() // channel: /path/to/multiqc_report.html - versions = ch_versions // channel: [ path(versions.yml) ] - -} - -/* -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - THE END -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -*/ diff --git a/workflows/circrna/main.nf b/workflows/circrna/main.nf new file mode 100644 index 000000000..16f3b524a --- /dev/null +++ b/workflows/circrna/main.nf @@ -0,0 +1,248 @@ +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + IMPORT MODULES / SUBWORKFLOWS / FUNCTIONS +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +*/ + +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + CONFIG FILES +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +*/ + +ch_multiqc_config = Channel.fromPath("$projectDir/assets/multiqc_config.yml", checkIfExists: true) +ch_multiqc_custom_config = params.multiqc_config ? Channel.fromPath( params.multiqc_config, checkIfExists: true ) : Channel.empty() +ch_multiqc_logo = params.multiqc_logo ? Channel.fromPath( params.multiqc_logo, checkIfExists: true ) : Channel.empty() +ch_multiqc_custom_methods_description = params.multiqc_methods_description ? file(params.multiqc_methods_description, checkIfExists: true) : file("$projectDir/assets/methods_description_template.yml", checkIfExists: true) + +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + IMPORT LOCAL MODULES/SUBWORKFLOWS +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +*/ + +// SUBWORKFLOWS: +include { paramsSummaryMap } from 'plugin/nf-validation' +include { paramsSummaryMultiqc } from '../../subworkflows/nf-core/utils_nfcore_pipeline' +include { validateInputSamplesheet } from '../../subworkflows/local/utils_nfcore_circrna_pipeline' + +include { softwareVersionsToYAML } from '../../subworkflows/nf-core/utils_nfcore_pipeline' +include { PREPARE_GENOME } from '../../subworkflows/local/prepare_genome' +include { BSJ_DETECTION } from '../../subworkflows/local/bsj_detection' +include { COMBINE_TRANSCRIPTOMES } from '../../subworkflows/local/combine_transcriptomes' +include { QUANTIFICATION } from '../../subworkflows/local/quantification' +include { MIRNA_PREDICTION } from '../../subworkflows/local/mirna_prediction' +include { STATISTICAL_TESTS } from '../../subworkflows/local/statistical_tests' + +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + IMPORT NF-CORE MODULES/SUBWORKFLOWS +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +*/ + +// MODULES: +include { MULTIQC } from '../../modules/nf-core/multiqc/main' +include { CAT_FASTQ } from '../../modules/nf-core/cat/fastq/main' + +// SUBWORKFLOWS: +include { FASTQC_TRIMGALORE } from '../../subworkflows/nf-core/fastqc_trimgalore' +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + RUN MAIN WORKFLOW +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +*/ + +workflow CIRCRNA { + take: + ch_samplesheet + ch_phenotype + ch_fasta + ch_gtf + ch_mature + ch_annotation + ch_versions + ch_mirna + + main: + + ch_multiqc_files = Channel.empty() + + // + // 1. Pre-processing + // + + // SUBWORKFLOW: + ch_samplesheet + .map { + meta, fastq_1, fastq_2 -> + if (!fastq_2) { + return [ meta.id, meta + [ single_end:true ], [ fastq_1 ] ] + } else { + return [ meta.id, meta + [ single_end:false ], [ fastq_1, fastq_2 ] ] + } + } + .groupTuple() + .map { + validateInputSamplesheet(it) + } + .map { + meta, fastqs -> + return [ meta, fastqs.flatten() ] + } + .branch { + meta, fastqs -> + single : fastqs.size() == 1 + return [ meta, fastqs ] + multiple: fastqs.size() > 1 + return [ meta, fastqs ] + } + .set { ch_fastq } + + // MODULE: + // Concatenate FastQ files from same sample if required + CAT_FASTQ (ch_fastq.multiple) + .reads + .mix(ch_fastq.single) + .set { ch_cat_fastq } + ch_versions = ch_versions.mix(CAT_FASTQ.out.versions) + + // SUBORKFLOW: + // Prepare index files &/or use iGenomes if chosen. + PREPARE_GENOME ( + ch_fasta, + ch_gtf + ) + + ch_gtf = PREPARE_GENOME.out.gtf + bowtie_index = PREPARE_GENOME.out.bowtie + bowtie2_index = PREPARE_GENOME.out.bowtie2 + bwa_index = PREPARE_GENOME.out.bwa + chromosomes = PREPARE_GENOME.out.chromosomes + hisat2_index = PREPARE_GENOME.out.hisat2 + star_index = PREPARE_GENOME.out.star + ch_versions = ch_versions.mix(PREPARE_GENOME.out.versions) + + // MODULE: Run FastQC, trimgalore! + FASTQC_TRIMGALORE ( + ch_cat_fastq, + params.skip_fastqc, + params.skip_trimming + ) + ch_versions = ch_versions.mix(FASTQC_TRIMGALORE.out.versions) + ch_multiqc_files = ch_multiqc_files.mix(FASTQC_TRIMGALORE.out.trim_zip.collect{it[1]}.ifEmpty([])) + ch_multiqc_files = ch_multiqc_files.mix(FASTQC_TRIMGALORE.out.trim_log.collect{it[1]}.ifEmpty([])) + + // + // 2. BSJ Discovery + // + + BSJ_DETECTION( + FASTQC_TRIMGALORE.out.reads, + ch_fasta, + ch_gtf, + ch_annotation, + bowtie_index, + bowtie2_index, + bwa_index, + chromosomes, + hisat2_index, + star_index, + params.bsj_reads, + params.exon_boundary + ) + + ch_multiqc_files = ch_multiqc_files.mix(BSJ_DETECTION.out.multiqc_files) + ch_versions = ch_versions.mix(BSJ_DETECTION.out.versions) + + COMBINE_TRANSCRIPTOMES( + ch_fasta, + ch_gtf, + BSJ_DETECTION.out.gtf + ) + + ch_versions = ch_versions.mix(COMBINE_TRANSCRIPTOMES.out.versions) + + // + // 3. circRNA quantification + // + + QUANTIFICATION( + FASTQC_TRIMGALORE.out.reads, + ch_gtf, + ch_fasta, + COMBINE_TRANSCRIPTOMES.out.fasta, + COMBINE_TRANSCRIPTOMES.out.gtf, + BSJ_DETECTION.out.bed12, + BSJ_DETECTION.out.gtf, + params.bootstrap_samples, + ch_phenotype, + PREPARE_GENOME.out.faidx, + PREPARE_GENOME.out.bwa, + PREPARE_GENOME.out.hisat2 + ) + + ch_versions = ch_versions.mix(QUANTIFICATION.out.versions) + + // + // 4. miRNA prediction + // + + if (params.mature) { + MIRNA_PREDICTION( + COMBINE_TRANSCRIPTOMES.out.fasta, + BSJ_DETECTION.out.bed12, + ch_mature, + ch_mirna, + QUANTIFICATION.out.circ, + QUANTIFICATION.out.rds + ) + ch_versions = ch_versions.mix(MIRNA_PREDICTION.out.versions) + } + + // + // 5. Statistical tests + // + + STATISTICAL_TESTS( + QUANTIFICATION.out.gene, + QUANTIFICATION.out.circ, + QUANTIFICATION.out.ciriquant, + QUANTIFICATION.out.stringtie, + ch_phenotype + ) + + ch_versions = ch_versions.mix(STATISTICAL_TESTS.out.versions) + + + // + // Collate and save software versions + // + softwareVersionsToYAML(ch_versions) + .collectFile(storeDir: "${params.outdir}/pipeline_info", name: 'nf_core_pipeline_software_mqc_versions.yml', sort: true, newLine: true) + .set { ch_collated_versions } + + // MultiQC + ch_multiqc_custom_config = params.multiqc_config ? Channel.fromPath(params.multiqc_config) : Channel.empty() + ch_multiqc_logo = params.multiqc_logo ? Channel.fromPath(params.multiqc_logo) : Channel.empty() + summary_params = paramsSummaryMap(workflow, parameters_schema: "nextflow_schema.json") + ch_workflow_summary = Channel.value(paramsSummaryMultiqc(summary_params)) + ch_multiqc_files = ch_multiqc_files.mix(ch_workflow_summary.collectFile(name: 'workflow_summary_mqc.yaml')) + ch_multiqc_files = ch_multiqc_files.mix(ch_collated_versions) + + MULTIQC ( + ch_multiqc_files.collect(), + ch_multiqc_config.toList(), + ch_multiqc_custom_config.toList(), + ch_multiqc_logo.toList() + ) + + emit: + multiqc_report = MULTIQC.out.report.toList() // channel: /path/to/multiqc_report.html + versions = ch_versions // channel: [ path(versions.yml) ] +} + +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + THE END +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +*/
Process Name \\", + " \\ Software Version
CUSTOM_DUMPSOFTWAREVERSIONSpython3.11.7
yaml5.4.1
TOOL1tool10.11.9
TOOL2tool21.9
WorkflowNextflow