diff --git a/.github/ISSUE_TEMPLATE/bug_report.yml b/.github/ISSUE_TEMPLATE/bug_report.yml index f2c8a55..9c53149 100644 --- a/.github/ISSUE_TEMPLATE/bug_report.yml +++ b/.github/ISSUE_TEMPLATE/bug_report.yml @@ -128,8 +128,8 @@ body: label: Were you able to successfully run the latest version of the workflow with the demo data? description: For CLI execution, were you able to successfully run the workflow using the demo data available in the [Install and run](./README.md#install-and-run) section of the `README.md`? For execution in the EPI2ME application, were you able to successfully run the workflow via the "Use demo data" button? options: - - yes - - no + - 'yes' + - 'no' - other (please describe below) validations: required: true diff --git a/CHANGELOG.md b/CHANGELOG.md index 3321011..c49acff 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,7 +4,9 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). -## [Unreleased] +## [v1.0.3] +### Added +- Publish stringtie transcriptome fasta and GFF files to output dir. ### Fixed - More informative error message upon read duplicate detection. ### Updated diff --git a/README.md b/README.md index 31c8767..51ab7e9 100644 --- a/README.md +++ b/README.md @@ -214,6 +214,8 @@ Output files may be aggregated including information for all samples or provided | Alignment index per chromosome | ./{{ alias }}/bams/{{ alias }}.{{ chromosome }}.tagged.bam.bai | Genomic alignment index file per chromosome. | per-sample | | Alignment output per sample | ./{{ alias }}/bams/{{ alias }}.tagged.sorted.bam | Genomic alignment output file with aggregated chromosomes (when using --merge_bam). | per-sample | | Alignment index per sample | ./{{ alias }}/bams/{{ alias }}.tagged.sorted.bam.bai | Genomic alignment index file with aggregated chromosomes (when using --merge_bam). | per-sample | +| Transcriptome sequence | ./{{ alias }}/{{ alias }}.transcriptome.fa.gz | Transcriptome generated by Stringtie during transcript discovery stage | per-sample | +| Transcriptome annotation | ./{{ alias }}/{{ alias }}.transcriptome.gff.gz | Transcriptome annotation generated by Stringtie during transcript discovery stage | per-sample | diff --git a/docs/07_outputs.md b/docs/07_outputs.md index db24a99..ff439cd 100644 --- a/docs/07_outputs.md +++ b/docs/07_outputs.md @@ -19,3 +19,5 @@ Output files may be aggregated including information for all samples or provided | Alignment index per chromosome | ./{{ alias }}/bams/{{ alias }}.{{ chromosome }}.tagged.bam.bai | Genomic alignment index file per chromosome. | per-sample | | Alignment output per sample | ./{{ alias }}/bams/{{ alias }}.tagged.sorted.bam | Genomic alignment output file with aggregated chromosomes (when using --merge_bam). | per-sample | | Alignment index per sample | ./{{ alias }}/bams/{{ alias }}.tagged.sorted.bam.bai | Genomic alignment index file with aggregated chromosomes (when using --merge_bam). | per-sample | +| Transcriptome sequence | ./{{ alias }}/{{ alias }}.transcriptome.fa.gz | Transcriptome generated by Stringtie during transcript discovery stage | per-sample | +| Transcriptome annotation | ./{{ alias }}/{{ alias }}.transcriptome.gff.gz | Transcriptome annotation generated by Stringtie during transcript discovery stage | per-sample | diff --git a/main.nf b/main.nf index bed925f..5e6f990 100644 --- a/main.nf +++ b/main.nf @@ -114,7 +114,7 @@ process output { publishDir "${params.out_dir}", mode: 'copy', pattern: "*umap*.{tsv,png}", saveAs: { filename -> "${meta.alias}/umap/$filename" } publishDir "${params.out_dir}", mode: 'copy', - pattern: "*{images,counts,gene_expression,transcript_expression,kneeplot,saturation,config,tags,whitelist}*", + pattern: "*{images,counts,gene_expression,transcript_expression,kneeplot,saturation,config,tags,whitelist,transcriptome,annotation}*", saveAs: { filename -> "${meta.alias}/$filename" } input: diff --git a/nextflow.config b/nextflow.config index 018acfe..fe9c944 100644 --- a/nextflow.config +++ b/nextflow.config @@ -75,7 +75,7 @@ manifest { description = 'Identification of cell- and UMI barcodes from single-cell sequencing.' mainScript = 'main.nf' nextflowVersion = '>=23.04.2' - version = '1.0.2' + version = '1.0.3' } epi2melabs { diff --git a/output_definition.json b/output_definition.json index 90e9b30..fc0be21 100644 --- a/output_definition.json +++ b/output_definition.json @@ -135,6 +135,22 @@ "mime-type": "application/gzip", "optional": true, "type": "per-sample" + }, + "transcriptome_fasta": { + "filepath": "./{{ alias }}/{{ alias }}.transcriptome.fa.gz", + "title": "Transcriptome sequence", + "description": "Transcriptome generated by Stringtie during transcript discovery stage", + "mime-type": "application/gzip", + "optional": false, + "type": "per-sample" + }, + "transcriptome_annotation": { + "filepath": "./{{ alias }}/{{ alias }}.transcriptome.gff.gz", + "title": "Transcriptome annotation", + "description": "Transcriptome annotation generated by Stringtie during transcript discovery stage", + "mime-type": "application/gzip", + "optional": false, + "type": "per-sample" } } } \ No newline at end of file diff --git a/subworkflows/process_bams.nf b/subworkflows/process_bams.nf index c98d240..baaab18 100644 --- a/subworkflows/process_bams.nf +++ b/subworkflows/process_bams.nf @@ -244,7 +244,6 @@ process combine_uncorrect_bcs { } - process combine_chrom_bams { // Merge all chromosome bams by sample_id label "singlecell" @@ -284,30 +283,32 @@ process stringtie { output: tuple val(meta), val(chr), - path("transcriptome.fa"), + path("${meta.alias}.transcriptome.fa"), path("chr.gtf"), - path("stringtie.gff"), + path("${meta.alias}.stringtie.gff"), path("reads.fastq"), emit: read_tr_map script: if (meta.kit_name=="5prime") """ + # Add chromosome label (-l) to generated transcripts + # so we don't get name collisions during file merge later samtools view -h align.bam ${chr} \ - | tee >(stringtie -L ${params.stringtie_opts} -p ${task.cpus} -G chr.gtf -l stringtie \ - -o stringtie.gff - ) \ - | samtools fastq > reads.fastq + | tee >(stringtie -L ${params.stringtie_opts} -p ${task.cpus} -G chr.gtf -l "${chr}.stringtie" \ + -o "${meta.alias}.stringtie.gff" - ) \ + | samtools fastq > reads.fastq # Get transcriptome sequence - gffread -g ref_genome.fa -w transcriptome.fa stringtie.gff + gffread -g ref_genome.fa -w "${meta.alias}.transcriptome.fa" "${meta.alias}.stringtie.gff" """ else - """ + """ # Data from 3prime and multiome kits must be flipped to the transcript strand before building transcriptome. workflow-glue process_bam_for_stringtie align.bam ${chr} \ - | tee >(stringtie -L ${params.stringtie_opts} -p ${task.cpus} -G chr.gtf -l stringtie \ - -o stringtie.gff - ) \ + | tee >(stringtie -L ${params.stringtie_opts} -p ${task.cpus} -G chr.gtf -l "${chr}.stringtie" \ + -o "${meta.alias}.stringtie.gff" - ) \ | samtools fastq > reads.fastq # Get transcriptome sequence - gffread -g ref_genome.fa -w transcriptome.fa stringtie.gff + gffread -g ref_genome.fa -w "${meta.alias}.transcriptome.fa" "${meta.alias}.stringtie.gff" """ } @@ -371,6 +372,9 @@ process assign_features { val(chr), path("${meta.alias}.${chr}.feature_assigns.tsv"), emit: feature_assigns + tuple val(meta), + path("gffcompare.annotated.gtf"), + emit: annotation """ # gffcomapre maps transcript reference IDs to query transcripts. gffcompare -o gffcompare -r chr.gtf stringtie.gff @@ -475,6 +479,27 @@ process umap_reduce_expression_matrix { """ } +process merge_transcriptome { + // Merge the annotated GFFs and transcriptome sequence files + label "singlecell" + cpus 1 + memory "2GB" + input: + tuple val(meta), + path('fasta/?.fa'), + path('gffs/?.gff') + output: + tuple val(meta), + path("${meta.alias}.transcriptome.gff.gz"), + path("${meta.alias}.transcriptome.fa.gz"), + emit: merged_annotation + """ + # Concatenate transcriptome files, remove comments (from gff) and compress + find fasta/ -name '*.fa' -exec cat {} + | gzip > "${meta.alias}.transcriptome.fa.gz" + find gffs/ -name '*.gff' -exec cat {} + |grep -v '^#' | gzip > "${meta.alias}.transcriptome.gff.gz" + """ +} + process pack_images { label "singlecell" @@ -618,6 +643,13 @@ workflow process_bams { .concat(umi_gene_saturation.out.saturation_curve) .groupTuple()) + merge_transcriptome( + assign_features.out.annotation.groupTuple() + .join(stringtie.out.read_tr_map.groupTuple()) + .map{ + meta, ann_tr_gff, chr, tr_fa, ref_gtf, str_gff, fastq -> + [meta, tr_fa, ann_tr_gff]}) + // Tidy up channels prior to output proc_expresion_out = process_expression_matrix.out.gene_matrix_processed_tsv .concat(process_expression_matrix.out.transcript_matrix_processed_tsv) @@ -635,6 +667,7 @@ workflow process_bams { .join(tagged_bams) .join(combine_uncorrect_bcs.out) .join(pack_images.out) + .join(merge_transcriptome.out) .map{it -> it.flatten()} // Emit sperately for use in the report