nf-core · maxulysse · May 29, 2024 · May 31, 2024 · Jun 10, 2024 · Jun 10, 2024
diff --git a/main.nf b/main.nf
@@ -14,19 +14,13 @@
     GENOME PARAMETER VALUES
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 */
+
 params.fasta             = getGenomeAttribute('fasta')
-params.fasta_fai         = getGenomeAttribute('fasta_fai')
-params.dict              = getGenomeAttribute('dict')
+params.gene_bed          = getGenomeAttribute('bed12')
 params.gtf               = getGenomeAttribute('gtf')
-params.gff               = getGenomeAttribute('gff')
-params.exon_bed          = getGenomeAttribute('exon_bed')
-params.star_index        = getGenomeAttribute('star')
-params.dbsnp             = getGenomeAttribute('dbsnp')
-params.dbsnp_tbi         = getGenomeAttribute('dbsnp_tbi')
-params.known_indels      = getGenomeAttribute('known_indels')
-params.known_indels_tbi  = getGenomeAttribute('known_indels_tbi')
 params.snpeff_db         = getGenomeAttribute('snpeff_db')
 params.snpeff_genome     = getGenomeAttribute('snpeff_genome')
+params.star_index        = getGenomeAttribute('star')
 params.vep_cache_version = getGenomeAttribute('vep_cache_version')
 params.vep_genome        = getGenomeAttribute('vep_genome')
 params.vep_species       = getGenomeAttribute('vep_species')
@@ -61,29 +55,18 @@ workflow NFCORE_RNAVAR {
 
     ch_versions = Channel.empty()
 
-    // Initialize fasta file with meta map:
-    ch_fasta_raw      = params.fasta                   ? Channel.fromPath(params.fasta).map{ it -> [ [id:it.baseName], it ] }.collect()    : Channel.empty()
-
     // Initialize file channels based on params, defined in the params.genomes[params.genome] scope
-    ch_dbsnp          = params.dbsnp                   ? Channel.fromPath(params.dbsnp).collect()                                          : Channel.value([])
-    ch_known_indels   = params.known_indels            ? Channel.fromPath(params.known_indels).collect()                                   : Channel.value([])
-    ch_gff            = params.gff                     ? Channel.fromPath(params.gff).collect()                                            : Channel.empty()
-    ch_gtf_raw        = params.gtf                     ? Channel.fromPath(params.gtf).map{ gtf -> [ [ id:gtf.baseName ], gtf ] }.collect() : Channel.empty()
+    ch_dbsnp          = params.dbsnp              ? Channel.fromPath(params.dbsnp).collect()        : Channel.value([])
+    ch_known_indels   = params.known_indels       ? Channel.fromPath(params.known_indels).collect() : Channel.value([])
 
-    // Initialize variant annotation associated channels
+    // Initialize value channels based on params, defined in the params.genomes[params.genome] scope
     snpeff_db         = params.snpeff_db          ?: Channel.empty()
     vep_cache_version = params.vep_cache_version  ?: Channel.empty()
     vep_genome        = params.vep_genome         ?: Channel.empty()
     vep_species       = params.vep_species        ?: Channel.empty()
 
-    seq_platform      = params.seq_platform ?: []
-    seq_center        = params.seq_center   ?: []
-
-    // Initialize value channels based on params, defined in the params.genomes[params.genome] scope
-    snpeff_db                   = params.snpeff_db          ?:  Channel.empty()
-    vep_cache_version           = params.vep_cache_version  ?:  Channel.empty()
-    vep_genome                  = params.vep_genome         ?:  Channel.empty()
-    vep_species                 = params.vep_species        ?:  Channel.empty()
+    seq_platform      = params.seq_platform       ?: []
+    seq_center        = params.seq_center         ?: []
 
     vep_extra_files = []
 
@@ -99,16 +82,22 @@ workflow NFCORE_RNAVAR {
         vep_extra_files.add(file(params.spliceai_snv_tbi, checkIfExists: true))
     }
 
-    PREPARE_GENOME(
-        ch_fasta_raw,
-        ch_gff,
-        ch_gtf_raw,
-        ch_dbsnp,
-        ch_known_indels,
-        params.feature_type)
+    PREPARE_GENOME (
+        params.fasta,
+        params.gtf,
+        params.gff,
+        params.additional_fasta,
+        params.transcript_fasta,
+        params.gene_bed,
+        params.star_index,
+        params.gencode,
+        params.featurecounts_group_type,
+        params.skip_gtf_filter
+    )
 
     ch_fasta            = PREPARE_GENOME.out.fasta
     ch_star_index       = PREPARE_GENOME.out.star_index
+    ch_gff              = PREPARE_GENOME.out.gff
     ch_gtf              = PREPARE_GENOME.out.gtf
     ch_dict             = params.dict           ? Channel.fromPath(params.dict).map{ it -> [ [id:'dict'], it ] }.collect()
                                                 : PREPARE_GENOME.out.dict

diff --git a/nextflow.config b/nextflow.config
@@ -19,6 +19,15 @@ params {
     save_merged_fastq               = false
     feature_type                    = 'exon'
 
+    dbsnp                           = null
+    dbsnp_tbi                       = null
+    dict                            = null
+    exon_bed                        = null
+    fasta_fai                       = null
+    gff                             = null
+    known_indels                    = null
+    known_indels_tbi                = null
+
     // Sequence read information
     read_length                     = 150 // Required for STAR to build index and align reads
 

diff --git a/nextflow_schema.json b/nextflow_schema.json
@@ -91,6 +91,10 @@
                     "description": "Path to GFF3 annotation file.",
                     "help_text": "This parameter must be specified if `--genome` or `--gtf` are not specified."
                 },
+                "gene_bed": {
+                    "type": "string",
+                    "description": "Path to BED file containing gene intervals. This will be created from the GTF file if not specified."
+                },
                 "exon_bed": {
                     "type": "string",
                     "description": "Path to BED file containing exon intervals. This will be created from the GTF file if not specified."

diff --git a/subworkflows/local/prepare_genome/main.nf b/subworkflows/local/prepare_genome/main.nf
@@ -1,80 +1,193 @@
 //
-// Prepare reference genome files
+// Uncompress and prepare reference genome files
 //
 
 include { BEDTOOLS_MERGE                    } from '../../../modules/nf-core/bedtools/merge/main'
 include { BEDTOOLS_SORT                     } from '../../../modules/nf-core/bedtools/sort/main'
+include { CUSTOM_CATADDITIONALFASTA         } from '../../../modules/nf-core/custom/catadditionalfasta'
+include { CUSTOM_GETCHROMSIZES              } from '../../../modules/nf-core/custom/getchromsizes'
 include { GATK4_CREATESEQUENCEDICTIONARY    } from '../../../modules/nf-core/gatk4/createsequencedictionary/main'
 include { GFFREAD                           } from '../../../modules/nf-core/gffread/main'
 include { GTF2BED                           } from '../../../modules/local/gtf2bed'
+include { GTF_FILTER                        } from '../../../modules/local/gtf_filter'
+include { GUNZIP as GUNZIP_ADDITIONAL_FASTA } from '../../../modules/nf-core/gunzip'
+include { GUNZIP as GUNZIP_FASTA            } from '../../../modules/nf-core/gunzip'
+include { GUNZIP as GUNZIP_GENE_BED         } from '../../../modules/nf-core/gunzip'
+include { GUNZIP as GUNZIP_GFF              } from '../../../modules/nf-core/gunzip'
+include { GUNZIP as GUNZIP_GTF              } from '../../../modules/nf-core/gunzip'
+include { GUNZIP as GUNZIP_TRANSCRIPT_FASTA } from '../../../modules/nf-core/gunzip'
 include { SAMTOOLS_FAIDX                    } from '../../../modules/nf-core/samtools/faidx/main'
 include { STAR_GENOMEGENERATE               } from '../../../modules/nf-core/star/genomegenerate/main'
-include { GUNZIP as GUNZIP_FASTA            } from '../../../modules/nf-core/gunzip/main'
-include { GUNZIP as GUNZIP_GTF              } from '../../../modules/nf-core/gunzip/main'
 include { TABIX_TABIX as TABIX_DBSNP        } from '../../../modules/nf-core/tabix/tabix/main'
 include { TABIX_TABIX as TABIX_KNOWN_INDELS } from '../../../modules/nf-core/tabix/tabix/main'
+include { UNTAR as UNTAR_STAR_INDEX         } from '../../../modules/nf-core/untar'
 
 workflow PREPARE_GENOME {
     take:
-    ch_fasta_raw      // file: /path/to/genome.fasta
-    ch_gff            // file: /path/to/genome.gff
-    ch_gtf_raw        // file: /path/to/genome.gtf
-    ch_dbsnp
-    ch_known_indels
-    feature_type
+    fasta                    //      file: /path/to/genome.fasta
+    gtf                      //      file: /path/to/genome.gtf
+    gff                      //      file: /path/to/genome.gff
+    additional_fasta         //      file: /path/to/additional.fasta
+    transcript_fasta         //      file: /path/to/transcript.fasta
+    gene_bed                 //      file: /path/to/gene.bed
+    dbsnp                    //      file: /path/to/dbsnp.vcf
+    known_indels             //      file: /path/to/known_indels.vcf
+    star_index               // directory: /path/to/star/index/
+    featurecounts_group_type //    string: The attribute type used to group feature types in the GTF file when generating the biotype plot with featureCounts
+    gencode                  //   boolean: whether the genome is from GENCODE
+    skip_gtf_filter          //   boolean: Skip filtering of GTF for valid scaffolds and/ or transcript IDs
 
     main:
     ch_versions = Channel.empty()
 
-    //Unzip reference genome files if needed
-
-    if (params.fasta.endsWith('.gz')) {
-        GUNZIP_FASTA(ch_fasta_raw)
+    //
+    // Uncompress genome fasta file if required
+    //
+    if (fasta.endsWith('.gz')) {
+        ch_fasta    = GUNZIP_FASTA ( [ [:], fasta ] ).gunzip.map { it[1] }
+        ch_versions = ch_versions.mix(GUNZIP_FASTA.out.versions)
+    } else {
+        ch_fasta = Channel.value(file(fasta))
+    }
 
-        ch_fasta = GUNZIP_FASTA.out.gunzip
+    //
+    // Uncompress GTF annotation file or create from GFF3 if required
+    //
+    if (gtf || gff) {
+        if (gtf) {
+            if (gtf.endsWith('.gz')) {
+                ch_gtf      = GUNZIP_GTF ( [ [:], gtf ] ).gunzip.map { it[1] }
+                ch_versions = ch_versions.mix(GUNZIP_GTF.out.versions)
+            } else {
+                ch_gtf = Channel.value(file(gtf))
+            }
+        } else if (gff) {
+            if (gff.endsWith('.gz')) {
+                ch_gff      = GUNZIP_GFF ( [ [:], gff ] ).gunzip.map { it[1] }
+                ch_versions = ch_versions.mix(GUNZIP_GFF.out.versions)
+            } else {
+                ch_gff = Channel.value(file(gff))
+            }
+            ch_gtf      = GFFREAD ( ch_gff ).gtf
+            ch_versions = ch_versions.mix(GFFREAD.out.versions)
+        }
 
-    } else {
-        ch_fasta = ch_fasta_raw
+        // Determine whether to filter the GTF or not
+        def filter_gtf =
+            ((
+                // Condition 1: Transcript FASTA file is not provided
+                !transcript_fasta
+            )) &&
+            (
+                // Condition 2: --skip_gtf_filter is not provided
+                !skip_gtf_filter
+            )
+        if (filter_gtf) {
+            GTF_FILTER ( ch_fasta, ch_gtf )
+            ch_gtf = GTF_FILTER.out.genome_gtf
+            ch_versions = ch_versions.mix(GTF_FILTER.out.versions)
+        }
     }
 
-    if (params.gtf.endsWith('.gz')) {
-        GUNZIP_GTF(ch_gtf_raw)
+    //
+    // Uncompress additional fasta file and concatenate with reference fasta and gtf files
+    //
+    def biotype = gencode ? "gene_type" : featurecounts_group_type
+    if (additional_fasta) {
+        if (additional_fasta.endsWith('.gz')) {
+            ch_add_fasta = GUNZIP_ADDITIONAL_FASTA ( [ [:], additional_fasta ] ).gunzip.map { it[1] }
+            ch_versions  = ch_versions.mix(GUNZIP_ADDITIONAL_FASTA.out.versions)
+        } else {
+            ch_add_fasta = Channel.value(file(additional_fasta))
+        }
 
-        ch_gtf = GUNZIP_GTF.out.gunzip
+        CUSTOM_CATADDITIONALFASTA (
+            ch_fasta.combine(ch_gtf).map { fasta, gtf -> [ [:], fasta, gtf ] },
+            ch_add_fasta.map { [ [:], it ] },
+            biotype
+        )
+        ch_fasta    = CUSTOM_CATADDITIONALFASTA.out.fasta.map { it[1] }.first()
+        ch_gtf      = CUSTOM_CATADDITIONALFASTA.out.gtf.map { it[1] }.first()
+        ch_versions = ch_versions.mix(CUSTOM_CATADDITIONALFASTA.out.versions)
+    }
+
+    //
+    // Uncompress gene BED annotation file or create from GTF if required
+    //
+    if (gene_bed) {
+        if (gene_bed.endsWith('.gz')) {
+            ch_gene_bed = GUNZIP_GENE_BED ( [ [:], gene_bed ] ).gunzip.map { it[1] }
+            ch_versions = ch_versions.mix(GUNZIP_GENE_BED.out.versions)
+        } else {
+            ch_gene_bed = Channel.value(file(gene_bed))
+        }
+    } else {
+        ch_gene_bed = GTF2BED ( ch_gtf ).bed
+        ch_versions = ch_versions.mix(GTF2BED.out.versions)
+    }
 
+    //
+    // Uncompress transcript fasta file / create if required
+    //
+    if (transcript_fasta) {
+        if (transcript_fasta.endsWith('.gz')) {
+            ch_transcript_fasta = GUNZIP_TRANSCRIPT_FASTA ( [ [:], transcript_fasta ] ).gunzip.map { it[1] }
+            ch_versions         = ch_versions.mix(GUNZIP_TRANSCRIPT_FASTA.out.versions)
+        } else {
+            ch_transcript_fasta = Channel.value(file(transcript_fasta))
+        }
+        if (gencode) {
+            PREPROCESS_TRANSCRIPTS_FASTA_GENCODE ( ch_transcript_fasta )
+            ch_transcript_fasta = PREPROCESS_TRANSCRIPTS_FASTA_GENCODE.out.fasta
+            ch_versions         = ch_versions.mix(PREPROCESS_TRANSCRIPTS_FASTA_GENCODE.out.versions)
+        }
     } else {
-        ch_gtf = ch_gtf_raw
+        ch_transcript_fasta = MAKE_TRANSCRIPTS_FASTA ( ch_fasta, ch_gtf ).transcript_fasta
+        ch_versions         = ch_versions.mix(MAKE_TRANSCRIPTS_FASTA.out.versions)
     }
 
+    //
+    // Create chromosome sizes file
+    //
+    CUSTOM_GETCHROMSIZES ( ch_fasta.map { [ [:], it ] } )
+    ch_fai         = CUSTOM_GETCHROMSIZES.out.fai.map { it[1] }
+    ch_chrom_sizes = CUSTOM_GETCHROMSIZES.out.sizes.map { it[1] }
+    ch_versions    = ch_versions.mix(CUSTOM_GETCHROMSIZES.out.versions)
+
     GATK4_CREATESEQUENCEDICTIONARY(ch_fasta)
-    GFFREAD(ch_gff, ch_fasta)
-    SAMTOOLS_FAIDX(ch_fasta, [['id':'genome'], []])
     TABIX_DBSNP(ch_dbsnp.flatten().map{ it -> [ [ id:it.baseName ], it ] })
     TABIX_KNOWN_INDELS(ch_known_indels.flatten().map{ it -> [ [ id:it.baseName ], it ] } )
 
-    ch_gtf = ch_gtf.mix(GFFREAD.out.gtf)
-
-    GTF2BED(ch_gtf, feature_type)
-    STAR_GENOMEGENERATE(ch_fasta, ch_gtf)
-
     ch_versions = ch_versions.mix(GATK4_CREATESEQUENCEDICTIONARY.out.versions)
-    ch_versions = ch_versions.mix(GFFREAD.out.versions)
-    ch_versions = ch_versions.mix(GTF2BED.out.versions)
-    ch_versions = ch_versions.mix(SAMTOOLS_FAIDX.out.versions)
-    ch_versions = ch_versions.mix(STAR_GENOMEGENERATE.out.versions)
     ch_versions = ch_versions.mix(TABIX_DBSNP.out.versions)
     ch_versions = ch_versions.mix(TABIX_KNOWN_INDELS.out.versions)
 
+    //
+    // Uncompress STAR index or generate from scratch if required
+    //
+    ch_star_index = Channel.empty()
+    if (star_index) {
+        if (star_index.endsWith('.tar.gz')) {
+            ch_star_index = UNTAR_STAR_INDEX ( [ [:], star_index ] ).untar.map { it[1] }
+            ch_versions   = ch_versions.mix(UNTAR_STAR_INDEX.out.versions)
+        } else {
+            ch_star_index = Channel.value(file(star_index))
+        }
+    } else {
+            ch_star_index = STAR_GENOMEGENERATE ( ch_fasta.map { [ [:], it ] }, ch_gtf.map { [ [:], it ] } ).index.map { it[1] }
+            ch_versions   = ch_versions.mix(STAR_GENOMEGENERATE.out.versions)
+    }
+
     emit:
-    dict             = GATK4_CREATESEQUENCEDICTIONARY.out.dict                              //    path: genome.fasta.dict
-    exon_bed         = GTF2BED.out.bed.map{ bed -> [ [ id:bed.baseName ], bed ] }.collect() //    path: exon.bed
-    fasta            = ch_fasta
-    fasta_fai        = SAMTOOLS_FAIDX.out.fai.map{ meta, fai -> [fai] }                     //    path: genome.fasta.fai
-    gtf              = ch_gtf                                                               //    path: genome.gtf
-    star_index       = STAR_GENOMEGENERATE.out.index                                        //    path: star/index/
+    chrom_sizes      = ch_chrom_sizes            // channel: path(genome.sizes)
     dbsnp_tbi        = TABIX_DBSNP.out.tbi.map{ meta, tbi -> [tbi] }.collect()              // path: dbsnb.vcf.gz.tbi
+    dict             = GATK4_CREATESEQUENCEDICTIONARY.out.dict                              //    path: genome.fasta.dict
+    fai              = ch_fai                    // channel: path(genome.fai)
+    fasta            = ch_fasta                  // channel: path(genome.fasta)
+    gene_bed         = ch_gene_bed               // channel: path(gene.bed)
+    gtf              = ch_gtf                    // channel: path(genome.gtf)
     known_indels_tbi = TABIX_KNOWN_INDELS.out.tbi.map{ meta, tbi -> [tbi] }.collect()       // path: {known_indels*}.vcf.gz.tbi
-    versions         = ch_versions                                                          // channel: [ versions.yml ]
-    // bedtools_sort    = ch_bedtools_sort    // path: sort.bed
-    // bedtools_merge   = ch_bedtools_merge   // path: merge.bed
+    star_index       = ch_star_index             // channel: path(star/index/)
+    transcript_fasta = ch_transcript_fasta       // channel: path(transcript.fasta)
+    versions         = ch_versions.ifEmpty(null) // channel: [ versions.yml ]
 }
diff --git a/workflows/rnavar/main.nf b/workflows/rnavar/main.nf
@@ -214,7 +214,7 @@ workflow RNAVAR {
 
             // Gather QC reports
             ch_reports  = ch_reports.mix(ch_bqsr_table.map{ meta, table -> table})
-            ch_versions     = ch_versions.mix(GATK4_BASERECALIBRATOR.out.versions)
+            ch_versions = ch_versions.mix(GATK4_BASERECALIBRATOR.out.versions)
 
             ch_bam_applybqsr       = ch_splitncigar_bam_bai.join(ch_bqsr_table)
             ch_bam_recalibrated_qc = Channel.empty()
@@ -252,8 +252,8 @@ workflow RNAVAR {
             ch_dbsnp_for_haplotypecaller = [[id:'null'], []]
             ch_dbsnp_for_haplotypecaller_tbi = [[id:'null'], []]
         } else {
-            ch_dbsnp_for_haplotypecaller     = ch_dbsnp.map{ vcf -> [[id:'dbsnp'], vcf] }
-            ch_dbsnp_for_haplotypecaller_tbi = ch_dbsnp_tbi.map{ tbi -> [[id:'dbsnp'], tbi] }
+            ch_dbsnp_for_haplotypecaller     = ch_dbsnp.map{ it -> [[id:it.baseName], it] },
+            ch_dbsnp_for_haplotypecaller_tbi = ch_dbsnp_tbi.map{ it -> [[id:it.baseName], it] }
         }
 
         ch_haplotypecaller_vcf = Channel.empty()