Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix reference files usage #141

Open
wants to merge 5 commits into
base: dev
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
53 changes: 21 additions & 32 deletions main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -14,19 +14,13 @@
GENOME PARAMETER VALUES
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
*/

params.fasta = getGenomeAttribute('fasta')
params.fasta_fai = getGenomeAttribute('fasta_fai')
params.dict = getGenomeAttribute('dict')
params.gene_bed = getGenomeAttribute('bed12')
params.gtf = getGenomeAttribute('gtf')
params.gff = getGenomeAttribute('gff')
params.exon_bed = getGenomeAttribute('exon_bed')
params.star_index = getGenomeAttribute('star')
params.dbsnp = getGenomeAttribute('dbsnp')
params.dbsnp_tbi = getGenomeAttribute('dbsnp_tbi')
params.known_indels = getGenomeAttribute('known_indels')
params.known_indels_tbi = getGenomeAttribute('known_indels_tbi')
params.snpeff_db = getGenomeAttribute('snpeff_db')
params.snpeff_genome = getGenomeAttribute('snpeff_genome')
params.star_index = getGenomeAttribute('star')
params.vep_cache_version = getGenomeAttribute('vep_cache_version')
params.vep_genome = getGenomeAttribute('vep_genome')
params.vep_species = getGenomeAttribute('vep_species')
Expand Down Expand Up @@ -61,29 +55,18 @@ workflow NFCORE_RNAVAR {

ch_versions = Channel.empty()

// Initialize fasta file with meta map:
ch_fasta_raw = params.fasta ? Channel.fromPath(params.fasta).map{ it -> [ [id:it.baseName], it ] }.collect() : Channel.empty()

// Initialize file channels based on params, defined in the params.genomes[params.genome] scope
ch_dbsnp = params.dbsnp ? Channel.fromPath(params.dbsnp).collect() : Channel.value([])
ch_known_indels = params.known_indels ? Channel.fromPath(params.known_indels).collect() : Channel.value([])
ch_gff = params.gff ? Channel.fromPath(params.gff).collect() : Channel.empty()
ch_gtf_raw = params.gtf ? Channel.fromPath(params.gtf).map{ gtf -> [ [ id:gtf.baseName ], gtf ] }.collect() : Channel.empty()
ch_dbsnp = params.dbsnp ? Channel.fromPath(params.dbsnp).collect() : Channel.value([])
ch_known_indels = params.known_indels ? Channel.fromPath(params.known_indels).collect() : Channel.value([])

// Initialize variant annotation associated channels
// Initialize value channels based on params, defined in the params.genomes[params.genome] scope
snpeff_db = params.snpeff_db ?: Channel.empty()
vep_cache_version = params.vep_cache_version ?: Channel.empty()
vep_genome = params.vep_genome ?: Channel.empty()
vep_species = params.vep_species ?: Channel.empty()

seq_platform = params.seq_platform ?: []
seq_center = params.seq_center ?: []

// Initialize value channels based on params, defined in the params.genomes[params.genome] scope
snpeff_db = params.snpeff_db ?: Channel.empty()
vep_cache_version = params.vep_cache_version ?: Channel.empty()
vep_genome = params.vep_genome ?: Channel.empty()
vep_species = params.vep_species ?: Channel.empty()
seq_platform = params.seq_platform ?: []
seq_center = params.seq_center ?: []

vep_extra_files = []

Expand All @@ -99,16 +82,22 @@ workflow NFCORE_RNAVAR {
vep_extra_files.add(file(params.spliceai_snv_tbi, checkIfExists: true))
}

PREPARE_GENOME(
ch_fasta_raw,
ch_gff,
ch_gtf_raw,
ch_dbsnp,
ch_known_indels,
params.feature_type)
PREPARE_GENOME (
params.fasta,
params.gtf,
params.gff,
params.additional_fasta,
params.transcript_fasta,
params.gene_bed,
params.star_index,
params.gencode,
params.featurecounts_group_type,
params.skip_gtf_filter
)

ch_fasta = PREPARE_GENOME.out.fasta
ch_star_index = PREPARE_GENOME.out.star_index
ch_gff = PREPARE_GENOME.out.gff
ch_gtf = PREPARE_GENOME.out.gtf
ch_dict = params.dict ? Channel.fromPath(params.dict).map{ it -> [ [id:'dict'], it ] }.collect()
: PREPARE_GENOME.out.dict
Expand Down
9 changes: 9 additions & 0 deletions nextflow.config
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,15 @@ params {
save_merged_fastq = false
feature_type = 'exon'

dbsnp = null
dbsnp_tbi = null
dict = null
exon_bed = null
fasta_fai = null
gff = null
known_indels = null
known_indels_tbi = null

// Sequence read information
read_length = 150 // Required for STAR to build index and align reads

Expand Down
4 changes: 4 additions & 0 deletions nextflow_schema.json
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,10 @@
"description": "Path to GFF3 annotation file.",
"help_text": "This parameter must be specified if `--genome` or `--gtf` are not specified."
},
"gene_bed": {
"type": "string",
"description": "Path to BED file containing gene intervals. This will be created from the GTF file if not specified."
},
"exon_bed": {
"type": "string",
"description": "Path to BED file containing exon intervals. This will be created from the GTF file if not specified."
Expand Down
193 changes: 153 additions & 40 deletions subworkflows/local/prepare_genome/main.nf
Original file line number Diff line number Diff line change
@@ -1,80 +1,193 @@
//
// Prepare reference genome files
// Uncompress and prepare reference genome files
//

include { BEDTOOLS_MERGE } from '../../../modules/nf-core/bedtools/merge/main'
include { BEDTOOLS_SORT } from '../../../modules/nf-core/bedtools/sort/main'
include { CUSTOM_CATADDITIONALFASTA } from '../../../modules/nf-core/custom/catadditionalfasta'
include { CUSTOM_GETCHROMSIZES } from '../../../modules/nf-core/custom/getchromsizes'
include { GATK4_CREATESEQUENCEDICTIONARY } from '../../../modules/nf-core/gatk4/createsequencedictionary/main'
include { GFFREAD } from '../../../modules/nf-core/gffread/main'
include { GTF2BED } from '../../../modules/local/gtf2bed'
include { GTF_FILTER } from '../../../modules/local/gtf_filter'
include { GUNZIP as GUNZIP_ADDITIONAL_FASTA } from '../../../modules/nf-core/gunzip'
include { GUNZIP as GUNZIP_FASTA } from '../../../modules/nf-core/gunzip'
include { GUNZIP as GUNZIP_GENE_BED } from '../../../modules/nf-core/gunzip'
include { GUNZIP as GUNZIP_GFF } from '../../../modules/nf-core/gunzip'
include { GUNZIP as GUNZIP_GTF } from '../../../modules/nf-core/gunzip'
include { GUNZIP as GUNZIP_TRANSCRIPT_FASTA } from '../../../modules/nf-core/gunzip'
include { SAMTOOLS_FAIDX } from '../../../modules/nf-core/samtools/faidx/main'
include { STAR_GENOMEGENERATE } from '../../../modules/nf-core/star/genomegenerate/main'
include { GUNZIP as GUNZIP_FASTA } from '../../../modules/nf-core/gunzip/main'
include { GUNZIP as GUNZIP_GTF } from '../../../modules/nf-core/gunzip/main'
include { TABIX_TABIX as TABIX_DBSNP } from '../../../modules/nf-core/tabix/tabix/main'
include { TABIX_TABIX as TABIX_KNOWN_INDELS } from '../../../modules/nf-core/tabix/tabix/main'
include { UNTAR as UNTAR_STAR_INDEX } from '../../../modules/nf-core/untar'

workflow PREPARE_GENOME {
take:
ch_fasta_raw // file: /path/to/genome.fasta
ch_gff // file: /path/to/genome.gff
ch_gtf_raw // file: /path/to/genome.gtf
ch_dbsnp
ch_known_indels
feature_type
fasta // file: /path/to/genome.fasta
gtf // file: /path/to/genome.gtf
gff // file: /path/to/genome.gff
additional_fasta // file: /path/to/additional.fasta
transcript_fasta // file: /path/to/transcript.fasta
gene_bed // file: /path/to/gene.bed
dbsnp // file: /path/to/dbsnp.vcf
known_indels // file: /path/to/known_indels.vcf
star_index // directory: /path/to/star/index/
featurecounts_group_type // string: The attribute type used to group feature types in the GTF file when generating the biotype plot with featureCounts
gencode // boolean: whether the genome is from GENCODE
skip_gtf_filter // boolean: Skip filtering of GTF for valid scaffolds and/ or transcript IDs

main:
ch_versions = Channel.empty()

//Unzip reference genome files if needed

if (params.fasta.endsWith('.gz')) {
GUNZIP_FASTA(ch_fasta_raw)
//
// Uncompress genome fasta file if required
//
if (fasta.endsWith('.gz')) {
ch_fasta = GUNZIP_FASTA ( [ [:], fasta ] ).gunzip.map { it[1] }
ch_versions = ch_versions.mix(GUNZIP_FASTA.out.versions)
} else {
ch_fasta = Channel.value(file(fasta))
}

ch_fasta = GUNZIP_FASTA.out.gunzip
//
// Uncompress GTF annotation file or create from GFF3 if required
//
if (gtf || gff) {
if (gtf) {
if (gtf.endsWith('.gz')) {
ch_gtf = GUNZIP_GTF ( [ [:], gtf ] ).gunzip.map { it[1] }
ch_versions = ch_versions.mix(GUNZIP_GTF.out.versions)
} else {
ch_gtf = Channel.value(file(gtf))
}
} else if (gff) {
if (gff.endsWith('.gz')) {
ch_gff = GUNZIP_GFF ( [ [:], gff ] ).gunzip.map { it[1] }
ch_versions = ch_versions.mix(GUNZIP_GFF.out.versions)
} else {
ch_gff = Channel.value(file(gff))
}
ch_gtf = GFFREAD ( ch_gff ).gtf
ch_versions = ch_versions.mix(GFFREAD.out.versions)
}

} else {
ch_fasta = ch_fasta_raw
// Determine whether to filter the GTF or not
def filter_gtf =
((
// Condition 1: Transcript FASTA file is not provided
!transcript_fasta
)) &&
(
// Condition 2: --skip_gtf_filter is not provided
!skip_gtf_filter
)
if (filter_gtf) {
GTF_FILTER ( ch_fasta, ch_gtf )
ch_gtf = GTF_FILTER.out.genome_gtf
ch_versions = ch_versions.mix(GTF_FILTER.out.versions)
}
}

if (params.gtf.endsWith('.gz')) {
GUNZIP_GTF(ch_gtf_raw)
//
// Uncompress additional fasta file and concatenate with reference fasta and gtf files
//
def biotype = gencode ? "gene_type" : featurecounts_group_type
if (additional_fasta) {
if (additional_fasta.endsWith('.gz')) {
ch_add_fasta = GUNZIP_ADDITIONAL_FASTA ( [ [:], additional_fasta ] ).gunzip.map { it[1] }
ch_versions = ch_versions.mix(GUNZIP_ADDITIONAL_FASTA.out.versions)
} else {
ch_add_fasta = Channel.value(file(additional_fasta))
}

ch_gtf = GUNZIP_GTF.out.gunzip
CUSTOM_CATADDITIONALFASTA (
ch_fasta.combine(ch_gtf).map { fasta, gtf -> [ [:], fasta, gtf ] },
ch_add_fasta.map { [ [:], it ] },
biotype
)
ch_fasta = CUSTOM_CATADDITIONALFASTA.out.fasta.map { it[1] }.first()
ch_gtf = CUSTOM_CATADDITIONALFASTA.out.gtf.map { it[1] }.first()
ch_versions = ch_versions.mix(CUSTOM_CATADDITIONALFASTA.out.versions)
}

//
// Uncompress gene BED annotation file or create from GTF if required
//
if (gene_bed) {
if (gene_bed.endsWith('.gz')) {
ch_gene_bed = GUNZIP_GENE_BED ( [ [:], gene_bed ] ).gunzip.map { it[1] }
ch_versions = ch_versions.mix(GUNZIP_GENE_BED.out.versions)
} else {
ch_gene_bed = Channel.value(file(gene_bed))
}
} else {
ch_gene_bed = GTF2BED ( ch_gtf ).bed
ch_versions = ch_versions.mix(GTF2BED.out.versions)
}

//
// Uncompress transcript fasta file / create if required
//
if (transcript_fasta) {
if (transcript_fasta.endsWith('.gz')) {
ch_transcript_fasta = GUNZIP_TRANSCRIPT_FASTA ( [ [:], transcript_fasta ] ).gunzip.map { it[1] }
ch_versions = ch_versions.mix(GUNZIP_TRANSCRIPT_FASTA.out.versions)
} else {
ch_transcript_fasta = Channel.value(file(transcript_fasta))
}
if (gencode) {
PREPROCESS_TRANSCRIPTS_FASTA_GENCODE ( ch_transcript_fasta )
ch_transcript_fasta = PREPROCESS_TRANSCRIPTS_FASTA_GENCODE.out.fasta
ch_versions = ch_versions.mix(PREPROCESS_TRANSCRIPTS_FASTA_GENCODE.out.versions)
}
} else {
ch_gtf = ch_gtf_raw
ch_transcript_fasta = MAKE_TRANSCRIPTS_FASTA ( ch_fasta, ch_gtf ).transcript_fasta
ch_versions = ch_versions.mix(MAKE_TRANSCRIPTS_FASTA.out.versions)
}

//
// Create chromosome sizes file
//
CUSTOM_GETCHROMSIZES ( ch_fasta.map { [ [:], it ] } )
ch_fai = CUSTOM_GETCHROMSIZES.out.fai.map { it[1] }
ch_chrom_sizes = CUSTOM_GETCHROMSIZES.out.sizes.map { it[1] }
ch_versions = ch_versions.mix(CUSTOM_GETCHROMSIZES.out.versions)

GATK4_CREATESEQUENCEDICTIONARY(ch_fasta)
GFFREAD(ch_gff, ch_fasta)
SAMTOOLS_FAIDX(ch_fasta, [['id':'genome'], []])
TABIX_DBSNP(ch_dbsnp.flatten().map{ it -> [ [ id:it.baseName ], it ] })
TABIX_KNOWN_INDELS(ch_known_indels.flatten().map{ it -> [ [ id:it.baseName ], it ] } )

ch_gtf = ch_gtf.mix(GFFREAD.out.gtf)

GTF2BED(ch_gtf, feature_type)
STAR_GENOMEGENERATE(ch_fasta, ch_gtf)

ch_versions = ch_versions.mix(GATK4_CREATESEQUENCEDICTIONARY.out.versions)
ch_versions = ch_versions.mix(GFFREAD.out.versions)
ch_versions = ch_versions.mix(GTF2BED.out.versions)
ch_versions = ch_versions.mix(SAMTOOLS_FAIDX.out.versions)
ch_versions = ch_versions.mix(STAR_GENOMEGENERATE.out.versions)
ch_versions = ch_versions.mix(TABIX_DBSNP.out.versions)
ch_versions = ch_versions.mix(TABIX_KNOWN_INDELS.out.versions)

//
// Uncompress STAR index or generate from scratch if required
//
ch_star_index = Channel.empty()
if (star_index) {
if (star_index.endsWith('.tar.gz')) {
ch_star_index = UNTAR_STAR_INDEX ( [ [:], star_index ] ).untar.map { it[1] }
ch_versions = ch_versions.mix(UNTAR_STAR_INDEX.out.versions)
} else {
ch_star_index = Channel.value(file(star_index))
}
} else {
ch_star_index = STAR_GENOMEGENERATE ( ch_fasta.map { [ [:], it ] }, ch_gtf.map { [ [:], it ] } ).index.map { it[1] }
ch_versions = ch_versions.mix(STAR_GENOMEGENERATE.out.versions)
}

emit:
dict = GATK4_CREATESEQUENCEDICTIONARY.out.dict // path: genome.fasta.dict
exon_bed = GTF2BED.out.bed.map{ bed -> [ [ id:bed.baseName ], bed ] }.collect() // path: exon.bed
fasta = ch_fasta
fasta_fai = SAMTOOLS_FAIDX.out.fai.map{ meta, fai -> [fai] } // path: genome.fasta.fai
gtf = ch_gtf // path: genome.gtf
star_index = STAR_GENOMEGENERATE.out.index // path: star/index/
chrom_sizes = ch_chrom_sizes // channel: path(genome.sizes)
dbsnp_tbi = TABIX_DBSNP.out.tbi.map{ meta, tbi -> [tbi] }.collect() // path: dbsnb.vcf.gz.tbi
dict = GATK4_CREATESEQUENCEDICTIONARY.out.dict // path: genome.fasta.dict
fai = ch_fai // channel: path(genome.fai)
fasta = ch_fasta // channel: path(genome.fasta)
gene_bed = ch_gene_bed // channel: path(gene.bed)
gtf = ch_gtf // channel: path(genome.gtf)
known_indels_tbi = TABIX_KNOWN_INDELS.out.tbi.map{ meta, tbi -> [tbi] }.collect() // path: {known_indels*}.vcf.gz.tbi
versions = ch_versions // channel: [ versions.yml ]
// bedtools_sort = ch_bedtools_sort // path: sort.bed
// bedtools_merge = ch_bedtools_merge // path: merge.bed
star_index = ch_star_index // channel: path(star/index/)
transcript_fasta = ch_transcript_fasta // channel: path(transcript.fasta)
versions = ch_versions.ifEmpty(null) // channel: [ versions.yml ]
}
6 changes: 3 additions & 3 deletions workflows/rnavar/main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -214,7 +214,7 @@ workflow RNAVAR {

// Gather QC reports
ch_reports = ch_reports.mix(ch_bqsr_table.map{ meta, table -> table})
ch_versions = ch_versions.mix(GATK4_BASERECALIBRATOR.out.versions)
ch_versions = ch_versions.mix(GATK4_BASERECALIBRATOR.out.versions)

ch_bam_applybqsr = ch_splitncigar_bam_bai.join(ch_bqsr_table)
ch_bam_recalibrated_qc = Channel.empty()
Expand Down Expand Up @@ -252,8 +252,8 @@ workflow RNAVAR {
ch_dbsnp_for_haplotypecaller = [[id:'null'], []]
ch_dbsnp_for_haplotypecaller_tbi = [[id:'null'], []]
} else {
ch_dbsnp_for_haplotypecaller = ch_dbsnp.map{ vcf -> [[id:'dbsnp'], vcf] }
ch_dbsnp_for_haplotypecaller_tbi = ch_dbsnp_tbi.map{ tbi -> [[id:'dbsnp'], tbi] }
ch_dbsnp_for_haplotypecaller = ch_dbsnp.map{ it -> [[id:it.baseName], it] },
ch_dbsnp_for_haplotypecaller_tbi = ch_dbsnp_tbi.map{ it -> [[id:it.baseName], it] }
}

ch_haplotypecaller_vcf = Channel.empty()
Expand Down
Loading