Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

simplify TABIX generation + refactor #48

Merged
merged 28 commits into from
Dec 4, 2024
Merged
Show file tree
Hide file tree
Changes from 21 commits
Commits
Show all changes
28 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,8 @@ Initial release of nf-core/references, created with the [nf-core](https://nf-co.
- [41](https://github.com/nf-core/references/pull/41) - Better sarek tests
- [41](https://github.com/nf-core/references/pull/41) - Better publishing for sarek related files
- [43](https://github.com/nf-core/references/pull/43) - Fasta is no longer a required asset
- [48](https://github.com/nf-core/references/pull/48) - Simplify VCF tabix index generation and related assets
- [48](https://github.com/nf-core/references/pull/48) - Code refactoring (new subworfklows for each type of operations)

### Fixed

Expand Down
22 changes: 14 additions & 8 deletions assets/genomes/test/default_extended.yml
Original file line number Diff line number Diff line change
@@ -1,22 +1,28 @@
- genome: "GRCh38_chr21"
dbsnp_vcf: "https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/genome/vcf/dbsnp_146.hg38.vcf.gz"
fasta: "https://raw.githubusercontent.com/nf-core/test-datasets/references/references/GRCh38_chr21/GRCh38_chr21.fa"
fasta_dict: "https://raw.githubusercontent.com/nf-core/test-datasets/references/references/GRCh38_chr21/GRCh38_chr21.dict"
fasta_fai: "https://raw.githubusercontent.com/nf-core/test-datasets/references/references/GRCh38_chr21/GRCh38_chr21.fa.fai"
fasta_sizes: "https://raw.githubusercontent.com/nf-core/test-datasets/references/references/GRCh38_chr21/GRCh38_chr21.fa.sizes"
germline_resource_vcf: "https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/genome/vcf/gnomAD.r2.1.1.vcf.gz"
gff: "https://raw.githubusercontent.com/nf-core/test-datasets/references/references/GRCh38_chr21/genes_chr21.gff"
gtf: "https://raw.githubusercontent.com/nf-core/test-datasets/references/references/GRCh38_chr21/GRCh38_chr21.gtf"
known_indels_vcf: "https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/genome/vcf/mills_and_1000G.indels.vcf.gz"
known_snps_vcf: "https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/genome/vcf/dbsnp_146.hg38.vcf.gz"
mito_name: "MT"
readme: "https://raw.githubusercontent.com/nf-core/test-datasets/references/references/GRCh38_chr21/README.md"
source: "nf-core/references"
source_dbsnp: "GATK_BUNDLE"
source_germline_resource: "GATK_BUNDLE"
source_known_indels: "GATK_BUNDLE"
source_known_snps: "GATK_BUNDLE"
source_vcf: "GATK_BUNDLE"
species: "Homo_sapiens"
splice_sites: "https://raw.githubusercontent.com/nf-core/test-datasets/references/references/GRCh38_chr21/genes_chr21.splice_sites.txt"
transcript_fasta: "https://raw.githubusercontent.com/nf-core/test-datasets/references/references/GRCh38_chr21/genome.transcripts.fa"
vcf: "https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/genome/vcf/dbsnp_146.hg38.vcf.gz"
# macs_gsize: "1.2e7"
- genome: "GRCh38_chr21"
readme: "https://raw.githubusercontent.com/nf-core/test-datasets/references/references/GRCh38_chr21/README.md"
source: "nf-core/references"
source_vcf: "GATK_BUNDLE"
species: "Homo_sapiens"
vcf: "https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/genome/vcf/gnomAD.r2.1.1.vcf.gz"
- genome: "GRCh38_chr21"
readme: "https://raw.githubusercontent.com/nf-core/test-datasets/references/references/GRCh38_chr21/README.md"
source: "nf-core/references"
source_vcf: "GATK_BUNDLE"
species: "Homo_sapiens"
vcf: "https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/genome/vcf/mills_and_1000G.indels.vcf.gz"
29 changes: 17 additions & 12 deletions assets/genomes/test/default_full.yml
Original file line number Diff line number Diff line change
@@ -1,27 +1,32 @@
- genome: "GRCh38_chr21"
dbsnp_vcf: "https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/genome/vcf/dbsnp_146.hg38.vcf.gz"
dbsnp_vcf_tbi: "https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/genome/vcf/dbsnp_146.hg38.vcf.gz.tbi"
fasta: "https://raw.githubusercontent.com/nf-core/test-datasets/references/references/GRCh38_chr21/GRCh38_chr21.fa"
fasta_dict: "https://raw.githubusercontent.com/nf-core/test-datasets/references/references/GRCh38_chr21/GRCh38_chr21.dict"
fasta_fai: "https://raw.githubusercontent.com/nf-core/test-datasets/references/references/GRCh38_chr21/GRCh38_chr21.fa.fai"
fasta_sizes: "https://raw.githubusercontent.com/nf-core/test-datasets/references/references/GRCh38_chr21/GRCh38_chr21.fa.sizes"
germline_resource_vcf: "https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/genome/vcf/gnomAD.r2.1.1.vcf.gz"
germline_resource_vcf_tbi: "https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/genome/vcf/gnomAD.r2.1.1.vcf.gz.tbi"
gff: "https://raw.githubusercontent.com/nf-core/test-datasets/references/references/GRCh38_chr21/genes_chr21.gff"
gtf: "https://raw.githubusercontent.com/nf-core/test-datasets/references/references/GRCh38_chr21/GRCh38_chr21.gtf"
intervals_bed: "https://raw.githubusercontent.com/nf-core/test-datasets/references/references/GRCh38_chr21/GRCh38_chr21.bed"
known_indels_vcf: "https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/genome/vcf/mills_and_1000G.indels.vcf.gz"
known_indels_vcf_tbi: "https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/genome/vcf/mills_and_1000G.indels.vcf.gz.tbi"
known_snps_vcf: "https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/genome/vcf/dbsnp_146.hg38.vcf.gz"
known_snps_vcf_tbi: "https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/genome/vcf/dbsnp_146.hg38.vcf.gz.tbi"
mito_name: "MT"
readme: "https://raw.githubusercontent.com/nf-core/test-datasets/references/references/GRCh38_chr21/README.md"
source: "nf-core/references"
source_dbsnp: "GATK_BUNDLE"
source_germline_resource: "GATK_BUNDLE"
source_known_indels: "GATK_BUNDLE"
source_known_snps: "GATK_BUNDLE"
source_vcf: "GATK_BUNDLE"
species: "Homo_sapiens"
splice_sites: "https://raw.githubusercontent.com/nf-core/test-datasets/references/references/GRCh38_chr21/genes_chr21.splice_sites.txt"
transcript_fasta: "https://raw.githubusercontent.com/nf-core/test-datasets/references/references/GRCh38_chr21/genome.transcripts.fa"
vcf: "https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/genome/vcf/dbsnp_146.hg38.vcf.gz"
vcf_tbi: "https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/genome/vcf/dbsnp_146.hg38.vcf.gz.tbi"
# macs_gsize: "1.2e7"
- genome: "GRCh38_chr21"
readme: "https://raw.githubusercontent.com/nf-core/test-datasets/references/references/GRCh38_chr21/README.md"
source: "nf-core/references"
source_vcf: "GATK_BUNDLE"
species: "Homo_sapiens"
vcf: "https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/genome/vcf/gnomAD.r2.1.1.vcf.gz"
vcf_tbi: "https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/genome/vcf/gnomAD.r2.1.1.vcf.gz.tbi"
- genome: "GRCh38_chr21"
readme: "https://raw.githubusercontent.com/nf-core/test-datasets/references/references/GRCh38_chr21/README.md"
source: "nf-core/references"
source_vcf: "GATK_BUNDLE"
species: "Homo_sapiens"
vcf: "https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/genome/vcf/mills_and_1000G.indels.vcf.gz"
vcf_tbi: "https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/genome/vcf/mills_and_1000G.indels.vcf.gz.tbi"
20 changes: 12 additions & 8 deletions assets/genomes/test/pipelines/sarek.yml
Original file line number Diff line number Diff line change
@@ -1,12 +1,16 @@
- genome: "testdata.GRCh38_chr22"
dbsnp_vcf: "https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/genome/vcf/dbsnp_146.hg38.vcf.gz"
fasta: "https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/genome/genome.fasta"
germline_resource_vcf: "https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/genome/vcf/gnomAD.r2.1.1.vcf.gz"
known_indels_vcf: "https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/genome/vcf/mills_and_1000G.indels.vcf.gz"
known_snps_vcf: "https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/genome/vcf/dbsnp_146.hg38.vcf.gz"
vcf: "https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/genome/vcf/dbsnp_146.hg38.vcf.gz"
source: "nf-core/references"
source_dbsnp: "GATK_BUNDLE"
source_germline_resource: "GATK_BUNDLE"
source_known_indels: "GATK_BUNDLE"
source_known_snps: "GATK_BUNDLE"
source_vcf: "GATK_BUNDLE"
species: "Homo_sapiens"
- genome: "testdata.GRCh38_chr22"
source_vcf: "GATK_BUNDLE"
species: "Homo_sapiens"
source: "nf-core/references"
vcf: "https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/genome/vcf/gnomAD.r2.1.1.vcf.gz"
- genome: "testdata.GRCh38_chr22"
source_vcf: "GATK_BUNDLE"
species: "Homo_sapiens"
source: "nf-core/references"
vcf: "https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/genome/vcf/mills_and_1000G.indels.vcf.gz"
8 changes: 4 additions & 4 deletions assets/genomes/test/pipelines/sarek_s3_muliple_glob.yml
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
# from sarek igenomes.config
- genome: GRCh37
known_indels_vcf: "s3://ngi-igenomes/igenomes/Homo_sapiens/GATK/GRCh37/Annotation/GATKBundle/{1000G_phase1,Mills_and_1000G_gold_standard}.indels.b37.vcf.gz"
source_known_indels: "GATK_BUNDLE"
vcf: "s3://ngi-igenomes/igenomes/Homo_sapiens/GATK/GRCh37/Annotation/GATKBundle/{1000G_phase1,Mills_and_1000G_gold_standard}.indels.b37.vcf.gz"
source_vcf: "GATK_BUNDLE"
species: "Homo_sapiens"
source: "GATK"
- genome: GRCh38
known_indels_vcf: "s3://ngi-igenomes/igenomes/Homo_sapiens/GATK/GRCh38/Annotation/GATKBundle/{Mills_and_1000G_gold_standard.indels.hg38,beta/Homo_sapiens_assembly38.known_indels}.vcf.gz"
source_known_indels: "GATK_BUNDLE"
vcf: "s3://ngi-igenomes/igenomes/Homo_sapiens/GATK/GRCh38/Annotation/GATKBundle/{Mills_and_1000G_gold_standard.indels.hg38,beta/Homo_sapiens_assembly38.known_indels}.vcf.gz"
source_vcf: "GATK_BUNDLE"
species: "Homo_sapiens"
source: "GATK"
36 changes: 3 additions & 33 deletions assets/schema_input.json
Original file line number Diff line number Diff line change
Expand Up @@ -18,25 +18,10 @@
"errorMessage": "Where the references came from",
"meta": ["source"]
},
"source_dbsnp": {
"source_vcf": {
"type": "string",
"errorMessage": "Where the references came from",
"meta": ["source_dbsnp"]
},
"source_germline_resource": {
"type": "string",
"errorMessage": "Where the references came from",
"meta": ["source_germline_resource"]
},
"source_known_indels": {
"type": "string",
"errorMessage": "Where the references came from",
"meta": ["source_known_indels"]
},
"source_known_snps": {
"type": "string",
"errorMessage": "Where the references came from",
"meta": ["source_known_snps"]
"meta": ["source_vcf"]
},
"species": {
"type": "string",
Expand Down Expand Up @@ -88,22 +73,7 @@
"pattern": "^\\S+\\.f(ast|n)?a(\\.gz)?$",
"errorMessage": "TODO"
},
"dbsnp_vcf": {
"type": "string",
"pattern": "^\\S+\\.vcf\\.gz$",
"errorMessage": "TODO"
},
"known_snps_vcf": {
"type": "string",
"pattern": "^\\S+\\.vcf\\.gz$",
"errorMessage": "TODO"
},
"known_indels_vcf": {
"type": "string",
"pattern": "^\\S+\\.vcf\\.gz$",
"errorMessage": "TODO"
},
"germline_resource_vcf": {
"vcf": {
"type": "string",
"pattern": "^\\S+\\.vcf\\.gz$",
"errorMessage": "TODO"
Expand Down
58 changes: 23 additions & 35 deletions main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -165,17 +165,8 @@ output {
'star' {
path 'star'
}
'tabix_dbsnp' {
path { meta, vcf -> { file -> "${meta.species}/${meta.source}/${meta.id}/Annotation/${meta.source_dbsnp}/${file}" } }
}
'tabix_germline_resource' {
path { meta, vcf -> { file -> "${meta.species}/${meta.source}/${meta.id}/Annotation/${meta.source_germline_resource}/${file}" } }
}
'tabix_known_indels' {
path { meta, vcf -> { file -> "${meta.species}/${meta.source}/${meta.id}/Annotation/${meta.source_known_indels}/${file}" } }
}
'tabix_known_snps' {
path { meta, vcf -> { file -> "${meta.species}/${meta.source}/${meta.id}/Annotation/${meta.source_known_snps}/${file}" } }
'vcf_tbi' {
path { meta, tbi -> { file -> "${meta.species}/${meta.source}/${meta.id}/Annotation/${meta.source_vcf}/${file}" } }
}
}
/*
Expand All @@ -197,28 +188,25 @@ workflow NFCORE_REFERENCES {
REFERENCES(input, tools)

emit:
bowtie1 = REFERENCES.out.bowtie1
bowtie2 = REFERENCES.out.bowtie2
bwamem1 = REFERENCES.out.bwamem1
bwamem2 = REFERENCES.out.bwamem2
dbsnp_vcf_tbi = REFERENCES.out.dbsnp_vcf_tbi
dragmap = REFERENCES.out.dragmap
fasta = REFERENCES.out.fasta
fasta_dict = REFERENCES.out.fasta_dict
fasta_fai = REFERENCES.out.fasta_fai
germline_resource_vcf_tbi = REFERENCES.out.germline_resource_vcf_tbi
gffread = REFERENCES.out.gff_gtf
hisat2 = REFERENCES.out.hisat2
hisat2_splice_sites = REFERENCES.out.hisat2_splice_sites
intervals = REFERENCES.out.intervals_bed
kallisto = REFERENCES.out.kallisto
known_indels_vcf_tbi = REFERENCES.out.known_indels_vcf_tbi
known_snps_vcf_tbi = REFERENCES.out.known_snps_vcf_tbi
msisensorpro = REFERENCES.out.msisensorpro
rsem = REFERENCES.out.rsem
rsem_transcript_fasta = REFERENCES.out.rsem_transcript_fasta
salmon = REFERENCES.out.salmon
sizes = REFERENCES.out.sizes
star = REFERENCES.out.star
versions = REFERENCES.out.versions
bowtie1 = REFERENCES.out.bowtie1
bowtie2 = REFERENCES.out.bowtie2
bwamem1 = REFERENCES.out.bwamem1
bwamem2 = REFERENCES.out.bwamem2
dragmap = REFERENCES.out.dragmap
fasta = REFERENCES.out.fasta
fasta_dict = REFERENCES.out.fasta_dict
fasta_fai = REFERENCES.out.fasta_fai
gffread = REFERENCES.out.gff_gtf
hisat2 = REFERENCES.out.hisat2
hisat2_splice_sites = REFERENCES.out.hisat2_splice_sites
intervals = REFERENCES.out.intervals_bed
kallisto = REFERENCES.out.kallisto
msisensorpro = REFERENCES.out.msisensorpro
rsem = REFERENCES.out.rsem
rsem_transcript_fasta = REFERENCES.out.rsem_transcript_fasta
salmon = REFERENCES.out.salmon
sizes = REFERENCES.out.sizes
star = REFERENCES.out.star
vcf_tbi = REFERENCES.out.vcf_tbi
versions = REFERENCES.out.versions
}
67 changes: 67 additions & 0 deletions subworkflows/local/create_align_index/main.nf
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
include { BOWTIE_BUILD as BOWTIE1_BUILD } from '../../../modules/nf-core/bowtie/build'
include { BOWTIE2_BUILD } from '../../../modules/nf-core/bowtie2/build'
include { BWAMEM2_INDEX } from '../../../modules/nf-core/bwamem2/index'
include { BWA_INDEX as BWAMEM1_INDEX } from '../../../modules/nf-core/bwa/index'
include { DRAGMAP_HASHTABLE } from '../../../modules/nf-core/dragmap/hashtable'

workflow CREATE_ALIGN_INDEX {
maxulysse marked this conversation as resolved.
Show resolved Hide resolved
take:
fasta
run_bowtie1
run_bowtie2
run_bwamem1
run_bwamem2
run_dragmap

main:
bowtie1_index = Channel.empty()
bowtie2_index = Channel.empty()
bwamem1_index = Channel.empty()
bwamem2_index = Channel.empty()
dragmap_hashmap = Channel.empty()

versions = Channel.empty()

if (run_bowtie1) {
BOWTIE1_BUILD(fasta)

bowtie1_index = BOWTIE1_BUILD.out.index
versions = versions.mix(BOWTIE1_BUILD.out.versions)
}

if (run_bowtie2) {
BOWTIE2_BUILD(fasta)

bowtie2_index = BOWTIE2_BUILD.out.index
versions = versions.mix(BOWTIE2_BUILD.out.versions)
}

if (run_bwamem1) {
BWAMEM1_INDEX(fasta)

bwamem1_index = BWAMEM1_INDEX.out.index
versions = versions.mix(BWAMEM1_INDEX.out.versions)
}

if (run_bwamem2) {
BWAMEM2_INDEX(fasta)

bwamem2_index = BWAMEM2_INDEX.out.index
versions = versions.mix(BWAMEM2_INDEX.out.versions)
}

if (run_dragmap) {
DRAGMAP_HASHTABLE(fasta)

dragmap_hashmap = DRAGMAP_HASHTABLE.out.hashmap
versions = versions.mix(DRAGMAP_HASHTABLE.out.versions)
}

emit:
bowtie1_index
bowtie2_index
bwamem1_index
bwamem2_index
dragmap_hashmap
versions
}
Loading
Loading