From ceffcddddcf5257d6038d87038dfa1357fde4da5 Mon Sep 17 00:00:00 2001 From: aidaanva Date: Fri, 18 Oct 2024 11:56:30 +0200 Subject: [PATCH 1/3] work on adding mva --- conf/modules.config | 11 ++ modules.json | 5 + .../nf-core/multivcfanalyzer/environment.yml | 7 + modules/nf-core/multivcfanalyzer/main.nf | 100 +++++++++++++++ modules/nf-core/multivcfanalyzer/meta.yml | 120 ++++++++++++++++++ nextflow.config | 14 ++ .../local/reference_indexing_single.nf | 5 +- workflows/eager.nf | 17 +++ 8 files changed, 278 insertions(+), 1 deletion(-) create mode 100644 modules/nf-core/multivcfanalyzer/environment.yml create mode 100644 modules/nf-core/multivcfanalyzer/main.nf create mode 100644 modules/nf-core/multivcfanalyzer/meta.yml diff --git a/conf/modules.config b/conf/modules.config index b03153561..305b7963f 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -1177,4 +1177,15 @@ process { pattern: '*.{glf,beagle}.gz' ] } + + withName: MULTIVCFANALYZER { + tag = { "${meta.reference}" } + ext.prefix = { "multivcfanalyzer_${meta.reference}" } + publishDir = [ + path: { "${params.outdir}/consensus_sequence/" }, + mode: params.publish_dir_mode, + enabled: true, + pattern: '*.{fasta.gz,tsv,txt}' + ] + } } diff --git a/modules.json b/modules.json index 7b4d35fce..799268af8 100644 --- a/modules.json +++ b/modules.json @@ -175,6 +175,11 @@ "git_sha": "b7ebe95761cd389603f9cc0e0dc384c0f663815a", "installed_by": ["modules"] }, + "multivcfanalyzer": { + "branch": "master", + "git_sha": "b34fb117e397e72a070cde8adf21b758670c90f5", + "installed_by": ["modules"] + }, "picard/createsequencedictionary": { "branch": "master", "git_sha": "20b0918591d4ba20047d7e13e5094bcceba81447", diff --git a/modules/nf-core/multivcfanalyzer/environment.yml b/modules/nf-core/multivcfanalyzer/environment.yml new file mode 100644 index 000000000..37f96f5e3 --- /dev/null +++ b/modules/nf-core/multivcfanalyzer/environment.yml @@ -0,0 +1,7 @@ +name: multivcfanalyzer +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::multivcfanalyzer=0.85.2 diff --git a/modules/nf-core/multivcfanalyzer/main.nf b/modules/nf-core/multivcfanalyzer/main.nf new file mode 100644 index 000000000..f3a91a8a8 --- /dev/null +++ b/modules/nf-core/multivcfanalyzer/main.nf @@ -0,0 +1,100 @@ +process MULTIVCFANALYZER { + tag "$fasta" + label 'process_single' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/multivcfanalyzer:0.85.2--hdfd78af_1': + 'biocontainers/multivcfanalyzer:0.85.2--hdfd78af_1' }" + + input: + tuple val(meta), path(vcfs) + tuple val(meta2), path(fasta) + tuple val(meta3), path(snpeff_results) + tuple val(meta4), path(gff) + val allele_freqs + val genotype_quality + val coverage + val homozygous_freq + val heterozygous_freq + tuple val(meta5), path(gff_exclude) + + + output: + tuple val(meta), path('fullAlignment.fasta.gz') , emit: full_alignment + tuple val(meta), path('info.txt') , emit: info_txt + tuple val(meta), path('snpAlignment.fasta.gz') , emit: snp_alignment + tuple val(meta), path('snpAlignmentIncludingRefGenome.fasta.gz') , emit: snp_genome_alignment + tuple val(meta), path('snpStatistics.tsv') , emit: snpstatistics + tuple val(meta), path('snpTable.tsv') , emit: snptable + tuple val(meta), path('snpTableForSnpEff.tsv') , emit: snptable_snpeff + tuple val(meta), path('snpTableWithUncertaintyCalls.tsv') , emit: snptable_uncertainty + tuple val(meta), path('structureGenotypes.tsv') , emit: structure_genotypes + tuple val(meta), path('structureGenotypes_noMissingData-Columns.tsv') , emit: structure_genotypes_nomissing + tuple val(meta), path('MultiVCFAnalyzer.json') , emit: json + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + // def args = task.ext.args ?: '' // MultiVCFAnalyzer has strict and input ordering and all are mandatory. Deactivating $args to prevent breakage of input + def args2 = task.ext.args2 ?: '' + + def cmd_snpeff_results = snpeff_results ? "${snpeff_results}" : "NA" + def cmd_gff = gff ? "${gff}" : "NA" + def cmd_allele_freqs = allele_freqs ? "T" : "F" + def cmd_gff_exclude = gff_exclude ? "${gff}" : "NA" + + """ + multivcfanalyzer \\ + ${cmd_snpeff_results} \\ + ${fasta} \\ + ${cmd_gff} \\ + . \ + ${cmd_allele_freqs} \\ + ${genotype_quality} \\ + ${coverage} \\ + ${homozygous_freq} \\ + ${heterozygous_freq} \\ + ${cmd_gff_exclude} \\ + ${vcfs.sort().join(" ")} + + gzip \\ + $args2 \\ + fullAlignment.fasta snpAlignment.fasta snpAlignmentIncludingRefGenome.fasta + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + multivcfanalyzer: \$(echo \$(multivcfanalyzer --help | head -n 1) | cut -f 3 -d ' ' ) + END_VERSIONS + """ + stub: + + def args2 = task.ext.args2 ?: '' + + def cmd_snpeff_results = snpeff_results ? "${snpeff_results}" : "NA" + def cmd_gff = gff ? "${gff}" : "NA" + def cmd_allele_freqs = allele_freqs ? "T" : "F" + def cmd_gff_exclude = gff_exclude ? "${gff}" : "NA" + + """ + echo "" | gzip > fullAlignment.fasta.gz + touch info.txt + echo "" | gzip > snpAlignment.fasta.gz + echo "" | gzip > snpAlignmentIncludingRefGenome.fasta.gz + touch snpStatistics.tsv + touch snpTable.tsv + touch snpTableForSnpEff.tsv + touch snpTableWithUncertaintyCalls.tsv + touch structureGenotypes.tsv + touch structureGenotypes_noMissingData-Columns.tsv + touch MultiVCFAnalyzer.json + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + multivcfanalyzer: \$(echo \$(multivcfanalyzer --help | head -n 1) | cut -f 3 -d ' ' ) + END_VERSIONS + + """ +} diff --git a/modules/nf-core/multivcfanalyzer/meta.yml b/modules/nf-core/multivcfanalyzer/meta.yml new file mode 100644 index 000000000..016637452 --- /dev/null +++ b/modules/nf-core/multivcfanalyzer/meta.yml @@ -0,0 +1,120 @@ +name: "multivcfanalyzer" +description: SNP table generator from GATK UnifiedGenotyper with functionality geared for aDNA +keywords: + - vcf + - ancient DNA + - aDNA + - SNP + - GATK UnifiedGenotyper + - SNP table +tools: + - "multivcfanalyzer": + description: "MultiVCFAnalyzer is a VCF file post-processing tool tailored for aDNA. License on Github repository." + homepage: "https://github.com/alexherbig/MultiVCFAnalyzer" + documentation: "https://github.com/alexherbig/MultiVCFAnalyzer" + tool_dev_url: "https://github.com/alexherbig/MultiVCFAnalyzer" + doi: "10.1038/nature13591" + licence: ["GPL >=3"] +input: + - vcfs: + type: file + description: One or a list of uncompressed VCF file + pattern: "*.vcf" + - fasta: + type: file + description: Reference genome VCF was generated against + pattern: "*.{fasta,fna,fa}" + - snpeff_results: + type: file + description: Results from snpEff in txt format (Optional) + pattern: "*.txt" + - gff: + type: file + description: GFF file corresponding to reference genome fasta (Optional) + pattern: "*.gff" + - allele_freqs: + type: boolean + description: | + Whether to include the percentage of reads a given allele is + present in in the SNP table. + - genotype_quality: + type: integer + description: | + Minimum GATK genotyping threshold threshold of which a SNP call + falling under is 'discarded' + - coverage: + type: integer + description: | + Minimum number of a reads that a position must be covered by to be + reported + - homozygous_freq: + type: integer + description: Fraction of reads a base must have to be called 'homozygous' + - heterozygous_freq: + type: integer + description: | + Fraction of which whereby if a call falls above this value, and lower + than the homozygous threshold, a base will be called 'heterozygous'. + - gff_exclude: + type: file + description: | + file listing positions that will be 'filtered' (i.e. ignored) + (Optional) + pattern: "*.vcf" +output: + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - bam: + type: file + description: Sorted BAM/CRAM/SAM file + pattern: "*.{bam,cram,sam}" + - full_alignment: + type: file + description: Fasta a fasta file of all positions contained in the VCF files i.e. including ref calls + pattern: ".fasta.gz" + - info_txt: + type: file + description: Information about the run + pattern: ".txt" + - snp_alignment: + type: file + description: A fasta file of just SNP positions with samples only + pattern: ".fasta.gz" + - snp_genome_alignment: + type: file + description: A fasta file of just SNP positions with reference genome + pattern: ".fasta.gz" + - snpstatistics: + type: file + description: Some basic statistics about the SNP calls of each sample + pattern: ".tsv" + - snptable: + type: file + description: Basic SNP table of combined positions taken from each VCF file + pattern: ".tsv" + - snptable_snpeff: + type: file + description: Input file for SnpEff + pattern: ".tsv" + - snptable_uncertainty: + type: file + description: Same as above, but with lower case characters indicating uncertain calls + pattern: ".tsv" + - structure_genotypes: + type: file + description: Input file for STRUCTURE + pattern: ".tsv" + - structure_genotypes_nomissing: + type: file + description: Alternate input file for STRUCTURE + pattern: ".tsv" + - json: + type: file + description: Summary statistics in MultiQC JSON format + pattern: ".json" +authors: + - "@jfy133" +maintainers: + - "@jfy133" diff --git a/nextflow.config b/nextflow.config index dfee8fcbb..d86556490 100644 --- a/nextflow.config +++ b/nextflow.config @@ -235,8 +235,22 @@ params { genotyping_freebayes_skip_coverage = 0 genotyping_angsd_glmodel = 'samtools' genotyping_angsd_glformat = 'binary' + + //Consensus sequence + run_consensus = false + consensus_tool = null + consensus_multivcfanalyzer_write_allele_frequencies = false + consensus_multivcfanalyzer_min_genotype_quality = 30 + consensus_multivcfanalyzer_min_base_coverage = 5 + consensus_multivcfanalyzer_allele_freq_hom = 0.9 + consensus_multivcfanalyzer_allele_freq_het = 0.9 + consensus_multivcfanalyzer_additional_vcf_files = null + consensus_multivcfanalyzer_reference_gff_annotations = null + consensus_multivcfanalyzer_reference_gff_exclude = null + consensus_multivcfanalyzer_snpeff_results = null } + // Load base.config by default for all pipelines includeConfig 'conf/base.config' diff --git a/subworkflows/local/reference_indexing_single.nf b/subworkflows/local/reference_indexing_single.nf index 4206b340c..6e97d4b25 100644 --- a/subworkflows/local/reference_indexing_single.nf +++ b/subworkflows/local/reference_indexing_single.nf @@ -91,7 +91,8 @@ workflow REFERENCE_INDEXING_SINGLE { def bedtools_feature = params.mapstats_bedtools_featurefile != null ? file(params.mapstats_bedtools_featurefile, checkIfExists: true ) : "" def genotyping_reference_ploidy = params.genotyping_reference_ploidy def genotyping_gatk_dbsnp = params.genotyping_gatk_dbsnp != null ? file(params.genotyping_gatk_dbsnp, checkIfExists: true ) : "" - [ meta + [ ploidy: genotyping_reference_ploidy ], fasta, fai, dict, mapper_index, params.fasta_circular_target, params.mitochondrion_header, contamination_estimation_angsd_hapmap, pmd_masked_fasta, pmd_bed_for_masking, capture_bed, pileupcaller_bed, pileupcaller_snp, sexdet_bed, bedtools_feature, genotyping_gatk_dbsnp ] + def consensus_sequence_mva_additional_vcf = params.consensus_multivcfanalyzer_additional_vcf_files != null ? file(params.consensus_multivcfanalyzer_additional_vcf_files, checkIfExists: true ) : "" + [ meta + [ ploidy: genotyping_reference_ploidy ], fasta, fai, dict, mapper_index, params.fasta_circular_target, params.mitochondrion_header, contamination_estimation_angsd_hapmap, pmd_masked_fasta, pmd_bed_for_masking, capture_bed, pileupcaller_bed, pileupcaller_snp, sexdet_bed, bedtools_feature, genotyping_gatk_dbsnp, consensus_sequence_mva_additional_vcf ] } ch_ref_index_single = ch_reference_for_mapping @@ -107,6 +108,7 @@ workflow REFERENCE_INDEXING_SINGLE { sexdeterrmine_bed: [ meta, sexdet_bed ] bedtools_feature: [ meta, bedtools_feature ] dbsnp: [ meta, genotyping_gatk_dbsnp ] + mva_additional_vcfs: [ meta, consensus_sequence_mva_additional_vcf ] } emit: @@ -120,6 +122,7 @@ workflow REFERENCE_INDEXING_SINGLE { sexdeterrmine_bed = ch_ref_index_single.sexdeterrmine_bed // [ meta, sexdet_bed ] bedtools_feature = ch_ref_index_single.bedtools_feature // [ meta, bedtools_feature ] dbsnp = ch_ref_index_single.dbsnp // [ meta, genotyping_gatk_dbsnp ] + mva_additional_vcfs = ch_ref_index_single.mva_additional_vcfs // [ meta, consensus_sequence_mva_additional_vcf ] versions = ch_versions } diff --git a/workflows/eager.nf b/workflows/eager.nf index 46739b7a7..0d3e86136 100644 --- a/workflows/eager.nf +++ b/workflows/eager.nf @@ -584,6 +584,23 @@ workflow EAGER { ch_multiqc_files = ch_multiqc_files.mix( GENOTYPE.out.mqc.collect{it[1]}.ifEmpty([]) ) } + // + // SUBWORKFLOW: Consensus sequence + // + if ( params.run_consensus_sequence ) { + ch_reference_for_consensus_sequence = REFERENCE_INDEXING.out.reference + // Remove unnecessary files from the reference channel, so SWF doesn't break with each change to reference channel. + .map { + meta, fasta, fai, dict, mapindex, circular_target -> + [ meta, fasta, fai ] + } + CONSENSUS_SEQUENCE( + ch_vcf_for_consensus_sequence = GENOTYPE.out.vcf + ch_reference_for_consensus_sequence + ) + + } + // // Collate and save software versions // From 3fa9e13876cd00d9bf320fc3304d12ff6f249a5b Mon Sep 17 00:00:00 2001 From: aidaanva Date: Fri, 18 Oct 2024 11:56:30 +0200 Subject: [PATCH 2/3] work on adding mva --- conf/modules.config | 11 ++ modules.json | 5 + .../nf-core/multivcfanalyzer/environment.yml | 7 + modules/nf-core/multivcfanalyzer/main.nf | 100 +++++++++++++++ modules/nf-core/multivcfanalyzer/meta.yml | 120 ++++++++++++++++++ nextflow.config | 14 ++ .../local/reference_indexing_single.nf | 5 +- workflows/eager.nf | 17 +++ 8 files changed, 278 insertions(+), 1 deletion(-) create mode 100644 modules/nf-core/multivcfanalyzer/environment.yml create mode 100644 modules/nf-core/multivcfanalyzer/main.nf create mode 100644 modules/nf-core/multivcfanalyzer/meta.yml diff --git a/conf/modules.config b/conf/modules.config index 09fbff1fc..c65c99d0b 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -1399,4 +1399,15 @@ process { pattern: '*.{glf,beagle}.gz' ] } + + withName: MULTIVCFANALYZER { + tag = { "${meta.reference}" } + ext.prefix = { "multivcfanalyzer_${meta.reference}" } + publishDir = [ + path: { "${params.outdir}/consensus_sequence/" }, + mode: params.publish_dir_mode, + enabled: true, + pattern: '*.{fasta.gz,tsv,txt}' + ] + } } diff --git a/modules.json b/modules.json index a2fc85bbb..914df0e04 100644 --- a/modules.json +++ b/modules.json @@ -225,6 +225,11 @@ "git_sha": "b7ebe95761cd389603f9cc0e0dc384c0f663815a", "installed_by": ["modules"] }, + "multivcfanalyzer": { + "branch": "master", + "git_sha": "b34fb117e397e72a070cde8adf21b758670c90f5", + "installed_by": ["modules"] + }, "picard/createsequencedictionary": { "branch": "master", "git_sha": "20b0918591d4ba20047d7e13e5094bcceba81447", diff --git a/modules/nf-core/multivcfanalyzer/environment.yml b/modules/nf-core/multivcfanalyzer/environment.yml new file mode 100644 index 000000000..37f96f5e3 --- /dev/null +++ b/modules/nf-core/multivcfanalyzer/environment.yml @@ -0,0 +1,7 @@ +name: multivcfanalyzer +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::multivcfanalyzer=0.85.2 diff --git a/modules/nf-core/multivcfanalyzer/main.nf b/modules/nf-core/multivcfanalyzer/main.nf new file mode 100644 index 000000000..f3a91a8a8 --- /dev/null +++ b/modules/nf-core/multivcfanalyzer/main.nf @@ -0,0 +1,100 @@ +process MULTIVCFANALYZER { + tag "$fasta" + label 'process_single' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/multivcfanalyzer:0.85.2--hdfd78af_1': + 'biocontainers/multivcfanalyzer:0.85.2--hdfd78af_1' }" + + input: + tuple val(meta), path(vcfs) + tuple val(meta2), path(fasta) + tuple val(meta3), path(snpeff_results) + tuple val(meta4), path(gff) + val allele_freqs + val genotype_quality + val coverage + val homozygous_freq + val heterozygous_freq + tuple val(meta5), path(gff_exclude) + + + output: + tuple val(meta), path('fullAlignment.fasta.gz') , emit: full_alignment + tuple val(meta), path('info.txt') , emit: info_txt + tuple val(meta), path('snpAlignment.fasta.gz') , emit: snp_alignment + tuple val(meta), path('snpAlignmentIncludingRefGenome.fasta.gz') , emit: snp_genome_alignment + tuple val(meta), path('snpStatistics.tsv') , emit: snpstatistics + tuple val(meta), path('snpTable.tsv') , emit: snptable + tuple val(meta), path('snpTableForSnpEff.tsv') , emit: snptable_snpeff + tuple val(meta), path('snpTableWithUncertaintyCalls.tsv') , emit: snptable_uncertainty + tuple val(meta), path('structureGenotypes.tsv') , emit: structure_genotypes + tuple val(meta), path('structureGenotypes_noMissingData-Columns.tsv') , emit: structure_genotypes_nomissing + tuple val(meta), path('MultiVCFAnalyzer.json') , emit: json + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + // def args = task.ext.args ?: '' // MultiVCFAnalyzer has strict and input ordering and all are mandatory. Deactivating $args to prevent breakage of input + def args2 = task.ext.args2 ?: '' + + def cmd_snpeff_results = snpeff_results ? "${snpeff_results}" : "NA" + def cmd_gff = gff ? "${gff}" : "NA" + def cmd_allele_freqs = allele_freqs ? "T" : "F" + def cmd_gff_exclude = gff_exclude ? "${gff}" : "NA" + + """ + multivcfanalyzer \\ + ${cmd_snpeff_results} \\ + ${fasta} \\ + ${cmd_gff} \\ + . \ + ${cmd_allele_freqs} \\ + ${genotype_quality} \\ + ${coverage} \\ + ${homozygous_freq} \\ + ${heterozygous_freq} \\ + ${cmd_gff_exclude} \\ + ${vcfs.sort().join(" ")} + + gzip \\ + $args2 \\ + fullAlignment.fasta snpAlignment.fasta snpAlignmentIncludingRefGenome.fasta + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + multivcfanalyzer: \$(echo \$(multivcfanalyzer --help | head -n 1) | cut -f 3 -d ' ' ) + END_VERSIONS + """ + stub: + + def args2 = task.ext.args2 ?: '' + + def cmd_snpeff_results = snpeff_results ? "${snpeff_results}" : "NA" + def cmd_gff = gff ? "${gff}" : "NA" + def cmd_allele_freqs = allele_freqs ? "T" : "F" + def cmd_gff_exclude = gff_exclude ? "${gff}" : "NA" + + """ + echo "" | gzip > fullAlignment.fasta.gz + touch info.txt + echo "" | gzip > snpAlignment.fasta.gz + echo "" | gzip > snpAlignmentIncludingRefGenome.fasta.gz + touch snpStatistics.tsv + touch snpTable.tsv + touch snpTableForSnpEff.tsv + touch snpTableWithUncertaintyCalls.tsv + touch structureGenotypes.tsv + touch structureGenotypes_noMissingData-Columns.tsv + touch MultiVCFAnalyzer.json + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + multivcfanalyzer: \$(echo \$(multivcfanalyzer --help | head -n 1) | cut -f 3 -d ' ' ) + END_VERSIONS + + """ +} diff --git a/modules/nf-core/multivcfanalyzer/meta.yml b/modules/nf-core/multivcfanalyzer/meta.yml new file mode 100644 index 000000000..016637452 --- /dev/null +++ b/modules/nf-core/multivcfanalyzer/meta.yml @@ -0,0 +1,120 @@ +name: "multivcfanalyzer" +description: SNP table generator from GATK UnifiedGenotyper with functionality geared for aDNA +keywords: + - vcf + - ancient DNA + - aDNA + - SNP + - GATK UnifiedGenotyper + - SNP table +tools: + - "multivcfanalyzer": + description: "MultiVCFAnalyzer is a VCF file post-processing tool tailored for aDNA. License on Github repository." + homepage: "https://github.com/alexherbig/MultiVCFAnalyzer" + documentation: "https://github.com/alexherbig/MultiVCFAnalyzer" + tool_dev_url: "https://github.com/alexherbig/MultiVCFAnalyzer" + doi: "10.1038/nature13591" + licence: ["GPL >=3"] +input: + - vcfs: + type: file + description: One or a list of uncompressed VCF file + pattern: "*.vcf" + - fasta: + type: file + description: Reference genome VCF was generated against + pattern: "*.{fasta,fna,fa}" + - snpeff_results: + type: file + description: Results from snpEff in txt format (Optional) + pattern: "*.txt" + - gff: + type: file + description: GFF file corresponding to reference genome fasta (Optional) + pattern: "*.gff" + - allele_freqs: + type: boolean + description: | + Whether to include the percentage of reads a given allele is + present in in the SNP table. + - genotype_quality: + type: integer + description: | + Minimum GATK genotyping threshold threshold of which a SNP call + falling under is 'discarded' + - coverage: + type: integer + description: | + Minimum number of a reads that a position must be covered by to be + reported + - homozygous_freq: + type: integer + description: Fraction of reads a base must have to be called 'homozygous' + - heterozygous_freq: + type: integer + description: | + Fraction of which whereby if a call falls above this value, and lower + than the homozygous threshold, a base will be called 'heterozygous'. + - gff_exclude: + type: file + description: | + file listing positions that will be 'filtered' (i.e. ignored) + (Optional) + pattern: "*.vcf" +output: + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - bam: + type: file + description: Sorted BAM/CRAM/SAM file + pattern: "*.{bam,cram,sam}" + - full_alignment: + type: file + description: Fasta a fasta file of all positions contained in the VCF files i.e. including ref calls + pattern: ".fasta.gz" + - info_txt: + type: file + description: Information about the run + pattern: ".txt" + - snp_alignment: + type: file + description: A fasta file of just SNP positions with samples only + pattern: ".fasta.gz" + - snp_genome_alignment: + type: file + description: A fasta file of just SNP positions with reference genome + pattern: ".fasta.gz" + - snpstatistics: + type: file + description: Some basic statistics about the SNP calls of each sample + pattern: ".tsv" + - snptable: + type: file + description: Basic SNP table of combined positions taken from each VCF file + pattern: ".tsv" + - snptable_snpeff: + type: file + description: Input file for SnpEff + pattern: ".tsv" + - snptable_uncertainty: + type: file + description: Same as above, but with lower case characters indicating uncertain calls + pattern: ".tsv" + - structure_genotypes: + type: file + description: Input file for STRUCTURE + pattern: ".tsv" + - structure_genotypes_nomissing: + type: file + description: Alternate input file for STRUCTURE + pattern: ".tsv" + - json: + type: file + description: Summary statistics in MultiQC JSON format + pattern: ".json" +authors: + - "@jfy133" +maintainers: + - "@jfy133" diff --git a/nextflow.config b/nextflow.config index c8a86d059..b9da96e3a 100644 --- a/nextflow.config +++ b/nextflow.config @@ -269,8 +269,22 @@ params { genotyping_freebayes_skip_coverage = 0 genotyping_angsd_glmodel = 'samtools' genotyping_angsd_glformat = 'binary' + + //Consensus sequence + run_consensus = false + consensus_tool = null + consensus_multivcfanalyzer_write_allele_frequencies = false + consensus_multivcfanalyzer_min_genotype_quality = 30 + consensus_multivcfanalyzer_min_base_coverage = 5 + consensus_multivcfanalyzer_allele_freq_hom = 0.9 + consensus_multivcfanalyzer_allele_freq_het = 0.9 + consensus_multivcfanalyzer_additional_vcf_files = null + consensus_multivcfanalyzer_reference_gff_annotations = null + consensus_multivcfanalyzer_reference_gff_exclude = null + consensus_multivcfanalyzer_snpeff_results = null } + // Load base.config by default for all pipelines includeConfig 'conf/base.config' diff --git a/subworkflows/local/reference_indexing_single.nf b/subworkflows/local/reference_indexing_single.nf index 41feced1e..718909630 100644 --- a/subworkflows/local/reference_indexing_single.nf +++ b/subworkflows/local/reference_indexing_single.nf @@ -89,9 +89,10 @@ workflow REFERENCE_INDEXING_SINGLE { def bedtools_feature = params.mapstats_bedtools_featurefile != null ? file(params.mapstats_bedtools_featurefile, checkIfExists: true ) : "" def genotyping_reference_ploidy = params.genotyping_reference_ploidy def genotyping_gatk_dbsnp = params.genotyping_gatk_dbsnp != null ? file(params.genotyping_gatk_dbsnp, checkIfExists: true ) : "" + def consensus_sequence_mva_additional_vcf = params.consensus_multivcfanalyzer_additional_vcf_files != null ? file(params.consensus_multivcfanalyzer_additional_vcf_files, checkIfExists: true ) : "" def circularmapper_elongated_fasta = params.fasta_circularmapper_elongatedfasta != null ? file( params.fasta_circularmapper_elongatedfasta, checkIfExists: true ) : "" def circularmapper_elongated_index = params.fasta_circularmapper_elongatedindex != null ? file( params.fasta_circularmapper_elongatedindex, checkIfExists: true ) : "" - [ meta + [ ploidy: genotyping_reference_ploidy ], fasta, fai, dict, mapper_index, params.fasta_circular_target, params.mitochondrion_header, contamination_estimation_angsd_hapmap, pmd_masked_fasta, pmd_bed_for_masking, capture_bed, pileupcaller_bed, pileupcaller_snp, sexdet_bed, bedtools_feature, genotyping_gatk_dbsnp, circularmapper_elongated_fasta, circularmapper_elongated_index ] + [ meta + [ ploidy: genotyping_reference_ploidy ], fasta, fai, dict, mapper_index, params.fasta_circular_target, params.mitochondrion_header, contamination_estimation_angsd_hapmap, pmd_masked_fasta, pmd_bed_for_masking, capture_bed, pileupcaller_bed, pileupcaller_snp, sexdet_bed, bedtools_feature, genotyping_gatk_dbsnp, consensus_sequence_mva_additional_vcf, circularmapper_elongated_fasta, circularmapper_elongated_index ] } ch_ref_index_single = ch_reference_for_mapping @@ -108,6 +109,7 @@ workflow REFERENCE_INDEXING_SINGLE { sexdeterrmine_bed: [ meta, sexdet_bed ] bedtools_feature: [ meta, bedtools_feature ] dbsnp: [ meta, genotyping_gatk_dbsnp ] + mva_additional_vcfs: [ meta, consensus_sequence_mva_additional_vcf ] } emit: @@ -122,6 +124,7 @@ workflow REFERENCE_INDEXING_SINGLE { sexdeterrmine_bed = ch_ref_index_single.sexdeterrmine_bed // [ meta, sexdet_bed ] bedtools_feature = ch_ref_index_single.bedtools_feature // [ meta, bedtools_feature ] dbsnp = ch_ref_index_single.dbsnp // [ meta, genotyping_gatk_dbsnp ] + mva_additional_vcfs = ch_ref_index_single.mva_additional_vcfs // [ meta, consensus_sequence_mva_additional_vcf ] versions = ch_versions } diff --git a/workflows/eager.nf b/workflows/eager.nf index 0766744bd..a9ae083c6 100644 --- a/workflows/eager.nf +++ b/workflows/eager.nf @@ -588,6 +588,23 @@ workflow EAGER { ch_multiqc_files = ch_multiqc_files.mix( GENOTYPE.out.mqc.collect{it[1]}.ifEmpty([]) ) } + // + // SUBWORKFLOW: Consensus sequence + // + if ( params.run_consensus_sequence ) { + ch_reference_for_consensus_sequence = REFERENCE_INDEXING.out.reference + // Remove unnecessary files from the reference channel, so SWF doesn't break with each change to reference channel. + .map { + meta, fasta, fai, dict, mapindex, circular_target -> + [ meta, fasta, fai ] + } + CONSENSUS_SEQUENCE( + ch_vcf_for_consensus_sequence = GENOTYPE.out.vcf + ch_reference_for_consensus_sequence + ) + + } + // // Collate and save software versions // From 2447756902ee089296512a1620bbaf4ac0b69ec7 Mon Sep 17 00:00:00 2001 From: aidaanva Date: Fri, 25 Oct 2024 11:58:37 +0200 Subject: [PATCH 3/3] add additional vcfs to reference_indexing_mutlti --- .../local/reference_indexing_multi.nf | 65 ++++++++++--------- 1 file changed, 34 insertions(+), 31 deletions(-) diff --git a/subworkflows/local/reference_indexing_multi.nf b/subworkflows/local/reference_indexing_multi.nf index 00276a5b9..391ca21bf 100644 --- a/subworkflows/local/reference_indexing_multi.nf +++ b/subworkflows/local/reference_indexing_multi.nf @@ -19,25 +19,26 @@ workflow REFERENCE_INDEXING_MULTI { // Import reference sheet and change empty arrays to empty strings for compatibility with single reference input ch_splitreferencesheet_for_branch = Channel.fromSamplesheet("fasta_sheet") .map{ - meta, fasta, fai, dict, mapper_index, circular_target, circularmapper_elongatedfasta, circularmapper_elongatedindex, mitochondrion, capture_bed, pileupcaller_bed, pileupcaller_snp, hapmap, pmd_masked_fasta, pmd_bed_for_masking, sexdet_bed, bedtools_feature, genotyping_gatk_dbsnp -> - meta.ploidy = meta.genotyping_ploidy != null ? meta.genotyping_ploidy : params.genotyping_reference_ploidy - fai = fai != [] ? fai : "" - dict = dict != [] ? dict : "" - mapper_index = mapper_index != [] ? mapper_index : "" - circular_target = circular_target != [] ? circular_target : "" - circularmapper_elongatedfasta = circularmapper_elongatedfasta != [] ? circularmapper_elongatedfasta : "" - circularmapper_elongatedindex = circularmapper_elongatedindex != [] ? circularmapper_elongatedindex : "" - mitochondrion = mitochondrion != [] ? mitochondrion : "" - capture_bed = capture_bed != [] ? capture_bed : "" - pileupcaller_bed = pileupcaller_bed != [] ? pileupcaller_bed : "" - pileupcaller_snp = pileupcaller_snp != [] ? pileupcaller_snp : "" - hapmap = hapmap != [] ? hapmap : "" - pmd_masked_fasta = pmd_masked_fasta != [] ? pmd_masked_fasta : "" - pmd_bed_for_masking = pmd_bed_for_masking != [] ? pmd_bed_for_masking : "" - sexdet_bed = sexdet_bed != [] ? sexdet_bed : "" - bedtools_feature = bedtools_feature != [] ? bedtools_feature : "" - genotyping_gatk_dbsnp = genotyping_gatk_dbsnp != [] ? genotyping_gatk_dbsnp : "" - [ meta - meta.subMap('genotyping_ploidy'), fasta, fai, dict, mapper_index, circular_target, circularmapper_elongatedfasta, circularmapper_elongatedindex, mitochondrion, capture_bed, pileupcaller_bed, pileupcaller_snp, hapmap, pmd_masked_fasta, pmd_bed_for_masking, sexdet_bed, bedtools_feature, genotyping_gatk_dbsnp ] + meta, fasta, fai, dict, mapper_index, circular_target, circularmapper_elongatedfasta, circularmapper_elongatedindex, mitochondrion, capture_bed, pileupcaller_bed, pileupcaller_snp, hapmap, pmd_masked_fasta, pmd_bed_for_masking, sexdet_bed, bedtools_feature, genotyping_gatk_dbsnp, consensus_multivcfanalyzer_additional_vcf_files -> + meta.ploidy = meta.genotyping_ploidy != null ? meta.genotyping_ploidy : params.genotyping_reference_ploidy + fai = fai != [] ? fai : "" + dict = dict != [] ? dict : "" + mapper_index = mapper_index != [] ? mapper_index : "" + circular_target = circular_target != [] ? circular_target : "" + circularmapper_elongatedfasta = circularmapper_elongatedfasta != [] ? circularmapper_elongatedfasta : "" + circularmapper_elongatedindex = circularmapper_elongatedindex != [] ? circularmapper_elongatedindex : "" + mitochondrion = mitochondrion != [] ? mitochondrion : "" + capture_bed = capture_bed != [] ? capture_bed : "" + pileupcaller_bed = pileupcaller_bed != [] ? pileupcaller_bed : "" + pileupcaller_snp = pileupcaller_snp != [] ? pileupcaller_snp : "" + hapmap = hapmap != [] ? hapmap : "" + pmd_masked_fasta = pmd_masked_fasta != [] ? pmd_masked_fasta : "" + pmd_bed_for_masking = pmd_bed_for_masking != [] ? pmd_bed_for_masking : "" + sexdet_bed = sexdet_bed != [] ? sexdet_bed : "" + bedtools_feature = bedtools_feature != [] ? bedtools_feature : "" + genotyping_gatk_dbsnp = genotyping_gatk_dbsnp != [] ? genotyping_gatk_dbsnp : "" + consensus_multivcfanalyzer_additional_vcf_files = consensus_multivcfanalyzer_additional_vcf_files != [] ? consensus_multivcfanalyzer_additional_vcf_files : "" + [ meta - meta.subMap('genotyping_ploidy'), fasta, fai, dict, mapper_index, circular_target, circularmapper_elongatedfasta, circularmapper_elongatedindex, mitochondrion, capture_bed, pileupcaller_bed, pileupcaller_snp, hapmap, pmd_masked_fasta, pmd_bed_for_masking, sexdet_bed, bedtools_feature, genotyping_gatk_dbsnp, consensus_multivcfanalyzer_additional_vcf_files ] } // GENERAL DESCRIPTION FOR NEXT SECTIONS @@ -53,7 +54,7 @@ workflow REFERENCE_INDEXING_MULTI { ch_input_from_referencesheet = ch_splitreferencesheet_for_branch .multiMap { - meta, fasta, fai, dict, mapper_index, circular_target, circularmapper_elongatedfasta, circularmapper_elongatedindex, mitochondrion, capture_bed, pileupcaller_bed, pileupcaller_snp, hapmap, pmd_masked_fasta, pmd_bed_for_masking, sexdet_bed, bedtools_feature, genotyping_gatk_dbsnp -> + meta, fasta, fai, dict, mapper_index, circular_target, circularmapper_elongatedfasta, circularmapper_elongatedindex, mitochondrion, capture_bed, pileupcaller_bed, pileupcaller_snp, hapmap, pmd_masked_fasta, pmd_bed_for_masking, sexdet_bed, bedtools_feature, genotyping_gatk_dbsnp, consensus_multivcfanalyzer_additional_vcf_files -> generated: [ meta, fasta, fai, dict, mapper_index ] circularmapper: [ meta, circular_target, circularmapper_elongatedfasta, circularmapper_elongatedindex ] mitochondrion_header: [ meta, mitochondrion ] @@ -65,6 +66,7 @@ workflow REFERENCE_INDEXING_MULTI { sexdeterrmine_bed: [ meta, sexdet_bed ] bedtools_feature: [ meta, bedtools_feature ] dbsnp: [ meta, genotyping_gatk_dbsnp ] + mva_additional_vcfs: [ meta, consensus_multivcfanalyzer_additional_vcf_files ] } // Detect if fasta is gzipped or not @@ -201,16 +203,17 @@ workflow REFERENCE_INDEXING_MULTI { ch_indexmapper_for_reference = ch_fasta_for_mapperindex.skip.mix(ch_indexed_formix) emit: - reference = ch_indexmapper_for_reference // [ meta, fasta, fai, dict, mapindex ] - elongated_reference = ch_input_from_referencesheet.circularmapper // [ meta, circular_target, circularmapper_elongatedfasta, circularmapper_elongatedindex ] - mitochondrion_header = ch_input_from_referencesheet.mitochondrion_header // [ meta, mitochondrion ] - hapmap = ch_input_from_referencesheet.angsd_hapmap // [ meta, hapmap ] - pmd_masked_fasta = ch_input_from_referencesheet.pmd_masked_fasta // [ meta, pmd_masked_fasta ] - pmd_bed_for_masking = ch_input_from_referencesheet.pmd_bed_for_masking // [ meta, pmd_bed_for_masking ] - snp_capture_bed = ch_input_from_referencesheet.snp_bed // [ meta, capture_bed ] - pileupcaller_bed_snp = ch_input_from_referencesheet.pileupcaller_bed_snp // [ meta, pileupcaller_bed, pileupcaller_snp ] - sexdeterrmine_bed = ch_input_from_referencesheet.sexdeterrmine_bed // [ meta, sexdet_bed ] - bedtools_feature = ch_input_from_referencesheet.bedtools_feature // [ meta, bedtools_feature ] - dbsnp = ch_input_from_referencesheet.dbsnp // [ meta, genotyping_gatk_dbsnp ] + reference = ch_indexmapper_for_reference // [ meta, fasta, fai, dict, mapindex ] + elongated_reference = ch_input_from_referencesheet.circularmapper // [ meta, circular_target, circularmapper_elongatedfasta, circularmapper_elongatedindex ] + mitochondrion_header = ch_input_from_referencesheet.mitochondrion_header // [ meta, mitochondrion ] + hapmap = ch_input_from_referencesheet.angsd_hapmap // [ meta, hapmap ] + pmd_masked_fasta = ch_input_from_referencesheet.pmd_masked_fasta // [ meta, pmd_masked_fasta ] + pmd_bed_for_masking = ch_input_from_referencesheet.pmd_bed_for_masking // [ meta, pmd_bed_for_masking ] + snp_capture_bed = ch_input_from_referencesheet.snp_bed // [ meta, capture_bed ] + pileupcaller_bed_snp = ch_input_from_referencesheet.pileupcaller_bed_snp // [ meta, pileupcaller_bed, pileupcaller_snp ] + sexdeterrmine_bed = ch_input_from_referencesheet.sexdeterrmine_bed // [ meta, sexdet_bed ] + bedtools_feature = ch_input_from_referencesheet.bedtools_feature // [ meta, bedtools_feature ] + dbsnp = ch_input_from_referencesheet.dbsnp // [ meta, genotyping_gatk_dbsnp ] + mva_additional_vcfs = ch_input_from_referencesheet.mva_additional_vcfs // [ meta, consensus_multivcfanalyzer_additional_vcf_files ] versions = ch_versions }