From ceffcddddcf5257d6038d87038dfa1357fde4da5 Mon Sep 17 00:00:00 2001
From: aidaanva <aida.andrades@gmail.com>
Date: Fri, 18 Oct 2024 11:56:30 +0200
Subject: [PATCH 1/3] work on adding mva

---
 conf/modules.config                           |  11 ++
 modules.json                                  |   5 +
 .../nf-core/multivcfanalyzer/environment.yml  |   7 +
 modules/nf-core/multivcfanalyzer/main.nf      | 100 +++++++++++++++
 modules/nf-core/multivcfanalyzer/meta.yml     | 120 ++++++++++++++++++
 nextflow.config                               |  14 ++
 .../local/reference_indexing_single.nf        |   5 +-
 workflows/eager.nf                            |  17 +++
 8 files changed, 278 insertions(+), 1 deletion(-)
 create mode 100644 modules/nf-core/multivcfanalyzer/environment.yml
 create mode 100644 modules/nf-core/multivcfanalyzer/main.nf
 create mode 100644 modules/nf-core/multivcfanalyzer/meta.yml

diff --git a/conf/modules.config b/conf/modules.config
index b03153561..305b7963f 100644
--- a/conf/modules.config
+++ b/conf/modules.config
@@ -1177,4 +1177,15 @@ process {
             pattern: '*.{glf,beagle}.gz'
         ]
     }
+
+    withName: MULTIVCFANALYZER {
+        tag = { "${meta.reference}" }
+        ext.prefix = { "multivcfanalyzer_${meta.reference}" }
+        publishDir = [
+            path: { "${params.outdir}/consensus_sequence/" },
+            mode: params.publish_dir_mode,
+            enabled: true,
+            pattern: '*.{fasta.gz,tsv,txt}'
+        ]
+    }
 }
diff --git a/modules.json b/modules.json
index 7b4d35fce..799268af8 100644
--- a/modules.json
+++ b/modules.json
@@ -175,6 +175,11 @@
                         "git_sha": "b7ebe95761cd389603f9cc0e0dc384c0f663815a",
                         "installed_by": ["modules"]
                     },
+                    "multivcfanalyzer": {
+                        "branch": "master",
+                        "git_sha": "b34fb117e397e72a070cde8adf21b758670c90f5",
+                        "installed_by": ["modules"]
+                    },
                     "picard/createsequencedictionary": {
                         "branch": "master",
                         "git_sha": "20b0918591d4ba20047d7e13e5094bcceba81447",
diff --git a/modules/nf-core/multivcfanalyzer/environment.yml b/modules/nf-core/multivcfanalyzer/environment.yml
new file mode 100644
index 000000000..37f96f5e3
--- /dev/null
+++ b/modules/nf-core/multivcfanalyzer/environment.yml
@@ -0,0 +1,7 @@
+name: multivcfanalyzer
+channels:
+  - conda-forge
+  - bioconda
+  - defaults
+dependencies:
+  - bioconda::multivcfanalyzer=0.85.2
diff --git a/modules/nf-core/multivcfanalyzer/main.nf b/modules/nf-core/multivcfanalyzer/main.nf
new file mode 100644
index 000000000..f3a91a8a8
--- /dev/null
+++ b/modules/nf-core/multivcfanalyzer/main.nf
@@ -0,0 +1,100 @@
+process MULTIVCFANALYZER {
+    tag "$fasta"
+    label 'process_single'
+
+    conda "${moduleDir}/environment.yml"
+    container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
+        'https://depot.galaxyproject.org/singularity/multivcfanalyzer:0.85.2--hdfd78af_1':
+        'biocontainers/multivcfanalyzer:0.85.2--hdfd78af_1' }"
+
+    input:
+    tuple val(meta), path(vcfs)
+    tuple val(meta2), path(fasta)
+    tuple val(meta3), path(snpeff_results)
+    tuple val(meta4), path(gff)
+    val allele_freqs
+    val genotype_quality
+    val coverage
+    val homozygous_freq
+    val heterozygous_freq
+    tuple val(meta5), path(gff_exclude)
+
+
+    output:
+    tuple val(meta), path('fullAlignment.fasta.gz')                       , emit: full_alignment
+    tuple val(meta), path('info.txt')                                     , emit: info_txt
+    tuple val(meta), path('snpAlignment.fasta.gz')                        , emit: snp_alignment
+    tuple val(meta), path('snpAlignmentIncludingRefGenome.fasta.gz')      , emit: snp_genome_alignment
+    tuple val(meta), path('snpStatistics.tsv')                            , emit: snpstatistics
+    tuple val(meta), path('snpTable.tsv')                                 , emit: snptable
+    tuple val(meta), path('snpTableForSnpEff.tsv')                        , emit: snptable_snpeff
+    tuple val(meta), path('snpTableWithUncertaintyCalls.tsv')             , emit: snptable_uncertainty
+    tuple val(meta), path('structureGenotypes.tsv')                       , emit: structure_genotypes
+    tuple val(meta), path('structureGenotypes_noMissingData-Columns.tsv') , emit: structure_genotypes_nomissing
+    tuple val(meta), path('MultiVCFAnalyzer.json')                        , emit: json
+    path "versions.yml"                                  , emit: versions
+
+    when:
+    task.ext.when == null || task.ext.when
+
+    script:
+    // def args = task.ext.args ?: '' // MultiVCFAnalyzer has strict and input ordering and all are mandatory. Deactivating $args to prevent breakage of input
+    def args2 = task.ext.args2 ?: ''
+
+    def cmd_snpeff_results = snpeff_results ? "${snpeff_results}" : "NA"
+    def cmd_gff            = gff ? "${gff}" : "NA"
+    def cmd_allele_freqs   = allele_freqs ? "T" : "F"
+    def cmd_gff_exclude    = gff_exclude ? "${gff}" : "NA"
+
+    """
+    multivcfanalyzer \\
+        ${cmd_snpeff_results} \\
+        ${fasta} \\
+        ${cmd_gff} \\
+        . \
+        ${cmd_allele_freqs}  \\
+        ${genotype_quality}  \\
+        ${coverage}  \\
+        ${homozygous_freq}  \\
+        ${heterozygous_freq}  \\
+        ${cmd_gff_exclude}  \\
+        ${vcfs.sort().join(" ")}
+
+    gzip \\
+        $args2 \\
+        fullAlignment.fasta snpAlignment.fasta snpAlignmentIncludingRefGenome.fasta
+
+    cat <<-END_VERSIONS > versions.yml
+    "${task.process}":
+        multivcfanalyzer: \$(echo \$(multivcfanalyzer --help | head -n 1) | cut -f 3 -d ' ' )
+    END_VERSIONS
+    """
+    stub:
+
+    def args2 = task.ext.args2 ?: ''
+
+    def cmd_snpeff_results = snpeff_results ? "${snpeff_results}" : "NA"
+    def cmd_gff            = gff ? "${gff}" : "NA"
+    def cmd_allele_freqs   = allele_freqs ? "T" : "F"
+    def cmd_gff_exclude    = gff_exclude ? "${gff}" : "NA"
+
+    """
+    echo "" | gzip > fullAlignment.fasta.gz
+    touch info.txt
+    echo "" | gzip > snpAlignment.fasta.gz
+    echo "" | gzip > snpAlignmentIncludingRefGenome.fasta.gz
+    touch snpStatistics.tsv
+    touch snpTable.tsv
+    touch snpTableForSnpEff.tsv
+    touch snpTableWithUncertaintyCalls.tsv
+    touch structureGenotypes.tsv
+    touch structureGenotypes_noMissingData-Columns.tsv
+    touch MultiVCFAnalyzer.json
+
+    cat <<-END_VERSIONS > versions.yml
+    "${task.process}":
+        multivcfanalyzer: \$(echo \$(multivcfanalyzer --help | head -n 1) | cut -f 3 -d ' ' )
+    END_VERSIONS
+
+    """
+}
diff --git a/modules/nf-core/multivcfanalyzer/meta.yml b/modules/nf-core/multivcfanalyzer/meta.yml
new file mode 100644
index 000000000..016637452
--- /dev/null
+++ b/modules/nf-core/multivcfanalyzer/meta.yml
@@ -0,0 +1,120 @@
+name: "multivcfanalyzer"
+description: SNP table generator from GATK UnifiedGenotyper with functionality geared for aDNA
+keywords:
+  - vcf
+  - ancient DNA
+  - aDNA
+  - SNP
+  - GATK UnifiedGenotyper
+  - SNP table
+tools:
+  - "multivcfanalyzer":
+      description: "MultiVCFAnalyzer is a VCF file post-processing tool tailored for aDNA. License on Github repository."
+      homepage: "https://github.com/alexherbig/MultiVCFAnalyzer"
+      documentation: "https://github.com/alexherbig/MultiVCFAnalyzer"
+      tool_dev_url: "https://github.com/alexherbig/MultiVCFAnalyzer"
+      doi: "10.1038/nature13591"
+      licence: ["GPL >=3"]
+input:
+  - vcfs:
+      type: file
+      description: One or a list of uncompressed VCF file
+      pattern: "*.vcf"
+  - fasta:
+      type: file
+      description: Reference genome VCF was generated against
+      pattern: "*.{fasta,fna,fa}"
+  - snpeff_results:
+      type: file
+      description: Results from snpEff in txt format (Optional)
+      pattern: "*.txt"
+  - gff:
+      type: file
+      description: GFF file corresponding to reference genome fasta (Optional)
+      pattern: "*.gff"
+  - allele_freqs:
+      type: boolean
+      description: |
+        Whether to include the percentage of reads a given allele is
+        present in in the SNP table.
+  - genotype_quality:
+      type: integer
+      description: |
+        Minimum GATK genotyping threshold threshold of which a SNP call
+        falling under is 'discarded'
+  - coverage:
+      type: integer
+      description: |
+        Minimum number of a reads that a position must be covered by to be
+        reported
+  - homozygous_freq:
+      type: integer
+      description: Fraction of reads a base must have to be called 'homozygous'
+  - heterozygous_freq:
+      type: integer
+      description: |
+        Fraction of which whereby if a call falls above this value, and lower
+        than the homozygous threshold, a base will be called 'heterozygous'.
+  - gff_exclude:
+      type: file
+      description: |
+        file listing positions that will be 'filtered' (i.e. ignored)
+        (Optional)
+      pattern: "*.vcf"
+output:
+  - versions:
+      type: file
+      description: File containing software versions
+      pattern: "versions.yml"
+  - bam:
+      type: file
+      description: Sorted BAM/CRAM/SAM file
+      pattern: "*.{bam,cram,sam}"
+  - full_alignment:
+      type: file
+      description: Fasta a fasta file of all positions contained in the VCF files i.e. including ref calls
+      pattern: ".fasta.gz"
+  - info_txt:
+      type: file
+      description: Information about the run
+      pattern: ".txt"
+  - snp_alignment:
+      type: file
+      description: A fasta file of just SNP positions with samples only
+      pattern: ".fasta.gz"
+  - snp_genome_alignment:
+      type: file
+      description: A fasta file of just SNP positions with reference genome
+      pattern: ".fasta.gz"
+  - snpstatistics:
+      type: file
+      description: Some basic statistics about the SNP calls of each sample
+      pattern: ".tsv"
+  - snptable:
+      type: file
+      description: Basic SNP table of combined positions taken from each VCF file
+      pattern: ".tsv"
+  - snptable_snpeff:
+      type: file
+      description: Input file for SnpEff
+      pattern: ".tsv"
+  - snptable_uncertainty:
+      type: file
+      description: Same as above, but with lower case characters indicating uncertain calls
+      pattern: ".tsv"
+  - structure_genotypes:
+      type: file
+      description: Input file for STRUCTURE
+      pattern: ".tsv"
+  - structure_genotypes_nomissing:
+      type: file
+      description: Alternate input file for STRUCTURE
+      pattern: ".tsv"
+  - json:
+      type: file
+      description: Summary statistics in MultiQC JSON format
+      pattern: ".json"
+authors:
+  - "@jfy133"
+maintainers:
+  - "@jfy133"
diff --git a/nextflow.config b/nextflow.config
index dfee8fcbb..d86556490 100644
--- a/nextflow.config
+++ b/nextflow.config
@@ -235,8 +235,22 @@ params {
     genotyping_freebayes_skip_coverage       = 0
     genotyping_angsd_glmodel                 = 'samtools'
     genotyping_angsd_glformat                = 'binary'
+
+    //Consensus sequence
+    run_consensus                                        = false
+    consensus_tool                                       = null
+    consensus_multivcfanalyzer_write_allele_frequencies  = false
+    consensus_multivcfanalyzer_min_genotype_quality      = 30
+    consensus_multivcfanalyzer_min_base_coverage         = 5
+    consensus_multivcfanalyzer_allele_freq_hom           = 0.9
+    consensus_multivcfanalyzer_allele_freq_het           = 0.9
+    consensus_multivcfanalyzer_additional_vcf_files      = null
+    consensus_multivcfanalyzer_reference_gff_annotations = null
+    consensus_multivcfanalyzer_reference_gff_exclude     = null
+    consensus_multivcfanalyzer_snpeff_results            = null
 }
 
+
 // Load base.config by default for all pipelines
 includeConfig 'conf/base.config'
 
diff --git a/subworkflows/local/reference_indexing_single.nf b/subworkflows/local/reference_indexing_single.nf
index 4206b340c..6e97d4b25 100644
--- a/subworkflows/local/reference_indexing_single.nf
+++ b/subworkflows/local/reference_indexing_single.nf
@@ -91,7 +91,8 @@ workflow REFERENCE_INDEXING_SINGLE {
                                     def bedtools_feature                      = params.mapstats_bedtools_featurefile != null ? file(params.mapstats_bedtools_featurefile, checkIfExists: true ) : ""
                                     def genotyping_reference_ploidy           = params.genotyping_reference_ploidy
                                     def genotyping_gatk_dbsnp                 = params.genotyping_gatk_dbsnp != null ? file(params.genotyping_gatk_dbsnp, checkIfExists: true ) : ""
-                                    [ meta + [ ploidy: genotyping_reference_ploidy ], fasta, fai, dict, mapper_index, params.fasta_circular_target, params.mitochondrion_header, contamination_estimation_angsd_hapmap, pmd_masked_fasta, pmd_bed_for_masking, capture_bed, pileupcaller_bed, pileupcaller_snp, sexdet_bed, bedtools_feature, genotyping_gatk_dbsnp ]
+                                    def consensus_sequence_mva_additional_vcf = params.consensus_multivcfanalyzer_additional_vcf_files != null ? file(params.consensus_multivcfanalyzer_additional_vcf_files, checkIfExists: true ) : ""
+                                    [ meta + [ ploidy: genotyping_reference_ploidy ], fasta, fai, dict, mapper_index, params.fasta_circular_target, params.mitochondrion_header, contamination_estimation_angsd_hapmap, pmd_masked_fasta, pmd_bed_for_masking, capture_bed, pileupcaller_bed, pileupcaller_snp, sexdet_bed, bedtools_feature, genotyping_gatk_dbsnp, consensus_sequence_mva_additional_vcf ]
                                 }
 
     ch_ref_index_single = ch_reference_for_mapping
@@ -107,6 +108,7 @@ workflow REFERENCE_INDEXING_SINGLE {
                                     sexdeterrmine_bed:      [ meta, sexdet_bed ]
                                     bedtools_feature:       [ meta, bedtools_feature ]
                                     dbsnp:                  [ meta, genotyping_gatk_dbsnp ]
+                                    mva_additional_vcfs:    [ meta, consensus_sequence_mva_additional_vcf ]
                                 }
 
     emit:
@@ -120,6 +122,7 @@ workflow REFERENCE_INDEXING_SINGLE {
     sexdeterrmine_bed    = ch_ref_index_single.sexdeterrmine_bed     // [ meta, sexdet_bed ]
     bedtools_feature     = ch_ref_index_single.bedtools_feature      // [ meta, bedtools_feature ]
     dbsnp                = ch_ref_index_single.dbsnp                 // [ meta, genotyping_gatk_dbsnp ]
+    mva_additional_vcfs  = ch_ref_index_single.mva_additional_vcfs   // [ meta, consensus_sequence_mva_additional_vcf ]
     versions             = ch_versions
 
 }
diff --git a/workflows/eager.nf b/workflows/eager.nf
index 46739b7a7..0d3e86136 100644
--- a/workflows/eager.nf
+++ b/workflows/eager.nf
@@ -584,6 +584,23 @@ workflow EAGER {
         ch_multiqc_files = ch_multiqc_files.mix( GENOTYPE.out.mqc.collect{it[1]}.ifEmpty([]) )
     }
 
+    //
+    // SUBWORKFLOW: Consensus sequence
+    //
+    if ( params.run_consensus_sequence ) {
+        ch_reference_for_consensus_sequence = REFERENCE_INDEXING.out.reference
+            // Remove unnecessary files from the reference channel, so SWF doesn't break with each change to reference channel.
+            .map {
+                meta, fasta, fai, dict, mapindex, circular_target ->
+                [ meta, fasta, fai ]
+            }
+    CONSENSUS_SEQUENCE(
+                        ch_vcf_for_consensus_sequence = GENOTYPE.out.vcf
+                        ch_reference_for_consensus_sequence
+    )
+
+    }
+
     //
     // Collate and save software versions
     //

From 3fa9e13876cd00d9bf320fc3304d12ff6f249a5b Mon Sep 17 00:00:00 2001
From: aidaanva <aida.andrades@gmail.com>
Date: Fri, 18 Oct 2024 11:56:30 +0200
Subject: [PATCH 2/3] work on adding mva

---
 conf/modules.config                           |  11 ++
 modules.json                                  |   5 +
 .../nf-core/multivcfanalyzer/environment.yml  |   7 +
 modules/nf-core/multivcfanalyzer/main.nf      | 100 +++++++++++++++
 modules/nf-core/multivcfanalyzer/meta.yml     | 120 ++++++++++++++++++
 nextflow.config                               |  14 ++
 .../local/reference_indexing_single.nf        |   5 +-
 workflows/eager.nf                            |  17 +++
 8 files changed, 278 insertions(+), 1 deletion(-)
 create mode 100644 modules/nf-core/multivcfanalyzer/environment.yml
 create mode 100644 modules/nf-core/multivcfanalyzer/main.nf
 create mode 100644 modules/nf-core/multivcfanalyzer/meta.yml

diff --git a/conf/modules.config b/conf/modules.config
index 09fbff1fc..c65c99d0b 100644
--- a/conf/modules.config
+++ b/conf/modules.config
@@ -1399,4 +1399,15 @@ process {
             pattern: '*.{glf,beagle}.gz'
         ]
     }
+
+    withName: MULTIVCFANALYZER {
+        tag = { "${meta.reference}" }
+        ext.prefix = { "multivcfanalyzer_${meta.reference}" }
+        publishDir = [
+            path: { "${params.outdir}/consensus_sequence/" },
+            mode: params.publish_dir_mode,
+            enabled: true,
+            pattern: '*.{fasta.gz,tsv,txt}'
+        ]
+    }
 }
diff --git a/modules.json b/modules.json
index a2fc85bbb..914df0e04 100644
--- a/modules.json
+++ b/modules.json
@@ -225,6 +225,11 @@
                         "git_sha": "b7ebe95761cd389603f9cc0e0dc384c0f663815a",
                         "installed_by": ["modules"]
                     },
+                    "multivcfanalyzer": {
+                        "branch": "master",
+                        "git_sha": "b34fb117e397e72a070cde8adf21b758670c90f5",
+                        "installed_by": ["modules"]
+                    },
                     "picard/createsequencedictionary": {
                         "branch": "master",
                         "git_sha": "20b0918591d4ba20047d7e13e5094bcceba81447",
diff --git a/modules/nf-core/multivcfanalyzer/environment.yml b/modules/nf-core/multivcfanalyzer/environment.yml
new file mode 100644
index 000000000..37f96f5e3
--- /dev/null
+++ b/modules/nf-core/multivcfanalyzer/environment.yml
@@ -0,0 +1,7 @@
+name: multivcfanalyzer
+channels:
+  - conda-forge
+  - bioconda
+  - defaults
+dependencies:
+  - bioconda::multivcfanalyzer=0.85.2
diff --git a/modules/nf-core/multivcfanalyzer/main.nf b/modules/nf-core/multivcfanalyzer/main.nf
new file mode 100644
index 000000000..f3a91a8a8
--- /dev/null
+++ b/modules/nf-core/multivcfanalyzer/main.nf
@@ -0,0 +1,100 @@
+process MULTIVCFANALYZER {
+    tag "$fasta"
+    label 'process_single'
+
+    conda "${moduleDir}/environment.yml"
+    container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
+        'https://depot.galaxyproject.org/singularity/multivcfanalyzer:0.85.2--hdfd78af_1':
+        'biocontainers/multivcfanalyzer:0.85.2--hdfd78af_1' }"
+
+    input:
+    tuple val(meta), path(vcfs)
+    tuple val(meta2), path(fasta)
+    tuple val(meta3), path(snpeff_results)
+    tuple val(meta4), path(gff)
+    val allele_freqs
+    val genotype_quality
+    val coverage
+    val homozygous_freq
+    val heterozygous_freq
+    tuple val(meta5), path(gff_exclude)
+
+
+    output:
+    tuple val(meta), path('fullAlignment.fasta.gz')                       , emit: full_alignment
+    tuple val(meta), path('info.txt')                                     , emit: info_txt
+    tuple val(meta), path('snpAlignment.fasta.gz')                        , emit: snp_alignment
+    tuple val(meta), path('snpAlignmentIncludingRefGenome.fasta.gz')      , emit: snp_genome_alignment
+    tuple val(meta), path('snpStatistics.tsv')                            , emit: snpstatistics
+    tuple val(meta), path('snpTable.tsv')                                 , emit: snptable
+    tuple val(meta), path('snpTableForSnpEff.tsv')                        , emit: snptable_snpeff
+    tuple val(meta), path('snpTableWithUncertaintyCalls.tsv')             , emit: snptable_uncertainty
+    tuple val(meta), path('structureGenotypes.tsv')                       , emit: structure_genotypes
+    tuple val(meta), path('structureGenotypes_noMissingData-Columns.tsv') , emit: structure_genotypes_nomissing
+    tuple val(meta), path('MultiVCFAnalyzer.json')                        , emit: json
+    path "versions.yml"                                  , emit: versions
+
+    when:
+    task.ext.when == null || task.ext.when
+
+    script:
+    // def args = task.ext.args ?: '' // MultiVCFAnalyzer has strict and input ordering and all are mandatory. Deactivating $args to prevent breakage of input
+    def args2 = task.ext.args2 ?: ''
+
+    def cmd_snpeff_results = snpeff_results ? "${snpeff_results}" : "NA"
+    def cmd_gff            = gff ? "${gff}" : "NA"
+    def cmd_allele_freqs   = allele_freqs ? "T" : "F"
+    def cmd_gff_exclude    = gff_exclude ? "${gff}" : "NA"
+
+    """
+    multivcfanalyzer \\
+        ${cmd_snpeff_results} \\
+        ${fasta} \\
+        ${cmd_gff} \\
+        . \
+        ${cmd_allele_freqs}  \\
+        ${genotype_quality}  \\
+        ${coverage}  \\
+        ${homozygous_freq}  \\
+        ${heterozygous_freq}  \\
+        ${cmd_gff_exclude}  \\
+        ${vcfs.sort().join(" ")}
+
+    gzip \\
+        $args2 \\
+        fullAlignment.fasta snpAlignment.fasta snpAlignmentIncludingRefGenome.fasta
+
+    cat <<-END_VERSIONS > versions.yml
+    "${task.process}":
+        multivcfanalyzer: \$(echo \$(multivcfanalyzer --help | head -n 1) | cut -f 3 -d ' ' )
+    END_VERSIONS
+    """
+    stub:
+
+    def args2 = task.ext.args2 ?: ''
+
+    def cmd_snpeff_results = snpeff_results ? "${snpeff_results}" : "NA"
+    def cmd_gff            = gff ? "${gff}" : "NA"
+    def cmd_allele_freqs   = allele_freqs ? "T" : "F"
+    def cmd_gff_exclude    = gff_exclude ? "${gff}" : "NA"
+
+    """
+    echo "" | gzip > fullAlignment.fasta.gz
+    touch info.txt
+    echo "" | gzip > snpAlignment.fasta.gz
+    echo "" | gzip > snpAlignmentIncludingRefGenome.fasta.gz
+    touch snpStatistics.tsv
+    touch snpTable.tsv
+    touch snpTableForSnpEff.tsv
+    touch snpTableWithUncertaintyCalls.tsv
+    touch structureGenotypes.tsv
+    touch structureGenotypes_noMissingData-Columns.tsv
+    touch MultiVCFAnalyzer.json
+
+    cat <<-END_VERSIONS > versions.yml
+    "${task.process}":
+        multivcfanalyzer: \$(echo \$(multivcfanalyzer --help | head -n 1) | cut -f 3 -d ' ' )
+    END_VERSIONS
+
+    """
+}
diff --git a/modules/nf-core/multivcfanalyzer/meta.yml b/modules/nf-core/multivcfanalyzer/meta.yml
new file mode 100644
index 000000000..016637452
--- /dev/null
+++ b/modules/nf-core/multivcfanalyzer/meta.yml
@@ -0,0 +1,120 @@
+name: "multivcfanalyzer"
+description: SNP table generator from GATK UnifiedGenotyper with functionality geared for aDNA
+keywords:
+  - vcf
+  - ancient DNA
+  - aDNA
+  - SNP
+  - GATK UnifiedGenotyper
+  - SNP table
+tools:
+  - "multivcfanalyzer":
+      description: "MultiVCFAnalyzer is a VCF file post-processing tool tailored for aDNA. License on Github repository."
+      homepage: "https://github.com/alexherbig/MultiVCFAnalyzer"
+      documentation: "https://github.com/alexherbig/MultiVCFAnalyzer"
+      tool_dev_url: "https://github.com/alexherbig/MultiVCFAnalyzer"
+      doi: "10.1038/nature13591"
+      licence: ["GPL >=3"]
+input:
+  - vcfs:
+      type: file
+      description: One or a list of uncompressed VCF file
+      pattern: "*.vcf"
+  - fasta:
+      type: file
+      description: Reference genome VCF was generated against
+      pattern: "*.{fasta,fna,fa}"
+  - snpeff_results:
+      type: file
+      description: Results from snpEff in txt format (Optional)
+      pattern: "*.txt"
+  - gff:
+      type: file
+      description: GFF file corresponding to reference genome fasta (Optional)
+      pattern: "*.gff"
+  - allele_freqs:
+      type: boolean
+      description: |
+        Whether to include the percentage of reads a given allele is
+        present in in the SNP table.
+  - genotype_quality:
+      type: integer
+      description: |
+        Minimum GATK genotyping threshold threshold of which a SNP call
+        falling under is 'discarded'
+  - coverage:
+      type: integer
+      description: |
+        Minimum number of a reads that a position must be covered by to be
+        reported
+  - homozygous_freq:
+      type: integer
+      description: Fraction of reads a base must have to be called 'homozygous'
+  - heterozygous_freq:
+      type: integer
+      description: |
+        Fraction of which whereby if a call falls above this value, and lower
+        than the homozygous threshold, a base will be called 'heterozygous'.
+  - gff_exclude:
+      type: file
+      description: |
+        file listing positions that will be 'filtered' (i.e. ignored)
+        (Optional)
+      pattern: "*.vcf"
+output:
+  - versions:
+      type: file
+      description: File containing software versions
+      pattern: "versions.yml"
+  - bam:
+      type: file
+      description: Sorted BAM/CRAM/SAM file
+      pattern: "*.{bam,cram,sam}"
+  - full_alignment:
+      type: file
+      description: Fasta a fasta file of all positions contained in the VCF files i.e. including ref calls
+      pattern: ".fasta.gz"
+  - info_txt:
+      type: file
+      description: Information about the run
+      pattern: ".txt"
+  - snp_alignment:
+      type: file
+      description: A fasta file of just SNP positions with samples only
+      pattern: ".fasta.gz"
+  - snp_genome_alignment:
+      type: file
+      description: A fasta file of just SNP positions with reference genome
+      pattern: ".fasta.gz"
+  - snpstatistics:
+      type: file
+      description: Some basic statistics about the SNP calls of each sample
+      pattern: ".tsv"
+  - snptable:
+      type: file
+      description: Basic SNP table of combined positions taken from each VCF file
+      pattern: ".tsv"
+  - snptable_snpeff:
+      type: file
+      description: Input file for SnpEff
+      pattern: ".tsv"
+  - snptable_uncertainty:
+      type: file
+      description: Same as above, but with lower case characters indicating uncertain calls
+      pattern: ".tsv"
+  - structure_genotypes:
+      type: file
+      description: Input file for STRUCTURE
+      pattern: ".tsv"
+  - structure_genotypes_nomissing:
+      type: file
+      description: Alternate input file for STRUCTURE
+      pattern: ".tsv"
+  - json:
+      type: file
+      description: Summary statistics in MultiQC JSON format
+      pattern: ".json"
+authors:
+  - "@jfy133"
+maintainers:
+  - "@jfy133"
diff --git a/nextflow.config b/nextflow.config
index c8a86d059..b9da96e3a 100644
--- a/nextflow.config
+++ b/nextflow.config
@@ -269,8 +269,22 @@ params {
     genotyping_freebayes_skip_coverage       = 0
     genotyping_angsd_glmodel                 = 'samtools'
     genotyping_angsd_glformat                = 'binary'
+
+    //Consensus sequence
+    run_consensus                                        = false
+    consensus_tool                                       = null
+    consensus_multivcfanalyzer_write_allele_frequencies  = false
+    consensus_multivcfanalyzer_min_genotype_quality      = 30
+    consensus_multivcfanalyzer_min_base_coverage         = 5
+    consensus_multivcfanalyzer_allele_freq_hom           = 0.9
+    consensus_multivcfanalyzer_allele_freq_het           = 0.9
+    consensus_multivcfanalyzer_additional_vcf_files      = null
+    consensus_multivcfanalyzer_reference_gff_annotations = null
+    consensus_multivcfanalyzer_reference_gff_exclude     = null
+    consensus_multivcfanalyzer_snpeff_results            = null
 }
 
+
 // Load base.config by default for all pipelines
 includeConfig 'conf/base.config'
 
diff --git a/subworkflows/local/reference_indexing_single.nf b/subworkflows/local/reference_indexing_single.nf
index 41feced1e..718909630 100644
--- a/subworkflows/local/reference_indexing_single.nf
+++ b/subworkflows/local/reference_indexing_single.nf
@@ -89,9 +89,10 @@ workflow REFERENCE_INDEXING_SINGLE {
                                     def bedtools_feature                      = params.mapstats_bedtools_featurefile != null ? file(params.mapstats_bedtools_featurefile, checkIfExists: true ) : ""
                                     def genotyping_reference_ploidy           = params.genotyping_reference_ploidy
                                     def genotyping_gatk_dbsnp                 = params.genotyping_gatk_dbsnp != null ? file(params.genotyping_gatk_dbsnp, checkIfExists: true ) : ""
+                                    def consensus_sequence_mva_additional_vcf = params.consensus_multivcfanalyzer_additional_vcf_files != null ? file(params.consensus_multivcfanalyzer_additional_vcf_files, checkIfExists: true ) : ""
                                     def circularmapper_elongated_fasta        = params.fasta_circularmapper_elongatedfasta != null ? file( params.fasta_circularmapper_elongatedfasta, checkIfExists: true ) : ""
                                     def circularmapper_elongated_index        = params.fasta_circularmapper_elongatedindex != null ? file( params.fasta_circularmapper_elongatedindex, checkIfExists: true ) : ""
-                                    [ meta + [ ploidy: genotyping_reference_ploidy ], fasta, fai, dict, mapper_index, params.fasta_circular_target, params.mitochondrion_header, contamination_estimation_angsd_hapmap, pmd_masked_fasta, pmd_bed_for_masking, capture_bed, pileupcaller_bed, pileupcaller_snp, sexdet_bed, bedtools_feature, genotyping_gatk_dbsnp, circularmapper_elongated_fasta, circularmapper_elongated_index ]
+                                    [ meta + [ ploidy: genotyping_reference_ploidy ], fasta, fai, dict, mapper_index, params.fasta_circular_target, params.mitochondrion_header, contamination_estimation_angsd_hapmap, pmd_masked_fasta, pmd_bed_for_masking, capture_bed, pileupcaller_bed, pileupcaller_snp, sexdet_bed, bedtools_feature, genotyping_gatk_dbsnp, consensus_sequence_mva_additional_vcf, circularmapper_elongated_fasta, circularmapper_elongated_index ]
                                 }
 
     ch_ref_index_single = ch_reference_for_mapping
@@ -108,6 +109,7 @@ workflow REFERENCE_INDEXING_SINGLE {
                                     sexdeterrmine_bed:      [ meta, sexdet_bed ]
                                     bedtools_feature:       [ meta, bedtools_feature ]
                                     dbsnp:                  [ meta, genotyping_gatk_dbsnp ]
+                                    mva_additional_vcfs:    [ meta, consensus_sequence_mva_additional_vcf ]
                                 }
 
     emit:
@@ -122,6 +124,7 @@ workflow REFERENCE_INDEXING_SINGLE {
     sexdeterrmine_bed    = ch_ref_index_single.sexdeterrmine_bed     // [ meta, sexdet_bed ]
     bedtools_feature     = ch_ref_index_single.bedtools_feature      // [ meta, bedtools_feature ]
     dbsnp                = ch_ref_index_single.dbsnp                 // [ meta, genotyping_gatk_dbsnp ]
+    mva_additional_vcfs  = ch_ref_index_single.mva_additional_vcfs   // [ meta, consensus_sequence_mva_additional_vcf ]
     versions             = ch_versions
 
 }
diff --git a/workflows/eager.nf b/workflows/eager.nf
index 0766744bd..a9ae083c6 100644
--- a/workflows/eager.nf
+++ b/workflows/eager.nf
@@ -588,6 +588,23 @@ workflow EAGER {
         ch_multiqc_files = ch_multiqc_files.mix( GENOTYPE.out.mqc.collect{it[1]}.ifEmpty([]) )
     }
 
+    //
+    // SUBWORKFLOW: Consensus sequence
+    //
+    if ( params.run_consensus_sequence ) {
+        ch_reference_for_consensus_sequence = REFERENCE_INDEXING.out.reference
+            // Remove unnecessary files from the reference channel, so SWF doesn't break with each change to reference channel.
+            .map {
+                meta, fasta, fai, dict, mapindex, circular_target ->
+                [ meta, fasta, fai ]
+            }
+    CONSENSUS_SEQUENCE(
+                        ch_vcf_for_consensus_sequence = GENOTYPE.out.vcf
+                        ch_reference_for_consensus_sequence
+    )
+
+    }
+
     //
     // Collate and save software versions
     //

From 2447756902ee089296512a1620bbaf4ac0b69ec7 Mon Sep 17 00:00:00 2001
From: aidaanva <aida.andrades@gmail.com>
Date: Fri, 25 Oct 2024 11:58:37 +0200
Subject: [PATCH 3/3] add additional vcfs to reference_indexing_mutlti

---
 .../local/reference_indexing_multi.nf         | 65 ++++++++++---------
 1 file changed, 34 insertions(+), 31 deletions(-)

diff --git a/subworkflows/local/reference_indexing_multi.nf b/subworkflows/local/reference_indexing_multi.nf
index 00276a5b9..391ca21bf 100644
--- a/subworkflows/local/reference_indexing_multi.nf
+++ b/subworkflows/local/reference_indexing_multi.nf
@@ -19,25 +19,26 @@ workflow REFERENCE_INDEXING_MULTI {
     // Import reference sheet and change empty arrays to empty strings for compatibility with single reference input
     ch_splitreferencesheet_for_branch = Channel.fromSamplesheet("fasta_sheet")
                                     .map{
-                                        meta, fasta, fai, dict, mapper_index, circular_target, circularmapper_elongatedfasta, circularmapper_elongatedindex, mitochondrion, capture_bed, pileupcaller_bed, pileupcaller_snp, hapmap, pmd_masked_fasta, pmd_bed_for_masking, sexdet_bed, bedtools_feature, genotyping_gatk_dbsnp ->
-                                            meta.ploidy                    = meta.genotyping_ploidy != null ? meta.genotyping_ploidy : params.genotyping_reference_ploidy
-                                            fai                            = fai != [] ? fai : ""
-                                            dict                           = dict != [] ? dict : ""
-                                            mapper_index                   = mapper_index != [] ? mapper_index : ""
-                                            circular_target                = circular_target != [] ? circular_target : ""
-                                            circularmapper_elongatedfasta = circularmapper_elongatedfasta != [] ? circularmapper_elongatedfasta : ""
-                                            circularmapper_elongatedindex = circularmapper_elongatedindex != [] ? circularmapper_elongatedindex : ""
-                                            mitochondrion                  = mitochondrion != [] ? mitochondrion : ""
-                                            capture_bed                    = capture_bed != [] ? capture_bed : ""
-                                            pileupcaller_bed               = pileupcaller_bed != [] ? pileupcaller_bed : ""
-                                            pileupcaller_snp               = pileupcaller_snp != [] ? pileupcaller_snp : ""
-                                            hapmap                         = hapmap != [] ? hapmap : ""
-                                            pmd_masked_fasta               = pmd_masked_fasta != [] ? pmd_masked_fasta : ""
-                                            pmd_bed_for_masking            = pmd_bed_for_masking != [] ? pmd_bed_for_masking : ""
-                                            sexdet_bed                     = sexdet_bed != [] ? sexdet_bed : ""
-                                            bedtools_feature               = bedtools_feature != [] ? bedtools_feature : ""
-                                            genotyping_gatk_dbsnp          = genotyping_gatk_dbsnp != [] ? genotyping_gatk_dbsnp : ""
-                                        [ meta - meta.subMap('genotyping_ploidy'), fasta, fai, dict, mapper_index, circular_target, circularmapper_elongatedfasta, circularmapper_elongatedindex, mitochondrion, capture_bed, pileupcaller_bed, pileupcaller_snp, hapmap, pmd_masked_fasta, pmd_bed_for_masking, sexdet_bed, bedtools_feature, genotyping_gatk_dbsnp ]
+                                        meta, fasta, fai, dict, mapper_index, circular_target, circularmapper_elongatedfasta, circularmapper_elongatedindex, mitochondrion, capture_bed, pileupcaller_bed, pileupcaller_snp, hapmap, pmd_masked_fasta, pmd_bed_for_masking, sexdet_bed, bedtools_feature, genotyping_gatk_dbsnp, consensus_multivcfanalyzer_additional_vcf_files ->
+                                            meta.ploidy                                     = meta.genotyping_ploidy != null ? meta.genotyping_ploidy : params.genotyping_reference_ploidy
+                                            fai                                             = fai != [] ? fai : ""
+                                            dict                                            = dict != [] ? dict : ""
+                                            mapper_index                                    = mapper_index != [] ? mapper_index : ""
+                                            circular_target                                 = circular_target != [] ? circular_target : ""
+                                            circularmapper_elongatedfasta                   = circularmapper_elongatedfasta != [] ? circularmapper_elongatedfasta : ""
+                                            circularmapper_elongatedindex                   = circularmapper_elongatedindex != [] ? circularmapper_elongatedindex : ""
+                                            mitochondrion                                   = mitochondrion != [] ? mitochondrion : ""
+                                            capture_bed                                     = capture_bed != [] ? capture_bed : ""
+                                            pileupcaller_bed                                = pileupcaller_bed != [] ? pileupcaller_bed : ""
+                                            pileupcaller_snp                                = pileupcaller_snp != [] ? pileupcaller_snp : ""
+                                            hapmap                                          = hapmap != [] ? hapmap : ""
+                                            pmd_masked_fasta                                = pmd_masked_fasta != [] ? pmd_masked_fasta : ""
+                                            pmd_bed_for_masking                             = pmd_bed_for_masking != [] ? pmd_bed_for_masking : ""
+                                            sexdet_bed                                      = sexdet_bed != [] ? sexdet_bed : ""
+                                            bedtools_feature                                = bedtools_feature != [] ? bedtools_feature : ""
+                                            genotyping_gatk_dbsnp                           = genotyping_gatk_dbsnp != [] ? genotyping_gatk_dbsnp : ""
+                                            consensus_multivcfanalyzer_additional_vcf_files = consensus_multivcfanalyzer_additional_vcf_files != [] ? consensus_multivcfanalyzer_additional_vcf_files : ""
+                                        [ meta - meta.subMap('genotyping_ploidy'), fasta, fai, dict, mapper_index, circular_target, circularmapper_elongatedfasta, circularmapper_elongatedindex, mitochondrion, capture_bed, pileupcaller_bed, pileupcaller_snp, hapmap, pmd_masked_fasta, pmd_bed_for_masking, sexdet_bed, bedtools_feature, genotyping_gatk_dbsnp, consensus_multivcfanalyzer_additional_vcf_files ]
                                     }
 
     // GENERAL DESCRIPTION FOR NEXT SECTIONS
@@ -53,7 +54,7 @@ workflow REFERENCE_INDEXING_MULTI {
 
     ch_input_from_referencesheet = ch_splitreferencesheet_for_branch
                             .multiMap {
-                                meta, fasta, fai, dict, mapper_index, circular_target, circularmapper_elongatedfasta, circularmapper_elongatedindex, mitochondrion, capture_bed, pileupcaller_bed, pileupcaller_snp, hapmap, pmd_masked_fasta, pmd_bed_for_masking, sexdet_bed, bedtools_feature, genotyping_gatk_dbsnp ->
+                                meta, fasta, fai, dict, mapper_index, circular_target, circularmapper_elongatedfasta, circularmapper_elongatedindex, mitochondrion, capture_bed, pileupcaller_bed, pileupcaller_snp, hapmap, pmd_masked_fasta, pmd_bed_for_masking, sexdet_bed, bedtools_feature, genotyping_gatk_dbsnp, consensus_multivcfanalyzer_additional_vcf_files ->
                                 generated:            [ meta, fasta, fai, dict, mapper_index ]
                                 circularmapper:       [ meta, circular_target, circularmapper_elongatedfasta, circularmapper_elongatedindex ]
                                 mitochondrion_header: [ meta, mitochondrion ]
@@ -65,6 +66,7 @@ workflow REFERENCE_INDEXING_MULTI {
                                 sexdeterrmine_bed:    [ meta, sexdet_bed ]
                                 bedtools_feature:     [ meta, bedtools_feature ]
                                 dbsnp:                [ meta, genotyping_gatk_dbsnp ]
+                                mva_additional_vcfs:  [ meta, consensus_multivcfanalyzer_additional_vcf_files ]
                             }
 
     // Detect if fasta is gzipped or not
@@ -201,16 +203,17 @@ workflow REFERENCE_INDEXING_MULTI {
     ch_indexmapper_for_reference = ch_fasta_for_mapperindex.skip.mix(ch_indexed_formix)
 
     emit:
-    reference            = ch_indexmapper_for_reference                         // [ meta, fasta, fai, dict, mapindex ]
-    elongated_reference  = ch_input_from_referencesheet.circularmapper          // [ meta, circular_target, circularmapper_elongatedfasta, circularmapper_elongatedindex ]
-    mitochondrion_header = ch_input_from_referencesheet.mitochondrion_header    // [ meta, mitochondrion ]
-    hapmap               = ch_input_from_referencesheet.angsd_hapmap            // [ meta, hapmap ]
-    pmd_masked_fasta     = ch_input_from_referencesheet.pmd_masked_fasta        // [ meta, pmd_masked_fasta ]
-    pmd_bed_for_masking  = ch_input_from_referencesheet.pmd_bed_for_masking     // [ meta, pmd_bed_for_masking ]
-    snp_capture_bed      = ch_input_from_referencesheet.snp_bed                 // [ meta, capture_bed ]
-    pileupcaller_bed_snp = ch_input_from_referencesheet.pileupcaller_bed_snp    // [ meta, pileupcaller_bed, pileupcaller_snp ]
-    sexdeterrmine_bed    = ch_input_from_referencesheet.sexdeterrmine_bed       // [ meta, sexdet_bed ]
-    bedtools_feature     = ch_input_from_referencesheet.bedtools_feature        // [ meta, bedtools_feature ]
-    dbsnp                = ch_input_from_referencesheet.dbsnp                   // [ meta, genotyping_gatk_dbsnp ]
+    reference            = ch_indexmapper_for_reference                          // [ meta, fasta, fai, dict, mapindex ]
+    elongated_reference  = ch_input_from_referencesheet.circularmapper           // [ meta, circular_target, circularmapper_elongatedfasta, circularmapper_elongatedindex ]
+    mitochondrion_header = ch_input_from_referencesheet.mitochondrion_header     // [ meta, mitochondrion ]
+    hapmap               = ch_input_from_referencesheet.angsd_hapmap             // [ meta, hapmap ]
+    pmd_masked_fasta     = ch_input_from_referencesheet.pmd_masked_fasta         // [ meta, pmd_masked_fasta ]
+    pmd_bed_for_masking  = ch_input_from_referencesheet.pmd_bed_for_masking      // [ meta, pmd_bed_for_masking ]
+    snp_capture_bed      = ch_input_from_referencesheet.snp_bed                  // [ meta, capture_bed ]
+    pileupcaller_bed_snp = ch_input_from_referencesheet.pileupcaller_bed_snp     // [ meta, pileupcaller_bed, pileupcaller_snp ]
+    sexdeterrmine_bed    = ch_input_from_referencesheet.sexdeterrmine_bed        // [ meta, sexdet_bed ]
+    bedtools_feature     = ch_input_from_referencesheet.bedtools_feature         // [ meta, bedtools_feature ]
+    dbsnp                = ch_input_from_referencesheet.dbsnp                    // [ meta, genotyping_gatk_dbsnp ]
+    mva_additional_vcfs  = ch_input_from_referencesheet.mva_additional_vcfs // [ meta, consensus_multivcfanalyzer_additional_vcf_files ]
     versions             = ch_versions
 }