From 67220cef04be6baaf04eeb26f0d8cd930c7b8e9c Mon Sep 17 00:00:00 2001 From: Thiseas Christos Lamnidis Date: Fri, 23 Jun 2023 16:26:14 +0200 Subject: [PATCH 001/110] add pileupcaller params to config and schema --- nextflow.config | 11 ++++++- nextflow_schema.json | 78 +++++++++++++++++++++++++++++++++++++------- 2 files changed, 77 insertions(+), 12 deletions(-) diff --git a/nextflow.config b/nextflow.config index e696219c6..fc07899cd 100644 --- a/nextflow.config +++ b/nextflow.config @@ -184,7 +184,16 @@ params { damage_manipulation_bamutils_softclip = false // Genotyping - genotyping_source = 'raw' + run_genotyping = false + genotyping_tool = null + genotyping_source = null + genotyping_pileupcaller_min_base_quality = 30 + genotyping_pileupcaller_min_map_quality = 30 + genotyping_pileupcaller_bedfile = null + genotyping_pileupcaller_snpfile = null + genotyping_pileupcaller_method = 'randomHaploid' + genotyping_pileupcaller_transitions_mode = 'AllSites' + } // Load base.config by default for all pipelines diff --git a/nextflow_schema.json b/nextflow_schema.json index 81a3a8230..7b988a24d 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -848,16 +848,75 @@ "description": "Options for variant calling", "default": "", "properties": { + "run_genotyping": { + "type": "boolean", + "fa_icon": "fas fa-power-off", + "description": "Turn on genotyping of BAM files.", + "help_text": "Turns on genotyping. `--genotyping_source` and `--genotyping_tool` must also be provided together with this option." + }, "genotyping_source": { "type": "string", - "default": "raw", "description": "Specify which input BAM to use for genotyping.", "help_text": "Indicates which BAM file to use for genotyping, depending on what BAM processing modules you have turned on. Options are: 'raw' (to use the reads used as input for damage manipulation); 'pmd' (for pmdtools output); 'trimmed' (for base-clipped BAMs. Base-clipped-PMD-filtered BAMs if both filtering and trimming are requested); 'rescaled' (for mapDamage2 rescaling output).\nWarning: Depending on the parameters you provided, 'raw' can refer to all mapped reads, filtered reads (if bam filtering has been performed), or the deduplicated reads (if deduplication was performed).", "fa_icon": "fas fa-faucet", "enum": ["raw", "pmd", "trimmed", "rescaled"] + }, + "genotyping_tool": { + "type": "string", + "fa_icon": "fas fa-tools", + "enum": ["ug", "hc", "freebayes", "pileupcaller", "angsd"], + "help_text": "Specifies which genotyper to use. Current options are: GATK (v3.5) UnifiedGenotyper or GATK Haplotype Caller (v4); and the FreeBayes Caller.\n\n> Note that while UnifiedGenotyper is more suitable for low-coverage ancient DNA (HaplotypeCaller does de novo assembly around each variant site), be aware GATK 3.5 it is officially deprecated by the Broad Institute.", + "description": "Specify which genotyper to use between: GATK UnifiedGenotyper, GATK HaplotypeCaller, Freebayes, or pileupCaller." + }, + "genotyping_pileupcaller_min_base_quality": { + "type": "integer", + "default": 30, + "description": "The base mapping quality to be used for genotyping with pileupcaller.", + "help_text": "The minimum mapping quality to be used for genotyping with pileupCaller. Affects the `samtools mpileup` output that is used by `pileupCaller`. \n\n> Affects `-Q` parameter of `samtools mpileup`.", + "fa_icon": "fas fa-filter" + }, + "genotyping_pileupcaller_min_map_quality": { + "type": "integer", + "default": 30, + "fa_icon": "fas fa-filter", + "description": "The minimum mapping quality to be used for genotyping with pileupcaller.", + "help_text": "The minimum mapping quality to be used for genotyping with pileupCaller. Affects the `samtools mpileup` output that is used by `pileupCaller`. \n\n> Affects `-q` parameter of `samtools mpileup`." + }, + "genotyping_pileupcaller_bedfile": { + "type": "string", + "default": "None", + "fa_icon": "fas fa-bed", + "help_text": "Specify a SNP panel in the form of a bed file of sites at which to generate a pileup for pileupCaller.", + "format": "file-path", + "description": "Specify the path to SNP panel in bed format for pileupCaller." + }, + "genotyping_pileupcaller_snpfile": { + "type": "string", + "default": "None", + "help_text": "Specify a SNP panel in [EIGENSTRAT](https://github.com/DReichLab/EIG/blob/master/CONVERTF/README) format, pileupCaller will call these sites.", + "fa_icon": "fas fa-sliders-h", + "format": "file-path", + "description": "Specify the path to SNP panel in EIGENSTRAT format for pileupCaller." + }, + "genotyping_pileupcaller_method": { + "type": "string", + "default": "randomHaploid", + "fa_icon": "fas fa-toolbox", + "description": "Specify the calling method to use.", + "help_text": "Specify the calling method to use.\n\n> Modifies pileupCaller parameter: `--randomHaploid` `--randomDiploid` `--majorityCall`", + "enum": ["randomHaploid", "randomDiploid", "majorityCall"] + }, + "genotyping_pileupcaller_transitions_mode": { + "type": "string", + "default": "AllSites", + "description": "Specify the calling mode for transitions.", + "help_text": "Specify if genotypes of transition SNPs should be called, set to missing, or excluded from the genotypes respectively. \n\n> Modifies pileupCaller parameter: --skipTransitions --transitionsMissing", + "enum": ["AllSites", "TransitionsMissing", "SkipTransitions"], + "fa_icon": "fas fa-toggle-on" } }, - "fa_icon": "fas fa-sliders-h" + "fa_icon": "fas fa-sliders-h", + "help_text": "There are options for different genotypers (or genotype likelihood calculators) to be used. We suggest you read the documentation of each tool to find the ones that suit your needs.\n\nDocumentation for each tool:\n\n- GATK UnifiedGenotyper\n- GATK HaplotypeCaller\n- FreeBayes\n- ANGSD\n- sequenceTools pileupCaller\n\nGenotyping is performed per sample (i.e. after all types of libraries are merged), except for pileupCaller which gathers all double-stranded and single-stranded (same-type merged) libraries respectively.\nSome genotypers require additional files to be specified in the reference sheet, or using command line parameters. When using a reference sheet, only references with the required filed specified in the respective columns will be used for genotyping. " }, "mitochondrial_to_nuclear_ratio": { "title": "Mitochondrial to Nuclear Ratio", @@ -1085,9 +1144,6 @@ { "$ref": "#/definitions/mapping" }, - { - "$ref": "#/definitions/adna_damage_analysis" - }, { "$ref": "#/definitions/bam_filtering" }, @@ -1098,25 +1154,25 @@ "$ref": "#/definitions/deduplication" }, { - "$ref": "#/definitions/mitochondrial_to_nuclear_ratio" + "$ref": "#/definitions/damage_manipulation" }, { - "$ref": "#/definitions/mapping_statistics" + "$ref": "#/definitions/genotyping" }, { - "$ref": "#/definitions/damage_manipulation" + "$ref": "#/definitions/mitochondrial_to_nuclear_ratio" }, { - "$ref": "#/definitions/genotyping" + "$ref": "#/definitions/mapping_statistics" }, { "$ref": "#/definitions/adna_damage_analysis" }, { - "$ref": "#/definitions/contamination_estimation" + "$ref": "#/definitions/host_removal" }, { - "$ref": "#/definitions/host_removal" + "$ref": "#/definitions/contamination_estimation" } ] } From eb61d10d98d43162a08ca71db9982e37e800f188 Mon Sep 17 00:00:00 2001 From: Thiseas Christos Lamnidis Date: Fri, 23 Jun 2023 16:26:48 +0200 Subject: [PATCH 002/110] genotyping parameter requirement checks --- workflows/eager.nf | 2 ++ 1 file changed, 2 insertions(+) diff --git a/workflows/eager.nf b/workflows/eager.nf index c3239b949..f58f8ad1c 100644 --- a/workflows/eager.nf +++ b/workflows/eager.nf @@ -19,6 +19,8 @@ if (params.input) { ch_input = file(params.input) } else { exit 1, 'Input sample // Check failing parameter combinations if ( params.bamfiltering_retainunmappedgenomicbam && params.bamfiltering_mappingquality > 0 ) { exit 1, ("[nf-core/eager] ERROR: You cannot both retain unmapped reads and perform quality filtering, as unmapped reads have a mapping quality of 0. Pick one or the other functionality.") } +if ( params.run_genotyping && ! params.genotyping_tool ) { exit 1, ("[nf-core/eager] ERROR: --run_genotyping was specified, but no --genotyping_tool was specified.") } +if ( params.run_genotyping && ! params.genotyping_source ) { exit 1, ("[nf-core/eager] ERROR: --run_genotyping was specified, but no --genotyping_source was specified.") } if ( params.genotyping_source == 'trimmed' && ! params.run_trim_bam ) { exit 1, ("[nf-core/eager] ERROR: --genotyping_source cannot be 'trimmed' unless BAM trimming is turned on with `--run_trim_bam`.") } if ( params.genotyping_source == 'pmd' && ! params.run_pmd_filtering ) { exit 1, ("[nf-core/eager] ERROR: --genotyping_source cannot be 'pmd' unless PMD-filtering is ran.") } if ( params.genotyping_source == 'rescaled' && ! params.run_mapdamage_rescaling ) { exit 1, ("[nf-core/eager] ERROR: --genotyping_source cannot be 'rescaled' unless aDNA damage rescaling is ran.") } From 1bf352973ad382084c534ff68ad525c8e2c3642c Mon Sep 17 00:00:00 2001 From: Thiseas Christos Lamnidis Date: Fri, 30 Jun 2023 11:21:40 +0200 Subject: [PATCH 003/110] Add required modules --- modules.json | 10 +++ modules/nf-core/samtools/mpileup/main.nf | 37 +++++++++ modules/nf-core/samtools/mpileup/meta.yml | 52 +++++++++++++ .../sequencetools/pileupcaller/main.nf | 45 +++++++++++ .../sequencetools/pileupcaller/meta.yml | 77 +++++++++++++++++++ 5 files changed, 221 insertions(+) create mode 100644 modules/nf-core/samtools/mpileup/main.nf create mode 100644 modules/nf-core/samtools/mpileup/meta.yml create mode 100644 modules/nf-core/sequencetools/pileupcaller/main.nf create mode 100644 modules/nf-core/sequencetools/pileupcaller/meta.yml diff --git a/modules.json b/modules.json index 566c4dbd2..55b043caf 100644 --- a/modules.json +++ b/modules.json @@ -180,6 +180,11 @@ "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", "installed_by": ["modules"] }, + "samtools/mpileup": { + "branch": "master", + "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", + "installed_by": ["modules"] + }, "samtools/sort": { "branch": "master", "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", @@ -189,6 +194,11 @@ "branch": "master", "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", "installed_by": ["modules", "bam_split_by_region"] + }, + "sequencetools/pileupcaller": { + "branch": "master", + "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", + "installed_by": ["modules"] } } }, diff --git a/modules/nf-core/samtools/mpileup/main.nf b/modules/nf-core/samtools/mpileup/main.nf new file mode 100644 index 000000000..d77249841 --- /dev/null +++ b/modules/nf-core/samtools/mpileup/main.nf @@ -0,0 +1,37 @@ +process SAMTOOLS_MPILEUP { + tag "$meta.id" + label 'process_single' + + conda "bioconda::samtools=1.17" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/samtools:1.17--h00cdaf9_0' : + 'biocontainers/samtools:1.17--h00cdaf9_0' }" + input: + tuple val(meta), path(input), path(intervals) + path fasta + + output: + tuple val(meta), path("*.mpileup.gz"), emit: mpileup + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def intervals = intervals ? "-l ${intervals}" : "" + """ + samtools mpileup \\ + --fasta-ref $fasta \\ + --output ${prefix}.mpileup \\ + $args \\ + $intervals \\ + $input + bgzip ${prefix}.mpileup + cat <<-END_VERSIONS > versions.yml + "${task.process}": + samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/samtools/mpileup/meta.yml b/modules/nf-core/samtools/mpileup/meta.yml new file mode 100644 index 000000000..7597ef41a --- /dev/null +++ b/modules/nf-core/samtools/mpileup/meta.yml @@ -0,0 +1,52 @@ +name: samtools_mpileup +description: BAM +keywords: + - mpileup + - bam + - sam + - cram +tools: + - samtools: + description: | + SAMtools is a set of utilities for interacting with and post-processing + short DNA sequence read alignments in the SAM, BAM and CRAM formats, written by Heng Li. + These files are generated as output by short read aligners like BWA. + homepage: http://www.htslib.org/ + documentation: http://www.htslib.org/doc/samtools.html + doi: 10.1093/bioinformatics/btp352 + licence: ["MIT"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - input: + type: file + description: BAM/CRAM/SAM file + pattern: "*.{bam,cram,sam}" + - fasta: + type: file + description: FASTA reference file + pattern: "*.{fasta,fa}" + - intervals: + type: file + description: Interval FILE + pattern: "*.bed" +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - mpileup: + type: file + description: mpileup file + pattern: "*.{mpileup}" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@drpatelh" + - "@joseespinosa" diff --git a/modules/nf-core/sequencetools/pileupcaller/main.nf b/modules/nf-core/sequencetools/pileupcaller/main.nf new file mode 100644 index 000000000..d09dfc1b6 --- /dev/null +++ b/modules/nf-core/sequencetools/pileupcaller/main.nf @@ -0,0 +1,45 @@ +process SEQUENCETOOLS_PILEUPCALLER { + tag "$meta.id" + label 'process_low' + + conda "bioconda::sequencetools=1.5.2" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/sequencetools:1.5.2--hec16e2b_1': + 'biocontainers/sequencetools:1.5.2--hec16e2b_1' }" + + input: + tuple val(meta), path(mpileup) + path snpfile + path sample_names_fn + + output: + tuple val(meta), path("*.geno"), path("*.snp"), path("*.ind") , optional:true, emit: eigenstrat + tuple val(meta), path("*.bed"), path("*.bim"), path("*.fam") , optional:true, emit: plink + tuple val(meta), path("*.freqsum.gz") , optional:true, emit: freqsum + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def sample_names = sample_names_fn ? "--sampleNameFile ${sample_names_fn}" : '' + def args_list = args.tokenize() + // If no output format is set, freqsum is produced in stdout. + freqsum_output = "-e" in args_list || "--eigenstratOut" in args_list || "-p" in args_list || "--plinkOut" in args_list ? '' : "| gzip -c > ${prefix}.freqsum.gz" + + """ + gzip -cdf ${mpileup} | \\ + pileupCaller \\ + -f ${snpfile} \\ + ${sample_names} \\ + ${args} \\ + ${freqsum_output} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + sequencetools: \$(echo \$(pileupCaller --version 2>&1) ) + END_VERSIONS + """ +} diff --git a/modules/nf-core/sequencetools/pileupcaller/meta.yml b/modules/nf-core/sequencetools/pileupcaller/meta.yml new file mode 100644 index 000000000..7ab10cde2 --- /dev/null +++ b/modules/nf-core/sequencetools/pileupcaller/meta.yml @@ -0,0 +1,77 @@ +name: "sequencetools_pileupcaller" +description: PileupCaller is a tool to create genotype calls from bam files using read-sampling methods +keywords: + - genotyping + - mpileup + - random draw + - pseudohaploid + - pseudodiploid + - freqsum + - plink + - bed + - eigenstrat +tools: + - "sequencetools": + description: "Tools for population genetics on sequencing data" + homepage: "https://github.com/stschiff/sequenceTools" + documentation: "https://github.com/stschiff/sequenceTools#readme" + tool_dev_url: "https://github.com/stschiff/sequenceTools" + + licence: "['MIT']" + +input: + # Only when we have meta + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + + - mpileup: + type: file + description: samtools mpileup output. + + - snpfile: + type: file + description: | + Eigenstrat format .snp file of the sites in the mpileup file to call genotypes on. + Only alleles matching the Ref and Alt alleles of the provided snp file will be called. + + - calling_method: + type: value + description: The desired calling method for pileupcaller. One of 'randomHaploid', 'randomDiploid', or 'majorityCall'. + + - output_format: + type: value + description: The desired output format. One of 'PLINK', 'EIGENSTRAT', or 'FREQSUM'. + +output: + #Only when we have meta + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + + - eigenstrat: + type: file + description: A tuple containing the output Eigenstrat-formatted geno, snp and ind files. + pattern: "*.{geno,snp,ind}.txt" + + - plink: + type: file + description: A tuple containing the output Plink-formatted bed, bim and fam files. + pattern: "*.{bed,bim,fam}" + + - freqsum: + type: file + description: The output freqsum-formatted file. + pattern: "*.freqsum.gz" + +authors: + - "@TCLamnidis" From 05f1b13c5b291643d9e4791f27d1107314ec5cd2 Mon Sep 17 00:00:00 2001 From: Thiseas Christos Lamnidis Date: Fri, 30 Jun 2023 11:53:35 +0200 Subject: [PATCH 004/110] Install modules --- modules.json | 20 +++++ .../eigenstratsnpcoverage/main.nf | 37 +++++++++ .../eigenstratsnpcoverage/meta.yml | 57 +++++++++++++ modules/nf-core/freebayes/main.nf | 51 ++++++++++++ modules/nf-core/freebayes/meta.yml | 82 ++++++++++++++++++ modules/nf-core/gatk/unifiedgenotyper/main.nf | 63 ++++++++++++++ .../nf-core/gatk/unifiedgenotyper/meta.yml | 74 +++++++++++++++++ modules/nf-core/gatk4/haplotypecaller/main.nf | 75 +++++++++++++++++ .../nf-core/gatk4/haplotypecaller/meta.yml | 83 +++++++++++++++++++ workflows/eager.nf | 12 +++ 10 files changed, 554 insertions(+) create mode 100644 modules/nf-core/eigenstratdatabasetools/eigenstratsnpcoverage/main.nf create mode 100644 modules/nf-core/eigenstratdatabasetools/eigenstratsnpcoverage/meta.yml create mode 100644 modules/nf-core/freebayes/main.nf create mode 100644 modules/nf-core/freebayes/meta.yml create mode 100644 modules/nf-core/gatk/unifiedgenotyper/main.nf create mode 100644 modules/nf-core/gatk/unifiedgenotyper/meta.yml create mode 100644 modules/nf-core/gatk4/haplotypecaller/main.nf create mode 100644 modules/nf-core/gatk4/haplotypecaller/meta.yml diff --git a/modules.json b/modules.json index 55b043caf..f44578c97 100644 --- a/modules.json +++ b/modules.json @@ -85,6 +85,11 @@ "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", "installed_by": ["modules"] }, + "eigenstratdatabasetools/eigenstratsnpcoverage": { + "branch": "master", + "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", + "installed_by": ["modules"] + }, "falco": { "branch": "master", "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", @@ -100,6 +105,21 @@ "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", "installed_by": ["modules"] }, + "freebayes": { + "branch": "master", + "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", + "installed_by": ["modules"] + }, + "gatk/unifiedgenotyper": { + "branch": "master", + "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", + "installed_by": ["modules"] + }, + "gatk4/haplotypecaller": { + "branch": "master", + "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", + "installed_by": ["modules"] + }, "gunzip": { "branch": "master", "git_sha": "5c460c5a4736974abde2843294f35307ee2b0e5e", diff --git a/modules/nf-core/eigenstratdatabasetools/eigenstratsnpcoverage/main.nf b/modules/nf-core/eigenstratdatabasetools/eigenstratsnpcoverage/main.nf new file mode 100644 index 000000000..66d439a47 --- /dev/null +++ b/modules/nf-core/eigenstratdatabasetools/eigenstratsnpcoverage/main.nf @@ -0,0 +1,37 @@ +process EIGENSTRATDATABASETOOLS_EIGENSTRATSNPCOVERAGE { + tag "$meta.id" + label 'process_single' + + conda "bioconda::eigenstratdatabasetools=1.1.0" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/eigenstratdatabasetools:1.1.0--hdfd78af_0': + 'biocontainers/eigenstratdatabasetools:1.1.0--hdfd78af_0' }" + + input: + tuple val(meta), path(geno), path(snp), path(ind) + + output: + tuple val(meta), path("*.tsv") , emit: tsv + tuple val(meta), path("*.json"), emit: json, optional:true + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + """ + eigenstrat_snp_coverage \\ + $args \\ + -g ${geno} \\ + -s ${snp} \\ + -i ${ind} \\ + -o ${prefix}.tsv + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + eigenstratdatabasetools: \$(echo \$(eigenstrat_snp_coverage --version 2>&1) | sed 's/^.*eigenstrat_snp_coverage //' )) + END_VERSIONS + """ +} diff --git a/modules/nf-core/eigenstratdatabasetools/eigenstratsnpcoverage/meta.yml b/modules/nf-core/eigenstratdatabasetools/eigenstratsnpcoverage/meta.yml new file mode 100644 index 000000000..87eaab00c --- /dev/null +++ b/modules/nf-core/eigenstratdatabasetools/eigenstratsnpcoverage/meta.yml @@ -0,0 +1,57 @@ +name: "eigenstratdatabasetools_eigenstratsnpcoverage" +description: Provide the SNP coverage of each individual in an eigenstrat formatted dataset. +keywords: + - coverage + - eigenstrat + - eigenstratdatabasetools + - snp + - snps +tools: + - "eigenstratdatabasetools": + description: "A set of tools to compare and manipulate the contents of EingenStrat databases, and to calculate SNP coverage statistics in such databases." + + documentation: "https://github.com/TCLamnidis/EigenStratDatabaseTools/README.md" + tool_dev_url: "https://github.com/TCLamnidis/EigenStratDatabaseTools" + + licence: "['GPL v3']" + +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - geno: + type: file + description: An Eigenstrat formatted genotype file + pattern: "*.{geno}" + - snp: + type: file + description: An Eigenstrat formatted snp file + pattern: "*.{snp}" + - ind: + type: file + description: An Eigenstrat formatted individual file + pattern: "*.{ind}" + +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - tsv: + type: file + description: A TSV table with the number of covered SNPs per individual. + pattern: "*.{tsv}" + - json: + type: file + description: A json table with the number of covered SNPs per individual. + pattern: "*.{json}" + +authors: + - "@TCLamnidis" diff --git a/modules/nf-core/freebayes/main.nf b/modules/nf-core/freebayes/main.nf new file mode 100644 index 000000000..1466f085e --- /dev/null +++ b/modules/nf-core/freebayes/main.nf @@ -0,0 +1,51 @@ +process FREEBAYES { + tag "$meta.id" + label 'process_single' + + conda "bioconda::freebayes=1.3.6" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/freebayes:1.3.6--hbfe0e7f_2' : + 'biocontainers/freebayes:1.3.6--hbfe0e7f_2' }" + + input: + tuple val(meta), path(input_1), path(input_1_index), path(input_2), path(input_2_index), path(target_bed) + path fasta + path fasta_fai + path samples + path populations + path cnv + + output: + tuple val(meta), path("*.vcf.gz"), emit: vcf + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def input = input_2 ? "${input_1} ${input_2}" : "${input_1}" + def targets_file = target_bed ? "--target ${target_bed}" : "" + def samples_file = samples ? "--samples ${samples}" : "" + def populations_file = populations ? "--populations ${populations}" : "" + def cnv_file = cnv ? "--cnv-map ${cnv}" : "" + + """ + freebayes \\ + -f $fasta \\ + $targets_file \\ + $samples_file \\ + $populations_file \\ + $cnv_file \\ + $args \\ + $input > ${prefix}.vcf + + bgzip ${prefix}.vcf + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + freebayes: \$(echo \$(freebayes --version 2>&1) | sed 's/version:\s*v//g' ) + END_VERSIONS + """ +} diff --git a/modules/nf-core/freebayes/meta.yml b/modules/nf-core/freebayes/meta.yml new file mode 100644 index 000000000..17d83cba2 --- /dev/null +++ b/modules/nf-core/freebayes/meta.yml @@ -0,0 +1,82 @@ +name: freebayes +description: A haplotype-based variant detector +keywords: + - variant caller + - SNP + - genotyping + - somatic variant calling + - germline variant calling + - bacterial variant calling + - bayesian + +tools: + - freebayes: + description: Bayesian haplotype-based polymorphism discovery and genotyping + homepage: https://github.com/freebayes/freebayes + documentation: https://github.com/freebayes/freebayes + tool_dev_url: https://github.com/freebayes/freebayes + doi: "10.48550/arXiv.1207.3907" + licence: ["MIT"] + +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - input: + type: file + description: BAM/CRAM/SAM file + pattern: "*.{bam,cram,sam}" + - input_index: + type: file + description: BAM/CRAM/SAM index file + pattern: "*.{bai,crai}" + - target_bed: + type: file + description: Optional - Limit analysis to targets listed in this BED-format FILE. + pattern: "*.bed" + - fasta: + type: file + description: reference fasta file + pattern: ".{fa,fa.gz,fasta,fasta.gz}" + - fasta_fai: + type: file + description: reference fasta file index + pattern: "*.{fa,fasta}.fai" + - samples: + type: file + description: Optional - Limit analysis to samples listed (one per line) in the FILE. + pattern: "*.txt" + - populations: + type: file + description: Optional - Each line of FILE should list a sample and a population which it is part of. + pattern: "*.txt" + - cnv: + type: file + description: | + A copy number map BED file, which has either a sample-level ploidy: + sample_name copy_number + or a region-specific format: + seq_name start end sample_name copy_number + pattern: "*.bed" + +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - version: + type: file + description: File containing software version + pattern: "*.{version.txt}" + - vcf: + type: file + description: Compressed VCF file + pattern: "*.vcf.gz" + +authors: + - "@maxibor" + - "@FriederikeHanssen" + - "@maxulysse" diff --git a/modules/nf-core/gatk/unifiedgenotyper/main.nf b/modules/nf-core/gatk/unifiedgenotyper/main.nf new file mode 100644 index 000000000..fceb25e48 --- /dev/null +++ b/modules/nf-core/gatk/unifiedgenotyper/main.nf @@ -0,0 +1,63 @@ +process GATK_UNIFIEDGENOTYPER { + tag "$meta.id" + label 'process_medium' + + conda "bioconda::gatk=3.5" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/gatk:3.5--hdfd78af_11': + 'biocontainers/gatk:3.5--hdfd78af_11' }" + + input: + tuple val(meta), path(input), path(index) + path fasta + path fai + path dict + path intervals + path contamination + path dbsnp + path comp + + output: + tuple val(meta), path("*.vcf.gz"), emit: vcf + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def contamination_file = contamination ? "-contaminationFile ${contamination}" : "" + def dbsnp_file = dbsnp ? "--dbsnp ${dbsnp}" : "" + def comp_file = comp ? "--comp ${comp}" : "" + def intervals_file = intervals ? "--intervals ${intervals}" : "" + + def avail_mem = 3072 + if (!task.memory) { + log.info '[GATK RealignerTargetCreator] Available memory not known - defaulting to 3GB. Specify process memory requirements to change this.' + } else { + avail_mem = (task.memory.mega*0.8).intValue() + } + + """ + gatk3 \\ + -Xmx${avail_mem}M \\ + -nt ${task.cpus} \\ + -T UnifiedGenotyper \\ + -I ${input} \\ + -R ${fasta} \\ + ${contamination_file} \\ + ${dbsnp_file} \\ + ${comp_file} \\ + ${intervals_file} \\ + -o ${prefix}.vcf \\ + $args + + gzip -n *.vcf + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + gatk: \$(echo \$(gatk3 --version)) + END_VERSIONS + """ +} diff --git a/modules/nf-core/gatk/unifiedgenotyper/meta.yml b/modules/nf-core/gatk/unifiedgenotyper/meta.yml new file mode 100644 index 000000000..fc40bd636 --- /dev/null +++ b/modules/nf-core/gatk/unifiedgenotyper/meta.yml @@ -0,0 +1,74 @@ +name: "gatk_unifiedgenotyper" +description: SNP and Indel variant caller on a per-locus basis +keywords: + - bam + - vcf + - variant calling +tools: + - "gatk": + description: "The full Genome Analysis Toolkit (GATK) framework, license restricted." + homepage: "https://gatk.broadinstitute.org/hc/en-us" + documentation: "https://github.com/broadinstitute/gatk-docs" + licence: "['https://software.broadinstitute.org/gatk/download/licensing', 'BSD', 'https://www.broadinstitute.org/gatk/about/#licensing']" + +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - input: + type: file + description: Sorted and indexed BAM/CRAM/SAM file + pattern: "*.bam" + - index: + type: file + description: BAM index file + pattern: "*.bai" + - fasta: + type: file + description: Reference file used to generate BAM file + pattern: ".{fasta,fa,fna}" + - fai: + type: file + description: Index of reference file used to generate BAM file + pattern: ".fai" + - dict: + type: file + description: GATK dict file for reference + pattern: ".dict" + - intervals: + type: file + description: Bed file with the genomic regions included in the library (optional) + pattern: "*.intervals" + - contamination: + type: file + description: Tab-separated file containing fraction of contamination in sequencing data (per sample) to aggressively remove + pattern: "*" + - dbsnps: + type: file + description: VCF file containing known sites (optional) + pattern: "*" + - comp: + type: file + description: Comparison VCF file (optional) + pattern: "*" + +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - vcf: + type: file + description: VCF file containing called variants + pattern: "*.vcf.gz" + +authors: + - "@ilight1542" + - "@jfy133" diff --git a/modules/nf-core/gatk4/haplotypecaller/main.nf b/modules/nf-core/gatk4/haplotypecaller/main.nf new file mode 100644 index 000000000..478681bd1 --- /dev/null +++ b/modules/nf-core/gatk4/haplotypecaller/main.nf @@ -0,0 +1,75 @@ +process GATK4_HAPLOTYPECALLER { + tag "$meta.id" + label 'process_medium' + + conda "bioconda::gatk4=4.4.0.0" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/gatk4:4.4.0.0--py36hdfd78af_0': + 'biocontainers/gatk4:4.4.0.0--py36hdfd78af_0' }" + + input: + tuple val(meta), path(input), path(input_index), path(intervals), path(dragstr_model) + path fasta + path fai + path dict + path dbsnp + path dbsnp_tbi + + output: + tuple val(meta), path("*.vcf.gz") , emit: vcf + tuple val(meta), path("*.tbi") , optional:true, emit: tbi + tuple val(meta), path("*.realigned.bam"), optional:true, emit: bam + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def dbsnp_command = dbsnp ? "--dbsnp $dbsnp" : "" + def interval_command = intervals ? "--intervals $intervals" : "" + def dragstr_command = dragstr_model ? "--dragstr-params-path $dragstr_model" : "" + def bamout_command = args.contains("--bam-writer-type") ? "--bam-output ${prefix.replaceAll('.g\\s*$', '')}.realigned.bam" : "" + + def avail_mem = 3072 + if (!task.memory) { + log.info '[GATK HaplotypeCaller] Available memory not known - defaulting to 3GB. Specify process memory requirements to change this.' + } else { + avail_mem = (task.memory.mega*0.8).intValue() + } + """ + gatk --java-options "-Xmx${avail_mem}M" HaplotypeCaller \\ + --input $input \\ + --output ${prefix}.vcf.gz \\ + --reference $fasta \\ + $dbsnp_command \\ + $interval_command \\ + $dragstr_command \\ + $bamout_command \\ + --tmp-dir . \\ + $args + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + gatk4: \$(echo \$(gatk --version 2>&1) | sed 's/^.*(GATK) v//; s/ .*\$//') + END_VERSIONS + """ + + stub: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def bamout_command = args.contains("--bam-writer-type") ? "--bam-output ${prefix.replaceAll('.g\\s*$', '')}.realigned.bam" : "" + + def stub_realigned_bam = bamout_command ? "touch ${prefix.replaceAll('.g\\s*$', '')}.realigned.bam" : "" + """ + touch ${prefix}.vcf.gz + touch ${prefix}.vcf.gz.tbi + ${stub_realigned_bam} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + gatk4: \$(echo \$(gatk --version 2>&1) | sed 's/^.*(GATK) v//; s/ .*\$//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/gatk4/haplotypecaller/meta.yml b/modules/nf-core/gatk4/haplotypecaller/meta.yml new file mode 100644 index 000000000..27633cca6 --- /dev/null +++ b/modules/nf-core/gatk4/haplotypecaller/meta.yml @@ -0,0 +1,83 @@ +name: gatk4_haplotypecaller +description: Call germline SNPs and indels via local re-assembly of haplotypes +keywords: + - gatk4 + - haplotypecaller + - haplotype +tools: + - gatk4: + description: | + Developed in the Data Sciences Platform at the Broad Institute, the toolkit offers a wide variety of tools + with a primary focus on variant discovery and genotyping. Its powerful processing engine + and high-performance computing features make it capable of taking on projects of any size. + homepage: https://gatk.broadinstitute.org/hc/en-us + documentation: https://gatk.broadinstitute.org/hc/en-us/categories/360002369672s + doi: 10.1158/1538-7445.AM2017-3590 + licence: ["Apache-2.0"] + +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - input: + type: file + description: BAM/CRAM file from alignment + pattern: "*.{bam,cram}" + - input_index: + type: file + description: BAI/CRAI file from alignment + pattern: "*.{bai,crai}" + - intervals: + type: file + description: Bed file with the genomic regions included in the library (optional) + - dragstr_model: + type: file + description: Text file containing the DragSTR model of the used BAM/CRAM file (optional) + pattern: "*.txt" + - fasta: + type: file + description: The reference fasta file + pattern: "*.fasta" + - fai: + type: file + description: Index of reference fasta file + pattern: "fasta.fai" + - dict: + type: file + description: GATK sequence dictionary + pattern: "*.dict" + - dbsnp: + type: file + description: VCF file containing known sites (optional) + - dbsnp_tbi: + type: file + description: VCF index of dbsnp (optional) + +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - vcf: + type: file + description: Compressed VCF file + pattern: "*.vcf.gz" + - tbi: + type: file + description: Index of VCF file + pattern: "*.vcf.gz.tbi" + - bam: + type: file + description: Assembled haplotypes and locally realigned reads + pattern: "*.realigned.bam" + +authors: + - "@suzannejin" + - "@FriederikeHanssen" diff --git a/workflows/eager.nf b/workflows/eager.nf index f58f8ad1c..a414327a8 100644 --- a/workflows/eager.nf +++ b/workflows/eager.nf @@ -75,6 +75,7 @@ include { MANIPULATE_DAMAGE } from '../subworkflows/local/manipulate include { METAGENOMICS_COMPLEXITYFILTER } from '../subworkflows/local/metagenomics_complexityfilter' include { ESTIMATE_CONTAMINATION } from '../subworkflows/local/estimate_contamination' include { CALCULATE_DAMAGE } from '../subworkflows/local/calculate_damage' +include { GENOTYPE } from '../subworkflows/local/genotype' /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -385,6 +386,17 @@ workflow EAGER { ch_bams_for_genotyping = ch_dedupped_bams } + // + // SUBWORKFLOW: Genotyping + // + + if ( params.run_genotyping ) { + GENOTYPE( ch_genotyping_input, ch_fasta ) + + ch_versions = ch_versions.mix( GENOTYPE.out.versions ) + ch_multiqc_files = ch_multiqc_files.mix( GENOTYPE.out.mqc.collect{it[1]}.ifEmpty([]) ) + } + // // MODULE: MultiQC // From 1b82eaa2944644bace1bfbf20069350878330d06 Mon Sep 17 00:00:00 2001 From: Thiseas Christos Lamnidis Date: Mon, 3 Jul 2023 19:54:29 +0200 Subject: [PATCH 005/110] wip genotyping --- subworkflows/local/genotype.nf | 65 ++++++++++++++++++++++++++++++++++ 1 file changed, 65 insertions(+) create mode 100644 subworkflows/local/genotype.nf diff --git a/subworkflows/local/genotype.nf b/subworkflows/local/genotype.nf new file mode 100644 index 000000000..2441e1e68 --- /dev/null +++ b/subworkflows/local/genotype.nf @@ -0,0 +1,65 @@ +// +// Genotype the input data using the requested genotyper. +// + +include { SAMTOOLS_MPILEUP as SAMTOOLS_MPILEUP_PILEUPCALLER } from '../modules/nf-core/samtools/mpileup/main' +include { EIGENSTRATDATABASETOOLS_EIGENSTRATSNPCOVERAGE } from '../modules/nf-core/eigenstratdatabasetools/eigenstratsnpcoverage/main' +include { SEQUENCETOOLS_PILEUPCALLER } from '../modules/nf-core/sequencetools/pileupcaller/main' +include { GATK_UNIFIEDGENOTYPER } from '../modules/nf-core/gatk/unifiedgenotyper/main' +include { GATK4_HAPLOTYPECALLER } from '../modules/nf-core/gatk4/haplotypecaller/main' +include { FREEBAYES } from '../modules/nf-core/freebayes/main' +// TODO Add ANGSD GTL module. The current module does not pick up the .glf.gz output files. + +workflow GENOTYPE { + take: + ch_bam_bai // [ [ meta ], bam , bai ] + ch_fasta // [ [ meta ], fasta ] + ch_snpcapture_bed // [ [ meta ], bed ] + ch_pileupcaller_bedfile // [ [ meta ], bed ] + ch_pileupcaller_snpfile // [ [ meta ], snp ] + + main: + ch_versions = Channel.empty() + ch_multiqc_files = Channel.empty() + ch_pileupcaller_genotypes = Channel.empty() + ch_gatk_haplotypecaller_genotypes = Channel.empty() + ch_gatk_unifiedgenotyper_genotypes = Channel.empty() + ch_freebayes_genotypes = Channel.empty() + ch_angsd_genotypes = Channel.empty() + + + if ( params.genotyping_tool == 'pileupcaller' ) { + SAMTOOLS_MPILEUP_PILEUPCALLER( ch_bam_bai, ch_fasta ) + + /* + // TODO - this is not working yet. Need snpcapture Bed and pileupcaller snp file to add here. + SEQUENCETOOLS_PILEUPCALLER( ch_bam_bai, ch_fasta, ch_versions, ch_multiqc_files ) + */ + } + + if ( params.genotyping_tool == 'unifiedgenotyper' ) { + // TODO + } + + if ( params.genotyping_tool == 'haplotypecaller' ) { + // TODO + } + + if ( params.genotyping_tool == 'freebayes' ) { + // TODO + } + + if ( params.genotyping_tool == 'angsd' ) { + // TODO + } + + emit: + pileupcaller_genotypes = ch_pileupcaller_genotypes // [ [ meta ], geno, snp, ind ] + gatk_haplotypecaller_genotypes = ch_gatk_haplotypecaller_genotypes // [ [ meta ], vcf ] ] + gatk_unifiedgenotyper_genotypes = ch_gatk_unifiedgenotyper_genotypes // [ [ meta ], vcf ] ] + freebayes_genotypes = ch_freebayes_genotypes // [ [ meta ], vcf ] ] + angsd_genotypes = ch_angsd_genotypes // [ [ meta ], glf ] ] + versions = ch_versions + mqc = ch_multiqc_files + +} From 0e8b3c702e9063f0fa387642261fcd65831a5d74 Mon Sep 17 00:00:00 2001 From: Thiseas Christos Lamnidis Date: Fri, 14 Jul 2023 11:34:17 +0200 Subject: [PATCH 006/110] Install GATK UG modules --- modules.json | 10 +++ modules/nf-core/gatk/indelrealigner/main.nf | 54 ++++++++++++++ modules/nf-core/gatk/indelrealigner/meta.yml | 71 +++++++++++++++++++ .../gatk/realignertargetcreator/main.nf | 53 ++++++++++++++ .../gatk/realignertargetcreator/meta.yml | 64 +++++++++++++++++ subworkflows/local/genotype.nf | 16 +++-- 6 files changed, 261 insertions(+), 7 deletions(-) create mode 100644 modules/nf-core/gatk/indelrealigner/main.nf create mode 100644 modules/nf-core/gatk/indelrealigner/meta.yml create mode 100644 modules/nf-core/gatk/realignertargetcreator/main.nf create mode 100644 modules/nf-core/gatk/realignertargetcreator/meta.yml diff --git a/modules.json b/modules.json index f44578c97..66c6deb73 100644 --- a/modules.json +++ b/modules.json @@ -110,6 +110,16 @@ "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", "installed_by": ["modules"] }, + "gatk/indelrealigner": { + "branch": "master", + "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", + "installed_by": ["modules"] + }, + "gatk/realignertargetcreator": { + "branch": "master", + "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", + "installed_by": ["modules"] + }, "gatk/unifiedgenotyper": { "branch": "master", "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", diff --git a/modules/nf-core/gatk/indelrealigner/main.nf b/modules/nf-core/gatk/indelrealigner/main.nf new file mode 100644 index 000000000..6a057ae8e --- /dev/null +++ b/modules/nf-core/gatk/indelrealigner/main.nf @@ -0,0 +1,54 @@ +process GATK_INDELREALIGNER { + tag "$meta.id" + label 'process_single' + + conda "bioconda::gatk=3.5" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/gatk:3.5--hdfd78af_11': + 'biocontainers/gatk:3.5--hdfd78af_11' }" + + input: + tuple val(meta), path(bam), path(bai), path(intervals) + path(fasta) + path(fai) + path(dict) + path(known_vcf) + + output: + tuple val(meta), path("*.bam"), path("*.bai"), emit: bam + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def known = known_vcf ? "-known ${known_vcf}" : "" + + if ("$bam" == "${prefix}.bam") error "Input and output names are the same, set prefix in module configuration to disambiguate!" + + def avail_mem = 3072 + if (!task.memory) { + log.info '[GATK IndelRealigner] Available memory not known - defaulting to 3GB. Specify process memory requirements to change this.' + } else { + avail_mem = (task.memory.mega*0.8).intValue() + } + + """ + gatk3 \\ + -Xmx${avail_mem}M \\ + -T IndelRealigner \\ + -R ${fasta} \\ + -I ${bam} \\ + --targetIntervals ${intervals} \\ + ${known} \\ + -o ${prefix}.bam \\ + $args + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + gatk: \$(echo \$(gatk3 --version)) + END_VERSIONS + """ +} diff --git a/modules/nf-core/gatk/indelrealigner/meta.yml b/modules/nf-core/gatk/indelrealigner/meta.yml new file mode 100644 index 000000000..35ad28e8e --- /dev/null +++ b/modules/nf-core/gatk/indelrealigner/meta.yml @@ -0,0 +1,71 @@ +name: "gatk_indelrealigner" +description: Performs local realignment around indels to correct for mapping errors +keywords: + - bam + - vcf + - variant calling + - indel + - realignment +tools: + - "gatk": + description: "The full Genome Analysis Toolkit (GATK) framework, license restricted." + homepage: "https://gatk.broadinstitute.org/hc/en-us" + documentation: "https://github.com/broadinstitute/gatk-docs" + licence: "['https://software.broadinstitute.org/gatk/download/licensing', 'BSD', 'https://www.broadinstitute.org/gatk/about/#licensing']" + +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - bam: + type: file + description: Sorted and indexed BAM file + pattern: "*.bam" + - bai: + type: file + description: BAM index file + pattern: "*.bai" + - intervals: + type: file + description: Intervals file created by gatk3 RealignerTargetCreator + pattern: "*.{intervals,list}" + - fasta: + type: file + description: Reference file used to generate BAM file + pattern: ".{fasta,fa,fna}" + - fai: + type: file + description: Index of reference file used to generate BAM file + pattern: ".fai" + - dict: + type: file + description: GATK dict file for reference + pattern: ".dict" + - known_vcf: + type: file + description: Optional input VCF file(s) with known indels + pattern: ".vcf" + +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - bam: + type: file + description: Sorted and indexed BAM file with local realignment around variants + pattern: "*.bam" + - bai: + type: file + description: Output BAM Index file + pattern: "*.bai" + +authors: + - "@jfy133" diff --git a/modules/nf-core/gatk/realignertargetcreator/main.nf b/modules/nf-core/gatk/realignertargetcreator/main.nf new file mode 100644 index 000000000..a4866417e --- /dev/null +++ b/modules/nf-core/gatk/realignertargetcreator/main.nf @@ -0,0 +1,53 @@ +process GATK_REALIGNERTARGETCREATOR { + tag "$meta.id" + label 'process_low' + + conda "bioconda::gatk=3.5" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/gatk:3.5--hdfd78af_11': + 'biocontainers/gatk:3.5--hdfd78af_11' }" + + input: + tuple val(meta), path(input), path(index) + path fasta + path fai + path dict + path known_vcf + + output: + tuple val(meta), path("*.intervals"), emit: intervals + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def known = known_vcf ? "-known ${known_vcf}" : "" + if ("$input" == "${prefix}.bam") error "Input and output names are the same, set prefix in module configuration to disambiguate!" + + def avail_mem = 3072 + if (!task.memory) { + log.info '[GATK RealignerTargetCreator] Available memory not known - defaulting to 3GB. Specify process memory requirements to change this.' + } else { + avail_mem = (task.memory.mega*0.8).intValue() + } + + """ + gatk3 \\ + -Xmx${avail_mem}M \\ + -T RealignerTargetCreator \\ + -nt ${task.cpus} \\ + -I ${input} \\ + -R ${fasta} \\ + -o ${prefix}.intervals \\ + ${known} \\ + $args + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + gatk: \$(echo \$(gatk3 --version)) + END_VERSIONS + """ +} diff --git a/modules/nf-core/gatk/realignertargetcreator/meta.yml b/modules/nf-core/gatk/realignertargetcreator/meta.yml new file mode 100644 index 000000000..c49d2a8d9 --- /dev/null +++ b/modules/nf-core/gatk/realignertargetcreator/meta.yml @@ -0,0 +1,64 @@ +name: "gatk_realignertargetcreator" +description: Generates a list of locations that should be considered for local realignment prior genotyping. +keywords: + - bam + - vcf + - variant calling + - indel + - realignment + - targets +tools: + - "gatk": + description: "The full Genome Analysis Toolkit (GATK) framework, license restricted." + homepage: "https://gatk.broadinstitute.org/hc/en-us" + documentation: "https://github.com/broadinstitute/gatk-docs" + licence: "['https://software.broadinstitute.org/gatk/download/licensing', 'BSD', 'https://www.broadinstitute.org/gatk/about/#licensing']" + +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - input: + type: file + description: Sorted and indexed BAM/CRAM/SAM file + pattern: "*.bam" + - index: + type: file + description: BAM index file + pattern: "*.bai" + - fasta: + type: file + description: Reference file used to generate BAM file + pattern: ".{fasta,fa,fna}" + - fai: + type: file + description: Index of reference file used to generate BAM file + pattern: ".fai" + - dict: + type: file + description: GATK dict file for reference + pattern: ".dict" + - known_vcf: + type: file + description: Optional input VCF file(s) with known indels + pattern: ".vcf" + +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - intervals: + type: file + description: File containg intervals that represent sites of extant and potential indels. + pattern: "*.intervals" + +authors: + - "@jfy133" diff --git a/subworkflows/local/genotype.nf b/subworkflows/local/genotype.nf index 2441e1e68..73d52bf79 100644 --- a/subworkflows/local/genotype.nf +++ b/subworkflows/local/genotype.nf @@ -2,12 +2,14 @@ // Genotype the input data using the requested genotyper. // -include { SAMTOOLS_MPILEUP as SAMTOOLS_MPILEUP_PILEUPCALLER } from '../modules/nf-core/samtools/mpileup/main' -include { EIGENSTRATDATABASETOOLS_EIGENSTRATSNPCOVERAGE } from '../modules/nf-core/eigenstratdatabasetools/eigenstratsnpcoverage/main' -include { SEQUENCETOOLS_PILEUPCALLER } from '../modules/nf-core/sequencetools/pileupcaller/main' -include { GATK_UNIFIEDGENOTYPER } from '../modules/nf-core/gatk/unifiedgenotyper/main' -include { GATK4_HAPLOTYPECALLER } from '../modules/nf-core/gatk4/haplotypecaller/main' -include { FREEBAYES } from '../modules/nf-core/freebayes/main' +include { SAMTOOLS_MPILEUP as SAMTOOLS_MPILEUP_PILEUPCALLER } from '../../modules/nf-core/samtools/mpileup/main' +include { EIGENSTRATDATABASETOOLS_EIGENSTRATSNPCOVERAGE } from '../../modules/nf-core/eigenstratdatabasetools/eigenstratsnpcoverage/main' +include { SEQUENCETOOLS_PILEUPCALLER } from '../../modules/nf-core/sequencetools/pileupcaller/main' +include { GATK_INDELREALIGNER } from '../../modules/nf-core/gatk/indelrealigner/main' +include { GATK_REALIGNERTARGETCREATOR } from '../../modules/nf-core/gatk/realignertargetcreator/main' +include { GATK_UNIFIEDGENOTYPER } from '../../modules/nf-core/gatk/unifiedgenotyper/main' +include { GATK4_HAPLOTYPECALLER } from '../../modules/nf-core/gatk4/haplotypecaller/main' +include { FREEBAYES } from '../../modules/nf-core/freebayes/main' // TODO Add ANGSD GTL module. The current module does not pick up the .glf.gz output files. workflow GENOTYPE { @@ -29,7 +31,7 @@ workflow GENOTYPE { if ( params.genotyping_tool == 'pileupcaller' ) { - SAMTOOLS_MPILEUP_PILEUPCALLER( ch_bam_bai, ch_fasta ) + // SAMTOOLS_MPILEUP_PILEUPCALLER( ch_bam_bai, ch_fasta ) /* // TODO - this is not working yet. Need snpcapture Bed and pileupcaller snp file to add here. From c0cd49c4e5107caf4b2686d2dfd519529e80aa05 Mon Sep 17 00:00:00 2001 From: Thiseas Christos Lamnidis Date: Fri, 14 Jul 2023 12:01:11 +0200 Subject: [PATCH 007/110] started adding GATK_UG --- subworkflows/local/genotype.nf | 43 ++++++++++++++++++++++++++++++++-- 1 file changed, 41 insertions(+), 2 deletions(-) diff --git a/subworkflows/local/genotype.nf b/subworkflows/local/genotype.nf index 73d52bf79..66b2bfac7 100644 --- a/subworkflows/local/genotype.nf +++ b/subworkflows/local/genotype.nf @@ -5,8 +5,8 @@ include { SAMTOOLS_MPILEUP as SAMTOOLS_MPILEUP_PILEUPCALLER } from '../../modules/nf-core/samtools/mpileup/main' include { EIGENSTRATDATABASETOOLS_EIGENSTRATSNPCOVERAGE } from '../../modules/nf-core/eigenstratdatabasetools/eigenstratsnpcoverage/main' include { SEQUENCETOOLS_PILEUPCALLER } from '../../modules/nf-core/sequencetools/pileupcaller/main' -include { GATK_INDELREALIGNER } from '../../modules/nf-core/gatk/indelrealigner/main' include { GATK_REALIGNERTARGETCREATOR } from '../../modules/nf-core/gatk/realignertargetcreator/main' +include { GATK_INDELREALIGNER } from '../../modules/nf-core/gatk/indelrealigner/main' include { GATK_UNIFIEDGENOTYPER } from '../../modules/nf-core/gatk/unifiedgenotyper/main' include { GATK4_HAPLOTYPECALLER } from '../../modules/nf-core/gatk4/haplotypecaller/main' include { FREEBAYES } from '../../modules/nf-core/freebayes/main' @@ -40,7 +40,46 @@ workflow GENOTYPE { } if ( params.genotyping_tool == 'unifiedgenotyper' ) { - // TODO + // Use correct reference for each input bam/bai pair. + ch_bams_for_multimap = ch_bam_bai + .map { + // Prepend a new meta that contains the meta.reference value as the new_meta.reference attribute + WorkflowEager.addNewMetaFromAttributes( it, "reference" , "reference" , false ) + } + + ch_fasta_for_multimap = ch_fasta + .map { + // Prepend a new meta that contains the meta.id value as the new_meta.reference attribute + WorkflowEager.addNewMetaFromAttributes( it, "id" , "reference" , false ) + } + + ch_input_for_targetcreator = ch_bams_for_multimap + .combine( ch_fasta_for_multimap , by:0 ) + .multiMap { + ignore_me, meta, bam, bai, ref_meta, fasta -> + bam: [ meta, bam ] + fasta: fasta // no meta needed for fasta in GATK_* modules + } + + GATK_REALIGNERTARGETCREATOR( ch_input_for_targetcreator.bam, ch_input_for_targetcreator.fasta ) + ch_versions = ch_versions.mix( GATK_REALIGNERTARGETCREATOR.out.versions.first() ) + + ch_input_for_indelrealigner = ch_bam_bai + // TODO can I join to ch_bams_for_multimap instead? + .join( GATK_REALIGNERTARGETCREATOR.out.intervals ) + .map { + // Prepend a new meta that contains the meta.reference value as the new_meta.reference attribute + WorkflowEager.addNewMetaFromAttributes( it, "reference" , "reference" , false ) + } + .combine( ch_fasta_for_multimap , by:0 ) + .multiMap { + ignore_me, meta, bam, bai, ref_meta, fasta -> + bam: [ meta, bam ] + fasta: fasta // no meta needed for fasta in GATK_* modules + } + // TODO check that the channel manipulations work as intended and what gets given to INDELREALIGNER + GATK_INDELREALIGNER( ch_input_for_gatk_ug.bam, ch_input_for_gatk_ug.fasta, GATK_REALIGNERTARGETCREATOR.out.intervals ) + // GATK_UNIFIEDGENOTYPER() } if ( params.genotyping_tool == 'haplotypecaller' ) { From 9e6b40731a2f07757d8137f77009cfd19e09ba1c Mon Sep 17 00:00:00 2001 From: Thiseas Christos Lamnidis Date: Fri, 21 Jul 2023 16:07:32 +0200 Subject: [PATCH 008/110] Update gatk3 modules --- modules.json | 6 +-- modules/nf-core/gatk/indelrealigner/main.nf | 8 ++-- modules/nf-core/gatk/indelrealigner/meta.yml | 20 ++++++++++ .../gatk/realignertargetcreator/main.nf | 14 +++---- .../gatk/realignertargetcreator/meta.yml | 24 +++++++++++- modules/nf-core/gatk/unifiedgenotyper/main.nf | 18 ++++----- .../nf-core/gatk/unifiedgenotyper/meta.yml | 39 ++++++++++++++++++- 7 files changed, 102 insertions(+), 27 deletions(-) diff --git a/modules.json b/modules.json index 1a5c2a439..c7398e126 100644 --- a/modules.json +++ b/modules.json @@ -117,17 +117,17 @@ }, "gatk/indelrealigner": { "branch": "master", - "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", + "git_sha": "27902a200da5056a941cde0f15ec80878b5e837c", "installed_by": ["modules"] }, "gatk/realignertargetcreator": { "branch": "master", - "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", + "git_sha": "27902a200da5056a941cde0f15ec80878b5e837c", "installed_by": ["modules"] }, "gatk/unifiedgenotyper": { "branch": "master", - "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", + "git_sha": "27902a200da5056a941cde0f15ec80878b5e837c", "installed_by": ["modules"] }, "gatk4/haplotypecaller": { diff --git a/modules/nf-core/gatk/indelrealigner/main.nf b/modules/nf-core/gatk/indelrealigner/main.nf index 6a057ae8e..abcb245ca 100644 --- a/modules/nf-core/gatk/indelrealigner/main.nf +++ b/modules/nf-core/gatk/indelrealigner/main.nf @@ -9,10 +9,10 @@ process GATK_INDELREALIGNER { input: tuple val(meta), path(bam), path(bai), path(intervals) - path(fasta) - path(fai) - path(dict) - path(known_vcf) + tuple val(meta2), path(fasta) + tuple val(meta3), path(fai) + tuple val(meta4), path(dict) + tuple val(meta5), path(known_vcf) output: tuple val(meta), path("*.bam"), path("*.bai"), emit: bam diff --git a/modules/nf-core/gatk/indelrealigner/meta.yml b/modules/nf-core/gatk/indelrealigner/meta.yml index 35ad28e8e..6751f1117 100644 --- a/modules/nf-core/gatk/indelrealigner/meta.yml +++ b/modules/nf-core/gatk/indelrealigner/meta.yml @@ -31,18 +31,38 @@ input: type: file description: Intervals file created by gatk3 RealignerTargetCreator pattern: "*.{intervals,list}" + - meta2: + type: map + description: | + Groovy Map containing reference information. + e.g. [ id:'test', single_end:false ] - fasta: type: file description: Reference file used to generate BAM file pattern: ".{fasta,fa,fna}" + - meta3: + type: map + description: | + Groovy Map containing reference information. + e.g. [ id:'test', single_end:false ] - fai: type: file description: Index of reference file used to generate BAM file pattern: ".fai" + - meta4: + type: map + description: | + Groovy Map containing reference information. + e.g. [ id:'test', single_end:false ] - dict: type: file description: GATK dict file for reference pattern: ".dict" + - meta5: + type: map + description: | + Groovy Map containing file meta-information for known_vcf. + e.g. [ id:'test', single_end:false ] - known_vcf: type: file description: Optional input VCF file(s) with known indels diff --git a/modules/nf-core/gatk/realignertargetcreator/main.nf b/modules/nf-core/gatk/realignertargetcreator/main.nf index a4866417e..623ac468f 100644 --- a/modules/nf-core/gatk/realignertargetcreator/main.nf +++ b/modules/nf-core/gatk/realignertargetcreator/main.nf @@ -8,11 +8,11 @@ process GATK_REALIGNERTARGETCREATOR { 'biocontainers/gatk:3.5--hdfd78af_11' }" input: - tuple val(meta), path(input), path(index) - path fasta - path fai - path dict - path known_vcf + tuple val(meta), path(bam), path(bai) + tuple val(meta2), path(fasta) + tuple val(meta3), path(fai) + tuple val(meta4), path(dict) + tuple val(meta5), path(known_vcf) output: tuple val(meta), path("*.intervals"), emit: intervals @@ -25,7 +25,7 @@ process GATK_REALIGNERTARGETCREATOR { def args = task.ext.args ?: '' def prefix = task.ext.prefix ?: "${meta.id}" def known = known_vcf ? "-known ${known_vcf}" : "" - if ("$input" == "${prefix}.bam") error "Input and output names are the same, set prefix in module configuration to disambiguate!" + if ("$bam" == "${prefix}.bam") error "Input and output names are the same, set prefix in module configuration to disambiguate!" def avail_mem = 3072 if (!task.memory) { @@ -39,7 +39,7 @@ process GATK_REALIGNERTARGETCREATOR { -Xmx${avail_mem}M \\ -T RealignerTargetCreator \\ -nt ${task.cpus} \\ - -I ${input} \\ + -I ${bam} \\ -R ${fasta} \\ -o ${prefix}.intervals \\ ${known} \\ diff --git a/modules/nf-core/gatk/realignertargetcreator/meta.yml b/modules/nf-core/gatk/realignertargetcreator/meta.yml index c49d2a8d9..384c93e14 100644 --- a/modules/nf-core/gatk/realignertargetcreator/meta.yml +++ b/modules/nf-core/gatk/realignertargetcreator/meta.yml @@ -20,26 +20,46 @@ input: description: | Groovy Map containing sample information e.g. [ id:'test', single_end:false ] - - input: + - bam: type: file description: Sorted and indexed BAM/CRAM/SAM file pattern: "*.bam" - - index: + - bai: type: file description: BAM index file pattern: "*.bai" + - meta2: + type: map + description: | + Groovy Map containing reference information. + e.g. [ id:'test', single_end:false ] - fasta: type: file description: Reference file used to generate BAM file pattern: ".{fasta,fa,fna}" + - meta3: + type: map + description: | + Groovy Map containing reference information. + e.g. [ id:'test', single_end:false ] - fai: type: file description: Index of reference file used to generate BAM file pattern: ".fai" + - meta4: + type: map + description: | + Groovy Map containing reference information. + e.g. [ id:'test', single_end:false ] - dict: type: file description: GATK dict file for reference pattern: ".dict" + - meta5: + type: map + description: | + Groovy Map containing file meta-information for known_vcf. + e.g. [ id:'test', single_end:false ] - known_vcf: type: file description: Optional input VCF file(s) with known indels diff --git a/modules/nf-core/gatk/unifiedgenotyper/main.nf b/modules/nf-core/gatk/unifiedgenotyper/main.nf index fceb25e48..99e700a33 100644 --- a/modules/nf-core/gatk/unifiedgenotyper/main.nf +++ b/modules/nf-core/gatk/unifiedgenotyper/main.nf @@ -8,14 +8,14 @@ process GATK_UNIFIEDGENOTYPER { 'biocontainers/gatk:3.5--hdfd78af_11' }" input: - tuple val(meta), path(input), path(index) - path fasta - path fai - path dict - path intervals - path contamination - path dbsnp - path comp + tuple val(meta), path(bam), path(bai) + tuple val(meta2), path(fasta) + tuple val(meta3), path(fai) + tuple val(meta4), path(dict) + tuple val(meta5), path(intervals) + tuple val(meta6), path(contamination) + tuple val(meta7), path(dbsnp) + tuple val(meta8), path(comp) output: tuple val(meta), path("*.vcf.gz"), emit: vcf @@ -44,7 +44,7 @@ process GATK_UNIFIEDGENOTYPER { -Xmx${avail_mem}M \\ -nt ${task.cpus} \\ -T UnifiedGenotyper \\ - -I ${input} \\ + -I ${bam} \\ -R ${fasta} \\ ${contamination_file} \\ ${dbsnp_file} \\ diff --git a/modules/nf-core/gatk/unifiedgenotyper/meta.yml b/modules/nf-core/gatk/unifiedgenotyper/meta.yml index fc40bd636..f946411df 100644 --- a/modules/nf-core/gatk/unifiedgenotyper/meta.yml +++ b/modules/nf-core/gatk/unifiedgenotyper/meta.yml @@ -17,38 +17,73 @@ input: description: | Groovy Map containing sample information e.g. [ id:'test', single_end:false ] - - input: + - bam: type: file description: Sorted and indexed BAM/CRAM/SAM file pattern: "*.bam" - - index: + - bai: type: file description: BAM index file pattern: "*.bai" + - meta2: + type: map + description: | + Groovy Map containing reference information. + e.g. [ id:'test', single_end:false ] - fasta: type: file description: Reference file used to generate BAM file pattern: ".{fasta,fa,fna}" + - meta3: + type: map + description: | + Groovy Map containing reference information. + e.g. [ id:'test', single_end:false ] - fai: type: file description: Index of reference file used to generate BAM file pattern: ".fai" + - meta4: + type: map + description: | + Groovy Map containing reference information. + e.g. [ id:'test', single_end:false ] - dict: type: file description: GATK dict file for reference pattern: ".dict" + - meta5: + type: map + description: | + Groovy Map containing reference information. + e.g. [ id:'test', single_end:false ] - intervals: type: file description: Bed file with the genomic regions included in the library (optional) pattern: "*.intervals" + - meta6: + type: map + description: | + Groovy Map containing file meta-information for the contamination file. + e.g. [ id:'test', single_end:false ] - contamination: type: file description: Tab-separated file containing fraction of contamination in sequencing data (per sample) to aggressively remove pattern: "*" + - meta7: + type: map + description: | + Groovy Map containing file meta-information for the dbsnps file. + e.g. [ id:'test', single_end:false ] - dbsnps: type: file description: VCF file containing known sites (optional) pattern: "*" + - meta8: + type: map + description: | + Groovy Map containing file meta-information for the VCF comparison file. + e.g. [ id:'test', single_end:false ] - comp: type: file description: Comparison VCF file (optional) From ecc4257266d642fe779eb508e2f8e40fc7273582 Mon Sep 17 00:00:00 2001 From: Thiseas Christos Lamnidis Date: Fri, 21 Jul 2023 16:08:03 +0200 Subject: [PATCH 009/110] Add genotyping SWF --- workflows/eager.nf | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/workflows/eager.nf b/workflows/eager.nf index 2671a973b..cfd189c37 100644 --- a/workflows/eager.nf +++ b/workflows/eager.nf @@ -142,11 +142,11 @@ workflow EAGER { file(params.input) ) ch_versions = ch_versions.mix( INPUT_CHECK.out.versions ) - + // TODO: OPTIONAL, you can use nf-validation plugin to create an input channel from the samplesheet with Channel.fromSamplesheet("input") // See the documentation https://nextflow-io.github.io/nf-validation/samplesheets/fromSamplesheet/ // ! There is currently no tooling to help you write a sample sheet schema - + // // SUBWORKFLOW: Indexing of reference files // @@ -438,12 +438,21 @@ workflow EAGER { ch_bams_for_genotyping = ch_dedupped_bams } + // TODO Merge library-level BAMs into sample-level BAMs for genotyping. + // // SUBWORKFLOW: Genotyping // if ( params.run_genotyping ) { - GENOTYPE( ch_genotyping_input, ch_fasta ) + ch_reference_for_genotyping = REFERENCE_INDEXING.out.reference + // Remove unnecessary files from the reference channel, so SWF doesn't break with each change to reference channel. + .map{ + meta, fasta, fai, dict, mapindex, circular_target, mitochondrion -> + [ meta, fasta, fai, dict ] // dbsnp ] // TODO add dbsnp to reference sheet. + } + + GENOTYPE( ch_bams_for_genotyping, ch_reference_for_genotyping, [], [], [] ) ch_versions = ch_versions.mix( GENOTYPE.out.versions ) ch_multiqc_files = ch_multiqc_files.mix( GENOTYPE.out.mqc.collect{it[1]}.ifEmpty([]) ) From 5ca504384419a7cfe56caab8b52723dcff3439a3 Mon Sep 17 00:00:00 2001 From: Thiseas Christos Lamnidis Date: Fri, 21 Jul 2023 16:08:22 +0200 Subject: [PATCH 010/110] work on gatk ug --- subworkflows/local/genotype.nf | 59 +++++++++++++++++++++++++--------- 1 file changed, 43 insertions(+), 16 deletions(-) diff --git a/subworkflows/local/genotype.nf b/subworkflows/local/genotype.nf index 66b2bfac7..da2767598 100644 --- a/subworkflows/local/genotype.nf +++ b/subworkflows/local/genotype.nf @@ -15,7 +15,7 @@ include { FREEBAYES } from '../../module workflow GENOTYPE { take: ch_bam_bai // [ [ meta ], bam , bai ] - ch_fasta // [ [ meta ], fasta ] + ch_fasta_plus // [ [ meta ], fasta, fai, dict ] // TODO add dbSNP ch_snpcapture_bed // [ [ meta ], bed ] ch_pileupcaller_bedfile // [ [ meta ], bed ] ch_pileupcaller_snpfile // [ [ meta ], snp ] @@ -31,15 +31,15 @@ workflow GENOTYPE { if ( params.genotyping_tool == 'pileupcaller' ) { - // SAMTOOLS_MPILEUP_PILEUPCALLER( ch_bam_bai, ch_fasta ) + // SAMTOOLS_MPILEUP_PILEUPCALLER( ch_bam_bai, ch_fasta_plus ) /* // TODO - this is not working yet. Need snpcapture Bed and pileupcaller snp file to add here. - SEQUENCETOOLS_PILEUPCALLER( ch_bam_bai, ch_fasta, ch_versions, ch_multiqc_files ) + SEQUENCETOOLS_PILEUPCALLER( ch_bam_bai, ch_fasta_plus, ch_versions, ch_multiqc_files ) */ } - if ( params.genotyping_tool == 'unifiedgenotyper' ) { + if ( params.genotyping_tool == 'ug' ) { // Use correct reference for each input bam/bai pair. ch_bams_for_multimap = ch_bam_bai .map { @@ -47,7 +47,7 @@ workflow GENOTYPE { WorkflowEager.addNewMetaFromAttributes( it, "reference" , "reference" , false ) } - ch_fasta_for_multimap = ch_fasta + ch_fasta_for_multimap = ch_fasta_plus .map { // Prepend a new meta that contains the meta.id value as the new_meta.reference attribute WorkflowEager.addNewMetaFromAttributes( it, "id" , "reference" , false ) @@ -56,16 +56,36 @@ workflow GENOTYPE { ch_input_for_targetcreator = ch_bams_for_multimap .combine( ch_fasta_for_multimap , by:0 ) .multiMap { - ignore_me, meta, bam, bai, ref_meta, fasta -> - bam: [ meta, bam ] - fasta: fasta // no meta needed for fasta in GATK_* modules + ignore_me, meta, bam, bai, ref_meta, fasta, fai, dict -> // TODO add dbSNP + bam: [ meta, bam , bai ] + fasta: [ ref_meta, fasta ] + fai: [ ref_meta, fai ] + dict: [ ref_meta, dict ] } - GATK_REALIGNERTARGETCREATOR( ch_input_for_targetcreator.bam, ch_input_for_targetcreator.fasta ) + GATK_REALIGNERTARGETCREATOR( ch_input_for_targetcreator.bam, ch_input_for_targetcreator.fasta, ch_input_for_targetcreator.fai, ch_input_for_targetcreator.dict, [[], []] ) ch_versions = ch_versions.mix( GATK_REALIGNERTARGETCREATOR.out.versions.first() ) + + // ch_input_for_indelrealigner = ch_bam_bai + // // TODO can I join to ch_bams_for_multimap instead? or will that break ordering? + // .join( GATK_REALIGNERTARGETCREATOR.out.intervals ) + // .map { + // // Prepend a new meta that contains the meta.reference value as the new_meta.reference attribute + // WorkflowEager.addNewMetaFromAttributes( it, "reference" , "reference" , false ) + // } + // ch_input_for_indelrealigner = ch_bams_for_multimap + // .combine( ch_fasta_for_multimap , by:0 ) + // .multiMap { + // ignore_me, meta, bam, bai, ref_meta, fasta, fai, dict, mapindex, circular_target, mitochondrion -> + // bam: [ meta, bam, bai ] + // fasta: fasta // no meta needed for fasta in GATK_* modules // TODO add meta once modules get updated [ ref_meta, fasta ] + // fai: fai // no meta needed for fai in GATK_* modules // TODO add meta once modules get updated [ ref_meta, fai ] + // dict: dict // no meta needed for dict in GATK_* modules // TODO add meta once modules get updated [ ref_meta, dict ] + // } + + // Join the bam/bai pairs to the intervals file, then redo multiMap to get the correct ordering for each bam/reference/intervals set. ch_input_for_indelrealigner = ch_bam_bai - // TODO can I join to ch_bams_for_multimap instead? .join( GATK_REALIGNERTARGETCREATOR.out.intervals ) .map { // Prepend a new meta that contains the meta.reference value as the new_meta.reference attribute @@ -73,16 +93,23 @@ workflow GENOTYPE { } .combine( ch_fasta_for_multimap , by:0 ) .multiMap { - ignore_me, meta, bam, bai, ref_meta, fasta -> - bam: [ meta, bam ] - fasta: fasta // no meta needed for fasta in GATK_* modules + ignore_me, meta, bam, bai, intervals, ref_meta, fasta, fai, dict -> // TODO add dbSNP + bam: [ meta, bam, bai, intervals ] + fasta: [ ref_meta, fasta ] + fai: [ ref_meta, fai ] + dict: [ ref_meta, dict ] } - // TODO check that the channel manipulations work as intended and what gets given to INDELREALIGNER - GATK_INDELREALIGNER( ch_input_for_gatk_ug.bam, ch_input_for_gatk_ug.fasta, GATK_REALIGNERTARGETCREATOR.out.intervals ) + + GATK_INDELREALIGNER( ch_input_for_indelrealigner.bam, ch_input_for_indelrealigner.fasta, ch_input_for_indelrealigner.fai, ch_input_for_indelrealigner.dict, [[], []] ) + ch_versions = ch_versions.mix( GATK_INDELREALIGNER.out.versions.first() ) // TODO is this actually needed, since all GATK modules have the same version? + + // Use realigned bams as input for UG. + ch_bams_for_ug = GATK_INDELREALIGNER.out.bam + // GATK_UNIFIEDGENOTYPER() } - if ( params.genotyping_tool == 'haplotypecaller' ) { + if ( params.genotyping_tool == 'hc' ) { // TODO } From 89a7aabefdc5a7fed96480700ac4e8a6e1f2a9b3 Mon Sep 17 00:00:00 2001 From: Thiseas Christos Lamnidis Date: Fri, 21 Jul 2023 16:14:26 +0200 Subject: [PATCH 011/110] Add gatk UG --- conf/modules.config | 45 ++++++++++++++++++++++++++ subworkflows/local/genotype.nf | 59 +++++++++++++++++++++------------- 2 files changed, 82 insertions(+), 22 deletions(-) diff --git a/conf/modules.config b/conf/modules.config index 8bc4b884f..9f3f56406 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -805,4 +805,49 @@ process { enabled: true ] } + + // + // GENOTYPING + // + withName: GATK_REALIGNERTARGETCREATOR { + tag = { "${meta.reference}|${meta.sample_id}_${meta.library_id}" } + ext.args = [ + '' + ].join(' ').trim() + ext.prefix = { "${meta.sample_id}_${meta.library_id}_${meta.reference}" } + publishDir = [ + path: { "${params.outdir}/genotyping/RTC" }, + mode: params.publish_dir_mode, + enabled: true + //enabled: false + ] + } + + withName: GATK_INDELREALIGNER { + tag = { "${meta.reference}|${meta.sample_id}_${meta.library_id}" } + ext.args = [ + '' + ].join(' ').trim() + ext.prefix = { "${meta.sample_id}_${meta.library_id}_${meta.reference}" } + publishDir = [ + path: { "${params.outdir}/genotyping/IR" }, + mode: params.publish_dir_mode, + enabled: true + //enabled: false + ] + } + + withName: GATK_UNIFIEDGENOTYPER { + tag = { "${meta.reference}|${meta.sample_id}_${meta.library_id}" } + ext.args = [ + '' + ].join(' ').trim() + ext.prefix = { "${meta.sample_id}_${meta.library_id}_${meta.reference}" } + publishDir = [ + path: { "${params.outdir}/genotyping/ug" }, + mode: params.publish_dir_mode, + enabled: true + //enabled: false + ] + } } diff --git a/subworkflows/local/genotype.nf b/subworkflows/local/genotype.nf index da2767598..48c4bd6ba 100644 --- a/subworkflows/local/genotype.nf +++ b/subworkflows/local/genotype.nf @@ -63,27 +63,15 @@ workflow GENOTYPE { dict: [ ref_meta, dict ] } - GATK_REALIGNERTARGETCREATOR( ch_input_for_targetcreator.bam, ch_input_for_targetcreator.fasta, ch_input_for_targetcreator.fai, ch_input_for_targetcreator.dict, [[], []] ) + GATK_REALIGNERTARGETCREATOR( + ch_input_for_targetcreator.bam, + ch_input_for_targetcreator.fasta, + ch_input_for_targetcreator.fai, + ch_input_for_targetcreator.dict, + [[], []] // No known_vcf + ) ch_versions = ch_versions.mix( GATK_REALIGNERTARGETCREATOR.out.versions.first() ) - - // ch_input_for_indelrealigner = ch_bam_bai - // // TODO can I join to ch_bams_for_multimap instead? or will that break ordering? - // .join( GATK_REALIGNERTARGETCREATOR.out.intervals ) - // .map { - // // Prepend a new meta that contains the meta.reference value as the new_meta.reference attribute - // WorkflowEager.addNewMetaFromAttributes( it, "reference" , "reference" , false ) - // } - // ch_input_for_indelrealigner = ch_bams_for_multimap - // .combine( ch_fasta_for_multimap , by:0 ) - // .multiMap { - // ignore_me, meta, bam, bai, ref_meta, fasta, fai, dict, mapindex, circular_target, mitochondrion -> - // bam: [ meta, bam, bai ] - // fasta: fasta // no meta needed for fasta in GATK_* modules // TODO add meta once modules get updated [ ref_meta, fasta ] - // fai: fai // no meta needed for fai in GATK_* modules // TODO add meta once modules get updated [ ref_meta, fai ] - // dict: dict // no meta needed for dict in GATK_* modules // TODO add meta once modules get updated [ ref_meta, dict ] - // } - // Join the bam/bai pairs to the intervals file, then redo multiMap to get the correct ordering for each bam/reference/intervals set. ch_input_for_indelrealigner = ch_bam_bai .join( GATK_REALIGNERTARGETCREATOR.out.intervals ) @@ -100,13 +88,40 @@ workflow GENOTYPE { dict: [ ref_meta, dict ] } - GATK_INDELREALIGNER( ch_input_for_indelrealigner.bam, ch_input_for_indelrealigner.fasta, ch_input_for_indelrealigner.fai, ch_input_for_indelrealigner.dict, [[], []] ) + GATK_INDELREALIGNER( + ch_input_for_indelrealigner.bam, + ch_input_for_indelrealigner.fasta, + ch_input_for_indelrealigner.fai, + ch_input_for_indelrealigner.dict, + [[], []] // No known_vcf + ) ch_versions = ch_versions.mix( GATK_INDELREALIGNER.out.versions.first() ) // TODO is this actually needed, since all GATK modules have the same version? - // Use realigned bams as input for UG. + // Use realigned bams as input for UG. combine with reference info to get correct ordering. ch_bams_for_ug = GATK_INDELREALIGNER.out.bam + .map { + WorkflowEager.addNewMetaFromAttributes( it, "reference" , "reference" , false ) + } + .combine( ch_fasta_for_multimap , by:0 ) + .multiMap { + ignore_me, meta, bam, bai, intervals, ref_meta, fasta, fai, dict -> // TODO add dbSNP + bam: [ meta, bam, bai, intervals ] + fasta: [ ref_meta, fasta ] + fai: [ ref_meta, fai ] + dict: [ ref_meta, dict ] + // dbsnp: [ ref_meta, dbsnp ] // TODO add dbsnp + } - // GATK_UNIFIEDGENOTYPER() + GATK_UNIFIEDGENOTYPER( + ch_bams_for_ug.bam, + ch_bams_for_ug.fasta, + ch_bams_for_ug.fai, + ch_bams_for_ug.dict, + [[], []], // No intervals + [[], []], // No contamination + [[], []], // No dbsnp //TODO add dbsnp // ch_bams_for_ug.dbsnp + [[], []] // No comp + ) } if ( params.genotyping_tool == 'hc' ) { From 1ef7d1a64ce4b36d7136c72ce167b89f6f1b7985 Mon Sep 17 00:00:00 2001 From: Thiseas Christos Lamnidis Date: Fri, 21 Jul 2023 16:17:46 +0200 Subject: [PATCH 012/110] no intervals in ug call --- subworkflows/local/genotype.nf | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/subworkflows/local/genotype.nf b/subworkflows/local/genotype.nf index 48c4bd6ba..1a3a6b0f8 100644 --- a/subworkflows/local/genotype.nf +++ b/subworkflows/local/genotype.nf @@ -104,8 +104,8 @@ workflow GENOTYPE { } .combine( ch_fasta_for_multimap , by:0 ) .multiMap { - ignore_me, meta, bam, bai, intervals, ref_meta, fasta, fai, dict -> // TODO add dbSNP - bam: [ meta, bam, bai, intervals ] + ignore_me, meta, bam, bai, ref_meta, fasta, fai, dict -> // TODO add dbSNP + bam: [ meta, bam, bai ] fasta: [ ref_meta, fasta ] fai: [ ref_meta, fai ] dict: [ ref_meta, dict ] From 29a3e89751696e1db266ad94057ac28f6147bb84 Mon Sep 17 00:00:00 2001 From: Thiseas Christos Lamnidis Date: Fri, 21 Jul 2023 16:18:52 +0200 Subject: [PATCH 013/110] add version --- subworkflows/local/genotype.nf | 1 + 1 file changed, 1 insertion(+) diff --git a/subworkflows/local/genotype.nf b/subworkflows/local/genotype.nf index 1a3a6b0f8..18b54124d 100644 --- a/subworkflows/local/genotype.nf +++ b/subworkflows/local/genotype.nf @@ -122,6 +122,7 @@ workflow GENOTYPE { [[], []], // No dbsnp //TODO add dbsnp // ch_bams_for_ug.dbsnp [[], []] // No comp ) + ch_versions = ch_versions.mix( GATK_UNIFIEDGENOTYPER.out.versions.first() ) } if ( params.genotyping_tool == 'hc' ) { From 2d7b6224bd0f1716517cd05a5f97ceb03400a66f Mon Sep 17 00:00:00 2001 From: Thiseas Christos Lamnidis Date: Fri, 21 Jul 2023 16:19:30 +0200 Subject: [PATCH 014/110] emit UG output --- subworkflows/local/genotype.nf | 1 + 1 file changed, 1 insertion(+) diff --git a/subworkflows/local/genotype.nf b/subworkflows/local/genotype.nf index 18b54124d..d6715c6b2 100644 --- a/subworkflows/local/genotype.nf +++ b/subworkflows/local/genotype.nf @@ -123,6 +123,7 @@ workflow GENOTYPE { [[], []] // No comp ) ch_versions = ch_versions.mix( GATK_UNIFIEDGENOTYPER.out.versions.first() ) + ch_gatk_unifiedgenotyper_genotypes = GATK_UNIFIEDGENOTYPER.out.vcf } if ( params.genotyping_tool == 'hc' ) { From 0b4598e2f94ea90ac64bcc163f3f3069aa374629 Mon Sep 17 00:00:00 2001 From: Thiseas Christos Lamnidis Date: Fri, 21 Jul 2023 16:27:00 +0200 Subject: [PATCH 015/110] tweak gatk UG outputs --- conf/modules.config | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/conf/modules.config b/conf/modules.config index 9f3f56406..320abbb46 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -816,10 +816,10 @@ process { ].join(' ').trim() ext.prefix = { "${meta.sample_id}_${meta.library_id}_${meta.reference}" } publishDir = [ - path: { "${params.outdir}/genotyping/RTC" }, - mode: params.publish_dir_mode, - enabled: true - //enabled: false + // path: { "${params.outdir}/genotyping/RTC" }, + // mode: params.publish_dir_mode, + // enabled: true + enabled: false ] } @@ -830,10 +830,10 @@ process { ].join(' ').trim() ext.prefix = { "${meta.sample_id}_${meta.library_id}_${meta.reference}" } publishDir = [ - path: { "${params.outdir}/genotyping/IR" }, - mode: params.publish_dir_mode, - enabled: true - //enabled: false + // path: { "${params.outdir}/genotyping/IR" }, + // mode: params.publish_dir_mode, + // enabled: true + enabled: false ] } @@ -847,7 +847,6 @@ process { path: { "${params.outdir}/genotyping/ug" }, mode: params.publish_dir_mode, enabled: true - //enabled: false ] } } From aab831f6458a77e6cb4501796c4da608e49f53ff Mon Sep 17 00:00:00 2001 From: Thiseas Christos Lamnidis Date: Fri, 21 Jul 2023 16:27:12 +0200 Subject: [PATCH 016/110] rename emissions --- subworkflows/local/genotype.nf | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/subworkflows/local/genotype.nf b/subworkflows/local/genotype.nf index d6715c6b2..ed2d1033f 100644 --- a/subworkflows/local/genotype.nf +++ b/subworkflows/local/genotype.nf @@ -139,12 +139,12 @@ workflow GENOTYPE { } emit: - pileupcaller_genotypes = ch_pileupcaller_genotypes // [ [ meta ], geno, snp, ind ] - gatk_haplotypecaller_genotypes = ch_gatk_haplotypecaller_genotypes // [ [ meta ], vcf ] ] - gatk_unifiedgenotyper_genotypes = ch_gatk_unifiedgenotyper_genotypes // [ [ meta ], vcf ] ] - freebayes_genotypes = ch_freebayes_genotypes // [ [ meta ], vcf ] ] - angsd_genotypes = ch_angsd_genotypes // [ [ meta ], glf ] ] - versions = ch_versions - mqc = ch_multiqc_files + geno_pileupcaller = ch_pileupcaller_genotypes // [ [ meta ], geno, snp, ind ] + geno_gatk_hc = ch_gatk_haplotypecaller_genotypes // [ [ meta ], vcf ] ] + geno_gatk_ug = ch_gatk_unifiedgenotyper_genotypes // [ [ meta ], vcf ] ] + geno_freebayes = ch_freebayes_genotypes // [ [ meta ], vcf ] ] + geno_angsd = ch_angsd_genotypes // [ [ meta ], glf ] ] + versions = ch_versions + mqc = ch_multiqc_files } From 6e8d8195809667025a0845800b7c3a8d52439eee Mon Sep 17 00:00:00 2001 From: Thiseas Christos Lamnidis Date: Fri, 21 Jul 2023 16:28:28 +0200 Subject: [PATCH 017/110] delete leftover debug print from map.nf --- subworkflows/local/map.nf | 1 - 1 file changed, 1 deletion(-) diff --git a/subworkflows/local/map.nf b/subworkflows/local/map.nf index 1d2e2b3ed..72df19e75 100644 --- a/subworkflows/local/map.nf +++ b/subworkflows/local/map.nf @@ -66,7 +66,6 @@ workflow MAP { .groupTuple() .branch { meta, bam -> - println(bam.size()) merge: bam.size() > 1 skip: true } From 8b4b79d305440a5e860bb2fa70b97b26b30058a2 Mon Sep 17 00:00:00 2001 From: Thiseas Christos Lamnidis Date: Fri, 28 Jul 2023 11:01:09 +0200 Subject: [PATCH 018/110] Add params for gatk and gatkUG --- nextflow.config | 9 ++++++ nextflow_schema.json | 71 +++++++++++++++++++++++++++++++++++++++----- 2 files changed, 72 insertions(+), 8 deletions(-) diff --git a/nextflow.config b/nextflow.config index f672eecb1..4af370a9c 100644 --- a/nextflow.config +++ b/nextflow.config @@ -190,12 +190,21 @@ params { run_genotyping = false genotyping_tool = null genotyping_source = null + run_bcftools_stats = true genotyping_pileupcaller_min_base_quality = 30 genotyping_pileupcaller_min_map_quality = 30 genotyping_pileupcaller_bedfile = null genotyping_pileupcaller_snpfile = null genotyping_pileupcaller_method = 'randomHaploid' genotyping_pileupcaller_transitions_mode = 'AllSites' + genotyping_gatk_call_conf = 30 + genotyping_gatk_ploidy = 2 + genotyping_gatk_dbsnp = null + genotyping_gatk_ug_downsample = 250 + genotyping_gatk_ug_out_mode = 'EMIT_VARIANTS_ONLY' + genotyping_gatk_ug_genotype_mode = 'SNP' + genotyping_gatk_ug_keep_realign_bam = false + genotyping_gatk_ug_defaultbasequalities = -1 } diff --git a/nextflow_schema.json b/nextflow_schema.json index eb0e27479..4653e2135 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -214,14 +214,12 @@ "type": "boolean", "description": "Display help text.", "fa_icon": "fas fa-question-circle", - "default": false, "hidden": true }, "version": { "type": "boolean", "description": "Display version and exit.", "fa_icon": "fas fa-question-circle", - "default": false, "hidden": true }, "publish_dir_mode": { @@ -245,7 +243,6 @@ "type": "boolean", "description": "Send plain-text email instead of HTML.", "fa_icon": "fas fa-remove-format", - "default": false, "hidden": true }, "max_multiqc_email_size": { @@ -260,7 +257,6 @@ "type": "boolean", "description": "Do not use coloured log outputs.", "fa_icon": "fas fa-palette", - "default": false, "hidden": true }, "hook_url": { @@ -299,7 +295,6 @@ "type": "boolean", "fa_icon": "far fa-eye-slash", "description": "Show all params when using `--help`", - "default": false, "hidden": true, "help_text": "By default, parameters set as _hidden_ in the schema are not shown on the command line when a user runs with `--help`. Specifying this option will tell the pipeline to show all parameters." }, @@ -307,7 +302,6 @@ "type": "boolean", "fa_icon": "far fa-check-circle", "description": "Validation of parameters fails when an unrecognised parameter is found.", - "default": false, "hidden": true, "help_text": "By default, when an unrecognised parameter is found, it returns a warinig." }, @@ -315,7 +309,6 @@ "type": "boolean", "fa_icon": "far fa-check-circle", "description": "Validation of parameters in lenient more.", - "default": false, "hidden": true, "help_text": "Allows string values that are parseable as numbers or booleans. For further information see [JSONSchema docs](https://github.com/everit-org/json-schema#lenient-mode)." } @@ -884,6 +877,13 @@ "help_text": "Specifies which genotyper to use. Current options are: GATK (v3.5) UnifiedGenotyper or GATK Haplotype Caller (v4); and the FreeBayes Caller.\n\n> Note that while UnifiedGenotyper is more suitable for low-coverage ancient DNA (HaplotypeCaller does de novo assembly around each variant site), be aware GATK 3.5 it is officially deprecated by the Broad Institute.", "description": "Specify which genotyper to use between: GATK UnifiedGenotyper, GATK HaplotypeCaller, Freebayes, or pileupCaller." }, + "run_bcftools_stats": { + "type": "boolean", + "default": true, + "fa_icon": "far fa-chart-bar", + "description": "Turn on bcftools stats generation for VCF based variant calling statistics", + "help_text": "Runs `bcftools stats` against VCF files from GATK and FreeBayes genotypers.\n\nIt will automatically include the FASTA reference for INDEL-related statistics." + }, "genotyping_pileupcaller_min_base_quality": { "type": "integer", "default": 30, @@ -926,9 +926,64 @@ "type": "string", "default": "AllSites", "description": "Specify the calling mode for transitions.", - "help_text": "Specify if genotypes of transition SNPs should be called, set to missing, or excluded from the genotypes respectively. \n\n> Modifies pileupCaller parameter: --skipTransitions --transitionsMissing", + "help_text": "Specify if genotypes of transition SNPs should be called, set to missing, or excluded from the genotypes respectively. \n\n> Modifies pileupCaller parameter: `--skipTransitions` `--transitionsMissing`", "enum": ["AllSites", "TransitionsMissing", "SkipTransitions"], "fa_icon": "fas fa-toggle-on" + }, + "genotyping_gatk_call_conf": { + "type": "integer", + "default": 30, + "fa_icon": "fas fa-balance-scale-right", + "description": "Specify GATK phred-scaled confidence threshold.", + "help_text": "If selected, specify a GATK genotyper phred-scaled confidence threshold of a given SNP/INDEL call.\n\n> Modifies GATK UnifiedGenotyper or HaplotypeCaller parameter: `-stand_call_conf`" + }, + "genotyping_gatk_ploidy": { + "type": "integer", + "default": 2, + "fa_icon": "fas fa-pastafarianism", + "description": "Specify GATK organism ploidy.", + "help_text": "If selected, specify a GATK genotyper ploidy value of your reference organism. E.g. if you want to allow heterozygous calls from >= diploid organisms.\n\n> Modifies GATK UnifiedGenotyper or HaplotypeCaller parameter: --sample-ploidy" + }, + "genotyping_gatk_dbsnp": { + "type": "string", + "help_text": "(Optional) Specify VCF file for output VCF SNP annotation e.g. if you want to annotate your VCF file with 'rs' SNP IDs. Check GATK documentation for more information. Gzip not accepted.", + "fa_icon": "fas fa-pen-alt" + }, + "genotyping_gatk_ug_downsample": { + "type": "integer", + "default": 250, + "fa_icon": "fas fa-icicles", + "description": "Maximum depth coverage allowed for genotyping before down-sampling is turned on.", + "help_text": "Maximum depth coverage allowed for genotyping before down-sampling is turned on. Any position with a coverage higher than this value will be randomly down-sampled to this many reads.\n\n> Modifies GATK UnifiedGenotyper parameter: `-dcov`" + }, + "genotyping_gatk_ug_out_mode": { + "type": "string", + "default": "EMIT_VARIANTS_ONLY", + "description": "Specify GATK output mode.", + "enum": ["EMIT_VARIANTS_ONLY", "EMIT_ALL_CONFIDENT_SITES", "EMIT_ALL_SITES"], + "help_text": "If the GATK UnifiedGenotyper is selected, what type of VCF to create, i.e. produce calls for every site or just confidence sites.\n\n> Modifies GATK UnifiedGenotyper parameter: `--output_mode`", + "fa_icon": "fas fa-bullhorn" + }, + "genotyping_gatk_ug_genotype_mode": { + "type": "string", + "default": "SNP", + "description": "Specify UnifiedGenotyper likelihood model.", + "enum": ["SNP", "INDEL", "BOTH", "GENERALPLOIDYSNP", "GENERALPLOIDYINDEL"], + "fa_icon": "fas fa-project-diagram", + "help_text": "If the GATK UnifiedGenotyper is selected, which likelihood model to follow, i.e. whether to call use SNPs or INDELS etc.\n\n> Modifies GATK UnifiedGenotyper parameter: `--genotype_likelihoods_model`" + }, + "genotyping_gatk_ug_keep_realign_bam": { + "type": "boolean", + "fa_icon": "far fa-save", + "description": "Specify to keep the BAM output of re-alignment around variants from GATK UnifiedGenotyper.", + "help_text": "If provided when running GATK's UnifiedGenotyper, this will put into the output folder the BAMs that have realigned reads (with GATK's (v3) IndelRealigner) around possible variants for improved genotyping.\n\nThese BAMs will be stored in the same folder as the corresponding VCF files." + }, + "genotyping_gatk_ug_defaultbasequalities": { + "type": "integer", + "default": -1, + "description": "Supply a default base quality if a read is missing a base quality score. Setting to -1 turns this off.", + "help_text": "When running GATK's UnifiedGenotyper, specify a value to set base quality scores, if reads are missing this information. Might be useful if you have 'synthetically' generated reads (e.g. chopping up a reference genome). Default is set to `-1` which is to not set any default quality (turned off). \n\n> Modifies GATK UnifiedGenotyper parameter: `--defaultBaseQualities`", + "fa_icon": "fas fa-redo-alt" } }, "fa_icon": "fas fa-sliders-h", From 857559276c4e135657024b09118c7139f4aa722a Mon Sep 17 00:00:00 2001 From: Thiseas Christos Lamnidis Date: Fri, 28 Jul 2023 11:52:07 +0200 Subject: [PATCH 019/110] WIP adding params to GATK UG --- conf/modules.config | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) diff --git a/conf/modules.config b/conf/modules.config index 320abbb46..3f4d77e73 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -812,13 +812,10 @@ process { withName: GATK_REALIGNERTARGETCREATOR { tag = { "${meta.reference}|${meta.sample_id}_${meta.library_id}" } ext.args = [ - '' + "--defaultBaseQualities ${params.gatk_ug_defaultbasequalities}", ].join(' ').trim() ext.prefix = { "${meta.sample_id}_${meta.library_id}_${meta.reference}" } publishDir = [ - // path: { "${params.outdir}/genotyping/RTC" }, - // mode: params.publish_dir_mode, - // enabled: true enabled: false ] } @@ -826,21 +823,21 @@ process { withName: GATK_INDELREALIGNER { tag = { "${meta.reference}|${meta.sample_id}_${meta.library_id}" } ext.args = [ - '' + "--defaultBaseQualities ${params.gatk_ug_defaultbasequalities}" ].join(' ').trim() ext.prefix = { "${meta.sample_id}_${meta.library_id}_${meta.reference}" } publishDir = [ - // path: { "${params.outdir}/genotyping/IR" }, - // mode: params.publish_dir_mode, - // enabled: true - enabled: false + path: { "${params.outdir}/genotyping/IR" }, + mode: params.publish_dir_mode, + enabled: params.genotyping_gatk_ug_keep_realign_bam ] } withName: GATK_UNIFIEDGENOTYPER { tag = { "${meta.reference}|${meta.sample_id}_${meta.library_id}" } ext.args = [ - '' + "--defaultBaseQualities ${params.gatk_ug_defaultbasequalities}" + // TODO add the rest of the parameters ].join(' ').trim() ext.prefix = { "${meta.sample_id}_${meta.library_id}_${meta.reference}" } publishDir = [ From 60e7eec292af6125728dd5658300556ce68daae6 Mon Sep 17 00:00:00 2001 From: Thiseas Christos Lamnidis Date: Fri, 28 Jul 2023 15:43:58 +0200 Subject: [PATCH 020/110] reorder params --- nextflow.config | 2 +- subworkflows/local/genotype.nf | 12 +++++++----- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/nextflow.config b/nextflow.config index 4af370a9c..950d66133 100644 --- a/nextflow.config +++ b/nextflow.config @@ -203,8 +203,8 @@ params { genotyping_gatk_ug_downsample = 250 genotyping_gatk_ug_out_mode = 'EMIT_VARIANTS_ONLY' genotyping_gatk_ug_genotype_mode = 'SNP' - genotyping_gatk_ug_keep_realign_bam = false genotyping_gatk_ug_defaultbasequalities = -1 + genotyping_gatk_ug_keep_realign_bam = false } diff --git a/subworkflows/local/genotype.nf b/subworkflows/local/genotype.nf index ed2d1033f..84b3ab0c5 100644 --- a/subworkflows/local/genotype.nf +++ b/subworkflows/local/genotype.nf @@ -19,6 +19,7 @@ workflow GENOTYPE { ch_snpcapture_bed // [ [ meta ], bed ] ch_pileupcaller_bedfile // [ [ meta ], bed ] ch_pileupcaller_snpfile // [ [ meta ], snp ] + ch_dbsnp // [ dbsnp ] main: ch_versions = Channel.empty() @@ -48,6 +49,7 @@ workflow GENOTYPE { } ch_fasta_for_multimap = ch_fasta_plus + .combine( ch_dbsnp ) // TODO This will need a 'by:0' once the dbsnp channel gets a meta. .map { // Prepend a new meta that contains the meta.id value as the new_meta.reference attribute WorkflowEager.addNewMetaFromAttributes( it, "id" , "reference" , false ) @@ -56,7 +58,7 @@ workflow GENOTYPE { ch_input_for_targetcreator = ch_bams_for_multimap .combine( ch_fasta_for_multimap , by:0 ) .multiMap { - ignore_me, meta, bam, bai, ref_meta, fasta, fai, dict -> // TODO add dbSNP + ignore_me, meta, bam, bai, ref_meta, fasta, fai, dict, dbsnp -> bam: [ meta, bam , bai ] fasta: [ ref_meta, fasta ] fai: [ ref_meta, fai ] @@ -81,7 +83,7 @@ workflow GENOTYPE { } .combine( ch_fasta_for_multimap , by:0 ) .multiMap { - ignore_me, meta, bam, bai, intervals, ref_meta, fasta, fai, dict -> // TODO add dbSNP + ignore_me, meta, bam, bai, intervals, ref_meta, fasta, fai, dict, dbsnp -> bam: [ meta, bam, bai, intervals ] fasta: [ ref_meta, fasta ] fai: [ ref_meta, fai ] @@ -104,12 +106,12 @@ workflow GENOTYPE { } .combine( ch_fasta_for_multimap , by:0 ) .multiMap { - ignore_me, meta, bam, bai, ref_meta, fasta, fai, dict -> // TODO add dbSNP + ignore_me, meta, bam, bai, ref_meta, fasta, fai, dict, dbsnp -> bam: [ meta, bam, bai ] fasta: [ ref_meta, fasta ] fai: [ ref_meta, fai ] dict: [ ref_meta, dict ] - // dbsnp: [ ref_meta, dbsnp ] // TODO add dbsnp + dbsnp: [ ref_meta, dbsnp ] } GATK_UNIFIEDGENOTYPER( @@ -119,7 +121,7 @@ workflow GENOTYPE { ch_bams_for_ug.dict, [[], []], // No intervals [[], []], // No contamination - [[], []], // No dbsnp //TODO add dbsnp // ch_bams_for_ug.dbsnp + ch_bams_for_ug.dbsnp, [[], []] // No comp ) ch_versions = ch_versions.mix( GATK_UNIFIEDGENOTYPER.out.versions.first() ) From 2f9496bbcca8dc8a50c1f347f1cb296663164419 Mon Sep 17 00:00:00 2001 From: Thiseas Christos Lamnidis Date: Fri, 28 Jul 2023 15:44:33 +0200 Subject: [PATCH 021/110] add dbSNP placeholder to be able to test. parameters passes to gatk now --- conf/modules.config | 12 ++++++++---- subworkflows/local/genotype.nf | 4 ++-- workflows/eager.nf | 8 +++++++- 3 files changed, 17 insertions(+), 7 deletions(-) diff --git a/conf/modules.config b/conf/modules.config index 3f4d77e73..3375093c9 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -812,7 +812,7 @@ process { withName: GATK_REALIGNERTARGETCREATOR { tag = { "${meta.reference}|${meta.sample_id}_${meta.library_id}" } ext.args = [ - "--defaultBaseQualities ${params.gatk_ug_defaultbasequalities}", + params.genotyping_gatk_ug_defaultbasequalities > 0 ? "--defaultBaseQualities ${params.genotyping_gatk_ug_defaultbasequalities}" : "", ].join(' ').trim() ext.prefix = { "${meta.sample_id}_${meta.library_id}_${meta.reference}" } publishDir = [ @@ -823,7 +823,7 @@ process { withName: GATK_INDELREALIGNER { tag = { "${meta.reference}|${meta.sample_id}_${meta.library_id}" } ext.args = [ - "--defaultBaseQualities ${params.gatk_ug_defaultbasequalities}" + params.genotyping_gatk_ug_defaultbasequalities > 0 ? "--defaultBaseQualities ${params.genotyping_gatk_ug_defaultbasequalities}" : "", ].join(' ').trim() ext.prefix = { "${meta.sample_id}_${meta.library_id}_${meta.reference}" } publishDir = [ @@ -836,8 +836,12 @@ process { withName: GATK_UNIFIEDGENOTYPER { tag = { "${meta.reference}|${meta.sample_id}_${meta.library_id}" } ext.args = [ - "--defaultBaseQualities ${params.gatk_ug_defaultbasequalities}" - // TODO add the rest of the parameters + "--sample_ploidy ${params.genotyping_gatk_ploidy}", + "-stand_call_conf ${params.genotyping_gatk_call_conf}", + "-dcov ${params.genotyping_gatk_ug_downsample}", + "--output_mode ${params.genotyping_gatk_ug_out_mode}", + "--genotype_likelihoods_model ${params.genotyping_gatk_ug_genotype_mode}", + params.genotyping_gatk_ug_defaultbasequalities > 0 ? "--defaultBaseQualities ${params.genotyping_gatk_ug_defaultbasequalities}" : "", ].join(' ').trim() ext.prefix = { "${meta.sample_id}_${meta.library_id}_${meta.reference}" } publishDir = [ diff --git a/subworkflows/local/genotype.nf b/subworkflows/local/genotype.nf index 84b3ab0c5..e65a380e3 100644 --- a/subworkflows/local/genotype.nf +++ b/subworkflows/local/genotype.nf @@ -30,7 +30,6 @@ workflow GENOTYPE { ch_freebayes_genotypes = Channel.empty() ch_angsd_genotypes = Channel.empty() - if ( params.genotyping_tool == 'pileupcaller' ) { // SAMTOOLS_MPILEUP_PILEUPCALLER( ch_bam_bai, ch_fasta_plus ) @@ -49,11 +48,12 @@ workflow GENOTYPE { } ch_fasta_for_multimap = ch_fasta_plus - .combine( ch_dbsnp ) // TODO This will need a 'by:0' once the dbsnp channel gets a meta. .map { // Prepend a new meta that contains the meta.id value as the new_meta.reference attribute WorkflowEager.addNewMetaFromAttributes( it, "id" , "reference" , false ) } + .combine( ch_dbsnp ) // TODO This will need tweaking ('by:0'?) once the dbsnp channel gets a meta. + .dump(tag:"dbsnp") ch_input_for_targetcreator = ch_bams_for_multimap .combine( ch_fasta_for_multimap , by:0 ) diff --git a/workflows/eager.nf b/workflows/eager.nf index cfd189c37..06d07ebb0 100644 --- a/workflows/eager.nf +++ b/workflows/eager.nf @@ -134,6 +134,12 @@ workflow EAGER { // Contamination estimation hapmap_file = file(params.contamination_estimation_angsd_hapmap, checkIfExists:true) + // GATK dbSNP + if ( params.genotyping_gatk_dbsnp ) { + ch_dbsnp = Channel.fromPath(params.genotyping_gatk_dbsnp, checkIfExists: true) + } else { + ch_dbsnp = Channel.empty() + } // // SUBWORKFLOW: Read in samplesheet, validate and stage input files // @@ -452,7 +458,7 @@ workflow EAGER { [ meta, fasta, fai, dict ] // dbsnp ] // TODO add dbsnp to reference sheet. } - GENOTYPE( ch_bams_for_genotyping, ch_reference_for_genotyping, [], [], [] ) + GENOTYPE( ch_bams_for_genotyping, ch_reference_for_genotyping, [], [], [], ch_dbsnp.ifEmpty([[]]) ) ch_versions = ch_versions.mix( GENOTYPE.out.versions ) ch_multiqc_files = ch_multiqc_files.mix( GENOTYPE.out.mqc.collect{it[1]}.ifEmpty([]) ) From 020eba2d66c9b80dc289334a816663561618edf7 Mon Sep 17 00:00:00 2001 From: Thiseas Christos Lamnidis Date: Fri, 4 Aug 2023 10:43:53 +0200 Subject: [PATCH 022/110] convert bcftools_stats to skip --- nextflow.config | 2 +- nextflow_schema.json | 3 +-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/nextflow.config b/nextflow.config index 950d66133..ba3d26f40 100644 --- a/nextflow.config +++ b/nextflow.config @@ -190,7 +190,7 @@ params { run_genotyping = false genotyping_tool = null genotyping_source = null - run_bcftools_stats = true + skip_bcftools_stats = false genotyping_pileupcaller_min_base_quality = 30 genotyping_pileupcaller_min_map_quality = 30 genotyping_pileupcaller_bedfile = null diff --git a/nextflow_schema.json b/nextflow_schema.json index 4653e2135..62d444aa3 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -877,9 +877,8 @@ "help_text": "Specifies which genotyper to use. Current options are: GATK (v3.5) UnifiedGenotyper or GATK Haplotype Caller (v4); and the FreeBayes Caller.\n\n> Note that while UnifiedGenotyper is more suitable for low-coverage ancient DNA (HaplotypeCaller does de novo assembly around each variant site), be aware GATK 3.5 it is officially deprecated by the Broad Institute.", "description": "Specify which genotyper to use between: GATK UnifiedGenotyper, GATK HaplotypeCaller, Freebayes, or pileupCaller." }, - "run_bcftools_stats": { + "skip_bcftools_stats": { "type": "boolean", - "default": true, "fa_icon": "far fa-chart-bar", "description": "Turn on bcftools stats generation for VCF based variant calling statistics", "help_text": "Runs `bcftools stats` against VCF files from GATK and FreeBayes genotypers.\n\nIt will automatically include the FASTA reference for INDEL-related statistics." From 2fa9508f6603b4b385525b3fe285fbf7f3c3c1bc Mon Sep 17 00:00:00 2001 From: Thiseas Christos Lamnidis Date: Fri, 4 Aug 2023 10:48:30 +0200 Subject: [PATCH 023/110] finish schema --- nextflow_schema.json | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/nextflow_schema.json b/nextflow_schema.json index 62d444aa3..d35d4e609 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -880,8 +880,8 @@ "skip_bcftools_stats": { "type": "boolean", "fa_icon": "far fa-chart-bar", - "description": "Turn on bcftools stats generation for VCF based variant calling statistics", - "help_text": "Runs `bcftools stats` against VCF files from GATK and FreeBayes genotypers.\n\nIt will automatically include the FASTA reference for INDEL-related statistics." + "description": "Skip bcftools stats generation for VCF based variant calling statistics", + "help_text": "Disables running of `bcftools stats` against VCF files from GATK and FreeBayes genotypers.\n\nIf ran, `bcftools stats` will automatically include the FASTA reference for INDEL-related statistics." }, "genotyping_pileupcaller_min_base_quality": { "type": "integer", @@ -946,7 +946,11 @@ "genotyping_gatk_dbsnp": { "type": "string", "help_text": "(Optional) Specify VCF file for output VCF SNP annotation e.g. if you want to annotate your VCF file with 'rs' SNP IDs. Check GATK documentation for more information. Gzip not accepted.", - "fa_icon": "fas fa-pen-alt" + "fa_icon": "fas fa-pen-alt", + "description": "Specify VCF file for SNP annotation of output VCF files. Optional. Gzip not accepted.", + "pattern": "^\\S+\\.vcf$", + "format": "file-path", + "mimetype": "VCF" }, "genotyping_gatk_ug_downsample": { "type": "integer", From b0da6bd67fe2e30297f801e8fa8de31b08a1466e Mon Sep 17 00:00:00 2001 From: Thiseas Christos Lamnidis Date: Fri, 4 Aug 2023 11:12:10 +0200 Subject: [PATCH 024/110] merge conflict --- conf/modules.config | 2 +- docs/development/manual_tests.md | 9 +++++++++ modules.json | 5 +++++ subworkflows/local/genotype.nf | 1 + 4 files changed, 16 insertions(+), 1 deletion(-) diff --git a/conf/modules.config b/conf/modules.config index e3faca711..299fd6cdc 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -843,7 +843,7 @@ process { withName: GATK_REALIGNERTARGETCREATOR { tag = { "${meta.reference}|${meta.sample_id}_${meta.library_id}" } ext.args = [ - params.genotyping_gatk_ug_defaultbasequalities > 0 ? "--defaultBaseQualities ${params.genotyping_gatk_ug_defaultbasequalities}" : "", + params.genotyping_gatk_ug_defaultbasequalities > 0 ? "--defaultBaseQualities ${params.genotyping_gatk_ug_defaultbasequalities}" : "", // For some reason, GATK complains if its default of -1 is actually provided ?_? ].join(' ').trim() ext.prefix = { "${meta.sample_id}_${meta.library_id}_${meta.reference}" } publishDir = [ diff --git a/docs/development/manual_tests.md b/docs/development/manual_tests.md index a02f22588..c6763005f 100644 --- a/docs/development/manual_tests.md +++ b/docs/development/manual_tests.md @@ -648,3 +648,12 @@ nextflow run . -profile test,docker \ --damage_manipulation_bamutils_trim_double_stranded_half_udg_left 1 \ --damage_manipulation_bamutils_trim_double_stranded_half_udg_right 2 ``` +# GENOTYPING + +## GATK UG + +```bash +## Gatk on raw reads +## Expect: One VCF per sample/reference combination. +nextflow run main.nf -profile test,docker --outdir ./results -w work/ -resume --run_genotyping --genotyping_tool 'ug' --genotyping_source 'raw' -ansi-log false +``` diff --git a/modules.json b/modules.json index fd41aaed3..cf4cb7cbf 100644 --- a/modules.json +++ b/modules.json @@ -30,6 +30,11 @@ "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", "installed_by": ["modules"] }, + "bcftools/stats": { + "branch": "master", + "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", + "installed_by": ["modules"] + }, "bedtools/coverage": { "branch": "master", "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", diff --git a/subworkflows/local/genotype.nf b/subworkflows/local/genotype.nf index e65a380e3..5c508f2f3 100644 --- a/subworkflows/local/genotype.nf +++ b/subworkflows/local/genotype.nf @@ -10,6 +10,7 @@ include { GATK_INDELREALIGNER } from '../../module include { GATK_UNIFIEDGENOTYPER } from '../../modules/nf-core/gatk/unifiedgenotyper/main' include { GATK4_HAPLOTYPECALLER } from '../../modules/nf-core/gatk4/haplotypecaller/main' include { FREEBAYES } from '../../modules/nf-core/freebayes/main' +include { BCFTOOLS_STATS as BCFTOOLS_STATS_GENOTYPING } from '../../modules/nf-core/bcftools/stats/main' // TODO Add ANGSD GTL module. The current module does not pick up the .glf.gz output files. workflow GENOTYPE { From 16d567a62f5b412d66753515ef17ae6dd32c08a0 Mon Sep 17 00:00:00 2001 From: Thiseas Christos Lamnidis Date: Fri, 11 Aug 2023 10:43:20 +0200 Subject: [PATCH 025/110] Update bcftools_stats --- modules.json | 2 +- modules/nf-core/bcftools/stats/main.nf | 60 +++++++++++++++++++++ modules/nf-core/bcftools/stats/meta.yml | 72 +++++++++++++++++++++++++ 3 files changed, 133 insertions(+), 1 deletion(-) create mode 100644 modules/nf-core/bcftools/stats/main.nf create mode 100644 modules/nf-core/bcftools/stats/meta.yml diff --git a/modules.json b/modules.json index cf4cb7cbf..4fc2d1cba 100644 --- a/modules.json +++ b/modules.json @@ -32,7 +32,7 @@ }, "bcftools/stats": { "branch": "master", - "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", + "git_sha": "e2693a7e2d773b92e0649b25880ee22fe82bb79d", "installed_by": ["modules"] }, "bedtools/coverage": { diff --git a/modules/nf-core/bcftools/stats/main.nf b/modules/nf-core/bcftools/stats/main.nf new file mode 100644 index 000000000..7ccb9bf6c --- /dev/null +++ b/modules/nf-core/bcftools/stats/main.nf @@ -0,0 +1,60 @@ +process BCFTOOLS_STATS { + tag "$meta.id" + label 'process_single' + + conda "bioconda::bcftools=1.17" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/bcftools:1.17--haef29d1_0': + 'biocontainers/bcftools:1.17--haef29d1_0' }" + + input: + tuple val(meta), path(vcf), path(tbi) + tuple val(meta2), path(regions) + tuple val(meta3), path(targets) + tuple val(meta4), path(samples) + tuple val(meta5), path(exons) + tuple val(meta6), path(fasta) + + output: + tuple val(meta), path("*stats.txt"), emit: stats + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def regions_file = regions ? "--regions-file ${regions}" : "" + def targets_file = targets ? "--targets-file ${targets}" : "" + def samples_file = samples ? "--samples-file ${samples}" : "" + def reference_fasta = fasta ? "--fasta-ref ${fasta}" : "" + def exons_file = exons ? "--exons ${exons}" : "" + """ + bcftools stats \\ + $args \\ + $regions_file \\ + $targets_file \\ + $samples_file \\ + $reference_fasta \\ + $exons_file \\ + $vcf > ${prefix}.bcftools_stats.txt + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + bcftools: \$(bcftools --version 2>&1 | head -n1 | sed 's/^.*bcftools //; s/ .*\$//') + END_VERSIONS + """ + + stub: + def prefix = task.ext.prefix ?: "${meta.id}" + + """ + touch ${prefix}.bcftools_stats.txt + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + bcftools: \$(bcftools --version 2>&1 | head -n1 | sed 's/^.*bcftools //; s/ .*\$//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/bcftools/stats/meta.yml b/modules/nf-core/bcftools/stats/meta.yml new file mode 100644 index 000000000..5850d25f7 --- /dev/null +++ b/modules/nf-core/bcftools/stats/meta.yml @@ -0,0 +1,72 @@ +name: bcftools_stats +description: Generates stats from VCF files +keywords: + - variant calling + - stats + - VCF +tools: + - stats: + description: | + Parses VCF or BCF and produces text file stats which is suitable for + machine processing and can be plotted using plot-vcfstats. + homepage: http://samtools.github.io/bcftools/bcftools.html + documentation: http://www.htslib.org/doc/bcftools.html + doi: 10.1093/bioinformatics/btp352 + licence: ["MIT"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - vcf: + type: file + description: VCF input file + pattern: "*.{vcf}" + - tbi: + type: file + description: | + The tab index for the VCF file to be inspected. Optional: only required when parameter regions is chosen. + pattern: "*.tbi" + - regions: + type: file + description: | + Optionally, restrict the operation to regions listed in this file. (VCF, BED or tab-delimited) + - targets: + type: file + description: | + Optionally, restrict the operation to regions listed in this file (doesn't rely upon tbi index files) + - samples: + type: file + description: | + Optional, file of sample names to be included or excluded. + e.g. 'file.tsv' + - exons: + type: file + description: | + Tab-delimited file with exons for indel frameshifts (chr,beg,end; 1-based, inclusive, optionally bgzip compressed). + e.g. 'exons.tsv.gz' + - fasta: + type: file + description: | + Faidx indexed reference sequence file to determine INDEL context. + e.g. 'reference.fa' +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - stats: + type: file + description: Text output file containing stats + pattern: "*_{stats.txt}" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@joseespinosa" + - "@drpatelh" + - "@SusiJo" + - "@TCLamnidis" From 97f147148b796c141a0ac674837b1f06e14c74ab Mon Sep 17 00:00:00 2001 From: Thiseas Christos Lamnidis Date: Fri, 11 Aug 2023 10:47:07 +0200 Subject: [PATCH 026/110] add bcftools stats to UG --- subworkflows/local/genotype.nf | 31 ++++++++++++++++++++++++++++--- 1 file changed, 28 insertions(+), 3 deletions(-) diff --git a/subworkflows/local/genotype.nf b/subworkflows/local/genotype.nf index 5c508f2f3..1761ea4b9 100644 --- a/subworkflows/local/genotype.nf +++ b/subworkflows/local/genotype.nf @@ -30,6 +30,7 @@ workflow GENOTYPE { ch_gatk_unifiedgenotyper_genotypes = Channel.empty() ch_freebayes_genotypes = Channel.empty() ch_angsd_genotypes = Channel.empty() + ch_bcftools_stats = Channel.empty() if ( params.genotyping_tool == 'pileupcaller' ) { // SAMTOOLS_MPILEUP_PILEUPCALLER( ch_bam_bai, ch_fasta_plus ) @@ -48,6 +49,7 @@ workflow GENOTYPE { WorkflowEager.addNewMetaFromAttributes( it, "reference" , "reference" , false ) } + // RESULT: [ [combination_meta], [ref_meta], fasta, fai, dict, dbsnp ] ch_fasta_for_multimap = ch_fasta_plus .map { // Prepend a new meta that contains the meta.id value as the new_meta.reference attribute @@ -60,7 +62,7 @@ workflow GENOTYPE { .combine( ch_fasta_for_multimap , by:0 ) .multiMap { ignore_me, meta, bam, bai, ref_meta, fasta, fai, dict, dbsnp -> - bam: [ meta, bam , bai ] + bam: [ meta, bam , bai ] fasta: [ ref_meta, fasta ] fai: [ ref_meta, fai ] dict: [ ref_meta, dict ] @@ -85,7 +87,7 @@ workflow GENOTYPE { .combine( ch_fasta_for_multimap , by:0 ) .multiMap { ignore_me, meta, bam, bai, intervals, ref_meta, fasta, fai, dict, dbsnp -> - bam: [ meta, bam, bai, intervals ] + bam: [ meta, bam, bai, intervals ] fasta: [ ref_meta, fasta ] fai: [ ref_meta, fai ] dict: [ ref_meta, dict ] @@ -108,7 +110,7 @@ workflow GENOTYPE { .combine( ch_fasta_for_multimap , by:0 ) .multiMap { ignore_me, meta, bam, bai, ref_meta, fasta, fai, dict, dbsnp -> - bam: [ meta, bam, bai ] + bam: [ meta, bam, bai ] fasta: [ ref_meta, fasta ] fai: [ ref_meta, fai ] dict: [ ref_meta, dict ] @@ -127,6 +129,29 @@ workflow GENOTYPE { ) ch_versions = ch_versions.mix( GATK_UNIFIEDGENOTYPER.out.versions.first() ) ch_gatk_unifiedgenotyper_genotypes = GATK_UNIFIEDGENOTYPER.out.vcf + + if ( ! params.skip_bcftools_stats ) { + ch_bcftools_input= ch_gatk_unifiedgenotyper_genotypes + .map { + WorkflowEager.addNewMetaFromAttributes( it, "reference" , "reference" , false ) + } + .combine( ch_fasta_for_multimap , by:0 ) + .multiMap { + ignore_me, meta, vcf, ref_meta, fasta, fai, dict, dbsnp -> + vcf: [ meta, vcf, [] ] // bcftools stats module expects a tbi file with the vcf. + fasta: [ ref_meta, fasta ] + } + + BCFTOOLS_STATS_GENOTYPING( + ch_bcftools_input.vcf, // vcf + [ [], [] ], // regions + [ [], [] ], // targets + [ [], [] ], // samples + [ [], [] ], // exons + ch_bcftools_input.fasta // fasta + ) + ch_versions = ch_versions.mix( BCFTOOLS_STATS_GENOTYPING.out.versions.first() ) + } } if ( params.genotyping_tool == 'hc' ) { From 64d0df59b67bdbe21910ca057fc927fc34acc4a8 Mon Sep 17 00:00:00 2001 From: Thiseas Christos Lamnidis Date: Fri, 11 Aug 2023 10:50:01 +0200 Subject: [PATCH 027/110] add todo comment --- subworkflows/local/genotype.nf | 1 + 1 file changed, 1 insertion(+) diff --git a/subworkflows/local/genotype.nf b/subworkflows/local/genotype.nf index 1761ea4b9..70442efa6 100644 --- a/subworkflows/local/genotype.nf +++ b/subworkflows/local/genotype.nf @@ -131,6 +131,7 @@ workflow GENOTYPE { ch_gatk_unifiedgenotyper_genotypes = GATK_UNIFIEDGENOTYPER.out.vcf if ( ! params.skip_bcftools_stats ) { + // TODO this section could be moved outside the UG specific section into its own if clause and take input from HC and FB as well. ch_bcftools_input= ch_gatk_unifiedgenotyper_genotypes .map { WorkflowEager.addNewMetaFromAttributes( it, "reference" , "reference" , false ) From c5bbb3b234b3a4f947e007d4451e0502c4088134 Mon Sep 17 00:00:00 2001 From: Thiseas Christos Lamnidis Date: Fri, 11 Aug 2023 11:54:16 +0200 Subject: [PATCH 028/110] Add config for bcftools stats --- conf/modules.config | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/conf/modules.config b/conf/modules.config index 299fd6cdc..6a4270233 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -876,7 +876,17 @@ process { ].join(' ').trim() ext.prefix = { "${meta.sample_id}_${meta.library_id}_${meta.reference}" } publishDir = [ - path: { "${params.outdir}/genotyping/ug" }, + path: { "${params.outdir}/genotyping/" }, + mode: params.publish_dir_mode, + enabled: true + ] + } + + withName: BCFTOOLS_STATS_GENOTYPING { + tag = { "${meta.reference}|${meta.sample_id}_${meta.library_id}" } + ext.prefix = { "${meta.sample_id}_${meta.library_id}_${meta.reference}" } + publishDir = [ + path: { "${params.outdir}/genotyping/" }, mode: params.publish_dir_mode, enabled: true ] From 2cf5eaca5d467374ea4cbc2db1baa6d6f24abb05 Mon Sep 17 00:00:00 2001 From: Thiseas Christos Lamnidis Date: Fri, 11 Aug 2023 11:54:22 +0200 Subject: [PATCH 029/110] record manual tests --- docs/development/manual_tests.md | 32 ++++++++++++++++++++++++++++++-- 1 file changed, 30 insertions(+), 2 deletions(-) diff --git a/docs/development/manual_tests.md b/docs/development/manual_tests.md index c6763005f..4887f1558 100644 --- a/docs/development/manual_tests.md +++ b/docs/development/manual_tests.md @@ -648,12 +648,40 @@ nextflow run . -profile test,docker \ --damage_manipulation_bamutils_trim_double_stranded_half_udg_left 1 \ --damage_manipulation_bamutils_trim_double_stranded_half_udg_right 2 ``` + # GENOTYPING ## GATK UG ```bash ## Gatk on raw reads -## Expect: One VCF per sample/reference combination. -nextflow run main.nf -profile test,docker --outdir ./results -w work/ -resume --run_genotyping --genotyping_tool 'ug' --genotyping_source 'raw' -ansi-log false +## Expect: One VCF per sample/reference combination. Also 1 bcftools_stats file per bam. +nextflow run main.nf -profile test,docker --outdir ./results -w work/ -resume --run_genotyping --genotyping_tool 'ug' --genotyping_source 'raw' -ansi-log false -dump-channels +``` + +```bash +## Gatk on trimmed reads +## Expect: One VCF per sample/reference combination, based on the trimmed bams. Also 1 bcftools_stats file per bam. +nextflow run main.nf -profile test,docker --outdir ./results -w work/ -resume --run_genotyping --genotyping_tool 'ug' --genotyping_source 'trimmed' -ansi-log false -dump-channels --run_trim_bam \ + --damage_manipulation_bamutils_trim_double_stranded_none_udg_left 5 \ + --damage_manipulation_bamutils_trim_double_stranded_none_udg_right 7 \ + --damage_manipulation_bamutils_trim_double_stranded_half_udg_left 1 \ + --damage_manipulation_bamutils_trim_double_stranded_half_udg_right 2 +## Checked that the input bam for the UG jobs indeed had trimmed reads. (The full UDG sample has untrimmed bams.) +``` + +```bash +``` + +```bash +## Gatk on pmd-filtered reads +## Expect: One VCF per sample/reference combination, based on the pmd-filtered bams. Also 1 bcftools_stats file per bam. +nextflow run main.nf -profile test,docker --outdir ./results -w work/ -resume --run_genotyping --genotyping_tool 'ug' --genotyping_source 'pmd' -ansi-log false -dump-channels --run_pmd_filtering +## Checked that the bams had fewer reads compared to the raw bams. +``` + +```bash +## Gatk on rescaled reads +## Expect: One VCF per sample/reference combination, based on the rescaled bams. Also 1 bcftools_stats file per bam. +nextflow run main.nf -profile test,docker --outdir ./results -w work/ -resume --run_genotyping --genotyping_tool 'ug' --genotyping_source 'rescaled' -ansi-log false -dump-channels --run_mapdamage_rescaling ``` From 947ee60a95269a9c682e5b7bb8e6a891d7f9657f Mon Sep 17 00:00:00 2001 From: Thiseas Christos Lamnidis Date: Fri, 11 Aug 2023 12:01:25 +0200 Subject: [PATCH 030/110] remove unnecessary bash block --- docs/development/manual_tests.md | 3 --- 1 file changed, 3 deletions(-) diff --git a/docs/development/manual_tests.md b/docs/development/manual_tests.md index 4887f1558..cefa4e3c8 100644 --- a/docs/development/manual_tests.md +++ b/docs/development/manual_tests.md @@ -670,9 +670,6 @@ nextflow run main.nf -profile test,docker --outdir ./results -w work/ -resume -- ## Checked that the input bam for the UG jobs indeed had trimmed reads. (The full UDG sample has untrimmed bams.) ``` -```bash -``` - ```bash ## Gatk on pmd-filtered reads ## Expect: One VCF per sample/reference combination, based on the pmd-filtered bams. Also 1 bcftools_stats file per bam. From fd16f487974e2bd64b1a51424d408bdf553d2143 Mon Sep 17 00:00:00 2001 From: Thiseas Christos Lamnidis Date: Fri, 3 Nov 2023 11:36:18 +0100 Subject: [PATCH 031/110] attempt to add dbsnp to reference sheet --- subworkflows/local/reference_indexing.nf | 5 +++ .../local/reference_indexing_multi.nf | 38 ++++++++++--------- .../local/reference_indexing_single.nf | 8 +++- 3 files changed, 32 insertions(+), 19 deletions(-) diff --git a/subworkflows/local/reference_indexing.nf b/subworkflows/local/reference_indexing.nf index 13357d4ea..0c2d31951 100644 --- a/subworkflows/local/reference_indexing.nf +++ b/subworkflows/local/reference_indexing.nf @@ -31,6 +31,7 @@ workflow REFERENCE_INDEXING { ch_pileupcaller_snp = REFERENCE_INDEXING_MULTI.out.pileupcaller_snp ch_sexdeterrmine_bed = REFERENCE_INDEXING_MULTI.out.sexdeterrmine_bed ch_bedtools_feature = REFERENCE_INDEXING_MULTI.out.bedtools_feature + ch_dbsnp = REFERENCE_INDEXING_MULTI.out.dbsnp ch_versions = ch_versions.mix( REFERENCE_INDEXING_MULTI.out.versions ) } else { // If input FASTA and/or indicies supplied @@ -43,6 +44,7 @@ workflow REFERENCE_INDEXING { ch_sexdeterrmine_bed = REFERENCE_INDEXING_SINGLE.out.sexdeterrmine_bed ch_bedtools_feature = REFERENCE_INDEXING_SINGLE.out.bedtools_feature ch_reference_for_mapping = REFERENCE_INDEXING_SINGLE.out.reference + ch_dbsnp = REFERENCE_INDEXING_SINGLE.out.dbsnp ch_versions = ch_versions.mix( REFERENCE_INDEXING_SINGLE.out.versions ) } @@ -80,6 +82,8 @@ workflow REFERENCE_INDEXING { ch_bedtools_feature = ch_bedtools_feature .filter{ it[1] != "" } + // TODO-DEV No filtering dbsnp cause we always need the ploidy value from its meta. Will probably need a reference sheet validator to fix this. + emit: reference = ch_reference_for_mapping // [ meta, fasta, fai, dict, mapindex, circular_target ] mitochondrion_header = ch_mitochondrion_header // [ meta, mitochondrion_header ] @@ -89,6 +93,7 @@ workflow REFERENCE_INDEXING { pileupcaller_snp = ch_pileupcaller_snp // [ meta, pileupcaller_bed, pileupcaller_snp ] sexdeterrmine_bed = ch_sexdeterrmine_bed // [ meta, sexdet_bed ] bedtools_feature = ch_bedtools_feature // [ meta, bedtools_feature ] + dbsnp = ch_dbsnp // [ meta, dbsnp ] versions = ch_versions } diff --git a/subworkflows/local/reference_indexing_multi.nf b/subworkflows/local/reference_indexing_multi.nf index 1ca630498..8310ef920 100644 --- a/subworkflows/local/reference_indexing_multi.nf +++ b/subworkflows/local/reference_indexing_multi.nf @@ -29,22 +29,24 @@ workflow REFERENCE_INDEXING_MULTI { ch_splitreferencesheet_for_branch = ch_splitreferencesheet_for_map .map { row -> - def meta = [:] - meta.id = row["reference_name"] - def fasta = file(row["fasta"], checkIfExists: true) // mandatory parameter! - def fai = row["fai"] != "" ? file(row["fai"], checkIfExists: true) : "" - def dict = row["dict"] != "" ? file(row["dict"], checkIfExists: true) : "" - def mapper_index = row["mapper_index"] != "" ? file(row["mapper_index"], checkIfExists: true) : "" - def circular_target = row["circular_target"] - def mitochondrion = row["mitochondrion_header"] - def capture_bed = row["snpcapture_bed"] != "" ? file(row["snpcapture_bed"], checkIfExists: true) : "" - def pileupcaller_bed = row["pileupcaller_bedfile"] != "" ? file(row["pileupcaller_bedfile"], checkIfExists: true) : "" - def pileupcaller_snp = row["pileupcaller_snpfile"] != "" ? file(row["pileupcaller_snpfile"], checkIfExists: true) : "" - def hapmap = row["hapmap_file"] != "" ? file(row["hapmap_file"], checkIfExists: true) : "" - def pmd_mask = row["pmdtools_masked_fasta"] != "" ? file(row["pmdtools_masked_fasta"], checkIfExists: true) : "" - def sexdet_bed = row["sexdeterrmine_snp_bed"] != "" ? file(row["sexdeterrmine_snp_bed"], checkIfExists: true) : "" - def bedtools_feature = row["bedtools_feature_file"] != "" ? file(row["bedtools_feature_file"], checkIfExists: true) : "" - [ meta, fasta, fai, dict, mapper_index, circular_target, mitochondrion, capture_bed, pileupcaller_bed, pileupcaller_snp, hapmap, pmd_mask, sexdet_bed, bedtools_feature ] + def meta = [:] + meta.id = row["reference_name"] + def fasta = file(row["fasta"], checkIfExists: true) // mandatory parameter! + def fai = row["fai"] != "" ? file(row["fai"], checkIfExists: true) : "" + def dict = row["dict"] != "" ? file(row["dict"], checkIfExists: true) : "" + def mapper_index = row["mapper_index"] != "" ? file(row["mapper_index"], checkIfExists: true) : "" + def circular_target = row["circular_target"] + def mitochondrion = row["mitochondrion_header"] + def capture_bed = row["snpcapture_bed"] != "" ? file(row["snpcapture_bed"], checkIfExists: true) : "" + def pileupcaller_bed = row["pileupcaller_bedfile"] != "" ? file(row["pileupcaller_bedfile"], checkIfExists: true) : "" + def pileupcaller_snp = row["pileupcaller_snpfile"] != "" ? file(row["pileupcaller_snpfile"], checkIfExists: true) : "" + def hapmap = row["hapmap_file"] != "" ? file(row["hapmap_file"], checkIfExists: true) : "" + def pmd_mask = row["pmdtools_masked_fasta"] != "" ? file(row["pmdtools_masked_fasta"], checkIfExists: true) : "" + def sexdet_bed = row["sexdeterrmine_snp_bed"] != "" ? file(row["sexdeterrmine_snp_bed"], checkIfExists: true) : "" + def bedtools_feature = row["bedtools_feature_file"] != "" ? file(row["bedtools_feature_file"], checkIfExists: true) : "" + def genotyping_gatk_ploidy = row["genotyping_gatk_ploidy"] != "" ? row["genotyping_gatk_ploidy"] : "" + def genotyping_gatk_dbsnp = row["genotyping_gatk_dbsnp"] != "" ? file(row["genotyping_gatk_dbsnp"], checkIfExists: true) : "" + [ meta, fasta, fai, dict, mapper_index, circular_target, mitochondrion, capture_bed, pileupcaller_bed, pileupcaller_snp, hapmap, pmd_mask, sexdet_bed, bedtools_feature, genotyping_gatk_ploidy, genotyping_gatk_dbsnp ] } @@ -61,7 +63,7 @@ workflow REFERENCE_INDEXING_MULTI { ch_input_from_referencesheet = ch_splitreferencesheet_for_branch .multiMap { - meta, fasta, fai, dict, mapper_index, circular_target, mitochondrion, capture_bed, pileupcaller_bed, pileupcaller_snp, hapmap, pmd_mask, sexdet_bed, bedtools_feature -> + meta, fasta, fai, dict, mapper_index, circular_target, mitochondrion, capture_bed, pileupcaller_bed, pileupcaller_snp, hapmap, pmd_mask, sexdet_bed, bedtools_feature, genotyping_gatk_ploidy, genotyping_gatk_dbsnp -> generated: [ meta, fasta, fai, dict, mapper_index, circular_target ] mitochondrion_header: [ meta, mitochondrion ] angsd_hapmap: [ meta, hapmap ] @@ -70,6 +72,7 @@ ch_input_from_referencesheet = ch_splitreferencesheet_for_branch pileupcaller_snp: [ meta, pileupcaller_bed, pileupcaller_snp ] sexdeterrmine_bed: [ meta, sexdet_bed ] bedtools_feature: [ meta, bedtools_feature ] + dbsnp: [ meta + [ ploidy: genotyping_gatk_ploidy ], genotyping_gatk_dbsnp ] // Include ploidy of the reference in dbsnp meta. } // Detect if fasta is gzipped or not @@ -216,5 +219,6 @@ ch_input_from_referencesheet = ch_splitreferencesheet_for_branch pileupcaller_snp = ch_input_from_referencesheet.pileupcaller_snp // [ meta, pileupcaller_snp, pileupcaller_bed ] sexdeterrmine_bed = ch_input_from_referencesheet.sexdeterrmine_bed // [ meta, sexdet_bed ] bedtools_feature = ch_input_from_referencesheet.bedtools_feature // [ meta, bedtools_feature ] + dbsnp = ch_input_from_referencesheet.dbsnp // [ meta + [ ploidy: genotyping_gatk_ploidy ], genotyping_gatk_dbsnp ] versions = ch_versions } diff --git a/subworkflows/local/reference_indexing_single.nf b/subworkflows/local/reference_indexing_single.nf index 43ebbf262..3fb1504f8 100644 --- a/subworkflows/local/reference_indexing_single.nf +++ b/subworkflows/local/reference_indexing_single.nf @@ -87,12 +87,14 @@ workflow REFERENCE_INDEXING_SINGLE { def pileupcaller_snp = "" def sexdet_bed = "" def bedtools_feature = params.mapstats_bedtools_featurefile != null ? file(params.mapstats_bedtools_featurefile, checkIfExists: true ) : "" - [ meta, fasta, fai, dict, mapper_index, params.fasta_circular_target, params.mitochondrion_header, contamination_estimation_angsd_hapmap, pmd_mask, capture_bed, pileupcaller_bed, pileupcaller_snp, sexdet_bed, bedtools_feature ] + def genotyping_gatk_ploidy = params.genotyping_gatk_ploidy != null ? params.genotyping_gatk_ploidy : "" + def genotyping_gatk_dbsnp = params.genotyping_gatk_dbsnp != null ? file(params.genotyping_gatk_dbsnp, checkIfExists: true ) : "" + [ meta, fasta, fai, dict, mapper_index, params.fasta_circular_target, params.mitochondrion_header, contamination_estimation_angsd_hapmap, pmd_mask, capture_bed, pileupcaller_bed, pileupcaller_snp, sexdet_bed, bedtools_feature, genotyping_gatk_ploidy, genotyping_gatk_dbsnp ] } ch_ref_index_single = ch_reference_for_mapping .multiMap{ - meta, fasta, fai, dict, mapper_index, circular_target, mitochondrion_header, contamination_estimation_angsd_hapmap, pmd_mask, capture_bed, pileupcaller_bed, pileupcaller_snp, sexdet_bed, bedtools_feature -> + meta, fasta, fai, dict, mapper_index, circular_target, mitochondrion_header, contamination_estimation_angsd_hapmap, pmd_mask, capture_bed, pileupcaller_bed, pileupcaller_snp, sexdet_bed, bedtools_feature, genotyping_gatk_ploidy, genotyping_gatk_dbsnp -> reference: [ meta, fasta, fai, dict, mapper_index, circular_target ] mito_header: [ meta, mitochondrion_header ] hapmap: [ meta, contamination_estimation_angsd_hapmap ] @@ -101,6 +103,7 @@ workflow REFERENCE_INDEXING_SINGLE { pileupcaller_snp: [ meta, pileupcaller_bed, pileupcaller_snp ] sexdeterrmine_bed: [ meta, sexdet_bed ] bedtools_feature: [ meta, bedtools_feature ] + dbsnp: [ meta + [ ploidy: genotyping_gatk_ploidy ], genotyping_gatk_dbsnp ] // Include ploidy of the reference in dbsnp meta. } emit: @@ -112,6 +115,7 @@ workflow REFERENCE_INDEXING_SINGLE { pileupcaller_snp = ch_ref_index_single.pileupcaller_snp // [ meta, pileupcaller_bed, pileupcaller_snp ] sexdeterrmine_bed = ch_ref_index_single.sexdeterrmine_bed // [ meta, sexdet_bed ] bedtools_feature = ch_ref_index_single.bedtools_feature // [ meta, bedtools_feature ] + dbsnp = ch_ref_index_single.dbsnp // [ meta + [ ploidy: genotyping_gatk_ploidy ], genotyping_gatk_dbsnp ] versions = ch_versions } From a056508027df57f35c703b75e145e867cf3b804f Mon Sep 17 00:00:00 2001 From: Thiseas Christos Lamnidis Date: Fri, 3 Nov 2023 11:57:11 +0100 Subject: [PATCH 032/110] pass dbsnp to genotyping --- subworkflows/local/genotype.nf | 7 ++++--- workflows/eager.nf | 10 ++++++++-- 2 files changed, 12 insertions(+), 5 deletions(-) diff --git a/subworkflows/local/genotype.nf b/subworkflows/local/genotype.nf index 70442efa6..91fd3496d 100644 --- a/subworkflows/local/genotype.nf +++ b/subworkflows/local/genotype.nf @@ -12,15 +12,16 @@ include { GATK4_HAPLOTYPECALLER } from '../../module include { FREEBAYES } from '../../modules/nf-core/freebayes/main' include { BCFTOOLS_STATS as BCFTOOLS_STATS_GENOTYPING } from '../../modules/nf-core/bcftools/stats/main' // TODO Add ANGSD GTL module. The current module does not pick up the .glf.gz output files. - +// TODO Find a way to pass ploidy and dbsnp to the GATK modules. maybe ploidy should go in all reference metas +? workflow GENOTYPE { take: ch_bam_bai // [ [ meta ], bam , bai ] - ch_fasta_plus // [ [ meta ], fasta, fai, dict ] // TODO add dbSNP + ch_fasta_plus // [ [ meta ], fasta, fai, dict ] ch_snpcapture_bed // [ [ meta ], bed ] ch_pileupcaller_bedfile // [ [ meta ], bed ] ch_pileupcaller_snpfile // [ [ meta ], snp ] - ch_dbsnp // [ dbsnp ] + ch_dbsnp // [ [ meta ], dbsnp ] main: ch_versions = Channel.empty() diff --git a/workflows/eager.nf b/workflows/eager.nf index 8687001ad..7200c9907 100644 --- a/workflows/eager.nf +++ b/workflows/eager.nf @@ -554,12 +554,18 @@ workflow EAGER { if ( params.run_genotyping ) { ch_reference_for_genotyping = REFERENCE_INDEXING.out.reference // Remove unnecessary files from the reference channel, so SWF doesn't break with each change to reference channel. - .map{ + .map { meta, fasta, fai, dict, mapindex, circular_target, mitochondrion -> [ meta, fasta, fai, dict ] // dbsnp ] // TODO add dbsnp to reference sheet. } + ch_dbsnp = REFERENCE_INDEXING.out.dbsnp + .map { + meta, dbsnp -> + final_dbsnp = dbsnp != "" ? dbsnp : [] + [ meta, final_dbsnp ] + } - GENOTYPE( ch_bams_for_genotyping, ch_reference_for_genotyping, [], [], [], ch_dbsnp.ifEmpty([[]]) ) + GENOTYPE( ch_bams_for_genotyping, ch_reference_for_genotyping, [], [], [], ch_dbsnp ) ch_versions = ch_versions.mix( GENOTYPE.out.versions ) ch_multiqc_files = ch_multiqc_files.mix( GENOTYPE.out.mqc.collect{it[1]}.ifEmpty([]) ) From f312446819a93d458606183d5bc007ee5f85af00 Mon Sep 17 00:00:00 2001 From: Thiseas Christos Lamnidis Date: Fri, 24 Nov 2023 10:16:43 +0100 Subject: [PATCH 033/110] Include ploidy into ref_meta --- subworkflows/local/reference_indexing_multi.nf | 10 +++++----- subworkflows/local/reference_indexing_single.nf | 10 +++++----- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/subworkflows/local/reference_indexing_multi.nf b/subworkflows/local/reference_indexing_multi.nf index 8310ef920..af2966c20 100644 --- a/subworkflows/local/reference_indexing_multi.nf +++ b/subworkflows/local/reference_indexing_multi.nf @@ -31,6 +31,7 @@ workflow REFERENCE_INDEXING_MULTI { row -> def meta = [:] meta.id = row["reference_name"] + meta.ploidy = row["genotyping_gatk_ploidy"] != "" ? row["genotyping_gatk_ploidy"] : params.genotyping_gatk_ploidy // Use default value if none is specified. This info goes in the meta def fasta = file(row["fasta"], checkIfExists: true) // mandatory parameter! def fai = row["fai"] != "" ? file(row["fai"], checkIfExists: true) : "" def dict = row["dict"] != "" ? file(row["dict"], checkIfExists: true) : "" @@ -44,9 +45,8 @@ workflow REFERENCE_INDEXING_MULTI { def pmd_mask = row["pmdtools_masked_fasta"] != "" ? file(row["pmdtools_masked_fasta"], checkIfExists: true) : "" def sexdet_bed = row["sexdeterrmine_snp_bed"] != "" ? file(row["sexdeterrmine_snp_bed"], checkIfExists: true) : "" def bedtools_feature = row["bedtools_feature_file"] != "" ? file(row["bedtools_feature_file"], checkIfExists: true) : "" - def genotyping_gatk_ploidy = row["genotyping_gatk_ploidy"] != "" ? row["genotyping_gatk_ploidy"] : "" def genotyping_gatk_dbsnp = row["genotyping_gatk_dbsnp"] != "" ? file(row["genotyping_gatk_dbsnp"], checkIfExists: true) : "" - [ meta, fasta, fai, dict, mapper_index, circular_target, mitochondrion, capture_bed, pileupcaller_bed, pileupcaller_snp, hapmap, pmd_mask, sexdet_bed, bedtools_feature, genotyping_gatk_ploidy, genotyping_gatk_dbsnp ] + [ meta, fasta, fai, dict, mapper_index, circular_target, mitochondrion, capture_bed, pileupcaller_bed, pileupcaller_snp, hapmap, pmd_mask, sexdet_bed, bedtools_feature, genotyping_gatk_dbsnp ] } @@ -63,7 +63,7 @@ workflow REFERENCE_INDEXING_MULTI { ch_input_from_referencesheet = ch_splitreferencesheet_for_branch .multiMap { - meta, fasta, fai, dict, mapper_index, circular_target, mitochondrion, capture_bed, pileupcaller_bed, pileupcaller_snp, hapmap, pmd_mask, sexdet_bed, bedtools_feature, genotyping_gatk_ploidy, genotyping_gatk_dbsnp -> + meta, fasta, fai, dict, mapper_index, circular_target, mitochondrion, capture_bed, pileupcaller_bed, pileupcaller_snp, hapmap, pmd_mask, sexdet_bed, bedtools_feature, genotyping_gatk_dbsnp -> generated: [ meta, fasta, fai, dict, mapper_index, circular_target ] mitochondrion_header: [ meta, mitochondrion ] angsd_hapmap: [ meta, hapmap ] @@ -72,7 +72,7 @@ ch_input_from_referencesheet = ch_splitreferencesheet_for_branch pileupcaller_snp: [ meta, pileupcaller_bed, pileupcaller_snp ] sexdeterrmine_bed: [ meta, sexdet_bed ] bedtools_feature: [ meta, bedtools_feature ] - dbsnp: [ meta + [ ploidy: genotyping_gatk_ploidy ], genotyping_gatk_dbsnp ] // Include ploidy of the reference in dbsnp meta. + dbsnp: [ meta, genotyping_gatk_dbsnp ] // Include ploidy of the reference in dbsnp meta. } // Detect if fasta is gzipped or not @@ -219,6 +219,6 @@ ch_input_from_referencesheet = ch_splitreferencesheet_for_branch pileupcaller_snp = ch_input_from_referencesheet.pileupcaller_snp // [ meta, pileupcaller_snp, pileupcaller_bed ] sexdeterrmine_bed = ch_input_from_referencesheet.sexdeterrmine_bed // [ meta, sexdet_bed ] bedtools_feature = ch_input_from_referencesheet.bedtools_feature // [ meta, bedtools_feature ] - dbsnp = ch_input_from_referencesheet.dbsnp // [ meta + [ ploidy: genotyping_gatk_ploidy ], genotyping_gatk_dbsnp ] + dbsnp = ch_input_from_referencesheet.dbsnp // [ meta, genotyping_gatk_dbsnp ] versions = ch_versions } diff --git a/subworkflows/local/reference_indexing_single.nf b/subworkflows/local/reference_indexing_single.nf index 3fb1504f8..9aa19beab 100644 --- a/subworkflows/local/reference_indexing_single.nf +++ b/subworkflows/local/reference_indexing_single.nf @@ -87,14 +87,14 @@ workflow REFERENCE_INDEXING_SINGLE { def pileupcaller_snp = "" def sexdet_bed = "" def bedtools_feature = params.mapstats_bedtools_featurefile != null ? file(params.mapstats_bedtools_featurefile, checkIfExists: true ) : "" - def genotyping_gatk_ploidy = params.genotyping_gatk_ploidy != null ? params.genotyping_gatk_ploidy : "" + def genotyping_gatk_ploidy = params.genotyping_gatk_ploidy def genotyping_gatk_dbsnp = params.genotyping_gatk_dbsnp != null ? file(params.genotyping_gatk_dbsnp, checkIfExists: true ) : "" - [ meta, fasta, fai, dict, mapper_index, params.fasta_circular_target, params.mitochondrion_header, contamination_estimation_angsd_hapmap, pmd_mask, capture_bed, pileupcaller_bed, pileupcaller_snp, sexdet_bed, bedtools_feature, genotyping_gatk_ploidy, genotyping_gatk_dbsnp ] + [ meta + [ ploidy: genotyping_gatk_ploidy ], fasta, fai, dict, mapper_index, params.fasta_circular_target, params.mitochondrion_header, contamination_estimation_angsd_hapmap, pmd_mask, capture_bed, pileupcaller_bed, pileupcaller_snp, sexdet_bed, bedtools_feature, genotyping_gatk_dbsnp ] } ch_ref_index_single = ch_reference_for_mapping .multiMap{ - meta, fasta, fai, dict, mapper_index, circular_target, mitochondrion_header, contamination_estimation_angsd_hapmap, pmd_mask, capture_bed, pileupcaller_bed, pileupcaller_snp, sexdet_bed, bedtools_feature, genotyping_gatk_ploidy, genotyping_gatk_dbsnp -> + meta, fasta, fai, dict, mapper_index, circular_target, mitochondrion_header, contamination_estimation_angsd_hapmap, pmd_mask, capture_bed, pileupcaller_bed, pileupcaller_snp, sexdet_bed, bedtools_feature, genotyping_gatk_dbsnp -> reference: [ meta, fasta, fai, dict, mapper_index, circular_target ] mito_header: [ meta, mitochondrion_header ] hapmap: [ meta, contamination_estimation_angsd_hapmap ] @@ -103,7 +103,7 @@ workflow REFERENCE_INDEXING_SINGLE { pileupcaller_snp: [ meta, pileupcaller_bed, pileupcaller_snp ] sexdeterrmine_bed: [ meta, sexdet_bed ] bedtools_feature: [ meta, bedtools_feature ] - dbsnp: [ meta + [ ploidy: genotyping_gatk_ploidy ], genotyping_gatk_dbsnp ] // Include ploidy of the reference in dbsnp meta. + dbsnp: [ meta, genotyping_gatk_dbsnp ] } emit: @@ -115,7 +115,7 @@ workflow REFERENCE_INDEXING_SINGLE { pileupcaller_snp = ch_ref_index_single.pileupcaller_snp // [ meta, pileupcaller_bed, pileupcaller_snp ] sexdeterrmine_bed = ch_ref_index_single.sexdeterrmine_bed // [ meta, sexdet_bed ] bedtools_feature = ch_ref_index_single.bedtools_feature // [ meta, bedtools_feature ] - dbsnp = ch_ref_index_single.dbsnp // [ meta + [ ploidy: genotyping_gatk_ploidy ], genotyping_gatk_dbsnp ] + dbsnp = ch_ref_index_single.dbsnp // [ meta, genotyping_gatk_dbsnp ] versions = ch_versions } From 05675d5de90dd1dcec38f8051123966cbca90137 Mon Sep 17 00:00:00 2001 From: Thiseas Christos Lamnidis Date: Fri, 24 Nov 2023 10:51:55 +0100 Subject: [PATCH 034/110] gatk UG done with dbsnp --- subworkflows/local/genotype.nf | 16 +++++++++++----- workflows/eager.nf | 12 +++--------- 2 files changed, 14 insertions(+), 14 deletions(-) diff --git a/subworkflows/local/genotype.nf b/subworkflows/local/genotype.nf index 91fd3496d..8b55ffc0a 100644 --- a/subworkflows/local/genotype.nf +++ b/subworkflows/local/genotype.nf @@ -13,7 +13,7 @@ include { FREEBAYES } from '../../module include { BCFTOOLS_STATS as BCFTOOLS_STATS_GENOTYPING } from '../../modules/nf-core/bcftools/stats/main' // TODO Add ANGSD GTL module. The current module does not pick up the .glf.gz output files. // TODO Find a way to pass ploidy and dbsnp to the GATK modules. maybe ploidy should go in all reference metas -? + workflow GENOTYPE { take: ch_bam_bai // [ [ meta ], bam , bai ] @@ -33,6 +33,14 @@ workflow GENOTYPE { ch_angsd_genotypes = Channel.empty() ch_bcftools_stats = Channel.empty() + // Replace missing dbsnps with empty lists + ch_dbsnp_for_gatk = ch_dbsnp + .map { + meta, dbsnp -> + final_dbsnp = dbsnp != "" ? dbsnp : [] + [ meta, final_dbsnp ] + } + if ( params.genotyping_tool == 'pileupcaller' ) { // SAMTOOLS_MPILEUP_PILEUPCALLER( ch_bam_bai, ch_fasta_plus ) @@ -50,14 +58,12 @@ workflow GENOTYPE { WorkflowEager.addNewMetaFromAttributes( it, "reference" , "reference" , false ) } - // RESULT: [ [combination_meta], [ref_meta], fasta, fai, dict, dbsnp ] ch_fasta_for_multimap = ch_fasta_plus + .join( ch_dbsnp_for_gatk ) // [ [ref_meta], fasta, fai, dict, dbsnp ] .map { // Prepend a new meta that contains the meta.id value as the new_meta.reference attribute WorkflowEager.addNewMetaFromAttributes( it, "id" , "reference" , false ) - } - .combine( ch_dbsnp ) // TODO This will need tweaking ('by:0'?) once the dbsnp channel gets a meta. - .dump(tag:"dbsnp") + } // RESULT: [ [combination_meta], [ref_meta], fasta, fai, dict, dbsnp ] ch_input_for_targetcreator = ch_bams_for_multimap .combine( ch_fasta_for_multimap , by:0 ) diff --git a/workflows/eager.nf b/workflows/eager.nf index 7200c9907..897358733 100644 --- a/workflows/eager.nf +++ b/workflows/eager.nf @@ -555,17 +555,11 @@ workflow EAGER { ch_reference_for_genotyping = REFERENCE_INDEXING.out.reference // Remove unnecessary files from the reference channel, so SWF doesn't break with each change to reference channel. .map { - meta, fasta, fai, dict, mapindex, circular_target, mitochondrion -> - [ meta, fasta, fai, dict ] // dbsnp ] // TODO add dbsnp to reference sheet. - } - ch_dbsnp = REFERENCE_INDEXING.out.dbsnp - .map { - meta, dbsnp -> - final_dbsnp = dbsnp != "" ? dbsnp : [] - [ meta, final_dbsnp ] + meta, fasta, fai, dict, mapindex, circular_target -> + [ meta, fasta, fai, dict ] } - GENOTYPE( ch_bams_for_genotyping, ch_reference_for_genotyping, [], [], [], ch_dbsnp ) + GENOTYPE( ch_bams_for_genotyping, ch_reference_for_genotyping, [], [], [], REFERENCE_INDEXING.out.dbsnp ) ch_versions = ch_versions.mix( GENOTYPE.out.versions ) ch_multiqc_files = ch_multiqc_files.mix( GENOTYPE.out.mqc.collect{it[1]}.ifEmpty([]) ) From dcc3e47b6b71f47217bf1ed23c9622eca4229285 Mon Sep 17 00:00:00 2001 From: Thiseas Christos Lamnidis Date: Fri, 24 Nov 2023 10:55:19 +0100 Subject: [PATCH 035/110] fix indentation --- subworkflows/local/genotype.nf | 46 +++++++++++++++++----------------- 1 file changed, 23 insertions(+), 23 deletions(-) diff --git a/subworkflows/local/genotype.nf b/subworkflows/local/genotype.nf index 8b55ffc0a..01a3181dc 100644 --- a/subworkflows/local/genotype.nf +++ b/subworkflows/local/genotype.nf @@ -137,29 +137,29 @@ workflow GENOTYPE { ch_versions = ch_versions.mix( GATK_UNIFIEDGENOTYPER.out.versions.first() ) ch_gatk_unifiedgenotyper_genotypes = GATK_UNIFIEDGENOTYPER.out.vcf - if ( ! params.skip_bcftools_stats ) { - // TODO this section could be moved outside the UG specific section into its own if clause and take input from HC and FB as well. - ch_bcftools_input= ch_gatk_unifiedgenotyper_genotypes - .map { - WorkflowEager.addNewMetaFromAttributes( it, "reference" , "reference" , false ) - } - .combine( ch_fasta_for_multimap , by:0 ) - .multiMap { - ignore_me, meta, vcf, ref_meta, fasta, fai, dict, dbsnp -> - vcf: [ meta, vcf, [] ] // bcftools stats module expects a tbi file with the vcf. - fasta: [ ref_meta, fasta ] - } - - BCFTOOLS_STATS_GENOTYPING( - ch_bcftools_input.vcf, // vcf - [ [], [] ], // regions - [ [], [] ], // targets - [ [], [] ], // samples - [ [], [] ], // exons - ch_bcftools_input.fasta // fasta - ) - ch_versions = ch_versions.mix( BCFTOOLS_STATS_GENOTYPING.out.versions.first() ) - } + if ( ! params.skip_bcftools_stats ) { + // TODO this section could be moved outside the UG specific section into its own if clause and take input from HC and FB as well. + ch_bcftools_input= ch_gatk_unifiedgenotyper_genotypes + .map { + WorkflowEager.addNewMetaFromAttributes( it, "reference" , "reference" , false ) + } + .combine( ch_fasta_for_multimap , by:0 ) + .multiMap { + ignore_me, meta, vcf, ref_meta, fasta, fai, dict, dbsnp -> + vcf: [ meta, vcf, [] ] // bcftools stats module expects a tbi file with the vcf. + fasta: [ ref_meta, fasta ] + } + + BCFTOOLS_STATS_GENOTYPING( + ch_bcftools_input.vcf, // vcf + [ [], [] ], // regions + [ [], [] ], // targets + [ [], [] ], // samples + [ [], [] ], // exons + ch_bcftools_input.fasta // fasta + ) + ch_versions = ch_versions.mix( BCFTOOLS_STATS_GENOTYPING.out.versions.first() ) + } } if ( params.genotyping_tool == 'hc' ) { From a91659e3bedc7eb761449bc7bb15bf059510251b Mon Sep 17 00:00:00 2001 From: Thiseas Christos Lamnidis Date: Fri, 24 Nov 2023 13:36:07 +0100 Subject: [PATCH 036/110] update haplotypecaller module --- modules.json | 2 +- .../gatk4/haplotypecaller/environment.yml | 7 ++++ modules/nf-core/gatk4/haplotypecaller/main.nf | 17 +++++----- .../nf-core/gatk4/haplotypecaller/meta.yml | 33 ++++++++++++++++--- 4 files changed, 46 insertions(+), 13 deletions(-) create mode 100644 modules/nf-core/gatk4/haplotypecaller/environment.yml diff --git a/modules.json b/modules.json index 3ad676bab..0cdb7079d 100644 --- a/modules.json +++ b/modules.json @@ -142,7 +142,7 @@ }, "gatk4/haplotypecaller": { "branch": "master", - "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", + "git_sha": "eab2bd29e589bd05da2b47c9bf95ef10b9508699", "installed_by": ["modules"] }, "gunzip": { diff --git a/modules/nf-core/gatk4/haplotypecaller/environment.yml b/modules/nf-core/gatk4/haplotypecaller/environment.yml new file mode 100644 index 000000000..0c8f32fa6 --- /dev/null +++ b/modules/nf-core/gatk4/haplotypecaller/environment.yml @@ -0,0 +1,7 @@ +name: gatk4_haplotypecaller +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::gatk4=4.4.0.0 diff --git a/modules/nf-core/gatk4/haplotypecaller/main.nf b/modules/nf-core/gatk4/haplotypecaller/main.nf index 478681bd1..a6a71d562 100644 --- a/modules/nf-core/gatk4/haplotypecaller/main.nf +++ b/modules/nf-core/gatk4/haplotypecaller/main.nf @@ -2,18 +2,18 @@ process GATK4_HAPLOTYPECALLER { tag "$meta.id" label 'process_medium' - conda "bioconda::gatk4=4.4.0.0" + conda "${moduleDir}/environment.yml" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/gatk4:4.4.0.0--py36hdfd78af_0': 'biocontainers/gatk4:4.4.0.0--py36hdfd78af_0' }" input: - tuple val(meta), path(input), path(input_index), path(intervals), path(dragstr_model) - path fasta - path fai - path dict - path dbsnp - path dbsnp_tbi + tuple val(meta), path(input), path(input_index), path(intervals), path(dragstr_model) + tuple val(meta2), path(fasta) + tuple val(meta3), path(fai) + tuple val(meta4), path(dict) + tuple val(meta5), path(dbsnp) + tuple val(meta6), path(dbsnp_tbi) output: tuple val(meta), path("*.vcf.gz") , emit: vcf @@ -39,7 +39,8 @@ process GATK4_HAPLOTYPECALLER { avail_mem = (task.memory.mega*0.8).intValue() } """ - gatk --java-options "-Xmx${avail_mem}M" HaplotypeCaller \\ + gatk --java-options "-Xmx${avail_mem}M -XX:-UsePerfData" \\ + HaplotypeCaller \\ --input $input \\ --output ${prefix}.vcf.gz \\ --reference $fasta \\ diff --git a/modules/nf-core/gatk4/haplotypecaller/meta.yml b/modules/nf-core/gatk4/haplotypecaller/meta.yml index 27633cca6..2085c2db2 100644 --- a/modules/nf-core/gatk4/haplotypecaller/meta.yml +++ b/modules/nf-core/gatk4/haplotypecaller/meta.yml @@ -2,8 +2,8 @@ name: gatk4_haplotypecaller description: Call germline SNPs and indels via local re-assembly of haplotypes keywords: - gatk4 - - haplotypecaller - haplotype + - haplotypecaller tools: - gatk4: description: | @@ -14,7 +14,6 @@ tools: documentation: https://gatk.broadinstitute.org/hc/en-us/categories/360002369672s doi: 10.1158/1538-7445.AM2017-3590 licence: ["Apache-2.0"] - input: - meta: type: map @@ -36,25 +35,49 @@ input: type: file description: Text file containing the DragSTR model of the used BAM/CRAM file (optional) pattern: "*.txt" + - meta2: + type: map + description: | + Groovy Map containing reference information + e.g. [ id:'test', single_end:false ] - fasta: type: file description: The reference fasta file pattern: "*.fasta" + - meta3: + type: map + description: | + Groovy Map containing reference information + e.g. [ id:'test', single_end:false ] - fai: type: file description: Index of reference fasta file pattern: "fasta.fai" + - meta4: + type: map + description: | + Groovy Map containing reference information + e.g. [ id:'test', single_end:false ] - dict: type: file description: GATK sequence dictionary pattern: "*.dict" + - meta5: + type: map + description: | + Groovy Map containing dbsnp information + e.g. [ id:'test', single_end:false ] - dbsnp: type: file description: VCF file containing known sites (optional) + - meta6: + type: map + description: | + Groovy Map containing dbsnp information + e.g. [ id:'test', single_end:false ] - dbsnp_tbi: type: file description: VCF index of dbsnp (optional) - output: - meta: type: map @@ -77,7 +100,9 @@ output: type: file description: Assembled haplotypes and locally realigned reads pattern: "*.realigned.bam" - authors: - "@suzannejin" - "@FriederikeHanssen" +maintainers: + - "@suzannejin" + - "@FriederikeHanssen" From c88aaa3c359f14ac89654885cd227b44c0306f20 Mon Sep 17 00:00:00 2001 From: Thiseas Christos Lamnidis Date: Fri, 24 Nov 2023 13:54:31 +0100 Subject: [PATCH 037/110] port UG channels to HC --- subworkflows/local/genotype.nf | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/subworkflows/local/genotype.nf b/subworkflows/local/genotype.nf index 01a3181dc..a169673b7 100644 --- a/subworkflows/local/genotype.nf +++ b/subworkflows/local/genotype.nf @@ -164,6 +164,29 @@ workflow GENOTYPE { if ( params.genotyping_tool == 'hc' ) { // TODO + ch_bams_for_multimap = ch_bam_bai + .map { + // Prepend a new meta that contains the meta.reference value as the new_meta.reference attribute + WorkflowEager.addNewMetaFromAttributes( it, "reference" , "reference" , false ) + } + + ch_fasta_for_multimap = ch_fasta_plus + .join( ch_dbsnp_for_gatk ) // [ [ref_meta], fasta, fai, dict, dbsnp ] + .map { + // Prepend a new meta that contains the meta.id value as the new_meta.reference attribute + WorkflowEager.addNewMetaFromAttributes( it, "id" , "reference" , false ) + } // RESULT: [ [combination_meta], [ref_meta], fasta, fai, dict, dbsnp ] + + ch_input_for_hc = ch_bams_for_multimap + .combine( ch_fasta_for_multimap , by:0 ) + .multiMap { + ignore_me, meta, bam, bai, ref_meta, fasta, fai, dict, dbsnp -> + bam: [ meta, bam , bai ] + fasta: [ ref_meta, fasta ] + fai: [ ref_meta, fai ] + dict: [ ref_meta, dict ] + dbsnp: [ ref_meta, dbsnp ] + } } if ( params.genotyping_tool == 'freebayes' ) { From 4e88678a7eb1fb8ce0ed25a50aa8285fcf8ec13b Mon Sep 17 00:00:00 2001 From: Thiseas Christos Lamnidis Date: Fri, 8 Dec 2023 15:26:30 +0100 Subject: [PATCH 038/110] Add gatk HC params. Update some gatk UG param text --- nextflow_schema.json | 34 +++++++++++++++++++++++----------- 1 file changed, 23 insertions(+), 11 deletions(-) diff --git a/nextflow_schema.json b/nextflow_schema.json index bd46a3f78..afd398e4f 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -958,7 +958,7 @@ "default": "EMIT_VARIANTS_ONLY", "description": "Specify GATK output mode.", "enum": ["EMIT_VARIANTS_ONLY", "EMIT_ALL_CONFIDENT_SITES", "EMIT_ALL_SITES"], - "help_text": "If the GATK UnifiedGenotyper is selected, what type of VCF to create, i.e. produce calls for every site or just confidence sites.\n\n> Modifies GATK UnifiedGenotyper parameter: `--output_mode`", + "help_text": "If GATK UnifiedGenotyper is selected as the genotyping tool, this defines the output mode to use when producing the output VCF (i.e. produce calls for every site or just confidence sites.)\n\n> Modifies GATK UnifiedGenotyper parameter: `--output_mode`", "fa_icon": "fas fa-bullhorn" }, "genotyping_gatk_ug_genotype_mode": { @@ -967,20 +967,36 @@ "description": "Specify UnifiedGenotyper likelihood model.", "enum": ["SNP", "INDEL", "BOTH", "GENERALPLOIDYSNP", "GENERALPLOIDYINDEL"], "fa_icon": "fas fa-project-diagram", - "help_text": "If the GATK UnifiedGenotyper is selected, which likelihood model to follow, i.e. whether to call use SNPs or INDELS etc.\n\n> Modifies GATK UnifiedGenotyper parameter: `--genotype_likelihoods_model`" + "help_text": "If GATK UnifiedGenotyper is selected as the genotyping tool, this sets which likelihood model to follow, i.e. whether to call only SNPs or INDELS etc.\n\n> Modifies GATK UnifiedGenotyper parameter: `--genotype_likelihoods_model`" }, "genotyping_gatk_ug_keep_realign_bam": { "type": "boolean", "fa_icon": "far fa-save", "description": "Specify to keep the BAM output of re-alignment around variants from GATK UnifiedGenotyper.", - "help_text": "If provided when running GATK's UnifiedGenotyper, this will put into the output folder the BAMs that have realigned reads (with GATK's (v3) IndelRealigner) around possible variants for improved genotyping.\n\nThese BAMs will be stored in the same folder as the corresponding VCF files." + "help_text": "If GATK UnifiedGenotyper is selected as the genotyping tool, providing this parameter will output the BAMs that have realigned reads (with GATK's (v3) IndelRealigner) around possible variants for improved genotyping in addition to the standard VCF output.\n\nThese BAMs will be stored in the same folder as the corresponding VCF files." }, "genotyping_gatk_ug_defaultbasequalities": { "type": "integer", "default": -1, "description": "Supply a default base quality if a read is missing a base quality score. Setting to -1 turns this off.", - "help_text": "When running GATK's UnifiedGenotyper, specify a value to set base quality scores, if reads are missing this information. Might be useful if you have 'synthetically' generated reads (e.g. chopping up a reference genome). Default is set to `-1` which is to not set any default quality (turned off). \n\n> Modifies GATK UnifiedGenotyper parameter: `--defaultBaseQualities`", + "help_text": "If GATK UnifiedGenotyper is selected as the genotyping tool, specify a value to set base quality scores, if reads are missing this information. Might be useful if you have 'synthetically' generated reads (e.g. chopping up a reference genome). Default is set to `-1` which is to not set any default quality (turned off). \n\n> Modifies GATK UnifiedGenotyper parameter: `--defaultBaseQualities`", "fa_icon": "fas fa-redo-alt" + }, + "genotyping_gatk_hc_out_mode": { + "type": "string", + "default": "EMIT_VARIANTS_ONLY", + "fa_icon": "fas fa-bullhorn", + "description": "Specify GATK output mode.", + "help_text": "If GATK HaplotypeCaller is selected as the genotyping tool, this sets the type of sites that should be included in the output VCF (i.e. produce calls for every site or just confidence sites). \n\n> Modifies GATK HaplotypeCaller parameter: `--output_mode`", + "enum": ["EMIT_VARIANTS_ONLY", "EMIT_ALL_CONFIDENT_SITES", "EMIT_ALL_ACTIVE_SITES"] + }, + "genotyping_gatk_hc_emitrefconf": { + "type": "string", + "default": "GVCF", + "fa_icon": "fas fa-bullhorn", + "description": "Specify HaplotypeCaller mode for emitting reference confidence calls.", + "help_text": "If GATK HaplotypeCaller is selected as the genotyping tool, this sets the mode for emitting reference confidence calls.\n\n> Modifies GATK HaplotypeCaller parameter: `--emit-ref-confidence`", + "enum": ["NONE", "BP_RESOLUTION", "GVCF"] } }, "fa_icon": "fas fa-sliders-h", @@ -1071,8 +1087,7 @@ "fa_icon": "fab fa-creative-commons-sampling-plus" }, "skip_qualimap": { - "type": "boolean", - "default": "false" + "type": "boolean" }, "snpcapture_bed": { "type": "string", @@ -1267,16 +1282,13 @@ "$ref": "#/definitions/adna_damage_analysis" }, { - "$ref": "#/definitions/host_removal" - }, - { - "$ref": "#/definitions/contamination_estimation" + "$ref": "#/definitions/feature_annotation_statistics" }, { "$ref": "#/definitions/host_removal" }, { - "$ref": "#/definitions/feature_annotation_statistics" + "$ref": "#/definitions/contamination_estimation" } ] } From a8dc2d4c658f1c71a0b96febe3179432c4c4486b Mon Sep 17 00:00:00 2001 From: Thiseas Christos Lamnidis Date: Fri, 8 Dec 2023 15:26:40 +0100 Subject: [PATCH 039/110] add gatk HC params --- nextflow.config | 2 ++ 1 file changed, 2 insertions(+) diff --git a/nextflow.config b/nextflow.config index b9c242316..26c8ba190 100644 --- a/nextflow.config +++ b/nextflow.config @@ -212,6 +212,8 @@ params { genotyping_gatk_ug_genotype_mode = 'SNP' genotyping_gatk_ug_defaultbasequalities = -1 genotyping_gatk_ug_keep_realign_bam = false + genotyping_gatk_hc_out_mode = 'EMIT_VARIANTS_ONLY' + genotyping_gatk_hc_emitrefconf = 'GVCF' } From bf9cd87576e7a2220f0d8343032d58a0f8fef7b9 Mon Sep 17 00:00:00 2001 From: Thiseas Christos Lamnidis Date: Fri, 8 Dec 2023 15:27:07 +0100 Subject: [PATCH 040/110] add gatk HC. Add patterns to genotyping module publishDir --- conf/modules.config | 27 ++++++++++++++++++++++++--- 1 file changed, 24 insertions(+), 3 deletions(-) diff --git a/conf/modules.config b/conf/modules.config index 6a9a3896f..5509266a1 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -878,7 +878,8 @@ process { publishDir = [ path: { "${params.outdir}/genotyping/IR" }, mode: params.publish_dir_mode, - enabled: params.genotyping_gatk_ug_keep_realign_bam + enabled: params.genotyping_gatk_ug_keep_realign_bam, + pattern: '*.{bam,bai}' ] } @@ -896,7 +897,26 @@ process { publishDir = [ path: { "${params.outdir}/genotyping/" }, mode: params.publish_dir_mode, - enabled: true + enabled: true, + pattern: '*.vcf.gz' + ] + } + + withName: GATK4_HAPLOTYPECALLER { + tag = { "${meta.reference}|${meta.sample_id}_${meta.library_id}" } + ext.args = [ + // Option names have changed from underscore_separated to hyphen-separated in GATK4 + "--sample-ploidy ${params.genotyping_gatk_ploidy}", + "-stand-call-conf ${params.genotyping_gatk_call_conf}", + "--output-mode ${params.genotyping_gatk_hc_out_mode}", + "--emit-ref-confidence ${params.genotyping_gatk_hc_emitrefconf}", + ].join(' ').trim() + ext.prefix = { "${meta.sample_id}_${meta.library_id}_${meta.reference}" } + publishDir = [ + path: { "${params.outdir}/genotyping/" }, + mode: params.publish_dir_mode, + enabled: true, + pattern: '*.{vcf.gz,vcf.gz.tbi}' ] } @@ -906,7 +926,8 @@ process { publishDir = [ path: { "${params.outdir}/genotyping/" }, mode: params.publish_dir_mode, - enabled: true + enabled: true, + pattern: '*.txt' ] } } From edb5e589320197b948a8221877be97722045583f Mon Sep 17 00:00:00 2001 From: Thiseas Christos Lamnidis Date: Fri, 8 Dec 2023 15:27:33 +0100 Subject: [PATCH 041/110] add HC. fix indent. move bcftools --- subworkflows/local/genotype.nf | 69 ++++++++++++++++++++-------------- 1 file changed, 41 insertions(+), 28 deletions(-) diff --git a/subworkflows/local/genotype.nf b/subworkflows/local/genotype.nf index a169673b7..6d59e1326 100644 --- a/subworkflows/local/genotype.nf +++ b/subworkflows/local/genotype.nf @@ -124,6 +124,7 @@ workflow GENOTYPE { dbsnp: [ ref_meta, dbsnp ] } + // TODO: Should the vcfs be indexed with bcftools index? VCFs from HC are indexed. GATK_UNIFIEDGENOTYPER( ch_bams_for_ug.bam, ch_bams_for_ug.fasta, @@ -136,57 +137,43 @@ workflow GENOTYPE { ) ch_versions = ch_versions.mix( GATK_UNIFIEDGENOTYPER.out.versions.first() ) ch_gatk_unifiedgenotyper_genotypes = GATK_UNIFIEDGENOTYPER.out.vcf - - if ( ! params.skip_bcftools_stats ) { - // TODO this section could be moved outside the UG specific section into its own if clause and take input from HC and FB as well. - ch_bcftools_input= ch_gatk_unifiedgenotyper_genotypes - .map { - WorkflowEager.addNewMetaFromAttributes( it, "reference" , "reference" , false ) - } - .combine( ch_fasta_for_multimap , by:0 ) - .multiMap { - ignore_me, meta, vcf, ref_meta, fasta, fai, dict, dbsnp -> - vcf: [ meta, vcf, [] ] // bcftools stats module expects a tbi file with the vcf. - fasta: [ ref_meta, fasta ] - } - - BCFTOOLS_STATS_GENOTYPING( - ch_bcftools_input.vcf, // vcf - [ [], [] ], // regions - [ [], [] ], // targets - [ [], [] ], // samples - [ [], [] ], // exons - ch_bcftools_input.fasta // fasta - ) - ch_versions = ch_versions.mix( BCFTOOLS_STATS_GENOTYPING.out.versions.first() ) - } } if ( params.genotyping_tool == 'hc' ) { - // TODO ch_bams_for_multimap = ch_bam_bai .map { // Prepend a new meta that contains the meta.reference value as the new_meta.reference attribute WorkflowEager.addNewMetaFromAttributes( it, "reference" , "reference" , false ) } - ch_fasta_for_multimap = ch_fasta_plus + ch_fasta_for_multimap = ch_fasta_plus .join( ch_dbsnp_for_gatk ) // [ [ref_meta], fasta, fai, dict, dbsnp ] .map { // Prepend a new meta that contains the meta.id value as the new_meta.reference attribute WorkflowEager.addNewMetaFromAttributes( it, "id" , "reference" , false ) } // RESULT: [ [combination_meta], [ref_meta], fasta, fai, dict, dbsnp ] - ch_input_for_hc = ch_bams_for_multimap + ch_input_for_hc = ch_bams_for_multimap .combine( ch_fasta_for_multimap , by:0 ) .multiMap { ignore_me, meta, bam, bai, ref_meta, fasta, fai, dict, dbsnp -> - bam: [ meta, bam , bai ] + bam: [ meta, bam , bai, [], [] ] // No intervals or dragSTR model inputs to HC module fasta: [ ref_meta, fasta ] fai: [ ref_meta, fai ] dict: [ ref_meta, dict ] dbsnp: [ ref_meta, dbsnp ] } + + GATK4_HAPLOTYPECALLER( + ch_input_for_hc.bam, + ch_input_for_hc.fasta, + ch_input_for_hc.fai, + ch_input_for_hc.dict, + ch_input_for_hc.dbsnp, + [[], []] // No dbsnp_tbi + ) + ch_versions = ch_versions.mix( GATK4_HAPLOTYPECALLER.out.versions.first() ) + ch_gatk_unifiedgenotyper_genotypes = GATK4_HAPLOTYPECALLER.out.vcf } if ( params.genotyping_tool == 'freebayes' ) { @@ -197,6 +184,32 @@ workflow GENOTYPE { // TODO } + // Run BCFTOOLS_STATS on output from GATK UG, HC and Freebayes + if ( !params.skip_bcftools_stats && ( params.genotyping_tool == 'hc' || params.genotyping_tool == 'ug' || params.genotyping_tool == 'freebayes' ) ) { + ch_bcftools_input= ch_gatk_unifiedgenotyper_genotypes + .mix( ch_gatk_haplotypecaller_genotypes ) + .mix( ch_freebayes_genotypes ) + .map { + WorkflowEager.addNewMetaFromAttributes( it, "reference" , "reference" , false ) + } + .combine( ch_fasta_for_multimap , by:0 ) + .multiMap { + ignore_me, meta, vcf, ref_meta, fasta, fai, dict, dbsnp -> + vcf: [ meta, vcf, [] ] // bcftools stats module expects a tbi file with the vcf. + fasta: [ ref_meta, fasta ] + } + + BCFTOOLS_STATS_GENOTYPING( + ch_bcftools_input.vcf, // vcf + [ [], [] ], // regions + [ [], [] ], // targets + [ [], [] ], // samples + [ [], [] ], // exons + ch_bcftools_input.fasta // fasta + ) + ch_versions = ch_versions.mix( BCFTOOLS_STATS_GENOTYPING.out.versions.first() ) + } + emit: geno_pileupcaller = ch_pileupcaller_genotypes // [ [ meta ], geno, snp, ind ] geno_gatk_hc = ch_gatk_haplotypecaller_genotypes // [ [ meta ], vcf ] ] From 49d531d3fdf1a491d78f86c2b9e9a2cbe875c752 Mon Sep 17 00:00:00 2001 From: Thiseas Christos Lamnidis Date: Fri, 8 Dec 2023 15:27:45 +0100 Subject: [PATCH 042/110] HC manual tests. update UG tests --- docs/development/manual_tests.md | 32 +++++++++++++++++++++++++------- 1 file changed, 25 insertions(+), 7 deletions(-) diff --git a/docs/development/manual_tests.md b/docs/development/manual_tests.md index 8238a502a..1765809b6 100644 --- a/docs/development/manual_tests.md +++ b/docs/development/manual_tests.md @@ -660,14 +660,15 @@ nextflow run . -profile test,docker \ ```bash ## Gatk on raw reads -## Expect: One VCF per sample/reference combination. Also 1 bcftools_stats file per bam. -nextflow run main.nf -profile test,docker --outdir ./results -w work/ -resume --run_genotyping --genotyping_tool 'ug' --genotyping_source 'raw' -ansi-log false -dump-channels +## Expect: One VCF per sample/reference combination. Also 1 bcftools_stats file per VCF. Additional IR/ subdirectory with 1 bam and 1 bai per sample/reference combination. +nextflow run main.nf -profile test,docker --outdir ./results -w work/ -resume --run_genotyping --genotyping_tool 'ug' --genotyping_source 'raw' --genotyping_gatk_ug_keep_realign_bam -ansi-log false -dump-channels ``` ```bash -## Gatk on trimmed reads -## Expect: One VCF per sample/reference combination, based on the trimmed bams. Also 1 bcftools_stats file per bam. -nextflow run main.nf -profile test,docker --outdir ./results -w work/ -resume --run_genotyping --genotyping_tool 'ug' --genotyping_source 'trimmed' -ansi-log false -dump-channels --run_trim_bam \ +## Gatk on trimmed reads. Skip bcftools stats. +## Expect: One VCF per sample/reference combination, based on the trimmed bams (this actually shows on the IndelRealigner step and not the UG step). No bcftools_stats file per VCF. +nextflow run main.nf -profile test,docker --outdir ./results -w work/ -resume --run_genotyping --genotyping_tool 'ug' --genotyping_source 'trimmed' -ansi-log false -dump-channels --skip_bcftools_stats \ + --run_trim_bam \ --damage_manipulation_bamutils_trim_double_stranded_none_udg_left 5 \ --damage_manipulation_bamutils_trim_double_stranded_none_udg_right 7 \ --damage_manipulation_bamutils_trim_double_stranded_half_udg_left 1 \ @@ -677,13 +678,30 @@ nextflow run main.nf -profile test,docker --outdir ./results -w work/ -resume -- ```bash ## Gatk on pmd-filtered reads -## Expect: One VCF per sample/reference combination, based on the pmd-filtered bams. Also 1 bcftools_stats file per bam. +## Expect: One VCF per sample/reference combination, based on the pmd-filtered bams (this actually shows on the IndelRealigner step and not the UG step). Also 1 bcftools_stats file per VCF. nextflow run main.nf -profile test,docker --outdir ./results -w work/ -resume --run_genotyping --genotyping_tool 'ug' --genotyping_source 'pmd' -ansi-log false -dump-channels --run_pmd_filtering ## Checked that the bams had fewer reads compared to the raw bams. ``` ```bash ## Gatk on rescaled reads -## Expect: One VCF per sample/reference combination, based on the rescaled bams. Also 1 bcftools_stats file per bam. +## Expect: One VCF per sample/reference combination, based on the rescaled bams (this actually shows on the IndelRealigner step and not the UG step). Also 1 bcftools_stats file per VCF. nextflow run main.nf -profile test,docker --outdir ./results -w work/ -resume --run_genotyping --genotyping_tool 'ug' --genotyping_source 'rescaled' -ansi-log false -dump-channels --run_mapdamage_rescaling ``` + +## GATK HC + +```bash +## Gatk HC on raw reads +## Expect: One VCF + .tbi index per sample/reference combination. Also 1 bcftools_stats file per VCF. +nextflow run main.nf -profile test,docker --outdir ./results -w work/ -resume --run_genotyping --genotyping_tool 'hc' --genotyping_source 'raw' -ansi-log false -dump-channels +``` + +```bash +## Gatk HC on trimmed reads, with different out mode and emit confidence. Skip bcftools stats. +## Expect: One VCF + .tbi index per sample/reference combination. Also 1 bcftools_stats file per VCF. +## Checked .command.sh for correct args. +nextflow run main.nf -profile test,docker --outdir ./results -w work/ -resume --run_genotyping --genotyping_tool 'hc' --genotyping_source 'raw' -ansi-log false -dump-channels --skip_bcftools_stats \ + --genotyping_gatk_hc_emitrefconf 'BP_RESOLUTION' \ + --genotyping_gatk_hc_out_mode 'EMIT_ALL_ACTIVE_SITES' +``` From f86bdad21883616d6f7a2574cfa3ef77383c1333 Mon Sep 17 00:00:00 2001 From: Thiseas Christos Lamnidis Date: Fri, 8 Dec 2023 15:29:16 +0100 Subject: [PATCH 043/110] update TODOs --- subworkflows/local/genotype.nf | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/subworkflows/local/genotype.nf b/subworkflows/local/genotype.nf index 6d59e1326..4392a39d2 100644 --- a/subworkflows/local/genotype.nf +++ b/subworkflows/local/genotype.nf @@ -12,7 +12,6 @@ include { GATK4_HAPLOTYPECALLER } from '../../module include { FREEBAYES } from '../../modules/nf-core/freebayes/main' include { BCFTOOLS_STATS as BCFTOOLS_STATS_GENOTYPING } from '../../modules/nf-core/bcftools/stats/main' // TODO Add ANGSD GTL module. The current module does not pick up the .glf.gz output files. -// TODO Find a way to pass ploidy and dbsnp to the GATK modules. maybe ploidy should go in all reference metas workflow GENOTYPE { take: @@ -177,11 +176,11 @@ workflow GENOTYPE { } if ( params.genotyping_tool == 'freebayes' ) { - // TODO + // TODO Freebayes module needs updating to use the new meta format. } if ( params.genotyping_tool == 'angsd' ) { - // TODO + // TODO no module for angsd genotyping yet } // Run BCFTOOLS_STATS on output from GATK UG, HC and Freebayes From d174435dda3e874bbb49e2491f8617e1dea93b23 Mon Sep 17 00:00:00 2001 From: Thiseas Christos Lamnidis Date: Wed, 13 Dec 2023 13:18:45 +0100 Subject: [PATCH 044/110] update freebayes module --- modules.json | 2 +- modules/nf-core/freebayes/environment.yml | 7 ++++ modules/nf-core/freebayes/main.nf | 12 +++--- modules/nf-core/freebayes/meta.yml | 49 +++++++++++++++++++---- 4 files changed, 55 insertions(+), 15 deletions(-) create mode 100644 modules/nf-core/freebayes/environment.yml diff --git a/modules.json b/modules.json index 0cdb7079d..e7fc77e86 100644 --- a/modules.json +++ b/modules.json @@ -122,7 +122,7 @@ }, "freebayes": { "branch": "master", - "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", + "git_sha": "77978839bef6d437f21edb900b49bcbc04f9f735", "installed_by": ["modules"] }, "gatk/indelrealigner": { diff --git a/modules/nf-core/freebayes/environment.yml b/modules/nf-core/freebayes/environment.yml new file mode 100644 index 000000000..6846080a2 --- /dev/null +++ b/modules/nf-core/freebayes/environment.yml @@ -0,0 +1,7 @@ +name: freebayes +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::freebayes=1.3.6 diff --git a/modules/nf-core/freebayes/main.nf b/modules/nf-core/freebayes/main.nf index 1466f085e..c07895c05 100644 --- a/modules/nf-core/freebayes/main.nf +++ b/modules/nf-core/freebayes/main.nf @@ -2,18 +2,18 @@ process FREEBAYES { tag "$meta.id" label 'process_single' - conda "bioconda::freebayes=1.3.6" + conda "${moduleDir}/environment.yml" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/freebayes:1.3.6--hbfe0e7f_2' : 'biocontainers/freebayes:1.3.6--hbfe0e7f_2' }" input: tuple val(meta), path(input_1), path(input_1_index), path(input_2), path(input_2_index), path(target_bed) - path fasta - path fasta_fai - path samples - path populations - path cnv + tuple val(ref_meta), path(fasta) + tuple val(ref_idx_meta), path(fasta_fai) + tuple val(samples_meta), path(samples) + tuple val(populations_meta), path(populations) + tuple val(cnv_meta), path(cnv) output: tuple val(meta), path("*.vcf.gz"), emit: vcf diff --git a/modules/nf-core/freebayes/meta.yml b/modules/nf-core/freebayes/meta.yml index 17d83cba2..1803b2b31 100644 --- a/modules/nf-core/freebayes/meta.yml +++ b/modules/nf-core/freebayes/meta.yml @@ -8,7 +8,6 @@ keywords: - germline variant calling - bacterial variant calling - bayesian - tools: - freebayes: description: Bayesian haplotype-based polymorphism discovery and genotyping @@ -17,18 +16,25 @@ tools: tool_dev_url: https://github.com/freebayes/freebayes doi: "10.48550/arXiv.1207.3907" licence: ["MIT"] - input: - meta: type: map description: | Groovy Map containing sample information e.g. [ id:'test', single_end:false ] - - input: + - input_1: type: file description: BAM/CRAM/SAM file pattern: "*.{bam,cram,sam}" - - input_index: + - input_1_index: + type: file + description: BAM/CRAM/SAM index file + pattern: "*.{bai,crai}" + - input_2: + type: file + description: BAM/CRAM/SAM file + pattern: "*.{bam,cram,sam}" + - input_2_index: type: file description: BAM/CRAM/SAM index file pattern: "*.{bai,crai}" @@ -36,22 +42,47 @@ input: type: file description: Optional - Limit analysis to targets listed in this BED-format FILE. pattern: "*.bed" + - ref_meta: + type: map + description: | + Groovy Map containing reference information. + e.g. [ id:'test_reference' ] - fasta: type: file description: reference fasta file pattern: ".{fa,fa.gz,fasta,fasta.gz}" + - ref_idx_meta: + type: map + description: | + Groovy Map containing reference information. + e.g. [ id:'test_reference' ] - fasta_fai: type: file description: reference fasta file index pattern: "*.{fa,fasta}.fai" + - samples_meta: + type: map + description: | + Groovy Map containing meta information for the samples file. + e.g. [ id:'test_samples' ] - samples: type: file description: Optional - Limit analysis to samples listed (one per line) in the FILE. pattern: "*.txt" + - populations_meta: + type: map + description: | + Groovy Map containing meta information for the populations file. + e.g. [ id:'test_populations' ] - populations: type: file description: Optional - Each line of FILE should list a sample and a population which it is part of. pattern: "*.txt" + - cnv_meta: + type: map + description: | + Groovy Map containing meta information for the cnv file. + e.g. [ id:'test_cnv' ] - cnv: type: file description: | @@ -60,23 +91,25 @@ input: or a region-specific format: seq_name start end sample_name copy_number pattern: "*.bed" - output: - meta: type: map description: | Groovy Map containing sample information e.g. [ id:'test', single_end:false ] - - version: + - versions: type: file description: File containing software version - pattern: "*.{version.txt}" + pattern: "versions.yml" - vcf: type: file description: Compressed VCF file pattern: "*.vcf.gz" - authors: - "@maxibor" - "@FriederikeHanssen" - "@maxulysse" +maintainers: + - "@maxibor" + - "@FriederikeHanssen" + - "@maxulysse" From 297ea51c4199302af0967cc1a2125ddd7ef4ffdc Mon Sep 17 00:00:00 2001 From: Thiseas Christos Lamnidis Date: Wed, 13 Dec 2023 14:37:50 +0100 Subject: [PATCH 045/110] Add Freebayes --- conf/modules.config | 16 +++++++++++++++ nextflow.config | 5 ++++- nextflow_schema.json | 21 ++++++++++++++++++++ subworkflows/local/genotype.nf | 36 +++++++++++++++++++++++++++++++++- 4 files changed, 76 insertions(+), 2 deletions(-) diff --git a/conf/modules.config b/conf/modules.config index 5509266a1..8154b9d0f 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -920,6 +920,22 @@ process { ] } + withName: FREEBAYES { + tag = { "${meta.reference}|${meta.sample_id}_${meta.library_id}" } + ext.args = [ + "-C ${params.genotyping_freebayes_min_alternate_count}", + "-p ${params.genotyping_freebayes_ploidy}", + params.genotyping_freebayes_skip_coverage != 0 ? "-g ${params.genotyping_freebayes_skip_coverage}" : "", + ].join(' ').trim() + ext.prefix = { "${meta.sample_id}_${meta.library_id}_${meta.reference}" } + publishDir = [ + path: { "${params.outdir}/genotyping/" }, + mode: params.publish_dir_mode, + enabled: true, + pattern: '*.vcf.gz' + ] + } + withName: BCFTOOLS_STATS_GENOTYPING { tag = { "${meta.reference}|${meta.sample_id}_${meta.library_id}" } ext.prefix = { "${meta.sample_id}_${meta.library_id}_${meta.reference}" } diff --git a/nextflow.config b/nextflow.config index 26c8ba190..b27b9d5b0 100644 --- a/nextflow.config +++ b/nextflow.config @@ -194,6 +194,7 @@ params { damage_manipulation_bamutils_softclip = false // Genotyping + // TODO GATK and FREEBAYES ploidy options should be merged run_genotyping = false genotyping_tool = null genotyping_source = null @@ -214,7 +215,9 @@ params { genotyping_gatk_ug_keep_realign_bam = false genotyping_gatk_hc_out_mode = 'EMIT_VARIANTS_ONLY' genotyping_gatk_hc_emitrefconf = 'GVCF' - + genotyping_freebayes_min_alternate_count = 1 + genotyping_freebayes_ploidy = 2 + genotyping_freebayes_skip_coverage = 0 } // Load base.config by default for all pipelines diff --git a/nextflow_schema.json b/nextflow_schema.json index afd398e4f..5bbc2a36c 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -997,6 +997,27 @@ "description": "Specify HaplotypeCaller mode for emitting reference confidence calls.", "help_text": "If GATK HaplotypeCaller is selected as the genotyping tool, this sets the mode for emitting reference confidence calls.\n\n> Modifies GATK HaplotypeCaller parameter: `--emit-ref-confidence`", "enum": ["NONE", "BP_RESOLUTION", "GVCF"] + }, + "genotyping_freebayes_min_alternate_count": { + "type": "integer", + "default": 1, + "description": "Specify minimum required supporting observations of an alternate allele to consider a variant.", + "help_text": "Require at least this count of observations supporting an alternate allele within a single individual in order to evaluate the position.\n\n> Modifies freebayes parameter: `-C`", + "fa_icon": "fas fa-align-center" + }, + "genotyping_freebayes_skip_coverage": { + "type": "integer", + "default": 0, + "description": "Specify to skip over regions of high depth by discarding alignments overlapping positions where total read depth is greater than specified.", + "help_text": "Specify to skip over regions of high depth by discarding alignments overlapping positions where total read depth is greater than the specified value. Setting to 0 (the default) deactivates this behaviour.\n\n> Modifies freebayes parameter: `-g`", + "fa_icon": "fab fa-think-peaks" + }, + "genotyping_freebayes_ploidy": { + "type": "integer", + "default": 2, + "description": "Specify ploidy of sample in FreeBayes.", + "help_text": "Specify ploidy of sample in FreeBayes.\n\n> Modifies freebayes parameter: `-p`", + "fa_icon": "fas fa-pastafarianism" } }, "fa_icon": "fas fa-sliders-h", diff --git a/subworkflows/local/genotype.nf b/subworkflows/local/genotype.nf index 4392a39d2..207e20bc9 100644 --- a/subworkflows/local/genotype.nf +++ b/subworkflows/local/genotype.nf @@ -176,7 +176,41 @@ workflow GENOTYPE { } if ( params.genotyping_tool == 'freebayes' ) { - // TODO Freebayes module needs updating to use the new meta format. + ch_bams_for_multimap = ch_bam_bai + .map { + // Prepend a new meta that contains the meta.reference value as the new_meta.reference attribute + WorkflowEager.addNewMetaFromAttributes( it, "reference" , "reference" , false ) + } + + // TODO Do we want to provide SNP capture bed file to Freebayes? It would then genotype only on those positions. + // NOTE: dbsnp is not used by Freebayes, but we need to provide it to the module anyway, to ensure correct cardinality of the fasta channel within the BCFTOOLS_STATS channel operations. + ch_fasta_for_multimap = ch_fasta_plus + .join( ch_dbsnp_for_gatk ) // [ [ref_meta], fasta, fai, dict, dbsnp ] + .map { + // Prepend a new meta that contains the meta.id value as the new_meta.reference attribute + WorkflowEager.addNewMetaFromAttributes( it, "id" , "reference" , false ) + } // RESULT: [ [combination_meta], [ref_meta], fasta, fai, dict, dbsnp ] + + ch_input_for_freebayes = ch_bams_for_multimap + .combine( ch_fasta_for_multimap , by:0 ) + .multiMap { + ignore_me, meta, bam, bai, ref_meta, fasta, fai, dict, dbsnp -> + bam: [ meta, bam , bai, [], [], [] ] // No second bam, second bai, or regions-BED file + fasta: [ ref_meta, fasta ] + fai: [ ref_meta, fai ] + } + + // TODO: Should the vcfs be indexed with bcftools index? VCFs from HC are indexed. + FREEBAYES( + ch_input_for_freebayes.bam, + ch_input_for_freebayes.fasta, + ch_input_for_freebayes.fai, + [ [], [] ], // No samples file + [ [], [] ], // No populations file + [ [], [] ] // No CNV file + ) + ch_versions = ch_versions.mix( FREEBAYES.out.versions.first() ) + ch_freebayes_genotypes = FREEBAYES.out.vcf } if ( params.genotyping_tool == 'angsd' ) { From 8f1aa9d0c7571a23540f2714b3139226f463ab45 Mon Sep 17 00:00:00 2001 From: Thiseas Christos Lamnidis Date: Wed, 13 Dec 2023 14:37:55 +0100 Subject: [PATCH 046/110] manual tests --- docs/development/manual_tests.md | 23 +++++++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) diff --git a/docs/development/manual_tests.md b/docs/development/manual_tests.md index 1765809b6..c2786215a 100644 --- a/docs/development/manual_tests.md +++ b/docs/development/manual_tests.md @@ -699,9 +699,28 @@ nextflow run main.nf -profile test,docker --outdir ./results -w work/ -resume -- ```bash ## Gatk HC on trimmed reads, with different out mode and emit confidence. Skip bcftools stats. -## Expect: One VCF + .tbi index per sample/reference combination. Also 1 bcftools_stats file per VCF. +## Expect: One VCF + .tbi index per sample/reference combination. ## Checked .command.sh for correct args. -nextflow run main.nf -profile test,docker --outdir ./results -w work/ -resume --run_genotyping --genotyping_tool 'hc' --genotyping_source 'raw' -ansi-log false -dump-channels --skip_bcftools_stats \ +nextflow run main.nf -profile test,docker --outdir ./results -w work/ -resume --run_genotyping --genotyping_tool 'hc' --genotyping_source 'trimmed' -ansi-log false -dump-channels --skip_bcftools_stats \ --genotyping_gatk_hc_emitrefconf 'BP_RESOLUTION' \ --genotyping_gatk_hc_out_mode 'EMIT_ALL_ACTIVE_SITES' ``` + +## FREEBAYES + +```bash +## Freebayes on raw reads +## Expect: One VCF per sample/reference combination. Also 1 bcftools_stats file per VCF. +nextflow run main.nf -profile test,docker --outdir ./results -w work/ -resume --run_genotyping --genotyping_tool 'freebayes' --genotyping_source 'raw' -ansi-log false -dump-channels +``` + +```bash +## Freebayes on trimmed reads. Different options, and skip bcftools stats. +## Expect: One VCF per sample/reference combination. +## Checked .command.sh for correct args. +nextflow run main.nf -profile test,docker --outdir ./results -w work/ -resume --run_genotyping --genotyping_tool 'freebayes' --genotyping_source 'trimmed' -ansi-log false -dump-channels --skip_bcftools_stats \ + --run_trim_bam \ + --genotyping_freebayes_skip_coverage 10 \ + --genotyping_freebayes_min_alternate_count 2 \ + --genotyping_freebayes_ploidy 1 +``` From 68386e4662749f4c43a93cab770db0d9a524cdf3 Mon Sep 17 00:00:00 2001 From: Thiseas Christos Lamnidis Date: Fri, 12 Jan 2024 16:37:30 +0100 Subject: [PATCH 047/110] add pileupcaller aux files --- conf/test_humanbam.config | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/conf/test_humanbam.config b/conf/test_humanbam.config index 47ba00e93..d2badc5c1 100644 --- a/conf/test_humanbam.config +++ b/conf/test_humanbam.config @@ -38,8 +38,8 @@ params { // //Sex Determination // sexdeterrmine_bedfile = 'https://raw.githubusercontent.com/nf-core/test-datasets/eager/reference/Human/1240K.pos.list_hs37d5.0based.bed.gz' // // Genotyping - // pileupcaller_bedfile = 'https://raw.githubusercontent.com/nf-core/test-datasets/eager/reference/Human/1240K.pos.list_hs37d5.0based.bed.gz' - // pileupcaller_snpfile = 'https://raw.githubusercontent.com/nf-core/test-datasets/eager/reference/Human/1240K_covered_in_JK2067_downsampled_s0.1.numeric_chromosomes.snp' + genotyping_pileupcaller_bedfile = 'https://raw.githubusercontent.com/nf-core/test-datasets/eager/reference/Human/1240K.pos.list_hs37d5.0based.bed.gz' + genotyping_pileupcaller_snpfile = 'https://raw.githubusercontent.com/nf-core/test-datasets/eager/reference/Human/1240K_covered_in_JK2067_downsampled_s0.1.numeric_chromosomes.snp' // BAM filtering From a963652405b43269e2edd00805f976a359187158 Mon Sep 17 00:00:00 2001 From: Thiseas Christos Lamnidis Date: Fri, 12 Jan 2024 16:37:59 +0100 Subject: [PATCH 048/110] remove old dbsnp input. fix genotyping swf cardinality --- workflows/eager.nf | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) diff --git a/workflows/eager.nf b/workflows/eager.nf index 897358733..661dc2c20 100644 --- a/workflows/eager.nf +++ b/workflows/eager.nf @@ -135,13 +135,6 @@ workflow EAGER { if ( params.preprocessing_tool == 'fastp' && !adapterlist.extension.matches(".*(fa|fasta|fna|fas)") ) error "[nf-core/eager] ERROR: fastp adapter list requires a `.fasta` format and extension (or fa, fas, fna). Check input: --preprocessing_adapterlist ${params.preprocessing_adapterlist}" } - - // GATK dbSNP - if ( params.genotyping_gatk_dbsnp ) { - ch_dbsnp = Channel.fromPath(params.genotyping_gatk_dbsnp, checkIfExists: true) - } else { - ch_dbsnp = Channel.empty() - } // // SUBWORKFLOW: Read in samplesheet, validate and stage input files // @@ -558,8 +551,7 @@ workflow EAGER { meta, fasta, fai, dict, mapindex, circular_target -> [ meta, fasta, fai, dict ] } - - GENOTYPE( ch_bams_for_genotyping, ch_reference_for_genotyping, [], [], [], REFERENCE_INDEXING.out.dbsnp ) + GENOTYPE( ch_bams_for_genotyping, ch_reference_for_genotyping, REFERENCE_INDEXING.out.pileupcaller_bed_snp, REFERENCE_INDEXING.out.dbsnp ) ch_versions = ch_versions.mix( GENOTYPE.out.versions ) ch_multiqc_files = ch_multiqc_files.mix( GENOTYPE.out.mqc.collect{it[1]}.ifEmpty([]) ) From b070eea7b350630cb0b9e1306c8ce4dc6156dc4f Mon Sep 17 00:00:00 2001 From: Thiseas Christos Lamnidis Date: Fri, 12 Jan 2024 16:58:13 +0100 Subject: [PATCH 049/110] add pileupcaller bed and snp files --- subworkflows/local/reference_indexing.nf | 17 +++---- .../local/reference_indexing_multi.nf | 44 +++++++++---------- .../local/reference_indexing_single.nf | 40 ++++++++--------- 3 files changed, 51 insertions(+), 50 deletions(-) diff --git a/subworkflows/local/reference_indexing.nf b/subworkflows/local/reference_indexing.nf index 0c2d31951..dbe0a3ba1 100644 --- a/subworkflows/local/reference_indexing.nf +++ b/subworkflows/local/reference_indexing.nf @@ -28,7 +28,7 @@ workflow REFERENCE_INDEXING { ch_hapmap = REFERENCE_INDEXING_MULTI.out.hapmap ch_pmd_mask = REFERENCE_INDEXING_MULTI.out.pmd_mask ch_snp_capture_bed = REFERENCE_INDEXING_MULTI.out.snp_capture_bed - ch_pileupcaller_snp = REFERENCE_INDEXING_MULTI.out.pileupcaller_snp + ch_pileupcaller_bed_snp = REFERENCE_INDEXING_MULTI.out.pileupcaller_bed_snp ch_sexdeterrmine_bed = REFERENCE_INDEXING_MULTI.out.sexdeterrmine_bed ch_bedtools_feature = REFERENCE_INDEXING_MULTI.out.bedtools_feature ch_dbsnp = REFERENCE_INDEXING_MULTI.out.dbsnp @@ -40,7 +40,7 @@ workflow REFERENCE_INDEXING { ch_hapmap = REFERENCE_INDEXING_SINGLE.out.hapmap ch_pmd_mask = REFERENCE_INDEXING_SINGLE.out.pmd_mask ch_snp_capture_bed = REFERENCE_INDEXING_SINGLE.out.snp_capture_bed - ch_pileupcaller_snp = REFERENCE_INDEXING_SINGLE.out.pileupcaller_snp + ch_pileupcaller_bed_snp = REFERENCE_INDEXING_SINGLE.out.pileupcaller_bed_snp ch_sexdeterrmine_bed = REFERENCE_INDEXING_SINGLE.out.sexdeterrmine_bed ch_bedtools_feature = REFERENCE_INDEXING_SINGLE.out.bedtools_feature ch_reference_for_mapping = REFERENCE_INDEXING_SINGLE.out.reference @@ -73,16 +73,17 @@ workflow REFERENCE_INDEXING { GUNZIP_SNPBED( ch_capture_bed_gunzip.forgunzip ) ch_capture_bed = GUNZIP_SNPBED.out.gunzip.mix( ch_capture_bed_gunzip.skip ).mix( ch_capture_bed.skip ) - ch_pileupcaller_snp = ch_pileupcaller_snp - .filter{ it[1] != "" && it[2] != "" } + ch_pileupcaller_bed_snp = ch_pileupcaller_bed_snp + .filter { it[1] != "" && it[2] != "" } ch_sexdeterrmine_bed = ch_sexdeterrmine_bed - .filter{ it[1] != "" } + .filter { it[1] != "" } ch_bedtools_feature = ch_bedtools_feature - .filter{ it[1] != "" } + .filter { it[1] != "" } - // TODO-DEV No filtering dbsnp cause we always need the ploidy value from its meta. Will probably need a reference sheet validator to fix this. + ch_dbsnp = ch_dbsnp + .filter { it[1] != "" } emit: reference = ch_reference_for_mapping // [ meta, fasta, fai, dict, mapindex, circular_target ] @@ -90,7 +91,7 @@ workflow REFERENCE_INDEXING { hapmap = ch_hapmap // [ meta, hapmap ] pmd_mask = ch_pmd_mask // [ meta, masked_fasta, capture_bed ] snp_capture_bed = ch_capture_bed // [ meta, capture_bed ] - pileupcaller_snp = ch_pileupcaller_snp // [ meta, pileupcaller_bed, pileupcaller_snp ] + pileupcaller_bed_snp = ch_pileupcaller_bed_snp // [ meta, pileupcaller_bed, pileupcaller_snp ] sexdeterrmine_bed = ch_sexdeterrmine_bed // [ meta, sexdet_bed ] bedtools_feature = ch_bedtools_feature // [ meta, bedtools_feature ] dbsnp = ch_dbsnp // [ meta, dbsnp ] diff --git a/subworkflows/local/reference_indexing_multi.nf b/subworkflows/local/reference_indexing_multi.nf index af2966c20..51f98af9f 100644 --- a/subworkflows/local/reference_indexing_multi.nf +++ b/subworkflows/local/reference_indexing_multi.nf @@ -33,19 +33,19 @@ workflow REFERENCE_INDEXING_MULTI { meta.id = row["reference_name"] meta.ploidy = row["genotyping_gatk_ploidy"] != "" ? row["genotyping_gatk_ploidy"] : params.genotyping_gatk_ploidy // Use default value if none is specified. This info goes in the meta def fasta = file(row["fasta"], checkIfExists: true) // mandatory parameter! - def fai = row["fai"] != "" ? file(row["fai"], checkIfExists: true) : "" - def dict = row["dict"] != "" ? file(row["dict"], checkIfExists: true) : "" - def mapper_index = row["mapper_index"] != "" ? file(row["mapper_index"], checkIfExists: true) : "" + def fai = row["fai"] != "" ? file(row["fai"] , checkIfExists: true) : "" + def dict = row["dict"] != "" ? file(row["dict"] , checkIfExists: true) : "" + def mapper_index = row["mapper_index"] != "" ? file(row["mapper_index"] , checkIfExists: true) : "" def circular_target = row["circular_target"] def mitochondrion = row["mitochondrion_header"] - def capture_bed = row["snpcapture_bed"] != "" ? file(row["snpcapture_bed"], checkIfExists: true) : "" - def pileupcaller_bed = row["pileupcaller_bedfile"] != "" ? file(row["pileupcaller_bedfile"], checkIfExists: true) : "" - def pileupcaller_snp = row["pileupcaller_snpfile"] != "" ? file(row["pileupcaller_snpfile"], checkIfExists: true) : "" - def hapmap = row["hapmap_file"] != "" ? file(row["hapmap_file"], checkIfExists: true) : "" - def pmd_mask = row["pmdtools_masked_fasta"] != "" ? file(row["pmdtools_masked_fasta"], checkIfExists: true) : "" - def sexdet_bed = row["sexdeterrmine_snp_bed"] != "" ? file(row["sexdeterrmine_snp_bed"], checkIfExists: true) : "" - def bedtools_feature = row["bedtools_feature_file"] != "" ? file(row["bedtools_feature_file"], checkIfExists: true) : "" - def genotyping_gatk_dbsnp = row["genotyping_gatk_dbsnp"] != "" ? file(row["genotyping_gatk_dbsnp"], checkIfExists: true) : "" + def capture_bed = row["snpcapture_bed"] != "" ? file(row["snpcapture_bed"] , checkIfExists: true) : "" + def pileupcaller_bed = row["pileupcaller_bedfile"] != "" ? file(row["pileupcaller_bedfile"] , checkIfExists: true) : "" + def pileupcaller_snp = row["pileupcaller_snpfile"] != "" ? file(row["pileupcaller_snpfile"] , checkIfExists: true) : "" + def hapmap = row["hapmap_file"] != "" ? file(row["hapmap_file"] , checkIfExists: true) : "" + def pmd_mask = row["pmdtools_masked_fasta"] != "" ? file(row["pmdtools_masked_fasta"] , checkIfExists: true) : "" + def sexdet_bed = row["sexdeterrmine_snp_bed"] != "" ? file(row["sexdeterrmine_snp_bed"] , checkIfExists: true) : "" + def bedtools_feature = row["bedtools_feature_file"] != "" ? file(row["bedtools_feature_file"] , checkIfExists: true) : "" + def genotyping_gatk_dbsnp = row["genotyping_gatk_dbsnp"] != "" ? file(row["genotyping_gatk_dbsnp"] , checkIfExists: true) : "" [ meta, fasta, fai, dict, mapper_index, circular_target, mitochondrion, capture_bed, pileupcaller_bed, pileupcaller_snp, hapmap, pmd_mask, sexdet_bed, bedtools_feature, genotyping_gatk_dbsnp ] } @@ -69,10 +69,10 @@ ch_input_from_referencesheet = ch_splitreferencesheet_for_branch angsd_hapmap: [ meta, hapmap ] pmd_mask: [ meta, pmd_mask, capture_bed ] snp_bed: [ meta, capture_bed ] - pileupcaller_snp: [ meta, pileupcaller_bed, pileupcaller_snp ] + pileupcaller_bed_snp: [ meta, pileupcaller_bed, pileupcaller_snp ] sexdeterrmine_bed: [ meta, sexdet_bed ] bedtools_feature: [ meta, bedtools_feature ] - dbsnp: [ meta, genotyping_gatk_dbsnp ] // Include ploidy of the reference in dbsnp meta. + dbsnp: [ meta, genotyping_gatk_dbsnp ] } // Detect if fasta is gzipped or not @@ -211,14 +211,14 @@ ch_input_from_referencesheet = ch_splitreferencesheet_for_branch ch_indexmapper_for_reference = ch_fasta_for_mapperindex.skip.mix(ch_indexed_formix) emit: - reference = ch_indexmapper_for_reference // [ meta, fasta, fai, dict, mapindex, circular_target ] - mitochondrion_header = ch_input_from_referencesheet.mitochondrion_header // [ meta, mitochondrion ] - hapmap = ch_input_from_referencesheet.angsd_hapmap // [ meta, hapmap ] - pmd_mask = ch_input_from_referencesheet.pmd_mask // [ meta, pmd_mask, capture_bed ] - snp_capture_bed = ch_input_from_referencesheet.snp_bed // [ meta, capture_bed ] - pileupcaller_snp = ch_input_from_referencesheet.pileupcaller_snp // [ meta, pileupcaller_snp, pileupcaller_bed ] - sexdeterrmine_bed = ch_input_from_referencesheet.sexdeterrmine_bed // [ meta, sexdet_bed ] - bedtools_feature = ch_input_from_referencesheet.bedtools_feature // [ meta, bedtools_feature ] - dbsnp = ch_input_from_referencesheet.dbsnp // [ meta, genotyping_gatk_dbsnp ] + reference = ch_indexmapper_for_reference // [ meta, fasta, fai, dict, mapindex, circular_target ] + mitochondrion_header = ch_input_from_referencesheet.mitochondrion_header // [ meta, mitochondrion ] + hapmap = ch_input_from_referencesheet.angsd_hapmap // [ meta, hapmap ] + pmd_mask = ch_input_from_referencesheet.pmd_mask // [ meta, pmd_mask, capture_bed ] + snp_capture_bed = ch_input_from_referencesheet.snp_bed // [ meta, capture_bed ] + pileupcaller_bed_snp = ch_input_from_referencesheet.pileupcaller_bed_snp // [ meta, pileupcaller_snp, pileupcaller_bed ] + sexdeterrmine_bed = ch_input_from_referencesheet.sexdeterrmine_bed // [ meta, sexdet_bed ] + bedtools_feature = ch_input_from_referencesheet.bedtools_feature // [ meta, bedtools_feature ] + dbsnp = ch_input_from_referencesheet.dbsnp // [ meta, genotyping_gatk_dbsnp ] versions = ch_versions } diff --git a/subworkflows/local/reference_indexing_single.nf b/subworkflows/local/reference_indexing_single.nf index 9aa19beab..97e926f20 100644 --- a/subworkflows/local/reference_indexing_single.nf +++ b/subworkflows/local/reference_indexing_single.nf @@ -83,8 +83,8 @@ workflow REFERENCE_INDEXING_SINGLE { def contamination_estimation_angsd_hapmap = params.contamination_estimation_angsd_hapmap != null ? file( params.contamination_estimation_angsd_hapmap, checkIfExists: true ) : "" def pmd_mask = params.damage_manipulation_pmdtools_reference_mask != null ? file(params.damage_manipulation_pmdtools_reference_mask, checkIfExists: true ) : "" def capture_bed = params.snpcapture_bed != null ? file(params.snpcapture_bed, checkIfExists: true ) : "" - def pileupcaller_bed = "" - def pileupcaller_snp = "" + def pileupcaller_bed = params.genotyping_pileupcaller_bedfile != null ? file(params.genotyping_pileupcaller_bedfile, checkIfExists: true ) : "" + def pileupcaller_snp = params.genotyping_pileupcaller_snpfile != null ? file(params.genotyping_pileupcaller_snpfile, checkIfExists: true ) : "" def sexdet_bed = "" def bedtools_feature = params.mapstats_bedtools_featurefile != null ? file(params.mapstats_bedtools_featurefile, checkIfExists: true ) : "" def genotyping_gatk_ploidy = params.genotyping_gatk_ploidy @@ -95,27 +95,27 @@ workflow REFERENCE_INDEXING_SINGLE { ch_ref_index_single = ch_reference_for_mapping .multiMap{ meta, fasta, fai, dict, mapper_index, circular_target, mitochondrion_header, contamination_estimation_angsd_hapmap, pmd_mask, capture_bed, pileupcaller_bed, pileupcaller_snp, sexdet_bed, bedtools_feature, genotyping_gatk_dbsnp -> - reference: [ meta, fasta, fai, dict, mapper_index, circular_target ] - mito_header: [ meta, mitochondrion_header ] - hapmap: [ meta, contamination_estimation_angsd_hapmap ] - pmd_mask: [ meta, pmd_mask, capture_bed ] - snp_bed: [ meta, capture_bed ] - pileupcaller_snp: [ meta, pileupcaller_bed, pileupcaller_snp ] - sexdeterrmine_bed: [ meta, sexdet_bed ] - bedtools_feature: [ meta, bedtools_feature ] - dbsnp: [ meta, genotyping_gatk_dbsnp ] + reference: [ meta, fasta, fai, dict, mapper_index, circular_target ] + mito_header: [ meta, mitochondrion_header ] + hapmap: [ meta, contamination_estimation_angsd_hapmap ] + pmd_mask: [ meta, pmd_mask, capture_bed ] + snp_bed: [ meta, capture_bed ] + pileupcaller_bed_snp: [ meta, pileupcaller_bed, pileupcaller_snp ] + sexdeterrmine_bed: [ meta, sexdet_bed ] + bedtools_feature: [ meta, bedtools_feature ] + dbsnp: [ meta, genotyping_gatk_dbsnp ] } emit: - reference = ch_ref_index_single.reference // [ meta, fasta, fai, dict, mapindex, circular_target ] - mitochondrion_header = ch_ref_index_single.mito_header // [ meta, mito_header ] - hapmap = ch_ref_index_single.hapmap // [ meta, hapmap ] - pmd_mask = ch_ref_index_single.pmd_mask // [ meta, pmd_mask, capture_bed ] - snp_capture_bed = ch_ref_index_single.snp_bed // [ meta, capture_bed ] - pileupcaller_snp = ch_ref_index_single.pileupcaller_snp // [ meta, pileupcaller_bed, pileupcaller_snp ] - sexdeterrmine_bed = ch_ref_index_single.sexdeterrmine_bed // [ meta, sexdet_bed ] - bedtools_feature = ch_ref_index_single.bedtools_feature // [ meta, bedtools_feature ] - dbsnp = ch_ref_index_single.dbsnp // [ meta, genotyping_gatk_dbsnp ] + reference = ch_ref_index_single.reference // [ meta, fasta, fai, dict, mapindex, circular_target ] + mitochondrion_header = ch_ref_index_single.mito_header // [ meta, mito_header ] + hapmap = ch_ref_index_single.hapmap // [ meta, hapmap ] + pmd_mask = ch_ref_index_single.pmd_mask // [ meta, pmd_mask, capture_bed ] + snp_capture_bed = ch_ref_index_single.snp_bed // [ meta, capture_bed ] + pileupcaller_bed_snp = ch_ref_index_single.pileupcaller_bed_snp // [ meta, pileupcaller_bed, pileupcaller_snp ] + sexdeterrmine_bed = ch_ref_index_single.sexdeterrmine_bed // [ meta, sexdet_bed ] + bedtools_feature = ch_ref_index_single.bedtools_feature // [ meta, bedtools_feature ] + dbsnp = ch_ref_index_single.dbsnp // [ meta, genotyping_gatk_dbsnp ] versions = ch_versions } From 69dae3122b5fa234abb75356f4ac1a8a4dce9ab1 Mon Sep 17 00:00:00 2001 From: Thiseas Christos Lamnidis Date: Fri, 12 Jan 2024 16:58:45 +0100 Subject: [PATCH 050/110] Add pileupcaller. simplify input channels. --- subworkflows/local/genotype.nf | 74 +++++++++++++++++++++++++++++----- 1 file changed, 63 insertions(+), 11 deletions(-) diff --git a/subworkflows/local/genotype.nf b/subworkflows/local/genotype.nf index 207e20bc9..0be5d6958 100644 --- a/subworkflows/local/genotype.nf +++ b/subworkflows/local/genotype.nf @@ -15,12 +15,10 @@ include { BCFTOOLS_STATS as BCFTOOLS_STATS_GENOTYPING } from '../../module workflow GENOTYPE { take: - ch_bam_bai // [ [ meta ], bam , bai ] - ch_fasta_plus // [ [ meta ], fasta, fai, dict ] - ch_snpcapture_bed // [ [ meta ], bed ] - ch_pileupcaller_bedfile // [ [ meta ], bed ] - ch_pileupcaller_snpfile // [ [ meta ], snp ] - ch_dbsnp // [ [ meta ], dbsnp ] + ch_bam_bai // [ [ meta ], bam , bai ] + ch_fasta_plus // [ [ meta ], fasta, fai, dict ] + ch_pileupcaller_aux_files // [ [ meta ], bed, snp ] + ch_dbsnp // [ [ meta ], dbsnp ] main: ch_versions = Channel.empty() @@ -41,12 +39,66 @@ workflow GENOTYPE { } if ( params.genotyping_tool == 'pileupcaller' ) { - // SAMTOOLS_MPILEUP_PILEUPCALLER( ch_bam_bai, ch_fasta_plus ) - /* - // TODO - this is not working yet. Need snpcapture Bed and pileupcaller snp file to add here. - SEQUENCETOOLS_PILEUPCALLER( ch_bam_bai, ch_fasta_plus, ch_versions, ch_multiqc_files ) - */ + // Compile together all reference based files + ch_refs_for_mpileup_pileupcaller = ch_fasta_plus + .join( ch_pileupcaller_aux_files ) // [ [ref_meta], fasta, fai, dict, bed, snp ] + .map { + // Prepend a new meta that contains the meta.id value as the new_meta.reference attribute + WorkflowEager.addNewMetaFromAttributes( it, "id" , "reference" , false ) + } // RESULT: [ [combination_meta], [ref_meta], fasta, fai, dict, bed, snp ] + + // Prepare collect bams for mpileup + ch_mpileup_inputs_bams = ch_bam_bai + .map { + WorkflowEager.addNewMetaFromAttributes( it, ["reference", "strandedness"] , ["reference", "strandedness"] , false ) + } + .groupTuple() + .map { + combo_meta, metas, bams, bais -> + def ids = metas.collect { meta -> meta.id } + [ combo_meta + [id: ids], bams ] // Drop bais + } // Collect all IDs into a list in meta.id. Useful when running pileupCaller later + + // Combine prepped bams and references + ch_mpileup_inputs = ch_mpileup_inputs_bams + .map { + WorkflowEager.addNewMetaFromAttributes( it, "reference", "reference" , false ) + } + .combine( ch_refs_for_mpileup_pileupcaller , by:0 ) + .multiMap { + ignore_me, combo_meta, bams, ref_meta, fasta, fai, dict, bed, snp -> + def bedfile = bed != "" ? bed : [] + bams: [ combo_meta, bams, bedfile ] + fasta: [ fasta ] + } + SAMTOOLS_MPILEUP_PILEUPCALLER( + ch_mpileup_inputs.bams, + ch_mpileup_inputs.fasta + ) + ch_versions = ch_versions.mix( SAMTOOLS_MPILEUP_PILEUPCALLER.out.versions.first() ) + + ch_pileupcaller_input = SAMTOOLS_MPILEUP_PILEUPCALLER.out.mpileup + .map { + WorkflowEager.addNewMetaFromAttributes( it, "reference", "reference" , false ) + } + .combine( ch_refs_for_mpileup_pileupcaller, by:0 ) + .multiMap { + ignore_me, meta, mpileup, ref_meta, fasta, fai, dict, bed, snp -> + def snpfile = snp != "" ? snp : [] + mpileup: [ meta, mpileup ] + snpfile: [ snpfile ] + } + + // TODO NOTE: Maybe implement a check that unmerged R2 reads have not been kept and throw a warning for ssDNA libs? See: https://github.com/stschiff/sequenceTools/issues/24 + ch_pileupcaller_input.mpileup.dump(tag:"mpileup", pretty: true) + // Run PileupCaller + SEQUENCETOOLS_PILEUPCALLER( + ch_pileupcaller_input.mpileup, + ch_pileupcaller_input.snpfile, + [] + ) + ch_versions = ch_versions.mix( SEQUENCETOOLS_PILEUPCALLER.out.versions.first() ) } if ( params.genotyping_tool == 'ug' ) { From 6371e9f292152d623f0823c9af761b366c531707 Mon Sep 17 00:00:00 2001 From: Thiseas Christos Lamnidis Date: Fri, 12 Jan 2024 16:59:04 +0100 Subject: [PATCH 051/110] add pileupcaller and samtools mpileup --- conf/modules.config | 35 +++++++++++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) diff --git a/conf/modules.config b/conf/modules.config index 8154b9d0f..4edff681e 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -858,6 +858,41 @@ process { // // GENOTYPING // + + withName: SAMTOOLS_MPILEUP_PILEUPCALLER { + tag = { "${meta.reference}|${meta.strandedness}" } + ext.args = [ + "-B", + "-q ${params.genotyping_pileupcaller_min_base_quality}", + "-Q ${params.genotyping_pileupcaller_min_map_quality}", + // "--ignore-RG", + params.genotyping_gatk_ug_defaultbasequalities > 0 ? "--defaultBaseQualities ${params.genotyping_gatk_ug_defaultbasequalities}" : "", // For some reason, GATK complains if its default of -1 is actually provided ?_? + ].join(' ').trim() + ext.prefix = { "${meta.strandedness}_${meta.reference}" } + publishDir = [ + path: { "${params.outdir}/genotyping/" }, + mode: params.publish_dir_mode, + enabled: true + ] + } + + withName: SEQUENCETOOLS_PILEUPCALLER { + tag = { "${meta.reference}|${meta.strandedness}" } + ext.args = {[ + "--${params.genotyping_pileupcaller_method}", + params.genotyping_pileupcaller_transitions_mode == "SkipTransitions" ? "--skipTransitions" : params.genotyping_pileupcaller_transitions_mode == "TransitionsMissing" ? "--transitionsMissing" : "", + "${meta.strandedness}" == 'single' ? "--singleStrangMode" : "" , + "--sampleNames", meta.id.join(","), + "-e pileupcaller.${meta.strandedness}.${meta.reference}" + ].join(' ').trim() } + ext.prefix = { "${meta.strandedness}_${meta.reference}" } + publishDir = [ + path: { "${params.outdir}/genotyping/" }, + mode: params.publish_dir_mode, + enabled: true + ] + } + withName: GATK_REALIGNERTARGETCREATOR { tag = { "${meta.reference}|${meta.sample_id}_${meta.library_id}" } ext.args = [ From 0f1d71baa575d9604ed096b4991239d62abf36dc Mon Sep 17 00:00:00 2001 From: Thiseas Christos Lamnidis Date: Fri, 26 Jan 2024 17:12:41 +0100 Subject: [PATCH 052/110] no mpileup output. add pattern to pileupcaller --- conf/modules.config | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/conf/modules.config b/conf/modules.config index 4edff681e..4b5db5657 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -870,9 +870,9 @@ process { ].join(' ').trim() ext.prefix = { "${meta.strandedness}_${meta.reference}" } publishDir = [ - path: { "${params.outdir}/genotyping/" }, - mode: params.publish_dir_mode, - enabled: true + // path: { "${params.outdir}/genotyping/" }, + // mode: params.publish_dir_mode, + enabled: false ] } @@ -889,7 +889,8 @@ process { publishDir = [ path: { "${params.outdir}/genotyping/" }, mode: params.publish_dir_mode, - enabled: true + enabled: true, + pattern: '*.{geno,snp,ind}' ] } From c7042c05e7c0cf4ca04f081243209b5621091c06 Mon Sep 17 00:00:00 2001 From: Thiseas Christos Lamnidis Date: Fri, 26 Jan 2024 17:14:55 +0100 Subject: [PATCH 053/110] deal with optional files. --- subworkflows/local/genotype.nf | 74 ++++++++++++++++++++++++++-------- 1 file changed, 57 insertions(+), 17 deletions(-) diff --git a/subworkflows/local/genotype.nf b/subworkflows/local/genotype.nf index 0be5d6958..175f96ead 100644 --- a/subworkflows/local/genotype.nf +++ b/subworkflows/local/genotype.nf @@ -30,19 +30,28 @@ workflow GENOTYPE { ch_angsd_genotypes = Channel.empty() ch_bcftools_stats = Channel.empty() - // Replace missing dbsnps with empty lists - ch_dbsnp_for_gatk = ch_dbsnp - .map { - meta, dbsnp -> - final_dbsnp = dbsnp != "" ? dbsnp : [] - [ meta, final_dbsnp ] - } - if ( params.genotyping_tool == 'pileupcaller' ) { // Compile together all reference based files - ch_refs_for_mpileup_pileupcaller = ch_fasta_plus - .join( ch_pileupcaller_aux_files ) // [ [ref_meta], fasta, fai, dict, bed, snp ] + ch_refs_prep = ch_fasta_plus + // Because aux files are optional, the channel can be [[],[],[]]. remainder:true will output both the empty list and the fasta_plus channel with an added 'null'. + .join( ch_pileupcaller_aux_files, remainder: true ) // [ [ref_meta], fasta, fai, dict, bed, snp ] + // Also filter out the empty list aux_files (meta == []) + .filter { it[0] != [] } + // .branch to separate succesfully joined from unsuccesfully joined elements + .branch { + it -> + no_aux: it[4] == null + has_aux: true + } + + // mix the two branches back together after fixing cardinality + ch_refs_for_mpileup_pileupcaller = ch_refs_prep.no_aux + .map { + ref_meta, fasta, fai, dict, empty -> + [ ref_meta, fasta, fai, dict, [], [] ] + } + .mix( ch_refs_prep.has_aux ) .map { // Prepend a new meta that contains the meta.id value as the new_meta.reference attribute WorkflowEager.addNewMetaFromAttributes( it, "id" , "reference" , false ) @@ -66,6 +75,8 @@ workflow GENOTYPE { WorkflowEager.addNewMetaFromAttributes( it, "reference", "reference" , false ) } .combine( ch_refs_for_mpileup_pileupcaller , by:0 ) + // do not run if no bed file is provided + .filter { it[7] != []} .multiMap { ignore_me, combo_meta, bams, ref_meta, fasta, fai, dict, bed, snp -> def bedfile = bed != "" ? bed : [] @@ -74,7 +85,7 @@ workflow GENOTYPE { } SAMTOOLS_MPILEUP_PILEUPCALLER( ch_mpileup_inputs.bams, - ch_mpileup_inputs.fasta + ch_mpileup_inputs.fasta, ) ch_versions = ch_versions.mix( SAMTOOLS_MPILEUP_PILEUPCALLER.out.versions.first() ) @@ -85,13 +96,12 @@ workflow GENOTYPE { .combine( ch_refs_for_mpileup_pileupcaller, by:0 ) .multiMap { ignore_me, meta, mpileup, ref_meta, fasta, fai, dict, bed, snp -> - def snpfile = snp != "" ? snp : [] + // def snpfile = snp != "" ? snp : [] mpileup: [ meta, mpileup ] - snpfile: [ snpfile ] + snpfile: snp } // TODO NOTE: Maybe implement a check that unmerged R2 reads have not been kept and throw a warning for ssDNA libs? See: https://github.com/stschiff/sequenceTools/issues/24 - ch_pileupcaller_input.mpileup.dump(tag:"mpileup", pretty: true) // Run PileupCaller SEQUENCETOOLS_PILEUPCALLER( ch_pileupcaller_input.mpileup, @@ -99,6 +109,8 @@ workflow GENOTYPE { [] ) ch_versions = ch_versions.mix( SEQUENCETOOLS_PILEUPCALLER.out.versions.first() ) + + // TODO If both ds and ss data are present, merge the two datasets per reference together with paste (both have the same snp/bed file) } if ( params.genotyping_tool == 'ug' ) { @@ -110,7 +122,16 @@ workflow GENOTYPE { } ch_fasta_for_multimap = ch_fasta_plus - .join( ch_dbsnp_for_gatk ) // [ [ref_meta], fasta, fai, dict, dbsnp ] + // Because dbsnp is optional, the channel can be [[],[]]. remainder:true will output both the empty list and the fasta_plus channel with an added 'null'. + .join( ch_dbsnp, remainder:true ) // [ [ref_meta], fasta, fai, dict, dbsnp ] + // Also filter out the empty list dbsnp (meta == []) + .filter { it[0] != [] } + // convert added null dbsnp into an empty list + .map { + ref_meta, fasta, fai, dict, dbsnp -> + def final_dbsnp = dbsnp != null ? dbsnp : [] + [ ref_meta, fasta, fai, dict, final_dbsnp ] + } .map { // Prepend a new meta that contains the meta.id value as the new_meta.reference attribute WorkflowEager.addNewMetaFromAttributes( it, "id" , "reference" , false ) @@ -198,7 +219,16 @@ workflow GENOTYPE { } ch_fasta_for_multimap = ch_fasta_plus - .join( ch_dbsnp_for_gatk ) // [ [ref_meta], fasta, fai, dict, dbsnp ] + // Because dbsnp is optional, the channel can be [[],[]]. remainder:true will output both the empty list and the fasta_plus channel with an added 'null'. + .join( ch_dbsnp, remainder:true ) // [ [ref_meta], fasta, fai, dict, dbsnp ] + // Also filter out the empty list dbsnp (meta == []) + .filter { it[0] != [] } + // convert added null dbsnp into an empty list + .map { + ref_meta, fasta, fai, dict, dbsnp -> + def final_dbsnp = dbsnp != null ? dbsnp : [] + [ ref_meta, fasta, fai, dict, final_dbsnp ] + } .map { // Prepend a new meta that contains the meta.id value as the new_meta.reference attribute WorkflowEager.addNewMetaFromAttributes( it, "id" , "reference" , false ) @@ -236,8 +266,18 @@ workflow GENOTYPE { // TODO Do we want to provide SNP capture bed file to Freebayes? It would then genotype only on those positions. // NOTE: dbsnp is not used by Freebayes, but we need to provide it to the module anyway, to ensure correct cardinality of the fasta channel within the BCFTOOLS_STATS channel operations. + // i.e. to keep the definition of the ch_fasta_for_multimap channel consistent regardless of genotyper, so the `combine -> multiMap` in lines 327-328 work. ch_fasta_for_multimap = ch_fasta_plus - .join( ch_dbsnp_for_gatk ) // [ [ref_meta], fasta, fai, dict, dbsnp ] + // Because dbsnp is optional, the channel can be [[],[]]. remainder:true will output both the empty list and the fasta_plus channel with an added 'null'. + .join( ch_dbsnp, remainder:true ) // [ [ref_meta], fasta, fai, dict, dbsnp ] + // Also filter out the empty list dbsnp (meta == []) + .filter { it[0] != [] } + // convert added null dbsnp into an empty list + .map { + ref_meta, fasta, fai, dict, dbsnp -> + def final_dbsnp = dbsnp != null ? dbsnp : [] + [ ref_meta, fasta, fai, dict, final_dbsnp ] + } .map { // Prepend a new meta that contains the meta.id value as the new_meta.reference attribute WorkflowEager.addNewMetaFromAttributes( it, "id" , "reference" , false ) From d4dc6f9c44e9ed1a3739896194027fbb48dafabc Mon Sep 17 00:00:00 2001 From: Thiseas Christos Lamnidis Date: Fri, 26 Jan 2024 17:15:33 +0100 Subject: [PATCH 054/110] clearer formatting of Genotyping call --- workflows/eager.nf | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/workflows/eager.nf b/workflows/eager.nf index 661dc2c20..f01aef487 100644 --- a/workflows/eager.nf +++ b/workflows/eager.nf @@ -551,7 +551,12 @@ workflow EAGER { meta, fasta, fai, dict, mapindex, circular_target -> [ meta, fasta, fai, dict ] } - GENOTYPE( ch_bams_for_genotyping, ch_reference_for_genotyping, REFERENCE_INDEXING.out.pileupcaller_bed_snp, REFERENCE_INDEXING.out.dbsnp ) + GENOTYPE( + ch_bams_for_genotyping, // [ [meta] , bam, bai ] + ch_reference_for_genotyping, // [ [ref_meta] , fasta, fai, dict ] + REFERENCE_INDEXING.out.pileupcaller_bed_snp.ifEmpty([[],[],[]]), // [ [ref_meta] , bed, snp ] + REFERENCE_INDEXING.out.dbsnp.ifEmpty([[],[]]) // [ [ref_meta] , dbsnp ] + ) ch_versions = ch_versions.mix( GENOTYPE.out.versions ) ch_multiqc_files = ch_multiqc_files.mix( GENOTYPE.out.mqc.collect{it[1]}.ifEmpty([]) ) From 599375cb51e2d9622037a9c57070a0056081d360 Mon Sep 17 00:00:00 2001 From: Thiseas Christos Lamnidis Date: Fri, 26 Jan 2024 17:16:47 +0100 Subject: [PATCH 055/110] add warning todo for inconsistent options --- subworkflows/local/reference_indexing.nf | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/subworkflows/local/reference_indexing.nf b/subworkflows/local/reference_indexing.nf index dbe0a3ba1..e20b582ea 100644 --- a/subworkflows/local/reference_indexing.nf +++ b/subworkflows/local/reference_indexing.nf @@ -85,6 +85,12 @@ workflow REFERENCE_INDEXING { ch_dbsnp = ch_dbsnp .filter { it[1] != "" } + // Parameter combination validation + // TODO + // If channel ch_pileupcaller_bed_snp is empty and params.genotyping_tool == 'pileupcaller', throw error + // if ( ch_pileupcaller_bed_snp.isEmpty() && params.genotyping_tool == 'pileupcaller' ) { + // error "No pileupcaller_bed_snp file provided, but genotyping_tool is set to 'pileupcaller'. Please provide a pileupcaller_bed_snp file." + // } emit: reference = ch_reference_for_mapping // [ meta, fasta, fai, dict, mapindex, circular_target ] mitochondrion_header = ch_mitochondrion_header // [ meta, mitochondrion_header ] From 4e0bb0d208b1c6eddb3a09565779920e1c99559d Mon Sep 17 00:00:00 2001 From: Thiseas Christos Lamnidis Date: Fri, 26 Jan 2024 17:17:02 +0100 Subject: [PATCH 056/110] manual tests for genotyping. add multiref per block --- docs/development/manual_tests.md | 61 ++++++++++++++++++++++++++++---- 1 file changed, 54 insertions(+), 7 deletions(-) diff --git a/docs/development/manual_tests.md b/docs/development/manual_tests.md index c2786215a..d99cf4af7 100644 --- a/docs/development/manual_tests.md +++ b/docs/development/manual_tests.md @@ -656,17 +656,19 @@ nextflow run . -profile test,docker \ # GENOTYPING +These tests were ran before library merging was implemented. + ## GATK UG ```bash -## Gatk on raw reads +## Gatk UG on raw reads ## Expect: One VCF per sample/reference combination. Also 1 bcftools_stats file per VCF. Additional IR/ subdirectory with 1 bam and 1 bai per sample/reference combination. nextflow run main.nf -profile test,docker --outdir ./results -w work/ -resume --run_genotyping --genotyping_tool 'ug' --genotyping_source 'raw' --genotyping_gatk_ug_keep_realign_bam -ansi-log false -dump-channels ``` ```bash -## Gatk on trimmed reads. Skip bcftools stats. -## Expect: One VCF per sample/reference combination, based on the trimmed bams (this actually shows on the IndelRealigner step and not the UG step). No bcftools_stats file per VCF. +## Gatk UG on trimmed reads. Skip bcftools stats. +## Expect: One VCF per sample/reference combination, based on the trimmed bams (this actually shows on the IndelRealigner step and not the UG step). No IR directory. No bcftools_stats file per VCF. nextflow run main.nf -profile test,docker --outdir ./results -w work/ -resume --run_genotyping --genotyping_tool 'ug' --genotyping_source 'trimmed' -ansi-log false -dump-channels --skip_bcftools_stats \ --run_trim_bam \ --damage_manipulation_bamutils_trim_double_stranded_none_udg_left 5 \ @@ -677,18 +679,25 @@ nextflow run main.nf -profile test,docker --outdir ./results -w work/ -resume -- ``` ```bash -## Gatk on pmd-filtered reads -## Expect: One VCF per sample/reference combination, based on the pmd-filtered bams (this actually shows on the IndelRealigner step and not the UG step). Also 1 bcftools_stats file per VCF. +## Gatk UG on pmd-filtered reads +## Expect: One VCF per sample/reference combination, based on the pmd-filtered bams (this actually shows on the IndelRealigner step and not the UG step). No IR directory. Also 1 bcftools_stats file per VCF. nextflow run main.nf -profile test,docker --outdir ./results -w work/ -resume --run_genotyping --genotyping_tool 'ug' --genotyping_source 'pmd' -ansi-log false -dump-channels --run_pmd_filtering ## Checked that the bams had fewer reads compared to the raw bams. ``` ```bash -## Gatk on rescaled reads -## Expect: One VCF per sample/reference combination, based on the rescaled bams (this actually shows on the IndelRealigner step and not the UG step). Also 1 bcftools_stats file per VCF. +## Gatk UG on rescaled reads +## Expect: One VCF per sample/reference combination, based on the rescaled bams (this actually shows on the IndelRealigner step and not the UG step). No IR directory. Also 1 bcftools_stats file per VCF. nextflow run main.nf -profile test,docker --outdir ./results -w work/ -resume --run_genotyping --genotyping_tool 'ug' --genotyping_source 'rescaled' -ansi-log false -dump-channels --run_mapdamage_rescaling ``` +```bash +## Gatk UG on raw reads, multiple references +## NOTE: Actually fails due to header of BAM input in test_multiref not matching sequences in fasta (which was shortened to chr 21+ for brevity). Provided alternative input without a BAM input line. ( head -n 5 on https://raw.githubusercontent.com/nf-core/test-datasets/eager/testdata/Mammoth/samplesheet_multilane_multilib.tsv ). It then worked fine. +## Expect: One VCF per sample/reference combination. Also 1 bcftools_stats file per VCF. Additional IR/ subdirectory with 1 bam and 1 bai per sample/reference combination. +nextflow run main.nf -profile test_multiref,docker --input test/samplesheet_multilane_multilib_noBAM.tsv --outdir ./results -w work/ -resume --run_genotyping --genotyping_tool 'ug' --genotyping_source 'raw' --genotyping_gatk_ug_keep_realign_bam -ansi-log false -dump-channels +``` + ## GATK HC ```bash @@ -706,6 +715,13 @@ nextflow run main.nf -profile test,docker --outdir ./results -w work/ -resume -- --genotyping_gatk_hc_out_mode 'EMIT_ALL_ACTIVE_SITES' ``` +```bash +## Gatk HC on raw reads, multiple references +## NOTE: Actually fails due to header of BAM input in test_multiref not matching sequences in fasta (which was shortened to chr 21+ for brevity). Provided alternative input without a BAM input line. ( head -n 5 on https://raw.githubusercontent.com/nf-core/test-datasets/eager/testdata/Mammoth/samplesheet_multilane_multilib.tsv ). It then worked fine. +## Expect: One VCF + .tbi index per sample/reference combination . Also 1 bcftools_stats file per VCF. +nextflow run main.nf -profile test_multiref,docker --input test/samplesheet_multilane_multilib_noBAM.tsv --outdir ./results -w work/ -resume --run_genotyping --genotyping_tool 'hc' --genotyping_source 'raw' --genotyping_gatk_ug_keep_realign_bam -ansi-log false -dump-channels +``` + ## FREEBAYES ```bash @@ -724,3 +740,34 @@ nextflow run main.nf -profile test,docker --outdir ./results -w work/ -resume -- --genotyping_freebayes_min_alternate_count 2 \ --genotyping_freebayes_ploidy 1 ``` + + +```bash +## Freebayes on raw reads, multiple references +## Freebayes does not complain about the BAM header not matching the reference. +## Expect: One VCF per sample/reference combination. BAM input only has 1 output for the specified reference. Also 1 bcftools_stats file per VCF. +nextflow run main.nf -profile test_multiref,docker --outdir ./results -w work/ -resume --run_genotyping --genotyping_tool 'freebayes' --genotyping_source 'raw' --genotyping_gatk_ug_keep_realign_bam -ansi-log false -dump-channels +``` + +## PILEUPCALLER + +```bash +## Pileupcaller on raw reads. No bed or snp file provided. +## Expect: NO GENOTYPING. Pileupcaller requires a bed file and a snp file. Pipeline still executes though. +## TODO Maybe we need a hard failure here? +nextflow run main.nf -profile test,docker --outdir ./results -w work/ -resume --run_genotyping --genotyping_tool 'pileupcaller' --genotyping_source 'raw' -ansi-log false -dump-channels +``` + +```bash +## Pileupcaller on raw reads. +## Expect: One geno/snp/ind combination per reference/strandedness combination (provided that a bed and snp file are present for the reference). geno and snp have same number of lines as SNPs in provided snpfile. ind has same number of lines as number of samples of that strandedness. +nextflow run main.nf -profile test_humanbam,docker --outdir ./results -w work/ -resume --run_genotyping --genotyping_tool 'pileupcaller' --genotyping_source 'raw' -ansi-log false -dump-channels +``` + +```bash +## PileupCaller on raw reads +## This currently fails because the different libraries of JK2802 never get merged, so the sample appears twice. Library merging is needed. As a workaround, I named the sample_id of the _SE and _PE libraries aafter the library name. The two are treated as separate samples now. (--input test/samplesheet_multilane_multilib_noDupSamples.tsv) +## Expect: One geno/snp/ind combination per reference/strandedness combination (provided that a bed and snp file are present for the reference). geno and snp have same number of lines as SNPs in provided snpfile. ind has same number of lines as number of samples of that strandedness. +## Specifically, no geno/snp/ind for the reference that has no bed/snp file (Mammoth). Only ds data present, so only 1 genotyping dataset. +nextflow run main.nf -profile test_multiref,docker --input test/samplesheet_multilane_multilib_noDupSamples.tsv --outdir ./results -w work/ -resume --run_genotyping --genotyping_tool 'pileupcaller' --genotyping_source 'raw' -ansi-log false -dump-channels +``` From 89478b902d1cf8c531ccdaa40048c46ab1ea1cac Mon Sep 17 00:00:00 2001 From: Thiseas Christos Lamnidis Date: Fri, 26 Jan 2024 17:25:26 +0100 Subject: [PATCH 057/110] add small todos --- subworkflows/local/reference_indexing.nf | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/subworkflows/local/reference_indexing.nf b/subworkflows/local/reference_indexing.nf index e20b582ea..d7e3ce31d 100644 --- a/subworkflows/local/reference_indexing.nf +++ b/subworkflows/local/reference_indexing.nf @@ -74,7 +74,8 @@ workflow REFERENCE_INDEXING { ch_capture_bed = GUNZIP_SNPBED.out.gunzip.mix( ch_capture_bed_gunzip.skip ).mix( ch_capture_bed.skip ) ch_pileupcaller_bed_snp = ch_pileupcaller_bed_snp - .filter { it[1] != "" && it[2] != "" } + .filter { it[1] != "" || it[2] != "" } // They go together or not at all. + // TODO add user warning if only one of the two is provided ch_sexdeterrmine_bed = ch_sexdeterrmine_bed .filter { it[1] != "" } From e732f273c2187093a2e18f791f91a53853f9348d Mon Sep 17 00:00:00 2001 From: Thiseas Christos Lamnidis Date: Fri, 2 Feb 2024 14:29:29 +0100 Subject: [PATCH 058/110] remove empty defaults --- nextflow_schema.json | 2 -- 1 file changed, 2 deletions(-) diff --git a/nextflow_schema.json b/nextflow_schema.json index 61629fb70..2796b2611 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -908,7 +908,6 @@ }, "genotyping_pileupcaller_bedfile": { "type": "string", - "default": "None", "fa_icon": "fas fa-bed", "help_text": "Specify a SNP panel in the form of a bed file of sites at which to generate a pileup for pileupCaller.", "format": "file-path", @@ -916,7 +915,6 @@ }, "genotyping_pileupcaller_snpfile": { "type": "string", - "default": "None", "help_text": "Specify a SNP panel in [EIGENSTRAT](https://github.com/DReichLab/EIG/blob/master/CONVERTF/README) format, pileupCaller will call these sites.", "fa_icon": "fas fa-sliders-h", "format": "file-path", From be20f06ba931f9c9bb0b69bf343d5d27364c3f29 Mon Sep 17 00:00:00 2001 From: Thiseas Christos Lamnidis Date: Fri, 2 Feb 2024 14:30:03 +0100 Subject: [PATCH 059/110] Update all modules --- modules.json | 28 +++++++++---------- .../nf-core/bcftools/stats/environment.yml | 7 +++++ modules/nf-core/bcftools/stats/main.nf | 6 ++-- modules/nf-core/bcftools/stats/meta.yml | 5 ++++ modules/nf-core/bwa/mem/environment.yml | 1 + modules/nf-core/bwa/sampe/environment.yml | 1 + modules/nf-core/bwa/samse/environment.yml | 1 + .../eigenstratsnpcoverage/environment.yml | 7 +++++ .../eigenstratsnpcoverage/main.nf | 2 +- .../eigenstratsnpcoverage/meta.yml | 7 ++--- .../gatk/indelrealigner/environment.yml | 7 +++++ modules/nf-core/gatk/indelrealigner/main.nf | 2 +- modules/nf-core/gatk/indelrealigner/meta.yml | 5 ++-- .../realignertargetcreator/environment.yml | 7 +++++ .../gatk/realignertargetcreator/main.nf | 2 +- .../gatk/realignertargetcreator/meta.yml | 5 ++-- .../gatk/unifiedgenotyper/environment.yml | 7 +++++ modules/nf-core/gatk/unifiedgenotyper/main.nf | 2 +- .../nf-core/gatk/unifiedgenotyper/meta.yml | 6 ++-- modules/nf-core/multiqc/tests/main.nf.test | 13 +++++---- .../nf-core/multiqc/tests/main.nf.test.snap | 28 ++++++++++++++++--- .../nf-core/samtools/mpileup/environment.yml | 8 ++++++ modules/nf-core/samtools/mpileup/main.nf | 6 ++-- modules/nf-core/samtools/mpileup/meta.yml | 3 ++ .../pileupcaller/environment.yml | 7 +++++ .../sequencetools/pileupcaller/main.nf | 2 +- .../sequencetools/pileupcaller/meta.yml | 14 ++-------- 27 files changed, 128 insertions(+), 61 deletions(-) create mode 100644 modules/nf-core/bcftools/stats/environment.yml create mode 100644 modules/nf-core/eigenstratdatabasetools/eigenstratsnpcoverage/environment.yml create mode 100644 modules/nf-core/gatk/indelrealigner/environment.yml create mode 100644 modules/nf-core/gatk/realignertargetcreator/environment.yml create mode 100644 modules/nf-core/gatk/unifiedgenotyper/environment.yml create mode 100644 modules/nf-core/samtools/mpileup/environment.yml create mode 100644 modules/nf-core/sequencetools/pileupcaller/environment.yml diff --git a/modules.json b/modules.json index 75b4eb0b7..cf345fef8 100644 --- a/modules.json +++ b/modules.json @@ -32,7 +32,7 @@ }, "bcftools/stats": { "branch": "master", - "git_sha": "e2693a7e2d773b92e0649b25880ee22fe82bb79d", + "git_sha": "44096c08ffdbc694f5f92ae174ea0f7ba0f37e09", "installed_by": ["modules"] }, "bedtools/coverage": { @@ -67,17 +67,17 @@ }, "bwa/mem": { "branch": "master", - "git_sha": "a64788f5ad388f1d2ac5bd5f1f3f8fc81476148c", + "git_sha": "1e2b7fb7106852388610c0360d234b0829eb980e", "installed_by": ["modules"] }, "bwa/sampe": { "branch": "master", - "git_sha": "a64788f5ad388f1d2ac5bd5f1f3f8fc81476148c", + "git_sha": "1e2b7fb7106852388610c0360d234b0829eb980e", "installed_by": ["fastq_align_bwaaln"] }, "bwa/samse": { "branch": "master", - "git_sha": "a64788f5ad388f1d2ac5bd5f1f3f8fc81476148c", + "git_sha": "1e2b7fb7106852388610c0360d234b0829eb980e", "installed_by": ["fastq_align_bwaaln"] }, "cat/fastq": { @@ -102,7 +102,7 @@ }, "eigenstratdatabasetools/eigenstratsnpcoverage": { "branch": "master", - "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", + "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", "installed_by": ["modules"] }, "endorspy": { @@ -117,7 +117,7 @@ }, "fastp": { "branch": "master", - "git_sha": "c9488585ce7bd35ccd2a30faa2371454c8112fb9", + "git_sha": "003920c7f9a8ae19b69a97171922880220bedf56", "installed_by": ["modules"] }, "fastqc": { @@ -132,17 +132,17 @@ }, "gatk/indelrealigner": { "branch": "master", - "git_sha": "27902a200da5056a941cde0f15ec80878b5e837c", + "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", "installed_by": ["modules"] }, "gatk/realignertargetcreator": { "branch": "master", - "git_sha": "27902a200da5056a941cde0f15ec80878b5e837c", + "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", "installed_by": ["modules"] }, "gatk/unifiedgenotyper": { "branch": "master", - "git_sha": "27902a200da5056a941cde0f15ec80878b5e837c", + "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", "installed_by": ["modules"] }, "gatk4/haplotypecaller": { @@ -167,7 +167,7 @@ }, "multiqc": { "branch": "master", - "git_sha": "8ec825f465b9c17f9d83000022995b4f7de6fe93", + "git_sha": "9e71d8519dfbfc328c078bba14d4bd4c99e39a94", "installed_by": ["modules"] }, "picard/createsequencedictionary": { @@ -237,7 +237,7 @@ }, "samtools/mpileup": { "branch": "master", - "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", + "git_sha": "ce0b1aed7d504883061e748f492a31bf44c5777c", "installed_by": ["modules"] }, "samtools/sort": { @@ -250,12 +250,12 @@ "git_sha": "ce0b1aed7d504883061e748f492a31bf44c5777c", "installed_by": ["bam_split_by_region", "modules"] }, - "sequencetools/pileupcaller": { + "seqkit/split2": { "branch": "master", - "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", + "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", "installed_by": ["modules"] }, - "seqkit/split2": { + "sequencetools/pileupcaller": { "branch": "master", "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", "installed_by": ["modules"] diff --git a/modules/nf-core/bcftools/stats/environment.yml b/modules/nf-core/bcftools/stats/environment.yml new file mode 100644 index 000000000..1a9695288 --- /dev/null +++ b/modules/nf-core/bcftools/stats/environment.yml @@ -0,0 +1,7 @@ +name: bcftools_stats +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::bcftools=1.18 diff --git a/modules/nf-core/bcftools/stats/main.nf b/modules/nf-core/bcftools/stats/main.nf index 7ccb9bf6c..ffa1df643 100644 --- a/modules/nf-core/bcftools/stats/main.nf +++ b/modules/nf-core/bcftools/stats/main.nf @@ -2,10 +2,10 @@ process BCFTOOLS_STATS { tag "$meta.id" label 'process_single' - conda "bioconda::bcftools=1.17" + conda "${moduleDir}/environment.yml" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/bcftools:1.17--haef29d1_0': - 'biocontainers/bcftools:1.17--haef29d1_0' }" + 'https://depot.galaxyproject.org/singularity/bcftools:1.18--h8b25389_0': + 'biocontainers/bcftools:1.18--h8b25389_0' }" input: tuple val(meta), path(vcf), path(tbi) diff --git a/modules/nf-core/bcftools/stats/meta.yml b/modules/nf-core/bcftools/stats/meta.yml index 5850d25f7..7ea2103e3 100644 --- a/modules/nf-core/bcftools/stats/meta.yml +++ b/modules/nf-core/bcftools/stats/meta.yml @@ -70,3 +70,8 @@ authors: - "@drpatelh" - "@SusiJo" - "@TCLamnidis" +maintainers: + - "@joseespinosa" + - "@drpatelh" + - "@SusiJo" + - "@TCLamnidis" diff --git a/modules/nf-core/bwa/mem/environment.yml b/modules/nf-core/bwa/mem/environment.yml index c5b2a9ce5..1818cea3f 100644 --- a/modules/nf-core/bwa/mem/environment.yml +++ b/modules/nf-core/bwa/mem/environment.yml @@ -7,3 +7,4 @@ dependencies: - bwa=0.7.17 # renovate: datasource=conda depName=bioconda/samtools - samtools=1.18 + - htslib=1.18 diff --git a/modules/nf-core/bwa/sampe/environment.yml b/modules/nf-core/bwa/sampe/environment.yml index 84eee3486..63bbd0b5d 100644 --- a/modules/nf-core/bwa/sampe/environment.yml +++ b/modules/nf-core/bwa/sampe/environment.yml @@ -6,3 +6,4 @@ channels: dependencies: - bioconda::bwa=0.7.17 - bioconda::samtools=1.18 + - bioconda::htslib=1.18 diff --git a/modules/nf-core/bwa/samse/environment.yml b/modules/nf-core/bwa/samse/environment.yml index a3b33d5df..d427a735b 100644 --- a/modules/nf-core/bwa/samse/environment.yml +++ b/modules/nf-core/bwa/samse/environment.yml @@ -6,3 +6,4 @@ channels: dependencies: - bioconda::bwa=0.7.17 - bioconda::samtools=1.18 + - bioconda::htslib=1.18 diff --git a/modules/nf-core/eigenstratdatabasetools/eigenstratsnpcoverage/environment.yml b/modules/nf-core/eigenstratdatabasetools/eigenstratsnpcoverage/environment.yml new file mode 100644 index 000000000..9ac9ee9fe --- /dev/null +++ b/modules/nf-core/eigenstratdatabasetools/eigenstratsnpcoverage/environment.yml @@ -0,0 +1,7 @@ +name: eigenstratdatabasetools_eigenstratsnpcoverage +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::eigenstratdatabasetools=1.1.0 diff --git a/modules/nf-core/eigenstratdatabasetools/eigenstratsnpcoverage/main.nf b/modules/nf-core/eigenstratdatabasetools/eigenstratsnpcoverage/main.nf index 66d439a47..aa25553be 100644 --- a/modules/nf-core/eigenstratdatabasetools/eigenstratsnpcoverage/main.nf +++ b/modules/nf-core/eigenstratdatabasetools/eigenstratsnpcoverage/main.nf @@ -2,7 +2,7 @@ process EIGENSTRATDATABASETOOLS_EIGENSTRATSNPCOVERAGE { tag "$meta.id" label 'process_single' - conda "bioconda::eigenstratdatabasetools=1.1.0" + conda "${moduleDir}/environment.yml" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/eigenstratdatabasetools:1.1.0--hdfd78af_0': 'biocontainers/eigenstratdatabasetools:1.1.0--hdfd78af_0' }" diff --git a/modules/nf-core/eigenstratdatabasetools/eigenstratsnpcoverage/meta.yml b/modules/nf-core/eigenstratdatabasetools/eigenstratsnpcoverage/meta.yml index 87eaab00c..cf48d5ce2 100644 --- a/modules/nf-core/eigenstratdatabasetools/eigenstratsnpcoverage/meta.yml +++ b/modules/nf-core/eigenstratdatabasetools/eigenstratsnpcoverage/meta.yml @@ -9,12 +9,9 @@ keywords: tools: - "eigenstratdatabasetools": description: "A set of tools to compare and manipulate the contents of EingenStrat databases, and to calculate SNP coverage statistics in such databases." - documentation: "https://github.com/TCLamnidis/EigenStratDatabaseTools/README.md" tool_dev_url: "https://github.com/TCLamnidis/EigenStratDatabaseTools" - licence: "['GPL v3']" - input: - meta: type: map @@ -33,7 +30,6 @@ input: type: file description: An Eigenstrat formatted individual file pattern: "*.{ind}" - output: - meta: type: map @@ -52,6 +48,7 @@ output: type: file description: A json table with the number of covered SNPs per individual. pattern: "*.{json}" - authors: - "@TCLamnidis" +maintainers: + - "@TCLamnidis" diff --git a/modules/nf-core/gatk/indelrealigner/environment.yml b/modules/nf-core/gatk/indelrealigner/environment.yml new file mode 100644 index 000000000..5a30ceb74 --- /dev/null +++ b/modules/nf-core/gatk/indelrealigner/environment.yml @@ -0,0 +1,7 @@ +name: gatk_indelrealigner +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::gatk=3.5 diff --git a/modules/nf-core/gatk/indelrealigner/main.nf b/modules/nf-core/gatk/indelrealigner/main.nf index abcb245ca..c4b7338bf 100644 --- a/modules/nf-core/gatk/indelrealigner/main.nf +++ b/modules/nf-core/gatk/indelrealigner/main.nf @@ -2,7 +2,7 @@ process GATK_INDELREALIGNER { tag "$meta.id" label 'process_single' - conda "bioconda::gatk=3.5" + conda "${moduleDir}/environment.yml" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/gatk:3.5--hdfd78af_11': 'biocontainers/gatk:3.5--hdfd78af_11' }" diff --git a/modules/nf-core/gatk/indelrealigner/meta.yml b/modules/nf-core/gatk/indelrealigner/meta.yml index 6751f1117..15d9aec41 100644 --- a/modules/nf-core/gatk/indelrealigner/meta.yml +++ b/modules/nf-core/gatk/indelrealigner/meta.yml @@ -12,7 +12,6 @@ tools: homepage: "https://gatk.broadinstitute.org/hc/en-us" documentation: "https://github.com/broadinstitute/gatk-docs" licence: "['https://software.broadinstitute.org/gatk/download/licensing', 'BSD', 'https://www.broadinstitute.org/gatk/about/#licensing']" - input: - meta: type: map @@ -67,7 +66,6 @@ input: type: file description: Optional input VCF file(s) with known indels pattern: ".vcf" - output: - meta: type: map @@ -86,6 +84,7 @@ output: type: file description: Output BAM Index file pattern: "*.bai" - authors: - "@jfy133" +maintainers: + - "@jfy133" diff --git a/modules/nf-core/gatk/realignertargetcreator/environment.yml b/modules/nf-core/gatk/realignertargetcreator/environment.yml new file mode 100644 index 000000000..68ca7b543 --- /dev/null +++ b/modules/nf-core/gatk/realignertargetcreator/environment.yml @@ -0,0 +1,7 @@ +name: gatk_realignertargetcreator +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::gatk=3.5 diff --git a/modules/nf-core/gatk/realignertargetcreator/main.nf b/modules/nf-core/gatk/realignertargetcreator/main.nf index 623ac468f..73a807e0f 100644 --- a/modules/nf-core/gatk/realignertargetcreator/main.nf +++ b/modules/nf-core/gatk/realignertargetcreator/main.nf @@ -2,7 +2,7 @@ process GATK_REALIGNERTARGETCREATOR { tag "$meta.id" label 'process_low' - conda "bioconda::gatk=3.5" + conda "${moduleDir}/environment.yml" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/gatk:3.5--hdfd78af_11': 'biocontainers/gatk:3.5--hdfd78af_11' }" diff --git a/modules/nf-core/gatk/realignertargetcreator/meta.yml b/modules/nf-core/gatk/realignertargetcreator/meta.yml index 384c93e14..f7cf3c60f 100644 --- a/modules/nf-core/gatk/realignertargetcreator/meta.yml +++ b/modules/nf-core/gatk/realignertargetcreator/meta.yml @@ -13,7 +13,6 @@ tools: homepage: "https://gatk.broadinstitute.org/hc/en-us" documentation: "https://github.com/broadinstitute/gatk-docs" licence: "['https://software.broadinstitute.org/gatk/download/licensing', 'BSD', 'https://www.broadinstitute.org/gatk/about/#licensing']" - input: - meta: type: map @@ -64,7 +63,6 @@ input: type: file description: Optional input VCF file(s) with known indels pattern: ".vcf" - output: - meta: type: map @@ -79,6 +77,7 @@ output: type: file description: File containg intervals that represent sites of extant and potential indels. pattern: "*.intervals" - authors: - "@jfy133" +maintainers: + - "@jfy133" diff --git a/modules/nf-core/gatk/unifiedgenotyper/environment.yml b/modules/nf-core/gatk/unifiedgenotyper/environment.yml new file mode 100644 index 000000000..7201ece73 --- /dev/null +++ b/modules/nf-core/gatk/unifiedgenotyper/environment.yml @@ -0,0 +1,7 @@ +name: gatk_unifiedgenotyper +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::gatk=3.5 diff --git a/modules/nf-core/gatk/unifiedgenotyper/main.nf b/modules/nf-core/gatk/unifiedgenotyper/main.nf index 99e700a33..dffa1d5a0 100644 --- a/modules/nf-core/gatk/unifiedgenotyper/main.nf +++ b/modules/nf-core/gatk/unifiedgenotyper/main.nf @@ -2,7 +2,7 @@ process GATK_UNIFIEDGENOTYPER { tag "$meta.id" label 'process_medium' - conda "bioconda::gatk=3.5" + conda "${moduleDir}/environment.yml" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/gatk:3.5--hdfd78af_11': 'biocontainers/gatk:3.5--hdfd78af_11' }" diff --git a/modules/nf-core/gatk/unifiedgenotyper/meta.yml b/modules/nf-core/gatk/unifiedgenotyper/meta.yml index f946411df..708815eec 100644 --- a/modules/nf-core/gatk/unifiedgenotyper/meta.yml +++ b/modules/nf-core/gatk/unifiedgenotyper/meta.yml @@ -10,7 +10,6 @@ tools: homepage: "https://gatk.broadinstitute.org/hc/en-us" documentation: "https://github.com/broadinstitute/gatk-docs" licence: "['https://software.broadinstitute.org/gatk/download/licensing', 'BSD', 'https://www.broadinstitute.org/gatk/about/#licensing']" - input: - meta: type: map @@ -88,7 +87,6 @@ input: type: file description: Comparison VCF file (optional) pattern: "*" - output: - meta: type: map @@ -103,7 +101,9 @@ output: type: file description: VCF file containing called variants pattern: "*.vcf.gz" - authors: - "@ilight1542" - "@jfy133" +maintainers: + - "@ilight1542" + - "@jfy133" diff --git a/modules/nf-core/multiqc/tests/main.nf.test b/modules/nf-core/multiqc/tests/main.nf.test index d0438eda6..f1c4242ef 100644 --- a/modules/nf-core/multiqc/tests/main.nf.test +++ b/modules/nf-core/multiqc/tests/main.nf.test @@ -3,6 +3,7 @@ nextflow_process { name "Test Process MULTIQC" script "../main.nf" process "MULTIQC" + tag "modules" tag "modules_nfcore" tag "multiqc" @@ -12,7 +13,7 @@ nextflow_process { when { process { """ - input[0] = Channel.of([file(params.test_data['sarscov2']['illumina']['test_1_fastq_gz_fastqc_zip'], checkIfExists: true)]) + input[0] = Channel.of(file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastqc/test_fastqc.zip', checkIfExists: true)) input[1] = [] input[2] = [] input[3] = [] @@ -25,7 +26,7 @@ nextflow_process { { assert process.success }, { assert process.out.report[0] ==~ ".*/multiqc_report.html" }, { assert process.out.data[0] ==~ ".*/multiqc_data" }, - { assert snapshot(process.out.versions).match("versions") } + { assert snapshot(process.out.versions).match("multiqc_versions_single") } ) } @@ -36,7 +37,7 @@ nextflow_process { when { process { """ - input[0] = Channel.of([file(params.test_data['sarscov2']['illumina']['test_1_fastq_gz_fastqc_zip'], checkIfExists: true)]) + input[0] = Channel.of(file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastqc/test_fastqc.zip', checkIfExists: true)) input[1] = Channel.of(file("https://github.com/nf-core/tools/raw/dev/nf_core/pipeline-template/assets/multiqc_config.yml", checkIfExists: true)) input[2] = [] input[3] = [] @@ -49,7 +50,7 @@ nextflow_process { { assert process.success }, { assert process.out.report[0] ==~ ".*/multiqc_report.html" }, { assert process.out.data[0] ==~ ".*/multiqc_data" }, - { assert snapshot(process.out.versions).match("versions") } + { assert snapshot(process.out.versions).match("multiqc_versions_config") } ) } } @@ -61,7 +62,7 @@ nextflow_process { when { process { """ - input[0] = Channel.of([file(params.test_data['sarscov2']['illumina']['test_1_fastq_gz_fastqc_zip'], checkIfExists: true)]) + input[0] = Channel.of(file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastqc/test_fastqc.zip', checkIfExists: true)) input[1] = [] input[2] = [] input[3] = [] @@ -75,7 +76,7 @@ nextflow_process { { assert snapshot(process.out.report.collect { file(it).getName() } + process.out.data.collect { file(it).getName() } + process.out.plots.collect { file(it).getName() } + - process.out.versions ).match() } + process.out.versions ).match("multiqc_stub") } ) } diff --git a/modules/nf-core/multiqc/tests/main.nf.test.snap b/modules/nf-core/multiqc/tests/main.nf.test.snap index d37e73040..549ba79c0 100644 --- a/modules/nf-core/multiqc/tests/main.nf.test.snap +++ b/modules/nf-core/multiqc/tests/main.nf.test.snap @@ -1,13 +1,17 @@ { - "versions": { + "multiqc_versions_single": { "content": [ [ "versions.yml:md5,14e9a2661241abd828f4f06a7b5c222d" ] ], - "timestamp": "2024-01-09T23:02:49.911994" + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-01-31T17:43:40.529579" }, - "sarscov2 single-end [fastqc] - stub": { + "multiqc_stub": { "content": [ [ "multiqc_report.html", @@ -16,6 +20,22 @@ "versions.yml:md5,14e9a2661241abd828f4f06a7b5c222d" ] ], - "timestamp": "2024-01-09T23:03:14.524346" + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-01-31T17:45:09.605359" + }, + "multiqc_versions_config": { + "content": [ + [ + "versions.yml:md5,14e9a2661241abd828f4f06a7b5c222d" + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-01-31T17:44:53.535994" } } \ No newline at end of file diff --git a/modules/nf-core/samtools/mpileup/environment.yml b/modules/nf-core/samtools/mpileup/environment.yml new file mode 100644 index 000000000..ae6e31d35 --- /dev/null +++ b/modules/nf-core/samtools/mpileup/environment.yml @@ -0,0 +1,8 @@ +name: samtools_mpileup +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::samtools=1.18 + - bioconda::htslib=1.18 diff --git a/modules/nf-core/samtools/mpileup/main.nf b/modules/nf-core/samtools/mpileup/main.nf index d77249841..3e4cc409b 100644 --- a/modules/nf-core/samtools/mpileup/main.nf +++ b/modules/nf-core/samtools/mpileup/main.nf @@ -2,10 +2,10 @@ process SAMTOOLS_MPILEUP { tag "$meta.id" label 'process_single' - conda "bioconda::samtools=1.17" + conda "${moduleDir}/environment.yml" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/samtools:1.17--h00cdaf9_0' : - 'biocontainers/samtools:1.17--h00cdaf9_0' }" + 'https://depot.galaxyproject.org/singularity/samtools:1.18--h50ea8bc_1' : + 'biocontainers/samtools:1.18--h50ea8bc_1' }" input: tuple val(meta), path(input), path(intervals) path fasta diff --git a/modules/nf-core/samtools/mpileup/meta.yml b/modules/nf-core/samtools/mpileup/meta.yml index 7597ef41a..13038fbc9 100644 --- a/modules/nf-core/samtools/mpileup/meta.yml +++ b/modules/nf-core/samtools/mpileup/meta.yml @@ -50,3 +50,6 @@ output: authors: - "@drpatelh" - "@joseespinosa" +maintainers: + - "@drpatelh" + - "@joseespinosa" diff --git a/modules/nf-core/sequencetools/pileupcaller/environment.yml b/modules/nf-core/sequencetools/pileupcaller/environment.yml new file mode 100644 index 000000000..98ac1fa58 --- /dev/null +++ b/modules/nf-core/sequencetools/pileupcaller/environment.yml @@ -0,0 +1,7 @@ +name: sequencetools_pileupcaller +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::sequencetools=1.5.2 diff --git a/modules/nf-core/sequencetools/pileupcaller/main.nf b/modules/nf-core/sequencetools/pileupcaller/main.nf index d09dfc1b6..2b8b5f698 100644 --- a/modules/nf-core/sequencetools/pileupcaller/main.nf +++ b/modules/nf-core/sequencetools/pileupcaller/main.nf @@ -2,7 +2,7 @@ process SEQUENCETOOLS_PILEUPCALLER { tag "$meta.id" label 'process_low' - conda "bioconda::sequencetools=1.5.2" + conda "${moduleDir}/environment.yml" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/sequencetools:1.5.2--hec16e2b_1': 'biocontainers/sequencetools:1.5.2--hec16e2b_1' }" diff --git a/modules/nf-core/sequencetools/pileupcaller/meta.yml b/modules/nf-core/sequencetools/pileupcaller/meta.yml index 7ab10cde2..54ed15b0f 100644 --- a/modules/nf-core/sequencetools/pileupcaller/meta.yml +++ b/modules/nf-core/sequencetools/pileupcaller/meta.yml @@ -16,9 +16,7 @@ tools: homepage: "https://github.com/stschiff/sequenceTools" documentation: "https://github.com/stschiff/sequenceTools#readme" tool_dev_url: "https://github.com/stschiff/sequenceTools" - licence: "['MIT']" - input: # Only when we have meta - meta: @@ -26,25 +24,20 @@ input: description: | Groovy Map containing sample information e.g. [ id:'test', single_end:false ] - - mpileup: type: file description: samtools mpileup output. - - snpfile: type: file description: | Eigenstrat format .snp file of the sites in the mpileup file to call genotypes on. Only alleles matching the Ref and Alt alleles of the provided snp file will be called. - - calling_method: type: value description: The desired calling method for pileupcaller. One of 'randomHaploid', 'randomDiploid', or 'majorityCall'. - - output_format: type: value description: The desired output format. One of 'PLINK', 'EIGENSTRAT', or 'FREQSUM'. - output: #Only when we have meta - meta: @@ -52,26 +45,23 @@ output: description: | Groovy Map containing sample information e.g. [ id:'test', single_end:false ] - - versions: type: file description: File containing software versions pattern: "versions.yml" - - eigenstrat: type: file description: A tuple containing the output Eigenstrat-formatted geno, snp and ind files. pattern: "*.{geno,snp,ind}.txt" - - plink: type: file description: A tuple containing the output Plink-formatted bed, bim and fam files. pattern: "*.{bed,bim,fam}" - - freqsum: type: file description: The output freqsum-formatted file. pattern: "*.freqsum.gz" - authors: - "@TCLamnidis" +maintainers: + - "@TCLamnidis" From 5d4eaf6a76f9eed56e4cf1b587140b74f3e8d850 Mon Sep 17 00:00:00 2001 From: Thiseas Christos Lamnidis Date: Fri, 2 Feb 2024 14:50:13 +0100 Subject: [PATCH 060/110] fix linting warnings --- subworkflows/local/manipulate_damage.nf | 1 + subworkflows/local/map.nf | 18 ++++++++++-------- subworkflows/local/reference_indexing.nf | 8 +++++--- workflows/eager.nf | 6 ++++-- 4 files changed, 20 insertions(+), 13 deletions(-) diff --git a/subworkflows/local/manipulate_damage.nf b/subworkflows/local/manipulate_damage.nf index b54be730e..149b8e555 100644 --- a/subworkflows/local/manipulate_damage.nf +++ b/subworkflows/local/manipulate_damage.nf @@ -131,6 +131,7 @@ workflow MANIPULATE_DAMAGE { SAMTOOLS_FLAGSTAT_DAMAGE_FILTERED( ch_pmd_filtered_bams ) ch_pmd_filtered_flagstat = SAMTOOLS_FLAGSTAT_DAMAGE_FILTERED.out.flagstat + ch_versions = ch_versions.mix( SAMTOOLS_FLAGSTAT_DAMAGE_FILTERED.out.versions.first() ) } if ( params.run_trim_bam ) { diff --git a/subworkflows/local/map.nf b/subworkflows/local/map.nf index cd2371c1f..023814d8e 100644 --- a/subworkflows/local/map.nf +++ b/subworkflows/local/map.nf @@ -2,14 +2,16 @@ // Prepare reference indexing for downstream // -include { SEQKIT_SPLIT2 } from '../../modules/nf-core/seqkit/split2/main' -include { FASTQ_ALIGN_BWAALN } from '../../subworkflows/nf-core/fastq_align_bwaaln/main' -include { BWA_MEM } from '../../modules/nf-core/bwa/mem/main' -include { BOWTIE2_ALIGN } from '../../modules/nf-core/bowtie2/align/main' -include { SAMTOOLS_MERGE as SAMTOOLS_MERGE_LANES } from '../../modules/nf-core/samtools/merge/main' -include { SAMTOOLS_SORT as SAMTOOLS_SORT_MERGED_LANES } from '../../modules/nf-core/samtools/sort/main' -include { SAMTOOLS_INDEX as SAMTOOLS_INDEX_MEM; SAMTOOLS_INDEX as SAMTOOLS_INDEX_BT2; SAMTOOLS_INDEX as SAMTOOLS_INDEX_MERGED_LANES } from '../../modules/nf-core/samtools/index/main' -include { SAMTOOLS_FLAGSTAT as SAMTOOLS_FLAGSTAT_MAPPED } from '../../modules/nf-core/samtools/flagstat/main' +include { SEQKIT_SPLIT2 } from '../../modules/nf-core/seqkit/split2/main' +include { FASTQ_ALIGN_BWAALN } from '../../subworkflows/nf-core/fastq_align_bwaaln/main' +include { BWA_MEM } from '../../modules/nf-core/bwa/mem/main' +include { BOWTIE2_ALIGN } from '../../modules/nf-core/bowtie2/align/main' +include { SAMTOOLS_MERGE as SAMTOOLS_MERGE_LANES } from '../../modules/nf-core/samtools/merge/main' +include { SAMTOOLS_SORT as SAMTOOLS_SORT_MERGED_LANES } from '../../modules/nf-core/samtools/sort/main' +include { SAMTOOLS_INDEX as SAMTOOLS_INDEX_MEM } from '../../modules/nf-core/samtools/index/main' +include { SAMTOOLS_INDEX as SAMTOOLS_INDEX_BT2 } from '../../modules/nf-core/samtools/index/main' +include { SAMTOOLS_INDEX as SAMTOOLS_INDEX_MERGED_LANES } from '../../modules/nf-core/samtools/index/main' +include { SAMTOOLS_FLAGSTAT as SAMTOOLS_FLAGSTAT_MAPPED } from '../../modules/nf-core/samtools/flagstat/main' workflow MAP { take: diff --git a/subworkflows/local/reference_indexing.nf b/subworkflows/local/reference_indexing.nf index 362c21fa6..010f1cfe7 100644 --- a/subworkflows/local/reference_indexing.nf +++ b/subworkflows/local/reference_indexing.nf @@ -2,9 +2,11 @@ // Prepare reference indexing for downstream // -include { REFERENCE_INDEXING_SINGLE } from '../../subworkflows/local/reference_indexing_single.nf' -include { REFERENCE_INDEXING_MULTI } from '../../subworkflows/local/reference_indexing_multi.nf' -include { GUNZIP as GUNZIP_PMDBED; GUNZIP as GUNZIP_PMDFASTA; GUNZIP as GUNZIP_SNPBED } from '../../modules/nf-core/gunzip/main.nf' +include { REFERENCE_INDEXING_SINGLE } from '../../subworkflows/local/reference_indexing_single.nf' +include { REFERENCE_INDEXING_MULTI } from '../../subworkflows/local/reference_indexing_multi.nf' +include { GUNZIP as GUNZIP_PMDBED } from '../../modules/nf-core/gunzip/main.nf' +include { GUNZIP as GUNZIP_PMDFASTA } from '../../modules/nf-core/gunzip/main.nf' +include { GUNZIP as GUNZIP_SNPBED } from '../../modules/nf-core/gunzip/main.nf' workflow REFERENCE_INDEXING { take: diff --git a/workflows/eager.nf b/workflows/eager.nf index 2c5e47481..4abe9849d 100644 --- a/workflows/eager.nf +++ b/workflows/eager.nf @@ -97,9 +97,11 @@ include { MTNUCRATIO } from '../modules/n include { HOST_REMOVAL } from '../modules/local/host_removal' include { ENDORSPY } from '../modules/nf-core/endorspy/main' include { SAMTOOLS_FLAGSTAT as SAMTOOLS_FLAGSTATS_BAM_INPUT } from '../modules/nf-core/samtools/flagstat/main' -include { BEDTOOLS_COVERAGE as BEDTOOLS_COVERAGE_DEPTH ; BEDTOOLS_COVERAGE as BEDTOOLS_COVERAGE_BREADTH } from '../modules/nf-core/bedtools/coverage/main' +include { BEDTOOLS_COVERAGE as BEDTOOLS_COVERAGE_DEPTH } from '../modules/nf-core/bedtools/coverage/main' +include { BEDTOOLS_COVERAGE as BEDTOOLS_COVERAGE_BREADTH } from '../modules/nf-core/bedtools/coverage/main' include { SAMTOOLS_VIEW_GENOME } from '../modules/local/samtools_view_genome.nf' -include { QUALIMAP_BAMQC as QUALIMAP_BAMQC_NOBED ; QUALIMAP_BAMQC as QUALIMAP_BAMQC_WITHBED } from '../modules/nf-core/qualimap/bamqc/main' +include { QUALIMAP_BAMQC as QUALIMAP_BAMQC_NOBED } from '../modules/nf-core/qualimap/bamqc/main' +include { QUALIMAP_BAMQC as QUALIMAP_BAMQC_WITHBED } from '../modules/nf-core/qualimap/bamqc/main' /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ From 2e38e63b2ae64be8159aec45cda5d7d7072bbdce Mon Sep 17 00:00:00 2001 From: Thiseas Christos Lamnidis Date: Fri, 2 Feb 2024 23:12:44 +0100 Subject: [PATCH 061/110] add collect_genotypes --- bin/collect_genotypes.py | 102 +++++++++++++++++++++++++++++ modules/local/collect_genotypes.nf | 53 +++++++++++++++ 2 files changed, 155 insertions(+) create mode 100755 bin/collect_genotypes.py create mode 100644 modules/local/collect_genotypes.nf diff --git a/bin/collect_genotypes.py b/bin/collect_genotypes.py new file mode 100755 index 000000000..6746f499a --- /dev/null +++ b/bin/collect_genotypes.py @@ -0,0 +1,102 @@ +#!/usr/bin/env python + +# MIT License (c) Thiseas C. Lamnidis (@TCLamnidis) + +import argparse +import filecmp + +def file_len(fname): + with open(fname) as f: + for i, l in enumerate(f): + pass + return i + 1 + +## A function to return the number of genotypes per line in a .geno file. +def file_width(fname): + with open(fname) as f: + for i in f: + return(len(i.strip())) + break + +## A function to check that there are no duplicate individual IDs across ind files. +def check_for_duplicate_ids(indf1, indf2): + with open(indf1) as f: + inds1 = [x.strip().split()[0] for x in f.readlines()] + with open(indf2) as f: + inds2 = [x.strip().split()[0] for x in f.readlines()] + intersection = set(inds1).intersection(inds2) + if len(intersection) > 0: + raise IOError("Input .ind files contain duplicate individual IDs. Duplicates: {}".format(intersection)) + +## Function to check that the snp files are identical +def check_snp_files(snpf1, snpf2): + if not filecmp.cmp(snpf1, snpf2): + raise IOError("Input .snp files are not identical.") + +## Function to check the consistency of an eigenstrat database +def validate_eigenstrat(genof, snpf, indf): + dimsGeno = [file_len(genof), file_width(genof)] + linesSnp = file_len(snpf) + linesInd = file_len(indf) + + # print(dimsGeno,linesSnp,linesInd) + ##Check geno and snp compatibility + if dimsGeno[0] != linesSnp: + raise IOError("Input .snp and .geno files do not match.") + + ##Check geno and ind compatibility + if dimsGeno[1] != linesInd: + raise IOError("Input .ind and .geno files do not match.") + +VERSION = "1.0.0" + +parser = argparse.ArgumentParser(usage="%(prog)s (-i ) (-c | -R | -E) [-L | -S Ind [-S Ind2]] [-o ]" , description="A tool to check two different EingenStrat databses for shared individuals, and extract or remove individuals from an EigenStrat database.") +parser._optionals.title = "Available options" +parser.add_argument("-g1", "--genoFn1", type = str, metavar = "", required = True, help = "The path to the input geno file of the first dataset.") +parser.add_argument("-s1", "--snpFn1", type = str, metavar = "", required = True, help = "The path to the input snp file of the first dataset.") +parser.add_argument("-i1", "--indFn1", type = str, metavar = "", required = True, help = "The path to the input ind file of the first dataset.") +parser.add_argument("-g2", "--genoFn2", type = str, metavar = "", required = True, help = "The path to the input geno file of the second dataset.") +parser.add_argument("-s2", "--snpFn2", type = str, metavar = "", required = True, help = "The path to the input snp file of the second dataset.") +parser.add_argument("-i2", "--indFn2", type = str, metavar = "", required = True, help = "The path to the input ind file of the second dataset.") +parser.add_argument("-o", "--output", type = str, metavar = "", required = True, help = "The desired output file prefix. Three output files are created, .geno , .snp and .ind .") +parser.add_argument("-v", "--version", action='version', version="{}".format(VERSION), help="Print the version and exit.") +args = parser.parse_args() + +## Open input files +GenoFile1 = open(args.genoFn1, "r") +SnpFile1 = open(args.snpFn1, "r") +IndFile1 = open(args.indFn1, "r") + +GenoFile2 = open(args.genoFn2, "r") +# SnpFile2 = open(args.snpFn2, "r") ## Never actually read in line by line +IndFile2 = open(args.indFn2, "r") + +## open output files +GenoFileOut = open(args.output + ".geno", "w") +SnpFileOut = open(args.output + ".snp", "w") +IndFileOut = open(args.output + ".ind", "w") + +## Perform basic validation on inputs +validate_eigenstrat(args.genoFn1, args.snpFn1, args.indFn1) +validate_eigenstrat(args.genoFn2, args.snpFn2, args.indFn2) +check_for_duplicate_ids(args.indFn1, args.indFn2) +check_snp_files(args.snpFn1, args.snpFn2) + +## Now actually merge the data +## Geno +for line1, line2 in zip(GenoFile1, GenoFile2): + geno_line="{}{}".format(line1.strip(),line2.strip()) + print(geno_line, file=GenoFileOut) + +## Snp +## Copying the file would be faster, but this way we do not rely on the os or external packages. +## We already checked that the snp files are byte-identical, so we can just copy one of them. +for line in SnpFile1: + print(line.strip(), file=SnpFileOut) + +## Ind +## The indfiles are simply concatenated in the same order as the geno file. +for line in IndFile1: + print(line.strip(), file=IndFileOut) +for line in IndFile2: + print(line.strip(), file=IndFileOut) diff --git a/modules/local/collect_genotypes.nf b/modules/local/collect_genotypes.nf new file mode 100644 index 000000000..9a1715268 --- /dev/null +++ b/modules/local/collect_genotypes.nf @@ -0,0 +1,53 @@ +process COLLECT_GENOTYPES { + tag "$meta.id" + label 'process_single' + + conda "conda-forge::python=3.8.3" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/python:3.8.3' : + 'biocontainers/python:3.8.3' }" + + input: + tuple val(meta), path(geno), path(snp), path(ind) + + output: + tuple val(meta), path("*.geno"), path("*.snp"), path("*.ind") , emit: collected + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + prefix = task.ext.prefix ?: "${meta.id}" + // If there are multiple genotype datasets, then merge them, else just rename the output for consistency. + println "geno = ${geno.toList().size()}" + println "${geno.toList()}" + if ( geno.toList().size() == 1 ) { + """ + mv ${geno[0]} ${prefix}.geno + mv ${snp[0]} ${prefix}.snp + mv ${ind[0]} ${prefix}.ind + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + collect_genotypes.py: \$(collect_genotypes.py -v) + END_VERSIONS + """ + } else { + """ + collect_genotypes.py \ + --genoFn1 ${geno[0]} \ + --snpFn1 ${snp[0]} \ + --indFn1 ${ind[0]} \ + --genoFn2 ${geno[1]} \ + --snpFn2 ${snp[1]} \ + --indFn2 ${ind[1]} \ + --output ${prefix} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + collect_genotypes.py: \$(collect_genotypes.py -v) + END_VERSIONS + """ + } +} From 0c9f4ac1feee522b8acf305feac627c41901510b Mon Sep 17 00:00:00 2001 From: Thiseas Christos Lamnidis Date: Fri, 2 Feb 2024 23:14:09 +0100 Subject: [PATCH 062/110] add genotype collection --- conf/modules.config | 11 +++++++++++ subworkflows/local/genotype.nf | 16 +++++++++++++++- 2 files changed, 26 insertions(+), 1 deletion(-) diff --git a/conf/modules.config b/conf/modules.config index 8f1e3b565..64e98fd42 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -958,6 +958,17 @@ process { "-e pileupcaller.${meta.strandedness}.${meta.reference}" ].join(' ').trim() } ext.prefix = { "${meta.strandedness}_${meta.reference}" } + publishDir = [ + // path: { "${params.outdir}/genotyping/" }, + // mode: params.publish_dir_mode, + // pattern: '*.{geno,snp,ind}', + enabled: false + ] + } + + withName: COLLECT_GENOTYPES { + tag = { "${meta.reference}" } + ext.prefix = { "pileupcaller_genotypes_${meta.reference}" } publishDir = [ path: { "${params.outdir}/genotyping/" }, mode: params.publish_dir_mode, diff --git a/subworkflows/local/genotype.nf b/subworkflows/local/genotype.nf index 175f96ead..653a845ee 100644 --- a/subworkflows/local/genotype.nf +++ b/subworkflows/local/genotype.nf @@ -5,6 +5,7 @@ include { SAMTOOLS_MPILEUP as SAMTOOLS_MPILEUP_PILEUPCALLER } from '../../modules/nf-core/samtools/mpileup/main' include { EIGENSTRATDATABASETOOLS_EIGENSTRATSNPCOVERAGE } from '../../modules/nf-core/eigenstratdatabasetools/eigenstratsnpcoverage/main' include { SEQUENCETOOLS_PILEUPCALLER } from '../../modules/nf-core/sequencetools/pileupcaller/main' +include { COLLECT_GENOTYPES } from '../../modules/local/collect_genotypes' include { GATK_REALIGNERTARGETCREATOR } from '../../modules/nf-core/gatk/realignertargetcreator/main' include { GATK_INDELREALIGNER } from '../../modules/nf-core/gatk/indelrealigner/main' include { GATK_UNIFIEDGENOTYPER } from '../../modules/nf-core/gatk/unifiedgenotyper/main' @@ -110,7 +111,20 @@ workflow GENOTYPE { ) ch_versions = ch_versions.mix( SEQUENCETOOLS_PILEUPCALLER.out.versions.first() ) - // TODO If both ds and ss data are present, merge the two datasets per reference together with paste (both have the same snp/bed file) + // Merge/rename genotyping datasets + ch_final_genotypes = SEQUENCETOOLS_PILEUPCALLER.out.eigenstrat + .map { + WorkflowEager.addNewMetaFromAttributes( it, "reference" , "reference" , false ) + } + .groupTuple() + .map { + combo_meta, metas, geno, snp, ind -> + [ combo_meta, geno, snp, ind ] + } + + COLLECT_GENOTYPES( ch_final_genotypes ) + ch_versions = ch_versions.mix( COLLECT_GENOTYPES.out.versions.first() ) + } if ( params.genotyping_tool == 'ug' ) { From 7d0fbc44a4894d512138a49f91d73b0e27cb8af2 Mon Sep 17 00:00:00 2001 From: Thiseas Christos Lamnidis Date: Fri, 2 Feb 2024 23:16:32 +0100 Subject: [PATCH 063/110] update manual tests --- docs/development/manual_tests.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/docs/development/manual_tests.md b/docs/development/manual_tests.md index a06fb9a31..91c3bb019 100644 --- a/docs/development/manual_tests.md +++ b/docs/development/manual_tests.md @@ -837,8 +837,8 @@ nextflow run main.nf -profile test_humanbam,docker --outdir ./results -w work/ - ```bash ## PileupCaller on raw reads -## This currently fails because the different libraries of JK2802 never get merged, so the sample appears twice. Library merging is needed. As a workaround, I named the sample_id of the _SE and _PE libraries aafter the library name. The two are treated as separate samples now. (--input test/samplesheet_multilane_multilib_noDupSamples.tsv) -## Expect: One geno/snp/ind combination per reference/strandedness combination (provided that a bed and snp file are present for the reference). geno and snp have same number of lines as SNPs in provided snpfile. ind has same number of lines as number of samples of that strandedness. -## Specifically, no geno/snp/ind for the reference that has no bed/snp file (Mammoth). Only ds data present, so only 1 genotyping dataset. -nextflow run main.nf -profile test_multiref,docker --input test/samplesheet_multilane_multilib_noDupSamples.tsv --outdir ./results -w work/ -resume --run_genotyping --genotyping_tool 'pileupcaller' --genotyping_source 'raw' -ansi-log false -dump-channels +## Something is wrong with the test input BAM, that makes samtools mpileup fail. samtools quickcheck does not identify a problem, but empty mpileups are generated when the BAM input is included in as an input. +## Expect: One geno/snp/ind combination per reference (provided that a bed and snp file are present for the reference). geno and snp have same number of lines as SNPs in provided snpfile (977). ind has same number of lines as number of samples (2). +## Specifically, no geno/snp/ind for the reference that has no bed/snp file (Mammoth). Only data for "human" reference. +nextflow run main.nf -profile test_multiref,docker --input test/samplesheet_multilane_multilib_noBAM.tsv --outdir ./results -w work/ -resume --run_genotyping --genotyping_tool 'pileupcaller' --genotyping_source 'raw' -ansi-log false -dump-channels ``` From 12798917946028eb4f406a4db79c781aa692fc73 Mon Sep 17 00:00:00 2001 From: Thiseas Christos Lamnidis Date: Fri, 2 Feb 2024 23:17:58 +0100 Subject: [PATCH 064/110] linting --- docs/development/manual_tests.md | 1 - 1 file changed, 1 deletion(-) diff --git a/docs/development/manual_tests.md b/docs/development/manual_tests.md index 91c3bb019..affd9c246 100644 --- a/docs/development/manual_tests.md +++ b/docs/development/manual_tests.md @@ -812,7 +812,6 @@ nextflow run main.nf -profile test,docker --outdir ./results -w work/ -resume -- --genotyping_freebayes_ploidy 1 ``` - ```bash ## Freebayes on raw reads, multiple references ## Freebayes does not complain about the BAM header not matching the reference. From 687a2d05ed0c4abb2c22227187d8a460acc3716d Mon Sep 17 00:00:00 2001 From: Thiseas Christos Lamnidis Date: Fri, 2 Feb 2024 23:24:11 +0100 Subject: [PATCH 065/110] oopsie bugfix --- subworkflows/local/reference_indexing_single.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/subworkflows/local/reference_indexing_single.nf b/subworkflows/local/reference_indexing_single.nf index c3b37dc84..a46c80424 100644 --- a/subworkflows/local/reference_indexing_single.nf +++ b/subworkflows/local/reference_indexing_single.nf @@ -95,7 +95,7 @@ workflow REFERENCE_INDEXING_SINGLE { ch_ref_index_single = ch_reference_for_mapping .multiMap{ - meta, fasta, fai, dict, mapper_index, circular_target, mitochondrion_header, contamination_estimation_angsd_hapmap, pmd_mask, capture_bed, pileupcaller_bed, pileupcaller_snp, sexdet_bed, bedtools_feature, genotyping_gatk_dbsnp -> + meta, fasta, fai, dict, mapper_index, circular_target, mitochondrion_header, contamination_estimation_angsd_hapmap, pmd_masked_fasta, pmd_bed_for_masking, capture_bed, pileupcaller_bed, pileupcaller_snp, sexdet_bed, bedtools_feature, genotyping_gatk_dbsnp -> reference: [ meta, fasta, fai, dict, mapper_index, circular_target ] mito_header: [ meta, mitochondrion_header ] hapmap: [ meta, contamination_estimation_angsd_hapmap ] From 5f6d4661ed44fdc727579763621ea0c10c0b9611 Mon Sep 17 00:00:00 2001 From: Thiseas Christos Lamnidis Date: Tue, 6 Feb 2024 13:14:44 +0100 Subject: [PATCH 066/110] add test for each genotyper. --- .github/workflows/ci.yml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 55eb2f1e0..62c04347b 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -28,11 +28,11 @@ jobs: - "latest-everything" PARAMS: - "-profile test,docker --preprocessing_tool fastp --preprocessing_adapterlist 'https://github.com/nf-core/test-datasets/raw/modules/data/delete_me/fastp/adapters.fasta'" - - "-profile test,docker --preprocessing_tool adapterremoval --preprocessing_adapterlist 'https://github.com/nf-core/test-datasets/raw/modules/data/delete_me/adapterremoval/adapterremoval_adapterlist.txt' --sequencing_qc_tool falco" - - "-profile test,docker --mapping_tool bwamem --run_mapdamage_rescaling --run_pmd_filtering --run_trim_bam" - - "-profile test,docker --mapping_tool bowtie2" + - "-profile test,docker --preprocessing_tool adapterremoval --preprocessing_adapterlist 'https://github.com/nf-core/test-datasets/raw/modules/data/delete_me/adapterremoval/adapterremoval_adapterlist.txt' --sequencing_qc_tool falco --run_genotyping --genotyping_tool 'freebayes' --genotyping_source 'raw'" + - "-profile test,docker --mapping_tool bwamem --run_mapdamage_rescaling --run_pmd_filtering --run_trim_bam --run_genotyping --genotyping_tool 'ug' --genotyping_source 'trimmed'" + - "-profile test,docker --mapping_tool bowtie2 --run_genotyping --genotyping_tool 'hc' --genotyping_source 'raw'" - "-profile test,docker --skip_preprocessing" - - "-profile test_humanbam,docker --run_mtnucratio --run_contamination_estimation_angsd --snpcapture_bed 'https://raw.githubusercontent.com/nf-core/test-datasets/eager/reference/Human/1240K.pos.list_hs37d5.0based.bed.gz'" + - "-profile test_humanbam,docker --run_mtnucratio --run_contamination_estimation_angsd --snpcapture_bed 'https://raw.githubusercontent.com/nf-core/test-datasets/eager/reference/Human/1240K.pos.list_hs37d5.0based.bed.gz' --run_genotyping --genotyping_tool 'pileupcaller' --genotyping_source 'raw'" - "-profile test_multiref,docker" ## TODO add damage manipulation here instead once it goes multiref steps: - name: Check out pipeline code From 4a0366fdbb588404e6b32a7d406d32d1838d9680 Mon Sep 17 00:00:00 2001 From: Thiseas Christos Lamnidis Date: Tue, 6 Feb 2024 13:33:41 +0100 Subject: [PATCH 067/110] Add errors when pileupcaller is used without bed or snp file --- subworkflows/local/reference_indexing.nf | 4 +++- workflows/eager.nf | 1 + 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/subworkflows/local/reference_indexing.nf b/subworkflows/local/reference_indexing.nf index 010f1cfe7..4b78e0368 100644 --- a/subworkflows/local/reference_indexing.nf +++ b/subworkflows/local/reference_indexing.nf @@ -112,7 +112,9 @@ workflow REFERENCE_INDEXING { ch_pileupcaller_bed_snp = ch_pileupcaller_bed_snp .filter { it[1] != "" || it[2] != "" } // They go together or not at all. - // TODO add user warning if only one of the two is provided + // Check if the channel is empty, and throw an error. Will only trigger for tsv fasta input. Single reference gets validated immediately. + .ifEmpty { if(params.run_genotyping && params.genotyping_tool == 'pileupcaller') { error "[nf-core/eager] ERROR: Genotyping with pileupcaller requires that both '--genotyping_pileupcaller_bedfile' AND '--genotyping_pileupcaller_snpfile' are provided for at least one reference genome." } } + .filter{ it != null } // Remove null channel which arises if empty cause error returns null. ch_sexdeterrmine_bed = ch_sexdeterrmine_bed .filter { it[1] != "" } diff --git a/workflows/eager.nf b/workflows/eager.nf index 4abe9849d..a00409eaa 100644 --- a/workflows/eager.nf +++ b/workflows/eager.nf @@ -34,6 +34,7 @@ if ( params.metagenomics_complexity_tool == 'prinseq' && params.metagenomics_pri exit 1, ("[nf-core/eager] ERROR: Metagenomics: You picked PRINSEQ++ with 'entropy' mode but provided a dust score. Please specify an entropy filter threshold using the --metagenomics_complexity_entropy flag") } } +if ( params.run_genotyping && params.genotyping_tool == 'pileupcaller' && ! (params.genotyping_pileupcaller_snpfile || params.genotyping_pileupcaller_bedfile ) ) { exit 1, ("[nf-core/eager] ERROR: Genotyping with pileupcaller requires both '--genotyping_pileupcaller_bedfile' AND '--genotyping_pileupcaller_snpfile' to be provided.") } // TODO What to do when params.preprocessing_excludeunmerged is provided but the data is SE? if ( params.deduplication_tool == 'dedup' && ! params.preprocessing_excludeunmerged ) { exit 1, "[nf-core/eager] ERROR: Dedup can only be used on collapsed (i.e. merged) PE reads. For all other cases, please set --deduplication_tool to 'markduplicates'."} From 056e0f61808360e4904283e7409cf5763d0b50b0 Mon Sep 17 00:00:00 2001 From: Thiseas Christos Lamnidis Date: Tue, 6 Feb 2024 13:33:53 +0100 Subject: [PATCH 068/110] small tweaks --- docs/development/manual_tests.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/development/manual_tests.md b/docs/development/manual_tests.md index affd9c246..b1211b939 100644 --- a/docs/development/manual_tests.md +++ b/docs/development/manual_tests.md @@ -823,7 +823,7 @@ nextflow run main.nf -profile test_multiref,docker --outdir ./results -w work/ - ```bash ## Pileupcaller on raw reads. No bed or snp file provided. -## Expect: NO GENOTYPING. Pileupcaller requires a bed file and a snp file. Pipeline still executes though. +## Expect: NO GENOTYPING. Pileupcaller requires a bed file and a snp file. Throws an error. ## TODO Maybe we need a hard failure here? nextflow run main.nf -profile test,docker --outdir ./results -w work/ -resume --run_genotyping --genotyping_tool 'pileupcaller' --genotyping_source 'raw' -ansi-log false -dump-channels ``` From 80d28cb3d7a3d686a298b711d39f870b7ea7d89b Mon Sep 17 00:00:00 2001 From: Thiseas Christos Lamnidis Date: Tue, 6 Feb 2024 13:35:06 +0100 Subject: [PATCH 069/110] small changes --- workflows/eager.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflows/eager.nf b/workflows/eager.nf index a00409eaa..865a4b6ce 100644 --- a/workflows/eager.nf +++ b/workflows/eager.nf @@ -34,7 +34,7 @@ if ( params.metagenomics_complexity_tool == 'prinseq' && params.metagenomics_pri exit 1, ("[nf-core/eager] ERROR: Metagenomics: You picked PRINSEQ++ with 'entropy' mode but provided a dust score. Please specify an entropy filter threshold using the --metagenomics_complexity_entropy flag") } } -if ( params.run_genotyping && params.genotyping_tool == 'pileupcaller' && ! (params.genotyping_pileupcaller_snpfile || params.genotyping_pileupcaller_bedfile ) ) { exit 1, ("[nf-core/eager] ERROR: Genotyping with pileupcaller requires both '--genotyping_pileupcaller_bedfile' AND '--genotyping_pileupcaller_snpfile' to be provided.") } +if ( params.run_genotyping && params.genotyping_tool == 'pileupcaller' && ! (params.genotyping_pileupcaller_bedfile || params.genotyping_pileupcaller_snpfile ) ) { exit 1, ("[nf-core/eager] ERROR: Genotyping with pileupcaller requires both '--genotyping_pileupcaller_bedfile' AND '--genotyping_pileupcaller_snpfile' to be provided.") } // TODO What to do when params.preprocessing_excludeunmerged is provided but the data is SE? if ( params.deduplication_tool == 'dedup' && ! params.preprocessing_excludeunmerged ) { exit 1, "[nf-core/eager] ERROR: Dedup can only be used on collapsed (i.e. merged) PE reads. For all other cases, please set --deduplication_tool to 'markduplicates'."} From 2cab201f443ed621b75032bb622409f82d427fa5 Mon Sep 17 00:00:00 2001 From: Thiseas Christos Lamnidis Date: Tue, 6 Feb 2024 13:35:29 +0100 Subject: [PATCH 070/110] reposition a line --- workflows/eager.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflows/eager.nf b/workflows/eager.nf index 865a4b6ce..3c4d8eda8 100644 --- a/workflows/eager.nf +++ b/workflows/eager.nf @@ -22,6 +22,7 @@ if ( params.run_genotyping && ! params.genotyping_source if ( params.genotyping_source == 'trimmed' && ! params.run_trim_bam ) { exit 1, ("[nf-core/eager] ERROR: --genotyping_source cannot be 'trimmed' unless BAM trimming is turned on with `--run_trim_bam`.") } if ( params.genotyping_source == 'pmd' && ! params.run_pmd_filtering ) { exit 1, ("[nf-core/eager] ERROR: --genotyping_source cannot be 'pmd' unless PMD-filtering is ran.") } if ( params.genotyping_source == 'rescaled' && ! params.run_mapdamage_rescaling ) { exit 1, ("[nf-core/eager] ERROR: --genotyping_source cannot be 'rescaled' unless aDNA damage rescaling is ran.") } +if ( params.run_genotyping && params.genotyping_tool == 'pileupcaller' && ! (params.genotyping_pileupcaller_bedfile || params.genotyping_pileupcaller_snpfile ) ) { exit 1, ("[nf-core/eager] ERROR: Genotyping with pileupcaller requires both '--genotyping_pileupcaller_bedfile' AND '--genotyping_pileupcaller_snpfile' to be provided.") } if ( params.metagenomics_complexity_tool == 'prinseq' && params.metagenomics_prinseq_mode == 'dust' && params.metagenomics_complexity_entropy != 0.3 ) { // entropy score was set but dust method picked. If no dust-score provided, assume it was an error and fail if (params.metagenomics_prinseq_dustscore == 0.5) { @@ -34,7 +35,6 @@ if ( params.metagenomics_complexity_tool == 'prinseq' && params.metagenomics_pri exit 1, ("[nf-core/eager] ERROR: Metagenomics: You picked PRINSEQ++ with 'entropy' mode but provided a dust score. Please specify an entropy filter threshold using the --metagenomics_complexity_entropy flag") } } -if ( params.run_genotyping && params.genotyping_tool == 'pileupcaller' && ! (params.genotyping_pileupcaller_bedfile || params.genotyping_pileupcaller_snpfile ) ) { exit 1, ("[nf-core/eager] ERROR: Genotyping with pileupcaller requires both '--genotyping_pileupcaller_bedfile' AND '--genotyping_pileupcaller_snpfile' to be provided.") } // TODO What to do when params.preprocessing_excludeunmerged is provided but the data is SE? if ( params.deduplication_tool == 'dedup' && ! params.preprocessing_excludeunmerged ) { exit 1, "[nf-core/eager] ERROR: Dedup can only be used on collapsed (i.e. merged) PE reads. For all other cases, please set --deduplication_tool to 'markduplicates'."} From 6e6ea1cc73605868e66027ba8aa433db6ff5d0cd Mon Sep 17 00:00:00 2001 From: Thiseas Christos Lamnidis Date: Tue, 6 Feb 2024 13:39:39 +0100 Subject: [PATCH 071/110] fix error condition --- workflows/eager.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflows/eager.nf b/workflows/eager.nf index 3c4d8eda8..35cee37fa 100644 --- a/workflows/eager.nf +++ b/workflows/eager.nf @@ -22,7 +22,7 @@ if ( params.run_genotyping && ! params.genotyping_source if ( params.genotyping_source == 'trimmed' && ! params.run_trim_bam ) { exit 1, ("[nf-core/eager] ERROR: --genotyping_source cannot be 'trimmed' unless BAM trimming is turned on with `--run_trim_bam`.") } if ( params.genotyping_source == 'pmd' && ! params.run_pmd_filtering ) { exit 1, ("[nf-core/eager] ERROR: --genotyping_source cannot be 'pmd' unless PMD-filtering is ran.") } if ( params.genotyping_source == 'rescaled' && ! params.run_mapdamage_rescaling ) { exit 1, ("[nf-core/eager] ERROR: --genotyping_source cannot be 'rescaled' unless aDNA damage rescaling is ran.") } -if ( params.run_genotyping && params.genotyping_tool == 'pileupcaller' && ! (params.genotyping_pileupcaller_bedfile || params.genotyping_pileupcaller_snpfile ) ) { exit 1, ("[nf-core/eager] ERROR: Genotyping with pileupcaller requires both '--genotyping_pileupcaller_bedfile' AND '--genotyping_pileupcaller_snpfile' to be provided.") } +if ( ! ( fasta.extension == 'csv' || fasta.extension == 'tsv' ) && params.run_genotyping && params.genotyping_tool == 'pileupcaller' && ! (params.genotyping_pileupcaller_bedfile || params.genotyping_pileupcaller_snpfile ) ) { exit 1, ("[nf-core/eager] ERROR: Genotyping with pileupcaller requires both '--genotyping_pileupcaller_bedfile' AND '--genotyping_pileupcaller_snpfile' to be provided.") } if ( params.metagenomics_complexity_tool == 'prinseq' && params.metagenomics_prinseq_mode == 'dust' && params.metagenomics_complexity_entropy != 0.3 ) { // entropy score was set but dust method picked. If no dust-score provided, assume it was an error and fail if (params.metagenomics_prinseq_dustscore == 0.5) { From bd063faa12cd0f212531be923082099ebda569b6 Mon Sep 17 00:00:00 2001 From: Thiseas Christos Lamnidis Date: Tue, 6 Feb 2024 14:04:57 +0100 Subject: [PATCH 072/110] fix error conditional --- workflows/eager.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflows/eager.nf b/workflows/eager.nf index 35cee37fa..8d42d8091 100644 --- a/workflows/eager.nf +++ b/workflows/eager.nf @@ -22,7 +22,7 @@ if ( params.run_genotyping && ! params.genotyping_source if ( params.genotyping_source == 'trimmed' && ! params.run_trim_bam ) { exit 1, ("[nf-core/eager] ERROR: --genotyping_source cannot be 'trimmed' unless BAM trimming is turned on with `--run_trim_bam`.") } if ( params.genotyping_source == 'pmd' && ! params.run_pmd_filtering ) { exit 1, ("[nf-core/eager] ERROR: --genotyping_source cannot be 'pmd' unless PMD-filtering is ran.") } if ( params.genotyping_source == 'rescaled' && ! params.run_mapdamage_rescaling ) { exit 1, ("[nf-core/eager] ERROR: --genotyping_source cannot be 'rescaled' unless aDNA damage rescaling is ran.") } -if ( ! ( fasta.extension == 'csv' || fasta.extension == 'tsv' ) && params.run_genotyping && params.genotyping_tool == 'pileupcaller' && ! (params.genotyping_pileupcaller_bedfile || params.genotyping_pileupcaller_snpfile ) ) { exit 1, ("[nf-core/eager] ERROR: Genotyping with pileupcaller requires both '--genotyping_pileupcaller_bedfile' AND '--genotyping_pileupcaller_snpfile' to be provided.") } +if ( ! ( params.fasta.endsWith('csv') || params.fasta.endsWith('tsv') ) && params.run_genotyping && params.genotyping_tool == 'pileupcaller' && ! (params.genotyping_pileupcaller_bedfile || params.genotyping_pileupcaller_snpfile ) ) { exit 1, ("[nf-core/eager] ERROR: Genotyping with pileupcaller requires both '--genotyping_pileupcaller_bedfile' AND '--genotyping_pileupcaller_snpfile' to be provided.") } if ( params.metagenomics_complexity_tool == 'prinseq' && params.metagenomics_prinseq_mode == 'dust' && params.metagenomics_complexity_entropy != 0.3 ) { // entropy score was set but dust method picked. If no dust-score provided, assume it was an error and fail if (params.metagenomics_prinseq_dustscore == 0.5) { From 73b7d0a073894043ebc23d43dc3d6caa42d63619 Mon Sep 17 00:00:00 2001 From: Thiseas Christos Lamnidis Date: Tue, 6 Feb 2024 16:46:43 +0100 Subject: [PATCH 073/110] remove library ids from genotyping configs (libs merged) --- conf/modules.config | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/conf/modules.config b/conf/modules.config index 64e98fd42..4b8e41c3d 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -978,22 +978,22 @@ process { } withName: GATK_REALIGNERTARGETCREATOR { - tag = { "${meta.reference}|${meta.sample_id}_${meta.library_id}" } + tag = { "${meta.reference}|${meta.sample_id}" } ext.args = [ params.genotyping_gatk_ug_defaultbasequalities > 0 ? "--defaultBaseQualities ${params.genotyping_gatk_ug_defaultbasequalities}" : "", // For some reason, GATK complains if its default of -1 is actually provided ?_? ].join(' ').trim() - ext.prefix = { "${meta.sample_id}_${meta.library_id}_${meta.reference}" } + ext.prefix = { "${meta.sample_id}_${meta.reference}" } publishDir = [ enabled: false ] } withName: GATK_INDELREALIGNER { - tag = { "${meta.reference}|${meta.sample_id}_${meta.library_id}" } + tag = { "${meta.reference}|${meta.sample_id}" } ext.args = [ params.genotyping_gatk_ug_defaultbasequalities > 0 ? "--defaultBaseQualities ${params.genotyping_gatk_ug_defaultbasequalities}" : "", ].join(' ').trim() - ext.prefix = { "${meta.sample_id}_${meta.library_id}_${meta.reference}" } + ext.prefix = { "${meta.sample_id}_${meta.reference}" } publishDir = [ path: { "${params.outdir}/genotyping/IR" }, mode: params.publish_dir_mode, @@ -1003,7 +1003,7 @@ process { } withName: GATK_UNIFIEDGENOTYPER { - tag = { "${meta.reference}|${meta.sample_id}_${meta.library_id}" } + tag = { "${meta.reference}|${meta.sample_id}" } ext.args = [ "--sample_ploidy ${params.genotyping_gatk_ploidy}", "-stand_call_conf ${params.genotyping_gatk_call_conf}", @@ -1012,7 +1012,7 @@ process { "--genotype_likelihoods_model ${params.genotyping_gatk_ug_genotype_mode}", params.genotyping_gatk_ug_defaultbasequalities > 0 ? "--defaultBaseQualities ${params.genotyping_gatk_ug_defaultbasequalities}" : "", ].join(' ').trim() - ext.prefix = { "${meta.sample_id}_${meta.library_id}_${meta.reference}" } + ext.prefix = { "${meta.sample_id}_${meta.reference}" } publishDir = [ path: { "${params.outdir}/genotyping/" }, mode: params.publish_dir_mode, @@ -1022,7 +1022,7 @@ process { } withName: GATK4_HAPLOTYPECALLER { - tag = { "${meta.reference}|${meta.sample_id}_${meta.library_id}" } + tag = { "${meta.reference}|${meta.sample_id}" } ext.args = [ // Option names have changed from underscore_separated to hyphen-separated in GATK4 "--sample-ploidy ${params.genotyping_gatk_ploidy}", @@ -1030,7 +1030,7 @@ process { "--output-mode ${params.genotyping_gatk_hc_out_mode}", "--emit-ref-confidence ${params.genotyping_gatk_hc_emitrefconf}", ].join(' ').trim() - ext.prefix = { "${meta.sample_id}_${meta.library_id}_${meta.reference}" } + ext.prefix = { "${meta.sample_id}_${meta.reference}" } publishDir = [ path: { "${params.outdir}/genotyping/" }, mode: params.publish_dir_mode, @@ -1040,13 +1040,13 @@ process { } withName: FREEBAYES { - tag = { "${meta.reference}|${meta.sample_id}_${meta.library_id}" } + tag = { "${meta.reference}|${meta.sample_id}" } ext.args = [ "-C ${params.genotyping_freebayes_min_alternate_count}", "-p ${params.genotyping_freebayes_ploidy}", params.genotyping_freebayes_skip_coverage != 0 ? "-g ${params.genotyping_freebayes_skip_coverage}" : "", ].join(' ').trim() - ext.prefix = { "${meta.sample_id}_${meta.library_id}_${meta.reference}" } + ext.prefix = { "${meta.sample_id}_${meta.reference}" } publishDir = [ path: { "${params.outdir}/genotyping/" }, mode: params.publish_dir_mode, @@ -1056,8 +1056,8 @@ process { } withName: BCFTOOLS_STATS_GENOTYPING { - tag = { "${meta.reference}|${meta.sample_id}_${meta.library_id}" } - ext.prefix = { "${meta.sample_id}_${meta.library_id}_${meta.reference}" } + tag = { "${meta.reference}|${meta.sample_id}" } + ext.prefix = { "${meta.sample_id}_${meta.reference}" } publishDir = [ path: { "${params.outdir}/genotyping/" }, mode: params.publish_dir_mode, From 0b193357a196054936478066812a4c8069cd4054 Mon Sep 17 00:00:00 2001 From: Thiseas Christos Lamnidis Date: Tue, 6 Feb 2024 17:16:35 +0100 Subject: [PATCH 074/110] fix file name collision in GATK RTC --- conf/modules.config | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/conf/modules.config b/conf/modules.config index 4b8e41c3d..7fa258990 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -982,7 +982,7 @@ process { ext.args = [ params.genotyping_gatk_ug_defaultbasequalities > 0 ? "--defaultBaseQualities ${params.genotyping_gatk_ug_defaultbasequalities}" : "", // For some reason, GATK complains if its default of -1 is actually provided ?_? ].join(' ').trim() - ext.prefix = { "${meta.sample_id}_${meta.reference}" } + ext.prefix = { "${meta.sample_id}_${meta.reference}_realigntarget" } publishDir = [ enabled: false ] From 7a3ea2aea30081fe0717828321fbfdad68d528cc Mon Sep 17 00:00:00 2001 From: Thiseas Christos Lamnidis Date: Wed, 7 Feb 2024 16:27:20 +0100 Subject: [PATCH 075/110] remove debug statements. add python version to version yml --- modules/local/collect_genotypes.nf | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/modules/local/collect_genotypes.nf b/modules/local/collect_genotypes.nf index 9a1715268..00b732dbb 100644 --- a/modules/local/collect_genotypes.nf +++ b/modules/local/collect_genotypes.nf @@ -20,8 +20,6 @@ process COLLECT_GENOTYPES { script: prefix = task.ext.prefix ?: "${meta.id}" // If there are multiple genotype datasets, then merge them, else just rename the output for consistency. - println "geno = ${geno.toList().size()}" - println "${geno.toList()}" if ( geno.toList().size() == 1 ) { """ mv ${geno[0]} ${prefix}.geno @@ -46,6 +44,7 @@ process COLLECT_GENOTYPES { cat <<-END_VERSIONS > versions.yml "${task.process}": + python: \$(python --version | sed 's/Python //g') collect_genotypes.py: \$(collect_genotypes.py -v) END_VERSIONS """ From a3b16a289730091fe87e64fe4c2fdf8897f17a47 Mon Sep 17 00:00:00 2001 From: Thiseas Christos Lamnidis Date: Wed, 7 Feb 2024 16:30:17 +0100 Subject: [PATCH 076/110] add coverage stats. add mqc files to mqc channel --- conf/modules.config | 15 ++++++++++++- subworkflows/local/genotype.nf | 40 ++++++++++++++++++++++------------ 2 files changed, 40 insertions(+), 15 deletions(-) diff --git a/conf/modules.config b/conf/modules.config index 7fa258990..bb145acdf 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -977,6 +977,19 @@ process { ] } + withName: EIGENSTRATDATABASETOOLS_EIGENSTRATSNPCOVERAGE { + tag = { "${meta.reference}" } + ext.args = { "-j ${prefix}.json" } + ext.prefix = { "pileupcaller_genotypes_${meta.reference}_coverage" } + publishDir = [ + path: { "${params.outdir}/genotyping/" }, + mode: params.publish_dir_mode, + enabled: true, + pattern: '*.{tsv}' + ] + + } + withName: GATK_REALIGNERTARGETCREATOR { tag = { "${meta.reference}|${meta.sample_id}" } ext.args = [ @@ -993,7 +1006,7 @@ process { ext.args = [ params.genotyping_gatk_ug_defaultbasequalities > 0 ? "--defaultBaseQualities ${params.genotyping_gatk_ug_defaultbasequalities}" : "", ].join(' ').trim() - ext.prefix = { "${meta.sample_id}_${meta.reference}" } + ext.prefix = { "${meta.sample_id}_${meta.reference}_realigned" } publishDir = [ path: { "${params.outdir}/genotyping/IR" }, mode: params.publish_dir_mode, diff --git a/subworkflows/local/genotype.nf b/subworkflows/local/genotype.nf index 653a845ee..9c032d8ea 100644 --- a/subworkflows/local/genotype.nf +++ b/subworkflows/local/genotype.nf @@ -25,6 +25,7 @@ workflow GENOTYPE { ch_versions = Channel.empty() ch_multiqc_files = Channel.empty() ch_pileupcaller_genotypes = Channel.empty() + ch_eigenstrat_coverage_stats = Channel.empty() ch_gatk_haplotypecaller_genotypes = Channel.empty() ch_gatk_unifiedgenotyper_genotypes = Channel.empty() ch_freebayes_genotypes = Channel.empty() @@ -123,8 +124,16 @@ workflow GENOTYPE { } COLLECT_GENOTYPES( ch_final_genotypes ) - ch_versions = ch_versions.mix( COLLECT_GENOTYPES.out.versions.first() ) + ch_pileupcaller_genotypes = COLLECT_GENOTYPES.out.collected + ch_versions = ch_versions.mix( COLLECT_GENOTYPES.out.versions.first() ) + // Calcualte coverage stats for collected eigenstrat dataset + EIGENSTRATDATABASETOOLS_EIGENSTRATSNPCOVERAGE( + ch_pileupcaller_genotypes + ) + ch_eigenstrat_coverage_stats = EIGENSTRATDATABASETOOLS_EIGENSTRATSNPCOVERAGE.out.tsv + ch_versions = ch_versions.mix( EIGENSTRATDATABASETOOLS_EIGENSTRATSNPCOVERAGE.out.versions.first() ) + ch_multiqc_files = ch_multiqc_files.mix( EIGENSTRATDATABASETOOLS_EIGENSTRATSNPCOVERAGE.out.json ) } if ( params.genotyping_tool == 'ug' ) { @@ -221,8 +230,8 @@ workflow GENOTYPE { ch_bams_for_ug.dbsnp, [[], []] // No comp ) - ch_versions = ch_versions.mix( GATK_UNIFIEDGENOTYPER.out.versions.first() ) ch_gatk_unifiedgenotyper_genotypes = GATK_UNIFIEDGENOTYPER.out.vcf + ch_versions = ch_versions.mix( GATK_UNIFIEDGENOTYPER.out.versions.first() ) } if ( params.genotyping_tool == 'hc' ) { @@ -267,8 +276,8 @@ workflow GENOTYPE { ch_input_for_hc.dbsnp, [[], []] // No dbsnp_tbi ) - ch_versions = ch_versions.mix( GATK4_HAPLOTYPECALLER.out.versions.first() ) - ch_gatk_unifiedgenotyper_genotypes = GATK4_HAPLOTYPECALLER.out.vcf + ch_gatk_haplotypecaller_genotypes = GATK4_HAPLOTYPECALLER.out.vcf + ch_versions = ch_versions.mix( GATK4_HAPLOTYPECALLER.out.versions.first() ) } if ( params.genotyping_tool == 'freebayes' ) { @@ -315,8 +324,8 @@ workflow GENOTYPE { [ [], [] ], // No populations file [ [], [] ] // No CNV file ) - ch_versions = ch_versions.mix( FREEBAYES.out.versions.first() ) ch_freebayes_genotypes = FREEBAYES.out.vcf + ch_versions = ch_versions.mix( FREEBAYES.out.versions.first() ) } if ( params.genotyping_tool == 'angsd' ) { @@ -346,16 +355,19 @@ workflow GENOTYPE { [ [], [] ], // exons ch_bcftools_input.fasta // fasta ) - ch_versions = ch_versions.mix( BCFTOOLS_STATS_GENOTYPING.out.versions.first() ) + ch_bcftools_stats = BCFTOOLS_STATS_GENOTYPING.out.stats + ch_multiqc_files = ch_multiqc_files.mix(BCFTOOLS_STATS_GENOTYPING.out.stats) + ch_versions = ch_versions.mix( BCFTOOLS_STATS_GENOTYPING.out.versions.first() ) } emit: - geno_pileupcaller = ch_pileupcaller_genotypes // [ [ meta ], geno, snp, ind ] - geno_gatk_hc = ch_gatk_haplotypecaller_genotypes // [ [ meta ], vcf ] ] - geno_gatk_ug = ch_gatk_unifiedgenotyper_genotypes // [ [ meta ], vcf ] ] - geno_freebayes = ch_freebayes_genotypes // [ [ meta ], vcf ] ] - geno_angsd = ch_angsd_genotypes // [ [ meta ], glf ] ] - versions = ch_versions - mqc = ch_multiqc_files - + geno_pileupcaller = ch_pileupcaller_genotypes // [ [ meta ], geno, snp, ind ] + geno_gatk_hc = ch_gatk_haplotypecaller_genotypes // [ [ meta ], vcf ] ] + geno_gatk_ug = ch_gatk_unifiedgenotyper_genotypes // [ [ meta ], vcf ] ] + geno_freebayes = ch_freebayes_genotypes // [ [ meta ], vcf ] ] + geno_angsd = ch_angsd_genotypes // [ [ meta ], glf ] ] + vcf_stats = ch_bcftools_stats // [ [ meta ], stats ] + eigenstrat_coverage_stats = ch_eigenstrat_coverage_stats // [ [ meta ], stats ] + versions = ch_versions + mqc = ch_multiqc_files } From 97bdc0bf8c2ac7c44b2e870407fcce4c4b1f589d Mon Sep 17 00:00:00 2001 From: Thiseas Christos Lamnidis Date: Wed, 7 Feb 2024 16:30:31 +0100 Subject: [PATCH 077/110] update manual tests --- docs/development/manual_tests.md | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/docs/development/manual_tests.md b/docs/development/manual_tests.md index b1211b939..39354307f 100644 --- a/docs/development/manual_tests.md +++ b/docs/development/manual_tests.md @@ -740,13 +740,13 @@ nextflow run main.nf -profile test,docker --outdir ./results -w work/ -resume -- ```bash ## Gatk UG on trimmed reads. Skip bcftools stats. ## Expect: One VCF per sample/reference combination, based on the trimmed bams (this actually shows on the IndelRealigner step and not the UG step). No IR directory. No bcftools_stats file per VCF. +## Checked that the input bam for the UG jobs indeed had trimmed reads. (The full UDG sample has untrimmed bams.) nextflow run main.nf -profile test,docker --outdir ./results -w work/ -resume --run_genotyping --genotyping_tool 'ug' --genotyping_source 'trimmed' -ansi-log false -dump-channels --skip_bcftools_stats \ --run_trim_bam \ --damage_manipulation_bamutils_trim_double_stranded_none_udg_left 5 \ --damage_manipulation_bamutils_trim_double_stranded_none_udg_right 7 \ --damage_manipulation_bamutils_trim_double_stranded_half_udg_left 1 \ --damage_manipulation_bamutils_trim_double_stranded_half_udg_right 2 -## Checked that the input bam for the UG jobs indeed had trimmed reads. (The full UDG sample has untrimmed bams.) ``` ```bash @@ -777,11 +777,19 @@ nextflow run main.nf -profile test_multiref,docker --input test/samplesheet_mult nextflow run main.nf -profile test,docker --outdir ./results -w work/ -resume --run_genotyping --genotyping_tool 'hc' --genotyping_source 'raw' -ansi-log false -dump-channels ``` +```bash +## Attempt to run Gatk HC on trimmed reads, without activating trimming. +## Expect: FAILURE. Cannot set genotyping source to ;trimmed' without trimming. +nextflow run main.nf -profile test,docker --outdir ./results -w work/ -resume --run_genotyping --genotyping_tool 'hc' --genotyping_source 'trimmed' -ansi-log false -dump-channels --skip_bcftools_stats \ + --genotyping_gatk_hc_emitrefconf 'BP_RESOLUTION' \ + --genotyping_gatk_hc_out_mode 'EMIT_ALL_ACTIVE_SITES' +``` + ```bash ## Gatk HC on trimmed reads, with different out mode and emit confidence. Skip bcftools stats. ## Expect: One VCF + .tbi index per sample/reference combination. ## Checked .command.sh for correct args. -nextflow run main.nf -profile test,docker --outdir ./results -w work/ -resume --run_genotyping --genotyping_tool 'hc' --genotyping_source 'trimmed' -ansi-log false -dump-channels --skip_bcftools_stats \ +nextflow run main.nf -profile test,docker --outdir ./results -w work/ -resume --run_genotyping --genotyping_tool 'hc' --genotyping_source 'trimmed' --run_trim_bam -ansi-log false -dump-channels --skip_bcftools_stats \ --genotyping_gatk_hc_emitrefconf 'BP_RESOLUTION' \ --genotyping_gatk_hc_out_mode 'EMIT_ALL_ACTIVE_SITES' ``` @@ -823,21 +831,21 @@ nextflow run main.nf -profile test_multiref,docker --outdir ./results -w work/ - ```bash ## Pileupcaller on raw reads. No bed or snp file provided. -## Expect: NO GENOTYPING. Pileupcaller requires a bed file and a snp file. Throws an error. -## TODO Maybe we need a hard failure here? +## Expect: NO GENOTYPING. PileupCaller requires a bed file and a snp file. Throws an error and stops. +## NOTE: if --fasta is a tsv/csv, then the error is deferred to AFTER parsing and indexing the references. nextflow run main.nf -profile test,docker --outdir ./results -w work/ -resume --run_genotyping --genotyping_tool 'pileupcaller' --genotyping_source 'raw' -ansi-log false -dump-channels ``` ```bash ## Pileupcaller on raw reads. -## Expect: One geno/snp/ind combination per reference/strandedness combination (provided that a bed and snp file are present for the reference). geno and snp have same number of lines as SNPs in provided snpfile. ind has same number of lines as number of samples of that strandedness. +## Expect: One geno/snp/ind/coverage tsv combination per reference (provided that a bed and snp file are present for the reference). geno and snp have same number of lines as SNPs in provided snpfile (977). ind has same number of lines as number of samples (2). Coverage tsv has same lines as ind + 1 for header (3). nextflow run main.nf -profile test_humanbam,docker --outdir ./results -w work/ -resume --run_genotyping --genotyping_tool 'pileupcaller' --genotyping_source 'raw' -ansi-log false -dump-channels ``` ```bash ## PileupCaller on raw reads ## Something is wrong with the test input BAM, that makes samtools mpileup fail. samtools quickcheck does not identify a problem, but empty mpileups are generated when the BAM input is included in as an input. -## Expect: One geno/snp/ind combination per reference (provided that a bed and snp file are present for the reference). geno and snp have same number of lines as SNPs in provided snpfile (977). ind has same number of lines as number of samples (2). +## Expect: One geno/snp/ind/coverage tsv combination per reference (provided that a bed and snp file are present for the reference). geno and snp have same number of lines as SNPs in provided snpfile (977). ind has have same number of lines as number of samples (2). Coverage tsv has same lines as ind + 1 for header (3). ## Specifically, no geno/snp/ind for the reference that has no bed/snp file (Mammoth). Only data for "human" reference. nextflow run main.nf -profile test_multiref,docker --input test/samplesheet_multilane_multilib_noBAM.tsv --outdir ./results -w work/ -resume --run_genotyping --genotyping_tool 'pileupcaller' --genotyping_source 'raw' -ansi-log false -dump-channels ``` From 6d4ac28076a6588788fdb28f9b28ac639769afc8 Mon Sep 17 00:00:00 2001 From: "Thiseas C. Lamnidis" Date: Fri, 23 Feb 2024 10:35:53 +0100 Subject: [PATCH 078/110] Apply suggestions from code review to modules.conf Co-authored-by: Selina Carlhoff <73653549+scarlhoff@users.noreply.github.com> --- conf/modules.config | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/conf/modules.config b/conf/modules.config index fc380c1f1..ea84743c6 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -942,7 +942,6 @@ process { "-q ${params.genotyping_pileupcaller_min_base_quality}", "-Q ${params.genotyping_pileupcaller_min_map_quality}", // "--ignore-RG", - params.genotyping_gatk_ug_defaultbasequalities > 0 ? "--defaultBaseQualities ${params.genotyping_gatk_ug_defaultbasequalities}" : "", // For some reason, GATK complains if its default of -1 is actually provided ?_? ].join(' ').trim() ext.prefix = { "${meta.strandedness}_${meta.reference}" } publishDir = [ @@ -957,7 +956,7 @@ process { ext.args = {[ "--${params.genotyping_pileupcaller_method}", params.genotyping_pileupcaller_transitions_mode == "SkipTransitions" ? "--skipTransitions" : params.genotyping_pileupcaller_transitions_mode == "TransitionsMissing" ? "--transitionsMissing" : "", - "${meta.strandedness}" == 'single' ? "--singleStrangMode" : "" , + "${meta.strandedness}" == 'single' ? "--singleStrandMode" : "" , "--sampleNames", meta.id.join(","), "-e pileupcaller.${meta.strandedness}.${meta.reference}" ].join(' ').trim() } @@ -991,7 +990,6 @@ process { enabled: true, pattern: '*.{tsv}' ] - } withName: GATK_REALIGNERTARGETCREATOR { @@ -1012,7 +1010,7 @@ process { ].join(' ').trim() ext.prefix = { "${meta.sample_id}_${meta.reference}_realigned" } publishDir = [ - path: { "${params.outdir}/genotyping/IR" }, + path: { "${params.outdir}/genotyping/IndelRealigner" }, mode: params.publish_dir_mode, enabled: params.genotyping_gatk_ug_keep_realign_bam, pattern: '*.{bam,bai}' From 156e132f2f1c47f0893bf0b60eaad66b06b78b8d Mon Sep 17 00:00:00 2001 From: Thiseas Christos Lamnidis Date: Fri, 23 Feb 2024 10:37:32 +0100 Subject: [PATCH 079/110] remove commented lines, update comments --- conf/modules.config | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) diff --git a/conf/modules.config b/conf/modules.config index ea84743c6..ff6c7baea 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -941,12 +941,9 @@ process { "-B", "-q ${params.genotyping_pileupcaller_min_base_quality}", "-Q ${params.genotyping_pileupcaller_min_map_quality}", - // "--ignore-RG", ].join(' ').trim() ext.prefix = { "${meta.strandedness}_${meta.reference}" } publishDir = [ - // path: { "${params.outdir}/genotyping/" }, - // mode: params.publish_dir_mode, enabled: false ] } @@ -962,10 +959,7 @@ process { ].join(' ').trim() } ext.prefix = { "${meta.strandedness}_${meta.reference}" } publishDir = [ - // path: { "${params.outdir}/genotyping/" }, - // mode: params.publish_dir_mode, - // pattern: '*.{geno,snp,ind}', - enabled: false + enabled: false // Not published because the output goes through COLLECT_GENOTYPES ] } @@ -995,7 +989,7 @@ process { withName: GATK_REALIGNERTARGETCREATOR { tag = { "${meta.reference}|${meta.sample_id}" } ext.args = [ - params.genotyping_gatk_ug_defaultbasequalities > 0 ? "--defaultBaseQualities ${params.genotyping_gatk_ug_defaultbasequalities}" : "", // For some reason, GATK complains if its default of -1 is actually provided ?_? + params.genotyping_gatk_ug_defaultbasequalities > 0 ? "--defaultBaseQualities ${params.genotyping_gatk_ug_defaultbasequalities}" : "", // Empty string since GATK complains if its default of -1 is provided. ].join(' ').trim() ext.prefix = { "${meta.sample_id}_${meta.reference}_realigntarget" } publishDir = [ @@ -1006,7 +1000,7 @@ process { withName: GATK_INDELREALIGNER { tag = { "${meta.reference}|${meta.sample_id}" } ext.args = [ - params.genotyping_gatk_ug_defaultbasequalities > 0 ? "--defaultBaseQualities ${params.genotyping_gatk_ug_defaultbasequalities}" : "", + params.genotyping_gatk_ug_defaultbasequalities > 0 ? "--defaultBaseQualities ${params.genotyping_gatk_ug_defaultbasequalities}" : "", // Empty string since GATK complains if its default of -1 is provided. ].join(' ').trim() ext.prefix = { "${meta.sample_id}_${meta.reference}_realigned" } publishDir = [ @@ -1025,7 +1019,7 @@ process { "-dcov ${params.genotyping_gatk_ug_downsample}", "--output_mode ${params.genotyping_gatk_ug_out_mode}", "--genotype_likelihoods_model ${params.genotyping_gatk_ug_genotype_mode}", - params.genotyping_gatk_ug_defaultbasequalities > 0 ? "--defaultBaseQualities ${params.genotyping_gatk_ug_defaultbasequalities}" : "", + params.genotyping_gatk_ug_defaultbasequalities > 0 ? "--defaultBaseQualities ${params.genotyping_gatk_ug_defaultbasequalities}" : "", // Empty string since GATK complains if its default of -1 is provided. ].join(' ').trim() ext.prefix = { "${meta.sample_id}_${meta.reference}" } publishDir = [ From 0c2d9eea14eb882fc0ee458190c3bd0233670c87 Mon Sep 17 00:00:00 2001 From: Thiseas Christos Lamnidis Date: Fri, 23 Feb 2024 10:45:41 +0100 Subject: [PATCH 080/110] Update parameter name for keeping realigned bam --- conf/modules.config | 2 +- nextflow.config | 2 +- nextflow_schema.json | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/conf/modules.config b/conf/modules.config index ff6c7baea..231647759 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -1006,7 +1006,7 @@ process { publishDir = [ path: { "${params.outdir}/genotyping/IndelRealigner" }, mode: params.publish_dir_mode, - enabled: params.genotyping_gatk_ug_keep_realign_bam, + enabled: params.genotyping_gatk_ug_keeprealignbam, pattern: '*.{bam,bai}' ] } diff --git a/nextflow.config b/nextflow.config index 9e3c24bd2..7089d1bb5 100644 --- a/nextflow.config +++ b/nextflow.config @@ -217,7 +217,7 @@ params { genotyping_gatk_ug_out_mode = 'EMIT_VARIANTS_ONLY' genotyping_gatk_ug_genotype_mode = 'SNP' genotyping_gatk_ug_defaultbasequalities = -1 - genotyping_gatk_ug_keep_realign_bam = false + genotyping_gatk_ug_keeprealignbam = false genotyping_gatk_hc_out_mode = 'EMIT_VARIANTS_ONLY' genotyping_gatk_hc_emitrefconf = 'GVCF' genotyping_freebayes_min_alternate_count = 1 diff --git a/nextflow_schema.json b/nextflow_schema.json index 2796b2611..c9f547482 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -982,7 +982,7 @@ "fa_icon": "fas fa-project-diagram", "help_text": "If GATK UnifiedGenotyper is selected as the genotyping tool, this sets which likelihood model to follow, i.e. whether to call only SNPs or INDELS etc.\n\n> Modifies GATK UnifiedGenotyper parameter: `--genotype_likelihoods_model`" }, - "genotyping_gatk_ug_keep_realign_bam": { + "genotyping_gatk_ug_keeprealignbam": { "type": "boolean", "fa_icon": "far fa-save", "description": "Specify to keep the BAM output of re-alignment around variants from GATK UnifiedGenotyper.", From d74de2365d48e3f8e9a098b2b542c4ec6f6ff2d4 Mon Sep 17 00:00:00 2001 From: Thiseas Christos Lamnidis Date: Fri, 23 Feb 2024 10:47:01 +0100 Subject: [PATCH 081/110] rename parameter --- docs/development/manual_tests.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/docs/development/manual_tests.md b/docs/development/manual_tests.md index 39354307f..73adc7de7 100644 --- a/docs/development/manual_tests.md +++ b/docs/development/manual_tests.md @@ -734,7 +734,7 @@ These tests were ran before library merging was implemented. ```bash ## Gatk UG on raw reads ## Expect: One VCF per sample/reference combination. Also 1 bcftools_stats file per VCF. Additional IR/ subdirectory with 1 bam and 1 bai per sample/reference combination. -nextflow run main.nf -profile test,docker --outdir ./results -w work/ -resume --run_genotyping --genotyping_tool 'ug' --genotyping_source 'raw' --genotyping_gatk_ug_keep_realign_bam -ansi-log false -dump-channels +nextflow run main.nf -profile test,docker --outdir ./results -w work/ -resume --run_genotyping --genotyping_tool 'ug' --genotyping_source 'raw' --genotyping_gatk_ug_keeprealignbam -ansi-log false -dump-channels ``` ```bash @@ -766,7 +766,7 @@ nextflow run main.nf -profile test,docker --outdir ./results -w work/ -resume -- ## Gatk UG on raw reads, multiple references ## NOTE: Actually fails due to header of BAM input in test_multiref not matching sequences in fasta (which was shortened to chr 21+ for brevity). Provided alternative input without a BAM input line. ( head -n 5 on https://raw.githubusercontent.com/nf-core/test-datasets/eager/testdata/Mammoth/samplesheet_multilane_multilib.tsv ). It then worked fine. ## Expect: One VCF per sample/reference combination. Also 1 bcftools_stats file per VCF. Additional IR/ subdirectory with 1 bam and 1 bai per sample/reference combination. -nextflow run main.nf -profile test_multiref,docker --input test/samplesheet_multilane_multilib_noBAM.tsv --outdir ./results -w work/ -resume --run_genotyping --genotyping_tool 'ug' --genotyping_source 'raw' --genotyping_gatk_ug_keep_realign_bam -ansi-log false -dump-channels +nextflow run main.nf -profile test_multiref,docker --input test/samplesheet_multilane_multilib_noBAM.tsv --outdir ./results -w work/ -resume --run_genotyping --genotyping_tool 'ug' --genotyping_source 'raw' --genotyping_gatk_ug_keeprealignbam -ansi-log false -dump-channels ``` ## GATK HC @@ -798,7 +798,7 @@ nextflow run main.nf -profile test,docker --outdir ./results -w work/ -resume -- ## Gatk HC on raw reads, multiple references ## NOTE: Actually fails due to header of BAM input in test_multiref not matching sequences in fasta (which was shortened to chr 21+ for brevity). Provided alternative input without a BAM input line. ( head -n 5 on https://raw.githubusercontent.com/nf-core/test-datasets/eager/testdata/Mammoth/samplesheet_multilane_multilib.tsv ). It then worked fine. ## Expect: One VCF + .tbi index per sample/reference combination . Also 1 bcftools_stats file per VCF. -nextflow run main.nf -profile test_multiref,docker --input test/samplesheet_multilane_multilib_noBAM.tsv --outdir ./results -w work/ -resume --run_genotyping --genotyping_tool 'hc' --genotyping_source 'raw' --genotyping_gatk_ug_keep_realign_bam -ansi-log false -dump-channels +nextflow run main.nf -profile test_multiref,docker --input test/samplesheet_multilane_multilib_noBAM.tsv --outdir ./results -w work/ -resume --run_genotyping --genotyping_tool 'hc' --genotyping_source 'raw' --genotyping_gatk_ug_keeprealignbam -ansi-log false -dump-channels ``` ## FREEBAYES @@ -824,7 +824,7 @@ nextflow run main.nf -profile test,docker --outdir ./results -w work/ -resume -- ## Freebayes on raw reads, multiple references ## Freebayes does not complain about the BAM header not matching the reference. ## Expect: One VCF per sample/reference combination. BAM input only has 1 output for the specified reference. Also 1 bcftools_stats file per VCF. -nextflow run main.nf -profile test_multiref,docker --outdir ./results -w work/ -resume --run_genotyping --genotyping_tool 'freebayes' --genotyping_source 'raw' --genotyping_gatk_ug_keep_realign_bam -ansi-log false -dump-channels +nextflow run main.nf -profile test_multiref,docker --outdir ./results -w work/ -resume --run_genotyping --genotyping_tool 'freebayes' --genotyping_source 'raw' --genotyping_gatk_ug_keeprealignbam -ansi-log false -dump-channels ``` ## PILEUPCALLER From 5046dfe37622255b0493425097886d806ca99f8e Mon Sep 17 00:00:00 2001 From: "Thiseas C. Lamnidis" Date: Fri, 23 Feb 2024 11:24:39 +0100 Subject: [PATCH 082/110] Apply suggestions from code review to schema wording Co-authored-by: James A. Fellows Yates --- nextflow_schema.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/nextflow_schema.json b/nextflow_schema.json index c9f547482..b0b38febe 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -883,7 +883,7 @@ "type": "string", "fa_icon": "fas fa-tools", "enum": ["ug", "hc", "freebayes", "pileupcaller", "angsd"], - "help_text": "Specifies which genotyper to use. Current options are: GATK (v3.5) UnifiedGenotyper or GATK Haplotype Caller (v4); and the FreeBayes Caller.\n\n> Note that while UnifiedGenotyper is more suitable for low-coverage ancient DNA (HaplotypeCaller does de novo assembly around each variant site), be aware GATK 3.5 it is officially deprecated by the Broad Institute.", + "help_text": "Specifies which genotyper to use. Current options are: GATK (v3.5) UnifiedGenotyper or GATK Haplotype Caller (v4); and the FreeBayes Caller.\n\n> Note that while UnifiedGenotyper is more suitable for low-coverage ancient DNA (HaplotypeCaller does de novo assembly around each variant site), be aware GATK 3.5 it is officially deprecated by the Broad Institute (but is used here for compatibility with MultiVCFAnalyzer).", "description": "Specify which genotyper to use between: GATK UnifiedGenotyper, GATK HaplotypeCaller, Freebayes, or pileupCaller." }, "skip_bcftools_stats": { @@ -948,7 +948,7 @@ "default": 2, "fa_icon": "fas fa-pastafarianism", "description": "Specify GATK organism ploidy.", - "help_text": "If selected, specify a GATK genotyper ploidy value of your reference organism. E.g. if you want to allow heterozygous calls from >= diploid organisms.\n\n> Modifies GATK UnifiedGenotyper or HaplotypeCaller parameter: --sample-ploidy" + "help_text": "If selected, specify a GATK genotyper ploidy value of your reference organism. E.g. if you want to allow heterozygous calls from >= diploid organisms (e.g. for multi-allele frequency reporting in MultiVCFAnalyzer).\n\n> Modifies GATK UnifiedGenotyper or HaplotypeCaller parameter: --sample-ploidy" }, "genotyping_gatk_dbsnp": { "type": "string", From ea27287d39789468dc9199ec95c9609c64263d7e Mon Sep 17 00:00:00 2001 From: Thiseas Christos Lamnidis Date: Fri, 23 Feb 2024 11:25:23 +0100 Subject: [PATCH 083/110] standardise mpileup helptext wording --- nextflow_schema.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/nextflow_schema.json b/nextflow_schema.json index b0b38febe..226dd793f 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -896,7 +896,7 @@ "type": "integer", "default": 30, "description": "The base mapping quality to be used for genotyping with pileupcaller.", - "help_text": "The minimum mapping quality to be used for genotyping with pileupCaller. Affects the `samtools mpileup` output that is used by `pileupCaller`. \n\n> Affects `-Q` parameter of `samtools mpileup`.", + "help_text": "The minimum base quality to be used when generating the samtools mpileup used as input for genotyping with pileupCaller. \n\n> Modifies samtools mpileup parameter: `-Q`.", "fa_icon": "fas fa-filter" }, "genotyping_pileupcaller_min_map_quality": { @@ -904,7 +904,7 @@ "default": 30, "fa_icon": "fas fa-filter", "description": "The minimum mapping quality to be used for genotyping with pileupcaller.", - "help_text": "The minimum mapping quality to be used for genotyping with pileupCaller. Affects the `samtools mpileup` output that is used by `pileupCaller`. \n\n> Affects `-q` parameter of `samtools mpileup`." + "help_text": "The minimum mapping quality to be used when generating the samtools mpileup used as input for genotyping with pileupCaller. \n\n> Modifies samtools mpileup parameter: `-q`." }, "genotyping_pileupcaller_bedfile": { "type": "string", From 756c1706cb417729bd15eb02be12017826b047f3 Mon Sep 17 00:00:00 2001 From: Thiseas Christos Lamnidis Date: Fri, 23 Feb 2024 11:34:02 +0100 Subject: [PATCH 084/110] Update genotyping_pileupcaller_method helptext --- nextflow_schema.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/nextflow_schema.json b/nextflow_schema.json index 226dd793f..7941290cc 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -924,8 +924,8 @@ "type": "string", "default": "randomHaploid", "fa_icon": "fas fa-toolbox", - "description": "Specify the calling method to use.", - "help_text": "Specify the calling method to use.\n\n> Modifies pileupCaller parameter: `--randomHaploid` `--randomDiploid` `--majorityCall`", + "description": "Specify the SNP calling method to use for genotyping.", + "help_text": "Specify the SNP calling method to use for genotyping. 'randomHaploid' will randomly sample a read overlapping the SNP, and produce a homozygous genotype with the allele supported by that read (often called 'pseudohaploid' or 'pseudodiploid'). 'randomDiploid` will randomly sample two reads overlapping the SNP and produce a genotype comprised of the two alleles supported by the two reads. 'majorityCall' will produce a genotype that is homozygous for the allele that appears in the majority of reads overlapping the SNP.\n\n> Modifies pileupCaller parameters: `--randomHaploid` `--randomDiploid` `--majorityCall`", "enum": ["randomHaploid", "randomDiploid", "majorityCall"] }, "genotyping_pileupcaller_transitions_mode": { From 51400cd76c5645b05a2b6cfbbac003e74428c443 Mon Sep 17 00:00:00 2001 From: "Thiseas C. Lamnidis" Date: Fri, 23 Feb 2024 11:35:44 +0100 Subject: [PATCH 085/110] Apply suggestions from code review to schema Co-authored-by: Selina Carlhoff <73653549+scarlhoff@users.noreply.github.com> --- nextflow_schema.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nextflow_schema.json b/nextflow_schema.json index 7941290cc..b1dca9668 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -1034,7 +1034,7 @@ } }, "fa_icon": "fas fa-sliders-h", - "help_text": "There are options for different genotypers (or genotype likelihood calculators) to be used. We suggest you read the documentation of each tool to find the ones that suit your needs.\n\nDocumentation for each tool:\n\n- GATK UnifiedGenotyper\n- GATK HaplotypeCaller\n- FreeBayes\n- ANGSD\n- sequenceTools pileupCaller\n\nGenotyping is performed per sample (i.e. after all types of libraries are merged), except for pileupCaller which gathers all double-stranded and single-stranded (same-type merged) libraries respectively.\nSome genotypers require additional files to be specified in the reference sheet, or using command line parameters. When using a reference sheet, only references with the required filed specified in the respective columns will be used for genotyping. " + "help_text": "There are options for different genotypers (or genotype likelihood calculators) to be used. We suggest you read the documentation of each tool to find the ones that suit your needs.\n\nDocumentation for each tool:\n\n- GATK UnifiedGenotyper\n- GATK HaplotypeCaller\n- FreeBayes\n- sequenceTools pileupCaller\n\nGenotyping is performed per sample (i.e. after all types of libraries are merged), except for pileupCaller which gathers all double-stranded and single-stranded (same-type merged) libraries respectively.\nSome genotypers require additional files to be specified in the reference sheet, or using command line parameters. When using a reference sheet, only references with the required filed specified in the respective columns will be used for genotyping. " }, "mitochondrial_to_nuclear_ratio": { "title": "Mitochondrial to Nuclear Ratio", From b21e0f722fe08daabd9220073dfc8511dbcbbf7b Mon Sep 17 00:00:00 2001 From: Thiseas Christos Lamnidis Date: Fri, 23 Feb 2024 11:41:40 +0100 Subject: [PATCH 086/110] Remove TODO about parameter validation --- subworkflows/local/reference_indexing.nf | 6 ------ 1 file changed, 6 deletions(-) diff --git a/subworkflows/local/reference_indexing.nf b/subworkflows/local/reference_indexing.nf index 4b78e0368..7a4a2c4db 100644 --- a/subworkflows/local/reference_indexing.nf +++ b/subworkflows/local/reference_indexing.nf @@ -125,12 +125,6 @@ workflow REFERENCE_INDEXING { ch_dbsnp = ch_dbsnp .filter { it[1] != "" } - // Parameter combination validation - // TODO - // If channel ch_pileupcaller_bed_snp is empty and params.genotyping_tool == 'pileupcaller', throw error - // if ( ch_pileupcaller_bed_snp.isEmpty() && params.genotyping_tool == 'pileupcaller' ) { - // error "No pileupcaller_bed_snp file provided, but genotyping_tool is set to 'pileupcaller'. Please provide a pileupcaller_bed_snp file." - // } emit: reference = ch_reference_for_mapping // [ meta, fasta, fai, dict, mapindex, circular_target ] mitochondrion_header = ch_mitochondrion_header // [ meta, mitochondrion_header ] From 902003718befd105cb619764c67fa4d91adbc5c4 Mon Sep 17 00:00:00 2001 From: "Thiseas C. Lamnidis" Date: Fri, 23 Feb 2024 11:50:38 +0100 Subject: [PATCH 087/110] Apply suggestions from code review to genotype swf Co-authored-by: James A. Fellows Yates Co-authored-by: Selina Carlhoff <73653549+scarlhoff@users.noreply.github.com> --- subworkflows/local/genotype.nf | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/subworkflows/local/genotype.nf b/subworkflows/local/genotype.nf index 9c032d8ea..3e52bf6fe 100644 --- a/subworkflows/local/genotype.nf +++ b/subworkflows/local/genotype.nf @@ -85,6 +85,7 @@ workflow GENOTYPE { bams: [ combo_meta, bams, bedfile ] fasta: [ fasta ] } + SAMTOOLS_MPILEUP_PILEUPCALLER( ch_mpileup_inputs.bams, ch_mpileup_inputs.fasta, @@ -127,7 +128,7 @@ workflow GENOTYPE { ch_pileupcaller_genotypes = COLLECT_GENOTYPES.out.collected ch_versions = ch_versions.mix( COLLECT_GENOTYPES.out.versions.first() ) - // Calcualte coverage stats for collected eigenstrat dataset + // Calculate coverage stats for collected eigenstrat dataset EIGENSTRATDATABASETOOLS_EIGENSTRATSNPCOVERAGE( ch_pileupcaller_genotypes ) @@ -202,7 +203,7 @@ workflow GENOTYPE { ch_input_for_indelrealigner.dict, [[], []] // No known_vcf ) - ch_versions = ch_versions.mix( GATK_INDELREALIGNER.out.versions.first() ) // TODO is this actually needed, since all GATK modules have the same version? + ch_versions = ch_versions.mix( GATK_INDELREALIGNER.out.versions.first() ) // Use realigned bams as input for UG. combine with reference info to get correct ordering. ch_bams_for_ug = GATK_INDELREALIGNER.out.bam From 61df8b9079511b411f68714377be42f74cb6f91d Mon Sep 17 00:00:00 2001 From: Thiseas Christos Lamnidis Date: Fri, 23 Feb 2024 11:51:12 +0100 Subject: [PATCH 088/110] remove todo about issue #1054 --- subworkflows/local/genotype.nf | 1 - 1 file changed, 1 deletion(-) diff --git a/subworkflows/local/genotype.nf b/subworkflows/local/genotype.nf index 3e52bf6fe..73315ed5f 100644 --- a/subworkflows/local/genotype.nf +++ b/subworkflows/local/genotype.nf @@ -104,7 +104,6 @@ workflow GENOTYPE { snpfile: snp } - // TODO NOTE: Maybe implement a check that unmerged R2 reads have not been kept and throw a warning for ssDNA libs? See: https://github.com/stschiff/sequenceTools/issues/24 // Run PileupCaller SEQUENCETOOLS_PILEUPCALLER( ch_pileupcaller_input.mpileup, From 3f95223393a9e227d37d5a6c413e8c8dba7605f2 Mon Sep 17 00:00:00 2001 From: Thiseas Christos Lamnidis Date: Fri, 1 Mar 2024 10:53:03 +0100 Subject: [PATCH 089/110] merge both ploidy parameters into one genotyping_reference_ploidy param --- conf/modules.config | 18 ++++++++-------- nextflow.config | 3 +-- nextflow_schema.json | 21 +++++++------------ .../local/reference_indexing_multi.nf | 2 +- .../local/reference_indexing_single.nf | 4 ++-- 5 files changed, 20 insertions(+), 28 deletions(-) diff --git a/conf/modules.config b/conf/modules.config index 231647759..6711ebf1d 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -1013,14 +1013,14 @@ process { withName: GATK_UNIFIEDGENOTYPER { tag = { "${meta.reference}|${meta.sample_id}" } - ext.args = [ - "--sample_ploidy ${params.genotyping_gatk_ploidy}", + ext.args = {[ + "--sample_ploidy ${meta2.ploidy}", "-stand_call_conf ${params.genotyping_gatk_call_conf}", "-dcov ${params.genotyping_gatk_ug_downsample}", "--output_mode ${params.genotyping_gatk_ug_out_mode}", "--genotype_likelihoods_model ${params.genotyping_gatk_ug_genotype_mode}", params.genotyping_gatk_ug_defaultbasequalities > 0 ? "--defaultBaseQualities ${params.genotyping_gatk_ug_defaultbasequalities}" : "", // Empty string since GATK complains if its default of -1 is provided. - ].join(' ').trim() + ].join(' ').trim() } ext.prefix = { "${meta.sample_id}_${meta.reference}" } publishDir = [ path: { "${params.outdir}/genotyping/" }, @@ -1032,13 +1032,13 @@ process { withName: GATK4_HAPLOTYPECALLER { tag = { "${meta.reference}|${meta.sample_id}" } - ext.args = [ + ext.args = {[ // Option names have changed from underscore_separated to hyphen-separated in GATK4 - "--sample-ploidy ${params.genotyping_gatk_ploidy}", + "--sample-ploidy ${meta2.ploidy}", "-stand-call-conf ${params.genotyping_gatk_call_conf}", "--output-mode ${params.genotyping_gatk_hc_out_mode}", "--emit-ref-confidence ${params.genotyping_gatk_hc_emitrefconf}", - ].join(' ').trim() + ].join(' ').trim() } ext.prefix = { "${meta.sample_id}_${meta.reference}" } publishDir = [ path: { "${params.outdir}/genotyping/" }, @@ -1050,11 +1050,11 @@ process { withName: FREEBAYES { tag = { "${meta.reference}|${meta.sample_id}" } - ext.args = [ + ext.args = {[ + "-p ${ref_meta.ploidy}", "-C ${params.genotyping_freebayes_min_alternate_count}", - "-p ${params.genotyping_freebayes_ploidy}", params.genotyping_freebayes_skip_coverage != 0 ? "-g ${params.genotyping_freebayes_skip_coverage}" : "", - ].join(' ').trim() + ].join(' ').trim() } ext.prefix = { "${meta.sample_id}_${meta.reference}" } publishDir = [ path: { "${params.outdir}/genotyping/" }, diff --git a/nextflow.config b/nextflow.config index 7089d1bb5..8d79ddfe3 100644 --- a/nextflow.config +++ b/nextflow.config @@ -204,6 +204,7 @@ params { genotyping_tool = null genotyping_source = null skip_bcftools_stats = false + genotyping_reference_ploidy = 2 genotyping_pileupcaller_min_base_quality = 30 genotyping_pileupcaller_min_map_quality = 30 genotyping_pileupcaller_bedfile = null @@ -211,7 +212,6 @@ params { genotyping_pileupcaller_method = 'randomHaploid' genotyping_pileupcaller_transitions_mode = 'AllSites' genotyping_gatk_call_conf = 30 - genotyping_gatk_ploidy = 2 genotyping_gatk_dbsnp = null genotyping_gatk_ug_downsample = 250 genotyping_gatk_ug_out_mode = 'EMIT_VARIANTS_ONLY' @@ -221,7 +221,6 @@ params { genotyping_gatk_hc_out_mode = 'EMIT_VARIANTS_ONLY' genotyping_gatk_hc_emitrefconf = 'GVCF' genotyping_freebayes_min_alternate_count = 1 - genotyping_freebayes_ploidy = 2 genotyping_freebayes_skip_coverage = 0 } diff --git a/nextflow_schema.json b/nextflow_schema.json index b1dca9668..38024f4db 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -892,6 +892,13 @@ "description": "Skip bcftools stats generation for VCF based variant calling statistics", "help_text": "Disables running of `bcftools stats` against VCF files from GATK and FreeBayes genotypers.\n\nIf ran, `bcftools stats` will automatically include the FASTA reference for INDEL-related statistics." }, + "genotyping_reference_ploidy": { + "type": "integer", + "default": 2, + "description": "Specify the ploidy of the reference organism.", + "help_text": "Specify the desired ploidy value of your reference organism for genotyping with GATK or FreeBayes. E.g. if you want to allow heterozygous calls this value should be >= 2.\n\n> Modifies GATK UnifiedGenotyper parameter: `--sample_ploidy`\n> Modifies GATK HaplotypeCaller parameter: `--sample-ploidy`\n> Modifies FreeBayes parameter: `-p`", + "fa_icon": "fas fa-pastafarianism" + }, "genotyping_pileupcaller_min_base_quality": { "type": "integer", "default": 30, @@ -943,13 +950,6 @@ "description": "Specify GATK phred-scaled confidence threshold.", "help_text": "If selected, specify a GATK genotyper phred-scaled confidence threshold of a given SNP/INDEL call.\n\n> Modifies GATK UnifiedGenotyper or HaplotypeCaller parameter: `-stand_call_conf`" }, - "genotyping_gatk_ploidy": { - "type": "integer", - "default": 2, - "fa_icon": "fas fa-pastafarianism", - "description": "Specify GATK organism ploidy.", - "help_text": "If selected, specify a GATK genotyper ploidy value of your reference organism. E.g. if you want to allow heterozygous calls from >= diploid organisms (e.g. for multi-allele frequency reporting in MultiVCFAnalyzer).\n\n> Modifies GATK UnifiedGenotyper or HaplotypeCaller parameter: --sample-ploidy" - }, "genotyping_gatk_dbsnp": { "type": "string", "help_text": "(Optional) Specify VCF file for output VCF SNP annotation e.g. if you want to annotate your VCF file with 'rs' SNP IDs. Check GATK documentation for more information. Gzip not accepted.", @@ -1024,13 +1024,6 @@ "description": "Specify to skip over regions of high depth by discarding alignments overlapping positions where total read depth is greater than specified.", "help_text": "Specify to skip over regions of high depth by discarding alignments overlapping positions where total read depth is greater than the specified value. Setting to 0 (the default) deactivates this behaviour.\n\n> Modifies freebayes parameter: `-g`", "fa_icon": "fab fa-think-peaks" - }, - "genotyping_freebayes_ploidy": { - "type": "integer", - "default": 2, - "description": "Specify ploidy of sample in FreeBayes.", - "help_text": "Specify ploidy of sample in FreeBayes.\n\n> Modifies freebayes parameter: `-p`", - "fa_icon": "fas fa-pastafarianism" } }, "fa_icon": "fas fa-sliders-h", diff --git a/subworkflows/local/reference_indexing_multi.nf b/subworkflows/local/reference_indexing_multi.nf index 316cfc194..0f6bade30 100644 --- a/subworkflows/local/reference_indexing_multi.nf +++ b/subworkflows/local/reference_indexing_multi.nf @@ -31,7 +31,7 @@ workflow REFERENCE_INDEXING_MULTI { row -> def meta = [:] meta.id = row["reference_name"] - meta.ploidy = row["genotyping_gatk_ploidy"] != "" ? row["genotyping_gatk_ploidy"] : params.genotyping_gatk_ploidy // Use default value if none is specified. This info goes in the meta + meta.ploidy = row["genotyping_reference_ploidy"] != "" ? row["genotyping_reference_ploidy"] : params.genotyping_reference_ploidy // Use default value if none is specified. This info goes in the meta def fasta = file(row["fasta"], checkIfExists: true) // mandatory parameter! def fai = row["fai"] != "" ? file(row["fai"] , checkIfExists: true) : "" def dict = row["dict"] != "" ? file(row["dict"] , checkIfExists: true) : "" diff --git a/subworkflows/local/reference_indexing_single.nf b/subworkflows/local/reference_indexing_single.nf index a46c80424..060b3e406 100644 --- a/subworkflows/local/reference_indexing_single.nf +++ b/subworkflows/local/reference_indexing_single.nf @@ -88,9 +88,9 @@ workflow REFERENCE_INDEXING_SINGLE { def pileupcaller_snp = params.genotyping_pileupcaller_snpfile != null ? file(params.genotyping_pileupcaller_snpfile, checkIfExists: true ) : "" def sexdet_bed = "" def bedtools_feature = params.mapstats_bedtools_featurefile != null ? file(params.mapstats_bedtools_featurefile, checkIfExists: true ) : "" - def genotyping_gatk_ploidy = params.genotyping_gatk_ploidy + def genotyping_reference_ploidy = params.genotyping_reference_ploidy def genotyping_gatk_dbsnp = params.genotyping_gatk_dbsnp != null ? file(params.genotyping_gatk_dbsnp, checkIfExists: true ) : "" - [ meta + [ ploidy: genotyping_gatk_ploidy ], fasta, fai, dict, mapper_index, params.fasta_circular_target, params.mitochondrion_header, contamination_estimation_angsd_hapmap, pmd_masked_fasta, pmd_bed_for_masking, capture_bed, pileupcaller_bed, pileupcaller_snp, sexdet_bed, bedtools_feature, genotyping_gatk_dbsnp ] + [ meta + [ ploidy: genotyping_reference_ploidy ], fasta, fai, dict, mapper_index, params.fasta_circular_target, params.mitochondrion_header, contamination_estimation_angsd_hapmap, pmd_masked_fasta, pmd_bed_for_masking, capture_bed, pileupcaller_bed, pileupcaller_snp, sexdet_bed, bedtools_feature, genotyping_gatk_dbsnp ] } ch_ref_index_single = ch_reference_for_mapping From bc1c9245941700036d3ced7e9bab1570fa671663 Mon Sep 17 00:00:00 2001 From: Thiseas Christos Lamnidis Date: Fri, 1 Mar 2024 10:53:52 +0100 Subject: [PATCH 090/110] Install BCFTOOLS_INDEX --- modules.json | 5 ++ .../nf-core/bcftools/index/environment.yml | 7 +++ modules/nf-core/bcftools/index/main.nf | 51 +++++++++++++++++++ modules/nf-core/bcftools/index/meta.yml | 48 +++++++++++++++++ 4 files changed, 111 insertions(+) create mode 100644 modules/nf-core/bcftools/index/environment.yml create mode 100644 modules/nf-core/bcftools/index/main.nf create mode 100644 modules/nf-core/bcftools/index/meta.yml diff --git a/modules.json b/modules.json index cf345fef8..00b7648f7 100644 --- a/modules.json +++ b/modules.json @@ -30,6 +30,11 @@ "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", "installed_by": ["modules"] }, + "bcftools/index": { + "branch": "master", + "git_sha": "44096c08ffdbc694f5f92ae174ea0f7ba0f37e09", + "installed_by": ["modules"] + }, "bcftools/stats": { "branch": "master", "git_sha": "44096c08ffdbc694f5f92ae174ea0f7ba0f37e09", diff --git a/modules/nf-core/bcftools/index/environment.yml b/modules/nf-core/bcftools/index/environment.yml new file mode 100644 index 000000000..bbee37ad5 --- /dev/null +++ b/modules/nf-core/bcftools/index/environment.yml @@ -0,0 +1,7 @@ +name: bcftools_index +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::bcftools=1.18 diff --git a/modules/nf-core/bcftools/index/main.nf b/modules/nf-core/bcftools/index/main.nf new file mode 100644 index 000000000..4cd0dcbb5 --- /dev/null +++ b/modules/nf-core/bcftools/index/main.nf @@ -0,0 +1,51 @@ +process BCFTOOLS_INDEX { + tag "$meta.id" + label 'process_low' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/bcftools:1.18--h8b25389_0': + 'biocontainers/bcftools:1.18--h8b25389_0' }" + + input: + tuple val(meta), path(vcf) + + output: + tuple val(meta), path("*.csi"), optional:true, emit: csi + tuple val(meta), path("*.tbi"), optional:true, emit: tbi + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + + """ + bcftools \\ + index \\ + $args \\ + --threads $task.cpus \\ + $vcf + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + bcftools: \$(bcftools --version 2>&1 | head -n1 | sed 's/^.*bcftools //; s/ .*\$//') + END_VERSIONS + """ + + stub: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def extension = args.contains("--tsi") || args.contains("-t") ? "tbi" : + "csi" + """ + touch ${vcf}.${extension} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + bcftools: \$(bcftools --version 2>&1 | head -n1 | sed 's/^.*bcftools //; s/ .*\$//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/bcftools/index/meta.yml b/modules/nf-core/bcftools/index/meta.yml new file mode 100644 index 000000000..fc340cbc8 --- /dev/null +++ b/modules/nf-core/bcftools/index/meta.yml @@ -0,0 +1,48 @@ +name: bcftools_index +description: Index VCF tools +keywords: + - vcf + - index + - bcftools + - csi + - tbi +tools: + - bcftools: + description: BCFtools is a set of utilities that manipulate variant calls in the Variant Call Format (VCF) and its binary counterpart BCF. All commands work transparently with both VCFs and BCFs, both uncompressed and BGZF-compressed. Most commands accept VCF, bgzipped VCF and BCF with filetype detected automatically even when streaming from a pipe. Indexed VCF and BCF will work in all situations. Un-indexed VCF and BCF and streams will work in most, but not all situations. + homepage: https://samtools.github.io/bcftools/ + documentation: https://samtools.github.io/bcftools/howtos/index.html + tool_dev_url: https://github.com/samtools/bcftools + doi: "10.1093/gigascience/giab008" + licence: ["MIT", "GPL-3.0-or-later"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - VCF: + type: file + description: VCF file (optionally GZIPPED) + pattern: "*.{vcf,vcf.gz}" +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - version: + type: file + description: File containing software version + pattern: "versions.yml" + - csi: + type: file + description: Default VCF file index file + pattern: "*.csi" + - tbi: + type: file + description: Alternative VCF file index file for larger files (activated with -t parameter) + pattern: "*.tbi" +authors: + - "@jfy133" +maintainers: + - "@jfy133" From b9ef51f100547e94fa9fbaabc6c356ef90d92d79 Mon Sep 17 00:00:00 2001 From: Thiseas Christos Lamnidis Date: Fri, 1 Mar 2024 11:35:13 +0100 Subject: [PATCH 091/110] update gatk_HC module --- modules.json | 2 +- modules/nf-core/gatk4/haplotypecaller/environment.yml | 2 +- modules/nf-core/gatk4/haplotypecaller/main.nf | 4 ++-- modules/nf-core/gatk4/haplotypecaller/meta.yml | 10 +++++----- 4 files changed, 9 insertions(+), 9 deletions(-) diff --git a/modules.json b/modules.json index 00b7648f7..7a0daffc4 100644 --- a/modules.json +++ b/modules.json @@ -152,7 +152,7 @@ }, "gatk4/haplotypecaller": { "branch": "master", - "git_sha": "eab2bd29e589bd05da2b47c9bf95ef10b9508699", + "git_sha": "d742e3143f2ccb8853c29b35cfcf50b5e5026980", "installed_by": ["modules"] }, "gunzip": { diff --git a/modules/nf-core/gatk4/haplotypecaller/environment.yml b/modules/nf-core/gatk4/haplotypecaller/environment.yml index 0c8f32fa6..d4e8d3602 100644 --- a/modules/nf-core/gatk4/haplotypecaller/environment.yml +++ b/modules/nf-core/gatk4/haplotypecaller/environment.yml @@ -4,4 +4,4 @@ channels: - bioconda - defaults dependencies: - - bioconda::gatk4=4.4.0.0 + - bioconda::gatk4=4.5.0.0 diff --git a/modules/nf-core/gatk4/haplotypecaller/main.nf b/modules/nf-core/gatk4/haplotypecaller/main.nf index a6a71d562..3043ee07a 100644 --- a/modules/nf-core/gatk4/haplotypecaller/main.nf +++ b/modules/nf-core/gatk4/haplotypecaller/main.nf @@ -4,8 +4,8 @@ process GATK4_HAPLOTYPECALLER { conda "${moduleDir}/environment.yml" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/gatk4:4.4.0.0--py36hdfd78af_0': - 'biocontainers/gatk4:4.4.0.0--py36hdfd78af_0' }" + 'https://depot.galaxyproject.org/singularity/gatk4:4.5.0.0--py36hdfd78af_0': + 'biocontainers/gatk4:4.5.0.0--py36hdfd78af_0' }" input: tuple val(meta), path(input), path(input_index), path(intervals), path(dragstr_model) diff --git a/modules/nf-core/gatk4/haplotypecaller/meta.yml b/modules/nf-core/gatk4/haplotypecaller/meta.yml index 2085c2db2..703b99a09 100644 --- a/modules/nf-core/gatk4/haplotypecaller/meta.yml +++ b/modules/nf-core/gatk4/haplotypecaller/meta.yml @@ -39,7 +39,7 @@ input: type: map description: | Groovy Map containing reference information - e.g. [ id:'test', single_end:false ] + e.g. [ id:'test_reference' ] - fasta: type: file description: The reference fasta file @@ -48,7 +48,7 @@ input: type: map description: | Groovy Map containing reference information - e.g. [ id:'test', single_end:false ] + e.g. [ id:'test_reference' ] - fai: type: file description: Index of reference fasta file @@ -57,7 +57,7 @@ input: type: map description: | Groovy Map containing reference information - e.g. [ id:'test', single_end:false ] + e.g. [ id:'test_reference' ] - dict: type: file description: GATK sequence dictionary @@ -66,7 +66,7 @@ input: type: map description: | Groovy Map containing dbsnp information - e.g. [ id:'test', single_end:false ] + e.g. [ id:'test_dbsnp' ] - dbsnp: type: file description: VCF file containing known sites (optional) @@ -74,7 +74,7 @@ input: type: map description: | Groovy Map containing dbsnp information - e.g. [ id:'test', single_end:false ] + e.g. [ id:'test_dbsnp' ] - dbsnp_tbi: type: file description: VCF index of dbsnp (optional) From 24525b17b57330a9710776d546a204bd74569eda Mon Sep 17 00:00:00 2001 From: Thiseas Christos Lamnidis Date: Fri, 1 Mar 2024 11:35:35 +0100 Subject: [PATCH 092/110] index VCF files --- conf/modules.config | 24 ++++++++++++++++++++++++ subworkflows/local/genotype.nf | 33 ++++++++++++++++++++++----------- 2 files changed, 46 insertions(+), 11 deletions(-) diff --git a/conf/modules.config b/conf/modules.config index 6711ebf1d..8e31a61ae 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -1030,6 +1030,18 @@ process { ] } + withName: BCFTOOLS_INDEX_UG { + tag = { "${meta.reference}|${meta.sample_id}" } + ext.args = "--tbi" //tbi indices for consistency with GATK HC + ext.prefix = { "${meta.sample_id}_${meta.reference}" } + publishDir = [ + path: { "${params.outdir}/genotyping/" }, + mode: params.publish_dir_mode, + enabled: true, + pattern: '*.vcf.gz.tbi' + ] + } + withName: GATK4_HAPLOTYPECALLER { tag = { "${meta.reference}|${meta.sample_id}" } ext.args = {[ @@ -1064,6 +1076,18 @@ process { ] } + withName: BCFTOOLS_INDEX_FREEBAYES { + tag = { "${meta.reference}|${meta.sample_id}" } + ext.args = "--tbi" //tbi indices for consistency with GATK HC + ext.prefix = { "${meta.sample_id}_${meta.reference}" } + publishDir = [ + path: { "${params.outdir}/genotyping/" }, + mode: params.publish_dir_mode, + enabled: true, + pattern: '*.vcf.gz.tbi' + ] + } + withName: BCFTOOLS_STATS_GENOTYPING { tag = { "${meta.reference}|${meta.sample_id}" } ext.prefix = { "${meta.sample_id}_${meta.reference}" } diff --git a/subworkflows/local/genotype.nf b/subworkflows/local/genotype.nf index 73315ed5f..e89edd93c 100644 --- a/subworkflows/local/genotype.nf +++ b/subworkflows/local/genotype.nf @@ -12,6 +12,8 @@ include { GATK_UNIFIEDGENOTYPER } from '../../module include { GATK4_HAPLOTYPECALLER } from '../../modules/nf-core/gatk4/haplotypecaller/main' include { FREEBAYES } from '../../modules/nf-core/freebayes/main' include { BCFTOOLS_STATS as BCFTOOLS_STATS_GENOTYPING } from '../../modules/nf-core/bcftools/stats/main' +include { BCFTOOLS_INDEX as BCFTOOLS_INDEX_UG } from '../../modules/nf-core/bcftools/index/main' +include { BCFTOOLS_INDEX as BCFTOOLS_INDEX_FREEBAYES } from '../../modules/nf-core/bcftools/index/main' // TODO Add ANGSD GTL module. The current module does not pick up the .glf.gz output files. workflow GENOTYPE { @@ -26,6 +28,7 @@ workflow GENOTYPE { ch_multiqc_files = Channel.empty() ch_pileupcaller_genotypes = Channel.empty() ch_eigenstrat_coverage_stats = Channel.empty() + ch_genotypes_vcf = Channel.empty() ch_gatk_haplotypecaller_genotypes = Channel.empty() ch_gatk_unifiedgenotyper_genotypes = Channel.empty() ch_freebayes_genotypes = Channel.empty() @@ -219,7 +222,6 @@ workflow GENOTYPE { dbsnp: [ ref_meta, dbsnp ] } - // TODO: Should the vcfs be indexed with bcftools index? VCFs from HC are indexed. GATK_UNIFIEDGENOTYPER( ch_bams_for_ug.bam, ch_bams_for_ug.fasta, @@ -230,8 +232,14 @@ workflow GENOTYPE { ch_bams_for_ug.dbsnp, [[], []] // No comp ) - ch_gatk_unifiedgenotyper_genotypes = GATK_UNIFIEDGENOTYPER.out.vcf + ch_gatk_ug_vcf = GATK_UNIFIEDGENOTYPER.out.vcf ch_versions = ch_versions.mix( GATK_UNIFIEDGENOTYPER.out.versions.first() ) + + // Index the VCFs + BCFTOOLS_INDEX_UG( ch_gatk_ug_vcf ) + ch_versions = ch_versions.mix( BCFTOOLS_INDEX_UG.out.versions.first() ) + + ch_genotypes_vcf = ch_gatk_ug_vcf.join(BCFTOOLS_INDEX_UG.out.tbi) // [ [ meta ], vcf, tbi ] } if ( params.genotyping_tool == 'hc' ) { @@ -276,7 +284,7 @@ workflow GENOTYPE { ch_input_for_hc.dbsnp, [[], []] // No dbsnp_tbi ) - ch_gatk_haplotypecaller_genotypes = GATK4_HAPLOTYPECALLER.out.vcf + ch_genotypes_vcf = GATK4_HAPLOTYPECALLER.out.vcf.join( GATK4_HAPLOTYPECALLER.out.tbi ) // [ [ meta ], vcf, tbi ] ch_versions = ch_versions.mix( GATK4_HAPLOTYPECALLER.out.versions.first() ) } @@ -315,7 +323,6 @@ workflow GENOTYPE { fai: [ ref_meta, fai ] } - // TODO: Should the vcfs be indexed with bcftools index? VCFs from HC are indexed. FREEBAYES( ch_input_for_freebayes.bam, ch_input_for_freebayes.fasta, @@ -326,6 +333,12 @@ workflow GENOTYPE { ) ch_freebayes_genotypes = FREEBAYES.out.vcf ch_versions = ch_versions.mix( FREEBAYES.out.versions.first() ) + + // Index the VCFs + BCFTOOLS_INDEX_FREEBAYES( ch_freebayes_genotypes ) + ch_versions = ch_versions.mix( BCFTOOLS_INDEX_FREEBAYES.out.versions.first() ) + + ch_genotypes_vcf = ch_freebayes_genotypes.join(BCFTOOLS_INDEX_FREEBAYES.out.tbi) // [ [ meta ], vcf, tbi ] } if ( params.genotyping_tool == 'angsd' ) { @@ -334,16 +347,14 @@ workflow GENOTYPE { // Run BCFTOOLS_STATS on output from GATK UG, HC and Freebayes if ( !params.skip_bcftools_stats && ( params.genotyping_tool == 'hc' || params.genotyping_tool == 'ug' || params.genotyping_tool == 'freebayes' ) ) { - ch_bcftools_input= ch_gatk_unifiedgenotyper_genotypes - .mix( ch_gatk_haplotypecaller_genotypes ) - .mix( ch_freebayes_genotypes ) + ch_bcftools_input= ch_genotypes_vcf .map { WorkflowEager.addNewMetaFromAttributes( it, "reference" , "reference" , false ) - } - .combine( ch_fasta_for_multimap , by:0 ) + }.dump(tag:"ch_bcftools_input") + .combine( ch_fasta_for_multimap , by:0 ).dump(tag:"ch_bcftools_combined") .multiMap { - ignore_me, meta, vcf, ref_meta, fasta, fai, dict, dbsnp -> - vcf: [ meta, vcf, [] ] // bcftools stats module expects a tbi file with the vcf. + ignore_me, meta, vcf, tbi, ref_meta, fasta, fai, dict, dbsnp -> + vcf: [ meta, vcf, tbi ] // bcftools stats module expects a tbi file with the vcf. fasta: [ ref_meta, fasta ] } From 8edd468f0fa3a2564da08d7fc3bbc922129a4ddc Mon Sep 17 00:00:00 2001 From: Thiseas Christos Lamnidis Date: Fri, 8 Mar 2024 10:07:48 +0100 Subject: [PATCH 093/110] add warning about angsd --- subworkflows/local/genotype.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/subworkflows/local/genotype.nf b/subworkflows/local/genotype.nf index e89edd93c..6dabb530f 100644 --- a/subworkflows/local/genotype.nf +++ b/subworkflows/local/genotype.nf @@ -342,7 +342,7 @@ workflow GENOTYPE { } if ( params.genotyping_tool == 'angsd' ) { - // TODO no module for angsd genotyping yet + log.warn("[nf-core/eager] Genotyping with ANGSD is not yet implemented .") } // Run BCFTOOLS_STATS on output from GATK UG, HC and Freebayes From f34045e24189c965b5b90026994be6af3ddb0f1c Mon Sep 17 00:00:00 2001 From: Thiseas Christos Lamnidis Date: Fri, 8 Mar 2024 11:51:41 +0100 Subject: [PATCH 094/110] rename meta attribute id -> sample_id for consistency --- conf/modules.config | 2 +- subworkflows/local/genotype.nf | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/conf/modules.config b/conf/modules.config index 8e31a61ae..f3bd302ab 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -954,7 +954,7 @@ process { "--${params.genotyping_pileupcaller_method}", params.genotyping_pileupcaller_transitions_mode == "SkipTransitions" ? "--skipTransitions" : params.genotyping_pileupcaller_transitions_mode == "TransitionsMissing" ? "--transitionsMissing" : "", "${meta.strandedness}" == 'single' ? "--singleStrandMode" : "" , - "--sampleNames", meta.id.join(","), + "--sampleNames", meta.sample_id.join(","), "-e pileupcaller.${meta.strandedness}.${meta.reference}" ].join(' ').trim() } ext.prefix = { "${meta.strandedness}_${meta.reference}" } diff --git a/subworkflows/local/genotype.nf b/subworkflows/local/genotype.nf index 6dabb530f..2a88a11b3 100644 --- a/subworkflows/local/genotype.nf +++ b/subworkflows/local/genotype.nf @@ -70,9 +70,9 @@ workflow GENOTYPE { .groupTuple() .map { combo_meta, metas, bams, bais -> - def ids = metas.collect { meta -> meta.id } - [ combo_meta + [id: ids], bams ] // Drop bais - } // Collect all IDs into a list in meta.id. Useful when running pileupCaller later + def ids = metas.collect { meta -> meta.sample_id } + [ combo_meta + [sample_id: ids], bams ] // Drop bais + } // Collect all IDs into a list in meta.sample_id. Useful when running pileupCaller later // Combine prepped bams and references ch_mpileup_inputs = ch_mpileup_inputs_bams From c4614786d064b11689b948f26dce6979ff5c2c7e Mon Sep 17 00:00:00 2001 From: Thiseas Christos Lamnidis Date: Fri, 8 Mar 2024 11:57:02 +0100 Subject: [PATCH 095/110] simplify output channels --- subworkflows/local/genotype.nf | 19 ++++++------------- 1 file changed, 6 insertions(+), 13 deletions(-) diff --git a/subworkflows/local/genotype.nf b/subworkflows/local/genotype.nf index 2a88a11b3..c5a881030 100644 --- a/subworkflows/local/genotype.nf +++ b/subworkflows/local/genotype.nf @@ -29,10 +29,6 @@ workflow GENOTYPE { ch_pileupcaller_genotypes = Channel.empty() ch_eigenstrat_coverage_stats = Channel.empty() ch_genotypes_vcf = Channel.empty() - ch_gatk_haplotypecaller_genotypes = Channel.empty() - ch_gatk_unifiedgenotyper_genotypes = Channel.empty() - ch_freebayes_genotypes = Channel.empty() - ch_angsd_genotypes = Channel.empty() ch_bcftools_stats = Channel.empty() if ( params.genotyping_tool == 'pileupcaller' ) { @@ -372,13 +368,10 @@ workflow GENOTYPE { } emit: - geno_pileupcaller = ch_pileupcaller_genotypes // [ [ meta ], geno, snp, ind ] - geno_gatk_hc = ch_gatk_haplotypecaller_genotypes // [ [ meta ], vcf ] ] - geno_gatk_ug = ch_gatk_unifiedgenotyper_genotypes // [ [ meta ], vcf ] ] - geno_freebayes = ch_freebayes_genotypes // [ [ meta ], vcf ] ] - geno_angsd = ch_angsd_genotypes // [ [ meta ], glf ] ] - vcf_stats = ch_bcftools_stats // [ [ meta ], stats ] - eigenstrat_coverage_stats = ch_eigenstrat_coverage_stats // [ [ meta ], stats ] - versions = ch_versions - mqc = ch_multiqc_files + eigenstrat = ch_pileupcaller_genotypes // [ [ meta ], geno, snp, ind ] + vcf = ch_genotypes_vcf // [ [ meta ], vcf ] ] + vcf_stats = ch_bcftools_stats // [ [ meta ], stats ] + eigenstrat_coverage = ch_eigenstrat_coverage_stats // [ [ meta ], stats ] + versions = ch_versions + mqc = ch_multiqc_files } From 322ffa1f8cd3e32c23fa640219d0c34abbf43c42 Mon Sep 17 00:00:00 2001 From: Thiseas Christos Lamnidis Date: Thu, 14 Mar 2024 14:03:47 +0100 Subject: [PATCH 096/110] Update GATK_UG --- modules.json | 2 +- modules/nf-core/gatk/unifiedgenotyper/environment.yml | 1 + modules/nf-core/gatk/unifiedgenotyper/main.nf | 6 +++--- modules/nf-core/gatk/unifiedgenotyper/meta.yml | 9 +++++++-- 4 files changed, 12 insertions(+), 6 deletions(-) diff --git a/modules.json b/modules.json index 7a0daffc4..d4218b15b 100644 --- a/modules.json +++ b/modules.json @@ -147,7 +147,7 @@ }, "gatk/unifiedgenotyper": { "branch": "master", - "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", + "git_sha": "6fa52bdc450257d9dd0c149ef2f6467fcebeca5d", "installed_by": ["modules"] }, "gatk4/haplotypecaller": { diff --git a/modules/nf-core/gatk/unifiedgenotyper/environment.yml b/modules/nf-core/gatk/unifiedgenotyper/environment.yml index 7201ece73..4137c3489 100644 --- a/modules/nf-core/gatk/unifiedgenotyper/environment.yml +++ b/modules/nf-core/gatk/unifiedgenotyper/environment.yml @@ -5,3 +5,4 @@ channels: - defaults dependencies: - bioconda::gatk=3.5 + - bioconda::tabix=1.11 ## Needed for bgzip diff --git a/modules/nf-core/gatk/unifiedgenotyper/main.nf b/modules/nf-core/gatk/unifiedgenotyper/main.nf index dffa1d5a0..ecc132970 100644 --- a/modules/nf-core/gatk/unifiedgenotyper/main.nf +++ b/modules/nf-core/gatk/unifiedgenotyper/main.nf @@ -4,8 +4,8 @@ process GATK_UNIFIEDGENOTYPER { conda "${moduleDir}/environment.yml" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/gatk:3.5--hdfd78af_11': - 'biocontainers/gatk:3.5--hdfd78af_11' }" + 'https://depot.galaxyproject.org/singularity/mulled-v2-5e3fd88c6b8af48bb5982d5721ca5e36da94029b:c496eeb8cc9067e0720d35121dbff7732a7ebdb0-0': + 'biocontainers/mulled-v2-5e3fd88c6b8af48bb5982d5721ca5e36da94029b:c496eeb8cc9067e0720d35121dbff7732a7ebdb0-0' }" input: tuple val(meta), path(bam), path(bai) @@ -53,7 +53,7 @@ process GATK_UNIFIEDGENOTYPER { -o ${prefix}.vcf \\ $args - gzip -n *.vcf + bgzip ${prefix}.vcf cat <<-END_VERSIONS > versions.yml "${task.process}": diff --git a/modules/nf-core/gatk/unifiedgenotyper/meta.yml b/modules/nf-core/gatk/unifiedgenotyper/meta.yml index 708815eec..95ac33375 100644 --- a/modules/nf-core/gatk/unifiedgenotyper/meta.yml +++ b/modules/nf-core/gatk/unifiedgenotyper/meta.yml @@ -9,7 +9,12 @@ tools: description: "The full Genome Analysis Toolkit (GATK) framework, license restricted." homepage: "https://gatk.broadinstitute.org/hc/en-us" documentation: "https://github.com/broadinstitute/gatk-docs" - licence: "['https://software.broadinstitute.org/gatk/download/licensing', 'BSD', 'https://www.broadinstitute.org/gatk/about/#licensing']" + licence: + [ + "https://software.broadinstitute.org/gatk/download/licensing", + "BSD", + "https://www.broadinstitute.org/gatk/about/#licensing", + ] input: - meta: type: map @@ -74,7 +79,7 @@ input: description: | Groovy Map containing file meta-information for the dbsnps file. e.g. [ id:'test', single_end:false ] - - dbsnps: + - dbsnp: type: file description: VCF file containing known sites (optional) pattern: "*" From 83808edf66588257c69b320568c2c5ac7bac3c05 Mon Sep 17 00:00:00 2001 From: Thiseas Christos Lamnidis Date: Thu, 14 Mar 2024 14:15:34 +0100 Subject: [PATCH 097/110] remove dumps --- subworkflows/local/genotype.nf | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/subworkflows/local/genotype.nf b/subworkflows/local/genotype.nf index c5a881030..f89134763 100644 --- a/subworkflows/local/genotype.nf +++ b/subworkflows/local/genotype.nf @@ -346,8 +346,8 @@ workflow GENOTYPE { ch_bcftools_input= ch_genotypes_vcf .map { WorkflowEager.addNewMetaFromAttributes( it, "reference" , "reference" , false ) - }.dump(tag:"ch_bcftools_input") - .combine( ch_fasta_for_multimap , by:0 ).dump(tag:"ch_bcftools_combined") + } + .combine( ch_fasta_for_multimap , by:0 ) .multiMap { ignore_me, meta, vcf, tbi, ref_meta, fasta, fai, dict, dbsnp -> vcf: [ meta, vcf, tbi ] // bcftools stats module expects a tbi file with the vcf. From 9b4a484ce6c3de202ece790b7323cb6996e560d9 Mon Sep 17 00:00:00 2001 From: Thiseas Christos Lamnidis Date: Thu, 14 Mar 2024 14:15:58 +0100 Subject: [PATCH 098/110] update manual_tests.md --- docs/development/manual_tests.md | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/docs/development/manual_tests.md b/docs/development/manual_tests.md index 73adc7de7..23a7d8232 100644 --- a/docs/development/manual_tests.md +++ b/docs/development/manual_tests.md @@ -733,13 +733,13 @@ These tests were ran before library merging was implemented. ```bash ## Gatk UG on raw reads -## Expect: One VCF per sample/reference combination. Also 1 bcftools_stats file per VCF. Additional IR/ subdirectory with 1 bam and 1 bai per sample/reference combination. +## Expect: One VCF + .tbi index per sample/reference combination. Also 1 bcftools_stats file per VCF. Additional IR/ subdirectory with 1 bam and 1 bai per sample/reference combination. nextflow run main.nf -profile test,docker --outdir ./results -w work/ -resume --run_genotyping --genotyping_tool 'ug' --genotyping_source 'raw' --genotyping_gatk_ug_keeprealignbam -ansi-log false -dump-channels ``` ```bash ## Gatk UG on trimmed reads. Skip bcftools stats. -## Expect: One VCF per sample/reference combination, based on the trimmed bams (this actually shows on the IndelRealigner step and not the UG step). No IR directory. No bcftools_stats file per VCF. +## Expect: One VCF + .tbi index per sample/reference combination, based on the trimmed bams (this actually shows on the IndelRealigner step and not the UG step). No IR directory. No bcftools_stats file per VCF. ## Checked that the input bam for the UG jobs indeed had trimmed reads. (The full UDG sample has untrimmed bams.) nextflow run main.nf -profile test,docker --outdir ./results -w work/ -resume --run_genotyping --genotyping_tool 'ug' --genotyping_source 'trimmed' -ansi-log false -dump-channels --skip_bcftools_stats \ --run_trim_bam \ @@ -751,21 +751,21 @@ nextflow run main.nf -profile test,docker --outdir ./results -w work/ -resume -- ```bash ## Gatk UG on pmd-filtered reads -## Expect: One VCF per sample/reference combination, based on the pmd-filtered bams (this actually shows on the IndelRealigner step and not the UG step). No IR directory. Also 1 bcftools_stats file per VCF. +## Expect: One VCF + .tbi index per sample/reference combination, based on the pmd-filtered bams (this actually shows on the IndelRealigner step and not the UG step). No IR directory. Also 1 bcftools_stats file per VCF. nextflow run main.nf -profile test,docker --outdir ./results -w work/ -resume --run_genotyping --genotyping_tool 'ug' --genotyping_source 'pmd' -ansi-log false -dump-channels --run_pmd_filtering ## Checked that the bams had fewer reads compared to the raw bams. ``` ```bash ## Gatk UG on rescaled reads -## Expect: One VCF per sample/reference combination, based on the rescaled bams (this actually shows on the IndelRealigner step and not the UG step). No IR directory. Also 1 bcftools_stats file per VCF. +## Expect: One VCF + .tbi index per sample/reference combination, based on the rescaled bams (this actually shows on the IndelRealigner step and not the UG step). No IR directory. Also 1 bcftools_stats file per VCF. nextflow run main.nf -profile test,docker --outdir ./results -w work/ -resume --run_genotyping --genotyping_tool 'ug' --genotyping_source 'rescaled' -ansi-log false -dump-channels --run_mapdamage_rescaling ``` ```bash ## Gatk UG on raw reads, multiple references ## NOTE: Actually fails due to header of BAM input in test_multiref not matching sequences in fasta (which was shortened to chr 21+ for brevity). Provided alternative input without a BAM input line. ( head -n 5 on https://raw.githubusercontent.com/nf-core/test-datasets/eager/testdata/Mammoth/samplesheet_multilane_multilib.tsv ). It then worked fine. -## Expect: One VCF per sample/reference combination. Also 1 bcftools_stats file per VCF. Additional IR/ subdirectory with 1 bam and 1 bai per sample/reference combination. +## Expect: One VCF + .tbi index per sample/reference combination. Also 1 bcftools_stats file per VCF. Additional IR/ subdirectory with 1 bam and 1 bai per sample/reference combination. nextflow run main.nf -profile test_multiref,docker --input test/samplesheet_multilane_multilib_noBAM.tsv --outdir ./results -w work/ -resume --run_genotyping --genotyping_tool 'ug' --genotyping_source 'raw' --genotyping_gatk_ug_keeprealignbam -ansi-log false -dump-channels ``` @@ -805,13 +805,13 @@ nextflow run main.nf -profile test_multiref,docker --input test/samplesheet_mult ```bash ## Freebayes on raw reads -## Expect: One VCF per sample/reference combination. Also 1 bcftools_stats file per VCF. +## Expect: One VCF + .tbi index per sample/reference combination. Also 1 bcftools_stats file per VCF. nextflow run main.nf -profile test,docker --outdir ./results -w work/ -resume --run_genotyping --genotyping_tool 'freebayes' --genotyping_source 'raw' -ansi-log false -dump-channels ``` ```bash ## Freebayes on trimmed reads. Different options, and skip bcftools stats. -## Expect: One VCF per sample/reference combination. +## Expect: One VCF + .tbi index per sample/reference combination. ## Checked .command.sh for correct args. nextflow run main.nf -profile test,docker --outdir ./results -w work/ -resume --run_genotyping --genotyping_tool 'freebayes' --genotyping_source 'trimmed' -ansi-log false -dump-channels --skip_bcftools_stats \ --run_trim_bam \ @@ -823,7 +823,7 @@ nextflow run main.nf -profile test,docker --outdir ./results -w work/ -resume -- ```bash ## Freebayes on raw reads, multiple references ## Freebayes does not complain about the BAM header not matching the reference. -## Expect: One VCF per sample/reference combination. BAM input only has 1 output for the specified reference. Also 1 bcftools_stats file per VCF. +## Expect: One VCF + .tbi index per sample/reference combination. BAM input only has 1 output for the specified reference. Also 1 bcftools_stats file per VCF. nextflow run main.nf -profile test_multiref,docker --outdir ./results -w work/ -resume --run_genotyping --genotyping_tool 'freebayes' --genotyping_source 'raw' --genotyping_gatk_ug_keeprealignbam -ansi-log false -dump-channels ``` From 9b448f32a52db449a8d22cd50958ad800e543f5a Mon Sep 17 00:00:00 2001 From: Thiseas Christos Lamnidis Date: Fri, 15 Mar 2024 10:54:46 +0100 Subject: [PATCH 099/110] add genotyper to meta of genotypes --- subworkflows/local/genotype.nf | 26 +++++++++++++++++++++++--- 1 file changed, 23 insertions(+), 3 deletions(-) diff --git a/subworkflows/local/genotype.nf b/subworkflows/local/genotype.nf index f89134763..e60c8a716 100644 --- a/subworkflows/local/genotype.nf +++ b/subworkflows/local/genotype.nf @@ -123,7 +123,12 @@ workflow GENOTYPE { } COLLECT_GENOTYPES( ch_final_genotypes ) + // Add genotyper info to the meta ch_pileupcaller_genotypes = COLLECT_GENOTYPES.out.collected + .map { + meta, geno, snp, ind -> + [ meta + [ genotyper: "pileupcaller" ], geno , snp, ind ] + } ch_versions = ch_versions.mix( COLLECT_GENOTYPES.out.versions.first() ) // Calculate coverage stats for collected eigenstrat dataset @@ -235,7 +240,12 @@ workflow GENOTYPE { BCFTOOLS_INDEX_UG( ch_gatk_ug_vcf ) ch_versions = ch_versions.mix( BCFTOOLS_INDEX_UG.out.versions.first() ) - ch_genotypes_vcf = ch_gatk_ug_vcf.join(BCFTOOLS_INDEX_UG.out.tbi) // [ [ meta ], vcf, tbi ] + // Add genotyper info to the meta + ch_genotypes_vcf = ch_gatk_ug_vcf.join(BCFTOOLS_INDEX_UG.out.tbi) + .map { + meta, vcf, tbi -> + [ meta + [ genotyper: "ug" ], vcf , tbi ] + } } if ( params.genotyping_tool == 'hc' ) { @@ -280,7 +290,12 @@ workflow GENOTYPE { ch_input_for_hc.dbsnp, [[], []] // No dbsnp_tbi ) - ch_genotypes_vcf = GATK4_HAPLOTYPECALLER.out.vcf.join( GATK4_HAPLOTYPECALLER.out.tbi ) // [ [ meta ], vcf, tbi ] + // Add genotyper info to the meta + ch_genotypes_vcf = GATK4_HAPLOTYPECALLER.out.vcf.join( GATK4_HAPLOTYPECALLER.out.tbi ) + .map { + meta, vcf, tbi -> + [ meta + [ genotyper: "hc" ], vcf , tbi ] + } ch_versions = ch_versions.mix( GATK4_HAPLOTYPECALLER.out.versions.first() ) } @@ -334,7 +349,12 @@ workflow GENOTYPE { BCFTOOLS_INDEX_FREEBAYES( ch_freebayes_genotypes ) ch_versions = ch_versions.mix( BCFTOOLS_INDEX_FREEBAYES.out.versions.first() ) - ch_genotypes_vcf = ch_freebayes_genotypes.join(BCFTOOLS_INDEX_FREEBAYES.out.tbi) // [ [ meta ], vcf, tbi ] + // Add genotyper info to the meta + ch_genotypes_vcf = ch_freebayes_genotypes.join(BCFTOOLS_INDEX_FREEBAYES.out.tbi) + .map { + meta, vcf, tbi -> + [ meta + [ genotyper: "freebayes" ], vcf , tbi ] + } } if ( params.genotyping_tool == 'angsd' ) { From 8df94c484d5ad8be3043c5419c3b91cf9be91702 Mon Sep 17 00:00:00 2001 From: Thiseas Christos Lamnidis Date: Fri, 15 Mar 2024 11:02:39 +0100 Subject: [PATCH 100/110] remove todos --- subworkflows/local/genotype.nf | 2 -- 1 file changed, 2 deletions(-) diff --git a/subworkflows/local/genotype.nf b/subworkflows/local/genotype.nf index e60c8a716..860da78d3 100644 --- a/subworkflows/local/genotype.nf +++ b/subworkflows/local/genotype.nf @@ -14,7 +14,6 @@ include { FREEBAYES } from '../../module include { BCFTOOLS_STATS as BCFTOOLS_STATS_GENOTYPING } from '../../modules/nf-core/bcftools/stats/main' include { BCFTOOLS_INDEX as BCFTOOLS_INDEX_UG } from '../../modules/nf-core/bcftools/index/main' include { BCFTOOLS_INDEX as BCFTOOLS_INDEX_FREEBAYES } from '../../modules/nf-core/bcftools/index/main' -// TODO Add ANGSD GTL module. The current module does not pick up the .glf.gz output files. workflow GENOTYPE { take: @@ -306,7 +305,6 @@ workflow GENOTYPE { WorkflowEager.addNewMetaFromAttributes( it, "reference" , "reference" , false ) } - // TODO Do we want to provide SNP capture bed file to Freebayes? It would then genotype only on those positions. // NOTE: dbsnp is not used by Freebayes, but we need to provide it to the module anyway, to ensure correct cardinality of the fasta channel within the BCFTOOLS_STATS channel operations. // i.e. to keep the definition of the ch_fasta_for_multimap channel consistent regardless of genotyper, so the `combine -> multiMap` in lines 327-328 work. ch_fasta_for_multimap = ch_fasta_plus From d20d2ce2158c801f8df8a0553003d2b733bc43f2 Mon Sep 17 00:00:00 2001 From: Thiseas Christos Lamnidis Date: Mon, 18 Mar 2024 10:09:07 +0100 Subject: [PATCH 101/110] Add output information on genotypers --- docs/output.md | 64 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 64 insertions(+) diff --git a/docs/output.md b/docs/output.md index c4b78de53..627743373 100644 --- a/docs/output.md +++ b/docs/output.md @@ -540,3 +540,67 @@ is a tool which calculates a variety of standard 'aDNA' metrics from a BAM file. [ANGSD](http://www.popgen.dk/angsd/index.php/ANGSD) is a software for analyzing next generation sequencing data. Among other functions, ANGSD can estimate contamination for chromosomes for which one copy exists, i.e. X-chromosome for humans with karyotype XY. To do this, we first generate a binary count file for the X-chromosome (`angsd`) and then perform a Fisher's exact test for finding a p-value and jackknife to get an estimate of contamination (`contamination`). Contamination is estimated with Method of Moments (MOM) and Maximum Likelihood (ML) for both Method1 and Method2. Method1 compares the total number of minor and major reads at SNP sites with the number of minor and major reads at adjacent sites, assuming independent errors between reads and sites, while Method2 only samples one read at each site to remove the previous assumption. The results of all methods for each library, as well as respective standard errors are summarised in `nuclear_contamination.txt` and `nuclear_contamination_mqc.json`. + +### Genotyping + +### pileupCaller + +
+Output files + +- `genotyping/` + + - `*.geno`: Eigenstrat-formatted file containing the table of genotype calls. + - `*.snp`: Eigenstrat-formatted file containing the SNP annotation of the genotype table. + - `*.ind`: Eigenstrat-formatted file containing the individual annotation of the genotype table. + - `*_coverage.tsv`: Tab-separated file containing the number of covered SNPs and total number of SNPs for each individual in the eigenstrat dataset. + +
+ +[sequencetools pileupCaller](https://github.com/stschiff/sequenceTools) is a tool to create genotype calls from bam files using read-sampling methods. It can call genotypes from `samtools mpileup` output, and is often used for ancient DNA. The output files are in Eigenstrat format, in which the gentype dataset is split across 3 different files (similar to PLINK). The `.geno` file contains the genotype table. Each number represents the genotype call of an individual at a specific position, with each column represents an individual, while each row represents a SNP. The `.snp` file contains the SNP annotation of the genotype table, and the `.ind` file contains the individual annotation of the genotype table. +We provide an additional file named `*_coverage.tsv` which contains the number of covered SNPs and total number of SNPs for each individual in the eigenstrat dataset. + +### GATK UnifiedGenotyper + +
+Output files + +- `genotyping/` + + - `*.vcf.gz`: VCF file containing the genotype calls for each sample. + - `*.vcf.gz.tbi`: Tabix index file for the VCF file. + - `*.stats.txt`: Statistics of the VCF file from bcftools stats. + +
+ +[GATK's UnifiedGenotyper](https://github.com/broadinstitute/gatk-docs/blob/master/gatk3-tooldocs/3.5-0/org_broadinstitute_gatk_tools_walkers_genotyper_UnifiedGenotyper.html) uses a Bayesian genotype likelihood model to estimate simultaneously the most likely genotypes and allele frequency in a population of N samples, emitting a genotype for each sample. The system can either emit just the variant sites or complete genotypes (which includes homozygous reference calls) satisfying some phred-scaled confidence value. This tool has been deprecated by the GATK developers in favour of HaplotypeCaller, but is still cosidered a preferable genotyper for ancient DNA, given its ability to handle low coverage data. The output provided is a bgzipped VCF file for containing the genotype calls of each sample, it's index file, as well as the statistics of the VCF file generated by `bcftools stats`. + +### GATK HaplotypeCaller + +
+Output files + +- `genotyping/` + + - `*.vcf.gz`: VCF file containing the genotype calls for each sample. + - `*.vcf.gz.tbi`: Tabix index file for the VCF file. + - `*.stats.txt`: Statistics of the VCF file from bcftools stats. + +
+ +[GATK's HaplotypeCaller](https://gatk.broadinstitute.org/hc/en-us/articles/13832687299739-HaplotypeCaller) is capable of calling SNPs and indels simultaneously via local de-novo assembly of haplotypes in an active region. In addition, HaplotypeCaller is able to handle non-diploid organisms as well as pooled experiment data. This is the preferred genotyper for modern DNA. The output provided is a bgzipped VCF file for containing the genotype calls of each sample, it's index file, as well as the statistics of the VCF file generated by `bcftools stats`. + +### FreeBayes + +
+Output files + +- `genotyping/` + + - `*.vcf.gz`: VCF file containing the genotype calls for each sample. + - `*.vcf.gz.tbi`: Tabix index file for the VCF file. + - `*.stats.txt`: Statistics of the VCF file from bcftools stats. + +
+ +[FreeBayes](https://github.com/freebayes/freebayes) is a Bayesian genetic variant detector designed to find small polymorphisms, specifically SNPs (single-nucleotide polymorphisms), indels (insertions and deletions), MNPs (multi-nucleotide polymorphisms), and complex events (composite insertion and substitution events) smaller than the length of a short-read sequencing alignment. It calls variants based on the literal sequences of reads aligned to a particular target, not their precise alignment. This model is a straightforward generalization of previous ones (e.g. PolyBayes, samtools, GATK) which detect or report variants based on alignments. This method avoids one of the core problems with alignment-based variant detection - that identical sequences may have multiple possible alignments. The output provided is a bgzipped VCF file for containing the genotype calls of each sample, it's index file, as well as the statistics of the VCF file generated by `bcftools stats`. From 0e359a406d29d4c44285ea59f00906752a7eb024 Mon Sep 17 00:00:00 2001 From: Thiseas Christos Lamnidis Date: Mon, 18 Mar 2024 10:37:28 +0100 Subject: [PATCH 102/110] Clarify pileupcaller --- docs/output.md | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/docs/output.md b/docs/output.md index 627743373..144c7bc21 100644 --- a/docs/output.md +++ b/docs/output.md @@ -557,8 +557,9 @@ is a tool which calculates a variety of standard 'aDNA' metrics from a BAM file. -[sequencetools pileupCaller](https://github.com/stschiff/sequenceTools) is a tool to create genotype calls from bam files using read-sampling methods. It can call genotypes from `samtools mpileup` output, and is often used for ancient DNA. The output files are in Eigenstrat format, in which the gentype dataset is split across 3 different files (similar to PLINK). The `.geno` file contains the genotype table. Each number represents the genotype call of an individual at a specific position, with each column represents an individual, while each row represents a SNP. The `.snp` file contains the SNP annotation of the genotype table, and the `.ind` file contains the individual annotation of the genotype table. -We provide an additional file named `*_coverage.tsv` which contains the number of covered SNPs and total number of SNPs for each individual in the eigenstrat dataset. +[sequencetools pileupCaller](https://github.com/stschiff/sequenceTools) is a tool to create genotype calls from bam files using read-sampling methods. It can call genotypes from `samtools mpileup` output, and is often used for ancient DNA. The output files are in Eigenstrat format, in which the gentype dataset is split across 3 different files (similar to PLINK). The `.geno` file contains the genotype table. Each number represents the genotype call of an individual at a specific position, with each column represents an individual, while each row represents a SNP. The `.snp` file contains the SNP annotation of the genotype table, and the `.ind` file contains the individual annotation of the genotype table. We provide an additional file named `*_coverage.tsv` which contains the number of covered SNPs and total number of SNPs for each individual in the eigenstrat dataset. One dataset is generated per reference genome, which includes genotypes from both single- and double-stranded libraries. + +When using pileupCaller for genotyping, single-stranded and double-stranded libraries are genotyped separately. Single-stranded libraries are genotyped with the additional option `--singeStrandMode`, which ensure that deamination damage artefactts cannot affect the genotype calls, by only using the forward- or reverse-mapping reads when genotyping on transitions (depending on the alleles of the transition). ### GATK UnifiedGenotyper From 2fa3b73966fea4f2a21ca80a0865823727e087c3 Mon Sep 17 00:00:00 2001 From: Thiseas Christos Lamnidis Date: Tue, 19 Mar 2024 09:11:30 +0100 Subject: [PATCH 103/110] add citations --- CITATIONS.md | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/CITATIONS.md b/CITATIONS.md index c0f3d40ca..a0fe01b80 100644 --- a/CITATIONS.md +++ b/CITATIONS.md @@ -103,7 +103,19 @@ > QualiMap Okonechnikov, K., Conesa, A., & García-Alcalde, F. (2016). Qualimap 2: advanced multi-sample quality control for high-throughput sequencing data. Bioinformatics , 32(2), 292–294. Download: http://qualimap.bioinfo.cipf.es/ - [DamageProfiler](https://doi.org/10.1093/bioinformatics/btab190) - > DamageProfiler Neukamm, J., Peltzer, A., & Nieselt, K. (2020). DamageProfiler: Fast damage pattern calculation for ancient DNA. In Bioinformatics (btab190). doi: [10.1093/bioinformatics/btab190](https://doi.org/10.1093/bioinformatics/btab190). Download: https://github.com/Integrative-Transcriptomics/DamageProfiler + > DamageProfiler Neukamm, J., Peltzer, A., & Nieselt, K. (2020). DamageProfiler: Fast damage pattern calculation for ancient DNA. In Bioinformatics (btab190). doi: [10.1093/bioinformatics/btab190](https://doi.org/10.1093/bioinformatics/btab190). + +- [GATK 3.5](https://console.cloud.google.com/storage/browser/gatk) + + > DePristo M, Banks E, Poplin R, Garimella K, Maguire J, Hartl C, Philippakis A, del Angel G, Rivas MA, Hanna M, McKenna A, Fennell T, Kernytsky A, Sivachenko A, Cibulskis K, Gabriel S, Altshuler D, Daly M. (2011). A framework for variation discovery and genotyping using next-generation DNA sequencing data. Nature Genetics, 43(5), 491–498. doi: [10.1038/ng.806](https://doi.org/10.1038/ng.806). + +- [GATK 4.X](https://github.com/broadinstitute/gatk/releases) + + > Poplin R, Ruano-Rubio V, DePristo MA, Fennell TJ, Carneiro MO, Van der Auwera GA, Kling DE, Gauthier LD, Levy-Moonshine A, Roazen D, Shakir K, Thibault J, Chandran S, Whelan C, Lek M, Gabriel S, Daly MJ, Neale B, MacArthur DG, Banks E. (2017). Scaling accurate genetic variant discovery to tens of thousands of samples bioRxiv, 201178. doi: [10.1101/201178](https://doi.org/10.1101/201178). + +- [FreeBayes](https://github.com/freebayes/freebayes) + + > Garrison E, Marth G. Haplotype-based variant detection from short-read sequencing. arXiv preprint arXiv:1207.3907 \[q-bio.GN] 2012. doi: [10.48550/arXiv.1207.3907](https://doi.org/10.48550/arXiv.1207.3907). ## Software packaging/containerisation tools From 780ebaf2b0c3b2577a5b7c867e624fc03723e5bc Mon Sep 17 00:00:00 2001 From: Thiseas Christos Lamnidis Date: Tue, 19 Mar 2024 09:14:32 +0100 Subject: [PATCH 104/110] add bcftools citation --- CITATIONS.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/CITATIONS.md b/CITATIONS.md index a0fe01b80..177388df0 100644 --- a/CITATIONS.md +++ b/CITATIONS.md @@ -117,6 +117,10 @@ > Garrison E, Marth G. Haplotype-based variant detection from short-read sequencing. arXiv preprint arXiv:1207.3907 \[q-bio.GN] 2012. doi: [10.48550/arXiv.1207.3907](https://doi.org/10.48550/arXiv.1207.3907). +- [BCFtools](https://github.com/samtools/bcftools) + + > Li H. A statistical framework for SNP calling, mutation discovery, association mapping and population genetical parameter estimation from sequencing data. Bioinformatics (2011) 27(21) 2987-93.doi: [10.1093/bioinformatics/btr509](https://doi.org/10.1093/bioinformatics/btr509) + ## Software packaging/containerisation tools - [Anaconda](https://anaconda.com) From 309b8b8e3ae6f7ac65caa09e521cabfeab4146e7 Mon Sep 17 00:00:00 2001 From: Thiseas Christos Lamnidis Date: Tue, 19 Mar 2024 12:22:26 +0100 Subject: [PATCH 105/110] update modules.json (remove dumpSV) --- modules.json | 35 +++++++++++++---------------------- 1 file changed, 13 insertions(+), 22 deletions(-) diff --git a/modules.json b/modules.json index bac49b14e..914c5cb3a 100644 --- a/modules.json +++ b/modules.json @@ -90,11 +90,6 @@ "git_sha": "02fd5bd7275abad27aad32d5c852e0a9b1b98882", "installed_by": ["modules"] }, - "custom/dumpsoftwareversions": { - "branch": "master", - "git_sha": "8ec825f465b9c17f9d83000022995b4f7de6fe93", - "installed_by": ["modules"] - }, "damageprofiler": { "branch": "master", "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", @@ -269,38 +264,34 @@ }, "subworkflows": { "nf-core": { - "bam_docounts_contamination_angsd": { + "utils_nextflow_pipeline": { "branch": "master", - "git_sha": "cfd937a668919d948f6fcbf4218e79de50c2f36f", + "git_sha": "5caf7640a9ef1d18d765d55339be751bb0969dfa", "installed_by": ["subworkflows"] }, - "bam_split_by_region": { + "utils_nfcore_pipeline": { "branch": "master", - "git_sha": "cfd937a668919d948f6fcbf4218e79de50c2f36f", + "git_sha": "5caf7640a9ef1d18d765d55339be751bb0969dfa", "installed_by": ["subworkflows"] }, - "fastq_align_bwaaln": { + "utils_nfvalidation_plugin": { "branch": "master", - "git_sha": "cfd937a668919d948f6fcbf4218e79de50c2f36f", + "git_sha": "5caf7640a9ef1d18d765d55339be751bb0969dfa", "installed_by": ["subworkflows"] - } - } - }, - "subworkflows": { - "nf-core": { - "utils_nextflow_pipeline": { + }, + "bam_docounts_contamination_angsd": { "branch": "master", - "git_sha": "5caf7640a9ef1d18d765d55339be751bb0969dfa", + "git_sha": "cfd937a668919d948f6fcbf4218e79de50c2f36f", "installed_by": ["subworkflows"] }, - "utils_nfcore_pipeline": { + "bam_split_by_region": { "branch": "master", - "git_sha": "5caf7640a9ef1d18d765d55339be751bb0969dfa", + "git_sha": "cfd937a668919d948f6fcbf4218e79de50c2f36f", "installed_by": ["subworkflows"] }, - "utils_nfvalidation_plugin": { + "fastq_align_bwaaln": { "branch": "master", - "git_sha": "5caf7640a9ef1d18d765d55339be751bb0969dfa", + "git_sha": "cfd937a668919d948f6fcbf4218e79de50c2f36f", "installed_by": ["subworkflows"] } } From b78c3f17ab7ef0877c211b3b62c7d18ab6b791a0 Mon Sep 17 00:00:00 2001 From: Thiseas Christos Lamnidis Date: Tue, 19 Mar 2024 12:48:34 +0100 Subject: [PATCH 106/110] validate parameter combinations --- subworkflows/local/utils_nfcore_eager_pipeline/main.nf | 6 ++++++ workflows/eager.nf | 7 ------- 2 files changed, 6 insertions(+), 7 deletions(-) diff --git a/subworkflows/local/utils_nfcore_eager_pipeline/main.nf b/subworkflows/local/utils_nfcore_eager_pipeline/main.nf index bbfb95eff..b5c48df27 100644 --- a/subworkflows/local/utils_nfcore_eager_pipeline/main.nf +++ b/subworkflows/local/utils_nfcore_eager_pipeline/main.nf @@ -192,6 +192,12 @@ def validateInputParameters() { if ( params.metagenomics_complexity_tool == 'prinseq' && params.metagenomics_prinseq_mode == 'entropy' && params.metagenomics_prinseq_dustscore != 0.5 ) { if (params.metagenomics_complexity_entropy == 0.3) { exit 1, ("[nf-core/eager] ERROR: Metagenomics: You picked PRINSEQ++ with 'entropy' mode but provided a dust score. Please specify an entropy filter threshold using the --metagenomics_complexity_entropy flag") } } + if ( params.run_genotyping && ! params.genotyping_tool ) { exit 1, ("[nf-core/eager] ERROR: --run_genotyping was specified, but no --genotyping_tool was specified.") } + if ( params.run_genotyping && ! params.genotyping_source ) { exit 1, ("[nf-core/eager] ERROR: --run_genotyping was specified, but no --genotyping_source was specified.") } + if ( params.genotyping_source == 'trimmed' && ! params.run_trim_bam ) { exit 1, ("[nf-core/eager] ERROR: --genotyping_source cannot be 'trimmed' unless BAM trimming is turned on with `--run_trim_bam`.") } + if ( params.genotyping_source == 'pmd' && ! params.run_pmd_filtering ) { exit 1, ("[nf-core/eager] ERROR: --genotyping_source cannot be 'pmd' unless PMD-filtering is ran.") } + if ( params.genotyping_source == 'rescaled' && ! params.run_mapdamage_rescaling ) { exit 1, ("[nf-core/eager] ERROR: --genotyping_source cannot be 'rescaled' unless aDNA damage rescaling is ran.") } + if ( ! ( params.fasta.endsWith('csv') || params.fasta.endsWith('tsv') ) && params.run_genotyping && params.genotyping_tool == 'pileupcaller' && ! (params.genotyping_pileupcaller_bedfile || params.genotyping_pileupcaller_snpfile ) ) { exit 1, ("[nf-core/eager] ERROR: Genotyping with pileupcaller requires both '--genotyping_pileupcaller_bedfile' AND '--genotyping_pileupcaller_snpfile' to be provided.") } } diff --git a/workflows/eager.nf b/workflows/eager.nf index f17965d48..bf56f48bd 100644 --- a/workflows/eager.nf +++ b/workflows/eager.nf @@ -10,13 +10,6 @@ include { softwareVersionsToYAML } from '../subworkflows/nf-core/utils_nfcore_ include { methodsDescriptionText } from '../subworkflows/local/utils_nfcore_eager_pipeline' include { addNewMetaFromAttributes } from '../subworkflows/local/utils_nfcore_eager_pipeline/main' -// if ( params.run_genotyping && ! params.genotyping_tool ) { exit 1, ("[nf-core/eager] ERROR: --run_genotyping was specified, but no --genotyping_tool was specified.") } -// if ( params.run_genotyping && ! params.genotyping_source ) { exit 1, ("[nf-core/eager] ERROR: --run_genotyping was specified, but no --genotyping_source was specified.") } -// if ( params.genotyping_source == 'trimmed' && ! params.run_trim_bam ) { exit 1, ("[nf-core/eager] ERROR: --genotyping_source cannot be 'trimmed' unless BAM trimming is turned on with `--run_trim_bam`.") } -// if ( params.genotyping_source == 'pmd' && ! params.run_pmd_filtering ) { exit 1, ("[nf-core/eager] ERROR: --genotyping_source cannot be 'pmd' unless PMD-filtering is ran.") } -// if ( params.genotyping_source == 'rescaled' && ! params.run_mapdamage_rescaling ) { exit 1, ("[nf-core/eager] ERROR: --genotyping_source cannot be 'rescaled' unless aDNA damage rescaling is ran.") } -// if ( ! ( params.fasta.endsWith('csv') || params.fasta.endsWith('tsv') ) && params.run_genotyping && params.genotyping_tool == 'pileupcaller' && ! (params.genotyping_pileupcaller_bedfile || params.genotyping_pileupcaller_snpfile ) ) { exit 1, ("[nf-core/eager] ERROR: Genotyping with pileupcaller requires both '--genotyping_pileupcaller_bedfile' AND '--genotyping_pileupcaller_snpfile' to be provided.") } - /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ IMPORT LOCAL MODULES/SUBWORKFLOWS From e8b27df344374e55511cc5045171c0671388ce07 Mon Sep 17 00:00:00 2001 From: Thiseas Christos Lamnidis Date: Tue, 19 Mar 2024 12:50:38 +0100 Subject: [PATCH 107/110] linting --- modules.json | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/modules.json b/modules.json index 914c5cb3a..f966fd082 100644 --- a/modules.json +++ b/modules.json @@ -264,34 +264,34 @@ }, "subworkflows": { "nf-core": { - "utils_nextflow_pipeline": { + "bam_docounts_contamination_angsd": { "branch": "master", - "git_sha": "5caf7640a9ef1d18d765d55339be751bb0969dfa", + "git_sha": "cfd937a668919d948f6fcbf4218e79de50c2f36f", "installed_by": ["subworkflows"] }, - "utils_nfcore_pipeline": { + "bam_split_by_region": { "branch": "master", - "git_sha": "5caf7640a9ef1d18d765d55339be751bb0969dfa", + "git_sha": "cfd937a668919d948f6fcbf4218e79de50c2f36f", "installed_by": ["subworkflows"] }, - "utils_nfvalidation_plugin": { + "fastq_align_bwaaln": { "branch": "master", - "git_sha": "5caf7640a9ef1d18d765d55339be751bb0969dfa", + "git_sha": "cfd937a668919d948f6fcbf4218e79de50c2f36f", "installed_by": ["subworkflows"] }, - "bam_docounts_contamination_angsd": { + "utils_nextflow_pipeline": { "branch": "master", - "git_sha": "cfd937a668919d948f6fcbf4218e79de50c2f36f", + "git_sha": "5caf7640a9ef1d18d765d55339be751bb0969dfa", "installed_by": ["subworkflows"] }, - "bam_split_by_region": { + "utils_nfcore_pipeline": { "branch": "master", - "git_sha": "cfd937a668919d948f6fcbf4218e79de50c2f36f", + "git_sha": "5caf7640a9ef1d18d765d55339be751bb0969dfa", "installed_by": ["subworkflows"] }, - "fastq_align_bwaaln": { + "utils_nfvalidation_plugin": { "branch": "master", - "git_sha": "cfd937a668919d948f6fcbf4218e79de50c2f36f", + "git_sha": "5caf7640a9ef1d18d765d55339be751bb0969dfa", "installed_by": ["subworkflows"] } } From 0b93d81a1ce307c69786aafbd91d22be895a0167 Mon Sep 17 00:00:00 2001 From: Thiseas Christos Lamnidis Date: Tue, 19 Mar 2024 12:58:23 +0100 Subject: [PATCH 108/110] remove lib dependency --- subworkflows/local/genotype.nf | 29 +++++++++++++++-------------- 1 file changed, 15 insertions(+), 14 deletions(-) diff --git a/subworkflows/local/genotype.nf b/subworkflows/local/genotype.nf index 860da78d3..e97d4020d 100644 --- a/subworkflows/local/genotype.nf +++ b/subworkflows/local/genotype.nf @@ -14,6 +14,7 @@ include { FREEBAYES } from '../../module include { BCFTOOLS_STATS as BCFTOOLS_STATS_GENOTYPING } from '../../modules/nf-core/bcftools/stats/main' include { BCFTOOLS_INDEX as BCFTOOLS_INDEX_UG } from '../../modules/nf-core/bcftools/index/main' include { BCFTOOLS_INDEX as BCFTOOLS_INDEX_FREEBAYES } from '../../modules/nf-core/bcftools/index/main' +include { addNewMetaFromAttributes } from '../subworkflows/local/utils_nfcore_eager_pipeline/main' workflow GENOTYPE { take: @@ -54,13 +55,13 @@ workflow GENOTYPE { .mix( ch_refs_prep.has_aux ) .map { // Prepend a new meta that contains the meta.id value as the new_meta.reference attribute - WorkflowEager.addNewMetaFromAttributes( it, "id" , "reference" , false ) + addNewMetaFromAttributes( it, "id" , "reference" , false ) } // RESULT: [ [combination_meta], [ref_meta], fasta, fai, dict, bed, snp ] // Prepare collect bams for mpileup ch_mpileup_inputs_bams = ch_bam_bai .map { - WorkflowEager.addNewMetaFromAttributes( it, ["reference", "strandedness"] , ["reference", "strandedness"] , false ) + addNewMetaFromAttributes( it, ["reference", "strandedness"] , ["reference", "strandedness"] , false ) } .groupTuple() .map { @@ -72,7 +73,7 @@ workflow GENOTYPE { // Combine prepped bams and references ch_mpileup_inputs = ch_mpileup_inputs_bams .map { - WorkflowEager.addNewMetaFromAttributes( it, "reference", "reference" , false ) + addNewMetaFromAttributes( it, "reference", "reference" , false ) } .combine( ch_refs_for_mpileup_pileupcaller , by:0 ) // do not run if no bed file is provided @@ -92,7 +93,7 @@ workflow GENOTYPE { ch_pileupcaller_input = SAMTOOLS_MPILEUP_PILEUPCALLER.out.mpileup .map { - WorkflowEager.addNewMetaFromAttributes( it, "reference", "reference" , false ) + addNewMetaFromAttributes( it, "reference", "reference" , false ) } .combine( ch_refs_for_mpileup_pileupcaller, by:0 ) .multiMap { @@ -113,7 +114,7 @@ workflow GENOTYPE { // Merge/rename genotyping datasets ch_final_genotypes = SEQUENCETOOLS_PILEUPCALLER.out.eigenstrat .map { - WorkflowEager.addNewMetaFromAttributes( it, "reference" , "reference" , false ) + addNewMetaFromAttributes( it, "reference" , "reference" , false ) } .groupTuple() .map { @@ -144,7 +145,7 @@ workflow GENOTYPE { ch_bams_for_multimap = ch_bam_bai .map { // Prepend a new meta that contains the meta.reference value as the new_meta.reference attribute - WorkflowEager.addNewMetaFromAttributes( it, "reference" , "reference" , false ) + addNewMetaFromAttributes( it, "reference" , "reference" , false ) } ch_fasta_for_multimap = ch_fasta_plus @@ -160,7 +161,7 @@ workflow GENOTYPE { } .map { // Prepend a new meta that contains the meta.id value as the new_meta.reference attribute - WorkflowEager.addNewMetaFromAttributes( it, "id" , "reference" , false ) + addNewMetaFromAttributes( it, "id" , "reference" , false ) } // RESULT: [ [combination_meta], [ref_meta], fasta, fai, dict, dbsnp ] ch_input_for_targetcreator = ch_bams_for_multimap @@ -187,7 +188,7 @@ workflow GENOTYPE { .join( GATK_REALIGNERTARGETCREATOR.out.intervals ) .map { // Prepend a new meta that contains the meta.reference value as the new_meta.reference attribute - WorkflowEager.addNewMetaFromAttributes( it, "reference" , "reference" , false ) + addNewMetaFromAttributes( it, "reference" , "reference" , false ) } .combine( ch_fasta_for_multimap , by:0 ) .multiMap { @@ -210,7 +211,7 @@ workflow GENOTYPE { // Use realigned bams as input for UG. combine with reference info to get correct ordering. ch_bams_for_ug = GATK_INDELREALIGNER.out.bam .map { - WorkflowEager.addNewMetaFromAttributes( it, "reference" , "reference" , false ) + addNewMetaFromAttributes( it, "reference" , "reference" , false ) } .combine( ch_fasta_for_multimap , by:0 ) .multiMap { @@ -251,7 +252,7 @@ workflow GENOTYPE { ch_bams_for_multimap = ch_bam_bai .map { // Prepend a new meta that contains the meta.reference value as the new_meta.reference attribute - WorkflowEager.addNewMetaFromAttributes( it, "reference" , "reference" , false ) + addNewMetaFromAttributes( it, "reference" , "reference" , false ) } ch_fasta_for_multimap = ch_fasta_plus @@ -267,7 +268,7 @@ workflow GENOTYPE { } .map { // Prepend a new meta that contains the meta.id value as the new_meta.reference attribute - WorkflowEager.addNewMetaFromAttributes( it, "id" , "reference" , false ) + addNewMetaFromAttributes( it, "id" , "reference" , false ) } // RESULT: [ [combination_meta], [ref_meta], fasta, fai, dict, dbsnp ] ch_input_for_hc = ch_bams_for_multimap @@ -302,7 +303,7 @@ workflow GENOTYPE { ch_bams_for_multimap = ch_bam_bai .map { // Prepend a new meta that contains the meta.reference value as the new_meta.reference attribute - WorkflowEager.addNewMetaFromAttributes( it, "reference" , "reference" , false ) + addNewMetaFromAttributes( it, "reference" , "reference" , false ) } // NOTE: dbsnp is not used by Freebayes, but we need to provide it to the module anyway, to ensure correct cardinality of the fasta channel within the BCFTOOLS_STATS channel operations. @@ -320,7 +321,7 @@ workflow GENOTYPE { } .map { // Prepend a new meta that contains the meta.id value as the new_meta.reference attribute - WorkflowEager.addNewMetaFromAttributes( it, "id" , "reference" , false ) + addNewMetaFromAttributes( it, "id" , "reference" , false ) } // RESULT: [ [combination_meta], [ref_meta], fasta, fai, dict, dbsnp ] ch_input_for_freebayes = ch_bams_for_multimap @@ -363,7 +364,7 @@ workflow GENOTYPE { if ( !params.skip_bcftools_stats && ( params.genotyping_tool == 'hc' || params.genotyping_tool == 'ug' || params.genotyping_tool == 'freebayes' ) ) { ch_bcftools_input= ch_genotypes_vcf .map { - WorkflowEager.addNewMetaFromAttributes( it, "reference" , "reference" , false ) + addNewMetaFromAttributes( it, "reference" , "reference" , false ) } .combine( ch_fasta_for_multimap , by:0 ) .multiMap { From 47636013f292c155424e8c17ebede29ab2d161d1 Mon Sep 17 00:00:00 2001 From: Thiseas Christos Lamnidis Date: Tue, 19 Mar 2024 12:59:39 +0100 Subject: [PATCH 109/110] typo --- subworkflows/local/genotype.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/subworkflows/local/genotype.nf b/subworkflows/local/genotype.nf index e97d4020d..58bebf64c 100644 --- a/subworkflows/local/genotype.nf +++ b/subworkflows/local/genotype.nf @@ -14,7 +14,7 @@ include { FREEBAYES } from '../../module include { BCFTOOLS_STATS as BCFTOOLS_STATS_GENOTYPING } from '../../modules/nf-core/bcftools/stats/main' include { BCFTOOLS_INDEX as BCFTOOLS_INDEX_UG } from '../../modules/nf-core/bcftools/index/main' include { BCFTOOLS_INDEX as BCFTOOLS_INDEX_FREEBAYES } from '../../modules/nf-core/bcftools/index/main' -include { addNewMetaFromAttributes } from '../subworkflows/local/utils_nfcore_eager_pipeline/main' +include { addNewMetaFromAttributes } from '../../subworkflows/local/utils_nfcore_eager_pipeline/main' workflow GENOTYPE { take: From ccd118c51f76255afeeec56c5864b05a8486378a Mon Sep 17 00:00:00 2001 From: Thiseas Christos Lamnidis Date: Tue, 19 Mar 2024 15:35:29 +0100 Subject: [PATCH 110/110] minor edits and linting --- CITATIONS.md | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/CITATIONS.md b/CITATIONS.md index 177388df0..c6e0b8b68 100644 --- a/CITATIONS.md +++ b/CITATIONS.md @@ -100,9 +100,10 @@ - [QualiMap](https://doi.org/10.1093/bioinformatics/btv566) - > QualiMap Okonechnikov, K., Conesa, A., & García-Alcalde, F. (2016). Qualimap 2: advanced multi-sample quality control for high-throughput sequencing data. Bioinformatics , 32(2), 292–294. Download: http://qualimap.bioinfo.cipf.es/ + > QualiMap Okonechnikov, K., Conesa, A., & García-Alcalde, F. (2016). Qualimap 2: advanced multi-sample quality control for high-throughput sequencing data. Bioinformatics , 32(2), 292–294. doi: [10.1093/bioinformatics/btv566](https://doi.org/10.1093/bioinformatics/btv566). - [DamageProfiler](https://doi.org/10.1093/bioinformatics/btab190) + > DamageProfiler Neukamm, J., Peltzer, A., & Nieselt, K. (2020). DamageProfiler: Fast damage pattern calculation for ancient DNA. In Bioinformatics (btab190). doi: [10.1093/bioinformatics/btab190](https://doi.org/10.1093/bioinformatics/btab190). - [GATK 3.5](https://console.cloud.google.com/storage/browser/gatk) @@ -119,7 +120,7 @@ - [BCFtools](https://github.com/samtools/bcftools) - > Li H. A statistical framework for SNP calling, mutation discovery, association mapping and population genetical parameter estimation from sequencing data. Bioinformatics (2011) 27(21) 2987-93.doi: [10.1093/bioinformatics/btr509](https://doi.org/10.1093/bioinformatics/btr509) + > Li H. A statistical framework for SNP calling, mutation discovery, association mapping and population genetical parameter estimation from sequencing data. Bioinformatics (2011) 27(21) 2987-93.doi: [10.1093/bioinformatics/btr509](https://doi.org/10.1093/bioinformatics/btr509). ## Software packaging/containerisation tools