Skip to content

Commit

Permalink
feat: full gatk pipeline
Browse files Browse the repository at this point in the history
  • Loading branch information
edotau committed Feb 1, 2024
1 parent 5af2b04 commit acc009d
Show file tree
Hide file tree
Showing 3 changed files with 157 additions and 0 deletions.
50 changes: 50 additions & 0 deletions gatk/src/baseRecalibratorBQSR.bash
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
#!/bin/bash
set -eo pipefail

# Ensure tools are available
export PATH=$PATH:$GATK_PATH

# Assign arguments to variables
BAM=$1
REFERENCE=$2
SNP_VCF=$3
INDEL_VCF=$4

# Check if the correct number of arguments is provided
if [ "$#" -ne 4 ]; then
echo "Usage: $(basename "$0") <reference.fa> <sample.bam> <snp.vcf> <indel.vcf>"
exit 1
fi

# Directories and file names
DIR="bqsr-genotype-vcfs"
RECALIBRATED_BAMS_DIR="recalibrated-bams"
mkdir -p "$DIR" "$RECALIBRATED_BAMS_DIR"

PREFIX=$(basename "$BAM" .bam)
BAM_RECALIBRATED="$RECALIBRATED_BAMS_DIR/${PREFIX}.recal.bam"
RECALIBRATED_TABLE="$RECALIBRATED_BAMS_DIR/${PREFIX}.recal_data.table"
GENOTYPE_VCF="$DIR/${PREFIX}.g.vcf.gz"

# Step 1: Base quality score recalibration
gatk BaseRecalibrator -R "$REFERENCE" \
--known-sites "$SNP_VCF" \
--known-sites "$INDEL_VCF" \
-I "$BAM" \
-O "$RECALIBRATED_TABLE"

# Step 2: Apply BQSR
gatk ApplyBQSR -R "$REFERENCE" \
-I "$BAM" \
--bqsr-recal-file "$RECALIBRATED_TABLE" \
-O "$BAM_RECALIBRATED"

# Step 3: Index recalibrated BAM
samtools index "$BAM_RECALIBRATED"

# Step 4: Variant calling with HaplotypeCaller
gatk HaplotypeCaller \
--input "$BAM_RECALIBRATED" \
--output "$GENOTYPE_VCF" \
--reference "$REFERENCE" \
-ERC GVCF
62 changes: 62 additions & 0 deletions gatk/src/gatkGenotypeVariants.bash
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
#!/bin/bash
set -eo pipefail

# Check the number of arguments
if [ "$#" -ne 2 ]; then
echo "
Usage:
$(basename "$0") <reference.fa> <raw.genotypes.vcf.gz>
"
exit 1
fi

# Assign arguments to variables
REFERENCE=$1
COHORT_VCF=$2

# Ensure tools are available
export PATH=$PATH:$GATK_PATH

# Define file paths and prefixes
PREFIX=$(basename "$COHORT_VCF" | sed 's/\.vcf\.gz$//;s/\.vcf$//')
SNPS_DB=/path/to/trainingSnpDb.vcf.gz # Update this path to your SNP database
RAW_SNP="${PREFIX}.raw.SNPs.vcf.gz"

BQSR_SNP="${PREFIX}.bqsr"
TRACE_FILE="${BQSR_SNP}.output.tranches"

# Select SNPs from the cohort VCF
gatk --java-options "-Xmx16g" SelectVariants \
-R "$REFERENCE" \
-V "$COHORT_VCF" \
-select-type SNP \
-O "$RAW_SNP"

# Recalibrate variant quality scores for SNPs
gatk --java-options "-Xmx30g" VariantRecalibrator \
-R "$REFERENCE" \
-V "$RAW_SNP" \
-resource:trainingSnpDb,known=false,training=true,truth=true,prior=8.0 "$SNPS_DB" \
-an DP -an QD -an FS -an SOR -an MQ -an MQRankSum -an ReadPosRankSum -an InbreedingCoeff \
-mode SNP \
-O "$BQSR_SNP" \
--tranches-file "$TRACE_FILE" \
--rscript-file "${BQSR_SNP}.output.plots.R"

# Apply the recalibration to the SNP set
gatk --java-options "-Xmx30g" ApplyVQSR \
-R "$REFERENCE" \
-V "$RAW_SNP" \
-ts-filter-level 99.9 \
-recal-file "$BQSR_SNP" \
--tranches-file "$TRACE_FILE" \
-O "${PREFIX}.vqsr.SNPs.vcf.gz"

# Filter the final set of SNPs
gatk --java-options "-Xmx8g" VariantFiltration \
-V "${PREFIX}.vqsr.SNPs.vcf.gz" \
-O "${PREFIX}.vqsr.ase.genotype.SNPs.vcf.gz" \
--cluster-window-size 35 \
--cluster-size 3 \
--filter-name "FS" --filter-expression "FS > 30.0" \
--filter-name "QD" --filter-expression "QD < 2.0"
45 changes: 45 additions & 0 deletions gatk/src/snpIndelDiscovery.bash
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
#!/bin/bash
set -eo pipefail

REFERENCE=$1
COHORT_VCF=$2

# Ensure tools are available
export PATH=$PATH:$GATK_PATH

if [ "$#" -ne 2 ]; then
echo "Usage: $(basename "$0") <reference.fa> <cohort.genotypes.vcf.gz>"
exit 1
fi

PREFIX=$(basename "$COHORT_VCF" | sed 's/\.vcf\.gz$//;s/\.vcf$//')

# Example for SNP part, similar logic can be applied for INDELs
RAW_SNP="${PREFIX}.raw.SNPs.vcf.gz"
FILTERED_SNP="${PREFIX}.filtered.SNPs.vcf.gz"

# Using named pipe for intermediate steps
mkfifo raw.snps.tmp default.snps.tmp frequency.snps.tmp

gatk --java-options "-Xmx4g" SelectVariants -R "$REFERENCE" -V "$COHORT_VCF" -select-type SNP -O raw.snps.tmp &
gatk --java-options "-Xmx4g" VariantFiltration -R "$REFERENCE" -V raw.snps.tmp --filter-expression "QD < 2.0 || FS > 60.0 || MQ < 40.0 || MQRankSum < -12.5 || ReadPosRankSum < -8.0" --filter-name "default_snp_filter" -O default.snps.tmp &
gatk --java-options "-Xmx4g" VariantFiltration -R "$REFERENCE" -V default.snps.tmp --filter-expression "AC < 4" --filter-name "frequency_filter" -O frequency.snps.tmp &
gatk --java-options "-Xmx4g" SelectVariants -R "$REFERENCE" -V frequency.snps.tmp --exclude-filtered -O "$FILTERED_SNP"

# INDEL processing variables
RAW_INDEL="${PREFIX}.raw.INDELs.vcf.gz"

FILTERED_INDEL="${PREFIX}.filtered.INDELs.vcf.gz"

# Create named pipes for intermediate INDEL steps
mkfifo raw.indels.tmp default.indels.tmp frequency.indels.tmp

# Process INDEL variants
gatk --java-options "-Xmx4g" SelectVariants -R "$REFERENCE" -V "$COHORT_VCF" -select-type INDEL -O raw.indels.tmp &
gatk --java-options "-Xmx4g" VariantFiltration -R "$REFERENCE" -V raw.indels.tmp --filter-expression "QD < 2.0 || FS > 200.0 || ReadPosRankSum < -20.0" --filter-name "default_indel_filter" -O default.indels.tmp &
gatk --java-options "-Xmx4g" VariantFiltration -R "$REFERENCE" -V default.indels.tmp --filter-expression "AC < 4" --filter-name "frequency_filter" -O frequency.indels.tmp &
gatk --java-options "-Xmx4g" SelectVariants -R "$REFERENCE" -V frequency.indels.tmp --exclude-filtered -O "$FILTERED_INDEL"

# Cleanup named pipes
rm raw.snps.tmp default.snps.tmp frequency.snps.tmp
rm raw.indels.tmp default.indels.tmp frequency.indels.tmp

0 comments on commit acc009d

Please sign in to comment.