-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
3 changed files
with
157 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,50 @@ | ||
#!/bin/bash | ||
set -eo pipefail | ||
|
||
# Ensure tools are available | ||
export PATH=$PATH:$GATK_PATH | ||
|
||
# Assign arguments to variables | ||
BAM=$1 | ||
REFERENCE=$2 | ||
SNP_VCF=$3 | ||
INDEL_VCF=$4 | ||
|
||
# Check if the correct number of arguments is provided | ||
if [ "$#" -ne 4 ]; then | ||
echo "Usage: $(basename "$0") <reference.fa> <sample.bam> <snp.vcf> <indel.vcf>" | ||
exit 1 | ||
fi | ||
|
||
# Directories and file names | ||
DIR="bqsr-genotype-vcfs" | ||
RECALIBRATED_BAMS_DIR="recalibrated-bams" | ||
mkdir -p "$DIR" "$RECALIBRATED_BAMS_DIR" | ||
|
||
PREFIX=$(basename "$BAM" .bam) | ||
BAM_RECALIBRATED="$RECALIBRATED_BAMS_DIR/${PREFIX}.recal.bam" | ||
RECALIBRATED_TABLE="$RECALIBRATED_BAMS_DIR/${PREFIX}.recal_data.table" | ||
GENOTYPE_VCF="$DIR/${PREFIX}.g.vcf.gz" | ||
|
||
# Step 1: Base quality score recalibration | ||
gatk BaseRecalibrator -R "$REFERENCE" \ | ||
--known-sites "$SNP_VCF" \ | ||
--known-sites "$INDEL_VCF" \ | ||
-I "$BAM" \ | ||
-O "$RECALIBRATED_TABLE" | ||
|
||
# Step 2: Apply BQSR | ||
gatk ApplyBQSR -R "$REFERENCE" \ | ||
-I "$BAM" \ | ||
--bqsr-recal-file "$RECALIBRATED_TABLE" \ | ||
-O "$BAM_RECALIBRATED" | ||
|
||
# Step 3: Index recalibrated BAM | ||
samtools index "$BAM_RECALIBRATED" | ||
|
||
# Step 4: Variant calling with HaplotypeCaller | ||
gatk HaplotypeCaller \ | ||
--input "$BAM_RECALIBRATED" \ | ||
--output "$GENOTYPE_VCF" \ | ||
--reference "$REFERENCE" \ | ||
-ERC GVCF |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,62 @@ | ||
#!/bin/bash | ||
set -eo pipefail | ||
|
||
# Check the number of arguments | ||
if [ "$#" -ne 2 ]; then | ||
echo " | ||
Usage: | ||
$(basename "$0") <reference.fa> <raw.genotypes.vcf.gz> | ||
" | ||
exit 1 | ||
fi | ||
|
||
# Assign arguments to variables | ||
REFERENCE=$1 | ||
COHORT_VCF=$2 | ||
|
||
# Ensure tools are available | ||
export PATH=$PATH:$GATK_PATH | ||
|
||
# Define file paths and prefixes | ||
PREFIX=$(basename "$COHORT_VCF" | sed 's/\.vcf\.gz$//;s/\.vcf$//') | ||
SNPS_DB=/path/to/trainingSnpDb.vcf.gz # Update this path to your SNP database | ||
RAW_SNP="${PREFIX}.raw.SNPs.vcf.gz" | ||
|
||
BQSR_SNP="${PREFIX}.bqsr" | ||
TRACE_FILE="${BQSR_SNP}.output.tranches" | ||
|
||
# Select SNPs from the cohort VCF | ||
gatk --java-options "-Xmx16g" SelectVariants \ | ||
-R "$REFERENCE" \ | ||
-V "$COHORT_VCF" \ | ||
-select-type SNP \ | ||
-O "$RAW_SNP" | ||
|
||
# Recalibrate variant quality scores for SNPs | ||
gatk --java-options "-Xmx30g" VariantRecalibrator \ | ||
-R "$REFERENCE" \ | ||
-V "$RAW_SNP" \ | ||
-resource:trainingSnpDb,known=false,training=true,truth=true,prior=8.0 "$SNPS_DB" \ | ||
-an DP -an QD -an FS -an SOR -an MQ -an MQRankSum -an ReadPosRankSum -an InbreedingCoeff \ | ||
-mode SNP \ | ||
-O "$BQSR_SNP" \ | ||
--tranches-file "$TRACE_FILE" \ | ||
--rscript-file "${BQSR_SNP}.output.plots.R" | ||
|
||
# Apply the recalibration to the SNP set | ||
gatk --java-options "-Xmx30g" ApplyVQSR \ | ||
-R "$REFERENCE" \ | ||
-V "$RAW_SNP" \ | ||
-ts-filter-level 99.9 \ | ||
-recal-file "$BQSR_SNP" \ | ||
--tranches-file "$TRACE_FILE" \ | ||
-O "${PREFIX}.vqsr.SNPs.vcf.gz" | ||
|
||
# Filter the final set of SNPs | ||
gatk --java-options "-Xmx8g" VariantFiltration \ | ||
-V "${PREFIX}.vqsr.SNPs.vcf.gz" \ | ||
-O "${PREFIX}.vqsr.ase.genotype.SNPs.vcf.gz" \ | ||
--cluster-window-size 35 \ | ||
--cluster-size 3 \ | ||
--filter-name "FS" --filter-expression "FS > 30.0" \ | ||
--filter-name "QD" --filter-expression "QD < 2.0" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,45 @@ | ||
#!/bin/bash | ||
set -eo pipefail | ||
|
||
REFERENCE=$1 | ||
COHORT_VCF=$2 | ||
|
||
# Ensure tools are available | ||
export PATH=$PATH:$GATK_PATH | ||
|
||
if [ "$#" -ne 2 ]; then | ||
echo "Usage: $(basename "$0") <reference.fa> <cohort.genotypes.vcf.gz>" | ||
exit 1 | ||
fi | ||
|
||
PREFIX=$(basename "$COHORT_VCF" | sed 's/\.vcf\.gz$//;s/\.vcf$//') | ||
|
||
# Example for SNP part, similar logic can be applied for INDELs | ||
RAW_SNP="${PREFIX}.raw.SNPs.vcf.gz" | ||
FILTERED_SNP="${PREFIX}.filtered.SNPs.vcf.gz" | ||
|
||
# Using named pipe for intermediate steps | ||
mkfifo raw.snps.tmp default.snps.tmp frequency.snps.tmp | ||
|
||
gatk --java-options "-Xmx4g" SelectVariants -R "$REFERENCE" -V "$COHORT_VCF" -select-type SNP -O raw.snps.tmp & | ||
gatk --java-options "-Xmx4g" VariantFiltration -R "$REFERENCE" -V raw.snps.tmp --filter-expression "QD < 2.0 || FS > 60.0 || MQ < 40.0 || MQRankSum < -12.5 || ReadPosRankSum < -8.0" --filter-name "default_snp_filter" -O default.snps.tmp & | ||
gatk --java-options "-Xmx4g" VariantFiltration -R "$REFERENCE" -V default.snps.tmp --filter-expression "AC < 4" --filter-name "frequency_filter" -O frequency.snps.tmp & | ||
gatk --java-options "-Xmx4g" SelectVariants -R "$REFERENCE" -V frequency.snps.tmp --exclude-filtered -O "$FILTERED_SNP" | ||
|
||
# INDEL processing variables | ||
RAW_INDEL="${PREFIX}.raw.INDELs.vcf.gz" | ||
|
||
FILTERED_INDEL="${PREFIX}.filtered.INDELs.vcf.gz" | ||
|
||
# Create named pipes for intermediate INDEL steps | ||
mkfifo raw.indels.tmp default.indels.tmp frequency.indels.tmp | ||
|
||
# Process INDEL variants | ||
gatk --java-options "-Xmx4g" SelectVariants -R "$REFERENCE" -V "$COHORT_VCF" -select-type INDEL -O raw.indels.tmp & | ||
gatk --java-options "-Xmx4g" VariantFiltration -R "$REFERENCE" -V raw.indels.tmp --filter-expression "QD < 2.0 || FS > 200.0 || ReadPosRankSum < -20.0" --filter-name "default_indel_filter" -O default.indels.tmp & | ||
gatk --java-options "-Xmx4g" VariantFiltration -R "$REFERENCE" -V default.indels.tmp --filter-expression "AC < 4" --filter-name "frequency_filter" -O frequency.indels.tmp & | ||
gatk --java-options "-Xmx4g" SelectVariants -R "$REFERENCE" -V frequency.indels.tmp --exclude-filtered -O "$FILTERED_INDEL" | ||
|
||
# Cleanup named pipes | ||
rm raw.snps.tmp default.snps.tmp frequency.snps.tmp | ||
rm raw.indels.tmp default.indels.tmp frequency.indels.tmp |