Marine Assembly: From Pacbio, Nanopore, 10x Genomics, to HiC Lastz workflow: axt to chain, to netted chains
sticklebackCipher/gatk/Dockerfile
Lines 1 to 18 in db6adda
FROM ubuntu:20.04 | |
ENV DEBIAN_FRONTEND=noninteractive | |
# Install dependencies | |
RUN apt-get update && apt-get install -y \ | |
coreutils \ | |
samtools \ | |
bwa \ | |
openjdk-17-jdk \ | |
openjdk-17-jre \ | |
fastqc \ | |
python3 \ | |
python3-pip \ | |
&& pip3 install --upgrade pip \ | |
&& pip3 install cutadapt \ | |
&& apt-get clean \ | |
&& rm -rf /var/lib/apt/lists/* |
sticklebackCipher/gatk/src/alignReads.bash
Lines 43 to 66 in 8906a84
trim_galore --output_dir "$OUTPUT_DIR" --basename "$PREFIX" --trim-n --max_n 0 --cores $(( THREADS > 4 ? 4 : THREADS )) --paired "$READ_ONE" "$READ_TWO" | |
# Define paths for Trim Galore output | |
TRIMMED_FWD="${OUTPUT_DIR}/${PREFIX}_val_1.fq.gz" | |
TRIMMED_REV="${OUTPUT_DIR}/${PREFIX}_val_2.fq.gz" | |
# Adapter trimming and quality filtering with BBDuk | |
echo "["$(date)"]\tRemoving adapters with BBDuk..." | |
./bbduk.sh \ | |
in1="$TRIMMED_FWD" in2="$TRIMMED_REV" \ | |
out1="${OUTPUT_DIR}/${PREFIX}_R1.fastq.gz" out2="${OUTPUT_DIR}/${PREFIX}_R2.fastq.gz" \ | |
minlen=25 qtrim=rl trimq=10 ktrim=r k=25 mink=11 hdist=1 \ | |
ref=$ADAPTERS | |
# MAPPING with bwa and sorting with samtools | |
echo -e "["$(date)"]\tAligning and sorting..." | |
BAM="${OUTPUT_DIR}/${PREFIX}.bam" | |
SORT_THREADS=$(( $THREADS / 2 )) # Adjust based on your system's capabilities | |
bwa mem -t "$THREADS" "$REFERENCE" "${OUTPUT_DIR}/${PREFIX}_R1.fastq.gz" "${OUTPUT_DIR}/${PREFIX}_R2.fastq.gz" | \ | |
samtools sort -@ "$SORT_THREADS" -T "${OUTPUT_DIR}/${PREFIX}.tmp" -o "$BAM" | |
# Index the final BAM file | |
samtools index "$BAM" |
sticklebackCipher/gatk/src/gatkfmtBam.bash
Lines 37 to 49 in 8906a84
gatk --java-options "-Xmx$MEM" MarkDuplicates -I "$BAM" -O /dev/stdout -M /dev/null -VALIDATION_STRINGENCY SILENT -CREATE_INDEX true | \ | |
samtools view -b -@ "$THREADS" -f 1 - | \ | |
gatk --java-options "-Xmx$MEM" AddOrReplaceReadGroups -I /dev/stdin -O /dev/stdout -LB "WGS" -PL "$PL" -PU "ILUMINA" -SM "$PREFIX" | \ | |
gatk --java-options "-Xmx$MEM" FixMateInformation -I /dev/stdin -O "$PROCESSED_BAM" -ADD_MATE_CIGAR true -IGNORE_MISSING_MATES true -TMP_DIR "${PREFIX}.fixmate.tmp" | |
# Index the final output BAM | |
samtools index "$PROCESSED_BAM" | |
# Variant calling | |
gatk --java-options "-Xmx$MEM" HaplotypeCaller -I "$PROCESSED_BAM" -O "${PREFIX}.g.vcf.gz" -R "$REFERENCE" -ERC GVCF --native-pair-hmm-threads "$THREADS" | |
# Assuming $gVCF should be ${PREFIX}.g.vcf.gz based on previous command | |
gatk --java-options "-Xmx$(echo $MEM | sed 's/G/g/')" GenotypeGVCFs -R "$REFERENCE" -V "${PREFIX}.g.vcf.gz" -O "${PREFIX}.vcf.gz" |
sticklebackCipher/gatk/src/mergeGvcf.bash
Lines 27 to 41 in 8906a84
SAMPLES="${DIR}/${PREFIX}.list" | |
: > "$SAMPLES" | |
for i in "$DIR"/*.g.vcf.gz; do | |
echo "$i" >> "$SAMPLES" | |
done | |
MERGED="${DIR}/${PREFIX}.merged.g.vcf.gz" | |
VCF="${DIR}/${PREFIX}.cohort.genotyped.vcf.gz" | |
# Combine GVCFs | |
gatk --java-options "-Xmx32g" CombineGVCFs -R "$REFERENCE" --variant "$SAMPLES" -O "$MERGED" | |
# Genotype GVCFs | |
gatk --java-options "-Xmx32g" GenotypeGVCFs -R "$REFERENCE" -V "$MERGED" -O "$VCF" |
sticklebackCipher/gatk/src/snpIndelDiscovery.bash
Lines 18 to 27 in acc009d
RAW_SNP="${PREFIX}.raw.SNPs.vcf.gz" | |
FILTERED_SNP="${PREFIX}.filtered.SNPs.vcf.gz" | |
# Using named pipe for intermediate steps | |
mkfifo raw.snps.tmp default.snps.tmp frequency.snps.tmp | |
gatk --java-options "-Xmx4g" SelectVariants -R "$REFERENCE" -V "$COHORT_VCF" -select-type SNP -O raw.snps.tmp & | |
gatk --java-options "-Xmx4g" VariantFiltration -R "$REFERENCE" -V raw.snps.tmp --filter-expression "QD < 2.0 || FS > 60.0 || MQ < 40.0 || MQRankSum < -12.5 || ReadPosRankSum < -8.0" --filter-name "default_snp_filter" -O default.snps.tmp & | |
gatk --java-options "-Xmx4g" VariantFiltration -R "$REFERENCE" -V default.snps.tmp --filter-expression "AC < 4" --filter-name "frequency_filter" -O frequency.snps.tmp & | |
gatk --java-options "-Xmx4g" SelectVariants -R "$REFERENCE" -V frequency.snps.tmp --exclude-filtered -O "$FILTERED_SNP" |
sticklebackCipher/gatk/src/baseRecalibratorBQSR.bash
Lines 29 to 50 in acc009d
# Step 1: Base quality score recalibration | |
gatk BaseRecalibrator -R "$REFERENCE" \ | |
--known-sites "$SNP_VCF" \ | |
--known-sites "$INDEL_VCF" \ | |
-I "$BAM" \ | |
-O "$RECALIBRATED_TABLE" | |
# Step 2: Apply BQSR | |
gatk ApplyBQSR -R "$REFERENCE" \ | |
-I "$BAM" \ | |
--bqsr-recal-file "$RECALIBRATED_TABLE" \ | |
-O "$BAM_RECALIBRATED" | |
# Step 3: Index recalibrated BAM | |
samtools index "$BAM_RECALIBRATED" | |
# Step 4: Variant calling with HaplotypeCaller | |
gatk HaplotypeCaller \ | |
--input "$BAM_RECALIBRATED" \ | |
--output "$GENOTYPE_VCF" \ | |
--reference "$REFERENCE" \ | |
-ERC GVCF |
sticklebackCipher/gatk/src/mergeGvcf.bash
Lines 27 to 41 in acc009d
SAMPLES="${DIR}/${PREFIX}.list" | |
: > "$SAMPLES" | |
for i in "$DIR"/*.g.vcf.gz; do | |
echo "$i" >> "$SAMPLES" | |
done | |
MERGED="${DIR}/${PREFIX}.merged.g.vcf.gz" | |
VCF="${DIR}/${PREFIX}.cohort.genotyped.vcf.gz" | |
# Combine GVCFs | |
gatk --java-options "-Xmx32g" CombineGVCFs -R "$REFERENCE" --variant "$SAMPLES" -O "$MERGED" | |
# Genotype GVCFs | |
gatk --java-options "-Xmx32g" GenotypeGVCFs -R "$REFERENCE" -V "$MERGED" -O "$VCF" |
Step 7: The variant recalibration step fits a Gaussian mixture model to the contextual annotations given to each variant
sticklebackCipher/gatk/src/gatkGenotypeVariants.bash
Lines 35 to 62 in acc009d
# Recalibrate variant quality scores for SNPs | |
gatk --java-options "-Xmx30g" VariantRecalibrator \ | |
-R "$REFERENCE" \ | |
-V "$RAW_SNP" \ | |
-resource:trainingSnpDb,known=false,training=true,truth=true,prior=8.0 "$SNPS_DB" \ | |
-an DP -an QD -an FS -an SOR -an MQ -an MQRankSum -an ReadPosRankSum -an InbreedingCoeff \ | |
-mode SNP \ | |
-O "$BQSR_SNP" \ | |
--tranches-file "$TRACE_FILE" \ | |
--rscript-file "${BQSR_SNP}.output.plots.R" | |
# Apply the recalibration to the SNP set | |
gatk --java-options "-Xmx30g" ApplyVQSR \ | |
-R "$REFERENCE" \ | |
-V "$RAW_SNP" \ | |
-ts-filter-level 99.9 \ | |
-recal-file "$BQSR_SNP" \ | |
--tranches-file "$TRACE_FILE" \ | |
-O "${PREFIX}.vqsr.SNPs.vcf.gz" | |
# Filter the final set of SNPs | |
gatk --java-options "-Xmx8g" VariantFiltration \ | |
-V "${PREFIX}.vqsr.SNPs.vcf.gz" \ | |
-O "${PREFIX}.vqsr.ase.genotype.SNPs.vcf.gz" \ | |
--cluster-window-size 35 \ | |
--cluster-size 3 \ | |
--filter-name "FS" --filter-expression "FS > 30.0" \ | |
--filter-name "QD" --filter-expression "QD < 2.0" |