-
Notifications
You must be signed in to change notification settings - Fork 1
/
09-qual_score_recal02.py
executable file
·153 lines (132 loc) · 6.01 KB
/
09-qual_score_recal02.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
#! /usr/bin/env python
# PBS cluster job submission in Python
# Uses GATK-3.3.0 BaseRecalibrator to recalibrate quality scores
# By Jean P. Elbers
# Last modified 22 Jan 2015
###############################################################################
Usage = """
09-qual_score_recal02.py - version 1.0
Command:
cd InDir = /work/jelber2/immunome_2014/combined/call-SNPs-recal01
1.Analyze patterns of covariation in the sequence dataset
java -Xmx8g -jar ~/bin/GATK-3.3.0/GenomeAnalysisTK.jar \
-T BaseRecalibrator \
-R RefDir/GCF_000241765.3_Chrysemys_picta_bellii-3.0.3_genomic.fna \
-I Sample-recal01.bam \
-L /work/jelber2/reference/immunome_baits_C_picta-3.0.3.interval.list \
-knownSites ALL-samples-recal01-Q30-SNPs.vcf \
-o ../call-SNPs-recal02/Sample-recal-data.table
2.Do a second pass to analyze covariation remaining after recalibration
java -Xmx8g -jar ~/bin/GATK-3.3.0/GenomeAnalysisTK.jar \
-T BaseRecalibrator \
-R RefDir/GCF_000241765.3_Chrysemys_picta_bellii-3.0.3_genomic.fna \
-I Sample-recal01.bam \
-L /work/jelber2/reference/immunome_baits_C_picta-3.0.3.interval.list \
-knownSites ../call-SNPs-recal01/ALL-samples-recal01-Q30-SNPs.vcf \
-BQSR ../call-SNPs-recal02/Sample-recal-data.table \
-o ../call-SNPs-recal02/Sample-post-recal-data.table
3.Generate before/after plots
java -Xmx8g -jar ~/bin/GATK-3.3.0/GenomeAnalysisTK.jar \
-T AnalyzeCovariates \
-R RefDir/GCF_000241765.3_Chrysemys_picta_bellii-3.0.3_genomic.fna \
-before ../call-SNPs-recal02/Sample-recal-data.table \
-after ../call-SNPs-recal02/Sample-post-recal-data.table \
-plots ../call-SNPs-recal02/Sample-recalibration_plots.pdf
4.Apply the recalibration to your sequence data
java -Xmx8g -jar ~/bin/GATK-3.3.0/GenomeAnalysisTK.jar \
-T PrintReads \
-R RefDir/GCF_000241765.3_Chrysemys_picta_bellii-3.0.3_genomic.fna \
-I Sample-recal01.bam \
-L /work/jelber2/reference/immunome_baits_C_picta-3.0.3.interval.list \
-BQSR ../call-SNPs-recal02/Sample-recal-data.table \
-o ../call-SNPs-recal02/Sample-recal02.bam
File Info
InDir = /work/jelber2/immunome_2014/combined/call-SNPs-recal01
Input Files =
Sample-recal01.bam
../call-SNPs-recal01/ALL-samples-recal01-Q30-SNPs.vcf
OutDir = /work/jelber2/immunome_2014/combined/call-SNPs-recal02
Output Files = Sample-recal02.bam
Usage (execute following code in InDir):
find . -name '*-recal01.bam' -not -name 'ALL-samples-*' -exec ~/scripts/immunome_2014/09-qual_score_recal02.py {} \;
"""
###############################################################################
import os, sys, subprocess #imports os, sys, subprocess modules
if len(sys.argv)<2:
print Usage
else:
FileList = sys.argv[1:]
RefDir = "/work/jelber2/reference"
InDir = "/work/jelber2/immunome_2014/combined/call-SNPs-recal01"
OutDir1 = "call-SNPs-recal02"
os.chdir(InDir)
os.chdir("..") # go up one directory
if not os.path.exists(OutDir1):
os.mkdir(OutDir1) # if OutDir1 does not exist, then make it
os.chdir(InDir)
for InFileName in FileList: # do the following steps for each file in the inputstream
FileSuffix = "-recal01.bam"
FilePrefix = "./"
Samplepre = InFileName.replace(FileSuffix,'') # creates Samplepre string
Sample = Samplepre.replace(FilePrefix,'') # creates Sample string
# Customize your options here
Queue = "single"
Allocation = "hpc_gopo02"
Processors = "nodes=1:ppn=4"
WallTime = "01:00:00"
LogOut = "/work/jelber2/immunome_2014/combined/call-SNPs-recal02"
LogMerge = "oe"
JobName = "qual_score_recal02-%s" % (Sample)
Command ="""
java -Xmx8g -jar ~/bin/GATK-3.3.0/GenomeAnalysisTK.jar \
-T BaseRecalibrator \
-R %s/GCF_000241765.3_Chrysemys_picta_bellii-3.0.3_genomic.fna \
-I %s-recal01.bam \
-L /work/jelber2/reference/immunome_baits_C_picta-3.0.3.interval.list \
-knownSites ALL-samples-recal01-Q30-SNPs.vcf \
-o ../call-SNPs-recal02/%s-recal-data.table
java -Xmx8g -jar ~/bin/GATK-3.3.0/GenomeAnalysisTK.jar \
-T BaseRecalibrator \
-R %s/GCF_000241765.3_Chrysemys_picta_bellii-3.0.3_genomic.fna \
-I %s-recal01.bam \
-L /work/jelber2/reference/immunome_baits_C_picta-3.0.3.interval.list \
-knownSites ../call-SNPs-recal01/ALL-samples-recal01-Q30-SNPs.vcf \
-BQSR ../call-SNPs-recal02/%s-recal-data.table \
-o ../call-SNPs-recal02/%s-post-recal-data.table
java -Xmx8g -jar ~/bin/GATK-3.3.0/GenomeAnalysisTK.jar \
-T AnalyzeCovariates \
-R %s/GCF_000241765.3_Chrysemys_picta_bellii-3.0.3_genomic.fna \
-before ../call-SNPs-recal02/%s-recal-data.table \
-after ../call-SNPs-recal02/%s-post-recal-data.table \
-plots ../call-SNPs-recal02/%s-recalibration_plots.pdf
java -Xmx8g -jar ~/bin/GATK-3.3.0/GenomeAnalysisTK.jar \
-T PrintReads \
-R %s/GCF_000241765.3_Chrysemys_picta_bellii-3.0.3_genomic.fna \
-I %s-recal01.bam \
-L /work/jelber2/reference/immunome_baits_C_picta-3.0.3.interval.list \
-BQSR ../call-SNPs-recal02/%s-recal-data.table \
-o ../call-SNPs-recal02/%s-recal02.bam""" % \
(RefDir, Sample, Sample,
RefDir, Sample, Sample, Sample,
RefDir, Sample, Sample, Sample,
RefDir, Sample, Sample, Sample)
JobString = """
#!/bin/bash
#PBS -q %s
#PBS -A %s
#PBS -l %s
#PBS -l walltime=%s
#PBS -o %s
#PBS -j %s
#PBS -N %s
cd %s
%s\n""" % (Queue, Allocation, Processors, WallTime, LogOut, LogMerge, JobName, InDir, Command)
#Create pipe to qsub
proc = subprocess.Popen(['qsub'], shell=True,
stdin=subprocess.PIPE, stdout=subprocess.PIPE, close_fds=True)
(child_stdout, child_stdin) = (proc.stdout, proc.stdin)
#Print JobString
JobName = proc.communicate(JobString)[0]
print JobString
print JobName