-
Notifications
You must be signed in to change notification settings - Fork 11
/
parseBlast.py
executable file
·384 lines (312 loc) · 13.3 KB
/
parseBlast.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
#!/usr/bin/env python
# Copyright (c) 2011-2014, Stephen Fisher and Junhyong Kim, University of
# Pennsylvania. All Rights Reserved.
#
# You may not use this file except in compliance with the Kim Lab License
# located at
#
# http://kim.bio.upenn.edu/software/LICENSE
#
# Unless required by applicable law or agreed to in writing, this
# software is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
# CONDITIONS OF ANY KIND, either express or implied. See the License
# for the specific language governing permissions and limitations
# under the License.
"""
by: S. Fisher, 2011,2014
Usage: python parseBlast.py targetSpecies readsFastaFile blastFile
"""
#------------------------------------------------------------------------------------------
# Initializations
#------------------------------------------------------------------------------------------
import sys, os
DEBUG = False
if DEBUG: print 'DEBUG MODE: ON'
VERSION = '1.1'
# expect 3 args
if len(sys.argv) < 4:
print 'Usage: python parseBlast.py targetSpecies readsFastaFile blastFile'
print '\ttargetSpecies - reads mapped to this species will not be counted for other species. Target species must be one of the following species: bact, fish, fly, human, mouse, rat, yeast, ERCC, phiX'
print '\tblast.csv - list of each hit per read and count of total number of hits per read'
print '\treads.counted.txt - all alignments for reads that mapped to at least one of the target species'
print '\treads.notCounted.txt - all alignments for reads that did not map to any target species'
print '\tfailed.fa - fasta file with reads that did not align'
print '\tfailed.tsv - blast results for read that did not map to any of the counted species'
print '\tspeciesCounts.txt - species counts'
print '\ttargetSpecies.tsv - blast results that mapped to target species'
print '\ttargetSpecies.fa - fasta file with all target species reads'
sys.exit()
TARGET = sys.argv[1]
RAW_FILE = sys.argv[2]
BLAST_FILE = sys.argv[3]
BLAST_PATH = os.path.dirname(RAW_FILE)
if len(BLAST_PATH) > 0:
BLAST_PATH = BLAST_PATH + '/'
rawFile = open(RAW_FILE, 'r')
blastFile = open(BLAST_FILE, 'r')
csvFile = open(BLAST_PATH+'blast.csv', 'w')
hitFile = open(BLAST_PATH+'reads.counted.txt', 'w')
missFile = open(BLAST_PATH+'reads.notCounted.txt', 'w')
failedFile = open(BLAST_PATH+'failed.fa', 'w')
uncountedFile = open(BLAST_PATH+'uncounted.tsv', 'w')
speciesFile = open(BLAST_PATH+'speciesCounts.txt', 'w')
# convert from repo species name to names used here.
TARGET = TARGET.lower()
targetSpecies = TARGET
if 'mm9' in TARGET or 'mm10' in TARGET: targetSpecies = 'mouse'
elif 'rn5' in TARGET or 'rn6' in TARGET: targetSpecies = 'rat'
elif 'drosophila' in TARGET or 'dmel5' in TARGET or 'dmel6' in TARGET: targetSpecies = 'fly'
elif 'hg19' in TARGET: targetSpecies = 'human'
elif 'hg38' in TARGET: targetSpecies = 'human'
elif 'saccer3' in TARGET: targetSpecies = 'yeast'
elif 'zebrafish' in TARGET or 'zv' in TARGET or 'grcz' in TARGET: targetSpecies = 'fish'
elif 'ercc' in TARGET: targetSpecies = 'ercc'
elif 'phix' in TARGET or 'phi-x' in TARGET: targetSpecies = 'phix'
elif 'smed' in TARGET or 'planaria' in TARGET: targetSpecies = 'smed'
elif 'cel' in TARGET or 'elegans' in TARGET: targetSpecies = 'cel'
else: raise NotImplementedError(targetSpecies + " not a currently recognized genome")
targetFile = open(BLAST_PATH+targetSpecies+'.tsv', 'w')
targetFileFa = open(BLAST_PATH+targetSpecies+'.fa', 'w')
#------------------------------------------------------------------------------------------
# Functions
#------------------------------------------------------------------------------------------
def error(line):
# this should never happen
print 'Got to this line in error: ', line, '\n'
rawFile.close()
blastFile.close()
csvFile.close()
hitFile.close()
missFile.close()
failedFile.close()
uncountedFile.close()
speciesFile.close()
targetFile.close()
targetFileFa.close()
sys.exit()
def getLine():
line = blastFile.readline()
if not line: error(line)
return line.rstrip()
def getRaw():
# do this twice because first read is header info and we want sequence
line0 = rawFile.readline()
if not line0: error(line0)
line1 = rawFile.readline()
if not line1: error(line1)
return line1.rstrip(), line0.rstrip()
def writeAlignments(outputFile):
# output alignment info to outputFile
outputFile.write('------------------------------------------------------------------------------------\n' )
outputFile.write(query)
for hit in range(0, hits):
# step through until we find an alignment
while True:
line = getLine()
# stop when we get to the alignment
if '>' in line: break
# if we don't find any alignments, then there's a problem with the blast file
if 'Effective search space used' in line: error(line)
# get alignement descriptor. It might be multiple lines and
# will always begin with a '>' and end with a line with the length
desc = ''
while not 'Length=' in line:
desc += line + ' '
line = getLine()
if len(desc) > 80: desc = desc[:77] + '...'
outputFile.write('\n ' + desc + '\n')
# skip blank line
getLine()
# print line with E value
line = getLine()
outputFile.write(' ' + line + '\n')
# print line with Identities and Gaps
line = getLine()
line = line.replace('(', '')
line = line.replace('%)', ' ')
line = line.replace('/', ' ')
vals = line.split()
# skip next 2 lines
for cnt in range(0, 2): getLine()
# an alignment has 3 lines
for cnt in range(0, 3):
line = getLine()
outputFile.write(' ' + line + '\n')
# skip blank line
line = getLine()
# test if the alignment continues
line = getLine()
if 'Query' in line:
outputFile.write('\n')
for cnt in range(0, 3):
outputFile.write(' ' + line + '\n')
line = getLine()
outputFile.write('\n')
#------------------------------------------------------------------------------------------
# Do counts
#------------------------------------------------------------------------------------------
numReads = 0
numFails = 0
numHits = 0
numNotCounted = 0
# we count each time a read is aligned to one of the following. Since
# we only want to count the species once per read, we use a flag to
# denote whether or not the read has already been counted.
counts = {}
counts['bact'] = 0
counts['fish'] = 0
counts['fly'] = 0
counts['human'] = 0
counts['mouse'] = 0
counts['rat'] = 0
counts['yeast'] = 0
counts['ercc'] = 0
counts['phix'] = 0
counts['chicken'] = 0
counts['quail'] = 0
counts['smed'] = 0
counts['cel'] = 0
while True:
line = blastFile.readline()
if not line: break
if not 'Query=' in line:
# keep going until fine a 'Query='
continue
###############################################
# starting a new read
query = line
if DEBUG: print '\n--------------------------------------------------------------------------\n', line,
rawSeq, seqHeader = getRaw()
# count the read
numReads += 1
while line is not "": line = getLine()
# skip next 5 lines
for cnt in range(0, 4): line = getLine()
# test if no alignments
if 'No hits' in line:
if DEBUG: print ' Hits: 0'
csvFile.write('0, ' + rawSeq + ', ' + seqHeader + '\n')
# generate fasta file with sequences that didn't align
failedFile.write(seqHeader + '\n' + rawSeq + '\n')
# didn't align, so count the no hit and continue to next read
numFails += 1
# this will skip to next 'Query=' statement
continue
###############################################
# process hits
# count number of hits for the read
hits = 0
# flag if we didn't count the read mapping
hitNotCounted = False
# when species is found then save the line so we can output to
# targetFile (in the case of the target species). This also
# servers to flag when a species has already been counted, so we
# only count a species once per read.
found = {}
# flag if none of the hits were our species of interest
notFound = ''
while True:
line = getLine()
# if empty line then ran out of alignments
if line == '': break
hits += 1
lLine = line.lower() # test lowercase version of line
# keep first mapping as that will be the best mapping
if ('phix' in lLine) or ('phi-x' in lLine):
if 'phix' not in found: found['phix'] = line
elif ('bacter' in lLine) or ('coli' in lLine):
if 'bact' not in found: found['bact'] = line
elif ('zebrafish' in lLine) or ('danio' in lLine) or ('rerio' in lLine):
if 'fish' not in found: found['fish'] = line
elif ('drosophila' in lLine) or ('melanogaster' in lLine):
if 'fly' not in found: found['fly'] = line
elif ('human' in lLine) or ('sapiens' in lLine):
if 'human' not in found: found['human'] = line
elif ('mouse' in lLine) or ('musculus' in lLine):
if 'mouse' not in found: found['mouse'] = line
elif ('rattus' in lLine) or ('norvegicus' in lLine):
if 'rat' not in found: found['rat'] = line
elif ('yeast' in lLine) or ('cerevisiae' in lLine) or ('carlsbergensis' in lLine):
if 'yeast' not in found: found['yeast'] = line
elif ('Gallus' in lLine) or ('chicken' in lLine):
if 'rat' not in found: found['rat'] = line
elif ('ercc' in lLine):
if 'ercc' not in found: found['ercc'] = line
else:
# count how many reads only mapped to species we are not tracking
if len(notFound) == 0: notFound = line
# print alignment
if DEBUG: print ' ', line
# finished processing read's hits so now figure out if target was mapped
if targetSpecies in found:
counts[targetSpecies] += 1
targetFile.write(seqHeader + '\t' + rawSeq + '\t' + found[targetSpecies] + '\n')
targetFileFa.write(seqHeader + '\n' + rawSeq + '\n')
else:
if found:
for species in found.keys():
counts[species] += 1
else:
numNotCounted += 1
hitNotCounted = True
uncountedFile.write(line + '\t' + rawSeq + '\t' + seqHeader + '\n')
if DEBUG: print ' Hits:', hits
csvFile.write(str(hits) + ', ' + rawSeq + ', ' + seqHeader + '\n')
# count number of hits
if hits > 0: numHits += 1
###############################################
# process multiple alignments
# XXX If this is set to '<2' then only multiple alignments will be included in hits file
# if no alignments, then start on next query
if hits == 0: continue
if hitNotCounted:
writeAlignments(missFile)
else:
writeAlignments(hitFile)
#------------------------------------------------------------------------------------------
# Output counts to species file
#------------------------------------------------------------------------------------------
speciesFile.write('parseBlast.py version: ' + VERSION + '\n')
speciesFile.write('Target Species: ' + targetSpecies + '\n')
speciesFile.write('Num reads: ' + str(numReads) + '\n')
speciesFile.write('Num reads that did not align: ' + str(numFails) + '\n')
speciesFile.write('Number hits: %d ( %.1f%% )\n' % (numHits, (100*float(numHits)/float(numReads))))
speciesFile.write('Hits not accounted for: ' + str(numNotCounted) + '\n')
# need to include ERCC in target counts when computing hits not target species.
targetCounts = counts[targetSpecies]
if 'ercc' not in targetSpecies: targetCounts += counts['ercc']
if numHits > 0:
hitsNotTargetOrERCC = 100.0 * float(numHits - targetCounts)/float(numHits)
else:
hitsNotTargetOrERCC = 0
speciesFile.write('Perc Hits not target species or ERCC: %.1f\n\n' % (hitsNotTargetOrERCC))
# print out species counts
keyList = sorted(counts.keys())
for species in keyList:
speciesFile.write(species + ' hits\t\t' + str(counts[species]) + '\n')
speciesFile.write('\nTotal Hits\tHits Not Counted\tPerc Hits Not Target or SpikeIn\tBacteria\tFish\tFly\tHuman\tMouse\tRat\tYeast\tERCC\tPhiX\n')
speciesFile.write(str(numHits) + '\t' + str(numNotCounted))
speciesFile.write('\t%.1f' % (hitsNotTargetOrERCC))
speciesFile.write('\t' + str(counts['bact']))
speciesFile.write('\t' + str(counts['fish']))
speciesFile.write('\t' + str(counts['fly']))
speciesFile.write('\t' + str(counts['human']))
speciesFile.write('\t' + str(counts['mouse']))
speciesFile.write('\t' + str(counts['rat']))
speciesFile.write('\t' + str(counts['yeast']))
speciesFile.write('\t' + str(counts['ercc']))
speciesFile.write('\t' + str(counts['phix']))
speciesFile.write('\n')
#------------------------------------------------------------------------------------------
# Clean up
#------------------------------------------------------------------------------------------
rawFile.close()
blastFile.close()
csvFile.close()
hitFile.close()
missFile.close()
failedFile.close()
uncountedFile.close()
speciesFile.close()
targetFile.close()
targetFileFa.close()