forked from safisher/ngs
-
Notifications
You must be signed in to change notification settings - Fork 2
/
removeDuplicates.py
executable file
·274 lines (230 loc) · 9.9 KB
/
removeDuplicates.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
#!/usr/bin/env python
# Copyright (c) 2014, Stephen Fisher and Junhyong Kim, University of
# Pennsylvania. All Rights Reserved.
#
# You may not use this file except in compliance with the Kim Lab License
# located at
#
# http://kim.bio.upenn.edu/software/LICENSE
#
# Unless required by applicable law or agreed to in writing, this
# software is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
# CONDITIONS OF ANY KIND, either express or implied. See the License
# for the specific language governing permissions and limitations
# under the License.
"""
by: S. Fisher, 2014
usage: removeDuplicates.py [-h] -f FIRST_FQ [-r SECOND_FQ] -o OUTPUT_PREFIX
This will return a file that contains the specified number of randomly
sampled lines from the original file. If 'lines grouped' is greater
than 1, then each time a line is selected, the specified number of
lines (grouping size) will also be include. For example a line
grouping of 4 means 4 lines will be included every time a line is
selected, as in the case of a FASTQ file.
"""
#------------------------------------------------------------------------------------------
# INITIALIZATIONS
#------------------------------------------------------------------------------------------
import sys, os, argparse
DEBUG = False
if DEBUG: print 'DEBUG MODE: ON'
VERSION = '0.1'
# indecies for the read set
HEADER = 'header'
SEQUENCE = 'seq'
QUALS = 'quals'
argParser = argparse.ArgumentParser(version=VERSION,
description='Remove duplicate reads.',
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog='' +
'Remove duplicate reads that exactly match.\n')
argParser.add_argument( '-f', dest='first_fq', action='store', required=True,
help='fastq file with reads to be processed' )
argParser.add_argument( '-r', dest='second_fq',
help='fastq file with mate pairs for paired-end reads. This file is not present for single-end reads.' )
argParser.add_argument( '-o', dest='output_prefix', action='store', required=True,
help='prefix for output file(s). A \'_1\' will be appended to the first reads output file and if paired reads then a \'_2\' will be appended to the second reads file. Output files similarly named and with the suffix \'.dupl.txt\' will be created to store a copy of all duplicate reads. The number of duplicates for a read is stored in the read header as "D:XX" where XX is the duplicate count.' )
clArgs = argParser.parse_args()
if DEBUG: print clArgs
# flag if paired-end reads
PAIRED = False
if clArgs.second_fq:
PAIRED = True
# print error and quit
def quitOnError(msg):
# XXX may not be compatible with Python v3
sys.stderr.write('ERROR:' + msg + '\n')
sys.exit(0)
# track stats
nInitialReadPairs = 0 # number of input read pairs
nUniqueReadPairs = 0 # number of output read pairs (ie after duplicates removed)
nDuplicatesFound = 0 # number of duplicate read pairs discarded
nReadsWithDuplicates = 0 # number of read pairs that contained at least 1 duplicate
maxDuplicateCount = 0 # max number of duplicates of a single read pair
uniqReads = {} # dictionary containing all uniq reads
duplReads = {} # dictionary containing a single copy of all duplicate reads
cntDupl = {} # dictionary containing a duplicates count for all reads
#------------------------------------------------------------------------------------------
# FUNCTIONS FOR HANDLING READING AND WRITING OF READS
#------------------------------------------------------------------------------------------
def nextRead(inFile):
"""
Returns a set consisting of each line from the read. Returns empty set if no more reads.
"""
line = inFile.readline().rstrip()
if len(line) == 0:
return {}
read = {}
try:
# load read values from fastq file
read[HEADER] = line
# make sure sequence is upper case for comparisons
read[SEQUENCE] = inFile.readline().rstrip().upper()
# this is the + which we can ignore for now
# Ignore the + in the fastq file
inFile.readline()
read[QUALS] = inFile.readline().rstrip()
return read
except:
msg = 'Problem loading read:', line
quitOnError(msg)
def writeRead(read, count, outFile):
"""
Writes a read to the read file. Read is expected to be a 4-item
list as returned by nextRead(). Any spaces in the header are
replaced with a '-' because STAR truncates header strings at the
first space.
"""
header = read[HEADER].replace(' ','-') + "::D:" + str(count)
outFile.write(header + '\n') # header
outFile.write(read[SEQUENCE] + '\n') # sequence
outFile.write('+\n') # +
outFile.write(read[QUALS] + '\n') # quals
#------------------------------------------------------------------------------------------
# OPEN READ INPUT AND OUTPUT FILES
#------------------------------------------------------------------------------------------
# open input fastq file
firstReadIn = ''
try:
firstReadIn = open(clArgs.first_fq, 'r')
except:
msg = 'Unable to load reads file ' + clArgs.first_fq
quitOnError(msg)
# open output fastq file
firstReadOut = ''
try:
firstReadOut = open(clArgs.output_prefix + "_1.fq", 'w')
except:
msg = 'Unable to open output file ' + clArgs.output_prefix + "_1.fq"
quitOnError(msg)
# open file that will store one copy of all duplicate reads
firstReadDuplicateOut = ''
try:
firstReadDuplicateOut = open(clArgs.output_prefix + "_1.dupl.fq", 'w')
except:
msg = 'Unable to open output file ' + clArgs.output_prefix + "_1.dupl.fq"
quitOnError(msg)
if PAIRED:
secondReadIn = ''
try:
secondReadIn = open(clArgs.second_fq, 'r')
except:
msg = 'Unable to load reads file ' + clArgs.second_fq
quitOnError(msg)
secondReadOut = ''
try:
secondReadOut = open(clArgs.output_prefix + "_2.fq", 'w')
except:
msg = 'Unable to open output file ' + clArgs.output_prefix + "_2.fq"
quitOnError(msg)
secondReadDuplicateOut = ''
try:
secondReadDuplicateOut = open(clArgs.output_prefix + "_2.dupl.fq", 'w')
except:
msg = 'Unable to open output file ' + clArgs.output_prefix + "_2.dupl.fq"
quitOnError(msg)
# generate tsv file with number of duplicates per read.
if DEBUG:
readCountsOut = ''
try:
readCountsOut = open(clArgs.output_prefix + ".debug.tsv", 'w')
except:
msg = 'Unable to open output file ' + clArgs.output_prefix + '.debug.tsv'
quitOnError(msg)
#------------------------------------------------------------------------------------------
# PROCESS READS. LOAD ONE READ FROM BOTH FQ FILES AT SAME
# TIME. PROCESS READ PAIR TOGETHER.
# ------------------------------------------------------------------------------------------
while 1:
firstRead = nextRead(firstReadIn)
if firstRead == {}: break
readSeq = firstRead[SEQUENCE]
if PAIRED:
secondRead = nextRead(secondReadIn)
readSeq += secondRead[SEQUENCE]
else:
# create empty dict so don't have to keep testing PAIRED below
secondRead = {}
nInitialReadPairs += 1
if readSeq in uniqReads:
# found a duplicate
if readSeq not in duplReads:
# new duplicate, so add to duplRead and cntDupl
duplReads[readSeq] = [firstRead, secondRead]
cntDupl[readSeq] = 1
else:
# already found a duplicate so just increment cntDupl
cntDupl[readSeq] += 1
else:
uniqReads[readSeq] = [firstRead, secondRead]
# we've completed the hash tables at this point so now we need to write the data
if PAIRED:
for key in uniqReads:
if key in cntDupl:
writeRead(uniqReads[key][0], cntDupl[key], firstReadOut)
writeRead(uniqReads[key][1], cntDupl[key], secondReadOut)
else:
writeRead(uniqReads[key][0], 0, firstReadOut)
writeRead(uniqReads[key][1], 0, secondReadOut)
for key in duplReads:
writeRead(duplReads[key][0], cntDupl[key], firstReadDuplicateOut)
writeRead(duplReads[key][1], cntDupl[key], secondReadDuplicateOut)
else:
for key in uniqReads:
if key in cntDupl:
writeRead(uniqReads[key][0], cntDupl[key], firstReadOut)
else:
writeRead(uniqReads[key][0], 0, firstReadOut)
for key in duplReads:
writeRead(duplReads[key][0], cntDupl[key], firstReadDuplicateOut)
firstReadIn.close()
firstReadOut.close()
firstReadDuplicateOut.close()
if PAIRED:
secondReadIn.close()
secondReadOut.close()
secondReadDuplicateOut.close()
if DEBUG:
readCountsOut.write('numDupl\theader\tmate1\tmate2\n')
for key in cntDupl:
readCountsOut.write(str(cntDupl[key]) + '\t' + duplReads[key][0][HEADER] + '\t' + duplReads[key][0][SEQUENCE] + '\t' + duplReads[key][1][SEQUENCE] + '\n')
readCountsOut.close()
# compute stats
nUniqueReadPairs = len(uniqReads)
nReadsWithDuplicates = len(duplReads)
nDuplicatesFound = sum(cntDupl.values())
if cntDupl != {}:
maxDuplicateCount = max(cntDupl.values())
#------------------------------------------------------------------------------------------
# OUTPUT STATS
#------------------------------------------------------------------------------------------
print 'Reads processed:', nInitialReadPairs
print 'Unique reads:', nUniqueReadPairs
print 'Duplicate reads discarded:', nDuplicatesFound
print 'Number reads with at least 1 duplicate:', nReadsWithDuplicates
print 'Max duplicates of a single read:', maxDuplicateCount
# tab delimited output to facilitate adding stats to compilation file
fields = '\nnInitialReadPairs\tnUniqueReadPairs\tnDuplicatesFound\tnReadsWithDuplicates\tmaxDuplicateCount'
counts = '%d\t%d\t%d\t%d\t%d' % (nInitialReadPairs, nUniqueReadPairs, nDuplicatesFound, nReadsWithDuplicates, maxDuplicateCount)
print fields
print counts