-
Notifications
You must be signed in to change notification settings - Fork 1
/
2_AssembleProbes.py
345 lines (298 loc) · 11.2 KB
/
2_AssembleProbes.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
from Bio import SeqIO
from Bio.SeqUtils import GC
from Bio.SeqUtils import MeltingTemp as mt
from io import StringIO
import io
import subprocess
import os
import sys
import random
import glob
import argparse
def main(mismatch_cutoff):
#Need to specify these values.
adapts = open('adapters.txt', 'r')
adapt_dict = {}
for adapt in adapts:
adapt = adapt.strip()
adapt_name = adapt.split('\t')[0]
adapt_seq = adapt.split('\t')[1]
adapt_dict[adapt_name] = adapt_seq
initial_barcode_file = 'barcodes.txt'
# Increasing the below number increases the number of probes available, but
# at the cost of probes with stronger predicted secondary structure
# NOTE: This number is usually produced as a negative value by RNAfold, but
# is used as an absolute value in this script. So 7.6 in this code behaves
# like -7.6 in RNAfold
allowed_ss_score = 9.7
summary_file_name = 'summary_file_{}off.txt'.format(mismatch_cutoff)
summary_file = open(summary_file_name, 'w')
try:
barcode_file = open(initial_barcode_file, 'r')
except (FileNotFoundError, OSError) as e:
raise e
except Exception as e:
print("Unexpected error:")
raise e
# Create a folder named 'probes' within the current working directory, this
# will be where probes are saved.
working_dir = os.getcwd()
if os.path.exists(os.path.dirname('./probes/')):
pass
else:
os.makedirs(os.path.dirname('./probes/'))
# Assign barcodes. Use 'running_barcode_list.txt' to make sure barcodes are
# conserved between runs
unused_barcodes = []
used_barcodes = {}
for barcode in barcode_file:
barcode = str(barcode).strip()
if '\t' in barcode:
code = barcode.split('\t')
# barcodes are used in lower case for easier visualizing of final
# probes
used_barcodes[code[1]] = code[0].lower()
else:
barcode = barcode.lower()
unused_barcodes.append(barcode)
barcode_file.close()
# use sub_sam files generated by the first script
target_files = glob.glob('./part1/*.sub_sam')
for target_file in target_files:
target_name = str(target_file).replace('./part1/',
'').replace('.sub_sam','')
adapter_seq = adapt_dict[target_name]
bowtie_output = open(target_file, 'r')
print('Running probe design for {}'.format(target_name))
# Open file with all probes discovered from first script
probe_seqs = SeqIO.parse('{}_AllProbes.fasta'
''.format(target_file.replace('.sub_sam', '')),
'fasta')
# if paralogs exist, open file
paralog_list = []
print('target_name={}'.format(target_name))
try:
paralog_file = open('./paralogs/{}_paralogs.txt'
''.format(target_name), 'r')
for line in paralog_file:
line = line.strip()
paralog_list.append(line)
paralog_file.close()
except FileNotFoundError:
print('Warning: {} has no paralogs'.format(target_name))
# initialize a dictionary of the probes
probe_dict = {}
for probe in probe_seqs:
probe_dict[probe.name] = str(probe.seq)
# use correct barcodes or assign new ones if not present
if target_name in used_barcodes.keys():
target_barcode = used_barcodes[target_name]
else:
target_barcode = random.choice(unused_barcodes)
unused_barcodes.remove(target_barcode)
used_barcodes[target_name] = target_barcode
# dict for storing all probe names, and bowtie results
bowtie_results_dict = {}
for probe in probe_dict.keys():
bowtie_results_dict[probe] = []
for result in bowtie_output:
result = result.split('\t')
probe_name = result[0]
hit_name = result[1]
mismatches = int(result[3])
# only keep hits with an edit distance of less than
if mismatches < mismatch_cutoff:
bowtie_results_dict[probe_name].append(hit_name)
# list to sort by Tm after parsing bowtie results
post_bowtie_list = []
bowtie_output.close()
for bowtie_probe in bowtie_results_dict.keys():
comparison = False
if len(bowtie_results_dict[bowtie_probe]) == 0:
post_bowtie_list.append(genPostBowtieItem(bowtie_probe,
bowtie_probe,
probe_dict))
# post_bowtie_list.append({'Name' : bowtie_probe,
# 'Tm' : bowtie_probe.split('_')[2],
# 'Sequence' : probe_dict[bowtie_probe]})
elif len(bowtie_results_dict[bowtie_probe]) == 1:
if (bowtie_probe.split('_')[0] ==
bowtie_results_dict[bowtie_probe][0]):
post_bowtie_list.append(genPostBowtieItem(bowtie_probe,
bowtie_probe,
probe_dict))
# post_bowtie_list.append({'Name' : bowtie_probe,
# 'Tm' : bowtie_probe.split('_')[2],
# 'Sequence' : probe_dict[bowtie_probe]})
elif len(bowtie_results_dict[bowtie_probe]) > 1:
# keep track of number of bowtie hits that are itself or paralog
required_score = len(bowtie_results_dict[bowtie_probe])
score = 0
for item in bowtie_results_dict[bowtie_probe]:
if bowtie_probe.split('_')[0] == item:
probe_name = bowtie_probe
score += 1
elif item in paralog_list:
probe_name = bowtie_probe
score += 1
else:
# breaks out of loop if non-paralog found
break
if score == required_score:
post_bowtie_list.append(genPostBowtieItem(probe_name,
bowtie_probe,
probe_dict))
# post_bowtie_list.append({'Name' : probe_name,
# 'Tm' : bowtie_probe.split('_')[2],
# 'Sequence' : probe_dict[bowtie_probe]})
else:
print('Could not find any bowtie hits for {}'
''.format(target_name))
print('{} probes passed offtarget prediction'
''.format(len(post_bowtie_list)))
# save all good probes to file and add barcodes and adapter
post_bowtie_results = open('./probes/{}_{}off_PostBowtie.fasta'
''.format(target_name, mismatch_cutoff), 'w')
for a in post_bowtie_list:
name = a['Name']
f_arm = a['Sequence'][7:]
t_arm = a['Sequence'][:7]
final_sequence = "{}{}{}{}{}".format(f_arm, target_barcode[4:],
adapter_seq, target_barcode,
t_arm)
post_bowtie_results.write('>{}_{}off\n{}\n'.format(name,
mismatch_cutoff,
final_sequence))
post_bowtie_results.close()
# Use RNAfold to predict secondary structure
RNAfold_infile = ('./probes/{}_{}off_PostBowtie.fasta'
''.format(target_name, mismatch_cutoff))
# If no probes passed bowtie2 analysis, don't bother going through
# RNAfold analysis. This actually throws an error if file is empty
if len(post_bowtie_list) == 0:
print('number of good probes\t{}'.format(len(post_bowtie_list)))
summary_file.write('{}\t{}\t{}\n'
''.format(target_name, len(post_bowtie_list),
target_barcode))
else:
try:
# remove RNAfold outfile because it automatically appends
# results and does not overwright.
os.remove('tmp1.rna_out')
except FileNotFoundError:
pass
buildcmd = ['RNAfold', '-p', '-d2', '--noLP', '--noPS',
'-P', 'dna_mathews2004.par',
'--noconv', '--MEA',
'--infile={}'.format(RNAfold_infile),
'--outfile=tmp1.rna_out']
subprocess.call(buildcmd)
# remove unused ps_files that RNAfold produced even with ps files
# are suppressed
ps_files = glob.glob('*.ps')
for a in ps_files:
os.remove(a)
# Open the temporary RNAfold file
ss_result_file = open('tmp1.rna_out', 'r')
ss = ss_result_file.read()
ss_result_file.close()
ss_results = str(ss).split('>')
ss_results_list = []
# parse RNAfold output
for ss_result in ss_results:
b = ss_result.split('\n')
if len(b) > 1:
probe_name = b[0]
probe_seq = b[1]
data = str(b[2]).split(' ', 1)[1]
data = float(data[1:7].strip(' ').strip('-'))
if data <= allowed_ss_score:
ss_results_list.append({'Name' : probe_name,
'Sequence' : probe_seq,
'Result' : data})
# Sort probes with least secondary structure to most.
sorted_probes = sorted(ss_results_list,
key=lambda k: k['Result'],
reverse=False)
# post_RNAfold_results = open('./probes/{}_{}off_PostRNAfold.fasta'.format(target_name, mismatch_cutoff), 'w')
# make sure there is no probe overlap
# Use position_list to store all base positions that are already covered
# by a probe
position_list = []
good_probes = []
for probe in sorted_probes:
probe_name = probe['Name'] + '_' + str(probe['Result']) + 'ss'
probe_seq = probe['Sequence']
# post_RNAfold_results.write('>{}\n{}\n'.format(probe_name, probe_seq))
position = probe_name.split('_')[1]
position = int(position.split('-')[0])
# Ensure there is a 4 bp space between each probe
if len(position_list) == 0:
start = position - 2
if start < 0:
start = 0
end = position + 27
position_list.extend(range(start,end))
good_probes.append(genGoodProbesItem(probe_name, probe_seq))
else:
if ((position - 2 not in position_list)
and (position + 27 not in position_list)):
start = position - 2
if start < 0:
start = 0
end = position + 27
position_list.extend(range(start,end))
good_probes.append(genGoodProbesItem(probe_name,
probe_seq))
else:
pass
# Save entire probe and only the template binding region seperately
# for each analysis.
final_results = open('./probes/{}_{}off_FinalProbes.fasta'
''.format(target_name, mismatch_cutoff), 'w')
final_results2 = open('./probes/{}_{}off_FinalTargets.fasta'
''.format( target_name, mismatch_cutoff), 'w')
for item in good_probes:
target_sequence = item['Sequence'][53:] + item['Sequence'][:18]
final_results.write('>{}\n/5phos/{}\n'.format(item['Name'],
item['Sequence']))
final_results2.write('>{}\n{}\n'.format(item['Name'],
target_sequence))
print('number of good probes for {}\t{}'.format(target_name,
len(good_probes)))
summary_file.write('{}\t{}\t{}\n'.format(target_name,
len(good_probes),
target_barcode))
# post_RNAfold_results.close()
final_results.close()
final_results2.close()
# Recreate barcode list with new additions
barcode_results = open('barcodes.txt', 'w')
for key in used_barcodes.keys():
barcode_results.write('{}\t{}\n'.format(used_barcodes[key], key))
for leftover_code in unused_barcodes:
barcode_results.write('{}\n'.format(leftover_code))
summary_file.close()
barcode_results.close()
def genPostBowtieItem(probe_name, bowtie_probe, probe_dict):
d = {}
d['Name'] = probe_name
d['Tm'] = bowtie_probe.split('_')[2],
d['Sequence'] = probe_dict[bowtie_probe]
return d
def genGoodProbesItem(probe_name, probe_seq):
d = {}
d['Name'] = probe_name
d['Sequence'] = probe_seq
return d
if __name__ == '__main__':
# set up argument parsing
parser = argparse.ArgumentParser()
# mismatches allowed
parser.add_argument("mm", metavar="MM", nargs='?', default=6, type=int,
help='Number of mismatches allowed in offtarget '
'alignments (default: 6).')
# parse args
args = parser.parse_args()
# run main script
main(args.mm)