-
Notifications
You must be signed in to change notification settings - Fork 0
/
final_minicode.py
344 lines (247 loc) · 15.2 KB
/
final_minicode.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
# import all of the packages
import os
import csv
from Bio import SeqIO
from Bio import Entrez
'''
Problem 1
First: retrieve the transcriptomes of given SRR numbers and convert them to paired-end fastq files.
You can use wget (by constructing the path based on the SRR numbers for each of these samples).
'''
# initialize the log file (will be updated along the way)
# 'a' = to append to the file and not delete everything along the way
log_file = open('MiniProject.log', 'a')
# initialize SRR list with all of the accession numbers listed
SRR = ['SRR5660030', 'SRR5660033', 'SRR5660044', 'SRR5660045']
# initialize path variable to the current working directory
path = os.getcwd()
# get the input files and download as paired-read FASTQ files
def inputFiles(SRR):
# fetches the links through SRA and downloads the files
# wget = uses the specified path (link) to download the files onto working directory
getFiles = 'wget https://sra-downloadb.be-md.ncbi.nlm.nih.gov/sos2/sra-pub-run-11/' + SRR + '/' + SRR + '.1'
# split the SRA files into paired reads
splitFiles = 'fastq-dump -I --split-files ' + SRR + '.1'
# os.system() = executes the command in a subshell
os.system(getFiles)
os.system(splitFiles)
'''
Problem 2
First: Build a transcriptome index for HCMV (NCBI accession EF999921)
Second: Extract the CDS features from the GenBank format.
Third: Write the output to your log file
'''
# extract the CDS and make a transcriptome index for HCMV
def Transcriptome_Index():
out_Fasta = open("EF999921.fasta", "w") # FASTA outfile for HCMV sequence
out_CDS = open("EF999921_CDS.fasta","w") # FASTA outfile for CDS sequence
Entrez.email = '[email protected]' # use school email to access Entrez
# get nucleotide sequence in FASTA format from NCBI's [Nucleotide] database
HCMV_file = Entrez.efetch(db = 'nucleotide', id= ['EF999921'], rettype= 'fasta')
# set the gathered info as a list called records
records = list(SeqIO.parse(HCMV_file, "fasta"))
# use .seq object to write out FASTA file of whole sequence to the file
# include > and separate by new line for FASTA format
out_Fasta.write('>' + str(records[0].description)+ '\n' + str(records[0].seq))
out_Fasta.close() # all the data has been added so close the file
# get GenBank sequence from NCBI's [GenBank] database
GB_file = Entrez.efetch(db = 'nucleotide', id = ['EF999921'], rettype= 'gb', retmode='text')
# initialize tracker variable to count the number of CDS features genome has
track = 0
# this writes out the CDS sequences with a >identifier to a new file
# also adds a count for recording number of CDS sequences
for record in SeqIO.parse(GB_file, 'genbank'):
for feature in record.features:
# find the CDS regions and then add them to the transcriptome index file = outFile
if feature.type == "CDS":
track += 1 # add to the counter
out_CDS.write('>' + str(feature.qualifiers['protein_id']).replace('[', '').replace(']', '').replace("'","") + '\n' + str(feature.location.extract(record).seq) +'\n') # write to the CDS fasta file
out_CDS.close() # done writing to the file so close
log_file.write('The HCMV genome (EF999921) has ' + str(track) + ' CDS.' + '\n') # output how many CDS regions there are in the EF999921 genbank file
'''
Problem 3
First: Quantify the TPM of each CDS in each transcriptome using kallisto
Second: Use kallisto results as input to find differentially expressed genes between the two timepoints (2pi and 6dpi) using the R package sleuth
Third: Write out the information to the log file using a header row and tab-delimit for each item:
target_id test_stat pval qval
'''
# run kallisto with the SRR numbers and files
def kallisto(SRR):
# command to get kallisto using the CDS file created before
kallisto_index = 'time kallisto index -i HCMV_index.idx EF999921_CDS.fasta'
os.system(kallisto_index)
# command to run kallisto
run_kallisto = 'time kallisto quant -i HCMV_index.idx -o' + path + '/results_' + SRR + ' -b 30 -t 4 ' + SRR + '.1_1.fastq ' + SRR + '.1_2.fastq'
os.system(run_kallisto)
# code to make sleuth input file
def SleuthInput(SRR):
#input file for sleuth
covFile = open('sleuth_input.txt','w')
#initial line in file
covFile.write('sample'+ '\t' + 'condition' + '\t' + 'path' + '\n')
#based on SRR number, write condition and path to outnput file
for i in SRR:
paths = path + '/' + 'results_' + i
#print(paths)
if int(i[3:]) % 2 == 0: # if its even, 2dpi
covFile.write(str(i)+ '\t' + '2dpi' + '\t'+ paths + '\n')
else: # if its odd, 6dpi
covFile.write(str(i)+ '\t' + '6dpi' + '\t'+ paths + '\n')
covFile.close()
# code to run throgh the sleuth script in R
def Sleuth():
runSleuth = 'Rscript Sleuth_Rscript.R'
os.system(runSleuth)
output = open('out_sleuth.txt','r').readlines()
for i in output:
log_file.write(i + '\n')
'''
Problem 4
First: Use Bowtie2 to create an index for HCMV (NCBI accession EF999921).
Second: Save the reads that map to the HCMV index for use in assembly.
Third: Write to your log file the number of reads in each transcriptome before and after the Bowtie2 mapping.
'''
# builds a bow tie index for HCMV
def build_Bowtie(SRR):
#builds initial index using reference genome-- use transcriptome_index CDS fasta file?
bowtie_command = 'bowtie2-build ./EF999921.fasta EF999921_1'
os.system(bowtie_command)
#maps transcriptome reads to the index we just created, generating sam file
#need al-conc to make fastq files in output
bowtie_command2 = 'bowtie2 --quiet --no-unal --al-conc EF999921_' + SRR + '.fastq -x EF999921_1 -1 ' + SRR + '.1_1.fastq -2 ' + SRR + '.1_2.fastq -S EF999921_' + SRR + '.sam'
#bowtie_cmd = 'bowtie2 --quiet --no-unal --al-conc BOW_'+SRR+'.fastq -x EF99992_1 -1 '+SRR+ '_1.fastq -2'+SRR+'_2.fastq -S '+SRR+ '.sam'
os.system(bowtie_command2)
# save the mapped reads to the index
# count the number of reads in each transcriptome before and after the bowtie mapping
def bowtie_original(SRR, number):
bowtie_SRR1 = open('EF999921_' + SRR + '.1.fastq').readlines() # count the number of read pairs (mapped)
bowtie_SRR2 = open('EF999921_' + SRR + '.2.fastq').readlines() # count the number of read pairs (mapped)
donor = ''
if number == 1:
donor += 'Donor 1 (2dpi)'
elif number == 2:
donor += 'Donor 1 (6dpi)'
elif number == 3:
donor += 'Donor 3 (2dpi)'
elif number == 4:
donor += 'Donor 3 (6dpi)'
len_bowtie = ((len(bowtie_SRR1)+len(bowtie_SRR2))/8) # get the length of the bowtie pair
original1 = open(SRR + '.1_1.fastq').readlines() # count the number of read pairs (og)
original2 = open(SRR + '.1_2.fastq').readlines() # count the number of read pairs (og)
original = (len(original1) + len(original2))/8 # get the length of the original pair
# write out to the log file using given format
with open('MiniProject.log', 'a') as output:
output.write(donor + " had " + str(original) + ' read pairs before Bowtie2 filtering and ' + str(len_bowtie) + ' read pairs after \n')
output.close()
'''
Problem 5
First: Use the Bowtie2 output reads to assemble all four transcriptomes together to produce 1 assembly via SPAdes.
Second: Write the SPAdes command used to the log file.
'''
# code to run through SPAdes using the command line
def Spades(SRR1, SRR2, SRR3, SRR4):
spades_command = 'spades -k 55,77,99,127 --only-assembler -t 2 --pe1-1 EF999921_'+ SRR1 + '.1.fastq --pe1-2 EF999921_'+ SRR1 + '.2.fastq --pe2-1 EF999921_'+ SRR2 + '.1.fastq --pe2-2 EF999921_' + SRR2 + '.2.fastq --pe3-1 EF999921_' + SRR3 + '.1.fastq --pe3-2 EF999921_' + SRR3 +'.2.fastq --pe4-1 EF999921_' + SRR4 + '.1.fastq --pe4-2 EF999921_' + SRR4 + '.2.fastq -o Spades/'
with open('MiniProject.log','a') as output:
output.write(spades_command + '\n') # write out the collected information to the log file
output.close()
os.system(spades_command) # execute the command using the command line
'''
Problem 6
First: Write Python code to calculate the number of contigs with a length > 1000
Second: Write the # to the log file as follows (replace # with the correct integer):
There are # contigs > 1000 bp in the assembly.
'''
# count the number of contigs that are greater than 1000 bp
def count_Contigs():
newFile = open('ContigsNum.fasta', 'w') # open up new file
track = 0 # initialize the tracking variable to 0
inFile = SeqIO.parse('./Spades/contigs.fasta', 'fasta') # parse out SPAdes output as a FASTA file
#if the sequence len is greater than 1000 add to count
#Add sequences greater than 1000 to outfile
for record in inFile:
len_seq = len(record.seq) # get the length of each sequence
if len_seq > 1000: # if the sequence lenfth is greater than 1000...
track += 1 # track that seqeunce add to the counter
newFile.write('> '+ str(record.id) + '\n' + str(record.seq) + '\n') # add the record ID and sequence of that element to the file
newFile.close() # only one condition to write to the file, so close after the end of the loop
log_file.write('There are ' + str(track) + ' contigs > 1000 bp in the assembly.' + '\n') # write out to the log file
'''
Problem 7
First: Write Python code to calculate the length of the assembly (the total number of bp in all of the contigs > 1000 bp in length) and write this # to the log file as follows (replace # with the correct integer):
There are # bp in the assembly.
'''
# gets the sum of the total amount of base pairs in the assembly
def length_Contigs():
contig_file = SeqIO.parse('ContigsNum.fasta', 'fasta') # parse through the file that contains all of the sequences greater than 1000, write as FASTA
total_Len = [] # initialize empty list for the total length
#add each sequence len to a list
for record in contig_file: # for all in the file...
seq_len = len(record.seq) # collect all of their individual lengths
total_Len.append(int(seq_len)) # add those lengths, as ints, to the list
#sum the list
total = sum(total_Len) # get the sum of all of those lengths
log_file.write('There are ' + str(total) + ' bp in the assembly.' + '\n') # write out the total length (in bp) to the log file
'''
Problem 8
First: Write Python code to retrieve the longest contig from your SPAdes assembly.
Second: Use the longest contig as blast+ input to query the nr nucleotide database limited to members of the Betaherpesvirinae subfamily. You will need to make a local database of just sequences from the Betaherpesvirinae subfamily.
Third: Identify the top 10 hits. For the top 10 hits, write the following to your log file: Subject accession, Percent identity, Alignment length, Start of alignment in query, End of alignment in query, Start of alignment in subject, End of alignment in subject, Bit score, E-value, and Subject Title.
Include the following header row in the log file, followed by the top 10 hits, and tab-delimit each item:
sacc pident length qstart qend sstart send bitscore evalue stitle
'''
# function to get the longest contig from the SPAdes assembly
# SPAdes orders the sequences in order from longest to shortest when assembled
def longest_Contig():
long_count = next(SeqIO.parse('ContigsNum.fasta', 'fasta')) # next only pulls out the first sequence in a FASTA file
longest = open('longest_contig.fasta', 'w') # make a new file to store the longest contig
SeqIO.write(long_count, longest, 'fasta') # write to the file
# split into three separate functions (database call, blast call, and blast parse)
# function to make the blast local database
def blast_db():
os.system("makeblastdb -in Beta_seqs.fasta -out Betaherpesvirinae_db -title Betaherpesvirinae_db -dbtype nucl")
# function to call blast command with tab delimiters (the 10)
def blast():
os.system('blastn -query longest_contig.fasta -db Betaherpesvirinae_db -out myBlastResults.csv -outfmt "10 sacc pident length qstart qend sstart send bitscore evalue stitle"')
# parse through the csv file and pull out the top 10 hits and write them to the log file
def blast_parse():
headers = ["sacc", "pident", "length", "qstart", "qend", "sstart", "send", "bitscore", "evalue", "stitle"] # top line in file
log_file.write('sacc' + '\t' + 'pident' + '\t' + 'length' + '\t' + 'qstart' + '\t' + 'qend' + '\t' + 'sstart' + '\t' + 'send' + '\t' + 'bitscore' + '\t' + 'evalue' + '\t' + 'stitle') # write out the header row to the log file
blast_results = open("BLAST_results.csv", "r") # open the csv file
csv_rows = csv.DictReader(blast_results, headers, delimiter = ",") # read in each row of the csv file as dictionary
track = 0 # tracker for number of hits
with open("MiniProject.log", "a") as out: # add top 10 hits to the log file
for row in csv_rows:
if track >= 9: #count used to get only top 10 hits
break
else:
# pull out values for each key
out1 = str(row["sacc"]) # subject accession
out2 = str(row["pident"]) # percent identity
out3 = str(row["length"]) # alignment length
out4 = str(row["qstart"]) # start of alignment in query
out5 = str(row["qend"]) # end of alignment in query
out6 = str(row["sstart"]) # start of alignment in subject
out7 = str(row["send"]) # end of alignment in subject
out8 = str(row["bitscore"]) # bitscore
out9 = str(row["evalue"]) # evalue
out10 = str(row["stitle"]) # subject title
out.write(out1 + "\t" + out2 + "\t" + out3 + "\t" + out4 + "\t" + out5 + "\t" + out6 + "\t" + out7 + "\t" + out8 + "\t" + out9 + "\t" + out10 + "\n") # then write out all of the data found in tab delimited format to the out file
track += 1 # track and make sure only getting top ten hits
# ---- ALL FUNCTION CALLS ---- #
number = 1
for i in SRR: # for all of the functions that are using the given SRR values
inputFiles(i)
kallisto(i)
build_Bowtie(i)
bowtie_original(i, number)
number += 1
Transcriptome_Index()
Sleuth()
SleuthInput(SRR)
Spades(SRR[0], SRR[1], SRR[2], SRR[3])
count_Contigs()
length_Contigs()
longest_Contig()
blast_db()
blast()
blast_parse()