-
Notifications
You must be signed in to change notification settings - Fork 10
/
digest_mutant_protein.py
95 lines (77 loc) · 3.13 KB
/
digest_mutant_protein.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
import sys
import re
from Bio import SeqIO
from collections import OrderedDict
def TRYPSIN(proseq,miss_cleavage):
peptides=[]
cut_sites=[0]
for i in range(0,len(proseq)-1):
if proseq[i]=='K' and proseq[i+1]!='P':
cut_sites.append(i+1)
elif proseq[i]=='R' and proseq[i+1]!='P':
cut_sites.append(i+1)
if cut_sites[-1]!=len(proseq):
cut_sites.append(len(proseq))
if miss_cleavage==0:
for j in range(0,len(cut_sites)-1):
peptides.append(proseq[cut_sites[j]:cut_sites[j+1]])
elif miss_cleavage==1:
for j in range(0,len(cut_sites)-2):
peptides.append(proseq[cut_sites[j]:cut_sites[j+1]])
peptides.append(proseq[cut_sites[j]:cut_sites[j+2]])
peptides.append(proseq[cut_sites[-2]:cut_sites[-1]])
elif miss_cleavage==2:
for j in range(0,len(cut_sites)-3):
peptides.append(proseq[cut_sites[j]:cut_sites[j+1]])
peptides.append(proseq[cut_sites[j]:cut_sites[j+2]])
peptides.append(proseq[cut_sites[j]:cut_sites[j+3]])
peptides.append(proseq[cut_sites[-3]:cut_sites[-2]])
peptides.append(proseq[cut_sites[-3]:cut_sites[-1]])
peptides.append(proseq[cut_sites[-2]:cut_sites[-1]])
return peptides
handle1=SeqIO.parse(sys.argv[1],'fasta') # All_COSMIC_Genes.fasta
handle2=SeqIO.index(sys.argv[2],'fasta') # Cosmic_mutant_proteins.fasta
output=open(sys.argv[3],'w')
peptidome={}
for record in handle1:
cds_seq = record.seq
aa_seq = cds_seq.translate(to_stop=True,stop_symbol="")
peptide_list=TRYPSIN(str(aa_seq),0)
for peptide in peptide_list:
if len(peptide) in range(6,41):
if peptide not in peptidome:
peptidome[peptide.replace("I","L")]=1
print len(peptidome)
var_peptidome=OrderedDict()
record_counter=1
for record in handle2:
if record_counter%100000==0:
print record_counter
record_counter+=1
proseq=handle2[record].seq
descrip=handle2[record].description
if len(proseq)>5:
peptide_list=TRYPSIN(str(proseq),0)
for peptide in peptide_list:
if len(peptide) in range(6,41):
peptide1=peptide.replace("I","L")
if peptide1 not in peptidome:
des_list=descrip.split(":")
type=des_list[-1]
snp=des_list[4]
if type=="Substitution-Missense":
try:
mut_pos=int(re.findall(r'\d+',snp)[0])
index=str(proseq).index(peptide)
pos=mut_pos-index
des_list.append(str(pos))
except IndexError:
continue;
des_list.pop(3)
new_description=":".join(des_list)
if new_description not in var_peptidome:
var_peptidome[new_description]=peptide
print len(var_peptidome)
for acc in var_peptidome.keys():
output.write(">%s\n%s\n" % (acc,var_peptidome[acc]))
output.close()