-
Notifications
You must be signed in to change notification settings - Fork 4
/
assemble_peptides.py
127 lines (117 loc) · 4.74 KB
/
assemble_peptides.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
import pandas as pd
import argparse
import re
import skbio
from copy import deepcopy
import skbio
from skbio.alignment import local_pairwise_align_ssw, make_identity_substitution_matrix
from skbio.sequence import Protein
ident = make_identity_substitution_matrix(match_score=1, mismatch_score=0, alphabet=skbio.sequence.Protein.alphabet)
def assembleOverlappingPeptides(pepArr,overlap=11):
"""This is a work in progress, but the idea was
to be able to rebuild the sequence from the set of
overlapping 15mers..."""
assembled = [pep for pep in pepArr]
while len(assembled)>1:
for pepi1, pepi2 in itertools.combinations(arange(len(assembled)), 2):
pep1, pep2 = assembled[pepi1], assembled[pepi2]
res = pairwise2.align.globalxs(pep2, pep1, -4, 0)[0]
#print res[2]
if res[2]>=overlap-8:
#print res[0]
#print res[1]
_ = assembled.pop(pepi2)
assembled[pepi1] = ''.join([aa1 if not aa1=='-' else aa2 for aa1, aa2 in zip(res[0], res[1])])
#print assembled[pepi1]
#print
break
return assembled[0]
def _assembleTwo(seq1, seq2):
"""This only works if two sequences share a significant identical overlap"""
if len(seq2) <= len(seq1) and re.search(seq2, seq1):
return seq1
elif len(seq1) <= len(seq2) and re.search(seq1, seq2):
return seq2
else:
msa = local_pairwise_align_ssw(Protein(seq1),
Protein(seq2),
substitution_matrix=ident)
if msa[1] >= 8:
try:
(s1, e1), (s2, e2) = msa[-1]
except:
print(msa)
if s1 >= s2:
return seq1 + seq2[e2+1:]
else:
return seq2 + seq1[e1+1:]
return out
else:
print('No significant overlap')
raise
def _extendForward(seq, peptides, overlapL=8):
ol = seq[-overlapL:]
for pep in peptides:
match = re.search(pep, ol)
if match and match.start() > 0:
return pep[:match.start()] + seq, pep
def _extendBackward(seq, peptides, overlapL=8):
ol = seq[:overlapL]
for pep in peptides:
match = re.search(pep, ol)
if match and match.end() < mat.endpos():
return seq + pep[match.start():], pep
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Assemble overlapping 15mers')
parser.add_argument('filename', metavar='FILENAME', type=str, help='Peptide set from Peptide DB')
parser.add_argument('--useInds', action='store_true', help='Use the indices provided.')
parser.add_argument('--useOverlap', action='store_true', help='Use the overlap to assemble, assuming 8 AA are unique')
args = parser.parse_args()
outFn = args.filename[:-4] + '.fasta'
df = pd.read_excel(args.filename)
if args.useInds:
out = list(range(1, df['AA End'].max() + 1))
for i, row in df.iterrows():
pep = row['Peptide Sequence']
errors = 0
for aai, aa in zip(list(range(row['AA Start'], row['AA Start'] + len(pep))), pep):
# print '(%s)%d%s' % (out[aai - 1], aai, aa)
if out[aai - 1] == aai:
out[aai - 1] = aa
elif not out[aai - 1] == aa:
print('Error in assembly at %s%d%s' % (out[aai - 1], aai, aa))
errors += 1
if errors > 0:
print(row['Peptide Id'], pep)
for i, aa in enumerate(out):
if isinstance(aa, int):
out[i] = '-'
fullSeq = ''.join(out)
elif args.useOverlap:
"""Start with one peptide and add to it."""
peptides = df['Peptide Sequence'].tolist()
seq = peptides[0]
peptides.remove(seq)
while len(peptides) > 0 or len(peptides) < len(oldPeptides):
oldPeptides = deepcopy(peptides)
startOL = seq[:8]
endOL = seq[-8:]
middleOL = seq[8:-8]
for pep in oldPeptides:
if re.search(pep, middleOL):
peptides.remove(pep)
print('Found %s in middle of %s' % (pep, seq))
break
elif re.search(pep, startOL) or re.search(pep, endOL):
print('Found %s in edge of %s' % (pep, seq))
seq = _assembleTwo(seq, pep)
print(seq)
peptides.remove(seq)
break
fullSeq = seq
else:
pass
fastaStr = '>%s\n%s\n' % (df['Peptide Group Name'].unique()[0], fullSeq)
print(fastaStr)
with open(outFn, 'w') as fh:
fh.write(fastaStr)