-
Notifications
You must be signed in to change notification settings - Fork 4
/
epitope_mapping_df.py
1016 lines (898 loc) · 38.2 KB
/
epitope_mapping_df.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
import seaborn as sns
import pandas as pd
import numpy as np
from copy import deepcopy
import argparse
import multiprocessing
import skbio
from fg_shared import *
import itertools
import operator
from functools import partial
import matplotlib as mpl
import matplotlib.pyplot as plt
from matplotlib.collections import PatchCollection
from matplotlib.patches import Rectangle
from objhist import objhist
"""Rewrite of epitope_mapping.py that avoids the use of objects,
thereby permitting easy writing/reading of epitope mapping
results to/from files.
response : pd.Series
A single row from the response DataFrame
Row is indexed by attributes of the object
island : pd.DataFrame
A subset of the whole response DataFrame
containing responses from one PTID that overlap
epitope : pd.Series
A row from the DataFrame of epitopes
Has an epitope ID that is specific to the PTID
Can be merged with the respons DataFrame using the epitopeID"""
# from HLAPredCache import *
import sys
from os.path import join as opj
__all__ = [ 'hamming',
'overlap',
'sharedCoords',
'assignResponseIslands',
'findEpitopes',
'identicalRule',
'overlapRule',
'hlaRule',
'findpeptide',
'plotIsland',
'encodeVariants',
'decodeVariants',
'sliceRespSeq',
'groupby_parallel',
'realignPeptides',
'coord_overlap',
'plotEpitopeMap',
'plotEpitopeChiclets',
'consensusPeptide',
'alignPeptides',
'unionCoords',
'respCoords']
def hamming(str1, str2):
"""Hamming distance between two strings"""
return sum([i for i in map(operator.__ne__, str1, str2)])
def coord_overlap(start1, end1, start2, end2):
coords1 = list(range(int(start1), int(end1)))
coords2 = list(range(int(start2), int(end2)))
return [i for i in coords1 if i in coords2]
def _coords(r, plot=False):
"""Return coordinates of the response peptide
Plot option returns coordinates based on start and length of peptide,
as opposed to end coordinate which is subject to indsertions/deletions
Use end like a stop in range(start, stop)"""
if plot:
return list(range(int(r.start), int(r.start) + len(r.seq)))
else:
return list(range(int(r.start), int(r.end)))
def respCoords(r, plot=False):
"""Return coordinates of the response peptide
Plot option returns coordinates based on start and length of peptide,
as opposed to end coordinate which is subject to indsertions/deletions
Use end like a stop in range(start, stop)"""
if plot:
return list(range(int(r.start), int(r.start) + len(r.seq)))
else:
return list(range(int(r.start), int(r.end)))
def _epcoords(r, plot=False):
if plot:
return list(range(int(r.EpStart), int(r.EpStart) + len(r.EpSeq)))
else:
return list(range(int(r.EpStart), int(r.EpEnd)))
def overlap(response1, response2):
"""Any overlap between two responses?"""
if response1.protein == response2.protein:
coord2 = _coords(response2)
for i in _coords(response1):
if i in coord2:
return True
return False
def sharedCoords(island):
"""Find the coordinates that are shared by ALL responses in the island"""
if island.shape[0] > 0:
sc = set(_coords(island.iloc[0]))
for ri in range(island.shape[0]):
response = island.iloc[ri]
sc.intersection_update(_coords(response))
return sorted(sc)
else:
return []
def unionCoords(island):
"""Find the coordinates that are in ANY of the responses"""
if island.shape[0] > 0:
sc = set(_coords(island.iloc[0]))
for ri in range(island.shape[0]):
response = island.iloc[ri]
sc = sc.union(_coords(response))
return sorted(sc)
else:
return []
def alignPeptides(island):
'''coords = []
for i,r in island.iterrows():
coords.extend(_coords(r))
coords = np.unique(coords)'''
coords = np.array(unionCoords(island))
# assert (coords == np.arange(coords[0], coords[-1]+1)).all()
out = ['-'*(coords < r.start).sum() + r.seq + '-'*(coords > (r.end-1)).sum() for i,r in island.iterrows()]
return out
def consensusPeptide(island, ignoreGaps=True):
"""Aligns peptides and returns a consensus sequence
ignoresGaps unless all AA are gaps"""
align = alignPeptides(island)
L = len(align[0])
cons = ''
for aai in np.arange(L):
counts = objhist([seq[aai] for seq in align])
if ignoreGaps and len(counts)>1:
droppedGaps = counts.pop('-', 0)
cons += max(list(counts.keys()), key=counts.get)
return cons
def assignResponseIslands(responses):
"""Return a pd.Series with an island ID that can be joined with
the responses pd.DataFrame"""
"""Contains lists of row indexes of overlapping responses"""
islandList = []
for ri in range(responses.shape[0]):
r = responses.iloc[ri]
anyFound = False
for islandi, island in enumerate(islandList):
"""Add the response to each existing island"""
for islandRespi in island:
if overlap(r, responses.iloc[islandRespi]):
islandList[islandi].append(ri)
anyFound = True
break
if not anyFound:
"""Start a new island"""
islandList.append([ri])
"""Return an island column indexed like responses"""
islandColumn = [''] * responses.shape[0]
for islandi,island in enumerate(islandList):
for i in island:
islandColumn[i] = 'I%d' % islandi
out = pd.DataFrame({'IslandID': islandColumn})
out.index = responses.index
return out
def findEpitopes(responses, sharedRule, reduceResponses=True, returnResponseIndexed=True, **kwargs):
"""Given a list of responses find the minimum
set of epitopes that "explain" all responses."""
if reduceResponses:
"""Reduce responses to those with unique start and end positions"""
coords = {} # keeps track of unique coordinates: (st, en): i
keepInds = [] # indices we want to keep in the analysis for now
"""Keeps track of redundant coords so we can correctly assign epitopes later"""
redundantD = {} # key: i, value: list of redundant i's
for i in range(responses.shape[0]):
st = responses.start.iloc[i]
# en = responses.start.iloc[i] + len(responses.seq.iloc[i])
en = responses.end.iloc[i]
if not (st, en) in coords:
coords[(st, en)] = i
keepInds.append(i)
else:
if coords[(st, en)] in redundantD:
redundantD[coords[(st, en)]].append(i)
else:
redundantD[coords[(st, en)]] = [i]
redResp = responses.iloc[keepInds]
else:
redResp = responses
"""Start with all possible sets of responses (i.e. combinations)"""
sharedSets = []
for N in range(1, redResp.shape[0] + 1):
for inds in itertools.combinations(list(range(redResp.shape[0])), N):
sharedSets.append(list(inds))
"""Remove any sets that don't have one unique shared response/epitope"""
sharedSets = [ss for ss in sharedSets if sharedRule(redResp.iloc[ss], **kwargs)[0]]
"""Remove any sets that are a subset of another set"""
sharedSetsCopy = deepcopy(sharedSets)
for s1, s2 in itertools.product(sharedSetsCopy, sharedSetsCopy):
if (not s1 is s2) and set(s1).issubset(set(s2)):
try:
sharedSets.remove(s1)
except ValueError:
"""It has already been removed"""
pass
"""Remove any sets whose members are all members of other sets, starting with shorter sets"""
"""Could help remove a few redundant responses, but may not be neccesary if we use the HLA rule"""
anyRemoved = True
while anyRemoved:
anyRemoved = False
newSharedSets = deepcopy(sharedSets)
for s1 in sorted(newSharedSets, key=len):
"""For each set of responses"""
eachExplained = []
for respInd in s1:
"""For each response index in the set"""
explained = False
for s2 in newSharedSets:
"""For each set of responses"""
if (not s2 is s1) and respInd in s2:
"""If respInd is explained by another, larger set"""
explained = True
eachExplained.append(explained)
if all(eachExplained):
sharedSets.remove(s1)
anyRemoved = True
"""Start for s1 again with updated set of responses"""
break
if reduceResponses:
expSharedSets = []
for ss in sharedSets:
tmp = []
for reducedi in ss:
i = keepInds[reducedi]
tmp.append(i)
if i in redundantD:
tmp.extend(redundantD[i])
expSharedSets.append(tmp)
else:
expSharedSets = sharedSets
if returnResponseIndexed:
"""Return epitope columns indexed like responses
Deprecated: issue is that response may contain multilpe epitopes
so index by epitope ID is more natural. Since every epitope has at
least one response that can only be explained by that epitope,
the response indexed result is still OK for breadth calc"""
epitopes = [None] * responses.shape[0]
for epitopei, inds in enumerate(expSharedSets):
"""Some responses may be in multiple shared sets:
do not define/restrict epitope location based on these peptides"""
uinds = []
for i in inds:
notUnique = False
for ssi, ss in enumerate(expSharedSets):
if not ssi == epitopei:
if i in ss:
notUnique = True
if not notUnique:
uinds.append(i)
ep = sharedRule(responses.iloc[uinds], **kwargs)[1]
ep['EpID'] = 'E%d' % epitopei
"""This will assign responses an EpID multiple times when it
could contain either epitope"""
for i in inds:
epitopes[i] = ep
epitopesDf = pd.DataFrame(epitopes)
epitopesDf.index = responses.index
else:
"""Return DataFrame that is indexed by epitope and response,
with potentially more than one epitope per response"""
epitopes = []
for epitopei, inds in enumerate(expSharedSets):
"""Some responses may be in multiple shared sets:
do not define/restrict epitope location based on these peptides"""
uinds = []
for i in inds:
notUnique = False
for ssi, ss in enumerate(expSharedSets):
if not ssi == epitopei:
if i in ss:
notUnique = True
if not notUnique:
uinds.append(i)
ep = sharedRule(responses.iloc[uinds], **kwargs)[1]
ep['EpID'] = 'E%d' % epitopei
for i in inds:
tmp = deepcopy(ep)
tmp['ptid'] = responses['ptid'].iloc[i]
tmp['IslandID'] = responses['IslandID'].iloc[i]
tmp['RespID'] = responses['RespID'].iloc[i]
tmp['Floater'] = 0 if i in uinds else 1
epitopes.append(tmp)
epitopesDf = pd.DataFrame(epitopes)
return epitopesDf
"""
Each rule returns True or False,
if True it also returns the unique epitope representative of the response island
Simple "shared rule" implementation
(1) Define a rule to determine if a set of responses share a single response in common
(8 AA overlap, 6 AA matching rule or HLA-based rule)
(2) Begin with all possible sets of responses (including sets of size N, N-1, N-2, ... 1)
(3) Eliminate sets that do not all share a common response
(i.e. not all pairwise responses share a response)
(4) Eliminate sets that are a subset of another set
(5) Eliminate sets whose members are all members of other sets (starting with shorter sets)
(5) Remaining subsets are the true sets of responses (i.e. breadth)
"""
def identicalRule(island):
s = set(island.seq)
if len(s) == 1:
"""If there's just one response"""
ep = dict(EpSeq=island.iloc[0].seq,
EpStart=island.iloc[0].start,
EpEnd=island.iloc[0].start + len(island.iloc[0].seq))
return True, ep
else:
return False, {}
def overlapRule(island, minOverlap=8, minSharedAA=6):
"""If all responses share an overlap by at least minOverlap and
at least minSharedAA amino acid residues are matched in all responses
then the responses are considered 'shared'"""
"""Find the shared peptide for responses"""
sharedInds = sharedCoords(island)
nOverlap = 0
nMatching = 0
seq = ''
for si in sharedInds:
aas = {respSeq[si - int(respStart)] for respSeq, respStart in zip(island.seq, island.start) if (si - int(respStart)) < len(respSeq)}
if '-' in aas:
seq += '-'
else:
nOverlap += 1
if len(aas) == 1:
nMatching += 1
seq += aas.pop()
else:
seq += 'X'
if nOverlap >= minOverlap and nMatching >= minSharedAA:
ep = dict(EpSeq=seq, EpStart=sharedInds[0], EpEnd=sharedInds[-1]+1)
return True, ep
else:
return False, {}
def overlapEpitope(island, useX=True):
"""Find the shared peptide for responses"""
sharedInds = sharedCoords(island)
seq = ''
for si in sharedInds:
aas = {respSeq[si - int(respStart)] for respSeq, respStart in zip(island.seq, island.start) if (si - int(respStart)) < len(respSeq)}
if len(aas) == 1:
seq += aas.pop()
else:
if useX:
seq += 'X'
else:
seq += '[' + ''.join(aas) + ']'
return dict(EpSeq=seq, EpStart=sharedInds[0], EpEnd=sharedInds[-1]+1)
def hlaRule(island, hlaList, ba, topPct=0.1, nmer=[8, 9, 10], useX=True):
"""Determine overlap region common amongst all responses in the island
Predict HLA binding for all mers within the overlap region, for all response sequences (putative epitopes)
For each mer (defined by start position and length),
If there is a mer within the topPct for every response then the responses are shared
Return the mer whose avg rank is best amongst all the responses
If none of the mers are ranked topPct in all responses then then return None
TODO: untested and wil need to handle gaps"""
"""Find the shared peptide for responses"""
sharedInds = sharedCoords(island)
"""Indices of all the possible epitopes in the island"""
possibleEpitopeInds = getMers(sharedInds, nmer=nmer)
rankPctMat = np.ones((len(island), len(possibleEpitopeInds), len(hlaList)))
for ri, (rSeq, rL) in enumerate(zip(island.seq, island.L)):
"""Get the predictions for all mers in this response"""
ranks, sorti, kmers, ic50, hla = rankEpitopes(ba, hlaList, rSeq, nmer=nmer, peptideLength=None)
for ii, inds in enumerate(possibleEpitopeInds):
"""Find the part of each epitope that overlaps with this response"""
curMer = ''.join([rSeq[si-rL] for si in inds])
if len(curMer) in nmer:
for hi, h in enumerate(hlaList):
curIC50 = ba[(h, curMer)]
rankPctMat[ri, ii, hi] = (ic50 < curIC50).sum() / len(ic50)
"""Are any of the possible HLA:mer pairs in the topPct for all responses?"""
if (rankPctMat<=topPct).any(axis=2).any(axis=1).all():
"""Among HLA:mer pairs that are in the topPct, which one is the lowest on avg across responses?"""
rankPctMat[rankPctMat > topPct] = np.nan
tmp = np.nanmean(rankPctMat, axis=0)
avgRank = np.nanmin(tmp)
ii, hi = np.unravel_index(np.nanargmin(tmp), dims=tmp.shape)
epitopeInds = possibleEpitopeInds[ii]
hla = hlaList[hi]
seq = ''
for si in epitopeInds:
aas = {respSeq[si - respStart] for respSeq, respStart in zip(island.seq, island.start)}
if len(aas) == 1:
seq += list(aas)[0]
else:
if useX:
seq += 'X'
else:
seq += '[' + ''.join(aas) + ']'
ep = dict(EpSeq=seq, EpStart=epitopeInds[0], EpEnd=epitopeInds[-1]+1, hlas=[hla])
return True, ep
else:
return False, {}
def findpeptide(pep, seq, returnGapped=False):
"""Find pep in seq ignoring gaps but returning a start position that counts gaps
pep must match seq exactly (otherwise you should be using pairwise alignment)
seq[startPos:endPos] = pep
Parameters
----------
pep : str
Peptide to be found in seq.
seq : str
Sequence to be searched.
Returns
-------
startPos : int
Start position (zero-indexed) of pep in seq or -1 if not found
endPos : int
Stop + 1 position (zero-indexed) of pep in seq or -1 if not found
Use as stop in range(statr, stop) or slicing [start:stop]"""
ng = seq.replace('-', '')
ngInd = ng.find(pep)
ngCount = 0
pos = 0
"""Count the number of gaps prior to the non-gapped position.
Add them to it to get the gapped position"""
while ngCount < ngInd or seq[pos] == '-':
if not seq[pos] == '-':
ngCount += 1
pos += 1
startPos = ngInd + (pos - ngCount)
pepOut = ''
if startPos == -1:
endPos = -1
else:
count = 0
endPos = startPos
while count < len(pep):
if not seq[endPos] == '-':
count += 1
pepOut += seq[endPos]
else:
pepOut += '-'
endPos += 1
if returnGapped:
return startPos, endPos, pepOut
else:
return startPos, endPos
def sliceRespSeq(r):
"""After the epitope has been identified, slice the response sequence to the appropriate size"""
start, stop = int(r['EpStart'] - r['start']), int(r['EpStart'] - r['start'] + len(r['EpSeq']))
if 'align seq' in r:
s = r['align seq']
else:
s = r['seq']
if start < 0 or stop > len(s):
out = ''
else:
out = s[start:stop]
return out
def encodeVariants(peps):
"""Encode a list of aligned peptides as a single string
indicating all variants within brackets.
Example
-------
peps = ['KMQKEYALL', 'KKQKEYALL','KMVKEYHAL']
print(encodeVariants(peps))
'K[MK][QV]KEY[HA][LA]L'
"""
peps = [p for p in peps if len(p) > 0]
Ls = np.array([len(s) for s in peps])
assert np.unique(Ls).shape[0] == 1
L = Ls[0]
vs = [list(set([p[i] for p in peps])) for i in range(L)]
out = ''
for v in vs:
if len(v) == 1:
out += v[0]
else:
out += '[' + ''.join(v) + ']'
return out
def decodeVariants(s):
"""Decode a variant string containing brackets,
into a list of all possible peptides. Note that
there are many more decoded variants possible than
may have been used to generate the variant string.
(i.e. it is a lossy compression)"""
vs = []
inGroup = False
i = 0
cur = ''
while i < len(s):
if not s[i] in '[]' and inGroup == False:
vs.append(s[i])
elif not s[i] in '[]':
cur += s[i]
elif s[i] == '[':
inGroup = True
elif s[i] == ']':
inGroup = False
vs.append(cur)
cur = ''
i += 1
out = [''.join(aas) for aas in itertools.product(*vs)]
return out
def plotIsland(island):
"""Build an x-vector and AA vector"""
sitex = []
immunogens = []
for i,resp in island.iterrows():
sitex.extend(_coords(resp, plot=False))
sitex.extend(_epcoords(resp, plot=False))
sitex = np.array(sorted(set(sitex)))
xx = np.arange(len(sitex))
sitex2xx = {i:j for i, j in zip(sitex, xx)}
# figh = plt.gcf()
# figh.clf()
# axh = figh.add_axes([0.05, 0.05, 0.9, 0.9])
# axh.patch.set_facecolor('white')
plt.clf()
uEpIDs = np.unique(island.EpID)
colors = {i:c for i,c in zip(uEpIDs, sns.color_palette('Set1', n_colors=uEpIDs.shape[0]))}
ss=[]
y=1
for i,r in island.iterrows():
col = colors[r.EpID]
plt.plot([sitex2xx[r.start], sitex2xx[r.end - 1]], [y, y], '-s', lw=2, mec='gray', color=col)
ann = []
if 'PeptideSet' in r:
ann.append(r['PeptideSet'])
if 'align seqs' in r:
ann.append(r['align seqs'])
if 'SFC/M' in r:
ann.append('{:1.0f} SFC/M'.format(r['SFC/M']))
if 'LANL start' in r and not pd.isnull(r['LANL start']):
if pd.isnull(r['LANL HLA']):
s = 'LANL {}'.format(r['LANL Epitope'])
else:
s = 'LANL {} {}'.format(r['LANL HLA'], r['LANL Epitope'])
ann.append(s)
if len(ann) > 0:
s = ' | '.join(ann)
plt.annotate(s,
xy=(sitex2xx[r.start], y),
xytext=(5, 5),
textcoords='offset points',
ha='left', va='bottom', size='x-small')
for xoffset, aa in enumerate(r.seq):
plt.annotate(aa, xy=(sitex2xx[r.start] + xoffset, y - 0.1), ha='center', va='top', size='small')
"""if not hlaList is None and not ba is None:
ranks,sorti,kmers,ic50,hla=rankKmers(ba,hlaList,r[2],nmer=[8,9,10,11],peptideLength=None)
rankList=ranks.tolist()
mers,inds=getMerInds(r[2],nmer=[8,9,10,11])
notes={}
for curRank in arange(5):
curi=where(ranks==curRank)[0][0]
xcoord=sitex2xx[inds[mers.index(kmers[curi])][0]+r[3]]
if not notes.has_key(xcoord) or notes[xcoord][1] > ic50[curi]:
if AList.issuperset(kmers[curi]):
notes.update({xcoord:(hla[curi][:4],ic50[curi],len(kmers[curi]),'%d|A' % (curRank+1))})
elif BList.issuperset(kmers[curi]):
notes.update({xcoord:(hla[curi][:4],ic50[curi],len(kmers[curi]),'%d|B' % (curRank+1))})
else:
notes.update({xcoord:(hla[curi][:4],ic50[curi],len(kmers[curi]),'%d' % (curRank+1))})
for k,v in notes.items():
annotate('%s\n%1.1f\nL%d\nR%s' % v,
xy=(k,y+0.08),
xytext=(-1,0),textcoords='offset points',
ha='left',va='bottom',size='x-small')"""
ss += [r.start, r.end - 1]
y += 1
y = 0
for e in uEpIDs[::-1]:
r = island.loc[island.EpID == e].iloc[0]
xvec = [sitex2xx[r.EpStart] - 0.3, sitex2xx[r.EpEnd - 1] + 0.3]
plt.fill_between(xvec, [y, y], island.shape[0], color=colors[e], edgecolor='None', alpha=0.2)
for xoffset, aa in enumerate(r.EpSeq):
plt.annotate(aa, xy=(sitex2xx[r.EpStart] + xoffset, y + 0.1),
ha='center', va='bottom', size='small', weight='bold')
ss += [r.EpStart, r.EpEnd - 1]
y -= 1
plt.yticks([])
ss = np.unique(ss)
plt.xlabel('%s coordinate' % r.protein.title(), fontsize='x-large')
plt.xticks([sitex2xx[sx] for sx in ss], ss.astype(int), size='x-large')
plt.xlim((-1, len(xx)))
plt.ylim((-len(uEpIDs), len(island)+1))
handles = [mpl.patches.Patch(facecolor=colors[e], edgecolor='k') for e in uEpIDs]
plt.legend(handles, uEpIDs, loc='best', title='Epitope')
def plotEpitopeMap(respDf, groupDf, uRespCol, groupCol, startCol, endCol, groupOrder=None, groupColors=None, lw=2):
"""Plot map of all responses in respDf (could be epitopes or 15mers"""
nPTIDs = groupDf.ptid.unique().shape[0]
if not groupOrder is None:
uArms = np.asarray(groupOrder)
else:
uArms = groupDf[groupCol].unique()
tmp = respDf.drop_duplicates(['ptid', uRespCol])[['ptid', uRespCol]]
breadthDf = tmp.groupby('ptid')[[uRespCol]].count().reset_index()
breadthDf = pd.merge(groupDf[['ptid', groupCol]].drop_duplicates(), breadthDf, left_on='ptid', right_on='ptid', how='left').fillna(0)
if groupColors is None:
groupColors = sns.color_palette('Set1', n_colors=uArms.shape[0])[::-1]
"""Plot map of epitope responses by PTID"""
plt.clf()
axh = plt.axes([0.05, 0.1, 0.9, 0.8])
np.random.seed(110820)
lasty = 0
yt = []
for a, color in zip(uArms[::-1], groupColors[::-1]):
"""Sort PTIDs by breadth"""
sortedPtids = breadthDf.loc[breadthDf[groupCol] == a].sort_values(by=uRespCol).ptid.tolist()
# sortedPtids = np.random.permutation(respDf.ptid.unique())
for y, ptid in enumerate(sortedPtids):
plotDf = respDf.loc[respDf['ptid'] == ptid].drop_duplicates(subset=['ptid', uRespCol])
yt.append(y + lasty + 0.5)
yyvec = np.random.permutation(np.linspace(0, 1, plotDf.shape[0]))
for yy, st, en in zip(yyvec, plotDf[startCol], plotDf[endCol]):
plt.plot([st, en], [y + yy + lasty, y + yy + lasty],
'-',
color=color,
linewidth=lw)
plt.plot([-5, -5], [lasty, lasty + y + 1], '-', color=color, linewidth=7)
plt.annotate(a,
xy=(-5, np.mean([lasty, lasty + y + 1])),
xycoords='data',
textcoords='offset points',
xytext=(-10, 0),
ha='right',
va='center',
rotation=90,
size=16, weight='heavy',
color='black')
lasty += y + 3
if respDf.protein.str.lower().isin(['gp120', 'env', 'gp160']).all():
keepRegions = ['Signal peptide', 'V1 loop', 'V2 loop',
'Loop D', 'V3 loop', 'V4 loop', 'V5 loop', 'gp41']
reg = pd.read_csv(opj(_git, 'HIV_alignments', 'env_locations.csv'))
reg = reg.loc[reg.region.isin(keepRegions)]
reg.loc[:, 'region'] = reg.region.str.replace(' loop', '').str.replace(' ', '\n')
for i, (region, st, en) in reg.iterrows():
plt.plot([st, en], [lasty + 1, lasty + 1], 'k-', linewidth=6)
plt.annotate(region,
xy=(st + (en-st)/2., lasty+1),
xycoords='data',
textcoords='offset points',
xytext=(0, 10),
ha='center',
va='bottom',
size='medium',
color='black')
#plt.ylabel('Participants', fontsize=16)
plt.yticks(yt, ['' for i in range(len(yt))])
plt.ylim((-1, nPTIDs + 15))
plt.xlabel('HIV %s HXB2 Position' % respDf.protein.unique()[0])
#lims = [respDf['start'].min(), (respDf['start'] + respDf['seq'].map(len)).max()]
lims = [respDf[startCol].min(), respDf[endCol].max()]
xt = np.arange(0, lims[1] + 50, 50)
xt[0] += 1
plt.xticks(xt, xt.astype(int), size=12)
plt.xlim((-10, lims[1] + 50))
for s in ['right', 'top', 'left']:
axh.spines[s].set_visible(False)
handles = [mpl.patches.Patch(facecolor=c, edgecolor='k') for c in groupColors]
#plt.legend(handles, uArms, loc='upper left', fontsize=15, bbox_to_anchor=[1.02, 1.05])
#plt.legend(handles, uArms, loc='upper right', fontsize=15)
def plotEpitopeChiclets(respDf, groupDf, uRespCol, groupCol, startCol, endCol,
uPTIDs=None, groupOrder=None, groupColors=None, epColorCol=None,
chiclet_height=0.6, xtick_width=50, fontsize=16, dropNonresponders=False):
if uPTIDs is None:
uPTIDs = groupDf.ptid.unique().tolist()
nPTIDs = len(uPTIDs)
else:
nPTIDs = len(uPTIDs)
if not groupOrder is None:
uArms = np.asarray(groupOrder)
else:
uArms = groupDf[groupCol].unique()
if groupColors is None:
groupColors = sns.color_palette('Set1', n_colors=uArms.shape[0])[::-1]
if epColorCol is None:
respDf.loc[:, 'EpColor'] = 'None'
else:
respDf.loc[:, 'EpColor'] = respDf[epColorCol]
#lims = [respDf['start'].min(), (respDf['start'] + respDf['seq'].map(len)).max()]
lims = [respDf[startCol].min(), respDf[endCol].max()]
tmp = respDf.drop_duplicates(['ptid', uRespCol])[['ptid', uRespCol]]
breadthDf = tmp.groupby('ptid')[[uRespCol]].count().reset_index()
if dropNonresponders:
breadthDf = pd.merge(groupDf[['ptid', groupCol]].drop_duplicates(), breadthDf,
left_on='ptid', right_on='ptid', how='right').fillna(0)
else:
breadthDf = pd.merge(groupDf[['ptid', groupCol]].drop_duplicates(), breadthDf,
left_on='ptid', right_on='ptid', how='left').fillna(0)
"""Plot map of epitope responses by PTID"""
plt.clf()
axh = plt.axes([0.05, 0.1, 0.85, 0.8])
lasty = 0
yt = []
for a, color in zip(uArms[::-1], groupColors[::-1]):
"""Sort PTIDs by breadth"""
sortedPtids = breadthDf.loc[breadthDf[groupCol] == a].sort_values(by=uRespCol).ptid.tolist()
# sortedPtids = np.random.permutation(respDf.ptid.unique())
for y, ptid in enumerate(sortedPtids):
plotDf = respDf.loc[respDf['ptid'] == ptid].drop_duplicates(subset=['ptid', uRespCol])
yt.append(y + lasty)
rects = []
for st, en, c in zip(plotDf[startCol], plotDf[endCol], plotDf['EpColor']):
L = 5
xloc = np.mean([st, en])
xy = (xloc - L/2, y + lasty - chiclet_height/2)
if c == 'None':
c = color
rects.append(Rectangle(xy, width=L, height=chiclet_height, facecolor=c, alpha=1.0, edgecolor='black', lw=0.5))
# Create patch collection with specified colour/alpha
pc = PatchCollection(rects, match_original=True)
# Add collection to axes
axh.add_collection(pc)
plt.plot([0, lims[1] + 50], [y + lasty]*2, '-', color='gray', lw=0.5, zorder=-10)
plt.plot([-5, -5], [lasty, lasty + y + 1], '-', color=color, linewidth=7)
plt.annotate(a,
xy=(-5, np.mean([lasty, lasty + y + 1])),
xycoords='data',
textcoords='offset points',
xytext=(-3, 0),
ha='right',
va='center',
rotation=90,
size=fontsize, weight='heavy',
color='black')
lasty += y + 3
if (respDf.protein.str.lower().isin(['env', 'gp120', 'gp160'])).all():
keepRegions = ['Signal peptide', 'V1 loop', 'V2 loop',
'Loop D', 'V3 loop', 'V4 loop', 'V5 loop', 'gp41']
reg = pd.read_csv(opj(_git, 'HIV_alignments', 'env_locations.csv'))
reg = reg.loc[reg.region.isin(keepRegions)]
reg.loc[:, 'region'] = reg.region.str.replace(' loop', '').str.replace(' ', '\n')
for i, (region, st, en) in reg.iterrows():
plt.plot([st, en], [lasty + 1, lasty + 1], 'k-', linewidth=6)
plt.annotate(region,
xy=(st + (en-st)/2., lasty+1),
xycoords='data',
textcoords='offset points',
xytext=(0, 10),
ha='center',
va='bottom',
size=fontsize,
color='black')
# plt.ylabel('Participants', fontsize=16)
# plt.yticks(yt, ['' for i in range(nPTIDs)])
plt.yticks(())
plt.ylim((-1, nPTIDs + 10))
plt.xlabel('HIV %s HXB2 Position' % respDf.protein.unique()[0], size=fontsize)
xt = np.arange(0, lims[1] + xtick_width, xtick_width)
xt[0] += 1
plt.xticks(xt, xt.astype(int), size=fontsize)
plt.xlim((-10, lims[1] + 50))
for s in ['right', 'top', 'left']:
axh.spines[s].set_visible(False)
handles = [mpl.patches.Patch(facecolor=c, edgecolor='k') for c in groupColors]
#plt.legend(handles, uArms, loc='upper left', fontsize=15, bbox_to_anchor=[1.02, 0.8])
#plt.legend(handles, uArms, loc='upper right', fontsize=15)
def AlgnFreeOverlapRule(island, overlapD, minOverlap=7, minSharedAA=5, maxMismatches=3):
"""PROBLEM: Is working partly because it is still dependent on using the shared coords
Using shared coords greatly reduces complexity of the problem. Will try to go back and
get correct coords for each peptide
If all responses share an overlap by at least minOverlap and
at least minSharedAA amino acid residues are matched in all responses
then the responses are considered 'shared'"""
oParams = dict(overlapD=overlapD,
minOverlap=minOverlap,
minSharedAA=minSharedAA,
maxMismatches=maxMismatches)
"""Find the shared peptide for responses"""
sharedInds = sharedCoords(island)
nOverlap = 0
nMatching = 0
seq = ''
for si in sharedInds:
aas = {respSeq[si - int(respStart)] for respSeq, respStart in zip(island.seq, island.start)}
if '-' in aas:
seq += '-'
else:
nOverlap += 1
if len(aas) == 1:
nMatching += 1
seq += aas.pop()
else:
seq += 'X'
allOverlap = np.all([isoverlapping(a, b, **oParams) for a,b in itertools.product(island.seq, island.seq)])
if allOverlap:
# if nOverlap >= minOverlap and nMatching >= minSharedAA:
ep = dict(EpSeq=seq, EpStart=sharedInds[0], EpEnd=sharedInds[-1])
return True, ep
else:
return False, {}
def groupby_parallel(gb, func, ncpus=4):
"""Performs a Pandas groupby operation in parallel.
Example
-------
ep = groupby_parallel(posDf.groupby(['ptid', 'IslandID'], .apply(findEpitopes, overlapRule, minSharedAA=5, minOverlap=7))"""
with multiprocessing.Pool(ncpus) as pool:
queue = multiprocessing.Manager().Queue()
result = pool.starmap_async(func, [(name, group) for name, group in gb])
got = result.get()
return pd.concat(got)
def realignPeptides(peptides, alignFn, minL=4):
"""Use an alignment to assign peptide locations in alignment coordinates.
Algorithm first tries to find the whole peptide reporting all sequence IDs
that match in the alignment. With successive attempts it tries to match kmers
decreasing lengths with a minimum of minL.
Parameters
----------
peptides : iterator
Collection of peptides (e.g. list or pd.Series)
alignFn : str
Filename of a FASTA formatted multiple sequence alignment
minL : None or int
Value of None requires exact matches be found in the alignment
or a minimum of minL AAs must match
Returns
-------
resDf : pd.DataFrame shape (len(peptides), 6)
Columns are: align start, align end, align seq, firstN, lastN, align seqs"""
seqs = skbio.read(alignFn, format='fasta')
seqs = {s.metadata['id']:str(s) for s in seqs}
def _findOne(pep, pos=None, L=None):
origPep = pep
if pos is None:
pos = 0
if L is None:
L = len(pep)
pep = pep[pos:pos + L]
names = []
startPos, endPos = -1, -1
gapped = ''
for name, s in seqs.items():
st, en, g = findpeptide(pep, s, returnGapped=True)
if st >= 0:
names.append(name)
startPos, endPos = st, en
gapped = g
core = gapped
afterCore = origPep[pos + L:]
beforeCore = origPep[:pos]
endAdd = len(origPep) - (pos + L)
add = 0
afteri = 0
while add < endAdd:
if not s[endPos] == '-':
add += 1
core = core + afterCore[afteri]
afteri += 1
else:
core = core + '-'
endPos += 1
add = 0
beforei = 0
while add < pos:
if not s[startPos-1] == '-':
add += 1
core = beforeCore[pos - beforei - 1] + core
beforei += 1
else:
core = '-' + core
startPos -= 1
"""For L < len(origPep) this is filling AAs from the alignment"""
matched = s[startPos:endPos]
"""while core fills from the original peptide sequence"""
if startPos == -1:
pep = ''
firstN = None
lastN = None
matched = ''
core = ''
out = pd.Series({'align start':startPos,
'align end':endPos,
'align subseq':gapped,
'align seq':matched,
'realign seq':core,
'matched subseq':pep,
'seq':origPep,
'subpos':pos,
'subL':L,
'L':len(origPep),
'align seqs':'|'.join(names)})
return out
resL = []
for pep in peptides:
out = _findOne(pep)
if not minL is None:
for k in range(len(pep), minL, -1):