-
Notifications
You must be signed in to change notification settings - Fork 0
/
idXML2df.py
78 lines (69 loc) · 2.33 KB
/
idXML2df.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
import pandas as pd
from pyopenms import IdXMLFile
# convert every string col into an int or float if possible
def strToFloat(df):
for col in df:
try:
df[col] = [float(i) for i in df[col]]
except ValueError:
continue
return df# convert every string col into an int or float if possible
def readAndProcessIdXML(input_file, top=1):
"""
convert the (.idXML) format identification file to dataframe
"""
prot_ids = []; pep_ids = []
IdXMLFile().load(input_file, prot_ids, pep_ids)
meta_value_keys = []
rows = []
for peptide_id in pep_ids:
spectrum_id = peptide_id.getMetaValue("spectrum_reference")
scan_nr = spectrum_id[spectrum_id.rfind('=') + 1 : ]
hits = peptide_id.getHits()
psm_index = 1
for h in hits:
if psm_index > top:
break
charge = h.getCharge()
score = h.getScore()
z2 = 0; z3 = 0; z4 = 0; z5 = 0
if charge == 2:
z2 = 1
if charge == 3:
z3 = 1
if charge == 4:
z4 = 1
if charge == 5:
z5 = 1
if "target" in h.getMetaValue("target_decoy"):
label = 1
else:
label = 0
sequence = h.getSequence().toString()
if len(meta_value_keys) == 0: # fill meta value keys on first run
h.getKeys(meta_value_keys)
meta_value_keys = [x.decode() for x in meta_value_keys]
all_columns = ['SpecId','PSMId','Label','Score','ScanNr','Peptide','peplen','ExpMass','charge2','charge3','charge4','charge5','accessions'] + meta_value_keys
#print(all_columns)
# static part
accessions = ';'.join([s.decode() for s in h.extractProteinAccessionsSet()])
row = [spectrum_id, psm_index, label, score, scan_nr, sequence, str(len(sequence)), peptide_id.getMZ(), z2, z3, z4, z5, accessions]
# scores in meta values
for k in meta_value_keys:
s = h.getMetaValue(k)
if type(s) == bytes:
s = s.decode()
row.append(s)
rows.append(row)
psm_index += 1
break; # parse only first hit
df =pd.DataFrame(rows, columns=all_columns)
convert_dict = {'SpecId': str,
'PSMId': int,
'Label': int,
'Score': float,
'ScanNr': int,
'peplen': int
}
df = df.astype(convert_dict)
return df