-
Notifications
You must be signed in to change notification settings - Fork 0
/
ms2pip_features.py
238 lines (208 loc) · 8.95 KB
/
ms2pip_features.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
from psm_utils.io.peptide_record import PeptideRecordReader
from collections import defaultdict
from psm_utils.io import peptide_record
from psm_utils.io import write_file
from tqdm import tqdm
import pandas as pd
from argparser import args
from psm_utils.io import convert
from Data_parser import read_pin_file, read_features_config
CONFIG = { 'ms2rescore':
{ 'tmp_path': '',
'spectrum_path': '',
'output_path': '',
'psm_file': '',
'psm_id_pattern': None,
'spectrum_id_pattern': ".*_(controllerType=0 controllerNumber=1 scan=[0-9]+)_.*", #to take the predictions of all rank PSMs
'processes': 32,
'num_cpu': 4},
'ms2pip': {
'model': 'HCD',
'frag_error': 0.02}}
def initilize_CONFIG(mgf_file : str, out_pin_file : str, psm_file: str):
"""
Update config file according to args
"""
global CONFIG
CONFIG = { 'ms2rescore':
{ 'tmp_path': '',
'spectrum_path': mgf_file,
'output_path': out_pin_file,
'psm_file': psm_file,
'psm_id_pattern': None,
'spectrum_id_pattern': ".*_(controllerType=0 controllerNumber=1 scan=[0-9]+)_.*",
'processes': 32,
'num_cpu': 32},
'ms2pip': {
'model': 'HCD',
'frag_error': 0.02}}
return CONFIG
def get_psm_list(inputfile):
"""
Read from psm_list from peprec
"""
id_file = PeptideRecordReader(inputfile)
id_file.filename = inputfile
return id_file.read_file()
def Take_ms2pip_rescore_features(psm_list):
"""
Extract MSPIP rescore features (as PSMS_list)
"""
print("Ms2PIP-rescore features gathering")
from ms2rescore.feature_generators import ms2pip
n_duplicate = defaultdict(lambda: 1)
number_duplicates_per_spec = 1
indices_list = []
for spec_id in psm_list["spectrum_id"]:
indices_list.append(n_duplicate[spec_id])
if n_duplicate[spec_id] > number_duplicates_per_spec:
number_duplicates_per_spec = n_duplicate[spec_id]
n_duplicate[spec_id] += 1
for i in range(1,number_duplicates_per_spec+1):
ms2pip.MS2PIPFeatureGenerator(CONFIG,processes=CONFIG["ms2rescore"]["processes"], spectrum_path = CONFIG["ms2rescore"]["spectrum_path"], spectrum_id_pattern = CONFIG["ms2rescore"]["spectrum_id_pattern"]).add_features(psm_list[[True if x == i else False for x in indices_list ]])
def write_pin(psm_list):
"""
write MS2PIP rescore features in pin file
"""
write_file(
psm_list,
filename=CONFIG['ms2rescore']["output_path"],
filetype="percolator",
style="pin",
feature_names=psm_list[0].rescoring_features.keys(),
)
def unique(list1):
unique_list = []
for x in list1:
if x not in unique_list:
unique_list.append(x)
return unique_list
def Take_ms2pip_features(psm_list, out_file):
"""
Extract MSPIP features (DataFrame of intensities), will furthur extract intensity from output file
"""
print("update CONFIG for MS2PIP feature-----")
#"ptm": config_up._get_modification_config(psm_list),
from ms2pip.ms2pipC import MS2PIP
rt_feat_l, ms2pip_feat_l, b_ions, y_ions, corr_all, inten_feat, ms2pip_rescore_feat_l, ms2pip_mod = read_features_config(args.feat_config)
CONFIG["ms2pip"].update(
{
"ptm": ms2pip_mod,
"sptm": [],
"gptm": [],
}
)
print("Extracting_ms2pip_features------")
n_duplicate = defaultdict(lambda: 1)
number_duplicates_per_spec = 1
indices_list = []
spec_id_unique_ = unique(psm_list["spectrum_id"])
for spec_id in psm_list["spectrum_id"]:
indices_list.append(n_duplicate[spec_id])
if n_duplicate[spec_id] > number_duplicates_per_spec:
number_duplicates_per_spec = n_duplicate[spec_id]
n_duplicate[spec_id] += 1
Dataframe_list = []
x=0
for i in range(1,number_duplicates_per_spec+1):
psm_list_ = psm_list[[True if x == i else False for x in indices_list]]
MSPIP_feature = MS2PIP(
peptide_record.to_dataframe(psm_list_),
spec_file=CONFIG["ms2rescore"]["spectrum_path"],
spectrum_id_pattern=CONFIG["ms2rescore"]["spectrum_id_pattern"],
params=CONFIG,
return_results=True,
num_cpu=CONFIG["ms2rescore"]["num_cpu"],
)
pred_and_emp = MSPIP_feature.run()
spec_id_unique = list(pred_and_emp["spec_id"].unique())
spec_id_ls = []
charge_ls = []
ion_series_ls = []
ion_series_mz_ls = []
ion_series_pred_ls = []
ion_series_targ_ls = []
rank_series = []
for spec_id in spec_id_unique:
spec_id_ls.append(spec_id)
rank_series.append(str(x))
all_ions = pred_and_emp.loc[pred_and_emp["spec_id"]==spec_id]
charge_ls.append(list(all_ions["charge"].unique()))
ion_series_ls.append(list(all_ions["ion"].astype(str) + all_ions["ionnumber"].astype(str)))
ion_series_mz_ls.append(list(all_ions["mz"]))
ion_series_pred_ls.append(list(all_ions["prediction"]))
ion_series_targ_ls.append(list(all_ions["target"]))
columns = ["spec_id", "rank", "ions_series", "ions_charge", "ions_mz","ions_pred", "ions_targ"]
data = {
"spec_id":spec_id_ls,
"rank": rank_series,
"ions_series":ion_series_ls,
"ions_charge":charge_ls,
"ions_mz":ion_series_mz_ls,
"ions_pred":ion_series_pred_ls,
"ions_targ":ion_series_targ_ls
}
Dataframe_list.append(pd.DataFrame(data, columns=columns))
x=x+1
concat_df = pd.concat(Dataframe_list, join="inner")
final_PSMs = []
for spec in spec_id_unique_:
rows_with_rank = concat_df.loc[concat_df["spec_id"]==str(spec)]
#rows_with_rank_ = rows_with_rank.drop(["rank"], axis=1)
final_PSMs.append(rows_with_rank)
final_psms_df = pd.concat(final_PSMs, join="inner")
file_name_ = out_file.split('/')
file_name = file_name_[len(file_name_)-1]
final_psms_df.to_csv(args.out + file_name+"_MSPIP.csv")
print("MSPIP features written at: ",args.out + file_name+"_MSPIP.csv")
return args.out + file_name+"_MSPIP.csv"
def Take_MS2PIP_features():
"""
Extract MSPIP features (DataFrame of intensities)
Extract MSPIP rescore features (.pin file)
"""
file_id = (args.id).split('.')
file_id_ = file_id[0].split('/')
out_pin_file = args.out + file_id_[len(file_id_)-1] +'.pin'
if args.peprec is None:
print("converting .idXML to .peprec format")
peprec_file = args.out + file_id_[len(file_id_)-1] + '.peprec'
convert(args.id, peprec_file)
print(".peprec written at: ", peprec_file)
args.peprec = peprec_file
ms2pip_features_out = None
if args.mgf is not None:
CONFIG = initilize_CONFIG(args.mgf, out_pin_file , args.peprec)
print("Initialized MS2PIP CONFIG----\n", CONFIG)
psm_list = get_psm_list(CONFIG["ms2rescore"]["psm_file"])
ms2pip_features_out = Take_ms2pip_features(psm_list, file_id[0])
else:
print("Error: unable_initialized please provide (.mgf) file")
return ms2pip_features_out
def Take_MS2PIP_rescore_features():
"""
Extract MSPIP features (DataFrame of intensities)
Extract MSPIP rescore features (.pin file)
"""
file_id = (args.id).split('.')
file_id_ = file_id[0].split('/')
out_pin_file = args.out + file_id_[len(file_id_)-1] +'.pin'
if args.peprec is None:
print("converting .idXML to .peprec format")
peprec_file = args.out + file_id_[len(file_id_)-1] + '.peprec'
convert(args.id, peprec_file)
print(".peprec written at: ", peprec_file)
args.peprec = peprec_file
ms2pip_rescore_feat = None
if args.mgf is not None:
CONFIG = initilize_CONFIG(args.mgf, out_pin_file , args.peprec)
print("Initialized MS2PIP CONFIG----\n", CONFIG)
psm_list = get_psm_list(CONFIG["ms2rescore"]["psm_file"])
psm_list["rescoring_features"] = [{} for _ in range(len(psm_list))]
Take_ms2pip_rescore_features(psm_list)
write_pin(psm_list)
ms2pip_rescore_feat = read_pin_file(out_pin_file)
print("MSPIP rescore features written at: ", out_pin_file)
else:
print("Error: unable_initialized please provide .mgf file")
return ms2pip_rescore_feat