forked from xiaojunxu/dnn-binary-code-similarity
-
Notifications
You must be signed in to change notification settings - Fork 0
/
utils.py
264 lines (218 loc) · 7.85 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
import numpy as np
#import matplotlib.pyplot as plt
from sklearn.metrics import auc, roc_curve
from graphnnSiamese import graphnn
import json
def get_f_name(DATA, SF, CM, OP, VS):
F_NAME = []
for sf in SF:
for cm in CM:
for op in OP:
for vs in VS:
F_NAME.append(DATA+sf+cm+op+vs+".json")
return F_NAME
def get_f_dict(F_NAME):
name_num = 0
name_dict = {}
for f_name in F_NAME:
with open(f_name) as inf:
for line in inf:
g_info = json.loads(line.strip())
if (g_info['fname'] not in name_dict):
name_dict[g_info['fname']] = name_num
name_num += 1
return name_dict
class graph(object):
def __init__(self, node_num = 0, label = None, name = None):
self.node_num = node_num
self.label = label
self.name = name
self.features = []
self.succs = []
self.preds = []
if (node_num > 0):
for i in range(node_num):
self.features.append([])
self.succs.append([])
self.preds.append([])
def add_node(self, feature = []):
self.node_num += 1
self.features.append(feature)
self.succs.append([])
self.preds.append([])
def add_edge(self, u, v):
self.succs[u].append(v)
self.preds[v].append(u)
def toString(self):
ret = '{} {}\n'.format(self.node_num, self.label)
for u in range(self.node_num):
for fea in self.features[u]:
ret += '{} '.format(fea)
ret += str(len(self.succs[u]))
for succ in self.succs[u]:
ret += ' {}'.format(succ)
ret += '\n'
return ret
def read_graph(F_NAME, FUNC_NAME_DICT, FEATURE_DIM):
graphs = []
classes = []
if FUNC_NAME_DICT != None:
for f in range(len(FUNC_NAME_DICT)):
classes.append([])
for f_name in F_NAME:
with open(f_name) as inf:
for line in inf:
g_info = json.loads(line.strip())
label = FUNC_NAME_DICT[g_info['fname']]
classes[label].append(len(graphs))
cur_graph = graph(g_info['n_num'], label, g_info['src'])
for u in range(g_info['n_num']):
cur_graph.features[u] = np.array(g_info['features'][u])
for v in g_info['succs'][u]:
cur_graph.add_edge(u, v)
graphs.append(cur_graph)
return graphs, classes
def partition_data(Gs, classes, partitions, perm):
C = len(classes)
st = 0.0
ret = []
for part in partitions:
cur_g = []
cur_c = []
ed = st + part * C
for cls in range(int(st), int(ed)):
prev_class = classes[perm[cls]]
cur_c.append([])
for i in range(len(prev_class)):
cur_g.append(Gs[prev_class[i]])
cur_g[-1].label = len(cur_c)-1
cur_c[-1].append(len(cur_g)-1)
ret.append(cur_g)
ret.append(cur_c)
st = ed
return ret
def generate_epoch_pair(Gs, classes, M, output_id = False, load_id = None):
epoch_data = []
id_data = [] # [ ([(G0,G1),(G0,G1), ...], [(G0,H0),(G0,H0), ...]), ... ]
if load_id is None:
st = 0
while st < len(Gs):
if output_id:
X1, X2, m1, m2, y, pos_id, neg_id = get_pair(Gs, classes,
M, st=st, output_id=True)
id_data.append( (pos_id, neg_id) )
else:
X1, X2, m1, m2, y = get_pair(Gs, classes, M, st=st)
epoch_data.append( (X1,X2,m1,m2,y) )
st += M
else: ## Load from previous id data
id_data = load_id
for id_pair in id_data:
X1, X2, m1, m2, y = get_pair(Gs, classes, M, load_id=id_pair)
epoch_data.append( (X1, X2, m1, m2, y) )
if output_id:
return epoch_data, id_data
else:
return epoch_data
def get_pair(Gs, classes, M, st = -1, output_id = False, load_id = None):
if load_id is None:
C = len(classes)
if (st + M > len(Gs)):
M = len(Gs) - st
ed = st + M
pos_ids = [] # [(G_0, G_1)]
neg_ids = [] # [(G_0, H_0)]
for g_id in range(st, ed):
g0 = Gs[g_id]
cls = g0.label
tot_g = len(classes[cls])
if (len(classes[cls]) >= 2):
g1_id = classes[cls][np.random.randint(tot_g)]
while g_id == g1_id:
g1_id = classes[cls][np.random.randint(tot_g)]
pos_ids.append( (g_id, g1_id) )
cls2 = np.random.randint(C)
while (len(classes[cls2]) == 0) or (cls2 == cls):
cls2 = np.random.randint(C)
tot_g2 = len(classes[cls2])
h_id = classes[cls2][np.random.randint(tot_g2)]
neg_ids.append( (g_id, h_id) )
else:
pos_ids = load_id[0]
neg_ids = load_id[1]
M_pos = len(pos_ids)
M_neg = len(neg_ids)
M = M_pos + M_neg
maxN1 = 0
maxN2 = 0
for pair in pos_ids:
maxN1 = max(maxN1, Gs[pair[0]].node_num)
maxN2 = max(maxN2, Gs[pair[1]].node_num)
for pair in neg_ids:
maxN1 = max(maxN1, Gs[pair[0]].node_num)
maxN2 = max(maxN2, Gs[pair[1]].node_num)
feature_dim = len(Gs[0].features[0])
X1_input = np.zeros((M, maxN1, feature_dim))
X2_input = np.zeros((M, maxN2, feature_dim))
node1_mask = np.zeros((M, maxN1, maxN1))
node2_mask = np.zeros((M, maxN2, maxN2))
y_input = np.zeros((M))
for i in range(M_pos):
y_input[i] = 1
g1 = Gs[pos_ids[i][0]]
g2 = Gs[pos_ids[i][1]]
for u in range(g1.node_num):
X1_input[i, u, :] = np.array( g1.features[u] )
for v in g1.succs[u]:
node1_mask[i, u, v] = 1
for u in range(g2.node_num):
X2_input[i, u, :] = np.array( g2.features[u] )
for v in g2.succs[u]:
node2_mask[i, u, v] = 1
for i in range(M_pos, M_pos + M_neg):
y_input[i] = -1
g1 = Gs[neg_ids[i-M_pos][0]]
g2 = Gs[neg_ids[i-M_pos][1]]
for u in range(g1.node_num):
X1_input[i, u, :] = np.array( g1.features[u] )
for v in g1.succs[u]:
node1_mask[i, u, v] = 1
for u in range(g2.node_num):
X2_input[i, u, :] = np.array( g2.features[u] )
for v in g2.succs[u]:
node2_mask[i, u, v] = 1
if output_id:
return X1_input,X2_input,node1_mask,node2_mask,y_input,pos_ids,neg_ids
else:
return X1_input,X2_input,node1_mask,node2_mask,y_input
def train_epoch(model, graphs, classes, batch_size, load_data=None):
if load_data is None:
epoch_data = generate_epoch_pair(graphs, classes, batch_size)
else:
epoch_data = load_data
perm = np.random.permutation(len(epoch_data)) #Random shuffle
cum_loss = 0.0
for index in perm:
cur_data = epoch_data[index]
X1, X2, mask1, mask2, y = cur_data
loss = model.train(X1, X2, mask1, mask2, y)
cum_loss += loss
return cum_loss / len(perm)
def get_auc_epoch(model, graphs, classes, batch_size, load_data=None):
tot_diff = []
tot_truth = []
if load_data is None:
epoch_data= generate_epoch_pair(graphs, classes, batch_size)
else:
epoch_data = load_data
for cur_data in epoch_data:
X1, X2, m1, m2,y = cur_data
diff = model.calc_diff(X1, X2, m1, m2)
# print diff
tot_diff += list(diff)
tot_truth += list(y > 0)
diff = np.array(tot_diff)
truth = np.array(tot_truth)
fpr, tpr, thres = roc_curve(truth, (1-diff)/2)
model_auc = auc(fpr, tpr)
return model_auc, fpr, tpr, thres