-
Notifications
You must be signed in to change notification settings - Fork 2
/
tokenizer.py
130 lines (112 loc) · 4.3 KB
/
tokenizer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
import os
import json
import subprocess
from paths import Paths as P
from hyperparams import Hyperparams as H
def check_for_addr(operand):
if operand.find('0x') != -1 or operand.find('[') != -1 \
or operand[2:3] == ':' or operand.find('<') != -1:
return True
def get_ptr(operand):
if operand.find('Q') != -1:
return 'QWORD_PTR'
if operand.find('D') != -1:
return 'DWORD_PTR'
if operand.find('W') != -1:
return 'WORD_PTR'
if operand.find('B') != -1:
return 'BYTE_PTR'
if operand.find('F') != -1:
return 'FWORD_PTR'
if operand.find('X') != -1:
return 'XMMWORD_PTR'
def check_line(line):
if line == '' or line[0] == '.' or line[0] == '(':
return True
def remove_prefix(inp_line):
line = inp_line
for prefix in ['repz', 'repnz', 'rep', 'lock', 'rex', 'cs ', 'fs ', 'ds ', 'es ', 'gs ', 'ss ', 'bnd ']:
line = line.replace(prefix, '')
return line, line != inp_line
benign_files = os.listdir(P.benign_exe_disassembled)
malicious_files = os.listdir(P.malicious_exe_disassembled)
with open(P.count_dict_json, 'r') as jf:
count_dict = json.load(jf)
sorted_tokens = sorted(count_dict, key=lambda x:count_dict[x], reverse=True)
sorted_tokens = sorted_tokens[:H.vocab_size]
tokens = set(sorted_tokens) # for efficient lookup
token_dict = {}
for idx, token in enumerate(sorted_tokens):
token_dict[token] = idx + 1
def parser(file):
tokenized_file = ""
for index, line in enumerate(file):
#to differentiate section starts and instructions
if line.startswith(' ') and len(line) > 2:
# print(index, line)
line = line.strip()
line = line[line.find('\t') + 1:] #remove instruction address
line, prefix_bool = remove_prefix(line)
line = line.strip()
# print(line)
if check_line(line):
continue
# print(line)
op_idx = 7
if prefix_bool:
op_idx = line.find(' ')
opcode = line[:op_idx].rstrip() #operands start from index 8
operands = line[op_idx:].split(',') if line[op_idx:] != '' else []
operand_1 = "EMPTY"
operand_2 = "EMPTY"
# print(opcode, operands)
if len(operands) != 0:
if operands[0][0:1].isupper():
operand_1 = get_ptr(operands[0])
elif check_for_addr(operands[0]):
operand_1 = "ADDR"
else:
operand_1 = operands[0]
if len(operands) == 2:
if operands[1][0:1].isupper():
operand_2 = get_ptr(operands[1])
elif check_for_addr(operands[1]):
operand_2 = "ADDR"
else:
operand_2 = operands[1]
operand_1 = operand_1.strip()
sharp_idx = operand_2.find('#')
if sharp_idx != -1:
operand_2 = operand_2[:sharp_idx].rstrip()
keys = count_dict.keys()
write_line = ""
for op in [opcode, operand_1, operand_2]:
if op is None:
op = "EMPTY"
if op not in tokens:
op = "SPCL"
str_token = str(H.vocab_size + 1)
else:
str_token = str(token_dict[op])
write_line = write_line + str_token + " "
write_line = write_line[:-1] + "\n"
tokenized_file += write_line
return tokenized_file[:-1] # not last newline
for idx, file in enumerate(benign_files):
print(idx, file)
try:
with open(os.path.join(P.benign_exe_disassembled, file)) as f:
tokenized_file = parser(f)
with open(os.path.join(P.benign_exe_tokenized, file), 'w') as f:
f.write(tokenized_file)
except:
continue
for idx, file in enumerate(malicious_files[:H.num_malicious_files]):
print(idx, file)
try:
with open(os.path.join(P.malicious_exe_disassembled, file)) as f:
tokenized_file = parser(f)
with open(os.path.join(P.malicious_exe_tokenized, file), 'w') as f:
f.write(tokenized_file)
except:
continue