-
Notifications
You must be signed in to change notification settings - Fork 0
/
encode_input.py
143 lines (124 loc) · 5.03 KB
/
encode_input.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
from typing import Any, Union
import pandas as pd
import hashlib
import numpy as np
import uuid
def ip_encode(ip):
val = "".join([bin(int(x) + 256)[3:] for x in ip.split('.')])
bin_val = int(val, 2)
return (bin_val - 0) / 4294967295
def encode(f_name):
df = pd.read_csv(f_name)
bool_http = pd.notnull(df['http_method'])
http_method = df[['http_method']][bool_http]
# selecting all not null values for access_token
bool_token = pd.notnull(df['access_token'])
access_token = df[['access_token']][bool_token]
# selecting all not null values for invoke_path
bool_path = pd.notnull(df['invoke_path'])
invoke_path = df[['invoke_path']][bool_path]
# selecting all not null values for user_agent
bool_agent = pd.notnull(df['user_agent'])
user_agent = df[['user_agent']][bool_agent]
# selecting all not null values for response_code
bool_res_code = pd.notnull(df['response_code'])
response_code = df[['response_code']][bool_res_code]
ip = ip_encode(df['ip_address'][0])
# mapping function for http_method
def method(argument):
switcher = {
"GET": 1,
"POST": 2,
"DELETE": 3,
"PUT": 4
}
return switcher.get(argument, 0)
# label encoding for http_method
for row in http_method.iterrows():
val = method(row[1][0])
norm_value: Union[int, float] = (val - 0) / 4
df.at[row[0], "http_method"] = norm_value
df.to_csv(f_name, index=False)
# binary encode access token uuid
for row in access_token.iterrows():
int_val = uuid.UUID(row[1][0]).int
# MinMax normalization applied -> min value = 0 & max value = 340282366920938463463374607431768211455( =
# 'ffffffff-ffff-ffff-ffff-ffffffffffff')
norm_val = int_val / 340282366920938463463374607431768211455
df.at[row[0], "access_token"] = norm_val
df.to_csv(f_name, index=False)
# response code normalization
for row in response_code.iterrows():
# response codes -> 200,201,400,401,403,404,405,409,500,503
norm_code = (row[1][0] - 0) / 303
df.at[row[0], "response_code"] = norm_code
df.to_csv(f_name, index=False)
# binary encode resource access path for normal user pattern dataset
for row in invoke_path.iterrows():
md5 = hashlib.md5(row[1][0].encode('utf-8')).hexdigest()
res = ''.join(format(ord(i), 'b') for i in md5[:10])
res = res[0:60]
bin_val = int(res, 2)
# MinMax normalization applied -> min value = 0 & max value = 1152921504606846975( =
# '111111111111111111111111111111111111111111111111111111111111')
norm_val = (bin_val - 0) / 1152921504606846975
df.at[row[0], "invoke_path"] = norm_val
df.to_csv(f_name, index=False)
# # binary encode resource access path for abnormal token usage dataset and extreme delete attack dataset
# for row in invoke_path.iterrows():
# val = row[1][0].replace("https://172.17.0.1:8243/", "")
# md5 = hashlib.md5(val.encode('utf-8')).hexdigest()
# res = ''.join(format(ord(i), 'b') for i in md5[:10])
# res = res[0:60]
# bin_val = int(res, 2)
# # MinMax normalization applied -> min value = 0 & max value = 1152921504606846975( =
# # '111111111111111111111111111111111111111111111111111111111111')
# norm_val = (bin_val - 0) / 1152921504606846975
# df.at[row[0], "invoke_path"] = norm_val
# df.to_csv(f_name, index=False)
# label encoding user agent
for row in user_agent.iterrows():
s = row[1][0]
splt = s.split("/", 1)
if "Mozilla" in splt[0]:
browser = "m"
elif "Opera" in splt[0]:
browser = "o"
elif "Firefox" in splt[0]:
browser = "f"
else:
browser = None
print("Error with browser categorization!")
if "Windows" in splt[1]:
plat_form = "w"
elif "Linux" in splt[1]:
plat_form = "l"
else:
plat_form = None
print("Error with platform categorization!")
status = browser + plat_form
if "mw" in status:
out = 1
elif "ml" in status:
out = 2
elif "ow" in status:
out = 3
elif "ol" in status:
out = 4
elif "fw" in status:
out = 5
elif "fl" in status:
out = 6
else:
out = 0
print("Error with final label encoding!")
norm_out = (out - 0) / 6
df.at[row[0], "user_agent"] = norm_out
df.to_csv(f_name, index=False)
# df.iloc[:, 0] = df.iloc[:, 0].replace(to_replace=np.nan, value=ip)
df.iloc[:, 1] = df.iloc[:, 1].replace(to_replace=[np.nan, df['ip_address'][0]], value=ip)
# replace all NaN values with zero
df = df.replace(np.nan, 0)
# remove other columns
keep_cols = ["ip_address", "access_token", "http_method", "invoke_path", "user_agent", "response_code"]
df.to_csv(f_name, columns=keep_cols, index=False)