-
Notifications
You must be signed in to change notification settings - Fork 0
/
elm_subcategory_JarvisX.py
220 lines (196 loc) · 7 KB
/
elm_subcategory_JarvisX.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
# extreme learning machine
# Jarvis Xu
import numpy as np
import pandas as pd # mainly used for reading data from excel
import nltk # mainly used for eliminating the stop words
from nltk.corpus import stopwords # ^
from sklearn.metrics import confusion_matrix, recall_score, f1_score, precision_score
# used to transform text data into
from sklearn.feature_extraction.text import HashingVectorizer
# preprocessing
# X_train, y_train
# X_test, y_test
# load data from excel
def load(s):
data = pd.read_excel('EMAILDATASET.xlsx')
lst = list(data[s])
return lst
# eliminate stopword from a sentence
def removeStopWords(s,all_stopwords):
text_tokens = nltk.word_tokenize(s)
tokens_without_sw = [word for word in text_tokens if not word in all_stopwords]
newString = " ".join(word for word in tokens_without_sw)
return newString
# convert text input into numbers
def X_convert(lst):
# eliminate stopwords
nltk.download('stopwords')
all_stopwords = stopwords.words('english')
X = []
num_inputNodes = 1000 # this number controls the number of nodes in the input layer
vectorizer = HashingVectorizer(n_features = num_inputNodes)
print("The number of input nodes:", num_inputNodes)
for i in range(len(lst)):
text = lst[i]
text = [removeStopWords(text, all_stopwords)]
vector = vectorizer.transform(text)
X.append((vector.toarray())[0])
return X
# convert text desired output into numbers
def desiredOutput(lst):
output = []
for i in lst:
if i == 'WFO': output.append(0)
elif i == 'LOD': output.append(1)
elif i == 'CLK': output.append(2)
elif i == 'CLE': output.append(3)
elif i == 'IDC': output.append(4)
elif i == 'ADC': output.append(5)
elif i == 'DPC': output.append(6)
elif i == 'CTM': output.append(7)
elif i == 'COF': output.append(8)
elif i == 'CDT': output.append(9)
elif i == 'RBK': output.append(10)
elif i == 'BCL': output.append(11)
elif i == 'MSD': output.append(12)
elif i == 'RMB': output.append(13)
else: output.append(0)
return output
# convert text desired output into numbers (subcategories)
def desiredOutput1(lst):
output = []
for i in lst:
if i == 'ITD': output.append(0)
elif i == 'OAA': output.append(1)
elif i == 'OSL': output.append(2)
else: output.append(0)
return output
# extract the test data from the complete set of data
def testSet(lst):
test = []
for i in range(240,255):
test.append(lst[i])
return test
'''
# print the actual results
def print_Result(D,A):
D1 = 0
A1 = 0
for i in range(len(D)):
print()
'''
class ELM():
# define input X, label y, number of neurons m,
# contral parameter L = 0.2 and training function TRAIN_beta
def __init__(self, X, y, m, L):
self.X = X
self.y = y
self.m = m
self.L = L
self.TRAIN_beta()
# use sigmoid function for feature mapping
# transform input data into ELM feature space
def sigmoid(self, x):
return 1.0 / (1 + np.exp(-x))
# define training function, random w, b
# output matrix H, input weights beta
# F1 output function
def TRAIN_beta(self):
n, d = self.X.shape
self.w = np.random.rand(d, self.m)
self.b = np.random.rand(1, self.m)
H = self.sigmoid(np.dot(self.X,self.w) + self.b) # use feature mapping to get output matrix
self.beta = np.dot(np.linalg.inv(np.identity(self.m) / self.L + np.dot(H.T, H)),
np.dot(H.T, self.y))
print('Train Finish', self.beta.shape,"(# hidden nodes, # output nodes)")
# testing function
def TEST(self, x):
H = self.sigmoid(np.dot(x, self.w) + self.b) # use testing set
result = np.dot(H, self.beta)
# print('result= ',result)
return result
X_name = 'Text_Of_Email'
y_name = 'Category'
yy_name = 'Sub-Category'
X = load(X_name)
y = load(yy_name)
# preprocessing for input data
X_train = X_convert(X)
# preprocessing for desired output
y_train = desiredOutput(y)
X_test = testSet(X_train)
y_test = testSet(y_train)
X_train = X_train[0:241]
y_train = y_train[0:241]
X_test1 = X_train[100:114]
y_test1 = y_train[100:114]
X_train = np.array(X_train)
y_train = np.array(y_train)
X_train1 = np.array(X_train)
y_train1 = np.array(y_train)
X_test = np.array(X_test)
y_test = np.array(y_test)
X_test1 = np.array(X_test)
y_test1 = np.array(y_test)
# training process
# OneHot encode is used for training
Y_onehot = np.eye(14)[y_train]
elm = ELM(X_train, Y_onehot, 8000, 0.2)
'''
# testing process
predict = elm.TEST(X_test)
predict = np.argmax(predict, axis = 1) # use OneHot encode, classify by the index with the greatest value
y_test = np.eye(3)[y_test]
acc = np.sum(predict == y_test)
print('acc :', acc)
for i in range(len(y_test)):
print(y_test[i])
print(predict[i])
'''
# testing process over training set
predict = elm.TEST(X_train)
predict = np.argmax(predict, axis = 1) # use OneHot encode, classify by the index with the greatest value
y_train1 = np.eye(14)[y_train]
c_m = confusion_matrix(y_train,predict)
print("confusion matrix for training set:")
print(c_m)
print("precision score = ",precision_score(y_train,predict,average='micro'))
print("recall score = ",recall_score(y_train,predict,average='micro'))
print("f1 score = ",f1_score(y_train,predict,average='micro'))
# testing process over testing set
predict = elm.TEST(X_test1)
predict = np.argmax(predict, axis = 1) # use OneHot encode, classify by the index with the greatest value
y_test1 = np.eye(14)[y_test]
# Confusion matrix for testing set
c_m = confusion_matrix(y_test,predict)
print("confusion matrix for testing set:")
print(c_m)
print("precision score = ",precision_score(y_test,predict,average='micro'))
print("recall score = ",recall_score(y_test,predict,average='micro'))
print("f1 score = ",f1_score(y_test,predict,average='micro'))
# print(y_test)
# print(predict)
precision = 0
recall = 0
f1 = 0
iter = 100
for i in range(iter):
Y_onehot = np.eye(14)[y_train]
elm = ELM(X_train, Y_onehot, 2000, 0.2)
# testing process over testing set
predict = elm.TEST(X_test1)
predict = np.argmax(predict, axis = 1) # use OneHot encode, classify by the index with the greatest value
y_test1 = np.eye(14)[y_test]
precision = precision + precision_score(y_test,predict,average='micro')
recall = recall + recall_score(y_test,predict,average='micro')
f1 = f1+ f1_score(y_test,predict,average='micro')
print("average precision:", precision/iter)
print("average recall:", recall/iter)
print("average f1:", f1/iter)
'''
# now the raw data is ready to be used-----------------------------
X_train = np.array(X_train)
y_train = np.array(y_train)
X_test = np.array(X_test)
y_test = np.array(y_test)
'''