-
Notifications
You must be signed in to change notification settings - Fork 1
/
extract_features.py
277 lines (233 loc) · 10.3 KB
/
extract_features.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
#pos feature: create a group of fix length array (30) fill with spacy pos vector
#entity feature: same as pos then sum the two array
#mean similarity score of headlines with each sentence in the text
#demonstrative pronoun: this that these those
#Personal pronoun: I, we, you, he, she, they, me, us, you, him, her, them, mine, yours, his, hers, theirs
#Article: the
import numpy as np
import pandas as pd
from text_process import Text_process
from helpers import create_folder_path,load_pk_file,save_pk_file
from sklearn.feature_extraction import DictVectorizer
from sklearn import utils
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from gensim.models import Word2Vec
from gensim.models import KeyedVectors
from gensim.scripts.glove2word2vec import glove2word2vec
import multiprocessing
import spacy
nlp = spacy.load("en_core_web_sm")
import os
import glob
from tqdm import tqdm
tqdm.pandas()
def fix_len_vec(vec, length = 30):
"""
Resize a vector. If the length of the original vector is shorter than the desire length, pad the vector with 0 at the end.
If the length of the original vector is longer than the deire length, slice the vector from the begining to the 30th element.
vec: the original vector that is need to be resize
length: the desire length
Return a vector with the desire length.
"""
if len(vec) < length:
new_vec = np.pad(vec,(0,(30-len(vec))), mode='constant')
else:
new_vec = vec[:30]
return new_vec
def extract_sty_feat(df, length = 30):
"""
Extract stylometric features and tranform them into feature vectors.
df: dataframe that contain the data
length: the desire length of vectors
Return a list of feature vectors
"""
docs = df["targetTitle"].progress_map(lambda text: Text_process(text))
#turn some sentence feature into vectors
pos_vector = docs.progress_map(lambda doc: Text_process.tag_vectoriser(doc))
#resize vectors of sentence features into the same length
fixed_vec = pos_vector.progress_map(lambda vec: fix_len_vec(vec, length = length))
#turn the list of resized vectors into a feature vector
sent_feat = np.asarray(list(fixed_vec))
sent_feat.shape
#extract count features
df["num_arg"] = df["arg"].progress_map(lambda arg: len(arg))
df["num_root"] = df["root"].progress_map(lambda root: len(root))
df["num_det"] = df["det"].progress_map(lambda det: len(det))
df["num_advmod"] = df["advmod"].progress_map(lambda advmod: len(advmod))
df["num_verb"] = df["verb"].progress_map(lambda verb: len(verb))
df["num_nn"] = df["nn"].progress_map(lambda nn:len(nn))
df["num_adj"] = df["adj"].progress_map(lambda adj: len(adj))
df["num_pron"] = df["pron"].progress_map(lambda pron: len(pron))
df["num_adv"] = df["adv"].progress_map(lambda adv: len(adv))
features = []
for i, row in df.iterrows():
feat = dict()
feat["num_token"] = row["num_token"]
feat["avr_token_len"] = row["avr_token_len"]
feat["num_contr"] = row["num_contr"]
feat["max_dep_path"] = row["max_dep_path"]
feat["num_arg"] = row["num_arg"]
feat["num_root"] = row["num_root"]
feat["num_det"] = row["num_det"]
feat["num_advmod"] = row["num_advmod"]
feat["num_verb"] = row["num_verb"]
feat["num_nn"] = row["num_nn"]
feat["num_adj"] = row["num_adj"]
feat["num_pron"] = row["num_pron"]
feat["num_adv"] = row["num_adv"]
feat["senti_score"] = row["senti_score"]
feat["use_question"] = row["use_question"]
feat["use_list"] = row["use_modal"]
features.append(feat)
dict_vtrz = DictVectorizer(sparse=False)
#transform extracted features into vectors
dict_vect = dict_vtrz.fit_transform(features)
dict_vect.shape
#concatenate sentence feature vector and count feature vector into sylometric feautre vector
X_sty = np.concatenate((sent_feat,dict_vect), axis = 1)
X_sty.shape
return X_sty
def load_d2v(d2v_model_file):
dv_model = Doc2Vec.load(d2v_model_file)
return dv_model
def create_d2v(dv_model, text):
"""
Create tweet representation using document embeddings
dv_model: the path to the saved document embedding model
tweets: a list of tweets from the data file
Return: the name of the file to which feature vectors are saved
"""
d2v_vec = dv_model.infer_vector(text)
return d2v_vec
def load_w2v(w2v_model_file):
# load the Stanford GloVe model
wv_model = KeyedVectors.load(w2v_model_file)
return wv_model
def create_w2v(w2v_model,texts, size = 100):
"""
Create document representation using word embedding
This function has three positional arguments and one keyword argument
model_file: the path to the saved word embedding model
texts: a list of untokenised documents from the data file
size: the size of word embeddings
Return: name of the save file that contain the document vectors
"""
print('Create document representation using word embedding')
word_embs = []
wmb_size = size*2+2
nlp = spacy.load('en_core_web_sm')
for text in tqdm(list(texts)):
text_vec = np.zeros(wmb_size)
doc = nlp(text.lower())
for token in doc:
word = str(token.text)
head = str(token.head)
if word in w2v_model.wv.vocab and head in w2v_model.wv.vocab:
token_vec = w2v_model.wv[word]
pos_vec = np.array([token.pos])
dep_vec = np.array([token.dep])
head_vec = w2v_model.wv[head]
word_vec = np.concatenate((token_vec,pos_vec,dep_vec,head_vec))
text_vec += word_vec
word_embs.append(text_vec)
X_w2v = np.asarray(word_embs)
return X_w2v
def headline_features(in_folder,w2v_model):
"""
Extract features from headlines
in_folder: path to folder containing data file
w2v_model: word embedding model
"""
sty_vecs = []
w2v_vecs = []
labels = []
for file_name in glob.glob(os.path.join(in_folder, '*.pk')):
print(f"\nReading {file_name}\n")
df = load_pk_file(file_name)
sty_vec = extract_sty_feat(df)
sty_vecs.append(sty_vec)
w2v_vec = create_w2v(w2v_model,df["targetTitle"])
w2v_vecs.append(w2v_vec)
labels.append(df["truthClass"])
print("Concatenating feature vectors")
X_sty = np.concatenate(sty_vecs, axis = 0)
print(f"Stylometric: {X_sty.shape}")
X_w2v = np.concatenate( w2v_vecs, axis = 0)
print(f"Word2vec: {X_w2v.shape}")
X_cmb = np.concatenate((X_w2v,X_sty), axis = 1)
print(f"Combined: {X_cmb.shape}")
y = np.asarray(pd.concat(labels))
print("Splitting data")
print("Stylometric")
sty_file = "Vector/sty"
save_pk_file((X_sty,y), sty_file)
print("Word2vec")
w2v_file = "Vector/w2v"
save_pk_file((X_w2v,y),w2v_file)
print("Combined")
cmb_file = "Vector/cmb"
save_pk_file( (X_cmb,y), cmb_file)
print("Done")
def content_features(in_folder,w2v_model,d2v_model):
"""
Extract features from contents
in_folder: path to folder containing data file
w2v_model: word embedding model
d2v_model: document embedding model
"""
sty_vecs = []
w2v_vecs = []
d2v_vecs = []
labels = []
for file_name in glob.glob(os.path.join(in_folder, '*.pk')):
print(f"\nReading {file_name}\n")
df = load_pk_file(file_name)
if "targetParagraphs" in df.columns:
filtered_df = df[df["targetParagraphs"].apply(lambda x: len(x)>0)]
sty_vec = extract_sty_feat(filtered_df)
sty_vecs.append(sty_vec)
w2v_vec = create_w2v(w2v_model,filtered_df["targetTitle"])
w2v_vecs.append(w2v_vec)
labels.append(filtered_df["truthClass"])
sent_vec = filtered_df["cont_sent"].progress_map(lambda x: np.asarray([create_d2v(d2v_model,i) for i in x]))
avr_sent_vec = sent_vec.progress_map(lambda x:np.mean(x, axis = 0))
d2v_vec = np.asarray(list(avr_sent_vec))
features = []
for i, row in filtered_df.iterrows():
feat = dict()
feat["cont_num_token"] = row["cont_num_token"]
feat["cont_avr_token_len"] = row["cont_avr_token_len"]
feat["cont_senti_score"] = row["cont_senti_score"]
feat["avr_sim_score"] = row["avr_sim_score"]
feat["sim_pct"] = row["sim_pct"]
features.append(feat)
dict_vtrz = DictVectorizer(sparse=False)
dict_vect = dict_vtrz.fit_transform(features)
d2v_vecs.append(np.concatenate((d2v_vec,dict_vect), axis = 1))
print("Concatenating feature vectors")
X_sty = np.concatenate(sty_vecs, axis = 0)
print(f"Stylometry: {X_sty.shape}")
X_w2v = np.concatenate(w2v_vecs, axis = 0)
print(f"Word2vec: {X_w2v.shape}")
X_d2v = np.concatenate(d2v_vecs, axis = 0)
print(f"Doc2vec: {X_d2v.shape}")
X_cmb = np.concatenate((X_w2v,X_sty,X_d2v), axis = 1)
print(f"Combined: {X_cmb.shape}")
y = list(pd.concat(labels))
print("Splitting data")
print("Doc2vec")
cmb_file = "Vector/d2v"
save_pk_file((X_cmb,y), cmb_file)
print("Done")
if __name__ == "__main__":
in_folder = "Train"
vector_folder = create_folder_path("Vector")
print("\n----------------------------- Loading word2vec and doc2vec models -----------------------\n")
w2v_model_file = "".join([f for f in glob.glob(os.path.join("Model/*_model")) if "w2v" in f])
d2v_model_file = "".join([f for f in glob.glob(os.path.join("Model/*_model")) if "d2v" in f])
w2v_model = load_w2v(w2v_model_file)
d2v_model = load_w2v(d2v_model_file)
print("\n----------------------------- Creating headline feature vectors -----------------------\n")
headline_features(in_folder,w2v_model)
print("\n----------------------------- Creating content feature vectors -----------------------\n")
content_features(in_folder,w2v_model,d2v_model)