forked from weichenzhao/CS544_Project
-
Notifications
You must be signed in to change notification settings - Fork 0
/
tagDistance_BagOfWords.py
111 lines (93 loc) · 2.34 KB
/
tagDistance_BagOfWords.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
import sys
import json
import string
import nltk
import pickle
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
title = dict()
body = dict()
code = dict()
tag = dict()
tag_title = dict()
tag_list = list()
def readFile(inputFile):
with open(inputFile) as json_data:
seg_dict = json.load(json_data)
for i in seg_dict:
#print(seg_dict[i])
title[int(i)] = seg_dict[i][0]
body[int(i)] = seg_dict[i][1]
code[int(i)] = seg_dict[i][2]
ts = seg_dict[i][3].split(" ")
tag[int(i)] = ts
for t in ts:
if t not in tag_title:
tag_title[t] = seg_dict[i][0].lower()
else:
tag_title[t] = tag_title[t] + ' ' + seg_dict[i][0].lower()
j = 0
for i in tag_title:
# print(i, ': ', tag_title[i])
tag_list.append(i)
# print(tag)
print(len(tag_list))
print(len(tag_title))
# print(tag_title)
# print(title)
# print(body)
# print(code)
# print(tag[6026061])
# print(seg_dict)
# print(body[6026061])
def distance(bag):
tag_distance = dict()
# print(feature_names)
for i in range(0, len(tag_list)):
t1 = tag_list[i]
print(t1)
for j in range(i+1, len(tag_list)):
t2 = tag_list[j]
if (t1, t2) not in tag_distance:
tag_distance[(t1, t2)] = 0
for w1 in bag[t1]:
d1 = bag[t1][w1]
if w1 in bag[t2]:
d2 = bag[t2][w1]
else:
d2 = 0
tag_distance[(t1, t2)] += abs(d1 - d2)
# print(t1,', ', t2, ', ', tag_distance[(t1, t2)])
for w2 in bag[t2]:
if w2 not in bag[t1]:
tag_distance[(t1, t2)] += bag[t2][w2]
# print(t1,', ', t2, ', ', tag_distance[(t1, t2)])
print(tag_distance)
# print(1)
return tag_distance
def bag_of_words():
vec = CountVectorizer(stop_words='english', tokenizer=nltk.word_tokenize)
bag = vec.fit_transform(tag_title.values())
bag_matrix = dict()
# print(bag)
# print(vec.get_feature_names())
# print(len(vec.get_feature_names()))
tag_number = bag.nonzero()[0]
feature = bag.nonzero()[1]
for i in range(0, len(tag_number)):
x = tag_number[i]
y = feature[i]
if tag_list[x] not in bag_matrix:
bag_matrix[tag_list[x]] = dict()
bag_matrix[tag_list[x]][y] = bag[x, y]
# print(bag_matrix)
dis = distance(bag_matrix)
with open('tag_distance_bag_0.pickle', 'wb') as f:
pickle.dump(dis, f)
def main(argv):
inputFile = argv[1]
readFile(inputFile)
bag_of_words()
# tf_idf()
if __name__ == "__main__":
main(sys.argv)