-
Notifications
You must be signed in to change notification settings - Fork 0
/
merger.py
115 lines (84 loc) · 2.83 KB
/
merger.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
import json
import math
from collections import defaultdict
TOTAL_UNIQUE_DOC = 55393
def tf_idf_score(tokens):
"""
calculate the tf_idf_score for each token
"""
for token in tokens.keys():
df = len(tokens[token])
idf = math.log(TOTAL_UNIQUE_DOC/df)
for posting in tokens[token]:
tf = posting[1]
tf_idf = (1 + math.log(tf)) * idf
posting[1] = tf_idf
return tokens
def write_full_index(tokens):
"""
write the full single world index file
"""
with open("full_index.txt", "a") as f:
json.dump(tokens, f)
f.write("\n")
def write_full_index_biword(tokens):
"""
write the full biword index file
"""
with open("full_biword_index.txt", "a+") as f:
json.dump(tokens, f)
f.write("\n")
def write_full_index_triword(tokens):
"""
write the full triword index file
"""
with open("full_triword_index.txt", "a+") as f:
json.dump(tokens, f)
f.write("\n")
def merge(filename, mode):
"""
Merge the inverted index file together
"""
file_num = 12
docs = [None] * 12
tokens = [None] * 12
fp = [open("%s%s.txt"%(filename,x), 'r') for x in range(0,file_num)]
index = 0
while index < file_num:
docs[index] = fp[index].readline()
tokens[index] = json.loads(docs[index])
index += 1
valid_i = [x for x in range(0, file_num)]
while True:
token = min(list(tokens[x].keys())[0] for x in valid_i)
new_dict = {token:[]}
for index in valid_i:
if list(tokens[index].keys())[0] == token:
for element in tokens[index][token]:
new_dict[token].append(element)
#read next line of the partial index
docs[index] = fp[index].readline()
if not docs[index]:
valid_i.remove(index)
fp[index].close()
else:
tokens[index] = json.loads(docs[index])
new_dict = tf_idf_score(new_dict)
if mode == 'biword':
write_full_index_biword(new_dict)
elif mode == 'reg':
write_full_index(new_dict)
elif mode == 'triword':
write_full_index_triword(new_dict)
else:
print("please enter a valid mode")
break
if valid_i == []: #If all index file become empty, terminate the while loop
break
if __name__ == "__main__":
file_num = 12
docs = [None] * 12
tokens = [None] * 12
merge("inverted_biword_index_file/inverted_biword_index_", "biword")
merge("inverted_index_file/inverted_index_", "reg")
merge("inverted_triword_index_file/inverted_triword_index_", "triword")