-
Notifications
You must be signed in to change notification settings - Fork 0
/
inv_ind.py
182 lines (137 loc) · 7.16 KB
/
inv_ind.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
#Author: Rohan Choudhari
#Last Updated: 4/11/2018
#NOTES:
#1. Take care of case where none of the query terms are in the search query
#2. Take care of case where query vector is 0 because the term appears in all the docs
import glob
import os, os.path
import math
import re
import pickle
class inverted_index:
#Getting the total number of docs
total_docs=0
target_folder = 'Crawled_preprocessed/*.txt'
#target_folder = 'Test_Preprocessed/*.txt'
for filename in glob.glob(target_folder):
total_docs = total_docs+1
print('Folder Location: ', target_folder)
print('Total # of Docs: ', repr(total_docs))
print('Creating the Dictionary')
# *************************** CREATING THE DICTIONARY ***************************
term = []
term_data = []
filenames = []
#Reading the file
doc_no = 0;
#for filename in glob.glob('Test_Preprocessed/*.txt'):
for filename in glob.glob(target_folder):
with open(filename,'r') as f:
filenames.append(filename)
doc_no = doc_no+1;
#Reading the file word by word
for line in f:
for word in line.split():
#For each word, checking if its term already exists in the posting list
#Word exists in 'term'
if(word in term):
pos = term.index(word) #getting the position of the term in the list
word_data = term_data[pos] #getting the element of that word from the dictionary
term_freq = word_data[2] + 1 # word_data[2] is term_frequency. Incrementing it
word_data[2] = term_freq #updating the term frequency
posting_list = word_data[3] # word_data[3] is the posting list. It's a list of lists
last_posting_item = posting_list[-1] # Pythonic way of getting the last element in the posting list
#Checking if we found another instance of the word in the same document
if(last_posting_item[0]==doc_no):#last_posting_item[0] is the doc_no
#Means we don't need to create a new posting item. We just increment the frequency
posting_freq = last_posting_item[1]
posting_freq = posting_freq + 1
last_posting_item[1] = posting_freq;
else:
#New document containing a previously recorded word so incrementing the doc_freq
doc_freq = word_data[1]+1;
word_data[1] = doc_freq;
#Because this is a new document containing the word, we need a separate entry in the posting list
new_posting_item = []
new_posting_item.append(doc_no)
new_posting_item.append(1)
posting_list.append(new_posting_item)
word_data[2] = term_freq;
term_data[pos] = word_data
# Word doesn't yet exist in 'term'
else:
posting_list = [] #This is the list for each word
posting_item = [] #posting_list made of one or more posting_item(s). Format: [doc_no, frequency]. See below.
posting_item.append(doc_no);
posting_item.append(1);
posting_list.append(posting_item)
node = [word, 1, 1, posting_list] # temp_tuple = (word, initial_doc_freq, term_freq, posting_list)
# Adding the entry for that word in term and term_data
term.append(word)
term_data.append(node);
#Saving all the filenames in filenames.p
pickle.dump( filenames, open("filenames.p", "wb"))
# *************************** CREATING THE IDF TABLE ***************************
print('Creating the IDF Table')
dict_posting_list = sorted(term_data, key = lambda x: x[0])#Sorting term_data in ascending order according to the word
#dict_posting_list = term_data
idf_table = []
for node in dict_posting_list:
if(type(node) is list):
#Calculating the idf
df = node[1];
frac = total_docs/df
idf = math.log10(frac)
#Appending the word and the idf
idf_table_row = []
idf_table_row.append(node[0])
idf_table_row.append(idf)
#Weight table is a list of whether the term appears in a doc. If it does,
#that pos is 1, otherwise 0
weight_table = []
for x in range(1, (total_docs+1)):
weight_table.append(0)#initializing all positions to 0
posting_list = node[3]#getting the posting list for that word
for i in range(0, len(posting_list)):
posting_list_element = posting_list[i]
pos = posting_list_element[0]
tf = posting_list_element[1]
weight_table[pos-1] = tf*idf#Marking it as 1 wherever the term occurs
idf_table_row.append(weight_table)
idf_table.append(idf_table_row)
# *************************** CALCULATING DOCUMENT VECTORS ***************************
print('Calculating Document Vectors')
#Declaring and initializing the document_vectors list
document_vectors = []
for i in range(0, total_docs):
document_vectors.append(0);
for node in idf_table:
weight_table = node[2] #Getting the weight_table for each word
#Getting the sum of squares of each weight
for i in range(0, total_docs):
weight_per_doc = weight_table[i];
document_vectors[i] = document_vectors[i]+weight_per_doc*weight_per_doc;
#Calculating the square root of the sum of square of weights of each word
# (Psst: Getting the length of the vector)
for i in range(0, len(document_vectors)):
vector = document_vectors[i]
vector = math.sqrt(vector)
document_vectors[i] = vector
# *************************** CALCULATING NORMALIZED WEIGHTS ***************************
print('Calculating Normalized Weights')
normalized_idf_table = []
#Dividing the weights by the length of the document vector to get the normalized values
for row in idf_table:
weight_table = row[2]
normalized_weight_table = []
for i in range(0, len(weight_table)):
doc_vector = document_vectors[i]
normalized_weight = weight_table[i]/document_vectors[i]
normalized_weight_table.append(normalized_weight)
normalized_row = []
normalized_row.append(row[0])
normalized_row.append(row[1])
normalized_row.append(normalized_weight_table)
normalized_idf_table.append(normalized_row)
pickle.dump( normalized_idf_table, open("normalized_idf.p", "wb"))
test_normalized_idf_table = pickle.load( open( "normalized_idf.p", "rb" ) )