-
Notifications
You must be signed in to change notification settings - Fork 1
/
BSBIndex.py
67 lines (52 loc) · 1.77 KB
/
BSBIndex.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
import logging
from itertools import groupby
from collections import Counter
from CACMIndex import CACMIndex
from helpers import textProcessing
logging.basicConfig(format='%(asctime)s - %(levelname)s : %(message)s', level=logging.INFO)
def get_key(item):
return item[0]
class BSBIndex:
"""
Build a Block Sort-based index
"""
def __init__(self, collection, terms, documents):
self.collection = collection
self.terms = terms
self.documents = documents
self.index = {}
self.parsed = []
def build(self):
self.parse_documents()
self.invert()
def parse_documents(self):
"""
Parse each document as a tuple (term_id, doc_id)
"""
for doc_id, doc in self.documents.items():
processor = textProcessing.TextProcessor(self.collection)
doc_terms = processor.process(doc)[3] # vocabulary_full
vocab_freq = Counter(doc_terms)
for term in vocab_freq.keys():
term_id = self.terms[term]
pair = (term_id, (doc_id, vocab_freq[term]))
self.parsed.append(pair)
def invert(self):
"""
Sort the tuples and group by term_id
"""
self.parsed = groupby(sorted(self.parsed, key=get_key), key=get_key)
for term_id, doc_id in self.parsed:
self.index[term_id] = [doc[1] for doc in doc_id]
def get_index(self):
return self.index
def get_terms(self):
return self.terms
def get_documents(self):
return self.documents
if __name__ == "__main__":
CACMIndex = CACMIndex()
CACMIndex.build()
index = BSBIndex('CACM', CACMIndex.get_term_dict(), CACMIndex.get_document_dict())
index.build()
print(index.get_index())