-
Notifications
You must be signed in to change notification settings - Fork 1
/
CACMIndex.py
88 lines (72 loc) · 2.43 KB
/
CACMIndex.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
import logging
import time
from config import CACM_path
from helpers import indexBuilder, CACMParser
from frequencyRankGraph import *
logging.basicConfig(format='%(asctime)s - %(levelname)s : %(message)s', level=logging.INFO)
class CACMIndex:
"""
Build the CACM collection's index:
- parse the cacm.all file
- build index
"""
def __init__(self):
self.parser = CACMParser.CACMParser()
self.index = None
def build(self, half=False):
# logging.info("Start building index...")
# start = time.time()
data = self.get_document_dict(half)
all_words = " ".join(data.values())
self.index = indexBuilder.IndexBuilder('CACM', all_words)
self.index.build()
# end = time.time()
# logging.info("Index built in {0} seconds".format(end - start))
def get_tokens(self):
print(self.index.get_tokens())
def get_vocabulary(self):
print(self.index.get_vocabulary())
def get_size(self):
return self.index.get_size()
def get_term_dict(self):
"""
Build the term dictionary (term, termID)
:return: terms dictionary
"""
dict_term = dict()
term_id = 1
for vocab in sorted(list(self.index.get_vocabulary())):
dict_term[vocab] = term_id
term_id += 1
return dict_term
def get_document_dict(self, half=False):
"""
Build the document dictionary (documentID, document)
:param half: bool
:return: document dictionary
"""
with open(CACM_path + "/cacm.all") as f:
read_data = f.read()
if half:
read_data = read_data[:len(read_data)//2]
dic = self.parser.parse_documents(read_data)
data = self.parser.parse_all(dic)
return data
def get_freq(self):
return self.index.get_freq()
if __name__ == "__main__":
'''Run to calculate number of tokens, vocabulary
Uncomment parts to see the graph ar calculate values for half of the text
'''
index = CACMIndex()
index.build()
nb_tokens, nb_vocab = index.get_size()
# Uncomment here to see values for half of the text
print()
print("For half of the text:")
index.build(half=True)
nb_tokens_half, nb_vocab_half = index.get_size()
# Uncomment here to see the frequency graph
graph = FrequencyRankGraph(index.get_freq())
graph.draw_graph()
graph.draw_log_graph()