-
Notifications
You must be signed in to change notification settings - Fork 0
/
DataFetcher.py
140 lines (121 loc) · 4.96 KB
/
DataFetcher.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
from collections import defaultdict
from bs4 import BeautifulSoup
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
import hashlib
ps = PorterStemmer()
class DataFetcher():
""" this class fetch words from html
and build dictionaries to store resulted information
"""
def __init__(self, html):
self.html = html
self.word_dict = defaultdict(int)
self.biword_dict = defaultdict(int)
self.triword_dict = defaultdict(int)
self.position_dict = defaultdict(list)
self.checksum = 0
self.fetch()
def fetch(self):
""" fetch html data and parse
store resulted words in:
words_dict, formatting as "word": count
position_dict, "word": [list of positions]
"""
# ===== beautiful soup ====
important = []
soup = BeautifulSoup(self.html,'html.parser')
for script in soup(["script","style"]):
script.extract()
#print(soup.find('a')['href'])
text = soup.get_text()
## calculate the check sum
self.checksum = hashlib.md5(text.encode('utf-8')).hexdigest()
text = text.split(" ")
important = self.get_important_words(soup, important)
# =========================
# == building one and two word index dic ==
for line in text:
if line != '\n':
line = self._decode_line(line.lower())
line = line.split()
valid_line = []
for word in line:
if self._is_valid_word(word) and ps.stem(word).isalnum():
word = ps.stem(word)
valid_line.append(word)
for w in range(len(valid_line)):
word = valid_line[w]
self.word_dict[word] += 1
if word in important:
self.word_dict[word] *= 2
# biword index
if w < len(valid_line)-1:
next_word = valid_line[w+1]
biword = str(word) + " " + str(next_word)
self.biword_dict[biword] += 1
if word in important:
self.biword_dict[biword] *= 2
if next_word in important:
self.biword_dict[biword] *= 2
# tri-word index
if w < len(valid_line) -2:
next_word = valid_line[w+1]
next_next_word = valid_line[w+2]
tri_word = str(word) + " " + str(next_word) + " " + str(next_next_word)
self.triword_dict[tri_word] += 1
if word in important:
self.triword_dict[tri_word] *= 2
if next_word in important:
self.triword_dict[tri_word] *= 2
if next_next_word in important:
self.triword_dict[tri_word] *= 2
# ============================================
def get_important_words(self, soup, important):
try:
for word in soup.title.string.split():
word = word.lower()
if ps.stem(word) not in important and self._is_valid_word(word):
important.append(ps.stem(word))
except:
pass
for tags in soup.find_all(["h1", "h2", "h3", "b"]):
try:
tag = tags.string.split()
for text in tag:
if ps.stem(word) not in important and self._is_valid_word(word):
important.append(ps.stem(word))
except:
pass
return important
def get_words(self):
""" return all the words in this page as a dict"""
return self.word_dict
def get_biwords(self):
return self.biword_dict
def get_triwords(self):
return self.triword_dict
def get_position(self):
""" return the position information of all words as a dict"""
return self.position_dict
def get_checksum(self):
"""return the checksum value"""
return self.checksum
def _decode_line(self, line):
for c in line:
if not c.isascii():
line = line.replace(c, " ") #check if it's ascii
if 32 < ord(c) < 65:
line = line.replace(c, " ") #check if it's #$%^
#if 57 < ord(c) < 65:
# line = line.replace(c, " ")
if 90 < ord(c) < 97:
line = line.replace(c, " ")
if 122 < ord(c) < 127:
line = line.replace(c, " ")
return line
def _is_valid_word(self, word):
one_letter = ["i", "a"]
if len(word) == 1 and word not in one_letter:
return False
return True