-
Notifications
You must be signed in to change notification settings - Fork 0
/
search.py
executable file
·56 lines (52 loc) · 1.54 KB
/
search.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
from bs4 import BeautifulSoup
import requests
import re
import xlwt
def writeExcel(x, keywords, sheet):
keys = sorted(keywords)
y = 1
for word in keys:
sheet.write(y, x, keywords[word])
y+=1
def readFiles():
wordlist = []
urllist = []
print("reading text files...")
with open("keywords.txt") as f:
for line in f:
line = line.rstrip("\n").lower()
wordlist.append(line)
with open("urllist.txt") as f:
for line in f:
line = line.rstrip("\n")
urllist.append(line)
return wordlist, urllist
wordlist, urllist = readFiles()
wb = xlwt.Workbook()
sheet = wb.add_sheet("frequency_table")
y=1
for word in sorted(wordlist):
sheet.write(y, 0, word)
y+=1
x=1
for url in urllist:
sheet.write(0, x, url)
x+=1
count = 1
for url in urllist:
print("reading url No.", count, "...")
keywords = dict.fromkeys(wordlist, 0)
headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.90 Safari/537.36'}
req = requests.get(url, headers=headers)
html = req.text
soup = BeautifulSoup(html, "html.parser")
rawtext = soup.get_text()
text = re.split(r'\W+', rawtext)
for word in text:
word = word.lower()
if word in keywords:
keywords[word] += 1
writeExcel(count, keywords, sheet)
count+=1
wb.save("frequency_table.xls")
print("All done! Output can be found in frequency_table.xls.")