-
Notifications
You must be signed in to change notification settings - Fork 3
/
website_crawler.py
140 lines (124 loc) · 4.34 KB
/
website_crawler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
#!/usr/bin/env python
# coding=utf-8
import threading
import urllib
import re
import html2text
from BeautifulSoup import BeautifulSoup, SoupStrainer
import time
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
g_mutex = threading.Condition()
g_pages = [] # 从中解析所有url链接
g_queueURL = [] # 等待爬取的url链接列表
g_existURL = [] # 已经爬取过的url链接列表
g_failedURL = [] # 下载失败的url链接列表
g_totalcount = 0 # 下载过的页面数
class Crawler:
def __init__(self, crawlername, domain, threadnum):
self.crawlername = crawlername
self.url = 'http://'+domain
print(self.url)
self.domain = domain
self.threadnum = threadnum
self.threadpool = []
self.logfile = file("log.txt", 'w')
def craw(self):
global g_queueURL
g_queueURL.append(self.url)
depth = 0
print self.crawlername + " 启动..."
while (len(g_queueURL) != 0):
depth += 1
print 'Searching depth ', depth, '...\n\n'
self.logfile.write("URL:" + g_queueURL[0] + "........")
self.downloadAll()
self.updateQueueURL()
content = '\n>>>Depth ' + str(depth) + ':\n'
self.logfile.write(content)
i = 0
while i < len(g_queueURL):
content = str(g_totalcount + i) + '->' + g_queueURL[i] + '\n'
self.logfile.write(content)
i += 1
def downloadAll(self):
global g_queueURL
global g_totalcount
i = 0
while i < len(g_queueURL):
j = 0
while j < self.threadnum and i + j < len(g_queueURL):
g_totalcount += 1
print(g_queueURL[i+j])
threadresult = self.download(g_queueURL[i + j], str(g_totalcount) + '.txt', j)
if threadresult != None:
print 'Thread started:', i + j, '--File number =', g_totalcount
j += 1
i += j
for thread in self.threadpool:
thread.join(30)
threadpool = []
g_queueURL = []
def download(self, url, filename, tid):
crawthread = CrawlerThread(url, filename, tid)
self.threadpool.append(crawthread)
crawthread.start()
def updateQueueURL(self):
global g_queueURL
global g_existURL
newUrlList = []
for content in g_pages:
newUrlList += self.getUrl(content)
g_queueURL = list(set(newUrlList) - set(g_existURL))
def getUrl(self, content):
try:
soup = BeautifulSoup(content)
links = []
for link in soup.findAll('a'):
if link is not None and link['href'] is not None:
if self.domain in link['href']:
links.append(link['href'])
elif len(link['href']) > 10 and 'http://' not in link['href']:
links.append(self.url+link['href'])
return links
except Exception, e:
print 'get Url Exception:', e
return []
class CrawlerThread(threading.Thread):
def __init__(self, url, filename, tid):
threading.Thread.__init__(self)
self.url = url
self.filename = filename
self.tid = tid
def run(self):
global g_mutex
global g_failedURL
global g_queueURL
try:
page = urllib.urlopen(self.url)
html = page.read()
h = html2text.HTML2Text()
h.ignore_links = True
h = html2text.html2text(html.decode('utf-8'))
fout = file('bagdax/'+self.filename, 'w')
fout.write(h)
fout.close()
except Exception, e:
g_mutex.acquire()
g_existURL.append(self.url)
g_failedURL.append(self.url)
g_mutex.release()
print 'Failed downloading and saving', self.url
print e
return None
g_mutex.acquire()
g_pages.append(html)
g_existURL.append(self.url)
g_mutex.release()
if __name__ == "__main__":
domain = 'bagdax.cn' # raw_input("请输入url入口:\n")
threadnum = 30 # int(raw_input("设置线程数:"))
crawlername = "azim爬虫"
crawler = Crawler(crawlername, domain, threadnum)
crawler.craw()