-
Notifications
You must be signed in to change notification settings - Fork 0
/
tiki_2.0.py
395 lines (358 loc) · 14.2 KB
/
tiki_2.0.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
import requests
import bs4
import time
import multiprocessing
import multiprocessing.pool
import re
import logging
import pymongo
class NoDaemonProcess(multiprocessing.Process):
def _get_daemon(self):
return False
def _set_daemon(self, value):
pass
daemon = property(_get_daemon, _set_daemon)
class MyPool(multiprocessing.pool.Pool):
Process = NoDaemonProcess
class DataPipeline:
database_name = 'Tiki_2'
client = pymongo.MongoClient('localhost', 27017)
database = client[database_name]
@classmethod
def importData(self, collection_name):
try:
collection = DataPipeline.database[collection_name]
complete.logger.debug('Export data from collection "{}"'.format(collection_name))
return collection.find()
except Exception as e:
error.logger.exception(e)
@classmethod
def exportData(self, collection_name, data):
try:
collection = DataPipeline.database[collection_name]
for x in data:
if collection.find_one({'link': {"$eq": x['link']}}) is None:
collection.insert_one(x)
complete.logger.debug('Export data to collection "{}"'.format(collection_name))
except Exception as e:
error.logger.exception(e)
@classmethod
def update(self, collection_name, key_name, key_value, field_name, field_value):
try:
collection = DataPipeline.database[collection_name]
collection.update_one({key_name: key_value}, {'$set': {field_name: field_value}})
except Exception as e:
print(e)
error.logger.exception(e)
class Logger:
def __init__(self, name, log_file, level=logging.INFO):
self.name = name
self.log_file = log_file
self.level = level
urllib3_log = logging.getLogger("urllib3")
urllib3_log.setLevel(logging.CRITICAL)
formatter = logging.Formatter('%(levelname)s:%(asctime)s: %(message)s')
handler = logging.FileHandler(self.log_file, encoding='utf-8')
handler.setFormatter(formatter)
self.logger = logging.getLogger(self.name)
self.logger.setLevel(self.level)
self.logger.addHandler(handler)
class ItemScrapper:
'''
Scrap flow:
1. Crawl the display menu page and Get the item URL from the display menu
2. Enter the item source page and scrap the item info
'''
max_attemp = 5
@staticmethod
def getItemInfo(url):
'''
get the items information (name, price, rating)
:param url: the item source page
:return: dict contains item information (name,price,rating,url)
'''
attempt = 0
while attempt < ItemScrapper.max_attemp:
try:
response = requests.get(url)
soup = bs4.BeautifulSoup(response.text, 'lxml')
# check redirect url
try:
if int(response.headers['Content-Length']) < 1000:
# find the redirect url
error.logger.info('Redirect: ' + url)
regex = 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
redirect_url = soup.find_all('script')[0].text
redirect_url = re.findall(regex, redirect_url)[0]
# change the request to the redirect URL
response = requests.get(redirect_url)
soup = bs4.BeautifulSoup(response.text, 'lxml')
except:
pass
name = soup.select_one('.icon-tikinow-26+ span').text
price = soup.select_one('#span-price').text
rating = soup.find("meta", attrs={"itemprop": "ratingValue"})['content']
response.close()
soup.decompose()
complete.logger.info('Complete getting item: ' + name)
return {'name': name, 'price': price, 'rating': rating, 'link': url}
except Exception as e:
error.logger.exception(e)
attempt += 1
missing.logger.info('Missing item: ' + url)
return {'name': None, 'price': None, 'rating': None, 'link': url}
@staticmethod
def getItemUrls(url):
'''
scrap the items URL from the display menu page
:param url: page URL
:return: item_url_list: list of items URl
'''
try:
response = requests.get(url)
soup = bs4.BeautifulSoup(response.text, 'lxml')
item_url_list = [x['href'] for x in soup.select('.product-box-list .product-item > a')]
soup.decompose()
response.close()
return item_url_list
except Exception as e:
print(e)
error.logger.exception(e)
return []
@staticmethod
def scrapItemInfo(url):
'''
get all information of all items on a display menu
:param url: the URL of a display menu page
:return: item_info: list of items information
'''
print('Getting items from {}...'.format(url))
url_list = ItemScrapper.getItemUrls(url)
item_info = []
try:
pool = multiprocessing.Pool()
item_info = pool.map(ItemScrapper.getItemInfo, url_list)
pool.close()
pool.join()
complete.logger.debug('Complete get page: ' + url)
except Exception as e:
print(e)
error.logger.exception(e)
missing.logger.debug('Missing page: ' + url)
return item_info
@staticmethod
def main(url_list=None):
'''
crawl and scrap items information from a list of URLs
:param url: list of URLs
:return: info: items information
'''
if not url_list:
url_list = [x for x in DataPipeline.importData('leaf-categories') if x['item-status']]
last_page = []
a = []
for x in url_list:
a.extend(
[{'link': '{}page={}'.format(x['link'][:-8], y), 'category_id': x['_id'], 'status': True} for y in
range(1, x['length'] + 1)])
last_page.append({'link': '{}page={}'.format(x['link'][:-8], x['length']), 'category_id': x['_id']})
for i in range(0, len(a), batch_num):
print('batch: {}'.format(i // batch_num))
batch = [x['link'] for x in a[i:i + batch_num]]
try:
p = MyPool()
result = p.map(ItemScrapper.scrapItemInfo, batch)
for x in result:
DataPipeline.exportData('Item',x)
except Exception as e:
error.logger.exception(e)
for x in batch:
try:
end_page = next(i for i in last_page if i['link'] == x)
DataPipeline.update('leaf-categories', '_id', end_page['category_id'], 'item-status', False)
except StopIteration:
pass
p.close()
p.join()
else:
result = []
for i in range(0, len(url_list), batch_num):
print('batch: {}'.format(i // batch_num))
batch = url_list[i:i + batch_num]
p = MyPool()
result.append(p.map(ItemScrapper.scrapItemInfo, batch))
p.close()
p.join()
return result
class Category:
@staticmethod
def getSubCategory(data_tuple):
'''
get the sub-categories of a sub-category
if the sub-category is the a leaf then store the sub-category data (name, quantity, link) to queue
return []
else return sub_list
:param data_tuple: (url, queue)
url: sub-category page
queue: data queue
:return: sub-list: list of sub-category
'''
url = data_tuple[0]
queue = data_tuple[1]
try:
print('Getting {}...'.format(url))
current_url = url
response = requests.get(url, allow_redirects=False)
# check for redirect page
if response.status_code != 200:
current_url = 'https' + response.headers['Location'][4:] # get the right HTTP protocol
response = requests.get(url, allow_redirects=True)
soup = bs4.BeautifulSoup(response.text, 'lxml')
# get information
sub_list = ['https://tiki.vn' + x['href'] for x in soup.select('.is-child a')]
name = soup.select_one('h1').text.strip()
quantity = int(soup.select_one('.filter-list-box h4').text.strip()[:-8])
response.close()
soup.decompose()
# check if sub-category ends or not
if len(sub_list) == 0 or sub_list[0] == current_url:
queue.put({'name': name, 'quantity': quantity, 'link': url, 'status': True, 'item-status': True})
return []
else:
return sub_list
except Exception as e:
error.logger.exception('Exception occur at: ' + url)
error.logger.exception(e)
print(e)
return []
@staticmethod
def traverseCategoryTree(url_list, queue):
'''
recursively crawling through the category tree and get the leaf
:param url_list: list of node url
queue: data queue to store leaf
:return:
'''
if len(url_list) != 0:
pool = MyPool()
result = pool.map(Category.getSubCategory, [(x, queue) for x in url_list])
sub_categories = []
for sub_category in result:
sub_categories.extend(sub_category)
pool.close()
pool.join()
Category.traverseCategoryTree(sub_categories, queue)
@staticmethod
def main(url_list=None):
'''
get leaf categories of a page
:param url_list: list of url page need to find leaf category
:return: leaf_category_list: infomation of leaf categories
'''
# get tiki main category
if not url_list:
try:
url = 'https://tiki.vn'
re = requests.get(url)
soup = bs4.BeautifulSoup(re.text, 'lxml')
url_list = [x['href'] for x in soup.select('.efuIbv')]
except Exception as e:
error.logger.error(e)
queue = multiprocessing.Manager().Queue()
Category.traverseCategoryTree(url_list, queue)
leaf_category_list = []
while not queue.empty():
leaf_category_list.append(queue.get())
complete.logger.debug('Complete getting leaf-categories')
return leaf_category_list
class EndPage:
@staticmethod
def checkEnd(link):
'''
checking whether the page still the 'next page' button
:param link: the page's url
:return: True: if doesn't has the button
False: if has the button or something goes wrong
'''
try:
res = requests.get(link)
soup = bs4.BeautifulSoup(res.text, 'lxml')
# find the 'next page' button
if soup.find('a', class_="next") is None:
return True
else:
return False
except Exception:
logging.exception(Exception)
return False
@staticmethod
def getEndPage(url):
'''
getting all pages containing items of the category
using the hopping method to iterate through pages
:param link: the category url sample
:return: list of all category sub page
'''
if not url:
return 0
count = 0
print('Getting sub category of {}...'.format(url))
k = 100
# find the last page that still contains items
while True:
new_count = count + k
new_link = url[:-8] + 'page=' + str(new_count)
end = EndPage.checkEnd(new_link)
if end and k == 1:
count = new_count
break
if end:
k = k // 10
else:
count = new_count
return count
@staticmethod
def main(url=None):
if not url:
url = [x for x in DataPipeline.importData('leaf-categories') if x['status']]
print(url)
for i in range(0, len(url), batch_num):
print('batch: {}'.format(i // batch_num))
batch = [x['link'] if x['status'] else None for x in url[i:i + batch_num]]
p = multiprocessing.Pool()
result = p.map(EndPage.getEndPage, batch)
for index, value in enumerate(batch):
if value:
DataPipeline.update('leaf-categories', 'link', value, 'length', result[index])
DataPipeline.update('leaf-categories', 'link', value, 'status', False)
p.close()
p.join()
else:
result = []
for i in range(0, len(url), batch_num):
print('batch: {}'.format(i // batch_num))
batch = url[i:i + batch_num]
p = multiprocessing.Pool()
result.append(p.map(EndPage.getEndPage, batch))
p.close()
p.join()
return result
batch_num = 12
error = Logger('error', 'error.log')
missing = Logger('missing', 'missing.log')
complete = Logger('complete', 'complete.log', logging.DEBUG)
if __name__ == "__main__":
t1 = time.time()
url = [
'https://tiki.vn/may-doc-sach/c28856?src=tree',
'https://tiki.vn/may-tinh-bang/c1794?src=tree',
'https://tiki.vn/dien-thoai-pho-thong/c1796?src=tree',
'https://tiki.vn/dien-thoai-smartphone/c1795?src=tree',
'https://tiki.vn/dien-thoai-smartphone/c1795?src=tree',
'https://tiki.vn/noi/c891?src=tree',
'https://tiki.vn/toeic/c896?src=tree',
'https://tiki.vn/nghe/c890?src=tree'
]
ItemScrapper.main()
t2 = time.time()
print(t2 - t1)