forked from xm9304/Taobao_spider
-
Notifications
You must be signed in to change notification settings - Fork 0
/
taobao_selenium.py
125 lines (111 loc) · 5.36 KB
/
taobao_selenium.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
from selenium import webdriver
from loguru import logger
import time
from selenium.common.exceptions import NoSuchElementException, ElementClickInterceptedException, WebDriverException
from lxml import etree
from retrying import retry
from selenium.webdriver import ActionChains
class taobao(object):
def __init__(self):
self.browser = webdriver.Firefox()
self.browser.implicitly_wait(5)
self.domain = 'http://www.taobao.com'
self.action_chains = ActionChains(self.browser)
# 处理抓取为空的字段
self.handleNone = lambda x: x if x else ' '
def login(self, username, password):
while True:
self.browser.get(self.domain)
time.sleep(1)
self.browser.find_element_by_class_name('h').click()
self.browser.find_element_by_id('fm-login-id').send_keys(username)
self.browser.find_element_by_id('fm-login-password').send_keys(password)
time.sleep(2)
try:
# 出现验证码,滑动验证
slider = self.browser.find_element_by_xpath("//span[contains(@class, 'btn_slide')]")
if slider.is_displayed():
self.action_chains.drag_and_drop_by_offset(slider, 258, 0).perform()
time.sleep(0.5)
self.action_chains.release().perform()
except (NoSuchElementException, WebDriverException):
logger.info('未出现登录验证码')
self.browser.find_element_by_class_name('password-login').click()
nickname = self.get_nickname()
if nickname:
logger.info('登录成功,呢称为:' + nickname)
break
logger.debug('登录出错,5s后继续登录')
time.sleep(5)
def get_product(self, product_name):
self.browser.get(self.domain)
self.browser.find_element_by_class_name('search-combobox-input').send_keys(product_name)
self.browser.find_element_by_xpath(
"(//button[contains(@class, 'submit')]|//button[contains(@class,'btn-search')])").click()
# 等待加载
time.sleep(1)
self.get_product_detail()
# 重试3次,间隔1s
# @retry(stop_max_attempt_number=3, wait_fixed=1000)
def get_product_detail(self):
while True:
try:
# 模拟往下滚动
self.drop_down()
ps = self.browser.page_source
selector = etree.HTML(ps)
page = ''.join(selector.xpath("//li[@class='item active']//text()")).strip('\n ')
items = selector.xpath("//div[@id='mainsrp-itemlist']/div[contains(@class,'m-itemlist')]"
"/div[contains(@class,'grid g-clearfix')]/div[contains(@class,'items')]"
"/div[@class='item J_MouserOnverReq ']")
for item in items:
price = self.handleNone(''.join(item.xpath(".//div[contains(@class, 'price')]//text()"))).strip()
sales = self.handleNone(item.xpath(".//div[@class='deal-cnt']//text()"))[0].replace('人付款', '')
title = self.handleNone(''.join(item.xpath(".//div[contains(@class,'row-2')]//text()"))).strip(
'\n ')
shop_name = self.handleNone(''.join(item.xpath(".//div[contains(@class, 'shop')]//text()"))).strip()
location = self.handleNone(''.join(item.xpath(".//div[@class='location']//text()")))
logger.info(f"标题:{title}|销量:{sales}|价格:{price}|店名:{shop_name}|商铺地址:{location}")
logger.info(f'抓取第{page}页完成')
# 下一页
next = self.browser.find_element_by_xpath("//li[contains(@class, 'item next')]")
if 'next-disabled' in next.get_attribute('class'):
logger.info('没有下一页,抓取完成')
break
else:
next.click()
# 出现滑块验证
except ElementClickInterceptedException:
slider = self.browser.find_element_by_xpath("//span[contains(@class, 'btn_slide')]")
self.action_chains.drag_and_drop_by_offset(slider, 258, 0).perform()
time.sleep(0.5)
self.action_chains.release().perform()
except Exception as e:
logger.error('出现未知错误:' + e)
self.browser.refresh()
time.sleep(1)
# js控制往下拖动
def drop_down(self):
for x in range(1, 9):
time.sleep(0.3)
j = x / 10
js = f"document.documentElement.scrollTop = document.documentElement.scrollHeight * {j}"
self.browser.execute_script(js)
# 兄地~太快容易出验证码
time.sleep(2)
def get_nickname(self):
self.browser.get(self.domain)
time.sleep(0.5)
try:
return self.browser.find_element_by_class_name('site-nav-user').text
except NoSuchElementException:
return ''
if __name__ == '__main__':
# 填入自己的用户名,密码
username = 'username'
password = 'password'
tb = taobao()
tb.login(username, password)
# 可以改成想要商品的名称
product_name = '零食'
tb.get_product(product_name)