Skip to content

Commit

Permalink
Standardizing all actions (#26)
Browse files Browse the repository at this point in the history
* Deleted attributes search

* Changed tests

* New test for RecaptchaSolver and Updated Structure of MockServer

* RecacptchaSolverResponse

* Black formatting

* Deprecation warning and RecaptchaSolverSpider

* Formatting

* Updated test for RecaptchaSolver

* Fixed RecaptchaSolverResponse

* Fix: DeprecationWarning

* Python version update in GitHub Actions
  • Loading branch information
MatthewZMSU authored May 29, 2024
1 parent 769c8f4 commit b23cf9a
Show file tree
Hide file tree
Showing 17 changed files with 711 additions and 529 deletions.
4 changes: 3 additions & 1 deletion .github/workflows/python-test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,12 @@ jobs:
strategy:
matrix:
include:
- python-version: "3.7.x" # Min Python version (No 3.6 version)
- python-version: "3.7.x" # Min Python version (No 3.6 version in GitHub repository)
- python-version: "3.8.x"
- python-version: "3.9.x"
- python-version: "3.10.x"
- python-version: "3.11.x"
- python-version: "3.12.x"
- python-version: "3.x" # Last Python version
steps:
- uses: actions/checkout@v3
Expand Down
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ class MySpider(scrapy.Spider):
## Advanced usage

`PuppeteerRequest`'s first argument is a browser action.
Avalable actions are defined in `scrapypuppeteer.actions` module as subclasses of `PuppeteerServiceAction`.
Available actions are defined in `scrapypuppeteer.actions` module as subclasses of `PuppeteerServiceAction`.
Passing a URL into request is a shortcut for `GoTo(url)` action.

Here is the list of available actions:
Expand Down
10 changes: 5 additions & 5 deletions examples/settings.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
BOT_NAME = 'scrapypuppeteer'
BOT_NAME = "scrapypuppeteer"

SPIDER_MODULES = ['examples.spiders']
NEWSPIDER_MODULE = 'examples.spiders'
SPIDER_MODULES = ["examples.spiders"]
NEWSPIDER_MODULE = "examples.spiders"

CONCURRENT_REQUESTS = 1

DOWNLOADER_MIDDLEWARES = {
'scrapypuppeteer.middleware.PuppeteerServiceDownloaderMiddleware': 1042
"scrapypuppeteer.middleware.PuppeteerServiceDownloaderMiddleware": 1042
}

PUPPETEER_SERVICE_URL = 'http://localhost:3000'
PUPPETEER_SERVICE_URL = "http://localhost:3000"
47 changes: 27 additions & 20 deletions examples/spiders/auto_recaptcha.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,37 +14,44 @@ class AutoRecaptchaSpider(scrapy.Spider):
start_urls = ["https://www.google.com/recaptcha/api2/demo"]

custom_settings = {
'DOWNLOADER_MIDDLEWARES': {
'scrapypuppeteer.middleware.PuppeteerRecaptchaDownloaderMiddleware': 1041,
'scrapypuppeteer.middleware.PuppeteerServiceDownloaderMiddleware': 1042
"DOWNLOADER_MIDDLEWARES": {
"scrapypuppeteer.middleware.PuppeteerRecaptchaDownloaderMiddleware": 1041,
"scrapypuppeteer.middleware.PuppeteerServiceDownloaderMiddleware": 1042,
},
"PUPPETEER_INCLUDE_META": True,
"RECAPTCHA_ACTIVATION": True,
"RECAPTCHA_SOLVING": True,
"RECAPTCHA_SUBMIT_SELECTORS": {
"www.google.com/recaptcha/api2/demo": "#recaptcha-demo-submit",
},
'PUPPETEER_INCLUDE_META': True,

'RECAPTCHA_ACTIVATION': True,
'RECAPTCHA_SOLVING': True,
'RECAPTCHA_SUBMIT_SELECTORS': {
'www.google.com/recaptcha/api2/demo': '#recaptcha-demo-submit',
}
}

def start_requests(self):
for url in self.start_urls:
action = GoTo(url=url)
yield PuppeteerRequest(action=action, callback=self.parse_html, errback=self.error, close_page=False)
yield PuppeteerRequest(
action=action,
callback=self.parse_html,
errback=self.error,
close_page=False,
)

def parse_html(self, response: PuppeteerResponse, **kwargs):
with open(f"recaptcha_page.html", 'wb') as f:
with open(f"recaptcha_page.html", "wb") as f:
f.write(response.body)
action = Screenshot(options={
'full_page': True,
})
yield response.follow(action,
callback=self.make_screenshot,
errback=self.error,
close_page=True)
action = Screenshot(
options={
"full_page": True,
}
)
yield response.follow(
action, callback=self.make_screenshot, errback=self.error, close_page=True
)

def make_screenshot(self, response: PuppeteerScreenshotResponse, **kwargs):
data = response.screenshot # Note that data is string containing bytes, don't forget to decode them!
data = (
response.screenshot
) # Note that data is string containing bytes, don't forget to decode them!
with open("imageToSave.png", "wb") as fh:
fh.write(base64.b64decode(data))

Expand Down
44 changes: 31 additions & 13 deletions examples/spiders/manual_recaptcha.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,29 +16,47 @@ class ManualRecaptchaSpider(scrapy.Spider):
def start_requests(self):
for url in self.start_urls:
action = GoTo(url=url)
yield PuppeteerRequest(action=action, callback=self.solve_recaptcha, errback=self.error, close_page=False)
yield PuppeteerRequest(
action=action,
callback=self.solve_recaptcha,
errback=self.error,
close_page=False,
)

def solve_recaptcha(self, response: PuppeteerResponse, **kwargs):
action = RecaptchaSolver()
yield response.follow(action=action, callback=self.submit_recaptcha, errback=self.error, close_page=False)
yield response.follow(
action=action,
callback=self.submit_recaptcha,
errback=self.error,
close_page=False,
)

def submit_recaptcha(self, response, **kwargs):
action = Click('#recaptcha-demo-submit')
yield response.follow(action=action, callback=self.parse_html, errback=self.error, close_page=False)
action = Click("#recaptcha-demo-submit")
yield response.follow(
action=action,
callback=self.parse_html,
errback=self.error,
close_page=False,
)

def parse_html(self, response: PuppeteerResponse, **kwargs):
with open(f"recaptcha_page.html", 'wb') as f:
with open(f"recaptcha_page.html", "wb") as f:
f.write(response.body)
action = Screenshot(options={
'full_page': True,
})
yield response.follow(action,
callback=self.make_screenshot,
errback=self.error,
close_page=True)
action = Screenshot(
options={
"full_page": True,
}
)
yield response.follow(
action, callback=self.make_screenshot, errback=self.error, close_page=True
)

def make_screenshot(self, response: PuppeteerScreenshotResponse, **kwargs):
data = response.screenshot # Note that data is string containing bytes, don't forget to decode them!
data = (
response.screenshot
) # Note that data is string containing bytes, don't forget to decode them!
with open("imageToSave.png", "wb") as fh:
fh.write(base64.b64decode(data))

Expand Down
12 changes: 6 additions & 6 deletions examples/spiders/meduza.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,18 +4,18 @@


class MeduzaSpider(scrapy.Spider):
name = 'meduza'
name = "meduza"

def start_requests(self):
yield PuppeteerRequest('https://meduza.io', callback=self.parse_main_page)
yield PuppeteerRequest("https://meduza.io", callback=self.parse_main_page)

def parse_main_page(self, response: PuppeteerHtmlResponse):
for article_url in response.css('a.Link-isInBlockTitle::attr(href)').getall():
for article_url in response.css("a.Link-isInBlockTitle::attr(href)").getall():
yield response.follow(article_url, callback=self.parse_article)

def parse_article(self, response: PuppeteerHtmlResponse):
yield {
'url': response.url,
'title': response.css('h1::text').get(),
'text': '\n'.join(response.css('p.SimpleBlock-p::text').getall())
"url": response.url,
"title": response.css("h1::text").get(),
"text": "\n".join(response.css("p.SimpleBlock-p::text").getall()),
}
110 changes: 61 additions & 49 deletions examples/spiders/webscraperio.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,103 +8,115 @@ class EcommerceSiteSpider(scrapy.Spider):

@staticmethod
def extract_items(list_page_response):
for item_selector in list_page_response.css('div.row div.thumbnail'):
for item_selector in list_page_response.css("div.row div.thumbnail"):
yield {
'link': item_selector.css('a.title::attr(href)').get(),
'title': item_selector.css('a.title::attr(title)').get(),
'price': item_selector.css('h4.price::text').get(),
'description': item_selector.css('p.description::text').get(),
'rating': len(item_selector.css('span.glyphicon-star')),
'reviews_count': int(item_selector
.css('.ratings p.pull-right::text')
.re_first(r'\d+'))
"link": item_selector.css("a.title::attr(href)").get(),
"title": item_selector.css("a.title::attr(title)").get(),
"price": item_selector.css("h4.price::text").get(),
"description": item_selector.css("p.description::text").get(),
"rating": len(item_selector.css("span.glyphicon-star")),
"reviews_count": int(
item_selector.css(".ratings p.pull-right::text").re_first(r"\d+")
),
}

@staticmethod
def extract_item(detail_page_response):
yield {
'link': detail_page_response.url,
'title': detail_page_response.css('h4.price + h4::text').get(),
'price': detail_page_response.css('h4.price::text').get(),
'description': detail_page_response.css('p.description::text').get(),
'rating': len(detail_page_response.css('span.glyphicon-star')),
'reviews_count': int(detail_page_response
.css('.ratings::text')
.re_first('\d+'))
"link": detail_page_response.url,
"title": detail_page_response.css("h4.price + h4::text").get(),
"price": detail_page_response.css("h4.price::text").get(),
"description": detail_page_response.css("p.description::text").get(),
"rating": len(detail_page_response.css("span.glyphicon-star")),
"reviews_count": int(
detail_page_response.css(".ratings::text").re_first("\d+")
),
}


class AjaxPaginationSpider(EcommerceSiteSpider):
name = 'e-commerce-ajax'
name = "e-commerce-ajax"

def __init__(self, **kwargs):
super().__init__(**kwargs)
self.start_url = 'https://webscraper.io/test-sites/e-commerce/ajax/computers/laptops'
self.start_url = (
"https://webscraper.io/test-sites/e-commerce/ajax/computers/laptops"
)
self.next_page_ix = 1

def start_requests(self):
yield PuppeteerRequest(GoTo(self.start_url),
close_page=False,
callback=self.process_list_page)
yield PuppeteerRequest(
GoTo(self.start_url), close_page=False, callback=self.process_list_page
)

def process_list_page(self, response):
yield from self.extract_items(response)
self.next_page_ix += 1
next_page_selector = f'button[data-id="{self.next_page_ix}"]'
if response.css(next_page_selector):
yield response.follow(Click(next_page_selector,
wait_options={'selectorOrTimeout': 3000}),
close_page=False,
callback=self.process_list_page)
yield response.follow(
Click(next_page_selector, wait_options={"selectorOrTimeout": 3000}),
close_page=False,
callback=self.process_list_page,
)


class MoreSpider(EcommerceSiteSpider):
name = 'e-commerce-more'
name = "e-commerce-more"

def __init__(self, **kwargs):
super().__init__(**kwargs)
self.start_url = 'https://webscraper.io/test-sites/e-commerce/more/computers/laptops'
self.start_url = (
"https://webscraper.io/test-sites/e-commerce/more/computers/laptops"
)
self.seen_item_links = set()

def start_requests(self):
yield PuppeteerRequest(GoTo(self.start_url, wait_options={'selectorOrTimeout': 10000}),
close_page=False,
callback=self.process_list_page)
yield PuppeteerRequest(
GoTo(self.start_url, wait_options={"selectorOrTimeout": 10000}),
close_page=False,
callback=self.process_list_page,
)

def process_list_page(self, response):
for item in self.extract_items(response):
if item['link'] not in self.seen_item_links:
self.seen_item_links.add(item['link'])
if item["link"] not in self.seen_item_links:
self.seen_item_links.add(item["link"])
yield item
more_selector = '.ecomerce-items-scroll-more'
more_selector = ".ecomerce-items-scroll-more"
more_button = response.css(more_selector)
if 'style' not in more_button.attrib:
yield response.follow(Click(more_selector,
wait_options={'selectorOrTimeout': 1000}),
close_page=False,
callback=self.process_list_page)
if "style" not in more_button.attrib:
yield response.follow(
Click(more_selector, wait_options={"selectorOrTimeout": 1000}),
close_page=False,
callback=self.process_list_page,
)


class ScrollSpider(EcommerceSiteSpider):
name = 'e-commerce-scroll'
name = "e-commerce-scroll"

def __init__(self, **kwargs):
super().__init__(**kwargs)
self.start_url = 'https://webscraper.io/test-sites/e-commerce/scroll/computers/laptops'
self.start_url = (
"https://webscraper.io/test-sites/e-commerce/scroll/computers/laptops"
)
self.seen_item_links = set()

def start_requests(self):
yield PuppeteerRequest(GoTo(self.start_url),
close_page=False,
callback=self.process_list_page)
yield PuppeteerRequest(
GoTo(self.start_url), close_page=False, callback=self.process_list_page
)

def process_list_page(self, response):
items = self.extract_items(response)
new_items = [i for i in items if i['link'] not in self.seen_item_links]
new_items = [i for i in items if i["link"] not in self.seen_item_links]
if new_items:
for item in new_items:
self.seen_item_links.add(item['link'])
self.seen_item_links.add(item["link"])
yield item
yield response.follow(Scroll(wait_options={'selectorOrTimeout': 1000}),
close_page=False,
callback=self.process_list_page)
yield response.follow(
Scroll(wait_options={"selectorOrTimeout": 1000}),
close_page=False,
callback=self.process_list_page,
)
Loading

0 comments on commit b23cf9a

Please sign in to comment.