Skip to content

Commit

Permalink
Customize which headers will be sent to remote websites (#12)
Browse files Browse the repository at this point in the history
* customize which headers will be sent to remote websites

* update readme
  • Loading branch information
mxsnq authored May 2, 2023
1 parent 416aa1c commit e7f7c48
Show file tree
Hide file tree
Showing 4 changed files with 50 additions and 10 deletions.
5 changes: 5 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,11 @@ You also may specify requests context and page ids directly.

Once your spider is closed, middleware will take care of closing all used browser contexts.

One may customize which `PuppeteerRequest`'s headers will be sent to remote website by the service
via `include_headers` attribute in request or globally with `PUPPETEER_INCLUDE_HEADERS` setting.
Available values are True (all headers), False (no headers) or list of header names.
By default, only cookies are sent.

## TODO

- [x] skeleton that could handle goto, click, scroll, and actions
Expand Down
43 changes: 36 additions & 7 deletions scrapypuppeteer/middleware.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import json
from collections import defaultdict
from typing import List, Union
from urllib.parse import urljoin, urlencode

from scrapy import Request, signals
Expand All @@ -19,20 +20,44 @@ class PuppeteerServiceDownloaderMiddleware:
that spider uses and performs cleanup request to service once spider
is closed.
Puppeteer service URL may be set via PUPPETEER_SERVICE_URL setting.
Settings:
PUPPETEER_SERVICE_URL (str)
Service URL, e.g. 'http://localhost:3000'
PUPPETEER_INCLUDE_HEADERS (bool|list[str])
Determines which request headers will be sent to remote site by puppeteer service.
Either True (all headers), False (no headers) or list of header names.
May be overriden per request.
By default, only cookies are sent.
"""

def __init__(self, crawler: Crawler, service_url: str):
SERVICE_URL_SETTING = 'PUPPETEER_SERVICE_URL'
INCLUDE_HEADERS_SETTING = 'PUPPETEER_INCLUDE_HEADERS'
DEFAULT_INCLUDE_HEADERS = ['Cookie'] # TODO send them separately

def __init__(self,
crawler: Crawler,
service_url: str,
include_headers: Union[bool, List[str]]):
self.service_base_url = service_url
self.include_headers = include_headers
self.crawler = crawler
self.used_contexts = defaultdict(set)

@classmethod
def from_crawler(cls, crawler):
service_url = crawler.settings.get('PUPPETEER_SERVICE_URL')
service_url = crawler.settings.get(cls.SERVICE_URL_SETTING)
if service_url is None:
raise ValueError('Puppeteer service URL must be provided')
middleware = cls(crawler, service_url)
if cls.INCLUDE_HEADERS_SETTING in crawler.settings:
try:
include_headers = crawler.settings.getbool(cls.INCLUDE_HEADERS_SETTING)
except ValueError:
include_headers = crawler.settings.getlist(cls.INCLUDE_HEADERS_SETTING)
else:
include_headers = cls.DEFAULT_INCLUDE_HEADERS
middleware = cls(crawler, service_url, include_headers)
crawler.signals.connect(middleware.close_used_contexts,
signal=signals.spider_closed)
return middleware
Expand Down Expand Up @@ -76,8 +101,7 @@ def _encode_service_params(request):
service_params['closePage'] = 1
return urlencode(service_params)

@staticmethod
def _serialize_body(action, request):
def _serialize_body(self, action, request):
payload = action.payload()
if action.content_type == 'application/json':
if isinstance(payload, dict):
Expand All @@ -86,7 +110,12 @@ def _serialize_body(action, request):
proxy = request.meta.get('proxy')
if proxy:
payload['proxy'] = proxy
payload['headers'] = request.headers.to_unicode_dict()
include_headers = self.include_headers if request.include_headers is None else request.include_headers
if include_headers:
headers = request.headers.to_unicode_dict()
if isinstance(include_headers, list):
headers = {h: headers[h] for h in include_headers if h in headers}
payload['headers'] = headers
return json.dumps(payload)
return str(payload)

Expand Down
10 changes: 8 additions & 2 deletions scrapypuppeteer/request.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from typing import Union
from typing import List, Union

from scrapy.http import Request

Expand All @@ -15,6 +15,7 @@ def __init__(self,
context_id: str = None,
page_id: str = None,
close_page: bool = True,
include_headers: Union[bool, List[str]] = None,
**kwargs):
"""
Expand All @@ -26,6 +27,10 @@ def __init__(self,
:param close_page: whether to close page after request completion;
set to False, if you want to continue interacting
with the page
:param include_headers: determines which headers will be sent to remote
site by puppeteer: either True (all headers),
False (no headers), list of header names
or None (default, let middleware decide)
:param kwargs:
"""
url = kwargs.pop('url', None)
Expand All @@ -45,8 +50,9 @@ def __init__(self,
self.context_id = context_id
self.page_id = page_id
self.close_page = close_page
self.include_headers = include_headers

def replace(self, *args, **kwargs):
for x in ['action', 'context_id', 'page_id', 'close_page']:
for x in ['action', 'context_id', 'page_id', 'close_page', 'include_headers']:
kwargs.setdefault(x, getattr(self, x))
return super().replace(*args, **kwargs)
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@

setup(
name='scrapy-puppeteer-client',
version='0.0.6',
version='0.0.7',
description='A library to use Puppeteer-managed browser in Scrapy spiders',
long_description=long_description,
long_description_content_type="text/markdown",
Expand Down

0 comments on commit e7f7c48

Please sign in to comment.