From 7cc483b7d73fdb5dc342a2a85d3571d342aae51c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Fri, 29 Dec 2023 14:53:53 +0100 Subject: [PATCH 1/9] Document how to handle action failures --- docs/index.rst | 1 + docs/usage/actions.rst | 105 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 106 insertions(+) create mode 100644 docs/usage/actions.rst diff --git a/docs/index.rst b/docs/index.rst index 3834977f..72db0cb5 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -28,6 +28,7 @@ either :ref:`globally ` or :ref:`per request `, or usage/retry usage/scrapy-poet usage/stats + usage/actions usage/fingerprint usage/proxy diff --git a/docs/usage/actions.rst b/docs/usage/actions.rst new file mode 100644 index 00000000..47d366f0 --- /dev/null +++ b/docs/usage/actions.rst @@ -0,0 +1,105 @@ +.. _actions: + +====================== +Handling action errors +====================== + +Zyte API responses are considered successful :ref:`even if some browser actions +fail `. + +If you wish to retry requests whose response contains actions that failed, you +can define a custom Scrapy middleware as follows: + +.. code-block:: python + :caption: myproject/middlewares.py + + from scrapy import Request, Spider + from scrapy.http import Response + from scrapy.downloadermiddlewares.retry import get_retry_request + from scrapy.settings import BaseSettings + from scrapy_zyte_api.responses import ZyteAPIResponse, ZyteAPITextResponse + + class ZyteAPIFailedActionsRetryMiddleware: + + def __init__(self, settings: BaseSettings): + if not settings.getbool("RETRY_ENABLED"): + raise NotConfigured + self.max_retry_times = settings.getint("RETRY_TIMES") + self.priority_adjust = settings.getint("RETRY_PRIORITY_ADJUST") + + def process_response( + self, request: Request, response: Response, spider: Spider + ) -> Union[Request, Response]: + if not isinstance(response, (ZyteAPIResponse, ZyteAPITextResponse)): + return response + if request.meta.get("dont_retry", False): + return response + if any("error" in action for action in response.raw_api_response["actions"]): + reason = "An action failed" + new_request = self._retry(request, reason, spider) + if new_request: + return new_request + else: + return response + # Note: If you prefer requests that exceed all retries to + # be dropped, raise scrapy.exceptions.IgnoreRequest here, + # instead of returning the response. + return response + + def _retry( + self, + request: Request, + reason: Union[str, Exception, Type[Exception]], + spider: Spider, + ) -> Optional[Request]: + max_retry_times = request.meta.get("max_retry_times", self.max_retry_times) + priority_adjust = request.meta.get("priority_adjust", self.priority_adjust) + return get_retry_request( + request, + reason=reason, + spider=spider, + max_retry_times=max_retry_times, + priority_adjust=priority_adjust, + ) + +And enable it in your settings: + +.. code-block:: python + :caption: myproject/settings.py + + + DOWNLOADER_MIDDLEWARES = { + "myproject.middlewares.ZyteAPIFailedActionsRetryMiddleware": 525, + } + +If you use +:class:`~scrapy.downloadermiddlewares.httpcache.HttpCacheMiddleware`, you might +want to use a custom :setting:`HTTPCACHE_POLICY ` to +prevent responses with failed actions (i.e. after exceeding retries) to be +cached: + +.. code-block:: python + :caption: myproject/extensions.py + + from scrapy import Request + from scrapy.extensions.httpcache import DummyPolicy + from scrapy.http import Response + from scrapy_zyte_api.responses import ZyteAPIResponse, ZyteAPITextResponse + + class ZyteAPIFailedActionsPolicy(DummyPolicy): + + def should_cache_response(self, response: Response, request: Request): + if ( + isinstance(response, (ZyteAPIResponse, ZyteAPITextResponse)) + and any("error" in action for action in response.raw_api_response["actions"]) + ): + return False + return super().should_cache_response(response, request) + +And enable it in your settings: + +.. code-block:: python + :caption: myproject/settings.py + + + HTTPCACHE_POLICY = "myproject.extensions.ZyteAPIFailedActionsPolicy" From 47197ee4265108a7c9dc86a59ab558c717c0fa76 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Fri, 29 Dec 2023 14:55:17 +0100 Subject: [PATCH 2/9] Remove an empty line --- docs/usage/actions.rst | 1 - 1 file changed, 1 deletion(-) diff --git a/docs/usage/actions.rst b/docs/usage/actions.rst index 47d366f0..e779adcb 100644 --- a/docs/usage/actions.rst +++ b/docs/usage/actions.rst @@ -101,5 +101,4 @@ And enable it in your settings: .. code-block:: python :caption: myproject/settings.py - HTTPCACHE_POLICY = "myproject.extensions.ZyteAPIFailedActionsPolicy" From 00b2c2a0232d9033a512804006c2949e81c2ed1d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Fri, 29 Dec 2023 15:26:56 +0100 Subject: [PATCH 3/9] Fix the ZyteAPIFailedActionsRetryMiddleware example code --- docs/usage/actions.rst | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/docs/usage/actions.rst b/docs/usage/actions.rst index e779adcb..66374fe9 100644 --- a/docs/usage/actions.rst +++ b/docs/usage/actions.rst @@ -13,7 +13,10 @@ can define a custom Scrapy middleware as follows: .. code-block:: python :caption: myproject/middlewares.py + from typing import Optional, Type, Union + from scrapy import Request, Spider + from scrapy.crawler import Crawler from scrapy.http import Response from scrapy.downloadermiddlewares.retry import get_retry_request from scrapy.settings import BaseSettings @@ -21,6 +24,10 @@ can define a custom Scrapy middleware as follows: class ZyteAPIFailedActionsRetryMiddleware: + @classmethod + def from_crawler(cls, crawler: Crawler): + return cls(crawler.settings) + def __init__(self, settings: BaseSettings): if not settings.getbool("RETRY_ENABLED"): raise NotConfigured From deedfff9eae1eeaf957bc7619f3059e6ce559370 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Wed, 17 Jan 2024 15:07:00 +0100 Subject: [PATCH 4/9] Implement retries on action error --- docs/reference/settings.rst | 38 ++++ docs/usage/actions.rst | 80 +------- pyproject.toml | 3 + scrapy_zyte_api/_middlewares.py | 72 ++++++- scrapy_zyte_api/exceptions.py | 17 ++ tests/test_middlewares.py | 333 ++++++++++++++++++++++++++++++++ 6 files changed, 469 insertions(+), 74 deletions(-) create mode 100644 scrapy_zyte_api/exceptions.py diff --git a/docs/reference/settings.rst b/docs/reference/settings.rst index e24c2a16..a472c4b3 100644 --- a/docs/reference/settings.rst +++ b/docs/reference/settings.rst @@ -6,6 +6,44 @@ Settings :ref:`Settings ` for scrapy-zyte-api. +.. _ZYTE_API_ACTION_ERROR_RETRY_ENABLED: + +ZYTE_API_ACTION_ERROR_RETRY_ENABLED +=================================== + +Default: ``True`` + +Enables retries of Zyte API requests if responses contain an action error. + +Maximum retries and priority adjustment are handled with the same settings and +request metadata keywords as regular retries in Scrapy, see +:setting:`RETRY_TIMES ` and +:setting:`RETRY_PRIORITY_ADJUST `. + +See also :ref:`ZYTE_API_ACTION_ERROR_HANDLING`. + +.. _ZYTE_API_ACTION_ERROR_HANDLING: + +ZYTE_API_ACTION_ERROR_HANDLING +============================== + +Default: ``"pass"`` + +Determines how to handle Zyte API responses that contain an action error: + +- ``"pass"``: Responses are treated as valid responses, i.e. they will reach + your spider callback. + +- ``"ignore"``: Responses are dropped, i.e. they will not reach your spider. + +- ``"err"``: :class:`~scrapy_zyte_api.exceptions.ActionError` is raised. It + will be processed by downloader middlewares as an exception, and will reach + your spider errback if you set one. + +.. autoexception:: scrapy_zyte_api.exceptions.ActionError + :members: + + .. _ZYTE_API_AUTOMAP_PARAMS: ZYTE_API_AUTOMAP_PARAMS diff --git a/docs/usage/actions.rst b/docs/usage/actions.rst index 66374fe9..e4d5646d 100644 --- a/docs/usage/actions.rst +++ b/docs/usage/actions.rst @@ -4,80 +4,16 @@ Handling action errors ====================== -Zyte API responses are considered successful :ref:`even if some browser actions -fail `. +Even though Zyte API considers a response successful :ref:`even if a browser +action fails `, scrapy-zyte-api retries such +responses by default. See :ref:`ZYTE_API_ACTION_ERROR_RETRY_ENABLED`. -If you wish to retry requests whose response contains actions that failed, you -can define a custom Scrapy middleware as follows: +You can also use :ref:`ZYTE_API_ACTION_ERROR_HANDLING` to determine how such +responses are handled when they are not retried or when retries are exceeded: +treated as a success (default), ignored, or treated as an error. -.. code-block:: python - :caption: myproject/middlewares.py - - from typing import Optional, Type, Union - - from scrapy import Request, Spider - from scrapy.crawler import Crawler - from scrapy.http import Response - from scrapy.downloadermiddlewares.retry import get_retry_request - from scrapy.settings import BaseSettings - from scrapy_zyte_api.responses import ZyteAPIResponse, ZyteAPITextResponse - - class ZyteAPIFailedActionsRetryMiddleware: - - @classmethod - def from_crawler(cls, crawler: Crawler): - return cls(crawler.settings) - - def __init__(self, settings: BaseSettings): - if not settings.getbool("RETRY_ENABLED"): - raise NotConfigured - self.max_retry_times = settings.getint("RETRY_TIMES") - self.priority_adjust = settings.getint("RETRY_PRIORITY_ADJUST") - - def process_response( - self, request: Request, response: Response, spider: Spider - ) -> Union[Request, Response]: - if not isinstance(response, (ZyteAPIResponse, ZyteAPITextResponse)): - return response - if request.meta.get("dont_retry", False): - return response - if any("error" in action for action in response.raw_api_response["actions"]): - reason = "An action failed" - new_request = self._retry(request, reason, spider) - if new_request: - return new_request - else: - return response - # Note: If you prefer requests that exceed all retries to - # be dropped, raise scrapy.exceptions.IgnoreRequest here, - # instead of returning the response. - return response - - def _retry( - self, - request: Request, - reason: Union[str, Exception, Type[Exception]], - spider: Spider, - ) -> Optional[Request]: - max_retry_times = request.meta.get("max_retry_times", self.max_retry_times) - priority_adjust = request.meta.get("priority_adjust", self.priority_adjust) - return get_retry_request( - request, - reason=reason, - spider=spider, - max_retry_times=max_retry_times, - priority_adjust=priority_adjust, - ) - -And enable it in your settings: - -.. code-block:: python - :caption: myproject/settings.py - - - DOWNLOADER_MIDDLEWARES = { - "myproject.middlewares.ZyteAPIFailedActionsRetryMiddleware": 525, - } +Action error caching +==================== If you use :class:`~scrapy.downloadermiddlewares.httpcache.HttpCacheMiddleware`, you might diff --git a/pyproject.toml b/pyproject.toml index 75ae5cb1..b30e3711 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -15,6 +15,9 @@ files = [ ] [tool.pytest.ini_options] +filterwarnings = [ + "ignore:::twisted", +] junit_family = "xunit2" testpaths = [ "scrapy_zyte_api/", diff --git a/scrapy_zyte_api/_middlewares.py b/scrapy_zyte_api/_middlewares.py index 3fb5f451..5af5d47c 100644 --- a/scrapy_zyte_api/_middlewares.py +++ b/scrapy_zyte_api/_middlewares.py @@ -1,11 +1,15 @@ import logging -from typing import cast +from typing import Optional, Union, cast -from scrapy import Request, signals +from scrapy import Request, Spider, signals +from scrapy.downloadermiddlewares.retry import get_retry_request from scrapy.exceptions import IgnoreRequest +from scrapy.http import Response from zyte_api.aio.errors import RequestError from ._params import _ParamParser +from .exceptions import ActionError +from .responses import ZyteAPIResponse, ZyteAPITextResponse logger = logging.getLogger(__name__) @@ -43,6 +47,13 @@ def __init__(self, crawler) -> None: self._forbidden_domain_start_request_count = 0 self._total_start_request_count = 0 + self._retry_action_errors = crawler.settings.getbool( + "ZYTE_API_ACTION_ERROR_RETRY_ENABLED", True + ) + self._max_retry_times = crawler.settings.getint("RETRY_TIMES") + self._priority_adjust = crawler.settings.getint("RETRY_PRIORITY_ADJUST") + self._load_action_error_handling() + self._max_requests = crawler.settings.getint("ZYTE_API_MAX_REQUESTS") if self._max_requests: logger.info( @@ -56,6 +67,18 @@ def __init__(self, crawler) -> None: self._start_requests_processed, signal=_start_requests_processed ) + def _load_action_error_handling(self): + value = self._crawler.settings.get("ZYTE_API_ACTION_ERROR_HANDLING", "pass") + if value in ("pass", "ignore", "err"): + self._action_error_handling = value + else: + fallback_value = "pass" + logger.error( + f"Setting ZYTE_API_ACTION_ERROR_HANDLING got an unexpected " + f"value: {value!r}. Falling back to {fallback_value!r}." + ) + self._action_error_handling = fallback_value + def _get_spm_mw(self): spm_mw_classes = [] @@ -164,6 +187,51 @@ def _maybe_close(self): self._crawler.spider, "failed_forbidden_domain" ) + def _handle_action_error(self, response): + if self._action_error_handling == "pass": + return response + elif self._action_error_handling == "ignore": + raise IgnoreRequest + else: + assert self._action_error_handling == "err" + raise ActionError(response) + + def process_response( + self, request: Request, response: Response, spider: Spider + ) -> Union[Request, Response]: + if not isinstance(response, (ZyteAPIResponse, ZyteAPITextResponse)): + return response + + action_error = any( + "error" in action for action in response.raw_api_response["actions"] + ) + if not action_error: + return response + + if not self._retry_action_errors or request.meta.get("dont_retry", False): + return self._handle_action_error(response) + + return self._retry( + request, reason="action-error", spider=spider + ) or self._handle_action_error(response) + + def _retry( + self, + request: Request, + *, + reason: str, + spider: Spider, + ) -> Optional[Request]: + max_retry_times = request.meta.get("max_retry_times", self._max_retry_times) + priority_adjust = request.meta.get("priority_adjust", self._priority_adjust) + return get_retry_request( + request, + reason=reason, + spider=spider, + max_retry_times=max_retry_times, + priority_adjust=priority_adjust, + ) + class ScrapyZyteAPISpiderMiddleware(_BaseMiddleware): def __init__(self, crawler): diff --git a/scrapy_zyte_api/exceptions.py b/scrapy_zyte_api/exceptions.py new file mode 100644 index 00000000..ae14da97 --- /dev/null +++ b/scrapy_zyte_api/exceptions.py @@ -0,0 +1,17 @@ +from typing import Union + +from .responses import ZyteAPIResponse, ZyteAPITextResponse + + +class ActionError(ValueError): + """Exception raised when a Zyte API response contains an action error.""" + + def __init__(self, response, *args, **kwargs): + super().__init__(*args, **kwargs) + + #: Offending Zyte API response. + #: + #: You can inspect the outcome of actions in the ``"actions"`` key of + #: :attr:`response.raw_api_response + #: `. + self.response: Union[ZyteAPIResponse, ZyteAPITextResponse] = response diff --git a/tests/test_middlewares.py b/tests/test_middlewares.py index 9011a2a0..560119cc 100644 --- a/tests/test_middlewares.py +++ b/tests/test_middlewares.py @@ -1,3 +1,4 @@ +import logging from typing import Any, Dict, cast from unittest import SkipTest @@ -5,6 +6,7 @@ from packaging.version import Version from pytest_twisted import ensureDeferred from scrapy import Request, Spider +from scrapy.exceptions import IgnoreRequest from scrapy.http.response import Response from scrapy.item import Item from scrapy.utils.misc import create_instance @@ -14,6 +16,8 @@ ScrapyZyteAPIDownloaderMiddleware, ScrapyZyteAPISpiderMiddleware, ) +from scrapy_zyte_api.exceptions import ActionError +from scrapy_zyte_api.responses import ZyteAPIResponse from . import SETTINGS from .mockserver import DelayedResource, MockServer @@ -389,3 +393,332 @@ class CrawleraSpider(Spider): attribute, conflict, ) + + +@pytest.mark.parametrize( + "settings,meta,enabled", + [ + # ZYTE_API_ACTION_ERROR_RETRY_ENABLED enables, RETRY_ENABLED has no + # effect. + *( + ( + { + "RETRY_ENABLED": scrapy, + "ZYTE_API_ACTION_ERROR_RETRY_ENABLED": zyte_api, + }, + {}, + zyte_api, + ) + for zyte_api in (True, False) + for scrapy in (True, False) + ), + *( + ( + { + "RETRY_ENABLED": scrapy, + }, + {}, + True, + ) + for scrapy in (True, False) + ), + # dont_retry=True overrides. + *( + ( + {"ZYTE_API_ACTION_ERROR_RETRY_ENABLED": zyte_api}, + {"dont_retry": dont_retry}, + zyte_api and not dont_retry, + ) + for zyte_api in (True, False) + for dont_retry in (True, False) + ), + ], +) +@ensureDeferred +async def test_action_error_retry_enabled(settings, meta, enabled): + crawler = get_crawler(settings_dict=settings) + await crawler.crawl() + + middleware = create_instance( + ScrapyZyteAPIDownloaderMiddleware, settings=crawler.settings, crawler=crawler + ) + + request = Request("https://example.com", meta=meta) + raw_api_response = {"url": request.url, "actions": [{"error": "foo"}]} + response = ZyteAPIResponse.from_api_response(raw_api_response, request=request) + result = middleware.process_response(request, response, crawler.spider) + if enabled: + assert isinstance(result, Request) + assert result.meta["retry_times"] == 1 + else: + assert result is response + + await crawler.stop() + + +@pytest.mark.parametrize( + "settings,meta,max_retries", + [ + ( + {"RETRY_TIMES": 1}, + {}, + 1, + ), + ( + {}, + {"max_retry_times": 1}, + 1, + ), + ( + {"RETRY_TIMES": 1}, + {"max_retry_times": 2}, + 2, + ), + ( + {"RETRY_TIMES": 2}, + {"max_retry_times": 1}, + 1, + ), + ], +) +@ensureDeferred +async def test_action_error_retry_times(settings, meta, max_retries): + crawler = get_crawler(settings_dict=settings) + await crawler.crawl() + + middleware = create_instance( + ScrapyZyteAPIDownloaderMiddleware, settings=crawler.settings, crawler=crawler + ) + + request = Request( + "https://example.com", meta={**meta, "retry_times": max_retries - 1} + ) + raw_api_response = {"url": request.url, "actions": [{"error": "foo"}]} + response = ZyteAPIResponse.from_api_response(raw_api_response, request=request) + + request2 = middleware.process_response(request, response, crawler.spider) + assert isinstance(request2, Request) + assert request2.meta["retry_times"] == max_retries + + result = middleware.process_response(request2, response, crawler.spider) + assert result is response + + await crawler.stop() + + +@pytest.mark.parametrize( + "settings,meta,priority", + [ + ( + {"RETRY_PRIORITY_ADJUST": 1}, + {}, + 1, + ), + ( + {}, + {"priority_adjust": 1}, + 1, + ), + ( + {"RETRY_PRIORITY_ADJUST": 1}, + {"priority_adjust": 2}, + 2, + ), + ( + {"RETRY_PRIORITY_ADJUST": 2}, + {"priority_adjust": 1}, + 1, + ), + ], +) +@ensureDeferred +async def test_action_error_retry_priority_adjust(settings, meta, priority): + crawler = get_crawler(settings_dict=settings) + await crawler.crawl() + + middleware = create_instance( + ScrapyZyteAPIDownloaderMiddleware, settings=crawler.settings, crawler=crawler + ) + + request = Request("https://example.com", meta=meta) + raw_api_response = {"url": request.url, "actions": [{"error": "foo"}]} + response = ZyteAPIResponse.from_api_response(raw_api_response, request=request) + + request2 = middleware.process_response(request, response, crawler.spider) + assert isinstance(request2, Request) + assert request2.meta["retry_times"] == 1 + assert request2.priority == priority + + await crawler.stop() + + +@pytest.mark.parametrize( + "settings,expected,setup_errors", + [ + ( + {}, + Response, + [], + ), + ( + {"ZYTE_API_ACTION_ERROR_HANDLING": "pass"}, + Response, + [], + ), + ( + {"ZYTE_API_ACTION_ERROR_HANDLING": "ignore"}, + IgnoreRequest, + [], + ), + ( + {"ZYTE_API_ACTION_ERROR_HANDLING": "err"}, + ActionError, + [], + ), + ( + {"ZYTE_API_ACTION_ERROR_HANDLING": "foo"}, + Response, + [ + ( + "Setting ZYTE_API_ACTION_ERROR_HANDLING got an unexpected " + "value: 'foo'. Falling back to 'pass'." + ) + ], + ), + ], +) +@ensureDeferred +async def test_action_error_handling_no_retries( + settings, expected, setup_errors, caplog +): + settings["ZYTE_API_ACTION_ERROR_RETRY_ENABLED"] = False + crawler = get_crawler(settings_dict=settings) + await crawler.crawl() + + middleware = create_instance( + ScrapyZyteAPIDownloaderMiddleware, settings=crawler.settings, crawler=crawler + ) + if setup_errors: + assert caplog.record_tuples == [ + ("scrapy_zyte_api._middlewares", logging.ERROR, error) + for error in setup_errors + ] + else: + assert not caplog.records + + request = Request("https://example.com") + raw_api_response = {"url": request.url, "actions": [{"error": "foo"}]} + response = ZyteAPIResponse.from_api_response(raw_api_response, request=request) + + try: + result = middleware.process_response(request, response, crawler.spider) + except (ActionError, IgnoreRequest) as e: + result = e + assert isinstance(result, expected) + + await crawler.stop() + + +@pytest.mark.parametrize( + "settings,expected,setup_errors", + [ + ( + {}, + Response, + [], + ), + ( + {"ZYTE_API_ACTION_ERROR_HANDLING": "pass"}, + Response, + [], + ), + ( + {"ZYTE_API_ACTION_ERROR_HANDLING": "ignore"}, + IgnoreRequest, + [], + ), + ( + {"ZYTE_API_ACTION_ERROR_HANDLING": "err"}, + ActionError, + [], + ), + ( + {"ZYTE_API_ACTION_ERROR_HANDLING": "foo"}, + Response, + [ + ( + "Setting ZYTE_API_ACTION_ERROR_HANDLING got an unexpected " + "value: 'foo'. Falling back to 'pass'." + ) + ], + ), + ], +) +@ensureDeferred +async def test_action_error_handling_retries(settings, expected, setup_errors, caplog): + settings["RETRY_TIMES"] = 1 + crawler = get_crawler(settings_dict=settings) + await crawler.crawl() + + middleware = create_instance( + ScrapyZyteAPIDownloaderMiddleware, settings=crawler.settings, crawler=crawler + ) + if setup_errors: + assert caplog.record_tuples == [ + ("scrapy_zyte_api._middlewares", logging.ERROR, error) + for error in setup_errors + ] + else: + assert not caplog.records + + request = Request("https://example.com") + raw_api_response = {"url": request.url, "actions": [{"error": "foo"}]} + response = ZyteAPIResponse.from_api_response(raw_api_response, request=request) + + request2 = middleware.process_response(request, response, crawler.spider) + assert isinstance(request2, Request) + assert request2.meta["retry_times"] == 1 + + try: + result = middleware.process_response(request2, response, crawler.spider) + except (ActionError, IgnoreRequest) as e: + result = e + assert isinstance(result, expected) + + await crawler.stop() + + +@ensureDeferred +async def test_process_response_non_zyte_api(): + crawler = get_crawler() + await crawler.crawl() + + middleware = create_instance( + ScrapyZyteAPIDownloaderMiddleware, settings=crawler.settings, crawler=crawler + ) + + request = Request("https://example.com") + response = Response(request.url) + result = middleware.process_response(request, response, crawler.spider) + assert result is response + + await crawler.stop() + + +@ensureDeferred +async def test_process_response_no_action_error(): + crawler = get_crawler() + await crawler.crawl() + + middleware = create_instance( + ScrapyZyteAPIDownloaderMiddleware, settings=crawler.settings, crawler=crawler + ) + + request = Request("https://example.com") + raw_api_response = {"url": request.url, "actions": [{"action": "foo"}]} + response = ZyteAPIResponse.from_api_response(raw_api_response, request=request) + + result = middleware.process_response(request, response, crawler.spider) + assert result is response + + await crawler.stop() From 75faf6885cd1e9a462abeb10b1c225fe82eac910 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Wed, 17 Jan 2024 15:12:05 +0100 Subject: [PATCH 5/9] Make mypy happy --- scrapy_zyte_api/_middlewares.py | 1 + 1 file changed, 1 insertion(+) diff --git a/scrapy_zyte_api/_middlewares.py b/scrapy_zyte_api/_middlewares.py index 5af5d47c..9a1db160 100644 --- a/scrapy_zyte_api/_middlewares.py +++ b/scrapy_zyte_api/_middlewares.py @@ -202,6 +202,7 @@ def process_response( if not isinstance(response, (ZyteAPIResponse, ZyteAPITextResponse)): return response + assert response.raw_api_response is not None action_error = any( "error" in action for action in response.raw_api_response["actions"] ) From cdc0aa15320b0772f5460e528171b8979c0a5ab1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Wed, 17 Jan 2024 15:14:48 +0100 Subject: [PATCH 6/9] Do not assume Zyte API responses have actions --- docs/usage/actions.rst | 2 +- scrapy_zyte_api/_middlewares.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/usage/actions.rst b/docs/usage/actions.rst index e4d5646d..1e22c9f1 100644 --- a/docs/usage/actions.rst +++ b/docs/usage/actions.rst @@ -34,7 +34,7 @@ cached: def should_cache_response(self, response: Response, request: Request): if ( isinstance(response, (ZyteAPIResponse, ZyteAPITextResponse)) - and any("error" in action for action in response.raw_api_response["actions"]) + and any("error" in action for action in response.raw_api_response.get("actions", [])) ): return False return super().should_cache_response(response, request) diff --git a/scrapy_zyte_api/_middlewares.py b/scrapy_zyte_api/_middlewares.py index 9a1db160..c38e7c88 100644 --- a/scrapy_zyte_api/_middlewares.py +++ b/scrapy_zyte_api/_middlewares.py @@ -204,7 +204,7 @@ def process_response( assert response.raw_api_response is not None action_error = any( - "error" in action for action in response.raw_api_response["actions"] + "error" in action for action in response.raw_api_response.get("actions", []) ) if not action_error: return response From f19e1577cdd9ab4502516753b08b2d687582d074 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Wed, 17 Jan 2024 15:24:12 +0100 Subject: [PATCH 7/9] Backport get_retry_request for older Scrapy versions --- scrapy_zyte_api/_middlewares.py | 101 +++++++++++++++++++++++++++++++- 1 file changed, 100 insertions(+), 1 deletion(-) diff --git a/scrapy_zyte_api/_middlewares.py b/scrapy_zyte_api/_middlewares.py index c38e7c88..39168dda 100644 --- a/scrapy_zyte_api/_middlewares.py +++ b/scrapy_zyte_api/_middlewares.py @@ -2,15 +2,114 @@ from typing import Optional, Union, cast from scrapy import Request, Spider, signals -from scrapy.downloadermiddlewares.retry import get_retry_request from scrapy.exceptions import IgnoreRequest from scrapy.http import Response +from scrapy.utils.python import global_object_name from zyte_api.aio.errors import RequestError from ._params import _ParamParser from .exceptions import ActionError from .responses import ZyteAPIResponse, ZyteAPITextResponse +try: + from scrapy.downloadermiddlewares.retry import get_retry_request +except ImportError: + # Backport get_retry_request for Scrapy < 2.5.0 + + from logging import Logger + from typing import Type + + from scrapy.downloadermiddlewares.retry import logger as retry_logger + + def get_retry_request( + request: Request, + *, + spider: Spider, + reason: Union[str, Exception, Type[Exception]] = "unspecified", + max_retry_times: Optional[int] = None, + priority_adjust: Optional[int] = None, + logger: Logger = retry_logger, + stats_base_key: str = "retry", + ) -> Optional[Request]: + """ + Returns a new :class:`~scrapy.Request` object to retry the specified + request, or ``None`` if retries of the specified request have been + exhausted. + + For example, in a :class:`~scrapy.Spider` callback, you could use it as + follows:: + + def parse(self, response): + if not response.text: + new_request_or_none = get_retry_request( + response.request, + spider=self, + reason='empty', + ) + return new_request_or_none + + *spider* is the :class:`~scrapy.Spider` instance which is asking for the + retry request. It is used to access the :ref:`settings ` + and :ref:`stats `, and to provide extra logging context (see + :func:`logging.debug`). + + *reason* is a string or an :class:`Exception` object that indicates the + reason why the request needs to be retried. It is used to name retry stats. + + *max_retry_times* is a number that determines the maximum number of times + that *request* can be retried. If not specified or ``None``, the number is + read from the :reqmeta:`max_retry_times` meta key of the request. If the + :reqmeta:`max_retry_times` meta key is not defined or ``None``, the number + is read from the :setting:`RETRY_TIMES` setting. + + *priority_adjust* is a number that determines how the priority of the new + request changes in relation to *request*. If not specified, the number is + read from the :setting:`RETRY_PRIORITY_ADJUST` setting. + + *logger* is the logging.Logger object to be used when logging messages + + *stats_base_key* is a string to be used as the base key for the + retry-related job stats + """ + settings = spider.crawler.settings + assert spider.crawler.stats + stats = spider.crawler.stats + retry_times = request.meta.get("retry_times", 0) + 1 + if max_retry_times is None: + max_retry_times = request.meta.get("max_retry_times") + if max_retry_times is None: + max_retry_times = settings.getint("RETRY_TIMES") + if retry_times <= max_retry_times: + logger.debug( + "Retrying %(request)s (failed %(retry_times)d times): %(reason)s", + {"request": request, "retry_times": retry_times, "reason": reason}, + extra={"spider": spider}, + ) + new_request: Request = request.copy() + new_request.meta["retry_times"] = retry_times + new_request.dont_filter = True + if priority_adjust is None: + priority_adjust = settings.getint("RETRY_PRIORITY_ADJUST") + new_request.priority = request.priority + priority_adjust + + if callable(reason): + reason = reason() + if isinstance(reason, Exception): + reason = global_object_name(reason.__class__) + + stats.inc_value(f"{stats_base_key}/count") + stats.inc_value(f"{stats_base_key}/reason_count/{reason}") + return new_request + stats.inc_value(f"{stats_base_key}/max_reached") + logger.error( + "Gave up retrying %(request)s (failed %(retry_times)d times): " + "%(reason)s", + {"request": request, "retry_times": retry_times, "reason": reason}, + extra={"spider": spider}, + ) + return None + + logger = logging.getLogger(__name__) From 62c9ae6072509e90242553ad2f953a47a6043869 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Wed, 17 Jan 2024 15:26:33 +0100 Subject: [PATCH 8/9] Fix test failures due to a nameless spider class --- tests/test_middlewares.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/tests/test_middlewares.py b/tests/test_middlewares.py index 560119cc..0e117831 100644 --- a/tests/test_middlewares.py +++ b/tests/test_middlewares.py @@ -436,7 +436,7 @@ class CrawleraSpider(Spider): ) @ensureDeferred async def test_action_error_retry_enabled(settings, meta, enabled): - crawler = get_crawler(settings_dict=settings) + crawler = get_crawler(NamedSpider, settings_dict=settings) await crawler.crawl() middleware = create_instance( @@ -483,7 +483,7 @@ async def test_action_error_retry_enabled(settings, meta, enabled): ) @ensureDeferred async def test_action_error_retry_times(settings, meta, max_retries): - crawler = get_crawler(settings_dict=settings) + crawler = get_crawler(NamedSpider, settings_dict=settings) await crawler.crawl() middleware = create_instance( @@ -533,7 +533,7 @@ async def test_action_error_retry_times(settings, meta, max_retries): ) @ensureDeferred async def test_action_error_retry_priority_adjust(settings, meta, priority): - crawler = get_crawler(settings_dict=settings) + crawler = get_crawler(NamedSpider, settings_dict=settings) await crawler.crawl() middleware = create_instance( @@ -592,7 +592,7 @@ async def test_action_error_handling_no_retries( settings, expected, setup_errors, caplog ): settings["ZYTE_API_ACTION_ERROR_RETRY_ENABLED"] = False - crawler = get_crawler(settings_dict=settings) + crawler = get_crawler(NamedSpider, settings_dict=settings) await crawler.crawl() middleware = create_instance( @@ -657,7 +657,7 @@ async def test_action_error_handling_no_retries( @ensureDeferred async def test_action_error_handling_retries(settings, expected, setup_errors, caplog): settings["RETRY_TIMES"] = 1 - crawler = get_crawler(settings_dict=settings) + crawler = get_crawler(NamedSpider, settings_dict=settings) await crawler.crawl() middleware = create_instance( @@ -690,7 +690,7 @@ async def test_action_error_handling_retries(settings, expected, setup_errors, c @ensureDeferred async def test_process_response_non_zyte_api(): - crawler = get_crawler() + crawler = get_crawler(NamedSpider) await crawler.crawl() middleware = create_instance( @@ -707,7 +707,7 @@ async def test_process_response_non_zyte_api(): @ensureDeferred async def test_process_response_no_action_error(): - crawler = get_crawler() + crawler = get_crawler(NamedSpider) await crawler.crawl() middleware = create_instance( From ea2da8ff81195c4fbd7ad19a57b7347bab9e094f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Thu, 22 Feb 2024 10:47:05 +0100 Subject: [PATCH 9/9] Update docs/reference/settings.rst --- docs/reference/settings.rst | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/docs/reference/settings.rst b/docs/reference/settings.rst index a472c4b3..90b8089c 100644 --- a/docs/reference/settings.rst +++ b/docs/reference/settings.rst @@ -36,9 +36,8 @@ Determines how to handle Zyte API responses that contain an action error: - ``"ignore"``: Responses are dropped, i.e. they will not reach your spider. -- ``"err"``: :class:`~scrapy_zyte_api.exceptions.ActionError` is raised. It - will be processed by downloader middlewares as an exception, and will reach - your spider errback if you set one. +- ``"err"``: :class:`~scrapy_zyte_api.exceptions.ActionError` is raised, and + will reach your spider errback if you set one. .. autoexception:: scrapy_zyte_api.exceptions.ActionError :members: