From 1bcbe5170143cfa3db51cbb52fca06d37c62a3c1 Mon Sep 17 00:00:00 2001 From: Andrey Rakhmatullin Date: Thu, 21 Sep 2023 20:23:15 +0400 Subject: [PATCH 1/9] Fix typing issues for typed Scrapy. --- scrapy_zyte_api/handler.py | 5 +++-- scrapy_zyte_api/providers.py | 3 ++- scrapy_zyte_api/responses.py | 14 +++++++++----- tests/test_downloader_middleware.py | 7 +++++-- tests/test_responses.py | 5 +++-- 5 files changed, 22 insertions(+), 12 deletions(-) diff --git a/scrapy_zyte_api/handler.py b/scrapy_zyte_api/handler.py index 8cbdec90..f7dcf033 100644 --- a/scrapy_zyte_api/handler.py +++ b/scrapy_zyte_api/handler.py @@ -67,8 +67,8 @@ def __init__( # We keep the client in the crawler object to prevent multiple, # duplicate clients with the same settings to be used. # https://github.com/scrapy-plugins/scrapy-zyte-api/issues/58 - crawler.zyte_api_client = client - self._client: AsyncClient = crawler.zyte_api_client + crawler.zyte_api_client = client # type: ignore[attr-defined] + self._client: AsyncClient = crawler.zyte_api_client # type: ignore[attr-defined] logger.info("Using a Zyte API key starting with %r", self._client.api_key[:7]) verify_installed_reactor( "twisted.internet.asyncioreactor.AsyncioSelectorReactor" @@ -83,6 +83,7 @@ def __init__( ) self._param_parser = _ParamParser(crawler) self._retry_policy = _load_retry_policy(settings) + assert crawler.stats self._stats = crawler.stats self._session = create_session( connection_pool_size=self._client.n_conn, diff --git a/scrapy_zyte_api/providers.py b/scrapy_zyte_api/providers.py index 384ceada..d56af680 100644 --- a/scrapy_zyte_api/providers.py +++ b/scrapy_zyte_api/providers.py @@ -21,7 +21,7 @@ # requires Scrapy >= 2.8 from scrapy.http.request import NO_CALLBACK except ImportError: - NO_CALLBACK = None + NO_CALLBACK = None # type: ignore[assignment] class ZyteApiProvider(PageObjectInputProvider): @@ -86,6 +86,7 @@ async def __call__( }, callback=NO_CALLBACK, ) + assert crawler.engine api_response: ZyteAPITextResponse = await maybe_deferred_to_future( crawler.engine.download(api_request) ) diff --git a/scrapy_zyte_api/responses.py b/scrapy_zyte_api/responses.py index 18b63401..c77759d9 100644 --- a/scrapy_zyte_api/responses.py +++ b/scrapy_zyte_api/responses.py @@ -1,6 +1,6 @@ from base64 import b64decode from datetime import datetime -from typing import Any, Dict, List, Optional, Tuple, Union +from typing import Any, Dict, List, Optional, Tuple, Union, cast from scrapy import Request from scrapy.http import HtmlResponse, Response, TextResponse @@ -110,7 +110,9 @@ def _prepare_headers(cls, api_response: Dict[str, Any]): class ZyteAPITextResponse(ZyteAPIMixin, HtmlResponse): @classmethod - def from_api_response(cls, api_response: Dict, *, request: Request = None): + def from_api_response( + cls, api_response: Dict, *, request: Optional[Request] = None + ): """Alternative constructor to instantiate the response from the raw Zyte API response. """ @@ -141,7 +143,9 @@ def replace(self, *args, **kwargs): class ZyteAPIResponse(ZyteAPIMixin, Response): @classmethod - def from_api_response(cls, api_response: Dict, *, request: Request = None): + def from_api_response( + cls, api_response: Dict, *, request: Optional[Request] = None + ): """Alternative constructor to instantiate the response from the raw Zyte API response. """ @@ -188,8 +192,8 @@ def _process_response( if api_response.get("httpResponseHeaders") and api_response.get("httpResponseBody"): response_cls = responsetypes.from_args( - headers=api_response["httpResponseHeaders"], - url=api_response["url"], + headers=cast(List[Dict[str, str]], api_response["httpResponseHeaders"]), + url=cast(str, api_response["url"]), # FIXME: update this when python-zyte-api supports base64 decoding body=b64decode(api_response["httpResponseBody"]), # type: ignore ) diff --git a/tests/test_downloader_middleware.py b/tests/test_downloader_middleware.py index 63ff44c6..28d0cc4b 100644 --- a/tests/test_downloader_middleware.py +++ b/tests/test_downloader_middleware.py @@ -14,6 +14,8 @@ async def test_autothrottle_handling(): crawler = get_crawler() await crawler.crawl("a") + assert crawler.engine + assert crawler.spider spider = crawler.spider middleware = create_instance( @@ -21,14 +23,14 @@ async def test_autothrottle_handling(): ) # AutoThrottle does this. - spider.download_delay = 5 + spider.download_delay = 5 # type: ignore[attr-defined] # No effect on non-Zyte-API requests request = Request("https://example.com") assert middleware.process_request(request, spider) is None assert "download_slot" not in request.meta _, slot = crawler.engine.downloader._get_slot(request, spider) - assert slot.delay == spider.download_delay + assert slot.delay == spider.download_delay # type: ignore[attr-defined] # On Zyte API requests, the download slot is changed, and its delay is set # to 0. @@ -122,6 +124,7 @@ def parse(self, response): f"Maximum Zyte API requests for this crawl is set at {zapi_max_requests}" in caplog.text ) + assert crawler.stats assert crawler.stats.get_value("scrapy-zyte-api/success") <= zapi_max_requests assert crawler.stats.get_value("scrapy-zyte-api/processed") == zapi_max_requests assert crawler.stats.get_value("item_scraped_count") == zapi_max_requests + 6 diff --git a/tests/test_responses.py b/tests/test_responses.py index 9705a6fc..477b8169 100644 --- a/tests/test_responses.py +++ b/tests/test_responses.py @@ -1,6 +1,7 @@ from base64 import b64encode from collections import defaultdict from functools import partial +from typing import cast import pytest from scrapy import Request @@ -261,7 +262,7 @@ def test__process_response_no_body(): "product": {"name": "shoes"}, } - resp = _process_response(api_response, Request(api_response["url"])) + resp = _process_response(api_response, Request(cast(str, api_response["url"]))) assert isinstance(resp, Response) assert resp.body == b"" @@ -431,7 +432,7 @@ def test__process_response_non_text(): } ], } - resp = _process_response(api_response, Request(api_response["url"])) + resp = _process_response(api_response, Request(cast(str, api_response["url"]))) assert isinstance(resp, Response) with pytest.raises(NotSupported): From 5ac6d31adafbd9b1ad77b6aee208210356b03fea Mon Sep 17 00:00:00 2001 From: Andrey Rakhmatullin Date: Mon, 27 May 2024 18:49:58 +0500 Subject: [PATCH 2/9] Test with Scrapy master. --- tox.ini | 1 + 1 file changed, 1 insertion(+) diff --git a/tox.ini b/tox.ini index 073909e5..a75b60f3 100644 --- a/tox.ini +++ b/tox.ini @@ -111,6 +111,7 @@ extras = provider deps = mypy==1.8.0 types-setuptools + Scrapy @ git+https://github.com/scrapy/scrapy@master commands = mypy scrapy_zyte_api tests From aab7407972ef505e3593e86f5af1ec4ea0f41860 Mon Sep 17 00:00:00 2001 From: Andrey Rakhmatullin Date: Tue, 28 May 2024 13:17:19 +0500 Subject: [PATCH 3/9] More fixes. --- scrapy_zyte_api/_params.py | 24 ++++++++--------- scrapy_zyte_api/_request_fingerprinter.py | 4 +-- scrapy_zyte_api/addon.py | 4 ++- scrapy_zyte_api/handler.py | 4 +++ tests/__init__.py | 4 +-- tests/test_handler.py | 8 ++++-- tests/test_middlewares.py | 11 ++++++-- tests/test_providers.py | 32 +++++++++++++++++++++++ 8 files changed, 70 insertions(+), 21 deletions(-) diff --git a/scrapy_zyte_api/_params.py b/scrapy_zyte_api/_params.py index 6ddb7df9..720c0737 100644 --- a/scrapy_zyte_api/_params.py +++ b/scrapy_zyte_api/_params.py @@ -2,7 +2,7 @@ from copy import copy from logging import getLogger from os import environ -from typing import Any, Dict, List, Mapping, Optional, Set, Union +from typing import Any, Dict, Iterable, List, Mapping, Optional, Set, Tuple, Union from warnings import warn from scrapy import Request @@ -290,7 +290,7 @@ def _iter_headers( api_params: Dict[str, Any], request: Request, header_parameter: str, -): +) -> Iterable[Tuple[bytes, bytes, bytes]]: headers = api_params.get(header_parameter) if headers not in (None, True): logger.warning( @@ -306,8 +306,8 @@ def _iter_headers( continue decoded_k = k.decode() lowercase_k = k.strip().lower() - v = b",".join(v) - decoded_v = v.decode() + joined_v = b",".join(v) + decoded_v = joined_v.decode() if lowercase_k.startswith(b"x-crawlera-"): for spm_header_suffix, zapi_request_param in ( @@ -435,7 +435,7 @@ def _iter_headers( ) continue - yield k, lowercase_k, v + yield k, lowercase_k, joined_v def _map_custom_http_request_headers( @@ -461,7 +461,7 @@ def _map_request_headers( *, api_params: Dict[str, Any], request: Request, - browser_headers: Dict[str, str], + browser_headers: Dict[bytes, str], browser_ignore_headers: SKIP_HEADER_T, ): request_headers = {} @@ -477,7 +477,7 @@ def _map_request_headers( lowercase_k ] not in (ANY_VALUE, v): logger.warning( - f"Request {request} defines header {k}, which " + f"Request {request} defines header {k.decode()}, which " f"cannot be mapped into the Zyte API requestHeaders " f"parameter. See the ZYTE_API_BROWSER_HEADERS setting." ) @@ -500,7 +500,7 @@ def _set_request_headers_from_request( api_params: Dict[str, Any], request: Request, skip_headers: SKIP_HEADER_T, - browser_headers: Dict[str, str], + browser_headers: Dict[bytes, str], browser_ignore_headers: SKIP_HEADER_T, ): """Updates *api_params*, in place, based on *request*.""" @@ -727,7 +727,7 @@ def _update_api_params_from_request( default_params: Dict[str, Any], meta_params: Dict[str, Any], skip_headers: SKIP_HEADER_T, - browser_headers: Dict[str, str], + browser_headers: Dict[bytes, str], browser_ignore_headers: SKIP_HEADER_T, cookies_enabled: bool, cookie_jars: Optional[Dict[Any, CookieJar]], @@ -859,7 +859,7 @@ def _get_automap_params( default_enabled: bool, default_params: Dict[str, Any], skip_headers: SKIP_HEADER_T, - browser_headers: Dict[str, str], + browser_headers: Dict[bytes, str], browser_ignore_headers: SKIP_HEADER_T, cookies_enabled: bool, cookie_jars: Optional[Dict[Any, CookieJar]], @@ -906,7 +906,7 @@ def _get_api_params( transparent_mode: bool, automap_params: Dict[str, Any], skip_headers: SKIP_HEADER_T, - browser_headers: Dict[str, str], + browser_headers: Dict[bytes, str], browser_ignore_headers: SKIP_HEADER_T, job_id: Optional[str], cookies_enabled: bool, @@ -1003,7 +1003,7 @@ def _load_mw_skip_headers(crawler): return mw_skip_headers -def _load_browser_headers(settings): +def _load_browser_headers(settings) -> Dict[bytes, str]: browser_headers = settings.getdict( "ZYTE_API_BROWSER_HEADERS", {"Referer": "referer"}, diff --git a/scrapy_zyte_api/_request_fingerprinter.py b/scrapy_zyte_api/_request_fingerprinter.py index 5cb2a048..ee65ba23 100644 --- a/scrapy_zyte_api/_request_fingerprinter.py +++ b/scrapy_zyte_api/_request_fingerprinter.py @@ -1,5 +1,5 @@ from logging import getLogger -from typing import TYPE_CHECKING +from typing import TYPE_CHECKING, cast logger = getLogger(__name__) @@ -49,7 +49,7 @@ def __init__(self, crawler): crawler=crawler, ) if self._has_poet and not isinstance( - self._fallback_request_fingerprinter, RequestFingerprinter + self._fallback_request_fingerprinter, cast(type, RequestFingerprinter) ): logger.warning( f"You have scrapy-poet installed, but your custom value " diff --git a/scrapy_zyte_api/addon.py b/scrapy_zyte_api/addon.py index 916d8591..b34035e0 100644 --- a/scrapy_zyte_api/addon.py +++ b/scrapy_zyte_api/addon.py @@ -1,3 +1,5 @@ +from typing import cast + from scrapy.settings import BaseSettings from scrapy.utils.misc import load_object @@ -54,7 +56,7 @@ def update_settings(self, settings: BaseSettings) -> None: settings.set( "REQUEST_FINGERPRINTER_CLASS", "scrapy_zyte_api.ScrapyZyteAPIRequestFingerprinter", - settings.getpriority("REQUEST_FINGERPRINTER_CLASS"), + cast(int, settings.getpriority("REQUEST_FINGERPRINTER_CLASS")), ) else: settings.set( diff --git a/scrapy_zyte_api/handler.py b/scrapy_zyte_api/handler.py index 3983712b..e454ff84 100644 --- a/scrapy_zyte_api/handler.py +++ b/scrapy_zyte_api/handler.py @@ -106,6 +106,7 @@ async def engine_started(self): self._session = self._client.session(trust_env=self._trust_env) if not self._cookies_enabled: return + assert self._crawler.engine for middleware in self._crawler.engine.downloader.middleware.middlewares: if isinstance(middleware, self._cookie_mw_cls): self._cookie_jars = middleware.jars @@ -241,6 +242,9 @@ def _process_request_error(self, request, error): f"type={error.parsed.type!r}, request_id={error.request_id!r}) " f"while processing URL ({request.url}): {detail}" ) + assert self._crawler + assert self._crawler.engine + assert self._crawler.spider for status, error_type, close_reason in ( (401, "/auth/key-not-found", "zyte_api_bad_key"), (403, "/auth/account-suspended", "zyte_api_suspended_account"), diff --git a/tests/__init__.py b/tests/__init__.py index bdcdbad2..12374b6d 100644 --- a/tests/__init__.py +++ b/tests/__init__.py @@ -1,6 +1,6 @@ from contextlib import asynccontextmanager, contextmanager from os import environ -from typing import Any, Dict, Optional, Type, Union +from typing import Any, Dict, Optional from scrapy import Spider from scrapy.crawler import Crawler @@ -13,7 +13,7 @@ _API_KEY = "a" DEFAULT_CLIENT_CONCURRENCY = AsyncClient(api_key=_API_KEY).n_conn -SETTINGS_T = Dict[Union[Type, str], Any] +SETTINGS_T = Dict[str, Any] SETTINGS: SETTINGS_T = { "DOWNLOAD_HANDLERS": { "http": "scrapy_zyte_api.handler.ScrapyZyteAPIDownloadHandler", diff --git a/tests/test_handler.py b/tests/test_handler.py index 287acb32..b9d8bd23 100644 --- a/tests/test_handler.py +++ b/tests/test_handler.py @@ -470,9 +470,10 @@ async def test_trust_env(enabled): ), ) def test_user_agent_for_build_client(user_agent, expected): - settings: SETTINGS_T = Settings( + settings: Settings = Settings( { - **SETTINGS, + # see https://github.com/python/mypy/issues/16557#issuecomment-1831213673 + **SETTINGS, # type: ignore[dict-item] "_ZYTE_API_USER_AGENT": user_agent, } ) @@ -499,6 +500,7 @@ def parse(self, response): crawler = get_crawler(TestSpider, settings_dict=settings) await crawler.crawl() + assert crawler.stats assert crawler.stats.get_value("finish_reason") == "zyte_api_bad_key" @@ -526,6 +528,7 @@ def parse(self, response): crawler = get_crawler(TestSpider, settings_dict=settings) await crawler.crawl() + assert crawler.stats assert crawler.stats.get_value("finish_reason") == "zyte_api_suspended_account" @@ -548,6 +551,7 @@ def parse(self, response): crawler = get_crawler(TestSpider, settings_dict=settings) await crawler.crawl() + assert crawler.stats assert crawler.stats.get_value("finish_reason") == "zyte_api_suspended_account" diff --git a/tests/test_middlewares.py b/tests/test_middlewares.py index 05410108..5dc58ff0 100644 --- a/tests/test_middlewares.py +++ b/tests/test_middlewares.py @@ -189,6 +189,7 @@ def parse(self, response): crawler = get_crawler(TestSpider, settings_dict=settings) await crawler.crawl() + assert crawler.stats assert crawler.stats.get_value("finish_reason") == "failed_forbidden_domain" @@ -215,6 +216,7 @@ def parse(self, response): crawler = get_crawler(TestSpider, settings_dict=settings) await crawler.crawl() + assert crawler.stats assert crawler.stats.get_value("finish_reason") == "failed_forbidden_domain" @@ -240,6 +242,7 @@ def parse(self, response): crawler = get_crawler(TestSpider, settings_dict=settings) await crawler.crawl() + assert crawler.stats assert crawler.stats.get_value("finish_reason") == "finished" @@ -264,6 +267,7 @@ def parse(self, response): crawler = get_crawler(TestSpider, settings_dict=settings) await crawler.crawl() + assert crawler.stats assert crawler.stats.get_value("finish_reason") == "finished" @@ -294,6 +298,7 @@ def parse(self, response): crawler = get_crawler(TestSpider, settings_dict=settings) await crawler.crawl() + assert crawler.stats assert crawler.stats.get_value("finish_reason") == "failed_forbidden_domain" @@ -323,7 +328,7 @@ class SPMSpider(Spider): start_urls = ["data:,"] if attribute is not None: - SPMSpider.zyte_smartproxy_enabled = attribute + SPMSpider.zyte_smartproxy_enabled = attribute # type: ignore[attr-defined] settings = { "ZYTE_API_TRANSPARENT_MODE": True, @@ -340,6 +345,7 @@ class SPMSpider(Spider): crawler = get_crawler(SPMSpider, settings_dict=settings) await crawler.crawl() expected = "plugin_conflict" if conflict else "finished" + assert crawler.stats assert crawler.stats.get_value("finish_reason") == expected @@ -377,7 +383,7 @@ class CrawleraSpider(Spider): start_urls = ["data:,"] if attribute is not None: - CrawleraSpider.crawlera_enabled = attribute + CrawleraSpider.crawlera_enabled = attribute # type: ignore[attr-defined] settings = { "ZYTE_API_TRANSPARENT_MODE": True, @@ -394,6 +400,7 @@ class CrawleraSpider(Spider): crawler = get_crawler(CrawleraSpider, settings_dict=settings) await crawler.crawl() expected = "plugin_conflict" if conflict else "finished" + assert crawler.stats assert crawler.stats.get_value("finish_reason") == expected, ( setting, attribute, diff --git a/tests/test_providers.py b/tests/test_providers.py index aae2f05c..7c029fcd 100644 --- a/tests/test_providers.py +++ b/tests/test_providers.py @@ -419,6 +419,8 @@ class SomePage(BasePage): response: AnyResponse class ZyteAPISpider(Spider): + url: str + def start_requests(self): yield Request(self.url, callback=self.parse_) @@ -447,6 +449,8 @@ class SomePage(BasePage): product: Product class ZyteAPISpider(Spider): + url: str + def start_requests(self): yield Request(self.url, callback=self.parse_) @@ -476,6 +480,8 @@ class SomePage(BasePage): product: Product class ZyteAPISpider(Spider): + url: str + def start_requests(self): yield Request(self.url, callback=self.parse_) @@ -508,6 +514,8 @@ class SomePage(ItemPage[Product]): response: AnyResponse class ZyteAPISpider(Spider): + url: str + def start_requests(self): yield Request(self.url, callback=self.parse_) @@ -542,6 +550,8 @@ class SomePage(BasePage): product: Product class ZyteAPISpider(Spider): + url: str + def start_requests(self): yield Request(self.url, callback=self.parse_) @@ -578,6 +588,8 @@ class SomePage(BasePage): product: Product class ZyteAPISpider(Spider): + url: str + def start_requests(self): yield Request(self.url, callback=self.parse_) @@ -612,6 +624,8 @@ class SomePage(BasePage): product: Product class ZyteAPISpider(Spider): + url: str + def start_requests(self): yield Request(self.url, callback=self.parse_) @@ -648,6 +662,8 @@ class SomePage(BasePage): product: Product class ZyteAPISpider(Spider): + url: str + def start_requests(self): yield Request(self.url, callback=self.parse_) @@ -683,6 +699,8 @@ class SomePage(BasePage): html: BrowserHtml class ZyteAPISpider(Spider): + url: str + def start_requests(self): yield Request(self.url, callback=self.parse_) @@ -709,6 +727,8 @@ class SomePage(BasePage): browser_response: BrowserResponse class ZyteAPISpider(Spider): + url: str + def start_requests(self): yield Request(self.url, callback=self.parse_) @@ -736,6 +756,8 @@ class SomePage(BasePage): html: BrowserHtml class ZyteAPISpider(Spider): + url: str + def start_requests(self): yield Request(self.url, callback=self.parse_) @@ -763,6 +785,8 @@ class SomePage(BasePage): http_response: HttpResponse class ZyteAPISpider(Spider): + url: str + def start_requests(self): yield Request(self.url, callback=self.parse_) @@ -794,6 +818,8 @@ class SomePage(BasePage): http_response: HttpResponse class ZyteAPISpider(Spider): + url: str + def start_requests(self): yield Request(self.url, callback=self.parse_) @@ -832,6 +858,8 @@ class SecondPage(BasePage): response: AnyResponse class ZyteAPISpider(Spider): + url: str + def start_requests(self): yield Request(self.url, callback=self.parse_) @@ -866,6 +894,8 @@ class SecondPage(BasePage): response: AnyResponse class ZyteAPISpider(Spider): + url: str + def start_requests(self): yield Request(self.url, callback=self.parse_) @@ -893,6 +923,8 @@ def parse_(self, response: DummyResponse, page1: FirstPage, page2: SecondPage): @ensureDeferred async def test_screenshot(mockserver): class ZyteAPISpider(Spider): + url: str + def start_requests(self): yield Request(self.url, callback=self.parse_) From a8fa49f3f9318bd5bfc014094a11f8ab684a8348 Mon Sep 17 00:00:00 2001 From: Andrey Rakhmatullin Date: Fri, 18 Oct 2024 22:40:38 +0500 Subject: [PATCH 4/9] More typing fixes. --- scrapy_zyte_api/_session.py | 18 ++++++++++++------ scrapy_zyte_api/addon.py | 2 +- scrapy_zyte_api/providers.py | 4 +++- tests/test_providers.py | 4 ++++ tests/test_sessions.py | 4 ++-- 5 files changed, 22 insertions(+), 10 deletions(-) diff --git a/scrapy_zyte_api/_session.py b/scrapy_zyte_api/_session.py index eb674492..b0475584 100644 --- a/scrapy_zyte_api/_session.py +++ b/scrapy_zyte_api/_session.py @@ -73,7 +73,7 @@ class DummyResponse: # type: ignore[no-redef] from scrapy.downloadermiddlewares.retry import get_retry_request except ImportError: # pragma: no cover # https://github.com/scrapy/scrapy/blob/b1fe97dc6c8509d58b29c61cf7801eeee1b409a9/scrapy/downloadermiddlewares/retry.py#L57-L142 - def get_retry_request( + def get_retry_request( # type: ignore[misc] request, *, spider, @@ -125,7 +125,7 @@ def get_retry_request( from scrapy.http.request import NO_CALLBACK except ImportError: - def NO_CALLBACK(response): + def NO_CALLBACK(response): # type: ignore[misc] pass # pragma: no cover @@ -165,7 +165,7 @@ def _get_asyncio_event_loop(): return set_asyncio_event_loop() # https://github.com/scrapy/scrapy/blob/b1fe97dc6c8509d58b29c61cf7801eeee1b409a9/scrapy/utils/defer.py#L360-L379 - def deferred_to_future(d): + def deferred_to_future(d): # type: ignore[misc] return d.asFuture(_get_asyncio_event_loop()) @@ -177,7 +177,7 @@ def deferred_to_future(d): def build_from_crawler( objcls: Type[T], crawler: Crawler, /, *args: Any, **kwargs: Any ) -> T: - return create_instance(objcls, settings=None, crawler=crawler, *args, **kwargs) + return create_instance(objcls, settings=None, crawler=crawler, *args, **kwargs) # type: ignore[misc] class PoolError(ValueError): @@ -382,7 +382,7 @@ def check(self, response: Response, request: Request) -> bool: location = self.location(request) if not location: return True - for action in response.raw_api_response.get("actions", []): + for action in response.raw_api_response.get("actions", []): # type: ignore[attr-defined] if action.get("action", None) != "setLocation": continue if action.get("error", "").startswith("Action setLocation not supported "): @@ -647,6 +647,8 @@ def _get_pool(self, request): return pool async def _init_session(self, session_id: str, request: Request, pool: str) -> bool: + assert self._crawler.engine + assert self._crawler.stats session_config = self._get_session_config(request) if meta_params := request.meta.get("zyte_api_session_params", None): session_params = meta_params @@ -685,7 +687,7 @@ async def _init_session(self, session_id: str, request: Request, pool: str) -> b callback=NO_CALLBACK, ) if _DOWNLOAD_NEEDS_SPIDER: - deferred = self._crawler.engine.download( + deferred = self._crawler.engine.download( # type: ignore[call-arg] session_init_request, spider=spider ) else: @@ -829,6 +831,7 @@ async def check(self, response: Response, request: Request) -> bool: """Check the response for signs of session expiration, update the internal session pool accordingly, and return ``False`` if the session has expired or ``True`` if the session passed validation.""" + assert self._crawler.stats with self._fatal_error_handler: if self.is_init_request(request): return True @@ -860,6 +863,7 @@ async def check(self, response: Response, request: Request) -> bool: async def assign(self, request: Request): """Assign a working session to *request*.""" + assert self._crawler.stats with self._fatal_error_handler: if self.is_init_request(request): return @@ -895,6 +899,7 @@ def is_enabled(self, request: Request) -> bool: return session_config.enabled(request) def handle_error(self, request: Request): + assert self._crawler.stats with self._fatal_error_handler: pool = self._get_pool(request) self._crawler.stats.inc_value( @@ -908,6 +913,7 @@ def handle_error(self, request: Request): self._start_request_session_refresh(request, pool) def handle_expiration(self, request: Request): + assert self._crawler.stats with self._fatal_error_handler: pool = self._get_pool(request) self._crawler.stats.inc_value( diff --git a/scrapy_zyte_api/addon.py b/scrapy_zyte_api/addon.py index edd0175e..1e6f8daa 100644 --- a/scrapy_zyte_api/addon.py +++ b/scrapy_zyte_api/addon.py @@ -126,5 +126,5 @@ def update_settings(self, settings: BaseSettings) -> None: settings.set( "ZYTE_API_RETRY_POLICY", _SESSION_RETRY_POLICIES.get(loaded_retry_policy, retry_policy), - settings.getpriority("ZYTE_API_RETRY_POLICY"), + cast(int, settings.getpriority("ZYTE_API_RETRY_POLICY")), ) diff --git a/scrapy_zyte_api/providers.py b/scrapy_zyte_api/providers.py index a0eb2f6e..1a2cd496 100644 --- a/scrapy_zyte_api/providers.py +++ b/scrapy_zyte_api/providers.py @@ -5,6 +5,7 @@ from scrapy.crawler import Crawler from scrapy.utils.defer import maybe_deferred_to_future from scrapy_poet import PageObjectInputProvider +from twisted.internet.defer import Deferred from web_poet import ( AnyResponse, BrowserHtml, @@ -103,6 +104,7 @@ def is_provided(self, type_: Callable) -> bool: return super().is_provided(strip_annotated(type_)) def _track_auto_fields(self, crawler: Crawler, request: Request, cls: Type): + assert crawler.stats if cls not in _ITEM_KEYWORDS: return if self._should_track_auto_fields is None: @@ -258,7 +260,7 @@ async def __call__( # noqa: C901 ) assert crawler.engine api_response: ZyteAPITextResponse = await maybe_deferred_to_future( - crawler.engine.download(api_request) + cast("Deferred[ZyteAPITextResponse]", crawler.engine.download(api_request)) ) assert api_response.raw_api_response diff --git a/tests/test_providers.py b/tests/test_providers.py index 9f0b5fb7..3781943f 100644 --- a/tests/test_providers.py +++ b/tests/test_providers.py @@ -536,6 +536,8 @@ class SomePage(BasePage): response: AnyResponse class ZyteAPISpider(Spider): + url: str + def start_requests(self): yield Request(self.url, callback=self.parse_) @@ -564,6 +566,8 @@ class SomePage(BasePage): response: AnyResponse class ZyteAPISpider(Spider): + url: str + def start_requests(self): yield Request(self.url, callback=self.parse_) diff --git a/tests/test_sessions.py b/tests/test_sessions.py index cc249d3c..cd5e401b 100644 --- a/tests/test_sessions.py +++ b/tests/test_sessions.py @@ -393,7 +393,7 @@ class UseChecker(ConstantChecker): def check(self, response: Response, request: Request) -> bool: if response.meta.get(SESSION_INIT_META_KEY, False) is True: return True - return super().check(request, response) + return super().check(response, request) class FalseUseChecker(FalseChecker, UseChecker): @@ -2789,7 +2789,7 @@ class ExceptionRaisingDownloaderMiddleware: async def process_request(self, request: Request, spider: Spider) -> None: if request.meta.get("_is_session_init_request", False): return - raise spider.exception + raise spider.exception # type: ignore[attr-defined] @pytest.mark.parametrize( From 86c5b6762c133b35609c31721402eca6df285b7b Mon Sep 17 00:00:00 2001 From: Andrey Rakhmatullin Date: Mon, 4 Nov 2024 14:06:21 +0500 Subject: [PATCH 5/9] Fix passing headers in a wrong format to scrapy.responsetypes.responsetypes. --- scrapy_zyte_api/responses.py | 5 ++++- tests/test_responses.py | 8 ++++---- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/scrapy_zyte_api/responses.py b/scrapy_zyte_api/responses.py index 60fdc58d..3c041bbe 100644 --- a/scrapy_zyte_api/responses.py +++ b/scrapy_zyte_api/responses.py @@ -194,8 +194,11 @@ def _process_response( return ZyteAPITextResponse.from_api_response(api_response, request=request) if api_response.get("httpResponseHeaders") and api_response.get("httpResponseBody"): + scrapy_headers: Dict[bytes, bytes] = {} + for header in cast(List[Dict[str, str]], api_response["httpResponseHeaders"]): + scrapy_headers[header["name"].encode()] = header["value"].encode() response_cls = responsetypes.from_args( - headers=cast(List[Dict[str, str]], api_response["httpResponseHeaders"]), + headers=scrapy_headers, url=cast(str, api_response["url"]), # FIXME: update this when python-zyte-api supports base64 decoding body=b64decode(api_response["httpResponseBody"]), # type: ignore diff --git a/tests/test_responses.py b/tests/test_responses.py index d841a008..fa6cdae6 100644 --- a/tests/test_responses.py +++ b/tests/test_responses.py @@ -64,7 +64,7 @@ def raw_api_response_browser(): "echoData": {"some_value": "here"}, "httpResponseHeaders": [ {"name": "Content-Type", "value": "text/html"}, - {"name": "Content-Length", "value": len(PAGE_CONTENT)}, + {"name": "Content-Length", "value": str(len(PAGE_CONTENT))}, ], "statusCode": 200, "experimental": { @@ -80,7 +80,7 @@ def raw_api_response_body(): "echoData": {"some_value": "here"}, "httpResponseHeaders": [ {"name": "Content-Type", "value": "text/html"}, - {"name": "Content-Length", "value": len(PAGE_CONTENT)}, + {"name": "Content-Length", "value": str(len(PAGE_CONTENT))}, ], "statusCode": 200, "experimental": { @@ -97,7 +97,7 @@ def raw_api_response_mixed(): "echoData": {"some_value": "here"}, "httpResponseHeaders": [ {"name": "Content-Type", "value": "text/html"}, - {"name": "Content-Length", "value": len(PAGE_CONTENT_2)}, + {"name": "Content-Length", "value": str(len(PAGE_CONTENT_2))}, ], "statusCode": 200, "experimental": { @@ -200,7 +200,7 @@ def test_non_utf8_response(): "browserHtml": content, "httpResponseHeaders": [ {"name": "Content-Type", "value": "text/html; charset=iso-8859-1"}, - {"name": "Content-Length", "value": len(content)}, + {"name": "Content-Length", "value": str(len(content))}, ], } From e0734395ab62109d8cb07dbc89be0b742fcb9e38 Mon Sep 17 00:00:00 2001 From: Andrey Rakhmatullin Date: Mon, 4 Nov 2024 14:58:57 +0500 Subject: [PATCH 6/9] Fix a new typing issue. --- tests/test_middlewares.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/test_middlewares.py b/tests/test_middlewares.py index 48b64091..3959b0f6 100644 --- a/tests/test_middlewares.py +++ b/tests/test_middlewares.py @@ -216,6 +216,7 @@ def parse(self, response): f"Maximum Zyte API requests for this crawl is set at {zapi_max_requests}" in caplog.text ) + assert crawler.stats assert crawler.stats.get_value("scrapy-zyte-api/success") == zapi_max_requests assert crawler.stats.get_value("scrapy-zyte-api/processed") == zapi_max_requests assert crawler.stats.get_value("item_scraped_count") == zapi_max_requests From ac349276a17e36524a181f835bccf4679bcb1396 Mon Sep 17 00:00:00 2001 From: Andrey Rakhmatullin Date: Mon, 4 Nov 2024 16:11:38 +0500 Subject: [PATCH 7/9] Work around a problem in Scrapy 2.0.1. --- scrapy_zyte_api/responses.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/scrapy_zyte_api/responses.py b/scrapy_zyte_api/responses.py index 3c041bbe..941754f7 100644 --- a/scrapy_zyte_api/responses.py +++ b/scrapy_zyte_api/responses.py @@ -4,7 +4,7 @@ from typing import Any, Dict, List, Optional, Tuple, Union, cast from scrapy import Request -from scrapy.http import HtmlResponse, Response, TextResponse +from scrapy.http import Headers, HtmlResponse, Response, TextResponse from scrapy.http.cookies import CookieJar from scrapy.responsetypes import responsetypes @@ -194,7 +194,8 @@ def _process_response( return ZyteAPITextResponse.from_api_response(api_response, request=request) if api_response.get("httpResponseHeaders") and api_response.get("httpResponseBody"): - scrapy_headers: Dict[bytes, bytes] = {} + # a plain dict here doesn't work correctly on Scrapy < 2.1 + scrapy_headers = Headers() for header in cast(List[Dict[str, str]], api_response["httpResponseHeaders"]): scrapy_headers[header["name"].encode()] = header["value"].encode() response_cls = responsetypes.from_args( From a6d8388cf3afe1e7c80d0fdcc16128d9958883bc Mon Sep 17 00:00:00 2001 From: Andrey Rakhmatullin Date: Mon, 18 Nov 2024 20:04:52 +0500 Subject: [PATCH 8/9] Remove Scrapy master from tox.ini. --- tox.ini | 1 - 1 file changed, 1 deletion(-) diff --git a/tox.ini b/tox.ini index 5757e2ba..21c19341 100644 --- a/tox.ini +++ b/tox.ini @@ -111,7 +111,6 @@ extras = provider deps = mypy==1.11.2 pytest - Scrapy @ git+https://github.com/scrapy/scrapy@master commands = mypy scrapy_zyte_api tests From 380fefd60611b84cd3fc4b7bef707cbee1d70198 Mon Sep 17 00:00:00 2001 From: Andrey Rakhmatullin Date: Tue, 19 Nov 2024 12:26:40 +0500 Subject: [PATCH 9/9] Set REQUEST_FINGERPRINTER_IMPLEMENTATION only on Scrapy < 2.12. --- tests/__init__.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/tests/__init__.py b/tests/__init__.py index ee54efb2..12e4b861 100644 --- a/tests/__init__.py +++ b/tests/__init__.py @@ -2,7 +2,9 @@ from os import environ from typing import Any, Dict, Optional +from packaging.version import Version from scrapy import Spider +from scrapy import __version__ as SCRAPY_VERSION from scrapy.crawler import Crawler from scrapy.utils.misc import load_object from scrapy.utils.test import get_crawler as _get_crawler @@ -25,13 +27,16 @@ "scrapy_zyte_api.ScrapyZyteAPISessionDownloaderMiddleware": 667, }, "REQUEST_FINGERPRINTER_CLASS": "scrapy_zyte_api.ScrapyZyteAPIRequestFingerprinter", - "REQUEST_FINGERPRINTER_IMPLEMENTATION": "2.7", # Silence deprecation warning "SPIDER_MIDDLEWARES": { "scrapy_zyte_api.ScrapyZyteAPISpiderMiddleware": 100, }, "ZYTE_API_KEY": _API_KEY, "TWISTED_REACTOR": "twisted.internet.asyncioreactor.AsyncioSelectorReactor", } +if Version(SCRAPY_VERSION) < Version("2.12"): + SETTINGS["REQUEST_FINGERPRINTER_IMPLEMENTATION"] = ( + "2.7" # Silence deprecation warning + ) try: import scrapy_poet # noqa: F401 except ImportError: