From d82fa279096e8d1fd26194e6855f655e8d7acad8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Iv=C3=A1n=20de=20Prado?= Date: Fri, 3 Dec 2021 13:05:49 +0100 Subject: [PATCH 1/6] AutoExtractProvider now support the new scrapy-poet cache interface Additionaly, the preferred pageType for HTML requests (``AutoExtractProductData``) is now chosen always if listed as dependency instead of just choosing the first dependency ``pageType`` to request the HTML --- CHANGES.rst | 8 +++ scrapy_autoextract/providers.py | 99 ++++++++++++++++++++++++--------- setup.py | 2 +- tests/test_providers.py | 12 ++++ 4 files changed, 93 insertions(+), 28 deletions(-) diff --git a/CHANGES.rst b/CHANGES.rst index 4ab6d0c..5266ef0 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -1,6 +1,14 @@ Changes ======= +TBR +------------------ + +* The AutoExtract provider now supports the new providers API for caching +* The preferred pageType for HTML requests (``AutoExtractProductData``) + is now chosen always if listed as dependency instead of just choosing + the first dependency ``pageType`` to request the HTML + 0.7.0 (2021-08-05) ------------------ diff --git a/scrapy_autoextract/providers.py b/scrapy_autoextract/providers.py index e4f2716..7abac03 100644 --- a/scrapy_autoextract/providers.py +++ b/scrapy_autoextract/providers.py @@ -1,11 +1,13 @@ import inspect +import json import logging import os from asyncio import CancelledError -from typing import Callable, Set, ClassVar, List, Any, Hashable +from dataclasses import dataclass +from typing import Callable, Set, ClassVar, List, Any, Hashable, Sequence, Type, TypeVar import aiohttp -from scrapy import Request as ScrapyRequest, signals +from scrapy import Request as ScrapyRequest, signals, Request from scrapy.crawler import Crawler from scrapy.settings import Settings from autoextract.aio import request_raw, create_session @@ -30,6 +32,9 @@ _TASK_MANAGER = "_autoextract_task_manager" +AEDataType = TypeVar('AEDataType', bound=AutoExtractData, covariant=True) + + def get_autoextract_task_manager(crawler: Crawler) -> TaskManager: """ Return the configured :class:`scrapy_autoextract.TaskManager` that controls @@ -66,11 +71,20 @@ def get_concurrent_requests_per_domain(settings: Settings): return concurrency +@dataclass +class AERequestSpec: + query: List[AutoExtractRequest] + should_request_html: bool + is_extraction_required: bool + provided_cls: Callable + + class AutoExtractProvider(PageObjectInputProvider): """Provider for AutoExtract data""" # pageType requested when only html is required page_type_class_for_html: ClassVar[AutoExtractData] = AutoExtractProductData html_query_attribute: ClassVar[str] = "fullHtml" + name = "autoextract" @classmethod def provided_classes(cls, type_: Callable) -> bool: @@ -172,7 +186,7 @@ def get_per_domain_concurrency_slot(self, request: ScrapyRequest) -> Hashable: def get_filled_request(self, request: ScrapyRequest, - provided_cls: AutoExtractData, + provided_cls: Type[AEDataType], should_request_html: bool) -> AutoExtractRequest: """Return a filled request for AutoExtract""" ae_request = AutoExtractRequest( @@ -183,31 +197,55 @@ def get_filled_request(self, ae_request.extra = {self.html_query_attribute: True} return ae_request - async def __call__(self, - to_provide: Set[Callable], - request: ScrapyRequest - ) -> List: - """Make an AutoExtract request and build a Page Input of provided class - based on API response data. - """ - if not self.aiohttp_session: - self.aiohttp_session = await self.create_aiohttp_session() + def list_required_requests(self, to_provide: Set[Callable], request: ScrapyRequest): is_html_required = AutoExtractHtml in to_provide to_provide -= {AutoExtractHtml} is_extraction_required = bool(to_provide) if is_html_required and not is_extraction_required: # At least one request is required to get html to_provide = {self.page_type_class_for_html} - - instances = [] + if not to_provide: + return [] + # Use the recommended type for html request, otherwise use the first one + class_for_html = (self.page_type_class_for_html + if self.page_type_class_for_html in to_provide + else next(iter(to_provide))) + specs = [] for idx, provided_cls in enumerate(to_provide): if not issubclass(provided_cls, AutoExtractData): raise RuntimeError( f"Unexpected {provided_cls} requested. Probably a bug in the provider " "or in scrapy-poet itself") + + # html is requested only a single time to save resources + should_request_html = is_html_required and provided_cls is class_for_html + ae_request = self.get_filled_request( + request, + provided_cls, + should_request_html + ) + specs.append(AERequestSpec( + [ae_request], + should_request_html, + is_extraction_required, + provided_cls + )) + return specs + + async def __call__(self, + to_provide: Set[Callable], + request: ScrapyRequest + ) -> List: + """Make an AutoExtract request and build a Page Input of provided class + based on API response data. + """ + if not self.aiohttp_session: + self.aiohttp_session = await self.create_aiohttp_session() + + instances = [] + for spec in self.list_required_requests(to_provide, request): request_stats = AggStats() - is_first_request = idx == 0 - page_type = provided_cls.page_type + page_type = spec.provided_cls.page_type def inc_stats(suffix, value=1, both=False): self.stats.inc_value(f"autoextract/total{suffix}", value) @@ -215,19 +253,12 @@ def inc_stats(suffix, value=1, both=False): self.stats.inc_value(f"autoextract/{page_type}{suffix}", value) try: - # html is requested only a single time to save resources - should_request_html = is_html_required and is_first_request slot = self.get_per_domain_concurrency_slot(request) - ae_request = self.get_filled_request( - request, - provided_cls, - should_request_html - ) # When providing same-name arguments in both call and `__init__` # this implementation will not cause any errors (while name=value implementation would), # so predefined `__init__` arguments would override the same arguments in the call awaitable = self.do_request_cached(**{ - 'query': [ae_request], + 'query': spec.query, 'agg_stats': request_stats, **self.common_request_kwargs, }) @@ -251,15 +282,29 @@ def inc_stats(suffix, value=1, both=False): inc_stats("/attempts/count", request_stats.n_attempts) inc_stats("/attempts/billable", request_stats.n_billable_query_responses) - if should_request_html: + if spec.should_request_html: instances.append(AutoExtractHtml(url=data[page_type]['url'], html=data['html'])) inc_stats("/pages/html", both=True) - if is_extraction_required: + if spec.is_extraction_required: without_html = {k: v for k, v in data.items() if k != "html"} - instances.append(provided_cls(data=without_html)) + instances.append(spec.provided_cls(data=without_html)) inc_stats("/pages/success", both=True) return instances + + def fingerprint(self, to_provide: Set[Callable], request: Request) -> str: + queries = [spec.query[0].as_dict() for spec in self.list_required_requests(to_provide, request)] + # pageType is the key to sort by it + by_page_type = {query['pageType']: query for query in queries} + for query in by_page_type.values(): + query.pop("pageType") + return json.dumps(by_page_type, sort_keys=True) + + def serialize(self, result: Sequence[Any]) -> Any: + return result + + def deserialize(self, data: Any) -> Sequence[Any]: + return data diff --git a/setup.py b/setup.py index f645dc5..4524f9f 100644 --- a/setup.py +++ b/setup.py @@ -29,7 +29,7 @@ def get_version(): install_requires=[ 'autoextract-poet>=0.3.0', 'zyte-autoextract>=0.7.0', - 'scrapy-poet>=0.2.0', + 'scrapy-poet @ git+https://git@github.com/scrapinghub/scrapy-poet@injector_record_replay_native#egg=scrapy-poet', 'aiohttp', 'tldextract', 'sqlitedict>=1.7.0', diff --git a/tests/test_providers.py b/tests/test_providers.py index 0b4bd32..382fa93 100644 --- a/tests/test_providers.py +++ b/tests/test_providers.py @@ -285,3 +285,15 @@ async def cancel_after(sleep): finally: signal.signal(SIGINT, old_handler) + + +def test_provider_cache(): + injector = get_injector_for_testing({}) + request = get_response_for_testing(None).request + provider = AutoExtractProvider(injector.crawler) + finguerprint = provider.fingerprint( + {AutoExtractHtml, AutoExtractProductData, AutoExtractArticleData}, + request) + assert finguerprint == """{"article": {"articleBodyRaw": false, "url": "http://example.com"}, "product": {"articleBodyRaw": false, "fullHtml": true, "url": "http://example.com"}}""" + assert provider.serialize("foo") == "foo" + assert provider.deserialize("bar") == "bar" From 332af78e98a5edd3889716ebfbeea2652b8b5493 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Iv=C3=A1n=20de=20Prado?= Date: Fri, 3 Dec 2021 13:11:20 +0100 Subject: [PATCH 2/6] Fix py36 --- scrapy_autoextract/providers.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/scrapy_autoextract/providers.py b/scrapy_autoextract/providers.py index 7abac03..b116d4d 100644 --- a/scrapy_autoextract/providers.py +++ b/scrapy_autoextract/providers.py @@ -7,6 +7,7 @@ from typing import Callable, Set, ClassVar, List, Any, Hashable, Sequence, Type, TypeVar import aiohttp +import attr from scrapy import Request as ScrapyRequest, signals, Request from scrapy.crawler import Crawler from scrapy.settings import Settings @@ -71,7 +72,7 @@ def get_concurrent_requests_per_domain(settings: Settings): return concurrency -@dataclass +@attr.define() class AERequestSpec: query: List[AutoExtractRequest] should_request_html: bool From 15a8237095c5e14dbe75c11c1453856cfea98048 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Iv=C3=A1n=20de=20Prado?= Date: Fri, 3 Dec 2021 13:14:05 +0100 Subject: [PATCH 3/6] Fix py36 --- scrapy_autoextract/providers.py | 1 - 1 file changed, 1 deletion(-) diff --git a/scrapy_autoextract/providers.py b/scrapy_autoextract/providers.py index b116d4d..463abd0 100644 --- a/scrapy_autoextract/providers.py +++ b/scrapy_autoextract/providers.py @@ -3,7 +3,6 @@ import logging import os from asyncio import CancelledError -from dataclasses import dataclass from typing import Callable, Set, ClassVar, List, Any, Hashable, Sequence, Type, TypeVar import aiohttp From acc66ca6ed63badbb5719be924ad04d6ff6c9034 Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal Date: Wed, 22 Dec 2021 15:57:21 +0800 Subject: [PATCH 4/6] fix annotations, typos, and dep versioning --- scrapy_autoextract/providers.py | 6 +++--- setup.py | 2 +- tests/test_providers.py | 4 ++-- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/scrapy_autoextract/providers.py b/scrapy_autoextract/providers.py index 463abd0..03ab39c 100644 --- a/scrapy_autoextract/providers.py +++ b/scrapy_autoextract/providers.py @@ -32,7 +32,7 @@ _TASK_MANAGER = "_autoextract_task_manager" -AEDataType = TypeVar('AEDataType', bound=AutoExtractData, covariant=True) +AEDataType_co = TypeVar('AEDataType_co', bound=AutoExtractData, covariant=True) def get_autoextract_task_manager(crawler: Crawler) -> TaskManager: @@ -186,7 +186,7 @@ def get_per_domain_concurrency_slot(self, request: ScrapyRequest) -> Hashable: def get_filled_request(self, request: ScrapyRequest, - provided_cls: Type[AEDataType], + provided_cls: Type[AEDataType_co], should_request_html: bool) -> AutoExtractRequest: """Return a filled request for AutoExtract""" ae_request = AutoExtractRequest( @@ -197,7 +197,7 @@ def get_filled_request(self, ae_request.extra = {self.html_query_attribute: True} return ae_request - def list_required_requests(self, to_provide: Set[Callable], request: ScrapyRequest): + def list_required_requests(self, to_provide: Set[Callable], request: ScrapyRequest) -> List[AERequestSpec]: is_html_required = AutoExtractHtml in to_provide to_provide -= {AutoExtractHtml} is_extraction_required = bool(to_provide) diff --git a/setup.py b/setup.py index 4524f9f..2ec7762 100644 --- a/setup.py +++ b/setup.py @@ -29,7 +29,7 @@ def get_version(): install_requires=[ 'autoextract-poet>=0.3.0', 'zyte-autoextract>=0.7.0', - 'scrapy-poet @ git+https://git@github.com/scrapinghub/scrapy-poet@injector_record_replay_native#egg=scrapy-poet', + 'scrapy-poet @ git+https://git@github.com/scrapinghub/scrapy-poet@master#egg=scrapy-poet', 'aiohttp', 'tldextract', 'sqlitedict>=1.7.0', diff --git a/tests/test_providers.py b/tests/test_providers.py index 382fa93..e30214c 100644 --- a/tests/test_providers.py +++ b/tests/test_providers.py @@ -291,9 +291,9 @@ def test_provider_cache(): injector = get_injector_for_testing({}) request = get_response_for_testing(None).request provider = AutoExtractProvider(injector.crawler) - finguerprint = provider.fingerprint( + fingerprint = provider.fingerprint( {AutoExtractHtml, AutoExtractProductData, AutoExtractArticleData}, request) - assert finguerprint == """{"article": {"articleBodyRaw": false, "url": "http://example.com"}, "product": {"articleBodyRaw": false, "fullHtml": true, "url": "http://example.com"}}""" + assert fingerprint == """{"article": {"articleBodyRaw": false, "url": "http://example.com"}, "product": {"articleBodyRaw": false, "fullHtml": true, "url": "http://example.com"}}""" assert provider.serialize("foo") == "foo" assert provider.deserialize("bar") == "bar" From 1e2be8a2ffca6888f43cecc56c182bd55b4335da Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal Date: Thu, 23 Dec 2021 13:21:26 +0800 Subject: [PATCH 5/6] use the proper Cache Mixin from scrapy-poet --- scrapy_autoextract/providers.py | 7 +++++-- tests/test_providers.py | 1 + 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/scrapy_autoextract/providers.py b/scrapy_autoextract/providers.py index 03ab39c..3605cdb 100644 --- a/scrapy_autoextract/providers.py +++ b/scrapy_autoextract/providers.py @@ -19,7 +19,10 @@ from autoextract_poet.page_inputs import ( AutoExtractProductData, AutoExtractData, AutoExtractHtml, ) -from scrapy_poet.page_input_providers import PageObjectInputProvider +from scrapy_poet.page_input_providers import ( + PageObjectInputProvider, + CacheDataProviderMixin, +) from .errors import QueryError, summarize_exception from .slot_semaphore import SlotsSemaphore from .task_manager import TaskManager @@ -79,7 +82,7 @@ class AERequestSpec: provided_cls: Callable -class AutoExtractProvider(PageObjectInputProvider): +class AutoExtractProvider(PageObjectInputProvider, CacheDataProviderMixin): """Provider for AutoExtract data""" # pageType requested when only html is required page_type_class_for_html: ClassVar[AutoExtractData] = AutoExtractProductData diff --git a/tests/test_providers.py b/tests/test_providers.py index e30214c..2e8556f 100644 --- a/tests/test_providers.py +++ b/tests/test_providers.py @@ -297,3 +297,4 @@ def test_provider_cache(): assert fingerprint == """{"article": {"articleBodyRaw": false, "url": "http://example.com"}, "product": {"articleBodyRaw": false, "fullHtml": true, "url": "http://example.com"}}""" assert provider.serialize("foo") == "foo" assert provider.deserialize("bar") == "bar" + assert provider.has_cache_support From 0f0dd5518be40759868cdfff6ca0ba3372b90ee2 Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal Date: Thu, 23 Dec 2021 19:30:34 +0800 Subject: [PATCH 6/6] small import fix --- scrapy_autoextract/providers.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/scrapy_autoextract/providers.py b/scrapy_autoextract/providers.py index 3605cdb..d09a9b6 100644 --- a/scrapy_autoextract/providers.py +++ b/scrapy_autoextract/providers.py @@ -7,7 +7,7 @@ import aiohttp import attr -from scrapy import Request as ScrapyRequest, signals, Request +from scrapy import Request as ScrapyRequest, signals from scrapy.crawler import Crawler from scrapy.settings import Settings from autoextract.aio import request_raw, create_session @@ -74,7 +74,7 @@ def get_concurrent_requests_per_domain(settings: Settings): return concurrency -@attr.define() +@attr.define class AERequestSpec: query: List[AutoExtractRequest] should_request_html: bool @@ -298,7 +298,7 @@ def inc_stats(suffix, value=1, both=False): return instances - def fingerprint(self, to_provide: Set[Callable], request: Request) -> str: + def fingerprint(self, to_provide: Set[Callable], request: ScrapyRequest) -> str: queries = [spec.query[0].as_dict() for spec in self.list_required_requests(to_provide, request)] # pageType is the key to sort by it by_page_type = {query['pageType']: query for query in queries}