scrapinghub · ivanprado · Dec 3, 2021 · Dec 3, 2021 · Dec 3, 2021 · Dec 22, 2021
diff --git a/CHANGES.rst b/CHANGES.rst
@@ -4,6 +4,10 @@ Changes
 TBR
 ------------------
 
+* The AutoExtract provider now supports the new providers API for caching
+* The preferred pageType for HTML requests (``AutoExtractProductData``)
+  is now chosen always if listed as dependency instead of just choosing
+  the first dependency ``pageType`` to request the HTML
 * removed support for Python 3.6
 * added support for Python 3.10
 

diff --git a/scrapy_autoextract/providers.py b/scrapy_autoextract/providers.py
@@ -1,10 +1,12 @@
 import inspect
+import json
 import logging
 import os
 from asyncio import CancelledError
-from typing import Callable, Set, ClassVar, List, Any, Hashable
+from typing import Callable, Set, ClassVar, List, Any, Hashable, Sequence, Type, TypeVar
 
 import aiohttp
+import attr
 from scrapy import Request as ScrapyRequest, signals
 from scrapy.crawler import Crawler
 from scrapy.settings import Settings
@@ -17,7 +19,10 @@
 from autoextract_poet.page_inputs import (
     AutoExtractProductData, AutoExtractData, AutoExtractHtml,
 )
-from scrapy_poet.page_input_providers import PageObjectInputProvider
+from scrapy_poet.page_input_providers import (
+    PageObjectInputProvider,
+    CacheDataProviderMixin,
+)
 from .errors import QueryError, summarize_exception
 from .slot_semaphore import SlotsSemaphore
 from .task_manager import TaskManager
@@ -30,6 +35,9 @@
 _TASK_MANAGER = "_autoextract_task_manager"
 
 
+AEDataType_co = TypeVar('AEDataType_co', bound=AutoExtractData, covariant=True)
+
+
 def get_autoextract_task_manager(crawler: Crawler) -> TaskManager:
     """
     Return the configured :class:`scrapy_autoextract.TaskManager` that controls
@@ -66,11 +74,20 @@ def get_concurrent_requests_per_domain(settings: Settings):
     return concurrency
 
 
-class AutoExtractProvider(PageObjectInputProvider):
+@attr.define
+class AERequestSpec:
+    query: List[AutoExtractRequest]
+    should_request_html: bool
+    is_extraction_required: bool
+    provided_cls: Callable
+
+
+class AutoExtractProvider(PageObjectInputProvider, CacheDataProviderMixin):
     """Provider for AutoExtract data"""
     # pageType requested when only html is required
     page_type_class_for_html: ClassVar[AutoExtractData] = AutoExtractProductData
     html_query_attribute: ClassVar[str] = "fullHtml"
+    name = "autoextract"
 
     @classmethod
     def provided_classes(cls, type_: Callable) -> bool:
@@ -172,7 +189,7 @@ def get_per_domain_concurrency_slot(self, request: ScrapyRequest) -> Hashable:
 
     def get_filled_request(self,
                            request: ScrapyRequest,
-                           provided_cls: AutoExtractData,
+                           provided_cls: Type[AEDataType_co],
                            should_request_html: bool) -> AutoExtractRequest:
         """Return a filled request for AutoExtract"""
         ae_request = AutoExtractRequest(
@@ -183,51 +200,68 @@ def get_filled_request(self,
             ae_request.extra = {self.html_query_attribute: True}
         return ae_request
 
-    async def __call__(self,
-                       to_provide: Set[Callable],
-                       request: ScrapyRequest
-                       ) -> List:
-        """Make an AutoExtract request and build a Page Input of provided class
-        based on API response data.
-        """
-        if not self.aiohttp_session:
-            self.aiohttp_session = await self.create_aiohttp_session()
+    def list_required_requests(self, to_provide: Set[Callable], request: ScrapyRequest) -> List[AERequestSpec]:
         is_html_required = AutoExtractHtml in to_provide
         to_provide -= {AutoExtractHtml}
         is_extraction_required = bool(to_provide)
         if is_html_required and not is_extraction_required:
             # At least one request is required to get html
             to_provide = {self.page_type_class_for_html}
-
-        instances = []
+        if not to_provide:
+            return []
+        # Use the recommended type for html request, otherwise use the first one
+        class_for_html = (self.page_type_class_for_html
+                          if self.page_type_class_for_html in to_provide
+                          else next(iter(to_provide)))
+        specs = []
         for idx, provided_cls in enumerate(to_provide):
             if not issubclass(provided_cls, AutoExtractData):
                 raise RuntimeError(
                     f"Unexpected {provided_cls} requested. Probably a bug in the provider "
                     "or in scrapy-poet itself")
+
+            # html is requested only a single time to save resources
+            should_request_html = is_html_required and provided_cls is class_for_html
+            ae_request = self.get_filled_request(
+                request,
+                provided_cls,
+                should_request_html
+            )
+            specs.append(AERequestSpec(
+                [ae_request],
+                should_request_html,
+                is_extraction_required,
+                provided_cls
+            ))
+        return specs
+
+    async def __call__(self,
+                       to_provide: Set[Callable],
+                       request: ScrapyRequest
+                       ) -> List:
+        """Make an AutoExtract request and build a Page Input of provided class
+        based on API response data.
+        """
+        if not self.aiohttp_session:
+            self.aiohttp_session = await self.create_aiohttp_session()
+
+        instances = []
+        for spec in self.list_required_requests(to_provide, request):
             request_stats = AggStats()
-            is_first_request = idx == 0
-            page_type = provided_cls.page_type
+            page_type = spec.provided_cls.page_type
 
             def inc_stats(suffix, value=1, both=False):
                 self.stats.inc_value(f"autoextract/total{suffix}", value)
                 if both:
                     self.stats.inc_value(f"autoextract/{page_type}{suffix}", value)
 
             try:
-                # html is requested only a single time to save resources
-                should_request_html = is_html_required and is_first_request
                 slot = self.get_per_domain_concurrency_slot(request)
-                ae_request = self.get_filled_request(
-                    request,
-                    provided_cls,
-                    should_request_html
-                )
                 # When providing same-name arguments in both call and `__init__`
                 # this implementation will not cause any errors (while name=value implementation would),
                 # so predefined `__init__` arguments would override the same arguments in the call
                 awaitable = self.do_request_cached(**{
-                    'query': [ae_request],
+                    'query': spec.query,
                     'agg_stats': request_stats,
                     **self.common_request_kwargs,
                 })
@@ -251,15 +285,29 @@ def inc_stats(suffix, value=1, both=False):
                 inc_stats("/attempts/count", request_stats.n_attempts)
                 inc_stats("/attempts/billable", request_stats.n_billable_query_responses)
 
-            if should_request_html:
+            if spec.should_request_html:
                 instances.append(AutoExtractHtml(url=data[page_type]['url'],
                                                  html=data['html']))
                 inc_stats("/pages/html", both=True)
 
-            if is_extraction_required:
+            if spec.is_extraction_required:
                 without_html = {k: v for k, v in data.items() if k != "html"}
-                instances.append(provided_cls(data=without_html))
+                instances.append(spec.provided_cls(data=without_html))
 
             inc_stats("/pages/success", both=True)
 
         return instances
+
+    def fingerprint(self, to_provide: Set[Callable], request: ScrapyRequest) -> str:
+        queries = [spec.query[0].as_dict() for spec in self.list_required_requests(to_provide, request)]
+        # pageType is the key to sort by it
+        by_page_type = {query['pageType']: query for query in queries}
+        for query in by_page_type.values():
+            query.pop("pageType")
+        return json.dumps(by_page_type, sort_keys=True)
+
+    def serialize(self, result: Sequence[Any]) -> Any:
+        return result
+
+    def deserialize(self, data: Any) -> Sequence[Any]:
+        return data
diff --git a/setup.py b/setup.py
@@ -29,7 +29,7 @@ def get_version():
     install_requires=[
         'autoextract-poet>=0.3.0',
         'zyte-autoextract>=0.7.0',
-        'scrapy-poet>=0.2.0',
+        'scrapy-poet @ git+https://[email protected]/scrapinghub/scrapy-poet@master#egg=scrapy-poet',
         'aiohttp',
         'tldextract',
         'sqlitedict>=1.7.0',

diff --git a/tests/test_providers.py b/tests/test_providers.py
@@ -285,3 +285,16 @@ async def cancel_after(sleep):
 
         finally:
             signal.signal(SIGINT, old_handler)
+
+
+def test_provider_cache():
+    injector = get_injector_for_testing({})
+    request = get_response_for_testing(None).request
+    provider = AutoExtractProvider(injector.crawler)
+    fingerprint = provider.fingerprint(
+        {AutoExtractHtml, AutoExtractProductData, AutoExtractArticleData},
+        request)
+    assert fingerprint == """{"article": {"articleBodyRaw": false, "url": "http://example.com"}, "product": {"articleBodyRaw": false, "fullHtml": true, "url": "http://example.com"}}"""
+    assert provider.serialize("foo") == "foo"
+    assert provider.deserialize("bar") == "bar"
+    assert provider.has_cache_support