From d82fa279096e8d1fd26194e6855f655e8d7acad8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Iv=C3=A1n=20de=20Prado?=
 <ivanprado@users.noreply.github.com>
Date: Fri, 3 Dec 2021 13:05:49 +0100
Subject: [PATCH 1/6] AutoExtractProvider now support the new scrapy-poet cache
 interface

Additionaly, the preferred pageType for HTML requests (``AutoExtractProductData``)
is now chosen always if listed as dependency instead of just choosing
the first dependency ``pageType`` to request the HTML
---
 CHANGES.rst                     |  8 +++
 scrapy_autoextract/providers.py | 99 ++++++++++++++++++++++++---------
 setup.py                        |  2 +-
 tests/test_providers.py         | 12 ++++
 4 files changed, 93 insertions(+), 28 deletions(-)

diff --git a/CHANGES.rst b/CHANGES.rst
index 4ab6d0c..5266ef0 100644
--- a/CHANGES.rst
+++ b/CHANGES.rst
@@ -1,6 +1,14 @@
 Changes
 =======
 
+TBR
+------------------
+
+* The AutoExtract provider now supports the new providers API for caching
+* The preferred pageType for HTML requests (``AutoExtractProductData``)
+  is now chosen always if listed as dependency instead of just choosing
+  the first dependency ``pageType`` to request the HTML
+
 0.7.0 (2021-08-05)
 ------------------
 
diff --git a/scrapy_autoextract/providers.py b/scrapy_autoextract/providers.py
index e4f2716..7abac03 100644
--- a/scrapy_autoextract/providers.py
+++ b/scrapy_autoextract/providers.py
@@ -1,11 +1,13 @@
 import inspect
+import json
 import logging
 import os
 from asyncio import CancelledError
-from typing import Callable, Set, ClassVar, List, Any, Hashable
+from dataclasses import dataclass
+from typing import Callable, Set, ClassVar, List, Any, Hashable, Sequence, Type, TypeVar
 
 import aiohttp
-from scrapy import Request as ScrapyRequest, signals
+from scrapy import Request as ScrapyRequest, signals, Request
 from scrapy.crawler import Crawler
 from scrapy.settings import Settings
 from autoextract.aio import request_raw, create_session
@@ -30,6 +32,9 @@
 _TASK_MANAGER = "_autoextract_task_manager"
 
 
+AEDataType = TypeVar('AEDataType', bound=AutoExtractData, covariant=True)
+
+
 def get_autoextract_task_manager(crawler: Crawler) -> TaskManager:
     """
     Return the configured :class:`scrapy_autoextract.TaskManager` that controls
@@ -66,11 +71,20 @@ def get_concurrent_requests_per_domain(settings: Settings):
     return concurrency
 
 
+@dataclass
+class AERequestSpec:
+    query: List[AutoExtractRequest]
+    should_request_html: bool
+    is_extraction_required: bool
+    provided_cls: Callable
+
+
 class AutoExtractProvider(PageObjectInputProvider):
     """Provider for AutoExtract data"""
     # pageType requested when only html is required
     page_type_class_for_html: ClassVar[AutoExtractData] = AutoExtractProductData
     html_query_attribute: ClassVar[str] = "fullHtml"
+    name = "autoextract"
 
     @classmethod
     def provided_classes(cls, type_: Callable) -> bool:
@@ -172,7 +186,7 @@ def get_per_domain_concurrency_slot(self, request: ScrapyRequest) -> Hashable:
 
     def get_filled_request(self,
                            request: ScrapyRequest,
-                           provided_cls: AutoExtractData,
+                           provided_cls: Type[AEDataType],
                            should_request_html: bool) -> AutoExtractRequest:
         """Return a filled request for AutoExtract"""
         ae_request = AutoExtractRequest(
@@ -183,31 +197,55 @@ def get_filled_request(self,
             ae_request.extra = {self.html_query_attribute: True}
         return ae_request
 
-    async def __call__(self,
-                       to_provide: Set[Callable],
-                       request: ScrapyRequest
-                       ) -> List:
-        """Make an AutoExtract request and build a Page Input of provided class
-        based on API response data.
-        """
-        if not self.aiohttp_session:
-            self.aiohttp_session = await self.create_aiohttp_session()
+    def list_required_requests(self, to_provide: Set[Callable], request: ScrapyRequest):
         is_html_required = AutoExtractHtml in to_provide
         to_provide -= {AutoExtractHtml}
         is_extraction_required = bool(to_provide)
         if is_html_required and not is_extraction_required:
             # At least one request is required to get html
             to_provide = {self.page_type_class_for_html}
-
-        instances = []
+        if not to_provide:
+            return []
+        # Use the recommended type for html request, otherwise use the first one
+        class_for_html = (self.page_type_class_for_html
+                          if self.page_type_class_for_html in to_provide
+                          else next(iter(to_provide)))
+        specs = []
         for idx, provided_cls in enumerate(to_provide):
             if not issubclass(provided_cls, AutoExtractData):
                 raise RuntimeError(
                     f"Unexpected {provided_cls} requested. Probably a bug in the provider "
                     "or in scrapy-poet itself")
+
+            # html is requested only a single time to save resources
+            should_request_html = is_html_required and provided_cls is class_for_html
+            ae_request = self.get_filled_request(
+                request,
+                provided_cls,
+                should_request_html
+            )
+            specs.append(AERequestSpec(
+                [ae_request],
+                should_request_html,
+                is_extraction_required,
+                provided_cls
+            ))
+        return specs
+
+    async def __call__(self,
+                       to_provide: Set[Callable],
+                       request: ScrapyRequest
+                       ) -> List:
+        """Make an AutoExtract request and build a Page Input of provided class
+        based on API response data.
+        """
+        if not self.aiohttp_session:
+            self.aiohttp_session = await self.create_aiohttp_session()
+
+        instances = []
+        for spec in self.list_required_requests(to_provide, request):
             request_stats = AggStats()
-            is_first_request = idx == 0
-            page_type = provided_cls.page_type
+            page_type = spec.provided_cls.page_type
 
             def inc_stats(suffix, value=1, both=False):
                 self.stats.inc_value(f"autoextract/total{suffix}", value)
@@ -215,19 +253,12 @@ def inc_stats(suffix, value=1, both=False):
                     self.stats.inc_value(f"autoextract/{page_type}{suffix}", value)
 
             try:
-                # html is requested only a single time to save resources
-                should_request_html = is_html_required and is_first_request
                 slot = self.get_per_domain_concurrency_slot(request)
-                ae_request = self.get_filled_request(
-                    request,
-                    provided_cls,
-                    should_request_html
-                )
                 # When providing same-name arguments in both call and `__init__`
                 # this implementation will not cause any errors (while name=value implementation would),
                 # so predefined `__init__` arguments would override the same arguments in the call
                 awaitable = self.do_request_cached(**{
-                    'query': [ae_request],
+                    'query': spec.query,
                     'agg_stats': request_stats,
                     **self.common_request_kwargs,
                 })
@@ -251,15 +282,29 @@ def inc_stats(suffix, value=1, both=False):
                 inc_stats("/attempts/count", request_stats.n_attempts)
                 inc_stats("/attempts/billable", request_stats.n_billable_query_responses)
 
-            if should_request_html:
+            if spec.should_request_html:
                 instances.append(AutoExtractHtml(url=data[page_type]['url'],
                                                  html=data['html']))
                 inc_stats("/pages/html", both=True)
 
-            if is_extraction_required:
+            if spec.is_extraction_required:
                 without_html = {k: v for k, v in data.items() if k != "html"}
-                instances.append(provided_cls(data=without_html))
+                instances.append(spec.provided_cls(data=without_html))
 
             inc_stats("/pages/success", both=True)
 
         return instances
+
+    def fingerprint(self, to_provide: Set[Callable], request: Request) -> str:
+        queries = [spec.query[0].as_dict() for spec in self.list_required_requests(to_provide, request)]
+        # pageType is the key to sort by it
+        by_page_type = {query['pageType']: query for query in queries}
+        for query in by_page_type.values():
+            query.pop("pageType")
+        return json.dumps(by_page_type, sort_keys=True)
+
+    def serialize(self, result: Sequence[Any]) -> Any:
+        return result
+
+    def deserialize(self, data: Any) -> Sequence[Any]:
+        return data
diff --git a/setup.py b/setup.py
index f645dc5..4524f9f 100644
--- a/setup.py
+++ b/setup.py
@@ -29,7 +29,7 @@ def get_version():
     install_requires=[
         'autoextract-poet>=0.3.0',
         'zyte-autoextract>=0.7.0',
-        'scrapy-poet>=0.2.0',
+        'scrapy-poet @ git+https://git@github.com/scrapinghub/scrapy-poet@injector_record_replay_native#egg=scrapy-poet',
         'aiohttp',
         'tldextract',
         'sqlitedict>=1.7.0',
diff --git a/tests/test_providers.py b/tests/test_providers.py
index 0b4bd32..382fa93 100644
--- a/tests/test_providers.py
+++ b/tests/test_providers.py
@@ -285,3 +285,15 @@ async def cancel_after(sleep):
 
         finally:
             signal.signal(SIGINT, old_handler)
+
+
+def test_provider_cache():
+    injector = get_injector_for_testing({})
+    request = get_response_for_testing(None).request
+    provider = AutoExtractProvider(injector.crawler)
+    finguerprint = provider.fingerprint(
+        {AutoExtractHtml, AutoExtractProductData, AutoExtractArticleData},
+        request)
+    assert finguerprint == """{"article": {"articleBodyRaw": false, "url": "http://example.com"}, "product": {"articleBodyRaw": false, "fullHtml": true, "url": "http://example.com"}}"""
+    assert provider.serialize("foo") == "foo"
+    assert provider.deserialize("bar") == "bar"

From 332af78e98a5edd3889716ebfbeea2652b8b5493 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Iv=C3=A1n=20de=20Prado?=
 <ivanprado@users.noreply.github.com>
Date: Fri, 3 Dec 2021 13:11:20 +0100
Subject: [PATCH 2/6] Fix py36

---
 scrapy_autoextract/providers.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/scrapy_autoextract/providers.py b/scrapy_autoextract/providers.py
index 7abac03..b116d4d 100644
--- a/scrapy_autoextract/providers.py
+++ b/scrapy_autoextract/providers.py
@@ -7,6 +7,7 @@
 from typing import Callable, Set, ClassVar, List, Any, Hashable, Sequence, Type, TypeVar
 
 import aiohttp
+import attr
 from scrapy import Request as ScrapyRequest, signals, Request
 from scrapy.crawler import Crawler
 from scrapy.settings import Settings
@@ -71,7 +72,7 @@ def get_concurrent_requests_per_domain(settings: Settings):
     return concurrency
 
 
-@dataclass
+@attr.define()
 class AERequestSpec:
     query: List[AutoExtractRequest]
     should_request_html: bool

From 15a8237095c5e14dbe75c11c1453856cfea98048 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Iv=C3=A1n=20de=20Prado?=
 <ivanprado@users.noreply.github.com>
Date: Fri, 3 Dec 2021 13:14:05 +0100
Subject: [PATCH 3/6] Fix py36

---
 scrapy_autoextract/providers.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/scrapy_autoextract/providers.py b/scrapy_autoextract/providers.py
index b116d4d..463abd0 100644
--- a/scrapy_autoextract/providers.py
+++ b/scrapy_autoextract/providers.py
@@ -3,7 +3,6 @@
 import logging
 import os
 from asyncio import CancelledError
-from dataclasses import dataclass
 from typing import Callable, Set, ClassVar, List, Any, Hashable, Sequence, Type, TypeVar
 
 import aiohttp

From acc66ca6ed63badbb5719be924ad04d6ff6c9034 Mon Sep 17 00:00:00 2001
From: Kevin Lloyd Bernal <kevinoxy@gmail.com>
Date: Wed, 22 Dec 2021 15:57:21 +0800
Subject: [PATCH 4/6] fix annotations, typos, and dep versioning

---
 scrapy_autoextract/providers.py | 6 +++---
 setup.py                        | 2 +-
 tests/test_providers.py         | 4 ++--
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/scrapy_autoextract/providers.py b/scrapy_autoextract/providers.py
index 463abd0..03ab39c 100644
--- a/scrapy_autoextract/providers.py
+++ b/scrapy_autoextract/providers.py
@@ -32,7 +32,7 @@
 _TASK_MANAGER = "_autoextract_task_manager"
 
 
-AEDataType = TypeVar('AEDataType', bound=AutoExtractData, covariant=True)
+AEDataType_co = TypeVar('AEDataType_co', bound=AutoExtractData, covariant=True)
 
 
 def get_autoextract_task_manager(crawler: Crawler) -> TaskManager:
@@ -186,7 +186,7 @@ def get_per_domain_concurrency_slot(self, request: ScrapyRequest) -> Hashable:
 
     def get_filled_request(self,
                            request: ScrapyRequest,
-                           provided_cls: Type[AEDataType],
+                           provided_cls: Type[AEDataType_co],
                            should_request_html: bool) -> AutoExtractRequest:
         """Return a filled request for AutoExtract"""
         ae_request = AutoExtractRequest(
@@ -197,7 +197,7 @@ def get_filled_request(self,
             ae_request.extra = {self.html_query_attribute: True}
         return ae_request
 
-    def list_required_requests(self, to_provide: Set[Callable], request: ScrapyRequest):
+    def list_required_requests(self, to_provide: Set[Callable], request: ScrapyRequest) -> List[AERequestSpec]:
         is_html_required = AutoExtractHtml in to_provide
         to_provide -= {AutoExtractHtml}
         is_extraction_required = bool(to_provide)
diff --git a/setup.py b/setup.py
index 4524f9f..2ec7762 100644
--- a/setup.py
+++ b/setup.py
@@ -29,7 +29,7 @@ def get_version():
     install_requires=[
         'autoextract-poet>=0.3.0',
         'zyte-autoextract>=0.7.0',
-        'scrapy-poet @ git+https://git@github.com/scrapinghub/scrapy-poet@injector_record_replay_native#egg=scrapy-poet',
+        'scrapy-poet @ git+https://git@github.com/scrapinghub/scrapy-poet@master#egg=scrapy-poet',
         'aiohttp',
         'tldextract',
         'sqlitedict>=1.7.0',
diff --git a/tests/test_providers.py b/tests/test_providers.py
index 382fa93..e30214c 100644
--- a/tests/test_providers.py
+++ b/tests/test_providers.py
@@ -291,9 +291,9 @@ def test_provider_cache():
     injector = get_injector_for_testing({})
     request = get_response_for_testing(None).request
     provider = AutoExtractProvider(injector.crawler)
-    finguerprint = provider.fingerprint(
+    fingerprint = provider.fingerprint(
         {AutoExtractHtml, AutoExtractProductData, AutoExtractArticleData},
         request)
-    assert finguerprint == """{"article": {"articleBodyRaw": false, "url": "http://example.com"}, "product": {"articleBodyRaw": false, "fullHtml": true, "url": "http://example.com"}}"""
+    assert fingerprint == """{"article": {"articleBodyRaw": false, "url": "http://example.com"}, "product": {"articleBodyRaw": false, "fullHtml": true, "url": "http://example.com"}}"""
     assert provider.serialize("foo") == "foo"
     assert provider.deserialize("bar") == "bar"

From 1e2be8a2ffca6888f43cecc56c182bd55b4335da Mon Sep 17 00:00:00 2001
From: Kevin Lloyd Bernal <kevinoxy@gmail.com>
Date: Thu, 23 Dec 2021 13:21:26 +0800
Subject: [PATCH 5/6] use the proper Cache Mixin from scrapy-poet

---
 scrapy_autoextract/providers.py | 7 +++++--
 tests/test_providers.py         | 1 +
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/scrapy_autoextract/providers.py b/scrapy_autoextract/providers.py
index 03ab39c..3605cdb 100644
--- a/scrapy_autoextract/providers.py
+++ b/scrapy_autoextract/providers.py
@@ -19,7 +19,10 @@
 from autoextract_poet.page_inputs import (
     AutoExtractProductData, AutoExtractData, AutoExtractHtml,
 )
-from scrapy_poet.page_input_providers import PageObjectInputProvider
+from scrapy_poet.page_input_providers import (
+    PageObjectInputProvider,
+    CacheDataProviderMixin,
+)
 from .errors import QueryError, summarize_exception
 from .slot_semaphore import SlotsSemaphore
 from .task_manager import TaskManager
@@ -79,7 +82,7 @@ class AERequestSpec:
     provided_cls: Callable
 
 
-class AutoExtractProvider(PageObjectInputProvider):
+class AutoExtractProvider(PageObjectInputProvider, CacheDataProviderMixin):
     """Provider for AutoExtract data"""
     # pageType requested when only html is required
     page_type_class_for_html: ClassVar[AutoExtractData] = AutoExtractProductData
diff --git a/tests/test_providers.py b/tests/test_providers.py
index e30214c..2e8556f 100644
--- a/tests/test_providers.py
+++ b/tests/test_providers.py
@@ -297,3 +297,4 @@ def test_provider_cache():
     assert fingerprint == """{"article": {"articleBodyRaw": false, "url": "http://example.com"}, "product": {"articleBodyRaw": false, "fullHtml": true, "url": "http://example.com"}}"""
     assert provider.serialize("foo") == "foo"
     assert provider.deserialize("bar") == "bar"
+    assert provider.has_cache_support

From 0f0dd5518be40759868cdfff6ca0ba3372b90ee2 Mon Sep 17 00:00:00 2001
From: Kevin Lloyd Bernal <kevinoxy@gmail.com>
Date: Thu, 23 Dec 2021 19:30:34 +0800
Subject: [PATCH 6/6] small import fix

---
 scrapy_autoextract/providers.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/scrapy_autoextract/providers.py b/scrapy_autoextract/providers.py
index 3605cdb..d09a9b6 100644
--- a/scrapy_autoextract/providers.py
+++ b/scrapy_autoextract/providers.py
@@ -7,7 +7,7 @@
 
 import aiohttp
 import attr
-from scrapy import Request as ScrapyRequest, signals, Request
+from scrapy import Request as ScrapyRequest, signals
 from scrapy.crawler import Crawler
 from scrapy.settings import Settings
 from autoextract.aio import request_raw, create_session
@@ -74,7 +74,7 @@ def get_concurrent_requests_per_domain(settings: Settings):
     return concurrency
 
 
-@attr.define()
+@attr.define
 class AERequestSpec:
     query: List[AutoExtractRequest]
     should_request_html: bool
@@ -298,7 +298,7 @@ def inc_stats(suffix, value=1, both=False):
 
         return instances
 
-    def fingerprint(self, to_provide: Set[Callable], request: Request) -> str:
+    def fingerprint(self, to_provide: Set[Callable], request: ScrapyRequest) -> str:
         queries = [spec.query[0].as_dict() for spec in self.list_required_requests(to_provide, request)]
         # pageType is the key to sort by it
         by_page_type = {query['pageType']: query for query in queries}