From 81805e4fbb20a92a995da74713e76338a84bfa66 Mon Sep 17 00:00:00 2001
From: asadurski <artur@scrapinghub.com>
Date: Mon, 29 Nov 2021 10:28:37 +0100
Subject: [PATCH 1/2] add ScrapyCloudCollectionCache

---
 README.rst                      | 10 +++++++--
 scrapy_autoextract/cache.py     | 28 +++++++++++++++++++++++-
 scrapy_autoextract/providers.py | 38 ++++++++++++++++++++++-----------
 scrapy_autoextract/utils.py     | 13 +++++++++++
 setup.py                        |  1 +
 5 files changed, 75 insertions(+), 15 deletions(-)

diff --git a/README.rst b/README.rst
index 9f91262..a48d286 100644
--- a/README.rst
+++ b/README.rst
@@ -118,7 +118,7 @@ For example::
 The examples above extract an article from the page, but you may want to
 extract a different type of item, like a product or a job posting. It is
 as easy as using the correct type annotation in the callback. This
-is how the callback looks like if we need to extract a real state
+is how the callback looks like if we need to extract real estate data
 from the page::
 
     def parse(self,
@@ -245,6 +245,12 @@ Provider settings
 - ``AUTOEXTRACT_CACHE_GZIP`` [optional] when True (default), cached AutoExtract
   responses are compressed using gzip. Set this option to False to turn
   compression off.
+- ``AUTOEXTRACT_CACHE_COLLECTION`` [optional] when True, AutoExtract responses
+  are stored in Scrapy Cloud collection named after job id,
+  e.g. ``111_222_333_cache`` for job ``111/222/333``.
+  Using collections is mutually exclusive with using ``AUTOEXTRACT_CACHE_FILENAME`` setting.
+  If the spider is run locally, project number should be set in ``DEV_PROJECT`` setting.
+  Default collection name is ``dev_cache``.
 
 Limitations
 ===========
@@ -284,4 +290,4 @@ When using the AutoExtract providers, be aware that:
 .. _`Scrapy's asyncio documentation`: https://docs.scrapy.org/en/latest/topics/asyncio.html
 .. _`Request-level error`: https://doc.scrapinghub.com/autoextract.html#request-level
 .. _`Query-level error`: https://doc.scrapinghub.com/autoextract.html#query-level
-.. _`supported page types`: https://autoextract-poet.readthedocs.io/en/stable/_autosummary/autoextract_poet.pages.html#module-autoextract_poet.pages
\ No newline at end of file
+.. _`supported page types`: https://autoextract-poet.readthedocs.io/en/stable/_autosummary/autoextract_poet.pages.html#module-autoextract_poet.pages
diff --git a/scrapy_autoextract/cache.py b/scrapy_autoextract/cache.py
index 3f2cd4d..60ec579 100644
--- a/scrapy_autoextract/cache.py
+++ b/scrapy_autoextract/cache.py
@@ -1,11 +1,12 @@
 import abc
-import json
 import gzip
+import json
 import pickle
 import sqlite3
 
 import sqlitedict
 from autoextract.request import Request
+from scrapinghub import NotFound, ScrapinghubClient
 
 
 class _Cache(abc.ABC):
@@ -88,3 +89,28 @@ def __setitem__(self, fingerprint: str, value) -> None:
 
     def close(self):
         self.db.close()
+
+
+class ScrapyCloudCollectionCache(_Cache):
+    def __init__(self, project, collection):
+        sc = ScrapinghubClient()
+        self.collection = sc.get_project(project).collections.get_store(collection)
+
+    @classmethod
+    def fingerprint(cls, request: Request) -> str:
+        return request.url
+
+    def __getitem__(self, fingerprint: str):
+        try:
+            return self.collection.get(fingerprint)
+        except NotFound:
+            raise KeyError
+
+    def __setitem__(self, fingerprint: str, value) -> None:
+        self.collection.set(
+            {'_key': fingerprint,
+             'value': value}
+        )
+
+    def close(self):
+        pass
diff --git a/scrapy_autoextract/providers.py b/scrapy_autoextract/providers.py
index e4f2716..f711692 100644
--- a/scrapy_autoextract/providers.py
+++ b/scrapy_autoextract/providers.py
@@ -2,27 +2,28 @@
 import logging
 import os
 from asyncio import CancelledError
-from typing import Callable, Set, ClassVar, List, Any, Hashable
+from typing import Any, Callable, ClassVar, Hashable, List, Set
 
 import aiohttp
-from scrapy import Request as ScrapyRequest, signals
-from scrapy.crawler import Crawler
-from scrapy.settings import Settings
-from autoextract.aio import request_raw, create_session
-from autoextract.aio.errors import RequestError, \
-    ACCOUNT_DISABLED_ERROR_TYPE
+from autoextract.aio import create_session, request_raw
+from autoextract.aio.errors import ACCOUNT_DISABLED_ERROR_TYPE, RequestError
 from autoextract.aio.retry import RetryFactory
 from autoextract.request import Request as AutoExtractRequest
 from autoextract.stats import AggStats
-from autoextract_poet.page_inputs import (
-    AutoExtractProductData, AutoExtractData, AutoExtractHtml,
-)
+from autoextract_poet.page_inputs import (AutoExtractData, AutoExtractHtml,
+                                          AutoExtractProductData)
+from scrapy import Request as ScrapyRequest
+from scrapy import signals
+from scrapy.crawler import Crawler
+from scrapy.settings import Settings
 from scrapy_poet.page_input_providers import PageObjectInputProvider
+
+from .cache import AutoExtractCache, DummyCache, ScrapyCloudCollectionCache
 from .errors import QueryError, summarize_exception
 from .slot_semaphore import SlotsSemaphore
 from .task_manager import TaskManager
-from .utils import get_domain, get_scrapy_data_path
-from .cache import AutoExtractCache, DummyCache
+from .utils import (get_collection_name, get_domain, get_project_from_job,
+                    get_scrapy_data_path)
 
 logger = logging.getLogger(__name__)
 
@@ -93,11 +94,16 @@ def __init__(self, crawler: Crawler):
         self.per_domain_semaphore = SlotsSemaphore(per_domain_concurrency)
 
         cache_filename = self.settings.get('AUTOEXTRACT_CACHE_FILENAME')
+        cache_collection = self.settings.get('AUTOEXTRACT_CACHE_COLLECTION')
+        check_configuration(cache_filename, cache_collection)
         if cache_filename:
             cache_filename = os.path.join(get_scrapy_data_path(createdir=True),
                                           cache_filename)
             compressed = self.settings.getbool('AUTOEXTRACT_CACHE_GZIP', True)
             self.cache = AutoExtractCache(cache_filename, compressed=compressed)
+        elif cache_collection:
+            project = get_project_from_job() or self.settings.get('DEV_PROJECT')
+            self.cache = ScrapyCloudCollectionCache(project, get_collection_name())
         else:
             self.cache = DummyCache()
 
@@ -263,3 +269,11 @@ def inc_stats(suffix, value=1, both=False):
             inc_stats("/pages/success", both=True)
 
         return instances
+
+
+def check_configuration(cache_filename, cache_collection):
+    if all([cache_filename, cache_collection]):
+        raise ValueError(
+            "Configuration error. "
+            "Both AUTOEXTRACT_CACHE_FILENAME and AUTOEXTRACT_CACHE_COLLECTION defined in settings."
+        )
diff --git a/scrapy_autoextract/utils.py b/scrapy_autoextract/utils.py
index aea051b..429cf0c 100644
--- a/scrapy_autoextract/utils.py
+++ b/scrapy_autoextract/utils.py
@@ -28,3 +28,16 @@ def get_scrapy_data_path(createdir=True):
     if createdir:
         os.makedirs(path, exist_ok=True)
     return path
+
+
+def get_collection_name():
+    scrapy_job = os.environ.get('SCRAPY_JOB')
+    if scrapy_job:
+        return f"{scrapy_job.replace('/', '_')}_cache"
+    return 'dev_cache'
+
+
+def get_project_from_job():
+    scrapy_job = os.environ.get('SCRAPY_JOB')
+    if scrapy_job:
+        return scrapy_job.split('/')[0]
diff --git a/setup.py b/setup.py
index f645dc5..6604d70 100644
--- a/setup.py
+++ b/setup.py
@@ -33,6 +33,7 @@ def get_version():
         'aiohttp',
         'tldextract',
         'sqlitedict>=1.7.0',
+        'scrapinghub',
     ],
     keywords='scrapy autoextract middleware',
     classifiers=[

From e913128c6000547ffaf364ecbdb4e326965dce82 Mon Sep 17 00:00:00 2001
From: asadurski <artur@scrapinghub.com>
Date: Thu, 23 Dec 2021 18:00:19 +0100
Subject: [PATCH 2/2] implement suggestions from PR comments

---
 README.rst                      |  1 +
 scrapy_autoextract/cache.py     | 12 ++++++++----
 scrapy_autoextract/providers.py |  5 ++++-
 scrapy_autoextract/utils.py     |  7 +++++--
 4 files changed, 18 insertions(+), 7 deletions(-)

diff --git a/README.rst b/README.rst
index a48d286..883cf6b 100644
--- a/README.rst
+++ b/README.rst
@@ -251,6 +251,7 @@ Provider settings
   Using collections is mutually exclusive with using ``AUTOEXTRACT_CACHE_FILENAME`` setting.
   If the spider is run locally, project number should be set in ``DEV_PROJECT`` setting.
   Default collection name is ``dev_cache``.
+  The collection name can be customised by using ``AUTOEXTRACT_CACHE_COLLECTION_NAME`` setting.
 
 Limitations
 ===========
diff --git a/scrapy_autoextract/cache.py b/scrapy_autoextract/cache.py
index 60ec579..88d1411 100644
--- a/scrapy_autoextract/cache.py
+++ b/scrapy_autoextract/cache.py
@@ -93,12 +93,16 @@ def close(self):
 
 class ScrapyCloudCollectionCache(_Cache):
     def __init__(self, project, collection):
-        sc = ScrapinghubClient()
-        self.collection = sc.get_project(project).collections.get_store(collection)
+        self.sc = ScrapinghubClient()
+        self.collection = self.sc.get_project(project).collections.get_store(collection)
 
     @classmethod
     def fingerprint(cls, request: Request) -> str:
-        return request.url
+        return json.dumps(
+            request.as_dict(),
+            ensure_ascii=False,
+            sort_keys=True
+        )
 
     def __getitem__(self, fingerprint: str):
         try:
@@ -113,4 +117,4 @@ def __setitem__(self, fingerprint: str, value) -> None:
         )
 
     def close(self):
-        pass
+        self.sc.close()
diff --git a/scrapy_autoextract/providers.py b/scrapy_autoextract/providers.py
index f711692..0f6a4f4 100644
--- a/scrapy_autoextract/providers.py
+++ b/scrapy_autoextract/providers.py
@@ -103,7 +103,10 @@ def __init__(self, crawler: Crawler):
             self.cache = AutoExtractCache(cache_filename, compressed=compressed)
         elif cache_collection:
             project = get_project_from_job() or self.settings.get('DEV_PROJECT')
-            self.cache = ScrapyCloudCollectionCache(project, get_collection_name())
+            self.cache = ScrapyCloudCollectionCache(
+                project,
+                get_collection_name(self)
+            )
         else:
             self.cache = DummyCache()
 
diff --git a/scrapy_autoextract/utils.py b/scrapy_autoextract/utils.py
index 429cf0c..ac7e136 100644
--- a/scrapy_autoextract/utils.py
+++ b/scrapy_autoextract/utils.py
@@ -30,9 +30,12 @@ def get_scrapy_data_path(createdir=True):
     return path
 
 
-def get_collection_name():
+def get_collection_name(provider):
+    from_settings = provider.settings.get('AUTOEXTRACT_CACHE_COLLECTION_NAME')
     scrapy_job = os.environ.get('SCRAPY_JOB')
-    if scrapy_job:
+    if from_settings:
+        return from_settings
+    elif scrapy_job:
         return f"{scrapy_job.replace('/', '_')}_cache"
     return 'dev_cache'