scrapinghub · asadurski · Nov 29, 2021 · Dec 23, 2021 · Jan 4, 2022 · kmike
diff --git a/README.rst b/README.rst
@@ -118,7 +118,7 @@ For example::
 The examples above extract an article from the page, but you may want to
 extract a different type of item, like a product or a job posting. It is
 as easy as using the correct type annotation in the callback. This
-is how the callback looks like if we need to extract a real state
+is how the callback looks like if we need to extract real estate data
 from the page::
 
     def parse(self,
@@ -245,6 +245,12 @@ Provider settings
 - ``AUTOEXTRACT_CACHE_GZIP`` [optional] when True (default), cached AutoExtract
   responses are compressed using gzip. Set this option to False to turn
   compression off.
+- ``AUTOEXTRACT_CACHE_COLLECTION`` [optional] when True, AutoExtract responses
+  are stored in Scrapy Cloud collection named after job id,
+  e.g. ``111_222_333_cache`` for job ``111/222/333``.
+  Using collections is mutually exclusive with using ``AUTOEXTRACT_CACHE_FILENAME`` setting.
+  If the spider is run locally, project number should be set in ``DEV_PROJECT`` setting.
+  Default collection name is ``dev_cache``.
 
 Limitations
 ===========
@@ -284,4 +290,4 @@ When using the AutoExtract providers, be aware that:
 .. _`Scrapy's asyncio documentation`: https://docs.scrapy.org/en/latest/topics/asyncio.html
 .. _`Request-level error`: https://doc.scrapinghub.com/autoextract.html#request-level
 .. _`Query-level error`: https://doc.scrapinghub.com/autoextract.html#query-level
-.. _`supported page types`: https://autoextract-poet.readthedocs.io/en/stable/_autosummary/autoextract_poet.pages.html#module-autoextract_poet.pages
+.. _`supported page types`: https://autoextract-poet.readthedocs.io/en/stable/_autosummary/autoextract_poet.pages.html#module-autoextract_poet.pages
diff --git a/scrapy_autoextract/cache.py b/scrapy_autoextract/cache.py
@@ -1,11 +1,12 @@
 import abc
-import json
 import gzip
+import json
 import pickle
 import sqlite3
 
 import sqlitedict
 from autoextract.request import Request
+from scrapinghub import NotFound, ScrapinghubClient
 
 
 class _Cache(abc.ABC):
@@ -88,3 +89,28 @@ def __setitem__(self, fingerprint: str, value) -> None:
 
     def close(self):
         self.db.close()
+
+
+class ScrapyCloudCollectionCache(_Cache):
+    def __init__(self, project, collection):
+        sc = ScrapinghubClient()
+        self.collection = sc.get_project(project).collections.get_store(collection)
+
+    @classmethod
+    def fingerprint(cls, request: Request) -> str:
+        return request.url
+
+    def __getitem__(self, fingerprint: str):
+        try:
+            return self.collection.get(fingerprint)
+        except NotFound:
+            raise KeyError
+
+    def __setitem__(self, fingerprint: str, value) -> None:
+        self.collection.set(
+            {'_key': fingerprint,
+             'value': value}
+        )
+
+    def close(self):
+        pass
diff --git a/scrapy_autoextract/providers.py b/scrapy_autoextract/providers.py
@@ -2,27 +2,28 @@
 import logging
 import os
 from asyncio import CancelledError
-from typing import Callable, Set, ClassVar, List, Any, Hashable
+from typing import Any, Callable, ClassVar, Hashable, List, Set
 
 import aiohttp
-from scrapy import Request as ScrapyRequest, signals
-from scrapy.crawler import Crawler
-from scrapy.settings import Settings
-from autoextract.aio import request_raw, create_session
-from autoextract.aio.errors import RequestError, \
-    ACCOUNT_DISABLED_ERROR_TYPE
+from autoextract.aio import create_session, request_raw
+from autoextract.aio.errors import ACCOUNT_DISABLED_ERROR_TYPE, RequestError
 from autoextract.aio.retry import RetryFactory
 from autoextract.request import Request as AutoExtractRequest
 from autoextract.stats import AggStats
-from autoextract_poet.page_inputs import (
-    AutoExtractProductData, AutoExtractData, AutoExtractHtml,
-)
+from autoextract_poet.page_inputs import (AutoExtractData, AutoExtractHtml,
+                                          AutoExtractProductData)
+from scrapy import Request as ScrapyRequest
+from scrapy import signals
+from scrapy.crawler import Crawler
+from scrapy.settings import Settings
 from scrapy_poet.page_input_providers import PageObjectInputProvider
+
+from .cache import AutoExtractCache, DummyCache, ScrapyCloudCollectionCache
 from .errors import QueryError, summarize_exception
 from .slot_semaphore import SlotsSemaphore
 from .task_manager import TaskManager
-from .utils import get_domain, get_scrapy_data_path
-from .cache import AutoExtractCache, DummyCache
+from .utils import (get_collection_name, get_domain, get_project_from_job,
+                    get_scrapy_data_path)
 
 logger = logging.getLogger(__name__)
 
@@ -93,11 +94,16 @@ def __init__(self, crawler: Crawler):
         self.per_domain_semaphore = SlotsSemaphore(per_domain_concurrency)
 
         cache_filename = self.settings.get('AUTOEXTRACT_CACHE_FILENAME')
+        cache_collection = self.settings.get('AUTOEXTRACT_CACHE_COLLECTION')
+        check_configuration(cache_filename, cache_collection)
         if cache_filename:
             cache_filename = os.path.join(get_scrapy_data_path(createdir=True),
                                           cache_filename)
             compressed = self.settings.getbool('AUTOEXTRACT_CACHE_GZIP', True)
             self.cache = AutoExtractCache(cache_filename, compressed=compressed)
+        elif cache_collection:
+            project = get_project_from_job() or self.settings.get('DEV_PROJECT')
+            self.cache = ScrapyCloudCollectionCache(project, get_collection_name())
         else:
             self.cache = DummyCache()
 
@@ -263,3 +269,11 @@ def inc_stats(suffix, value=1, both=False):
             inc_stats("/pages/success", both=True)
 
         return instances
+
+
+def check_configuration(cache_filename, cache_collection):
+    if all([cache_filename, cache_collection]):
+        raise ValueError(
+            "Configuration error. "
+            "Both AUTOEXTRACT_CACHE_FILENAME and AUTOEXTRACT_CACHE_COLLECTION defined in settings."
+        )
diff --git a/scrapy_autoextract/utils.py b/scrapy_autoextract/utils.py
@@ -28,3 +28,16 @@ def get_scrapy_data_path(createdir=True):
     if createdir:
         os.makedirs(path, exist_ok=True)
     return path
+
+
+def get_collection_name():
+    scrapy_job = os.environ.get('SCRAPY_JOB')
+    if scrapy_job:
+        return f"{scrapy_job.replace('/', '_')}_cache"
+    return 'dev_cache'
+
+
+def get_project_from_job():
+    scrapy_job = os.environ.get('SCRAPY_JOB')
+    if scrapy_job:
+        return scrapy_job.split('/')[0]
diff --git a/setup.py b/setup.py
@@ -33,6 +33,7 @@ def get_version():
         'aiohttp',
         'tldextract',
         'sqlitedict>=1.7.0',
+        'scrapinghub',
     ],
     keywords='scrapy autoextract middleware',
     classifiers=[