diff --git a/README.rst b/README.rst index cf905bc..3785126 100644 --- a/README.rst +++ b/README.rst @@ -118,7 +118,7 @@ For example:: The examples above extract an article from the page, but you may want to extract a different type of item, like a product or a job posting. It is as easy as using the correct type annotation in the callback. This -is how the callback looks like if we need to extract a real state +is how the callback looks like if we need to extract real estate data from the page:: def parse(self, @@ -245,6 +245,13 @@ Provider settings - ``AUTOEXTRACT_CACHE_GZIP`` [optional] when True (default), cached AutoExtract responses are compressed using gzip. Set this option to False to turn compression off. +- ``AUTOEXTRACT_CACHE_COLLECTION`` [optional] when True, AutoExtract responses + are stored in Scrapy Cloud collection named after job id, + e.g. ``111_222_333_cache`` for job ``111/222/333``. + Using collections is mutually exclusive with using ``AUTOEXTRACT_CACHE_FILENAME`` setting. + If the spider is run locally, project number should be set in ``DEV_PROJECT`` setting. + Default collection name is ``dev_cache``. + The collection name can be customised by using ``AUTOEXTRACT_CACHE_COLLECTION_NAME`` setting. Limitations =========== diff --git a/scrapy_autoextract/cache.py b/scrapy_autoextract/cache.py index 3f2cd4d..88d1411 100644 --- a/scrapy_autoextract/cache.py +++ b/scrapy_autoextract/cache.py @@ -1,11 +1,12 @@ import abc -import json import gzip +import json import pickle import sqlite3 import sqlitedict from autoextract.request import Request +from scrapinghub import NotFound, ScrapinghubClient class _Cache(abc.ABC): @@ -88,3 +89,32 @@ def __setitem__(self, fingerprint: str, value) -> None: def close(self): self.db.close() + + +class ScrapyCloudCollectionCache(_Cache): + def __init__(self, project, collection): + self.sc = ScrapinghubClient() + self.collection = self.sc.get_project(project).collections.get_store(collection) + + @classmethod + def fingerprint(cls, request: Request) -> str: + return json.dumps( + request.as_dict(), + ensure_ascii=False, + sort_keys=True + ) + + def __getitem__(self, fingerprint: str): + try: + return self.collection.get(fingerprint) + except NotFound: + raise KeyError + + def __setitem__(self, fingerprint: str, value) -> None: + self.collection.set( + {'_key': fingerprint, + 'value': value} + ) + + def close(self): + self.sc.close() diff --git a/scrapy_autoextract/providers.py b/scrapy_autoextract/providers.py index e4f2716..0f6a4f4 100644 --- a/scrapy_autoextract/providers.py +++ b/scrapy_autoextract/providers.py @@ -2,27 +2,28 @@ import logging import os from asyncio import CancelledError -from typing import Callable, Set, ClassVar, List, Any, Hashable +from typing import Any, Callable, ClassVar, Hashable, List, Set import aiohttp -from scrapy import Request as ScrapyRequest, signals -from scrapy.crawler import Crawler -from scrapy.settings import Settings -from autoextract.aio import request_raw, create_session -from autoextract.aio.errors import RequestError, \ - ACCOUNT_DISABLED_ERROR_TYPE +from autoextract.aio import create_session, request_raw +from autoextract.aio.errors import ACCOUNT_DISABLED_ERROR_TYPE, RequestError from autoextract.aio.retry import RetryFactory from autoextract.request import Request as AutoExtractRequest from autoextract.stats import AggStats -from autoextract_poet.page_inputs import ( - AutoExtractProductData, AutoExtractData, AutoExtractHtml, -) +from autoextract_poet.page_inputs import (AutoExtractData, AutoExtractHtml, + AutoExtractProductData) +from scrapy import Request as ScrapyRequest +from scrapy import signals +from scrapy.crawler import Crawler +from scrapy.settings import Settings from scrapy_poet.page_input_providers import PageObjectInputProvider + +from .cache import AutoExtractCache, DummyCache, ScrapyCloudCollectionCache from .errors import QueryError, summarize_exception from .slot_semaphore import SlotsSemaphore from .task_manager import TaskManager -from .utils import get_domain, get_scrapy_data_path -from .cache import AutoExtractCache, DummyCache +from .utils import (get_collection_name, get_domain, get_project_from_job, + get_scrapy_data_path) logger = logging.getLogger(__name__) @@ -93,11 +94,19 @@ def __init__(self, crawler: Crawler): self.per_domain_semaphore = SlotsSemaphore(per_domain_concurrency) cache_filename = self.settings.get('AUTOEXTRACT_CACHE_FILENAME') + cache_collection = self.settings.get('AUTOEXTRACT_CACHE_COLLECTION') + check_configuration(cache_filename, cache_collection) if cache_filename: cache_filename = os.path.join(get_scrapy_data_path(createdir=True), cache_filename) compressed = self.settings.getbool('AUTOEXTRACT_CACHE_GZIP', True) self.cache = AutoExtractCache(cache_filename, compressed=compressed) + elif cache_collection: + project = get_project_from_job() or self.settings.get('DEV_PROJECT') + self.cache = ScrapyCloudCollectionCache( + project, + get_collection_name(self) + ) else: self.cache = DummyCache() @@ -263,3 +272,11 @@ def inc_stats(suffix, value=1, both=False): inc_stats("/pages/success", both=True) return instances + + +def check_configuration(cache_filename, cache_collection): + if all([cache_filename, cache_collection]): + raise ValueError( + "Configuration error. " + "Both AUTOEXTRACT_CACHE_FILENAME and AUTOEXTRACT_CACHE_COLLECTION defined in settings." + ) diff --git a/scrapy_autoextract/utils.py b/scrapy_autoextract/utils.py index aea051b..ac7e136 100644 --- a/scrapy_autoextract/utils.py +++ b/scrapy_autoextract/utils.py @@ -28,3 +28,19 @@ def get_scrapy_data_path(createdir=True): if createdir: os.makedirs(path, exist_ok=True) return path + + +def get_collection_name(provider): + from_settings = provider.settings.get('AUTOEXTRACT_CACHE_COLLECTION_NAME') + scrapy_job = os.environ.get('SCRAPY_JOB') + if from_settings: + return from_settings + elif scrapy_job: + return f"{scrapy_job.replace('/', '_')}_cache" + return 'dev_cache' + + +def get_project_from_job(): + scrapy_job = os.environ.get('SCRAPY_JOB') + if scrapy_job: + return scrapy_job.split('/')[0] diff --git a/setup.py b/setup.py index d440309..5912df7 100644 --- a/setup.py +++ b/setup.py @@ -33,6 +33,7 @@ def get_version(): 'aiohttp', 'tldextract', 'sqlitedict>=1.7.0', + 'scrapinghub', ], keywords='scrapy autoextract middleware', classifiers=[