From 81805e4fbb20a92a995da74713e76338a84bfa66 Mon Sep 17 00:00:00 2001 From: asadurski Date: Mon, 29 Nov 2021 10:28:37 +0100 Subject: [PATCH 1/2] add ScrapyCloudCollectionCache --- README.rst | 10 +++++++-- scrapy_autoextract/cache.py | 28 +++++++++++++++++++++++- scrapy_autoextract/providers.py | 38 ++++++++++++++++++++++----------- scrapy_autoextract/utils.py | 13 +++++++++++ setup.py | 1 + 5 files changed, 75 insertions(+), 15 deletions(-) diff --git a/README.rst b/README.rst index 9f91262..a48d286 100644 --- a/README.rst +++ b/README.rst @@ -118,7 +118,7 @@ For example:: The examples above extract an article from the page, but you may want to extract a different type of item, like a product or a job posting. It is as easy as using the correct type annotation in the callback. This -is how the callback looks like if we need to extract a real state +is how the callback looks like if we need to extract real estate data from the page:: def parse(self, @@ -245,6 +245,12 @@ Provider settings - ``AUTOEXTRACT_CACHE_GZIP`` [optional] when True (default), cached AutoExtract responses are compressed using gzip. Set this option to False to turn compression off. +- ``AUTOEXTRACT_CACHE_COLLECTION`` [optional] when True, AutoExtract responses + are stored in Scrapy Cloud collection named after job id, + e.g. ``111_222_333_cache`` for job ``111/222/333``. + Using collections is mutually exclusive with using ``AUTOEXTRACT_CACHE_FILENAME`` setting. + If the spider is run locally, project number should be set in ``DEV_PROJECT`` setting. + Default collection name is ``dev_cache``. Limitations =========== @@ -284,4 +290,4 @@ When using the AutoExtract providers, be aware that: .. _`Scrapy's asyncio documentation`: https://docs.scrapy.org/en/latest/topics/asyncio.html .. _`Request-level error`: https://doc.scrapinghub.com/autoextract.html#request-level .. _`Query-level error`: https://doc.scrapinghub.com/autoextract.html#query-level -.. _`supported page types`: https://autoextract-poet.readthedocs.io/en/stable/_autosummary/autoextract_poet.pages.html#module-autoextract_poet.pages \ No newline at end of file +.. _`supported page types`: https://autoextract-poet.readthedocs.io/en/stable/_autosummary/autoextract_poet.pages.html#module-autoextract_poet.pages diff --git a/scrapy_autoextract/cache.py b/scrapy_autoextract/cache.py index 3f2cd4d..60ec579 100644 --- a/scrapy_autoextract/cache.py +++ b/scrapy_autoextract/cache.py @@ -1,11 +1,12 @@ import abc -import json import gzip +import json import pickle import sqlite3 import sqlitedict from autoextract.request import Request +from scrapinghub import NotFound, ScrapinghubClient class _Cache(abc.ABC): @@ -88,3 +89,28 @@ def __setitem__(self, fingerprint: str, value) -> None: def close(self): self.db.close() + + +class ScrapyCloudCollectionCache(_Cache): + def __init__(self, project, collection): + sc = ScrapinghubClient() + self.collection = sc.get_project(project).collections.get_store(collection) + + @classmethod + def fingerprint(cls, request: Request) -> str: + return request.url + + def __getitem__(self, fingerprint: str): + try: + return self.collection.get(fingerprint) + except NotFound: + raise KeyError + + def __setitem__(self, fingerprint: str, value) -> None: + self.collection.set( + {'_key': fingerprint, + 'value': value} + ) + + def close(self): + pass diff --git a/scrapy_autoextract/providers.py b/scrapy_autoextract/providers.py index e4f2716..f711692 100644 --- a/scrapy_autoextract/providers.py +++ b/scrapy_autoextract/providers.py @@ -2,27 +2,28 @@ import logging import os from asyncio import CancelledError -from typing import Callable, Set, ClassVar, List, Any, Hashable +from typing import Any, Callable, ClassVar, Hashable, List, Set import aiohttp -from scrapy import Request as ScrapyRequest, signals -from scrapy.crawler import Crawler -from scrapy.settings import Settings -from autoextract.aio import request_raw, create_session -from autoextract.aio.errors import RequestError, \ - ACCOUNT_DISABLED_ERROR_TYPE +from autoextract.aio import create_session, request_raw +from autoextract.aio.errors import ACCOUNT_DISABLED_ERROR_TYPE, RequestError from autoextract.aio.retry import RetryFactory from autoextract.request import Request as AutoExtractRequest from autoextract.stats import AggStats -from autoextract_poet.page_inputs import ( - AutoExtractProductData, AutoExtractData, AutoExtractHtml, -) +from autoextract_poet.page_inputs import (AutoExtractData, AutoExtractHtml, + AutoExtractProductData) +from scrapy import Request as ScrapyRequest +from scrapy import signals +from scrapy.crawler import Crawler +from scrapy.settings import Settings from scrapy_poet.page_input_providers import PageObjectInputProvider + +from .cache import AutoExtractCache, DummyCache, ScrapyCloudCollectionCache from .errors import QueryError, summarize_exception from .slot_semaphore import SlotsSemaphore from .task_manager import TaskManager -from .utils import get_domain, get_scrapy_data_path -from .cache import AutoExtractCache, DummyCache +from .utils import (get_collection_name, get_domain, get_project_from_job, + get_scrapy_data_path) logger = logging.getLogger(__name__) @@ -93,11 +94,16 @@ def __init__(self, crawler: Crawler): self.per_domain_semaphore = SlotsSemaphore(per_domain_concurrency) cache_filename = self.settings.get('AUTOEXTRACT_CACHE_FILENAME') + cache_collection = self.settings.get('AUTOEXTRACT_CACHE_COLLECTION') + check_configuration(cache_filename, cache_collection) if cache_filename: cache_filename = os.path.join(get_scrapy_data_path(createdir=True), cache_filename) compressed = self.settings.getbool('AUTOEXTRACT_CACHE_GZIP', True) self.cache = AutoExtractCache(cache_filename, compressed=compressed) + elif cache_collection: + project = get_project_from_job() or self.settings.get('DEV_PROJECT') + self.cache = ScrapyCloudCollectionCache(project, get_collection_name()) else: self.cache = DummyCache() @@ -263,3 +269,11 @@ def inc_stats(suffix, value=1, both=False): inc_stats("/pages/success", both=True) return instances + + +def check_configuration(cache_filename, cache_collection): + if all([cache_filename, cache_collection]): + raise ValueError( + "Configuration error. " + "Both AUTOEXTRACT_CACHE_FILENAME and AUTOEXTRACT_CACHE_COLLECTION defined in settings." + ) diff --git a/scrapy_autoextract/utils.py b/scrapy_autoextract/utils.py index aea051b..429cf0c 100644 --- a/scrapy_autoextract/utils.py +++ b/scrapy_autoextract/utils.py @@ -28,3 +28,16 @@ def get_scrapy_data_path(createdir=True): if createdir: os.makedirs(path, exist_ok=True) return path + + +def get_collection_name(): + scrapy_job = os.environ.get('SCRAPY_JOB') + if scrapy_job: + return f"{scrapy_job.replace('/', '_')}_cache" + return 'dev_cache' + + +def get_project_from_job(): + scrapy_job = os.environ.get('SCRAPY_JOB') + if scrapy_job: + return scrapy_job.split('/')[0] diff --git a/setup.py b/setup.py index f645dc5..6604d70 100644 --- a/setup.py +++ b/setup.py @@ -33,6 +33,7 @@ def get_version(): 'aiohttp', 'tldextract', 'sqlitedict>=1.7.0', + 'scrapinghub', ], keywords='scrapy autoextract middleware', classifiers=[ From e913128c6000547ffaf364ecbdb4e326965dce82 Mon Sep 17 00:00:00 2001 From: asadurski Date: Thu, 23 Dec 2021 18:00:19 +0100 Subject: [PATCH 2/2] implement suggestions from PR comments --- README.rst | 1 + scrapy_autoextract/cache.py | 12 ++++++++---- scrapy_autoextract/providers.py | 5 ++++- scrapy_autoextract/utils.py | 7 +++++-- 4 files changed, 18 insertions(+), 7 deletions(-) diff --git a/README.rst b/README.rst index a48d286..883cf6b 100644 --- a/README.rst +++ b/README.rst @@ -251,6 +251,7 @@ Provider settings Using collections is mutually exclusive with using ``AUTOEXTRACT_CACHE_FILENAME`` setting. If the spider is run locally, project number should be set in ``DEV_PROJECT`` setting. Default collection name is ``dev_cache``. + The collection name can be customised by using ``AUTOEXTRACT_CACHE_COLLECTION_NAME`` setting. Limitations =========== diff --git a/scrapy_autoextract/cache.py b/scrapy_autoextract/cache.py index 60ec579..88d1411 100644 --- a/scrapy_autoextract/cache.py +++ b/scrapy_autoextract/cache.py @@ -93,12 +93,16 @@ def close(self): class ScrapyCloudCollectionCache(_Cache): def __init__(self, project, collection): - sc = ScrapinghubClient() - self.collection = sc.get_project(project).collections.get_store(collection) + self.sc = ScrapinghubClient() + self.collection = self.sc.get_project(project).collections.get_store(collection) @classmethod def fingerprint(cls, request: Request) -> str: - return request.url + return json.dumps( + request.as_dict(), + ensure_ascii=False, + sort_keys=True + ) def __getitem__(self, fingerprint: str): try: @@ -113,4 +117,4 @@ def __setitem__(self, fingerprint: str, value) -> None: ) def close(self): - pass + self.sc.close() diff --git a/scrapy_autoextract/providers.py b/scrapy_autoextract/providers.py index f711692..0f6a4f4 100644 --- a/scrapy_autoextract/providers.py +++ b/scrapy_autoextract/providers.py @@ -103,7 +103,10 @@ def __init__(self, crawler: Crawler): self.cache = AutoExtractCache(cache_filename, compressed=compressed) elif cache_collection: project = get_project_from_job() or self.settings.get('DEV_PROJECT') - self.cache = ScrapyCloudCollectionCache(project, get_collection_name()) + self.cache = ScrapyCloudCollectionCache( + project, + get_collection_name(self) + ) else: self.cache = DummyCache() diff --git a/scrapy_autoextract/utils.py b/scrapy_autoextract/utils.py index 429cf0c..ac7e136 100644 --- a/scrapy_autoextract/utils.py +++ b/scrapy_autoextract/utils.py @@ -30,9 +30,12 @@ def get_scrapy_data_path(createdir=True): return path -def get_collection_name(): +def get_collection_name(provider): + from_settings = provider.settings.get('AUTOEXTRACT_CACHE_COLLECTION_NAME') scrapy_job = os.environ.get('SCRAPY_JOB') - if scrapy_job: + if from_settings: + return from_settings + elif scrapy_job: return f"{scrapy_job.replace('/', '_')}_cache" return 'dev_cache'