Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

AutoExtractProvider now support the new scrapy-poet cache interface #31

Open
wants to merge 7 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions CHANGES.rst
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,10 @@ Changes
TBR
------------------

* The AutoExtract provider now supports the new providers API for caching
BurnzZ marked this conversation as resolved.
Show resolved Hide resolved
* The preferred pageType for HTML requests (``AutoExtractProductData``)
is now chosen always if listed as dependency instead of just choosing
the first dependency ``pageType`` to request the HTML
* removed support for Python 3.6
* added support for Python 3.10

Expand Down
104 changes: 76 additions & 28 deletions scrapy_autoextract/providers.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,12 @@
import inspect
import json
import logging
import os
from asyncio import CancelledError
from typing import Callable, Set, ClassVar, List, Any, Hashable
from typing import Callable, Set, ClassVar, List, Any, Hashable, Sequence, Type, TypeVar

import aiohttp
import attr
from scrapy import Request as ScrapyRequest, signals
from scrapy.crawler import Crawler
from scrapy.settings import Settings
Expand All @@ -17,7 +19,10 @@
from autoextract_poet.page_inputs import (
AutoExtractProductData, AutoExtractData, AutoExtractHtml,
)
from scrapy_poet.page_input_providers import PageObjectInputProvider
from scrapy_poet.page_input_providers import (
PageObjectInputProvider,
CacheDataProviderMixin,
)
from .errors import QueryError, summarize_exception
from .slot_semaphore import SlotsSemaphore
from .task_manager import TaskManager
Expand All @@ -30,6 +35,9 @@
_TASK_MANAGER = "_autoextract_task_manager"


AEDataType_co = TypeVar('AEDataType_co', bound=AutoExtractData, covariant=True)


def get_autoextract_task_manager(crawler: Crawler) -> TaskManager:
"""
Return the configured :class:`scrapy_autoextract.TaskManager` that controls
Expand Down Expand Up @@ -66,11 +74,20 @@ def get_concurrent_requests_per_domain(settings: Settings):
return concurrency


class AutoExtractProvider(PageObjectInputProvider):
@attr.define
class AERequestSpec:
query: List[AutoExtractRequest]
BurnzZ marked this conversation as resolved.
Show resolved Hide resolved
should_request_html: bool
is_extraction_required: bool
provided_cls: Callable


class AutoExtractProvider(PageObjectInputProvider, CacheDataProviderMixin):
"""Provider for AutoExtract data"""
# pageType requested when only html is required
page_type_class_for_html: ClassVar[AutoExtractData] = AutoExtractProductData
html_query_attribute: ClassVar[str] = "fullHtml"
name = "autoextract"

@classmethod
def provided_classes(cls, type_: Callable) -> bool:
Expand Down Expand Up @@ -172,7 +189,7 @@ def get_per_domain_concurrency_slot(self, request: ScrapyRequest) -> Hashable:

def get_filled_request(self,
request: ScrapyRequest,
provided_cls: AutoExtractData,
provided_cls: Type[AEDataType_co],
should_request_html: bool) -> AutoExtractRequest:
"""Return a filled request for AutoExtract"""
ae_request = AutoExtractRequest(
Expand All @@ -183,51 +200,68 @@ def get_filled_request(self,
ae_request.extra = {self.html_query_attribute: True}
return ae_request

async def __call__(self,
to_provide: Set[Callable],
request: ScrapyRequest
) -> List:
"""Make an AutoExtract request and build a Page Input of provided class
based on API response data.
"""
if not self.aiohttp_session:
self.aiohttp_session = await self.create_aiohttp_session()
def list_required_requests(self, to_provide: Set[Callable], request: ScrapyRequest) -> List[AERequestSpec]:
is_html_required = AutoExtractHtml in to_provide
to_provide -= {AutoExtractHtml}
is_extraction_required = bool(to_provide)
if is_html_required and not is_extraction_required:
# At least one request is required to get html
to_provide = {self.page_type_class_for_html}

instances = []
if not to_provide:
return []
# Use the recommended type for html request, otherwise use the first one
class_for_html = (self.page_type_class_for_html
if self.page_type_class_for_html in to_provide
else next(iter(to_provide)))
specs = []
for idx, provided_cls in enumerate(to_provide):
if not issubclass(provided_cls, AutoExtractData):
raise RuntimeError(
f"Unexpected {provided_cls} requested. Probably a bug in the provider "
"or in scrapy-poet itself")

# html is requested only a single time to save resources
should_request_html = is_html_required and provided_cls is class_for_html
ae_request = self.get_filled_request(
request,
provided_cls,
should_request_html
)
specs.append(AERequestSpec(
[ae_request],
should_request_html,
is_extraction_required,
provided_cls
))
return specs

async def __call__(self,
to_provide: Set[Callable],
request: ScrapyRequest
) -> List:
"""Make an AutoExtract request and build a Page Input of provided class
based on API response data.
"""
if not self.aiohttp_session:
self.aiohttp_session = await self.create_aiohttp_session()

instances = []
for spec in self.list_required_requests(to_provide, request):
request_stats = AggStats()
is_first_request = idx == 0
page_type = provided_cls.page_type
page_type = spec.provided_cls.page_type

def inc_stats(suffix, value=1, both=False):
self.stats.inc_value(f"autoextract/total{suffix}", value)
if both:
self.stats.inc_value(f"autoextract/{page_type}{suffix}", value)

try:
# html is requested only a single time to save resources
should_request_html = is_html_required and is_first_request
slot = self.get_per_domain_concurrency_slot(request)
ae_request = self.get_filled_request(
request,
provided_cls,
should_request_html
)
# When providing same-name arguments in both call and `__init__`
# this implementation will not cause any errors (while name=value implementation would),
# so predefined `__init__` arguments would override the same arguments in the call
awaitable = self.do_request_cached(**{
'query': [ae_request],
'query': spec.query,
'agg_stats': request_stats,
**self.common_request_kwargs,
})
Expand All @@ -251,15 +285,29 @@ def inc_stats(suffix, value=1, both=False):
inc_stats("/attempts/count", request_stats.n_attempts)
inc_stats("/attempts/billable", request_stats.n_billable_query_responses)

if should_request_html:
if spec.should_request_html:
instances.append(AutoExtractHtml(url=data[page_type]['url'],
html=data['html']))
inc_stats("/pages/html", both=True)

if is_extraction_required:
if spec.is_extraction_required:
without_html = {k: v for k, v in data.items() if k != "html"}
instances.append(provided_cls(data=without_html))
instances.append(spec.provided_cls(data=without_html))

inc_stats("/pages/success", both=True)

return instances

def fingerprint(self, to_provide: Set[Callable], request: ScrapyRequest) -> str:
queries = [spec.query[0].as_dict() for spec in self.list_required_requests(to_provide, request)]
# pageType is the key to sort by it
by_page_type = {query['pageType']: query for query in queries}
for query in by_page_type.values():
query.pop("pageType")
return json.dumps(by_page_type, sort_keys=True)

def serialize(self, result: Sequence[Any]) -> Any:
return result

def deserialize(self, data: Any) -> Sequence[Any]:
return data
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ def get_version():
install_requires=[
'autoextract-poet>=0.3.0',
'zyte-autoextract>=0.7.0',
'scrapy-poet>=0.2.0',
'scrapy-poet @ git+https://[email protected]/scrapinghub/scrapy-poet@master#egg=scrapy-poet',
'aiohttp',
'tldextract',
'sqlitedict>=1.7.0',
Expand Down
13 changes: 13 additions & 0 deletions tests/test_providers.py
Original file line number Diff line number Diff line change
Expand Up @@ -285,3 +285,16 @@ async def cancel_after(sleep):

finally:
signal.signal(SIGINT, old_handler)


def test_provider_cache():
injector = get_injector_for_testing({})
request = get_response_for_testing(None).request
provider = AutoExtractProvider(injector.crawler)
fingerprint = provider.fingerprint(
{AutoExtractHtml, AutoExtractProductData, AutoExtractArticleData},
request)
assert fingerprint == """{"article": {"articleBodyRaw": false, "url": "http://example.com"}, "product": {"articleBodyRaw": false, "fullHtml": true, "url": "http://example.com"}}"""
assert provider.serialize("foo") == "foo"
assert provider.deserialize("bar") == "bar"
assert provider.has_cache_support