Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

AutoExtractProvider now support the new scrapy-poet cache interface #31

Open
wants to merge 7 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions CHANGES.rst
Original file line number Diff line number Diff line change
@@ -1,6 +1,14 @@
Changes
=======

TBR
------------------

* The AutoExtract provider now supports the new providers API for caching
BurnzZ marked this conversation as resolved.
Show resolved Hide resolved
* The preferred pageType for HTML requests (``AutoExtractProductData``)
is now chosen always if listed as dependency instead of just choosing
the first dependency ``pageType`` to request the HTML

0.7.0 (2021-08-05)
------------------

Expand Down
99 changes: 72 additions & 27 deletions scrapy_autoextract/providers.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,13 @@
import inspect
import json
import logging
import os
from asyncio import CancelledError
from typing import Callable, Set, ClassVar, List, Any, Hashable
from typing import Callable, Set, ClassVar, List, Any, Hashable, Sequence, Type, TypeVar

import aiohttp
from scrapy import Request as ScrapyRequest, signals
import attr
from scrapy import Request as ScrapyRequest, signals, Request
from scrapy.crawler import Crawler
from scrapy.settings import Settings
from autoextract.aio import request_raw, create_session
Expand All @@ -30,6 +32,9 @@
_TASK_MANAGER = "_autoextract_task_manager"


AEDataType = TypeVar('AEDataType', bound=AutoExtractData, covariant=True)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could you elaborate a little bit, please, why you created a covariant TypeVar here instead of just using AutoExtractData in typing? Is it because it could be product data, article data, and so on, so it won't be an invariant?

Copy link
Contributor

@BurnzZ BurnzZ Dec 22, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yep, that's precisely right. Using covariant=True here would cover any subtypes derived from AutoExtractData.



def get_autoextract_task_manager(crawler: Crawler) -> TaskManager:
"""
Return the configured :class:`scrapy_autoextract.TaskManager` that controls
Expand Down Expand Up @@ -66,11 +71,20 @@ def get_concurrent_requests_per_domain(settings: Settings):
return concurrency


@attr.define()
class AERequestSpec:
query: List[AutoExtractRequest]
BurnzZ marked this conversation as resolved.
Show resolved Hide resolved
should_request_html: bool
is_extraction_required: bool
provided_cls: Callable


class AutoExtractProvider(PageObjectInputProvider):
"""Provider for AutoExtract data"""
# pageType requested when only html is required
page_type_class_for_html: ClassVar[AutoExtractData] = AutoExtractProductData
html_query_attribute: ClassVar[str] = "fullHtml"
name = "autoextract"

@classmethod
def provided_classes(cls, type_: Callable) -> bool:
Expand Down Expand Up @@ -172,7 +186,7 @@ def get_per_domain_concurrency_slot(self, request: ScrapyRequest) -> Hashable:

def get_filled_request(self,
request: ScrapyRequest,
provided_cls: AutoExtractData,
provided_cls: Type[AEDataType],
should_request_html: bool) -> AutoExtractRequest:
"""Return a filled request for AutoExtract"""
ae_request = AutoExtractRequest(
Expand All @@ -183,51 +197,68 @@ def get_filled_request(self,
ae_request.extra = {self.html_query_attribute: True}
return ae_request

async def __call__(self,
to_provide: Set[Callable],
request: ScrapyRequest
) -> List:
"""Make an AutoExtract request and build a Page Input of provided class
based on API response data.
"""
if not self.aiohttp_session:
self.aiohttp_session = await self.create_aiohttp_session()
def list_required_requests(self, to_provide: Set[Callable], request: ScrapyRequest):
BurnzZ marked this conversation as resolved.
Show resolved Hide resolved
is_html_required = AutoExtractHtml in to_provide
to_provide -= {AutoExtractHtml}
is_extraction_required = bool(to_provide)
if is_html_required and not is_extraction_required:
# At least one request is required to get html
to_provide = {self.page_type_class_for_html}

instances = []
if not to_provide:
return []
# Use the recommended type for html request, otherwise use the first one
class_for_html = (self.page_type_class_for_html
if self.page_type_class_for_html in to_provide
else next(iter(to_provide)))
specs = []
for idx, provided_cls in enumerate(to_provide):
if not issubclass(provided_cls, AutoExtractData):
raise RuntimeError(
f"Unexpected {provided_cls} requested. Probably a bug in the provider "
"or in scrapy-poet itself")

# html is requested only a single time to save resources
should_request_html = is_html_required and provided_cls is class_for_html
ae_request = self.get_filled_request(
request,
provided_cls,
should_request_html
)
specs.append(AERequestSpec(
[ae_request],
should_request_html,
is_extraction_required,
provided_cls
))
return specs

async def __call__(self,
to_provide: Set[Callable],
request: ScrapyRequest
) -> List:
"""Make an AutoExtract request and build a Page Input of provided class
based on API response data.
"""
if not self.aiohttp_session:
self.aiohttp_session = await self.create_aiohttp_session()

instances = []
for spec in self.list_required_requests(to_provide, request):
request_stats = AggStats()
is_first_request = idx == 0
page_type = provided_cls.page_type
page_type = spec.provided_cls.page_type

def inc_stats(suffix, value=1, both=False):
self.stats.inc_value(f"autoextract/total{suffix}", value)
if both:
self.stats.inc_value(f"autoextract/{page_type}{suffix}", value)

try:
# html is requested only a single time to save resources
should_request_html = is_html_required and is_first_request
slot = self.get_per_domain_concurrency_slot(request)
ae_request = self.get_filled_request(
request,
provided_cls,
should_request_html
)
# When providing same-name arguments in both call and `__init__`
# this implementation will not cause any errors (while name=value implementation would),
# so predefined `__init__` arguments would override the same arguments in the call
awaitable = self.do_request_cached(**{
'query': [ae_request],
'query': spec.query,
'agg_stats': request_stats,
**self.common_request_kwargs,
})
Expand All @@ -251,15 +282,29 @@ def inc_stats(suffix, value=1, both=False):
inc_stats("/attempts/count", request_stats.n_attempts)
inc_stats("/attempts/billable", request_stats.n_billable_query_responses)

if should_request_html:
if spec.should_request_html:
instances.append(AutoExtractHtml(url=data[page_type]['url'],
html=data['html']))
inc_stats("/pages/html", both=True)

if is_extraction_required:
if spec.is_extraction_required:
without_html = {k: v for k, v in data.items() if k != "html"}
instances.append(provided_cls(data=without_html))
instances.append(spec.provided_cls(data=without_html))

inc_stats("/pages/success", both=True)

return instances

def fingerprint(self, to_provide: Set[Callable], request: Request) -> str:
queries = [spec.query[0].as_dict() for spec in self.list_required_requests(to_provide, request)]
# pageType is the key to sort by it
by_page_type = {query['pageType']: query for query in queries}
for query in by_page_type.values():
query.pop("pageType")
return json.dumps(by_page_type, sort_keys=True)

def serialize(self, result: Sequence[Any]) -> Any:
return result

def deserialize(self, data: Any) -> Sequence[Any]:
return data
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ def get_version():
install_requires=[
'autoextract-poet>=0.3.0',
'zyte-autoextract>=0.7.0',
'scrapy-poet>=0.2.0',
'scrapy-poet @ git+https://[email protected]/scrapinghub/scrapy-poet@injector_record_replay_native#egg=scrapy-poet',
Copy link
Contributor

@BurnzZ BurnzZ Dec 22, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Reminder to remove this after a new release of scrapy-poet to PyPI is done.

'aiohttp',
'tldextract',
'sqlitedict>=1.7.0',
Expand Down
12 changes: 12 additions & 0 deletions tests/test_providers.py
Original file line number Diff line number Diff line change
Expand Up @@ -285,3 +285,15 @@ async def cancel_after(sleep):

finally:
signal.signal(SIGINT, old_handler)


def test_provider_cache():
injector = get_injector_for_testing({})
request = get_response_for_testing(None).request
provider = AutoExtractProvider(injector.crawler)
finguerprint = provider.fingerprint(
{AutoExtractHtml, AutoExtractProductData, AutoExtractArticleData},
request)
assert finguerprint == """{"article": {"articleBodyRaw": false, "url": "http://example.com"}, "product": {"articleBodyRaw": false, "fullHtml": true, "url": "http://example.com"}}"""
BurnzZ marked this conversation as resolved.
Show resolved Hide resolved
assert provider.serialize("foo") == "foo"
assert provider.deserialize("bar") == "bar"