diff --git a/.flake8 b/.flake8 index 939f4afa..09d66a0b 100644 --- a/.flake8 +++ b/.flake8 @@ -44,8 +44,7 @@ per-file-ignores = web_poet/serialization/__init__.py:F401,F403 web_poet/testing/__init__.py:F401,F403 web_poet/testing/pytest.py:D102 - tests/po_lib_to_return/__init__.py:D102 - tests/test_testing.py:D102 + tests/*:D102 # the suggestion makes the code worse tests/test_serialization.py:B028 diff --git a/docs/frameworks/additional-requests.rst b/docs/frameworks/additional-requests.rst index ebb77a57..99930633 100644 --- a/docs/frameworks/additional-requests.rst +++ b/docs/frameworks/additional-requests.rst @@ -39,6 +39,7 @@ This can be set using: import attrs import web_poet + from web_poet import validates_input async def request_implementation(req: web_poet.HttpRequest) -> web_poet.HttpResponse: ... @@ -52,6 +53,7 @@ This can be set using: class SomePage(web_poet.WebPage): http: web_poet.HttpClient + @validates_input async def to_item(self): ... @@ -89,6 +91,7 @@ when creating an :class:`~.HttpClient` instance: import attrs import web_poet + from web_poet import validates_input async def request_implementation(req: web_poet.HttpRequest) -> web_poet.HttpResponse: ... @@ -101,6 +104,7 @@ when creating an :class:`~.HttpClient` instance: class SomePage(web_poet.WebPage): http: web_poet.HttpClient + @validates_input async def to_item(self): ... @@ -161,18 +165,20 @@ like the ones above, then it would cause the code to look like: .. code-block:: python - import attrs - import web_poet + import urllib import aiohttp + import attrs import requests - import urllib + import web_poet + from web_poet import validates_input @attrs.define class SomePage(web_poet.WebPage): http: web_poet.HttpClient + @validates_input async def to_item(self): try: response = await self.http.get("...") @@ -196,12 +202,14 @@ This makes the code simpler: import attrs import web_poet + from web_poet import validates_input @attrs.define class SomePage(web_poet.WebPage): http: web_poet.HttpClient + @validates_input async def to_item(self): try: response = await self.http.get("...") diff --git a/docs/index.rst b/docs/index.rst index 535b9565..ed6a1ab9 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -25,7 +25,7 @@ web-poet page-objects/additional-requests page-objects/fields page-objects/rules - page-objects/retries + page-objects/input-validation page-objects/page-params page-objects/testing diff --git a/docs/page-objects/additional-requests.rst b/docs/page-objects/additional-requests.rst index 0057e3ca..d800c184 100644 --- a/docs/page-objects/additional-requests.rst +++ b/docs/page-objects/additional-requests.rst @@ -289,12 +289,14 @@ Executing a HttpRequest instance import attrs import web_poet + from web_poet import validates_input @attrs.define class ProductPage(web_poet.WebPage): http: web_poet.HttpClient + @validates_input async def to_item(self): item = { "url": self.url, @@ -345,12 +347,14 @@ method on it. import attrs import web_poet + from web_poet import validates_input @attrs.define class ProductPage(web_poet.WebPage): http: web_poet.HttpClient + @validates_input async def to_item(self): item = { "url": self.url, @@ -390,12 +394,14 @@ Thus, additional requests inside the Page Object are typically needed for it: import attrs import web_poet + from web_poet import validates_input @attrs.define class ProductPage(web_poet.WebPage): http: web_poet.HttpClient + @validates_input async def to_item(self): item = { "url": self.url, @@ -479,6 +485,7 @@ list of :class:`~.HttpRequest` to be executed in batch using the import attrs import web_poet + from web_poet import validates_input @attrs.define @@ -487,6 +494,7 @@ list of :class:`~.HttpRequest` to be executed in batch using the default_pagination_limit = 10 + @validates_input async def to_item(self): item = { "url": self.url, @@ -578,6 +586,7 @@ from the previous subsection named: :ref:`httpclient-get-example`. import attrs import web_poet + from web_poet import validates_input logger = logging.getLogger(__name__) @@ -586,6 +595,7 @@ from the previous subsection named: :ref:`httpclient-get-example`. class ProductPage(web_poet.WebPage): http: web_poet.HttpClient + @validates_input async def to_item(self): item = { "url": self.url, @@ -658,6 +668,7 @@ For this example, let's improve the code snippet from the previous subsection na import attrs import web_poet + from web_poet import validates_input @attrs.define @@ -666,6 +677,7 @@ For this example, let's improve the code snippet from the previous subsection na default_pagination_limit = 10 + @validates_input async def to_item(self): item = { "url": self.url, @@ -773,3 +785,54 @@ Here's an example: From the example above, we're now checking the list of responses to see if any exceptions are included in it. If so, we're simply logging it down and ignoring it. In this way, perfectly good responses can still be processed through. + + +.. _retries-additional-requests: + +Retrying Additional Requests +============================ + +When the bad response data comes from :ref:`additional requests +`, you must handle retries on your own. + +The page object code is responsible for retrying additional requests until good +response data is received, or until some maximum number of retries is exceeded. + +It is up to you to decide what the maximum number of retries should be for a +given additional request, based on your experience with the target website. + +It is also up to you to decide how to implement retries of additional requests. + +One option would be tenacity_. For example, to try an additional request 3 +times before giving up: + +.. _tenacity: https://tenacity.readthedocs.io/en/latest/index.html + +.. code-block:: python + + import attrs + from tenacity import retry, stop_after_attempt + from web_poet import HttpClient, WebPage, validates_input + + @attrs.define + class MyPage(WebPage): + http: HttpClient + + @retry(stop=stop_after_attempt(3)) + async def get_data(self): + response = await self.http.get("https://toscrape.com/") + if not response.css(".expected"): + raise ValueError + return response.css(".data").get() + + @validates_input + async def to_item(self) -> dict: + try: + data = await self.get_data() + except ValueError: + return {} + return {"data": data} + +If the reason your additional request fails is outdated or missing data from +page object input, do not try to reproduce the request for that input as an +additional request. :ref:`Request fresh input instead `. \ No newline at end of file diff --git a/docs/page-objects/fields.rst b/docs/page-objects/fields.rst index 80dbe870..74dcb9bb 100644 --- a/docs/page-objects/fields.rst +++ b/docs/page-objects/fields.rst @@ -206,7 +206,7 @@ attrs instances) instead of unstructured dicts to hold the data: .. code-block:: python import attrs - from web_poet import ItemPage, HttpResponse + from web_poet import ItemPage, HttpResponse, validates_input @attrs.define class Product: @@ -217,6 +217,7 @@ attrs instances) instead of unstructured dicts to hold the data: @attrs.define class ProductPage(ItemPage): # ... + @validates_input def to_item(self) -> Product: return Product( name=self.name, @@ -394,6 +395,8 @@ To recap: to contain more ``@fields`` than defined in the item class, e.g. because Page Object is inherited from some other base Page Object. +.. _field-caching: + Caching ------- @@ -404,12 +407,13 @@ attributes from this response: .. code-block:: python - from web_poet import ItemPage, HttpResponse, HttpClient + from web_poet import ItemPage, HttpResponse, HttpClient, validates_input class MyPage(ItemPage): response: HttpResponse http: HttpClient + @validates_input async def to_item(self): api_url = self.response.css("...").get() api_response = await self.http.get(api_url).json() @@ -541,3 +545,27 @@ returns a dictionary, where keys are field names, and values are print(field_names) # dict_keys(['my_field']) print(my_field_meta) # {'expensive': True} + + +Input validation +---------------- + +:ref:`Input validation `, if used, happens before field +evaluation, and it may override the values of fields, preventing field +evaluation from ever happening. For example: + +.. code-block:: python + + class Page(ItemPage[Item]): + def validate_input(self): + return Item(foo="bar") + + @field + def foo(self): + raise RuntimeError("This exception is never raised") + + assert Page().foo == "bar" + +Field evaluation may still happen for a field if the field is used in the +implementation of the ``validate_input`` method. Note, however, that only +synchronous fields can be used from the ``validate_input`` method. diff --git a/docs/page-objects/input-validation.rst b/docs/page-objects/input-validation.rst new file mode 100644 index 00000000..dba39651 --- /dev/null +++ b/docs/page-objects/input-validation.rst @@ -0,0 +1,118 @@ +.. _input-validation: + +================ +Input validation +================ + +Sometimes the data that your page object receives as input may be invalid. + +You can define a ``validate_input`` method in a page object class to check its +input data and determine how to handle invalid input. + +``validate_input`` is called on the first execution of ``ItemPage.to_item()`` +or the first access to a :ref:`field `. In both cases validation +happens early; in the case of fields, it happens before field evaluation. + +``validate_input`` is a synchronous method that expects no parameters, and its +outcome may be any of the following: + +- Return ``None``, indicating that the input is valid. + +.. _retries-input: + +- Raise :exc:`~web_poet.exceptions.Retry`, indicating that the input + looks like the result of a temporary issue, and that trying to fetch + similar input again may result in valid input. + + See also :ref:`retries-additional-requests`. + +- Raise :exc:`~web_poet.exceptions.UseFallback`, indicating that the + page object does not support the input, and that an alternative parsing + implementation should be tried instead. + + For example, imagine you have a page object for website commerce.example, + and that commerce.example is built with a popular e-commerce web framework. + You could have a generic page object for products of websites using that + framework, ``FrameworkProductPage``, and a more specific page object for + commerce.example, ``EcommerceExampleProductPage``. If + ``EcommerceExampleProductPage`` cannot parse a product page, but it looks + like it might be a valid product page, you would raise + :exc:`~web_poet.exceptions.UseFallback` to try to parse the same product + page with ``FrameworkProductPage``, in case it works. + + .. note:: web-poet does not dictate how to define or use an alternative + parsing implementation as fallback. It is up to web-poet + frameworks to choose how they implement fallback handling. + +- Return an item to override the output of the ``to_item`` method and of + fields. + + For input not matching the expected type of data, returning an item that + indicates so is recommended. + + For example, if your page object parses an e-commerce product, and the + input data corresponds to a list of products rather than a single product, + you could return a product item that somehow indicates that it is not a + valid product item, such as ``Product(is_valid=False)``. + +For example: + +.. code-block:: python + + def validate_input(self): + if self.css('.product-id::text') is not None: + return + if self.css('.http-503-error'): + raise Retry() + if self.css('.product'): + raise UseFallback() + if self.css('.product-list'): + return Product(is_valid=False) + +You may use fields in your implementation of the ``validate_input`` method, but +only synchronous fields are supported. For example: + +.. code-block:: python + + class Page(WebPage[Item]): + def validate_input(self): + if not self.name: + raise UseFallback() + + @field(cached=True) + def name(self): + return self.css(".product-name ::text") + +.. tip:: :ref:`Cache fields ` used in the ``validate_input`` + method, so that when they are used from ``to_item`` they are not + evaluated again. + +If you implement a custom ``to_item`` method, as long as you are inheriting +from :class:`~web_poet.pages.ItemPage`, you can enable input validation +decorating your custom ``to_item`` method with +:func:`~web_poet.util.validates_input`: + +.. code-block:: python + + from web_poet import validates_input + + class Page(ItemPage[Item]): + @validates_input + async def to_item(self): + ... + +:exc:`~web_poet.exceptions.Retry` and :exc:`~web_poet.exceptions.UseFallback` +may also be raised from the ``to_item`` method. This could come in handy, for +example, if after you execute some asynchronous code, such as an +:ref:`additional request `, you find out that you need to +retry the original request or use a fallback. + + +Input Validation Exceptions +=========================== + +.. autoexception:: web_poet.exceptions.PageObjectAction + +.. autoexception:: web_poet.exceptions.Retry + +.. autoexception:: web_poet.exceptions.UseFallback diff --git a/docs/page-objects/page-params.rst b/docs/page-objects/page-params.rst index c8d9a923..16279ac7 100644 --- a/docs/page-objects/page-params.rst +++ b/docs/page-objects/page-params.rst @@ -48,6 +48,7 @@ Controlling item values import attrs import web_poet + from web_poet import validates_input @attrs.define @@ -56,6 +57,7 @@ Controlling item values default_tax_rate = 0.10 + @validates_input def to_item(self): item = { "url": self.url, @@ -91,6 +93,7 @@ Let's try an example wherein :class:`~.PageParams` is able to control how import attrs import web_poet + from web_poet import validates_input @attrs.define @@ -100,6 +103,7 @@ Let's try an example wherein :class:`~.PageParams` is able to control how default_max_pages = 5 + @validates_input async def to_item(self): return {"product_urls": await self.get_product_urls()} diff --git a/docs/page-objects/retries.rst b/docs/page-objects/retries.rst deleted file mode 100644 index cd9453d8..00000000 --- a/docs/page-objects/retries.rst +++ /dev/null @@ -1,88 +0,0 @@ -.. _retries: - -======= -Retries -======= - -The responses of some websites can be unreliable. For example, sometimes -a request can get a response that may only include a part of the data to be -extracted, no data at all, or even data unrelated to your request, but sending -a follow-up, identical request can get you the expected data. - -Pages objects are responsible for handling these scenarios, where issues with -response data can only be detected during extraction. - -.. _retries-input: - -Retrying Page Object Input -========================== - -When the bad response data comes from the inputs that your web-poet framework -supplies to your page object, your page object must raise -:exc:`~web_poet.exceptions.core.Retry`: - -.. code-block:: python - - from web_poet import WebPage - from web_poet.exceptions import Retry - - class MyPage(WebPage): - - def to_item(self) -> dict: - if not self.css(".expected"): - raise Retry - return {} - -As a result, your web-poet framework will retry the source requests and create -a new instance of your page object with the new inputs. - - -.. _retries-additional-requests: - -Retrying Additional Requests -============================ - -When the bad response data comes from :ref:`additional requests -`, you must handle retries on your own. - -The page object code is responsible for retrying additional requests until good -response data is received, or until some maximum number of retries is exceeded. - -It is up to you to decide what the maximum number of retries should be for a -given additional request, based on your experience with the target website. - -It is also up to you to decide how to implement retries of additional requests. - -One option would be tenacity_. For example, to try an additional request 3 -times before giving up: - -.. _tenacity: https://tenacity.readthedocs.io/en/latest/index.html - -.. code-block:: python - - import attrs - from tenacity import retry, stop_after_attempt - from web_poet import HttpClient, HttpRequest, WebPage - - @attrs.define - class MyPage(WebPage): - http: HttpClient - - @retry(stop=stop_after_attempt(3)) - async def get_data(self): - request = HttpRequest("https://toscrape.com/") - response = await self.http.execute(request) - if not response.css(".expected"): - raise ValueError - return response.css(".data").get() - - async def to_item(self) -> dict: - try: - data = await self.get_data() - except ValueError: - return {} - return {"data": data} - -If the reason your additional request fails is outdated or missing data from -page object input, do not try to reproduce the request for that input as an -additional request. :ref:`Request fresh input instead `. diff --git a/docs/page-objects/rules.rst b/docs/page-objects/rules.rst index 6fcb4fcd..63f6a277 100644 --- a/docs/page-objects/rules.rst +++ b/docs/page-objects/rules.rst @@ -178,10 +178,11 @@ Let's take a look at how the following code is structured: .. code-block:: python - from web_poet import handle_urls, WebPage + from web_poet import handle_urls, WebPage, validates_input class GenericProductPage(WebPage): + @validates_input def to_item(self) -> Product: return Product(product_title=self.css("title::text").get()) @@ -241,10 +242,11 @@ the following: .. code-block:: python - from web_poet import handle_urls, WebPage + from web_poet import handle_urls, WebPage, validates_input class GenericProductPage(WebPage[Product]): + @validates_input def to_item(self) -> Product: return Product(product_title=self.css("title::text").get()) @@ -312,10 +314,11 @@ either contexts of Page Objects and item classes. .. code-block:: python - from web_poet import handle_urls, WebPage + from web_poet import handle_urls, WebPage, validates_input class GenericProductPage(WebPage[Product]): + @validates_input def to_item(self) -> Product: return Product(product_title=self.css("title::text").get()) @@ -577,7 +580,7 @@ have the first approach as an example: .. code-block:: python - from web_poet import default_registry, consume_modules, handle_urls + from web_poet import default_registry, consume_modules, handle_urls, validates_input import ecommerce_page_objects, gadget_sites_page_objects consume_modules("ecommerce_page_objects", "gadget_sites_page_objects") @@ -592,6 +595,7 @@ have the first approach as an example: @handle_urls("site_1.example", instead_of=ecommerce_page_objects.EcomGenericPage, priority=1000) class ImprovedEcomSite1(ecommerce_page_objects.site_1.EcomSite1): + @validates_input def to_item(self): ... # call super().to_item() and improve on the item's shortcomings @@ -692,11 +696,12 @@ Here's an example: .. code-block:: python - from web_poet import default_registry, consume_modules, handle_urls + from web_poet import default_registry, consume_modules, handle_urls, validates_input import ecommerce_page_objects, gadget_sites_page_objects, common_items @handle_urls("site_2.example", instead_of=common_items.ProductGenericPage, priority=1000) class EcomSite2Copy(ecommerce_page_objects.site_1.EcomSite1): + @validates_input def to_item(self): return super().to_item() diff --git a/docs/page-objects/testing.rst b/docs/page-objects/testing.rst index 36cb9a92..f0eee037 100644 --- a/docs/page-objects/testing.rst +++ b/docs/page-objects/testing.rst @@ -208,9 +208,10 @@ and such fields will containt wrong data if these timezones don't match. Consider an example item:: import datetime - from web_poet import WebPage + from web_poet import WebPage, validates_input class DateItemPage(WebPage): + @validates_input async def to_item(self) -> dict: # e.g. 2001-01-01 11:00:00 +00 now = datetime.datetime.now(datetime.timezone.utc) diff --git a/tests/po_lib_to_return/__init__.py b/tests/po_lib_to_return/__init__.py index 935e974e..2efaf893 100644 --- a/tests/po_lib_to_return/__init__.py +++ b/tests/po_lib_to_return/__init__.py @@ -1,7 +1,7 @@ import attrs from url_matcher import Patterns -from web_poet import Injectable, ItemPage, Returns, field, handle_urls, item_from_fields +from web_poet import Injectable, ItemPage, Returns, field, handle_urls @attrs.define @@ -184,13 +184,13 @@ class CustomProductPageDataTypeOnly(Injectable): expected_to_return = Product expected_meta = {} - @field + @property def name(self) -> str: return "name" - @field + @property def price(self) -> float: return 12.99 async def to_item(self) -> Product: - return await item_from_fields(self, item_cls=Product) + return Product(name=self.name, price=self.price) diff --git a/tests/test_fields.py b/tests/test_fields.py index 6f932828..8c4e867d 100644 --- a/tests/test_fields.py +++ b/tests/test_fields.py @@ -20,7 +20,6 @@ ) from web_poet import ( HttpResponse, - Injectable, ItemPage, field, item_from_fields, @@ -40,11 +39,11 @@ class Page(ItemPage[Item]): response: HttpResponse @field - def name(self): # noqa: D102 + def name(self): return self.response.css("title ::text").get() @field - async def price(self): # noqa: D102 + async def price(self): await asyncio.sleep(0.01) return "$123" @@ -54,11 +53,11 @@ class InvalidPage(ItemPage[Item]): response: HttpResponse @field - def name(self): # noqa: D102 + def name(self): return self.response.css("title ::text").get() @field - def unknown_attribute(self): # noqa: D102 + def unknown_attribute(self): return "foo" @@ -94,10 +93,10 @@ def test_item_from_fields_sync() -> None: @attrs.define class Page(ItemPage): @field - def name(self): # noqa: D102 + def name(self): return "name" - def to_item(self): # noqa: D102 + def to_item(self): return item_from_fields_sync(self, dict) page = Page() @@ -112,10 +111,10 @@ class Page(ItemPage): # https://github.com/python/mypy/issues/1362#issuecomment-438246775 @field # type: ignore @property - def name(self): # noqa: D102 + def name(self): return "name" - def to_item(self): # noqa: D102 + def to_item(self): return item_from_fields_sync(self, dict) @@ -126,10 +125,10 @@ def test_field_classmethod() -> None: class Page(ItemPage): @field @classmethod - def name(cls): # noqa: D102 + def name(cls): return "name" - def to_item(self): # noqa: D102 + def to_item(self): return item_from_fields_sync(self, dict) @@ -159,7 +158,7 @@ def to_item(self): def test_field_cache_sync() -> None: - class Page: + class Page(ItemPage): _n_called_1 = 0 _n_called_2 = 0 @@ -187,7 +186,7 @@ def n_called_2(self): @pytest.mark.asyncio async def test_field_cache_async() -> None: - class Page: + class Page(ItemPage): _n_called_1 = 0 _n_called_2 = 0 @@ -215,7 +214,7 @@ async def n_called_2(self): @pytest.mark.asyncio async def test_field_cache_async_locked() -> None: - class Page: + class Page(ItemPage): _n_called = 0 @field(cached=True) @@ -257,18 +256,18 @@ async def to_item(self) -> Item: def test_skip_nonitem_fields() -> None: @attrs.define - class SyncPage(Injectable): + class SyncPage(ItemPage): response: HttpResponse @field - def name(self): # noqa: D102 + def name(self): return self.response.css("title ::text").get() @field - def price(self): # noqa: D102 + def price(self): return "$123" - def to_item(self): # noqa: D102 + def to_item(self) -> Item: # type: ignore[override] return item_from_fields_sync(self, Item) class ExtendedPage(SyncPage): @@ -281,7 +280,7 @@ def new_attribute(self): page.to_item() class ExtendedPage2(ExtendedPage): - def to_item(self) -> Item: + def to_item(self) -> Item: # type: ignore[override] return item_from_fields_sync(self, Item, skip_nonitem_fields=True) page = ExtendedPage2(response=EXAMPLE_RESPONSE) @@ -483,7 +482,7 @@ def proc1(s): @attrs.define class Page(ItemPage): @field(out=[str.strip, proc1]) - def name(self): # noqa: D102 + def name(self): return " name\t " page = Page() @@ -498,7 +497,7 @@ def proc1(s): @attrs.define class Page(ItemPage): @field(out=[str.strip, proc1]) - async def name(self): # noqa: D102 + async def name(self): return " name\t " page = Page() diff --git a/tests/test_input_validation.py b/tests/test_input_validation.py new file mode 100644 index 00000000..8d662fb3 --- /dev/null +++ b/tests/test_input_validation.py @@ -0,0 +1,311 @@ +"""Test page object input validation scenarios.""" +from typing import Optional + +import attrs +import pytest + +from web_poet import ItemPage, Returns, field, validates_input +from web_poet.exceptions import Retry, UseFallback + + +@attrs.define +class Item: + a: str + is_valid: bool = True + + +EXPECTED_ITEM = Item(a="a", is_valid=True) + + +class BasePage(ItemPage[Item]): + @field + def a(self): + return "a" + + +# Valid input + + +class BaseValidInputPage(BasePage): + def validate_input(self): + pass + + +def test_valid_input_sync_to_item(): + class Page(BaseValidInputPage): + def to_item(self): + return Item(a=self.a) + + assert Page().to_item() == EXPECTED_ITEM + + +@pytest.mark.asyncio +async def test_valid_input_async_to_item(): + assert await BaseValidInputPage().to_item() == EXPECTED_ITEM + + +def test_valid_input_sync_field(): + assert BaseValidInputPage().a == "a" + + +@pytest.mark.asyncio +async def test_valid_input_async_field(): + class Page(BaseValidInputPage): + @field + async def a(self): + return "a" + + assert await Page().a == "a" + + +# Retry + + +class BaseRetryPage(BasePage): + def validate_input(self): + raise Retry() + + +def test_retry_sync_to_item(): + class Page(BaseRetryPage): + def to_item(self): + return Item(a=self.a) + + page = Page() + with pytest.raises(Retry): + page.to_item() + + +@pytest.mark.asyncio +async def test_retry_async_to_item(): + page = BaseRetryPage() + with pytest.raises(Retry): + await page.to_item() + + +def test_retry_sync_field(): + page = BaseRetryPage() + with pytest.raises(Retry): + page.a + + +@pytest.mark.asyncio +async def test_retry_async_field(): + class Page(BaseRetryPage): + @field + async def a(self): + return "a" + + page = Page() + with pytest.raises(Retry): + await page.a + + +# Use fallback + + +class BaseUseFallbackPage(BasePage): + def validate_input(self): + if self.a is None: + raise UseFallback() + + @field + def a(self): + return None + + +def test_use_fallback_sync_to_item(): + class Page(BaseUseFallbackPage): + def to_item(self): + return Item(a=self.a) + + page = Page() + with pytest.raises(UseFallback): + page.to_item() + + +@pytest.mark.asyncio +async def test_use_fallback_async_to_item(): + page = BaseUseFallbackPage() + with pytest.raises(UseFallback): + await page.to_item() + + +def test_use_fallback_sync_field(): + page = BaseUseFallbackPage() + with pytest.raises(UseFallback): + page.a + + +@pytest.mark.asyncio +async def test_use_fallback_async_field(): + class Page(BaseUseFallbackPage): + def validate_input(self): + # Cannot use async self.a + raise UseFallback() + + @field + async def a(self): + return "a" + + page = Page() + with pytest.raises(UseFallback): + await page.a + + +# Invalid input + + +INVALID_ITEM = Item(a="invalid", is_valid=False) + + +class BaseInvalidInputPage(ItemPage[Item]): + def validate_input(self): + return INVALID_ITEM + + @field + def a(self): + raise RuntimeError("This exception should never be raised") + + +def test_invalid_input_sync_to_item(): + class Page(BaseInvalidInputPage): + @validates_input + def to_item(self): + return Item(a=self.a) + + assert Page().to_item() == INVALID_ITEM + + +@pytest.mark.asyncio +async def test_invalid_input_async_to_item(): + assert await BaseInvalidInputPage().to_item() == INVALID_ITEM + + +def test_invalid_input_sync_field(): + assert BaseInvalidInputPage().a == "invalid" + + +@pytest.mark.asyncio +async def test_invalid_input_async_field(): + class Page(BaseInvalidInputPage): + @field + async def a(self): + raise RuntimeError("This exception should never be raised") + + assert await Page().a == "invalid" + + +# Unvalidated input + + +def test_unvalidated_input_sync_to_item(): + class Page(BasePage): + def to_item(self): + return Item(a=self.a) + + assert Page().to_item() == EXPECTED_ITEM + + +@pytest.mark.asyncio +async def test_unvalidated_input_async_to_item(): + assert await BasePage().to_item() == EXPECTED_ITEM + + +def test_unvalidated_input_sync_field(): + assert BasePage().a == "a" + + +@pytest.mark.asyncio +async def test_unvalidated_input_async_field(): + class Page(BasePage): + @field + async def a(self): + return "a" + + assert await Page().a == "a" + + +# Caching + + +class BaseCachingPage(BasePage): + _raise = False + + def validate_input(self): + if self._raise: + raise UseFallback() + self._raise = True + + +def test_invalid_input_sync_to_item_caching(): + class Page(BaseCachingPage): + def to_item(self): + return Item(a=self.a) + + page = Page() + page.to_item() + page.to_item() + + +@pytest.mark.asyncio +async def test_invalid_input_async_to_item_caching(): + page = BaseCachingPage() + await page.to_item() + await page.to_item() + + +def test_invalid_input_sync_field_caching(): + page = BaseCachingPage() + page.a + page.a + + +@pytest.mark.asyncio +async def test_invalid_input_async_field_caching(): + class Page(BaseCachingPage): + @field + async def a(self): + return "a" + + page = Page() + await page.a + await page.a + + +@pytest.mark.asyncio +async def test_invalid_input_cross_api_caching(): + @attrs.define + class _Item(Item): + b: Optional[str] = None + + class Page(BaseCachingPage, Returns[_Item]): + @field + async def b(self): + return "b" + + page = Page() + page.a + await page.b + await page.to_item() + + +# Recursion + + +@pytest.mark.asyncio +async def test_recursion(): + """Make sure that using fields within the validate_input method does not + result in a recursive call to the validate_input method.""" + + class Page(BasePage): + _raise = False + + def validate_input(self): + if self._raise: + raise UseFallback() + self._raise = True + assert self.a == "a" + + page = Page() + assert page.a == "a" diff --git a/tests/test_testing.py b/tests/test_testing.py index 3dadb29b..c54edbbb 100644 --- a/tests/test_testing.py +++ b/tests/test_testing.py @@ -56,12 +56,12 @@ def _assert_fixture_files( class MyItemPage(WebPage): - async def to_item(self) -> dict: # noqa: D102 + async def to_item(self) -> dict: return {"foo": "bar"} class MyItemPage2(WebPage): - async def to_item(self) -> dict: # noqa: D102 + async def to_item(self) -> dict: return {"foo": None} @@ -200,7 +200,7 @@ def _get_product_item(date: datetime.datetime) -> Product: class DateItemPage(WebPage): - async def to_item(self) -> Item: # noqa: D102 + async def to_item(self) -> Item: date = datetime.datetime.now().astimezone() return _get_product_item(date) @@ -275,7 +275,7 @@ def test_pytest_frozen_time_tz_windows_pass(pytester, book_list_html_response) - class ClientPage(WebPage): client: HttpClient - async def to_item(self) -> dict: # noqa: D102 + async def to_item(self) -> dict: resp1 = await self.client.get("http://books.toscrape.com/1.html") resp2 = await self.client.post("http://books.toscrape.com/2.html", body=b"post") return {"foo": "bar", "additional": [resp1.body.decode(), resp2.body.decode()]} @@ -339,7 +339,7 @@ def test_httpclient_no_response(pytester, book_list_html_response) -> None: class ClientExceptionPage(WebPage): client: HttpClient - async def to_item(self) -> dict: # noqa: D102 + async def to_item(self) -> dict: msg = "" try: await self.client.get("http://books.toscrape.com/1.html") diff --git a/web_poet/__init__.py b/web_poet/__init__.py index 3c8b0112..a30cb186 100644 --- a/web_poet/__init__.py +++ b/web_poet/__init__.py @@ -12,7 +12,7 @@ RequestUrl, ResponseUrl, ) -from .pages import Injectable, ItemPage, ItemWebPage, Returns, WebPage +from .pages import Injectable, ItemPage, ItemWebPage, Returns, WebPage, validates_input from .requests import request_downloader_var from .rules import ( ApplyRule, diff --git a/web_poet/exceptions/core.py b/web_poet/exceptions/core.py index 9f3691a6..307cabad 100644 --- a/web_poet/exceptions/core.py +++ b/web_poet/exceptions/core.py @@ -13,7 +13,10 @@ __all__ = [ "RequestDownloaderVarError", + "PageObjectAction", "Retry", + "UseFallback", + "NoSavedHttpResponse", ] @@ -25,17 +28,21 @@ class RequestDownloaderVarError(Exception): to learn more about this. """ - pass +class PageObjectAction(ValueError): + """Base class for exceptions that can be raised from a page object to + indicate something to be done about that page object.""" -class Retry(ValueError): + +class Retry(PageObjectAction): """The page object found that the input data is partial or empty, and a - request retry may provide better input. + request retry may provide better input.""" - See :ref:`retries`. - """ - pass +class UseFallback(PageObjectAction): + """The page object cannot extract data from the input, but the input seems + valid, so an alternative data extraction implementation for the same item + type may succeed.""" class NoSavedHttpResponse(AssertionError): diff --git a/web_poet/fields.py b/web_poet/fields.py index e2d70d4f..12313270 100644 --- a/web_poet/fields.py +++ b/web_poet/fields.py @@ -114,16 +114,20 @@ def _process(self, value, page): def _processed(self, method, page): """Returns a wrapper for method that calls processors on its result""" - if not self.processors: - return method if inspect.iscoroutinefunction(method): async def processed(*args, **kwargs): + validation_item = args[0]._validate_input() + if validation_item is not None: + return getattr(validation_item, method.__name__) return self._process(await method(*args, **kwargs), page) else: def processed(*args, **kwargs): + validation_item = args[0]._validate_input() + if validation_item is not None: + return getattr(validation_item, method.__name__) return self._process(method(*args, **kwargs), page) return wraps(method)(processed) diff --git a/web_poet/pages.py b/web_poet/pages.py index fed2fae4..3161a5e8 100644 --- a/web_poet/pages.py +++ b/web_poet/pages.py @@ -1,5 +1,8 @@ import abc +import inspect import typing +from contextlib import suppress +from functools import wraps import attr @@ -7,7 +10,7 @@ from web_poet.fields import FieldsMixin, item_from_fields from web_poet.mixins import ResponseShortcutsMixin from web_poet.page_inputs import HttpResponse -from web_poet.utils import _create_deprecated_class +from web_poet.utils import CallableT, _create_deprecated_class, cached_method class Injectable(abc.ABC, FieldsMixin): @@ -53,6 +56,31 @@ def item_cls(self) -> typing.Type[ItemT]: _NOT_SET = object() +def validates_input(to_item: CallableT) -> CallableT: + """Decorator to apply input validation to custom to_item method + implementations in :class:`~web_poet.pages.ItemPage` subclasses.""" + + if inspect.iscoroutinefunction(to_item): + + @wraps(to_item) + async def _to_item(self, *args, **kwargs): + validation_item = self._validate_input() + if validation_item is not None: + return validation_item + return await to_item(self, *args, **kwargs) + + else: + + @wraps(to_item) + def _to_item(self, *args, **kwargs): + validation_item = self._validate_input() + if validation_item is not None: + return validation_item + return to_item(self, *args, **kwargs) + + return _to_item # type: ignore[return-value] + + class ItemPage(Injectable, Returns[ItemT]): """Base Page Object, with a default :meth:`to_item` implementation which supports web-poet fields. @@ -72,6 +100,7 @@ def __init_subclass__(cls, skip_nonitem_fields=_NOT_SET, **kwargs): return cls._skip_nonitem_fields = skip_nonitem_fields + @validates_input async def to_item(self) -> ItemT: """Extract an item from a web page""" return await item_from_fields( @@ -80,6 +109,23 @@ async def to_item(self) -> ItemT: skip_nonitem_fields=self._get_skip_nonitem_fields(), ) + @cached_method + def _validate_input(self) -> None: + """Run self.validate_input if defined.""" + if not hasattr(self, "validate_input"): + return + with suppress(AttributeError): + if self.__validating_input: + # We are in a recursive call, i.e. _validate_input is being + # called from _validate_input itself (likely through a @field + # method). + return + + self.__validating_input: bool = True + validation_item = self.validate_input() # type: ignore[attr-defined] + self.__validating_input = False + return validation_item + @attr.s(auto_attribs=True) class WebPage(ItemPage[ItemT], ResponseShortcutsMixin):