Implement invalid input handling (scrapinghub#113)

Co-authored-by: Mikhail Korobov <[email protected]> Co-authored-by: Andrey Rakhmatullin <[email protected]>
ogiaquino · Mar 28, 2023 · 409d2e6 · 409d2e6
1 parent 0bd1ba5
commit 409d2e6
Show file tree

Hide file tree

Showing 18 changed files with 647 additions and 142 deletions.
diff --git a/.flake8 b/.flake8
@@ -44,8 +44,7 @@ per-file-ignores =
     web_poet/serialization/__init__.py:F401,F403
     web_poet/testing/__init__.py:F401,F403
     web_poet/testing/pytest.py:D102
-    tests/po_lib_to_return/__init__.py:D102
-    tests/test_testing.py:D102
+    tests/*:D102
 
     # the suggestion makes the code worse
     tests/test_serialization.py:B028
diff --git a/docs/frameworks/additional-requests.rst b/docs/frameworks/additional-requests.rst
@@ -39,6 +39,7 @@ This can be set using:
 
     import attrs
     import web_poet
+    from web_poet import validates_input
 
     async def request_implementation(req: web_poet.HttpRequest) -> web_poet.HttpResponse:
         ...
@@ -52,6 +53,7 @@ This can be set using:
     class SomePage(web_poet.WebPage):
         http: web_poet.HttpClient
 
+        @validates_input
         async def to_item(self):
             ...
 
@@ -89,6 +91,7 @@ when creating an :class:`~.HttpClient` instance:
 
     import attrs
     import web_poet
+    from web_poet import validates_input
 
     async def request_implementation(req: web_poet.HttpRequest) -> web_poet.HttpResponse:
         ...
@@ -101,6 +104,7 @@ when creating an :class:`~.HttpClient` instance:
     class SomePage(web_poet.WebPage):
         http: web_poet.HttpClient
 
+        @validates_input
         async def to_item(self):
             ...
 
@@ -161,18 +165,20 @@ like the ones above, then it would cause the code to look like:
 
 .. code-block:: python
 
-    import attrs
-    import web_poet
+    import urllib
 
     import aiohttp
+    import attrs
     import requests
-    import urllib
+    import web_poet
+    from web_poet import validates_input
 
 
     @attrs.define
     class SomePage(web_poet.WebPage):
         http: web_poet.HttpClient
 
+        @validates_input
         async def to_item(self):
             try:
                 response = await self.http.get("...")
@@ -196,12 +202,14 @@ This makes the code simpler:
 
     import attrs
     import web_poet
+    from web_poet import validates_input
 
 
     @attrs.define
     class SomePage(web_poet.WebPage):
         http: web_poet.HttpClient
 
+        @validates_input
         async def to_item(self):
             try:
                 response = await self.http.get("...")

diff --git a/docs/index.rst b/docs/index.rst
@@ -25,7 +25,7 @@ web-poet
    page-objects/additional-requests
    page-objects/fields
    page-objects/rules
-   page-objects/retries
+   page-objects/input-validation
    page-objects/page-params
    page-objects/testing
 

diff --git a/docs/page-objects/additional-requests.rst b/docs/page-objects/additional-requests.rst
@@ -289,12 +289,14 @@ Executing a HttpRequest instance
 
     import attrs
     import web_poet
+    from web_poet import validates_input
 
 
     @attrs.define
     class ProductPage(web_poet.WebPage):
         http: web_poet.HttpClient
 
+        @validates_input
         async def to_item(self):
             item = {
                 "url": self.url,
@@ -345,12 +347,14 @@ method on it.
 
     import attrs
     import web_poet
+    from web_poet import validates_input
 
 
     @attrs.define
     class ProductPage(web_poet.WebPage):
         http: web_poet.HttpClient
 
+        @validates_input
         async def to_item(self):
             item = {
                 "url": self.url,
@@ -390,12 +394,14 @@ Thus, additional requests inside the Page Object are typically needed for it:
 
     import attrs
     import web_poet
+    from web_poet import validates_input
 
 
     @attrs.define
     class ProductPage(web_poet.WebPage):
         http: web_poet.HttpClient
 
+        @validates_input
         async def to_item(self):
             item = {
                 "url": self.url,
@@ -479,6 +485,7 @@ list of :class:`~.HttpRequest` to be executed in batch using the
 
     import attrs
     import web_poet
+    from web_poet import validates_input
 
 
     @attrs.define
@@ -487,6 +494,7 @@ list of :class:`~.HttpRequest` to be executed in batch using the
 
         default_pagination_limit = 10
 
+        @validates_input
         async def to_item(self):
             item = {
                 "url": self.url,
@@ -578,6 +586,7 @@ from the previous subsection named: :ref:`httpclient-get-example`.
 
     import attrs
     import web_poet
+    from web_poet import validates_input
 
     logger = logging.getLogger(__name__)
 
@@ -586,6 +595,7 @@ from the previous subsection named: :ref:`httpclient-get-example`.
     class ProductPage(web_poet.WebPage):
         http: web_poet.HttpClient
 
+        @validates_input
         async def to_item(self):
             item = {
                 "url": self.url,
@@ -658,6 +668,7 @@ For this example, let's improve the code snippet from the previous subsection na
 
     import attrs
     import web_poet
+    from web_poet import validates_input
 
 
     @attrs.define
@@ -666,6 +677,7 @@ For this example, let's improve the code snippet from the previous subsection na
 
         default_pagination_limit = 10
 
+        @validates_input
         async def to_item(self):
             item = {
                 "url": self.url,
@@ -773,3 +785,54 @@ Here's an example:
 From the example above, we're now checking the list of responses to see if any
 exceptions are included in it. If so, we're simply logging it down and ignoring
 it. In this way, perfectly good responses can still be processed through.
+
+
+.. _retries-additional-requests:
+
+Retrying Additional Requests
+============================
+
+When the bad response data comes from :ref:`additional requests
+<additional-requests>`, you must handle retries on your own.
+
+The page object code is responsible for retrying additional requests until good
+response data is received, or until some maximum number of retries is exceeded.
+
+It is up to you to decide what the maximum number of retries should be for a
+given additional request, based on your experience with the target website.
+
+It is also up to you to decide how to implement retries of additional requests.
+
+One option would be tenacity_. For example, to try an additional request 3
+times before giving up:
+
+.. _tenacity: https://tenacity.readthedocs.io/en/latest/index.html
+
+.. code-block:: python
+
+    import attrs
+    from tenacity import retry, stop_after_attempt
+    from web_poet import HttpClient, WebPage, validates_input
+
+    @attrs.define
+    class MyPage(WebPage):
+        http: HttpClient
+
+        @retry(stop=stop_after_attempt(3))
+        async def get_data(self):
+            response = await self.http.get("https://toscrape.com/")
+            if not response.css(".expected"):
+                raise ValueError
+            return response.css(".data").get()
+
+        @validates_input
+        async def to_item(self) -> dict:
+            try:
+                data = await self.get_data()
+            except ValueError:
+                return {}
+            return {"data": data}
+
+If the reason your additional request fails is outdated or missing data from
+page object input, do not try to reproduce the request for that input as an
+additional request. :ref:`Request fresh input instead <retries-input>`.
diff --git a/docs/page-objects/fields.rst b/docs/page-objects/fields.rst
@@ -206,7 +206,7 @@ attrs instances) instead of unstructured dicts to hold the data:
 .. code-block:: python
 
     import attrs
-    from web_poet import ItemPage, HttpResponse
+    from web_poet import ItemPage, HttpResponse, validates_input
 
     @attrs.define
     class Product:
@@ -217,6 +217,7 @@ attrs instances) instead of unstructured dicts to hold the data:
     @attrs.define
     class ProductPage(ItemPage):
         # ...
+        @validates_input
         def to_item(self) -> Product:
             return Product(
                 name=self.name,
@@ -394,6 +395,8 @@ To recap:
   to contain more ``@fields`` than defined in the item class, e.g. because
   Page Object is inherited from some other base Page Object.
 
+.. _field-caching:
+
 Caching
 -------
 
@@ -404,12 +407,13 @@ attributes from this response:
 
 .. code-block:: python
 
-    from web_poet import ItemPage, HttpResponse, HttpClient
+    from web_poet import ItemPage, HttpResponse, HttpClient, validates_input
 
     class MyPage(ItemPage):
         response: HttpResponse
         http: HttpClient
 
+        @validates_input
         async def to_item(self):
             api_url = self.response.css("...").get()
             api_response = await self.http.get(api_url).json()
@@ -541,3 +545,27 @@ returns a dictionary, where keys are field names, and values are
 
     print(field_names)  # dict_keys(['my_field'])
     print(my_field_meta)  # {'expensive': True}
+
+
+Input validation
+----------------
+
+:ref:`Input validation <input-validation>`, if used, happens before field
+evaluation, and it may override the values of fields, preventing field
+evaluation from ever happening. For example:
+
+.. code-block:: python
+
+   class Page(ItemPage[Item]):
+       def validate_input(self):
+           return Item(foo="bar")
+
+       @field
+       def foo(self):
+           raise RuntimeError("This exception is never raised")
+
+    assert Page().foo == "bar"
+
+Field evaluation may still happen for a field if the field is used in the
+implementation of the ``validate_input`` method. Note, however, that only
+synchronous fields can be used from the ``validate_input`` method.