Skip to content

Commit

Permalink
Adding DOWNLOAD_MAXSIZE and DOWNLOAD_WARNSIZE (#227)
Browse files Browse the repository at this point in the history
* initial version for limiting response size

* fix doc

* add title for DOWNLOAD_MAXSIZE in doc

* fix formatting DOWNLOAD_WARNSIZE in doc

* remove extra doc

* restore original doc

* keep original number of encoding/decoding

* fix logic for _body_max_size_exceeded

* rewording messages (we don't have expected now)

* make args in _process_response are named.

* revert original responses.py

* move the main logic to handler

* merge main

* fix params order

* add tests for _body_max_size_exceeded

* formatting

* fix mypy warn

* fix mypy warn for settings

* tune warn message for dropping a response

* Update scrapy_zyte_api/handler.py

Co-authored-by: Mikhail Korobov <[email protected]>

* Update scrapy_zyte_api/handler.py

Co-authored-by: Mikhail Korobov <[email protected]>

---------

Co-authored-by: Mikhail Korobov <[email protected]>
  • Loading branch information
PyExplorer and kmike authored Nov 12, 2024
1 parent 1269ebb commit 0d18fb0
Show file tree
Hide file tree
Showing 2 changed files with 139 additions and 2 deletions.
37 changes: 36 additions & 1 deletion scrapy_zyte_api/handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,27 @@
logger = logging.getLogger(__name__)


def _body_max_size_exceeded(
body_size: int,
warnsize: Optional[int],
maxsize: Optional[int],
request_url: str,
) -> bool:
if warnsize and body_size > warnsize:
logger.warning(
f"Actual response size {body_size} larger than "
f"download warn size {warnsize} in request {request_url}."
)

if maxsize and body_size > maxsize:
logger.warning(
f"Dropping the response for {request_url}: actual response size "
f"{body_size} larger than download max size {maxsize}."
)
return True
return False


def _truncate_str(obj, index, text, limit):
if len(text) <= limit:
return
Expand Down Expand Up @@ -92,6 +113,9 @@ def __init__(
f"({self._truncate_limit}) is invalid. It must be 0 or a "
f"positive integer."
)
self._default_maxsize = settings.getint("DOWNLOAD_MAXSIZE")
self._default_warnsize = settings.getint("DOWNLOAD_WARNSIZE")

crawler.signals.connect(self.engine_started, signal=signals.engine_started)
self._crawler = crawler
self._fallback_handler = None
Expand Down Expand Up @@ -231,7 +255,18 @@ async def _download_request(
finally:
self._update_stats(api_params)

return _process_response(api_response, request, self._cookie_jars)
response = _process_response(
api_response=api_response, request=request, cookie_jars=self._cookie_jars
)
if response and _body_max_size_exceeded(
len(response.body),
self._default_warnsize,
self._default_maxsize,
request.url,
):
return None

return response

def _process_request_error(self, request, error):
detail = (error.parsed.data or {}).get("detail", error.message)
Expand Down
104 changes: 103 additions & 1 deletion tests/test_handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,11 @@
from zyte_api.aio.retry import RetryFactory
from zyte_api.constants import API_URL

from scrapy_zyte_api.handler import ScrapyZyteAPIDownloadHandler
from scrapy_zyte_api.handler import (
ScrapyZyteAPIDownloadHandler,
_body_max_size_exceeded,
)
from scrapy_zyte_api.responses import ZyteAPITextResponse
from scrapy_zyte_api.utils import USER_AGENT

from . import DEFAULT_CLIENT_CONCURRENCY, SETTINGS, SETTINGS_T, UNSET
Expand Down Expand Up @@ -557,3 +561,101 @@ async def test_fallback_setting():
handler = get_download_handler(crawler, "https")
assert isinstance(handler, ScrapyZyteAPIDownloadHandler)
assert isinstance(handler._fallback_handler, HTTPDownloadHandler)


@pytest.mark.parametrize(
"body_size, warnsize, maxsize, expected_result, expected_warnings",
[
# Warning only (exceeds warnsize but not maxsize)
(
1200,
1000,
1500,
False,
[
"Actual response size 1200 larger than download warn size 1000 in request http://example.com."
],
),
# Cancel download (exceeds both warnsize and maxsize)
(
1600,
1000,
1500,
True,
[
"Actual response size 1600 larger than download warn size 1000 in request http://example.com.",
"Dropping the response for http://example.com: actual response size 1600 larger than download max size 1500.",
],
),
# No limits - no warnings expected
(500, None, None, False, []),
],
)
def test_body_max_size_exceeded(
body_size, warnsize, maxsize, expected_result, expected_warnings
):
with mock.patch("scrapy_zyte_api.handler.logger") as logger:
result = _body_max_size_exceeded(
body_size=body_size,
warnsize=warnsize,
maxsize=maxsize,
request_url="http://example.com",
)

assert result == expected_result

if expected_warnings:
for call, expected_warning in zip(
logger.warning.call_args_list, expected_warnings
):
assert call[0][0] == expected_warning
else:
logger.warning.assert_not_called()


@ensureDeferred
@pytest.mark.parametrize(
"body_size, warnsize, maxsize, expect_null",
[
(500, None, None, False), # No limits, should return response
(
1500,
1000,
None,
False,
), # Exceeds warnsize, should log warning but return response
(2500, 1000, 2000, True), # Exceeds maxsize, should return None
(500, 1000, 2000, False), # Within limits, should return response
(
1500,
None,
1000,
True,
), # Exceeds maxsize with no warnsize, should return None
],
)
async def test_download_request_limits(
body_size, warnsize, maxsize, expect_null, mockserver
):
settings: SETTINGS_T = {"DOWNLOAD_WARNSIZE": warnsize, "DOWNLOAD_MAXSIZE": maxsize}
async with make_handler(settings, mockserver.urljoin("/")) as handler:
handler._session = mock.AsyncMock()
handler._session.get.return_value = mock.Mock(body=b"x" * body_size)

mock_api_response = mock.Mock(body=b"x" * body_size)

# Patch the `from_api_response` method of ZyteAPITextResponse only for the test
with mock.patch.object(
ZyteAPITextResponse, "from_api_response", return_value=mock_api_response
):
with mock.patch(
"scrapy_zyte_api.responses._process_response",
return_value=mock_api_response,
):
request = Request("https://example.com")
result = await handler._download_request({}, request, None)

if expect_null:
assert result is None
else:
assert result is not None

0 comments on commit 0d18fb0

Please sign in to comment.