From 0d18fb000b7f396e733542090d84f99ad5c29469 Mon Sep 17 00:00:00 2001 From: Shevchenko Taras Date: Tue, 12 Nov 2024 13:12:05 +0300 Subject: [PATCH] Adding DOWNLOAD_MAXSIZE and DOWNLOAD_WARNSIZE (#227) * initial version for limiting response size * fix doc * add title for DOWNLOAD_MAXSIZE in doc * fix formatting DOWNLOAD_WARNSIZE in doc * remove extra doc * restore original doc * keep original number of encoding/decoding * fix logic for _body_max_size_exceeded * rewording messages (we don't have expected now) * make args in _process_response are named. * revert original responses.py * move the main logic to handler * merge main * fix params order * add tests for _body_max_size_exceeded * formatting * fix mypy warn * fix mypy warn for settings * tune warn message for dropping a response * Update scrapy_zyte_api/handler.py Co-authored-by: Mikhail Korobov * Update scrapy_zyte_api/handler.py Co-authored-by: Mikhail Korobov --------- Co-authored-by: Mikhail Korobov --- scrapy_zyte_api/handler.py | 37 ++++++++++++- tests/test_handler.py | 104 ++++++++++++++++++++++++++++++++++++- 2 files changed, 139 insertions(+), 2 deletions(-) diff --git a/scrapy_zyte_api/handler.py b/scrapy_zyte_api/handler.py index cc6b3391..85294a7e 100644 --- a/scrapy_zyte_api/handler.py +++ b/scrapy_zyte_api/handler.py @@ -23,6 +23,27 @@ logger = logging.getLogger(__name__) +def _body_max_size_exceeded( + body_size: int, + warnsize: Optional[int], + maxsize: Optional[int], + request_url: str, +) -> bool: + if warnsize and body_size > warnsize: + logger.warning( + f"Actual response size {body_size} larger than " + f"download warn size {warnsize} in request {request_url}." + ) + + if maxsize and body_size > maxsize: + logger.warning( + f"Dropping the response for {request_url}: actual response size " + f"{body_size} larger than download max size {maxsize}." + ) + return True + return False + + def _truncate_str(obj, index, text, limit): if len(text) <= limit: return @@ -92,6 +113,9 @@ def __init__( f"({self._truncate_limit}) is invalid. It must be 0 or a " f"positive integer." ) + self._default_maxsize = settings.getint("DOWNLOAD_MAXSIZE") + self._default_warnsize = settings.getint("DOWNLOAD_WARNSIZE") + crawler.signals.connect(self.engine_started, signal=signals.engine_started) self._crawler = crawler self._fallback_handler = None @@ -231,7 +255,18 @@ async def _download_request( finally: self._update_stats(api_params) - return _process_response(api_response, request, self._cookie_jars) + response = _process_response( + api_response=api_response, request=request, cookie_jars=self._cookie_jars + ) + if response and _body_max_size_exceeded( + len(response.body), + self._default_warnsize, + self._default_maxsize, + request.url, + ): + return None + + return response def _process_request_error(self, request, error): detail = (error.parsed.data or {}).get("detail", error.message) diff --git a/tests/test_handler.py b/tests/test_handler.py index 287acb32..42c1d20c 100644 --- a/tests/test_handler.py +++ b/tests/test_handler.py @@ -17,7 +17,11 @@ from zyte_api.aio.retry import RetryFactory from zyte_api.constants import API_URL -from scrapy_zyte_api.handler import ScrapyZyteAPIDownloadHandler +from scrapy_zyte_api.handler import ( + ScrapyZyteAPIDownloadHandler, + _body_max_size_exceeded, +) +from scrapy_zyte_api.responses import ZyteAPITextResponse from scrapy_zyte_api.utils import USER_AGENT from . import DEFAULT_CLIENT_CONCURRENCY, SETTINGS, SETTINGS_T, UNSET @@ -557,3 +561,101 @@ async def test_fallback_setting(): handler = get_download_handler(crawler, "https") assert isinstance(handler, ScrapyZyteAPIDownloadHandler) assert isinstance(handler._fallback_handler, HTTPDownloadHandler) + + +@pytest.mark.parametrize( + "body_size, warnsize, maxsize, expected_result, expected_warnings", + [ + # Warning only (exceeds warnsize but not maxsize) + ( + 1200, + 1000, + 1500, + False, + [ + "Actual response size 1200 larger than download warn size 1000 in request http://example.com." + ], + ), + # Cancel download (exceeds both warnsize and maxsize) + ( + 1600, + 1000, + 1500, + True, + [ + "Actual response size 1600 larger than download warn size 1000 in request http://example.com.", + "Dropping the response for http://example.com: actual response size 1600 larger than download max size 1500.", + ], + ), + # No limits - no warnings expected + (500, None, None, False, []), + ], +) +def test_body_max_size_exceeded( + body_size, warnsize, maxsize, expected_result, expected_warnings +): + with mock.patch("scrapy_zyte_api.handler.logger") as logger: + result = _body_max_size_exceeded( + body_size=body_size, + warnsize=warnsize, + maxsize=maxsize, + request_url="http://example.com", + ) + + assert result == expected_result + + if expected_warnings: + for call, expected_warning in zip( + logger.warning.call_args_list, expected_warnings + ): + assert call[0][0] == expected_warning + else: + logger.warning.assert_not_called() + + +@ensureDeferred +@pytest.mark.parametrize( + "body_size, warnsize, maxsize, expect_null", + [ + (500, None, None, False), # No limits, should return response + ( + 1500, + 1000, + None, + False, + ), # Exceeds warnsize, should log warning but return response + (2500, 1000, 2000, True), # Exceeds maxsize, should return None + (500, 1000, 2000, False), # Within limits, should return response + ( + 1500, + None, + 1000, + True, + ), # Exceeds maxsize with no warnsize, should return None + ], +) +async def test_download_request_limits( + body_size, warnsize, maxsize, expect_null, mockserver +): + settings: SETTINGS_T = {"DOWNLOAD_WARNSIZE": warnsize, "DOWNLOAD_MAXSIZE": maxsize} + async with make_handler(settings, mockserver.urljoin("/")) as handler: + handler._session = mock.AsyncMock() + handler._session.get.return_value = mock.Mock(body=b"x" * body_size) + + mock_api_response = mock.Mock(body=b"x" * body_size) + + # Patch the `from_api_response` method of ZyteAPITextResponse only for the test + with mock.patch.object( + ZyteAPITextResponse, "from_api_response", return_value=mock_api_response + ): + with mock.patch( + "scrapy_zyte_api.responses._process_response", + return_value=mock_api_response, + ): + request = Request("https://example.com") + result = await handler._download_request({}, request, None) + + if expect_null: + assert result is None + else: + assert result is not None