Skip to content

Commit

Permalink
Merge remote-tracking branch 'origin/main' into scrapy-typing
Browse files Browse the repository at this point in the history
  • Loading branch information
wRAR committed Nov 18, 2024
2 parents ac34927 + 64ad254 commit 16c2e81
Show file tree
Hide file tree
Showing 6 changed files with 148 additions and 5 deletions.
2 changes: 1 addition & 1 deletion .bumpversion.cfg
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
[bumpversion]
current_version = 0.25.0
current_version = 0.25.1
commit = True
tag = True
tag_name = {new_version}
Expand Down
6 changes: 6 additions & 0 deletions CHANGES.rst
Original file line number Diff line number Diff line change
@@ -1,6 +1,12 @@
Changes
=======

0.25.1 (2024-11-12)
-------------------

* :setting:`DOWNLOAD_MAXSIZE` and :setting:`DOWNLOAD_WARNSIZE` are now also
enforced on requests sent through Zyte API.

0.25.0 (2024-10-22)
-------------------

Expand Down
2 changes: 1 addition & 1 deletion docs/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
project = "scrapy-zyte-api"
copyright = "2023, Zyte Group Ltd"
author = "Zyte Group Ltd"
release = "0.25.0"
release = "0.25.1"

sys.path.insert(0, str(Path(__file__).parent.absolute())) # _ext
extensions = [
Expand Down
2 changes: 1 addition & 1 deletion scrapy_zyte_api/__version__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "0.25.0"
__version__ = "0.25.1"
37 changes: 36 additions & 1 deletion scrapy_zyte_api/handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,27 @@
logger = logging.getLogger(__name__)


def _body_max_size_exceeded(
body_size: int,
warnsize: Optional[int],
maxsize: Optional[int],
request_url: str,
) -> bool:
if warnsize and body_size > warnsize:
logger.warning(
f"Actual response size {body_size} larger than "
f"download warn size {warnsize} in request {request_url}."
)

if maxsize and body_size > maxsize:
logger.warning(
f"Dropping the response for {request_url}: actual response size "
f"{body_size} larger than download max size {maxsize}."
)
return True
return False


def _truncate_str(obj, index, text, limit):
if len(text) <= limit:
return
Expand Down Expand Up @@ -93,6 +114,9 @@ def __init__(
f"({self._truncate_limit}) is invalid. It must be 0 or a "
f"positive integer."
)
self._default_maxsize = settings.getint("DOWNLOAD_MAXSIZE")
self._default_warnsize = settings.getint("DOWNLOAD_WARNSIZE")

crawler.signals.connect(self.engine_started, signal=signals.engine_started)
self._crawler = crawler
self._fallback_handler = None
Expand Down Expand Up @@ -233,7 +257,18 @@ async def _download_request(
finally:
self._update_stats(api_params)

return _process_response(api_response, request, self._cookie_jars)
response = _process_response(
api_response=api_response, request=request, cookie_jars=self._cookie_jars
)
if response and _body_max_size_exceeded(
len(response.body),
self._default_warnsize,
self._default_maxsize,
request.url,
):
return None

return response

def _process_request_error(self, request, error):
detail = (error.parsed.data or {}).get("detail", error.message)
Expand Down
104 changes: 103 additions & 1 deletion tests/test_handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,11 @@
from zyte_api.aio.retry import RetryFactory
from zyte_api.constants import API_URL

from scrapy_zyte_api.handler import ScrapyZyteAPIDownloadHandler
from scrapy_zyte_api.handler import (
ScrapyZyteAPIDownloadHandler,
_body_max_size_exceeded,
)
from scrapy_zyte_api.responses import ZyteAPITextResponse
from scrapy_zyte_api.utils import USER_AGENT

from . import DEFAULT_CLIENT_CONCURRENCY, SETTINGS, SETTINGS_T, UNSET
Expand Down Expand Up @@ -561,3 +565,101 @@ async def test_fallback_setting():
handler = get_download_handler(crawler, "https")
assert isinstance(handler, ScrapyZyteAPIDownloadHandler)
assert isinstance(handler._fallback_handler, HTTPDownloadHandler)


@pytest.mark.parametrize(
"body_size, warnsize, maxsize, expected_result, expected_warnings",
[
# Warning only (exceeds warnsize but not maxsize)
(
1200,
1000,
1500,
False,
[
"Actual response size 1200 larger than download warn size 1000 in request http://example.com."
],
),
# Cancel download (exceeds both warnsize and maxsize)
(
1600,
1000,
1500,
True,
[
"Actual response size 1600 larger than download warn size 1000 in request http://example.com.",
"Dropping the response for http://example.com: actual response size 1600 larger than download max size 1500.",
],
),
# No limits - no warnings expected
(500, None, None, False, []),
],
)
def test_body_max_size_exceeded(
body_size, warnsize, maxsize, expected_result, expected_warnings
):
with mock.patch("scrapy_zyte_api.handler.logger") as logger:
result = _body_max_size_exceeded(
body_size=body_size,
warnsize=warnsize,
maxsize=maxsize,
request_url="http://example.com",
)

assert result == expected_result

if expected_warnings:
for call, expected_warning in zip(
logger.warning.call_args_list, expected_warnings
):
assert call[0][0] == expected_warning
else:
logger.warning.assert_not_called()


@ensureDeferred
@pytest.mark.parametrize(
"body_size, warnsize, maxsize, expect_null",
[
(500, None, None, False), # No limits, should return response
(
1500,
1000,
None,
False,
), # Exceeds warnsize, should log warning but return response
(2500, 1000, 2000, True), # Exceeds maxsize, should return None
(500, 1000, 2000, False), # Within limits, should return response
(
1500,
None,
1000,
True,
), # Exceeds maxsize with no warnsize, should return None
],
)
async def test_download_request_limits(
body_size, warnsize, maxsize, expect_null, mockserver
):
settings: SETTINGS_T = {"DOWNLOAD_WARNSIZE": warnsize, "DOWNLOAD_MAXSIZE": maxsize}
async with make_handler(settings, mockserver.urljoin("/")) as handler:
handler._session = mock.AsyncMock()
handler._session.get.return_value = mock.Mock(body=b"x" * body_size)

mock_api_response = mock.Mock(body=b"x" * body_size)

# Patch the `from_api_response` method of ZyteAPITextResponse only for the test
with mock.patch.object(
ZyteAPITextResponse, "from_api_response", return_value=mock_api_response
):
with mock.patch(
"scrapy_zyte_api.responses._process_response",
return_value=mock_api_response,
):
request = Request("https://example.com")
result = await handler._download_request({}, request, None)

if expect_null:
assert result is None
else:
assert result is not None

0 comments on commit 16c2e81

Please sign in to comment.