Skip to content

Commit

Permalink
Merge remote-tracking branch 'origin/main' into scrapy-typing
Browse files Browse the repository at this point in the history
  • Loading branch information
wRAR committed Nov 4, 2024
2 parents 86c5b67 + 1269ebb commit 6635ea2
Show file tree
Hide file tree
Showing 6 changed files with 66 additions and 24 deletions.
2 changes: 1 addition & 1 deletion .bumpversion.cfg
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
[bumpversion]
current_version = 0.24.0
current_version = 0.25.0
commit = True
tag = True
tag_name = {new_version}
Expand Down
8 changes: 8 additions & 0 deletions CHANGES.rst
Original file line number Diff line number Diff line change
@@ -1,6 +1,14 @@
Changes
=======

0.25.0 (2024-10-22)
-------------------

* Added official Python 3.13 support, removed official Python 3.8 support.

* Fixed a race condition that could allow more Zyte API requests than those
configured in the :setting:`ZYTE_API_MAX_REQUESTS` setting.

0.24.0 (2024-10-07)
-------------------

Expand Down
2 changes: 1 addition & 1 deletion docs/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
project = "scrapy-zyte-api"
copyright = "2023, Zyte Group Ltd"
author = "Zyte Group Ltd"
release = "0.24.0"
release = "0.25.0"

sys.path.insert(0, str(Path(__file__).parent.absolute())) # _ext
extensions = [
Expand Down
2 changes: 1 addition & 1 deletion scrapy_zyte_api/__version__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "0.24.0"
__version__ = "0.25.0"
21 changes: 4 additions & 17 deletions scrapy_zyte_api/_middlewares.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@ def __init__(self, crawler) -> None:
f"{self._max_requests}. The spider will close when it's "
f"reached."
)
self._request_count = 0

crawler.signals.connect(
self._start_requests_processed, signal=_start_requests_processed
Expand Down Expand Up @@ -124,29 +125,15 @@ def process_request(self, request, spider):
if self._param_parser.parse(request) is None:
return

self.slot_request(request, spider, force=True)

if self._max_requests_reached(self._crawler.engine.downloader):
self._request_count += 1
if self._max_requests and self._request_count > self._max_requests:
self._crawler.engine.close_spider(spider, "closespider_max_zapi_requests")
raise IgnoreRequest(
f"The request {request} is skipped as {self._max_requests} max "
f"Zyte API requests have been reached."
)

def _max_requests_reached(self, downloader) -> bool:
if not self._max_requests:
return False

zapi_req_count = self._crawler.stats.get_value("scrapy-zyte-api/processed", 0)
download_req_count = sum(
[
len(slot.transferring)
for slot_id, slot in downloader.slots.items()
if slot_id.startswith(self._slot_prefix)
]
)
total_requests = zapi_req_count + download_req_count
return total_requests >= self._max_requests
self.slot_request(request, spider, force=True)

def process_exception(self, request, exception, spider):
if (
Expand Down
55 changes: 51 additions & 4 deletions tests/test_middlewares.py
Original file line number Diff line number Diff line change
Expand Up @@ -138,8 +138,8 @@ def start_requests(self):
for i in range(spider_requests):
meta = {"zyte_api": {"browserHtml": True}}

# Alternating requests between ZAPI and non-ZAPI tests if
# ZYTE_API_MAX_REQUESTS solely limits ZAPI Requests.
# Alternating requests between ZAPI and non-ZAPI verifies
# that ZYTE_API_MAX_REQUESTS solely limits ZAPI requests.

if i % 2:
yield Request(
Expand Down Expand Up @@ -169,8 +169,8 @@ def parse(self, response):
in caplog.text
)
assert crawler.stats
assert crawler.stats.get_value("scrapy-zyte-api/success") <= zapi_max_requests
assert crawler.stats.get_value("scrapy-zyte-api/processed") <= zapi_max_requests
assert crawler.stats.get_value("scrapy-zyte-api/success") == zapi_max_requests
assert crawler.stats.get_value("scrapy-zyte-api/processed") == zapi_max_requests
assert crawler.stats.get_value("item_scraped_count") <= zapi_max_requests + 6
assert crawler.stats.get_value("finish_reason") == "closespider_max_zapi_requests"
assert (
Expand All @@ -181,6 +181,53 @@ def parse(self, response):
)


@ensureDeferred
async def test_max_requests_race_condition(caplog):
spider_requests = 8
zapi_max_requests = 1

with MockServer(DelayedResource) as server:

class TestSpider(Spider):
name = "test_spider"

def start_requests(self):
for i in range(spider_requests):
meta = {"zyte_api": {"browserHtml": True}}
yield Request("https://example.com", meta=meta, dont_filter=True)

def parse(self, response):
yield Item()

settings = {
"DOWNLOADER_MIDDLEWARES": {
"scrapy_zyte_api.ScrapyZyteAPIDownloaderMiddleware": 633
},
"ZYTE_API_MAX_REQUESTS": zapi_max_requests,
"ZYTE_API_URL": server.urljoin("/"),
**SETTINGS,
}

crawler = get_crawler(TestSpider, settings_dict=settings)
with caplog.at_level("INFO"):
await crawler.crawl()

assert (
f"Maximum Zyte API requests for this crawl is set at {zapi_max_requests}"
in caplog.text
)
assert crawler.stats.get_value("scrapy-zyte-api/success") == zapi_max_requests
assert crawler.stats.get_value("scrapy-zyte-api/processed") == zapi_max_requests
assert crawler.stats.get_value("item_scraped_count") == zapi_max_requests
assert crawler.stats.get_value("finish_reason") == "closespider_max_zapi_requests"
assert (
crawler.stats.get_value(
"downloader/exception_type_count/scrapy.exceptions.IgnoreRequest"
)
> 0
)


@ensureDeferred
async def test_forbidden_domain_start_url():
class TestSpider(Spider):
Expand Down

0 comments on commit 6635ea2

Please sign in to comment.