From bdbdc06bafb07f3809cdce2074df2e6a204c8f87 Mon Sep 17 00:00:00 2001 From: Rob Brackett Date: Tue, 26 Dec 2023 09:33:16 -0500 Subject: [PATCH 01/17] Create new `_http` module, move hacks to it --- src/wayback/_client.py | 67 +--------------------------------------- src/wayback/_http.py | 70 ++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 71 insertions(+), 66 deletions(-) create mode 100644 src/wayback/_http.py diff --git a/src/wayback/_client.py b/src/wayback/_client.py index 0593e8f..819ecd6 100644 --- a/src/wayback/_client.py +++ b/src/wayback/_client.py @@ -28,13 +28,13 @@ Timeout) import time from urllib.parse import urljoin, urlparse -from urllib3.connectionpool import HTTPConnectionPool from urllib3.exceptions import (ConnectTimeoutError, MaxRetryError, ReadTimeoutError) from warnings import warn from . import _utils, __version__ from ._models import CdxRecord, Memento +from ._http import * # noqa from .exceptions import (WaybackException, UnexpectedResponseFormat, BlockedByRobotsError, @@ -271,71 +271,6 @@ def clean_memento_links(links, mode): return result -##################################################################### -# HACK: handle malformed Content-Encoding headers from Wayback. -# When you send `Accept-Encoding: gzip` on a request for a memento, Wayback -# will faithfully gzip the response body. However, if the original response -# from the web server that was snapshotted was gzipped, Wayback screws up the -# `Content-Encoding` header on the memento response, leading any HTTP client to -# *not* decompress the gzipped body. Wayback folks have no clear timeline for -# a fix, hence the workaround here. -# -# More info in this issue: -# https://github.com/edgi-govdata-archiving/web-monitoring-processing/issues/309 -# -# Example broken Wayback URL: -# http://web.archive.org/web/20181023233237id_/http://cwcgom.aoml.noaa.gov/erddap/griddap/miamiacidification.graph -# -if hasattr(HTTPConnectionPool, 'ResponseCls'): - # urllib3 v1.x: - # - # This subclass of urllib3's response class identifies the malformed headers - # and repairs them before instantiating the actual response object, so when - # it reads the body, it knows to decode it correctly. - # - # See what we're overriding from urllib3: - # https://github.com/urllib3/urllib3/blob/a6ec68a5c5c5743c59fe5c62c635c929586c429b/src/urllib3/response.py#L499-L526 - class WaybackResponse(HTTPConnectionPool.ResponseCls): - @classmethod - def from_httplib(cls, httplib_response, **response_kwargs): - headers = httplib_response.msg - pairs = headers.items() - if ('content-encoding', '') in pairs and ('Content-Encoding', 'gzip') in pairs: - del headers['content-encoding'] - headers['Content-Encoding'] = 'gzip' - return super().from_httplib(httplib_response, **response_kwargs) - - HTTPConnectionPool.ResponseCls = WaybackResponse -else: - # urllib3 v2.x: - # - # Unfortunately, we can't wrap the `HTTPConnection.getresponse` method in - # urllib3 v2, since it may have read the response body before returning. So - # we patch the HTTPHeaderDict class instead. - from urllib3._collections import HTTPHeaderDict as Urllib3HTTPHeaderDict - _urllib3_header_init = Urllib3HTTPHeaderDict.__init__ - - def _new_header_init(self, headers=None, **kwargs): - if headers: - if isinstance(headers, (Urllib3HTTPHeaderDict, dict)): - pairs = list(headers.items()) - else: - pairs = list(headers) - if ( - ('content-encoding', '') in pairs and - ('Content-Encoding', 'gzip') in pairs - ): - headers = [pair for pair in pairs - if pair[0].lower() != 'content-encoding'] - headers.append(('Content-Encoding', 'gzip')) - - return _urllib3_header_init(self, headers, **kwargs) - - Urllib3HTTPHeaderDict.__init__ = _new_header_init -# END HACK -##################################################################### - - class WaybackSession(_utils.DisableAfterCloseSession, requests.Session): """ A custom session object that pools network connections and resources for diff --git a/src/wayback/_http.py b/src/wayback/_http.py new file mode 100644 index 0000000..0c67a03 --- /dev/null +++ b/src/wayback/_http.py @@ -0,0 +1,70 @@ +""" +HTTP tooling used by Wayback when making requests to and handling responses +from Wayback Machine servers. +""" + +from urllib3.connectionpool import HTTPConnectionPool + +##################################################################### +# HACK: handle malformed Content-Encoding headers from Wayback. +# When you send `Accept-Encoding: gzip` on a request for a memento, Wayback +# will faithfully gzip the response body. However, if the original response +# from the web server that was snapshotted was gzipped, Wayback screws up the +# `Content-Encoding` header on the memento response, leading any HTTP client to +# *not* decompress the gzipped body. Wayback folks have no clear timeline for +# a fix, hence the workaround here. +# +# More info in this issue: +# https://github.com/edgi-govdata-archiving/web-monitoring-processing/issues/309 +# +# Example broken Wayback URL: +# http://web.archive.org/web/20181023233237id_/http://cwcgom.aoml.noaa.gov/erddap/griddap/miamiacidification.graph +# +if hasattr(HTTPConnectionPool, 'ResponseCls'): + # urllib3 v1.x: + # + # This subclass of urllib3's response class identifies the malformed headers + # and repairs them before instantiating the actual response object, so when + # it reads the body, it knows to decode it correctly. + # + # See what we're overriding from urllib3: + # https://github.com/urllib3/urllib3/blob/a6ec68a5c5c5743c59fe5c62c635c929586c429b/src/urllib3/response.py#L499-L526 + class WaybackUrllib3Response(HTTPConnectionPool.ResponseCls): + @classmethod + def from_httplib(cls, httplib_response, **response_kwargs): + headers = httplib_response.msg + pairs = headers.items() + if ('content-encoding', '') in pairs and ('Content-Encoding', 'gzip') in pairs: + del headers['content-encoding'] + headers['Content-Encoding'] = 'gzip' + return super().from_httplib(httplib_response, **response_kwargs) + + HTTPConnectionPool.ResponseCls = WaybackUrllib3Response +else: + # urllib3 v2.x: + # + # Unfortunately, we can't wrap the `HTTPConnection.getresponse` method in + # urllib3 v2, since it may have read the response body before returning. So + # we patch the HTTPHeaderDict class instead. + from urllib3._collections import HTTPHeaderDict as Urllib3HTTPHeaderDict + _urllib3_header_init = Urllib3HTTPHeaderDict.__init__ + + def _new_header_init(self, headers=None, **kwargs): + if headers: + if isinstance(headers, (Urllib3HTTPHeaderDict, dict)): + pairs = list(headers.items()) + else: + pairs = list(headers) + if ( + ('content-encoding', '') in pairs and + ('Content-Encoding', 'gzip') in pairs + ): + headers = [pair for pair in pairs + if pair[0].lower() != 'content-encoding'] + headers.append(('Content-Encoding', 'gzip')) + + return _urllib3_header_init(self, headers, **kwargs) + + Urllib3HTTPHeaderDict.__init__ = _new_header_init +# END HACK +##################################################################### From 18527aa6a5ce2570477acb362893fcd31e4f5916 Mon Sep 17 00:00:00 2001 From: Rob Brackett Date: Tue, 26 Dec 2023 16:22:51 -0500 Subject: [PATCH 02/17] Move all *requests* stuff (except errors) to new adapter --- src/wayback/_client.py | 80 +++++++++++++++--------------------------- src/wayback/_http.py | 35 ++++++++++++++++++ src/wayback/_utils.py | 7 ++-- 3 files changed, 65 insertions(+), 57 deletions(-) diff --git a/src/wayback/_client.py b/src/wayback/_client.py index 819ecd6..e7025a5 100644 --- a/src/wayback/_client.py +++ b/src/wayback/_client.py @@ -19,7 +19,6 @@ import hashlib import logging import re -import requests from requests.exceptions import (ChunkedEncodingError, ConnectionError, ContentDecodingError, @@ -34,7 +33,7 @@ from warnings import warn from . import _utils, __version__ from ._models import CdxRecord, Memento -from ._http import * # noqa +from ._http import WaybackHttpAdapter from .exceptions import (WaybackException, UnexpectedResponseFormat, BlockedByRobotsError, @@ -271,7 +270,7 @@ def clean_memento_links(links, mode): return result -class WaybackSession(_utils.DisableAfterCloseSession, requests.Session): +class WaybackSession(_utils.DisableAfterCloseSession): """ A custom session object that pools network connections and resources for requests to the Wayback Machine. @@ -355,6 +354,7 @@ def __init__(self, retries=6, backoff=2, timeout=60, user_agent=None, # The memento limit is actually a generic Wayback limit. '/': _utils.RateLimit.make_limit(memento_calls_per_second), } + self.adapter = WaybackHttpAdapter() # NOTE: the nice way to accomplish retry/backoff is with a urllib3: # adapter = requests.adapters.HTTPAdapter( # max_retries=Retry(total=5, backoff_factor=2, @@ -368,15 +368,16 @@ def __init__(self, retries=6, backoff=2, timeout=60, user_agent=None, # with Wayback's APIs, but urllib3 logs a warning on every retry: # https://github.com/urllib3/urllib3/blob/5b047b645f5f93900d5e2fc31230848c25eb1f5f/src/urllib3/connectionpool.py#L730-L737 - # Customize the built-in `send()` with retryability and rate limiting. - def send(self, request: requests.PreparedRequest, **kwargs): + def request(self, method, url, timeout=-1, **kwargs): + super().request() start_time = time.time() + timeout = self.timeout if timeout is -1 else timeout maximum = self.retries retries = 0 - url = urlparse(request.url) + parsed_url = urlparse(url) for path, limit in self.rate_limts.items(): - if url.path.startswith(path): + if parsed_url.path.startswith(path): rate_limit = limit break else: @@ -385,9 +386,9 @@ def send(self, request: requests.PreparedRequest, **kwargs): while True: retry_delay = 0 try: - logger.debug('sending HTTP request %s "%s", %s', request.method, request.url, kwargs) + logger.debug('sending HTTP request %s "%s", %s', method, url, kwargs) rate_limit.wait() - response = super().send(request, **kwargs) + response = self.adapter.request(method, url, timeout=timeout, **kwargs) retry_delay = self.get_retry_delay(retries, response) if retries >= maximum or not self.should_retry(response): @@ -415,23 +416,6 @@ def send(self, request: requests.PreparedRequest, **kwargs): time.sleep(retry_delay) retries += 1 - # Customize `request` in order to set a default timeout from the session. - # We can't do this in `send` because `request` always passes a `timeout` - # keyword to `send`. Inside `send`, we can't tell the difference between a - # user explicitly requesting no timeout and not setting one at all. - def request(self, method, url, **kwargs): - """ - Perform an HTTP request using this session. For arguments and return - values, see: - https://requests.readthedocs.io/en/latest/api/#requests.Session.request - - If the ``timeout`` keyword argument is not set, it will default to the - session's ``timeout`` attribute. - """ - if 'timeout' not in kwargs: - kwargs['timeout'] = self.timeout - return super().request(method, url, **kwargs) - def should_retry(self, response): # A memento may actually be a capture of an error, so don't retry it :P if is_memento_response(response): @@ -474,14 +458,8 @@ def get_retry_delay(self, retries, response=None): def reset(self): "Reset any network connections the session is using." - # Close really just closes all the adapters in `self.adapters`. We - # could do that directly, but `self.adapters` is not documented/public, - # so might be somewhat risky. self.close(disable=False) - # Re-build the standard adapters. See: - # https://github.com/kennethreitz/requests/blob/v2.22.0/requests/sessions.py#L415-L418 - self.mount('https://', requests.adapters.HTTPAdapter()) - self.mount('http://', requests.adapters.HTTPAdapter()) + self.adapter.close() # TODO: add retry, backoff, cross_thread_backoff, and rate_limit options that @@ -720,22 +698,21 @@ def search(self, url, *, match_type=None, limit=1000, offset=None, sent_query, next_query = next_query, None response = self.session.request('GET', CDX_SEARCH_URL, params=sent_query) - try: - # Read/cache the response and close straightaway. If we need - # to raise for status, we want to pre-emptively close the - # response so a user handling the error doesn't need to - # worry about it. If we don't raise here, we still want to - # close the connection so it doesn't leak when we move onto - # the next of results or when this iterator ends. - read_and_close(response) - response.raise_for_status() - except requests.exceptions.HTTPError as error: + + # Read/cache the response and close straightaway. If we need + # to raise for status, we want to pre-emptively close the + # response so a user handling the error doesn't need to + # worry about it. If we don't raise here, we still want to + # close the connection so it doesn't leak when we move onto + # the next of results or when this iterator ends. + read_and_close(response) + if response.status_code >= 400: if 'AdministrativeAccessControlException' in response.text: raise BlockedSiteError(query['url']) elif 'RobotAccessControlException' in response.text: raise BlockedByRobotsError(query['url']) else: - raise WaybackException(str(error)) + raise WaybackException(f'HTTP {response.status_code} error for CDX search: "{query}"') lines = iter(response.content.splitlines()) @@ -953,12 +930,11 @@ def get_memento(self, url, timestamp=None, mode=Mode.original, *, current_url, current_date, current_mode = _utils.memento_url_data(response.url) # In view mode, redirects need special handling. - if current_mode == Mode.view.value: + redirect_url = response.next and response.next.url + if current_mode == Mode.view.value and not redirect_url: redirect_url = detect_view_mode_redirect(response, current_date) if redirect_url: # Fix up response properties to be like other modes. - redirect = requests.Request('GET', redirect_url) - response._next = self.session.prepare_request(redirect) response.headers['Memento-Datetime'] = current_date.strftime( '%a, %d %b %Y %H:%M:%S %Z' ) @@ -997,11 +973,11 @@ def get_memento(self, url, timestamp=None, mode=Mode.original, *, # rarely have been captured at the same time as the # redirect itself. (See 2b) playable = False - if response.next and ( + if redirect_url and ( (len(history) == 0 and not exact) or (len(history) > 0 and (previous_was_memento or not exact_redirects)) ): - target_url, target_date, _ = _utils.memento_url_data(response.next.url) + target_url, target_date, _ = _utils.memento_url_data(redirect_url) # A non-memento redirect is generally taking us to the # closest-in-time capture of the same URL. Note that is # NOT the next capture -- i.e. the one that would have @@ -1059,13 +1035,13 @@ def get_memento(self, url, timestamp=None, mode=Mode.original, *, raise MementoPlaybackError(f'{response.status_code} error while loading ' f'memento at {url}') - if response.next: + if redirect_url: previous_was_memento = is_memento read_and_close(response) # Wayback sometimes has circular memento redirects ¯\_(ツ)_/¯ urls.add(response.url) - if response.next.url in urls: + if redirect_url in urls: raise MementoPlaybackError(f'Memento at {url} is circular') # All requests are included in `debug_history`, but @@ -1073,7 +1049,7 @@ def get_memento(self, url, timestamp=None, mode=Mode.original, *, debug_history.append(response.url) if is_memento: history.append(memento) - response = self.session.send(response.next, allow_redirects=False) + response = self.session.request('GET', redirect_url, allow_redirects=False) else: break diff --git a/src/wayback/_http.py b/src/wayback/_http.py index 0c67a03..e19b63e 100644 --- a/src/wayback/_http.py +++ b/src/wayback/_http.py @@ -3,8 +3,12 @@ from Wayback Machine servers. """ +import logging +import requests from urllib3.connectionpool import HTTPConnectionPool +logger = logging.getLogger(__name__) + ##################################################################### # HACK: handle malformed Content-Encoding headers from Wayback. # When you send `Accept-Encoding: gzip` on a request for a memento, Wayback @@ -68,3 +72,34 @@ def _new_header_init(self, headers=None, **kwargs): Urllib3HTTPHeaderDict.__init__ = _new_header_init # END HACK ##################################################################### + + +class WaybackHttpAdapter: + """ + TODO + """ + + def __init__(self) -> None: + self._session = requests.Session() + + def request( + self, + method, + url, + *, + params=None, + headers=None, + allow_redirects=True, + timeout=None + ) -> requests.Response: + return self._session.request( + method=method, + url=url, + params=params, + headers=headers, + allow_redirects=allow_redirects, + timeout=timeout + ) + + def close(self): + self._session.close() diff --git a/src/wayback/_utils.py b/src/wayback/_utils.py index 4d8fa05..7284aa7 100644 --- a/src/wayback/_utils.py +++ b/src/wayback/_utils.py @@ -322,7 +322,7 @@ def __exit_all__(self, type, value, traceback): pass -class DisableAfterCloseSession(requests.Session): +class DisableAfterCloseSession: """ A custom session object raises a :class:`SessionClosedError` if you try to use it after closing it, to help identify and avoid potentially dangerous @@ -332,17 +332,14 @@ class DisableAfterCloseSession(requests.Session): _closed = False def close(self, disable=True): - super().close() if disable: self._closed = True - def send(self, *args, **kwargs): + def request(self, *args, **kwargs): if self._closed: raise SessionClosedError('This session has already been closed ' 'and cannot send new HTTP requests.') - return super().send(*args, **kwargs) - class CaseInsensitiveDict(MutableMapping): """ From 01e6d83f061f369f7f391d2fb8a568083c61d169 Mon Sep 17 00:00:00 2001 From: Rob Brackett Date: Fri, 29 Dec 2023 11:57:24 -0800 Subject: [PATCH 03/17] Update docs for WaybackSession --- src/wayback/_client.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/wayback/_client.py b/src/wayback/_client.py index e7025a5..d2648dd 100644 --- a/src/wayback/_client.py +++ b/src/wayback/_client.py @@ -272,8 +272,8 @@ def clean_memento_links(links, mode): class WaybackSession(_utils.DisableAfterCloseSession): """ - A custom session object that pools network connections and resources for - requests to the Wayback Machine. + Sessions manage HTTP requests and network traffic to web servers, adding + functionality like retries and rate limiting. Parameters ---------- @@ -463,7 +463,7 @@ def reset(self): # TODO: add retry, backoff, cross_thread_backoff, and rate_limit options that -# create a custom instance of urllib3.utils.Retry +# create a custom instance of urllib3.utils.Retry? class WaybackClient(_utils.DepthCountedContext): """ A client for retrieving data from the Internet Archive's Wayback Machine. From 1eb3e17657017567002ae3662a4991bd02a49e2c Mon Sep 17 00:00:00 2001 From: Rob Brackett Date: Fri, 29 Dec 2023 13:32:53 -0800 Subject: [PATCH 04/17] Add `InternalHttpResponse` to provide a non-requests Response interface This also provides some basic (but not yet fully implemented) thread-safety for response objects. --- src/wayback/_client.py | 40 ++++------- src/wayback/_http.py | 112 ++++++++++++++++++++++++++++++- src/wayback/_utils.py | 2 - src/wayback/tests/test_client.py | 2 +- 4 files changed, 125 insertions(+), 31 deletions(-) diff --git a/src/wayback/_client.py b/src/wayback/_client.py index d2648dd..8c69427 100644 --- a/src/wayback/_client.py +++ b/src/wayback/_client.py @@ -19,9 +19,7 @@ import hashlib import logging import re -from requests.exceptions import (ChunkedEncodingError, - ConnectionError, - ContentDecodingError, +from requests.exceptions import (ConnectionError, ProxyError, RetryError, Timeout) @@ -33,7 +31,7 @@ from warnings import warn from . import _utils, __version__ from ._models import CdxRecord, Memento -from ._http import WaybackHttpAdapter +from ._http import InternalHttpResponse, WaybackHttpAdapter from .exceptions import (WaybackException, UnexpectedResponseFormat, BlockedByRobotsError, @@ -147,7 +145,7 @@ def is_malformed_url(url): return False -def is_memento_response(response): +def is_memento_response(response: InternalHttpResponse) -> bool: return 'Memento-Datetime' in response.headers @@ -157,18 +155,6 @@ def cdx_hash(content): return b32encode(hashlib.sha1(content).digest()).decode() -def read_and_close(response): - # Read content so it gets cached and close the response so - # we can release the connection for reuse. See: - # https://github.com/psf/requests/blob/eedd67462819f8dbf8c1c32e77f9070606605231/requests/sessions.py#L160-L163 - try: - response.content - except (ChunkedEncodingError, ContentDecodingError, RuntimeError): - response.raw.read(decode_content=False) - finally: - response.close() - - REDIRECT_PAGE_PATTERN = re.compile(r'Got an? HTTP 3\d\d response at crawl time', re.IGNORECASE) @@ -368,7 +354,7 @@ def __init__(self, retries=6, backoff=2, timeout=60, user_agent=None, # with Wayback's APIs, but urllib3 logs a warning on every retry: # https://github.com/urllib3/urllib3/blob/5b047b645f5f93900d5e2fc31230848c25eb1f5f/src/urllib3/connectionpool.py#L730-L737 - def request(self, method, url, timeout=-1, **kwargs): + def request(self, method, url, timeout=-1, **kwargs) -> InternalHttpResponse: super().request() start_time = time.time() timeout = self.timeout if timeout is -1 else timeout @@ -393,16 +379,16 @@ def request(self, method, url, timeout=-1, **kwargs): if retries >= maximum or not self.should_retry(response): if response.status_code == 429: - read_and_close(response) + response.close() raise RateLimitError(response, retry_delay) return response else: logger.debug('Received error response (status: %s), will retry', response.status_code) - read_and_close(response) + response.close(cache=False) except WaybackSession.handleable_errors as error: response = getattr(error, 'response', None) if response is not None: - read_and_close(response) + response.close() if retries >= maximum: raise WaybackRetryError(retries, time.time() - start_time, error) from error @@ -477,6 +463,8 @@ class WaybackClient(_utils.DepthCountedContext): ---------- session : WaybackSession, optional """ + session: WaybackSession + def __init__(self, session=None): self.session = session or WaybackSession() @@ -705,7 +693,7 @@ def search(self, url, *, match_type=None, limit=1000, offset=None, # worry about it. If we don't raise here, we still want to # close the connection so it doesn't leak when we move onto # the next of results or when this iterator ends. - read_and_close(response) + response.close() if response.status_code >= 400: if 'AdministrativeAccessControlException' in response.text: raise BlockedSiteError(query['url']) @@ -930,7 +918,7 @@ def get_memento(self, url, timestamp=None, mode=Mode.original, *, current_url, current_date, current_mode = _utils.memento_url_data(response.url) # In view mode, redirects need special handling. - redirect_url = response.next and response.next.url + redirect_url = response.redirect_url if current_mode == Mode.view.value and not redirect_url: redirect_url = detect_view_mode_redirect(response, current_date) if redirect_url: @@ -1006,7 +994,7 @@ def get_memento(self, url, timestamp=None, mode=Mode.original, *, playable = True if not playable: - read_and_close(response) + response.close() message = response.headers.get('X-Archive-Wayback-Runtime-Error', '') if ( ('AdministrativeAccessControlException' in message) or @@ -1020,7 +1008,7 @@ def get_memento(self, url, timestamp=None, mode=Mode.original, *, raise BlockedByRobotsError(f'{url} is blocked by robots.txt') elif message: raise MementoPlaybackError(f'Memento at {url} could not be played: {message}') - elif response.ok: + elif response.is_success: # TODO: Raise more specific errors for the possible # cases here. We *should* only arrive here when # there's a redirect and: @@ -1037,7 +1025,7 @@ def get_memento(self, url, timestamp=None, mode=Mode.original, *, if redirect_url: previous_was_memento = is_memento - read_and_close(response) + response.close() # Wayback sometimes has circular memento redirects ¯\_(ツ)_/¯ urls.add(response.url) diff --git a/src/wayback/_http.py b/src/wayback/_http.py index e19b63e..6c733e6 100644 --- a/src/wayback/_http.py +++ b/src/wayback/_http.py @@ -4,7 +4,12 @@ """ import logging +import threading +from typing import Optional +from urllib.parse import urljoin import requests +from requests.exceptions import (ChunkedEncodingError, + ContentDecodingError) from urllib3.connectionpool import HTTPConnectionPool logger = logging.getLogger(__name__) @@ -74,6 +79,108 @@ def _new_header_init(self, headers=None, **kwargs): ##################################################################### +# FIXME: This implementation is tied to requests. It should probably be an +# abstract base class and we should have a requests-specific implementation, +# which would make `WaybackHttpAdapter` customizable/pluggable. +class InternalHttpResponse: + """ + Internal wrapper class for HTTP responses. _This should never be exposed to + user code_; it's job is to insulate the rest of the Wayback package from + the particulars of the underlying HTTP tooling (e.g. requests, httpx, etc). + This is *similar* to response objects from httpx and requests, although it + lacks facilities from those libraries that we don't need or use, and takes + shortcuts that are specific to our use cases. + """ + status_code: int + headers: dict + encoding: Optional[str] = None + url: str + links: dict + _read_lock: threading.Lock + _raw: object + _content: Optional[bytes] = None + _redirect_url: Optional[str] = None + + def __init__(self, raw: requests.Response, request_url: str) -> None: + self._read_lock = threading.Lock() + self._raw = raw + self.status_code = raw.status_code + self.headers = raw.headers + self.url = urljoin(request_url, getattr(raw, 'url', '')) + self.encoding = raw.encoding + self.links = raw.links + + @property + def content(self) -> bytes: + with self._read_lock: + if self._content is None: + # TODO: This is designed around the requests library and is not + # generic enough. A better version would either: + # 1. Leave this for subclasses to implement. + # 2. Read iteratively from a `raw` object with a `read(n)` method. + self._content = self._raw.content + + return self._content + + @property + def text(self) -> str: + encoding = self.encoding or self.sniff_encoding() or 'utf-8' + try: + return str(self.content, encoding, errors="replace") + except (LookupError, TypeError): + return str(self.content, errors="replace") + + def sniff_encoding(self) -> None: + # XXX: requests uses chardet here. Consider what we want to use. + ... + + @property + def redirect_url(self) -> str: + """ + The URL this response redirects to. If the response is not a redirect, + this returns an empty string. + """ + if self.status_code >= 300 and self.status_code < 400: + location = self.headers.get('location') + if location: + return urljoin(self.url, location) + + return '' + + @property + def is_success(self) -> bool: + return self.status_code >= 200 and self.status_code < 300 + + # XXX: This needs wrapping with a lock! (Maybe `_read_lock` should be an + # RLock so it can be used both here and in `content`). + def close(self, cache: bool = True) -> None: + """ + Read the rest of the response off the wire and release the connection. + If the full response is not read, the connection can hang and programs + will leak memory (and cause a bad time for the server as well). + + Parameters + ---------- + cache : bool, default: True + Whether to cache the response body so it can still be used via the + ``content`` and ``text`` properties. + """ + if self._raw: + try: + # TODO: if cache is false, it would be better not to try and + # read content at all. + self.content + if not cache: + self._content = '' + except (ChunkedEncodingError, ContentDecodingError, RuntimeError): + with self._read_lock: + self._raw.read(decode_content=False) + finally: + with self._read_lock: + self._raw.close() + self._raw = None + + class WaybackHttpAdapter: """ TODO @@ -91,8 +198,8 @@ def request( headers=None, allow_redirects=True, timeout=None - ) -> requests.Response: - return self._session.request( + ) -> InternalHttpResponse: + response = self._session.request( method=method, url=url, params=params, @@ -100,6 +207,7 @@ def request( allow_redirects=allow_redirects, timeout=timeout ) + return InternalHttpResponse(response, url) def close(self): self._session.close() diff --git a/src/wayback/_utils.py b/src/wayback/_utils.py index 7284aa7..2e3d50f 100644 --- a/src/wayback/_utils.py +++ b/src/wayback/_utils.py @@ -4,8 +4,6 @@ import email.utils import logging import re -import requests -import requests.adapters import threading import time from typing import Union diff --git a/src/wayback/tests/test_client.py b/src/wayback/tests/test_client.py index f236a76..2b9e90b 100644 --- a/src/wayback/tests/test_client.py +++ b/src/wayback/tests/test_client.py @@ -694,7 +694,7 @@ def test_request_retries(self, requests_mock): {'text': 'good', 'status_code': 200}]) session = WaybackSession(retries=2, backoff=0.1) response = session.request('GET', 'http://test.com') - assert response.ok + assert response.is_success session.close() From ccc8f401b9d8c98999522d3e1db44ff4ef4cf08d Mon Sep 17 00:00:00 2001 From: Rob Brackett Date: Fri, 29 Dec 2023 14:21:24 -0800 Subject: [PATCH 05/17] Delegate to requests for encoding sniffing Also re-organize methods a bit --- src/wayback/_http.py | 41 ++++++++++++++++++++--------------------- 1 file changed, 20 insertions(+), 21 deletions(-) diff --git a/src/wayback/_http.py b/src/wayback/_http.py index 6c733e6..83443a5 100644 --- a/src/wayback/_http.py +++ b/src/wayback/_http.py @@ -97,7 +97,7 @@ class InternalHttpResponse: url: str links: dict _read_lock: threading.Lock - _raw: object + _raw: requests.Response _content: Optional[bytes] = None _redirect_url: Optional[str] = None @@ -110,6 +110,23 @@ def __init__(self, raw: requests.Response, request_url: str) -> None: self.encoding = raw.encoding self.links = raw.links + @property + def redirect_url(self) -> str: + """ + The URL this response redirects to. If the response is not a redirect, + this returns an empty string. + """ + if self.status_code >= 300 and self.status_code < 400: + location = self.headers.get('location') + if location: + return urljoin(self.url, location) + + return '' + + @property + def is_success(self) -> bool: + return self.status_code >= 200 and self.status_code < 300 + @property def content(self) -> bytes: with self._read_lock: @@ -131,25 +148,8 @@ def text(self) -> str: return str(self.content, errors="replace") def sniff_encoding(self) -> None: - # XXX: requests uses chardet here. Consider what we want to use. - ... - - @property - def redirect_url(self) -> str: - """ - The URL this response redirects to. If the response is not a redirect, - this returns an empty string. - """ - if self.status_code >= 300 and self.status_code < 400: - location = self.headers.get('location') - if location: - return urljoin(self.url, location) - - return '' - - @property - def is_success(self) -> bool: - return self.status_code >= 200 and self.status_code < 300 + self.content + return self._raw.apparent_encoding # XXX: This needs wrapping with a lock! (Maybe `_read_lock` should be an # RLock so it can be used both here and in `content`). @@ -178,7 +178,6 @@ def close(self, cache: bool = True) -> None: finally: with self._read_lock: self._raw.close() - self._raw = None class WaybackHttpAdapter: From 0f3ef224bfc3ca08fb8fd06939189b76891fbd83 Mon Sep 17 00:00:00 2001 From: Rob Brackett Date: Fri, 29 Dec 2023 14:46:26 -0800 Subject: [PATCH 06/17] Fix locking and caching issues in `close()` --- src/wayback/_http.py | 36 ++++++++++++++++++++---------------- 1 file changed, 20 insertions(+), 16 deletions(-) diff --git a/src/wayback/_http.py b/src/wayback/_http.py index 83443a5..c7b0c17 100644 --- a/src/wayback/_http.py +++ b/src/wayback/_http.py @@ -96,13 +96,13 @@ class InternalHttpResponse: encoding: Optional[str] = None url: str links: dict - _read_lock: threading.Lock + _read_lock: threading.RLock _raw: requests.Response _content: Optional[bytes] = None _redirect_url: Optional[str] = None def __init__(self, raw: requests.Response, request_url: str) -> None: - self._read_lock = threading.Lock() + self._read_lock = threading.RLock() self._raw = raw self.status_code = raw.status_code self.headers = raw.headers @@ -151,13 +151,11 @@ def sniff_encoding(self) -> None: self.content return self._raw.apparent_encoding - # XXX: This needs wrapping with a lock! (Maybe `_read_lock` should be an - # RLock so it can be used both here and in `content`). def close(self, cache: bool = True) -> None: """ Read the rest of the response off the wire and release the connection. - If the full response is not read, the connection can hang and programs - will leak memory (and cause a bad time for the server as well). + If the full response is not read, the connection can hang and waste + both local and server resources. Parameters ---------- @@ -165,19 +163,25 @@ def close(self, cache: bool = True) -> None: Whether to cache the response body so it can still be used via the ``content`` and ``text`` properties. """ - if self._raw: + with self._read_lock: + # Reading bytes potentially involves decoding data from compressed + # gzip/brotli/etc. responses, so we need to handle those errors by + # continuing to just read the raw data off the socket instead. + # + # This fallback behavior is inspired by requests: + # https://github.com/psf/requests/blob/eedd67462819f8dbf8c1c32e77f9070606605231/requests/sessions.py#L160-L163 + # For urllib3, the appropriate errors to handle would be: + # `(DecodeError, ProtocolError, RuntimeError)` try: - # TODO: if cache is false, it would be better not to try and - # read content at all. - self.content - if not cache: - self._content = '' - except (ChunkedEncodingError, ContentDecodingError, RuntimeError): - with self._read_lock: + if cache: + try: + self.content + except (ChunkedEncodingError, ContentDecodingError, RuntimeError): + self._raw.read(decode_content=False) + else: self._raw.read(decode_content=False) finally: - with self._read_lock: - self._raw.close() + self._raw.close() class WaybackHttpAdapter: From 172e6a6dbdb51f4662fd9298786a4c34e5cdcbe5 Mon Sep 17 00:00:00 2001 From: Rob Brackett Date: Fri, 29 Dec 2023 15:16:18 -0800 Subject: [PATCH 07/17] Split response API from reqeusts-specific implementation --- src/wayback/_client.py | 6 +-- src/wayback/_http.py | 110 +++++++++++++++++++++++++++-------------- 2 files changed, 75 insertions(+), 41 deletions(-) diff --git a/src/wayback/_client.py b/src/wayback/_client.py index 8c69427..63bb777 100644 --- a/src/wayback/_client.py +++ b/src/wayback/_client.py @@ -31,7 +31,7 @@ from warnings import warn from . import _utils, __version__ from ._models import CdxRecord, Memento -from ._http import InternalHttpResponse, WaybackHttpAdapter +from ._http import WaybackHttpResponse, WaybackHttpAdapter from .exceptions import (WaybackException, UnexpectedResponseFormat, BlockedByRobotsError, @@ -145,7 +145,7 @@ def is_malformed_url(url): return False -def is_memento_response(response: InternalHttpResponse) -> bool: +def is_memento_response(response: WaybackHttpResponse) -> bool: return 'Memento-Datetime' in response.headers @@ -354,7 +354,7 @@ def __init__(self, retries=6, backoff=2, timeout=60, user_agent=None, # with Wayback's APIs, but urllib3 logs a warning on every retry: # https://github.com/urllib3/urllib3/blob/5b047b645f5f93900d5e2fc31230848c25eb1f5f/src/urllib3/connectionpool.py#L730-L737 - def request(self, method, url, timeout=-1, **kwargs) -> InternalHttpResponse: + def request(self, method, url, timeout=-1, **kwargs) -> WaybackHttpResponse: super().request() start_time = time.time() timeout = self.timeout if timeout is -1 else timeout diff --git a/src/wayback/_http.py b/src/wayback/_http.py index c7b0c17..b1443ff 100644 --- a/src/wayback/_http.py +++ b/src/wayback/_http.py @@ -79,36 +79,26 @@ def _new_header_init(self, headers=None, **kwargs): ##################################################################### -# FIXME: This implementation is tied to requests. It should probably be an -# abstract base class and we should have a requests-specific implementation, -# which would make `WaybackHttpAdapter` customizable/pluggable. -class InternalHttpResponse: +class WaybackHttpResponse: """ - Internal wrapper class for HTTP responses. _This should never be exposed to - user code_; it's job is to insulate the rest of the Wayback package from - the particulars of the underlying HTTP tooling (e.g. requests, httpx, etc). - This is *similar* to response objects from httpx and requests, although it - lacks facilities from those libraries that we don't need or use, and takes - shortcuts that are specific to our use cases. + Represents an HTTP response from a server. This might be included as an + attribute of an exception, but should otherwise not be exposed to user + code in normal circumstances. It's meant to wrap to provide a standard, + thread-safe interface to response objects from whatever underlying HTTP + tool is being used (e.g. requests, httpx, etc.). """ status_code: int headers: dict encoding: Optional[str] = None url: str links: dict - _read_lock: threading.RLock - _raw: requests.Response - _content: Optional[bytes] = None - _redirect_url: Optional[str] = None - def __init__(self, raw: requests.Response, request_url: str) -> None: - self._read_lock = threading.RLock() - self._raw = raw - self.status_code = raw.status_code - self.headers = raw.headers - self.url = urljoin(request_url, getattr(raw, 'url', '')) - self.encoding = raw.encoding - self.links = raw.links + def __init__(self, url: str, status_code: int, headers: dict, links: dict = None, encoding: str = None): + self.url = url + self.status_code = status_code + self.headers = headers + self.links = links or {} + self.encoding = encoding @property def redirect_url(self) -> str: @@ -125,31 +115,36 @@ def redirect_url(self) -> str: @property def is_success(self) -> bool: + """Whether the status code indicated success (2xx) or an error.""" return self.status_code >= 200 and self.status_code < 300 @property def content(self) -> bytes: - with self._read_lock: - if self._content is None: - # TODO: This is designed around the requests library and is not - # generic enough. A better version would either: - # 1. Leave this for subclasses to implement. - # 2. Read iteratively from a `raw` object with a `read(n)` method. - self._content = self._raw.content - - return self._content + """ + The response body as bytes. This is the *decompressed* bytes, so + responses with `Content-Encoding: gzip` will be unzipped here. + """ + raise NotImplementedError() @property def text(self) -> str: + """ + Gets the response body as a text string. it will try to decode the raw + bytes of the response based on response's declared encoding (i.e. from + the ``Content-Type`` header), falling back to sniffing the encoding or + using UTF-8. + """ encoding = self.encoding or self.sniff_encoding() or 'utf-8' try: return str(self.content, encoding, errors="replace") except (LookupError, TypeError): return str(self.content, errors="replace") - def sniff_encoding(self) -> None: - self.content - return self._raw.apparent_encoding + def sniff_encoding(self) -> Optional[str]: + """ + Sniff the text encoding from the raw bytes of the content, if possible. + """ + return None def close(self, cache: bool = True) -> None: """ @@ -163,6 +158,45 @@ def close(self, cache: bool = True) -> None: Whether to cache the response body so it can still be used via the ``content`` and ``text`` properties. """ + raise NotImplementedError() + + +class WaybackRequestsResponse(WaybackHttpResponse): + """ + Wraps an HTTP response from the requests library. + """ + _read_lock: threading.RLock + _raw: requests.Response + _content: Optional[bytes] = None + + def __init__(self, raw: requests.Response, request_url: str) -> None: + # NOTE: if we drop down to urllib3, we need the requested URL to be + # passed in so we can join it with the response's URL (in urllib3, + # `response.url` does not include the scheme/host/etc data that belongs + # to the connection pool). + super().__init__( + url=urljoin(request_url, getattr(raw, 'url', '')), + status_code=raw.status_code, + headers=raw.headers, + links=raw.links, + encoding=raw.encoding + ) + self._read_lock = threading.RLock() + self._raw = raw + + @property + def content(self) -> bytes: + with self._read_lock: + if self._content is None: + self._content = self._raw.content + + return self._content + + def sniff_encoding(self) -> None: + self.content + return self._raw.apparent_encoding + + def close(self, cache: bool = True) -> None: with self._read_lock: # Reading bytes potentially involves decoding data from compressed # gzip/brotli/etc. responses, so we need to handle those errors by @@ -177,9 +211,9 @@ def close(self, cache: bool = True) -> None: try: self.content except (ChunkedEncodingError, ContentDecodingError, RuntimeError): - self._raw.read(decode_content=False) + self._raw.raw.read(decode_content=False) else: - self._raw.read(decode_content=False) + self._raw.raw.read(decode_content=False) finally: self._raw.close() @@ -201,7 +235,7 @@ def request( headers=None, allow_redirects=True, timeout=None - ) -> InternalHttpResponse: + ) -> WaybackHttpResponse: response = self._session.request( method=method, url=url, @@ -210,7 +244,7 @@ def request( allow_redirects=allow_redirects, timeout=timeout ) - return InternalHttpResponse(response, url) + return WaybackRequestsResponse(response, url) def close(self): self._session.close() From 4843ed4e368c529b4b8dc4c33c3b30cadc1993e9 Mon Sep 17 00:00:00 2001 From: Rob Brackett Date: Fri, 29 Dec 2023 15:35:30 -0800 Subject: [PATCH 08/17] Add docs and type annotations to adapters --- src/wayback/_http.py | 54 +++++++++++++++++++++++++++++++++++--------- 1 file changed, 43 insertions(+), 11 deletions(-) diff --git a/src/wayback/_http.py b/src/wayback/_http.py index b1443ff..b8d208f 100644 --- a/src/wayback/_http.py +++ b/src/wayback/_http.py @@ -5,7 +5,7 @@ import logging import threading -from typing import Optional +from typing import Optional, Tuple, Union from urllib.parse import urljoin import requests from requests.exceptions import (ChunkedEncodingError, @@ -169,13 +169,13 @@ class WaybackRequestsResponse(WaybackHttpResponse): _raw: requests.Response _content: Optional[bytes] = None - def __init__(self, raw: requests.Response, request_url: str) -> None: + def __init__(self, raw: requests.Response) -> None: # NOTE: if we drop down to urllib3, we need the requested URL to be # passed in so we can join it with the response's URL (in urllib3, # `response.url` does not include the scheme/host/etc data that belongs # to the connection pool). super().__init__( - url=urljoin(request_url, getattr(raw, 'url', '')), + url=raw.url, status_code=raw.status_code, headers=raw.headers, links=raw.links, @@ -220,22 +220,54 @@ def close(self, cache: bool = True) -> None: class WaybackHttpAdapter: """ - TODO + Handles making actual HTTP requests over the network. For now, this is an + implementation detail, but it may be a way for users to customize behavior + in the future. """ def __init__(self) -> None: self._session = requests.Session() + # FIXME: remove `allow_redirects`! Redirection needs to be handled by + # whatever does throttling, which is currently not here. def request( self, - method, - url, + method: str, + url: str, *, - params=None, - headers=None, - allow_redirects=True, - timeout=None + params: dict = None, + headers: dict = None, + allow_redirects: bool = True, + timeout: Union[int, Tuple[int, int]] = None ) -> WaybackHttpResponse: + """ + Send an HTTP request and return a :class:`WaybackHttpResponse` object + representing the response. + + Parameters + ---------- + method : str + Method to use for the request, e.g. ``'GET'``, ``'POST'``. + url : str + The URL to reqeust. + params : dict, optional + For POST/PUT requests, data to be encoded in the response body. For + other methods, this should be encoded as the querystring. + headers : dict, optional + The HTTP headers to send with the request. + allow_redirects : bool, default: True + Whether to follow redirects before returning a response. + timeout : int or float or tuple of (int or float, int or float), optional + How long to wait, in seconds, before timing out. If this is a single + number, it will be used as both a connect timeout (how long to wait + for the first bit of response data) and a read timeout (how long + to wait between bits of response data). If a tuple, the values will + be used as the connect and read timeouts, respectively. + + Returns + ------- + WaybackHttpResponse + """ response = self._session.request( method=method, url=url, @@ -244,7 +276,7 @@ def request( allow_redirects=allow_redirects, timeout=timeout ) - return WaybackRequestsResponse(response, url) + return WaybackRequestsResponse(response) def close(self): self._session.close() From 9d9a16c8b78bb2738a66324259a3504183736792 Mon Sep 17 00:00:00 2001 From: Rob Brackett Date: Tue, 2 Jan 2024 11:32:08 -0800 Subject: [PATCH 09/17] Break Session functionality into separate methods Sessions currently handle rate limiting and retries (and *need* to handle redirects so those get rate limited; it's a bug that they do not) and their implementation is a bit complicated. This separates out each of those jobs into a separate method to make management of those separate (but related) tasks a bit clearer. This should make the code a little easier to change, which will probably happen next because I think I want to remove `WaybackSession` entirely. --- src/wayback/_client.py | 73 ++++++++++++++++++++++++++++-------------- 1 file changed, 49 insertions(+), 24 deletions(-) diff --git a/src/wayback/_client.py b/src/wayback/_client.py index 63bb777..11440e4 100644 --- a/src/wayback/_client.py +++ b/src/wayback/_client.py @@ -341,40 +341,43 @@ def __init__(self, retries=6, backoff=2, timeout=60, user_agent=None, '/': _utils.RateLimit.make_limit(memento_calls_per_second), } self.adapter = WaybackHttpAdapter() - # NOTE: the nice way to accomplish retry/backoff is with a urllib3: - # adapter = requests.adapters.HTTPAdapter( - # max_retries=Retry(total=5, backoff_factor=2, - # status_forcelist=(503, 504))) - # self.mount('http://', adapter) - # But Wayback mementos can have errors, which complicates things. See: - # https://github.com/urllib3/urllib3/issues/1445#issuecomment-422950868 - # - # Also note that, if we are ever able to switch to that, we may need to - # get more fancy with log filtering, since we *expect* lots of retries - # with Wayback's APIs, but urllib3 logs a warning on every retry: - # https://github.com/urllib3/urllib3/blob/5b047b645f5f93900d5e2fc31230848c25eb1f5f/src/urllib3/connectionpool.py#L730-L737 def request(self, method, url, timeout=-1, **kwargs) -> WaybackHttpResponse: super().request() - start_time = time.time() timeout = self.timeout if timeout is -1 else timeout + + # We add a lot of customization around rate limits, retries, etc, so + # we need to call down through that stack of tooling: + # + # _request_redirectable (handles redirects) + # └> _request_retryable (handles retries for errors) + # └> _request_rate_limited (handles rate limiting/throttling) + # └> _request_send (handles actual HTTP) + # + return self._request_redirectable(method, url, timeout=timeout, **kwargs) + + def _request_redirectable(self, method, url, timeout, **kwargs) -> WaybackHttpResponse: + # FIXME: this method should implement redirect following (if + # `kwargs['allow_redirects']` is true), rather than passing it to the + # underlying implementation, since redirects need to be rate limited. + return self._request_retryable(method, url, timeout=timeout, **kwargs) + + # NOTE: the nice way to accomplish retry/backoff is with a urllib3: + # adapter = requests.adapters.HTTPAdapter( + # max_retries=Retry(total=5, backoff_factor=2, + # status_forcelist=(503, 504))) + # self.mount('http://', adapter) + # But Wayback mementos (which shouldn't be retries) can be errors, which + # complicates things. See: https://github.com/urllib3/urllib3/issues/1445 + def _request_retryable(self, method, url, timeout, **kwargs) -> WaybackHttpResponse: + start_time = time.time() maximum = self.retries retries = 0 - parsed_url = urlparse(url) - for path, limit in self.rate_limts.items(): - if parsed_url.path.startswith(path): - rate_limit = limit - break - else: - rate_limit = DEFAULT_MEMENTO_RATE_LIMIT - while True: retry_delay = 0 try: - logger.debug('sending HTTP request %s "%s", %s', method, url, kwargs) - rate_limit.wait() - response = self.adapter.request(method, url, timeout=timeout, **kwargs) + response = self._request_rate_limited(method, url, timeout=timeout, **kwargs) retry_delay = self.get_retry_delay(retries, response) if retries >= maximum or not self.should_retry(response): @@ -402,6 +405,28 @@ def request(self, method, url, timeout=-1, **kwargs) -> WaybackHttpResponse: time.sleep(retry_delay) retries += 1 + # Ideally, this would be implemented as a custom `requests.HTTPAdapter` so + # we could take advantage of requests/urllib3 built-in functionality for + # redirects, retries, etc. However, retries can't be implemented correctly + # using the built-in tools (https://github.com/urllib3/urllib3/issues/1445). + # If we have to implement retries at this level, it's easier to also + # implement rate limiting here, too. + def _request_rate_limited(self, method, url, timeout, **kwargs) -> WaybackHttpResponse: + parsed_url = urlparse(url) + for path, limit in self.rate_limts.items(): + if parsed_url.path.startswith(path): + rate_limit = limit + break + else: + rate_limit = DEFAULT_MEMENTO_RATE_LIMIT + + rate_limit.wait() + return self._request_send(method, url, timeout=timeout, **kwargs) + + def _request_send(self, method, url, timeout, **kwargs) -> WaybackHttpResponse: + logger.debug('sending HTTP request %s "%s", %s', method, url, kwargs) + return self.adapter.request(method, url, timeout=timeout, **kwargs) + def should_retry(self, response): # A memento may actually be a capture of an error, so don't retry it :P if is_memento_response(response): From de289813eda523b7781b1fde3f9bdf22fa2fb5c5 Mon Sep 17 00:00:00 2001 From: Rob Brackett Date: Tue, 2 Jan 2024 12:04:57 -0800 Subject: [PATCH 10/17] Clarify redirect_url docs --- src/wayback/_http.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/wayback/_http.py b/src/wayback/_http.py index b8d208f..5632096 100644 --- a/src/wayback/_http.py +++ b/src/wayback/_http.py @@ -103,7 +103,8 @@ def __init__(self, url: str, status_code: int, headers: dict, links: dict = None @property def redirect_url(self) -> str: """ - The URL this response redirects to. If the response is not a redirect, + The absolute URL this response redirects to. It will always be a + complete URL with a scheme and host. If the response is not a redirect, this returns an empty string. """ if self.status_code >= 300 and self.status_code < 400: From f2eb7e583b048c4671fe75df6a833aca1d15796a Mon Sep 17 00:00:00 2001 From: Rob Brackett Date: Tue, 2 Jan 2024 12:16:25 -0800 Subject: [PATCH 11/17] Rename allow_redirects -> follow_redirects --- src/wayback/_client.py | 4 ++-- src/wayback/_http.py | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/src/wayback/_client.py b/src/wayback/_client.py index 11440e4..3c11996 100644 --- a/src/wayback/_client.py +++ b/src/wayback/_client.py @@ -936,7 +936,7 @@ def get_memento(self, url, timestamp=None, mode=Mode.original, *, urls = set() previous_was_memento = False - response = self.session.request('GET', url, allow_redirects=False) + response = self.session.request('GET', url, follow_redirects=False) protocol_and_www = re.compile(r'^https?://(www\d?\.)?') memento = None while True: @@ -1062,7 +1062,7 @@ def get_memento(self, url, timestamp=None, mode=Mode.original, *, debug_history.append(response.url) if is_memento: history.append(memento) - response = self.session.request('GET', redirect_url, allow_redirects=False) + response = self.session.request('GET', redirect_url, follow_redirects=False) else: break diff --git a/src/wayback/_http.py b/src/wayback/_http.py index 5632096..3c8e10d 100644 --- a/src/wayback/_http.py +++ b/src/wayback/_http.py @@ -238,7 +238,7 @@ def request( *, params: dict = None, headers: dict = None, - allow_redirects: bool = True, + follow_redirects: bool = True, timeout: Union[int, Tuple[int, int]] = None ) -> WaybackHttpResponse: """ @@ -256,7 +256,7 @@ def request( other methods, this should be encoded as the querystring. headers : dict, optional The HTTP headers to send with the request. - allow_redirects : bool, default: True + follow_redirects : bool, default: True Whether to follow redirects before returning a response. timeout : int or float or tuple of (int or float, int or float), optional How long to wait, in seconds, before timing out. If this is a single @@ -274,7 +274,7 @@ def request( url=url, params=params, headers=headers, - allow_redirects=allow_redirects, + allow_redirects=follow_redirects, timeout=timeout ) return WaybackRequestsResponse(response) From 086e034d3545788cf02611bfe24e38fb9ec80100 Mon Sep 17 00:00:00 2001 From: Rob Brackett Date: Thu, 4 Jan 2024 14:32:10 -0800 Subject: [PATCH 12/17] Limit `noqa` to just the unused import rule --- src/wayback/__init__.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/wayback/__init__.py b/src/wayback/__init__.py index 1a8b5f2..d443eec 100644 --- a/src/wayback/__init__.py +++ b/src/wayback/__init__.py @@ -1,12 +1,12 @@ from ._version import __version__, __version_tuple__ # noqa: F401 -from ._utils import memento_url_data, RateLimit # noqa +from ._utils import memento_url_data, RateLimit # noqa: F401 -from ._models import ( # noqa +from ._models import ( # noqa: F401 CdxRecord, Memento) -from ._client import ( # noqa +from ._client import ( # noqa: F401 Mode, WaybackClient, WaybackSession) From 2f48476b99acee71c07141de5e3587db8e6f0aa4 Mon Sep 17 00:00:00 2001 From: Rob Brackett Date: Thu, 4 Jan 2024 15:44:25 -0800 Subject: [PATCH 13/17] Add user agent test I think we actually broke this functionality somewhere, so it needs a test --- src/wayback/tests/test_client.py | 25 ++++++++++++++++++++++++- 1 file changed, 24 insertions(+), 1 deletion(-) diff --git a/src/wayback/tests/test_client.py b/src/wayback/tests/test_client.py index 2b9e90b..b95b685 100644 --- a/src/wayback/tests/test_client.py +++ b/src/wayback/tests/test_client.py @@ -671,7 +671,7 @@ def test_get_memento_returns_memento_with_accurate_url(): assert memento.url == 'https://www.fws.gov/' -def return_timeout(self, *args, **kwargs) -> requests.Response: +def return_timeout(request, **kwargs) -> requests.Response: """ Patch requests.Session.send with this in order to return a response with the provided timeout value as the response body. @@ -687,6 +687,23 @@ def return_timeout(self, *args, **kwargs) -> requests.Response: return res +def return_user_agent(request, **kwargs) -> requests.Response: + """ + Patch requests.Session.send with this in order to return a response with + the provided ``User-Agent`` header value as the response body. + + Usage: + >>> @mock.patch('requests.Session.send', side_effect=return_user_agent) + >>> def test_timeout(self, mock_class): + >>> assert requests.get('http://test.com', + >>> headers={'User-Agent': 'x'}).text == 'x' + """ + response = requests.Response() + response.status_code = 200 + response._content = str(request.headers.get('User-Agent', None)).encode() + return response + + class TestWaybackSession: def test_request_retries(self, requests_mock): requests_mock.get('http://test.com', [{'text': 'bad1', 'status_code': 503}, @@ -762,6 +779,12 @@ def test_timeout_empty(self, mock_class): res = session.request('GET', 'http://test.com', timeout=1) assert res.text == '1' + @mock.patch('requests.Session.send', side_effect=return_user_agent) + def test_user_agent(self, mock_class): + adapter = WaybackSession() + agent = adapter.request('GET', 'http://test.com').text + assert agent.startswith('wayback/') + @ia_vcr.use_cassette() def test_search_rate_limits(self): # The timing relies on the cassettes being present, From f43ba0a24b2cfb07eb92d663a5cda0432522b55c Mon Sep 17 00:00:00 2001 From: Rob Brackett Date: Thu, 4 Jan 2024 15:54:34 -0800 Subject: [PATCH 14/17] Move all WaybackSession functionality to adapters --- docs/source/usage.rst | 7 +- src/wayback/__init__.py | 5 +- src/wayback/_client.py | 280 ++------------------------ src/wayback/_http.py | 332 +++++++++++++++++++++++++++++-- src/wayback/_utils.py | 10 +- src/wayback/exceptions.py | 4 +- src/wayback/tests/test_client.py | 49 ++--- 7 files changed, 378 insertions(+), 309 deletions(-) diff --git a/docs/source/usage.rst b/docs/source/usage.rst index e4eb9bf..900f0b5 100644 --- a/docs/source/usage.rst +++ b/docs/source/usage.rst @@ -118,9 +118,10 @@ Memento API. We implement a Python client that can speak both. .. automethod:: close .. automethod:: parse_memento_headers -.. autoclass:: wayback.WaybackSession +.. autoclass:: wayback.WaybackHttpAdapter - .. automethod:: reset + .. automethod:: request + .. automethod:: close Utilities --------- @@ -150,4 +151,4 @@ Exception Classes .. autoclass:: wayback.exceptions.WaybackRetryError -.. autoclass:: wayback.exceptions.SessionClosedError +.. autoclass:: wayback.exceptions.AlreadyClosedError diff --git a/src/wayback/__init__.py b/src/wayback/__init__.py index d443eec..125d50d 100644 --- a/src/wayback/__init__.py +++ b/src/wayback/__init__.py @@ -8,5 +8,6 @@ from ._client import ( # noqa: F401 Mode, - WaybackClient, - WaybackSession) + WaybackClient) + +from ._http import WaybackHttpAdapter # noqa: F401 diff --git a/src/wayback/_client.py b/src/wayback/_client.py index 3c11996..e971ad6 100644 --- a/src/wayback/_client.py +++ b/src/wayback/_client.py @@ -19,27 +19,18 @@ import hashlib import logging import re -from requests.exceptions import (ConnectionError, - ProxyError, - RetryError, - Timeout) -import time -from urllib.parse import urljoin, urlparse -from urllib3.exceptions import (ConnectTimeoutError, - MaxRetryError, - ReadTimeoutError) +from urllib.parse import urljoin from warnings import warn -from . import _utils, __version__ +from . import _utils from ._models import CdxRecord, Memento -from ._http import WaybackHttpResponse, WaybackHttpAdapter +from ._http import (WaybackRequestsAdapter, + WaybackHttpAdapter) from .exceptions import (WaybackException, UnexpectedResponseFormat, BlockedByRobotsError, BlockedSiteError, MementoPlaybackError, - NoMementoError, - WaybackRetryError, - RateLimitError) + NoMementoError) logger = logging.getLogger(__name__) @@ -67,16 +58,6 @@ # Make sure it roughly starts with a valid protocol + domain + port? URL_ISH = re.compile(r'^[\w+\-]+://[^/?=&]+\.\w\w+(:\d+)?(/|$)') -# Global default rate limits for various endpoints. Internet Archive folks have -# asked us to set the defaults at 80% of the hard limits. -DEFAULT_CDX_RATE_LIMIT = _utils.RateLimit(0.8 * 60 / 60) -DEFAULT_TIMEMAP_RATE_LIMIT = _utils.RateLimit(0.8 * 100 / 60) -DEFAULT_MEMENTO_RATE_LIMIT = _utils.RateLimit(0.8 * 600 / 60) - -# If a rate limit response (i.e. a response with status == 429) does not -# include a `Retry-After` header, recommend pausing for this long. -DEFAULT_RATE_LIMIT_DELAY = 60 - class Mode(Enum): """ @@ -145,10 +126,6 @@ def is_malformed_url(url): return False -def is_memento_response(response: WaybackHttpResponse) -> bool: - return 'Memento-Datetime' in response.headers - - def cdx_hash(content): if isinstance(content, str): content = content.encode() @@ -256,249 +233,30 @@ def clean_memento_links(links, mode): return result -class WaybackSession(_utils.DisableAfterCloseSession): - """ - Sessions manage HTTP requests and network traffic to web servers, adding - functionality like retries and rate limiting. - - Parameters - ---------- - retries : int, default: 6 - The maximum number of retries for requests. - backoff : int or float, default: 2 - Number of seconds from which to calculate how long to back off and wait - when retrying requests. The first retry is always immediate, but - subsequent retries increase by powers of 2: - - seconds = backoff * 2 ^ (retry number - 1) - - So if this was `4`, retries would happen after the following delays: - 0 seconds, 4 seconds, 8 seconds, 16 seconds, ... - timeout : int or float or tuple of (int or float, int or float), default: 60 - A timeout to use for all requests. - See the Requests docs for more: - https://docs.python-requests.org/en/master/user/advanced/#timeouts - user_agent : str, optional - A custom user-agent string to use in all requests. Defaults to: - `wayback/{version} (+https://github.com/edgi-govdata-archiving/wayback)` - search_calls_per_second : wayback.RateLimit or int or float, default: 0.8 - The maximum number of calls per second made to the CDX search API. - To disable the rate limit, set this to 0. - - To have multiple sessions share a rate limit (so requests made by one - session count towards the limit of the other session), use a - single :class:`wayback.RateLimit` instance and pass it to each - ``WaybackSession`` instance. If you do not set a limit, the default - limit is shared globally across all sessions. - memento_calls_per_second : wayback.RateLimit or int or float, default: 8 - The maximum number of calls per second made to the memento API. - To disable the rate limit, set this to 0. - - To have multiple sessions share a rate limit (so requests made by one - session count towards the limit of the other session), use a - single :class:`wayback.RateLimit` instance and pass it to each - ``WaybackSession`` instance. If you do not set a limit, the default - limit is shared globally across all sessions. - timemap_calls_per_second : wayback.RateLimit or int or float, default: 1.33 - The maximum number of calls per second made to the timemap API (the - Wayback Machine's new, beta CDX search is part of the timemap API). - To disable the rate limit, set this to 0. - - To have multiple sessions share a rate limit (so requests made by one - session count towards the limit of the other session), use a - single :class:`wayback.RateLimit` instance and pass it to each - ``WaybackSession`` instance. If you do not set a limit, the default - limit is shared globally across all sessions. - """ - - # It seems Wayback sometimes produces 500 errors for transient issues, so - # they make sense to retry here. Usually not in other contexts, though. - retryable_statuses = frozenset((413, 421, 500, 502, 503, 504, 599)) - - retryable_errors = (ConnectTimeoutError, MaxRetryError, ReadTimeoutError, - ProxyError, RetryError, Timeout) - # Handleable errors *may* be retryable, but need additional logic beyond - # just the error type. See `should_retry_error()`. - handleable_errors = (ConnectionError,) + retryable_errors - - def __init__(self, retries=6, backoff=2, timeout=60, user_agent=None, - search_calls_per_second=DEFAULT_CDX_RATE_LIMIT, - memento_calls_per_second=DEFAULT_MEMENTO_RATE_LIMIT, - timemap_calls_per_second=DEFAULT_TIMEMAP_RATE_LIMIT): - super().__init__() - self.retries = retries - self.backoff = backoff - self.timeout = timeout - self.headers = { - 'User-Agent': (user_agent or - f'wayback/{__version__} (+https://github.com/edgi-govdata-archiving/wayback)'), - 'Accept-Encoding': 'gzip, deflate' - } - self.rate_limts = { - '/web/timemap': _utils.RateLimit.make_limit(timemap_calls_per_second), - '/cdx': _utils.RateLimit.make_limit(search_calls_per_second), - # The memento limit is actually a generic Wayback limit. - '/': _utils.RateLimit.make_limit(memento_calls_per_second), - } - self.adapter = WaybackHttpAdapter() - - def request(self, method, url, timeout=-1, **kwargs) -> WaybackHttpResponse: - super().request() - timeout = self.timeout if timeout is -1 else timeout - - # We add a lot of customization around rate limits, retries, etc, so - # we need to call down through that stack of tooling: - # - # _request_redirectable (handles redirects) - # └> _request_retryable (handles retries for errors) - # └> _request_rate_limited (handles rate limiting/throttling) - # └> _request_send (handles actual HTTP) - # - return self._request_redirectable(method, url, timeout=timeout, **kwargs) - - def _request_redirectable(self, method, url, timeout, **kwargs) -> WaybackHttpResponse: - # FIXME: this method should implement redirect following (if - # `kwargs['allow_redirects']` is true), rather than passing it to the - # underlying implementation, since redirects need to be rate limited. - return self._request_retryable(method, url, timeout=timeout, **kwargs) - - # NOTE: the nice way to accomplish retry/backoff is with a urllib3: - # adapter = requests.adapters.HTTPAdapter( - # max_retries=Retry(total=5, backoff_factor=2, - # status_forcelist=(503, 504))) - # self.mount('http://', adapter) - # But Wayback mementos (which shouldn't be retries) can be errors, which - # complicates things. See: https://github.com/urllib3/urllib3/issues/1445 - def _request_retryable(self, method, url, timeout, **kwargs) -> WaybackHttpResponse: - start_time = time.time() - maximum = self.retries - retries = 0 - - while True: - retry_delay = 0 - try: - response = self._request_rate_limited(method, url, timeout=timeout, **kwargs) - retry_delay = self.get_retry_delay(retries, response) - - if retries >= maximum or not self.should_retry(response): - if response.status_code == 429: - response.close() - raise RateLimitError(response, retry_delay) - return response - else: - logger.debug('Received error response (status: %s), will retry', response.status_code) - response.close(cache=False) - except WaybackSession.handleable_errors as error: - response = getattr(error, 'response', None) - if response is not None: - response.close() - - if retries >= maximum: - raise WaybackRetryError(retries, time.time() - start_time, error) from error - elif self.should_retry_error(error): - retry_delay = self.get_retry_delay(retries, response) - logger.info('Caught exception during request, will retry: %s', error) - else: - raise - - logger.debug('Will retry after sleeping for %s seconds...', retry_delay) - time.sleep(retry_delay) - retries += 1 - - # Ideally, this would be implemented as a custom `requests.HTTPAdapter` so - # we could take advantage of requests/urllib3 built-in functionality for - # redirects, retries, etc. However, retries can't be implemented correctly - # using the built-in tools (https://github.com/urllib3/urllib3/issues/1445). - # If we have to implement retries at this level, it's easier to also - # implement rate limiting here, too. - def _request_rate_limited(self, method, url, timeout, **kwargs) -> WaybackHttpResponse: - parsed_url = urlparse(url) - for path, limit in self.rate_limts.items(): - if parsed_url.path.startswith(path): - rate_limit = limit - break - else: - rate_limit = DEFAULT_MEMENTO_RATE_LIMIT - - rate_limit.wait() - return self._request_send(method, url, timeout=timeout, **kwargs) - - def _request_send(self, method, url, timeout, **kwargs) -> WaybackHttpResponse: - logger.debug('sending HTTP request %s "%s", %s', method, url, kwargs) - return self.adapter.request(method, url, timeout=timeout, **kwargs) - - def should_retry(self, response): - # A memento may actually be a capture of an error, so don't retry it :P - if is_memento_response(response): - return False - - return response.status_code in self.retryable_statuses - - def should_retry_error(self, error): - if isinstance(error, WaybackSession.retryable_errors): - return True - elif isinstance(error, ConnectionError): - # ConnectionErrors from requests actually wrap a whole family of - # more detailed errors from urllib3, so we need to do some string - # checking to determine whether the error is retryable. - text = str(error) - # NOTE: we have also seen this, which may warrant retrying: - # `requests.exceptions.ConnectionError: ('Connection aborted.', - # RemoteDisconnected('Remote end closed connection without - # response'))` - if 'NewConnectionError' in text or 'Max retries' in text: - return True - - return False - - def get_retry_delay(self, retries, response=None): - delay = 0 - - # As of 2023-11-27, the Wayback Machine does not set a `Retry-After` - # header, so this parsing is just future-proofing. - if response is not None: - delay = _utils.parse_retry_after(response.headers.get('Retry-After')) or delay - if response.status_code == 429 and delay == 0: - delay = DEFAULT_RATE_LIMIT_DELAY - - # No default backoff on the first retry. - if retries > 0: - delay = max(self.backoff * 2 ** (retries - 1), delay) - - return delay - - def reset(self): - "Reset any network connections the session is using." - self.close(disable=False) - self.adapter.close() - - -# TODO: add retry, backoff, cross_thread_backoff, and rate_limit options that -# create a custom instance of urllib3.utils.Retry? class WaybackClient(_utils.DepthCountedContext): """ A client for retrieving data from the Internet Archive's Wayback Machine. You can use a WaybackClient as a context manager. When exiting, it will - close the session it's using (if you've passed in a custom session, make + close the adapter it's using (if you've passed in a custom adapter, make sure not to use the context manager functionality unless you want to live dangerously). Parameters ---------- - session : WaybackSession, optional + adapter : WaybackHttpAdapter, optional """ - session: WaybackSession + adapter: WaybackHttpAdapter - def __init__(self, session=None): - self.session = session or WaybackSession() + def __init__(self, adapter=None): + self.adapter = adapter or WaybackRequestsAdapter() def __exit_all__(self, type, value, traceback): self.close() def close(self): - "Close the client's session." - self.session.close() + "Close the client's adapter." + self.adapter.close() def search(self, url, *, match_type=None, limit=1000, offset=None, fast_latest=None, from_date=None, to_date=None, @@ -709,7 +467,7 @@ def search(self, url, *, match_type=None, limit=1000, offset=None, previous_result = None while next_query: sent_query, next_query = next_query, None - response = self.session.request('GET', CDX_SEARCH_URL, + response = self.adapter.request('GET', CDX_SEARCH_URL, params=sent_query) # Read/cache the response and close straightaway. If we need @@ -936,7 +694,7 @@ def get_memento(self, url, timestamp=None, mode=Mode.original, *, urls = set() previous_was_memento = False - response = self.session.request('GET', url, follow_redirects=False) + response = self.adapter.request('GET', url, follow_redirects=False) protocol_and_www = re.compile(r'^https?://(www\d?\.)?') memento = None while True: @@ -952,8 +710,6 @@ def get_memento(self, url, timestamp=None, mode=Mode.original, *, '%a, %d %b %Y %H:%M:%S %Z' ) - is_memento = is_memento_response(response) - # A memento URL will match possible captures based on its SURT # form, which means we might be getting back a memento captured # from a different URL than the one specified in the request. @@ -961,7 +717,7 @@ def get_memento(self, url, timestamp=None, mode=Mode.original, *, if response.links and ('original' in response.links): current_url = response.links['original']['url'] - if is_memento: + if response.is_memento: links = clean_memento_links(response.links, mode) memento = Memento(url=current_url, timestamp=current_date, @@ -1049,7 +805,7 @@ def get_memento(self, url, timestamp=None, mode=Mode.original, *, f'memento at {url}') if redirect_url: - previous_was_memento = is_memento + previous_was_memento = response.is_memento response.close() # Wayback sometimes has circular memento redirects ¯\_(ツ)_/¯ @@ -1060,9 +816,9 @@ def get_memento(self, url, timestamp=None, mode=Mode.original, *, # All requests are included in `debug_history`, but # `history` only shows redirects that were mementos. debug_history.append(response.url) - if is_memento: + if response.is_memento: history.append(memento) - response = self.session.request('GET', redirect_url, follow_redirects=False) + response = self.adapter.request('GET', redirect_url, follow_redirects=False) else: break diff --git a/src/wayback/_http.py b/src/wayback/_http.py index 3c8e10d..c0a7a8e 100644 --- a/src/wayback/_http.py +++ b/src/wayback/_http.py @@ -4,16 +4,39 @@ """ import logging +from numbers import Real import threading -from typing import Optional, Tuple, Union -from urllib.parse import urljoin +import time +from typing import Dict, Optional, Tuple, Union +from urllib.parse import urljoin, urlparse import requests from requests.exceptions import (ChunkedEncodingError, - ContentDecodingError) + ConnectionError, + ContentDecodingError, + ProxyError, + RetryError, + Timeout) from urllib3.connectionpool import HTTPConnectionPool +from urllib3.exceptions import (ConnectTimeoutError, + MaxRetryError, + ReadTimeoutError) +from . import __version__ +from ._utils import DisableAfterCloseAdapter, RateLimit, parse_retry_after +from .exceptions import RateLimitError, WaybackRetryError logger = logging.getLogger(__name__) +# Global default rate limits for various endpoints. Internet Archive folks have +# asked us to set the defaults at 80% of the hard limits. +DEFAULT_CDX_RATE_LIMIT = RateLimit(0.8 * 60 / 60) +DEFAULT_TIMEMAP_RATE_LIMIT = RateLimit(0.8 * 100 / 60) +DEFAULT_MEMENTO_RATE_LIMIT = RateLimit(0.8 * 600 / 60) + +# If a rate limit response (i.e. a response with status == 429) does not +# include a `Retry-After` header, recommend pausing for this long. +DEFAULT_RATE_LIMIT_DELAY = 60 + + ##################################################################### # HACK: handle malformed Content-Encoding headers from Wayback. # When you send `Accept-Encoding: gzip` on a request for a memento, Wayback @@ -119,6 +142,10 @@ def is_success(self) -> bool: """Whether the status code indicated success (2xx) or an error.""" return self.status_code >= 200 and self.status_code < 300 + @property + def is_memento(self) -> bool: + return 'Memento-Datetime' in self.headers + @property def content(self) -> bytes: """ @@ -221,16 +248,10 @@ def close(self, cache: bool = True) -> None: class WaybackHttpAdapter: """ - Handles making actual HTTP requests over the network. For now, this is an - implementation detail, but it may be a way for users to customize behavior - in the future. + Handles making actual HTTP requests over the network. This is an abstract + base class that defines the API an adapter must implement. """ - def __init__(self) -> None: - self._session = requests.Session() - - # FIXME: remove `allow_redirects`! Redirection needs to be handled by - # whatever does throttling, which is currently not here. def request( self, method: str, @@ -269,6 +290,35 @@ def request( ------- WaybackHttpResponse """ + raise NotImplementedError() + + def close(self) -> None: + """ + Close the adapter and release any long-lived resources, like pooled + HTTP connections. + """ + ... + + +class RequestsAdapter(WaybackHttpAdapter): + """ + Wrap the requests library for making HTTP requests. + """ + + def __init__(self) -> None: + self._session = requests.Session() + + def request( + self, + method: str, + url: str, + *, + params: dict = None, + headers: dict = None, + follow_redirects: bool = True, + timeout: Union[int, Tuple[int, int]] = None + ) -> WaybackHttpResponse: + logger.debug('Sending HTTP request <%s "%s">', method, url) response = self._session.request( method=method, url=url, @@ -279,5 +329,263 @@ def request( ) return WaybackRequestsResponse(response) - def close(self): + def close(self) -> None: self._session.close() + + +class RetryAndRateLimitAdapter(WaybackHttpAdapter): + """ + Adds rate limiting and retry functionality to an HTTP adapter. This class + can't actually make HTTP requests and should usually be used in a + multiple-inheritance situation. Alternatively, you can override the + ``request_raw()`` method. + + Parameters + ---------- + retries : int, default: 6 + The maximum number of retries for failed HTTP requests. + backoff : int or float, default: 2 + Number of seconds from which to calculate how long to back off and wait + when retrying requests. The first retry is always immediate, but + subsequent retries increase by powers of 2: + + seconds = backoff * 2 ^ (retry number - 1) + + So if this was `4`, retries would happen after the following delays: + 0 seconds, 4 seconds, 8 seconds, 16 seconds, ... + timeout : int or float or tuple of (int or float, int or float), default: 60 + A timeout to use for all requests. + See the Requests docs for more: + https://docs.python-requests.org/en/master/user/advanced/#timeouts + rate_limits : dict of (str, RateLimit) + The rate limits that should be applied to different URL paths. The keys + are URL path prefixes, e.g. ``"/cdx/search/"``, and the values are + :class:`wayback.RateLimit` objects. When requests are made, the rate + limit from the most specific matching path is used and execution will + pause to ensure the rate limit is not exceeded. + + + Examples + -------- + + Usage via multiple inheritance: + + >>> class CombinedAdapter(RetryAndRateLimitAdapter, SomeActualHttpAdapterWithoutRateLimits): + >>> ... + + Usage via override: + + >>> class MyHttpAdapter(RetryAndRateLimitAdapter): + >>> def request_raw( + >>> self, + >>> method: str, + >>> url: str, + >>> *, + >>> params: dict = None, + >>> headers: dict = None, + >>> follow_redirects: bool = True, + >>> timeout: Union[int, Tuple[int, int]] = None + >>> ) -> WaybackHttpResponse: + >>> response = urllib.urlopen(...) + >>> return make_response_from_urllib(response) + """ + + rate_limits: Dict[str, RateLimit] + + # It seems Wayback sometimes produces 500 errors for transient issues, so + # they make sense to retry here. Usually not in other contexts, though. + retryable_statuses = frozenset((413, 421, 500, 502, 503, 504, 599)) + + # XXX: Some of these are requests-specific and should move WaybackRequestsAdapter. + retryable_errors = (ConnectTimeoutError, MaxRetryError, ReadTimeoutError, + ProxyError, RetryError, Timeout) + # Handleable errors *may* be retryable, but need additional logic beyond + # just the error type. See `should_retry_error()`. + handleable_errors = (ConnectionError,) + retryable_errors + + def __init__( + self, + retries: int = 6, + backoff: Real = 2, + rate_limits: Dict[str, RateLimit] = {} + ) -> None: + super().__init__() + self.retries = retries + self.backoff = backoff + # Sort rate limits by longest path first, so we always match the most + # specific path when looking for the right rate limit on any given URL. + self.rate_limits = {path: rate_limits[path] + for path in sorted(rate_limits.keys(), + key=lambda k: len(k), + reverse=True)} + + # The implementation of different features is split up by method here, so + # `request()` calls down through a stack of overrides: + # + # request (ensure valid types/values/etc.) + # └> _request_redirectable (handle redirects) + # └> _request_retryable (handle retries for errors) + # └> _request_rate_limited (handle rate limiting/throttling) + # └> request_raw (handle actual HTTP) + def request( + self, + method: str, + url: str, + *, + params: dict = None, + headers: dict = None, + follow_redirects: bool = True, + timeout: Union[int, Tuple[int, int]] = None + ) -> WaybackHttpResponse: + return self._request_redirectable(method, + url, + params=params, + headers=headers, + follow_redirects=follow_redirects, + timeout=timeout) + + def _request_redirectable(self, *args, follow_redirects: bool = True, **kwargs) -> WaybackHttpResponse: + # FIXME: this method should implement redirect following (if + # `follow_redirects` is true), rather than passing it to the underlying + # implementation, since redirects need to be rate limited. + return self._request_retryable(*args, follow_redirects=follow_redirects, **kwargs) + + def _request_retryable(self, method: str, url: str, **kwargs) -> WaybackHttpResponse: + start_time = time.time() + maximum = self.retries + retries = 0 + + while True: + retry_delay = 0 + try: + response = self._request_rate_limited(method, url, **kwargs) + retry_delay = self.get_retry_delay(retries, response) + + if retries >= maximum or not self.should_retry(response): + if response.status_code == 429: + response.close() + raise RateLimitError(response, retry_delay) + return response + else: + logger.debug('Received error response (status: %s), will retry', response.status_code) + response.close(cache=False) + except self.handleable_errors as error: + response = getattr(error, 'response', None) + if response is not None: + response.close() + + if retries >= maximum: + raise WaybackRetryError(retries, time.time() - start_time, error) from error + elif self.should_retry_error(error): + retry_delay = self.get_retry_delay(retries, response) + logger.info('Caught exception during request, will retry: %s', error) + else: + raise + + logger.debug('Will retry after sleeping for %s seconds...', retry_delay) + time.sleep(retry_delay) + retries += 1 + + def _request_rate_limited(self, method: str, url: str, **kwargs) -> WaybackHttpResponse: + parsed_url = urlparse(url) + for path, limit in self.rate_limits.items(): + if parsed_url.path.startswith(path): + rate_limit = limit + break + else: + rate_limit = DEFAULT_MEMENTO_RATE_LIMIT + + rate_limit.wait() + return self.request_raw(method, url, **kwargs) + + def request_raw(self, *args, **kwargs) -> WaybackHttpResponse: + return super().request(*args, **kwargs) + + def should_retry(self, response: WaybackHttpResponse): + # A memento may actually be a capture of an error, so don't retry it :P + if response.is_memento: + return False + + return response.status_code in self.retryable_statuses + + def should_retry_error(self, error): + if isinstance(error, self.retryable_errors): + return True + elif isinstance(error, ConnectionError): + # ConnectionErrors from requests actually wrap a whole family of + # more detailed errors from urllib3, so we need to do some string + # checking to determine whether the error is retryable. + text = str(error) + # NOTE: we have also seen this, which may warrant retrying: + # `requests.exceptions.ConnectionError: ('Connection aborted.', + # RemoteDisconnected('Remote end closed connection without + # response'))` + if 'NewConnectionError' in text or 'Max retries' in text: + return True + + return False + + def get_retry_delay(self, retries, response=None): + delay = 0 + + # As of 2023-11-27, the Wayback Machine does not set a `Retry-After` + # header, so this parsing is really just future-proofing. + if response is not None: + delay = parse_retry_after(response.headers.get('Retry-After')) or delay + if response.status_code == 429 and delay == 0: + delay = DEFAULT_RATE_LIMIT_DELAY + + # No default backoff on the first retry. + if retries > 0: + delay = max(self.backoff * 2 ** (retries - 1), delay) + + return delay + + +class WaybackRequestsAdapter(RetryAndRateLimitAdapter, DisableAfterCloseAdapter, RequestsAdapter): + def __init__( + self, + retries: int = 6, + backoff: Real = 2, + timeout: Union[Real, Tuple[Real, Real]] = 60, + user_agent: str = None, + memento_rate_limit: Union[RateLimit, Real] = DEFAULT_MEMENTO_RATE_LIMIT, + search_rate_limit: Union[RateLimit, Real] = DEFAULT_CDX_RATE_LIMIT, + timemap_rate_limit: Union[RateLimit, Real] = DEFAULT_TIMEMAP_RATE_LIMIT, + ) -> None: + super().__init__( + retries=retries, + backoff=backoff, + rate_limits={ + '/web/timemap': RateLimit.make_limit(timemap_rate_limit), + '/cdx': RateLimit.make_limit(search_rate_limit), + # The memento limit is actually a generic Wayback limit. + '/': RateLimit.make_limit(memento_rate_limit), + } + ) + self.timeout = timeout + self.headers = { + 'User-Agent': (user_agent or + f'wayback/{__version__} (+https://github.com/edgi-govdata-archiving/wayback)'), + 'Accept-Encoding': 'gzip, deflate' + } + + def request( + self, + method: str, + url: str, + *, + params: dict = None, + headers: dict = None, + follow_redirects: bool = True, + timeout: Union[int, Tuple[int, int]] = -1 + ) -> WaybackHttpResponse: + timeout = self.timeout if timeout is -1 else timeout + headers = (headers or {}) | self.headers + + return super().request(method, + url, + params=params, + headers=headers, + follow_redirects=follow_redirects, + timeout=timeout) diff --git a/src/wayback/_utils.py b/src/wayback/_utils.py index 2e3d50f..5a2f9dd 100644 --- a/src/wayback/_utils.py +++ b/src/wayback/_utils.py @@ -8,7 +8,7 @@ import time from typing import Union import urllib.parse -from .exceptions import SessionClosedError +from .exceptions import AlreadyClosedError logger = logging.getLogger(__name__) @@ -320,9 +320,9 @@ def __exit_all__(self, type, value, traceback): pass -class DisableAfterCloseSession: +class DisableAfterCloseAdapter: """ - A custom session object raises a :class:`SessionClosedError` if you try to + A custom session object raises a :class:`AlreadyClosedError` if you try to use it after closing it, to help identify and avoid potentially dangerous code patterns. (Standard session objects continue to be usable after closing, even if they may not work exactly as expected.) @@ -335,8 +335,10 @@ def close(self, disable=True): def request(self, *args, **kwargs): if self._closed: - raise SessionClosedError('This session has already been closed ' + raise AlreadyClosedError('This session has already been closed ' 'and cannot send new HTTP requests.') + else: + return super().request(*args, **kwargs) class CaseInsensitiveDict(MutableMapping): diff --git a/src/wayback/exceptions.py b/src/wayback/exceptions.py index 2273c42..04bddf1 100644 --- a/src/wayback/exceptions.py +++ b/src/wayback/exceptions.py @@ -101,8 +101,8 @@ def __init__(self, response, retry_after): super().__init__(message) -class SessionClosedError(Exception): +class AlreadyClosedError(Exception): """ - Raised when a Wayback session is used to make a request after it has been + Raised when a Wayback client is used to make a request after it has been closed and disabled. """ diff --git a/src/wayback/tests/test_client.py b/src/wayback/tests/test_client.py index b95b685..82c9250 100644 --- a/src/wayback/tests/test_client.py +++ b/src/wayback/tests/test_client.py @@ -6,11 +6,11 @@ import requests from unittest import mock from .support import create_vcr -from .._utils import SessionClosedError +from .._utils import AlreadyClosedError from .._client import (CdxRecord, Mode, - WaybackSession, WaybackClient) +from .._http import WaybackRequestsAdapter from ..exceptions import (BlockedSiteError, MementoPlaybackError, NoMementoError, @@ -112,7 +112,7 @@ def test_search_multipage(): @ia_vcr.use_cassette() def test_search_cannot_iterate_after_session_closing(): - with pytest.raises(SessionClosedError): + with pytest.raises(AlreadyClosedError): with WaybackClient() as client: versions = client.search('nasa.gov', from_date=datetime(1996, 10, 1), @@ -704,12 +704,12 @@ def return_user_agent(request, **kwargs) -> requests.Response: return response -class TestWaybackSession: +class TestWaybackRequestsAdapter: def test_request_retries(self, requests_mock): requests_mock.get('http://test.com', [{'text': 'bad1', 'status_code': 503}, {'text': 'bad2', 'status_code': 503}, {'text': 'good', 'status_code': 200}]) - session = WaybackSession(retries=2, backoff=0.1) + session = WaybackRequestsAdapter(retries=2, backoff=0.1) response = session.request('GET', 'http://test.com') assert response.is_success @@ -719,7 +719,7 @@ def test_stops_after_given_retries(self, requests_mock): requests_mock.get('http://test.com', [{'text': 'bad1', 'status_code': 503}, {'text': 'bad2', 'status_code': 503}, {'text': 'good', 'status_code': 200}]) - session = WaybackSession(retries=1, backoff=0.1) + session = WaybackRequestsAdapter(retries=1, backoff=0.1) response = session.request('GET', 'http://test.com') assert response.status_code == 503 assert response.text == 'bad2' @@ -727,28 +727,28 @@ def test_stops_after_given_retries(self, requests_mock): def test_only_retries_some_errors(self, requests_mock): requests_mock.get('http://test.com', [{'text': 'bad1', 'status_code': 400}, {'text': 'good', 'status_code': 200}]) - session = WaybackSession(retries=1, backoff=0.1) + session = WaybackRequestsAdapter(retries=1, backoff=0.1) response = session.request('GET', 'http://test.com') assert response.status_code == 400 def test_raises_rate_limit_error(self, requests_mock): requests_mock.get('http://test.com', [WAYBACK_RATE_LIMIT_ERROR]) with pytest.raises(RateLimitError): - session = WaybackSession(retries=0) + session = WaybackRequestsAdapter(retries=0) session.request('GET', 'http://test.com') def test_rate_limit_error_includes_retry_after(self, requests_mock): requests_mock.get('http://test.com', [WAYBACK_RATE_LIMIT_ERROR]) with pytest.raises(RateLimitError) as excinfo: - session = WaybackSession(retries=0) + session = WaybackRequestsAdapter(retries=0) session.request('GET', 'http://test.com') assert excinfo.value.retry_after == 10 @mock.patch('requests.Session.send', side_effect=return_timeout) def test_timeout_applied_session(self, mock_class): - # Is the timeout applied through the WaybackSession - session = WaybackSession(timeout=1) + # Is the timeout applied through the adapter + session = WaybackRequestsAdapter(timeout=1) res = session.request('GET', 'http://test.com') assert res.text == '1' # Overwriting the default in the requests method @@ -760,7 +760,7 @@ def test_timeout_applied_session(self, mock_class): @mock.patch('requests.Session.send', side_effect=return_timeout) def test_timeout_applied_request(self, mock_class): # Using the default value - session = WaybackSession() + session = WaybackRequestsAdapter() res = session.request('GET', 'http://test.com') assert res.text == '60' # Overwriting the default @@ -772,7 +772,7 @@ def test_timeout_applied_request(self, mock_class): @mock.patch('requests.Session.send', side_effect=return_timeout) def test_timeout_empty(self, mock_class): # Disabling default timeout - session = WaybackSession(timeout=None) + session = WaybackRequestsAdapter(timeout=None) res = session.request('GET', 'http://test.com') assert res.text == 'None' # Overwriting the default @@ -781,7 +781,7 @@ def test_timeout_empty(self, mock_class): @mock.patch('requests.Session.send', side_effect=return_user_agent) def test_user_agent(self, mock_class): - adapter = WaybackSession() + adapter = WaybackRequestsAdapter() agent = adapter.request('GET', 'http://test.com').text assert agent.startswith('wayback/') @@ -801,7 +801,7 @@ def test_search_rate_limits(self): # Check that disabling the rate limits through the search API works. start_time = time.time() - with WaybackClient(WaybackSession(search_calls_per_second=0)) as client: + with WaybackClient(WaybackRequestsAdapter(search_rate_limit=0)) as client: for i in range(3): next(client.search('zew.de')) duration_without_limits = time.time() - start_time @@ -810,7 +810,7 @@ def test_search_rate_limits(self): # I need to sleep one half second in order to reset the rate limit. time.sleep(0.5) start_time = time.time() - with WaybackClient(WaybackSession(search_calls_per_second=2)) as client: + with WaybackClient(WaybackRequestsAdapter(search_rate_limit=2)) as client: for i in range(3): next(client.search('zew.de')) duration_with_limits_custom = time.time() - start_time @@ -821,11 +821,14 @@ def test_search_rate_limits(self): @ia_vcr.use_cassette() def test_memento_rate_limits(self): - # The timing relies on the cassettes being present, - # therefore the first run might fail. - # Since another test might run before this one, - # I have to wait one call before starting. + # NOTE: The timing relies on VCR cassettes being present (so HTTP + # responses come immediately). The first time you run this test, it + # might fail because it is recording real requests that take a while. + + # Since another test might run before this one, wait a bit before + # starting to get past rate limits from previous tests. time.sleep(1/30) + with WaybackClient() as client: cdx = next(client.search('zew.de')) # First test that the default rate limits are correctly applied. @@ -836,17 +839,15 @@ def test_memento_rate_limits(self): duration_with_limits = time.time() - start_time # Check that disabling the rate limits through the get_memento API works. - time.sleep(1) # Wait to exceed any previous rate limits. start_time = time.time() - with WaybackClient(WaybackSession(memento_calls_per_second=0)) as client: + with WaybackClient(WaybackRequestsAdapter(memento_rate_limit=0)) as client: for i in range(3): client.get_memento(cdx) duration_without_limits = time.time() - start_time # Check that a different rate limit set through the session is applied correctly. - time.sleep(1) # Wait to exceed any previous rate limits. start_time = time.time() - with WaybackClient(WaybackSession(memento_calls_per_second=10)) as client: + with WaybackClient(WaybackRequestsAdapter(memento_rate_limit=10)) as client: for i in range(6): client.get_memento(cdx) duration_with_limits_custom = time.time() - start_time From 0ec78111795a2b24dabc58d6223572120bb5c1e6 Mon Sep 17 00:00:00 2001 From: Rob Brackett Date: Thu, 4 Jan 2024 16:00:28 -0800 Subject: [PATCH 15/17] Forgot the dict merge operator is Python 3.9+ --- src/wayback/_http.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/wayback/_http.py b/src/wayback/_http.py index c0a7a8e..482d5fc 100644 --- a/src/wayback/_http.py +++ b/src/wayback/_http.py @@ -581,7 +581,7 @@ def request( timeout: Union[int, Tuple[int, int]] = -1 ) -> WaybackHttpResponse: timeout = self.timeout if timeout is -1 else timeout - headers = (headers or {}) | self.headers + headers = dict(headers or {}, **self.headers) return super().request(method, url, From cf6d879ebb8bf9f61f3a270a83c8e7b670ed8da5 Mon Sep 17 00:00:00 2001 From: Rob Brackett Date: Thu, 4 Jan 2024 17:18:51 -0800 Subject: [PATCH 16/17] Delegate default header handling to requests --- src/wayback/_http.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/src/wayback/_http.py b/src/wayback/_http.py index 482d5fc..5ce4e49 100644 --- a/src/wayback/_http.py +++ b/src/wayback/_http.py @@ -564,7 +564,7 @@ def __init__( } ) self.timeout = timeout - self.headers = { + self._session.headers = { 'User-Agent': (user_agent or f'wayback/{__version__} (+https://github.com/edgi-govdata-archiving/wayback)'), 'Accept-Encoding': 'gzip, deflate' @@ -580,8 +580,7 @@ def request( follow_redirects: bool = True, timeout: Union[int, Tuple[int, int]] = -1 ) -> WaybackHttpResponse: - timeout = self.timeout if timeout is -1 else timeout - headers = dict(headers or {}, **self.headers) + timeout = self.timeout if timeout == -1 else timeout return super().request(method, url, From 994c8b78a5227db1f3d043352a985d806f6ce182 Mon Sep 17 00:00:00 2001 From: Rob Brackett Date: Thu, 4 Jan 2024 17:30:23 -0800 Subject: [PATCH 17/17] Oops, stream should be true --- src/wayback/_http.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/wayback/_http.py b/src/wayback/_http.py index 5ce4e49..4af214f 100644 --- a/src/wayback/_http.py +++ b/src/wayback/_http.py @@ -325,7 +325,8 @@ def request( params=params, headers=headers, allow_redirects=follow_redirects, - timeout=timeout + timeout=timeout, + stream=True ) return WaybackRequestsResponse(response)