From bcd85d25a92d90019d07311074aaa88a6e80ff5c Mon Sep 17 00:00:00 2001 From: Adrien Carpentier Date: Wed, 30 Oct 2024 18:43:04 +0100 Subject: [PATCH 1/3] fix: fix sentry error 145755 --- udata_hydra/utils/file.py | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/udata_hydra/utils/file.py b/udata_hydra/utils/file.py index 93f26c89..bcfcb555 100644 --- a/udata_hydra/utils/file.py +++ b/udata_hydra/utils/file.py @@ -52,15 +52,18 @@ async def download_resource( async with aiohttp.ClientSession( headers={"user-agent": config.USER_AGENT}, raise_for_status=True ) as session: - async with session.get(url, allow_redirects=True) as response: - async for chunk in response.content.iter_chunked(chunk_size): - if max_size_allowed is None or i * chunk_size < max_size_allowed: - tmp_file.write(chunk) - else: - tmp_file.close() - log.warning(f"File {url} is too big, skipping") - raise IOError("File too large to download") - i += 1 + try: + async with session.get(url, allow_redirects=True) as response: + async for chunk in response.content.iter_chunked(chunk_size): + if max_size_allowed is None or i * chunk_size < max_size_allowed: + tmp_file.write(chunk) + else: + tmp_file.close() + log.warning(f"File {url} is too big, skipping") + raise IOError("File too large to download") + i += 1 + except aiohttp.ClientResponseError as e: + raise IOError(f"Error downloading CSV: {e}") tmp_file.close() if magic.from_file(tmp_file.name, mime=True) in [ "application/x-gzip", From 32d7f95a6e1278e9bb1c7281de4dd823196a9e09 Mon Sep 17 00:00:00 2001 From: Adrien Carpentier Date: Fri, 15 Nov 2024 17:34:06 +0100 Subject: [PATCH 2/3] feat: add custom IOException error --- udata_hydra/analysis/csv.py | 11 +++++-- udata_hydra/utils/__init__.py | 1 + udata_hydra/{analysis => utils}/errors.py | 40 ++++++++++++++++------- udata_hydra/utils/file.py | 9 ++--- udata_hydra/utils/http.py | 7 ++-- 5 files changed, 48 insertions(+), 20 deletions(-) rename udata_hydra/{analysis => utils}/errors.py (65%) diff --git a/udata_hydra/analysis/csv.py b/udata_hydra/analysis/csv.py index a0c28c4d..a1ed30ea 100644 --- a/udata_hydra/analysis/csv.py +++ b/udata_hydra/analysis/csv.py @@ -32,12 +32,19 @@ from udata_hydra import config, context from udata_hydra.analysis import helpers -from udata_hydra.analysis.errors import ParseException, handle_parse_exception from udata_hydra.db import compute_insert_query from udata_hydra.db.check import Check from udata_hydra.db.resource import Resource from udata_hydra.db.resource_exception import ResourceException -from udata_hydra.utils import Reader, Timer, download_resource, queue, send +from udata_hydra.utils import ( + ParseException, + Reader, + Timer, + download_resource, + handle_parse_exception, + queue, + send, +) from udata_hydra.utils.minio import MinIOClient from udata_hydra.utils.parquet import save_as_parquet diff --git a/udata_hydra/utils/__init__.py b/udata_hydra/utils/__init__.py index 77d3cf0e..b3f391c4 100644 --- a/udata_hydra/utils/__init__.py +++ b/udata_hydra/utils/__init__.py @@ -1,6 +1,7 @@ # ruff: noqa: F401 from .auth import token_auth_middleware from .csv import detect_tabular_from_headers +from .errors import IOException, ParseException, handle_parse_exception from .file import compute_checksum_from_file, download_resource, read_csv_gz from .http import get_request_params, is_valid_uri, send from .queue import enqueue diff --git a/udata_hydra/analysis/errors.py b/udata_hydra/utils/errors.py similarity index 65% rename from udata_hydra/analysis/errors.py rename to udata_hydra/utils/errors.py index a5a207ec..f1448870 100644 --- a/udata_hydra/analysis/errors.py +++ b/udata_hydra/utils/errors.py @@ -1,24 +1,23 @@ import logging from datetime import datetime, timezone -from tkinter import N import sentry_sdk from asyncpg import Record -from udata_hydra import config, context +from udata_hydra import context from udata_hydra.db.check import Check log = logging.getLogger("udata-hydra") -class ParseException(Exception): +class ExceptionWithSentryDetails(Exception): """ - Exception raised when an error occurs during parsing. - Enriches Sentry with tags if available. + Custom exception which enriches Sentry with tags if available. """ def __init__( self, + message: str | None = None, step: str | None = None, resource_id: str | None = None, url: str | None = None, @@ -26,20 +25,33 @@ def __init__( table_name: str | None = None, *args, ) -> None: - if step: - self.step = step - if config.SENTRY_DSN: + self.step = step + self.message = message + if sentry_sdk.Hub.current.client: with sentry_sdk.new_scope() as scope: # scope.set_level("warning") scope.set_tags( { "resource_id": resource_id or "unknown", - "csv_url": url or "unknown", + "url": url or "unknown", "check_id": check_id or "unknown", "table_name": table_name or "unknown", } ) - super().__init__(*args) + sentry_sdk.capture_exception(self) + super().__init__(message, *args) + + +class ParseException(ExceptionWithSentryDetails): + """Exception raised when an error occurs during parsing.""" + + pass + + +class IOException(ExceptionWithSentryDetails): + """Exception raised when an error occurs during IO operations.""" + + pass async def handle_parse_exception(e: ParseException, table_name: str, check: Record | None) -> None: @@ -47,12 +59,16 @@ async def handle_parse_exception(e: ParseException, table_name: str, check: Reco db = await context.pool("csv") await db.execute(f'DROP TABLE IF EXISTS "{table_name}"') if check: - if config.SENTRY_DSN: + if sentry_sdk.Hub.current.client: with sentry_sdk.new_scope(): event_id = sentry_sdk.capture_exception(e) # e.__cause__ let us access the "inherited" error of ParseException (raise e from cause) # it's called explicit exception chaining and it's very cool, look it up (PEP 3134)! - err = f"{e.step}:sentry:{event_id}" if config.SENTRY_DSN else f"{e.step}:{str(e.__cause__)}" + err = ( + f"{e.step}:sentry:{event_id}" + if sentry_sdk.Hub.current.client + else f"{e.step}:{str(e.__cause__)}" + ) await Check.update( check["id"], {"parsing_error": err, "parsing_finished_at": datetime.now(timezone.utc)}, diff --git a/udata_hydra/utils/file.py b/udata_hydra/utils/file.py index bcfcb555..e7d7f86b 100644 --- a/udata_hydra/utils/file.py +++ b/udata_hydra/utils/file.py @@ -8,6 +8,7 @@ import magic from udata_hydra import config +from udata_hydra.utils import IOException log = logging.getLogger("udata-hydra") @@ -38,14 +39,14 @@ async def download_resource( """ Attempts downloading a resource from a given url. Returns the downloaded file object. - Raises IOError if the resource is too large. + Raises custom IOException if the resource is too large. """ tmp_file = tempfile.NamedTemporaryFile( dir=config.TEMPORARY_DOWNLOAD_FOLDER or None, delete=False ) if max_size_allowed is not None and float(headers.get("content-length", -1)) > max_size_allowed: - raise IOError("File too large to download") + raise IOException("File too large to download") chunk_size = 1024 i = 0 @@ -60,10 +61,10 @@ async def download_resource( else: tmp_file.close() log.warning(f"File {url} is too big, skipping") - raise IOError("File too large to download") + raise IOException("File too large to download", url=url) i += 1 except aiohttp.ClientResponseError as e: - raise IOError(f"Error downloading CSV: {e}") + raise IOException("Error downloading CSV", url=url) from e tmp_file.close() if magic.from_file(tmp_file.name, mime=True) in [ "application/x-gzip", diff --git a/udata_hydra/utils/http.py b/udata_hydra/utils/http.py index 2cf31ccf..9c62da2c 100644 --- a/udata_hydra/utils/http.py +++ b/udata_hydra/utils/http.py @@ -6,6 +6,7 @@ from aiohttp import web from udata_hydra import config +from udata_hydra.utils import IOException log = logging.getLogger("udata-hydra") @@ -51,8 +52,10 @@ async def send(dataset_id: str, resource_id: str, document: dict) -> None: if resp.status == 404: pass elif resp.status == 410: - raise IOError("Resource has been deleted on udata") + raise IOException( + "Resource has been deleted on udata", resource_id=resource_id, url=uri + ) if resp.status == 502: - raise IOError("Udata is unreachable") + raise IOException("Udata is unreachable", resource_id=resource_id, url=uri) else: resp.raise_for_status() From 1a9e667f415338137673604ca7f9c1c2469bece8 Mon Sep 17 00:00:00 2001 From: Adrien Carpentier Date: Fri, 15 Nov 2024 18:19:16 +0100 Subject: [PATCH 3/3] fix: fix exception logic --- udata_hydra/utils/file.py | 29 +++++++++++++++-------------- 1 file changed, 15 insertions(+), 14 deletions(-) diff --git a/udata_hydra/utils/file.py b/udata_hydra/utils/file.py index e7d7f86b..c399a9a5 100644 --- a/udata_hydra/utils/file.py +++ b/udata_hydra/utils/file.py @@ -39,7 +39,7 @@ async def download_resource( """ Attempts downloading a resource from a given url. Returns the downloaded file object. - Raises custom IOException if the resource is too large. + Raises custom IOException if the resource is too large or if the URL is unreachable. """ tmp_file = tempfile.NamedTemporaryFile( dir=config.TEMPORARY_DOWNLOAD_FOLDER or None, delete=False @@ -50,10 +50,10 @@ async def download_resource( chunk_size = 1024 i = 0 - async with aiohttp.ClientSession( - headers={"user-agent": config.USER_AGENT}, raise_for_status=True - ) as session: - try: + try: + async with aiohttp.ClientSession( + headers={"user-agent": config.USER_AGENT}, raise_for_status=True + ) as session: async with session.get(url, allow_redirects=True) as response: async for chunk in response.content.iter_chunked(chunk_size): if max_size_allowed is None or i * chunk_size < max_size_allowed: @@ -63,12 +63,13 @@ async def download_resource( log.warning(f"File {url} is too big, skipping") raise IOException("File too large to download", url=url) i += 1 - except aiohttp.ClientResponseError as e: - raise IOException("Error downloading CSV", url=url) from e - tmp_file.close() - if magic.from_file(tmp_file.name, mime=True) in [ - "application/x-gzip", - "application/gzip", - ]: - tmp_file = read_csv_gz(tmp_file.name) - return tmp_file + except aiohttp.ClientResponseError as e: + raise IOException("Error downloading CSV", url=url) from e + finally: + tmp_file.close() + if magic.from_file(tmp_file.name, mime=True) in [ + "application/x-gzip", + "application/gzip", + ]: + tmp_file = read_csv_gz(tmp_file.name) + return tmp_file