From bcd85d25a92d90019d07311074aaa88a6e80ff5c Mon Sep 17 00:00:00 2001
From: Adrien Carpentier <me@adriencarpentier.com>
Date: Wed, 30 Oct 2024 18:43:04 +0100
Subject: [PATCH 1/3] fix: fix sentry error 145755

---
 udata_hydra/utils/file.py | 21 ++++++++++++---------
 1 file changed, 12 insertions(+), 9 deletions(-)

diff --git a/udata_hydra/utils/file.py b/udata_hydra/utils/file.py
index 93f26c89..bcfcb555 100644
--- a/udata_hydra/utils/file.py
+++ b/udata_hydra/utils/file.py
@@ -52,15 +52,18 @@ async def download_resource(
     async with aiohttp.ClientSession(
         headers={"user-agent": config.USER_AGENT}, raise_for_status=True
     ) as session:
-        async with session.get(url, allow_redirects=True) as response:
-            async for chunk in response.content.iter_chunked(chunk_size):
-                if max_size_allowed is None or i * chunk_size < max_size_allowed:
-                    tmp_file.write(chunk)
-                else:
-                    tmp_file.close()
-                    log.warning(f"File {url} is too big, skipping")
-                    raise IOError("File too large to download")
-                i += 1
+        try:
+            async with session.get(url, allow_redirects=True) as response:
+                async for chunk in response.content.iter_chunked(chunk_size):
+                    if max_size_allowed is None or i * chunk_size < max_size_allowed:
+                        tmp_file.write(chunk)
+                    else:
+                        tmp_file.close()
+                        log.warning(f"File {url} is too big, skipping")
+                        raise IOError("File too large to download")
+                    i += 1
+        except aiohttp.ClientResponseError as e:
+            raise IOError(f"Error downloading CSV: {e}")
     tmp_file.close()
     if magic.from_file(tmp_file.name, mime=True) in [
         "application/x-gzip",

From 32d7f95a6e1278e9bb1c7281de4dd823196a9e09 Mon Sep 17 00:00:00 2001
From: Adrien Carpentier <me@adriencarpentier.com>
Date: Fri, 15 Nov 2024 17:34:06 +0100
Subject: [PATCH 2/3] feat: add custom IOException error

---
 udata_hydra/analysis/csv.py               | 11 +++++--
 udata_hydra/utils/__init__.py             |  1 +
 udata_hydra/{analysis => utils}/errors.py | 40 ++++++++++++++++-------
 udata_hydra/utils/file.py                 |  9 ++---
 udata_hydra/utils/http.py                 |  7 ++--
 5 files changed, 48 insertions(+), 20 deletions(-)
 rename udata_hydra/{analysis => utils}/errors.py (65%)

diff --git a/udata_hydra/analysis/csv.py b/udata_hydra/analysis/csv.py
index a0c28c4d..a1ed30ea 100644
--- a/udata_hydra/analysis/csv.py
+++ b/udata_hydra/analysis/csv.py
@@ -32,12 +32,19 @@
 
 from udata_hydra import config, context
 from udata_hydra.analysis import helpers
-from udata_hydra.analysis.errors import ParseException, handle_parse_exception
 from udata_hydra.db import compute_insert_query
 from udata_hydra.db.check import Check
 from udata_hydra.db.resource import Resource
 from udata_hydra.db.resource_exception import ResourceException
-from udata_hydra.utils import Reader, Timer, download_resource, queue, send
+from udata_hydra.utils import (
+    ParseException,
+    Reader,
+    Timer,
+    download_resource,
+    handle_parse_exception,
+    queue,
+    send,
+)
 from udata_hydra.utils.minio import MinIOClient
 from udata_hydra.utils.parquet import save_as_parquet
 
diff --git a/udata_hydra/utils/__init__.py b/udata_hydra/utils/__init__.py
index 77d3cf0e..b3f391c4 100644
--- a/udata_hydra/utils/__init__.py
+++ b/udata_hydra/utils/__init__.py
@@ -1,6 +1,7 @@
 # ruff: noqa: F401
 from .auth import token_auth_middleware
 from .csv import detect_tabular_from_headers
+from .errors import IOException, ParseException, handle_parse_exception
 from .file import compute_checksum_from_file, download_resource, read_csv_gz
 from .http import get_request_params, is_valid_uri, send
 from .queue import enqueue
diff --git a/udata_hydra/analysis/errors.py b/udata_hydra/utils/errors.py
similarity index 65%
rename from udata_hydra/analysis/errors.py
rename to udata_hydra/utils/errors.py
index a5a207ec..f1448870 100644
--- a/udata_hydra/analysis/errors.py
+++ b/udata_hydra/utils/errors.py
@@ -1,24 +1,23 @@
 import logging
 from datetime import datetime, timezone
-from tkinter import N
 
 import sentry_sdk
 from asyncpg import Record
 
-from udata_hydra import config, context
+from udata_hydra import context
 from udata_hydra.db.check import Check
 
 log = logging.getLogger("udata-hydra")
 
 
-class ParseException(Exception):
+class ExceptionWithSentryDetails(Exception):
     """
-    Exception raised when an error occurs during parsing.
-    Enriches Sentry with tags if available.
+    Custom exception which enriches Sentry with tags if available.
     """
 
     def __init__(
         self,
+        message: str | None = None,
         step: str | None = None,
         resource_id: str | None = None,
         url: str | None = None,
@@ -26,20 +25,33 @@ def __init__(
         table_name: str | None = None,
         *args,
     ) -> None:
-        if step:
-            self.step = step
-        if config.SENTRY_DSN:
+        self.step = step
+        self.message = message
+        if sentry_sdk.Hub.current.client:
             with sentry_sdk.new_scope() as scope:
                 # scope.set_level("warning")
                 scope.set_tags(
                     {
                         "resource_id": resource_id or "unknown",
-                        "csv_url": url or "unknown",
+                        "url": url or "unknown",
                         "check_id": check_id or "unknown",
                         "table_name": table_name or "unknown",
                     }
                 )
-        super().__init__(*args)
+                sentry_sdk.capture_exception(self)
+        super().__init__(message, *args)
+
+
+class ParseException(ExceptionWithSentryDetails):
+    """Exception raised when an error occurs during parsing."""
+
+    pass
+
+
+class IOException(ExceptionWithSentryDetails):
+    """Exception raised when an error occurs during IO operations."""
+
+    pass
 
 
 async def handle_parse_exception(e: ParseException, table_name: str, check: Record | None) -> None:
@@ -47,12 +59,16 @@ async def handle_parse_exception(e: ParseException, table_name: str, check: Reco
     db = await context.pool("csv")
     await db.execute(f'DROP TABLE IF EXISTS "{table_name}"')
     if check:
-        if config.SENTRY_DSN:
+        if sentry_sdk.Hub.current.client:
             with sentry_sdk.new_scope():
                 event_id = sentry_sdk.capture_exception(e)
         # e.__cause__ let us access the "inherited" error of ParseException (raise e from cause)
         # it's called explicit exception chaining and it's very cool, look it up (PEP 3134)!
-        err = f"{e.step}:sentry:{event_id}" if config.SENTRY_DSN else f"{e.step}:{str(e.__cause__)}"
+        err = (
+            f"{e.step}:sentry:{event_id}"
+            if sentry_sdk.Hub.current.client
+            else f"{e.step}:{str(e.__cause__)}"
+        )
         await Check.update(
             check["id"],
             {"parsing_error": err, "parsing_finished_at": datetime.now(timezone.utc)},
diff --git a/udata_hydra/utils/file.py b/udata_hydra/utils/file.py
index bcfcb555..e7d7f86b 100644
--- a/udata_hydra/utils/file.py
+++ b/udata_hydra/utils/file.py
@@ -8,6 +8,7 @@
 import magic
 
 from udata_hydra import config
+from udata_hydra.utils import IOException
 
 log = logging.getLogger("udata-hydra")
 
@@ -38,14 +39,14 @@ async def download_resource(
     """
     Attempts downloading a resource from a given url.
     Returns the downloaded file object.
-    Raises IOError if the resource is too large.
+    Raises custom IOException if the resource is too large.
     """
     tmp_file = tempfile.NamedTemporaryFile(
         dir=config.TEMPORARY_DOWNLOAD_FOLDER or None, delete=False
     )
 
     if max_size_allowed is not None and float(headers.get("content-length", -1)) > max_size_allowed:
-        raise IOError("File too large to download")
+        raise IOException("File too large to download")
 
     chunk_size = 1024
     i = 0
@@ -60,10 +61,10 @@ async def download_resource(
                     else:
                         tmp_file.close()
                         log.warning(f"File {url} is too big, skipping")
-                        raise IOError("File too large to download")
+                        raise IOException("File too large to download", url=url)
                     i += 1
         except aiohttp.ClientResponseError as e:
-            raise IOError(f"Error downloading CSV: {e}")
+            raise IOException("Error downloading CSV", url=url) from e
     tmp_file.close()
     if magic.from_file(tmp_file.name, mime=True) in [
         "application/x-gzip",
diff --git a/udata_hydra/utils/http.py b/udata_hydra/utils/http.py
index 2cf31ccf..9c62da2c 100644
--- a/udata_hydra/utils/http.py
+++ b/udata_hydra/utils/http.py
@@ -6,6 +6,7 @@
 from aiohttp import web
 
 from udata_hydra import config
+from udata_hydra.utils import IOException
 
 log = logging.getLogger("udata-hydra")
 
@@ -51,8 +52,10 @@ async def send(dataset_id: str, resource_id: str, document: dict) -> None:
             if resp.status == 404:
                 pass
             elif resp.status == 410:
-                raise IOError("Resource has been deleted on udata")
+                raise IOException(
+                    "Resource has been deleted on udata", resource_id=resource_id, url=uri
+                )
             if resp.status == 502:
-                raise IOError("Udata is unreachable")
+                raise IOException("Udata is unreachable", resource_id=resource_id, url=uri)
             else:
                 resp.raise_for_status()

From 1a9e667f415338137673604ca7f9c1c2469bece8 Mon Sep 17 00:00:00 2001
From: Adrien Carpentier <me@adriencarpentier.com>
Date: Fri, 15 Nov 2024 18:19:16 +0100
Subject: [PATCH 3/3] fix: fix exception logic

---
 udata_hydra/utils/file.py | 29 +++++++++++++++--------------
 1 file changed, 15 insertions(+), 14 deletions(-)

diff --git a/udata_hydra/utils/file.py b/udata_hydra/utils/file.py
index e7d7f86b..c399a9a5 100644
--- a/udata_hydra/utils/file.py
+++ b/udata_hydra/utils/file.py
@@ -39,7 +39,7 @@ async def download_resource(
     """
     Attempts downloading a resource from a given url.
     Returns the downloaded file object.
-    Raises custom IOException if the resource is too large.
+    Raises custom IOException if the resource is too large or if the URL is unreachable.
     """
     tmp_file = tempfile.NamedTemporaryFile(
         dir=config.TEMPORARY_DOWNLOAD_FOLDER or None, delete=False
@@ -50,10 +50,10 @@ async def download_resource(
 
     chunk_size = 1024
     i = 0
-    async with aiohttp.ClientSession(
-        headers={"user-agent": config.USER_AGENT}, raise_for_status=True
-    ) as session:
-        try:
+    try:
+        async with aiohttp.ClientSession(
+            headers={"user-agent": config.USER_AGENT}, raise_for_status=True
+        ) as session:
             async with session.get(url, allow_redirects=True) as response:
                 async for chunk in response.content.iter_chunked(chunk_size):
                     if max_size_allowed is None or i * chunk_size < max_size_allowed:
@@ -63,12 +63,13 @@ async def download_resource(
                         log.warning(f"File {url} is too big, skipping")
                         raise IOException("File too large to download", url=url)
                     i += 1
-        except aiohttp.ClientResponseError as e:
-            raise IOException("Error downloading CSV", url=url) from e
-    tmp_file.close()
-    if magic.from_file(tmp_file.name, mime=True) in [
-        "application/x-gzip",
-        "application/gzip",
-    ]:
-        tmp_file = read_csv_gz(tmp_file.name)
-    return tmp_file
+    except aiohttp.ClientResponseError as e:
+        raise IOException("Error downloading CSV", url=url) from e
+    finally:
+        tmp_file.close()
+        if magic.from_file(tmp_file.name, mime=True) in [
+            "application/x-gzip",
+            "application/gzip",
+        ]:
+            tmp_file = read_csv_gz(tmp_file.name)
+        return tmp_file