Merge branch 'main' into fix-sentry-issue-148024

datagouv · Nov 15, 2024 · ea40d82 · ea40d82
2 parents 9c64eec + 2461395
commit ea40d82
Show file tree

Hide file tree

Showing 19 changed files with 271 additions and 368 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -2,10 +2,23 @@
 
 ## Current (in progress)
 
+- Refactor ParseError to enrich Sentry with context and to inquire about Sentry errors like #4096 [#218](https://github.com/datagouv/hydra/pull/218)
+- Remove legacy routes [#203](https://github.com/datagouv/hydra/pull/203)
+- More explicit error reporting when sending to udata without raising errors for udata responding with a 404 [#213](https://github.com/datagouv/hydra/pull/213)
+- Minor cleaning: remove unused arg in function [#219](https://github.com/datagouv/hydra/pull/219)
+- Fix type issue regarding `resource_id` [#220](https://github.com/datagouv/hydra/pull/220)
+
+## 2.0.5 (2024-11-08)
+
 - Fix minor types issues [#204](https://github.com/datagouv/hydra/pull/204)
 - Return resources statuses count in crawler status endpoint response [#206](https://github.com/datagouv/hydra/pull/206)
 - Fix deprecated CircleCI config [#207](https://github.com/datagouv/hydra/pull/207)
 - Fix Sentry issue #4195 [#209](https://github.com/datagouv/hydra/pull/209)
+- Clean doctrings for more consistent style [#215](https://github.com/datagouv/hydra/pull/215)
+- Fix some type hints [#214](https://github.com/datagouv/hydra/pull/214)
+- Add option to force analysis even if resource has not changed [#205](https://github.com/datagouv/hydra/pull/205)
+- Fix get all checks CRUD method [#217](https://github.com/datagouv/hydra/pull/217)
+- Deactivate parquet export for small CSVs [#216](https://github.com/datagouv/hydra/pull/216)
 
 ## 2.0.4 (2024-10-28)
 

diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "udata-hydra"
-version = "2.0.5.dev"
+version = "2.0.6.dev"
 description = "Async crawler and parsing service for data.gouv.fr"
 authors = ["Opendata Team <[email protected]>"]
 license = "MIT"

diff --git a/tests/test_analysis/test_analysis_csv.py b/tests/test_analysis/test_analysis_csv.py
@@ -4,11 +4,16 @@
 from tempfile import NamedTemporaryFile
 
 import pytest
+from aiohttp import ClientSession
 from asyncpg.exceptions import UndefinedTableError
 from yarl import URL
 
 from tests.conftest import RESOURCE_ID, RESOURCE_URL
 from udata_hydra.analysis.csv import analyse_csv, csv_to_db
+from udata_hydra.crawl.check_resources import (
+    RESOURCE_RESPONSE_STATUSES,
+    check_resource,
+)
 from udata_hydra.db.resource import Resource
 
 pytestmark = pytest.mark.asyncio
@@ -306,3 +311,69 @@ async def test_analyse_csv_send_udata_webhook(
     assert webhook.get("analysis:parsing:started_at")
     assert webhook.get("analysis:parsing:finished_at")
     assert webhook.get("analysis:parsing:error") is None
+
+
+@pytest.mark.parametrize(
+    "forced_analysis",
+    (
+        (True, True),
+        (False, False),
+    ),
+)
+async def test_forced_analysis(
+    setup_catalog,
+    rmock,
+    catalog_content,
+    db,
+    fake_check,
+    forced_analysis,
+    udata_url,
+):
+    force_analysis, table_exists = forced_analysis
+    check = await fake_check(
+        headers={
+            "content-type": "application/csv",
+            "content-length": "100",
+        }
+    )
+    url = check["url"]
+    rid = check["resource_id"]
+    rmock.head(
+        url,
+        status=200,
+        headers={
+            "content-type": "application/csv",
+            "content-length": "100",
+        },
+    )
+    rmock.get(
+        url,
+        status=200,
+        headers={
+            "content-type": "application/csv",
+            "content-length": "100",
+        },
+        body="a,b,c\n1,2,3".encode("utf-8"),
+        repeat=True,
+    )
+    rmock.put(udata_url, status=200, repeat=True)
+    async with ClientSession() as session:
+        await check_resource(
+            url=url, resource_id=rid, session=session, force_analysis=force_analysis
+        )
+
+    # check that csv was indeed pushed to db
+    table_name = hashlib.md5(url.encode("utf-8")).hexdigest()
+    tables = await db.fetch(
+        "SELECT table_name FROM INFORMATION_SCHEMA.TABLES WHERE table_schema = 'public';"
+    )
+    assert (table_name in [r["table_name"] for r in tables]) == table_exists
+
+    # check whether udata was pinged
+    if force_analysis:
+        webhook = rmock.requests[("PUT", URL(udata_url))][0].kwargs["json"]
+        assert webhook.get("analysis:parsing:started_at")
+        assert webhook.get("analysis:parsing:finished_at")
+        assert webhook.get("analysis:parsing:error") is None
+    else:
+        assert ("PUT", URL(udata_url)) not in rmock.requests.keys()
diff --git a/tests/test_api/test_api_legacy.py b/tests/test_api/test_api_legacy.py
diff --git a/tests/test_parquet_export.py b/tests/test_parquet_export.py
@@ -1,12 +1,14 @@
+import os
 from io import BytesIO
 
 import pyarrow.parquet as pq
 import pytest
 
 from udata_hydra.analysis.csv import (
     RESERVED_COLS,
+    csv_detective_routine,
+    csv_to_parquet,
     generate_records,
-    perform_csv_inspection,
 )
 from udata_hydra.utils.parquet import save_as_parquet
 
@@ -21,12 +23,12 @@
         ("catalog.xlsx", 2),
     ),
 )
-async def test_parquet_conversion(
-    setup_catalog, rmock, db, fake_check, produce_mock, file_and_count
-):
+async def test_save_as_parquet(file_and_count):
     filename, expected_count = file_and_count
     file_path = f"tests/data/{filename}"
-    inspection: dict | None = await perform_csv_inspection(file_path)
+    inspection: dict | None = csv_detective_routine(
+        csv_file_path=file_path, output_profile=True, num_rows=-1, save_results=False
+    )
     assert inspection
     columns = inspection["columns"]
     columns = {
@@ -41,3 +43,37 @@ async def test_parquet_conversion(
     assert len(table) == expected_count
     fake_file = BytesIO()
     pq.write_table(table, fake_file)
+
+
+@pytest.mark.parametrize(
+    "parquet_config",
+    (
+        (False, 1, False),  # CSV_TO_PARQUET = False, MIN_LINES_FOR_PARQUET = 1
+        (True, 1, True),  # CSV_TO_PARQUET = True, MIN_LINES_FOR_PARQUET = 1
+        (True, 3, False),  # CSV_TO_PARQUET = True, MIN_LINES_FOR_PARQUET = 3
+    ),
+)
+async def test_csv_to_parquet(mocker, parquet_config):
+    async def execute_csv_to_parquet() -> tuple[str, int] | None:
+        file_path = "tests/data/catalog.csv"
+        inspection: dict | None = csv_detective_routine(
+            csv_file_path=file_path, output_profile=True, num_rows=-1, save_results=False
+        )
+        assert inspection
+        return await csv_to_parquet(
+            file_path=file_path, inspection=inspection, table_name="test_table"
+        )
+
+    csv_to_parquet_config, min_lines_for_parquet_config, expected_conversion = parquet_config
+    mocker.patch("udata_hydra.config.CSV_TO_PARQUET", csv_to_parquet_config)
+    mocker.patch("udata_hydra.config.MIN_LINES_FOR_PARQUET", min_lines_for_parquet_config)
+
+    if not expected_conversion:
+        assert not await execute_csv_to_parquet()
+
+    else:
+        # TODO: don't use the exception as the assertion, better to mock the minio client sending the file
+        with pytest.raises(ValueError, match="invalid bucket name"):
+            await execute_csv_to_parquet()
+        # Clean the remaining parquet file
+        os.remove("test_table.parquet")