Skip to content

Commit

Permalink
Merge branch 'main' into fix-sentry-issue-148024
Browse files Browse the repository at this point in the history
  • Loading branch information
bolinocroustibat committed Nov 15, 2024
2 parents 9c64eec + 2461395 commit ea40d82
Show file tree
Hide file tree
Showing 19 changed files with 271 additions and 368 deletions.
13 changes: 13 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,23 @@

## Current (in progress)

- Refactor ParseError to enrich Sentry with context and to inquire about Sentry errors like #4096 [#218](https://github.com/datagouv/hydra/pull/218)
- Remove legacy routes [#203](https://github.com/datagouv/hydra/pull/203)
- More explicit error reporting when sending to udata without raising errors for udata responding with a 404 [#213](https://github.com/datagouv/hydra/pull/213)
- Minor cleaning: remove unused arg in function [#219](https://github.com/datagouv/hydra/pull/219)
- Fix type issue regarding `resource_id` [#220](https://github.com/datagouv/hydra/pull/220)

## 2.0.5 (2024-11-08)

- Fix minor types issues [#204](https://github.com/datagouv/hydra/pull/204)
- Return resources statuses count in crawler status endpoint response [#206](https://github.com/datagouv/hydra/pull/206)
- Fix deprecated CircleCI config [#207](https://github.com/datagouv/hydra/pull/207)
- Fix Sentry issue #4195 [#209](https://github.com/datagouv/hydra/pull/209)
- Clean doctrings for more consistent style [#215](https://github.com/datagouv/hydra/pull/215)
- Fix some type hints [#214](https://github.com/datagouv/hydra/pull/214)
- Add option to force analysis even if resource has not changed [#205](https://github.com/datagouv/hydra/pull/205)
- Fix get all checks CRUD method [#217](https://github.com/datagouv/hydra/pull/217)
- Deactivate parquet export for small CSVs [#216](https://github.com/datagouv/hydra/pull/216)

## 2.0.4 (2024-10-28)

Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "udata-hydra"
version = "2.0.5.dev"
version = "2.0.6.dev"
description = "Async crawler and parsing service for data.gouv.fr"
authors = ["Opendata Team <[email protected]>"]
license = "MIT"
Expand Down
71 changes: 71 additions & 0 deletions tests/test_analysis/test_analysis_csv.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,16 @@
from tempfile import NamedTemporaryFile

import pytest
from aiohttp import ClientSession
from asyncpg.exceptions import UndefinedTableError
from yarl import URL

from tests.conftest import RESOURCE_ID, RESOURCE_URL
from udata_hydra.analysis.csv import analyse_csv, csv_to_db
from udata_hydra.crawl.check_resources import (
RESOURCE_RESPONSE_STATUSES,
check_resource,
)
from udata_hydra.db.resource import Resource

pytestmark = pytest.mark.asyncio
Expand Down Expand Up @@ -306,3 +311,69 @@ async def test_analyse_csv_send_udata_webhook(
assert webhook.get("analysis:parsing:started_at")
assert webhook.get("analysis:parsing:finished_at")
assert webhook.get("analysis:parsing:error") is None


@pytest.mark.parametrize(
"forced_analysis",
(
(True, True),
(False, False),
),
)
async def test_forced_analysis(
setup_catalog,
rmock,
catalog_content,
db,
fake_check,
forced_analysis,
udata_url,
):
force_analysis, table_exists = forced_analysis
check = await fake_check(
headers={
"content-type": "application/csv",
"content-length": "100",
}
)
url = check["url"]
rid = check["resource_id"]
rmock.head(
url,
status=200,
headers={
"content-type": "application/csv",
"content-length": "100",
},
)
rmock.get(
url,
status=200,
headers={
"content-type": "application/csv",
"content-length": "100",
},
body="a,b,c\n1,2,3".encode("utf-8"),
repeat=True,
)
rmock.put(udata_url, status=200, repeat=True)
async with ClientSession() as session:
await check_resource(
url=url, resource_id=rid, session=session, force_analysis=force_analysis
)

# check that csv was indeed pushed to db
table_name = hashlib.md5(url.encode("utf-8")).hexdigest()
tables = await db.fetch(
"SELECT table_name FROM INFORMATION_SCHEMA.TABLES WHERE table_schema = 'public';"
)
assert (table_name in [r["table_name"] for r in tables]) == table_exists

# check whether udata was pinged
if force_analysis:
webhook = rmock.requests[("PUT", URL(udata_url))][0].kwargs["json"]
assert webhook.get("analysis:parsing:started_at")
assert webhook.get("analysis:parsing:finished_at")
assert webhook.get("analysis:parsing:error") is None
else:
assert ("PUT", URL(udata_url)) not in rmock.requests.keys()
184 changes: 0 additions & 184 deletions tests/test_api/test_api_legacy.py

This file was deleted.

46 changes: 41 additions & 5 deletions tests/test_parquet_export.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,14 @@
import os
from io import BytesIO

import pyarrow.parquet as pq
import pytest

from udata_hydra.analysis.csv import (
RESERVED_COLS,
csv_detective_routine,
csv_to_parquet,
generate_records,
perform_csv_inspection,
)
from udata_hydra.utils.parquet import save_as_parquet

Expand All @@ -21,12 +23,12 @@
("catalog.xlsx", 2),
),
)
async def test_parquet_conversion(
setup_catalog, rmock, db, fake_check, produce_mock, file_and_count
):
async def test_save_as_parquet(file_and_count):
filename, expected_count = file_and_count
file_path = f"tests/data/{filename}"
inspection: dict | None = await perform_csv_inspection(file_path)
inspection: dict | None = csv_detective_routine(
csv_file_path=file_path, output_profile=True, num_rows=-1, save_results=False
)
assert inspection
columns = inspection["columns"]
columns = {
Expand All @@ -41,3 +43,37 @@ async def test_parquet_conversion(
assert len(table) == expected_count
fake_file = BytesIO()
pq.write_table(table, fake_file)


@pytest.mark.parametrize(
"parquet_config",
(
(False, 1, False), # CSV_TO_PARQUET = False, MIN_LINES_FOR_PARQUET = 1
(True, 1, True), # CSV_TO_PARQUET = True, MIN_LINES_FOR_PARQUET = 1
(True, 3, False), # CSV_TO_PARQUET = True, MIN_LINES_FOR_PARQUET = 3
),
)
async def test_csv_to_parquet(mocker, parquet_config):
async def execute_csv_to_parquet() -> tuple[str, int] | None:
file_path = "tests/data/catalog.csv"
inspection: dict | None = csv_detective_routine(
csv_file_path=file_path, output_profile=True, num_rows=-1, save_results=False
)
assert inspection
return await csv_to_parquet(
file_path=file_path, inspection=inspection, table_name="test_table"
)

csv_to_parquet_config, min_lines_for_parquet_config, expected_conversion = parquet_config
mocker.patch("udata_hydra.config.CSV_TO_PARQUET", csv_to_parquet_config)
mocker.patch("udata_hydra.config.MIN_LINES_FOR_PARQUET", min_lines_for_parquet_config)

if not expected_conversion:
assert not await execute_csv_to_parquet()

else:
# TODO: don't use the exception as the assertion, better to mock the minio client sending the file
with pytest.raises(ValueError, match="invalid bucket name"):
await execute_csv_to_parquet()
# Clean the remaining parquet file
os.remove("test_table.parquet")
Loading

0 comments on commit ea40d82

Please sign in to comment.