From 4a8411a4ff766fd1ebce640a84b8afd178f54bd8 Mon Sep 17 00:00:00 2001 From: Michael Terry Date: Mon, 1 Jul 2024 14:20:36 -0400 Subject: [PATCH] feat: switch to cumulus-fhir-supprots's ndjson reading code We now load any ndjson files that cumulus-fhir-support says to. --- CONTRIBUTING.md | 14 ++++++++++++++ cumulus_library/databases.py | 36 ++++++++---------------------------- pyproject.toml | 2 +- tests/test_duckdb.py | 10 ++++------ 4 files changed, 27 insertions(+), 35 deletions(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 743578b8..292bb6f1 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -11,6 +11,20 @@ pre-commit install This will install dependencies & build tools, as well as set up an auto-formatter commit hook. +## Running tests + +Tests can be run with `pytest` like you might expect, +but you'll first want to set up a fake `test` AWS profile. + +1. Edit `~/.aws/credentials` +2. Add a block like: +``` +[test] +aws_access_key_id = test +aws_secret_access_key = test +``` +3. Then run `pytest` + ## Adding new resources Things to keep in mind: diff --git a/cumulus_library/databases.py b/cumulus_library/databases.py index 4e74b8f6..dcbf60d0 100644 --- a/cumulus_library/databases.py +++ b/cumulus_library/databases.py @@ -646,37 +646,15 @@ def parse_found_schema(self, schema: dict[str, str]) -> dict: return parsed -def _read_rows_from_files(filenames: list[str]) -> list[dict]: - """Reads all provided ndjson files directly into memory""" - rows = [] - for filename in sorted(filenames): - with open(filename, encoding="utf8") as f: - for line in f: - rows.append(json.loads(line)) - return rows - - -def _read_rows_from_table_dir(path: Path) -> list[dict]: - """Grab ndjson files in the Cumulus ETL output format: path/tablename/*.ndjson""" - if not path.exists(): - return [] - - filenames = [str(x) for x in path.iterdir() if x.name.endswith(".ndjson")] - return _read_rows_from_files(filenames) - - def _read_rows_for_resource(path: Path, resource: str) -> list[dict]: rows = [] - # Grab any ndjson files in Cumulus ETL input format: path/*.Resource.*.ndjson - if path.exists(): - # This pattern is copied from the ETL, allowing a suffix or a numbered prefix. - pattern = re.compile(rf"([0-9]+\.)?{resource}(\.[^/]+)?\.ndjson") - filenames = [str(x) for x in path.iterdir() if pattern.match(x.name)] - rows += _read_rows_from_files(filenames) + # Support any ndjson files from the target folder directly + rows += list(cumulus_fhir_support.read_ndjson_from_dir(path, resource)) - # Also grab any ndjson files in Cumulus ETL output format - rows += _read_rows_from_table_dir(path / resource.lower()) + # Also support being given an ETL output folder, and look in the table subdir + subdir = path / resource.lower() + rows += list(cumulus_fhir_support.read_ndjson_from_dir(subdir, resource)) return rows @@ -721,7 +699,9 @@ def read_ndjson_dir(path: str) -> dict[str, pyarrow.Table]: "etl__completion_encounters", ] for metadata_table in metadata_tables: - rows = _read_rows_from_table_dir(Path(f"{path}/{metadata_table}")) + rows = list( + cumulus_fhir_support.read_ndjson_from_dir(f"{path}/{metadata_table}") + ) if rows: # Auto-detecting the schema works for these simple tables all_tables[metadata_table] = pyarrow.Table.from_pylist(rows) diff --git a/pyproject.toml b/pyproject.toml index 154c97dd..bcd54842 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -3,7 +3,7 @@ name = "cumulus-library" requires-python = ">= 3.10" dependencies = [ "ctakesclient >= 1.3", - "cumulus-fhir-support >= 1.1", + "cumulus-fhir-support >= 1.2", "duckdb >= 0.9", "fhirclient >= 4.1", "Jinja2 > 3", diff --git a/tests/test_duckdb.py b/tests/test_duckdb.py index c71f1404..abce04a3 100644 --- a/tests/test_duckdb.py +++ b/tests/test_duckdb.py @@ -74,11 +74,8 @@ def test_duckdb_from_iso8601_timestamp(timestamp, expected): def test_duckdb_load_ndjson_dir(tmp_path): filenames = { - "A.Patient.ndjson": False, - "1.Patient.ndjson": True, - "Patient.ndjson": True, - "Patient.hello.bye.ndjson": True, - "Patient.nope": False, + "blarg.ndjson": True, + "blarg.nope": False, "patient/blarg.ndjson": True, "patient/blarg.meta": False, } @@ -86,7 +83,7 @@ def test_duckdb_load_ndjson_dir(tmp_path): for index, (filename, valid) in enumerate(filenames.items()): with open(f"{tmp_path}/{filename}", "w", encoding="utf8") as f: row_id = f"Good{index}" if valid else f"Bad{index}" - f.write(f'{{"id":"{row_id}"}}') + f.write(f'{{"id":"{row_id}", "resourceType": "Patient"}}') db = databases.create_db_backend( { @@ -115,6 +112,7 @@ def test_duckdb_table_schema(): with open(f"{tmpdir}/observation/test.ndjson", "w", encoding="utf8") as ndjson: json.dump( { + "resourceType": "Observation", "id": "test", "component": [ {