From 4a8411a4ff766fd1ebce640a84b8afd178f54bd8 Mon Sep 17 00:00:00 2001
From: Michael Terry <michael.terry@childrens.harvard.edu>
Date: Mon, 1 Jul 2024 14:20:36 -0400
Subject: [PATCH] feat: switch to cumulus-fhir-supprots's ndjson reading code

We now load any ndjson files that cumulus-fhir-support says to.
---
 CONTRIBUTING.md              | 14 ++++++++++++++
 cumulus_library/databases.py | 36 ++++++++----------------------------
 pyproject.toml               |  2 +-
 tests/test_duckdb.py         | 10 ++++------
 4 files changed, 27 insertions(+), 35 deletions(-)

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 743578b8..292bb6f1 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -11,6 +11,20 @@ pre-commit install
 This will install dependencies & build tools,
 as well as set up an auto-formatter commit hook.
 
+## Running tests
+
+Tests can be run with `pytest` like you might expect,
+but you'll first want to set up a fake `test` AWS profile.
+
+1. Edit `~/.aws/credentials`
+2. Add a block like:
+```
+[test]
+aws_access_key_id = test
+aws_secret_access_key = test
+```
+3. Then run `pytest`
+
 ## Adding new resources
 
 Things to keep in mind:
diff --git a/cumulus_library/databases.py b/cumulus_library/databases.py
index 4e74b8f6..dcbf60d0 100644
--- a/cumulus_library/databases.py
+++ b/cumulus_library/databases.py
@@ -646,37 +646,15 @@ def parse_found_schema(self, schema: dict[str, str]) -> dict:
         return parsed
 
 
-def _read_rows_from_files(filenames: list[str]) -> list[dict]:
-    """Reads all provided ndjson files directly into memory"""
-    rows = []
-    for filename in sorted(filenames):
-        with open(filename, encoding="utf8") as f:
-            for line in f:
-                rows.append(json.loads(line))
-    return rows
-
-
-def _read_rows_from_table_dir(path: Path) -> list[dict]:
-    """Grab ndjson files in the Cumulus ETL output format: path/tablename/*.ndjson"""
-    if not path.exists():
-        return []
-
-    filenames = [str(x) for x in path.iterdir() if x.name.endswith(".ndjson")]
-    return _read_rows_from_files(filenames)
-
-
 def _read_rows_for_resource(path: Path, resource: str) -> list[dict]:
     rows = []
 
-    # Grab any ndjson files in Cumulus ETL input format: path/*.Resource.*.ndjson
-    if path.exists():
-        # This pattern is copied from the ETL, allowing a suffix or a numbered prefix.
-        pattern = re.compile(rf"([0-9]+\.)?{resource}(\.[^/]+)?\.ndjson")
-        filenames = [str(x) for x in path.iterdir() if pattern.match(x.name)]
-        rows += _read_rows_from_files(filenames)
+    # Support any ndjson files from the target folder directly
+    rows += list(cumulus_fhir_support.read_ndjson_from_dir(path, resource))
 
-    # Also grab any ndjson files in Cumulus ETL output format
-    rows += _read_rows_from_table_dir(path / resource.lower())
+    # Also support being given an ETL output folder, and look in the table subdir
+    subdir = path / resource.lower()
+    rows += list(cumulus_fhir_support.read_ndjson_from_dir(subdir, resource))
 
     return rows
 
@@ -721,7 +699,9 @@ def read_ndjson_dir(path: str) -> dict[str, pyarrow.Table]:
         "etl__completion_encounters",
     ]
     for metadata_table in metadata_tables:
-        rows = _read_rows_from_table_dir(Path(f"{path}/{metadata_table}"))
+        rows = list(
+            cumulus_fhir_support.read_ndjson_from_dir(f"{path}/{metadata_table}")
+        )
         if rows:
             # Auto-detecting the schema works for these simple tables
             all_tables[metadata_table] = pyarrow.Table.from_pylist(rows)
diff --git a/pyproject.toml b/pyproject.toml
index 154c97dd..bcd54842 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -3,7 +3,7 @@ name = "cumulus-library"
 requires-python = ">= 3.10"
 dependencies = [
     "ctakesclient >= 1.3",
-    "cumulus-fhir-support >= 1.1",
+    "cumulus-fhir-support >= 1.2",
     "duckdb >= 0.9",
     "fhirclient >= 4.1",
     "Jinja2 > 3",
diff --git a/tests/test_duckdb.py b/tests/test_duckdb.py
index c71f1404..abce04a3 100644
--- a/tests/test_duckdb.py
+++ b/tests/test_duckdb.py
@@ -74,11 +74,8 @@ def test_duckdb_from_iso8601_timestamp(timestamp, expected):
 
 def test_duckdb_load_ndjson_dir(tmp_path):
     filenames = {
-        "A.Patient.ndjson": False,
-        "1.Patient.ndjson": True,
-        "Patient.ndjson": True,
-        "Patient.hello.bye.ndjson": True,
-        "Patient.nope": False,
+        "blarg.ndjson": True,
+        "blarg.nope": False,
         "patient/blarg.ndjson": True,
         "patient/blarg.meta": False,
     }
@@ -86,7 +83,7 @@ def test_duckdb_load_ndjson_dir(tmp_path):
     for index, (filename, valid) in enumerate(filenames.items()):
         with open(f"{tmp_path}/{filename}", "w", encoding="utf8") as f:
             row_id = f"Good{index}" if valid else f"Bad{index}"
-            f.write(f'{{"id":"{row_id}"}}')
+            f.write(f'{{"id":"{row_id}", "resourceType": "Patient"}}')
 
     db = databases.create_db_backend(
         {
@@ -115,6 +112,7 @@ def test_duckdb_table_schema():
         with open(f"{tmpdir}/observation/test.ndjson", "w", encoding="utf8") as ndjson:
             json.dump(
                 {
+                    "resourceType": "Observation",
                     "id": "test",
                     "component": [
                         {