From 3a349737ed5a7fefd2bc3feec80dc725513d170c Mon Sep 17 00:00:00 2001 From: Michael Terry Date: Wed, 7 Feb 2024 16:30:02 -0500 Subject: [PATCH] Be looser about case when checking schemas This fixes a couple bugs: - We were not lower-casing the result of asking DuckDB for its schema. So we were searching inside its schema results for the exact camel case string in duckdb but Athena would need a lower case string. Impossible to work in both. So double-word depth-two-fields would never be matched. - For depth-two fields, we were also assuming that we would be given fields in the same order that the SQL table would return them. So if you didn't provide them in that precise order (like if you used start, end but the SQL table had end, start -- you'd miss one). This was the exact case for Encounter fields in duckdb. This commit allows the following new features/conveniences: - Case-insensitive schema lookups (i.e. table name and column name arguments to get_column_datatype_query() can be any case) - Case-insensitive schema validation (i.e. the expected column names passed to validate_table_schema() can be any case) - The resulting validated schema will use the original case used by the expected-fields dictionary. So that the user will not be surprised and we'll use camel case if they do, lower case if they do. --- .github/workflows/ci.yaml | 2 +- cumulus_library/databases.py | 51 +++++++------- .../core/core_templates/core_utils.jinja | 6 +- .../template_sql/column_datatype.sql.jinja | 4 +- tests/test_data/core/core__encounter.txt | 4 +- tests/test_data/core/core__encounter_type.txt | 4 +- tests/test_duckdb.py | 67 +++++++++++++++++-- tests/test_templates.py | 4 +- 8 files changed, 99 insertions(+), 43 deletions(-) diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 0ca2b5b1..e3cc746d 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -80,7 +80,7 @@ jobs: WG: cumulus DB: cumulus_library_regression_db run: | - cumulus-library build -t core --profile $PROFILE --workgroup $WG --database $DB + cumulus-library build -t core --profile $PROFILE --workgroup $WG --database $DB --verbose cumulus-library export -t core ./tests/regression/data_export/ --profile $PROFILE --workgroup $WG --database $DB - name: Compare vs known data run: python ./tests/regression/run_regression.py diff --git a/cumulus_library/databases.py b/cumulus_library/databases.py index 789d415d..368de1a1 100644 --- a/cumulus_library/databases.py +++ b/cumulus_library/databases.py @@ -44,7 +44,9 @@ def fetchall(self) -> list[list] | None: class DatabaseParser(abc.ABC): """Parses information_schema results from a database""" - def _parse_found_schema(self, expected: dict[dict[list]], schema: list[list]): + def _parse_found_schema( + self, expected: dict[dict[list]], schema: dict[list] + ) -> dict: """Checks for presence of field for each column in a table :param expected: A nested dict describing the expected data format of @@ -64,36 +66,33 @@ def _parse_found_schema(self, expected: dict[dict[list]], schema: list[list]): is a first pass, ignoring complexities of differing database variable types, just iterating through looking for column names. - - TODO: on a per database instance, consider a more nuanced approach - if needed + TODO: on a per database instance, consider a more nuanced approach if needed + (compared to just checking if the schema contains the field name) """ output = {} - for column, _ in expected.items(): - output[column] = {} - if col_schema := schema[column.lower()]: - # is this an object column? - if len(expected[column]) > 0: - for field in expected[column]: - col_schema = col_schema.split(field, 1) - if len(col_schema) != 2: - output[column][field] = False - col_schema = col_schema[0] - else: - output[column][field] = True - col_schema = col_schema[1] - # otherwise this is a primitive col - else: - output[column] = True + + for column, fields in expected.items(): + column_lower = column.lower() + + # is this an object column? (like: "subject": ["reference"]) + if fields: + col_schema = schema.get(column_lower, "").lower() + output[column] = { + # TODO: make this check more robust + field: field.lower() in col_schema + for field in fields + } + + # otherwise this is a primitive col (like: "recordedDate": None) else: - for field in expected[column]: - output[column][field] = False + output[column] = column_lower in schema + return output @abc.abstractmethod def validate_table_schema( - self, expected: dict[dict[list]], schema: list[tuple] - ) -> dict[bool]: + self, expected: dict[str, list[str]], schema: list[tuple] + ) -> dict: """Public interface for investigating if fields are in a table schema. This method should lightly format results and pass them to @@ -182,7 +181,7 @@ def close(self) -> None: class AthenaParser(DatabaseParser): def validate_table_schema( self, expected: dict[dict[list]], schema: list[list] - ) -> bool: + ) -> dict: schema = dict(schema) return self._parse_found_schema(expected, schema) @@ -291,6 +290,8 @@ def _compat_from_iso8601_timestamp( else: return datetime.datetime(int(pieces[0]), int(pieces[1]), 1) + # TODO: return timezone-aware datetimes, like Athena does + # (this currently generates naive datetimes, in UTC local time) return datetime.datetime.fromisoformat(value) def cursor(self) -> duckdb.DuckDBPyConnection: diff --git a/cumulus_library/studies/core/core_templates/core_utils.jinja b/cumulus_library/studies/core/core_templates/core_utils.jinja index 5ba06746..5b711286 100644 --- a/cumulus_library/studies/core/core_templates/core_utils.jinja +++ b/cumulus_library/studies/core/core_templates/core_utils.jinja @@ -90,7 +90,7 @@ targets is assumed to be a list of tuples of one of the following format: {%- for col in targets %} {%- if col is not string and col|length ==4%} {%- if schema[table][col[0]][col[1]] %} - date_trunc('{{ col[3] }}', date(from_iso8601_timestamp({{ alias }}.{{ col[0] }}.{{ col[1] }}))) + date_trunc('{{ col[3] }}', date(from_iso8601_timestamp({{ alias }}."{{ col[0] }}"."{{ col[1] }}"))) AS {{ col[2] }} {%- else %} cast(NULL AS date) AS {{col[2]}} @@ -98,14 +98,14 @@ targets is assumed to be a list of tuples of one of the following format: {#- depth two nested column -#} {%- elif col is not string and col|length ==5%} {%- if schema[table][col[0]][col[2]] %} - date_trunc('{{ col[4] }}', date(from_iso8601_timestamp({{ alias }}.{{ col[0] }}.{{ col[1] }}.{{col[2]}}))) + date_trunc('{{ col[4] }}', date(from_iso8601_timestamp({{ alias }}."{{ col[0] }}"."{{ col[1] }}"."{{col[2]}}"))) AS {{ col[3] }} {%- else %} cast(NULL AS date) AS {{col[3]}} {%- endif %} {#- SQL primitive column column-#} {%- elif schema[table][col[0]] %} - date_trunc('{{ col[1] }}', date(from_iso8601_timestamp({{ alias }}.{{ col[0] }}))) + date_trunc('{{ col[1] }}', date(from_iso8601_timestamp({{ alias }}."{{ col[0] }}"))) AS {{ col[0] }}_{{ col[1] }} {%- else %} cast(NULL AS date) AS {{ col[0] }}_{{ col[1] }} diff --git a/cumulus_library/template_sql/column_datatype.sql.jinja b/cumulus_library/template_sql/column_datatype.sql.jinja index 334def33..b3c568aa 100644 --- a/cumulus_library/template_sql/column_datatype.sql.jinja +++ b/cumulus_library/template_sql/column_datatype.sql.jinja @@ -4,5 +4,5 @@ SELECT FROM information_schema.columns WHERE table_schema = '{{ schema_name }}' - AND table_name = '{{ table_name }}' - AND LOWER(column_name) IN ('{{ column_names|join("', '") }}') --noqa: LT05 + AND table_name = '{{ table_name|lower }}' + AND LOWER(column_name) IN ('{{ column_names|join("', '")|lower }}') --noqa: LT05 diff --git a/tests/test_data/core/core__encounter.txt b/tests/test_data/core/core__encounter.txt index a2c34dec..a4871e67 100644 --- a/tests/test_data/core/core__encounter.txt +++ b/tests/test_data/core/core__encounter.txt @@ -21,7 +21,7 @@ ('75312bd2-d5ac-c62e-c9df-0004783725c7', {'id': None, 'code': 'AMB', 'display': None, 'system': 'http://terminology.hl7.org/CodeSystem/v3-ActCode', 'userSelected': None, 'version': None}, 'AMB', 'ambulatory', 'finished', None, None, None, None, None, None, None, None, 11, datetime.date(2018, 7, 2), datetime.date(2018, 7, 2), datetime.date(2018, 7, 2), datetime.date(2018, 7, 1), datetime.date(2018, 1, 1), 'Patient/3ae095ec-8fe0-133b-36d4-8785a6ad8df3', 'Encounter/75312bd2-d5ac-c62e-c9df-0004783725c7', 'female', 'white', 'not hispanic or latino', '662') ('75b68644-535d-bdc1-4c31-aa9fe7e1822f', {'id': None, 'code': 'EMER', 'display': None, 'system': 'http://terminology.hl7.org/CodeSystem/v3-ActCode', 'userSelected': None, 'version': None}, 'EMER', 'emergency', 'finished', None, None, None, None, None, None, '55680006', 'http://snomed.info/sct', 25, datetime.date(2018, 6, 30), datetime.date(2018, 6, 30), datetime.date(2018, 6, 25), datetime.date(2018, 6, 1), datetime.date(2018, 1, 1), 'Patient/82b8a670-4700-30e8-24a0-b83efa3c5e0a', 'Encounter/75b68644-535d-bdc1-4c31-aa9fe7e1822f', 'female', 'white', 'not hispanic or latino', '672') ('79d8f213-7847-646b-8a66-5da208cc1c27', {'id': None, 'code': 'AMB', 'display': None, 'system': 'http://terminology.hl7.org/CodeSystem/v3-ActCode', 'userSelected': None, 'version': None}, 'AMB', 'ambulatory', 'finished', None, None, None, None, None, None, '431857002', 'http://snomed.info/sct', 77, datetime.date(2018, 7, 14), datetime.date(2018, 7, 14), datetime.date(2018, 7, 9), datetime.date(2018, 7, 1), datetime.date(2018, 1, 1), 'Patient/26a3984f-b2a8-e67f-7abc-ff147a0e6e35', 'Encounter/79d8f213-7847-646b-8a66-5da208cc1c27', 'male', 'white', 'not hispanic or latino', '674') -('83d0d564-3bbf-48eb-7445-bd2b81130671', {'id': None, 'code': 'EMER', 'display': None, 'system': 'http://terminology.hl7.org/CodeSystem/v3-ActCode', 'userSelected': None, 'version': None}, 'EMER', 'emergency', 'finished', None, None, None, None, None, None, '55680006', 'http://snomed.info/sct', 69, datetime.date(2018, 6, 16), datetime.date(2018, 6, 16), datetime.date(2018, 6, 11), datetime.date(2018, 6, 1), datetime.date(2018, 1, 1), 'Patient/16be855b-ece2-8b96-1ef9-a4d93adf3289', 'Encounter/83d0d564-3bbf-48eb-7445-bd2b81130671', 'male', 'white', 'not hispanic or latino', '660') +('83d0d564-3bbf-48eb-7445-bd2b81130671', {'id': None, 'code': 'EMER', 'display': None, 'system': 'http://terminology.hl7.org/CodeSystem/v3-ActCode', 'userSelected': None, 'version': None}, 'EMER', 'emergency', 'finished', None, None, None, None, None, None, '55680006', 'http://snomed.info/sct', 69, datetime.date(2018, 6, 16), datetime.date(2018, 6, 17), datetime.date(2018, 6, 11), datetime.date(2018, 6, 1), datetime.date(2018, 1, 1), 'Patient/16be855b-ece2-8b96-1ef9-a4d93adf3289', 'Encounter/83d0d564-3bbf-48eb-7445-bd2b81130671', 'male', 'white', 'not hispanic or latino', '660') ('8ff1dc01-5a28-b2d8-3b42-4b7a7d539970', {'id': None, 'code': 'AMB', 'display': None, 'system': 'http://terminology.hl7.org/CodeSystem/v3-ActCode', 'userSelected': None, 'version': None}, 'AMB', 'ambulatory', 'finished', None, None, None, None, None, None, None, None, 55, datetime.date(2018, 7, 15), datetime.date(2018, 7, 15), datetime.date(2018, 7, 9), datetime.date(2018, 7, 1), datetime.date(2018, 1, 1), 'Patient/c1bfec36-dc2c-afc8-c767-3d35ed2bf6f0', 'Encounter/8ff1dc01-5a28-b2d8-3b42-4b7a7d539970', 'female', 'white', 'not hispanic or latino', '662') ('91f94a9d-69a7-e30a-cd1a-68c52dc01e70', {'id': None, 'code': 'AMB', 'display': None, 'system': 'http://terminology.hl7.org/CodeSystem/v3-ActCode', 'userSelected': None, 'version': None}, 'AMB', 'ambulatory', 'finished', None, None, None, None, None, None, None, None, 22, datetime.date(2018, 6, 29), datetime.date(2018, 6, 29), datetime.date(2018, 6, 25), datetime.date(2018, 6, 1), datetime.date(2018, 1, 1), 'Patient/a28be3e1-a6bd-7df4-fc81-1140848f8453', 'Encounter/91f94a9d-69a7-e30a-cd1a-68c52dc01e70', 'female', 'white', 'not hispanic or latino', '662') ('98d4bd14-d78e-debb-e7dc-2df7786aedf3', {'id': None, 'code': 'AMB', 'display': None, 'system': 'http://terminology.hl7.org/CodeSystem/v3-ActCode', 'userSelected': None, 'version': None}, 'AMB', 'ambulatory', 'finished', None, None, None, None, None, None, None, None, 89, datetime.date(2018, 7, 9), datetime.date(2018, 7, 9), datetime.date(2018, 7, 9), datetime.date(2018, 7, 1), datetime.date(2018, 1, 1), 'Patient/e455ca3f-fc16-6ffc-297a-adc27e2db183', 'Encounter/98d4bd14-d78e-debb-e7dc-2df7786aedf3', 'male', 'white', 'hispanic or latino', '667') @@ -45,6 +45,6 @@ ('e613f29d-7505-6f2e-a1f5-bfbec300752d', {'id': None, 'code': 'AMB', 'display': None, 'system': 'http://terminology.hl7.org/CodeSystem/v3-ActCode', 'userSelected': None, 'version': None}, 'AMB', 'ambulatory', 'finished', None, None, None, None, None, None, '88805009', 'http://snomed.info/sct', 84, datetime.date(2018, 6, 29), datetime.date(2018, 6, 29), datetime.date(2018, 6, 25), datetime.date(2018, 6, 1), datetime.date(2018, 1, 1), 'Patient/1c498b42-61fd-6341-69c3-053f6e4fe404', 'Encounter/e613f29d-7505-6f2e-a1f5-bfbec300752d', 'female', 'white', 'not hispanic or latino', '667') ('e922a884-7039-a171-a65e-78051fe7afe6', {'id': None, 'code': 'AMB', 'display': None, 'system': 'http://terminology.hl7.org/CodeSystem/v3-ActCode', 'userSelected': None, 'version': None}, 'AMB', 'ambulatory', 'finished', None, None, None, None, None, None, None, None, 1, datetime.date(2018, 6, 13), datetime.date(2018, 6, 13), datetime.date(2018, 6, 11), datetime.date(2018, 6, 1), datetime.date(2018, 1, 1), 'Patient/8022fbbe-aaa4-056c-d0f5-ec074bf0656c', 'Encounter/e922a884-7039-a171-a65e-78051fe7afe6', 'male', 'white', 'not hispanic or latino', '665') ('ed151e04-3dd6-8cb7-a3e5-777c8a8667f1', {'id': None, 'code': 'AMB', 'display': None, 'system': 'http://terminology.hl7.org/CodeSystem/v3-ActCode', 'userSelected': None, 'version': None}, 'AMB', 'ambulatory', 'finished', None, None, None, None, None, None, '195662009', 'http://snomed.info/sct', 57, datetime.date(2018, 6, 6), datetime.date(2018, 6, 6), datetime.date(2018, 6, 4), datetime.date(2018, 6, 1), datetime.date(2018, 1, 1), 'Patient/19158de4-66a2-f70f-e3bf-4396b312c8f1', 'Encounter/ed151e04-3dd6-8cb7-a3e5-777c8a8667f1', 'female', 'white', 'not hispanic or latino', '000') -('f2752dd7-1bf1-739d-dd8c-40122d0b63bc', {'id': None, 'code': 'AMB', 'display': None, 'system': 'http://terminology.hl7.org/CodeSystem/v3-ActCode', 'userSelected': None, 'version': None}, 'AMB', 'ambulatory', 'finished', None, None, None, None, None, None, '161665007', 'http://snomed.info/sct', 77, datetime.date(2018, 6, 1), datetime.date(2018, 6, 1), datetime.date(2018, 5, 28), datetime.date(2018, 6, 1), datetime.date(2018, 1, 1), 'Patient/9c8d8539-0b1e-73e2-b64f-83f1ea601fa4', 'Encounter/f2752dd7-1bf1-739d-dd8c-40122d0b63bc', 'male', 'white', 'not hispanic or latino', '674') +('f2752dd7-1bf1-739d-dd8c-40122d0b63bc', {'id': None, 'code': 'AMB', 'display': None, 'system': 'http://terminology.hl7.org/CodeSystem/v3-ActCode', 'userSelected': None, 'version': None}, 'AMB', 'ambulatory', 'finished', None, None, None, None, None, None, '161665007', 'http://snomed.info/sct', 77, datetime.date(2018, 6, 1), datetime.date(2018, 6, 2), datetime.date(2018, 5, 28), datetime.date(2018, 6, 1), datetime.date(2018, 1, 1), 'Patient/9c8d8539-0b1e-73e2-b64f-83f1ea601fa4', 'Encounter/f2752dd7-1bf1-739d-dd8c-40122d0b63bc', 'male', 'white', 'not hispanic or latino', '674') ('f964be66-3fcd-95c8-0021-71c7d24c91b7', {'id': None, 'code': 'AMB', 'display': None, 'system': 'http://terminology.hl7.org/CodeSystem/v3-ActCode', 'userSelected': None, 'version': None}, 'AMB', 'ambulatory', 'finished', None, None, None, None, None, None, '431857002', 'http://snomed.info/sct', 63, datetime.date(2018, 6, 5), datetime.date(2018, 6, 5), datetime.date(2018, 6, 4), datetime.date(2018, 6, 1), datetime.date(2018, 1, 1), 'Patient/a5bc08ea-9462-c4f5-1bd2-ff342598ac99', 'Encounter/f964be66-3fcd-95c8-0021-71c7d24c91b7', 'female', 'white', 'not hispanic or latino', '661') ('fd0754a4-e96d-cba7-b3c0-77697a09c86e', {'id': None, 'code': 'AMB', 'display': None, 'system': 'http://terminology.hl7.org/CodeSystem/v3-ActCode', 'userSelected': None, 'version': None}, 'AMB', 'ambulatory', 'finished', None, None, None, None, None, None, None, None, 100, datetime.date(2018, 7, 10), datetime.date(2018, 7, 10), datetime.date(2018, 7, 9), datetime.date(2018, 7, 1), datetime.date(2018, 1, 1), 'Patient/6385ddd7-2639-6505-3789-0521b8f66c8b', 'Encounter/fd0754a4-e96d-cba7-b3c0-77697a09c86e', 'female', 'white', 'not hispanic or latino', '672') diff --git a/tests/test_data/core/core__encounter_type.txt b/tests/test_data/core/core__encounter_type.txt index 308fda57..732a4af7 100644 --- a/tests/test_data/core/core__encounter_type.txt +++ b/tests/test_data/core/core__encounter_type.txt @@ -21,7 +21,7 @@ ('75312bd2-d5ac-c62e-c9df-0004783725c7', 'AMB', 'ambulatory', 'http://snomed.info/sct', '410620009', 'Well child visit (procedure)', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'finished', 11, datetime.date(2018, 7, 2), datetime.date(2018, 7, 2), datetime.date(2018, 7, 2), datetime.date(2018, 7, 1), datetime.date(2018, 1, 1), 'Patient/3ae095ec-8fe0-133b-36d4-8785a6ad8df3', 'Encounter/75312bd2-d5ac-c62e-c9df-0004783725c7', 'female', 'white', 'not hispanic or latino', '662') ('75b68644-535d-bdc1-4c31-aa9fe7e1822f', 'EMER', 'emergency', 'http://snomed.info/sct', '50849002', 'Emergency room admission (procedure)', 'None', 'None', 'None', 'None', 'None', 'None', '55680006', 'Drug overdose', 'finished', 25, datetime.date(2018, 6, 30), datetime.date(2018, 6, 30), datetime.date(2018, 6, 25), datetime.date(2018, 6, 1), datetime.date(2018, 1, 1), 'Patient/82b8a670-4700-30e8-24a0-b83efa3c5e0a', 'Encounter/75b68644-535d-bdc1-4c31-aa9fe7e1822f', 'female', 'white', 'not hispanic or latino', '672') ('79d8f213-7847-646b-8a66-5da208cc1c27', 'AMB', 'ambulatory', 'http://snomed.info/sct', '185347001', 'Encounter for problem (procedure)', 'None', 'None', 'None', 'None', 'None', 'None', '431857002', 'Chronic kidney disease stage 4 (disorder)', 'finished', 77, datetime.date(2018, 7, 14), datetime.date(2018, 7, 14), datetime.date(2018, 7, 9), datetime.date(2018, 7, 1), datetime.date(2018, 1, 1), 'Patient/26a3984f-b2a8-e67f-7abc-ff147a0e6e35', 'Encounter/79d8f213-7847-646b-8a66-5da208cc1c27', 'male', 'white', 'not hispanic or latino', '674') -('83d0d564-3bbf-48eb-7445-bd2b81130671', 'EMER', 'emergency', 'http://snomed.info/sct', '50849002', 'Emergency room admission (procedure)', 'None', 'None', 'None', 'None', 'None', 'None', '55680006', 'Drug overdose', 'finished', 69, datetime.date(2018, 6, 16), datetime.date(2018, 6, 16), datetime.date(2018, 6, 11), datetime.date(2018, 6, 1), datetime.date(2018, 1, 1), 'Patient/16be855b-ece2-8b96-1ef9-a4d93adf3289', 'Encounter/83d0d564-3bbf-48eb-7445-bd2b81130671', 'male', 'white', 'not hispanic or latino', '660') +('83d0d564-3bbf-48eb-7445-bd2b81130671', 'EMER', 'emergency', 'http://snomed.info/sct', '50849002', 'Emergency room admission (procedure)', 'None', 'None', 'None', 'None', 'None', 'None', '55680006', 'Drug overdose', 'finished', 69, datetime.date(2018, 6, 16), datetime.date(2018, 6, 17), datetime.date(2018, 6, 11), datetime.date(2018, 6, 1), datetime.date(2018, 1, 1), 'Patient/16be855b-ece2-8b96-1ef9-a4d93adf3289', 'Encounter/83d0d564-3bbf-48eb-7445-bd2b81130671', 'male', 'white', 'not hispanic or latino', '660') ('8ff1dc01-5a28-b2d8-3b42-4b7a7d539970', 'AMB', 'ambulatory', 'http://snomed.info/sct', '185349003', 'Encounter for check up (procedure)', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'finished', 55, datetime.date(2018, 7, 15), datetime.date(2018, 7, 15), datetime.date(2018, 7, 9), datetime.date(2018, 7, 1), datetime.date(2018, 1, 1), 'Patient/c1bfec36-dc2c-afc8-c767-3d35ed2bf6f0', 'Encounter/8ff1dc01-5a28-b2d8-3b42-4b7a7d539970', 'female', 'white', 'not hispanic or latino', '662') ('91f94a9d-69a7-e30a-cd1a-68c52dc01e70', 'AMB', 'ambulatory', 'http://snomed.info/sct', '308335008', 'Patient encounter procedure', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'finished', 22, datetime.date(2018, 6, 29), datetime.date(2018, 6, 29), datetime.date(2018, 6, 25), datetime.date(2018, 6, 1), datetime.date(2018, 1, 1), 'Patient/a28be3e1-a6bd-7df4-fc81-1140848f8453', 'Encounter/91f94a9d-69a7-e30a-cd1a-68c52dc01e70', 'female', 'white', 'not hispanic or latino', '662') ('98d4bd14-d78e-debb-e7dc-2df7786aedf3', 'AMB', 'ambulatory', 'http://snomed.info/sct', '185349003', 'Encounter for check up (procedure)', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'finished', 89, datetime.date(2018, 7, 9), datetime.date(2018, 7, 9), datetime.date(2018, 7, 9), datetime.date(2018, 7, 1), datetime.date(2018, 1, 1), 'Patient/e455ca3f-fc16-6ffc-297a-adc27e2db183', 'Encounter/98d4bd14-d78e-debb-e7dc-2df7786aedf3', 'male', 'white', 'hispanic or latino', '667') @@ -45,6 +45,6 @@ ('e613f29d-7505-6f2e-a1f5-bfbec300752d', 'AMB', 'ambulatory', 'http://snomed.info/sct', '448337001', 'Telemedicine consultation with patient', 'None', 'None', 'None', 'None', 'None', 'None', '88805009', 'Chronic congestive heart failure (disorder)', 'finished', 84, datetime.date(2018, 6, 29), datetime.date(2018, 6, 29), datetime.date(2018, 6, 25), datetime.date(2018, 6, 1), datetime.date(2018, 1, 1), 'Patient/1c498b42-61fd-6341-69c3-053f6e4fe404', 'Encounter/e613f29d-7505-6f2e-a1f5-bfbec300752d', 'female', 'white', 'not hispanic or latino', '667') ('e922a884-7039-a171-a65e-78051fe7afe6', 'AMB', 'ambulatory', 'http://snomed.info/sct', '410620009', 'Well child visit (procedure)', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'finished', 1, datetime.date(2018, 6, 13), datetime.date(2018, 6, 13), datetime.date(2018, 6, 11), datetime.date(2018, 6, 1), datetime.date(2018, 1, 1), 'Patient/8022fbbe-aaa4-056c-d0f5-ec074bf0656c', 'Encounter/e922a884-7039-a171-a65e-78051fe7afe6', 'male', 'white', 'not hispanic or latino', '665') ('ed151e04-3dd6-8cb7-a3e5-777c8a8667f1', 'AMB', 'ambulatory', 'http://snomed.info/sct', '185345009', 'Encounter for symptom', 'None', 'None', 'None', 'None', 'None', 'None', '195662009', 'Acute viral pharyngitis (disorder)', 'finished', 57, datetime.date(2018, 6, 6), datetime.date(2018, 6, 6), datetime.date(2018, 6, 4), datetime.date(2018, 6, 1), datetime.date(2018, 1, 1), 'Patient/19158de4-66a2-f70f-e3bf-4396b312c8f1', 'Encounter/ed151e04-3dd6-8cb7-a3e5-777c8a8667f1', 'female', 'white', 'not hispanic or latino', '000') -('f2752dd7-1bf1-739d-dd8c-40122d0b63bc', 'AMB', 'ambulatory', 'http://snomed.info/sct', '439740005', 'Postoperative follow-up visit (procedure)', 'None', 'None', 'None', 'None', 'None', 'None', '161665007', 'History of renal transplant (situation)', 'finished', 77, datetime.date(2018, 6, 1), datetime.date(2018, 6, 1), datetime.date(2018, 5, 28), datetime.date(2018, 6, 1), datetime.date(2018, 1, 1), 'Patient/9c8d8539-0b1e-73e2-b64f-83f1ea601fa4', 'Encounter/f2752dd7-1bf1-739d-dd8c-40122d0b63bc', 'male', 'white', 'not hispanic or latino', '674') +('f2752dd7-1bf1-739d-dd8c-40122d0b63bc', 'AMB', 'ambulatory', 'http://snomed.info/sct', '439740005', 'Postoperative follow-up visit (procedure)', 'None', 'None', 'None', 'None', 'None', 'None', '161665007', 'History of renal transplant (situation)', 'finished', 77, datetime.date(2018, 6, 1), datetime.date(2018, 6, 2), datetime.date(2018, 5, 28), datetime.date(2018, 6, 1), datetime.date(2018, 1, 1), 'Patient/9c8d8539-0b1e-73e2-b64f-83f1ea601fa4', 'Encounter/f2752dd7-1bf1-739d-dd8c-40122d0b63bc', 'male', 'white', 'not hispanic or latino', '674') ('f964be66-3fcd-95c8-0021-71c7d24c91b7', 'AMB', 'ambulatory', 'http://snomed.info/sct', '185347001', 'Encounter for problem (procedure)', 'None', 'None', 'None', 'None', 'None', 'None', '431857002', 'Chronic kidney disease stage 4 (disorder)', 'finished', 63, datetime.date(2018, 6, 5), datetime.date(2018, 6, 5), datetime.date(2018, 6, 4), datetime.date(2018, 6, 1), datetime.date(2018, 1, 1), 'Patient/a5bc08ea-9462-c4f5-1bd2-ff342598ac99', 'Encounter/f964be66-3fcd-95c8-0021-71c7d24c91b7', 'female', 'white', 'not hispanic or latino', '661') ('fd0754a4-e96d-cba7-b3c0-77697a09c86e', 'AMB', 'ambulatory', 'http://snomed.info/sct', '702927004', 'Urgent care clinic (environment)', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'finished', 100, datetime.date(2018, 7, 10), datetime.date(2018, 7, 10), datetime.date(2018, 7, 9), datetime.date(2018, 7, 1), datetime.date(2018, 1, 1), 'Patient/6385ddd7-2639-6505-3789-0521b8f66c8b', 'Encounter/fd0754a4-e96d-cba7-b3c0-77697a09c86e', 'female', 'white', 'not hispanic or latino', '672') diff --git a/tests/test_duckdb.py b/tests/test_duckdb.py index 517463ac..2db351c7 100644 --- a/tests/test_duckdb.py +++ b/tests/test_duckdb.py @@ -1,15 +1,17 @@ """ tests for duckdb backend support """ import glob +import json import os import tempfile -from datetime import datetime, timedelta, timezone +from datetime import datetime from pathlib import Path from unittest import mock import pytest from cumulus_library import cli, databases +from cumulus_library.template_sql import base_templates @mock.patch.dict( @@ -55,12 +57,12 @@ def test_duckdb_core_build_and_export(): @pytest.mark.parametrize( "timestamp,expected", [ - ("2021", datetime(2021, 1, 1, tzinfo=timezone.utc)), - ("2019-10", datetime(2019, 10, 1, tzinfo=timezone.utc)), - ("1923-01-23", datetime(1923, 1, 23, tzinfo=timezone.utc)), + ("2021", datetime(2021, 1, 1)), + ("2019-10", datetime(2019, 10, 1)), + ("1923-01-23", datetime(1923, 1, 23)), ( "2023-01-16T07:55:25-05:00", - datetime(2023, 1, 16, 7, 55, 25, tzinfo=timezone(timedelta(hours=-5))), + datetime(2023, 1, 16, 12, 55, 25), ), ], ) @@ -71,4 +73,57 @@ def test_duckdb_from_iso8601_timestamp(timestamp, expected): .execute(f"select from_iso8601_timestamp('{timestamp}')") .fetchone()[0] ) - assert parsed, expected + assert parsed == expected + + +def test_duckdb_table_schema(): + """Verify we can detect schemas correctly, even for nested camel case fields""" + db = databases.DuckDatabaseBackend(":memory:") + + with tempfile.TemporaryDirectory() as tmpdir: + os.mkdir(f"{tmpdir}/observation") + with open(f"{tmpdir}/observation/test.ndjson", "w", encoding="utf8") as ndjson: + json.dump( + { + "id": "test", + "component": [{"dataAbsentReason": {"text": "Dunno"}}], + "valueBoolean": False, + }, + ndjson, + ) + + db.insert_tables(databases.read_ndjson_dir(tmpdir)) + + # Look for a mix of camel-cased and lower-cased fields. Both should work. + target_schema = { + "bodySite": [], + "CoMpOnEnT": ["dataabsentreason", "valueQuantity"], + "not_a_real_field": [], + "valueboolean": [], + } + + # Query database for what exists right now as a schema + query = base_templates.get_column_datatype_query( + # Use a mixed-case table name + db.schema_name, + "Observation", + list(target_schema.keys()), + ) + actual_schema = db.cursor().execute(query).fetchall() + + # Validate that schema against what we were looking for + validated_schema = db.parser().validate_table_schema( + target_schema, actual_schema + ) + # Note the all mixed-case results. + # These are guaranteed to be the same case as the expected/target schema. + expected_schema = { + "bodySite": True, # real toplevel fields are guaranteed to be in schema + "CoMpOnEnT": { + "dataabsentreason": True, + "valueQuantity": False, + }, + "not_a_real_field": False, + "valueboolean": True, + } + assert validated_schema == expected_schema diff --git a/tests/test_templates.py b/tests/test_templates.py index ab89b3ec..8c5d85bb 100644 --- a/tests/test_templates.py +++ b/tests/test_templates.py @@ -168,8 +168,8 @@ def test_get_column_datatype_query(): query = base_templates.get_column_datatype_query( schema_name="schema_name", - table_name="table_name", - column_names=["foo", "bar"], + table_name="TABLE_NAME", + column_names=["foo", "BAR"], ) assert query == expected