Skip to content

Commit

Permalink
Be looser about case when checking schemas
Browse files Browse the repository at this point in the history
This fixes a couple bugs:
- We were not lower-casing the result of asking DuckDB for its schema.
  So we were searching inside its schema results for the exact camel
  case string in duckdb but Athena would need a lower case string.
  Impossible to work in both. So double-word depth-two-fields would
  never be matched.
- For depth-two fields, we were also assuming that we would be given
  fields in the same order that the SQL table would return them.
  So if you didn't provide them in that precise order (like if you used
  start, end but the SQL table had end, start -- you'd miss one).
  This was the exact case for Encounter fields in duckdb.

This commit allows the following new features/conveniences:
- Case-insensitive schema lookups (i.e. table name and column name
  arguments to get_column_datatype_query() can be any case)
- Case-insensitive schema validation (i.e. the expected column names
  passed to validate_table_schema() can be any case)
- The resulting validated schema will use the original case used by
  the expected-fields dictionary. So that the user will not be
  surprised and we'll use camel case if they do, lower case if they do.
  • Loading branch information
mikix committed Feb 8, 2024
1 parent becfbad commit e98aecd
Show file tree
Hide file tree
Showing 6 changed files with 96 additions and 40 deletions.
51 changes: 26 additions & 25 deletions cumulus_library/databases.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,9 @@ def fetchall(self) -> list[list] | None:
class DatabaseParser(abc.ABC):
"""Parses information_schema results from a database"""

def _parse_found_schema(self, expected: dict[dict[list]], schema: list[list]):
def _parse_found_schema(
self, expected: dict[dict[list]], schema: dict[list]
) -> dict:
"""Checks for presence of field for each column in a table
:param expected: A nested dict describing the expected data format of
Expand All @@ -64,36 +66,33 @@ def _parse_found_schema(self, expected: dict[dict[list]], schema: list[list]):
is a first pass, ignoring complexities of differing database variable
types, just iterating through looking for column names.
TODO: on a per database instance, consider a more nuanced approach
if needed
TODO: on a per database instance, consider a more nuanced approach if needed
(compared to just checking if the schema contains the field name)
"""
output = {}
for column, _ in expected.items():
output[column] = {}
if col_schema := schema[column.lower()]:
# is this an object column?
if len(expected[column]) > 0:
for field in expected[column]:
col_schema = col_schema.split(field, 1)
if len(col_schema) != 2:
output[column][field] = False
col_schema = col_schema[0]
else:
output[column][field] = True
col_schema = col_schema[1]
# otherwise this is a primitive col
else:
output[column] = True

for column, fields in expected.items():
column_lower = column.lower()

# is this an object column? (like: "subject": ["reference"])
if fields:
col_schema = schema.get(column_lower, "").lower()
output[column] = {
# TODO: make this check more robust
field: field.lower() in col_schema
for field in fields
}

# otherwise this is a primitive col (like: "recordedDate": None)
else:
for field in expected[column]:
output[column][field] = False
output[column] = column_lower in schema

return output

@abc.abstractmethod
def validate_table_schema(
self, expected: dict[dict[list]], schema: list[tuple]
) -> dict[bool]:
self, expected: dict[str, list[str]], schema: list[tuple]
) -> dict:
"""Public interface for investigating if fields are in a table schema.
This method should lightly format results and pass them to
Expand Down Expand Up @@ -182,7 +181,7 @@ def close(self) -> None:
class AthenaParser(DatabaseParser):
def validate_table_schema(
self, expected: dict[dict[list]], schema: list[list]
) -> bool:
) -> dict:
schema = dict(schema)
return self._parse_found_schema(expected, schema)

Expand Down Expand Up @@ -291,6 +290,8 @@ def _compat_from_iso8601_timestamp(
else:
return datetime.datetime(int(pieces[0]), int(pieces[1]), 1)

# TODO: return timezone-aware datetimes, like Athena does
# (this currently generates naive datetimes, in UTC local time)
return datetime.datetime.fromisoformat(value)

def cursor(self) -> duckdb.DuckDBPyConnection:
Expand Down
4 changes: 2 additions & 2 deletions cumulus_library/template_sql/column_datatype.sql.jinja
Original file line number Diff line number Diff line change
Expand Up @@ -4,5 +4,5 @@ SELECT
FROM information_schema.columns
WHERE
table_schema = '{{ schema_name }}'
AND table_name = '{{ table_name }}'
AND LOWER(column_name) IN ('{{ column_names|join("', '") }}') --noqa: LT05
AND LOWER(table_name) = '{{ table_name|lower }}'
AND LOWER(column_name) IN ('{{ column_names|join("', '")|lower }}') --noqa: LT05
4 changes: 2 additions & 2 deletions tests/test_data/core/core__encounter.txt
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
('75312bd2-d5ac-c62e-c9df-0004783725c7', {'id': None, 'code': 'AMB', 'display': None, 'system': 'http://terminology.hl7.org/CodeSystem/v3-ActCode', 'userSelected': None, 'version': None}, 'AMB', 'ambulatory', 'finished', None, None, None, None, None, None, None, None, 11, datetime.date(2018, 7, 2), datetime.date(2018, 7, 2), datetime.date(2018, 7, 2), datetime.date(2018, 7, 1), datetime.date(2018, 1, 1), 'Patient/3ae095ec-8fe0-133b-36d4-8785a6ad8df3', 'Encounter/75312bd2-d5ac-c62e-c9df-0004783725c7', 'female', 'white', 'not hispanic or latino', '662')
('75b68644-535d-bdc1-4c31-aa9fe7e1822f', {'id': None, 'code': 'EMER', 'display': None, 'system': 'http://terminology.hl7.org/CodeSystem/v3-ActCode', 'userSelected': None, 'version': None}, 'EMER', 'emergency', 'finished', None, None, None, None, None, None, '55680006', 'http://snomed.info/sct', 25, datetime.date(2018, 6, 30), datetime.date(2018, 6, 30), datetime.date(2018, 6, 25), datetime.date(2018, 6, 1), datetime.date(2018, 1, 1), 'Patient/82b8a670-4700-30e8-24a0-b83efa3c5e0a', 'Encounter/75b68644-535d-bdc1-4c31-aa9fe7e1822f', 'female', 'white', 'not hispanic or latino', '672')
('79d8f213-7847-646b-8a66-5da208cc1c27', {'id': None, 'code': 'AMB', 'display': None, 'system': 'http://terminology.hl7.org/CodeSystem/v3-ActCode', 'userSelected': None, 'version': None}, 'AMB', 'ambulatory', 'finished', None, None, None, None, None, None, '431857002', 'http://snomed.info/sct', 77, datetime.date(2018, 7, 14), datetime.date(2018, 7, 14), datetime.date(2018, 7, 9), datetime.date(2018, 7, 1), datetime.date(2018, 1, 1), 'Patient/26a3984f-b2a8-e67f-7abc-ff147a0e6e35', 'Encounter/79d8f213-7847-646b-8a66-5da208cc1c27', 'male', 'white', 'not hispanic or latino', '674')
('83d0d564-3bbf-48eb-7445-bd2b81130671', {'id': None, 'code': 'EMER', 'display': None, 'system': 'http://terminology.hl7.org/CodeSystem/v3-ActCode', 'userSelected': None, 'version': None}, 'EMER', 'emergency', 'finished', None, None, None, None, None, None, '55680006', 'http://snomed.info/sct', 69, datetime.date(2018, 6, 16), datetime.date(2018, 6, 16), datetime.date(2018, 6, 11), datetime.date(2018, 6, 1), datetime.date(2018, 1, 1), 'Patient/16be855b-ece2-8b96-1ef9-a4d93adf3289', 'Encounter/83d0d564-3bbf-48eb-7445-bd2b81130671', 'male', 'white', 'not hispanic or latino', '660')
('83d0d564-3bbf-48eb-7445-bd2b81130671', {'id': None, 'code': 'EMER', 'display': None, 'system': 'http://terminology.hl7.org/CodeSystem/v3-ActCode', 'userSelected': None, 'version': None}, 'EMER', 'emergency', 'finished', None, None, None, None, None, None, '55680006', 'http://snomed.info/sct', 69, datetime.date(2018, 6, 16), datetime.date(2018, 6, 17), datetime.date(2018, 6, 11), datetime.date(2018, 6, 1), datetime.date(2018, 1, 1), 'Patient/16be855b-ece2-8b96-1ef9-a4d93adf3289', 'Encounter/83d0d564-3bbf-48eb-7445-bd2b81130671', 'male', 'white', 'not hispanic or latino', '660')
('8ff1dc01-5a28-b2d8-3b42-4b7a7d539970', {'id': None, 'code': 'AMB', 'display': None, 'system': 'http://terminology.hl7.org/CodeSystem/v3-ActCode', 'userSelected': None, 'version': None}, 'AMB', 'ambulatory', 'finished', None, None, None, None, None, None, None, None, 55, datetime.date(2018, 7, 15), datetime.date(2018, 7, 15), datetime.date(2018, 7, 9), datetime.date(2018, 7, 1), datetime.date(2018, 1, 1), 'Patient/c1bfec36-dc2c-afc8-c767-3d35ed2bf6f0', 'Encounter/8ff1dc01-5a28-b2d8-3b42-4b7a7d539970', 'female', 'white', 'not hispanic or latino', '662')
('91f94a9d-69a7-e30a-cd1a-68c52dc01e70', {'id': None, 'code': 'AMB', 'display': None, 'system': 'http://terminology.hl7.org/CodeSystem/v3-ActCode', 'userSelected': None, 'version': None}, 'AMB', 'ambulatory', 'finished', None, None, None, None, None, None, None, None, 22, datetime.date(2018, 6, 29), datetime.date(2018, 6, 29), datetime.date(2018, 6, 25), datetime.date(2018, 6, 1), datetime.date(2018, 1, 1), 'Patient/a28be3e1-a6bd-7df4-fc81-1140848f8453', 'Encounter/91f94a9d-69a7-e30a-cd1a-68c52dc01e70', 'female', 'white', 'not hispanic or latino', '662')
('98d4bd14-d78e-debb-e7dc-2df7786aedf3', {'id': None, 'code': 'AMB', 'display': None, 'system': 'http://terminology.hl7.org/CodeSystem/v3-ActCode', 'userSelected': None, 'version': None}, 'AMB', 'ambulatory', 'finished', None, None, None, None, None, None, None, None, 89, datetime.date(2018, 7, 9), datetime.date(2018, 7, 9), datetime.date(2018, 7, 9), datetime.date(2018, 7, 1), datetime.date(2018, 1, 1), 'Patient/e455ca3f-fc16-6ffc-297a-adc27e2db183', 'Encounter/98d4bd14-d78e-debb-e7dc-2df7786aedf3', 'male', 'white', 'hispanic or latino', '667')
Expand All @@ -45,6 +45,6 @@
('e613f29d-7505-6f2e-a1f5-bfbec300752d', {'id': None, 'code': 'AMB', 'display': None, 'system': 'http://terminology.hl7.org/CodeSystem/v3-ActCode', 'userSelected': None, 'version': None}, 'AMB', 'ambulatory', 'finished', None, None, None, None, None, None, '88805009', 'http://snomed.info/sct', 84, datetime.date(2018, 6, 29), datetime.date(2018, 6, 29), datetime.date(2018, 6, 25), datetime.date(2018, 6, 1), datetime.date(2018, 1, 1), 'Patient/1c498b42-61fd-6341-69c3-053f6e4fe404', 'Encounter/e613f29d-7505-6f2e-a1f5-bfbec300752d', 'female', 'white', 'not hispanic or latino', '667')
('e922a884-7039-a171-a65e-78051fe7afe6', {'id': None, 'code': 'AMB', 'display': None, 'system': 'http://terminology.hl7.org/CodeSystem/v3-ActCode', 'userSelected': None, 'version': None}, 'AMB', 'ambulatory', 'finished', None, None, None, None, None, None, None, None, 1, datetime.date(2018, 6, 13), datetime.date(2018, 6, 13), datetime.date(2018, 6, 11), datetime.date(2018, 6, 1), datetime.date(2018, 1, 1), 'Patient/8022fbbe-aaa4-056c-d0f5-ec074bf0656c', 'Encounter/e922a884-7039-a171-a65e-78051fe7afe6', 'male', 'white', 'not hispanic or latino', '665')
('ed151e04-3dd6-8cb7-a3e5-777c8a8667f1', {'id': None, 'code': 'AMB', 'display': None, 'system': 'http://terminology.hl7.org/CodeSystem/v3-ActCode', 'userSelected': None, 'version': None}, 'AMB', 'ambulatory', 'finished', None, None, None, None, None, None, '195662009', 'http://snomed.info/sct', 57, datetime.date(2018, 6, 6), datetime.date(2018, 6, 6), datetime.date(2018, 6, 4), datetime.date(2018, 6, 1), datetime.date(2018, 1, 1), 'Patient/19158de4-66a2-f70f-e3bf-4396b312c8f1', 'Encounter/ed151e04-3dd6-8cb7-a3e5-777c8a8667f1', 'female', 'white', 'not hispanic or latino', '000')
('f2752dd7-1bf1-739d-dd8c-40122d0b63bc', {'id': None, 'code': 'AMB', 'display': None, 'system': 'http://terminology.hl7.org/CodeSystem/v3-ActCode', 'userSelected': None, 'version': None}, 'AMB', 'ambulatory', 'finished', None, None, None, None, None, None, '161665007', 'http://snomed.info/sct', 77, datetime.date(2018, 6, 1), datetime.date(2018, 6, 1), datetime.date(2018, 5, 28), datetime.date(2018, 6, 1), datetime.date(2018, 1, 1), 'Patient/9c8d8539-0b1e-73e2-b64f-83f1ea601fa4', 'Encounter/f2752dd7-1bf1-739d-dd8c-40122d0b63bc', 'male', 'white', 'not hispanic or latino', '674')
('f2752dd7-1bf1-739d-dd8c-40122d0b63bc', {'id': None, 'code': 'AMB', 'display': None, 'system': 'http://terminology.hl7.org/CodeSystem/v3-ActCode', 'userSelected': None, 'version': None}, 'AMB', 'ambulatory', 'finished', None, None, None, None, None, None, '161665007', 'http://snomed.info/sct', 77, datetime.date(2018, 6, 1), datetime.date(2018, 6, 2), datetime.date(2018, 5, 28), datetime.date(2018, 6, 1), datetime.date(2018, 1, 1), 'Patient/9c8d8539-0b1e-73e2-b64f-83f1ea601fa4', 'Encounter/f2752dd7-1bf1-739d-dd8c-40122d0b63bc', 'male', 'white', 'not hispanic or latino', '674')
('f964be66-3fcd-95c8-0021-71c7d24c91b7', {'id': None, 'code': 'AMB', 'display': None, 'system': 'http://terminology.hl7.org/CodeSystem/v3-ActCode', 'userSelected': None, 'version': None}, 'AMB', 'ambulatory', 'finished', None, None, None, None, None, None, '431857002', 'http://snomed.info/sct', 63, datetime.date(2018, 6, 5), datetime.date(2018, 6, 5), datetime.date(2018, 6, 4), datetime.date(2018, 6, 1), datetime.date(2018, 1, 1), 'Patient/a5bc08ea-9462-c4f5-1bd2-ff342598ac99', 'Encounter/f964be66-3fcd-95c8-0021-71c7d24c91b7', 'female', 'white', 'not hispanic or latino', '661')
('fd0754a4-e96d-cba7-b3c0-77697a09c86e', {'id': None, 'code': 'AMB', 'display': None, 'system': 'http://terminology.hl7.org/CodeSystem/v3-ActCode', 'userSelected': None, 'version': None}, 'AMB', 'ambulatory', 'finished', None, None, None, None, None, None, None, None, 100, datetime.date(2018, 7, 10), datetime.date(2018, 7, 10), datetime.date(2018, 7, 9), datetime.date(2018, 7, 1), datetime.date(2018, 1, 1), 'Patient/6385ddd7-2639-6505-3789-0521b8f66c8b', 'Encounter/fd0754a4-e96d-cba7-b3c0-77697a09c86e', 'female', 'white', 'not hispanic or latino', '672')
Loading

0 comments on commit e98aecd

Please sign in to comment.