From 90946648db5edc77ad0f4a6f32adfb15e03bfefd Mon Sep 17 00:00:00 2001 From: Michael Terry Date: Tue, 3 Dec 2024 14:56:47 -0500 Subject: [PATCH] core: add core__diagnosticreport table --- CONTRIBUTING.md | 10 + cumulus_library/.sqlfluff | 15 ++ cumulus_library/builders/counts.py | 26 +++ .../statistics_templates/counts_templates.py | 1 + .../studies/core/builder_diagnosticreport.py | 37 ++++ .../core_templates/completion_utils.jinja | 1 + .../core_templates/diagnosticreport.sql.jinja | 102 +++++++++++ cumulus_library/studies/core/count_core.py | 16 ++ cumulus_library/studies/core/manifest.toml | 2 + tests/core/test_core_diagreports.py | 171 ++++++++++++++++++ tests/test_cli.py | 4 +- .../etl__completion/completion.ndjson | 1 + ...ore__count_diagnosticreport_month.cube.csv | 20 ++ tests/testbed_utils.py | 14 +- 14 files changed, 417 insertions(+), 3 deletions(-) create mode 100644 cumulus_library/studies/core/builder_diagnosticreport.py create mode 100644 cumulus_library/studies/core/core_templates/diagnosticreport.sql.jinja create mode 100644 tests/core/test_core_diagreports.py create mode 100644 tests/test_data/duckdb_data/expected_export/core/core__count_diagnosticreport_month.cube.csv diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 600a8bf..f29c11a 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -40,6 +40,16 @@ Things to keep in mind: the `id` field, add a `condition_ref` field defined like: `concat('Condition/', id) AS condition_ref` +## DateTime fields + +In general: +- Expand a Date/DateTime/Instant field into four versions: + day/week/month/year, with appropriate suffixes (look for examples). +- If the field is a Date field, leave the `_day` suffix off the day version, + since the end result is not actually a modified value. +- Add start & end versions of Period fields, + since EHRs are truly wild out there and may fill only start or only end. + ## Rebuilding the reference SQL We keep some reference SQL in git, diff --git a/cumulus_library/.sqlfluff b/cumulus_library/.sqlfluff index ecd6dd4..a3bccbd 100644 --- a/cumulus_library/.sqlfluff +++ b/cumulus_library/.sqlfluff @@ -139,6 +139,21 @@ schema = 'reference': True, 'display': False, 'type': True } }, + 'diagnosticreport': { + 'effectivePeriod': { + 'start': True, 'end': True, + }, + 'encounter': { + 'reference': True, + }, + 'id': True, + 'result': { + 'reference': True, + }, + 'subject': { + 'reference': True, + }, + }, 'documentreference': { 'id': True, 'type': True, diff --git a/cumulus_library/builders/counts.py b/cumulus_library/builders/counts.py index 10d495e..74ffb17 100644 --- a/cumulus_library/builders/counts.py +++ b/cumulus_library/builders/counts.py @@ -142,6 +142,32 @@ def count_condition( filter_resource=True, ) + def count_diagnosticreport( + self, + table_name: str, + source_table: str, + table_cols: list, + where_clauses: list | None = None, + min_subject: int | None = None, + ) -> str: + """wrapper method for constructing diagnosticreport counts tables + + :param table_name: The name of the table to create. Must start with study prefix + :param source_table: The table to create counts data from + :param table_cols: The columns from the source table to add to the count table + :param where_clauses: An array of where clauses to use for filtering the data + :param min_subject: An integer setting the minimum bin size for inclusion + (default: 10) + """ + return self.get_count_query( + table_name, + source_table, + table_cols, + where_clauses=where_clauses, + min_subject=min_subject, + fhir_resource="diagnosticreport", + ) + def count_documentreference( self, table_name: str, diff --git a/cumulus_library/builders/statistics_templates/counts_templates.py b/cumulus_library/builders/statistics_templates/counts_templates.py index 55d10d8..f184dea 100644 --- a/cumulus_library/builders/statistics_templates/counts_templates.py +++ b/cumulus_library/builders/statistics_templates/counts_templates.py @@ -17,6 +17,7 @@ class CountableFhirResource(Enum): ALLERGYINTOLERANCE = "allergyintolerance" CONDITION = "condition" + DIAGNOSTICREPORT = "diagnosticreport" DOCUMENTREFERENCE = "documentreference" ENCOUNTER = "encounter" NONE = None diff --git a/cumulus_library/studies/core/builder_diagnosticreport.py b/cumulus_library/studies/core/builder_diagnosticreport.py new file mode 100644 index 0000000..2dcb3d0 --- /dev/null +++ b/cumulus_library/studies/core/builder_diagnosticreport.py @@ -0,0 +1,37 @@ +import cumulus_library +from cumulus_library.studies.core.core_templates import core_templates +from cumulus_library.template_sql import sql_utils + +expected_table_cols = { + "diagnosticreport": { + "id": [], + "status": [], + "subject": sql_utils.REFERENCE, + "encounter": sql_utils.REFERENCE, + "effectiveDateTime": [], + "effectivePeriod": ["start", "end"], + "issued": [], + "result": sql_utils.REFERENCE, + } +} + + +class CoreDiagnosticReportBuilder(cumulus_library.BaseTableBuilder): + display_text = "Creating DiagnosticReport tables..." + + def prepare_queries(self, *args, config: cumulus_library.StudyConfig, **kwargs): + code_sources = [ + sql_utils.CodeableConceptConfig( + source_table="diagnosticreport", + column_hierarchy=[("category", list)], + target_table="core__diagnosticreport_dn_category", + ), + sql_utils.CodeableConceptConfig( + source_table="diagnosticreport", + column_hierarchy=[("code", dict)], + target_table="core__diagnosticreport_dn_code", + ), + ] + self.queries += sql_utils.denormalize_complex_objects(config.db, code_sources) + validated_schema = sql_utils.validate_schema(config.db, expected_table_cols) + self.queries.append(core_templates.get_core_template("diagnosticreport", validated_schema)) diff --git a/cumulus_library/studies/core/core_templates/completion_utils.jinja b/cumulus_library/studies/core/core_templates/completion_utils.jinja index d7ce807..4046412 100644 --- a/cumulus_library/studies/core/core_templates/completion_utils.jinja +++ b/cumulus_library/studies/core/core_templates/completion_utils.jinja @@ -41,6 +41,7 @@ in this table. ( BOOL_OR(ec.table_name = 'allergyintolerance') AND BOOL_OR(ec.table_name = 'condition') + AND BOOL_OR(ec.table_name = 'diagnosticreport') AND BOOL_OR(ec.table_name = 'documentreference') AND BOOL_OR(ec.table_name = 'medicationrequest') AND BOOL_OR(ec.table_name = 'observation') diff --git a/cumulus_library/studies/core/core_templates/diagnosticreport.sql.jinja b/cumulus_library/studies/core/core_templates/diagnosticreport.sql.jinja new file mode 100644 index 0000000..cb95d47 --- /dev/null +++ b/cumulus_library/studies/core/core_templates/diagnosticreport.sql.jinja @@ -0,0 +1,102 @@ +{% import 'core_utils.jinja' as utils %} +{% import 'unnest_utils.jinja' as unnest_utils %} + +-- This table includes all fields of interest to the US Core DiagnosticReport profiles. +-- EXCEPT FOR: +-- * the 'presentedForm' field, which is an attachment array that is stripped out by the ETL. +-- * the `reporter` field, simply due to it not likely being interesting to consumers +-- and being an array field, which would require a lot of row duplication. +-- +-- US Core profiles for reference: +-- * https://hl7.org/fhir/us/core/STU4/StructureDefinition-us-core-diagnosticreport-lab.html +-- * https://hl7.org/fhir/us/core/STU4/StructureDefinition-us-core-diagnosticreport-note.html + +CREATE TABLE core__diagnosticreport AS +WITH temp_diagnosticreport AS ( + SELECT + {{- utils.basic_cols('diagnosticreport', 'd', ['id']) }}, + {{- + utils.nullable_cols( + 'diagnosticreport', + 'd', + [ + 'status', + ('subject', 'reference', 'subject_ref'), + ('encounter', 'reference', 'encounter_ref'), + ], + schema + ) + }}, + {{- + utils.truncate_date_cols( + 'diagnosticreport', + 'd', + [ + ('effectiveDateTime', 'day'), + ('effectiveDateTime', 'week'), + ('effectiveDateTime', 'month'), + ('effectiveDateTime', 'year'), + ('effectivePeriod', 'start', 'effectivePeriod_start_day', 'day'), + ('effectivePeriod', 'start', 'effectivePeriod_start_week', 'week'), + ('effectivePeriod', 'start', 'effectivePeriod_start_month', 'month'), + ('effectivePeriod', 'start', 'effectivePeriod_start_year', 'year'), + ('effectivePeriod', 'end', 'effectivePeriod_end_day', 'day'), + ('effectivePeriod', 'end', 'effectivePeriod_end_week', 'week'), + ('effectivePeriod', 'end', 'effectivePeriod_end_month', 'month'), + ('effectivePeriod', 'end', 'effectivePeriod_end_year', 'year'), + ('issued', 'day'), + ('issued', 'week'), + ('issued', 'month'), + ('issued', 'year'), + ], + schema + ) + }} + FROM diagnosticreport AS d +), + +temp_result AS ( + {{ unnest_utils.flatten('diagnosticreport', 'reference', parent_field='result') }} +) + +SELECT + td.id, + td.status, + + dn_category.code AS category_code, + dn_category.system AS category_system, + dn_category.display AS category_display, + + dn_code.code AS code_code, + dn_code.system AS code_system, + dn_code.display AS code_display, + + td.effectiveDateTime_day, + td.effectiveDateTime_week, + td.effectiveDateTime_month, + td.effectiveDateTime_year, + + td.effectivePeriod_start_day, + td.effectivePeriod_start_week, + td.effectivePeriod_start_month, + td.effectivePeriod_start_year, + + td.effectivePeriod_end_day, + td.effectivePeriod_end_week, + td.effectivePeriod_end_month, + td.effectivePeriod_end_year, + + td.issued_day, + td.issued_week, + td.issued_month, + td.issued_year, + + concat('DiagnosticReport/', td.id) AS diagnosticreport_ref, + td.subject_ref, + td.encounter_ref, + tr.reference AS result_ref + +FROM temp_diagnosticreport AS td +LEFT JOIN core__diagnosticreport_dn_code AS dn_code ON td.id = dn_code.id +LEFT JOIN core__diagnosticreport_dn_category AS dn_category ON td.id = dn_category.id +LEFT JOIN temp_result AS tr ON td.id = tr.id; diff --git a/cumulus_library/studies/core/count_core.py b/cumulus_library/studies/core/count_core.py index c39d6fb..e51ec39 100644 --- a/cumulus_library/studies/core/count_core.py +++ b/cumulus_library/studies/core/count_core.py @@ -27,6 +27,21 @@ def count_core_condition(self, duration: str = "month"): ] return self.count_condition(table_name, from_table, cols) + def count_core_diagnosticreport(self, duration: str = "month"): + table_name = self.get_table_name("count_diagnosticreport", duration=duration) + from_table = self.get_table_name("diagnosticreport") + cols = [ + ["category_display", "varchar", None], + ["code_display", "varchar", None], + # Issued is not the _preferred_ time to pull, since it is an administrative time, + # not a clinical one. But the clinical dates are annoyingly spread across three + # fields: effectiveDateTime, effectivePeriod.start, and effectivePeriod.end. + # So rather than do some fancy collation, just use issued. These core counts are + # just a rough idea of the data, not a polished final product. + [f"issued_{duration}", "date", None], + ] + return self.count_diagnosticreport(table_name, from_table, cols) + def count_core_documentreference(self, duration: str = "month"): table_name = self.get_table_name("count_documentreference", duration=duration) from_table = self.get_table_name("documentreference") @@ -119,6 +134,7 @@ def prepare_queries(self, *args, **kwargs): self.queries = [ self.count_core_allergyintolerance(duration="month"), self.count_core_condition(duration="month"), + self.count_core_diagnosticreport(duration="month"), self.count_core_documentreference(duration="month"), self.count_core_encounter(duration="month"), self.count_core_encounter_all_types(), diff --git a/cumulus_library/studies/core/manifest.toml b/cumulus_library/studies/core/manifest.toml index 3e478a7..8444889 100644 --- a/cumulus_library/studies/core/manifest.toml +++ b/cumulus_library/studies/core/manifest.toml @@ -5,6 +5,7 @@ file_names = [ "builder_prereq_tables.py", "builder_allergyintolerance.py", "builder_condition.py", + "builder_diagnosticreport.py", "builder_patient.py", "builder_encounter.py", "builder_documentreference.py", @@ -19,6 +20,7 @@ file_names = [ count_list = [ "core__count_allergyintolerance_month", "core__count_condition_month", + "core__count_diagnosticreport_month", "core__count_documentreference_month", "core__count_encounter_month", "core__count_encounter_all_types", diff --git a/tests/core/test_core_diagreports.py b/tests/core/test_core_diagreports.py new file mode 100644 index 0000000..c8b422e --- /dev/null +++ b/tests/core/test_core_diagreports.py @@ -0,0 +1,171 @@ +"""Tests for core__diagnosticreport""" + +import functools +import itertools +import json + +from tests import conftest, testbed_utils + + +def combine_dictionaries(*combos: list[dict]) -> list[dict]: + return [ + functools.reduce(lambda x, y: x | y, tuple_of_dicts) + for tuple_of_dicts in itertools.product(*combos) + ] + + +def dict_set_from_list(rows: list[dict]) -> set[tuple]: + return {tuple(sorted(row.items())) for row in rows} + + +def test_core_diag_report_many_cases(tmp_path): + """Verify that we handle multiply rows as needed when multiple options appear""" + testbed = testbed_utils.LocalTestbed(tmp_path) + testbed.add_diagnostic_report( + "Multiple-Rows", + status="final", + category=[ + {"coding": [{"code": "cat1", "system": "sys:cat", "display": "Cat One"}]}, + {"coding": [{"code": "cat2", "system": "sys:cat", "display": "Cat Two"}]}, + ], + code={ + "coding": [ + {"code": "code1", "system": "sys:code", "display": "Code One"}, + {"code": "code2", "system": "sys:code", "display": "Code Two"}, + ], + }, + subject={"reference": "Patient/P1"}, + encounter={"reference": "Encounter/E1"}, + effectiveDateTime="2019-12-11T10:10:10+05:00", + issued="2019-12-12T10:10:10+05:00", + result=[ + {"reference": "Observation/result1"}, + {"reference": "Observation/result2"}, + ], + ) + + con = testbed.build() + df = con.sql("SELECT * FROM core__diagnosticreport").df() + rows = json.loads(df.to_json(orient="records")) + + assert 8 == len(rows) + + combos = combine_dictionaries( + # Start with a list of size one - all the consistent elements across all rows + [ + { + "id": "Multiple-Rows", + "diagnosticreport_ref": "DiagnosticReport/Multiple-Rows", + "status": "final", + "subject_ref": "Patient/P1", + "encounter_ref": "Encounter/E1", + "effectiveDateTime_day": conftest.date_to_epoch(2019, 12, 11), + "effectiveDateTime_week": conftest.date_to_epoch(2019, 12, 9), + "effectiveDateTime_month": conftest.date_to_epoch(2019, 12, 1), + "effectiveDateTime_year": conftest.date_to_epoch(2019, 1, 1), + "effectivePeriod_start_day": None, + "effectivePeriod_start_week": None, + "effectivePeriod_start_month": None, + "effectivePeriod_start_year": None, + "effectivePeriod_end_day": None, + "effectivePeriod_end_week": None, + "effectivePeriod_end_month": None, + "effectivePeriod_end_year": None, + "issued_day": conftest.date_to_epoch(2019, 12, 12), + "issued_week": conftest.date_to_epoch(2019, 12, 9), + "issued_month": conftest.date_to_epoch(2019, 12, 1), + "issued_year": conftest.date_to_epoch(2019, 1, 1), + }, + ], + [ + {"category_code": "cat1", "category_system": "sys:cat", "category_display": "Cat One"}, + {"category_code": "cat2", "category_system": "sys:cat", "category_display": "Cat Two"}, + ], + [ + {"code_code": "code1", "code_system": "sys:code", "code_display": "Code One"}, + {"code_code": "code2", "code_system": "sys:code", "code_display": "Code Two"}, + ], + [ + {"result_ref": "Observation/result1"}, + {"result_ref": "Observation/result2"}, + ], + ) + assert 8 == len(combos) # sanity check our product math + + assert dict_set_from_list(rows) == dict_set_from_list(combos) + + +def test_core_diag_report_minimal(tmp_path): + """Verify that no actual content works fine""" + testbed = testbed_utils.LocalTestbed(tmp_path) + testbed.add_diagnostic_report("Nothing") + + con = testbed.build() + df = con.sql("SELECT * FROM core__diagnosticreport").df() + rows = json.loads(df.to_json(orient="records")) + + assert rows == [ + { + "id": "Nothing", + "diagnosticreport_ref": "DiagnosticReport/Nothing", + "status": None, + "category_code": None, + "category_system": None, + "category_display": None, + "code_code": None, + "code_system": None, + "code_display": None, + "subject_ref": None, + "encounter_ref": None, + "effectiveDateTime_day": None, + "effectiveDateTime_week": None, + "effectiveDateTime_month": None, + "effectiveDateTime_year": None, + "effectivePeriod_start_day": None, + "effectivePeriod_start_week": None, + "effectivePeriod_start_month": None, + "effectivePeriod_start_year": None, + "effectivePeriod_end_day": None, + "effectivePeriod_end_week": None, + "effectivePeriod_end_month": None, + "effectivePeriod_end_year": None, + "issued_day": None, + "issued_week": None, + "issued_month": None, + "issued_year": None, + "result_ref": None, + } + ] + + +def test_core_diag_report_period(tmp_path): + """Verify that we parse the period correctly""" + testbed = testbed_utils.LocalTestbed(tmp_path) + testbed.add_diagnostic_report( + "Period", + effectivePeriod={ + "start": "2023-10-06T09:30:00Z", + "end": "2023-10-06T10:30:00Z", + }, + ) + + con = testbed.build() + df = con.sql("SELECT * FROM core__diagnosticreport").df() + rows = json.loads(df.to_json(orient="records")) + + assert len(rows) == 1 + effective_fields = {k: v for k, v in rows[0].items() if "effective" in k} + assert effective_fields == { + "effectiveDateTime_day": None, + "effectiveDateTime_week": None, + "effectiveDateTime_month": None, + "effectiveDateTime_year": None, + "effectivePeriod_start_day": conftest.date_to_epoch(2023, 10, 6), + "effectivePeriod_start_week": conftest.date_to_epoch(2023, 10, 2), + "effectivePeriod_start_month": conftest.date_to_epoch(2023, 10, 1), + "effectivePeriod_start_year": conftest.date_to_epoch(2023, 1, 1), + "effectivePeriod_end_day": conftest.date_to_epoch(2023, 10, 6), + "effectivePeriod_end_week": conftest.date_to_epoch(2023, 10, 2), + "effectivePeriod_end_month": conftest.date_to_epoch(2023, 10, 1), + "effectivePeriod_end_year": conftest.date_to_epoch(2023, 1, 1), + } diff --git a/tests/test_cli.py b/tests/test_cli.py index 43b9247..dd24491 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -285,7 +285,7 @@ def test_clean(tmp_path, args, expected, raises): ( ["build", "-t", "core"], ["export", "-t", "core"], - 64, + 68, does_not_raise(), ), ( @@ -477,7 +477,7 @@ def test_cli_executes_queries(tmp_path, build_args, export_args, expected_tables config = tomllib.load(file) csv_files = glob.glob(f"{tmp_path}/export/{build_args[2]}/*.csv") export_config = config["export_config"] - for export_list in export_config: + for export_list in export_config.values(): for export_table in export_list: assert any(export_table in x for x in csv_files) diff --git a/tests/test_data/duckdb_data/etl__completion/completion.ndjson b/tests/test_data/duckdb_data/etl__completion/completion.ndjson index f3ca735..fe93761 100644 --- a/tests/test_data/duckdb_data/etl__completion/completion.ndjson +++ b/tests/test_data/duckdb_data/etl__completion/completion.ndjson @@ -1,5 +1,6 @@ {"table_name": "allergyintolerance", "group_name": "test-group", "export_time": "2020-10-13T12:00:20-05:00"} {"table_name": "condition", "group_name": "test-group", "export_time": "2020-10-13T12:00:20-05:00"} +{"table_name": "diagnosticreport", "group_name": "test-group", "export_time": "2020-10-13T12:00:20-05:00"} {"table_name": "documentreference", "group_name": "test-group", "export_time": "2020-10-13T12:00:20-05:00"} {"table_name": "encounter", "group_name": "test-group", "export_time": "2020-10-13T12:00:20-05:00"} {"table_name": "medicationrequest", "group_name": "test-group", "export_time": "2020-10-13T12:00:20-05:00"} diff --git a/tests/test_data/duckdb_data/expected_export/core/core__count_diagnosticreport_month.cube.csv b/tests/test_data/duckdb_data/expected_export/core/core__count_diagnosticreport_month.cube.csv new file mode 100644 index 0000000..cd3a794 --- /dev/null +++ b/tests/test_data/duckdb_data/expected_export/core/core__count_diagnosticreport_month.cube.csv @@ -0,0 +1,20 @@ +"cnt","category_display","code_display","issued_month" +13,,, +13,,"History and physical note", +13,,"Evaluation + Plan note", +13,,"CBC panel - Blood by Automated count", +13,"Laboratory",, +13,"Laboratory","CBC panel - Blood by Automated count", +13,"History and physical note",, +13,"History and physical note","History and physical note", +13,"History and physical note","Evaluation + Plan note", +13,"Evaluation + Plan note",, +13,"Evaluation + Plan note","History and physical note", +13,"Evaluation + Plan note","Evaluation + Plan note", +12,"cumulus__none",, +11,,"Generalized anxiety disorder 7 item (GAD-7)", +11,"cumulus__none","Generalized anxiety disorder 7 item (GAD-7)", +10,,"Patient Health Questionnaire 2 item (PHQ-2) [Reported]", +10,,"Alcohol Use Disorder Identification Test - Consumption [AUDIT-C]", +10,"cumulus__none","Patient Health Questionnaire 2 item (PHQ-2) [Reported]", +10,"cumulus__none","Alcohol Use Disorder Identification Test - Consumption [AUDIT-C]", diff --git a/tests/testbed_utils.py b/tests/testbed_utils.py index eb231d8..fd6c986 100644 --- a/tests/testbed_utils.py +++ b/tests/testbed_utils.py @@ -46,7 +46,7 @@ def add(self, table: str, obj: dict) -> None: # All other args can be specified as a kwarg, like add() itself does. def add_allergy_intolerance(self, row_id: str, recorded: str = "2020", **kwargs) -> None: - """Adds a Condition with all the SQL-required fields filled out""" + """Adds a AllergyIntolerance with all the SQL-required fields filled out""" self.add( "allergyintolerance", { @@ -69,6 +69,17 @@ def add_condition(self, row_id: str, recorded: str = "2020", **kwargs) -> None: }, ) + def add_diagnostic_report(self, row_id: str, **kwargs) -> None: + """Adds a DiagnosticReport with all the SQL-required fields filled out""" + self.add( + "diagnosticreport", + { + "resourceType": "DiagnosticReport", + "id": row_id, + **kwargs, + }, + ) + def add_document_reference(self, row_id: str, start: str = "2020", **kwargs) -> None: """Adds a DocumentReference with all the SQL-required fields filled out""" context = kwargs.pop("context", {}) @@ -113,6 +124,7 @@ def add_etl_completion( # All required tables: "allergyintolerance", "condition", + "diagnosticreport", "documentreference", "medicationrequest", "observation",