Skip to content

Commit

Permalink
Generalized code discovery (#199)
Browse files Browse the repository at this point in the history
* Generalized code discovery

* light cleanup, moved unit test

* Docstrings, sql linting, cleanup
  • Loading branch information
dogversioning authored Mar 29, 2024
1 parent 111da13 commit df6be1d
Show file tree
Hide file tree
Showing 13 changed files with 459 additions and 324 deletions.
17 changes: 5 additions & 12 deletions cumulus_library/.sqlfluff
Original file line number Diff line number Diff line change
Expand Up @@ -33,36 +33,29 @@ code_system_tables =
[
{
table_name":"hasarray",
"column_name":"acol",
"is_bare_coding":False,
"is_array":True,
"column_hierarchy":[("acol",list),("bcol",dict)],
"has_data": True
},
{
"table_name":"noarray",
"column_name":"col",
"is_bare_coding":False,
"is_array":False,
"column_hierarchy":[("acol.bcol", list)],
"has_data": True
},
{
"table_name":"bare",
"column_name":"bcol",
"is_bare_coding":True,
"is_array":False,
"column_hierarchy":[("bcol", dict)],
"has_data": True
},
{
"table_name":"empty",
"column_name":"empty",
"is_bare_coding":False,
"is_array":False,
"column_hierarchy":[("empty",dict],
"has_data": False
}
]
column_name = 'bar'
column_names = ['foo', 'bar']
conditions = ["1 > 0", "1 < 2"]
column_hierarchy = [('a', list),('b',dict)]
config =
{
"medication_datasources" : {
Expand Down
63 changes: 38 additions & 25 deletions cumulus_library/studies/discovery/code_definitions.py
Original file line number Diff line number Diff line change
@@ -1,74 +1,87 @@
# A collection of codes & codeableConcepts to extract available codes from.
# Two optional booleans are available for use:
# - is_array: the field in question is an array of CodeableConcepts
# - is_bare_coding: the field in question is a Coding not wrapped in concepts
# - otherwise, it is assumed to be a 0..1 or 1..1 CodeableConcept
# TODO: if another state is needed, move to an Enum


code_list = [
# Condition
{"table_name": "condition", "column_name": "category", "is_array": True},
{
"table_name": "condition",
"column_name": "code",
"column_hierarchy": [("category", list), ("coding", list)],
},
{
"table_name": "condition",
"column_hierarchy": [("code", dict), ("coding", list)],
},
# DocumentReference
{
"table_name": "documentreference",
"column_name": "type",
"column_hierarchy": [("type", dict), ("coding", list)],
},
{
"table_name": "documentreference",
"column_hierarchy": [("category", list), ("coding", list)],
},
{"table_name": "documentreference", "column_name": "category", "is_array": True},
# Encounter
{
"table_name": "encounter",
"column_name": "class",
"is_bare_coding": True,
"column_hierarchy": [("class", dict)],
},
{
"table_name": "encounter",
"column_hierarchy": [("type", list), ("coding", list)],
},
{
"table_name": "encounter",
"column_name": "type",
"is_array": True,
"column_hierarchy": [("servicetype", dict), ("coding", list)],
},
{
"table_name": "encounter",
"column_name": "servicetype",
"column_hierarchy": [("priority", dict), ("coding", list)],
},
{
"table_name": "encounter",
"column_name": "priority",
"column_hierarchy": [("reasoncode", list), ("coding", list)],
},
{"table_name": "encounter", "column_name": "reasoncode", "is_array": True},
{
"table_name": "encounter",
"column_name": "hospitalization.dischargedisposition",
"column_hierarchy": [
("hospitalization", dict),
("dischargedisposition", dict),
("coding", list),
],
},
# Medication
{
"table_name": "medication",
"column_name": "codecodeableconcept ",
"column_hierarchy": [("codecodeableconcept", dict), ("coding", list)],
},
{
"table_name": "medication",
"column_name": "medicationcode",
"column_hierarchy": [("medicationcode", dict), ("coding", list)],
},
# Observation
{"table_name": "observation", "column_name": "category", "is_array": True},
{
"table_name": "observation",
"column_name": "code",
"column_hierarchy": [("category", list), ("coding", list)],
},
{
"table_name": "observation",
"column_hierarchy": [("code", dict), ("coding", list)],
},
{
"table_name": "observation",
"column_hierarchy": [("interpretation", list), ("coding", list)],
},
{"table_name": "observation", "column_name": "interpretation", "is_array": True},
{
"table_name": "observation",
"column_name": "valuecodeableconcept",
"column_hierarchy": [("valuecodeableconcept", dict), ("coding", list)],
},
{
"table_name": "observation",
"column_name": "dataabsentreason",
"column_hierarchy": [("dataabsentreason", dict), ("coding", list)],
},
# Patient
{
"table_name": "patient",
"column_name": "maritalstatus",
"column_hierarchy": [("maritalstatus", dict), ("coding", list)],
},
]
34 changes: 8 additions & 26 deletions cumulus_library/studies/discovery/code_detection.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,8 @@

from cumulus_library import base_table_builder, base_utils
from cumulus_library.studies.discovery import code_definitions
from cumulus_library.template_sql import base_templates, sql_utils
from cumulus_library.studies.discovery.discovery_templates import discovery_templates
from cumulus_library.template_sql import sql_utils


class CodeDetectionBuilder(base_table_builder.BaseTableBuilder):
Expand All @@ -11,27 +12,11 @@ class CodeDetectionBuilder(base_table_builder.BaseTableBuilder):
def _check_coding_against_db(self, code_source, schema, cursor):
"""selects the appropriate DB query to run"""

if code_source["is_array"]:
return sql_utils.is_field_populated(
schema=schema,
source_table=code_source["table_name"],
hierarchy=[(code_source["column_name"], list)],
expected=sql_utils.CODEABLE_CONCEPT,
cursor=cursor,
)
elif code_source["is_bare_coding"]:
return sql_utils.is_field_populated(
schema=schema,
source_table=code_source["table_name"],
hierarchy=[(code_source["column_name"], dict)],
expected=sql_utils.CODING,
cursor=cursor,
)
return sql_utils.is_field_populated(
schema=schema,
source_table=code_source["table_name"],
hierarchy=[(code_source["column_name"], dict)],
expected=sql_utils.CODEABLE_CONCEPT,
hierarchy=code_source["column_hierarchy"],
expected=sql_utils.CODING,
cursor=cursor,
)

Expand Down Expand Up @@ -59,24 +44,21 @@ def prepare_queries(self, cursor: object, schema: str, *args, **kwargs):
"""

code_sources = []
required_keys = {"table_name", "column_hierarchy"}
for code_definition in code_definitions.code_list:
if any(
x not in code_definition.keys() for x in ["table_name", "column_name"]
):
if not required_keys.issubset(code_definition):
raise KeyError(
"Expected table_name and column_name keys in "
"Expected table_name and column_hierarchy keys in "
f"{code_definition!s}"
)
code_source = {
"is_bare_coding": False,
"is_array": False,
"has_data": False,
}
for key in code_definition.keys():
code_source[key] = code_definition[key]
code_sources.append(code_source)
code_sources = self._check_codes_in_fields(code_sources, schema, cursor)
query = base_templates.get_code_system_pairs(
query = discovery_templates.get_code_system_pairs(
"discovery__code_sources", code_sources
)
self.queries.append(query)
126 changes: 0 additions & 126 deletions cumulus_library/studies/discovery/code_detection.sql

This file was deleted.

Loading

0 comments on commit df6be1d

Please sign in to comment.