Skip to content

Commit

Permalink
Updates for i2b2 export support (#126)
Browse files Browse the repository at this point in the history
* Updates for i2b2 export support

* reference data update

* sort on export

* testing cleanup

* prefix cleanup
  • Loading branch information
dogversioning authored Oct 12, 2023
1 parent 8db3902 commit eae81c1
Show file tree
Hide file tree
Showing 34 changed files with 29,555 additions and 28,393 deletions.
2 changes: 1 addition & 1 deletion cumulus_library/__init__.py
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
"""Package metadata"""
__version__ = "1.3.1"
__version__ = "1.4.0"
20 changes: 14 additions & 6 deletions cumulus_library/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,7 @@ def reset_data_path(self, study: PosixPath) -> None:

### Creating studies

def clean_study(self, targets: List[str]) -> None:
def clean_study(self, targets: List[str], prefix=False) -> None:
"""Removes study table/views from Athena.
While this is usually not required, since it it done as part of a build,
Expand All @@ -102,9 +102,14 @@ def clean_study(self, targets: List[str]) -> None:
"Explicit targets for cleaning not provided. "
"Provide one or more explicit study prefixes to remove."
)
for study in targets:
StudyManifestParser.clean_study(
self.cursor, self.schema_name, self.verbose, prefix=f"{study}__"
for target in targets:
if prefix:
prefix = target
else:
prefix = f"{target}__"
parser = StudyManifestParser()
parser.clean_study(
self.cursor, self.schema_name, self.verbose, prefix=prefix
)

def clean_and_build_study(self, target: PosixPath) -> None:
Expand Down Expand Up @@ -252,7 +257,7 @@ def run_cli(args: Dict):
builder.cursor.execute("SHOW DATABASES")

if args["action"] == "clean":
builder.clean_study(args["target"])
builder.clean_study(args["target"], args["prefix"])

else:
study_dict = get_study_dict(args["study_dir"])
Expand Down Expand Up @@ -320,7 +325,10 @@ def main(cli_args=None):
read_env_vars = []
for pair in arg_env_pairs:
if env_val := os.environ.get(pair[1]):
args[pair[0]] = env_val
if pair[0] == "study_dir":
args[pair[0]] = [env_val]
else:
args[pair[0]] = env_val
read_env_vars.append([pair[1], env_val])

if len(read_env_vars) > 0:
Expand Down
12 changes: 8 additions & 4 deletions cumulus_library/cli_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,7 @@ def add_target_argument(parser: argparse.ArgumentParser) -> None:
"-t",
"--target",
action="append",
help=(
"Specify one or more studies to perform actions against. "
"Default is to use all studies."
),
help=("Specify one or more studies to perform actions against."),
)


Expand Down Expand Up @@ -124,8 +121,15 @@ def get_parser() -> argparse.ArgumentParser:
clean = actions.add_parser(
"clean", help="Removes tables & views beginning with '[target]__' from Athena"
)

add_target_argument(clean)
add_verbose_argument(clean)
add_aws_config(clean)
clean.add_argument(
"--prefix",
action="store_true",
help=(argparse.SUPPRESS),
)

build = actions.add_parser(
"build",
Expand Down
10 changes: 10 additions & 0 deletions cumulus_library/studies/core/builder_encounter_coding.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,16 @@ def prepare_queries(self, cursor: object, schema: str):
],
"has_data": False,
},
{
"column_name": "reasoncode",
"is_array": True,
"filter_priority": True,
"code_systems": [
"http://terminology.hl7.org/CodeSystem/v3-ActPriority",
"http://snomed.info/sct",
],
"has_data": False,
},
]
code_sources = self._check_data_in_fields(code_sources, schema, cursor)
for code_source in code_sources:
Expand Down
8 changes: 4 additions & 4 deletions cumulus_library/studies/core/documentreference.sql
Original file line number Diff line number Diff line change
Expand Up @@ -65,17 +65,17 @@ WITH powerset AS (
count(DISTINCT d.doc_id) AS cnt_document,
d.doc_type_display,
d.author_month,
e.enc_class.display AS enc_class_code
e.enc_class_display
FROM core__documentreference AS d, core__encounter AS e
WHERE d.encounter_ref = e.encounter_ref
GROUP BY cube(d.doc_type_display, d.author_month, e.enc_class)
GROUP BY cube(d.doc_type_display, d.author_month, e.enc_class_display)
)

SELECT DISTINCT
cnt_document AS cnt,
author_month,
enc_class_code,
enc_class_display,
doc_type_display
FROM powerset
WHERE cnt_subject >= 10
ORDER BY cnt_document DESC, enc_class_code ASC;
ORDER BY cnt_document DESC, enc_class_display ASC;
39 changes: 27 additions & 12 deletions cumulus_library/studies/core/encounter.sql
Original file line number Diff line number Diff line change
Expand Up @@ -3,31 +3,45 @@
-- https://build.fhir.org/ig/HL7/US-Core/StructureDefinition-us-core-encounter.html

CREATE TABLE core__encounter AS


WITH temp_encounter AS (
SELECT DISTINCT
e.period,
e.status,
e.class,
e.type,
e.servicetype,
e.priority,
e.reasoncode,
edt.code AS type_code,
edt.code_system AS type_code_system,
eds.code AS sevicetype_code,
eds.code_system AS sevicetype_code_system,
edp.code AS priority_code,
edp.code_system AS priority_code_system,
edr.code AS reasoncode_code,
edr.code_system AS reasoncode_code_system,
e.subject.reference AS subject_ref,
e.id AS encounter_id,
date(from_iso8601_timestamp(e.period."start")) AS start_date,
date(from_iso8601_timestamp(e.period."end")) AS end_date,
concat('Encounter/', e.id) AS encounter_ref
FROM encounter AS e
LEFT JOIN core__encounter_dn_priority AS edt ON e.id = edt.id
LEFT JOIN core__encounter_dn_servicetype AS eds ON e.id = eds.id
LEFT JOIN core__encounter_dn_priority AS edp ON e.id = edp.id
LEFT JOIN core__encounter_dn_reasoncode AS edr ON e.id = edr.id
)

SELECT DISTINCT
e.class AS enc_class,
e.class.code AS enc_class_code,
e.class.display AS enc_class_display,
e.type AS enc_type,
e.servicetype AS service_type,
e.priority,
e.reasoncode AS reason_code,
ac.display AS enc_class_display,
e.type_code,
e.type_code_system,
e.sevicetype_code,
e.sevicetype_code_system,
e.priority_code,
e.priority_code_system,
e.reasoncode_code,
e.reasoncode_code_system,
date_diff('year', date(p.birthdate), e.start_date) AS age_at_visit,
date_trunc('day', e.start_date) AS start_date,
date_trunc('day', e.end_date) AS end_date,
Expand All @@ -41,7 +55,8 @@ SELECT DISTINCT
p.race_display,
p.ethnicity_display,
p.postalcode3
FROM temp_encounter AS e, core__patient AS p
FROM temp_encounter AS e
LEFT JOIN core__act_encounter_code_v3 AS ac ON ac.code = e.class.code
INNER JOIN core__patient AS p ON e.subject_ref = p.subject_ref
WHERE
e.subject_ref = p.subject_ref
AND start_date BETWEEN date('2016-06-01') AND current_date;
start_date BETWEEN date('2016-06-01') AND current_date;
6 changes: 4 additions & 2 deletions cumulus_library/studies/core/encounter_type.sql
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,8 @@ SELECT DISTINCT
coalesce(cep.code_system, 'None') AS enc_priority_system,
coalesce(cep.code, 'None') AS enc_priority_code,
coalesce(cep.display, 'None') AS enc_priority_display,
e.reason_code,
coalesce(cer.code, 'None') AS enc_reasoncode_code,
coalesce(cer.display, 'None') AS enc_reasoncode_display,
e.age_at_visit,
e.start_date,
e.end_date,
Expand All @@ -34,4 +35,5 @@ SELECT DISTINCT
FROM core__encounter AS e
LEFT JOIN core__encounter_dn_type AS cet ON e.encounter_id = cet.id
LEFT JOIN core__encounter_dn_servicetype AS ces ON e.encounter_id = ces.id
LEFT JOIN core__encounter_dn_priority AS cep ON e.encounter_id = cep.id;
LEFT JOIN core__encounter_dn_priority AS cep ON e.encounter_id = cep.id
LEFT JOIN core__encounter_dn_reasoncode AS cer ON e.encounter_id = cer.id;
54 changes: 54 additions & 0 deletions cumulus_library/studies/core/setup.sql
Original file line number Diff line number Diff line change
Expand Up @@ -61,3 +61,57 @@ FROM
)
)
AS t (from_system, from_code, analyte, code_system, code, display);

CREATE TABLE core__act_encounter_code_v3 AS
SELECT
t.code,
t.display
FROM
(
VALUES
(
'AMB',
'ambulatory'
),
(
'EMER',
'emergency'
),
(
'FLD',
'field'
),
(
'HH',
'home health'
),
(
'IMP',
'inpatient encounter'
),
(
'ACUTE',
'inpatient acute'
),
(
'NONAC',
'inpatient non-acute'
),
(
'OBSENC',
'observation encounter'
),
(
'PRENC',
'pre-admission'
),
(
'SS',
'short stay'
),
(
'VR',
'virtual'
)
)
AS t (code, display)
26 changes: 22 additions & 4 deletions cumulus_library/studies/core/study_period.sql
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ WITH documented_encounter AS (
ce.encounter_ref,
cd.doc_ref,
date_diff('day', ce.start_date, cd.author_date) AS diff_enc_note_days,
coalesce(ce.enc_class.display, 'None') AS enc_class_code,
coalesce(ce.enc_class_display, 'None') AS enc_class_display,
coalesce(cd.doc_type.code, 'None') AS doc_type_code,
coalesce(cd.doc_type.display, cd.doc_type.code) AS doc_type_display
FROM
Expand All @@ -33,11 +33,29 @@ WITH documented_encounter AS (
)

SELECT
documented_encounter.*,
de.start_date,
de.start_week,
de.start_month,
de.end_date,
de.age_at_visit,
de.author_date,
de.author_week,
de.author_month,
de.author_year,
de.gender,
de.race_display,
de.ethnicity_display,
de.subject_ref,
de.encounter_ref,
de.doc_ref,
de.diff_enc_note_days,
de.enc_class_display,
de.doc_type_code,
de.doc_type_display,
coalesce(ed.code IS NOT NULL, false) AS ed_note
FROM documented_encounter
FROM documented_encounter AS de
LEFT JOIN core__ed_note AS ed
ON documented_encounter.doc_type_code = ed.from_code;
ON de.doc_type_code = ed.from_code;

CREATE TABLE core__meta_date AS
SELECT
Expand Down
2 changes: 1 addition & 1 deletion cumulus_library/studies/core/version.sql
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
CREATE TABLE core__meta_version AS
SELECT 1 AS data_package_version;
SELECT 2 AS data_package_version;
27 changes: 20 additions & 7 deletions cumulus_library/study_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -149,16 +149,21 @@ def clean_study(
:param schema_name: The name of the schema containing the study tables
:verbose: toggle from progress bar to query output, optional
:returns: list of dropped tables (for unit testing only)
:prefix: override prefix discovery with the provided prefix
TODO: If we need to support additional databases, we may need to investigate
additional ways to get a list of table prefixes
"""
if not schema_name:
raise ValueError("No database provided")
if not prefix:
prefix = self.get_study_prefix()
view_sql = get_show_views(schema_name, prefix)
table_sql = get_show_tables(schema_name, prefix)
drop_prefix = f"{self.get_study_prefix()}__"
display_prefix = self.get_study_prefix()
else:
drop_prefix = prefix
display_prefix = drop_prefix
view_sql = get_show_views(schema_name, drop_prefix)
table_sql = get_show_tables(schema_name, drop_prefix)
view_table_list = []
for query_and_type in [[view_sql, "VIEW"], [table_sql, "TABLE"]]:
cursor.execute(query_and_type[0])
Expand All @@ -180,19 +185,24 @@ def clean_study(
# study builder, and remove them from the list.
for view_table in view_table_list.copy():
if any(
view_table[0].startswith(f"{self.get_study_prefix()}__{word}_")
((f"_{word}_") in view_table[0] or view_table[0].endswith(word))
for word in RESERVED_TABLE_KEYWORDS
):
view_table_list.remove(view_table)

# We want to only show a progress bar if we are :not: printing SQL lines
with get_progress_bar(disable=verbose) as progress:
task = progress.add_task(
f"Removing {self.get_study_prefix()} study artifacts...",
f"Removing {display_prefix} study artifacts...",
total=len(view_table_list),
visible=not verbose,
)
self._execute_drop_queries(cursor, verbose, view_table_list, progress, task)
self._execute_drop_queries(
cursor,
verbose,
view_table_list,
progress,
task,
)
return view_table_list

def _execute_drop_queries(
Expand Down Expand Up @@ -427,6 +437,9 @@ def export_study(self, cursor: object, data_path: PosixPath) -> List:
dataframe = cursor.execute(query).as_pandas()
path = Path(f"{str(data_path)}/{self.get_study_prefix()}/")
path.mkdir(parents=True, exist_ok=True)
dataframe = dataframe.sort_values(
by=list(dataframe.columns), ascending=False, na_position="first"
)
dataframe.to_csv(f"{path}/{table}.csv", index=False)
dataframe.to_parquet(f"{path}/{table}.parquet", index=False)
queries.append(query)
Expand Down
2 changes: 1 addition & 1 deletion cumulus_library/template_sql/show_tables.sql.jinja
Original file line number Diff line number Diff line change
@@ -1 +1 @@
SHOW TABLES FROM `{{schema_name}}` '{{prefix}}__*';
SHOW TABLES FROM `{{schema_name}}` '{{prefix}}*';
2 changes: 1 addition & 1 deletion cumulus_library/template_sql/show_views.sql.jinja
Original file line number Diff line number Diff line change
@@ -1 +1 @@
SHOW VIEWS IN "{{schema_name}}" LIKE '{{prefix}}__*';
SHOW VIEWS IN "{{schema_name}}" LIKE '{{prefix}}*';
Loading

0 comments on commit eae81c1

Please sign in to comment.