Skip to content

Commit

Permalink
Counts generation peformance improvements
Browse files Browse the repository at this point in the history
  • Loading branch information
dogversioning committed Feb 1, 2024
1 parent 820e1a9 commit 936dc2d
Show file tree
Hide file tree
Showing 33 changed files with 938 additions and 544 deletions.
10 changes: 6 additions & 4 deletions cumulus_library/base_table_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
from typing import final

from cumulus_library.databases import DatabaseCursor
from cumulus_library.helper import get_progress_bar, query_console_output
from cumulus_library import helper


class BaseTableBuilder(ABC):
Expand Down Expand Up @@ -77,18 +77,20 @@ def execute_queries(
table_names.append(table_name)
for table_name in table_names:
cursor.execute(f"DROP TABLE IF EXISTS {table_name}")
with get_progress_bar(disable=verbose) as progress:
with helper.get_progress_bar(disable=verbose) as progress:
task = progress.add_task(
self.display_text,
total=len(self.queries),
visible=not verbose,
)
for query in self.queries:
query_console_output(verbose, query, progress, task)
try:
helper.query_console_verbose(verbose, query)
cursor.execute(query)
helper.query_console_progress(verbose, progress, task)
except Exception as e: # pylint: disable=broad-exception-caught
sys.exit(e)

self.post_execution(cursor, schema, verbose, drop_table, *args, **kwargs)

def post_execution(
Expand Down Expand Up @@ -120,8 +122,8 @@ def comment_queries(self, doc_str=None):
self.queries = commented_queries

def write_queries(self, path: pathlib.Path = pathlib.Path.cwd() / "output.sql"):
path.parents[0].mkdir(parents=True, exist_ok=True)
"""writes all queries constructed by prepare_queries to disk"""
path.parents[0].mkdir(parents=True, exist_ok=True)
with open(path, "w", encoding="utf-8") as file:
for query in self.queries:
file.write(query)
Expand Down
2 changes: 1 addition & 1 deletion cumulus_library/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -356,7 +356,7 @@ def run_cli(args: Dict):

elif args["action"] == "generate-sql":
if "all" in args["target"]:
for target in study_dict.keys():
for target in study_dict.keys(): # pylint: disable= C0206, C0201
runner.generate_all_sql(study_dict[target])
else:
for target in args["target"]:
Expand Down
19 changes: 19 additions & 0 deletions cumulus_library/databases.py
Original file line number Diff line number Diff line change
Expand Up @@ -216,6 +216,21 @@ def __init__(self, db_file: str):
None,
duckdb.typing.TIMESTAMP,
)
self.connection.create_function(
# This is in support of the md5 function - duckdb's implementation of
# md5 just takes a string (very reasonable!), but trino's implementation
# takes a varbinary type (which duckdb does not support), so we have to
# run a string through a UTF8 conversion in production environments.
# For duckdb's purposes, we'll just return the argument passed with no
# modifications.
# NOTE: currently we do not have a use case beyond experimentation where
# this provides a benefit. Until we do, it is not required to support this
# in other DatabaseBackend implementations.
"to_utf8",
self._compat_to_utf8,
None,
duckdb.typing.VARCHAR,
)

def insert_tables(self, tables: dict[str, pyarrow.Table]) -> None:
"""Ingests all ndjson data from a folder tree (often the output folder of Cumulus ETL)"""
Expand Down Expand Up @@ -245,6 +260,10 @@ def _compat_date(
else:
raise ValueError("Unexpected date() argument:", type(value), value)

@staticmethod
def _compat_to_utf8(value: str) -> Optional[datetime.date]:
return value

@staticmethod
def _compat_from_iso8601_timestamp(
value: Optional[str],
Expand Down
14 changes: 9 additions & 5 deletions cumulus_library/helper.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,15 +48,19 @@ def list_coding(code_display: dict, system=None) -> List[dict]:
return as_list


def query_console_output(
verbose: bool, query: str, progress_bar: progress.Progress, task: progress.Task
def query_console_progress(
verbose: bool, progress_bar: progress.Progress, task: progress.Task
):
"""Convenience function for determining output type"""
"""Convenience function for updating progress bar"""
if not verbose:
progress_bar.advance(task)


def query_console_verbose(verbose: bool, query: str):
"""Convenience function for printing verbose queries"""
if verbose:
print()
print(query)
else:
progress_bar.advance(task)


def get_progress_bar(**kwargs) -> progress.Progress:
Expand Down
12 changes: 5 additions & 7 deletions cumulus_library/studies/core/builder_medication.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,11 @@
""" Module for generating core medication table"""

from cumulus_library.base_table_builder import BaseTableBuilder
from cumulus_library.helper import get_progress_bar, query_console_output
from cumulus_library.template_sql import templates
from cumulus_library.template_sql.utils import is_codeable_concept_populated
from cumulus_library import base_table_builder, helper
from cumulus_library.template_sql import templates, utils
from cumulus_library.studies.core.core_templates import core_templates


class MedicationBuilder(BaseTableBuilder):
class MedicationBuilder(base_table_builder.BaseTableBuilder):
display_text = "Creating Medication table..."

def _check_data_in_fields(self, cursor, schema: str):
Expand All @@ -27,14 +25,14 @@ def _check_data_in_fields(self, cursor, schema: str):

table = "medicationrequest"
base_col = "medicationcodeableconcept"
with get_progress_bar(transient=True) as progress:
with helper.get_progress_bar(transient=True) as progress:
task = progress.add_task(
"Detecting available medication sources...",
total=7,
)

# inline medications from FHIR medication
data_types["inline"] = is_codeable_concept_populated(
data_types["inline"] = utils.is_codeable_concept_populated(
schema, table, base_col, cursor
)
if data_types["inline"]:
Expand Down
1 change: 0 additions & 1 deletion cumulus_library/studies/core/builder_prereq_tables.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@
import sqlparse

from cumulus_library import base_table_builder
from cumulus_library.helper import get_progress_bar, query_console_output


class CorePrereqTableBuilder(base_table_builder.BaseTableBuilder):
Expand Down
1 change: 1 addition & 0 deletions cumulus_library/studies/core/count_core.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ def count_core_condition(self, duration: str = "month"):
cols = [
["category_code", "varchar", "cond_category_code"],
[f"recorded_{duration}", "date", "cond_month"],
["code_display", "varchar", None],
]
return self.count_condition(table_name, from_table, cols)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,9 @@
-- Its format is tied to the specific database it was run against, and it may not
-- be correct for all databases. Use the CLI's build option to derive the best SQL
-- for your dataset.

-- ###########################################################

CREATE TABLE core__condition_codable_concepts_display AS (
WITH

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,9 @@
-- Its format is tied to the specific database it was run against, and it may not
-- be correct for all databases. Use the CLI's build option to derive the best SQL
-- for your dataset.

-- ###########################################################

CREATE TABLE core__documentreference_dn_type AS (
WITH

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,9 @@
-- Its format is tied to the specific database it was run against, and it may not
-- be correct for all databases. Use the CLI's build option to derive the best SQL
-- for your dataset.

-- ###########################################################

CREATE TABLE core__encounter_dn_type AS (
WITH

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,9 @@
-- Its format is tied to the specific database it was run against, and it may not
-- be correct for all databases. Use the CLI's build option to derive the best SQL
-- for your dataset.

-- ###########################################################

CREATE TABLE core__medication AS (
WITH

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,9 @@
-- Its format is tied to the specific database it was run against, and it may not
-- be correct for all databases. Use the CLI's build option to derive the best SQL
-- for your dataset.

-- ###########################################################

CREATE TABLE core__medicationrequest_dn_category AS (
WITH

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,9 @@
-- Its format is tied to the specific database it was run against, and it may not
-- be correct for all databases. Use the CLI's build option to derive the best SQL
-- for your dataset.

-- ###########################################################

CREATE TABLE core__observation_dn_category AS (
WITH

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,9 @@
-- Its format is tied to the specific database it was run against, and it may not
-- be correct for all databases. Use the CLI's build option to derive the best SQL
-- for your dataset.

-- ###########################################################

CREATE TABLE core__patient_ext_race AS (
WITH

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,9 @@
-- Its format is tied to the specific database it was run against, and it may not
-- be correct for all databases. Use the CLI's build option to derive the best SQL
-- for your dataset.

-- ###########################################################

CREATE TABLE core__meta_version AS
SELECT 3 AS data_package_version;

Expand Down
Loading

0 comments on commit 936dc2d

Please sign in to comment.