Counts generation peformance improvements

smart-on-fhir · Feb 1, 2024 · 936dc2d · 936dc2d
1 parent 820e1a9
commit 936dc2d
Show file tree

Hide file tree

Showing 33 changed files with 938 additions and 544 deletions.
diff --git a/cumulus_library/base_table_builder.py b/cumulus_library/base_table_builder.py
@@ -8,7 +8,7 @@
 from typing import final
 
 from cumulus_library.databases import DatabaseCursor
-from cumulus_library.helper import get_progress_bar, query_console_output
+from cumulus_library import helper
 
 
 class BaseTableBuilder(ABC):
@@ -77,18 +77,20 @@ def execute_queries(
                     table_names.append(table_name)
             for table_name in table_names:
                 cursor.execute(f"DROP TABLE IF EXISTS {table_name}")
-        with get_progress_bar(disable=verbose) as progress:
+        with helper.get_progress_bar(disable=verbose) as progress:
             task = progress.add_task(
                 self.display_text,
                 total=len(self.queries),
                 visible=not verbose,
             )
             for query in self.queries:
-                query_console_output(verbose, query, progress, task)
                 try:
+                    helper.query_console_verbose(verbose, query)
                     cursor.execute(query)
+                    helper.query_console_progress(verbose, progress, task)
                 except Exception as e:  # pylint: disable=broad-exception-caught
                     sys.exit(e)
+
         self.post_execution(cursor, schema, verbose, drop_table, *args, **kwargs)
 
     def post_execution(
@@ -120,8 +122,8 @@ def comment_queries(self, doc_str=None):
         self.queries = commented_queries
 
     def write_queries(self, path: pathlib.Path = pathlib.Path.cwd() / "output.sql"):
-        path.parents[0].mkdir(parents=True, exist_ok=True)
         """writes all queries constructed by prepare_queries to disk"""
+        path.parents[0].mkdir(parents=True, exist_ok=True)
         with open(path, "w", encoding="utf-8") as file:
             for query in self.queries:
                 file.write(query)

diff --git a/cumulus_library/cli.py b/cumulus_library/cli.py
@@ -356,7 +356,7 @@ def run_cli(args: Dict):
 
             elif args["action"] == "generate-sql":
                 if "all" in args["target"]:
-                    for target in study_dict.keys():
+                    for target in study_dict.keys():  # pylint: disable= C0206, C0201
                         runner.generate_all_sql(study_dict[target])
                 else:
                     for target in args["target"]:

diff --git a/cumulus_library/databases.py b/cumulus_library/databases.py
@@ -216,6 +216,21 @@ def __init__(self, db_file: str):
             None,
             duckdb.typing.TIMESTAMP,
         )
+        self.connection.create_function(
+            # This is in support of the md5 function - duckdb's implementation of
+            # md5 just takes a string (very reasonable!), but trino's implementation
+            # takes a varbinary type (which duckdb does not support), so we have to
+            # run a string through a UTF8 conversion in production environments.
+            # For duckdb's purposes, we'll just return the argument passed with no
+            # modifications.
+            # NOTE: currently we do not have a use case beyond experimentation where
+            # this provides a benefit. Until we do, it is not required to support this
+            # in other DatabaseBackend implementations.
+            "to_utf8",
+            self._compat_to_utf8,
+            None,
+            duckdb.typing.VARCHAR,
+        )
 
     def insert_tables(self, tables: dict[str, pyarrow.Table]) -> None:
         """Ingests all ndjson data from a folder tree (often the output folder of Cumulus ETL)"""
@@ -245,6 +260,10 @@ def _compat_date(
         else:
             raise ValueError("Unexpected date() argument:", type(value), value)
 
+    @staticmethod
+    def _compat_to_utf8(value: str) -> Optional[datetime.date]:
+        return value
+
     @staticmethod
     def _compat_from_iso8601_timestamp(
         value: Optional[str],

diff --git a/cumulus_library/helper.py b/cumulus_library/helper.py
@@ -48,15 +48,19 @@ def list_coding(code_display: dict, system=None) -> List[dict]:
     return as_list
 
 
-def query_console_output(
-    verbose: bool, query: str, progress_bar: progress.Progress, task: progress.Task
+def query_console_progress(
+    verbose: bool, progress_bar: progress.Progress, task: progress.Task
 ):
-    """Convenience function for determining output type"""
+    """Convenience function for updating progress bar"""
+    if not verbose:
+        progress_bar.advance(task)
+
+
+def query_console_verbose(verbose: bool, query: str):
+    """Convenience function for printing verbose queries"""
     if verbose:
         print()
         print(query)
-    else:
-        progress_bar.advance(task)
 
 
 def get_progress_bar(**kwargs) -> progress.Progress:

diff --git a/cumulus_library/studies/core/builder_medication.py b/cumulus_library/studies/core/builder_medication.py
@@ -1,13 +1,11 @@
 """ Module for generating core medication table"""
 
-from cumulus_library.base_table_builder import BaseTableBuilder
-from cumulus_library.helper import get_progress_bar, query_console_output
-from cumulus_library.template_sql import templates
-from cumulus_library.template_sql.utils import is_codeable_concept_populated
+from cumulus_library import base_table_builder, helper
+from cumulus_library.template_sql import templates, utils
 from cumulus_library.studies.core.core_templates import core_templates
 
 
-class MedicationBuilder(BaseTableBuilder):
+class MedicationBuilder(base_table_builder.BaseTableBuilder):
     display_text = "Creating Medication table..."
 
     def _check_data_in_fields(self, cursor, schema: str):
@@ -27,14 +25,14 @@ def _check_data_in_fields(self, cursor, schema: str):
 
         table = "medicationrequest"
         base_col = "medicationcodeableconcept"
-        with get_progress_bar(transient=True) as progress:
+        with helper.get_progress_bar(transient=True) as progress:
             task = progress.add_task(
                 "Detecting available medication sources...",
                 total=7,
             )
 
             # inline medications from FHIR medication
-            data_types["inline"] = is_codeable_concept_populated(
+            data_types["inline"] = utils.is_codeable_concept_populated(
                 schema, table, base_col, cursor
             )
             if data_types["inline"]:

diff --git a/cumulus_library/studies/core/builder_prereq_tables.py b/cumulus_library/studies/core/builder_prereq_tables.py
@@ -7,7 +7,6 @@
 import sqlparse
 
 from cumulus_library import base_table_builder
-from cumulus_library.helper import get_progress_bar, query_console_output
 
 
 class CorePrereqTableBuilder(base_table_builder.BaseTableBuilder):

diff --git a/cumulus_library/studies/core/count_core.py b/cumulus_library/studies/core/count_core.py
@@ -12,6 +12,7 @@ def count_core_condition(self, duration: str = "month"):
         cols = [
             ["category_code", "varchar", "cond_category_code"],
             [f"recorded_{duration}", "date", "cond_month"],
+            ["code_display", "varchar", None],
         ]
         return self.count_condition(table_name, from_table, cols)
 

diff --git a/cumulus_library/studies/core/reference_sql/builder_condition.sql b/cumulus_library/studies/core/reference_sql/builder_condition.sql
@@ -3,6 +3,9 @@
 -- Its format is tied to the specific database it was run against, and it may not
 -- be correct for all databases. Use the CLI's build option to derive the best SQL
 -- for your dataset.
+
+-- ###########################################################
+
 CREATE TABLE core__condition_codable_concepts_display AS (
     WITH
 

diff --git a/cumulus_library/studies/core/reference_sql/builder_documentreference.sql b/cumulus_library/studies/core/reference_sql/builder_documentreference.sql
@@ -3,6 +3,9 @@
 -- Its format is tied to the specific database it was run against, and it may not
 -- be correct for all databases. Use the CLI's build option to derive the best SQL
 -- for your dataset.
+
+-- ###########################################################
+
 CREATE TABLE core__documentreference_dn_type AS (
     WITH
 

diff --git a/cumulus_library/studies/core/reference_sql/builder_encounter.sql b/cumulus_library/studies/core/reference_sql/builder_encounter.sql
@@ -3,6 +3,9 @@
 -- Its format is tied to the specific database it was run against, and it may not
 -- be correct for all databases. Use the CLI's build option to derive the best SQL
 -- for your dataset.
+
+-- ###########################################################
+
 CREATE TABLE core__encounter_dn_type AS (
     WITH
 

diff --git a/cumulus_library/studies/core/reference_sql/builder_medication.sql b/cumulus_library/studies/core/reference_sql/builder_medication.sql
@@ -3,6 +3,9 @@
 -- Its format is tied to the specific database it was run against, and it may not
 -- be correct for all databases. Use the CLI's build option to derive the best SQL
 -- for your dataset.
+
+-- ###########################################################
+
 CREATE TABLE core__medication AS (
     WITH
 

diff --git a/cumulus_library/studies/core/reference_sql/builder_medicationrequest.sql b/cumulus_library/studies/core/reference_sql/builder_medicationrequest.sql
@@ -3,6 +3,9 @@
 -- Its format is tied to the specific database it was run against, and it may not
 -- be correct for all databases. Use the CLI's build option to derive the best SQL
 -- for your dataset.
+
+-- ###########################################################
+
 CREATE TABLE core__medicationrequest_dn_category AS (
     WITH
 

diff --git a/cumulus_library/studies/core/reference_sql/builder_observation.sql b/cumulus_library/studies/core/reference_sql/builder_observation.sql
@@ -3,6 +3,9 @@
 -- Its format is tied to the specific database it was run against, and it may not
 -- be correct for all databases. Use the CLI's build option to derive the best SQL
 -- for your dataset.
+
+-- ###########################################################
+
 CREATE TABLE core__observation_dn_category AS (
     WITH
 

diff --git a/cumulus_library/studies/core/reference_sql/builder_patient.sql b/cumulus_library/studies/core/reference_sql/builder_patient.sql
@@ -3,6 +3,9 @@
 -- Its format is tied to the specific database it was run against, and it may not
 -- be correct for all databases. Use the CLI's build option to derive the best SQL
 -- for your dataset.
+
+-- ###########################################################
+
 CREATE TABLE core__patient_ext_race AS (
     WITH
 

diff --git a/cumulus_library/studies/core/reference_sql/builder_prereq_tables.sql b/cumulus_library/studies/core/reference_sql/builder_prereq_tables.sql
@@ -3,6 +3,9 @@
 -- Its format is tied to the specific database it was run against, and it may not
 -- be correct for all databases. Use the CLI's build option to derive the best SQL
 -- for your dataset.
+
+-- ###########################################################
+
 CREATE TABLE core__meta_version AS
 SELECT 3 AS data_package_version;