diff --git a/cumulus_library/base_table_builder.py b/cumulus_library/base_table_builder.py index 95534673..bf6a8783 100644 --- a/cumulus_library/base_table_builder.py +++ b/cumulus_library/base_table_builder.py @@ -85,9 +85,8 @@ def execute_queries( ) for query in self.queries: try: - helper.query_console_verbose(verbose, query) - cursor.execute(query) - helper.query_console_progress(verbose, progress, task) + with helper.query_console_output(verbose, query, progress, task): + cursor.execute(query) except Exception as e: # pylint: disable=broad-exception-caught sys.exit(e) diff --git a/cumulus_library/cli.py b/cumulus_library/cli.py index 29dca9ed..6dd58afa 100755 --- a/cumulus_library/cli.py +++ b/cumulus_library/cli.py @@ -355,12 +355,8 @@ def run_cli(args: Dict): runner.export_study(study_dict[target], args["data_path"]) elif args["action"] == "generate-sql": - if "all" in args["target"]: - for target in study_dict.keys(): # pylint: disable= C0206, C0201 - runner.generate_all_sql(study_dict[target]) - else: - for target in args["target"]: - runner.generate_study_sql(study_dict[target]) + for target in args["target"]: + runner.generate_study_sql(study_dict[target]) finally: db_backend.close() diff --git a/cumulus_library/databases.py b/cumulus_library/databases.py index 749d329d..858a7156 100644 --- a/cumulus_library/databases.py +++ b/cumulus_library/databases.py @@ -217,15 +217,20 @@ def __init__(self, db_file: str): duckdb.typing.TIMESTAMP, ) self.connection.create_function( - # This is in support of the md5 function - duckdb's implementation of - # md5 just takes a string (very reasonable!), but trino's implementation - # takes a varbinary type (which duckdb does not support), so we have to - # run a string through a UTF8 conversion in production environments. - # For duckdb's purposes, we'll just return the argument passed with no - # modifications. + # When trying to calculate an MD5 hash in Trino/Athena, the implementation + # expects to recieve a varbinary type, so if you're hashing a string, + # you would invoke it like `SELECT md5(to_utf8(string_col)) from table`. + # + # DuckDB's md5() function accepts a varchar instead, and does not have a + # to_utf() function or varbinary type, so we patch this with a UDF that + # just provides back the original string. As a result, these functions + # have different signatures, but for cases like this where you're + # conforming an argument to another function, it provides appropriate + # function mocking + # # NOTE: currently we do not have a use case beyond experimentation where - # this provides a benefit. Until we do, it is not required to support this - # in other DatabaseBackend implementations. + # using MD5 hashes provide a benefit. Until we do, it is not required to + # support this in other DatabaseBackend implementations. "to_utf8", self._compat_to_utf8, None, @@ -261,7 +266,8 @@ def _compat_date( raise ValueError("Unexpected date() argument:", type(value), value) @staticmethod - def _compat_to_utf8(value: str) -> Optional[datetime.date]: + def _compat_to_utf8(value: Optional[str]) -> Optional[datetime.date]: + """See the create_function() call for to_utf8 for more background""" return value @staticmethod diff --git a/cumulus_library/helper.py b/cumulus_library/helper.py index 851ac05e..84fc044e 100644 --- a/cumulus_library/helper.py +++ b/cumulus_library/helper.py @@ -3,7 +3,10 @@ import datetime import os import json + +from contextlib import contextmanager from typing import List + from rich import progress @@ -48,19 +51,17 @@ def list_coding(code_display: dict, system=None) -> List[dict]: return as_list -def query_console_progress( - verbose: bool, progress_bar: progress.Progress, task: progress.Task +@contextmanager +def query_console_output( + verbose: bool, query: str, progress_bar: progress.Progress, task: progress.Task ): - """Convenience function for updating progress bar""" - if not verbose: - progress_bar.advance(task) - - -def query_console_verbose(verbose: bool, query: str): - """Convenience function for printing verbose queries""" + """Convenience context manager for handling console output""" if verbose: print() print(query) + yield + if not verbose: + progress_bar.advance(task) def get_progress_bar(**kwargs) -> progress.Progress: diff --git a/cumulus_library/study_parser.py b/cumulus_library/study_parser.py index 259d1f85..9c1d69d7 100644 --- a/cumulus_library/study_parser.py +++ b/cumulus_library/study_parser.py @@ -311,9 +311,10 @@ def _execute_drop_queries( drop_view_table = templates.get_drop_view_table( name=view_table[0], view_or_table=view_table[1] ) - helper.query_console_verbose(verbose, drop_view_table) - cursor.execute(drop_view_table) - helper.query_console_progress(verbose, progress, task) + with helper.query_console_output( + verbose, drop_view_table, progress, task + ) as v: + cursor.execute(drop_view_table) def _load_and_execute_builder( self, @@ -653,9 +654,10 @@ def _execute_build_queries( "start with a string like `study_prefix__`.", ) try: - helper.query_console_verbose(verbose, query[0]) - cursor.execute(query[0]) - helper.query_console_progress(verbose, progress, task) + with helper.query_console_output( + verbose, query[0], progress, task + ) as v: + cursor.execute(query[0]) except Exception as e: # pylint: disable=broad-exception-caught self._query_error( query, diff --git a/cumulus_library/template_sql/statistics/count.sql.jinja b/cumulus_library/template_sql/statistics/count.sql.jinja index 3107f5f4..617279f2 100644 --- a/cumulus_library/template_sql/statistics/count.sql.jinja +++ b/cumulus_library/template_sql/statistics/count.sql.jinja @@ -22,7 +22,7 @@ infrastructure for future/moving resource specific logic to template generator - {#- implicit null -#} {%- endmacro -%} -{%- macro cols_delenated_list(table_cols, fhir_resource)-%} +{%- macro cols_delineated_list(table_cols, fhir_resource)-%} {% for col in table_cols %} "{{ column_or_alias(col) }}" {{- comma_delineate(loop) }} @@ -145,7 +145,7 @@ CREATE TABLE {{ table_name }} AS ( tertiary_powerset AS( SELECT count(DISTINCT {{tertiary}}) AS cnt_{{tertiary}}, - {{-cols_delenated_list(table_cols, fhir_resource)}}, + {{-cols_delineated_list(table_cols, fhir_resource)}}, concat_ws( '-', {{-cols_coalesced_list(table_cols, fhir_resource)}} @@ -153,7 +153,7 @@ CREATE TABLE {{ table_name }} AS ( FROM null_replacement GROUP BY cube( - {{-cols_delenated_list(table_cols, fhir_resource)}} + {{-cols_delineated_list(table_cols, fhir_resource)}} ) ), @@ -162,7 +162,7 @@ CREATE TABLE {{ table_name }} AS ( secondary_powerset AS ( SELECT count(DISTINCT {{secondary}}) AS cnt_{{secondary}}, - {{-cols_delenated_list(table_cols, fhir_resource)}}, + {{-cols_delineated_list(table_cols, fhir_resource)}}, concat_ws( '-', {{-cols_coalesced_list(table_cols, fhir_resource)}} @@ -170,7 +170,7 @@ CREATE TABLE {{ table_name }} AS ( FROM null_replacement GROUP BY cube( - {{-cols_delenated_list(table_cols, fhir_resource)}} + {{-cols_delineated_list(table_cols, fhir_resource)}} ) ), {%- endif %} @@ -178,7 +178,7 @@ CREATE TABLE {{ table_name }} AS ( powerset AS ( SELECT count(DISTINCT subject_ref) AS cnt_subject_ref, - {{-cols_delenated_list(table_cols, fhir_resource)}}, + {{-cols_delineated_list(table_cols, fhir_resource)}}, concat_ws( '-', {{-cols_coalesced_list(table_cols, fhir_resource)}} @@ -186,7 +186,7 @@ CREATE TABLE {{ table_name }} AS ( FROM null_replacement GROUP BY cube( - {{-cols_delenated_list(table_cols, fhir_resource)}} + {{-cols_delineated_list(table_cols, fhir_resource)}} ) ) diff --git a/docs/creating-studies.md b/docs/creating-studies.md index b3052b15..edc2456c 100644 --- a/docs/creating-studies.md +++ b/docs/creating-studies.md @@ -117,7 +117,7 @@ styling. - All your tables **must** start with a string like `my_study__`. - Relatedly, **`__` is a reserved character string**. Your table names should have exactly one of these. We :might: add other use cases for these in the future, - but as of this writing we don't plan to. C + but as of this writing we don't plan to. - We have **three reserved post-study prefrix substrings: `etl_`, `nlp_`, and `lib_`** so that we don't drop tables created by other processes. These are fine to use elsewhere; `my_study__nlp_counts` would cause an error, but diff --git a/tests/test_core.py b/tests/test_core.py index d2c8c0e6..3f95f6f8 100644 --- a/tests/test_core.py +++ b/tests/test_core.py @@ -13,6 +13,15 @@ from tests.conftest import ResourceTableIdPos as idpos # pylint: disable=unused-import +def get_sorted_table_data(cursor, table): + num_cols = cursor.execute( + "SELECT count(*) FROM information_schema.columns " f"WHERE table_name='{table}'" + ).fetchone()[0] + return cursor.execute( + f"SELECT * FROM '{table}' ORDER BY " f"{','.join(map(str, range(1,num_cols)))}" + ) + + @pytest.mark.parametrize( "table", [ diff --git a/tests/test_counts_templates.py b/tests/test_counts_templates.py index 2d311df2..55a211a0 100644 --- a/tests/test_counts_templates.py +++ b/tests/test_counts_templates.py @@ -202,6 +202,6 @@ def test_count_query(expected, kwargs): query = get_count_query("test_table", "test_source", ["age", "sex"], **kwargs) # Snippet for getting updated template output - with open("output.sql", "w") as f: - f.write(query) + # with open("output.sql", "w") as f: + # f.write(query) assert query == expected diff --git a/tests/test_data/duckdb_data/expected_export/core/core__count_condition_month.csv b/tests/test_data/duckdb_data/expected_export/core/core__count_condition_month.csv index a54bee58..7b01156e 100644 --- a/tests/test_data/duckdb_data/expected_export/core/core__count_condition_month.csv +++ b/tests/test_data/duckdb_data/expected_export/core/core__count_condition_month.csv @@ -1,3 +1,3 @@ -cnt,cond_category_code,cond_month,code_display +cnt,cond_category_code,cond_month,cond_code_display 15,,, 15,encounter-diagnosis,,