From fd14ce3530fcd72c20bba7a1685e93afd9dbaace Mon Sep 17 00:00:00 2001 From: Matt Garber Date: Tue, 25 Jun 2024 15:16:49 -0400 Subject: [PATCH] Added generic logging interface --- cumulus_library/actions/builder.py | 32 +--- cumulus_library/actions/file_generator.py | 2 +- cumulus_library/cli.py | 61 +++---- cumulus_library/enums.py | 16 +- cumulus_library/log_utils.py | 95 +++++++++++ cumulus_library/protected_table_builder.py | 8 +- .../template_sql/base_templates.py | 2 + .../template_sql/insert_into.sql.jinja | 4 +- cumulus_library/template_sql/sql_utils.py | 78 ++++++++- docs/creating-sql-with-python.md | 4 + tests/test_base_templates.py | 6 +- tests/test_cli.py | 2 - .../test_data/study_python_valid_generated.md | 1 + tests/test_logging.py | 151 ++++++++++++++++++ 14 files changed, 383 insertions(+), 79 deletions(-) create mode 100644 cumulus_library/log_utils.py create mode 100644 tests/test_logging.py diff --git a/cumulus_library/actions/builder.py b/cumulus_library/actions/builder.py index 0469e804..2758eae7 100644 --- a/cumulus_library/actions/builder.py +++ b/cumulus_library/actions/builder.py @@ -10,17 +10,16 @@ from rich.progress import Progress, TaskID from cumulus_library import ( - __version__, base_table_builder, base_utils, databases, enums, errors, + log_utils, protected_table_builder, study_parser, ) from cumulus_library.statistics import psm -from cumulus_library.template_sql import base_templates @contextlib.contextmanager @@ -263,29 +262,14 @@ def run_statistics_builders( config=config, manifest=manifest, ) - - insert_query = base_templates.get_insert_into_query( - f"{manifest.get_study_prefix()}__{enums.ProtectedTables.STATISTICS.value}", - [ - "study_name", - "library_version", - "table_type", - "table_name", - "view_name", - "created_on", - ], - [ - [ - manifest.get_study_prefix(), - __version__, - config_type, - f"{target_table}_{safe_timestamp}", - target_table, - base_utils.get_utc_datetime(), - ] - ], + log_utils.log_statistics( + cursor=cursor, + schema=schema, + manifest=manifest, + table_type=config_type, + table_name=f"{target_table}_{safe_timestamp}", + view_name=target_table, ) - cursor.execute(insert_query) def run_matching_table_builder( diff --git a/cumulus_library/actions/file_generator.py b/cumulus_library/actions/file_generator.py index 8eb4c6f0..ea3aea81 100644 --- a/cumulus_library/actions/file_generator.py +++ b/cumulus_library/actions/file_generator.py @@ -85,7 +85,7 @@ def run_generate_markdown( if len(count_tables) > 0: f.write(f"## {manifest_parser.get_study_prefix()} count tables\n\n") for table in count_tables: - manifest_parser._write_md_table(table, study_df, f) + _write_md_table(table, study_df, f) if len(base_tables) > 0: f.write(f"## {manifest_parser.get_study_prefix()} base tables\n\n") for table in base_tables: diff --git a/cumulus_library/cli.py b/cumulus_library/cli.py index 09571fcd..58521079 100755 --- a/cumulus_library/cli.py +++ b/cumulus_library/cli.py @@ -17,7 +17,7 @@ databases, enums, errors, - protected_table_builder, + log_utils, study_parser, ) from cumulus_library.actions import ( @@ -28,7 +28,6 @@ importer, uploader, ) -from cumulus_library.template_sql import base_templates class StudyRunner: @@ -49,36 +48,6 @@ def get_schema(self, manifest: study_parser.StudyManifestParser): return dedicated return self.schema_name - def update_transactions( - self, manifest: study_parser.StudyManifestParser, status: str - ): - """Adds a record to a study's transactions table""" - if manifest.get_dedicated_schema(): - transactions = ( - f"{manifest.get_dedicated_schema()}." - f"{enums.ProtectedTables.TRANSACTIONS.value}" - ) - else: - transactions = ( - f"{manifest.get_study_prefix()}__" - f"{enums.ProtectedTables.TRANSACTIONS.value}" - ) - self.cursor.execute( - base_templates.get_insert_into_query( - transactions, - protected_table_builder.TRANSACTIONS_COLS, - [ - [ - manifest.get_study_prefix(), - __version__, - status, - base_utils.get_utc_datetime(), - ] - ], - {"event_time": "TIMESTAMP"}, - ) - ) - ### Creating studies def clean_study( @@ -153,7 +122,12 @@ def clean_and_build_study( config=config, ) if not continue_from: - self.update_transactions(manifest, "started") + log_utils.log_transaction( + cursor=self.cursor, + schema=schema, + manifest=manifest, + status=enums.LogStatuses.STARTED, + ) cleaned_tables = cleaner.clean_study( manifest_parser=manifest, cursor=self.cursor, @@ -173,7 +147,12 @@ def clean_and_build_study( config=config, ) else: - self.update_transactions(manifest, "resumed") + log_utils.log_transaction( + cursor=self.cursor, + schema=schema, + manifest=manifest, + status=enums.LogStatuses.RESUMED, + ) builder.build_study( manifest, self.cursor, @@ -195,14 +174,24 @@ def clean_and_build_study( verbose=self.verbose, config=config, ) - self.update_transactions(manifest, "finished") + log_utils.log_transaction( + cursor=self.cursor, + schema=schema, + manifest=manifest, + status=enums.LogStatuses.FINISHED, + ) except errors.StudyManifestFilesystemError as e: # This should be thrown prior to any database connections, so # skipping logging raise e except Exception as e: - self.update_transactions(manifest, "error") + log_utils.log_transaction( + cursor=self.cursor, + schema=schema, + manifest=manifest, + status=enums.LogStatuses.ERROR, + ) raise e def run_matching_table_builder( diff --git a/cumulus_library/enums.py b/cumulus_library/enums.py index 9a7f91a4..e3c7e899 100644 --- a/cumulus_library/enums.py +++ b/cumulus_library/enums.py @@ -1,9 +1,9 @@ """Holds enums used across more than one module""" -from enum import Enum +import enum -class ProtectedTableKeywords(Enum): +class ProtectedTableKeywords(enum.Enum): """Tables with a pattern like '_{keyword}_' are not manually dropped.""" ETL = "etl" @@ -11,8 +11,18 @@ class ProtectedTableKeywords(Enum): NLP = "nlp" -class ProtectedTables(Enum): +class ProtectedTables(enum.Enum): """Tables created by cumulus for persistence outside of study rebuilds""" STATISTICS = "lib_statistics" TRANSACTIONS = "lib_transactions" + + +class LogStatuses(enum.Enum): + DEBUG = "debug" + ERROR = "error" + FINISHED = "finished" + INFO = "info" + RESUMED = "resumed" + STARTED = "started" + WARN = "warn" diff --git a/cumulus_library/log_utils.py b/cumulus_library/log_utils.py new file mode 100644 index 00000000..2a04be8c --- /dev/null +++ b/cumulus_library/log_utils.py @@ -0,0 +1,95 @@ +"""A set of convenience functions for database logging""" + +from cumulus_library import ( + __version__, + base_utils, + databases, + enums, + errors, + study_parser, +) +from cumulus_library.template_sql import base_templates, sql_utils + + +def log_transaction( + *, + cursor: databases.DatabaseCursor, + schema: str, + manifest: study_parser.StudyManifestParser, + status: enums.LogStatuses | str | None = enums.LogStatuses.INFO, + message: str | None = None, +): + if isinstance(status, str): + try: + status = enums.LogStatuses(status) + except ValueError as e: + raise errors.CumulusLibraryError( + f"Invalid event type {status} requested for transaction log.\n" + f"Valid types: {','.join([x.value for x in enums.LogStatuses])}" + ) from e + _log_table( + table=sql_utils.TransactionsTable(), + cursor=cursor, + schema=schema, + manifest=manifest, + dataset=[ + [ + manifest.get_study_prefix(), + __version__, + status.value, + base_utils.get_utc_datetime(), + message if message else None, + ] + ], + ) + + +def log_statistics( + *, + cursor: databases.DatabaseCursor, + schema: str, + manifest: study_parser.StudyManifestParser, + table_type: str, + table_name: str, + view_name: str, +): + _log_table( + table=sql_utils.StatisticsTable(), + cursor=cursor, + schema=schema, + manifest=manifest, + dataset=[ + [ + manifest.get_study_prefix(), + __version__, + table_type, + table_name, + view_name, + base_utils.get_utc_datetime(), + ] + ], + ) + + +def _log_table( + *, + table: sql_utils.BaseTable, + cursor: databases.DatabaseCursor, + schema: str, + manifest: study_parser.StudyManifestParser, + dataset: list[list], +): + if manifest and manifest.get_dedicated_schema(): + db_schema = manifest.get_dedicated_schema() + table_name = table.name + else: + db_schema = schema + table_name = f"{manifest.get_study_prefix()}__{table.name}" + query = base_templates.get_insert_into_query( + schema=db_schema, + table_name=table_name, + table_cols=table.columns, + dataset=dataset, + type_casts=table.type_casts, + ) + cursor.execute(query) diff --git a/cumulus_library/protected_table_builder.py b/cumulus_library/protected_table_builder.py index 0ab3227c..70d7e3d6 100644 --- a/cumulus_library/protected_table_builder.py +++ b/cumulus_library/protected_table_builder.py @@ -1,10 +1,10 @@ """Builder for creating tables for tracking state/logging changes""" -from cumulus_library import base_table_builder, enums, study_parser +from cumulus_library import base_table_builder, databases, enums, study_parser from cumulus_library.template_sql import base_templates -TRANSACTIONS_COLS = ["study_name", "library_version", "status", "event_time"] -TRANSACTION_COLS_TYPES = ["varchar", "varchar", "varchar", "timestamp"] +TRANSACTIONS_COLS = ["study_name", "library_version", "status", "event_time", "message"] +TRANSACTION_COLS_TYPES = ["varchar", "varchar", "varchar", "timestamp", "varchar"] # while it may seem redundant, study_name and view_name are included as a column for # ease of constructing a view of multiple transaction tables STATISTICS_COLS = [ @@ -32,7 +32,7 @@ class ProtectedTableBuilder(base_table_builder.BaseTableBuilder): def prepare_queries( self, - cursor: object, + cursor: databases.DatabaseCursor, schema: str, study_name: str, study_stats: dict, diff --git a/cumulus_library/template_sql/base_templates.py b/cumulus_library/template_sql/base_templates.py index 72bec480..4bfb0f36 100644 --- a/cumulus_library/template_sql/base_templates.py +++ b/cumulus_library/template_sql/base_templates.py @@ -298,6 +298,7 @@ def get_extension_denormalize_query(config: sql_utils.ExtensionConfig) -> str: def get_insert_into_query( + schema: str, table_name: str, table_cols: list[str], dataset: list[list[str]], @@ -313,6 +314,7 @@ def get_insert_into_query( type_casts = type_casts or {} return get_base_template( "insert_into", + schema_name=schema, table_name=table_name, table_cols=table_cols, dataset=dataset, diff --git a/cumulus_library/template_sql/insert_into.sql.jinja b/cumulus_library/template_sql/insert_into.sql.jinja index 9807243c..aca08fef 100644 --- a/cumulus_library/template_sql/insert_into.sql.jinja +++ b/cumulus_library/template_sql/insert_into.sql.jinja @@ -1,5 +1,5 @@ {%- import 'syntax.sql.jinja' as syntax -%} -INSERT INTO {{ table_name }} +INSERT INTO {{ schema_name }}.{{ table_name }} ( {%- for col in table_cols -%} "{{ col }}" @@ -12,6 +12,8 @@ VALUES {%- for field in row -%} {%- if table_cols[loop.index0] in type_casts.keys() -%} {{ type_casts[table_cols[loop.index0]] }} '{{ field }}' + {%- elif not field -%} + NULL {%- else -%} '{{ field }}' {%- endif -%} diff --git a/cumulus_library/template_sql/sql_utils.py b/cumulus_library/template_sql/sql_utils.py index d1cbdf3b..f5c7a5e4 100644 --- a/cumulus_library/template_sql/sql_utils.py +++ b/cumulus_library/template_sql/sql_utils.py @@ -11,7 +11,7 @@ from dataclasses import dataclass, field -from cumulus_library import base_utils, databases +from cumulus_library import base_utils, databases, enums from cumulus_library.template_sql import base_templates # *** Some convenience constants for providing to validate_schema() *** @@ -26,7 +26,73 @@ @dataclass(kw_only=True) -class BaseConfig: +class BaseTable: + """Base class for system table metadata + + :keyword name: the name of the table + :keyword columns: a list of columns names + :keyword column_types: a list of sql types for the columns + :keyword type_casts: a dict of column/type values that require explicit casting + """ + + name: str + columns: list + column_types: list + type_casts: dict + + +@dataclass(kw_only=True) +class TransactionsTable(BaseTable): + name: str = enums.ProtectedTables.TRANSACTIONS.value + columns: list = field( + default_factory=lambda: [ + "study_name", + "library_version", + "status", + "event_time", + "message", + ] + ) + column_types: list = field( + default_factory=lambda: [ + "varchar", + "varchar", + "varchar", + "timestamp", + "varchar", + ] + ) + type_casts: dict = field(default_factory=lambda: {"event_time": "timestamp"}) + + +@dataclass(kw_only=True) +class StatisticsTable(BaseTable): + name: str = enums.ProtectedTables.STATISTICS.value + columns: list = field( + default_factory=lambda: [ + "study_name", + "library_version", + "table_type", + "table_name", + "view_name", + "created_on", + ] + ) + column_types: list = field( + default_factory=lambda: [ + "varchar", + "varchar", + "varchar", + "varchar", + "varchar", + "timestamp", + ] + ) + type_casts: dict = field(default_factory=lambda: {"created_on": "timestamp"}) + + +@dataclass(kw_only=True) +class BaseFHIRResourceConfig: """Base class for handling table detection/denormalization""" source_table: str = None @@ -36,7 +102,7 @@ class BaseConfig: @dataclass(kw_only=True) -class CodeableConceptConfig(BaseConfig): +class CodeableConceptConfig(BaseFHIRResourceConfig): """Holds parameters for generating codableconcept tables. :keyword column_name: the column containing the codeableConcept you want to extract. @@ -63,7 +129,7 @@ class CodeableConceptConfig(BaseConfig): @dataclass(kw_only=True) -class CodingConfig(BaseConfig): +class CodingConfig(BaseFHIRResourceConfig): column_hierarchy: list[tuple] filter_priority: bool = False code_systems: list = None @@ -71,7 +137,7 @@ class CodingConfig(BaseConfig): @dataclass(kw_only=True) -class ExtensionConfig(BaseConfig): +class ExtensionConfig(BaseFHIRResourceConfig): """convenience class for holding parameters for generating extension tables. :keyword source_table: the table to extract extensions from @@ -132,7 +198,7 @@ def _check_data_in_fields( def denormalize_complex_objects( database: databases.DatabaseBackend, - code_sources: list[BaseConfig], + code_sources: list[BaseFHIRResourceConfig], ): queries = [] code_sources = _check_data_in_fields(database, code_sources) diff --git a/docs/creating-sql-with-python.md b/docs/creating-sql-with-python.md index de2c80d3..14bbc0e3 100644 --- a/docs/creating-sql-with-python.md +++ b/docs/creating-sql-with-python.md @@ -87,6 +87,10 @@ your SQL has been validated. If you're just working on counts, you don't need to worry about this detail, but otherwise, the following section talks about our templating mechanism. +If you need to incrementally check steps in the database, or need other state logging +tools, you can use functions in `cumulus_library/log_utils.py` to write to an +autogenerated `{study}__lib_transactions` table to track state. + ### Working with template SQL If you are only worried about building counts tables, skip this section - diff --git a/tests/test_base_templates.py b/tests/test_base_templates.py index 7e77995d..884d5e73 100644 --- a/tests/test_base_templates.py +++ b/tests/test_base_templates.py @@ -496,23 +496,25 @@ def test_extension_denormalize_creation(): def test_insert_into_query_creation(): - expected = """INSERT INTO test_table + expected = """INSERT INTO test.test_table ("a","b") VALUES ('foo','foo'), ('bar','bar');""" query = base_templates.get_insert_into_query( + schema="test", table_name="test_table", table_cols=["a", "b"], dataset=[["foo", "foo"], ["bar", "bar"]], ) assert query == expected - expected = """INSERT INTO test_table + expected = """INSERT INTO test.test_table ("a","b") VALUES ('foo',VARCHAR 'foo'), ('bar',VARCHAR 'bar');""" query = base_templates.get_insert_into_query( + schema="test", table_name="test_table", table_cols=["a", "b"], dataset=[["foo", "foo"], ["bar", "bar"]], diff --git a/tests/test_cli.py b/tests/test_cli.py index f8b336ac..14866f2e 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -221,8 +221,6 @@ def test_generate_md(mock_path, tmp_path): ) cli.main(cli_args=args) test_file = f"{tmp_path}/study_python_valid/study_python_valid_generated.md" - with open(test_file) as f: - print(f.read()) ref_file = ( pathlib.Path(__file__).resolve().parent / "test_data/study_python_valid_generated.md" diff --git a/tests/test_data/study_python_valid_generated.md b/tests/test_data/study_python_valid_generated.md index ee8857bc..2d197279 100644 --- a/tests/test_data/study_python_valid_generated.md +++ b/tests/test_data/study_python_valid_generated.md @@ -8,6 +8,7 @@ |library_version|VARCHAR | | |status |VARCHAR | | |event_time |TIMESTAMP| | +|message |VARCHAR | | ### study_python_valid__table diff --git a/tests/test_logging.py b/tests/test_logging.py new file mode 100644 index 00000000..168032dc --- /dev/null +++ b/tests/test_logging.py @@ -0,0 +1,151 @@ +from contextlib import nullcontext as does_not_raise +from datetime import datetime + +import pytest +from freezegun import freeze_time + +from cumulus_library import __version__, enums, errors, log_utils, study_parser +from cumulus_library.template_sql import base_templates, sql_utils + + +@freeze_time("2024-01-01") +@pytest.mark.parametrize( + "schema,study,status,message,expects,raises", + [ + ( + "main", + "study_valid", + enums.LogStatuses.STARTED, + None, + ("study_valid", __version__, "started", datetime(2024, 1, 1, 0, 0), None), + does_not_raise(), + ), + ( + "main", + "study_valid", + "info", + "status", + ("study_valid", __version__, "info", datetime(2024, 1, 1, 0, 0), "status"), + does_not_raise(), + ), + ( + "main", + "study_dedicated_schema", + "error", + None, + ( + "study_dedicated_schema", + __version__, + "error", + datetime(2024, 1, 1, 0, 0), + None, + ), + does_not_raise(), + ), + ( + "main", + "study_valid", + "invalid_type", + None, + None, + pytest.raises(errors.CumulusLibraryError), + ), + ], +) +def test_transactions(mock_db, schema, study, status, message, expects, raises): + with raises: + cursor = mock_db.cursor() + table = sql_utils.TransactionsTable() + manifest = study_parser.StudyManifestParser(f"./tests/test_data/{study}/") + if manifest.get_dedicated_schema(): + schema = manifest.get_dedicated_schema() + table_name = table.name + cursor.execute(f"create schema {schema}") + else: + table_name = f"{manifest.get_study_prefix()}__{table.name}" + query = base_templates.get_ctas_empty_query( + schema_name=schema, + table_name=table_name, + table_cols=table.columns, + table_cols_types=table.column_types, + ) + cursor.execute(query) + log_utils.log_transaction( + cursor=cursor, + schema=schema, + manifest=manifest, + status=status, + message=message, + ) + log = cursor.execute(f"select * from {schema}.{table_name}").fetchone() + assert log == expects + + +@freeze_time("2024-01-01") +@pytest.mark.parametrize( + "schema,study,table_type,table_name,view_type,expects,raises", + [ + ( + "main", + "study_valid", + "psm", + "psm123", + "psmview", + ( + "study_valid", + __version__, + "psm", + "study_valid__lib_statistics", + "psmview", + datetime(2024, 1, 1, 0, 0), + ), + does_not_raise(), + ), + ( + "main", + "study_dedicated_schema", + "psm", + "psm123", + "psmview", + ( + "study_dedicated_schema", + __version__, + "psm", + "lib_statistics", + "psmview", + datetime(2024, 1, 1, 0, 0), + ), + does_not_raise(), + ), + ], +) +def test_statistics( + mock_db, schema, study, table_type, table_name, view_type, raises, expects +): + with raises: + cursor = mock_db.cursor() + table = sql_utils.StatisticsTable() + manifest = study_parser.StudyManifestParser(f"./tests/test_data/{study}/") + if manifest.get_dedicated_schema(): + schema = manifest.get_dedicated_schema() + table_name = table.name + cursor.execute(f"create schema {schema}") + else: + table_name = f"{manifest.get_study_prefix()}__{table.name}" + query = base_templates.get_ctas_empty_query( + schema_name=schema, + table_name=table_name, + table_cols=table.columns, + table_cols_types=table.column_types, + ) + cursor.execute(query) + log_utils.log_statistics( + cursor=cursor, + schema=schema, + manifest=manifest, + table_type=table_type, + table_name=table_name, + view_name=view_type, + ) + log = cursor.execute(f"select * from {schema}.{table_name}").fetchone() + assert log == expects