From 501b9ded3f8b578d34b00392687cd8c447c80045 Mon Sep 17 00:00:00 2001 From: Matt Garber Date: Mon, 1 Jul 2024 11:53:35 -0400 Subject: [PATCH 1/5] actions now driven via config+manifest --- cumulus_library/__init__.py | 2 +- cumulus_library/actions/builder.py | 217 +++++++---------- cumulus_library/actions/cleaner.py | 45 ++-- cumulus_library/actions/exporter.py | 63 ++--- cumulus_library/actions/file_generator.py | 54 ++--- cumulus_library/actions/importer.py | 37 ++- cumulus_library/base_table_builder.py | 56 +++-- cumulus_library/base_utils.py | 28 ++- cumulus_library/cli.py | 194 ++++++---------- cumulus_library/databases.py | 6 +- cumulus_library/log_utils.py | 29 +-- cumulus_library/protected_table_builder.py | 31 ++- cumulus_library/statistics/counts.py | 20 +- cumulus_library/statistics/psm.py | 13 +- .../studies/core/builder_prereq_tables.py | 2 +- cumulus_library/studies/core/count_core.py | 2 +- .../studies/vocab/vocab_icd_builder.py | 7 +- .../{study_parser.py => study_manifest.py} | 4 +- tests/conftest.py | 37 ++- tests/test_actions.py | 218 +++++++++--------- tests/test_cli.py | 11 +- .../study_dedicated_schema/module1.py | 2 +- .../study_dedicated_schema/module2.py | 2 +- .../study_python_counts_valid/module1.py | 2 +- .../study_python_counts_valid/module2.py | 2 +- .../study_python_local_template/module1.py | 2 +- .../study_python_no_subclass/module1.py | 2 +- .../study_python_no_subclass/module2.py | 2 +- tests/test_data/study_python_valid/module1.py | 2 +- tests/test_data/study_python_valid/module2.py | 2 +- tests/test_logging.py | 21 +- tests/test_psm.py | 20 +- tests/test_study_parser.py | 8 +- tests/testbed_utils.py | 9 +- 34 files changed, 539 insertions(+), 613 deletions(-) rename cumulus_library/{study_parser.py => study_manifest.py} (98%) diff --git a/cumulus_library/__init__.py b/cumulus_library/__init__.py index 5af118b4..90816ccd 100644 --- a/cumulus_library/__init__.py +++ b/cumulus_library/__init__.py @@ -1,3 +1,3 @@ """Package metadata""" -__version__ = "2.3.0" +__version__ = "3.0.0" diff --git a/cumulus_library/actions/builder.py b/cumulus_library/actions/builder.py index 2758eae7..7e9835b9 100644 --- a/cumulus_library/actions/builder.py +++ b/cumulus_library/actions/builder.py @@ -17,7 +17,7 @@ errors, log_utils, protected_table_builder, - study_parser, + study_manifest, ) from cumulus_library.statistics import psm @@ -33,27 +33,19 @@ def _temporary_sys_path(add: pathlib.Path) -> None: def _load_and_execute_builder( - manifest: study_parser.StudyManifestParser, - filename: str, - cursor: databases.DatabaseCursor, - schema: str, - *, config: base_utils.StudyConfig, - verbose: bool = False, - drop_table: bool = False, + manifest: study_manifest.StudyManifest, + *, + filename: str, db_parser: databases.DatabaseParser = None, write_reference_sql: bool = False, doc_str: str | None = None, ) -> None: """Loads a table builder from a file. - :param manifest: a StudyManifestParser object - :param filename: filename of a module implementing a TableBuilder - :param cursor: a database cursor for query execution - :param schema: the database schema to write to - :keyword config: a StudyConfig object - :keyword verbose: if true, will replace progress bars with sql query output - :keyword drop_table: if true, will drop a table if it already exists + :param config: a StudyConfig object + :param manifest: a StudyManifest object + :keyword filename: filename of a module implementing a TableBuilder :keyword db_parser: an object implementing DatabaseParser for the target database :keyword write_reference_sql: if true, writes sql to disk inside a study's directory :keyword doc_string: A string to insert between queries written to disk @@ -102,7 +94,9 @@ def _load_and_execute_builder( table_builder_class = table_builder_subclasses[0] table_builder = table_builder_class() if write_reference_sql: - table_builder.prepare_queries(cursor, schema, parser=db_parser, config=config) + table_builder.prepare_queries( + config=config, manifest=manifest, parser=db_parser + ) table_builder.comment_queries(doc_str=doc_str) new_filename = pathlib.Path(f"{filename}").stem + ".sql" table_builder.write_queries( @@ -110,13 +104,9 @@ def _load_and_execute_builder( ) else: table_builder.execute_queries( - cursor, - schema, - verbose=verbose, - drop_table=drop_table, - parser=db_parser, config=config, manifest=manifest, + parser=db_parser, ) # After running the executor code, we'll remove @@ -127,70 +117,45 @@ def _load_and_execute_builder( def run_protected_table_builder( - manifest: study_parser.StudyManifestParser, - cursor: databases.DatabaseCursor, - schema: str, - *, config: base_utils.StudyConfig, - verbose: bool = False, + manifest: study_manifest.StudyManifest, ) -> None: """Creates protected tables for persisting selected data across runs - :param manifest: a StudyManifestParser object - :param cursor: A DatabaseCursor object - :param schema: The name of the schema to write tables to - :keyword config: a StudyConfig object - :keyword verbose: if true, will replace progress bars with sql query output + :param config: a StudyConfig object + :param manifest: a StudyManifest object """ ptb = protected_table_builder.ProtectedTableBuilder() ptb.execute_queries( - cursor, - schema, - verbose, - study_name=manifest._study_config.get("study_prefix"), - study_stats=manifest._study_config.get("statistics_config"), config=config, manifest=manifest, ) def run_table_builder( - manifest: study_parser.StudyManifestParser, - cursor: databases.DatabaseCursor, - schema: str, - *, config: base_utils.StudyConfig, - verbose: bool = False, + manifest: study_manifest.StudyManifest, + *, db_parser: databases.DatabaseParser = None, ) -> None: """Loads modules from a manifest and executes code via BaseTableBuilder - :param manifest: a StudyManifestParser object - :param cursor: A DatabaseCursor object - :param schema: The name of the schema to write tables to - :keyword config: a StudyConfig object - :keyword verbose: if true, will replace progress bars with sql query output + :param config: a StudyConfig object + :param manifest: a StudyManifest object :keyword db_parser: an object implementing DatabaseParser for the target database """ for file in manifest.get_table_builder_file_list(): _load_and_execute_builder( - manifest, - file, - cursor, - schema, - verbose=verbose, - db_parser=db_parser, config=config, + manifest=manifest, + filename=file, + db_parser=db_parser, ) def run_counts_builders( - manifest: study_parser.StudyManifestParser, - cursor: databases.DatabaseCursor, - schema: str, - *, config: base_utils.StudyConfig, - verbose: bool = False, + manifest: study_manifest.StudyManifest, ) -> None: """Loads counts modules from a manifest and executes code via BaseTableBuilder @@ -199,41 +164,38 @@ def run_counts_builders( given dataset, where other statistical methods may use sampling techniques or adjustable input parameters that may need to be preserved for later review. - :param manifest: a StudyManifestParser object - :param cursor: A DatabaseCursor object - :param schema: The name of the schema to write tables to - :keyword config: a StudyConfig object - :keyword verbose: if true, will replace progress bars with sql query output + :param config: a StudyConfig object + :param manifest: a StudyManifest object """ for file in manifest.get_counts_builder_file_list(): _load_and_execute_builder( - manifest, - file, - cursor, - schema, - verbose=verbose, config=config, + manifest=manifest, + filename=file, ) def run_statistics_builders( - manifest: study_parser.StudyManifestParser, - cursor: databases.DatabaseCursor, - schema: str, - *, config: base_utils.StudyConfig, - verbose: bool = False, + manifest: study_manifest.StudyManifest, ) -> None: """Loads statistics modules from toml definitions and executes - :param manifest: a StudyManifestParser object - :param cursor: A DatabaseCursor object - :param schema: The name of the schema to write tables to - :keyword config: a StudyConfig object - :keyword verbose: if true, will replace progress bars with sql query output + :param config: a StudyConfig object + :param manifest: a StudyManifest object """ - if not config.stats_build: + if len(manifest.get_statistics_file_list()) == 0: return + existing_stats = [] + if not config.stats_build: + existing_stats = ( + config.db.cursor() + .execute( + "SELECT view_name FROM " + f"{manifest.get_study_prefix()}__{enums.ProtectedTables.STATISTICS.value}" + ) + .fetchall() + ) for file in manifest.get_statistics_file_list(): # This open is a bit redundant with the open inside of the PSM builder, # but we're letting it slide so that builders function similarly @@ -241,30 +203,31 @@ def run_statistics_builders( safe_timestamp = base_utils.get_tablename_safe_iso_timestamp() toml_path = pathlib.Path(f"{manifest._study_path}/{file}") with open(toml_path, encoding="UTF-8") as file: - config = toml.load(file) - config_type = config["config_type"] - target_table = config["target_table"] + stats_config = toml.load(file) + config_type = stats_config["config_type"] + target_table = stats_config["target_table"] + + if (target_table,) in existing_stats and not config.stats_build: + print("early break") + continue + print("run build") if config_type == "psm": builder = psm.PsmBuilder( - toml_path, - manifest.data_path / f"{manifest.get_study_prefix()}/psm", - config=config, + toml_config_path=toml_path, + config=stats_config, + data_path=manifest.data_path / f"{manifest.get_study_prefix()}/psm", ) else: raise errors.StudyManifestParsingError( f"{toml_path} references an invalid statistics type {config_type}." ) builder.execute_queries( - cursor, - schema, - verbose, - table_suffix=safe_timestamp, config=config, manifest=manifest, + table_suffix=safe_timestamp, ) log_utils.log_statistics( - cursor=cursor, - schema=schema, + config=config, manifest=manifest, table_type=config_type, table_name=f"{target_table}_{safe_timestamp}", @@ -273,54 +236,40 @@ def run_statistics_builders( def run_matching_table_builder( - manifest: study_parser.StudyManifestParser, - cursor: databases.DatabaseCursor, - schema: str, - builder: str, - *, config: base_utils.StudyConfig, - verbose: bool = False, + manifest: study_manifest.StudyManifest, + *, + builder: str, db_parser: databases.DatabaseParser = None, ): """targets all table builders matching a target string for running - :param manifest: a StudyManifestParser object - :param cursor: A DatabaseCursor object - :param schema: The name of the schema to write tables to - :param builder: filename of a module implementing a TableBuilder - :keyword config: a StudyConfig object - :keyword verbose: if true, will replace progress bars with sql query output + :param config: a StudyConfig object + :param manifest: a StudyManifest object + :keyword builder: filename of a module implementing a TableBuilder :keyword db_parser: an object implementing DatabaseParser for the target database""" all_generators = manifest.get_all_generators() for file in all_generators: if builder and file.find(builder) == -1: continue _load_and_execute_builder( - manifest, - file, - cursor, - schema, - verbose=verbose, - drop_table=True, - db_parser=db_parser, config=config, + manifest=manifest, + filename=file, + db_parser=db_parser, ) def build_study( - manifest: study_parser.StudyManifestParser, - cursor: databases.DatabaseCursor, - *, config: base_utils.StudyConfig, - verbose: bool = False, + manifest: study_manifest.StudyManifest, + *, continue_from: str | None = None, ) -> list: """Creates tables in the schema by iterating through the sql_config.file_names - :param manifest: a StudyManifestParser object - :param cursor: A DatabaseCursor object - :keyword config: a StudyConfig object - :keyword verbose: if true, will replace progress bars with sql query output + :param config: a StudyConfig object + :param manifest: a StudyManifest object :keyword continue_from: Name of a sql file to resume table creation from :returns: loaded queries (for unit testing only) """ @@ -338,21 +287,23 @@ def build_study( f"`{manifest.get_study_prefix()}__", "`", ) + # We'll explicitly create a cursor since recreating cursors for each + # table in a study is slightly slower for some databases + cursor = config.db.cursor() # We want to only show a progress bar if we are :not: printing SQL lines - with base_utils.get_progress_bar(disable=verbose) as progress: + with base_utils.get_progress_bar(disable=config.verbose) as progress: task = progress.add_task( f"Creating {manifest.get_study_prefix()} study in db...", total=len(queries), - visible=not verbose, + visible=not config.verbose, ) _execute_build_queries( - manifest, - cursor, - verbose, - queries, - progress, - task, - config, + config=config, + manifest=manifest, + cursor=cursor, + queries=queries, + progress=progress, + task=task, ) return queries @@ -370,23 +321,21 @@ def _query_error(query_and_filename: list, exit_message: str) -> None: def _execute_build_queries( - manifest: study_parser.StudyManifestParser, + config: base_utils.StudyConfig, + manifest: study_manifest.StudyManifest, + *, cursor: databases.DatabaseCursor, - verbose: bool, queries: list, progress: Progress, task: TaskID, - config: base_utils.StudyConfig, ) -> None: """Handler for executing create table queries and displaying console output. - :param manifest: a StudyManifestParser object + :param manifest: a StudyManifest object :param cursor: A DatabaseCursor object - :param verbose: toggle from progress bar to query output :param queries: a list of queries read from files in sql_config.file_names :param progress: a rich progress bar renderer :param task: a TaskID for a given progress bar - :param config: a StudyConfig object """ for query in queries: create_line = query[0].split("\n")[0] @@ -430,7 +379,9 @@ def _execute_build_queries( "start with a string like `study_prefix__`.", ) try: - with base_utils.query_console_output(verbose, query[0], progress, task): + with base_utils.query_console_output( + config.verbose, query[0], progress, task + ): cursor.execute(query[0]) except Exception as e: # pylint: disable=broad-exception-caught _query_error( diff --git a/cumulus_library/actions/cleaner.py b/cumulus_library/actions/cleaner.py index f54a93a7..7a823db1 100644 --- a/cumulus_library/actions/cleaner.py +++ b/cumulus_library/actions/cleaner.py @@ -2,7 +2,7 @@ from rich.progress import Progress, TaskID -from cumulus_library import base_utils, databases, enums, errors, study_parser +from cumulus_library import base_utils, databases, enums, errors, study_manifest from cumulus_library.template_sql import base_templates @@ -76,41 +76,35 @@ def get_unprotected_stats_view_table( def clean_study( - manifest_parser: study_parser.StudyManifestParser | None, - cursor: databases.DatabaseCursor, - schema_name: str, - stats_clean: bool = False, - verbose: bool = False, + config: base_utils.StudyConfig, + manifest: study_manifest.StudyManifest | None, prefix: str | None = None, ) -> list: """Removes tables beginning with the study prefix from the database schema - :param manifest_parser: a StudyManifestParser for the study - :param cursor: A DatabaseCursor object - :param schema_name: The name of the schema containing the study tables - :keyword verbose: toggle from progress bar to query output, optional - :keyword stats_clean: will delete stats tables created from random sampling + :param config: a StudyConfig object + :param manifest: a StudyManifest object :keyword prefix: override manifest-based prefix discovery with the provided prefix :returns: list of dropped tables (for unit testing only) """ - if not schema_name: + if not config.schema: raise errors.CumulusLibraryError("No database provided") - if not prefix and not manifest_parser: + if not prefix and not manifest: raise errors.CumulusLibraryError( "Either a manifest parser or a filter prefix must be provided" ) - if manifest_parser and manifest_parser.get_dedicated_schema(): + if manifest and manifest.get_dedicated_schema(): drop_prefix = "" display_prefix = "" elif not prefix: - drop_prefix = f"{manifest_parser.get_study_prefix()}__" - display_prefix = manifest_parser.get_study_prefix() + drop_prefix = f"{manifest.get_study_prefix()}__" + display_prefix = manifest.get_study_prefix() else: drop_prefix = prefix display_prefix = drop_prefix - if stats_clean: + if config.stats_clean: confirm = input( "This will remove all historical stats tables in the " f"{display_prefix} study - are you sure? (y/N)" @@ -118,8 +112,9 @@ def clean_study( if confirm is None or confirm.lower() not in ("y", "yes"): sys.exit("Table cleaning aborted") - view_sql = base_templates.get_show_views(schema_name, drop_prefix) - table_sql = base_templates.get_show_tables(schema_name, drop_prefix) + cursor = config.db.cursor() + view_sql = base_templates.get_show_views(config.schema, drop_prefix) + table_sql = base_templates.get_show_tables(config.schema, drop_prefix) for query_and_type in [[view_sql, "VIEW"], [table_sql, "TABLE"]]: view_table_list = get_unprotected_stats_view_table( cursor, @@ -127,7 +122,7 @@ def clean_study( query_and_type[1], drop_prefix, display_prefix, - stats_clean, + config.stats_clean, ) if not view_table_list: return view_table_list @@ -152,7 +147,7 @@ def clean_study( confirm = input("Remove these tables? (y/N)") if confirm.lower() not in ("y", "yes"): sys.exit("Table cleaning aborted") - if dedicated := manifest_parser.get_dedicated_schema(): + if dedicated := manifest.get_dedicated_schema(): view_table_list = [ ( f"`{dedicated}`.`{x[0]}`", @@ -161,22 +156,22 @@ def clean_study( for x in view_table_list ] # We want to only show a progress bar if we are :not: printing SQL lines - with base_utils.get_progress_bar(disable=verbose) as progress: + with base_utils.get_progress_bar(disable=config.verbose) as progress: task = progress.add_task( f"Removing {display_prefix} study artifacts...", total=len(view_table_list), - visible=not verbose, + visible=not config.verbose, ) _execute_drop_queries( cursor, - verbose, + config.verbose, view_table_list, progress, task, ) # if we're doing a stats clean, we'll also remove the table containing the # list of protected tables - if stats_clean: + if config.stats_clean: drop_query = base_templates.get_drop_view_table( f"{drop_prefix}{enums.ProtectedTables.STATISTICS.value}", "TABLE" ) diff --git a/cumulus_library/actions/exporter.py b/cumulus_library/actions/exporter.py index a7a68abd..27d43b4b 100644 --- a/cumulus_library/actions/exporter.py +++ b/cumulus_library/actions/exporter.py @@ -4,21 +4,19 @@ from pyarrow import csv, parquet from rich.progress import track -from cumulus_library import base_utils, databases, study_parser +from cumulus_library import base_utils, study_manifest from cumulus_library.template_sql import base_templates # Database exporting functions def reset_counts_exports( - manifest_parser: study_parser.StudyManifestParser, + manifest: study_manifest.StudyManifest, ) -> None: """ Removes exports associated with this study from the ../data_export directory. """ - path = pathlib.Path( - f"{manifest_parser.data_path}/{manifest_parser.get_study_prefix()}" - ) + path = pathlib.Path(f"{manifest.data_path}/{manifest.get_study_prefix()}") if path.exists(): # we're just going to remove the count exports - stats exports in # subdirectories are left alone by this call @@ -26,60 +24,63 @@ def reset_counts_exports( file.unlink() -def _write_chunk(writer, chunk, schema): +def _write_chunk(writer, chunk, arrow_schema): writer.write( pyarrow.Table.from_pandas( chunk.sort_values( by=list(chunk.columns), ascending=False, na_position="first" ), preserve_index=False, - schema=schema, + schema=arrow_schema, ) ) def export_study( - manifest_parser: study_parser.StudyManifestParser, - db: databases.DatabaseBackend, - schema_name: str, + config: base_utils.StudyConfig, + manifest: study_manifest.StudyManifest, + *, data_path: pathlib.Path, archive: bool, chunksize: int = 1000000, ) -> list: """Exports csvs/parquet extracts of tables listed in export_list - :param db: A database backend - :param schema_name: the schema/database to target - :param data_path: the path to the place on disk to save data - :param archive: If true, get all study data and zip with timestamp + :param config: a StudyConfig object + :param manifest: a StudyManifest object + :keyword data_path: the path to the place on disk to save data + :keyword archive: If true, get all study data and zip with timestamp + :keyword chunksize: number of rows to export in a single transaction :returns: a list of queries, (only for unit tests) """ - reset_counts_exports(manifest_parser) - if manifest_parser.get_dedicated_schema(): - prefix = f"{manifest_parser.get_dedicated_schema()}." + reset_counts_exports(manifest) + if manifest.get_dedicated_schema(): + prefix = f"{manifest.get_dedicated_schema()}." else: - prefix = f"{manifest_parser.get_study_prefix()}__" + prefix = f"{manifest.get_study_prefix()}__" if archive: - table_query = base_templates.get_show_tables(schema_name, prefix) - result = db.cursor().execute(table_query).fetchall() + table_query = base_templates.get_show_tables(config.schema, prefix) + result = config.db.cursor().execute(table_query).fetchall() table_list = [row[0] for row in result] else: - table_list = manifest_parser.get_export_table_list() + table_list = manifest.get_export_table_list() queries = [] - path = pathlib.Path(f"{data_path}/{manifest_parser.get_study_prefix()}/") + path = pathlib.Path(f"{data_path}/{manifest.get_study_prefix()}/") for table in track( table_list, - description=f"Exporting {manifest_parser.get_study_prefix()} data...", + description=f"Exporting {manifest.get_study_prefix()} data...", ): query = f"SELECT * FROM {table}" - query = base_utils.update_query_if_schema_specified(query, manifest_parser) - dataframe_chunks, db_schema = db.execute_as_pandas(query, chunksize=chunksize) + query = base_utils.update_query_if_schema_specified(query, manifest) + dataframe_chunks, db_schema = config.db.execute_as_pandas( + query, chunksize=chunksize + ) path.mkdir(parents=True, exist_ok=True) - schema = pyarrow.schema(db.col_pyarrow_types_from_sql(db_schema)) - with parquet.ParquetWriter(f"{path}/{table}.parquet", schema) as p_writer: + arrow_schema = pyarrow.schema(config.db.col_pyarrow_types_from_sql(db_schema)) + with parquet.ParquetWriter(f"{path}/{table}.parquet", arrow_schema) as p_writer: with csv.CSVWriter( f"{path}/{table}.csv", - schema, + arrow_schema, write_options=csv.WriteOptions( # Note that this quoting style is not exactly csv.QUOTE_MINIMAL # https://github.com/apache/arrow/issues/42032 @@ -87,9 +88,9 @@ def export_study( ), ) as c_writer: for chunk in dataframe_chunks: - _write_chunk(p_writer, chunk, schema) # pragma: no cover - _write_chunk(c_writer, chunk, schema) # pragma: no cover + _write_chunk(p_writer, chunk, arrow_schema) # pragma: no cover + _write_chunk(c_writer, chunk, arrow_schema) # pragma: no cover queries.append(query) if archive: - base_utils.zip_dir(path, data_path, manifest_parser.get_study_prefix()) + base_utils.zip_dir(path, data_path, manifest.get_study_prefix()) return queries diff --git a/cumulus_library/actions/file_generator.py b/cumulus_library/actions/file_generator.py index ea3aea81..be4a721a 100644 --- a/cumulus_library/actions/file_generator.py +++ b/cumulus_library/actions/file_generator.py @@ -3,30 +3,26 @@ import pandas import pytablewriter -from cumulus_library import base_utils, databases, study_parser +from cumulus_library import base_utils, databases, study_manifest from cumulus_library.actions import builder from cumulus_library.template_sql import base_templates def run_generate_sql( - manifest_parser: study_parser.StudyManifestParser, - cursor: databases.DatabaseCursor, - schema: str, - *, config: base_utils.StudyConfig, + manifest: study_manifest.StudyManifest, + *, table_builder: str | None = None, db_parser: databases.DatabaseParser = None, - verbose: bool = False, ) -> None: """Generates reference SQL from table_builders listed in the manifest - :param cursor: A DatabaseCursor object - :param schema: The name of the schema to write tables to - :param table_builder: a specific table_builder to target - :param verbose: toggle from progress bar to query output - :param parser: a DB parser + :param config: a StudyConfig object + :param manifest: a StudyManifest object + :keyword table_builder: a specific table_builder to target + :keyword db_parser: a DB parser """ - all_generators = manifest_parser.get_all_generators() + all_generators = manifest.get_all_generators() doc_str = ( "-- This sql was autogenerated as a reference example using the library\n" "-- CLI. Its format is tied to the specific database it was run against,\n" @@ -37,57 +33,51 @@ def run_generate_sql( if table_builder and file.find(table_builder) == -1: continue builder._load_and_execute_builder( - manifest_parser, - file, - cursor, - schema, + config=config, + manifest=manifest, + filename=file, db_parser=db_parser, write_reference_sql=True, doc_str=doc_str, - verbose=verbose, - config=config, ) def run_generate_markdown( - manifest_parser: study_parser.StudyManifestParser, - cursor: databases.DatabaseCursor, - schema: str, + config: base_utils.StudyConfig, + manifest: study_manifest.StudyManifest, + *, db_parser: databases.DatabaseParser = None, - verbose: bool = False, ) -> None: """Generates reference SQL from builders listed in the manifest - :param cursor: A DatabaseCursor object - :param schema: The name of the schema to write tables to - :param verbose: toggle from progress bar to query output + :param config: a StudyConfig object + :param manifest: a StudyManifest object """ + cursor = config.db.cursor() query = base_templates.get_show_tables( - schema_name=schema, prefix=f"{manifest_parser.get_study_prefix()}__" + schema_name=config.schema, prefix=f"{manifest.get_study_prefix()}__" ) - tables = [x[0] for x in cursor.execute(query).fetchall()] query = base_templates.get_column_datatype_query( - schema_name=schema, table_names=tables, include_table_names=True + schema_name=config.schema, table_names=tables, include_table_names=True ) study_df = pandas.DataFrame( cursor.execute(query).fetchall(), columns=["Column", "Type", "Table"] ) with open( - manifest_parser._study_path - / f"{manifest_parser.get_study_prefix()}_generated.md", + manifest._study_path / f"{manifest.get_study_prefix()}_generated.md", "w", ) as f: table_list = sorted(study_df["Table"].unique()) count_tables = [t for t in table_list if "__count_" in t] base_tables = [t for t in table_list if "__count_" not in t] if len(count_tables) > 0: - f.write(f"## {manifest_parser.get_study_prefix()} count tables\n\n") + f.write(f"## {manifest.get_study_prefix()} count tables\n\n") for table in count_tables: _write_md_table(table, study_df, f) if len(base_tables) > 0: - f.write(f"## {manifest_parser.get_study_prefix()} base tables\n\n") + f.write(f"## {manifest.get_study_prefix()} base tables\n\n") for table in base_tables: _write_md_table(table, study_df, f) diff --git a/cumulus_library/actions/importer.py b/cumulus_library/actions/importer.py index 5dc8003c..d011bd9a 100644 --- a/cumulus_library/actions/importer.py +++ b/cumulus_library/actions/importer.py @@ -9,7 +9,7 @@ from cumulus_library.template_sql import base_templates -def create_table_from_parquet(archive, file, study_name, db, schema): +def _create_table_from_parquet(archive, file, study_name, config): try: parquet_path = pathlib.Path( archive.extract(file), path=tempfile.TemporaryFile() @@ -18,8 +18,8 @@ def create_table_from_parquet(archive, file, study_name, db, schema): # which is messy - this could be optionally be replaced by pyarrow if it # becomes problematic. table_types = pandas.read_parquet(parquet_path).dtypes - remote_types = db.col_parquet_types_from_pandas(table_types.values) - s3_path = db.upload_file( + remote_types = config.db.col_parquet_types_from_pandas(table_types.values) + s3_path = config.db.upload_file( file=parquet_path, study=study_name, topic=parquet_path.stem, @@ -27,22 +27,25 @@ def create_table_from_parquet(archive, file, study_name, db, schema): remote_filename=parquet_path.name, ) query = base_templates.get_ctas_from_parquet_query( - schema_name=schema, + schema_name=config.schema, table_name=parquet_path.stem.replace(".", "_"), local_location=parquet_path.parent, remote_location=s3_path, table_cols=list(table_types.index), remote_table_cols_types=remote_types, ) - db.cursor().execute(query) + config.db.cursor().execute(query) finally: parquet_path.unlink() -def import_archive( - config: base_utils.StudyConfig, archive_path: pathlib.Path, args: dict -): - """Creates a study in the database from a previous export""" +def import_archive(config: base_utils.StudyConfig, *, archive_path: pathlib.Path): + """Creates a study in the database from a previous export + + :param config: a StudyConfig object + :keyword archive_path: the location of the archive to import data from + + """ # Ensure we've got something that looks like a valid database export if not archive_path.exists(): @@ -71,21 +74,13 @@ def import_archive( ) # Clean and rebuild from the provided archive - cleaner.clean_study( - manifest_parser=None, - cursor=config.db.cursor(), - schema_name=args["schema_name"], - verbose=args["verbose"], - prefix=study_name, - ) - with base_utils.get_progress_bar(disable=args["verbose"]) as progress: + cleaner.clean_study(config=config, manifest=None, prefix=study_name) + with base_utils.get_progress_bar(disable=config.verbose) as progress: task = progress.add_task( f"Recreating {study_name} from archive...", total=len(files), - visible=not args["verbose"], + visible=not config.verbose, ) for file in files: - create_table_from_parquet( - archive, file, study_name, config.db, args["schema_name"] - ) + _create_table_from_parquet(archive, file, study_name, config) progress.advance(task) diff --git a/cumulus_library/base_table_builder.py b/cumulus_library/base_table_builder.py index 0a6bbae9..92f9817c 100644 --- a/cumulus_library/base_table_builder.py +++ b/cumulus_library/base_table_builder.py @@ -1,16 +1,15 @@ """abstract base for python-based study executors""" +import abc import pathlib import re import sys -from abc import ABC, abstractmethod -from typing import final +import typing -from cumulus_library import base_utils, study_parser -from cumulus_library.databases import DatabaseCursor +from cumulus_library import base_utils, study_manifest -class BaseTableBuilder(ABC): +class BaseTableBuilder(abc.ABC): """Generic base class for python table builders. To use a table builder, extend this class exactly once in a new module. @@ -22,38 +21,38 @@ class BaseTableBuilder(ABC): def __init__(self): self.queries = [] - @abstractmethod - def prepare_queries(self, cursor: object, schema: str, *args, **kwargs): + @abc.abstractmethod + def prepare_queries( + self, + config: base_utils.StudyConfig, + manifest: study_manifest.StudyManifest, + *args, + **kwargs, + ): """Main entrypoint for python table builders. When completed, prepare_queries should populate self.queries with sql - statements to execute. This array will the be read by run queries. + statements to execute. This array will the be read by execute_queries. - :param cursor: A PEP-249 compatible cursor - :param schema: A schema name + :param config: A study configuration object """ raise NotImplementedError - @final + @typing.final def execute_queries( self, - cursor: DatabaseCursor, - schema: str, - verbose: bool, + config: base_utils.StudyConfig, + manifest: study_manifest.StudyManifest, *args, - drop_table: bool = False, - manifest: study_parser.StudyManifestParser = None, **kwargs, ): """Executes queries set up by a prepare_queries call - :param cursor: A PEP-249 compatible cursor - :param schema: A schema name - :param verbose: toggle for verbose output mode - :param drop_table: drops any tables found in prepared_queries results + :param config: A study configuration object """ - self.prepare_queries(cursor, schema, *args, manifest=manifest, **kwargs) - if drop_table: + self.prepare_queries(*args, config=config, manifest=manifest, **kwargs) + cursor = config.db.cursor() + if config.drop_table: table_names = [] for query in self.queries: # Get the first non-whitespace word after create table @@ -78,31 +77,28 @@ def execute_queries( table_names.append(table_name) for table_name in table_names: cursor.execute(f"DROP TABLE IF EXISTS {table_name}") - with base_utils.get_progress_bar(disable=verbose) as progress: + with base_utils.get_progress_bar(disable=config.verbose) as progress: task = progress.add_task( self.display_text, total=len(self.queries), - visible=not verbose, + visible=not config.verbose, ) for query in self.queries: try: query = base_utils.update_query_if_schema_specified(query, manifest) with base_utils.query_console_output( - verbose, query, progress, task + config.verbose, query, progress, task ): cursor.execute(query) except Exception as e: # pylint: disable=broad-exception-caught sys.exit(e) - self.post_execution(cursor, schema, verbose, drop_table, *args, **kwargs) + self.post_execution(config, *args, **kwargs) def post_execution( # noqa: B027 - this looks like, but is not, an abstract method self, - cursor: DatabaseCursor, - schema: str, - verbose: bool, + config: base_utils.StudyConfig, *args, - drop_table: bool = False, **kwargs, ): """Hook for any additional actions to run after execute_queries""" diff --git a/cumulus_library/base_utils.py b/cumulus_library/base_utils.py index 8451a36b..b4c9c808 100644 --- a/cumulus_library/base_utils.py +++ b/cumulus_library/base_utils.py @@ -10,7 +10,7 @@ from rich import progress -from cumulus_library import databases, study_parser +from cumulus_library import databases, study_manifest @dataclasses.dataclass @@ -22,24 +22,38 @@ class StudyConfig: doing something above that level, consider explicit arguments instead. This should be an interface aimed at a study author. - :param db_backend: a databaseBackend object for a specific target database - :keyword db_type: the argument passed in from the CLI indicating the requested DB - (this is easier to use in things like jinja templates than db_backend, if they - need to run DB technology aware queries) - :keyword replace_existing: If the study downloads data from an external resource, + :param db: a databaseBackend object for a specific target database + :param schema: the database schema specified at the command line + :keyword drop_table: when creating tables, if one already exists in the db, + drop it + :keyword force_upload: If the study downloads data from an external resource, force it to skip any cached data when running + :keyword verbose: if True, print raw queries to the cli instead of progress bars :keyword stats_build: If the study runs a stats builder, force regeneration of any sampling or other stochastic techniques + :keyword umls_key: An API for the UMLS service, used for downloading vocabularies :keyword umls: A UMLS API key + :keyword options: a dictionary for any study-specific CLI arguments """ db: databases.DatabaseBackend + schema: str + drop_table: bool = False force_upload: bool = False + verbose: bool = False stats_build: bool = False + stats_clean: bool = False umls_key: str | None = None options: dict | None = None +def get_schema(config: StudyConfig, manifest: study_manifest.StudyManifest): + if dedicated := manifest.get_dedicated_schema(): + config.db.create_schema(dedicated) + return dedicated + return config.schema + + def load_text(path: str) -> str: with open(path, encoding="UTF-8") as fp: return fp.read() @@ -117,7 +131,7 @@ def zip_dir(read_path, write_path, archive_name): def update_query_if_schema_specified( - query: str, manifest: study_parser.StudyManifestParser + query: str, manifest: study_manifest.StudyManifest ): if manifest and manifest.get_dedicated_schema(): query = query.replace( diff --git a/cumulus_library/cli.py b/cumulus_library/cli.py index 58521079..1115c32f 100755 --- a/cumulus_library/cli.py +++ b/cumulus_library/cli.py @@ -1,6 +1,7 @@ #!/usr/bin/env python3 """Utility for building/retrieving data views in AWS Athena""" +import copy import json import os import pathlib @@ -18,7 +19,7 @@ enums, errors, log_utils, - study_parser, + study_manifest, ) from cumulus_library.actions import ( builder, @@ -36,17 +37,18 @@ class StudyRunner: verbose = False schema_name = None - def __init__(self, db: databases.DatabaseBackend, data_path: str): - self.db = db + def __init__(self, config: base_utils.StudyConfig, data_path: str): + self.config = config self.data_path = data_path - self.cursor = db.cursor() - self.schema_name = db.schema_name - def get_schema(self, manifest: study_parser.StudyManifestParser): - if dedicated := manifest.get_dedicated_schema(): - self.db.create_schema(dedicated) - return dedicated - return self.schema_name + def get_config(self, manifest=study_manifest.StudyManifest): + schema = base_utils.get_schema(self.config, manifest) + if schema == self.config.schema: + return self.config + else: + config = copy.copy(self.config) + config.schema = schema + return config ### Creating studies @@ -55,7 +57,6 @@ def clean_study( targets: list[str], study_dict: dict, *, - stats_clean: bool, prefix: bool = False, ) -> None: """Removes study table/views from Athena. @@ -66,7 +67,6 @@ def clean_study( :param target: The study prefix to use for IDing tables to remove :param study_dict: The dictionary of available study targets - :param stats_clean: If true, removes previous stats runs :keyword prefix: If True, does a search by string prefix in place of study name """ if targets is None: @@ -77,106 +77,63 @@ def clean_study( for target in targets: if prefix: - manifest = study_parser.StudyManifestParser() - schema = self.get_schema(manifest) - cleaner.clean_study( - manifest_parser=manifest, - cursor=self.cursor, - schema_name=schema, - verbose=self.verbose, - stats_clean=stats_clean, - prefix=target, - ) + manifest = study_manifest.StudyManifest() + cleaner.clean_study(config=self.get_config(manifest), manifest=manifest) else: - manifest = study_parser.StudyManifestParser(study_dict[target]) - schema = self.get_schema(manifest) - cleaner.clean_study( - manifest_parser=manifest, - cursor=self.cursor, - schema_name=schema, - verbose=self.verbose, - stats_clean=stats_clean, - ) + manifest = study_manifest.StudyManifest(study_dict[target]) + cleaner.clean_study(config=self.get_config(manifest), manifest=manifest) def clean_and_build_study( self, target: pathlib.Path, *, - config: base_utils.StudyConfig, continue_from: str | None = None, ) -> None: """Recreates study views/tables :param target: A path to the study directory - :param config: A StudyConfig object containing optional params :keyword continue_from: Restart a run from a specific sql file (for dev only) """ - manifest = study_parser.StudyManifestParser(target, self.data_path) - schema = self.get_schema(manifest) + manifest = study_manifest.StudyManifest(target, self.data_path) try: builder.run_protected_table_builder( - manifest, - self.cursor, - schema, - verbose=self.verbose, - config=config, + config=self.get_config(manifest), manifest=manifest ) if not continue_from: log_utils.log_transaction( - cursor=self.cursor, - schema=schema, + config=self.get_config(manifest), manifest=manifest, status=enums.LogStatuses.STARTED, ) - cleaned_tables = cleaner.clean_study( - manifest_parser=manifest, - cursor=self.cursor, - schema_name=schema, - verbose=self.verbose, - stats_clean=False, + cleaner.clean_study( + config=self.get_config(manifest), + manifest=manifest, ) - # If the study hasn't been created before, force stats table generation - if len(cleaned_tables) == 0: - config.stats_build = True builder.run_table_builder( - manifest, - self.cursor, - schema, - verbose=self.verbose, - db_parser=self.db.parser(), - config=config, + config=self.get_config(manifest), manifest=manifest ) + else: log_utils.log_transaction( - cursor=self.cursor, - schema=schema, + config=self.get_config(manifest), manifest=manifest, status=enums.LogStatuses.RESUMED, ) + builder.build_study( - manifest, - self.cursor, - verbose=self.verbose, + config=self.get_config(manifest), + manifest=manifest, continue_from=continue_from, - config=config, ) builder.run_counts_builders( - manifest, - self.cursor, - schema, - verbose=self.verbose, - config=config, + config=self.get_config(manifest), manifest=manifest ) builder.run_statistics_builders( - manifest, - self.cursor, - schema, - verbose=self.verbose, - config=config, + config=self.get_config(manifest), + manifest=manifest, ) log_utils.log_transaction( - cursor=self.cursor, - schema=schema, + config=self.get_config(manifest), manifest=manifest, status=enums.LogStatuses.FINISHED, ) @@ -187,8 +144,7 @@ def clean_and_build_study( raise e except Exception as e: log_utils.log_transaction( - cursor=self.cursor, - schema=schema, + config=self.get_config(manifest), manifest=manifest, status=enums.LogStatuses.ERROR, ) @@ -198,24 +154,17 @@ def run_matching_table_builder( self, target: pathlib.Path, table_builder_name: str, - config: base_utils.StudyConfig, ) -> None: """Runs a single table builder :param target: A path to the study directory :param table_builder_name: a builder file referenced in the study's manifest - :param config: A StudyConfig object containing optional params """ - manifest = study_parser.StudyManifestParser(target) - schema = self.get_schema(manifest) + manifest = study_manifest.StudyManifest(target) builder.run_matching_table_builder( - manifest, - self.cursor, - schema, - table_builder_name, - verbose=self.verbose, - db_parser=self.db.parser(), - config=config, + config=self.get_config(manifest), + manifest=manifest, + builder=table_builder_name, ) ### Data exporters @@ -225,35 +174,33 @@ def export_study( """Exports aggregates defined in a manifest :param target: A path to the study directory + :param datapath: The location to export data to + :param archive: If true, will export all tables, otherwise uses manifest list """ if data_path is None: sys.exit("Missing destination - please provide a path argument.") - manifest = study_parser.StudyManifestParser(target, data_path) - exporter.export_study(manifest, self.db, self.schema_name, data_path, archive) + manifest = study_manifest.StudyManifest(target, data_path) + exporter.export_study( + config=self.get_config(manifest), + manifest=manifest, + data_path=data_path, + archive=archive, + ) def generate_study_sql( self, target: pathlib.Path, *, - config: base_utils.StudyConfig, builder: str | None = None, ) -> None: """Materializes study sql from templates :param target: A path to the study directory - :param config: A StudyConfig object containing optional params - :param builder: Specify a single builder to generate sql from + :keyword builder: Specify a single builder to generate sql from """ - manifest = study_parser.StudyManifestParser(target) - schema = self.get_schema(manifest) + manifest = study_manifest.StudyManifest(target) file_generator.run_generate_sql( - manifest_parser=manifest, - cursor=self.cursor, - schema=schema, - table_builder=builder, - verbose=self.verbose, - db_parser=self.db.parser(), - config=config, + config=self.get_config(manifest), manifest=manifest ) def generate_study_markdown( @@ -264,14 +211,9 @@ def generate_study_markdown( :param target: A path to the study directory """ - manifest = study_parser.StudyManifestParser(target) - schema = self.get_schema(manifest) + manifest = study_manifest.StudyManifest(target) file_generator.run_generate_markdown( - manifest_parser=manifest, - cursor=self.cursor, - schema=schema, - verbose=self.verbose, - db_parser=self.db.parser(), + config=self.get_config(manifest), manifest=manifest ) @@ -321,7 +263,7 @@ def get_studies_by_manifest_path(path: pathlib.Path) -> dict[str, pathlib.Path]: manifest_paths.update(get_studies_by_manifest_path(child_path)) elif child_path.name == "manifest.toml": try: - manifest = study_parser.StudyManifestParser(path) + manifest = study_manifest.StudyManifest(path) manifest_paths[manifest.get_study_prefix()] = path except errors.StudyManifestParsingError as exc: rich.print(f"[bold red] Ignoring study in '{path}': {exc}") @@ -340,19 +282,31 @@ def run_cli(args: dict): # all other actions require connecting to the database else: + # if we're targeting a builder explicitly, we always want to + # run in drop mode + if builder := args.get("builder"): + drop_table = builder + else: + drop_table = args.get("drop_table") + if args.get("db_type") == "duckdb": + schema = "main" + else: + schema = args.get("schema_name") config = base_utils.StudyConfig( db=databases.create_db_backend(args), + schema=schema, + drop_table=drop_table, force_upload=args.get("replace_existing", False), + verbose=args.get("verbose"), stats_build=args.get("stats_build", False), + stats_clean=args.get("stats_clean", False), umls_key=args.get("umls_key"), options=args.get("options"), ) try: - runner = StudyRunner(config.db, data_path=args.get("data_path")) - if args.get("verbose"): - runner.verbose = True + runner = StudyRunner(config, data_path=args.get("data_path")) console.print("[italic] Connecting to database...") - runner.cursor.execute("SHOW DATABASES") + runner.config.db.cursor().execute("SHOW DATABASES") study_dict = get_study_dict(args.get("study_dir")) if "prefix" not in args.keys(): if args.get("target"): @@ -366,21 +320,19 @@ def run_cli(args: dict): ) if args["action"] == "clean": runner.clean_study( - args["target"], - study_dict, - stats_clean=args["stats_clean"], + targets=args["target"], + study_dict=study_dict, prefix=args["prefix"], ) elif args["action"] == "build": for target in args["target"]: if args["builder"]: runner.run_matching_table_builder( - study_dict[target], args["builder"], config=config + study_dict[target], args["builder"] ) else: runner.clean_and_build_study( study_dict[target], - config=config, continue_from=args["continue_from"], ) @@ -408,12 +360,12 @@ def run_cli(args: dict): elif args["action"] == "import": for archive in args["archive_path"]: archive = get_abs_path(archive) - importer.import_archive(config, archive, args) + importer.import_archive(config, archive_path=archive) elif args["action"] == "generate-sql": for target in args["target"]: runner.generate_study_sql( - study_dict[target], builder=args["builder"], config=config + study_dict[target], builder=args["builder"] ) elif args["action"] == "generate-md": diff --git a/cumulus_library/databases.py b/cumulus_library/databases.py index 9ca3a4c0..1cbd75c1 100644 --- a/cumulus_library/databases.py +++ b/cumulus_library/databases.py @@ -614,7 +614,11 @@ def operational_errors(self) -> tuple[Exception]: def create_schema(self, schema_name): """Creates a new schema object inside the database""" - self.connection.sql(f"CREATE SCHEMA {schema_name}") + schemas = self.connection.sql( + "SELECT schema_name FROM information_schema.schemata" + ).fetchall() + if (schema_name,) not in schemas: + self.connection.sql(f"CREATE SCHEMA {schema_name}") def close(self) -> None: self.connection.close() diff --git a/cumulus_library/log_utils.py b/cumulus_library/log_utils.py index 32e32032..b605a549 100644 --- a/cumulus_library/log_utils.py +++ b/cumulus_library/log_utils.py @@ -3,19 +3,17 @@ from cumulus_library import ( __version__, base_utils, - databases, enums, errors, - study_parser, + study_manifest, ) from cumulus_library.template_sql import base_templates, sql_utils def log_transaction( *, - cursor: databases.DatabaseCursor, - schema: str, - manifest: study_parser.StudyManifestParser, + config: base_utils.StudyConfig, + manifest: study_manifest.StudyManifest, status: enums.LogStatuses | str | None = enums.LogStatuses.INFO, message: str | None = None, ): @@ -29,8 +27,7 @@ def log_transaction( ) from e _log_table( table=sql_utils.TransactionsTable(), - cursor=cursor, - schema=schema, + config=config, manifest=manifest, dataset=[ [ @@ -46,17 +43,15 @@ def log_transaction( def log_statistics( *, - cursor: databases.DatabaseCursor, - schema: str, - manifest: study_parser.StudyManifestParser, + config: base_utils.StudyConfig, + manifest: study_manifest.StudyManifest, table_type: str, table_name: str, view_name: str, ): _log_table( table=sql_utils.StatisticsTable(), - cursor=cursor, - schema=schema, + config=config, manifest=manifest, dataset=[ [ @@ -74,16 +69,16 @@ def log_statistics( def _log_table( *, table: sql_utils.BaseTable, - cursor: databases.DatabaseCursor, - schema: str, - manifest: study_parser.StudyManifestParser, + config: base_utils.StudyConfig, + manifest: study_manifest.StudyManifest, dataset: list[list], ): + print(type(config)) if manifest and manifest.get_dedicated_schema(): db_schema = manifest.get_dedicated_schema() table_name = table.name else: - db_schema = schema + db_schema = config.schema table_name = f"{manifest.get_study_prefix()}__{table.name}" query = base_templates.get_insert_into_query( schema=db_schema, @@ -92,4 +87,4 @@ def _log_table( dataset=dataset, type_casts=table.type_casts, ) - cursor.execute(query) + config.db.cursor().execute(query) diff --git a/cumulus_library/protected_table_builder.py b/cumulus_library/protected_table_builder.py index 70d7e3d6..04f8b271 100644 --- a/cumulus_library/protected_table_builder.py +++ b/cumulus_library/protected_table_builder.py @@ -1,6 +1,11 @@ """Builder for creating tables for tracking state/logging changes""" -from cumulus_library import base_table_builder, databases, enums, study_parser +from cumulus_library import ( + base_table_builder, + base_utils, + enums, + study_manifest, +) from cumulus_library.template_sql import base_templates TRANSACTIONS_COLS = ["study_name", "library_version", "status", "event_time", "message"] @@ -32,22 +37,28 @@ class ProtectedTableBuilder(base_table_builder.BaseTableBuilder): def prepare_queries( self, - cursor: databases.DatabaseCursor, - schema: str, - study_name: str, - study_stats: dict, + config: base_utils.StudyConfig, + manifest: study_manifest.StudyManifest | None = None, *args, - manifest: study_parser.StudyManifestParser | None = None, + study_stats: dict | None = None, **kwargs, ): + if study_stats is None: + study_stats = {} if manifest and manifest.get_dedicated_schema(): db_schema = manifest.get_dedicated_schema() transactions = enums.ProtectedTables.TRANSACTIONS.value statistics = enums.ProtectedTables.STATISTICS.value else: - db_schema = schema - transactions = f"{study_name}__{enums.ProtectedTables.TRANSACTIONS.value}" - statistics = f"{study_name}__{enums.ProtectedTables.STATISTICS.value}" + db_schema = config.schema + transactions = ( + f"{manifest.get_study_prefix()}" + f"__{enums.ProtectedTables.TRANSACTIONS.value}" + ) + statistics = ( + f"{manifest.get_study_prefix()}" + f"__{enums.ProtectedTables.STATISTICS.value}" + ) self.queries.append( base_templates.get_ctas_empty_query( db_schema, @@ -56,7 +67,7 @@ def prepare_queries( TRANSACTION_COLS_TYPES, ) ) - if study_stats: + if manifest._study_config.get("statistics_config"): self.queries.append( base_templates.get_ctas_empty_query( db_schema, diff --git a/cumulus_library/statistics/counts.py b/cumulus_library/statistics/counts.py index 44050783..9d467d18 100644 --- a/cumulus_library/statistics/counts.py +++ b/cumulus_library/statistics/counts.py @@ -3,13 +3,11 @@ import sys from pathlib import Path -from cumulus_library.base_table_builder import BaseTableBuilder -from cumulus_library.errors import CountsBuilderError +from cumulus_library import base_table_builder, errors, study_manifest from cumulus_library.statistics.statistics_templates import counts_templates -from cumulus_library.study_parser import StudyManifestParser -class CountsBuilder(BaseTableBuilder): +class CountsBuilder(base_table_builder.BaseTableBuilder): """Extends BaseTableBuilder for counts-related use cases""" def __init__(self, study_prefix: str | None = None): @@ -19,10 +17,10 @@ def __init__(self, study_prefix: str | None = None): study_path = Path(sys.modules[self.__module__].__file__).parent try: - parser = StudyManifestParser(study_path) + parser = study_manifest.StudyManifest(study_path) self.study_prefix = parser.get_study_prefix() except Exception as e: - raise CountsBuilderError( + raise errors.CountsBuilderError( "CountsBuilder must be either initiated with a study prefix, " "or be in a directory with a valid manifest.toml" ) from e @@ -57,7 +55,9 @@ def get_where_clauses( elif isinstance(clause, list): return clause else: - raise CountsBuilderError(f"get_where_clauses invalid clause {clause}") + raise errors.CountsBuilderError( + f"get_where_clauses invalid clause {clause}" + ) def get_count_query( self, table_name: str, source_table: str, table_cols: list, **kwargs @@ -74,7 +74,7 @@ def get_count_query( statistics/statistics_templates/count_templates.CountableFhirResource) """ if not table_name or not source_table or not table_cols: - raise CountsBuilderError( + raise errors.CountsBuilderError( "count_query missing required arguments. " f"output table: {table_name}" ) for key in kwargs: @@ -84,7 +84,9 @@ def get_count_query( "fhir_resource", "filter_resource", ]: - raise CountsBuilderError(f"count_query received unexpected key: {key}") + raise errors.CountsBuilderError( + f"count_query received unexpected key: {key}" + ) return counts_templates.get_count_query( table_name, source_table, table_cols, **kwargs ) diff --git a/cumulus_library/statistics/psm.py b/cumulus_library/statistics/psm.py index 3b528e61..93939412 100644 --- a/cumulus_library/statistics/psm.py +++ b/cumulus_library/statistics/psm.py @@ -16,7 +16,7 @@ # these imports are mimicing PsmPy imports for re-implemented functions from psmpy.functions import cohenD -from cumulus_library import base_table_builder, databases +from cumulus_library import base_table_builder, base_utils, databases from cumulus_library.statistics.statistics_templates import psm_templates from cumulus_library.template_sql import base_templates @@ -347,19 +347,16 @@ def generate_psm_analysis( ) def prepare_queries( - self, cursor: object, schema: str, table_suffix: str, *args, **kwargs + self, config: base_utils.StudyConfig, *args, table_suffix: str, **kwargs ): - self._create_covariate_table(cursor, schema, table_suffix) + self._create_covariate_table(config.db.cursor(), config.schema, table_suffix) def post_execution( self, - cursor: object, - schema: str, + config: base_utils.StudyConfig, *args, - verbose: bool = False, - drop_table: bool = False, table_suffix: str | None = None, **kwargs, ): # super().execute_queries(cursor, schema, verbose, drop_table) - self.generate_psm_analysis(cursor, schema, table_suffix) + self.generate_psm_analysis(config.db.cursor(), config.schema, table_suffix) diff --git a/cumulus_library/studies/core/builder_prereq_tables.py b/cumulus_library/studies/core/builder_prereq_tables.py index 50798d76..e6df535c 100644 --- a/cumulus_library/studies/core/builder_prereq_tables.py +++ b/cumulus_library/studies/core/builder_prereq_tables.py @@ -12,7 +12,7 @@ class CorePrereqTableBuilder(base_table_builder.BaseTableBuilder): display_text = "Creating core prerequisite tables..." - def prepare_queries(self, cursor: object, schema: str, *args, **kwargs): + def prepare_queries(self, *args, **kwargs): dir_path = pathlib.Path(__file__).resolve().parents[0] prereq_sql = [ "version.sql", diff --git a/cumulus_library/studies/core/count_core.py b/cumulus_library/studies/core/count_core.py index f8b82812..7c7eaf93 100644 --- a/cumulus_library/studies/core/count_core.py +++ b/cumulus_library/studies/core/count_core.py @@ -111,7 +111,7 @@ def count_core_patient(self): cols = ["gender", "race_display", "ethnicity_display"] return self.count_patient(table_name, from_table, cols) - def prepare_queries(self, cursor=None, schema=None, *args, **kwargs): + def prepare_queries(self, *args, **kwargs): self.queries = [ self.count_core_condition(duration="month"), self.count_core_documentreference(duration="month"), diff --git a/cumulus_library/studies/vocab/vocab_icd_builder.py b/cumulus_library/studies/vocab/vocab_icd_builder.py index b7102ca5..1367639d 100644 --- a/cumulus_library/studies/vocab/vocab_icd_builder.py +++ b/cumulus_library/studies/vocab/vocab_icd_builder.py @@ -10,7 +10,6 @@ class VocabIcdRunner(base_table_builder.BaseTableBuilder): display_text = "Creating ICD vocab..." - partition_size = 1200 @staticmethod def clean_row(row, filename): @@ -22,10 +21,8 @@ def clean_row(row, filename): def prepare_queries( self, - cursor: object, - schema: str, - *args, config: base_utils.StudyConfig, + *args, **kwargs, ): """Creates queries for populating ICD vocab @@ -58,7 +55,7 @@ def prepare_queries( # use the last value of remote location self.queries.append( base_templates.get_ctas_from_parquet_query( - schema_name=schema, + schema_name=config.schema, table_name=table_name, local_location=path / "icd", remote_location=remote_path, diff --git a/cumulus_library/study_parser.py b/cumulus_library/study_manifest.py similarity index 98% rename from cumulus_library/study_parser.py rename to cumulus_library/study_manifest.py index 403e48eb..b9879bb5 100644 --- a/cumulus_library/study_parser.py +++ b/cumulus_library/study_manifest.py @@ -7,7 +7,7 @@ from cumulus_library import errors -class StudyManifestParser: +class StudyManifest: """Handles interactions with study directories and manifest files""" def __init__( @@ -15,7 +15,7 @@ def __init__( study_path: pathlib.Path | None = None, data_path: pathlib.Path | None = None, ): - """Instantiates a StudyManifestParser. + """Instantiates a StudyManifest. :param study_path: A pathlib Path object, optional """ diff --git a/tests/conftest.py b/tests/conftest.py index 98d260a1..ef9f2a8e 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -205,17 +205,34 @@ def mock_db(tmp_path): yield db +@pytest.fixture +def mock_db_config(mock_db): + """Provides a DuckDatabaseBackend for local testing inside a StudyConfig""" + config = base_utils.StudyConfig(db=mock_db, schema="main") + yield config + + @pytest.fixture def mock_db_core(tmp_path, mock_db): # pylint: disable=redefined-outer-name """Provides a DuckDatabaseBackend with the core study ran for local testing""" - builder = cli.StudyRunner(mock_db, data_path=f"{tmp_path}/data_path") + config = base_utils.StudyConfig( + db=mock_db, + schema="main", + ) + builder = cli.StudyRunner(config, data_path=f"{tmp_path}/data_path") builder.clean_and_build_study( Path(__file__).parent.parent / "cumulus_library/studies/core", - config=base_utils.StudyConfig(stats_build=True, db=mock_db), ) yield mock_db +@pytest.fixture +def mock_db_core_config(mock_db_core): + """Provides a DuckDatabaseBackend with core study inside a StudyConfig""" + config = base_utils.StudyConfig(db=mock_db_core, schema="main") + yield config + + @pytest.fixture def mock_db_stats(tmp_path): """Provides a DuckDatabaseBackend with a larger dataset for sampling stats""" @@ -223,13 +240,23 @@ def mock_db_stats(tmp_path): db = create_db_backend( { "db_type": "duckdb", - "schema_name": f"{tmp_path}cumulus.duckdb", + "schema_name": f"{tmp_path}/stats.db", "load_ndjson_dir": f"{tmp_path}/mock_data", } ) - builder = cli.StudyRunner(db, data_path=f"{tmp_path}/data_path") + config = base_utils.StudyConfig( + db=db, + schema="main", + ) + builder = cli.StudyRunner(config, data_path=f"{tmp_path}/data_path") builder.clean_and_build_study( Path(__file__).parent.parent / "cumulus_library/studies/core", - config=base_utils.StudyConfig(stats_build=True, db=db), ) yield db + + +@pytest.fixture +def mock_db_stats_config(mock_db_stats): + """Provides a DuckDatabaseBackend with core study inside a StudyConfig""" + config = base_utils.StudyConfig(db=mock_db_stats, schema="main") + yield config diff --git a/tests/test_actions.py b/tests/test_actions.py index 482dc0ef..5e309416 100644 --- a/tests/test_actions.py +++ b/tests/test_actions.py @@ -13,7 +13,7 @@ import responses from freezegun import freeze_time -from cumulus_library import base_utils, enums, errors, study_parser +from cumulus_library import base_utils, enums, errors, log_utils, study_manifest from cumulus_library.actions import ( builder, cleaner, @@ -25,21 +25,19 @@ @pytest.mark.parametrize( - "schema,verbose,prefix,confirm,stats,target,raises", + "verbose,prefix,confirm,stats,target,raises", [ - ("main", True, None, None, False, "study_valid__table", does_not_raise()), - ("main", False, None, None, False, "study_valid__table", does_not_raise()), - ("main", None, None, None, False, "study_valid__table", does_not_raise()), - (None, True, None, None, False, None, pytest.raises(SystemExit)), - ("main", None, None, None, False, "study_valid__etl_table", does_not_raise()), - ("main", None, None, None, False, "study_valid__nlp_table", does_not_raise()), - ("main", None, None, None, False, "study_valid__lib_table", does_not_raise()), - ("main", None, None, None, False, "study_valid__lib", does_not_raise()), - ("main", None, "foo", "y", False, "foo_table", does_not_raise()), - ("main", None, "foo", "n", False, "foo_table", pytest.raises(SystemExit)), - ("main", True, None, "y", True, "study_valid__table", does_not_raise()), + (True, None, None, False, "study_valid__table", does_not_raise()), + (False, None, None, False, "study_valid__table", does_not_raise()), + (None, None, None, False, "study_valid__table", does_not_raise()), + (None, None, None, False, "study_valid__etl_table", does_not_raise()), + (None, None, None, False, "study_valid__nlp_table", does_not_raise()), + (None, None, None, False, "study_valid__lib_table", does_not_raise()), + (None, None, None, False, "study_valid__lib", does_not_raise()), + (None, "foo", "y", False, "foo_table", does_not_raise()), + (None, "foo", "n", False, "foo_table", pytest.raises(SystemExit)), + (True, None, "y", True, "study_valid__table", does_not_raise()), ( - "main", True, None, "n", @@ -49,40 +47,34 @@ ), ], ) -def test_clean_study(mock_db, schema, verbose, prefix, confirm, stats, target, raises): +def test_clean_study(mock_db_config, verbose, prefix, confirm, stats, target, raises): with raises: + mock_db_config.stats_clean = stats protected_strs = [x.value for x in enums.ProtectedTableKeywords] with mock.patch.object(builtins, "input", lambda _: confirm): - parser = study_parser.StudyManifestParser("./tests/test_data/study_valid/") + manifest = study_manifest.StudyManifest("./tests/test_data/study_valid/") builder.run_protected_table_builder( - parser, - mock_db.cursor(), - schema, - config=base_utils.StudyConfig(db=mock_db), + config=mock_db_config, + manifest=manifest, ) # We're mocking stats tables since creating them programmatically # is very slow and we're trying a lot of conditions - mock_db.cursor().execute( - f"CREATE TABLE {parser.get_study_prefix()}__" + mock_db_config.db.cursor().execute( + f"CREATE TABLE {manifest.get_study_prefix()}__" f"{enums.ProtectedTables.STATISTICS.value} " "AS SELECT 'study_valid' as study_name, " "'study_valid__123' AS table_name" ) - mock_db.cursor().execute("CREATE TABLE study_valid__123 (test int)") + mock_db_config.db.cursor().execute( + "CREATE TABLE study_valid__123 (test int)" + ) if target is not None: - mock_db.cursor().execute(f"CREATE TABLE {target} (test int);") - cleaner.clean_study( - parser, - mock_db.cursor(), - schema, - verbose=verbose, - prefix=prefix, - stats_clean=stats, - ) + mock_db_config.db.cursor().execute(f"CREATE TABLE {target} (test int);") + cleaner.clean_study(config=mock_db_config, manifest=manifest, prefix=prefix) remaining_tables = ( - mock_db.cursor() + mock_db_config.db.cursor() .execute("select distinct(table_name) from information_schema.tables") .fetchall() ) @@ -91,16 +83,16 @@ def test_clean_study(mock_db, schema, verbose, prefix, confirm, stats, target, r else: assert (target,) not in remaining_tables assert ( - f"{parser.get_study_prefix()}__{enums.ProtectedTables.TRANSACTIONS.value}", + f"{manifest.get_study_prefix()}__{enums.ProtectedTables.TRANSACTIONS.value}", ) in remaining_tables if stats: assert ( - f"{parser.get_study_prefix()}__{enums.ProtectedTables.STATISTICS.value}", + f"{manifest.get_study_prefix()}__{enums.ProtectedTables.STATISTICS.value}", ) not in remaining_tables assert ("study_valid__123",) not in remaining_tables else: assert ( - f"{parser.get_study_prefix()}__{enums.ProtectedTables.STATISTICS.value}", + f"{manifest.get_study_prefix()}__{enums.ProtectedTables.STATISTICS.value}", ) in remaining_tables assert ("study_valid__123",) in remaining_tables @@ -112,26 +104,24 @@ def test_clean_study(mock_db, schema, verbose, prefix, confirm, stats, target, r ("./tests/test_data/psm/", True), ], ) -def test_run_protected_table_builder(mock_db, study_path, stats): - parser = study_parser.StudyManifestParser(study_path) - builder.run_protected_table_builder( - parser, mock_db.cursor(), "main", config=base_utils.StudyConfig(db=mock_db) - ) +def test_run_protected_table_builder(mock_db_config, study_path, stats): + manifest = study_manifest.StudyManifest(study_path) + builder.run_protected_table_builder(config=mock_db_config, manifest=manifest) tables = ( - mock_db.cursor() + mock_db_config.db.cursor() .execute("SELECT distinct(table_name) FROM information_schema.tables ") .fetchall() ) assert ( - f"{parser.get_study_prefix()}__{enums.ProtectedTables.TRANSACTIONS.value}", + f"{manifest.get_study_prefix()}__{enums.ProtectedTables.TRANSACTIONS.value}", ) in tables if stats: assert ( - f"{parser.get_study_prefix()}__{enums.ProtectedTables.STATISTICS.value}", + f"{manifest.get_study_prefix()}__{enums.ProtectedTables.STATISTICS.value}", ) in tables else: assert ( - f"{parser.get_study_prefix()}__{enums.ProtectedTables.STATISTICS.value}", + f"{manifest.get_study_prefix()}__{enums.ProtectedTables.STATISTICS.value}", ) not in tables @@ -170,18 +160,15 @@ def test_run_protected_table_builder(mock_db, study_path, stats): ), ], ) -def test_table_builder(mock_db, study_path, verbose, expects, raises): +def test_table_builder(mock_db_config, study_path, verbose, expects, raises): with raises: - parser = study_parser.StudyManifestParser(pathlib.Path(study_path)) + manifest = study_manifest.StudyManifest(pathlib.Path(study_path)) builder.run_table_builder( - parser, - mock_db.cursor(), - "main", - verbose=verbose, - config=base_utils.StudyConfig(db=mock_db), + config=mock_db_config, + manifest=manifest, ) tables = ( - mock_db.cursor() + mock_db_config.db.cursor() .execute("SELECT distinct(table_name) FROM information_schema.tables ") .fetchall() ) @@ -235,79 +222,101 @@ def test_table_builder(mock_db, study_path, verbose, expects, raises): ), ], ) -def test_build_study(mock_db, study_path, verbose, expects, raises): +def test_build_study(mock_db_config, study_path, verbose, expects, raises): with raises: - parser = study_parser.StudyManifestParser(study_path) + manifest = study_manifest.StudyManifest(study_path) builder.build_study( - parser, - mock_db.cursor(), - verbose=verbose, - config=base_utils.StudyConfig(db=mock_db), + config=mock_db_config, + manifest=manifest, ) tables = ( - mock_db.cursor() + mock_db_config.db.cursor() .execute("SELECT distinct(table_name) FROM information_schema.tables ") .fetchall() ) assert expects in tables +@freeze_time("2024-01-01") @pytest.mark.parametrize( - "study_path,stats,expects,raises", + "study_path,stats,previous,expects,raises", [ ( "./tests/test_data/psm/", False, - ("psm_test__psm_encounter_covariate",), + False, + ("psm_test__psm_encounter_covariate_2024_01_01T00_00_00_00_00",), does_not_raise(), ), ( "./tests/test_data/psm/", + False, True, - ("psm_test__psm_encounter_covariate",), + ("psm_test__psm_encounter_covariate_2024_01_01T00_00_00_00_00",), + does_not_raise(), + ), + ( + "./tests/test_data/psm/", + True, + False, + ("psm_test__psm_encounter_covariate_2024_01_01T00_00_00_00_00",), does_not_raise(), ), ], ) def test_run_statistics_builders( - tmp_path, mock_db_stats, study_path, stats, expects, raises + tmp_path, mock_db_stats_config, study_path, stats, previous, expects, raises ): with raises: - parser = study_parser.StudyManifestParser(study_path, data_path=tmp_path) - config = base_utils.StudyConfig(db=mock_db_stats, stats_build=stats) - builder.run_protected_table_builder( - parser, mock_db_stats.cursor(), "main", config=config + manifest = study_manifest.StudyManifest(study_path, data_path=tmp_path) + config = base_utils.StudyConfig( + db=mock_db_stats_config.db, + schema=mock_db_stats_config.schema, + stats_build=stats, ) - builder.build_study(parser, mock_db_stats.cursor(), config=config) - builder.run_statistics_builders( - parser, mock_db_stats.cursor(), "main", config=config + builder.run_protected_table_builder( + config=config, + manifest=manifest, ) + if previous: + log_utils.log_statistics( + config=config, + manifest=manifest, + table_type="psm", + table_name="psm_test__psm_encounter_2023_06_15", + view_name="psm_test__psm_encounter_covariate", + ) + builder.build_study(config=config, manifest=manifest) + builder.run_statistics_builders(config=config, manifest=manifest) tables = ( - mock_db_stats.cursor() + mock_db_stats_config.db.cursor() .execute("SELECT distinct(table_name) FROM information_schema.tables") .fetchall() ) - print(tables) - if stats: - assert expects in tables - else: + if previous: assert expects not in tables + else: + assert expects in tables -def test_export_study(tmp_path, mock_db_core): - parser = study_parser.StudyManifestParser( +def test_export_study(tmp_path, mock_db_core_config): + manifest = study_manifest.StudyManifest( f"{Path(__file__).parent.parent}/cumulus_library/studies/core", data_path=f"{tmp_path}/export", ) exporter.export_study( - parser, mock_db_core, None, f"{tmp_path}/export", False, chunksize=20 + config=mock_db_core_config, + manifest=manifest, + data_path=f"{tmp_path}/export", + archive=False, + chunksize=20, ) for file in Path(f"{tmp_path}/export").glob("*.*"): - assert file in parser.get_export_table_list() + assert file in manifest.get_export_table_list() @freeze_time("2024-01-01") -def test_import_study(tmp_path, mock_db): +def test_import_study(tmp_path, mock_db_config): test_data = { "string": ["a", "b", None], "int": [1, 2, pandas.NA], @@ -324,13 +333,14 @@ def test_import_study(tmp_path, mock_db): archive.write(tmp_path / "archive/test__table.csv") (tmp_path / "archive/test__table.parquet").unlink() (tmp_path / "archive/test__table.csv").unlink() - args = {"schema_name": "main", "verbose": False} - config = base_utils.StudyConfig(db=mock_db) + mock_db_config.schema = "schema_name" importer.import_archive( - archive_path=tmp_path / "archive/test.zip", args=args, config=config + config=mock_db_config, archive_path=tmp_path / "archive/test.zip" ) assert len(list((tmp_path / "archive").glob("*"))) == 1 - test_table = mock_db.cursor().execute("SELECT * FROM test__table").fetchall() + test_table = ( + mock_db_config.db.cursor().execute("SELECT * FROM test__table").fetchall() + ) assert test_table == [ ("a", 1, 1.1, True, datetime.datetime(2024, 1, 1, 0, 0)), ("b", 2, 2.2, False, datetime.datetime(2024, 1, 1, 0, 0)), @@ -338,30 +348,30 @@ def test_import_study(tmp_path, mock_db): ] with pytest.raises(errors.StudyImportError): importer.import_archive( - archive_path=tmp_path / "archive/missing.zip", args=args, config=config + config=mock_db_config, archive_path=tmp_path / "archive/missing.zip" ) with pytest.raises(errors.StudyImportError): importer.import_archive( - archive_path=tmp_path / "duck.db", args=args, config=config + config=mock_db_config, archive_path=tmp_path / "duck.db" ) with pytest.raises(errors.StudyImportError): df.to_parquet(tmp_path / "archive/other_test__table.parquet") with zipfile.ZipFile(tmp_path / "archive/test.zip", "w") as archive: archive.write(tmp_path / "archive/other_test__table.parquet") importer.import_archive( - archive_path=tmp_path / "duck.db", args=args, config=config + config=mock_db_config, archive_path=tmp_path / "duck.db" ) with pytest.raises(errors.StudyImportError): df.to_parquet(tmp_path / "archive/table.parquet") with zipfile.ZipFile(tmp_path / "archive/no_dunder.zip", "w") as archive: archive.write(tmp_path / "archive/table.parquet") importer.import_archive( - archive_path=tmp_path / "archive/no_dunder.zip", args=args, config=config + config=mock_db_config, archive_path=tmp_path / "archive/no_dunder.zip" ) with pytest.raises(errors.StudyImportError): (tmp_path / "archive/empty.zip") importer.import_archive( - archive_path=tmp_path / "archive/empty.zip", args=args, config=config + config=mock_db_config, archive_path=tmp_path / "archive/empty.zip" ) @@ -417,22 +427,17 @@ def test_upload_data(user, id_token, status, login_error, raises): @mock.patch("sysconfig.get_path") -def test_generate_sql(mock_path, mock_db, tmp_path): +def test_generate_sql(mock_path, mock_db_config, tmp_path): mock_path.return_value = f"{tmp_path}/study_python_valid/" with does_not_raise(): shutil.copytree( f"{Path(__file__).resolve().parents[0]}/test_data/study_python_valid", f"{tmp_path}/study_python_valid/", ) - parser = study_parser.StudyManifestParser( + manifest = study_manifest.StudyManifest( study_path=pathlib.Path(f"{tmp_path}/study_python_valid/") ) - file_generator.run_generate_sql( - manifest_parser=parser, - cursor=mock_db.cursor(), - schema="main", - config=base_utils.StudyConfig(db=mock_db), - ) + file_generator.run_generate_sql(manifest=manifest, config=mock_db_config) files = list( pathlib.Path(f"{tmp_path}/study_python_valid/reference_sql/").glob("*") ) @@ -448,27 +453,18 @@ def test_generate_sql(mock_path, mock_db, tmp_path): @mock.patch("sysconfig.get_path") -def test_generate_md(mock_path, mock_db, tmp_path): +def test_generate_md(mock_path, mock_db_config, tmp_path): mock_path.return_value = f"{tmp_path}/study_python_valid/" with does_not_raise(): shutil.copytree( f"{Path(__file__).resolve().parents[0]}/test_data/study_python_valid", f"{tmp_path}/study_python_valid/", ) - parser = study_parser.StudyManifestParser( + manifest = study_manifest.StudyManifest( study_path=pathlib.Path(f"{tmp_path}/study_python_valid/") ) - builder.run_table_builder( - parser, - mock_db.cursor(), - "main", - config=base_utils.StudyConfig(db=mock_db), - ) - file_generator.run_generate_markdown( - manifest_parser=parser, - cursor=mock_db.cursor(), - schema="main", - ) + builder.run_table_builder(config=mock_db_config, manifest=manifest) + file_generator.run_generate_markdown(config=mock_db_config, manifest=manifest) with open( f"{tmp_path}/study_python_valid/study_python_valid_generated.md" ) as f: diff --git a/tests/test_cli.py b/tests/test_cli.py index 984b80a4..b6616d0a 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -225,8 +225,6 @@ def test_generate_md(mock_path, tmp_path): pathlib.Path(__file__).resolve().parent / "test_data/study_python_valid_generated.md" ) - with open(test_file) as f: - print(f.read()) assert filecmp.cmp(test_file, ref_file, shallow=True) @@ -499,19 +497,11 @@ def test_cli_transactions(tmp_path, study, finishes, raises): ["build", "-t", study, "-s", "tests/test_data/"], f"{tmp_path}", ) - print(args[-1:]) - args = args[:-1] + [ f"{tmp_path}/{study}_duck.db", ] - print(args[-1:]) cli.main(cli_args=args) db = DuckDatabaseBackend(f"{tmp_path}/{study}_duck.db") - print( - db.cursor() - .execute("select table_name from information_schema.tables") - .fetchall() - ) query = db.cursor().execute(f"SELECT * from {study}__lib_transactions").fetchall() assert query[0][2] == "started" if finishes: @@ -698,6 +688,7 @@ def test_cli_finds_study_from_manifest_prefix(tmp_path): ) @mock.patch("cumulus_library.base_utils.StudyConfig") def test_cli_custom_args(mock_config, tmp_path, option, raises): + mock_config.return_value.stats_clean = False with raises: cli.main( cli_args=duckdb_args( diff --git a/tests/test_data/study_dedicated_schema/module1.py b/tests/test_data/study_dedicated_schema/module1.py index 38c6b64b..08d9d307 100644 --- a/tests/test_data/study_dedicated_schema/module1.py +++ b/tests/test_data/study_dedicated_schema/module1.py @@ -4,7 +4,7 @@ class ModuleOneRunner(BaseTableBuilder): display_text = "module1" - def prepare_queries(self, cursor: object, schema: str, *args, **kwargs): + def prepare_queries(self, *args, **kwargs): self.queries.append( "CREATE TABLE IF NOT EXISTS study_dedicated_schema__table_1 (test int);" ) diff --git a/tests/test_data/study_dedicated_schema/module2.py b/tests/test_data/study_dedicated_schema/module2.py index 7e72c388..ab72183d 100644 --- a/tests/test_data/study_dedicated_schema/module2.py +++ b/tests/test_data/study_dedicated_schema/module2.py @@ -4,7 +4,7 @@ class ModuleTwoRunner(BaseTableBuilder): display_text = "module2" - def prepare_queries(self, cursor: object, schema: str, *args, **kwargs): + def prepare_queries(self, *args, **kwargs): self.queries.append( "CREATE TABLE IF NOT EXISTS study_dedicated_schema__table_2 (test int);" ) diff --git a/tests/test_data/study_python_counts_valid/module1.py b/tests/test_data/study_python_counts_valid/module1.py index e897b49a..0c0c2023 100644 --- a/tests/test_data/study_python_counts_valid/module1.py +++ b/tests/test_data/study_python_counts_valid/module1.py @@ -7,7 +7,7 @@ class ModuleOneRunner(CountsBuilder): def __init__(self): super().__init__() - def prepare_queries(self, cursor: object, schema: str, *args, **kwargs): + def prepare_queries(self, *args, **kwargs): self.queries.append( "CREATE TABLE IF NOT EXISTS study_python_counts_valid__table1 (test int);" ) diff --git a/tests/test_data/study_python_counts_valid/module2.py b/tests/test_data/study_python_counts_valid/module2.py index ea6802ec..531d3f96 100644 --- a/tests/test_data/study_python_counts_valid/module2.py +++ b/tests/test_data/study_python_counts_valid/module2.py @@ -7,7 +7,7 @@ class ModuleTwoRunner(CountsBuilder): def __init__(self): super().__init__(study_prefix="study_python_counts_valid") - def prepare_queries(self, cursor: object, schema: str, *args, **kwargs): + def prepare_queries(self, *args, **kwargs): self.queries.append( "CREATE TABLE IF NOT EXISTS study_python_counts_valid__table2 (test int);" ) diff --git a/tests/test_data/study_python_local_template/module1.py b/tests/test_data/study_python_local_template/module1.py index d63baaa7..52f38b6a 100644 --- a/tests/test_data/study_python_local_template/module1.py +++ b/tests/test_data/study_python_local_template/module1.py @@ -6,7 +6,7 @@ class ModuleOneRunner(BaseTableBuilder): display_text = "module1" - def prepare_queries(self, cursor: object, schema: str, *args, **kwargs): + def prepare_queries(self, *args, **kwargs): self.queries.append( local_template.get_local_template("table", config={"field": "foo"}) ) diff --git a/tests/test_data/study_python_no_subclass/module1.py b/tests/test_data/study_python_no_subclass/module1.py index df4fac9a..d9b680ec 100644 --- a/tests/test_data/study_python_no_subclass/module1.py +++ b/tests/test_data/study_python_no_subclass/module1.py @@ -5,5 +5,5 @@ class ModuleOneRunner(BaseTableBuilder): display_text = "Module 1" @classmethod - def prepare_queries(self, cursor: object, schema: str, *args, **kwargs): + def prepare_queries(self, *args, **kwargs): pass diff --git a/tests/test_data/study_python_no_subclass/module2.py b/tests/test_data/study_python_no_subclass/module2.py index b378f6cb..17a161d3 100644 --- a/tests/test_data/study_python_no_subclass/module2.py +++ b/tests/test_data/study_python_no_subclass/module2.py @@ -1,4 +1,4 @@ class ModuleTwoRunner: @classmethod - def prepare_queries(self, cursor: object, schema: str, *args, **kwargs): + def prepare_queries(self, *args, **kwargs): pass diff --git a/tests/test_data/study_python_valid/module1.py b/tests/test_data/study_python_valid/module1.py index 5f5b61e1..5db22e8e 100644 --- a/tests/test_data/study_python_valid/module1.py +++ b/tests/test_data/study_python_valid/module1.py @@ -4,7 +4,7 @@ class ModuleOneRunner(BaseTableBuilder): display_text = "module1" - def prepare_queries(self, cursor: object, schema: str, *args, **kwargs): + def prepare_queries(self, *args, **kwargs): self.queries.append( "CREATE TABLE IF NOT EXISTS study_python_valid__table (test int);" ) diff --git a/tests/test_data/study_python_valid/module2.py b/tests/test_data/study_python_valid/module2.py index a0b8854a..b00a5cf5 100644 --- a/tests/test_data/study_python_valid/module2.py +++ b/tests/test_data/study_python_valid/module2.py @@ -4,7 +4,7 @@ class ModuleTwoRunner(BaseTableBuilder): display_text = "module2" - def prepare_queries(self, cursor: object, schema: str, *args, **kwargs): + def prepare_queries(self, *args, **kwargs): self.queries.append( "CREATE TABLE IF NOT EXISTS study_python_valid__count_table (test int);" ) diff --git a/tests/test_logging.py b/tests/test_logging.py index 168032dc..2d3c70e1 100644 --- a/tests/test_logging.py +++ b/tests/test_logging.py @@ -4,7 +4,14 @@ import pytest from freezegun import freeze_time -from cumulus_library import __version__, enums, errors, log_utils, study_parser +from cumulus_library import ( + __version__, + base_utils, + enums, + errors, + log_utils, + study_manifest, +) from cumulus_library.template_sql import base_templates, sql_utils @@ -56,7 +63,8 @@ def test_transactions(mock_db, schema, study, status, message, expects, raises): with raises: cursor = mock_db.cursor() table = sql_utils.TransactionsTable() - manifest = study_parser.StudyManifestParser(f"./tests/test_data/{study}/") + config = base_utils.StudyConfig(db=mock_db, schema="main") + manifest = study_manifest.StudyManifest(f"./tests/test_data/{study}/") if manifest.get_dedicated_schema(): schema = manifest.get_dedicated_schema() table_name = table.name @@ -71,8 +79,7 @@ def test_transactions(mock_db, schema, study, status, message, expects, raises): ) cursor.execute(query) log_utils.log_transaction( - cursor=cursor, - schema=schema, + config=config, manifest=manifest, status=status, message=message, @@ -125,7 +132,8 @@ def test_statistics( with raises: cursor = mock_db.cursor() table = sql_utils.StatisticsTable() - manifest = study_parser.StudyManifestParser(f"./tests/test_data/{study}/") + manifest = study_manifest.StudyManifest(f"./tests/test_data/{study}/") + config = base_utils.StudyConfig(db=mock_db, schema="main") if manifest.get_dedicated_schema(): schema = manifest.get_dedicated_schema() table_name = table.name @@ -140,8 +148,7 @@ def test_statistics( ) cursor.execute(query) log_utils.log_statistics( - cursor=cursor, - schema=schema, + config=config, manifest=manifest, table_type=table_type, table_name=table_name, diff --git a/tests/test_psm.py b/tests/test_psm.py index c628730d..f41356e3 100644 --- a/tests/test_psm.py +++ b/tests/test_psm.py @@ -6,7 +6,7 @@ import pytest from freezegun import freeze_time -from cumulus_library import cli +from cumulus_library import cli, study_manifest from cumulus_library.statistics import psm @@ -54,22 +54,25 @@ ) def test_psm_create( tmp_path, - mock_db_stats, + mock_db_stats_config, toml_def, pos_set, neg_set, expected_first_record, expected_last_record, ): - builder = cli.StudyRunner(mock_db_stats, data_path=Path(tmp_path)) + builder = cli.StudyRunner(mock_db_stats_config, data_path=Path(tmp_path)) + manifest = study_manifest.StudyManifest( + study_path=f"{Path(__file__).parent}/test_data/psm/" + ) psmbuilder = psm.PsmBuilder( f"{Path(__file__).parent}/test_data/psm/{toml_def}", Path(tmp_path) ) - mock_db_stats.cursor().execute( + builder.config.db.cursor().execute( "create table psm_test__psm_cohort as (select * from core__condition " f"ORDER BY {psmbuilder.config.primary_ref} limit 100)" ).df() - mock_db_stats.cursor().execute("select * from psm_test__psm_cohort").fetchall() + builder.config.db.cursor().execute("select * from psm_test__psm_cohort").fetchall() safe_timestamp = ( datetime.now() .replace(microsecond=0) @@ -78,14 +81,13 @@ def test_psm_create( .replace("-", "_") ) psmbuilder.execute_queries( - mock_db_stats.pandas_cursor(), - builder.schema_name, - False, + builder.config, + manifest, drop_table=True, table_suffix=safe_timestamp, ) df = ( - mock_db_stats.cursor() + builder.config.db.cursor() .execute("select * from psm_test__psm_encounter_covariate") .df() ) diff --git a/tests/test_study_parser.py b/tests/test_study_parser.py index 327f142c..a6a200fa 100644 --- a/tests/test_study_parser.py +++ b/tests/test_study_parser.py @@ -6,7 +6,7 @@ import pytest -from cumulus_library import errors, study_parser +from cumulus_library import errors, study_manifest from tests.test_data.parser_mock_data import get_mock_toml, mock_manifests @@ -43,7 +43,7 @@ def test_load_manifest(manifest_path, expected, raises): path = f"{pathlib.Path(__file__).resolve().parents[0]}/{manifest_path}" else: path = None - manifest = study_parser.StudyManifestParser(path) + manifest = study_manifest.StudyManifest(path) assert str(manifest) == expected @@ -64,9 +64,9 @@ def test_manifest_data(manifest_key, raises): ): with raises: if manifest_key == "invalid_none": - parser = study_parser.StudyManifestParser() + parser = study_manifest.StudyManifest() else: - parser = study_parser.StudyManifestParser("./path") + parser = study_manifest.StudyManifest("./path") expected = mock_manifests[manifest_key] assert parser.get_study_prefix() == expected["study_prefix"] if "sql_config" in expected.keys(): diff --git a/tests/testbed_utils.py b/tests/testbed_utils.py index d7c6a20e..e0cf3a3f 100644 --- a/tests/testbed_utils.py +++ b/tests/testbed_utils.py @@ -232,10 +232,13 @@ def build(self, study="core") -> duckdb.DuckDBPyConnection: "load_ndjson_dir": str(self.path), } ) - builder = cli.StudyRunner(db, data_path=str(self.path)) - # builder.verbose = True + config = base_utils.StudyConfig( + db=db, + schema="main", + # builder.verbose = True + ) + builder = cli.StudyRunner(config, data_path=str(self.path)) builder.clean_and_build_study( Path(__file__).parent.parent / "cumulus_library/studies" / study, - config=base_utils.StudyConfig(db=db, stats_build=False), ) return duckdb.connect(db_file) From 2c717b67cb7eec5efd4bf47dc6b98272fc1e9765 Mon Sep 17 00:00:00 2001 From: Matt Garber Date: Mon, 1 Jul 2024 16:12:48 -0400 Subject: [PATCH 2/5] coverage improvements --- cumulus_library/statistics/counts.py | 4 +- .../studies/vocab/vocab_icd_builder.py | 13 +---- tests/test_actions.py | 33 +++++++++-- tests/test_counts_builder.py | 41 ++++++++++---- tests/test_psm.py | 55 +++++++++++++++---- 5 files changed, 106 insertions(+), 40 deletions(-) diff --git a/cumulus_library/statistics/counts.py b/cumulus_library/statistics/counts.py index 9d467d18..22839c34 100644 --- a/cumulus_library/statistics/counts.py +++ b/cumulus_library/statistics/counts.py @@ -262,7 +262,7 @@ def write_counts(self, filepath: str): :param filepath: path to file to write queries out to. """ - self.prepare_queries(cursor=None, schema=None) + self.prepare_queries() self.comment_queries() self.write_queries(path=Path(filepath)) @@ -272,4 +272,4 @@ def prepare_queries(self, cursor: object | None = None, schema: str | None = Non This should be overridden in any count generator. See studies/core/count_core.py for an example """ - pass + pass # pramga: no cover diff --git a/cumulus_library/studies/vocab/vocab_icd_builder.py b/cumulus_library/studies/vocab/vocab_icd_builder.py index 1367639d..115f16ec 100644 --- a/cumulus_library/studies/vocab/vocab_icd_builder.py +++ b/cumulus_library/studies/vocab/vocab_icd_builder.py @@ -11,14 +11,6 @@ class VocabIcdRunner(base_table_builder.BaseTableBuilder): display_text = "Creating ICD vocab..." - @staticmethod - def clean_row(row, filename): - """Removes non-SQL safe charatcers from the input row.""" - for i in range(len(row)): - cell = str(row[i]).replace("'", "").replace(";", ",") - row[i] = cell - return row - def prepare_queries( self, config: base_utils.StudyConfig, @@ -41,9 +33,8 @@ def prepare_queries( header_types = ["STRING", "STRING", "STRING", "STRING", "STRING"] for file in icd_files: parquet_path = path / f"icd/{file.stem}.parquet" - if not parquet_path.is_file(): - df = pandas.read_csv(file, delimiter="|", names=headers) - df.to_parquet(parquet_path) + df = pandas.read_csv(file, delimiter="|", names=headers) + df.to_parquet(parquet_path) remote_path = config.db.upload_file( file=parquet_path, study="vocab", diff --git a/tests/test_actions.py b/tests/test_actions.py index 5e309416..1e975ef1 100644 --- a/tests/test_actions.py +++ b/tests/test_actions.py @@ -350,13 +350,21 @@ def test_import_study(tmp_path, mock_db_config): importer.import_archive( config=mock_db_config, archive_path=tmp_path / "archive/missing.zip" ) + with pytest.raises(errors.StudyImportError): + with open(tmp_path / "archive/empty.zip", "w"): + pass + importer.import_archive( + config=mock_db_config, archive_path=tmp_path / "archive/empty.zip" + ) with pytest.raises(errors.StudyImportError): importer.import_archive( config=mock_db_config, archive_path=tmp_path / "duck.db" ) with pytest.raises(errors.StudyImportError): + df.to_parquet(tmp_path / "archive/test__table.parquet") df.to_parquet(tmp_path / "archive/other_test__table.parquet") with zipfile.ZipFile(tmp_path / "archive/test.zip", "w") as archive: + archive.write(tmp_path / "archive/test__table.parquet") archive.write(tmp_path / "archive/other_test__table.parquet") importer.import_archive( config=mock_db_config, archive_path=tmp_path / "duck.db" @@ -376,15 +384,16 @@ def test_import_study(tmp_path, mock_db_config): @pytest.mark.parametrize( - "user,id_token,status,login_error,raises", + "user,id_token,status,login_error,preview,raises", [ - (None, None, 204, False, pytest.raises(SystemExit)), - ("user", "id", 204, False, does_not_raise()), + (None, None, 204, False, False, pytest.raises(SystemExit)), + ("user", "id", 204, False, False, does_not_raise()), ( "user", "id", 500, False, + False, pytest.raises(requests.exceptions.HTTPError), ), ( @@ -392,6 +401,7 @@ def test_import_study(tmp_path, mock_db_config): "badid", 204, True, + False, pytest.raises(requests.exceptions.HTTPError), ), ( @@ -399,12 +409,21 @@ def test_import_study(tmp_path, mock_db_config): "id", 204, False, + False, + does_not_raise(), + ), + ( + "user", + "id", + 204, + False, + True, does_not_raise(), ), ], ) @responses.activate -def test_upload_data(user, id_token, status, login_error, raises): +def test_upload_data(user, id_token, status, preview, login_error, raises): with raises: if login_error: responses.add(responses.POST, "https://upload.url.test/upload/", status=401) @@ -417,13 +436,17 @@ def test_upload_data(user, id_token, status, login_error, raises): args = { "data_path": pathlib.Path.cwd() / "tests/test_data", "id": id_token, - "preview": False, + "preview": preview, "target": "core", "url": "https://upload.url.test/upload/", "user": user, } responses.add(responses.POST, "https://presigned.url.test", status=status) uploader.upload_files(args) + if preview: + responses.assert_call_count("https://upload.url.test/upload/", 1) + elif raises == does_not_raise(): + responses.assert_call_count("https://upload.url.test/upload/", 2) @mock.patch("sysconfig.get_path") diff --git a/tests/test_counts_builder.py b/tests/test_counts_builder.py index 5c2b7e01..94b50e91 100644 --- a/tests/test_counts_builder.py +++ b/tests/test_counts_builder.py @@ -5,8 +5,8 @@ import pytest -from cumulus_library.errors import CountsBuilderError -from cumulus_library.statistics.counts import CountsBuilder +from cumulus_library import errors +from cumulus_library.statistics import counts TEST_PREFIX = "test" @@ -19,7 +19,7 @@ ], ) def test_get_table_name(name, duration, expected): - builder = CountsBuilder(study_prefix=TEST_PREFIX) + builder = counts.CountsBuilder(study_prefix=TEST_PREFIX) output = builder.get_table_name(name, duration) assert output == expected @@ -32,7 +32,7 @@ def test_get_table_name(name, duration, expected): ("age > 5", None, ["age > 5"], does_not_raise()), (["age > 5", "sex =='F'"], None, ["age > 5", "sex =='F'"], does_not_raise()), ("age > 5", 7, ["age > 5"], does_not_raise()), - ({"age": "5"}, None, None, pytest.raises(CountsBuilderError)), + ({"age": "5"}, None, None, pytest.raises(errors.CountsBuilderError)), ], ) def test_get_where_clauses(clause, min_subject, expected, raises): @@ -42,7 +42,7 @@ def test_get_where_clauses(clause, min_subject, expected, raises): kwargs["clause"] = clause if min_subject is not None: kwargs["min_subject"] = min_subject - builder = CountsBuilder(study_prefix=TEST_PREFIX) + builder = counts.CountsBuilder(study_prefix=TEST_PREFIX) output = builder.get_where_clauses(**kwargs) assert output == expected @@ -67,12 +67,24 @@ def test_get_where_clauses(clause, min_subject, expected, raises): "source", ["a", "b"], {"bad_key": True}, - pytest.raises(CountsBuilderError), + pytest.raises(errors.CountsBuilderError), ), - (None, "source", ["a", "b"], {}, pytest.raises(CountsBuilderError)), - ("table", None, ["a", "b"], {}, pytest.raises(CountsBuilderError)), - ("table", "source", [], {}, pytest.raises(CountsBuilderError)), - ("table", "source", None, {}, pytest.raises(CountsBuilderError)), + ( + None, + "source", + ["a", "b"], + {}, + pytest.raises(errors.CountsBuilderError), + ), + ( + "table", + None, + ["a", "b"], + {}, + pytest.raises(errors.CountsBuilderError), + ), + ("table", "source", [], {}, pytest.raises(errors.CountsBuilderError)), + ("table", "source", None, {}, pytest.raises(errors.CountsBuilderError)), ], ) @mock.patch( @@ -82,7 +94,7 @@ def test_get_count_query( mock_count, table_name, source_table, table_cols, kwargs, raises ): with raises: - builder = CountsBuilder(study_prefix=TEST_PREFIX) + builder = counts.CountsBuilder(study_prefix=TEST_PREFIX) builder.get_count_query(table_name, source_table, table_cols, **kwargs) assert mock_count.called call_args, call_kwargs = mock_count.call_args @@ -165,7 +177,7 @@ def test_count_wrappers( kwargs["where_clauses"] = where if min_subject is not None: kwargs["min_subject"] = min_subject - builder = CountsBuilder(study_prefix=TEST_PREFIX) + builder = counts.CountsBuilder(study_prefix=TEST_PREFIX) wrapper = getattr(builder, method) wrapper(table_name, source_table, table_cols, **kwargs) assert mock_count.called @@ -176,3 +188,8 @@ def test_count_wrappers( assert call_kwargs["where_clauses"] == where if min_subject is not None: assert call_kwargs["min_subject"] == min_subject + + +def test_null_initialization(): + with pytest.raises(errors.CountsBuilderError): + counts.CountsBuilder() diff --git a/tests/test_psm.py b/tests/test_psm.py index f41356e3..d0b46706 100644 --- a/tests/test_psm.py +++ b/tests/test_psm.py @@ -1,7 +1,8 @@ """tests for propensity score matching generation""" -from datetime import datetime -from pathlib import Path +import datetime +import pathlib +from unittest import mock import pytest from freezegun import freeze_time @@ -61,20 +62,20 @@ def test_psm_create( expected_first_record, expected_last_record, ): - builder = cli.StudyRunner(mock_db_stats_config, data_path=Path(tmp_path)) + builder = cli.StudyRunner(mock_db_stats_config, data_path=pathlib.Path(tmp_path)) manifest = study_manifest.StudyManifest( - study_path=f"{Path(__file__).parent}/test_data/psm/" + study_path=f"{pathlib.Path(__file__).parent}/test_data/psm/" ) psmbuilder = psm.PsmBuilder( - f"{Path(__file__).parent}/test_data/psm/{toml_def}", Path(tmp_path) + f"{pathlib.Path(__file__).parent}/test_data/psm/{toml_def}", + pathlib.Path(tmp_path), ) builder.config.db.cursor().execute( "create table psm_test__psm_cohort as (select * from core__condition " f"ORDER BY {psmbuilder.config.primary_ref} limit 100)" ).df() - builder.config.db.cursor().execute("select * from psm_test__psm_cohort").fetchall() safe_timestamp = ( - datetime.now() + datetime.datetime.now() .replace(microsecond=0) .isoformat() .replace(":", "_") @@ -91,13 +92,47 @@ def test_psm_create( .execute("select * from psm_test__psm_encounter_covariate") .df() ) - print(df.columns) + ed_series = df["example_diagnosis"].value_counts() assert ed_series.iloc[0] == neg_set assert ed_series.iloc[1] == pos_set first_record = df.iloc[0].to_dict() - print(first_record) assert first_record == expected_first_record last_record = df.iloc[neg_set + pos_set - 1].to_dict() - print(last_record) assert last_record == expected_last_record + + +@pytest.mark.parametrize("error", [("value"), ("zero")]) +@mock.patch("psmpy.PsmPy.logistic_ps") +def test_psm_error_handling(mock_psm, error, tmp_path, mock_db_stats_config): + match error: + case "value": + mock_psm.side_effect = ValueError + case "zero": + mock_psm.side_effect = ZeroDivisionError + builder = cli.StudyRunner(mock_db_stats_config, data_path=pathlib.Path(tmp_path)) + manifest = study_manifest.StudyManifest( + study_path=f"{pathlib.Path(__file__).parent}/test_data/psm/" + ) + psmbuilder = psm.PsmBuilder( + f"{pathlib.Path(__file__).parent}/test_data/psm/psm_config.toml", + pathlib.Path(tmp_path), + ) + builder.config.db.cursor().execute( + "create table psm_test__psm_cohort as (select * from core__condition " + f"ORDER BY {psmbuilder.config.primary_ref} limit 100)" + ).df() + safe_timestamp = ( + datetime.datetime.now() + .replace(microsecond=0) + .isoformat() + .replace(":", "_") + .replace("-", "_") + ) + with pytest.raises(SystemExit): + psmbuilder.execute_queries( + builder.config, + manifest, + drop_table=True, + table_suffix=safe_timestamp, + ) From fa62a0bafb4836eaf7539ac9ef0a89c47614bb3b Mon Sep 17 00:00:00 2001 From: Matt Garber Date: Mon, 1 Jul 2024 16:35:41 -0400 Subject: [PATCH 3/5] PR feedback --- cumulus_library/actions/builder.py | 2 -- cumulus_library/protected_table_builder.py | 3 +-- 2 files changed, 1 insertion(+), 4 deletions(-) diff --git a/cumulus_library/actions/builder.py b/cumulus_library/actions/builder.py index 7e9835b9..1fdccf75 100644 --- a/cumulus_library/actions/builder.py +++ b/cumulus_library/actions/builder.py @@ -208,9 +208,7 @@ def run_statistics_builders( target_table = stats_config["target_table"] if (target_table,) in existing_stats and not config.stats_build: - print("early break") continue - print("run build") if config_type == "psm": builder = psm.PsmBuilder( toml_config_path=toml_path, diff --git a/cumulus_library/protected_table_builder.py b/cumulus_library/protected_table_builder.py index 04f8b271..78bd8650 100644 --- a/cumulus_library/protected_table_builder.py +++ b/cumulus_library/protected_table_builder.py @@ -43,8 +43,7 @@ def prepare_queries( study_stats: dict | None = None, **kwargs, ): - if study_stats is None: - study_stats = {} + study_stats = study_stats or {} if manifest and manifest.get_dedicated_schema(): db_schema = manifest.get_dedicated_schema() transactions = enums.ProtectedTables.TRANSACTIONS.value From 06633e13097c0b72ba3fbada355e0b2bf9d2218b Mon Sep 17 00:00:00 2001 From: Matt Garber Date: Tue, 2 Jul 2024 10:51:33 -0400 Subject: [PATCH 4/5] coverage cleanup --- cumulus_library/base_table_builder.py | 15 +++++----- cumulus_library/statistics/counts.py | 2 +- tests/test_cli.py | 42 ++++++++++++++++++++------- tests/test_counts_builder.py | 16 ++++++++++ 4 files changed, 55 insertions(+), 20 deletions(-) diff --git a/cumulus_library/base_table_builder.py b/cumulus_library/base_table_builder.py index 92f9817c..e3a4b13c 100644 --- a/cumulus_library/base_table_builder.py +++ b/cumulus_library/base_table_builder.py @@ -36,7 +36,7 @@ def prepare_queries( :param config: A study configuration object """ - raise NotImplementedError + raise NotImplementedError # pragma: no cover @typing.final def execute_queries( @@ -69,11 +69,6 @@ def execute_queries( ) table_name = table_name[0] - # TODO: this may not be required? reinvestigate - # if it contains a schema, remove it (usually it won't, but some - # CTAS forms may) - if "." in table_name: - table_name = table_name.split(".")[1].replace('"', "") table_names.append(table_name) for table_name in table_names: cursor.execute(f"DROP TABLE IF EXISTS {table_name}") @@ -91,7 +86,11 @@ def execute_queries( ): cursor.execute(query) except Exception as e: # pylint: disable=broad-exception-caught - sys.exit(e) + sys.exit( + "An error occured executing this query:\n----\n" + f"{query}\n----\n" + f"{e}" + ) self.post_execution(config, *args, **kwargs) @@ -123,7 +122,7 @@ def comment_queries(self, doc_str=None): def write_queries(self, path: pathlib.Path | None = None): """writes all queries constructed by prepare_queries to disk""" if path is None: - path = pathlib.Path.cwd() / "output.sql" + path = pathlib.Path.cwd() / "output.sql" # pragma: no cover path.parents[0].mkdir(parents=True, exist_ok=True) with open(path, "w", encoding="utf-8") as file: diff --git a/cumulus_library/statistics/counts.py b/cumulus_library/statistics/counts.py index 22839c34..a973642c 100644 --- a/cumulus_library/statistics/counts.py +++ b/cumulus_library/statistics/counts.py @@ -272,4 +272,4 @@ def prepare_queries(self, cursor: object | None = None, schema: str | None = Non This should be overridden in any count generator. See studies/core/count_core.py for an example """ - pass # pramga: no cover + pass # pragma: no cover diff --git a/tests/test_cli.py b/tests/test_cli.py index b6616d0a..7a1c1a44 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -20,8 +20,7 @@ import responses import toml -from cumulus_library import cli, errors -from cumulus_library.databases import DuckDatabaseBackend +from cumulus_library import cli, databases, errors from tests.conftest import duckdb_args @@ -120,7 +119,7 @@ def test_cli_path_mapping(mock_load_json, mock_path, tmp_path, args, raises, exp } args = duckdb_args(args, tmp_path) cli.main(cli_args=args) - db = DuckDatabaseBackend(f"{tmp_path}/duck.db") + db = databases.DuckDatabaseBackend(f"{tmp_path}/duck.db") assert (expected,) in db.cursor().execute("show tables").fetchall() @@ -143,7 +142,7 @@ def test_count_builder_mapping(mock_path, tmp_path): tmp_path, ) cli.main(cli_args=args) - db = DuckDatabaseBackend(f"{tmp_path}/duck.db") + db = databases.DuckDatabaseBackend(f"{tmp_path}/duck.db") assert [ ("study_python_counts_valid__lib_transactions",), ("study_python_counts_valid__table1",), @@ -281,7 +280,7 @@ def test_clean(mock_path, tmp_path, args, expected, raises): # pylint: disable= with does_not_raise(): with mock.patch.object(builtins, "input", lambda _: "y"): cli.main(cli_args=duckdb_args(args, tmp_path)) - db = DuckDatabaseBackend(f"{tmp_path}/duck.db") + db = databases.DuckDatabaseBackend(f"{tmp_path}/duck.db") for table in db.cursor().execute("show tables").fetchall(): assert expected not in table @@ -414,7 +413,7 @@ def test_cli_executes_queries( export_args = duckdb_args(export_args, tmp_path) cli.main(cli_args=export_args) - db = DuckDatabaseBackend(f"{tmp_path}/duck.db") + db = databases.DuckDatabaseBackend(f"{tmp_path}/duck.db") found_tables = ( db.cursor() .execute("SELECT table_schema,table_name FROM information_schema.tables") @@ -501,7 +500,7 @@ def test_cli_transactions(tmp_path, study, finishes, raises): f"{tmp_path}/{study}_duck.db", ] cli.main(cli_args=args) - db = DuckDatabaseBackend(f"{tmp_path}/{study}_duck.db") + db = databases.DuckDatabaseBackend(f"{tmp_path}/{study}_duck.db") query = db.cursor().execute(f"SELECT * from {study}__lib_transactions").fetchall() assert query[0][2] == "started" if finishes: @@ -539,7 +538,7 @@ def test_cli_stats_rebuild(tmp_path): cli.main(cli_args=[*arg_list, f"{tmp_path}/export"]) cli.main(cli_args=[*arg_list, f"{tmp_path}/export"]) cli.main(cli_args=[*arg_list, f"{tmp_path}/export", "--statistics"]) - db = DuckDatabaseBackend(f"{tmp_path}/duck.db") + db = databases.DuckDatabaseBackend(f"{tmp_path}/duck.db") expected = ( db.cursor() .execute( @@ -654,7 +653,7 @@ def test_cli_single_builder(tmp_path): cli.main( cli_args=duckdb_args(["build", "--builder=patient", "--target=core"], tmp_path) ) - db = DuckDatabaseBackend(f"{tmp_path}/duck.db") + db = databases.DuckDatabaseBackend(f"{tmp_path}/duck.db") tables = {x[0] for x in db.cursor().execute("show tables").fetchall()} assert { "core__patient", @@ -673,7 +672,7 @@ def test_cli_finds_study_from_manifest_prefix(tmp_path): tmp_path, ) ) - db = DuckDatabaseBackend(f"{tmp_path}/duck.db") + db = databases.DuckDatabaseBackend(f"{tmp_path}/duck.db") tables = {x[0] for x in db.cursor().execute("show tables").fetchall()} assert "study_different_name__table" in tables @@ -755,7 +754,7 @@ def test_dedicated_schema(tmp_path): ) cli.main(cli_args=core_build_args) cli.main(cli_args=build_args) - db = DuckDatabaseBackend(f"{tmp_path}/duck.db") + db = databases.DuckDatabaseBackend(f"{tmp_path}/duck.db") tables = ( db.cursor() .execute("SELECT table_schema,table_name FROM information_schema.tables") @@ -768,3 +767,24 @@ def test_dedicated_schema(tmp_path): ("main", "core__condition"), ]: assert table in tables + + +@mock.patch.dict(os.environ, clear=True) +@mock.patch("cumulus_library.databases.DuckDatabaseBackend") +def test_sql_error_handling(mock_backend, tmp_path): + mock_backend.return_value.cursor.return_value.execute.side_effect = [ + None, + Exception("bad query"), + ] + build_args = duckdb_args( + [ + "build", + "-t", + "study_valid", + "-s", + "tests/test_data/study_valid/", + ], + tmp_path, + ) + with pytest.raises(SystemExit): + cli.main(cli_args=build_args) diff --git a/tests/test_counts_builder.py b/tests/test_counts_builder.py index 94b50e91..71e04c39 100644 --- a/tests/test_counts_builder.py +++ b/tests/test_counts_builder.py @@ -193,3 +193,19 @@ def test_count_wrappers( def test_null_initialization(): with pytest.raises(errors.CountsBuilderError): counts.CountsBuilder() + + +def test_write_queries(tmp_path): + builder = counts.CountsBuilder(study_prefix="foo") + builder.queries = ["SELECT * FROM FOO", "SELECT * FROM BAR"] + builder.write_counts(tmp_path / "output.sql") + with open(tmp_path / "output.sql") as f: + found = f.read() + expected = """-- noqa: disable=all +SELECT * FROM FOO + +-- ########################################################### + +SELECT * FROM BAR +""" + assert found == expected From a74e72d4a0a3adf3350a160e00541bc29c0b9d1a Mon Sep 17 00:00:00 2001 From: Matt Garber Date: Tue, 2 Jul 2024 11:30:55 -0400 Subject: [PATCH 5/5] one more coverage tweak --- cumulus_library/actions/importer.py | 5 +---- tests/test_actions.py | 9 ++------- 2 files changed, 3 insertions(+), 11 deletions(-) diff --git a/cumulus_library/actions/importer.py b/cumulus_library/actions/importer.py index d011bd9a..8278ab41 100644 --- a/cumulus_library/actions/importer.py +++ b/cumulus_library/actions/importer.py @@ -48,6 +48,7 @@ def import_archive(config: base_utils.StudyConfig, *, archive_path: pathlib.Path """ # Ensure we've got something that looks like a valid database export + print(archive_path) if not archive_path.exists(): raise errors.StudyImportError(f"File {archive_path} not found.") try: @@ -58,10 +59,6 @@ def import_archive(config: base_utils.StudyConfig, *, archive_path: pathlib.Path raise errors.StudyImportError( f"File {archive_path} is not a valid archive." ) from e - if len(files) == 0: - raise errors.StudyImportError( - f"File {archive_path} does not contain any tables." - ) if not any("__" in file for file in files): raise errors.StudyImportError( f"File {archive_path} contains non-study parquet files." diff --git a/tests/test_actions.py b/tests/test_actions.py index 1e975ef1..e3e0237c 100644 --- a/tests/test_actions.py +++ b/tests/test_actions.py @@ -363,11 +363,11 @@ def test_import_study(tmp_path, mock_db_config): with pytest.raises(errors.StudyImportError): df.to_parquet(tmp_path / "archive/test__table.parquet") df.to_parquet(tmp_path / "archive/other_test__table.parquet") - with zipfile.ZipFile(tmp_path / "archive/test.zip", "w") as archive: + with zipfile.ZipFile(tmp_path / "archive/two_studies.zip", "w") as archive: archive.write(tmp_path / "archive/test__table.parquet") archive.write(tmp_path / "archive/other_test__table.parquet") importer.import_archive( - config=mock_db_config, archive_path=tmp_path / "duck.db" + config=mock_db_config, archive_path=tmp_path / "archive/two_studies.zip" ) with pytest.raises(errors.StudyImportError): df.to_parquet(tmp_path / "archive/table.parquet") @@ -376,11 +376,6 @@ def test_import_study(tmp_path, mock_db_config): importer.import_archive( config=mock_db_config, archive_path=tmp_path / "archive/no_dunder.zip" ) - with pytest.raises(errors.StudyImportError): - (tmp_path / "archive/empty.zip") - importer.import_archive( - config=mock_db_config, archive_path=tmp_path / "archive/empty.zip" - ) @pytest.mark.parametrize(