Converted manifest to single list of files (#317)

* Converted manifest to single list of files * coverage, PR feedback
smart-on-fhir · Nov 19, 2024 · e0c0728 · e0c0728
1 parent 2528c51
commit e0c0728
Show file tree

Hide file tree

Showing 23 changed files with 231 additions and 230 deletions.
diff --git a/cumulus_library/actions/builder.py b/cumulus_library/actions/builder.py
@@ -135,60 +135,14 @@ def run_protected_table_builder(
     )
 
 
-def run_table_builder(
-    config: base_utils.StudyConfig,
-    manifest: study_manifest.StudyManifest,
-    *,
-    db_parser: databases.DatabaseParser = None,
+def _run_workflow(
+    config: base_utils.StudyConfig, manifest: study_manifest.StudyManifest, filename: str
 ) -> None:
-    """Loads modules from a manifest and executes code via BaseTableBuilder
+    """Loads workflow config from toml definitions and executes workflow
 
     :param config: a StudyConfig object
     :param manifest: a StudyManifest object
-    :keyword db_parser: an object implementing DatabaseParser for the target database
     """
-    for file in manifest.get_table_builder_file_list():
-        _load_and_execute_builder(
-            config=config,
-            manifest=manifest,
-            filename=file,
-            db_parser=db_parser,
-        )
-
-
-def run_counts_builders(
-    config: base_utils.StudyConfig,
-    manifest: study_manifest.StudyManifest,
-) -> None:
-    """Loads counts modules from a manifest and executes code via BaseTableBuilder
-
-    While a count is a form of statistics, it is treated separately from other
-    statistics because it is, by design, always going to be static against a
-    given dataset, where other statistical methods may use sampling techniques
-    or adjustable input parameters that may need to be preserved for later review.
-
-    :param config: a StudyConfig object
-    :param manifest: a StudyManifest object
-    """
-    for file in manifest.get_counts_builder_file_list():
-        _load_and_execute_builder(
-            config=config,
-            manifest=manifest,
-            filename=file,
-        )
-
-
-def run_statistics_builders(
-    config: base_utils.StudyConfig,
-    manifest: study_manifest.StudyManifest,
-) -> None:
-    """Loads statistics modules from toml definitions and executes
-
-    :param config: a StudyConfig object
-    :param manifest: a StudyManifest object
-    """
-    if len(manifest.get_statistics_file_list()) == 0:
-        return
     existing_stats = []
     if not config.stats_build:
         existing_stats = (
@@ -199,40 +153,41 @@ def run_statistics_builders(
             )
             .fetchall()
         )
-    for file in manifest.get_statistics_file_list():
-        # This open is a bit redundant with the open inside of the PSM builder,
-        # but we're letting it slide so that builders function similarly
-        # across the board
-        safe_timestamp = base_utils.get_tablename_safe_iso_timestamp()
-        toml_path = pathlib.Path(f"{manifest._study_path}/{file}")
-        with open(toml_path, "rb") as file:
-            stats_config = tomllib.load(file)
-            config_type = stats_config["config_type"]
-            target_table = stats_config.get("target_table", stats_config.get("table_prefix", ""))
-
-        if (target_table,) in existing_stats and not config.stats_build:
-            continue
-        if config_type == "psm":
+    # This open is a bit redundant with the open inside of the PSM builder,
+    # but we're letting it slide so that builders function similarly
+    # across the board
+    safe_timestamp = base_utils.get_tablename_safe_iso_timestamp()
+    toml_path = pathlib.Path(f"{manifest._study_path}/{filename}")
+    with open(toml_path, "rb") as file:
+        workflow_config = tomllib.load(file)
+        config_type = workflow_config["config_type"]
+        target_table = workflow_config.get("target_table", workflow_config.get("table_prefix", ""))
+
+    if (target_table,) in existing_stats and not config.stats_build:
+        return
+    match config_type:
+        case "psm":
             builder = psm_builder.PsmBuilder(
                 toml_config_path=toml_path,
-                config=stats_config,
+                config=workflow_config,
                 data_path=manifest.data_path / f"{manifest.get_study_prefix()}/psm",
             )
-        elif config_type == "valueset":
+        case "valueset":
             builder = valueset_builder.ValuesetBuilder(
                 toml_config_path=toml_path,
-                config=stats_config,
+                config=workflow_config,
                 data_path=manifest.data_path / f"{manifest.get_study_prefix()}/valueset",
             )
-        else:
-            raise errors.StudyManifestParsingError(  # pragma: no cover
-                f"{toml_path} references an invalid statistics type {config_type}."
+        case _:  # pragma: no cover
+            raise errors.StudyManifestParsingError(
+                f"{toml_path} references an invalid workflow type {config_type}."
             )
-        builder.execute_queries(
-            config=config,
-            manifest=manifest,
-            table_suffix=safe_timestamp,
-        )
+    builder.execute_queries(
+        config=config,
+        manifest=manifest,
+        table_suffix=safe_timestamp,
+    )
+    if config_type in set(item.value for item in enums.StatisticsTypes):
         log_utils.log_statistics(
             config=config,
             manifest=manifest,
@@ -242,11 +197,11 @@ def run_statistics_builders(
         )
 
 
-def run_matching_table_builder(
+def build_matching_files(
     config: base_utils.StudyConfig,
     manifest: study_manifest.StudyManifest,
     *,
-    builder: str,
+    builder: str | None,
     db_parser: databases.DatabaseParser = None,
 ):
     """targets all table builders matching a target string for running
@@ -256,36 +211,55 @@ def run_matching_table_builder(
     :keyword builder: filename of a module implementing a TableBuilder
     :keyword db_parser: an object implementing DatabaseParser for the target database"""
     all_generators = manifest.get_all_generators()
-    for file in all_generators:
-        if builder and file.find(builder) == -1:
-            continue
-        _load_and_execute_builder(
-            config=config,
-            manifest=manifest,
-            filename=file,
-            db_parser=db_parser,
-        )
+    matches = []
+    if not builder:  # pragma: no cover
+        matches = all_generators
+    else:
+        for file in all_generators:
+            if file.find(builder) != -1:
+                matches.append(file)
+    build_study(config, manifest, db_parser=db_parser, file_list=matches)
 
 
 def build_study(
     config: base_utils.StudyConfig,
     manifest: study_manifest.StudyManifest,
     *,
+    db_parser: databases.DatabaseParser = None,
     continue_from: str | None = None,
+    file_list: list | None = None,
 ) -> list:
     """Creates tables in the schema by iterating through the sql_config.file_names
 
     :param config: a StudyConfig object
     :param manifest: a StudyManifest object
-    :keyword continue_from: Name of a sql file to resume table creation from
+    :keyword continue_from: Name of a file to resume table creation from
     :returns: loaded queries (for unit testing only)
     """
+    if file_list is None:
+        file_list = manifest.get_file_list(continue_from)
+    for file in file_list:
+        if file.endswith(".py"):
+            _load_and_execute_builder(
+                config=config,
+                manifest=manifest,
+                filename=file,
+                db_parser=db_parser,
+            )
+        elif file.endswith(".toml"):
+            _run_workflow(config=config, manifest=manifest, filename=file)
+        elif file.endswith(".sql"):
+            _run_raw_queries(config=config, manifest=manifest, filename=file)
+        else:
+            raise errors.StudyManifestParsingError
+
+
+def _run_raw_queries(
+    config: base_utils.StudyConfig, manifest: study_manifest.StudyManifest, filename: str
+):
     queries = []
-    for file in manifest.get_sql_file_list(continue_from):
-        for query in base_utils.parse_sql(base_utils.load_text(f"{manifest._study_path}/{file}")):
-            queries.append([query, file])
-    if len(queries) == 0:
-        return []
+    for query in base_utils.parse_sql(base_utils.load_text(f"{manifest._study_path}/{filename}")):
+        queries.append([query, filename])
     for query in queries:
         query[0] = base_utils.update_query_if_schema_specified(query[0], manifest)
         query[0] = query[0].replace(
@@ -298,7 +272,7 @@ def build_study(
     # We want to only show a progress bar if we are :not: printing SQL lines
     with base_utils.get_progress_bar(disable=config.verbose) as progress:
         task = progress.add_task(
-            f"Creating {manifest.get_study_prefix()} study in db...",
+            f"Building tables from {filename}...",
             total=len(queries),
             visible=not config.verbose,
         )

diff --git a/cumulus_library/builders/protected_table_builder.py b/cumulus_library/builders/protected_table_builder.py
@@ -1,5 +1,8 @@
 """Builder for creating tables for tracking state/logging changes"""
 
+import pathlib
+import tomllib
+
 from cumulus_library import (
     BaseTableBuilder,
     base_utils,
@@ -64,12 +67,25 @@ def prepare_queries(
                 TRANSACTION_COLS_TYPES,
             )
         )
-        if manifest._study_config.get("statistics_config"):
-            self.queries.append(
-                base_templates.get_ctas_empty_query(
-                    db_schema,
-                    statistics,
-                    STATISTICS_COLS,
-                    STATISTICS_COLS_TYPES,
-                )
-            )
+        files = manifest.get_all_workflows()
+        if len(files) == 0:
+            return
+        stats_types = set(item.value for item in enums.StatisticsTypes)
+        # In this loop, we are just checking to see if :any: workflow is a stats
+        # type workflow - if so, we'll create a table to hold data of stats runs
+        # (if it doesn't already exist) outside of the study lifecycle for
+        # persistence reasons
+        for file in files:
+            toml_path = pathlib.Path(f"{manifest._study_path}/{file}")
+            with open(toml_path, "rb") as file:
+                workflow_config = tomllib.load(file)
+                if workflow_config["config_type"] in stats_types:
+                    self.queries.append(
+                        base_templates.get_ctas_empty_query(
+                            db_schema,
+                            statistics,
+                            STATISTICS_COLS,
+                            STATISTICS_COLS_TYPES,
+                        )
+                    )
+                    return
diff --git a/cumulus_library/cli.py b/cumulus_library/cli.py
@@ -113,7 +113,6 @@ def clean_and_build_study(
                     config=self.get_config(manifest),
                     manifest=manifest,
                 )
-                builder.run_table_builder(config=self.get_config(manifest), manifest=manifest)
 
             else:
                 log_utils.log_transaction(
@@ -127,11 +126,6 @@ def clean_and_build_study(
                 manifest=manifest,
                 continue_from=continue_from,
             )
-            builder.run_counts_builders(config=self.get_config(manifest), manifest=manifest)
-            builder.run_statistics_builders(
-                config=self.get_config(manifest),
-                manifest=manifest,
-            )
             log_utils.log_transaction(
                 config=self.get_config(manifest),
                 manifest=manifest,
@@ -150,7 +144,7 @@ def clean_and_build_study(
             )
             raise e
 
-    def run_matching_table_builder(
+    def build_matching_files(
         self,
         target: pathlib.Path,
         table_builder_name: str,
@@ -164,7 +158,7 @@ def run_matching_table_builder(
         :param options: The dictionary of study-specific options
         """
         manifest = study_manifest.StudyManifest(target, options=options)
-        builder.run_matching_table_builder(
+        builder.build_matching_files(
             config=self.get_config(manifest),
             manifest=manifest,
             builder=table_builder_name,
@@ -330,7 +324,7 @@ def run_cli(args: dict):
             elif args["action"] == "build":
                 for target in args["target"]:
                     if args["builder"]:
-                        runner.run_matching_table_builder(
+                        runner.build_matching_files(
                             study_dict[target], args["builder"], options=args["options"]
                         )
                     else:

diff --git a/cumulus_library/enums.py b/cumulus_library/enums.py
@@ -18,6 +18,12 @@ class ProtectedTables(enum.Enum):
     TRANSACTIONS = "lib_transactions"
 
 
+class StatisticsTypes(enum.Enum):
+    """A subset of workflows that create statistics sampling artifacts"""
+
+    PSM = "psm"
+
+
 class LogStatuses(enum.Enum):
     DEBUG = "debug"
     ERROR = "error"

diff --git a/cumulus_library/studies/core/manifest.toml b/cumulus_library/studies/core/manifest.toml
@@ -1,6 +1,6 @@
 study_prefix = "core"
 
-[table_builder_config]
+[file_config]
 file_names = [
     "builder_prereq_tables.py",
     "builder_allergyintolerance.py",
@@ -9,17 +9,9 @@ file_names = [
     "builder_encounter.py",
     "builder_documentreference.py",
     "builder_medicationrequest.py",
-    "builder_observation.py"
-]
-
-[sql_config]
-file_names = [
+    "builder_observation.py",
     "observation_type.sql",
     "meta_date.sql",
-]
-
-[counts_builder_config]
-file_names = [
     "count_core.py"
 ]
 

diff --git a/cumulus_library/studies/discovery/manifest.toml b/cumulus_library/studies/discovery/manifest.toml
@@ -1,6 +1,6 @@
 study_prefix = "discovery"
 
-[table_builder_config]
+[file_config]
 file_names = [
     "code_detection.py",
 ]