smart-on-fhir · dogversioning · Jul 29, 2024 · Jul 29, 2024 · Jul 29, 2024
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -1,7 +1,7 @@
 default_install_hook_types: [pre-commit, pre-push]
 repos:
   - repo: https://github.com/astral-sh/ruff-pre-commit
-    rev: v0.4.4  # if you update this, also update pyproject.toml
+    rev: v0.5.5  # if you update this, also update pyproject.toml
     hooks:
       - name: Ruff formatting
         id: ruff-format

diff --git a/cumulus_library/actions/builder.py b/cumulus_library/actions/builder.py
@@ -94,9 +94,7 @@ def _load_and_execute_builder(
     table_builder_class = table_builder_subclasses[0]
     table_builder = table_builder_class()
     if write_reference_sql:
-        table_builder.prepare_queries(
-            config=config, manifest=manifest, parser=db_parser
-        )
+        table_builder.prepare_queries(config=config, manifest=manifest, parser=db_parser)
         table_builder.comment_queries(doc_str=doc_str)
         new_filename = pathlib.Path(f"{filename}").stem + ".sql"
         table_builder.write_queries(
@@ -191,7 +189,7 @@ def run_statistics_builders(
         existing_stats = (
             config.db.cursor()
             .execute(
-                "SELECT view_name FROM "
+                "SELECT view_name FROM "  # noqa: S608
                 f"{manifest.get_study_prefix()}__{enums.ProtectedTables.STATISTICS.value}"
             )
             .fetchall()
@@ -273,9 +271,7 @@ def build_study(
     """
     queries = []
     for file in manifest.get_sql_file_list(continue_from):
-        for query in base_utils.parse_sql(
-            base_utils.load_text(f"{manifest._study_path}/{file}")
-        ):
+        for query in base_utils.parse_sql(base_utils.load_text(f"{manifest._study_path}/{file}")):
             queries.append([query, file])
     if len(queries) == 0:
         return []
@@ -377,9 +373,7 @@ def _execute_build_queries(
                 "start with a string like `study_prefix__`.",
             )
         try:
-            with base_utils.query_console_output(
-                config.verbose, query[0], progress, task
-            ):
+            with base_utils.query_console_output(config.verbose, query[0], progress, task):
                 cursor.execute(query[0])
         except Exception as e:  # pylint: disable=broad-exception-caught
             _query_error(

diff --git a/cumulus_library/actions/cleaner.py b/cumulus_library/actions/cleaner.py
@@ -57,7 +57,7 @@ def get_unprotected_stats_view_table(
         protected_list = cursor.execute(
             f"""SELECT {artifact_type.lower()}_name 
             FROM {drop_prefix}{enums.ProtectedTables.STATISTICS.value}
-            WHERE study_name = '{display_prefix}'"""
+            WHERE study_name = '{display_prefix}'"""  # noqa: S608
         ).fetchall()
         for protected_tuple in protected_list:
             if protected_tuple in db_contents:
@@ -66,9 +66,7 @@ def get_unprotected_stats_view_table(
         # this check handles athena reporting views as also being tables,
         # so we don't waste time dropping things that don't exist
         if artifact_type == "TABLE":
-            if not any(
-                db_row_tuple[0] in iter_q_and_t for iter_q_and_t in unprotected_list
-            ):
+            if not any(db_row_tuple[0] in iter_q_and_t for iter_q_and_t in unprotected_list):
                 unprotected_list.append([db_row_tuple[0], artifact_type])
         else:
             unprotected_list.append([db_row_tuple[0], artifact_type])

diff --git a/cumulus_library/actions/exporter.py b/cumulus_library/actions/exporter.py
@@ -27,9 +27,7 @@ def reset_counts_exports(
 def _write_chunk(writer, chunk, arrow_schema):
     writer.write(
         pyarrow.Table.from_pandas(
-            chunk.sort_values(
-                by=list(chunk.columns), ascending=False, na_position="first"
-            ),
+            chunk.sort_values(by=list(chunk.columns), ascending=False, na_position="first"),
             preserve_index=False,
             schema=arrow_schema,
         )
@@ -70,11 +68,9 @@ def export_study(
         table_list,
         description=f"Exporting {manifest.get_study_prefix()} data...",
     ):
-        query = f"SELECT * FROM {table}"
+        query = f"SELECT * FROM {table}"  # noqa: S608
         query = base_utils.update_query_if_schema_specified(query, manifest)
-        dataframe_chunks, db_schema = config.db.execute_as_pandas(
-            query, chunksize=chunksize
-        )
+        dataframe_chunks, db_schema = config.db.execute_as_pandas(query, chunksize=chunksize)
         path.mkdir(parents=True, exist_ok=True)
         arrow_schema = pyarrow.schema(config.db.col_pyarrow_types_from_sql(db_schema))
         with parquet.ParquetWriter(f"{path}/{table}.parquet", arrow_schema) as p_writer:

diff --git a/cumulus_library/actions/importer.py b/cumulus_library/actions/importer.py
@@ -11,9 +11,7 @@
 
 def _create_table_from_parquet(archive, file, study_name, config):
     try:
-        parquet_path = pathlib.Path(
-            archive.extract(file), path=tempfile.TemporaryFile()
-        )
+        parquet_path = pathlib.Path(archive.extract(file), path=tempfile.TemporaryFile())
         # While convenient to access, this exposes us to panda's type system,
         # which is messy - this could be optionally be replaced by pyarrow if it
         # becomes problematic.
@@ -56,13 +54,9 @@ def import_archive(config: base_utils.StudyConfig, *, archive_path: pathlib.Path
         files = archive.namelist()
         files = [file for file in files if file.endswith(".parquet")]
     except zipfile.BadZipFile as e:
-        raise errors.StudyImportError(
-            f"File {archive_path} is not a valid archive."
-        ) from e
+        raise errors.StudyImportError(f"File {archive_path} is not a valid archive.") from e
     if not any("__" in file for file in files):
-        raise errors.StudyImportError(
-            f"File {archive_path} contains non-study parquet files."
-        )
+        raise errors.StudyImportError(f"File {archive_path} contains non-study parquet files.")
     study_name = files[0].split("__")[0]
     for file in files[1:]:
         if file.split("__")[0] != study_name:

diff --git a/cumulus_library/actions/uploader.py b/cumulus_library/actions/uploader.py
@@ -65,8 +65,7 @@ def upload_files(args: dict):
     """Wrapper to prep files & console output"""
     if args["data_path"] is None:
         sys.exit(
-            "No data directory provided - please provide a path to your"
-            "study export folder."
+            "No data directory provided - please provide a path to your" "study export folder."
         )
     file_paths = list(args["data_path"].glob("**/*.parquet"))
     if args["target"]:
@@ -79,9 +78,7 @@ def upload_files(args: dict):
     if not args["user"] or not args["id"]:
         sys.exit("user/id not provided, please pass --user and --id")
     try:
-        meta_version = next(
-            filter(lambda x: str(x).endswith("__meta_version.parquet"), file_paths)
-        )
+        meta_version = next(filter(lambda x: str(x).endswith("__meta_version.parquet"), file_paths))
         version = str(read_parquet(meta_version)["data_package_version"][0])
         file_paths.remove(meta_version)
     except StopIteration:

diff --git a/cumulus_library/apis/umls.py b/cumulus_library/apis/umls.py
@@ -99,9 +99,7 @@ def get_vsac_valuesets(
             included_records = all_responses[0].get("compose", {}).get("include", [])
             for record in included_records:
                 if "valueSet" in record:
-                    valueset = self.get_vsac_valuesets(
-                        action=action, url=record["valueSet"][0]
-                    )
+                    valueset = self.get_vsac_valuesets(action=action, url=record["valueSet"][0])
                     all_responses.append(valueset[0])
         return all_responses
 
@@ -143,24 +141,21 @@ def download_umls_files(
             "url": file_meta["downloadUrl"],
             "apiKey": self.api_key,
         }
-        download_res = requests.get(
+        download_res = requests.get(  # noqa: S113
             "https://uts-ws.nlm.nih.gov/download", params=download_payload, stream=True
         )
 
         with open(path / file_meta["fileName"], "wb") as f:
             chunks_read = 0
             with base_utils.get_progress_bar() as progress:
-                task = progress.add_task(
-                    f"Downloading {file_meta['fileName']}", total=None
-                )
+                task = progress.add_task(f"Downloading {file_meta['fileName']}", total=None)
                 for chunk in download_res.iter_content(chunk_size=1024):
                     f.write(chunk)
                     chunks_read += 1
                     progress.update(
                         task,
                         description=(
-                            f"Downloading {file_meta['fileName']}: "
-                            f"{chunks_read/1000} MB"
+                            f"Downloading {file_meta['fileName']}: " f"{chunks_read/1000} MB"
                         ),
                     )
         if unzip:

diff --git a/cumulus_library/base_table_builder.py b/cumulus_library/base_table_builder.py
@@ -56,17 +56,13 @@ def execute_queries(
             table_names = []
             for query in self.queries:
                 # Get the first non-whitespace word after create table
-                table_name = re.search(
-                    '(?i)(?<=create table )(([a-zA-Z0-9_".-]+))', query
-                )
+                table_name = re.search('(?i)(?<=create table )(([a-zA-Z0-9_".-]+))', query)
 
                 if table_name:
                     if table_name[0] == "IF":
                         # Edge case - if we're doing an empty conditional CTAS creation,
                         # we need to run a slightly different regex
-                        table_name = re.search(
-                            '(?i)(?<=not exists )(([a-zA-Z0-9_".-]+))', query
-                        )
+                        table_name = re.search('(?i)(?<=not exists )(([a-zA-Z0-9_".-]+))', query)
 
                     table_name = table_name[0]
                     table_names.append(table_name)
@@ -81,20 +77,16 @@ def execute_queries(
             for query in self.queries:
                 try:
                     query = base_utils.update_query_if_schema_specified(query, manifest)
-                    with base_utils.query_console_output(
-                        config.verbose, query, progress, task
-                    ):
+                    with base_utils.query_console_output(config.verbose, query, progress, task):
                         cursor.execute(query)
                 except Exception as e:  # pylint: disable=broad-exception-caught
                     sys.exit(
-                        "An error occured executing this query:\n----\n"
-                        f"{query}\n----\n"
-                        f"{e}"
+                        "An error occured executing this query:\n----\n" f"{query}\n----\n" f"{e}"
                     )
 
         self.post_execution(config, *args, **kwargs)
 
-    def post_execution(  # noqa: B027 - this looks like, but is not, an abstract method
+    def post_execution(
         self,
         config: base_utils.StudyConfig,
         *args,

diff --git a/cumulus_library/base_utils.py b/cumulus_library/base_utils.py
@@ -124,9 +124,7 @@ def zip_dir(read_path, write_path, archive_name):
         shutil.rmtree(read_path)
 
 
-def update_query_if_schema_specified(
-    query: str, manifest: study_manifest.StudyManifest
-):
+def update_query_if_schema_specified(query: str, manifest: study_manifest.StudyManifest):
     if manifest and manifest.get_dedicated_schema():
         # External queries in athena require a schema to be specified already, so
         # rather than splitting and ending up with a table name like

diff --git a/cumulus_library/cli.py b/cumulus_library/cli.py
@@ -98,9 +98,7 @@ def clean_and_build_study(
         """
         manifest = study_manifest.StudyManifest(target, self.data_path)
         try:
-            builder.run_protected_table_builder(
-                config=self.get_config(manifest), manifest=manifest
-            )
+            builder.run_protected_table_builder(config=self.get_config(manifest), manifest=manifest)
             if not continue_from:
                 log_utils.log_transaction(
                     config=self.get_config(manifest),
@@ -111,9 +109,7 @@ def clean_and_build_study(
                     config=self.get_config(manifest),
                     manifest=manifest,
                 )
-                builder.run_table_builder(
-                    config=self.get_config(manifest), manifest=manifest
-                )
+                builder.run_table_builder(config=self.get_config(manifest), manifest=manifest)
 
             else:
                 log_utils.log_transaction(
@@ -127,9 +123,7 @@ def clean_and_build_study(
                 manifest=manifest,
                 continue_from=continue_from,
             )
-            builder.run_counts_builders(
-                config=self.get_config(manifest), manifest=manifest
-            )
+            builder.run_counts_builders(config=self.get_config(manifest), manifest=manifest)
             builder.run_statistics_builders(
                 config=self.get_config(manifest),
                 manifest=manifest,
@@ -170,9 +164,7 @@ def run_matching_table_builder(
         )
 
     ### Data exporters
-    def export_study(
-        self, target: pathlib.Path, data_path: pathlib.Path, archive: bool
-    ) -> None:
+    def export_study(self, target: pathlib.Path, data_path: pathlib.Path, archive: bool) -> None:
         """Exports aggregates defined in a manifest
 
         :param target: A path to the study directory
@@ -199,9 +191,7 @@ def generate_study_sql(
         :keyword builder: Specify a single builder to generate sql from
         """
         manifest = study_manifest.StudyManifest(target)
-        file_generator.run_generate_sql(
-            config=self.get_config(manifest), manifest=manifest
-        )
+        file_generator.run_generate_sql(config=self.get_config(manifest), manifest=manifest)
 
     def generate_study_markdown(
         self,
@@ -212,9 +202,7 @@ def generate_study_markdown(
         :param target: A path to the study directory
         """
         manifest = study_manifest.StudyManifest(target)
-        file_generator.run_generate_markdown(
-            config=self.get_config(manifest), manifest=manifest
-        )
+        file_generator.run_generate_markdown(config=self.get_config(manifest), manifest=manifest)
 
 
 def get_abs_path(path: str) -> pathlib.Path:
@@ -324,9 +312,7 @@ def run_cli(args: dict):
             elif args["action"] == "build":
                 for target in args["target"]:
                     if args["builder"]:
-                        runner.run_matching_table_builder(
-                            study_dict[target], args["builder"]
-                        )
+                        runner.run_matching_table_builder(study_dict[target], args["builder"])
                     else:
                         runner.clean_and_build_study(
                             study_dict[target],
@@ -350,9 +336,7 @@ def run_cli(args: dict):
                     if response.lower() != "y":
                         sys.exit()
                 for target in args["target"]:
-                    runner.export_study(
-                        study_dict[target], args["data_path"], args["archive"]
-                    )
+                    runner.export_study(study_dict[target], args["data_path"], args["archive"])
 
             elif args["action"] == "import":
                 for archive in args["archive_path"]:
@@ -361,9 +345,7 @@ def run_cli(args: dict):
 
             elif args["action"] == "generate-sql":
                 for target in args["target"]:
-                    runner.generate_study_sql(
-                        study_dict[target], builder=args["builder"]
-                    )
+                    runner.generate_study_sql(study_dict[target], builder=args["builder"])
 
             elif args["action"] == "generate-md":
                 for target in args["target"]:

diff --git a/cumulus_library/cli_parser.py b/cumulus_library/cli_parser.py
@@ -228,9 +228,7 @@ def get_parser() -> argparse.ArgumentParser:
 
     # Database export
 
-    export = actions.add_parser(
-        "export", help="Generates files on disk from Athena tables/views"
-    )
+    export = actions.add_parser("export", help="Generates files on disk from Athena tables/views")
     add_custom_option(export)
     add_target_argument(export)
     add_study_dir_argument(export)
@@ -245,9 +243,7 @@ def get_parser() -> argparse.ArgumentParser:
 
     # Database import
 
-    importer = actions.add_parser(
-        "import", help="Recreates a study from an exported archive"
-    )
+    importer = actions.add_parser("import", help="Recreates a study from an exported archive")
     add_db_config(importer)
     add_verbose_argument(importer)
     importer.add_argument(
@@ -258,15 +254,11 @@ def get_parser() -> argparse.ArgumentParser:
     )
     # Aggregator upload
 
-    upload = actions.add_parser(
-        "upload", help="Bulk uploads data to Cumulus aggregator"
-    )
+    upload = actions.add_parser("upload", help="Bulk uploads data to Cumulus aggregator")
     add_data_path_argument(upload)
     add_target_argument(upload)
 
-    upload.add_argument(
-        "--id", help="Site ID. Default is value of CUMULUS_AGGREGATOR_ID"
-    )
+    upload.add_argument("--id", help="Site ID. Default is value of CUMULUS_AGGREGATOR_ID")
     upload.add_argument(
         "--preview",
         default=False,
@@ -281,9 +273,7 @@ def get_parser() -> argparse.ArgumentParser:
         ),
         default="https://aggregator.smartcumulus.org/upload/",
     )
-    upload.add_argument(
-        "--user", help="Cumulus user. Default is value of CUMULUS_AGGREGATOR_USER"
-    )
+    upload.add_argument("--user", help="Cumulus user. Default is value of CUMULUS_AGGREGATOR_USER")
 
     # Generate a study's template-driven sql