From 29181f0bd484e420830626a7f8eafb91df77e52d Mon Sep 17 00:00:00 2001 From: Matt Garber Date: Tue, 6 Feb 2024 13:58:21 -0500 Subject: [PATCH] Ruff --- .github/pull_request_template.md | 1 - .github/workflows/ci.yaml | 4 +- .pre-commit-config.yaml | 15 ++-- cumulus_library/base_table_builder.py | 13 ++-- cumulus_library/base_utils.py | 14 ++-- cumulus_library/cli.py | 25 +++--- cumulus_library/cli_parser.py | 18 +++-- cumulus_library/databases.py | 41 ++++++---- cumulus_library/parsers/fhir_valueset.py | 16 ++-- cumulus_library/schema/columns.py | 17 ++-- cumulus_library/schema/valueset.py | 3 +- cumulus_library/statistics/counts.py | 22 +++--- cumulus_library/statistics/psm.py | 41 +++++----- .../studies/core/builder_condition.py | 4 +- .../studies/core/builder_documentreference.py | 7 +- .../studies/core/builder_encounter.py | 4 +- .../studies/core/builder_medication.py | 2 +- .../studies/core/builder_medicationrequest.py | 5 +- .../studies/core/builder_observation.py | 5 +- .../studies/core/builder_patient.py | 5 +- .../core/core_templates/core_templates.py | 5 +- cumulus_library/studies/core/count_core.py | 8 +- .../studies/discovery/code_detection.py | 23 +++--- .../studies/vocab/vocab_icd_builder.py | 3 +- cumulus_library/study_parser.py | 77 +++++++++---------- .../template_sql/base_templates.py | 31 ++++---- cumulus_library/template_sql/sql_utils.py | 14 ++-- .../statistics/counts_templates.py | 9 +-- .../template_sql/statistics/psm_templates.py | 21 ++--- cumulus_library/upload.py | 2 - pyproject.toml | 25 +++++- tests/conftest.py | 18 ++--- tests/regression/run_regression.py | 7 +- tests/test_cli.py | 48 +++++++----- tests/test_conftest.py | 17 ++-- tests/test_core.py | 8 +- tests/test_duckdb.py | 3 +- tests/test_parsers.py | 4 +- tests/test_psm.py | 1 - tests/test_psm_templates.py | 2 +- tests/test_study_parser.py | 7 +- tests/test_template_sql_utils.py | 4 +- tests/test_templates.py | 1 - 43 files changed, 307 insertions(+), 293 deletions(-) diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md index 73705402..c8a451b1 100644 --- a/.github/pull_request_template.md +++ b/.github/pull_request_template.md @@ -2,5 +2,4 @@ ### Checklist - [ ] Consider if documentation (like in `docs/`) needs to be updated - [ ] Consider if tests should be added -- [ ] Run pylint if you're making changes beyond adding studies - [ ] Update template repo if there are changes to study configuration \ No newline at end of file diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 24d970c4..2c017949 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -47,10 +47,10 @@ jobs: - name: Run sqlfluff on jinja templates run: | sqlfluff lint - - name: Run black + - name: Run ruff if: success() || failure() # still run black if above checks fails run: | - black --check --verbose . + ruff regression: runs-on: ubuntu-22.04 permissions: diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 71f0d9cc..84a4db32 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,15 +1,10 @@ repos: - - repo: https://github.com/psf/black - #this version is synced with the black mentioned in .github/workflows/ci.yml - rev: 23.10.0 + - repo: https://github.com/astral-sh/ruff-pre-commit + rev: v0.2.1 hooks: - - id: black - entry: bash -c 'black "$@"; git add -u' -- - # It is recommended to specify the latest version of Python - # supported by your project here, or alternatively use - # pre-commit's default_language_version, see - # https://pre-commit.com/#top_level-default_language_version - language_version: python3.9 + - id: ruff + args: [--fix] + - id: ruff-format - repo: https://github.com/sqlfluff/sqlfluff rev: 2.3.4 diff --git a/cumulus_library/base_table_builder.py b/cumulus_library/base_table_builder.py index 66646500..0786bf4a 100644 --- a/cumulus_library/base_table_builder.py +++ b/cumulus_library/base_table_builder.py @@ -3,12 +3,11 @@ import pathlib import re import sys - from abc import ABC, abstractmethod from typing import final -from cumulus_library.databases import DatabaseCursor from cumulus_library import base_utils +from cumulus_library.databases import DatabaseCursor class BaseTableBuilder(ABC): @@ -70,8 +69,8 @@ def execute_queries( ) table_name = table_name[0] - # if it contains a schema, remove it (usually it won't, but some CTAS - # forms may) + # if it contains a schema, remove it (usually it won't, but some + # CTAS forms may) if "." in table_name: table_name = table_name.split(".")[1].replace('"', "") table_names.append(table_name) @@ -94,7 +93,7 @@ def execute_queries( self.post_execution(cursor, schema, verbose, drop_table, *args, **kwargs) - def post_execution( + def post_execution( # noqa: B027 - this looks like, but is not, an abstract method self, cursor: DatabaseCursor, schema: str, @@ -122,7 +121,9 @@ def comment_queries(self, doc_str=None): commented_queries.pop() self.queries = commented_queries - def write_queries(self, path: pathlib.Path = pathlib.Path.cwd() / "output.sql"): + def write_queries(self, path: pathlib.Path | None = None): + if path is None: + path = pathlib.Path.cwd() / "output.sql" """writes all queries constructed by prepare_queries to disk""" path.parents[0].mkdir(parents=True, exist_ok=True) with open(path, "w", encoding="utf-8") as file: diff --git a/cumulus_library/base_utils.py b/cumulus_library/base_utils.py index 84fc044e..213b9880 100644 --- a/cumulus_library/base_utils.py +++ b/cumulus_library/base_utils.py @@ -1,11 +1,9 @@ """ Collection of small commonly used utility functions """ import datetime -import os import json - +import os from contextlib import contextmanager -from typing import List from rich import progress @@ -15,16 +13,16 @@ def filepath(filename: str) -> str: def load_text(path: str) -> str: - with open(path, "r", encoding="UTF-8") as fp: + with open(path, encoding="UTF-8") as fp: return fp.read() def load_json(path: str) -> dict: - with open(path, "r", encoding="UTF-8") as fp: + with open(path, encoding="UTF-8") as fp: return json.load(fp) -def parse_sql(sql_text: str) -> List[str]: +def parse_sql(sql_text: str) -> list[str]: commands = [] for statement in sql_text.split(";"): @@ -36,11 +34,11 @@ def parse_sql(sql_text: str) -> List[str]: return filter_strip(commands) -def filter_strip(commands) -> List[str]: +def filter_strip(commands) -> list[str]: return list(filter(None, [c.strip() for c in commands])) -def list_coding(code_display: dict, system=None) -> List[dict]: +def list_coding(code_display: dict, system=None) -> list[dict]: as_list = [] for code, display in code_display.items(): if system: diff --git a/cumulus_library/cli.py b/cumulus_library/cli.py index c5465649..6c3ec07f 100755 --- a/cumulus_library/cli.py +++ b/cumulus_library/cli.py @@ -7,18 +7,15 @@ import sys import sysconfig - -from typing import Dict, List, Optional - import rich from cumulus_library import ( __version__, + base_utils, cli_parser, databases, enums, errors, - base_utils, protected_table_builder, study_parser, upload, @@ -60,8 +57,8 @@ def update_transactions(self, prefix: str, status: str): def clean_study( self, - targets: List[str], - study_dict: Dict, + targets: list[str], + study_dict: dict, *, stats_clean: bool, prefix: bool = False, @@ -106,7 +103,7 @@ def clean_and_build_study( target: pathlib.Path, *, stats_build: bool, - continue_from: str = None, + continue_from: str | None = None, ) -> None: """Recreates study views/tables @@ -176,7 +173,7 @@ def run_single_table_builder( parser=self.db.parser(), ) - def clean_and_build_all(self, study_dict: Dict, stats_build: bool) -> None: + def clean_and_build_all(self, study_dict: dict, stats_build: bool) -> None: """Builds views for all studies. NOTE: By design, this method will always exclude the `template` study dir, @@ -206,7 +203,7 @@ def export_study(self, target: pathlib.Path, data_path: pathlib.Path) -> None: studyparser = study_parser.StudyManifestParser(target, data_path) studyparser.export_study(self.db, data_path) - def export_all(self, study_dict: Dict, data_path: pathlib.Path): + def export_all(self, study_dict: dict, data_path: pathlib.Path): """Exports all defined count tables to disk""" for key in study_dict.keys(): self.export_study(study_dict[key], data_path) @@ -254,7 +251,7 @@ def create_template(path: str) -> None: dest_path.write_bytes(source_path.read_bytes()) -def get_study_dict(alt_dir_paths: List) -> Optional[Dict[str, pathlib.Path]]: +def get_study_dict(alt_dir_paths: list) -> dict[str, pathlib.Path] | None: """Gets valid study targets from ./studies/, and any pip installed studies :returns: A list of Path objects @@ -264,11 +261,12 @@ def get_study_dict(alt_dir_paths: List) -> Optional[Dict[str, pathlib.Path]]: # first, we'll get any installed public studies with open( - pathlib.Path(cli_path, "./module_allowlist.json"), "r", encoding="utf-8" + pathlib.Path(cli_path, "./module_allowlist.json"), encoding="utf-8" ) as study_allowlist_json: study_allowlist = json.load(study_allowlist_json)["allowlist"] - site_packages_dir = sysconfig.get_path("purelib") + site_packages_dir = "".join(sysconfig.get_path("purelib")) for study, subdir in study_allowlist.items(): + print(site_packages_dir) study_path = pathlib.Path(site_packages_dir, subdir) if study_path.exists(): manifest_studies[study] = study_path @@ -295,7 +293,7 @@ def get_studies_by_manifest_path(path: pathlib.Path) -> dict: return manifest_paths -def run_cli(args: Dict): +def run_cli(args: dict): """Controls which library tasks are run based on CLI arguments""" if args["action"] == "create": create_template(args["create_dir"]) @@ -312,7 +310,6 @@ def run_cli(args: Dict): runner.verbose = True print("Testing connection to database...") runner.cursor.execute("SHOW DATABASES") - study_dict = get_study_dict(args["study_dir"]) if "prefix" not in args.keys(): if args["target"]: diff --git a/cumulus_library/cli_parser.py b/cumulus_library/cli_parser.py index 5604936b..7ecdddef 100644 --- a/cumulus_library/cli_parser.py +++ b/cumulus_library/cli_parser.py @@ -29,10 +29,10 @@ def add_study_dir_argument(parser: argparse.ArgumentParser) -> None: action="append", help=( "Optionally add one or more directories to look for study definitions in. " - "Default is in project directory and CUMULUS_LIBRARY_STUDY_DIR, if present, " - "followed by any supplied paths. Target, and all its subdirectories, " - "are checked for manifests. Overriding studies with the same namespace " - "supersede earlier ones." + "Default is in project directory and CUMULUS_LIBRARY_STUDY_DIR, " + "if present, followed by any supplied paths. Target, and all its " + "subdirectories, are checked for manifests. Overriding studies with the" + " same namespace supersede earlier ones." ), ) @@ -87,10 +87,14 @@ def add_db_config(parser: argparse.ArgumentParser) -> None: ) group.add_argument( "--database", - # In Athena, we use this as the schema_name (which is also called a Database in their UX). + # In Athena, we use this as the schema_name (which is also called a Database + # in their UX). + # # In DuckDB, we use this as the path to the filename to store tables. - # Since we started as an Athena-centric codebase, we mostly keep referring to this as - # name "schema_name". But to the user, both uses are still conceptually a "database". + # + # Since we started as an Athena-centric codebase, we mostly keep referring to + # this as name "schema_name". But to the user, both uses are still conceptually + # a "database". dest="schema_name", help="Database name (for Athena) or file (for DuckDB)", ) diff --git a/cumulus_library/databases.py b/cumulus_library/databases.py index 858a7156..17bc6ca5 100644 --- a/cumulus_library/databases.py +++ b/cumulus_library/databases.py @@ -14,7 +14,7 @@ import os import sys from pathlib import Path -from typing import Optional, Protocol, Union +from typing import Protocol import cumulus_fhir_support import duckdb @@ -31,13 +31,13 @@ class DatabaseCursor(Protocol): def execute(self, sql: str) -> None: pass - def fetchone(self) -> Optional[list]: + def fetchone(self) -> list | None: pass - def fetchmany(self, size: Optional[int]) -> Optional[list[list]]: + def fetchmany(self, size: int | None) -> list[list] | None: pass - def fetchall(self) -> Optional[list[list]]: + def fetchall(self) -> list[list] | None: pass @@ -131,6 +131,7 @@ def execute_as_pandas(self, sql: str) -> pandas.DataFrame: def parser(self) -> DatabaseParser: """Returns parser object for interrogating DB schemas""" + @abc.abstractmethod def close(self) -> None: """Clean up any resources necessary""" @@ -174,6 +175,9 @@ def execute_as_pandas(self, sql: str) -> pandas.DataFrame: def parser(self) -> DatabaseParser: return AthenaParser() + def close(self) -> None: + return self.connection.close() + class AthenaParser(DatabaseParser): def validate_table_schema( @@ -190,7 +194,7 @@ def __init__(self, db_file: str): super().__init__("main") self.connection = duckdb.connect(db_file) # Aliasing Athena's as_pandas to duckDB's df cast - setattr(duckdb.DuckDBPyConnection, "as_pandas", duckdb.DuckDBPyConnection.df) + duckdb.DuckDBPyConnection.as_pandas = duckdb.DuckDBPyConnection.df # Paper over some syntax differences between Athena and DuckDB self.connection.create_function( @@ -238,22 +242,24 @@ def __init__(self, db_file: str): ) def insert_tables(self, tables: dict[str, pyarrow.Table]) -> None: - """Ingests all ndjson data from a folder tree (often the output folder of Cumulus ETL)""" + """Ingests all ndjson data from a folder tree. + + This is often the output folder of Cumulus ETL""" for name, table in tables.items(): self.connection.register(name, table) @staticmethod def _compat_array_join( - value: Optional[list[Optional[str]]], delimiter: str - ) -> Optional[str]: + value: list[str | None] | None, delimiter: str + ) -> str | None: if value is None: return None return delimiter.join(v for v in value if v is not None) @staticmethod def _compat_date( - value: Union[str, datetime.datetime, datetime.date, None] - ) -> Optional[datetime.date]: + value: str | datetime.datetime | datetime.date | None, + ) -> datetime.date | None: if value is None: return None elif isinstance(value, str): @@ -266,14 +272,14 @@ def _compat_date( raise ValueError("Unexpected date() argument:", type(value), value) @staticmethod - def _compat_to_utf8(value: Optional[str]) -> Optional[datetime.date]: + def _compat_to_utf8(value: str | None) -> datetime.date | None: """See the create_function() call for to_utf8 for more background""" return value @staticmethod def _compat_from_iso8601_timestamp( - value: Optional[str], - ) -> Optional[datetime.datetime]: + value: str | None, + ) -> datetime.datetime | None: if value is None: return None @@ -329,12 +335,13 @@ def read_ndjson_dir(path: str) -> dict[str, pyarrow.Table]: """Loads a directory tree of raw ndjson into schema-ful tables. :param path: a directory path - :returns: dictionary of table names (like 'documentreference') to table data (with schema) + :returns: dictionary of table names (like 'documentreference') to table + data (with schema) """ all_tables = {} - # Manually specify the list of resources because we want to create each table even if the - # folder does not exist. + # Manually specify the list of resources because we want to create each table + # even if the folder does not exist. resources = [ "AllergyIntolerance", "Condition", @@ -364,7 +371,7 @@ def read_ndjson_dir(path: str) -> dict[str, pyarrow.Table]: # Read all ndjson directly into memory rows = [] for filename in filenames: - with open(filename, "r", encoding="utf8") as f: + with open(filename, encoding="utf8") as f: for line in f: rows.append(json.loads(line)) diff --git a/cumulus_library/parsers/fhir_valueset.py b/cumulus_library/parsers/fhir_valueset.py index a11eaccf..a16632cb 100644 --- a/cumulus_library/parsers/fhir_valueset.py +++ b/cumulus_library/parsers/fhir_valueset.py @@ -1,17 +1,15 @@ -import os -from typing import List - from fhirclient.models.coding import Coding from cumulus_library import base_utils from cumulus_library.template_sql import base_templates -def get_include_coding(valueset_json) -> List[Coding]: - """ - Obtain a list of Coding "concepts" from a ValueSet. +def get_include_coding(valueset_json) -> list[Coding]: + """Obtain a list of Coding "concepts" from a ValueSet. + This method currently supports only "include" of "concept" defined fields. - Not supported: recursive fetching of contained ValueSets, which requires UMLS API Key and Wget, etc. + Not supported: recursive fetching of contained ValueSets, which requires UMLS + API Key and Wget, etc. examples https://vsac.nlm.nih.gov/valueset/2.16.840.1.113762.1.4.1146.1629/expansion/Latest @@ -31,7 +29,7 @@ def get_include_coding(valueset_json) -> List[Coding]: return parsed -def create_view_sql(view_name: str, concept_list: List[Coding]) -> str: +def create_view_sql(view_name: str, concept_list: list[Coding]) -> str: """ :param view_name: like study__define_type :param concept_list: list of concepts to include in definition @@ -57,7 +55,7 @@ def create_view_sql(view_name: str, concept_list: List[Coding]) -> str: """ -def write_view_sql(view_name: str, concept_list: List[Coding]) -> None: +def write_view_sql(view_name: str, concept_list: list[Coding]) -> None: """Convenience wrapper for writing create_view_sql to disk""" with open(f"{view_name}.sql", "w") as fp: fp.write(create_view_sql(view_name, concept_list)) diff --git a/cumulus_library/schema/columns.py b/cumulus_library/schema/columns.py index 3810b6ab..38e6fece 100644 --- a/cumulus_library/schema/columns.py +++ b/cumulus_library/schema/columns.py @@ -1,10 +1,15 @@ # pylint: disable=W,C,R -from enum import Enum, EnumMeta -from cumulus_library.schema.typesystem import Datatypes, Coding, Vocab -from cumulus_library.schema.valueset import Gender, Race, Ethnicity -from cumulus_library.schema.valueset import DurationUnits -from cumulus_library.schema.valueset import EncounterCode -from cumulus_library.schema.valueset import ObservationInterpretationDetection +from enum import Enum + +from cumulus_library.schema.typesystem import Datatypes, Vocab +from cumulus_library.schema.valueset import ( + DurationUnits, + EncounterCode, + Ethnicity, + Gender, + ObservationInterpretationDetection, + Race, +) class ColumnEnum(Enum): diff --git a/cumulus_library/schema/valueset.py b/cumulus_library/schema/valueset.py index 0c157426..4c5ed990 100644 --- a/cumulus_library/schema/valueset.py +++ b/cumulus_library/schema/valueset.py @@ -1,6 +1,7 @@ # pylint: disable=W,C,R from enum import Enum -from cumulus_library.schema.typesystem import Coding, Vocab + +from cumulus_library.schema.typesystem import Coding ################################################################################ # FHIR ValueSets diff --git a/cumulus_library/statistics/counts.py b/cumulus_library/statistics/counts.py index 7b3adb04..0d0707f9 100644 --- a/cumulus_library/statistics/counts.py +++ b/cumulus_library/statistics/counts.py @@ -1,20 +1,18 @@ """Class for generating counts tables from templates""" import sys - from pathlib import Path -from typing import Union from cumulus_library.base_table_builder import BaseTableBuilder +from cumulus_library.errors import CountsBuilderError from cumulus_library.study_parser import StudyManifestParser from cumulus_library.template_sql.statistics import counts_templates -from cumulus_library.errors import CountsBuilderError class CountsBuilder(BaseTableBuilder): """Extends BaseTableBuilder for counts-related use cases""" - def __init__(self, study_prefix: str = None): + def __init__(self, study_prefix: str | None = None): if study_prefix is None: # This slightly wonky approach will give us the path of the # directory of a class extending the CountsBuilder @@ -44,7 +42,7 @@ def get_table_name(self, table_name: str, duration=None) -> str: return f"{self.study_prefix}__{table_name}" def get_where_clauses( - self, clause: Union[list, str, None] = None, min_subject: int = 10 + self, clause: list | str | None = None, min_subject: int = 10 ) -> str: """Convenience method for constructing arbitrary where clauses. @@ -101,7 +99,7 @@ def count_condition( table_name: str, source_table: str, table_cols: list, - where_clauses: Union[list, None] = None, + where_clauses: list | None = None, min_subject: int = 10, ) -> str: """wrapper method for constructing condition counts tables @@ -128,7 +126,7 @@ def count_documentreference( table_name: str, source_table: str, table_cols: list, - where_clauses: Union[list, None] = None, + where_clauses: list | None = None, min_subject: int = 10, ) -> str: """wrapper method for constructing documentreference counts tables @@ -155,7 +153,7 @@ def count_encounter( table_name: str, source_table: str, table_cols: list, - where_clauses: Union[list, None] = None, + where_clauses: list | None = None, min_subject: int = 10, ) -> str: """wrapper method for constructing encounter counts tables @@ -181,7 +179,7 @@ def count_medicationrequest( table_name: str, source_table: str, table_cols: list, - where_clauses: Union[list, None] = None, + where_clauses: list | None = None, min_subject: int = 10, ) -> str: """wrapper method for constructing medicationrequests counts tables @@ -207,7 +205,7 @@ def count_observation( table_name: str, source_table: str, table_cols: list, - where_clauses: Union[list, None] = None, + where_clauses: list | None = None, min_subject: int = 10, ) -> str: """wrapper method for constructing observation counts tables @@ -233,7 +231,7 @@ def count_patient( table_name: str, source_table: str, table_cols: list, - where_clauses: Union[list, None] = None, + where_clauses: list | None = None, min_subject: int = 10, ) -> str: """wrapper method for constructing patient counts tables @@ -266,7 +264,7 @@ def write_counts(self, filepath: str): self.comment_queries() self.write_queries(filename=filepath) - def prepare_queries(self, cursor: object = None, schema: str = None): + def prepare_queries(self, cursor: object | None = None, schema: str | None = None): """Stub implementing abstract base class This should be overridden in any count generator. See studies/core/count_core.py diff --git a/cumulus_library/statistics/psm.py b/cumulus_library/statistics/psm.py index 00a2f059..0e05b223 100644 --- a/cumulus_library/statistics/psm.py +++ b/cumulus_library/statistics/psm.py @@ -5,21 +5,18 @@ import pathlib import sys import warnings - from dataclasses import dataclass +import matplotlib.pyplot as plt import pandas +import seaborn as sns import toml - from psmpy import PsmPy # these imports are mimicing PsmPy imports for re-implemented functions from psmpy.functions import cohenD -import matplotlib.pyplot as plt -import seaborn as sns -from cumulus_library import databases -from cumulus_library import base_table_builder +from cumulus_library import base_table_builder, databases from cumulus_library.template_sql import base_templates from cumulus_library.template_sql.statistics import psm_templates @@ -86,7 +83,7 @@ def __init__(self, toml_config_path: str, data_path: pathlib.Path): ) except KeyError: sys.exit( - f"PSM configuration at {toml_config_path} contains missing/invalid keys." + f"PSM configuration {toml_config_path} contains missing/invalid keys." "Check the PSM documentation for an example config with more details:\n" "https://docs.smarthealthit.org/cumulus/library/statistics/propensity-score-matching.html" ) @@ -190,8 +187,8 @@ def psm_plot_match( Title="Side by side matched controls", Ylabel="Number of patients", Xlabel="propensity logit", - names=["positive_cohort", "negative_cohort"], - colors=["#E69F00", "#56B4E9"], + names=["positive_cohort", "negative_cohort"], # noqa: B006 + colors=["#E69F00", "#56B4E9"], # noqa: B006 save=True, filename="propensity_match.png", ): @@ -214,13 +211,13 @@ def psm_plot_match( plt.xlabel(Xlabel) plt.ylabel(Ylabel) plt.title(Title) - if save == True: + if save: plt.savefig(filename, dpi=250) def psm_effect_size_plot( self, psm, - title="Standardized Mean differences accross covariates before and after matching", + title="Standardized Mean differences accross covariates before and after matching", # noqa: E501 before_color="#FCB754", after_color="#3EC8FB", save=False, @@ -233,8 +230,8 @@ def psm_effect_size_plot( and passing in the psm object instead of assuming a call from inside the PsmPy class. """ - df_preds_after = psm.df_matched[[psm.treatment] + psm.xvars] - df_preds_b4 = psm.data[[psm.treatment] + psm.xvars] + df_preds_after = psm.df_matched[[psm.treatment] + psm.xvars] # noqa: RUF005 + df_preds_b4 = psm.data[[psm.treatment] + psm.xvars] # noqa: RUF005 df_preds_after_float = df_preds_after.astype(float) df_preds_b4_float = df_preds_b4.astype(float) @@ -255,7 +252,7 @@ def psm_effect_size_plot( orient="h", ) sns_plot.set(title=title) - if save == True: + if save: sns_plot.figure.savefig(filename, dpi=250, bbox_inches="tight") def generate_psm_analysis( @@ -271,7 +268,9 @@ def generate_psm_analysis( ).as_pandas() symptoms_dict = self._get_symptoms_dict(self.config.classification_json) for dependent_variable, codes in symptoms_dict.items(): - df[dependent_variable] = df["code"].apply(lambda x: 1 if x in codes else 0) + df[dependent_variable] = df["code"].apply( + lambda x, codes=codes: 1 if x in codes else 0 + ) df = df.drop(columns="code") # instance_count present but unused for PSM if table contains a count_ref input # (it's intended for manual review) @@ -315,8 +314,8 @@ def generate_psm_analysis( # mentioning workarounds for this behavior. with warnings.catch_warnings(): warnings.simplefilter("ignore", category=UserWarning) - # This function populates the psm.predicted_data element, which is required - # for things like the knn_matched() function call + # This function populates the psm.predicted_data element, which is + # required for things like the knn_matched() function call psm.logistic_ps(balance=True) # This function populates the psm.df_matched element psm.knn_matched( @@ -338,11 +337,13 @@ def generate_psm_analysis( ) except ZeroDivisionError: sys.exit( - "Encountered a divide by zero error during statistical graph generation. Try increasing your sample size." + "Encountered a divide by zero error during statistical graph " + "generation. Try increasing your sample size." ) except ValueError: sys.exit( - "Encountered a value error during KNN matching. Try increasing your sample size." + "Encountered a value error during KNN matching. Try increasing " + "your sample size." ) def prepare_queries(self, cursor: object, schema: str, table_suffix: str): @@ -354,7 +355,7 @@ def post_execution( schema: str, verbose: bool, drop_table: bool = False, - table_suffix: str = None, + table_suffix: str | None = None, ): # super().execute_queries(cursor, schema, verbose, drop_table) self.generate_psm_analysis(cursor, schema, table_suffix) diff --git a/cumulus_library/studies/core/builder_condition.py b/cumulus_library/studies/core/builder_condition.py index afdf28c5..1492f509 100644 --- a/cumulus_library/studies/core/builder_condition.py +++ b/cumulus_library/studies/core/builder_condition.py @@ -1,9 +1,7 @@ -from cumulus_library import base_table_builder -from cumulus_library import databases +from cumulus_library import base_table_builder, databases from cumulus_library.studies.core.core_templates import core_templates from cumulus_library.template_sql import base_templates, sql_utils - expected_table_cols = { "condition": { "category": [ diff --git a/cumulus_library/studies/core/builder_documentreference.py b/cumulus_library/studies/core/builder_documentreference.py index 22d047d6..82b33937 100644 --- a/cumulus_library/studies/core/builder_documentreference.py +++ b/cumulus_library/studies/core/builder_documentreference.py @@ -1,16 +1,13 @@ -from cumulus_library import base_table_builder -from cumulus_library import databases +from cumulus_library import base_table_builder, databases from cumulus_library.studies.core.core_templates import core_templates from cumulus_library.template_sql import sql_utils - expected_table_cols = { "documentreference": { "id": [], "type": [], "status": [], "docstatus": [], - "context": [], "subject": ["reference"], "context": ["period", "start"], } @@ -37,7 +34,7 @@ def prepare_queries( source_id="id", column_name="type", is_array=False, - target_table=f"core__documentreference_dn_type", + target_table="core__documentreference_dn_type", ) ], ) diff --git a/cumulus_library/studies/core/builder_encounter.py b/cumulus_library/studies/core/builder_encounter.py index 6f5595c7..cd3552cd 100644 --- a/cumulus_library/studies/core/builder_encounter.py +++ b/cumulus_library/studies/core/builder_encounter.py @@ -1,9 +1,7 @@ -from cumulus_library import base_table_builder -from cumulus_library import databases +from cumulus_library import base_table_builder, databases from cumulus_library.studies.core.core_templates import core_templates from cumulus_library.template_sql import sql_utils - expected_table_cols = { "encounter": { "status": [], diff --git a/cumulus_library/studies/core/builder_medication.py b/cumulus_library/studies/core/builder_medication.py index f61a5643..c85690b4 100644 --- a/cumulus_library/studies/core/builder_medication.py +++ b/cumulus_library/studies/core/builder_medication.py @@ -1,8 +1,8 @@ """ Module for generating core medication table""" from cumulus_library import base_table_builder, base_utils -from cumulus_library.template_sql import base_templates, sql_utils from cumulus_library.studies.core.core_templates import core_templates +from cumulus_library.template_sql import base_templates, sql_utils class MedicationBuilder(base_table_builder.BaseTableBuilder): diff --git a/cumulus_library/studies/core/builder_medicationrequest.py b/cumulus_library/studies/core/builder_medicationrequest.py index 3030ea7e..7203a924 100644 --- a/cumulus_library/studies/core/builder_medicationrequest.py +++ b/cumulus_library/studies/core/builder_medicationrequest.py @@ -3,10 +3,9 @@ Note: This module assumes that you have already run builder_medication, as it leverages the core__medication table for data population""" -from cumulus_library import base_table_builder -from cumulus_library.template_sql import base_templates, sql_utils -from cumulus_library import databases +from cumulus_library import base_table_builder, databases from cumulus_library.studies.core.core_templates import core_templates +from cumulus_library.template_sql import base_templates, sql_utils expected_table_cols = { "medicationrequest": { diff --git a/cumulus_library/studies/core/builder_observation.py b/cumulus_library/studies/core/builder_observation.py index 294ba3c4..48db3032 100644 --- a/cumulus_library/studies/core/builder_observation.py +++ b/cumulus_library/studies/core/builder_observation.py @@ -2,10 +2,9 @@ from dataclasses import dataclass -from cumulus_library import base_table_builder -from cumulus_library.template_sql import sql_utils -from cumulus_library import databases +from cumulus_library import base_table_builder, databases from cumulus_library.studies.core.core_templates import core_templates +from cumulus_library.template_sql import sql_utils expected_table_cols = { "observation": { diff --git a/cumulus_library/studies/core/builder_patient.py b/cumulus_library/studies/core/builder_patient.py index 8302163e..f6c62e20 100644 --- a/cumulus_library/studies/core/builder_patient.py +++ b/cumulus_library/studies/core/builder_patient.py @@ -1,16 +1,15 @@ """ Module for extracting US core extensions from patient records""" -from cumulus_library.base_table_builder import BaseTableBuilder -from cumulus_library.template_sql import base_templates, sql_utils from cumulus_library import databases +from cumulus_library.base_table_builder import BaseTableBuilder from cumulus_library.studies.core.core_templates import core_templates +from cumulus_library.template_sql import base_templates, sql_utils expected_table_cols = { "patient": { "id": [], "gender": [], "address": [], - "id": [], "birthdate": [], } } diff --git a/cumulus_library/studies/core/core_templates/core_templates.py b/cumulus_library/studies/core/core_templates/core_templates.py index ce4808c3..0c0e7562 100644 --- a/cumulus_library/studies/core/core_templates/core_templates.py +++ b/cumulus_library/studies/core/core_templates/core_templates.py @@ -1,5 +1,4 @@ import pathlib -import typing import jinja2 @@ -9,7 +8,9 @@ def get_core_template( - target_table: str, schema: dict[dict[bool]] = None, config: dict = None + target_table: str, + schema: dict[dict[bool]] | None = None, + config: dict | None = None, ) -> str: """Extracts code system details as a standalone table""" with open(f"{PATH}/{target_table}.sql.jinja") as file: diff --git a/cumulus_library/studies/core/count_core.py b/cumulus_library/studies/core/count_core.py index fd2e214d..fa8a6775 100644 --- a/cumulus_library/studies/core/count_core.py +++ b/cumulus_library/studies/core/count_core.py @@ -1,5 +1,5 @@ -from typing import List from pathlib import Path + from cumulus_library.statistics import counts @@ -25,7 +25,7 @@ def count_core_documentreference(self, duration: str = "month"): ] return self.count_documentreference(table_name, from_table, cols) - def count_core_encounter(self, duration: str = None): + def count_core_encounter(self, duration: str | None = None): table_name = self.get_table_name("count_encounter", duration=duration) from_table = self.get_table_name("encounter") @@ -41,7 +41,7 @@ def count_core_encounter(self, duration: str = None): return self.count_encounter(table_name, from_table, cols) def _count_core_encounter_type( - self, table_name: str, cols: list, duration: str = None + self, table_name: str, cols: list, duration: str | None = None ): """ Encounter Type information is for every visit, and therefore this @@ -61,7 +61,7 @@ def _count_core_encounter_type( return self.count_encounter(table_name, from_table, cols) # TODO: consider renaming function and table to `count_core_encounter_all_types` - def count_core_encounter_type(self, duration: str = None): + def count_core_encounter_type(self, duration: str | None = None): cols = [ "enc_class_display", "enc_type_display", diff --git a/cumulus_library/studies/discovery/code_detection.py b/cumulus_library/studies/discovery/code_detection.py index e8fda411..5e8c449b 100644 --- a/cumulus_library/studies/discovery/code_detection.py +++ b/cumulus_library/studies/discovery/code_detection.py @@ -1,39 +1,40 @@ """ Module for generating encounter codeableConcept table""" -from cumulus_library import base_table_builder, helper -from cumulus_library.template_sql import templates, utils - +from cumulus_library import base_table_builder, base_utils from cumulus_library.studies.discovery import code_definitions +from cumulus_library.template_sql import base_templates, sql_utils -class CodeDetectionBuilder(BaseTableBuilder): +class CodeDetectionBuilder(base_table_builder.BaseTableBuilder): display_text = "Selecting unique code systems..." def _check_codes_in_fields(self, code_sources: list[dict], schema, cursor) -> dict: """checks if Coding/CodeableConcept fields are present and populated""" - with helper.get_progress_bar() as progress: + with base_utils.get_progress_bar() as progress: task = progress.add_task( "Discovering available coding systems...", total=len(code_sources), ) for code_source in code_sources: if code_source["is_array"]: - code_source["has_data"] = utils.is_codeable_concept_array_populated( + code_source[ + "has_data" + ] = sql_utils.is_codeable_concept_array_populated( schema, code_source["table_name"], code_source["column_name"], cursor, ) elif code_source["is_bare_coding"]: - code_source["has_data"] = utils.is_code_populated( + code_source["has_data"] = sql_utils.is_code_populated( schema, code_source["table_name"], code_source["column_name"], cursor, ) else: - code_source["has_data"] = utils.is_codeable_concept_populated( + code_source["has_data"] = sql_utils.is_codeable_concept_populated( schema, code_source["table_name"], code_source["column_name"], @@ -57,7 +58,7 @@ def prepare_queries(self, cursor: object, schema: str, *args, **kwargs): ): raise KeyError( "Expected table_name and column_name keys in " - f"{str(code_definition)}" + f"{code_definition!s}" ) code_source = { "is_bare_coding": False, @@ -69,5 +70,7 @@ def prepare_queries(self, cursor: object, schema: str, *args, **kwargs): code_sources.append(code_source) code_sources = self._check_codes_in_fields(code_sources, schema, cursor) - query = templates.get_code_system_pairs("discovery__code_sources", code_sources) + query = base_templates.get_code_system_pairs( + "discovery__code_sources", code_sources + ) self.queries.append(query) diff --git a/cumulus_library/studies/vocab/vocab_icd_builder.py b/cumulus_library/studies/vocab/vocab_icd_builder.py index cb9f849f..4a323893 100644 --- a/cumulus_library/studies/vocab/vocab_icd_builder.py +++ b/cumulus_library/studies/vocab/vocab_icd_builder.py @@ -34,12 +34,11 @@ def prepare_queries(self, cursor: object, schema: str, *args, **kwargs): path = pathlib.Path(__file__).parent headers = ["CUI", "TTY", "CODE", "SAB", "STR"] - header_types = [f"{x} string" for x in headers] rows_processed = 0 dataset = [] created = False for filename in icd_files: - with open(f"{path}/{filename}.bsv", "r") as file: + with open(f"{path}/{filename}.bsv") as file: # For the first row in the dataset, we want to coerce types from # varchar(len(item)) athena default to to an unrestricted varchar, so # we'll create a table with one row - this make the recast faster, and diff --git a/cumulus_library/study_parser.py b/cumulus_library/study_parser.py index 71a6de00..7b1a4aa7 100644 --- a/cumulus_library/study_parser.py +++ b/cumulus_library/study_parser.py @@ -1,21 +1,18 @@ """ Contains classes for loading study data based on manifest.toml files """ import csv -import inspect import importlib.util +import inspect import pathlib import sys -from typing import List, Optional - import toml - from rich.progress import Progress, TaskID, track from cumulus_library import ( __version__, - base_utils, base_table_builder, + base_utils, databases, enums, errors, @@ -24,8 +21,6 @@ from cumulus_library.statistics import psm from cumulus_library.template_sql import base_templates -StrList = List[str] - class StudyManifestParser: """Handles loading of study data from manifest files. @@ -38,18 +33,17 @@ class StudyManifestParser: future. """ - _study_path = None - _study_config = {} - def __init__( self, - study_path: Optional[pathlib.Path] = None, - data_path: Optional[pathlib.Path] = None, + study_path: pathlib.Path | None = None, + data_path: pathlib.Path | None = None, ): """Instantiates a StudyManifestParser. :param study_path: A pathlib Path object, optional """ + self._study_path = None + self._study_config = {} if study_path is not None: self.load_study_manifest(study_path) self.data_path = data_path @@ -75,19 +69,19 @@ def load_study_manifest(self, study_path: pathlib.Path) -> None: ) self._study_config = config self._study_path = study_path - except FileNotFoundError: + except FileNotFoundError as e: raise errors.StudyManifestFilesystemError( # pylint: disable=raise-missing-from f"Missing or invalid manifest found at {study_path}" - ) + ) from e - def get_study_prefix(self) -> Optional[str]: + def get_study_prefix(self) -> str | None: """Reads the name of a study prefix from the in-memory study config :returns: A string of the prefix in the manifest, or None if not found """ return self._study_config.get("study_prefix") - def get_sql_file_list(self, continue_from: str = None) -> Optional[StrList]: + def get_sql_file_list(self, continue_from: str | None = None) -> list[str] | None: """Reads the contents of the sql_config array from the manifest :returns: An array of sql files from the manifest, or None if not found. @@ -105,7 +99,7 @@ def get_sql_file_list(self, continue_from: str = None) -> Optional[StrList]: ) return sql_files - def get_table_builder_file_list(self) -> Optional[StrList]: + def get_table_builder_file_list(self) -> list[str] | None: """Reads the contents of the table_builder_config array from the manifest :returns: An array of sql files from the manifest, or None if not found. @@ -113,7 +107,7 @@ def get_table_builder_file_list(self) -> Optional[StrList]: sql_config = self._study_config.get("table_builder_config", {}) return sql_config.get("file_names", []) - def get_counts_builder_file_list(self) -> Optional[StrList]: + def get_counts_builder_file_list(self) -> list[str] | None: """Reads the contents of the counts_builder_config array from the manifest :returns: An array of sql files from the manifest, or None if not found. @@ -121,15 +115,16 @@ def get_counts_builder_file_list(self) -> Optional[StrList]: sql_config = self._study_config.get("counts_builder_config", {}) return sql_config.get("file_names", []) - def get_statistics_file_list(self) -> Optional[StrList]: + def get_statistics_file_list(self) -> list[str] | None: """Reads the contents of the statistics_config array from the manifest - :returns: An array of statistics toml files from the manifest, or None if not found. + :returns: An array of statistics toml files from the manifest, + or None if not found. """ stats_config = self._study_config.get("statistics_config", {}) return stats_config.get("file_names", []) - def get_export_table_list(self) -> Optional[StrList]: + def get_export_table_list(self) -> list[str] | None: """Reads the contents of the export_list array from the manifest :returns: An array of tables to export from the manifest, or None if not found. @@ -208,8 +203,8 @@ def clean_study( schema_name: str, stats_clean: bool = False, verbose: bool = False, - prefix: str = None, - ) -> List: + prefix: str | None = None, + ) -> list: """Removes tables beginning with the study prefix from the database schema :param cursor: A DatabaseCursor object @@ -296,7 +291,7 @@ def _execute_drop_queries( self, cursor: databases.DatabaseCursor, verbose: bool, - view_table_list: List, + view_table_list: list, progress: Progress, task: TaskID, ) -> None: @@ -304,7 +299,8 @@ def _execute_drop_queries( :param cursor: A DatabaseCursor object :param verbose: toggle from progress bar to query output - :param view_table_list: a list of views and tables beginning with the study prefix + :param view_table_list: a list of views and tables beginning with + the study prefix :param progress: a rich progress bar renderer :param task: a TaskID for a given progress bar """ @@ -326,7 +322,7 @@ def _load_and_execute_builder( drop_table: bool = False, parser: databases.DatabaseParser = None, write_reference_sql: bool = False, - doc_str: str = None, + doc_str: str | None = None, ) -> None: """Loads a table builder from a file. @@ -336,7 +332,7 @@ def _load_and_execute_builder( As with eating an ortolan, you may wish to cover your head with a cloth. Per an article on the subject: "Tradition dictates that this is to shield - – from God’s eyes – the shame of such a decadent and disgraceful act." + - from God's eyes - the shame of such a decadent and disgraceful act." """ spec = importlib.util.spec_from_file_location( @@ -528,7 +524,7 @@ def run_generate_sql( parser: databases.DatabaseParser = None, verbose: bool = False, ) -> None: - """Generates reference SQL from all BaseTableBuilder-derived classes in the manifest + """Generates reference SQL from builders listed in the manifest :param cursor: A DatabaseCursor object :param schema: The name of the schema to write tables to @@ -540,10 +536,10 @@ def run_generate_sql( + self.get_statistics_file_list() ) doc_str = ( - "-- This sql was autogenerated as a reference example using the library CLI.\n" - "-- Its format is tied to the specific database it was run against, and it may not\n" - "-- be correct for all databases. Use the CLI's build option to derive the best SQL\n" - "-- for your dataset." + "-- This sql was autogenerated as a reference example using the library\n" + "-- CLI.Its format is tied to the specific database it was run against,\n" + "-- and it may not be correct for all databases. Use the CLI's build \n" + "-- option to derive the best SQL for your dataset." ) for file in all_generators: self._load_and_execute_builder( @@ -560,8 +556,8 @@ def build_study( self, cursor: databases.DatabaseCursor, verbose: bool = False, - continue_from: str = None, - ) -> List: + continue_from: str | None = None, + ) -> list: """Creates tables in the schema by iterating through the sql_config.file_names :param cursor: A DatabaseCursor object @@ -593,9 +589,10 @@ def build_study( ) return queries - def _query_error(self, query_and_filename: List, exit_message: str) -> None: + def _query_error(self, query_and_filename: list, exit_message: str) -> None: print( - f"An error occured executing the following query in {query_and_filename[1]}:", + "An error occured executing the following query in ", + f"{query_and_filename[1]}:", file=sys.stderr, ) print("--------", file=sys.stderr) @@ -637,8 +634,8 @@ def _execute_build_queries( "This query contains a table name which contains a reserved word " "immediately after the study prefix. Please rename this table so " "that is does not begin with one of these special words " - "immediately after the double undescore.\n" - f"Reserved words: {str(word.value for word in enums.ProtectedTableKeywords)}", + "immediately after the double undescore.\n Reserved words: " + f"{(word.value for word in enums.ProtectedTableKeywords)}", ) if create_line.count("__") > 1: self._query_error( @@ -669,7 +666,7 @@ def _execute_build_queries( def export_study( self, db: databases.DatabaseBackend, data_path: pathlib.Path - ) -> List: + ) -> list: """Exports csvs/parquet extracts of tables listed in export_list :param db: A database backend @@ -683,7 +680,7 @@ def export_study( ): query = f"select * from {table}" dataframe = db.execute_as_pandas(query) - path = pathlib.Path(f"{str(data_path)}/{self.get_study_prefix()}/") + path = pathlib.Path(f"{data_path}/{self.get_study_prefix()}/") path.mkdir(parents=True, exist_ok=True) dataframe = dataframe.sort_values( by=list(dataframe.columns), ascending=False, na_position="first" diff --git a/cumulus_library/template_sql/base_templates.py b/cumulus_library/template_sql/base_templates.py index cff0f4ac..8b464d1f 100644 --- a/cumulus_library/template_sql/base_templates.py +++ b/cumulus_library/template_sql/base_templates.py @@ -2,12 +2,10 @@ from enum import Enum from pathlib import Path -from typing import Dict, List, Optional from jinja2 import Template from pandas import DataFrame -from cumulus_library import databases from cumulus_library.template_sql import sql_utils PATH = Path(__file__).parent @@ -71,7 +69,7 @@ def get_codeable_concept_denormalize_query( ) -def get_column_datatype_query(schema_name: str, table_name: str, column_names: List): +def get_column_datatype_query(schema_name: str, table_name: str, column_names: list): with open(f"{PATH}/column_datatype.sql.jinja") as column_datatype: return Template(column_datatype.read()).render( schema_name=schema_name, @@ -81,7 +79,7 @@ def get_column_datatype_query(schema_name: str, table_name: str, column_names: L def get_create_view_query( - view_name: str, dataset: List[List[str]], view_cols: List[str] + view_name: str, dataset: list[list[str]], view_cols: list[str] ) -> str: """Generates a create view as query for inserting static data into athena @@ -98,7 +96,7 @@ def get_create_view_query( def get_ctas_query( - schema_name: str, table_name: str, dataset: List[List[str]], table_cols: List[str] + schema_name: str, table_name: str, dataset: list[list[str]], table_cols: list[str] ) -> str: """Generates a create table as query for inserting static data into athena @@ -139,8 +137,8 @@ def get_ctas_query_from_df(schema_name: str, table_name: str, df: DataFrame) -> def get_ctas_empty_query( schema_name: str, table_name: str, - table_cols: List[str], - table_cols_types: List[str] = None, + table_cols: list[str], + table_cols_types: list[str] | None = None, ) -> str: """Generates a create table as query for initializing an empty table @@ -152,7 +150,8 @@ def get_ctas_empty_query( :param schema_name: The athena schema to create the table in :param table_name: The name of the athena table to create :param table_cols: Comma deleniated column names, i.e. ['first,second'] - :param table_cols_types: Allows specifying a data type per column (default: all varchar) + :param table_cols_types: Allows specifying a data type per column + (default: all varchar) """ if not table_cols_types: table_cols_types = ["varchar"] * len(table_cols) @@ -201,9 +200,9 @@ def get_extension_denormalize_query(config: sql_utils.ExtensionConfig) -> str: def get_insert_into_query( table_name: str, - table_cols: List[str], - dataset: List[List[str]], - type_casts: Dict = {}, + table_cols: list[str], + dataset: list[list[str]], + type_casts: dict | None = None, ) -> str: """Generates an insert query for adding data to an existing athena table @@ -212,6 +211,8 @@ def get_insert_into_query( :param table_cols: Comma deleniated column names, i.e. ['first','second'] :param dataset: Array of data arrays to insert, i.e. [['1','3'],['2','4']] """ + if type_casts is None: + type_casts = {} with open(f"{PATH}/insert_into.sql.jinja") as insert_into: return Template(insert_into.read()).render( table_name=table_name, @@ -224,9 +225,13 @@ def get_insert_into_query( def get_is_table_not_empty_query( source_table: str, field: str, - unnests: Optional[list[dict]] = [], - conditions: Optional[list[str]] = [], + unnests: list[dict] | None = None, + conditions: list[str] | None = None, ): + if unnests is None: + unnests = [] + if conditions is None: + conditions = [] with open(f"{PATH}/is_table_not_empty.sql.jinja") as is_table_not_empty: return Template(is_table_not_empty.read()).render( source_table=source_table, diff --git a/cumulus_library/template_sql/sql_utils.py b/cumulus_library/template_sql/sql_utils.py index 2664be56..797bc330 100644 --- a/cumulus_library/template_sql/sql_utils.py +++ b/cumulus_library/template_sql/sql_utils.py @@ -12,11 +12,9 @@ from dataclasses import dataclass import duckdb -from typing import List -from cumulus_library import base_utils +from cumulus_library import base_utils, databases from cumulus_library.template_sql import base_templates -from cumulus_library import databases @dataclass(kw_only=True) @@ -44,7 +42,7 @@ class CodeableConceptConfig: @dataclass -class ExtensionConfig(object): +class ExtensionConfig: """convenience class for holding parameters for generating extension tables. :param source_table: the table to extract extensions from @@ -61,14 +59,14 @@ class ExtensionConfig(object): target_table: str target_col_prefix: str fhir_extension: str - ext_systems: List[str] + ext_systems: list[str] is_array: bool = False def _check_data_in_fields( schema, cursor, - code_sources: List[CodeableConceptConfig], + code_sources: list[CodeableConceptConfig], ) -> dict: """checks if CodeableConcept fields actually have data available @@ -112,7 +110,7 @@ def _check_data_in_fields( def denormalize_codes( schema: str, cursor: databases.DatabaseCursor, - code_sources: List[CodeableConceptConfig], + code_sources: list[CodeableConceptConfig], ): queries = [] code_sources = _check_data_in_fields(schema, cursor, code_sources) @@ -284,7 +282,7 @@ def _check_schema_if_exists( if coding_element in schema_str: return False else: - required_fields = [coding_element] + ["code", "system", "display"] + required_fields = [coding_element, "code", "system", "display"] if any(x not in schema_str for x in required_fields): return False return True diff --git a/cumulus_library/template_sql/statistics/counts_templates.py b/cumulus_library/template_sql/statistics/counts_templates.py index 32159062..c32200d5 100644 --- a/cumulus_library/template_sql/statistics/counts_templates.py +++ b/cumulus_library/template_sql/statistics/counts_templates.py @@ -1,7 +1,6 @@ -from enum import Enum from dataclasses import dataclass +from enum import Enum from pathlib import Path -from typing import Optional from jinja2 import Template @@ -39,9 +38,9 @@ def get_count_query( source_table: str, table_cols: list, min_subject: int = 10, - where_clauses: Optional[list] = None, - fhir_resource: Optional[str] = None, - filter_resource: Optional[bool] = True, + where_clauses: list | None = None, + fhir_resource: str | None = None, + filter_resource: bool | None = True, ) -> str: """Generates count tables for generating study outputs""" path = Path(__file__).parent diff --git a/cumulus_library/template_sql/statistics/psm_templates.py b/cumulus_library/template_sql/statistics/psm_templates.py index 41a05932..a91f5bb9 100644 --- a/cumulus_library/template_sql/statistics/psm_templates.py +++ b/cumulus_library/template_sql/statistics/psm_templates.py @@ -1,17 +1,17 @@ """ Collection of jinja template getters for common SQL queries """ -from enum import Enum from pathlib import Path -from typing import Dict, List, Optional from jinja2 import Template -from pandas import DataFrame from cumulus_library.errors import CumulusLibraryError def get_distinct_ids( - columns: list[str], source_table: str, join_id: str = None, filter_table: str = None + columns: list[str], + source_table: str, + join_id: str | None = None, + filter_table: str | None = None, ) -> str: """Gets distinct ids from a table, optionally filtering by ids in another table @@ -23,12 +23,14 @@ def get_distinct_ids( :param columns: a list of ids to request :param source_table: the table to retrieve ids from - :param join_id: the id column to use for joining. Expected to exist in both source and filter tables + :param join_id: the id column to use for joining. Expected to exist in both + source and filter tables :param filter_table: a table containing ids you want to exclude """ if (join_id and not filter_table) or (not join_id and filter_table): raise CumulusLibraryError( - "psm_templates.get_distinct_ids expects all optional parameters to be defined if supplied" + "psm_templates.get_distinct_ids expects all optional parameters ", + "to be defined if supplied", ) path = Path(__file__).parent @@ -49,8 +51,8 @@ def get_create_covariate_table( primary_ref: str, dependent_variable: str, join_cols_by_table: dict, - count_ref: str = None, - count_table: str = None, + count_ref: str | None = None, + count_table: str | None = None, ) -> str: """Gets a query to create a covariate table for PSM analysis @@ -68,7 +70,8 @@ def get_create_covariate_table( """ if (count_ref and not count_table) or (not count_ref and count_table): raise CumulusLibraryError( - "psm_templates.get_create_covariate_table expects all count parameters to be defined if supplied" + "psm_templates.get_create_covariate_table expects all count parameters ", + "to be defined if supplied", ) path = Path(__file__).parent diff --git a/cumulus_library/upload.py b/cumulus_library/upload.py index 21edbdd0..b0489e02 100644 --- a/cumulus_library/upload.py +++ b/cumulus_library/upload.py @@ -1,11 +1,9 @@ """ Handles pushing data to the aggregator""" import sys - from pathlib import Path import requests - from pandas import read_parquet from rich.progress import Progress, TaskID diff --git a/pyproject.toml b/pyproject.toml index 8f6254ff..8e6019cc 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -30,11 +30,11 @@ classifiers = [ dynamic=["version"] [project.optional-dependencies] dev = [ - "black <25, >=24", + "ruff-0.2.1", "pre-commit", - "pylint", ] test = [ + "ruff-0.2.1", "freezegun", "pytest", "requests-mock", @@ -55,3 +55,24 @@ build-backend = "flit_core.buildapi" [tool.flit.sdist] include = [".sqlfluff"] + +[tool.ruff] +target-version = "py310" + +lint.select = [ + "A", # prevent using keywords that clobber python builtins + "B", # bugbear: security warnings + "E", # pycodestyle + "F", # pyflakes + "I", # isort + "ISC", # implicit string concatenation + "PLE", # pylint errors + "RUF", # the ruff developer's own rules + "UP", # alert you when better syntax is available in your python version +] +lint.ignore = [ + "ISC001" # Recommended ingore from `ruff format` +] + +[tool.ruff.lint.per-file-ignores] +"cumulus_library/schema/valueset.py" = ["E501"] diff --git a/tests/conftest.py b/tests/conftest.py index 3a5dc091..5b953fe7 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -2,18 +2,14 @@ import copy import json -import os -import tempfile - -from enum import IntEnum from pathlib import Path +import numpy import pandas import pytest -import numpy from cumulus_library.cli import StudyRunner -from cumulus_library.databases import create_db_backend, DatabaseCursor +from cumulus_library.databases import DatabaseCursor, create_db_backend MOCK_DATA_DIR = f"{Path(__file__).parent}/test_data/duckdb_data" ID_PATHS = { @@ -60,7 +56,8 @@ def duckdb_args(args: list, tmp_path, stats=False): target = f"{MOCK_DATA_DIR}" if args[0] == "build": - return args + [ + return [ + *args, "--db-type", "duckdb", "--load-ndjson-dir", @@ -69,14 +66,15 @@ def duckdb_args(args: list, tmp_path, stats=False): f"{tmp_path}/duck.db", ] elif args[0] == "export": - return args + [ + return [ + *args, "--db-type", "duckdb", "--database", f"{tmp_path}/duck.db", f"{tmp_path}/counts", ] - return args + ["--db-type", "duckdb", "--database", f"{tmp_path}/duck.db"] + return [*args, "--db-type", "duckdb", "--database", f"{tmp_path}/duck.db"] def ndjson_data_generator(source_dir: Path, target_dir: Path, iterations: int): @@ -121,7 +119,7 @@ def update_nested_obj(id_path, obj, i): # panda's deep copy is not recursive, so we have to do it # again for nested objects df[id_path[0]] = df[id_path[0]].map( - lambda x: update_nested_obj( + lambda x, i=i, id_path=id_path: update_nested_obj( id_path[1:], copy.deepcopy(x), i ) ) diff --git a/tests/regression/run_regression.py b/tests/regression/run_regression.py index 5cd5049e..60691e3b 100644 --- a/tests/regression/run_regression.py +++ b/tests/regression/run_regression.py @@ -16,7 +16,6 @@ import os import sys - from pathlib import Path from pandas import read_parquet @@ -32,8 +31,8 @@ export_missing = exports - references sys.exit( "❌ Found differences in files present: ❌\n" - f"Files present in reference not in export: {str(ref_missing)}\n" - f"Files present in export not in reference: {str(export_missing)}" + f"Files present in reference not in export: {ref_missing!s}\n" + f"Files present in export not in reference: {export_missing!s}" ) diffs = [] for filename in references: @@ -59,7 +58,7 @@ diffs.append( [ filename, - ("Rows differ between reference and export:\n" f"{compared}"), + ("Rows differ between reference and export:\n", f"{compared}"), ] ) if len(diffs) > 0: diff --git a/tests/test_cli.py b/tests/test_cli.py index 825298b2..6560d287 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -4,10 +4,8 @@ import glob import os import shutil -import sysconfig - from contextlib import nullcontext as does_not_raise -from pathlib import Path, PosixPath +from pathlib import Path from unittest import mock import pytest @@ -29,7 +27,8 @@ def open(self, *args, **kwargs): if str(args[0]).endswith(".bsv"): args = ( Path( - f"./tests/test_data/mock_bsvs/{str(args[0]).rsplit('/', maxsplit=1)[-1]}" + "./tests/test_data/mock_bsvs/", + f"{str(args[0]).rsplit('/', maxsplit=1)[-1]}", ), "r", ) @@ -101,11 +100,12 @@ def test_cli_early_exit(args): ), ], ) -def test_cli_path_mapping( - mock_load_json, mock_path, tmp_path, args, raises, expected -): # pylint: disable=unused-argument +def test_cli_path_mapping(mock_load_json, mock_path, tmp_path, args, raises, expected): # pylint: disable=unused-argument with raises: - mock_path.return_value = f"{Path(__file__).resolve().parents[0]}" "/test_data/" + mock_path.return_value = ( + f"{Path(__file__).resolve().parents[0]}", + "/test_data/", + ) mock_load_json.return_value = { "__desc__": "", "allowlist": { @@ -124,7 +124,7 @@ def test_cli_path_mapping( ) @mock.patch("sysconfig.get_path") def test_count_builder_mapping(mock_path, tmp_path): - mock_path.return_value = f"{Path(__file__).resolve().parents[0]}" "/test_data/" + mock_path.return_value = (f"{Path(__file__).resolve().parents[0]}", "/test_data/") with does_not_raise(): args = duckdb_args( [ @@ -177,7 +177,7 @@ def test_generate_sql(mock_path, tmp_path): assert "module1.sql" in ",".join(files) for file in files: if file.endswith("module1.sql"): - with open(file, "r") as f: + with open(file) as f: query = "\n".join(line.rstrip() for line in f) assert "This sql was autogenerated" in query assert "CREATE TABLE IF NOT EXISTS study_python_valid__table" in query @@ -220,15 +220,20 @@ def test_generate_sql(mock_path, tmp_path): ], ) def test_clean(mock_path, tmp_path, args, expected): # pylint: disable=unused-argument - mock_path.return_value = f"{Path(__file__).resolve().parents[0]}" "/test_data/" + mock_path.return_value = (f"{Path(__file__).resolve().parents[0]}", "/test_data/") cli.main( cli_args=duckdb_args(["build", "-t", "core", "--database", "test"], tmp_path) ) with does_not_raise(): with mock.patch.object(builtins, "input", lambda _: "y"): cli.main( - cli_args=args - + ["--db-type", "duckdb", "--database", f"{tmp_path}/duck.db"] + cli_args=[ + *args, + "--db-type", + "duckdb", + "--database", + f"{tmp_path}/duck.db", + ] ) db = DuckDatabaseBackend(f"{tmp_path}/duck.db") for table in db.cursor().execute("show tables").fetchall(): @@ -245,7 +250,9 @@ def test_clean(mock_path, tmp_path, args, expected): # pylint: disable=unused-a [ (["build", "-t", "core"], ["export", "-t", "core"], 44), ( - [ # checking that a study is loaded from a child directory of a user-defined path + # checking that a study is loaded from a child directory + # of a user-defined path + [ "build", "-t", "study_valid", @@ -257,8 +264,9 @@ def test_clean(mock_path, tmp_path, args, expected): # pylint: disable=unused-a ), (["build", "-t", "vocab"], None, 3), ( - [ # checking that a study is loaded from the directory of a user-defined path. - # we're also validating that the CLI accepts the statistics keyword, though + # checking that a study is loaded from the directory of a user-defined + # path. we're also validating that the CLI accepts the statistics keyword + [ "build", "-t", "study_valid", @@ -338,7 +346,7 @@ def test_cli_transactions(tmp_path, study, finishes, raises): .fetchall() ) query = ( - db.cursor().execute(f"SELECT * from study_valid__lib_transactions").fetchall() + db.cursor().execute("SELECT * from study_valid__lib_transactions").fetchall() ) assert query[1][2] == "started" if finishes: @@ -377,9 +385,9 @@ def test_cli_stats_rebuild(tmp_path): "--database", f"{tmp_path}/duck.db", ] - cli.main(cli_args=arg_list + [f"{tmp_path}/export"]) - cli.main(cli_args=arg_list + [f"{tmp_path}/export"]) - cli.main(cli_args=arg_list + [f"{tmp_path}/export", "--statistics"]) + cli.main(cli_args=[*arg_list, f"{tmp_path}/export"]) + cli.main(cli_args=[*arg_list, f"{tmp_path}/export"]) + cli.main(cli_args=[*arg_list, f"{tmp_path}/export", "--statistics"]) db = DuckDatabaseBackend(f"{tmp_path}/duck.db") expected = ( db.cursor() diff --git a/tests/test_conftest.py b/tests/test_conftest.py index 7eeb68c7..7daf0daa 100644 --- a/tests/test_conftest.py +++ b/tests/test_conftest.py @@ -1,8 +1,7 @@ import json - from pathlib import Path -from tests.conftest import ndjson_data_generator, MOCK_DATA_DIR, ID_PATHS +from tests.conftest import ID_PATHS, MOCK_DATA_DIR, ndjson_data_generator def test_ndjson_data_generator(tmp_path): @@ -15,17 +14,17 @@ def test_ndjson_data_generator(tmp_path): for filepath in [f for f in Path(target / key).iterdir()]: with open(filepath) as f: first_new = json.loads(next(f)) - for line in f: - pass - last_new = json.loads(line) + *_, last_new = f + last_new = json.loads(last_new) with open(f"{Path(MOCK_DATA_DIR)}/{key}/{filepath.name}") as f: first_line = next(f) first_ref = json.loads(first_line) # handling patient file of length 1: - line = first_line - for line in f: - pass - last_ref = json.loads(line) + try: + *_, last_ref = f + last_ref = json.loads(last_ref) + except Exception: + last_ref = first_ref for source in [[first_new, first_ref, 0], [last_new, last_ref, iters - 1]]: for id_path in ID_PATHS[key]: new_test = source[0] diff --git a/tests/test_core.py b/tests/test_core.py index b7705400..ced53f69 100644 --- a/tests/test_core.py +++ b/tests/test_core.py @@ -1,7 +1,6 @@ """unit tests for counts generation""" -import datetime # pylint: disable=unused-import - +import datetime # noqa: F401 from pathlib import Path import pytest @@ -14,7 +13,7 @@ def get_sorted_table_data(cursor, table): num_cols = cursor.execute( - "SELECT count(*) FROM information_schema.columns " f"WHERE table_name='{table}'" + f"SELECT count(*) FROM information_schema.columns WHERE table_name='{table}'" ).fetchone()[0] return cursor.execute( f"SELECT * FROM '{table}' ORDER BY " f"{','.join(map(str, range(1,num_cols)))}" @@ -53,7 +52,7 @@ def test_core_tables(mock_db_core, table): # for row in table_rows: # f.write(str(f"{row}\n")) - with open(f"./tests/test_data/core/{table}.txt", "r", encoding="UTF-8") as f: + with open(f"./tests/test_data/core/{table}.txt", encoding="UTF-8") as f: ref_table = [] for row in f.readlines(): ref_table.append(eval(row)) # pylint: disable=eval-used @@ -92,7 +91,6 @@ def test_core_count_missing_data(tmp_path, mock_db): # f.write(str(f"{row}\n")) with open( "./tests/test_data/core/core__count_encounter_month_missing_data.txt", - "r", encoding="UTF-8", ) as f: ref_table = [] diff --git a/tests/test_duckdb.py b/tests/test_duckdb.py index 09fe41e4..517463ac 100644 --- a/tests/test_duckdb.py +++ b/tests/test_duckdb.py @@ -3,10 +3,9 @@ import glob import os import tempfile - from datetime import datetime, timedelta, timezone -from unittest import mock from pathlib import Path +from unittest import mock import pytest diff --git a/tests/test_parsers.py b/tests/test_parsers.py index 4ffe6540..21a0e91e 100644 --- a/tests/test_parsers.py +++ b/tests/test_parsers.py @@ -1,12 +1,10 @@ """Test for FHIR resource parsers""" -from contextlib import nullcontext as does_not_raise import json - +from contextlib import nullcontext as does_not_raise from unittest.mock import mock_open, patch import pytest - from fhirclient.models.coding import Coding from cumulus_library.parsers import fhir_valueset diff --git a/tests/test_psm.py b/tests/test_psm.py index dbfc9ac0..53b7990e 100644 --- a/tests/test_psm.py +++ b/tests/test_psm.py @@ -4,7 +4,6 @@ from pathlib import Path import pytest - from freezegun import freeze_time from cumulus_library.cli import StudyRunner diff --git a/tests/test_psm_templates.py b/tests/test_psm_templates.py index cb6d0c9e..7a1aeab8 100644 --- a/tests/test_psm_templates.py +++ b/tests/test_psm_templates.py @@ -6,8 +6,8 @@ from cumulus_library.errors import CumulusLibraryError from cumulus_library.template_sql.statistics.psm_templates import ( - get_distinct_ids, get_create_covariate_table, + get_distinct_ids, ) diff --git a/tests/test_study_parser.py b/tests/test_study_parser.py index 66ee0699..f6af4c29 100644 --- a/tests/test_study_parser.py +++ b/tests/test_study_parser.py @@ -2,7 +2,6 @@ import builtins import pathlib - from contextlib import nullcontext as does_not_raise from pathlib import Path from unittest import mock @@ -135,7 +134,7 @@ def test_clean_study(mock_db, schema, verbose, prefix, confirm, stats, target, r ) remaining_tables = ( mock_db.cursor() - .execute(f"select distinct(table_name) from information_schema.tables") + .execute("select distinct(table_name) from information_schema.tables") .fetchall() ) if any(x in target for x in protected_strs): @@ -295,13 +294,13 @@ def test_build_study(mock_db, study_path, verbose, expects, raises): ( "./tests/test_data/psm/", False, - (f"psm_test__psm_encounter_covariate",), + ("psm_test__psm_encounter_covariate",), does_not_raise(), ), ( "./tests/test_data/psm/", True, - (f"psm_test__psm_encounter_covariate",), + ("psm_test__psm_encounter_covariate",), does_not_raise(), ), ], diff --git a/tests/test_template_sql_utils.py b/tests/test_template_sql_utils.py index 0c44614f..2136cc9c 100644 --- a/tests/test_template_sql_utils.py +++ b/tests/test_template_sql_utils.py @@ -1,9 +1,9 @@ """ tests for the cli interface to studies """ -import duckdb +from contextlib import nullcontext as does_not_raise + import pytest -from contextlib import nullcontext as does_not_raise from cumulus_library.template_sql import sql_utils diff --git a/tests/test_templates.py b/tests/test_templates.py index 6da24bcf..ab89b3ec 100644 --- a/tests/test_templates.py +++ b/tests/test_templates.py @@ -1,7 +1,6 @@ """ tests for jinja sql templates """ import pytest - from pandas import DataFrame from cumulus_library.template_sql import base_templates, sql_utils