Skip to content

Commit

Permalink
Make it easier to run studies from git checkouts (#235)
Browse files Browse the repository at this point in the history
- When loading a builder, we add the study dir's parent folder
  into sys.path, so that builders can import helper code.
  This is basically equivalent to doing PYTHONPATH=. from a checkout.
- When searching for the study dir, and we find a manifest.toml,
  actually read the manifest to find the study_prefix instead of using
  the directory name for the study name. This allows more natural
  python package layouts (rather than `module_name/study_name`, you can
  just do `module_name`)

This also bumps ruff's version to 0.4.4 and has the commit hook just
auto-commit any formatting changes.
  • Loading branch information
mikix authored May 13, 2024
1 parent 6866553 commit 1932655
Show file tree
Hide file tree
Showing 38 changed files with 97 additions and 52 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/ci.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ jobs:
run: |
sqlfluff lint
- name: Run ruff
if: success() || failure() # still run black if above checks fails
if: success() || failure() # still run ruff if above checks fails
run: |
ruff check
ruff format --check
Expand Down
3 changes: 2 additions & 1 deletion .pre-commit-config.yaml
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
default_install_hook_types: [pre-commit, pre-push]
repos:
- repo: https://github.com/astral-sh/ruff-pre-commit
rev: v0.2.1
rev: v0.4.4 # if you update this, also update pyproject.toml
hooks:
- name: Ruff formatting
id: ruff-format
entry: bash -c 'ruff format --force-exclude "$@"; git add -u' --
- name: Ruff linting
id: ruff
stages: [pre-push]
Expand Down
1 change: 1 addition & 0 deletions cumulus_library/apis/umls.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
"""Class for communicating with the umls API"""

import os
import pathlib

Expand Down
2 changes: 1 addition & 1 deletion cumulus_library/base_table_builder.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
""" abstract base for python-based study executors """
"""abstract base for python-based study executors"""

import pathlib
import re
Expand Down
2 changes: 1 addition & 1 deletion cumulus_library/base_utils.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
""" Collection of small commonly used utility functions """
"""Collection of small commonly used utility functions"""

import dataclasses
import datetime
Expand Down
8 changes: 6 additions & 2 deletions cumulus_library/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -318,14 +318,18 @@ def get_study_dict(alt_dir_paths: list) -> dict[str, pathlib.Path] | None:
return manifest_studies


def get_studies_by_manifest_path(path: pathlib.Path) -> dict:
def get_studies_by_manifest_path(path: pathlib.Path) -> dict[str, pathlib.Path]:
"""Recursively search for manifest.toml files from a given path"""
manifest_paths = {}
for child_path in path.iterdir():
if child_path.is_dir():
manifest_paths.update(get_studies_by_manifest_path(child_path))
elif child_path.name == "manifest.toml":
manifest_paths[path.name] = path
try:
manifest = study_parser.StudyManifestParser(path)
manifest_paths[manifest.get_study_prefix()] = path
except errors.StudyManifestParsingError as exc:
rich.print(f"[bold red] Ignoring study in '{path}': {exc}")
return manifest_paths


Expand Down
2 changes: 1 addition & 1 deletion cumulus_library/enums.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
""" Holds enums used across more than one module """
"""Holds enums used across more than one module"""

from enum import Enum

Expand Down
2 changes: 1 addition & 1 deletion cumulus_library/protected_table_builder.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
""" Builder for creating tables for tracking state/logging changes"""
"""Builder for creating tables for tracking state/logging changes"""

from cumulus_library import base_table_builder, enums
from cumulus_library.template_sql import base_templates
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
""" Collection of jinja template getters for common SQL queries """
"""Collection of jinja template getters for common SQL queries"""

from pathlib import Path

Expand Down
2 changes: 1 addition & 1 deletion cumulus_library/studies/core/builder_medication.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
""" Module for generating core medication table"""
"""Module for generating core medication table"""

from cumulus_library import base_table_builder, databases
from cumulus_library.studies.core.core_templates import core_templates
Expand Down
2 changes: 1 addition & 1 deletion cumulus_library/studies/core/builder_medicationrequest.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
""" Module for extracting US core extensions from medicationrequests
"""Module for extracting US core extensions from medicationrequests
Note: This module assumes that you have already run builder_medication,
as it leverages the core__medication table for data population"""
Expand Down
2 changes: 1 addition & 1 deletion cumulus_library/studies/core/builder_observation.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
""" Module for extracting US core extensions from patient records"""
"""Module for extracting US core extensions from patient records"""

from dataclasses import dataclass

Expand Down
2 changes: 1 addition & 1 deletion cumulus_library/studies/core/builder_patient.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
""" Module for extracting US core extensions from patient records"""
"""Module for extracting US core extensions from patient records"""

from cumulus_library import databases
from cumulus_library.base_table_builder import BaseTableBuilder
Expand Down
4 changes: 2 additions & 2 deletions cumulus_library/studies/core/builder_prereq_tables.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
""" This builder primarily exists to make sure that the FHIR lookup
"""This builder primarily exists to make sure that the FHIR lookup
tables are created before other builders in the core study run, so that
they are available for joins. """
they are available for joins."""

import pathlib

Expand Down
2 changes: 1 addition & 1 deletion cumulus_library/studies/discovery/code_detection.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
""" Module for generating encounter codeableConcept table"""
"""Module for generating encounter codeableConcept table"""

from cumulus_library import base_table_builder, base_utils, databases
from cumulus_library.studies.discovery import code_definitions
Expand Down
2 changes: 1 addition & 1 deletion cumulus_library/studies/vocab/vocab_icd_builder.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
""" Module for directly loading ICD bsvs into athena tables """
"""Module for directly loading ICD bsvs into athena tables"""

import pathlib

Expand Down
24 changes: 21 additions & 3 deletions cumulus_library/study_parser.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
""" Contains classes for loading study data based on manifest.toml files """
"""Contains classes for loading study data based on manifest.toml files"""

import contextlib
import csv
import importlib.util
import inspect
Expand All @@ -25,6 +26,16 @@
from cumulus_library.template_sql import base_templates


@contextlib.contextmanager
def _temporary_sys_path(add: pathlib.Path) -> None:
orig_path = list(sys.path)
try:
sys.path.insert(0, str(add))
yield
finally:
sys.path = orig_path


class StudyManifestParser:
"""Handles loading of study data from manifest files.
Expand Down Expand Up @@ -73,9 +84,12 @@ def load_study_manifest(self, study_path: pathlib.Path) -> None:
self._study_config = config
self._study_path = study_path
except FileNotFoundError as e:
raise errors.StudyManifestFilesystemError( # pylint: disable=raise-missing-from
raise errors.StudyManifestFilesystemError(
f"Missing or invalid manifest found at {study_path}"
) from e
except toml.TomlDecodeError as e:
# just unify the error classes for convenience of catching them
raise errors.StudyManifestParsingError(str(e)) from e

def get_study_prefix(self) -> str | None:
"""Reads the name of a study prefix from the in-memory study config
Expand Down Expand Up @@ -353,7 +367,11 @@ def _load_and_execute_builder(
)
table_builder_module = importlib.util.module_from_spec(spec)
sys.modules["table_builder"] = table_builder_module
spec.loader.exec_module(table_builder_module)
# Inject the study dir into sys.path so that builders can import
# from surrounding utility code, even if the study isn't installed.
# (i.e. you're working from a git checkout and do something like `-s .`)
with _temporary_sys_path(self._study_path.parent):
spec.loader.exec_module(table_builder_module)

# We're going to find all subclasses of BaseTableBuild in this file.
# Since BaseTableBuilder itself is a valid subclass of BaseTableBuilder,
Expand Down
2 changes: 1 addition & 1 deletion cumulus_library/template_sql/base_templates.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
""" Collection of jinja template getters for common SQL queries """
"""Collection of jinja template getters for common SQL queries"""

import enum
import pathlib
Expand Down
6 changes: 3 additions & 3 deletions cumulus_library/template_sql/sql_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
- Data with deep missing elements
- Data which may or may not be in an array depending on context
"""
import abc

from dataclasses import dataclass, field

from cumulus_library import base_utils, databases
Expand All @@ -26,8 +26,8 @@


@dataclass(kw_only=True)
class BaseConfig(abc.ABC):
"""Abstract ase class for handling table detection/denormalization"""
class BaseConfig:
"""Base class for handling table detection/denormalization"""

source_table: str = None
source_id: str = "id"
Expand Down
2 changes: 1 addition & 1 deletion cumulus_library/upload.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
""" Handles pushing data to the aggregator"""
"""Handles pushing data to the aggregator"""

import sys
from pathlib import Path
Expand Down
2 changes: 1 addition & 1 deletion docs/creating-studies.md
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@ file_names = [
# The following tables will be output to disk when an export is run. In most cases,
# only count tables should be output in this way.
export_list = [
"template__count_influenza_test_month",
"my_study__count_influenza_test_month",
]

# For generating counts table in a more standardized manner, we have a class in the
Expand Down
10 changes: 4 additions & 6 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,8 @@ classifiers = [
dynamic=["version"]
[project.optional-dependencies]
dev = [
"ruff == 0.2.1",
# if you update the ruff version, also update .pre-commit-config.yaml
"ruff == 0.4.4",
"pre-commit",
]
test = [
Expand All @@ -41,8 +42,8 @@ test = [
]

[project.urls]
Home = "https://smarthealthit.org/cumulus-a-universal-sidecar-for-a-smart-learning-healthcare-system/"
Documentation = "https://docs.smarthealthit.org/cumulus/"
Home = "https://smarthealthit.org/cumulus/"
Documentation = "https://docs.smarthealthit.org/cumulus/library/"
Source = "https://github.com/smart-on-fhir/cumulus-library"


Expand All @@ -62,9 +63,6 @@ testpaths = [
"tests",
]

[tool.ruff]
target-version = "py310"

[tool.ruff.lint]
select = [
"A", # prevent using keywords that clobber python builtins
Expand Down
4 changes: 2 additions & 2 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -210,7 +210,7 @@ def mock_db_core(tmp_path, mock_db): # pylint: disable=redefined-outer-name
"""Provides a DuckDatabaseBackend with the core study ran for local testing"""
builder = cli.StudyRunner(mock_db, data_path=f"{tmp_path}/data_path")
builder.clean_and_build_study(
f"{Path(__file__).parent.parent}/cumulus_library/studies/core",
Path(__file__).parent.parent / "cumulus_library/studies/core",
config=base_utils.StudyConfig(stats_build=True, db=mock_db),
)
yield mock_db
Expand All @@ -229,7 +229,7 @@ def mock_db_stats(tmp_path):
)
builder = cli.StudyRunner(db, data_path=f"{tmp_path}/data_path")
builder.clean_and_build_study(
f"{Path(__file__).parent.parent}/cumulus_library/studies/core",
Path(__file__).parent.parent / "cumulus_library/studies/core",
config=base_utils.StudyConfig(stats_build=True, db=db),
)
yield db
2 changes: 1 addition & 1 deletion tests/regression/run_regression.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
"""Checks export against known dataset.
This file is excluded from the pytest suite because it's finicky to
This file is excluded from the pytest suite because it's finicky to
run locally at BCH:
- You need to be on the BCH VPN
Expand Down
1 change: 1 addition & 0 deletions tests/test_athena.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
"""Tests for Athena database support"""

import json
import os
import pathlib
Expand Down
2 changes: 1 addition & 1 deletion tests/test_base_templates.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
""" tests for jinja sql templates """
"""tests for jinja sql templates"""

import pytest
from pandas import DataFrame
Expand Down
23 changes: 18 additions & 5 deletions tests/test_cli.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
""" tests for the cli interface to studies """
"""tests for the cli interface to studies"""

import builtins
import filecmp
Expand Down Expand Up @@ -401,9 +401,7 @@ def test_cli_transactions(tmp_path, study, finishes, raises):
.execute("select table_name from information_schema.tables")
.fetchall()
)
query = (
db.cursor().execute("SELECT * from study_valid__lib_transactions").fetchall()
)
query = db.cursor().execute(f"SELECT * from {study}__lib_transactions").fetchall()
assert query[0][2] == "started"
if finishes:
assert query[1][2] == "finished"
Expand Down Expand Up @@ -431,7 +429,7 @@ def test_cli_stats_rebuild(tmp_path):
"-s",
"./tests/test_data",
"-t",
"psm",
"psm_test",
"--db-type",
"duckdb",
"--database",
Expand Down Expand Up @@ -562,3 +560,18 @@ def test_cli_single_builder(tmp_path):
"core__patient_ext_ethnicity",
"core__patient_ext_race",
} == tables


@mock.patch.dict(os.environ, clear=True)
def test_cli_finds_study_from_manifest_prefix(tmp_path):
# This study is located inside a folder called `study_different_dir`,
# but we're going to find it using its real study prefix from the manifest.
cli.main(
cli_args=duckdb_args(
["build", "-s", "tests/test_data", "--target=study_different_name"],
tmp_path,
)
)
db = DuckDatabaseBackend(f"{tmp_path}/duck.db")
tables = {x[0] for x in db.cursor().execute("show tables").fetchall()}
assert "study_different_name__table" in tables
2 changes: 1 addition & 1 deletion tests/test_counts_builder.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
""" tests for outputs of counts_builder module """
"""tests for outputs of counts_builder module"""

from contextlib import nullcontext as does_not_raise
from unittest import mock
Expand Down
2 changes: 1 addition & 1 deletion tests/test_counts_templates.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
""" validates sql output of counts table sql generation """
"""validates sql output of counts table sql generation"""

import pytest

Expand Down
7 changes: 7 additions & 0 deletions tests/test_data/study_different_dir/manifest.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
study_prefix = "study_different_name"

[sql_config]
file_names = ["test.sql"]

[export_config]
export_list = ["study_different_name__table"]
1 change: 1 addition & 0 deletions tests/test_data/study_different_dir/test.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
CREATE TABLE study_different_name__table (test int);
4 changes: 2 additions & 2 deletions tests/test_data/study_invalid_bad_query/manifest.toml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
study_prefix = "study_valid"
study_prefix = "study_invalid_bad_query"

[sql_config]
file_names = ["test.sql"]

[export_config]
export_list = ["study_valid__table"]
export_list = ["study_invalid_bad_query__table"]
3 changes: 2 additions & 1 deletion tests/test_data/study_python_local_template/module1.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from study_python_local_template import local_template

from cumulus_library.base_table_builder import BaseTableBuilder
from tests.test_data.study_python_local_template import local_template


class ModuleOneRunner(BaseTableBuilder):
Expand Down
2 changes: 1 addition & 1 deletion tests/test_duckdb.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
""" tests for duckdb backend support """
"""tests for duckdb backend support"""

import glob
import json
Expand Down
2 changes: 1 addition & 1 deletion tests/test_psm.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
""" tests for propensity score matching generation """
"""tests for propensity score matching generation"""

from datetime import datetime
from pathlib import Path
Expand Down
2 changes: 1 addition & 1 deletion tests/test_psm_templates.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
""" validates sql output of psm table sql generation """
"""validates sql output of psm table sql generation"""

from contextlib import nullcontext as does_not_raise

Expand Down
Loading

0 comments on commit 1932655

Please sign in to comment.