From e06ea3424ef380348b7160c27ad235a45e80326c Mon Sep 17 00:00:00 2001 From: Lukas Heumos Date: Wed, 4 Dec 2024 08:30:54 +0100 Subject: [PATCH] Remove boilerplate (#70) * Move stuff around Signed-off-by: zethson * Move back Signed-off-by: zethson * Fix checkout version Signed-off-by: zethson * Remove unused imports Signed-off-by: zethson * Fix docs Signed-off-by: zethson * Fix docs Signed-off-by: zethson * promote info to warning where more meaningful * exclude slow tests by default, keep them in ci * they're also omitted in ci --------- Signed-off-by: zethson Co-authored-by: eroell --- .github/workflows/build.yaml | 8 ++-- docs/api.md | 27 ----------- docs/conf.py | 1 + pyproject.toml | 1 + pytest.ini | 4 ++ src/ehrdata/__init__.py | 8 +--- src/ehrdata/{utils => }/_omop_utils.py | 0 src/ehrdata/io/omop/omop.py | 6 +-- src/ehrdata/logging_config.py | 10 ---- src/ehrdata/pl/__init__.py | 4 +- src/ehrdata/pl/basic.py | 63 ------------------------- src/ehrdata/pp/__init__.py | 1 - src/ehrdata/pp/basic.py | 17 ------- src/ehrdata/tl/__init__.py | 2 - src/ehrdata/tl/basic.py | 17 ------- src/ehrdata/tl/omop.py | 65 -------------------------- tests/test_dt/test_dt.py | 4 ++ 17 files changed, 20 insertions(+), 218 deletions(-) create mode 100644 pytest.ini rename src/ehrdata/{utils => }/_omop_utils.py (100%) delete mode 100644 src/ehrdata/logging_config.py delete mode 100644 src/ehrdata/pl/basic.py delete mode 100644 src/ehrdata/pp/__init__.py delete mode 100644 src/ehrdata/pp/basic.py delete mode 100644 src/ehrdata/tl/__init__.py delete mode 100644 src/ehrdata/tl/basic.py delete mode 100644 src/ehrdata/tl/omop.py diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml index 265a95e..bfa2cf0 100644 --- a/.github/workflows/build.yaml +++ b/.github/workflows/build.yaml @@ -14,11 +14,11 @@ jobs: package: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v3 - - name: Set up Python 3.10 - uses: actions/setup-python@v4 + - uses: actions/checkout@v4 + - name: Set up Python 3.12 + uses: actions/setup-python@v5 with: - python-version: "3.10" + python-version: "3.12" cache: "pip" cache-dependency-path: "**/pyproject.toml" - name: Install build dependencies diff --git a/docs/api.md b/docs/api.md index b209ba5..c2ab7a7 100644 --- a/docs/api.md +++ b/docs/api.md @@ -40,31 +40,6 @@ dt.mimic_ii ``` -## Preprocessing - -```{eval-rst} -.. module:: ehrdata.pp -.. currentmodule:: ehrdata - -.. autosummary:: - :toctree: generated - - pp.basic_preproc -``` - -## Tools - -```{eval-rst} -.. module:: ehrdata.tl -.. currentmodule:: ehrdata - -.. autosummary:: - :toctree: generated - - tl.basic_tool - tl.get_concept_name -``` - ## Plotting ```{eval-rst} @@ -74,7 +49,5 @@ .. autosummary:: :toctree: generated - pl.basic_plot - pl.BasicClass pl.vitessce.gen_config ``` diff --git a/docs/conf.py b/docs/conf.py index 3f22573..e42fce5 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -141,6 +141,7 @@ # If building the documentation fails because of a missing link that is outside your control, # you can add an exception to this list: nitpick_ignore = [ + ("py:class", "types.EllipsisType"), # https://github.com/duckdb/duckdb-web/issues/3806 ("py:class", "duckdb.duckdb.DuckDBPyConnection"), # Is documented as a py:attribute instead diff --git a/pyproject.toml b/pyproject.toml index 757633b..b9f4a39 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -29,6 +29,7 @@ dependencies = [ "awkward", "duckdb", # for debug logging (referenced from the issue template) + "lamin-utils", "session-info", "xarray", ] diff --git a/pytest.ini b/pytest.ini new file mode 100644 index 0000000..a997e0e --- /dev/null +++ b/pytest.ini @@ -0,0 +1,4 @@ +[pytest] +markers = + slow: marks tests as slow (deselect with '-m "not slow"') +addopts = -m "not slow" diff --git a/src/ehrdata/__init__.py b/src/ehrdata/__init__.py index d657790..eb69f06 100644 --- a/src/ehrdata/__init__.py +++ b/src/ehrdata/__init__.py @@ -1,12 +1,8 @@ from importlib.metadata import version -from . import dt, io, pl, pp, tl +from . import dt, io, pl from .core import EHRData -__all__ = ["EHRData", "dt", "io", "pl", "pp", "tl"] +__all__ = ["EHRData", "dt", "io", "pl"] __version__ = version("ehrdata") - -from .logging_config import configure_logging - -configure_logging() diff --git a/src/ehrdata/utils/_omop_utils.py b/src/ehrdata/_omop_utils.py similarity index 100% rename from src/ehrdata/utils/_omop_utils.py rename to src/ehrdata/_omop_utils.py diff --git a/src/ehrdata/io/omop/omop.py b/src/ehrdata/io/omop/omop.py index 8f8a5c4..4561fca 100644 --- a/src/ehrdata/io/omop/omop.py +++ b/src/ehrdata/io/omop/omop.py @@ -12,6 +12,7 @@ import pandas as pd from duckdb.duckdb import DuckDBPyConnection +from ehrdata._omop_utils import get_table_catalog_dict from ehrdata.io.omop._check_arguments import ( VALID_OBSERVATION_TABLES_JOIN, VALID_OBSERVATION_TABLES_SINGLE, @@ -32,7 +33,6 @@ _check_valid_variable_data_tables, ) from ehrdata.io.omop._queries import _time_interval_table -from ehrdata.utils._omop_utils import get_table_catalog_dict DOWNLOAD_VERIFICATION_TAG = "download_verification_tag" @@ -328,7 +328,7 @@ def setup_variables( # dbms complains about our queries, which sometimes need a column to be of type e.g. datetime, when it can't infer types from data count = backend_handle.execute(f"SELECT COUNT(*) as count FROM {data_tables[0]}").df()["count"].item() if count == 0: - logging.info(f"No data found in {data_tables[0]}. Returning edata without additional variables.") + logging.warning(f"No data found in {data_tables[0]}. Returning edata without additional variables.") return edata ds = ( @@ -463,7 +463,7 @@ def setup_interval_variables( # dbms complains about our queries, which sometimes need a column to be of type e.g. datetime, when it can't infer types from data count = backend_handle.execute(f"SELECT COUNT(*) as count FROM {data_tables[0]}").df()["count"].item() if count == 0: - logging.info(f"No data in {data_tables}.") + logging.warning(f"No data in {data_tables}.") return edata ds = ( diff --git a/src/ehrdata/logging_config.py b/src/ehrdata/logging_config.py deleted file mode 100644 index 66ade38..0000000 --- a/src/ehrdata/logging_config.py +++ /dev/null @@ -1,10 +0,0 @@ -import logging - - -def configure_logging(level=logging.INFO): - """Configures logging for the package.""" - logging.basicConfig( - level=level, - format="%(levelname)s - %(message)s", - force=True, - ) diff --git a/src/ehrdata/pl/__init__.py b/src/ehrdata/pl/__init__.py index 8825776..8eb2ed9 100644 --- a/src/ehrdata/pl/__init__.py +++ b/src/ehrdata/pl/__init__.py @@ -1,8 +1,6 @@ from importlib.util import find_spec -__all__ = ["BasicClass", "basic_plot", "vitessce"] - -from .basic import BasicClass, basic_plot +__all__ = ["vitessce"] if find_spec("vitessce"): from . import vitessce diff --git a/src/ehrdata/pl/basic.py b/src/ehrdata/pl/basic.py deleted file mode 100644 index ed390ef..0000000 --- a/src/ehrdata/pl/basic.py +++ /dev/null @@ -1,63 +0,0 @@ -from anndata import AnnData - - -def basic_plot(adata: AnnData) -> int: - """Generate a basic plot for an AnnData object. - - Parameters - ---------- - adata - The AnnData object to preprocess. - - Returns - ------- - Some integer value. - """ - print("Import matplotlib and implement a plotting function here.") - return 0 - - -class BasicClass: - """A basic class. - - Parameters - ---------- - adata - The AnnData object to preprocess. - """ - - my_attribute: str = "Some attribute." - my_other_attribute: int = 0 - - def __init__(self, adata: AnnData): - print("Implement a class here.") - - def my_method(self, param: int) -> int: - """A basic method. - - Parameters - ---------- - param - A parameter. - - Returns - ------- - Some integer value. - """ - print("Implement a method here.") - return 0 - - def my_other_method(self, param: str) -> str: - """Another basic method. - - Parameters - ---------- - param - A parameter. - - Returns - ------- - Some integer value. - """ - print("Implement a method here.") - return "" diff --git a/src/ehrdata/pp/__init__.py b/src/ehrdata/pp/__init__.py deleted file mode 100644 index 5e7e293..0000000 --- a/src/ehrdata/pp/__init__.py +++ /dev/null @@ -1 +0,0 @@ -from .basic import basic_preproc diff --git a/src/ehrdata/pp/basic.py b/src/ehrdata/pp/basic.py deleted file mode 100644 index 5db1ec0..0000000 --- a/src/ehrdata/pp/basic.py +++ /dev/null @@ -1,17 +0,0 @@ -from anndata import AnnData - - -def basic_preproc(adata: AnnData) -> int: - """Run a basic preprocessing on the AnnData object. - - Parameters - ---------- - adata - The AnnData object to preprocess. - - Returns - ------- - Some integer value. - """ - print("Implement a preprocessing function here.") - return 0 diff --git a/src/ehrdata/tl/__init__.py b/src/ehrdata/tl/__init__.py deleted file mode 100644 index 21e17a8..0000000 --- a/src/ehrdata/tl/__init__.py +++ /dev/null @@ -1,2 +0,0 @@ -from .basic import basic_tool -from .omop import get_concept_name diff --git a/src/ehrdata/tl/basic.py b/src/ehrdata/tl/basic.py deleted file mode 100644 index d215ade..0000000 --- a/src/ehrdata/tl/basic.py +++ /dev/null @@ -1,17 +0,0 @@ -from anndata import AnnData - - -def basic_tool(adata: AnnData) -> int: - """Run a tool on the AnnData object. - - Parameters - ---------- - adata - The AnnData object to preprocess. - - Returns - ------- - Some integer value. - """ - print("Implement a tool to run on the AnnData object.") - return 0 diff --git a/src/ehrdata/tl/omop.py b/src/ehrdata/tl/omop.py deleted file mode 100644 index 348dccc..0000000 --- a/src/ehrdata/tl/omop.py +++ /dev/null @@ -1,65 +0,0 @@ -from __future__ import annotations - -import numbers - -from anndata import AnnData -from rich import print as rprint - -from ehrdata.utils._omop_utils import df_to_dict, get_column_types, read_table - - -# TODO: overhaul -def get_concept_name( - adata: AnnData | dict, - concept_id: str | list, - raise_error: bool = False, -) -> str | list[str]: - """Get concept name from concept_id using concept table. - - Parameters - ---------- - adata - Anndata object or adata.uns - concept_id - concept_id or list of concept_id - raise_error - If True, raise error if concept_id not found. Defaults to False. - - Returns - ------- - concept_name - concept name or list of concept names - """ - if isinstance(concept_id, numbers.Integral): - concept_id = [concept_id] - - if isinstance(adata, AnnData): - adata_dict = adata.uns - else: - adata_dict = adata - - column_types = get_column_types(adata_dict, table_name="concept") - df_concept = read_table(adata_dict, table_name="concept", dtype=column_types) - # TODO dask Support - # df_concept.compute().dropna(subset=["concept_id", "concept_name"], inplace=True, ignore_index=True) # usecols=vocabularies_tables_columns["concept"] - df_concept.dropna( - subset=["concept_id", "concept_name"], inplace=True, ignore_index=True - ) # usecols=vocabularies_tables_columns["concept"] - concept_dict = df_to_dict(df=df_concept, key="concept_id", value="concept_name") - concept_name = [] - concept_name_not_found = [] - for id in concept_id: - try: - concept_name.append(concept_dict[id]) - except KeyError: - concept_name.append(id) - concept_name_not_found.append(id) - if len(concept_name_not_found) > 0: - # warnings.warn(f"Couldn't find concept {id} in concept table!") - rprint(f"Couldn't find concept {concept_name_not_found} in concept table!") - if raise_error: - raise KeyError - if len(concept_name) == 1: - return concept_name[0] - else: - return concept_name diff --git a/tests/test_dt/test_dt.py b/tests/test_dt/test_dt.py index 65accff..36014ff 100644 --- a/tests/test_dt/test_dt.py +++ b/tests/test_dt/test_dt.py @@ -13,6 +13,7 @@ def duckdb_connection(): con.close() +@pytest.mark.slow def test_mimic_iv_omop(tmp_path): duckdb_connection = duckdb.connect() ed.dt.mimic_iv_omop(data_path=tmp_path, backend_handle=duckdb_connection) @@ -22,6 +23,7 @@ def test_mimic_iv_omop(tmp_path): duckdb_connection.close() +@pytest.mark.slow def test_gibleed_omop(tmp_path): duckdb_connection = duckdb.connect() ed.dt.gibleed_omop(data_path=tmp_path, backend_handle=duckdb_connection) @@ -31,6 +33,7 @@ def test_gibleed_omop(tmp_path): duckdb_connection.close() +@pytest.mark.slow def test_synthea27nj_omop(tmp_path): duckdb_connection = duckdb.connect() ed.dt.synthea27nj_omop(data_path=tmp_path, backend_handle=duckdb_connection) @@ -40,6 +43,7 @@ def test_synthea27nj_omop(tmp_path): duckdb_connection.close() +@pytest.mark.slow def test_physionet2012(): edata = ed.dt.physionet2012() assert edata.shape == (11988, 38)