Add and apply ruff linting

KarelZe · Oct 1, 2023 · 89300ff · 89300ff
1 parent 6085597
commit 89300ff
Show file tree

Hide file tree

Showing 44 changed files with 603 additions and 677 deletions.
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -24,52 +24,27 @@ repos:
       - id: debug-statements
       - id: end-of-file-fixer
       - id: mixed-line-ending
-  - repo: https://github.com/PyCQA/autoflake
-    rev: v1.7.7
-    hooks:
-      - id: autoflake
-        args:
-          # - "--check"
-          - "--ignore-init-module-imports"
-          - "--remove-all-unused-imports"
-          - "--remove-unused-variables"
-  - repo: https://github.com/PyCQA/isort
-    rev: 5.10.1
-    hooks:
-      - id: isort
-        name: isort (python)
-        # args:
-        #   - "--check-only"
+      - id: trailing-whitespace
   - repo: https://github.com/pre-commit/mirrors-mypy
     rev: v0.991
     hooks:
       - id: mypy
         # yaml requires additional stubs.
         # Similar to: https://stackoverflow.com/a/73603491/5755604
         additional_dependencies: ['types-PyYAML']
-  - repo: https://github.com/pre-commit/pygrep-hooks
-    rev: v1.9.0
-    hooks:
-      - id: python-use-type-annotations
   - repo: https://github.com/psf/black.git
     rev: 22.10.0
     hooks:
       - id: black
-        # args:
-        #  - "--check"
-        language_version: python3
-        exclude: ^(tests\/hooks-abort-render\/hooks|docs)
-  - repo: https://github.com/pycqa/flake8
-    rev: 5.0.4
+        exclude: ^(docs)
+  - repo: https://github.com/charliermarsh/ruff-pre-commit
+    rev: v0.0.284
     hooks:
-      - id: flake8
-        additional_dependencies:
-          - flake8-absolute-import
-          - flake8-black
-          - flake8-docstrings
-          - flake8-bugbear
-  - repo: https://github.com/asottile/pyupgrade
-    rev: v3.2.2
+    - id: ruff
+      args:
+      - --fix
+  - repo: https://github.com/kynan/nbstripout
+    rev: 0.6.0
     hooks:
-      - id: pyupgrade
+    - id: nbstripout
 exclude: "^(references|reports)"
diff --git a/pyproject.toml b/pyproject.toml
@@ -55,6 +55,7 @@ dev = [
   "pre-commit",
   "pytest",
   "pytest-cov",
+  "ruff",
   "sphinx",
   "tox",
 ]
@@ -100,3 +101,44 @@ omit = [
     "debug_*.py",
     "tests/*",
 ]
+
+[tool.ruff]
+# See rules: https://beta.ruff.rs/docs/rules/
+select = [
+    "A",    # flake8-builtins
+    "B",    # flake8-bugbear
+    "C",    # flake8-comprehensions
+    "D",    # pydocstyle
+    "E",    # pycodestyle errors
+    "F",    # pyflakes
+    "I",    # isort
+    "N",    # pep8-naming
+    "NPY",  # numpy
+    "PD",   # pandas-vet
+    "PT",   # pytest
+    "PTH",  # flake8-use-pathlib
+    "PGH",  # pygrep
+    "RET",  # return
+    "RUF",  # ruff-specific rules
+    "UP",   # pyupgrade
+    "S",    # flake8-bandit
+    "SIM",  # flake8-simplify
+    "W",    # pycodestyle warnings
+]
+
+include = ["*.py", "*.pyi", "**/pyproject.toml", "*.ipynb"]
+
+ignore = [
+    "E501",  # line too long, handled by black
+    "N803",  # argument name should be lowercase
+    "N806",  # variable name should be lowercase
+    "C901",  # too complex
+]
+
+[tool.ruff.isort]
+known-first-party = ["otc"]
+section-order = ["future", "standard-library", "third-party", "first-party", "local-folder"]
+
+[tool.ruff.per-file-ignores]
+"__init__.py" = ["D104", "F401"]  # disable missing docstrings in __init__, unused imports
+"tests/*" = ["S101"]              # Use of `assert` detected
diff --git a/src/otc/__init__.py b/src/otc/__init__.py
@@ -1,5 +1,4 @@
-"""
-Support for custom code.
+"""Support for custom code.
 
 See `readme.md` for instructions on how to run.
 """

diff --git a/src/otc/config/config.py b/src/otc/config/config.py
@@ -1,5 +1,4 @@
-"""
-Holds configuration for folders, dbs, and wandb configuration.
+"""Holds configuration for folders, dbs, and wandb configuration.
 
 See also `prod.env`.
 """
@@ -10,8 +9,7 @@
 
 
 class Settings(BaseSettings):
-    """
-    Specifies settings.
+    """Specifies settings.
 
     Mainly W&B, GCS and Heroku.
     """
@@ -26,8 +24,7 @@ class Settings(BaseSettings):
     MODEL_DIR_REMOTE: Path
 
     class Config:
-        """
-        Specifies configuration.
+        """Specifies configuration.
 
         Filename is given by "prod.env". Keys are case-sensitive.
         """

diff --git a/src/otc/data/__init__.py b/src/otc/data/__init__.py
@@ -1,5 +1,4 @@
-"""
-Support for data.
+"""Support for data.
 
 See `readme.md` for instructions on how to run.
 """
diff --git a/src/otc/data/dataloader.py b/src/otc/data/dataloader.py
@@ -1,5 +1,4 @@
-"""
-A fast dataloader-like object to load batches of tabular data sets.
+"""A fast dataloader-like object to load batches of tabular data sets.
 
 Adapted from here:
 https://discuss.pytorch.org/t/dataloader-much-slower-than-manual-batching/27014/6
@@ -12,8 +11,7 @@
 
 
 class TabDataLoader:
-    """
-    PyTorch Implementation of a dataloader for tabular data.
+    """PyTorch Implementation of a dataloader for tabular data.
 
     Due to a chunk-wise reading or several rows at once it is preferred
     over the standard dataloader that reads row-wise.
@@ -27,12 +25,12 @@ def __init__(
         device: str = "cpu",
         **kwargs: Any,
     ):
-        """
-        TabDataLoader.
+        """TabDataLoader.
 
         Tensors can be None e. g., if there is no categorical data.
 
         Args:
+        ----
             batch_size (int, optional): size of batch. Defaults to 4096.
             shuffle (bool, optional): shuffle data. Defaults to False.
             device (str, optional): device where. Defaults to "cpu".
@@ -57,10 +55,10 @@ def __init__(
         self.n_batches = n_batches
 
     def __iter__(self) -> TabDataLoader:
-        """
-        Return itself.
+        """Return itself.
 
-        Returns:
+        Returns
+        -------
             TabDataLoader: TabDataLoader
         """
         if self.shuffle:
@@ -71,13 +69,16 @@ def __iter__(self) -> TabDataLoader:
         return self
 
     def __next__(self) -> tuple[torch.Tensor | None, ...]:
-        """
-        Generate next batch with size of 'batch_size'.
+        """Generate next batch with size of 'batch_size'.
 
         Batches can be underful.
-        Raises:
+
+        Raises
+        ------
             StopIteration: stopping criterion.
-        Returns:
+
+        Returns
+        -------
             Tuple[torch.Tensor | None, torch.Tensor, torch.Tensor]: (X_cat), X_cont,
             weight, y
         """
@@ -96,10 +97,10 @@ def __next__(self) -> tuple[torch.Tensor | None, ...]:
         return tuple(mixed_batch)
 
     def __len__(self) -> int:
-        """
-        Get number of full and partial batches in data set.
+        """Get number of full and partial batches in data set.
 
-        Returns:
+        Returns
+        -------
             int: number of batches.
         """
         return self.n_batches
diff --git a/src/otc/data/dataset.py b/src/otc/data/dataset.py
@@ -1,5 +1,4 @@
-"""
-Implementation of a dataset for tabular data.
+"""Implementation of a dataset for tabular data.
 
 Supports both categorical and continous data.
 """
@@ -16,6 +15,7 @@ class TabDataset(Dataset):
     """PyTorch Dataset for tabular data.
 
     Args:
+    ----
         Dataset (Dataset): dataset
     """
 
@@ -28,13 +28,13 @@ def __init__(
         cat_features: list[str] | None = None,
         cat_unique_counts: tuple[int, ...] | None = None,
     ):
-        """
-        Tabular data set holding data for the model.
+        """Tabular data set holding data for the model.
 
         Data set is inspired by CatBoost's Pool class:
         https://catboost.ai/en/docs/concepts/python-reference_pool
 
         Args:
+        ----
             x (pd.DataFrame | npt.ndarray): feature matrix
             y (pd.Series | npt.ndarray): target
             weight (pd.Series | npt.ndarray | None, optional): weights of samples. If
@@ -48,7 +48,7 @@ def __init__(
             cat_unique_counts (tuple[int, ...] | None, optional): Number of categories
             per categorical feature. Defaults to None.
         """
-        self._cat_unique_counts = () if not cat_unique_counts else cat_unique_counts
+        self._cat_unique_counts = cat_unique_counts if cat_unique_counts else ()
         feature_names = [] if feature_names is None else feature_names
         # infer feature names from dataframe.
         if isinstance(x, pd.DataFrame):
@@ -58,7 +58,7 @@ def __init__(
         ), "`len('feature_names)` must match `X.shape[1]`"
 
         # calculate cat indices
-        cat_features = [] if not cat_features else cat_features
+        cat_features = cat_features if cat_features else []
         assert set(cat_features).issubset(
             feature_names
         ), "Categorical features must be a subset of feature names."
@@ -74,9 +74,9 @@ def __init__(
         ]
 
         # pd 2 np
-        x = x.values if isinstance(x, pd.DataFrame) else x
-        y = y.values if isinstance(y, pd.Series) else y
-        weight = weight.values if isinstance(weight, pd.Series) else weight
+        x = x.to_numpy() if isinstance(x, pd.DataFrame) else x
+        y = y.to_numpy() if isinstance(y, pd.Series) else y
+        weight = weight.to_numpy() if isinstance(weight, pd.Series) else weight
 
         assert (
             x.shape[0] == y.shape[0]
@@ -112,24 +112,25 @@ def __init__(
         self.weight = weight
 
     def __len__(self) -> int:
-        """
-        Length of dataset.
+        """Length of dataset.
 
-        Returns:
+        Returns
+        -------
             int: length
         """
         return len(self.x_cont)
 
     def __getitem__(
         self, idx: int
     ) -> tuple[torch.Tensor | None, torch.Tensor, torch.Tensor, torch.Tensor]:
-        """
-        Get sample for model.
+        """Get sample for model.
 
         Args:
+        ----
             idx (int): index of item.
 
         Returns:
+        -------
             Tuple[torch.Tensor | None, torch.Tensor, torch.Tensor torch.Tensor]:
             x_cat (if present if present otherwise None), x_cont, weight and y.
         """

diff --git a/src/otc/data/fs.py b/src/otc/data/fs.py
@@ -1,5 +1,4 @@
-"""
-Gives simple access to the google cloud storage bucket.
+"""Gives simple access to the google cloud storage bucket.
 
 Instance is only created once.
 """
@@ -13,10 +12,10 @@
 
 
 def _create_environment() -> gcsfs.GCSFileSystem:
-    """
-    Implement the global object pattern to connect only once to GCS.
+    """Implement the global object pattern to connect only once to GCS.
 
-    Returns:
+    Returns
+    -------
         gcsfs.GCSFileSystem: Instance of GCSFileSystem.
     """
     gcloud_config = str(Path(settings.GCS_CRED_FILE).expanduser().resolve())

diff --git a/src/otc/data/make_dataset.py b/src/otc/data/make_dataset.py
@@ -1,5 +1,4 @@
-"""
-Script to pre-process the raw data set.
+"""Script to pre-process the raw data set.
 
 See `notebooks/` for further details.
 """
@@ -16,11 +15,11 @@
 @click.argument("input_filepath", type=click.Path(exists=True))
 @click.argument("output_filepath", type=click.Path())
 def main(input_filepath: click.Path, output_filepath: click.Path) -> None:
-    """
-    Run data processing scripts to turn raw data from (../raw) into\
+    """Run data processing scripts to turn raw data from (../raw) into\
     cleaned data ready to be analyzed (saved in ../processed).
 
     Args:
+    ----
         input_filepath (click.Path): input file
         output_filepath (click.Path): output file
     """

diff --git a/src/otc/features/__init__.py b/src/otc/features/__init__.py
@@ -1,5 +1,4 @@
-"""
-Support for features.
+"""Support for features.
 
 See `readme.md` for instructions on how to run.
 """
diff --git a/src/otc/features/build_features.py b/src/otc/features/build_features.py
@@ -1,5 +1,4 @@
-"""
-Defines feature sets.
+"""Defines feature sets.
 
 See notebook/3.0b-feature-engineering.ipynb for details.
 """

diff --git a/src/otc/metrics/__init__.py b/src/otc/metrics/__init__.py
@@ -1,5 +1,4 @@
-"""
-Support for metrics.
+"""Support for metrics.
 
 See `readme.md` for instructions on how to run.
 """