feat: start with support for __dataframe__ api🆕

KarelZe · Apr 1, 2024 · 8efa3a4 · 8efa3a4
1 parent 923efee
commit 8efa3a4
Show file tree

Hide file tree

Showing 5 changed files with 55 additions and 54 deletions.
diff --git a/pixi.toml b/pixi.toml
@@ -30,5 +30,6 @@ setuptools = "*"
 # test
 pytest = "*"
 pytest-cov = "*"
+polars = "*"
 # linting
 pre-commit = "*"
diff --git a/pyproject.toml b/pyproject.toml
@@ -28,7 +28,6 @@ classifiers = [
 
 dependencies = [
   "numpy",
-  "pandas",
   "scikit-learn"
 ]
 dynamic = ["version"]

diff --git a/src/tclf/classical_classifier.py b/src/tclf/classical_classifier.py
@@ -11,15 +11,16 @@
 import numpy as np
 import numpy.typing as npt
 import pandas as pd
+from scipy import sparse
 from sklearn.base import BaseEstimator, ClassifierMixin
 from sklearn.utils import check_random_state
 from sklearn.utils.validation import (
     _check_sample_weight,
+    _get_feature_names,
+    check_array,
     check_is_fitted,
 )
 
-from tclf.types import ArrayLike, MatrixLike
-
 ALLOWED_FUNC_LITERALS = Literal[
     "tick",
     "rev_tick",
@@ -396,7 +397,8 @@ def _nan(self, subset: str) -> npt.NDArray:
         Returns:
             npt.NDArray: result of the trade size rule. Can be np.NaN.
         """
-        return np.full(shape=(self.X_.shape[0],), fill_value=np.nan)
+        n_samples = next(iter(self.X_.values())).shape[0]
+        return np.full(shape=(n_samples,), fill_value=np.nan)
 
     def _validate_columns(self, missing_columns: list) -> None:
         """Validate if all required columns are present.
@@ -407,8 +409,9 @@ def _validate_columns(self, missing_columns: list) -> None:
         Raises:
             ValueError: columns missing in dataframe.
         """
-        columns = self.columns_ + missing_columns if self.columns_ else missing_columns
-        self.X_ = pd.DataFrame(np.zeros(shape=(1, len(columns))), columns=columns)
+        columns = self.feature_names_in_.tolist()
+        columns.extend(missing_columns)
+        self.X_ = {c: np.zeros(shape=(1, 1)) for c in columns}
         try:
             self._predict()
         except KeyError as e:
@@ -427,15 +430,15 @@ def _validate_columns(self, missing_columns: list) -> None:
 
     def fit(
         self,
-        X: MatrixLike,
-        y: ArrayLike | None = None,
+        X,
+        y=None,
         sample_weight: npt.NDArray | None = None,
     ) -> ClassicalClassifier:
         """Fit the classifier.
 
         Args:
-            X (MatrixLike): features
-            y (ArrayLike | None, optional):  ignored, present here for API consistency by convention.
+            X: features
+            y: ignored, present here for API consistency by convention.
             sample_weight (npt.NDArray | None, optional):  Sample weights. Defaults to None.
 
         Raises:
@@ -465,30 +468,12 @@ def fit(
 
         self.func_mapping_ = dict(zip(ALLOWED_FUNC_STR, funcs))
 
-        # create working copy to be altered and try to get columns from df
-        self.columns_ = self.features
-        if isinstance(X, pd.DataFrame):
-            self.columns_ = X.columns.tolist()
-
-        X = self._validate_data(
-            X,
-            y="no_validation",
-            dtype=[np.float64, np.float32],
-            accept_sparse=False,
-            force_all_finite=False,
-        )
-
+        # set feature names, if given
+        self._check_feature_names(X, reset=True)
+        X = _check_X(X)
+        self._check_n_features(X, reset=True)
         self.classes_ = np.array([-1, 1])
 
-        # if no features are provided or inferred, use default
-        if self.columns_ is None:
-            self.columns_ = [str(i) for i in range(X.shape[1])]
-
-        if len(self.columns_) > 0 and X.shape[1] != len(self.columns_):
-            raise ValueError(
-                f"Expected {len(self.columns_)} columns, got {X.shape[1]}."
-            )
-
         self._layers = self.layers if self.layers is not None else []
         for func_str, _ in self._layers:
             if func_str not in ALLOWED_FUNC_STR:
@@ -500,26 +485,26 @@ def fit(
         self._validate_columns([])
         return self
 
-    def predict(self, X: MatrixLike) -> npt.NDArray:
+    def predict(self, X) -> npt.NDArray:
         """Perform classification on test vectors `X`.
 
         Args:
-            X (MatrixLike): feature matrix.
+            X: feature matrix.
 
         Returns:
             npt.NDArray: Predicted traget values for X.
         """
         check_is_fitted(self)
-        X = self._validate_data(
-            X,
-            dtype=[np.float64, np.float32],
-            accept_sparse=False,
-            force_all_finite=False,
-        )
-
         rs = check_random_state(self.random_state)
 
-        self.X_ = pd.DataFrame(data=X, columns=self.columns_)
+        # adapted from:
+        # https://github.com/scikit-learn/scikit-learn/blob/f07e0138b/sklearn/compose/_column_transformer.py#L900
+        column_names = _get_feature_names(X)
+        self._check_n_features(X, reset=True)
+        X = _check_X(X)
+
+        self.X_ = {c: X[c] for c in column_names}
+
         pred = self._predict()
 
         # fill NaNs randomly with -1 and 1 or with constant zero
@@ -539,7 +524,9 @@ def _predict(self) -> npt.NDArray:
         Returns:
             npt.NDArray: prediction
         """
-        pred = np.full(shape=(self.X_.shape[0],), fill_value=np.nan)
+        n_samples = next(iter(self.X_.values())).shape[0]
+        pred = np.full(shape=(n_samples,), fill_value=np.nan)
+
         for func_str, subset in self._layers:
             func = self.func_mapping_[func_str]
             pred = np.where(
@@ -549,15 +536,15 @@ def _predict(self) -> npt.NDArray:
             )
         return pred
 
-    def predict_proba(self, X: MatrixLike) -> npt.NDArray:
+    def predict_proba(self, X) -> npt.NDArray:
         """Predict class probabilities for X.
 
         Probabilities are either 0 or 1 depending on the class.
 
         For strategy 'constant' probabilities are (0.5,0.5) for unclassified classes.
 
         Args:
-            X (MatrixLike): feature matrix
+            X: feature matrix
 
         Returns:
             npt.NDArray: probabilities
@@ -578,3 +565,10 @@ def predict_proba(self, X: MatrixLike) -> npt.NDArray:
         # For strategy 'constant' probabilities are (0.5,0.5).
         prob[mask] = np.identity(n_classes)[indices]
         return prob
+
+
+def _check_X(X):
+    """Use check_array only when necessary, e.g. on lists and other non-array-likes."""
+    if hasattr(X, "__array__") or hasattr(X, "__dataframe__") or sparse.issparse(X):
+        return X
+    return check_array(X, force_all_finite="allow-nan", dtype=object)
diff --git a/src/tclf/types.py b/src/tclf/types.py
diff --git a/tests/test_classical_classifier.py b/tests/test_classical_classifier.py
@@ -4,6 +4,7 @@
 
 import numpy as np
 import pandas as pd
+import polars as pl
 import pytest
 from numpy.testing import assert_allclose
 from sklearn.base import BaseEstimator
@@ -226,6 +227,21 @@ def test_override(self, x_train: pd.DataFrame) -> None:
         )
         assert (y_pred == y_test).all()
 
+    def test_polars(self) -> None:
+        """Test polars support."""
+        x = pl.DataFrame({"trade_price": [1, 2, 0], "price_ex_lag": [2, 1, 3]})
+        y = pl.Series([-1, 1])
+
+        y_pred = (
+            ClassicalClassifier(
+                layers=[("tick", "ex")],
+                random_state=7,
+            )
+            .fit(x)
+            .predict(x)
+        )
+        assert (y_pred == y).all()
+
     def test_np_array(self, x_train: pd.DataFrame) -> None:
         """Test, if classifier works, if only np.ndarrays are provided.