Skip to content

Commit

Permalink
feat: start with support for __dataframe__ api🆕
Browse files Browse the repository at this point in the history
  • Loading branch information
KarelZe committed Apr 1, 2024
1 parent 923efee commit 8efa3a4
Show file tree
Hide file tree
Showing 5 changed files with 55 additions and 54 deletions.
1 change: 1 addition & 0 deletions pixi.toml
Original file line number Diff line number Diff line change
Expand Up @@ -30,5 +30,6 @@ setuptools = "*"
# test
pytest = "*"
pytest-cov = "*"
polars = "*"
# linting
pre-commit = "*"
1 change: 0 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,6 @@ classifiers = [

dependencies = [
"numpy",
"pandas",
"scikit-learn"
]
dynamic = ["version"]
Expand Down
82 changes: 38 additions & 44 deletions src/tclf/classical_classifier.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,15 +11,16 @@
import numpy as np
import numpy.typing as npt
import pandas as pd
from scipy import sparse
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.utils import check_random_state
from sklearn.utils.validation import (
_check_sample_weight,
_get_feature_names,
check_array,
check_is_fitted,
)

from tclf.types import ArrayLike, MatrixLike

ALLOWED_FUNC_LITERALS = Literal[
"tick",
"rev_tick",
Expand Down Expand Up @@ -396,7 +397,8 @@ def _nan(self, subset: str) -> npt.NDArray:
Returns:
npt.NDArray: result of the trade size rule. Can be np.NaN.
"""
return np.full(shape=(self.X_.shape[0],), fill_value=np.nan)
n_samples = next(iter(self.X_.values())).shape[0]
return np.full(shape=(n_samples,), fill_value=np.nan)

def _validate_columns(self, missing_columns: list) -> None:
"""Validate if all required columns are present.
Expand All @@ -407,8 +409,9 @@ def _validate_columns(self, missing_columns: list) -> None:
Raises:
ValueError: columns missing in dataframe.
"""
columns = self.columns_ + missing_columns if self.columns_ else missing_columns
self.X_ = pd.DataFrame(np.zeros(shape=(1, len(columns))), columns=columns)
columns = self.feature_names_in_.tolist()
columns.extend(missing_columns)
self.X_ = {c: np.zeros(shape=(1, 1)) for c in columns}
try:
self._predict()
except KeyError as e:
Expand All @@ -427,15 +430,15 @@ def _validate_columns(self, missing_columns: list) -> None:

def fit(
self,
X: MatrixLike,
y: ArrayLike | None = None,
X,
y=None,
sample_weight: npt.NDArray | None = None,
) -> ClassicalClassifier:
"""Fit the classifier.
Args:
X (MatrixLike): features
y (ArrayLike | None, optional): ignored, present here for API consistency by convention.
X: features
y: ignored, present here for API consistency by convention.
sample_weight (npt.NDArray | None, optional): Sample weights. Defaults to None.
Raises:
Expand Down Expand Up @@ -465,30 +468,12 @@ def fit(

self.func_mapping_ = dict(zip(ALLOWED_FUNC_STR, funcs))

# create working copy to be altered and try to get columns from df
self.columns_ = self.features
if isinstance(X, pd.DataFrame):
self.columns_ = X.columns.tolist()

X = self._validate_data(
X,
y="no_validation",
dtype=[np.float64, np.float32],
accept_sparse=False,
force_all_finite=False,
)

# set feature names, if given
self._check_feature_names(X, reset=True)
X = _check_X(X)
self._check_n_features(X, reset=True)
self.classes_ = np.array([-1, 1])

# if no features are provided or inferred, use default
if self.columns_ is None:
self.columns_ = [str(i) for i in range(X.shape[1])]

if len(self.columns_) > 0 and X.shape[1] != len(self.columns_):
raise ValueError(
f"Expected {len(self.columns_)} columns, got {X.shape[1]}."
)

self._layers = self.layers if self.layers is not None else []
for func_str, _ in self._layers:
if func_str not in ALLOWED_FUNC_STR:
Expand All @@ -500,26 +485,26 @@ def fit(
self._validate_columns([])
return self

def predict(self, X: MatrixLike) -> npt.NDArray:
def predict(self, X) -> npt.NDArray:
"""Perform classification on test vectors `X`.
Args:
X (MatrixLike): feature matrix.
X: feature matrix.
Returns:
npt.NDArray: Predicted traget values for X.
"""
check_is_fitted(self)
X = self._validate_data(
X,
dtype=[np.float64, np.float32],
accept_sparse=False,
force_all_finite=False,
)

rs = check_random_state(self.random_state)

self.X_ = pd.DataFrame(data=X, columns=self.columns_)
# adapted from:
# https://github.com/scikit-learn/scikit-learn/blob/f07e0138b/sklearn/compose/_column_transformer.py#L900
column_names = _get_feature_names(X)
self._check_n_features(X, reset=True)
X = _check_X(X)

self.X_ = {c: X[c] for c in column_names}

pred = self._predict()

# fill NaNs randomly with -1 and 1 or with constant zero
Expand All @@ -539,7 +524,9 @@ def _predict(self) -> npt.NDArray:
Returns:
npt.NDArray: prediction
"""
pred = np.full(shape=(self.X_.shape[0],), fill_value=np.nan)
n_samples = next(iter(self.X_.values())).shape[0]
pred = np.full(shape=(n_samples,), fill_value=np.nan)

for func_str, subset in self._layers:
func = self.func_mapping_[func_str]
pred = np.where(
Expand All @@ -549,15 +536,15 @@ def _predict(self) -> npt.NDArray:
)
return pred

def predict_proba(self, X: MatrixLike) -> npt.NDArray:
def predict_proba(self, X) -> npt.NDArray:
"""Predict class probabilities for X.
Probabilities are either 0 or 1 depending on the class.
For strategy 'constant' probabilities are (0.5,0.5) for unclassified classes.
Args:
X (MatrixLike): feature matrix
X: feature matrix
Returns:
npt.NDArray: probabilities
Expand All @@ -578,3 +565,10 @@ def predict_proba(self, X: MatrixLike) -> npt.NDArray:
# For strategy 'constant' probabilities are (0.5,0.5).
prob[mask] = np.identity(n_classes)[indices]
return prob


def _check_X(X):
"""Use check_array only when necessary, e.g. on lists and other non-array-likes."""
if hasattr(X, "__array__") or hasattr(X, "__dataframe__") or sparse.issparse(X):
return X
return check_array(X, force_all_finite="allow-nan", dtype=object)
9 changes: 0 additions & 9 deletions src/tclf/types.py

This file was deleted.

16 changes: 16 additions & 0 deletions tests/test_classical_classifier.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

import numpy as np
import pandas as pd
import polars as pl
import pytest
from numpy.testing import assert_allclose
from sklearn.base import BaseEstimator
Expand Down Expand Up @@ -226,6 +227,21 @@ def test_override(self, x_train: pd.DataFrame) -> None:
)
assert (y_pred == y_test).all()

def test_polars(self) -> None:
"""Test polars support."""
x = pl.DataFrame({"trade_price": [1, 2, 0], "price_ex_lag": [2, 1, 3]})
y = pl.Series([-1, 1])

y_pred = (
ClassicalClassifier(
layers=[("tick", "ex")],
random_state=7,
)
.fit(x)
.predict(x)
)
assert (y_pred == y).all()

def test_np_array(self, x_train: pd.DataFrame) -> None:
"""Test, if classifier works, if only np.ndarrays are provided.
Expand Down

0 comments on commit 8efa3a4

Please sign in to comment.