Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[ENH] Add polars version of dummy proba regressor #447

Open
wants to merge 38 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
38 commits
Select commit Hold shift + click to select a range
078a6f8
inital commit
julian-fong Aug 3, 2024
8f3dc3b
removed dummyprobaregressor from __init__ and added dummypolarsproba …
julian-fong Aug 3, 2024
c0340e1
updates
julian-fong Aug 3, 2024
99f08e5
renamed _dummy_polars to dummy_polars
julian-fong Aug 5, 2024
b4cb58f
Delete skpro/regression/_dummy_polars.py
julian-fong Aug 5, 2024
2cbb933
initial commit
julian-fong Aug 13, 2024
c94b612
intial commit
julian-fong Aug 14, 2024
17c3126
updated _convert
julian-fong Aug 14, 2024
fc46414
updated to from_pandas
julian-fong Aug 14, 2024
ae9ee7b
removed duplicative code
julian-fong Aug 14, 2024
98ea699
fixed naming convention for indices to use __index__{col_name}
julian-fong Aug 14, 2024
446e180
fixed name to only include original index name in returned dataframe
julian-fong Aug 14, 2024
c220f37
refactored current polars tests and fixed code
julian-fong Aug 15, 2024
e8cacd6
refactored lazy frames to use .collect_schema().names() to fix warning
julian-fong Aug 15, 2024
29d3f40
added conversion util for polars examples and removed commented code
julian-fong Aug 15, 2024
16e8f51
refactored check_polars_frame to ignore __index__ columns and edited …
julian-fong Aug 16, 2024
8f97233
bug fix
julian-fong Aug 16, 2024
3bf810c
updated n_features calculation
julian-fong Aug 16, 2024
2d4d2d1
added code to not include __index__ if df.index is trivial
julian-fong Aug 16, 2024
6498824
removed line
julian-fong Aug 16, 2024
98f668a
Merge branch 'polars_adapter_enhancements' into dummypolarsproba
julian-fong Aug 16, 2024
b7e0d81
introduced utils file for polars and rewrote dummypolarsprobaregressor
julian-fong Aug 17, 2024
3b6d698
added utils file for polars
julian-fong Aug 17, 2024
99a6bad
fixed import
julian-fong Aug 17, 2024
26c5d2f
added entire class as a polars dependency
julian-fong Aug 17, 2024
d227036
inital commit
julian-fong Aug 17, 2024
19ee986
Merge branch 'update_gitdiff' into dummypolarsproba
julian-fong Aug 17, 2024
05d1e19
add simple test for polars e2e
julian-fong Aug 17, 2024
7f6e052
fixed typo
julian-fong Aug 17, 2024
ca79f83
fixed typo
julian-fong Aug 17, 2024
59cce04
test updates
julian-fong Aug 18, 2024
a3c7fdc
test updates
julian-fong Aug 18, 2024
857f706
Merge branch 'main' into dummypolarsproba
julian-fong Aug 18, 2024
261cf85
Update dummy_polars.py
julian-fong Sep 5, 2024
93b9913
Merge branch 'sktime:main' into dummypolarsproba
julian-fong Sep 7, 2024
bb3cfa9
Merge branch 'main' into dummypolarsproba
julian-fong Sep 8, 2024
9bc13d5
updates
julian-fong Sep 10, 2024
6f6a5c3
removed unused import
julian-fong Sep 10, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions docs/source/api_reference/regression.rst
Original file line number Diff line number Diff line change
Expand Up @@ -133,6 +133,7 @@ This section lists simple regressors which can be used as baselines.

DeltaPointRegressor
DummyProbaRegressor
DummyPolarsProbaRegressor


Linear regression
Expand Down
80 changes: 80 additions & 0 deletions skpro/datatypes/tests/test_polars.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,12 @@
import polars as pl

from skpro.datatypes import check_is_mtype, convert
from skpro.datatypes._table._convert import (
convert_pandas_to_polars_eager,
convert_pandas_to_polars_lazy,
convert_polars_to_pandas,
)


TEST_ALPHAS = [0.05, 0.1, 0.25]

Expand Down Expand Up @@ -46,6 +52,14 @@ def _pd_to_pl(df):
return convert(df, from_type="pd_DataFrame_Table", to_type="polars_eager_table")


@pytest.fixture
def polars_estimator():
from skpro.regression.dummy_polars import DummyPolarsProbaRegressor

_estimator = DummyPolarsProbaRegressor(strategy="normal")
return _estimator


@pytest.fixture
def polars_load_diabetes_polars(polars_load_diabetes_pandas):
X_train, X_test, y_train = polars_load_diabetes_pandas
Expand Down Expand Up @@ -167,3 +181,69 @@ def test_polars_eager_regressor_in_predict_quantiles(
assert y_pred_quantile.columns[0] == ("target", 0.05)
assert y_pred_quantile.columns[1] == ("target", 0.1)
assert y_pred_quantile.columns[2] == ("target", 0.25)


@pytest.mark.skipif(
not run_test_module_changed("skpro.datatypes")
or not _check_soft_dependencies(["polars", "pyarrow"], severity="none"),
reason="skip test if polars/pyarrow is not installed in environment",
)
def test_pandas_to_polars_with_index_conversion(polars_load_diabetes_pandas):
X_train, X_test, y_train = polars_load_diabetes_pandas

X_train_pl = convert_pandas_to_polars_eager(X_train)
assert "__index__" in X_train_pl.columns

X_test_pl = convert_pandas_to_polars_lazy(X_test)
assert "__index__" in X_test_pl.columns

y_train.index.name = "foo"
y_train_pl = convert_pandas_to_polars_eager(y_train)
assert "__index__foo" in y_train_pl.columns

X_train_no_index = X_train.reset_index()
X_train_no_index_pl = convert_pandas_to_polars_eager(X_train_no_index)

assert "__index__" not in X_train_no_index_pl.columns


@pytest.mark.skipif(
not run_test_module_changed("skpro.datatypes")
or not _check_soft_dependencies(["polars", "pyarrow"], severity="none"),
reason="skip test if polars/pyarrow is not installed in environment",
)
def test_polars_to_pandas_with_index_conversion(polars_load_diabetes_pandas):
X_train, X_test, y_train = polars_load_diabetes_pandas

X_train_pl = convert_pandas_to_polars_eager(X_train)
X_train_ = convert_polars_to_pandas(X_train_pl)
assert list(X_train_.index) == list(X_train_pl["__index__"].to_numpy())
assert not X_train_.index.name

X_test_pl = convert_pandas_to_polars_lazy(X_test)
X_test_ = convert_polars_to_pandas(X_test_pl)
assert list(X_test_.index) == list(
X_test_pl.select(["__index__"]).collect().to_numpy()
)
assert not X_test_.index.name

y_train.index.name = "foo"
y_train_pl = convert_pandas_to_polars_eager(y_train)
y_train_ = convert_polars_to_pandas(y_train_pl)
assert list(y_train_.index) == list(y_train_pl["__index__foo"].to_numpy())
assert y_train_.index.name == "foo"


@pytest.mark.skipif(
not run_test_module_changed("skpro.datatypes")
or not _check_soft_dependencies(["polars", "pyarrow"], severity="none"),
reason="skip test if polars/pyarrow is not installed in environment",
)
def test_polars_estimator_e2e(polars_estimator, polars_load_diabetes_polars):
X_train_pl, X_test_pl, y_train_pl = polars_load_diabetes_polars

polars_estimator.fit(X_train_pl, y_train_pl)
y_pred = polars_estimator.predict(X_test_pl)

assert isinstance(y_pred, pl.DataFrame)
assert y_pred.columns == ["target"]
199 changes: 199 additions & 0 deletions skpro/regression/dummy_polars.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,199 @@
"""Dummy time series regressor."""

__author__ = ["julian-fong"]
__all__ = ["DummyPolarsProbaRegressor"]

import numpy as np
import pandas as pd

from skpro.distributions.empirical import Empirical
from skpro.distributions.normal import Normal
from skpro.regression.base import BaseProbaRegressor
from skpro.utils.validation._dependencies import _check_soft_dependencies

if _check_soft_dependencies(["polars", "pyarrow"], severity="none"):
import polars as pl

from skpro.utils.polars import (
polars_combine_index_value_frame,
polars_split_index_values_frame,
)

class DummyPolarsProbaRegressor(BaseProbaRegressor):
"""DummyProbaRegressor makes predictions that ignore the input features.

This regressor serves as a simple baseline to compare against other more
complex regressors.
The specific behavior of the baseline is selected with the ``strategy``
parameter. This ProbaRegressor is specific to handle polars dataframes

All strategies make predictions that ignore the input feature values passed
as the ``X`` argument to ``fit`` and ``predict``. The predictions, however,
typically depend on values observed in the ``y`` parameter passed to ``fit``.

Parameters
----------
strategy : one of ["empirical", "normal"] default="empirical"
Strategy to use to generate predictions.

* "empirical": always predicts the empirical unweighted distribution
of the training labels
* "normal": always predicts a normal distribution, with mean and variance
equal to the mean and variance of the training labels

Attributes
----------
distribution_ : skpro.distribution
Normal distribution or Empirical distribution, depending on chosen strategy.
Scalar version of the distribution that is returned by ``predict_proba``.
"""

_tags = {
"authors": ["julian-fong"],
"maintainers": ["julian-fong"],
"capability:multioutput": False,
"capability:missing": True,
"X_inner_mtype": "polars_eager_table",
"y_inner_mtype": "polars_eager_table",
}

def __init__(self, strategy="empirical"):
self.strategy = strategy
super().__init__()

def _fit(self, X, y):
"""Fit the dummy regressor.

Writes to self:
Sets fitted model attributes ending in "_".

Parameters
----------
X : pandas DataFrame
feature instances to fit regressor to
y : pandas DataFrame, must be same length as X
labels to fit regressor to

Returns
-------
self : reference to self
"""
self._y = y
self._y_columns = y.columns
self._mu = np.mean(self._y.to_numpy())
self._sigma = np.std(self._y.to_numpy())
_, pl_values = polars_split_index_values_frame(self._y)
pd_values = pl_values.to_pandas()
# distribution objects are written in pandas dataframes
if self.strategy == "empirical":
self.distribution_ = Empirical(pd_values)
if self.strategy == "normal":
self.distribution_ = Normal(self._mu, self._sigma)

return self

def _predict(self, X):
"""Predict labels for data from features.

Parameters
----------
X : sktime-format pandas dataframe or array-like, shape (n, d)

Returns
-------
y : polars DataFrame
predictions of target values for X
"""
X_n_rows = X.shape[0]
pl_index, pl_values = polars_split_index_values_frame(X)
y_pred = pl.DataFrame(np.ones(X_n_rows) * self._mu)
y_pred.columns = [col for col in self._y_columns if "__index__" not in col]
y_pred = polars_combine_index_value_frame(pl_index, y_pred)
return y_pred

def _predict_var(self, X):
"""Compute/return variance predictions.

private _predict_var containing the core logic, called from predict_var

Parameters
----------
X : pandas DataFrame, must have same columns as X in `fit`
data to predict labels for

Returns
-------
pred_var : pl.DataFrame
Column names are exactly those of ``y`` passed in ``fit``.
Entries are variance prediction, for var in col index.
A variance prediction for given variable and fh index is a predicted
variance for that variable and index, given observed data.
"""
X_n_rows = X.shape[0]
pl_index, pl_values = polars_split_index_values_frame(X)
y_pred = pl.DataFrame(np.ones(X_n_rows) * self._sigma)
y_pred.columns = [col for col in self._y_columns if "__index__" not in col]
y_pred = polars_combine_index_value_frame(pl_index, y_pred)

# TODO - remove after boilerplate is fixed
from skpro.datatypes._adapter.polars import (
convert_polars_to_pandas_with_index,
)

y_pred = convert_polars_to_pandas_with_index(y_pred)
return y_pred

def _predict_proba(self, X):
"""Broadcast skpro distribution from fit onto labels from X.

Parameters
----------
X : sktime-format pandas dataframe or array-like, shape (n, d)

Returns
-------
y : skpro.distribution, same length as `X`
labels predicted for `X`
"""
X_ind = list(X["__index__"].to_numpy())
X_n_rows = X.shape[0]
if self.strategy == "normal":
y_cols = [col for col in self._y_columns if "__index__" not in col]
# broadcast the mu and sigma from fit to the length of X
mu = np.reshape((np.ones(X_n_rows) * self._mu), (-1, 1))
sigma = np.reshape((np.ones(X_n_rows) * self._sigma), (-1, 1))
pred_dist = Normal(mu=mu, sigma=sigma, index=X_ind, columns=y_cols)
return pred_dist

if self.strategy == "empirical":
pl_index, pl_values = polars_split_index_values_frame(self._y)
pd_values = pl_values.to_pandas()
empr_df = pd.concat([pd_values] * X_n_rows, keys=X_ind).swaplevel()
pred_dist = Empirical(empr_df, index=X_ind, columns=pd_values.columns)

return pred_dist

@classmethod
def get_test_params(cls, parameter_set="default"):
"""Return testing parameter settings for the estimator.

Parameters
----------
parameter_set : str, default="default"
Name of the set of test parameters to return, for use in tests. If no
special parameters are defined for a value, will return `"default"`
set.

Returns
-------
params : dict or list of dict, default = {}
Parameters to create testing instances of the class
Each dict are parameters to construct an "interesting" test instance,
i.e., `MyClass(**params)` or `MyClass(**params[i])` creates a
valid test instance. `create_test_instance` uses the first
(or only) dictionary in `params`
"""
params1 = {}
params2 = {"strategy": "normal"}

return [params1, params2]
2 changes: 0 additions & 2 deletions skpro/regression/linear/__init__.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
"""Linear regression models."""
# copyright: skpro developers, BSD-3-Clause License (see LICENSE file)

from skpro.regression.dummy import DummyProbaRegressor
from skpro.regression.linear._glm import GLMRegressor
from skpro.regression.linear._sklearn import ARDRegression, BayesianRidge
from skpro.regression.linear._sklearn_poisson import PoissonRegressor
Expand All @@ -11,5 +10,4 @@
"BayesianRidge",
"GLMRegressor",
"PoissonRegressor",
"DummyProbaRegressor",
]
78 changes: 78 additions & 0 deletions skpro/utils/polars.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
"""Utility file for polars dataframes."""
from skpro.utils.validation._dependencies import _check_soft_dependencies

if _check_soft_dependencies(["polars", "pyarrow"], severity="none"):
import polars as pl

def polars_split_index_values_frame(obj):
"""Split all index and value columns into separate polars frame.

Assumes there exists columns of the form __index__ inside the dataframe and
other columns not in the double underscore form
For example:

┌───────────┬────────────┐
│ __index__ ┆ target │
│ --- ┆ --- │
│ i64 ┆ f64 │
╞═══════════╪════════════╡
│ 4 ┆ 121.545815 │
│ 63 ┆ 77.2909 │
│ 10 ┆ 74.845273 │
└───────────┴────────────┘

This function will then split the dataframes into 2 different
polars dataframes

┌───────────┬ ┬────────────┐
│ __index__ ┆ ┆ target │
│ --- ┆ ┆ --- │
│ i64 ┆ ┆ f64 │
╞═══════════╪ ╪════════════╡
│ 4 ┆ ┆ 121.545815 │
│ 63 ┆ ┆ 77.2909 │
│ 10 ┆ ┆ 74.845273 │
└───────────┴ ┴────────────┘

Parameters
----------
obj: polars DataFrame
has an assumption of the format of the dataframe


Returns
-------
polars_index_frame: polars DataFrame
polars frame containing only the index columns

polars_values_frame: polars DataFrame
polars frame containing only the value columns
"""
obj_columns = obj.columns

index_cols = [col for col in obj_columns if "__index__" in col]
value_cols = [col for col in obj_columns if "__index__" not in col]

polars_index_frame = obj.select(index_cols)
polars_value_frame = obj.select(value_cols)

return polars_index_frame, polars_value_frame

def polars_combine_index_value_frame(polars_index_frame, polars_values_frame):
"""Combine the index and value frame together into a single frame.

Parameter
---------
polars_index_frame: polars DataFrame
polars frame containing only the index columns

polars_values_frame: polars DataFrame
polars frame containing only the value columns

Returns
-------
obj: polars DataFrame
polars DataFrame containing both the index and value frames together
"""
obj = pl.concat([polars_index_frame, polars_values_frame], how="horizontal")
return obj
Loading