Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[ENH] Optimize Unsuprf #114

Merged
merged 17 commits into from
Aug 22, 2023
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
[![Main](https://github.com/neurodata/scikit-tree/actions/workflows/main.yml/badge.svg?branch=main)](https://github.com/neurodata/scikit-tree/actions/workflows/main.yml)
[![Checked with mypy](http://www.mypy-lang.org/static/mypy_badge.svg)](http://mypy-lang.org/)
[![codecov](https://codecov.io/gh/neurodata/scikit-tree/branch/main/graph/badge.svg?token=H1reh7Qwf4)](https://codecov.io/gh/neurodata/scikit-tree)
[![PyPI Download count](https://pepy.tech/badge/scikit-tree)](https://pepy.tech/project/scikit-tree)
[![PyPI Download count](https://img.shields.io/pypi/dm/scikit-tree.svg)](https://pypistats.org/packages/scikit-tree)
[![Latest PyPI release](https://img.shields.io/pypi/v/scikit-tree.svg)](https://pypi.org/project/scikit-tree/)

scikit-tree
Expand Down
96 changes: 96 additions & 0 deletions benchmarks/bench_plot_urf.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
from collections import defaultdict
from time import time

import numpy as np
from numpy import random as nr

from sktree import UnsupervisedObliqueRandomForest, UnsupervisedRandomForest


def compute_bench(samples_range, features_range):
it = 0
results = defaultdict(lambda: [])

est_params = {
"criterion": "fastbic",
}

max_it = len(samples_range) * len(features_range)
for n_samples in samples_range:
for n_features in features_range:
it += 1

print("==============================")
print("Iteration %03d of %03d" % (it, max_it))
print("==============================")
print()
print(f"n_samples: {n_samples} and n_features: {n_features}")
data = nr.randint(-50, 51, (n_samples, n_features))

print("Unsupervised RF")
tstart = time()
est = UnsupervisedRandomForest(
min_samples_split=2 * np.sqrt(n_samples).astype(int), **est_params
).fit(data)

delta = time() - tstart
max_depth = max(tree.get_depth() for tree in est.estimators_)
print("Speed: %0.3fs" % delta)
print("Max depth: %d" % max_depth)
print()

results["unsup_rf_speed"].append(delta)
results["unsup_rf_depth"].append(max_depth)

print("Unsupervised Oblique RF")
# let's prepare the data in small chunks
est = UnsupervisedObliqueRandomForest(
min_samples_split=2 * np.sqrt(n_samples).astype(int), **est_params
)
tstart = time()
est.fit(data)
delta = time() - tstart
max_depth = max(tree.get_depth() for tree in est.estimators_)
print("Speed: %0.3fs" % delta)
print("Max depth: %d" % max_depth)
print()
print()

results["unsup_obliquerf_speed"].append(delta)
results["unsup_obliquerf_depth"].append(max_depth)

return results


if __name__ == "__main__":
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import axes3d # noqa register the 3d projection

samples_range = np.linspace(50, 150, 5).astype(int)
features_range = np.linspace(150, 50000, 5).astype(int)
chunks = np.linspace(500, 10000, 15).astype(int)

results = compute_bench(samples_range, features_range)

max_time = max([max(i) for i in [t for (label, t) in results.items() if "speed" in label]])
max_inertia = max(
[max(i) for i in [t for (label, t) in results.items() if "speed" not in label]]
)

fig = plt.figure("scikit-learn Unsupervised (Oblique and Axis) RF benchmark results")
for c, (label, timings) in zip("brcy", sorted(results.items())):
if "speed" in label:
ax = fig.add_subplot(2, 1, 1, projection="3d")
ax.set_zlim3d(0.0, max_time * 1.1)
else:
ax = fig.add_subplot(2, 1, 2, projection="3d")
ax.set_zlim3d(0.0, max_inertia * 1.1)

X, Y = np.meshgrid(samples_range, features_range)
Z = np.asarray(timings).reshape(samples_range.shape[0], features_range.shape[0])
ax.plot_surface(X, Y, Z.T, cstride=1, rstride=1, color=c, alpha=0.5)
ax.set_title(f"{label}")
ax.set_xlabel("n_samples")
ax.set_ylabel("n_features")

plt.show()
3 changes: 2 additions & 1 deletion doc/whats_new/v0.2.rst
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,8 @@ Changelog
---------
- |Efficiency| Upgraded build process to rely on Cython 3.0+, by `Adam Li`_ (:pr:`109`)
- |Feature| Allow decision trees to take advantage of ``partial_fit`` and ``monotonic_cst`` when available, by `Adam Li`_ (:pr:`109`)

- |Efficiency| Around 1.5-2x speed improvement for unsupervised forests, by `Adam Li`_ (:pr:`114`)
- |API| Allow ``sqrt`` and ``log2`` keywords to be used for ``min_samples_split`` parameter in unsupervised forests, by `Adam Li`_ (:pr:`114`)

Code and Documentation Contributors
-----------------------------------
Expand Down
2 changes: 1 addition & 1 deletion sktree/_lib/sklearn_fork
Submodule sklearn_fork updated 53 files
+1 −1 build_tools/azure/debian_atlas_32bit_lock.txt
+5 −6 build_tools/azure/py38_conda_defaults_openblas_linux-64_conda.lock
+21 −21 build_tools/azure/py38_conda_forge_mkl_win-64_conda.lock
+21 −21 build_tools/azure/py38_conda_forge_openblas_ubuntu_2204_linux-64_conda.lock
+30 −29 build_tools/azure/pylatest_conda_forge_mkl_linux-64_conda.lock
+25 −24 build_tools/azure/pylatest_conda_forge_mkl_no_coverage_linux-64_conda.lock
+23 −23 build_tools/azure/pylatest_conda_forge_mkl_osx-64_conda.lock
+9 −11 build_tools/azure/pylatest_conda_mkl_no_openmp_osx-64_conda.lock
+12 −12 build_tools/azure/pylatest_pip_openblas_pandas_linux-64_conda.lock
+8 −8 build_tools/azure/pylatest_pip_scipy_dev_linux-64_conda.lock
+8 −8 build_tools/azure/pypy3_linux-64_conda.lock
+2 −2 build_tools/azure/python_nogil_lock.txt
+1 −1 build_tools/azure/ubuntu_atlas_lock.txt
+47 −53 build_tools/circle/doc_linux-64_conda.lock
+10 −10 build_tools/circle/doc_min_dependencies_linux-64_conda.lock
+4 −0 build_tools/cirrus/arm_tests.yml
+1 −1 build_tools/cirrus/arm_wheel.yml
+1 −1 build_tools/cirrus/build_test_arm.sh
+10 −10 build_tools/cirrus/py39_conda_forge_linux-aarch64_conda.lock
+28 −1 doc/modules/array_api.rst
+1 −1 doc/whats_new/v1.3.rst
+29 −4 doc/whats_new/v1.4.rst
+13 −9 sklearn/cluster/tests/test_affinity_propagation.py
+20 −13 sklearn/cluster/tests/test_bicluster.py
+16 −15 sklearn/cluster/tests/test_bisect_k_means.py
+9 −6 sklearn/cluster/tests/test_hdbscan.py
+211 −27 sklearn/compose/_column_transformer.py
+3 −3 sklearn/compose/_target.py
+118 −0 sklearn/compose/tests/test_column_transformer.py
+2 −2 sklearn/datasets/_base.py
+7 −3 sklearn/decomposition/_base.py
+2 −2 sklearn/decomposition/_nmf.py
+19 −16 sklearn/decomposition/tests/test_nmf.py
+309 −15 sklearn/ensemble/_forest.py
+6 −2 sklearn/feature_selection/tests/test_from_model.py
+2 −8 sklearn/linear_model/_least_angle.py
+3 −4 sklearn/linear_model/_ridge.py
+36 −3 sklearn/pipeline.py
+24 −12 sklearn/preprocessing/_data.py
+27 −0 sklearn/preprocessing/tests/test_data.py
+1 −0 sklearn/svm/_classes.py
+22 −2 sklearn/tests/metadata_routing_common.py
+6 −7 sklearn/tests/test_metadata_routing.py
+68 −59 sklearn/tests/test_pipeline.py
+41 −23 sklearn/tree/_classes.py
+2 −2 sklearn/tree/tests/test_tree.py
+61 −4 sklearn/utils/_array_api.py
+1 −1 sklearn/utils/_metadata_requests.py
+11 −1 sklearn/utils/estimator_checks.py
+34 −0 sklearn/utils/fixes.py
+56 −0 sklearn/utils/tests/test_array_api.py
+3 −3 sklearn/utils/tests/test_utils.py
+2 −2 sklearn/utils/validation.py
7 changes: 5 additions & 2 deletions sktree/ensemble/_honest_forest.py
Original file line number Diff line number Diff line change
Expand Up @@ -375,7 +375,7 @@ def __init__(
self.honest_prior = honest_prior
self.tree_estimator = tree_estimator

def fit(self, X, y, sample_weight=None):
def fit(self, X, y, sample_weight=None, classes=None):
"""
Build a forest of trees from the training set (X, y).

Expand All @@ -397,13 +397,16 @@ def fit(self, X, y, sample_weight=None):
classification, splits are also ignored if they would result in any
single class carrying a negative weight in either child node.

classes : array-like of shape (n_classes,), default=None
List of all the classes that can possibly appear in the y vector.

Returns
-------
self : HonestForestClassifier
Fitted tree estimator.
"""
X, y = check_X_y(X, y, multi_output=True)
super().fit(X, y, sample_weight)
super().fit(X, y, sample_weight=sample_weight, classes=classes)

# Compute honest decision function
self.honest_decision_function_ = self._predict_proba(
Expand Down
4 changes: 2 additions & 2 deletions sktree/ensemble/_unsupervised_forest.py
Original file line number Diff line number Diff line change
Expand Up @@ -554,7 +554,7 @@ def __init__(
*,
criterion="twomeans",
max_depth=None,
min_samples_split=2,
min_samples_split="sqrt",
min_samples_leaf=1,
min_weight_fraction_leaf=0.0,
max_features="sqrt",
Expand Down Expand Up @@ -786,7 +786,7 @@ def __init__(
*,
criterion="twomeans",
max_depth=None,
min_samples_split=2,
min_samples_split="sqrt",
min_samples_leaf=1,
min_weight_fraction_leaf=0.0,
max_features="sqrt",
Expand Down
21 changes: 16 additions & 5 deletions sktree/tests/test_supervised_forest.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.utils._testing import assert_array_almost_equal
from sklearn.utils.estimator_checks import check_estimator
from sklearn.utils.estimator_checks import parametrize_with_checks
from sklearn.utils.validation import check_random_state

from sktree.ensemble import (
Expand Down Expand Up @@ -177,10 +177,21 @@ def _trunk(n, p=10, random_state=None):
return X, y


@pytest.mark.parametrize("name", FOREST_ESTIMATORS)
def test_sklearn_compatible_estimator(name):
estimator = FOREST_ESTIMATORS[name](random_state=12345, n_estimators=10)
check_estimator(estimator)
@parametrize_with_checks(
[
ObliqueRandomForestClassifier(random_state=12345, n_estimators=10),
PatchObliqueRandomForestClassifier(random_state=12345, n_estimators=10),
ObliqueRandomForestRegressor(random_state=12345, n_estimators=10),
PatchObliqueRandomForestRegressor(random_state=12345, n_estimators=10),
]
)
def test_sklearn_compatible_estimator(estimator, check):
# TODO: remove when we can replicate the CI error...
if isinstance(
estimator, (ObliqueRandomForestClassifier, PatchObliqueRandomForestClassifier)
) and check.func.__name__ in ["check_fit_score_takes_y"]:
pytest.skip()
check(estimator)


def test_oblique_forest_sparse_parity():
Expand Down
25 changes: 21 additions & 4 deletions sktree/tree/_classes.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import copy
import numbers
from numbers import Real

import numpy as np
Expand Down Expand Up @@ -171,7 +172,7 @@ def __init__(
criterion="twomeans",
splitter="best",
max_depth=None,
min_samples_split=5,
min_samples_split="sqrt",
min_samples_leaf=1,
min_weight_fraction_leaf=0.0,
max_features=None,
Expand Down Expand Up @@ -234,6 +235,22 @@ def _build_tree(
max_depth,
random_state,
):
if isinstance(self.min_samples_split, str):
if self.min_samples_split == "sqrt":
min_samples_split = max(1, int(np.sqrt(self.n_features_in_)))
elif self.min_samples_split == "log2":
min_samples_split = max(1, int(np.log2(self.n_features_in_)))
elif self.min_samples_split is None:
min_samples_split = self.n_features_in_
elif isinstance(self.min_samples_split, numbers.Integral):
min_samples_split = self.min_samples_split
else: # float
if self.min_samples_split > 0.0:
min_samples_split = max(1, int(self.min_samples_split * self.n_features_in_))
else:
min_samples_split = 0
self.min_samples_split_ = min_samples_split

criterion = self.criterion
if not isinstance(criterion, UnsupervisedCriterion):
criterion = UNSUPERVISED_CRITERIA[self.criterion]()
Expand All @@ -254,7 +271,7 @@ def _build_tree(
if max_leaf_nodes < 0:
builder = UnsupervisedDepthFirstTreeBuilder(
splitter,
min_samples_split,
self.min_samples_split_,
min_samples_leaf,
min_weight_leaf,
max_depth,
Expand All @@ -263,7 +280,7 @@ def _build_tree(
else:
builder = UnsupervisedBestFirstTreeBuilder(
splitter,
min_samples_split,
self.min_samples_split_,
min_samples_leaf,
min_weight_leaf,
max_depth,
Expand Down Expand Up @@ -459,7 +476,7 @@ def __init__(
criterion="twomeans",
splitter="best",
max_depth=None,
min_samples_split=5,
min_samples_split="sqrt",
min_samples_leaf=1,
min_weight_fraction_leaf=0,
max_features=None,
Expand Down
Loading
Loading