Skip to content

Commit

Permalink
[ENH] Optimize Unsuprf (#114)
Browse files Browse the repository at this point in the history
* Improve runtime of unsupervised trees/forests by 1.5-2x removing extra for loops
* Add support for partial fit to all classification trees/forests

---------

Signed-off-by: Adam Li <[email protected]>
Co-authored-by: Haoyin Xu <[email protected]>
  • Loading branch information
adam2392 and PSSF23 authored Aug 22, 2023
1 parent 5a88341 commit fc48cef
Show file tree
Hide file tree
Showing 14 changed files with 323 additions and 155 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
[![Main](https://github.com/neurodata/scikit-tree/actions/workflows/main.yml/badge.svg?branch=main)](https://github.com/neurodata/scikit-tree/actions/workflows/main.yml)
[![Checked with mypy](http://www.mypy-lang.org/static/mypy_badge.svg)](http://mypy-lang.org/)
[![codecov](https://codecov.io/gh/neurodata/scikit-tree/branch/main/graph/badge.svg?token=H1reh7Qwf4)](https://codecov.io/gh/neurodata/scikit-tree)
[![PyPI Download count](https://pepy.tech/badge/scikit-tree)](https://pepy.tech/project/scikit-tree)
[![PyPI Download count](https://img.shields.io/pypi/dm/scikit-tree.svg)](https://pypistats.org/packages/scikit-tree)
[![Latest PyPI release](https://img.shields.io/pypi/v/scikit-tree.svg)](https://pypi.org/project/scikit-tree/)

scikit-tree
Expand Down
96 changes: 96 additions & 0 deletions benchmarks/bench_plot_urf.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
from collections import defaultdict
from time import time

import numpy as np
from numpy import random as nr

from sktree import UnsupervisedObliqueRandomForest, UnsupervisedRandomForest


def compute_bench(samples_range, features_range):
it = 0
results = defaultdict(lambda: [])

est_params = {
"criterion": "fastbic",
}

max_it = len(samples_range) * len(features_range)
for n_samples in samples_range:
for n_features in features_range:
it += 1

print("==============================")
print("Iteration %03d of %03d" % (it, max_it))
print("==============================")
print()
print(f"n_samples: {n_samples} and n_features: {n_features}")
data = nr.randint(-50, 51, (n_samples, n_features))

print("Unsupervised RF")
tstart = time()
est = UnsupervisedRandomForest(
min_samples_split=2 * np.sqrt(n_samples).astype(int), **est_params
).fit(data)

delta = time() - tstart
max_depth = max(tree.get_depth() for tree in est.estimators_)
print("Speed: %0.3fs" % delta)
print("Max depth: %d" % max_depth)
print()

results["unsup_rf_speed"].append(delta)
results["unsup_rf_depth"].append(max_depth)

print("Unsupervised Oblique RF")
# let's prepare the data in small chunks
est = UnsupervisedObliqueRandomForest(
min_samples_split=2 * np.sqrt(n_samples).astype(int), **est_params
)
tstart = time()
est.fit(data)
delta = time() - tstart
max_depth = max(tree.get_depth() for tree in est.estimators_)
print("Speed: %0.3fs" % delta)
print("Max depth: %d" % max_depth)
print()
print()

results["unsup_obliquerf_speed"].append(delta)
results["unsup_obliquerf_depth"].append(max_depth)

return results


if __name__ == "__main__":
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import axes3d # noqa register the 3d projection

samples_range = np.linspace(50, 150, 5).astype(int)
features_range = np.linspace(150, 50000, 5).astype(int)
chunks = np.linspace(500, 10000, 15).astype(int)

results = compute_bench(samples_range, features_range)

max_time = max([max(i) for i in [t for (label, t) in results.items() if "speed" in label]])
max_inertia = max(
[max(i) for i in [t for (label, t) in results.items() if "speed" not in label]]
)

fig = plt.figure("scikit-learn Unsupervised (Oblique and Axis) RF benchmark results")
for c, (label, timings) in zip("brcy", sorted(results.items())):
if "speed" in label:
ax = fig.add_subplot(2, 1, 1, projection="3d")
ax.set_zlim3d(0.0, max_time * 1.1)
else:
ax = fig.add_subplot(2, 1, 2, projection="3d")
ax.set_zlim3d(0.0, max_inertia * 1.1)

X, Y = np.meshgrid(samples_range, features_range)
Z = np.asarray(timings).reshape(samples_range.shape[0], features_range.shape[0])
ax.plot_surface(X, Y, Z.T, cstride=1, rstride=1, color=c, alpha=0.5)
ax.set_title(f"{label}")
ax.set_xlabel("n_samples")
ax.set_ylabel("n_features")

plt.show()
3 changes: 2 additions & 1 deletion doc/whats_new/v0.2.rst
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,8 @@ Changelog
---------
- |Efficiency| Upgraded build process to rely on Cython 3.0+, by `Adam Li`_ (:pr:`109`)
- |Feature| Allow decision trees to take advantage of ``partial_fit`` and ``monotonic_cst`` when available, by `Adam Li`_ (:pr:`109`)

- |Efficiency| Around 1.5-2x speed improvement for unsupervised forests, by `Adam Li`_ (:pr:`114`)
- |API| Allow ``sqrt`` and ``log2`` keywords to be used for ``min_samples_split`` parameter in unsupervised forests, by `Adam Li`_ (:pr:`114`)

Code and Documentation Contributors
-----------------------------------
Expand Down
2 changes: 1 addition & 1 deletion sktree/_lib/sklearn_fork
Submodule sklearn_fork updated 53 files
+1 −1 build_tools/azure/debian_atlas_32bit_lock.txt
+5 −6 build_tools/azure/py38_conda_defaults_openblas_linux-64_conda.lock
+21 −21 build_tools/azure/py38_conda_forge_mkl_win-64_conda.lock
+21 −21 build_tools/azure/py38_conda_forge_openblas_ubuntu_2204_linux-64_conda.lock
+30 −29 build_tools/azure/pylatest_conda_forge_mkl_linux-64_conda.lock
+25 −24 build_tools/azure/pylatest_conda_forge_mkl_no_coverage_linux-64_conda.lock
+23 −23 build_tools/azure/pylatest_conda_forge_mkl_osx-64_conda.lock
+9 −11 build_tools/azure/pylatest_conda_mkl_no_openmp_osx-64_conda.lock
+12 −12 build_tools/azure/pylatest_pip_openblas_pandas_linux-64_conda.lock
+8 −8 build_tools/azure/pylatest_pip_scipy_dev_linux-64_conda.lock
+8 −8 build_tools/azure/pypy3_linux-64_conda.lock
+2 −2 build_tools/azure/python_nogil_lock.txt
+1 −1 build_tools/azure/ubuntu_atlas_lock.txt
+47 −53 build_tools/circle/doc_linux-64_conda.lock
+10 −10 build_tools/circle/doc_min_dependencies_linux-64_conda.lock
+4 −0 build_tools/cirrus/arm_tests.yml
+1 −1 build_tools/cirrus/arm_wheel.yml
+1 −1 build_tools/cirrus/build_test_arm.sh
+10 −10 build_tools/cirrus/py39_conda_forge_linux-aarch64_conda.lock
+28 −1 doc/modules/array_api.rst
+1 −1 doc/whats_new/v1.3.rst
+29 −4 doc/whats_new/v1.4.rst
+13 −9 sklearn/cluster/tests/test_affinity_propagation.py
+20 −13 sklearn/cluster/tests/test_bicluster.py
+16 −15 sklearn/cluster/tests/test_bisect_k_means.py
+9 −6 sklearn/cluster/tests/test_hdbscan.py
+211 −27 sklearn/compose/_column_transformer.py
+3 −3 sklearn/compose/_target.py
+118 −0 sklearn/compose/tests/test_column_transformer.py
+2 −2 sklearn/datasets/_base.py
+7 −3 sklearn/decomposition/_base.py
+2 −2 sklearn/decomposition/_nmf.py
+19 −16 sklearn/decomposition/tests/test_nmf.py
+309 −15 sklearn/ensemble/_forest.py
+6 −2 sklearn/feature_selection/tests/test_from_model.py
+2 −8 sklearn/linear_model/_least_angle.py
+3 −4 sklearn/linear_model/_ridge.py
+36 −3 sklearn/pipeline.py
+24 −12 sklearn/preprocessing/_data.py
+27 −0 sklearn/preprocessing/tests/test_data.py
+1 −0 sklearn/svm/_classes.py
+22 −2 sklearn/tests/metadata_routing_common.py
+6 −7 sklearn/tests/test_metadata_routing.py
+68 −59 sklearn/tests/test_pipeline.py
+41 −23 sklearn/tree/_classes.py
+2 −2 sklearn/tree/tests/test_tree.py
+61 −4 sklearn/utils/_array_api.py
+1 −1 sklearn/utils/_metadata_requests.py
+11 −1 sklearn/utils/estimator_checks.py
+34 −0 sklearn/utils/fixes.py
+56 −0 sklearn/utils/tests/test_array_api.py
+3 −3 sklearn/utils/tests/test_utils.py
+2 −2 sklearn/utils/validation.py
7 changes: 5 additions & 2 deletions sktree/ensemble/_honest_forest.py
Original file line number Diff line number Diff line change
Expand Up @@ -375,7 +375,7 @@ def __init__(
self.honest_prior = honest_prior
self.tree_estimator = tree_estimator

def fit(self, X, y, sample_weight=None):
def fit(self, X, y, sample_weight=None, classes=None):
"""
Build a forest of trees from the training set (X, y).
Expand All @@ -397,13 +397,16 @@ def fit(self, X, y, sample_weight=None):
classification, splits are also ignored if they would result in any
single class carrying a negative weight in either child node.
classes : array-like of shape (n_classes,), default=None
List of all the classes that can possibly appear in the y vector.
Returns
-------
self : HonestForestClassifier
Fitted tree estimator.
"""
X, y = check_X_y(X, y, multi_output=True)
super().fit(X, y, sample_weight)
super().fit(X, y, sample_weight=sample_weight, classes=classes)

# Compute honest decision function
self.honest_decision_function_ = self._predict_proba(
Expand Down
4 changes: 2 additions & 2 deletions sktree/ensemble/_unsupervised_forest.py
Original file line number Diff line number Diff line change
Expand Up @@ -554,7 +554,7 @@ def __init__(
*,
criterion="twomeans",
max_depth=None,
min_samples_split=2,
min_samples_split="sqrt",
min_samples_leaf=1,
min_weight_fraction_leaf=0.0,
max_features="sqrt",
Expand Down Expand Up @@ -786,7 +786,7 @@ def __init__(
*,
criterion="twomeans",
max_depth=None,
min_samples_split=2,
min_samples_split="sqrt",
min_samples_leaf=1,
min_weight_fraction_leaf=0.0,
max_features="sqrt",
Expand Down
21 changes: 16 additions & 5 deletions sktree/tests/test_supervised_forest.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.utils._testing import assert_array_almost_equal
from sklearn.utils.estimator_checks import check_estimator
from sklearn.utils.estimator_checks import parametrize_with_checks
from sklearn.utils.validation import check_random_state

from sktree.ensemble import (
Expand Down Expand Up @@ -177,10 +177,21 @@ def _trunk(n, p=10, random_state=None):
return X, y


@pytest.mark.parametrize("name", FOREST_ESTIMATORS)
def test_sklearn_compatible_estimator(name):
estimator = FOREST_ESTIMATORS[name](random_state=12345, n_estimators=10)
check_estimator(estimator)
@parametrize_with_checks(
[
ObliqueRandomForestClassifier(random_state=12345, n_estimators=10),
PatchObliqueRandomForestClassifier(random_state=12345, n_estimators=10),
ObliqueRandomForestRegressor(random_state=12345, n_estimators=10),
PatchObliqueRandomForestRegressor(random_state=12345, n_estimators=10),
]
)
def test_sklearn_compatible_estimator(estimator, check):
# TODO: remove when we can replicate the CI error...
if isinstance(
estimator, (ObliqueRandomForestClassifier, PatchObliqueRandomForestClassifier)
) and check.func.__name__ in ["check_fit_score_takes_y"]:
pytest.skip()
check(estimator)


def test_oblique_forest_sparse_parity():
Expand Down
25 changes: 21 additions & 4 deletions sktree/tree/_classes.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import copy
import numbers
from numbers import Real

import numpy as np
Expand Down Expand Up @@ -171,7 +172,7 @@ def __init__(
criterion="twomeans",
splitter="best",
max_depth=None,
min_samples_split=5,
min_samples_split="sqrt",
min_samples_leaf=1,
min_weight_fraction_leaf=0.0,
max_features=None,
Expand Down Expand Up @@ -234,6 +235,22 @@ def _build_tree(
max_depth,
random_state,
):
if isinstance(self.min_samples_split, str):
if self.min_samples_split == "sqrt":
min_samples_split = max(1, int(np.sqrt(self.n_features_in_)))
elif self.min_samples_split == "log2":
min_samples_split = max(1, int(np.log2(self.n_features_in_)))
elif self.min_samples_split is None:
min_samples_split = self.n_features_in_
elif isinstance(self.min_samples_split, numbers.Integral):
min_samples_split = self.min_samples_split
else: # float
if self.min_samples_split > 0.0:
min_samples_split = max(1, int(self.min_samples_split * self.n_features_in_))
else:
min_samples_split = 0
self.min_samples_split_ = min_samples_split

criterion = self.criterion
if not isinstance(criterion, UnsupervisedCriterion):
criterion = UNSUPERVISED_CRITERIA[self.criterion]()
Expand All @@ -254,7 +271,7 @@ def _build_tree(
if max_leaf_nodes < 0:
builder = UnsupervisedDepthFirstTreeBuilder(
splitter,
min_samples_split,
self.min_samples_split_,
min_samples_leaf,
min_weight_leaf,
max_depth,
Expand All @@ -263,7 +280,7 @@ def _build_tree(
else:
builder = UnsupervisedBestFirstTreeBuilder(
splitter,
min_samples_split,
self.min_samples_split_,
min_samples_leaf,
min_weight_leaf,
max_depth,
Expand Down Expand Up @@ -459,7 +476,7 @@ def __init__(
criterion="twomeans",
splitter="best",
max_depth=None,
min_samples_split=5,
min_samples_split="sqrt",
min_samples_leaf=1,
min_weight_fraction_leaf=0,
max_features=None,
Expand Down
Loading

0 comments on commit fc48cef

Please sign in to comment.