diff --git a/.circleci/config.yml b/.circleci/config.yml index 6d76eb045..8f5c98550 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -84,6 +84,7 @@ jobs: pip install --upgrade pip spin spin setup-submodule pip install .[build,doc] + pip install --pre --extra-index https://pypi.anaconda.org/scientific-python-nightly-wheels/simple scikit-learn --force - run: name: build treeple diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index e363b70a0..492f70850 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -84,6 +84,11 @@ jobs: pip install -r build_requirements.txt pip install -r test_requirements.txt + - name: Install nightly wheels for scikit-learn (only for ubuntu 3.12) + if: ${{ matrix.python-version == '3.12' }} && ${{ matrix.os == 'ubuntu-latest' }} + run: | + pip install --pre --extra-index https://pypi.anaconda.org/scientific-python-nightly-wheels/simple scikit-learn --force + - name: Prepare compiler cache id: prep-ccache shell: bash @@ -184,6 +189,7 @@ jobs: pip install compilers pip install -r build_requirements.txt pip install -r test_requirements.txt + pip install --pre --extra-index https://pypi.anaconda.org/scientific-python-nightly-wheels/simple scikit-learn --force - name: Prepare compiler cache id: prep-ccache @@ -279,6 +285,7 @@ jobs: pip install spin pip install -r build_requirements.txt pip install -r test_requirements.txt + pip install --pre --extra-index https://pypi.anaconda.org/scientific-python-nightly-wheels/simple scikit-learn --force - name: Build run: | diff --git a/DEVELOPING.md b/DEVELOPING.md index 7e5bdcc1a..9340552f9 100644 --- a/DEVELOPING.md +++ b/DEVELOPING.md @@ -18,7 +18,7 @@ - Python 3.9+ - numpy>=1.25.0 - scipy>=1.5.0 -- scikit-learn>=1.4.1 +- scikit-learn>=1.5.0 For the other requirements, inspect the ``pyproject.toml`` file. diff --git a/README.md b/README.md index 0a21ebfe4..54b061edd 100644 --- a/README.md +++ b/README.md @@ -46,7 +46,7 @@ We minimally require: * Python (>=3.9) * numpy * scipy - * scikit-learn >= 1.3 + * scikit-learn Installation with Pip () ------------------------------------------------------------- diff --git a/pyproject.toml b/pyproject.toml index bc2149219..596d2408b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -10,7 +10,7 @@ requires = [ "setuptools<=65.5", "packaging", "Cython>=3.0.10", - "scikit-learn>=1.4.1", + "scikit-learn>=1.5.0", "scipy>=1.5.0", "numpy>=1.25; python_version>='3.9'" ] @@ -52,7 +52,7 @@ include = [ dependencies = [ 'numpy>=1.25.0', 'scipy>=1.5.0', - 'scikit-learn>=1.4.1' + 'scikit-learn>=1.5.0' ] [project.optional-dependencies] diff --git a/treeple/_lib/meson.build b/treeple/_lib/meson.build index 20f2d3bf2..5dd37c868 100644 --- a/treeple/_lib/meson.build +++ b/treeple/_lib/meson.build @@ -8,6 +8,9 @@ tree_extension_metadata = { '_tree': {'sources': ['./sklearn/tree/' + '_tree.pyx'], 'override_options': ['cython_language=cpp', 'optimization=3']}, + '_partitioner': + {'sources': ['./sklearn/tree/' + '_partitioner.pyx'], + 'override_options': ['cython_language=cpp', 'optimization=3']}, '_splitter': {'sources': ['./sklearn/tree/' + '_splitter.pyx'], 'override_options': ['cython_language=cpp', 'optimization=3']}, diff --git a/treeple/_lib/sklearn_fork b/treeple/_lib/sklearn_fork index d455aa16e..b201fcb94 160000 --- a/treeple/_lib/sklearn_fork +++ b/treeple/_lib/sklearn_fork @@ -1 +1 @@ -Subproject commit d455aa16ee9cc42ce342dd07d9b94db117783fcc +Subproject commit b201fcb945fa54979a93bf8cc11f88c17d4b8fc9 diff --git a/treeple/stats/__init__.py b/treeple/stats/__init__.py index 8ca7a11ea..a17666b89 100644 --- a/treeple/stats/__init__.py +++ b/treeple/stats/__init__.py @@ -1,14 +1,8 @@ -from .forestht import ( - build_coleman_forest, - build_cv_forest, - build_oob_forest, - build_permutation_forest, -) -from .monte_carlo import PermutationTest +from .baseline import build_cv_forest, build_permutation_forest +from .forest import build_coleman_forest, build_oob_forest from .permuteforest import PermutationHonestForestClassifier __all__ = [ - "PermutationTest", "build_cv_forest", "build_oob_forest", "build_coleman_forest", diff --git a/treeple/stats/forestht.py b/treeple/stats/baseline.py similarity index 52% rename from treeple/stats/forestht.py rename to treeple/stats/baseline.py index b71081806..ca0c0f580 100644 --- a/treeple/stats/forestht.py +++ b/treeple/stats/baseline.py @@ -1,5 +1,3 @@ -import threading -from collections import namedtuple from typing import Callable import numpy as np @@ -8,12 +6,11 @@ from sklearn.base import clone from sklearn.ensemble._base import _partition_estimators from sklearn.model_selection import StratifiedKFold, train_test_split -from sklearn.utils.multiclass import type_of_target -from .._lib.sklearn.ensemble._forest import ForestClassifier from ..tree._classes import DTYPE +from .forest import ForestTestResult, build_oob_forest from .permuteforest import PermutationHonestForestClassifier -from .utils import METRIC_FUNCTIONS, POSITIVE_METRICS, _compute_null_distribution_coleman +from .utils import METRIC_FUNCTIONS, POSITIVE_METRICS def _parallel_predict_proba(predict_proba, X, indices_test): @@ -28,156 +25,6 @@ def _parallel_predict_proba(predict_proba, X, indices_test): return prediction -def _parallel_predict_proba_oob(predict_proba, X, out, idx, test_idx, lock): - """ - This is a utility function for joblib's Parallel. - It can't go locally in ForestClassifier or ForestRegressor, because joblib - complains that it cannot pickle it when placed there. - """ - # each tree predicts proba with a list of output (n_samples, n_classes[i]) - prediction = predict_proba(X, check_input=False) - - indices = np.zeros(X.shape[0], dtype=bool) - indices[test_idx] = True - with lock: - out[idx, test_idx, :] = prediction[test_idx, :] - return prediction - - -ForestTestResult = namedtuple( - "ForestTestResult", - ["observe_test_stat", "permuted_stat", "observe_stat", "pvalue", "null_dist"], -) - - -def build_coleman_forest( - est, - perm_est, - X, - y, - covariate_index=None, - metric="s@98", - n_repeats=10_000, - verbose=False, - seed=None, - return_posteriors=True, - **metric_kwargs, -): - """Build a hypothesis testing forest using a two-forest approach. - - The two-forest approach stems from the Coleman et al. 2022 paper, where - two forests are trained: one on the original dataset, and one on the - permuted dataset. The dataset is either permuted once, or independently for - each tree in the permuted forest. The original test statistic is computed by - comparing the metric on both forests ``(metric_forest - metric_perm_forest)``. - For full details, see :footcite:`coleman2022scalable`. - - Parameters - ---------- - est : Forest - The type of forest to use. Must be enabled with ``bootstrap=True``. - perm_est : Forest - The forest to use for the permuted dataset. - X : ArrayLike of shape (n_samples, n_features) - Data. - y : ArrayLike of shape (n_samples, n_outputs) - Binary target, so ``n_outputs`` should be at most 1. - covariate_index : ArrayLike, optional of shape (n_covariates,) - The index array of covariates to shuffle, by default None, which - defaults to all covariates. - metric : str, optional - The metric to compute, by default "s@98", for sensitivity at - 98% specificity. - n_repeats : int, optional - Number of times to bootstrap sample the two forests to construct - the null distribution, by default 10000. The construction of the - null forests will be parallelized according to the ``n_jobs`` - argument of the ``est`` forest. - verbose : bool, optional - Verbosity, by default False. - seed : int, optional - Random seed, by default None. - return_posteriors : bool, optional - Whether or not to return the posteriors, by default True. - **metric_kwargs : dict, optional - Additional keyword arguments to pass to the metric function. - - Returns - ------- - observe_stat : float - The test statistic. To compute the test statistic, take - ``permute_stat_`` and subtract ``observe_stat_``. - pvalue : float - The p-value of the test statistic. - orig_forest_proba : ArrayLike of shape (n_estimators, n_samples, n_outputs) - The predicted posterior probabilities for each estimator on their - out of bag samples. - perm_forest_proba : ArrayLike of shape (n_estimators, n_samples, n_outputs) - The predicted posterior probabilities for each of the permuted estimators - on their out of bag samples. - null_dist : ArrayLike of shape (n_repeats,) - The null statistic differences from permuted forests. - - References - ---------- - .. footbibliography:: - """ - metric_func: Callable[[ArrayLike, ArrayLike], float] = METRIC_FUNCTIONS[metric] - - # build two sets of forests - est, orig_forest_proba = build_oob_forest(est, X, y, verbose=verbose) - - if not isinstance(perm_est, PermutationHonestForestClassifier): - raise RuntimeError( - f"Permutation forest must be a PermutationHonestForestClassifier, got {type(perm_est)}" - ) - perm_est, perm_forest_proba = build_oob_forest( - perm_est, X, y, verbose=verbose, covariate_index=covariate_index - ) - - # get the number of jobs - n_jobs = est.n_jobs - - if y.ndim == 1: - y = y.reshape(-1, 1) - metric_star, metric_star_pi = _compute_null_distribution_coleman( - y, - orig_forest_proba, - perm_forest_proba, - metric, - n_repeats=n_repeats, - seed=seed, - n_jobs=n_jobs, - **metric_kwargs, - ) - - y_pred_proba_orig = np.nanmean(orig_forest_proba, axis=0) - y_pred_proba_perm = np.nanmean(perm_forest_proba, axis=0) - observe_stat = metric_func(y, y_pred_proba_orig, **metric_kwargs) - permute_stat = metric_func(y, y_pred_proba_perm, **metric_kwargs) - - # metric^\pi - metric = observed test statistic, which under the - # null is normally distributed around 0 - observe_test_stat = observe_stat - permute_stat - - # metric^\pi_j - metric_j, which is centered at 0 - null_dist = metric_star_pi - metric_star - - # compute pvalue - if metric in POSITIVE_METRICS: - pvalue = (1 + (null_dist >= observe_test_stat).sum()) / (1 + n_repeats) - else: - pvalue = (1 + (null_dist <= observe_test_stat).sum()) / (1 + n_repeats) - - forest_result = ForestTestResult( - observe_test_stat, permute_stat, observe_stat, pvalue, null_dist - ) - if return_posteriors: - return forest_result, orig_forest_proba, perm_forest_proba, est, perm_est - else: - return forest_result - - def build_permutation_forest( est, perm_est, @@ -300,79 +147,6 @@ def build_permutation_forest( return forest_result -def build_oob_forest(est: ForestClassifier, X, y, verbose=False, **est_kwargs): - """Build a hypothesis testing forest using oob samples. - - Parameters - ---------- - est : Forest - The type of forest to use. Must be enabled with ``bootstrap=True``. - The forest should have either ``oob_samples_`` or ``estimators_samples_`` - property defined, which will be used to compute the out of bag samples - per tree. - X : ArrayLike of shape (n_samples, n_features) - Data. - y : ArrayLike of shape (n_samples, n_outputs) - Binary target, so ``n_outputs`` should be at most 1. - verbose : bool, optional - Verbosity, by default False. - **est_kwargs : dict, optional - Additional keyword arguments to pass to the forest estimator ``fit`` function. - - Returns - ------- - est : Forest - Fitted forest. - all_proba : ArrayLike of shape (n_estimators, n_samples, n_outputs) - The predicted posterior probabilities for each estimator on their - out of bag samples. - """ - assert est.bootstrap - assert type_of_target(y) in ("binary") - est = clone(est) - - # build forest - est.fit(X, y.ravel(), **est_kwargs) - - # now evaluate - X = est._validate_X_predict(X) - - # if we trained a binning tree, then we should re-bin the data - # XXX: this is inefficient and should be improved to be in line with what - # the Histogram Gradient Boosting Tree does, where the binning thresholds - # are passed into the tree itself, thus allowing us to set the node feature - # value thresholds within the tree itself. - if est.max_bins is not None: - X = est._bin_data(X, is_training_data=False).astype(DTYPE) - - # Assign chunk of trees to jobs - n_jobs, _, _ = _partition_estimators(est.n_estimators, est.n_jobs) - - # avoid storing the output of every estimator by summing them here - lock = threading.Lock() - # accumulate the predictions across all trees - all_proba = np.full( - (len(est.estimators_), X.shape[0], est.n_classes_), np.nan, dtype=np.float64 - ) - if hasattr(est, "oob_samples_"): - Parallel(n_jobs=n_jobs, verbose=verbose, require="sharedmem")( - delayed(_parallel_predict_proba_oob)(e.predict_proba, X, all_proba, idx, test_idx, lock) - for idx, (e, test_idx) in enumerate(zip(est.estimators_, est.oob_samples_)) - ) - else: - inbag_samples = est.estimators_samples_ - all_samples = np.arange(X.shape[0]) - oob_samples_list = [ - np.setdiff1d(all_samples, inbag_samples[i]) for i in range(len(inbag_samples)) - ] - Parallel(n_jobs=n_jobs, verbose=verbose, require="sharedmem")( - delayed(_parallel_predict_proba_oob)(e.predict_proba, X, all_proba, idx, test_idx, lock) - for idx, (e, test_idx) in enumerate(zip(est.estimators_, oob_samples_list)) - ) - - return est, all_proba - - def build_cv_forest( est, X, diff --git a/treeple/stats/forest.py b/treeple/stats/forest.py new file mode 100644 index 000000000..5b5b0b056 --- /dev/null +++ b/treeple/stats/forest.py @@ -0,0 +1,238 @@ +import threading +from collections import namedtuple +from typing import Callable + +import numpy as np +from joblib import Parallel, delayed +from numpy.typing import ArrayLike +from sklearn.base import clone +from sklearn.ensemble._base import _partition_estimators +from sklearn.utils.multiclass import type_of_target + +from .._lib.sklearn.ensemble._forest import ForestClassifier +from ..tree._classes import DTYPE +from .permuteforest import PermutationHonestForestClassifier +from .utils import METRIC_FUNCTIONS, POSITIVE_METRICS, _compute_null_distribution_coleman + + +def _parallel_predict_proba_oob(predict_proba, X, out, idx, test_idx, lock): + """ + This is a utility function for joblib's Parallel. + It can't go locally in ForestClassifier or ForestRegressor, because joblib + complains that it cannot pickle it when placed there. + """ + # each tree predicts proba with a list of output (n_samples, n_classes[i]) + prediction = predict_proba(X, check_input=False) + + indices = np.zeros(X.shape[0], dtype=bool) + indices[test_idx] = True + with lock: + out[idx, test_idx, :] = prediction[test_idx, :] + return prediction + + +ForestTestResult = namedtuple( + "ForestTestResult", + ["observe_test_stat", "permuted_stat", "observe_stat", "pvalue", "null_dist"], +) + + +def build_coleman_forest( + est, + perm_est, + X, + y, + covariate_index=None, + metric="s@98", + n_repeats=10_000, + verbose=False, + seed=None, + return_posteriors=True, + **metric_kwargs, +): + """Build a hypothesis testing forest using a two-forest approach. + + The two-forest approach stems from the Coleman et al. 2022 paper, where + two forests are trained: one on the original dataset, and one on the + permuted dataset. The dataset is either permuted once, or independently for + each tree in the permuted forest. The original test statistic is computed by + comparing the metric on both forests ``(metric_forest - metric_perm_forest)``. + For full details, see :footcite:`coleman2022scalable`. + + Parameters + ---------- + est : Forest + The type of forest to use. Must be enabled with ``bootstrap=True``. + perm_est : Forest + The forest to use for the permuted dataset. + X : ArrayLike of shape (n_samples, n_features) + Data. + y : ArrayLike of shape (n_samples, n_outputs) + Binary target, so ``n_outputs`` should be at most 1. + covariate_index : ArrayLike, optional of shape (n_covariates,) + The index array of covariates to shuffle, by default None, which + defaults to all covariates. + metric : str, optional + The metric to compute, by default "s@98", for sensitivity at + 98% specificity. + n_repeats : int, optional + Number of times to bootstrap sample the two forests to construct + the null distribution, by default 10000. The construction of the + null forests will be parallelized according to the ``n_jobs`` + argument of the ``est`` forest. + verbose : bool, optional + Verbosity, by default False. + seed : int, optional + Random seed, by default None. + return_posteriors : bool, optional + Whether or not to return the posteriors, by default True. + **metric_kwargs : dict, optional + Additional keyword arguments to pass to the metric function. + + Returns + ------- + observe_stat : float + The test statistic. To compute the test statistic, take + ``permute_stat_`` and subtract ``observe_stat_``. + pvalue : float + The p-value of the test statistic. + orig_forest_proba : ArrayLike of shape (n_estimators, n_samples, n_outputs) + The predicted posterior probabilities for each estimator on their + out of bag samples. + perm_forest_proba : ArrayLike of shape (n_estimators, n_samples, n_outputs) + The predicted posterior probabilities for each of the permuted estimators + on their out of bag samples. + null_dist : ArrayLike of shape (n_repeats,) + The null statistic differences from permuted forests. + + References + ---------- + .. footbibliography:: + """ + metric_func: Callable[[ArrayLike, ArrayLike], float] = METRIC_FUNCTIONS[metric] + + # build two sets of forests + est, orig_forest_proba = build_oob_forest(est, X, y, verbose=verbose) + + if not isinstance(perm_est, PermutationHonestForestClassifier): + raise RuntimeError( + f"Permutation forest must be a PermutationHonestForestClassifier, got {type(perm_est)}" + ) + perm_est, perm_forest_proba = build_oob_forest( + perm_est, X, y, verbose=verbose, covariate_index=covariate_index + ) + + # get the number of jobs + n_jobs = est.n_jobs + + if y.ndim == 1: + y = y.reshape(-1, 1) + metric_star, metric_star_pi = _compute_null_distribution_coleman( + y, + orig_forest_proba, + perm_forest_proba, + metric, + n_repeats=n_repeats, + seed=seed, + n_jobs=n_jobs, + **metric_kwargs, + ) + + y_pred_proba_orig = np.nanmean(orig_forest_proba, axis=0) + y_pred_proba_perm = np.nanmean(perm_forest_proba, axis=0) + observe_stat = metric_func(y, y_pred_proba_orig, **metric_kwargs) + permute_stat = metric_func(y, y_pred_proba_perm, **metric_kwargs) + + # metric^\pi - metric = observed test statistic, which under the + # null is normally distributed around 0 + observe_test_stat = observe_stat - permute_stat + + # metric^\pi_j - metric_j, which is centered at 0 + null_dist = metric_star_pi - metric_star + + # compute pvalue + if metric in POSITIVE_METRICS: + pvalue = (1 + (null_dist >= observe_test_stat).sum()) / (1 + n_repeats) + else: + pvalue = (1 + (null_dist <= observe_test_stat).sum()) / (1 + n_repeats) + + forest_result = ForestTestResult( + observe_test_stat, permute_stat, observe_stat, pvalue, null_dist + ) + if return_posteriors: + return forest_result, orig_forest_proba, perm_forest_proba, est, perm_est + else: + return forest_result + + +def build_oob_forest(est: ForestClassifier, X, y, verbose=False, **est_kwargs): + """Build a hypothesis testing forest using oob samples. + + Parameters + ---------- + est : Forest + The type of forest to use. Must be enabled with ``bootstrap=True``. + The forest should have either ``oob_samples_`` or ``estimators_samples_`` + property defined, which will be used to compute the out of bag samples + per tree. + X : ArrayLike of shape (n_samples, n_features) + Data. + y : ArrayLike of shape (n_samples, n_outputs) + Binary target, so ``n_outputs`` should be at most 1. + verbose : bool, optional + Verbosity, by default False. + **est_kwargs : dict, optional + Additional keyword arguments to pass to the forest estimator ``fit`` function. + + Returns + ------- + est : Forest + Fitted forest. + all_proba : ArrayLike of shape (n_estimators, n_samples, n_outputs) + The predicted posterior probabilities for each estimator on their + out of bag samples. + """ + assert est.bootstrap + assert type_of_target(y) in ("binary") + est = clone(est) + + # build forest + est.fit(X, y.ravel(), **est_kwargs) + + # now evaluate + X = est._validate_X_predict(X) + + # if we trained a binning tree, then we should re-bin the data + # XXX: this is inefficient and should be improved to be in line with what + # the Histogram Gradient Boosting Tree does, where the binning thresholds + # are passed into the tree itself, thus allowing us to set the node feature + # value thresholds within the tree itself. + if est.max_bins is not None: + X = est._bin_data(X, is_training_data=False).astype(DTYPE) + + # Assign chunk of trees to jobs + n_jobs, _, _ = _partition_estimators(est.n_estimators, est.n_jobs) + + # avoid storing the output of every estimator by summing them here + lock = threading.Lock() + # accumulate the predictions across all trees + all_proba = np.full( + (len(est.estimators_), X.shape[0], est.n_classes_), np.nan, dtype=np.float64 + ) + if hasattr(est, "oob_samples_"): + Parallel(n_jobs=n_jobs, verbose=verbose, require="sharedmem")( + delayed(_parallel_predict_proba_oob)(e.predict_proba, X, all_proba, idx, test_idx, lock) + for idx, (e, test_idx) in enumerate(zip(est.estimators_, est.oob_samples_)) + ) + else: + inbag_samples = est.estimators_samples_ + all_samples = np.arange(X.shape[0]) + oob_samples_list = [ + np.setdiff1d(all_samples, inbag_samples[i]) for i in range(len(inbag_samples)) + ] + Parallel(n_jobs=n_jobs, verbose=verbose, require="sharedmem")( + delayed(_parallel_predict_proba_oob)(e.predict_proba, X, all_proba, idx, test_idx, lock) + for idx, (e, test_idx) in enumerate(zip(est.estimators_, oob_samples_list)) + ) + + return est, all_proba diff --git a/treeple/stats/meson.build b/treeple/stats/meson.build index 2ef4db2db..587817547 100644 --- a/treeple/stats/meson.build +++ b/treeple/stats/meson.build @@ -1,9 +1,9 @@ python_sources = [ '__init__.py', - 'forestht.py', + 'forest.py', 'utils.py', - 'monte_carlo.py', 'permuteforest.py', + 'baseline.py', ] py.install_sources( diff --git a/treeple/stats/monte_carlo.py b/treeple/stats/monte_carlo.py deleted file mode 100644 index 0b6c6dbfa..000000000 --- a/treeple/stats/monte_carlo.py +++ /dev/null @@ -1,136 +0,0 @@ -import numpy as np -from numpy.typing import ArrayLike -from sklearn.base import MetaEstimatorMixin, clone -from sklearn.metrics import roc_auc_score -from sklearn.model_selection import train_test_split -from sklearn.utils.validation import check_X_y - -from .utils import _compute_null_distribution_perm - - -# XXX: This is an experimental abstraction for hypothesis testing using -# purely monte-carlo methods for any scikit-learn estimator that supports -# the `predict_proba` method. -class PermutationTest(MetaEstimatorMixin): - def __init__(self, estimator, n_repeats=100, test_size=0.2, random_state=None) -> None: - """Permutation tester for hypothesis testing. - - This approaches the problem of conditional hypothesis testing using permutations - of the covariate of interest. The test statistic is computed on the original data, - and then on the permuted data. The p-value is the proportion of times the test - statistic on the permuted data is more extreme than the test statistic on the - original data. - - For example, one can use kNN as the estimator, or a logistic regression model. - - Parameters - ---------- - estimator : _type_ - _description_ - n_repeats : int, optional - _description_, by default 100 - test_size : float, optional - _description_, by default 0.2 - random_state : _type_, optional - _description_, by default None - """ - self.estimator = estimator - self.n_repeats = n_repeats - self.test_size = test_size - self.random_state = random_state - - @property - def train_test_samples_(self): - """ - The subset of drawn samples for each base estimator. - - Returns a dynamically generated list of indices identifying - the samples used for fitting each member of the ensemble, i.e., - the in-bag samples. - - Note: the list is re-created at each call to the property in order - to reduce the object memory footprint by not storing the sampling - data. Thus fetching the property may be slower than expected. - """ - indices = np.arange(self._n_samples_, dtype=int) - - # Get drawn indices along both sample and feature axes - indices_train, indices_test = train_test_split( - indices, test_size=self.test_size, shuffle=True, random_state=self.random_state - ) - return indices_train, indices_test - - def test( - self, - X: ArrayLike, - y: ArrayLike, - covariate_index: ArrayLike, - metric: str = "mse", - n_repeats: int = 1000, - return_posteriors: bool = False, - **metric_kwargs, - ): - """Perform hypothesis test using permutation testing. - - Parameters - ---------- - X : ArrayLike of shape (n_samples, n_features) - The data matrix. - y : ArrayLike of shape (n_samples, n_outputs) - The target matrix. - covariate_index : ArrayLike of shape (n_covariates,) - The covariate indices of ``X`` to shuffle. - metric : str, optional - Metric to compute, by default "mse". - n_repeats : int, optional - Number of times to sample the null distribution, by default 1000. - return_posteriors : bool, optional - Whether or not to return the posteriors, by default False. - **metric_kwargs : dict, optional - Keyword arguments to pass to the metric function. - - Returns - ------- - observe_stat : float - Observed test statistic. - pvalue : float - Pvalue of the test. - """ - X, y = check_X_y(X, y, ensure_2d=True, copy=True, multi_output=True) - if y.ndim != 2: - y = y.reshape(-1, 1) - self._n_samples_ = X.shape[0] - - self.estimator_ = clone(self.estimator) - - indices_train, indices_test = self.train_test_samples_ - - # train/test split - self.estimator_.fit(X[indices_train, :], y[indices_train, :]) - posterior = self.estimator_.predict_proba(X[indices_test, :]) - pauc = roc_auc_score(y[indices_test, :], posterior[:, 1], max_fpr=0.1) - self.observe_stat_ = pauc - - # compute null distribution of the test statistic - # WARNING: this could take a long time, since it fits a new forest - null_dist = _compute_null_distribution_perm( - X_train=X[indices_train, :], - y_train=y[indices_train, :], - X_test=X[indices_test, :], - y_test=y[indices_test, :], - covariate_index=covariate_index, - est=self.estimator_, - metric=metric, - n_repeats=n_repeats, - seed=self.random_state, - ) - - if not return_posteriors: - self.null_dist_ = np.array(null_dist) - else: - self.null_dist_ = np.array([x[0] for x in null_dist]) - self.posterior_null_ = np.array([x[1] for x in null_dist]) - - n_repeats = len(self.null_dist_) - pvalue = (1 + (self.null_dist_ < self.observe_stat_).sum()) / (1 + n_repeats) - return self.observe_stat_, pvalue diff --git a/treeple/stats/tests/meson.build b/treeple/stats/tests/meson.build index 1fe81ecfa..c4b1391c3 100644 --- a/treeple/stats/tests/meson.build +++ b/treeple/stats/tests/meson.build @@ -1,6 +1,7 @@ python_sources = [ '__init__.py', - 'test_forestht.py', + 'test_forest.py', + 'test_baseline.py', 'test_coleman.py', 'test_utils.py', 'test_permuteforest.py', diff --git a/treeple/stats/tests/test_baseline.py b/treeple/stats/tests/test_baseline.py new file mode 100644 index 000000000..84b8683ff --- /dev/null +++ b/treeple/stats/tests/test_baseline.py @@ -0,0 +1,104 @@ +import numpy as np +import pytest +from numpy.testing import assert_array_equal + +from treeple import HonestForestClassifier +from treeple.stats import ( + PermutationHonestForestClassifier, + build_cv_forest, + build_permutation_forest, +) + +seed = 12345 +rng = np.random.default_rng(seed) + + +@pytest.mark.parametrize("bootstrap, max_samples", [(True, 1.6), (False, None)]) +def test_build_cv_honest_forest(bootstrap, max_samples): + n_estimators = 100 + est = HonestForestClassifier( + n_estimators=n_estimators, + random_state=0, + bootstrap=bootstrap, + max_samples=max_samples, + honest_fraction=0.5, + stratify=True, + ) + X = rng.normal(0, 1, (100, 2)) + X[:50] *= -1 + y = np.array([0, 1] * 50) + samples = np.arange(len(y)) + + est_list, proba_list, train_idx_list, test_idx_list = build_cv_forest( + est, + X, + y, + return_indices=True, + seed=seed, + cv=3, + ) + + assert isinstance(est_list, list) + assert isinstance(proba_list, list) + + for est, proba, train_idx, test_idx in zip(est_list, proba_list, train_idx_list, test_idx_list): + assert len(train_idx) + len(test_idx) == len(samples) + structure_samples = est.structure_indices_ + leaf_samples = est.honest_indices_ + + if not bootstrap: + oob_samples = [[] for _ in range(est.n_estimators)] + else: + oob_samples = est.oob_samples_ + + # compared to oob samples, now the train samples are comprised of the entire dataset + # seen over the entire forest. The test dataset is completely disjoint + for tree_idx in range(est.n_estimators): + n_samples_in_tree = len(structure_samples[tree_idx]) + len(leaf_samples[tree_idx]) + assert n_samples_in_tree + len(oob_samples[tree_idx]) == len(train_idx), ( + f"For tree: " + f"{tree_idx} {len(structure_samples[tree_idx])} + " + f"{len(leaf_samples[tree_idx])} + {len(oob_samples[tree_idx])} " + f"!= {len(train_idx)} {len(test_idx)}" + ) + + +def test_build_permutation_forest(): + """Simple test for building a permutation forest.""" + n_estimators = 30 + n_samples = 100 + n_features = 3 + rng = np.random.default_rng(seed) + + _X = rng.uniform(size=(n_samples, n_features)) + _X = rng.uniform(size=(n_samples // 2, n_features)) + X2 = _X + 10 + X = np.vstack([_X, X2]) + y = np.vstack( + [np.zeros((n_samples // 2, 1)), np.ones((n_samples // 2, 1))] + ) # Binary classification + + clf = HonestForestClassifier( + n_estimators=n_estimators, random_state=seed, n_jobs=-1, honest_fraction=0.5, bootstrap=True + ) + perm_clf = PermutationHonestForestClassifier( + n_estimators=n_estimators, random_state=seed, n_jobs=-1, honest_fraction=0.5, bootstrap=True + ) + with pytest.raises( + RuntimeError, match="Permutation forest must be a PermutationHonestForestClassifier" + ): + build_permutation_forest(clf, clf, X, y, seed=seed) + + forest_result, orig_forest_proba, perm_forest_proba = build_permutation_forest( + clf, perm_clf, X, y, metric="s@98", n_repeats=20, seed=seed + ) + assert forest_result.observe_test_stat > 0.1, f"{forest_result.observe_stat}" + assert forest_result.pvalue <= 0.05, f"{forest_result.pvalue}" + assert_array_equal(orig_forest_proba.shape, perm_forest_proba.shape) + + X = np.vstack([_X, _X]) + forest_result, _, _ = build_permutation_forest( + clf, perm_clf, X, y, metric="s@98", n_repeats=10, seed=seed + ) + assert forest_result.pvalue > 0.05, f"{forest_result.pvalue}" + assert forest_result.observe_test_stat < 0.05, f"{forest_result.observe_test_stat}" diff --git a/treeple/stats/tests/test_forestht.py b/treeple/stats/tests/test_forest.py similarity index 76% rename from treeple/stats/tests/test_forestht.py rename to treeple/stats/tests/test_forest.py index 6504a440f..922b057e5 100644 --- a/treeple/stats/tests/test_forestht.py +++ b/treeple/stats/tests/test_forest.py @@ -10,13 +10,7 @@ import treeple.stats as stats import treeple.stats.utils as utils from treeple import HonestForestClassifier, RandomForestClassifier -from treeple.stats import ( - PermutationHonestForestClassifier, - build_coleman_forest, - build_cv_forest, - build_oob_forest, - build_permutation_forest, -) +from treeple.stats import PermutationHonestForestClassifier, build_coleman_forest, build_oob_forest from treeple.tree import MultiViewDecisionTreeClassifier # load the iris dataset (n_samples, 4) @@ -290,9 +284,13 @@ def test_build_coleman_forest(use_bottleneck: bool): ): stats.build_coleman_forest(clf, clf, X, y) - forest_result, orig_forest_proba, perm_forest_proba, clf_fitted, perm_clf_fitted = ( - stats.build_coleman_forest(clf, perm_clf, X, y, metric="s@98", n_repeats=1000, seed=seed) - ) + ( + forest_result, + orig_forest_proba, + perm_forest_proba, + clf_fitted, + perm_clf_fitted, + ) = stats.build_coleman_forest(clf, perm_clf, X, y, metric="s@98", n_repeats=1000, seed=seed) assert clf_fitted._n_samples_bootstrap == round(n_samples * 1.6) assert perm_clf_fitted._n_samples_bootstrap == round(n_samples * 1.6) assert_array_equal(perm_clf_fitted.permutation_indices_.shape, (n_samples, 1)) @@ -354,9 +352,13 @@ def test_build_coleman_forest_multiview(): ): build_coleman_forest(clf, clf, X, y) - forest_result, orig_forest_proba, perm_forest_proba, clf_fitted, perm_clf_fitted = ( - build_coleman_forest(clf, perm_clf, X, y, metric="s@98", n_repeats=1000, seed=seed) - ) + ( + forest_result, + orig_forest_proba, + perm_forest_proba, + clf_fitted, + perm_clf_fitted, + ) = build_coleman_forest(clf, perm_clf, X, y, metric="s@98", n_repeats=1000, seed=seed) assert clf_fitted._n_samples_bootstrap == round(n_samples * 1.6) assert perm_clf_fitted._n_samples_bootstrap == round(n_samples * 1.6) assert_array_equal(perm_clf_fitted.permutation_indices_.shape, (n_samples, 1)) @@ -372,47 +374,6 @@ def test_build_coleman_forest_multiview(): assert forest_result.pvalue > 0.05, f"{forest_result.pvalue}" -def test_build_permutation_forest(): - """Simple test for building a permutation forest.""" - n_estimators = 30 - n_samples = 100 - n_features = 3 - rng = np.random.default_rng(seed) - - _X = rng.uniform(size=(n_samples, n_features)) - _X = rng.uniform(size=(n_samples // 2, n_features)) - X2 = _X + 10 - X = np.vstack([_X, X2]) - y = np.vstack( - [np.zeros((n_samples // 2, 1)), np.ones((n_samples // 2, 1))] - ) # Binary classification - - clf = HonestForestClassifier( - n_estimators=n_estimators, random_state=seed, n_jobs=-1, honest_fraction=0.5, bootstrap=True - ) - perm_clf = PermutationHonestForestClassifier( - n_estimators=n_estimators, random_state=seed, n_jobs=-1, honest_fraction=0.5, bootstrap=True - ) - with pytest.raises( - RuntimeError, match="Permutation forest must be a PermutationHonestForestClassifier" - ): - build_permutation_forest(clf, clf, X, y, seed=seed) - - forest_result, orig_forest_proba, perm_forest_proba = build_permutation_forest( - clf, perm_clf, X, y, metric="s@98", n_repeats=20, seed=seed - ) - assert forest_result.observe_test_stat > 0.1, f"{forest_result.observe_stat}" - assert forest_result.pvalue <= 0.05, f"{forest_result.pvalue}" - assert_array_equal(orig_forest_proba.shape, perm_forest_proba.shape) - - X = np.vstack([_X, _X]) - forest_result, _, _ = build_permutation_forest( - clf, perm_clf, X, y, metric="s@98", n_repeats=10, seed=seed - ) - assert forest_result.pvalue > 0.05, f"{forest_result.pvalue}" - assert forest_result.observe_test_stat < 0.05, f"{forest_result.observe_test_stat}" - - def test_build_oob_honest_forest(): bootstrap = True max_samples = 1.6 @@ -469,53 +430,3 @@ def test_build_oob_random_forest(): assert len(np.unique(structure_samples[tree_idx])) + len(oob_samples_list[tree_idx]) == len( samples ), f"{tree_idx} {len(structure_samples[tree_idx])} + {len(oob_samples_list[tree_idx])} != {len(samples)}" - - -@pytest.mark.parametrize("bootstrap, max_samples", [(True, 1.6), (False, None)]) -def test_build_cv_honest_forest(bootstrap, max_samples): - n_estimators = 100 - est = HonestForestClassifier( - n_estimators=n_estimators, - random_state=0, - bootstrap=bootstrap, - max_samples=max_samples, - honest_fraction=0.5, - stratify=True, - ) - X = rng.normal(0, 1, (100, 2)) - X[:50] *= -1 - y = np.array([0, 1] * 50) - samples = np.arange(len(y)) - - est_list, proba_list, train_idx_list, test_idx_list = build_cv_forest( - est, - X, - y, - return_indices=True, - seed=seed, - cv=3, - ) - - assert isinstance(est_list, list) - assert isinstance(proba_list, list) - - for est, proba, train_idx, test_idx in zip(est_list, proba_list, train_idx_list, test_idx_list): - assert len(train_idx) + len(test_idx) == len(samples) - structure_samples = est.structure_indices_ - leaf_samples = est.honest_indices_ - - if not bootstrap: - oob_samples = [[] for _ in range(est.n_estimators)] - else: - oob_samples = est.oob_samples_ - - # compared to oob samples, now the train samples are comprised of the entire dataset - # seen over the entire forest. The test dataset is completely disjoint - for tree_idx in range(est.n_estimators): - n_samples_in_tree = len(structure_samples[tree_idx]) + len(leaf_samples[tree_idx]) - assert n_samples_in_tree + len(oob_samples[tree_idx]) == len(train_idx), ( - f"For tree: " - f"{tree_idx} {len(structure_samples[tree_idx])} + " - f"{len(leaf_samples[tree_idx])} + {len(oob_samples[tree_idx])} " - f"!= {len(train_idx)} {len(test_idx)}" - ) diff --git a/treeple/stats/tests/test_utils.py b/treeple/stats/tests/test_utils.py index 95698c968..d3b169084 100644 --- a/treeple/stats/tests/test_utils.py +++ b/treeple/stats/tests/test_utils.py @@ -40,7 +40,6 @@ def test_get_per_tree_oob_samples(bootstrap): @pytest.mark.parametrize("use_bottleneck", [True, False]) def test_non_nan_samples(use_bottleneck: bool): - if use_bottleneck and utils.DISABLE_BN_ENV_VAR in os.environ: del os.environ[utils.DISABLE_BN_ENV_VAR] importlib.reload(utils) diff --git a/treeple/stats/utils.py b/treeple/stats/utils.py index 57f67787c..f5302c007 100644 --- a/treeple/stats/utils.py +++ b/treeple/stats/utils.py @@ -15,9 +15,9 @@ roc_auc_score, roc_curve, ) -from sklearn.utils.validation import check_is_fitted, check_X_y +from sklearn.utils.validation import check_is_fitted -from treeple._lib.sklearn.ensemble._forest import BaseForest, ForestClassifier +from treeple._lib.sklearn.ensemble._forest import BaseForest BOTTLENECK_AVAILABLE = False if "bottleneck" in sys.modules: @@ -159,66 +159,6 @@ def _non_nan_samples(posterior_arr: ArrayLike) -> ArrayLike: return nonnan_indices -def _compute_null_distribution_perm( - X_train: ArrayLike, - y_train: ArrayLike, - X_test: ArrayLike, - y_test: ArrayLike, - covariate_index: ArrayLike, - est: ForestClassifier, - metric: str = "mse", - n_repeats: int = 1000, - seed: Optional[int] = None, - **metric_kwargs, -) -> ArrayLike: - """Compute null distribution using permutation method. - - Parameters - ---------- - X_test : ArrayLike of shape (n_samples, n_features) - The data matrix. - y_test : ArrayLike of shape (n_samples, n_outputs) - The output matrix. - covariate_index : ArrayLike of shape (n_covariates,) - The indices of the covariates to permute. - est : ForestClassifier - The forest that will be used to recompute the test statistic on the permuted data. - metric : str, optional - The metric, which to compute the null distribution of statistics, by default 'mse'. - n_repeats : int, optional - The number of times to sample the null, by default 1000. - seed : int, optional - Random seed, by default None. - """ - rng = np.random.default_rng(seed) - X_test, y_test = check_X_y(X_test, y_test, ensure_2d=True, multi_output=True) - # n_samples_test = len(y_test) - n_samples_train = len(y_train) - metric_func = METRIC_FUNCTIONS[metric] - - # pre-allocate memory for the index array - train_index_arr = np.arange(n_samples_train, dtype=int).reshape(-1, 1) - # test_index_arr = np.arange(n_samples_test, dtype=int).reshape(-1, 1) - - null_metrics = np.zeros((n_repeats,)) - - for idx in range(n_repeats): - rng.shuffle(train_index_arr) - perm_X_cov = X_train[train_index_arr, covariate_index] - X_train[:, covariate_index] = perm_X_cov - - # train a new forest on the permuted data - # XXX: should there be a train/test split here? even w/ honest forests? - est.fit(X_train, y_train.ravel()) - y_pred = est.predict(X_test) - - # compute two instances of the metric from the sampled trees - metric_val = metric_func(y_test, y_pred, **metric_kwargs) - - null_metrics[idx] = metric_val - return null_metrics - - def _compute_null_distribution_coleman( y_test: ArrayLike, y_pred_proba_normal: ArrayLike, diff --git a/treeple/tree/_marginal.pxd b/treeple/tree/_marginal.pxd index c0c119cea..3e61df65a 100644 --- a/treeple/tree/_marginal.pxd +++ b/treeple/tree/_marginal.pxd @@ -3,7 +3,7 @@ import numpy as np cimport numpy as cnp from .._lib.sklearn.tree._tree cimport BaseTree, Node -from .._lib.sklearn.utils._typedefs cimport float32_t, float64_t, intp_t, uint32_t +from .._lib.sklearn.utils._typedefs cimport float32_t, float64_t, intp_t, uint8_t, uint32_t cpdef apply_marginal_tree( @@ -11,6 +11,6 @@ cpdef apply_marginal_tree( object X, const intp_t[:] marginal_indices, intp_t traversal_method, - unsigned char use_sample_weight, + uint8_t use_sample_weight, object random_state ) diff --git a/treeple/tree/_marginal.pyx b/treeple/tree/_marginal.pyx index 006c82869..6bc791f90 100644 --- a/treeple/tree/_marginal.pyx +++ b/treeple/tree/_marginal.pyx @@ -27,7 +27,7 @@ cpdef apply_marginal_tree( object X, const intp_t[:] marginal_indices, intp_t traversal_method, - unsigned char use_sample_weight, + uint8_t use_sample_weight, object random_state ): """Apply a dataset to a marginalized tree. @@ -44,7 +44,7 @@ cpdef apply_marginal_tree( traversal_method : intp_t The traversal method to use. 0 for 'random', 1 for 'weighted'. - use_sample_weight : unsigned char + use_sample_weight : uint8_t Whether or not to use the weighted number of samples in each node. random_state : object @@ -107,7 +107,7 @@ cdef inline cnp.ndarray _apply_dense_marginal( const float32_t[:, :] X, unordered_set[intp_t] marginal_indices_map, intp_t traversal_method, - unsigned char use_sample_weight, + uint8_t use_sample_weight, uint32_t* rand_r_state ): """Finds the terminal region (=leaf node) for each sample in X. @@ -128,7 +128,7 @@ cdef inline cnp.ndarray _apply_dense_marginal( traversal_method : intp_t The traversal method to use. 0 for 'random', 1 for 'weighted'. - use_sample_weight : unsigned char + use_sample_weight : uint8_t Whether or not to use the weighted number of samples in each node. rand_r_state : uint32_t diff --git a/treeple/tree/_oblique_splitter.pxd b/treeple/tree/_oblique_splitter.pxd index 9928f5f10..124a66dd6 100644 --- a/treeple/tree/_oblique_splitter.pxd +++ b/treeple/tree/_oblique_splitter.pxd @@ -15,7 +15,7 @@ from libcpp.vector cimport vector from .._lib.sklearn.tree._criterion cimport Criterion from .._lib.sklearn.tree._splitter cimport SplitRecord, Splitter from .._lib.sklearn.tree._tree cimport ParentInfo -from .._lib.sklearn.utils._typedefs cimport float32_t, float64_t, int8_t, intp_t, uint32_t +from .._lib.sklearn.utils._typedefs cimport float32_t, float64_t, int8_t, intp_t, uint8_t, uint32_t from ._sklearn_splitter cimport sort @@ -29,7 +29,7 @@ cdef struct ObliqueSplitRecord: float64_t improvement # Impurity improvement given parent node. float64_t impurity_left # Impurity of the left split. float64_t impurity_right # Impurity of the right split. - unsigned char missing_go_to_left # Controls if missing values go to the left node. + uint8_t missing_go_to_left # Controls if missing values go to the left node. intp_t n_missing # Number of missing values for the feature being split on vector[float32_t]* proj_vec_weights # weights of the vector (max_features,) diff --git a/treeple/tree/_oblique_splitter.pyx b/treeple/tree/_oblique_splitter.pyx index 2eea23666..ca77a30ac 100644 --- a/treeple/tree/_oblique_splitter.pyx +++ b/treeple/tree/_oblique_splitter.pyx @@ -206,7 +206,7 @@ cdef class ObliqueSplitter(BaseObliqueSplitter): object X, const float64_t[:, ::1] y, const float64_t[:] sample_weight, - const unsigned char[::1] missing_values_in_feature_mask, + const uint8_t[::1] missing_values_in_feature_mask, ) except -1: Splitter.init(self, X, y, sample_weight, missing_values_in_feature_mask) @@ -715,7 +715,7 @@ cdef class MultiViewSplitter(BestObliqueSplitter): object X, const float64_t[:, ::1] y, const float64_t[:] sample_weight, - const unsigned char[::1] missing_values_in_feature_mask, + const uint8_t[::1] missing_values_in_feature_mask, ) except -1: Splitter.init(self, X, y, sample_weight, missing_values_in_feature_mask) @@ -879,7 +879,7 @@ cdef class MultiViewObliqueSplitter(BestObliqueSplitter): object X, const float64_t[:, ::1] y, const float64_t[:] sample_weight, - const unsigned char[::1] missing_values_in_feature_mask, + const uint8_t[::1] missing_values_in_feature_mask, ) except -1: Splitter.init(self, X, y, sample_weight, missing_values_in_feature_mask) diff --git a/treeple/tree/manifold/_morf_splitter.pyx b/treeple/tree/manifold/_morf_splitter.pyx index 414062a5e..d6c8d0121 100644 --- a/treeple/tree/manifold/_morf_splitter.pyx +++ b/treeple/tree/manifold/_morf_splitter.pyx @@ -31,7 +31,7 @@ cdef class PatchSplitter(BestObliqueSplitter): object X, const float64_t[:, ::1] y, const float64_t[:] sample_weight, - const unsigned char[::1] missing_values_in_feature_mask, + const uint8_t[::1] missing_values_in_feature_mask, ) except -1: BestObliqueSplitter.init(self, X, y, sample_weight, missing_values_in_feature_mask) @@ -97,7 +97,7 @@ cdef class BaseDensePatchSplitter(PatchSplitter): object X, const float64_t[:, ::1] y, const float64_t[:] sample_weight, - const unsigned char[::1] missing_values_in_feature_mask, + const uint8_t[::1] missing_values_in_feature_mask, # const int32_t[:] n_categories ) except -1: """Initialize the splitter