diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index a329c4362..12182c95f 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -116,7 +116,116 @@ jobs: ./spin --help ./spin coverage --help ./spin test --help - ./spin coverage + ./spin test + cp $PWD/build-install/usr/lib/python${{matrix.python-version}}/site-packages/coverage.xml ./coverage.xml + + - name: debug + run: | + ls $PWD/build-install/usr/lib/python${{matrix.python-version}}/site-packages/ + echo "Okay..." + ls $PWD/build + ls ./ + + - name: Save build + uses: actions/upload-artifact@v3 + with: + name: sktree-build + path: $PWD/build + + build_and_test_slow: + name: Meson build ${{ matrix.os }} - py${{ matrix.python-version }} + timeout-minutes: 20 + needs: [build_and_test] + strategy: + fail-fast: false + matrix: + os: [ubuntu-22.04] + python-version: ["3.11"] + poetry-version: [1.5.0] + runs-on: ${{ matrix.os }} + defaults: + run: + shell: bash + env: + # to make sure coverage/test command builds cleanly + FORCE_SUBMODULE: True + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + - name: Setup Python ${{ matrix.python-version }} + uses: actions/setup-python@v4.6.1 + with: + python-version: ${{ matrix.python-version }} + architecture: "x64" + cache: "pip" + cache-dependency-path: "requirements.txt" + + - name: show-gcc + run: | + gcc --version + + - name: Install Ccache for MacOSX + if: ${{ matrix.os == 'macos-latest'}} + run: | + brew install ccache + + - name: Install packages for Ubuntu + if: ${{ matrix.os == 'ubuntu-22.04'}} + run: | + sudo apt-get update + sudo apt-get install -y libopenblas-dev libatlas-base-dev liblapack-dev gfortran libgmp-dev libmpfr-dev libsuitesparse-dev ccache libmpc-dev + + - name: Install Python packages + run: | + python -m pip install -r build_requirements.txt + python -m pip install spin + python -m pip install -r test_requirements.txt + + - name: Prepare compiler cache + id: prep-ccache + shell: bash + run: | + mkdir -p "${CCACHE_DIR}" + echo "dir=$CCACHE_DIR" >> $GITHUB_OUTPUT + NOW=$(date -u +"%F-%T") + echo "timestamp=${NOW}" >> $GITHUB_OUTPUT + + - name: Setup compiler cache + uses: actions/cache@v3 + id: cache-ccachev1 + # Reference: https://docs.github.com/en/actions/guides/caching-dependencies-to-speed-up-workflows#matching-a-cache-key + # NOTE: The caching strategy is modeled in a way that it will always have a unique cache key for each workflow run + # (even if the same workflow is run multiple times). The restore keys are not unique and for a partial match, they will + # return the most recently created cache entry, according to the GitHub Action Docs. + with: + path: ${{ steps.prep-ccache.outputs.dir }} + # Restores ccache from either a previous build on this branch or on main + key: ${{ github.workflow }}-${{ matrix.python-version }}-ccache-linux-${{ steps.prep-ccache.outputs.timestamp }} + # This evaluates to `Linux Tests-3.9-ccache-linux-` which is not unique. As the CI matrix is expanded, this will + # need to be updated to be unique so that the cache is not restored from a different job altogether. + restore-keys: | + ${{ github.workflow }}-${{ matrix.python-version }}-ccache-linux- + + - name: Setup build and install scikit-tree + run: | + ./spin build -j 2 --forcesubmodule + + - name: Ccache performance + shell: bash -l {0} + run: ccache -s + + - name: build-path + run: | + echo "$PWD/build-install/" + export INSTALLED_PATH=$PWD/build-install/usr/lib/python${{matrix.python-version}}/site-packages + + - name: Run unit tests and coverage + run: | + ./spin --help + ./spin coverage --help + ./spin test --help + ./spin coverage -k "slowtest" cp $PWD/build-install/usr/lib/python${{matrix.python-version}}/site-packages/coverage.xml ./coverage.xml - name: debug @@ -127,7 +236,6 @@ jobs: ls ./ - name: Upload coverage stats to codecov - if: ${{ matrix.os == 'ubuntu-22.04' && matrix.python-version == '3.10'}} uses: codecov/codecov-action@v3 with: # python spin goes into the INSTALLED path in order to run pytest @@ -146,7 +254,7 @@ jobs: release: name: Release runs-on: ubuntu-latest - needs: [build_and_test] + needs: [build_and_test_slow] if: startsWith(github.ref, 'refs/tags/') steps: - name: Checkout repository diff --git a/.spin/cmds.py b/.spin/cmds.py index 5b8585dde..b7f2cd863 100644 --- a/.spin/cmds.py +++ b/.spin/cmds.py @@ -1,5 +1,4 @@ import os -import shutil import subprocess import sys @@ -13,33 +12,27 @@ def get_git_revision_hash(submodule) -> str: @click.command() -@click.option("--build-dir", default="build", help="Build directory; default is `$PWD/build`") -@click.option("--clean", is_flag=True, help="Clean previously built docs before building") -@click.option("--noplot", is_flag=True, help="Build docs without plots") +@click.argument("slowtest", default=True) @click.pass_context -def docs(ctx, build_dir, clean=False, noplot=False): - """📖 Build documentation""" - if clean: - doc_dir = "./docs/_build" - if os.path.isdir(doc_dir): - print(f"Removing `{doc_dir}`") - shutil.rmtree(doc_dir) - - site_path = meson._get_site_packages() - if site_path is None: - print("No built scikit-tree found; run `./spin build` first.") - sys.exit(1) - - util.run(["pip", "install", "-q", "-r", "doc_requirements.txt"]) - - ctx.invoke(meson.docs) - - -@click.command() -@click.pass_context -def coverage(ctx): +def coverage(ctx, slowtest): """📊 Generate coverage report""" - pytest_args = ("-o", "python_functions=test_*", "sktree", "--cov=sktree", "--cov-report=xml") + if slowtest: + pytest_args = ( + "-o", + "python_functions=test_*", + "sktree", + "--cov=sktree", + "--cov-report=xml", + "-k slowtest", + ) + else: + pytest_args = ( + "-o", + "python_functions=test_*", + "sktree", + "--cov=sktree", + "--cov-report=xml", + ) ctx.invoke(meson.test, pytest_args=pytest_args) diff --git a/doc/conf.py b/doc/conf.py index fa949235e..33b43e88b 100644 --- a/doc/conf.py +++ b/doc/conf.py @@ -240,6 +240,7 @@ "predict", "fit", "apply", + "TreeBuilder", } # validation diff --git a/pyproject.toml b/pyproject.toml index d032d2c5d..544710691 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -235,7 +235,7 @@ exclude = [ [tool.pytest.ini_options] minversion = '6.0' -addopts = '--durations 20 --junit-xml=junit-results.xml --verbose --ignore=sktree/_lib/' +addopts = '--durations 20 --junit-xml=junit-results.xml --verbose --ignore=sktree/_lib/ -k "not slowtest"' filterwarnings = [] [tool.coverage.run] @@ -267,7 +267,9 @@ Environments = [ 'spin.cmds.meson.ipython', 'spin.cmds.meson.python', ] -Documentation = ['.spin/cmds.py:docs'] +Documentation = [ + 'spin.cmds.meson.docs' + ] Metrics = [ '.spin/cmds.py:coverage', '.spin/cmds.py:asv', diff --git a/sktree/stats/forestht.py b/sktree/stats/forestht.py index 74a4350d3..5ce124174 100644 --- a/sktree/stats/forestht.py +++ b/sktree/stats/forestht.py @@ -1,4 +1,4 @@ -from typing import Callable +from typing import Callable, Tuple import numpy as np from numpy.typing import ArrayLike @@ -28,6 +28,13 @@ class BaseForestHT(MetaEstimatorMixin): + observe_samples_: ArrayLike + observe_posteriors_: ArrayLike + observe_stat_: float + permute_samples_: ArrayLike + permute_posteriors_: ArrayLike + permute_stat_: float + def __init__( self, estimator=None, @@ -129,6 +136,13 @@ def _check_input(self, X: ArrayLike, y: ArrayLike, covariate_index: ArrayLike = if y.ndim != 2: y = y.reshape(-1, 1) + if covariate_index is not None: + if not isinstance(covariate_index, (list, tuple, np.ndarray)): + raise RuntimeError("covariate_index must be an iterable of integer indices") + else: + if not all(isinstance(idx, int) for idx in covariate_index): + raise RuntimeError("Not all covariate_index are integer indices") + if self._n_samples_ is not None and X.shape[0] != self._n_samples_: raise RuntimeError( f"X must have {self._n_samples_} samples, got {X.shape[0]}. " @@ -156,7 +170,7 @@ def statistic( return_posteriors: bool = False, check_input: bool = True, **metric_kwargs, - ): + ) -> Tuple[float, ArrayLike, ArrayLike]: """Compute the test statistic. Parameters @@ -503,6 +517,20 @@ def _get_estimator(self): estimator_ = self.estimator return clone(estimator_) + def statistic( + self, + X: ArrayLike, + y: ArrayLike, + covariate_index: ArrayLike = None, + metric="mse", + return_posteriors: bool = False, + check_input: bool = True, + **metric_kwargs, + ): + return super().statistic( + X, y, covariate_index, metric, return_posteriors, check_input, **metric_kwargs + ) + def _statistic( self, estimator: ForestClassifier, diff --git a/sktree/stats/tests/test_forestht.py b/sktree/stats/tests/test_forestht.py index 05aaae129..925c62f1f 100644 --- a/sktree/stats/tests/test_forestht.py +++ b/sktree/stats/tests/test_forestht.py @@ -1,6 +1,7 @@ import numpy as np import pytest from flaky import flaky +from joblib import Parallel, delayed from scipy.special import expit from sklearn import datasets @@ -58,6 +59,14 @@ def test_featureimportance_forest_permute_pertree(sample_dataset_per_tree): with pytest.raises(RuntimeError, match="Metric must be"): est.statistic(iris_X[:n_samples], iris_y[:n_samples], metric="mi") + # covariate index must be an iterable + with pytest.raises(RuntimeError, match="covariate_index must be an iterable"): + est.statistic(iris_X[:n_samples], iris_y[:n_samples], 0, metric="mi") + + # covariate index must be an iterable of ints + with pytest.raises(RuntimeError, match="Not all covariate_index"): + est.statistic(iris_X[:n_samples], iris_y[:n_samples], [0, 1.0], metric="mi") + def test_featureimportance_forest_errors(): permute_per_tree = False @@ -257,6 +266,7 @@ def test_correlated_logit_model(hypotester, model_kwargs, n_samples, n_repeats, @flaky(max_runs=2) +@pytest.mark.slowtest @pytest.mark.parametrize("criterion", ["gini", "entropy"]) @pytest.mark.parametrize("honest_prior", ["empirical", "uniform"]) @pytest.mark.parametrize( @@ -377,3 +387,28 @@ def test_forestht_check_inputs(forest_hyppo): y_invalid = np.random.rand(X.shape[0]) with pytest.raises(RuntimeError, match="y must have type"): forest_hyppo.statistic(X, y_invalid) + + +def test_parallelization(): + """Test parallelization of training forests.""" + n_samples = 100 + n_features = 5 + X = rng.uniform(size=(n_samples, n_features)) + y = rng.integers(0, 2, size=n_samples) # Binary classification + + def run_forest(covariate_index=None): + clf = FeatureImportanceForestClassifier( + estimator=HonestForestClassifier( + n_estimators=10, + random_state=seed, + n_jobs=1, + ), + ) + obs_stat = clf.statistic(X, y, metric="mi") + perm_stat = clf.statistic(X, y, covariate_index=[covariate_index], metric="mi") + return obs_stat, perm_stat + + out = Parallel(n_jobs=1)( + delayed(run_forest)(covariate_index) for covariate_index in range(n_features) + ) + assert len(out) == n_features diff --git a/sktree/tree/_classes.py b/sktree/tree/_classes.py index 24c25afcf..a9e00c6d0 100644 --- a/sktree/tree/_classes.py +++ b/sktree/tree/_classes.py @@ -839,7 +839,7 @@ def __init__( min_impurity_decrease=0.0, class_weight=None, feature_combinations=None, - ccp_alpha=None, + ccp_alpha=0.0, store_leaf_values=False, monotonic_cst=None, ): @@ -1220,7 +1220,7 @@ def __init__( max_leaf_nodes=None, min_impurity_decrease=0.0, feature_combinations=None, - ccp_alpha=None, + ccp_alpha=0.0, store_leaf_values=False, monotonic_cst=None, ): @@ -1628,7 +1628,7 @@ def __init__( data_dims=None, boundary=None, feature_weight=None, - ccp_alpha=None, + ccp_alpha=0.0, store_leaf_values=False, monotonic_cst=None, ): @@ -2092,7 +2092,7 @@ def __init__( data_dims=None, boundary=None, feature_weight=None, - ccp_alpha=None, + ccp_alpha=0.0, store_leaf_values=False, monotonic_cst=None, ): @@ -2582,7 +2582,7 @@ def __init__( min_impurity_decrease=0.0, class_weight=None, feature_combinations=None, - ccp_alpha=None, + ccp_alpha=0.0, store_leaf_values=False, monotonic_cst=None, ): @@ -2973,7 +2973,7 @@ def __init__( max_leaf_nodes=None, min_impurity_decrease=0.0, feature_combinations=None, - ccp_alpha=None, + ccp_alpha=0.0, store_leaf_values=False, monotonic_cst=None, ):