Add additional testing

Signed-off-by: Adam Li <[email protected]>
neurodata · Oct 3, 2023 · c716440 · c716440
1 parent aed9179
commit c716440
Show file tree

Hide file tree

Showing 7 changed files with 206 additions and 39 deletions.
diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
@@ -116,7 +116,116 @@ jobs:
           ./spin --help
           ./spin coverage --help
           ./spin test --help
-          ./spin coverage
+          ./spin test
+          cp $PWD/build-install/usr/lib/python${{matrix.python-version}}/site-packages/coverage.xml ./coverage.xml
+
+      - name: debug
+        run: |
+          ls $PWD/build-install/usr/lib/python${{matrix.python-version}}/site-packages/
+          echo "Okay..."
+          ls $PWD/build
+          ls ./
+
+      - name: Save build
+        uses: actions/upload-artifact@v3
+        with:
+          name: sktree-build
+          path: $PWD/build
+
+  build_and_test_slow:
+    name: Meson build ${{ matrix.os }} - py${{ matrix.python-version }}
+    timeout-minutes: 20
+    needs: [build_and_test]
+    strategy:
+      fail-fast: false
+      matrix:
+        os: [ubuntu-22.04]
+        python-version: ["3.11"]
+        poetry-version: [1.5.0]
+    runs-on: ${{ matrix.os }}
+    defaults:
+      run:
+        shell: bash
+    env:
+      # to make sure coverage/test command builds cleanly
+      FORCE_SUBMODULE: True
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+
+      - name: Setup Python ${{ matrix.python-version }}
+        uses: actions/[email protected]
+        with:
+          python-version: ${{ matrix.python-version }}
+          architecture: "x64"
+          cache: "pip"
+          cache-dependency-path: "requirements.txt"
+
+      - name: show-gcc
+        run: |
+          gcc --version
+
+      - name: Install Ccache for MacOSX
+        if: ${{ matrix.os == 'macos-latest'}}
+        run: |
+          brew install ccache
+
+      - name: Install packages for Ubuntu
+        if: ${{ matrix.os == 'ubuntu-22.04'}}
+        run: |
+          sudo apt-get update
+          sudo apt-get install -y libopenblas-dev libatlas-base-dev liblapack-dev gfortran libgmp-dev libmpfr-dev libsuitesparse-dev ccache libmpc-dev
+
+      - name: Install Python packages
+        run: |
+          python -m pip install -r build_requirements.txt
+          python -m pip install spin
+          python -m pip install -r test_requirements.txt
+
+      - name: Prepare compiler cache
+        id: prep-ccache
+        shell: bash
+        run: |
+          mkdir -p "${CCACHE_DIR}"
+          echo "dir=$CCACHE_DIR" >> $GITHUB_OUTPUT
+          NOW=$(date -u +"%F-%T")
+          echo "timestamp=${NOW}" >> $GITHUB_OUTPUT
+
+      - name: Setup compiler cache
+        uses: actions/cache@v3
+        id: cache-ccachev1
+        # Reference: https://docs.github.com/en/actions/guides/caching-dependencies-to-speed-up-workflows#matching-a-cache-key
+        # NOTE: The caching strategy is modeled in a way that it will always have a unique cache key for each workflow run
+        # (even if the same workflow is run multiple times). The restore keys are not unique and for a partial match, they will
+        # return the most recently created cache entry, according to the GitHub Action Docs.
+        with:
+          path: ${{ steps.prep-ccache.outputs.dir }}
+          # Restores ccache from either a previous build on this branch or on main
+          key: ${{ github.workflow }}-${{ matrix.python-version }}-ccache-linux-${{ steps.prep-ccache.outputs.timestamp }}
+          # This evaluates to `Linux Tests-3.9-ccache-linux-` which is not unique. As the CI matrix is expanded, this will
+          # need to be updated to be unique so that the cache is not restored from a different job altogether.
+          restore-keys: |
+            ${{ github.workflow }}-${{ matrix.python-version }}-ccache-linux-
+
+      - name: Setup build and install scikit-tree
+        run: |
+          ./spin build -j 2 --forcesubmodule
+
+      - name: Ccache performance
+        shell: bash -l {0}
+        run: ccache -s
+
+      - name: build-path
+        run: |
+          echo "$PWD/build-install/"
+          export INSTALLED_PATH=$PWD/build-install/usr/lib/python${{matrix.python-version}}/site-packages
+
+      - name: Run unit tests and coverage
+        run: |
+          ./spin --help
+          ./spin coverage --help
+          ./spin test --help
+          ./spin coverage -k "slowtest"
           cp $PWD/build-install/usr/lib/python${{matrix.python-version}}/site-packages/coverage.xml ./coverage.xml
 
       - name: debug
@@ -127,7 +236,6 @@ jobs:
           ls ./
 
       - name: Upload coverage stats to codecov
-        if: ${{ matrix.os == 'ubuntu-22.04' && matrix.python-version == '3.10'}}
         uses: codecov/codecov-action@v3
         with:
           # python spin goes into the INSTALLED path in order to run pytest
@@ -146,7 +254,7 @@ jobs:
   release:
     name: Release
     runs-on: ubuntu-latest
-    needs: [build_and_test]
+    needs: [build_and_test_slow]
     if: startsWith(github.ref, 'refs/tags/')
     steps:
       - name: Checkout repository

diff --git a/.spin/cmds.py b/.spin/cmds.py
@@ -1,5 +1,4 @@
 import os
-import shutil
 import subprocess
 import sys
 
@@ -13,33 +12,27 @@ def get_git_revision_hash(submodule) -> str:
 
 
 @click.command()
-@click.option("--build-dir", default="build", help="Build directory; default is `$PWD/build`")
-@click.option("--clean", is_flag=True, help="Clean previously built docs before building")
-@click.option("--noplot", is_flag=True, help="Build docs without plots")
+@click.argument("slowtest", default=True)
 @click.pass_context
-def docs(ctx, build_dir, clean=False, noplot=False):
-    """📖 Build documentation"""
-    if clean:
-        doc_dir = "./docs/_build"
-        if os.path.isdir(doc_dir):
-            print(f"Removing `{doc_dir}`")
-            shutil.rmtree(doc_dir)
-
-    site_path = meson._get_site_packages()
-    if site_path is None:
-        print("No built scikit-tree found; run `./spin build` first.")
-        sys.exit(1)
-
-    util.run(["pip", "install", "-q", "-r", "doc_requirements.txt"])
-
-    ctx.invoke(meson.docs)
-
-
-@click.command()
-@click.pass_context
-def coverage(ctx):
+def coverage(ctx, slowtest):
     """📊 Generate coverage report"""
-    pytest_args = ("-o", "python_functions=test_*", "sktree", "--cov=sktree", "--cov-report=xml")
+    if slowtest:
+        pytest_args = (
+            "-o",
+            "python_functions=test_*",
+            "sktree",
+            "--cov=sktree",
+            "--cov-report=xml",
+            "-k slowtest",
+        )
+    else:
+        pytest_args = (
+            "-o",
+            "python_functions=test_*",
+            "sktree",
+            "--cov=sktree",
+            "--cov-report=xml",
+        )
     ctx.invoke(meson.test, pytest_args=pytest_args)
 
 

diff --git a/doc/conf.py b/doc/conf.py
@@ -240,6 +240,7 @@
     "predict",
     "fit",
     "apply",
+    "TreeBuilder",
 }
 
 # validation

diff --git a/pyproject.toml b/pyproject.toml
@@ -235,7 +235,7 @@ exclude = [
 
 [tool.pytest.ini_options]
 minversion = '6.0'
-addopts = '--durations 20 --junit-xml=junit-results.xml --verbose --ignore=sktree/_lib/'
+addopts = '--durations 20 --junit-xml=junit-results.xml --verbose --ignore=sktree/_lib/ -k "not slowtest"'
 filterwarnings = []
 
 [tool.coverage.run]
@@ -267,7 +267,9 @@ Environments = [
     'spin.cmds.meson.ipython',
     'spin.cmds.meson.python',
 ]
-Documentation = ['.spin/cmds.py:docs']
+Documentation = [
+    'spin.cmds.meson.docs'
+    ]
 Metrics = [
     '.spin/cmds.py:coverage',
     '.spin/cmds.py:asv',

diff --git a/sktree/stats/forestht.py b/sktree/stats/forestht.py
@@ -1,4 +1,4 @@
-from typing import Callable
+from typing import Callable, Tuple
 
 import numpy as np
 from numpy.typing import ArrayLike
@@ -28,6 +28,13 @@
 
 
 class BaseForestHT(MetaEstimatorMixin):
+    observe_samples_: ArrayLike
+    observe_posteriors_: ArrayLike
+    observe_stat_: float
+    permute_samples_: ArrayLike
+    permute_posteriors_: ArrayLike
+    permute_stat_: float
+
     def __init__(
         self,
         estimator=None,
@@ -129,6 +136,13 @@ def _check_input(self, X: ArrayLike, y: ArrayLike, covariate_index: ArrayLike =
         if y.ndim != 2:
             y = y.reshape(-1, 1)
 
+        if covariate_index is not None:
+            if not isinstance(covariate_index, (list, tuple, np.ndarray)):
+                raise RuntimeError("covariate_index must be an iterable of integer indices")
+            else:
+                if not all(isinstance(idx, int) for idx in covariate_index):
+                    raise RuntimeError("Not all covariate_index are integer indices")
+
         if self._n_samples_ is not None and X.shape[0] != self._n_samples_:
             raise RuntimeError(
                 f"X must have {self._n_samples_} samples, got {X.shape[0]}. "
@@ -156,7 +170,7 @@ def statistic(
         return_posteriors: bool = False,
         check_input: bool = True,
         **metric_kwargs,
-    ):
+    ) -> Tuple[float, ArrayLike, ArrayLike]:
         """Compute the test statistic.
 
         Parameters
@@ -503,6 +517,20 @@ def _get_estimator(self):
             estimator_ = self.estimator
         return clone(estimator_)
 
+    def statistic(
+        self,
+        X: ArrayLike,
+        y: ArrayLike,
+        covariate_index: ArrayLike = None,
+        metric="mse",
+        return_posteriors: bool = False,
+        check_input: bool = True,
+        **metric_kwargs,
+    ):
+        return super().statistic(
+            X, y, covariate_index, metric, return_posteriors, check_input, **metric_kwargs
+        )
+
     def _statistic(
         self,
         estimator: ForestClassifier,

diff --git a/sktree/stats/tests/test_forestht.py b/sktree/stats/tests/test_forestht.py
@@ -1,6 +1,7 @@
 import numpy as np
 import pytest
 from flaky import flaky
+from joblib import Parallel, delayed
 from scipy.special import expit
 from sklearn import datasets
 
@@ -58,6 +59,14 @@ def test_featureimportance_forest_permute_pertree(sample_dataset_per_tree):
     with pytest.raises(RuntimeError, match="Metric must be"):
         est.statistic(iris_X[:n_samples], iris_y[:n_samples], metric="mi")
 
+    # covariate index must be an iterable
+    with pytest.raises(RuntimeError, match="covariate_index must be an iterable"):
+        est.statistic(iris_X[:n_samples], iris_y[:n_samples], 0, metric="mi")
+
+    # covariate index must be an iterable of ints
+    with pytest.raises(RuntimeError, match="Not all covariate_index"):
+        est.statistic(iris_X[:n_samples], iris_y[:n_samples], [0, 1.0], metric="mi")
+
 
 def test_featureimportance_forest_errors():
     permute_per_tree = False
@@ -257,6 +266,7 @@ def test_correlated_logit_model(hypotester, model_kwargs, n_samples, n_repeats,
 
 
 @flaky(max_runs=2)
+@pytest.mark.slowtest
 @pytest.mark.parametrize("criterion", ["gini", "entropy"])
 @pytest.mark.parametrize("honest_prior", ["empirical", "uniform"])
 @pytest.mark.parametrize(
@@ -377,3 +387,28 @@ def test_forestht_check_inputs(forest_hyppo):
     y_invalid = np.random.rand(X.shape[0])
     with pytest.raises(RuntimeError, match="y must have type"):
         forest_hyppo.statistic(X, y_invalid)
+
+
+def test_parallelization():
+    """Test parallelization of training forests."""
+    n_samples = 100
+    n_features = 5
+    X = rng.uniform(size=(n_samples, n_features))
+    y = rng.integers(0, 2, size=n_samples)  # Binary classification
+
+    def run_forest(covariate_index=None):
+        clf = FeatureImportanceForestClassifier(
+            estimator=HonestForestClassifier(
+                n_estimators=10,
+                random_state=seed,
+                n_jobs=1,
+            ),
+        )
+        obs_stat = clf.statistic(X, y, metric="mi")
+        perm_stat = clf.statistic(X, y, covariate_index=[covariate_index], metric="mi")
+        return obs_stat, perm_stat
+
+    out = Parallel(n_jobs=1)(
+        delayed(run_forest)(covariate_index) for covariate_index in range(n_features)
+    )
+    assert len(out) == n_features
diff --git a/sktree/tree/_classes.py b/sktree/tree/_classes.py
@@ -839,7 +839,7 @@ def __init__(
         min_impurity_decrease=0.0,
         class_weight=None,
         feature_combinations=None,
-        ccp_alpha=None,
+        ccp_alpha=0.0,
         store_leaf_values=False,
         monotonic_cst=None,
     ):
@@ -1220,7 +1220,7 @@ def __init__(
         max_leaf_nodes=None,
         min_impurity_decrease=0.0,
         feature_combinations=None,
-        ccp_alpha=None,
+        ccp_alpha=0.0,
         store_leaf_values=False,
         monotonic_cst=None,
     ):
@@ -1628,7 +1628,7 @@ def __init__(
         data_dims=None,
         boundary=None,
         feature_weight=None,
-        ccp_alpha=None,
+        ccp_alpha=0.0,
         store_leaf_values=False,
         monotonic_cst=None,
     ):
@@ -2092,7 +2092,7 @@ def __init__(
         data_dims=None,
         boundary=None,
         feature_weight=None,
-        ccp_alpha=None,
+        ccp_alpha=0.0,
         store_leaf_values=False,
         monotonic_cst=None,
     ):
@@ -2582,7 +2582,7 @@ def __init__(
         min_impurity_decrease=0.0,
         class_weight=None,
         feature_combinations=None,
-        ccp_alpha=None,
+        ccp_alpha=0.0,
         store_leaf_values=False,
         monotonic_cst=None,
     ):
@@ -2973,7 +2973,7 @@ def __init__(
         max_leaf_nodes=None,
         min_impurity_decrease=0.0,
         feature_combinations=None,
-        ccp_alpha=None,
+        ccp_alpha=0.0,
         store_leaf_values=False,
         monotonic_cst=None,
     ):