Skip to content

Commit

Permalink
Add additional testing
Browse files Browse the repository at this point in the history
Signed-off-by: Adam Li <[email protected]>
  • Loading branch information
adam2392 committed Oct 3, 2023
1 parent aed9179 commit c716440
Show file tree
Hide file tree
Showing 7 changed files with 206 additions and 39 deletions.
114 changes: 111 additions & 3 deletions .github/workflows/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -116,7 +116,116 @@ jobs:
./spin --help
./spin coverage --help
./spin test --help
./spin coverage
./spin test
cp $PWD/build-install/usr/lib/python${{matrix.python-version}}/site-packages/coverage.xml ./coverage.xml
- name: debug
run: |
ls $PWD/build-install/usr/lib/python${{matrix.python-version}}/site-packages/
echo "Okay..."
ls $PWD/build
ls ./
- name: Save build
uses: actions/upload-artifact@v3
with:
name: sktree-build
path: $PWD/build

build_and_test_slow:
name: Meson build ${{ matrix.os }} - py${{ matrix.python-version }}
timeout-minutes: 20
needs: [build_and_test]
strategy:
fail-fast: false
matrix:
os: [ubuntu-22.04]
python-version: ["3.11"]
poetry-version: [1.5.0]
runs-on: ${{ matrix.os }}
defaults:
run:
shell: bash
env:
# to make sure coverage/test command builds cleanly
FORCE_SUBMODULE: True
steps:
- name: Checkout repository
uses: actions/checkout@v4

- name: Setup Python ${{ matrix.python-version }}
uses: actions/[email protected]
with:
python-version: ${{ matrix.python-version }}
architecture: "x64"
cache: "pip"
cache-dependency-path: "requirements.txt"

- name: show-gcc
run: |
gcc --version
- name: Install Ccache for MacOSX
if: ${{ matrix.os == 'macos-latest'}}
run: |
brew install ccache
- name: Install packages for Ubuntu
if: ${{ matrix.os == 'ubuntu-22.04'}}
run: |
sudo apt-get update
sudo apt-get install -y libopenblas-dev libatlas-base-dev liblapack-dev gfortran libgmp-dev libmpfr-dev libsuitesparse-dev ccache libmpc-dev
- name: Install Python packages
run: |
python -m pip install -r build_requirements.txt
python -m pip install spin
python -m pip install -r test_requirements.txt
- name: Prepare compiler cache
id: prep-ccache
shell: bash
run: |
mkdir -p "${CCACHE_DIR}"
echo "dir=$CCACHE_DIR" >> $GITHUB_OUTPUT
NOW=$(date -u +"%F-%T")
echo "timestamp=${NOW}" >> $GITHUB_OUTPUT
- name: Setup compiler cache
uses: actions/cache@v3
id: cache-ccachev1
# Reference: https://docs.github.com/en/actions/guides/caching-dependencies-to-speed-up-workflows#matching-a-cache-key
# NOTE: The caching strategy is modeled in a way that it will always have a unique cache key for each workflow run
# (even if the same workflow is run multiple times). The restore keys are not unique and for a partial match, they will
# return the most recently created cache entry, according to the GitHub Action Docs.
with:
path: ${{ steps.prep-ccache.outputs.dir }}
# Restores ccache from either a previous build on this branch or on main
key: ${{ github.workflow }}-${{ matrix.python-version }}-ccache-linux-${{ steps.prep-ccache.outputs.timestamp }}
# This evaluates to `Linux Tests-3.9-ccache-linux-` which is not unique. As the CI matrix is expanded, this will
# need to be updated to be unique so that the cache is not restored from a different job altogether.
restore-keys: |
${{ github.workflow }}-${{ matrix.python-version }}-ccache-linux-
- name: Setup build and install scikit-tree
run: |
./spin build -j 2 --forcesubmodule
- name: Ccache performance
shell: bash -l {0}
run: ccache -s

- name: build-path
run: |
echo "$PWD/build-install/"
export INSTALLED_PATH=$PWD/build-install/usr/lib/python${{matrix.python-version}}/site-packages
- name: Run unit tests and coverage
run: |
./spin --help
./spin coverage --help
./spin test --help
./spin coverage -k "slowtest"
cp $PWD/build-install/usr/lib/python${{matrix.python-version}}/site-packages/coverage.xml ./coverage.xml
- name: debug
Expand All @@ -127,7 +236,6 @@ jobs:
ls ./
- name: Upload coverage stats to codecov
if: ${{ matrix.os == 'ubuntu-22.04' && matrix.python-version == '3.10'}}
uses: codecov/codecov-action@v3
with:
# python spin goes into the INSTALLED path in order to run pytest
Expand All @@ -146,7 +254,7 @@ jobs:
release:
name: Release
runs-on: ubuntu-latest
needs: [build_and_test]
needs: [build_and_test_slow]
if: startsWith(github.ref, 'refs/tags/')
steps:
- name: Checkout repository
Expand Down
45 changes: 19 additions & 26 deletions .spin/cmds.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
import os
import shutil
import subprocess
import sys

Expand All @@ -13,33 +12,27 @@ def get_git_revision_hash(submodule) -> str:


@click.command()
@click.option("--build-dir", default="build", help="Build directory; default is `$PWD/build`")
@click.option("--clean", is_flag=True, help="Clean previously built docs before building")
@click.option("--noplot", is_flag=True, help="Build docs without plots")
@click.argument("slowtest", default=True)
@click.pass_context
def docs(ctx, build_dir, clean=False, noplot=False):
"""📖 Build documentation"""
if clean:
doc_dir = "./docs/_build"
if os.path.isdir(doc_dir):
print(f"Removing `{doc_dir}`")
shutil.rmtree(doc_dir)

site_path = meson._get_site_packages()
if site_path is None:
print("No built scikit-tree found; run `./spin build` first.")
sys.exit(1)

util.run(["pip", "install", "-q", "-r", "doc_requirements.txt"])

ctx.invoke(meson.docs)


@click.command()
@click.pass_context
def coverage(ctx):
def coverage(ctx, slowtest):
"""📊 Generate coverage report"""
pytest_args = ("-o", "python_functions=test_*", "sktree", "--cov=sktree", "--cov-report=xml")
if slowtest:
pytest_args = (
"-o",
"python_functions=test_*",
"sktree",
"--cov=sktree",
"--cov-report=xml",
"-k slowtest",
)
else:
pytest_args = (
"-o",
"python_functions=test_*",
"sktree",
"--cov=sktree",
"--cov-report=xml",
)
ctx.invoke(meson.test, pytest_args=pytest_args)


Expand Down
1 change: 1 addition & 0 deletions doc/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -240,6 +240,7 @@
"predict",
"fit",
"apply",
"TreeBuilder",
}

# validation
Expand Down
6 changes: 4 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -235,7 +235,7 @@ exclude = [

[tool.pytest.ini_options]
minversion = '6.0'
addopts = '--durations 20 --junit-xml=junit-results.xml --verbose --ignore=sktree/_lib/'
addopts = '--durations 20 --junit-xml=junit-results.xml --verbose --ignore=sktree/_lib/ -k "not slowtest"'
filterwarnings = []

[tool.coverage.run]
Expand Down Expand Up @@ -267,7 +267,9 @@ Environments = [
'spin.cmds.meson.ipython',
'spin.cmds.meson.python',
]
Documentation = ['.spin/cmds.py:docs']
Documentation = [
'spin.cmds.meson.docs'
]
Metrics = [
'.spin/cmds.py:coverage',
'.spin/cmds.py:asv',
Expand Down
32 changes: 30 additions & 2 deletions sktree/stats/forestht.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from typing import Callable
from typing import Callable, Tuple

import numpy as np
from numpy.typing import ArrayLike
Expand Down Expand Up @@ -28,6 +28,13 @@


class BaseForestHT(MetaEstimatorMixin):
observe_samples_: ArrayLike
observe_posteriors_: ArrayLike
observe_stat_: float
permute_samples_: ArrayLike
permute_posteriors_: ArrayLike
permute_stat_: float

def __init__(
self,
estimator=None,
Expand Down Expand Up @@ -129,6 +136,13 @@ def _check_input(self, X: ArrayLike, y: ArrayLike, covariate_index: ArrayLike =
if y.ndim != 2:
y = y.reshape(-1, 1)

if covariate_index is not None:
if not isinstance(covariate_index, (list, tuple, np.ndarray)):
raise RuntimeError("covariate_index must be an iterable of integer indices")
else:
if not all(isinstance(idx, int) for idx in covariate_index):
raise RuntimeError("Not all covariate_index are integer indices")

if self._n_samples_ is not None and X.shape[0] != self._n_samples_:
raise RuntimeError(
f"X must have {self._n_samples_} samples, got {X.shape[0]}. "
Expand Down Expand Up @@ -156,7 +170,7 @@ def statistic(
return_posteriors: bool = False,
check_input: bool = True,
**metric_kwargs,
):
) -> Tuple[float, ArrayLike, ArrayLike]:
"""Compute the test statistic.
Parameters
Expand Down Expand Up @@ -503,6 +517,20 @@ def _get_estimator(self):
estimator_ = self.estimator
return clone(estimator_)

def statistic(
self,
X: ArrayLike,
y: ArrayLike,
covariate_index: ArrayLike = None,
metric="mse",
return_posteriors: bool = False,
check_input: bool = True,
**metric_kwargs,
):
return super().statistic(
X, y, covariate_index, metric, return_posteriors, check_input, **metric_kwargs
)

def _statistic(
self,
estimator: ForestClassifier,
Expand Down
35 changes: 35 additions & 0 deletions sktree/stats/tests/test_forestht.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import numpy as np
import pytest
from flaky import flaky
from joblib import Parallel, delayed
from scipy.special import expit
from sklearn import datasets

Expand Down Expand Up @@ -58,6 +59,14 @@ def test_featureimportance_forest_permute_pertree(sample_dataset_per_tree):
with pytest.raises(RuntimeError, match="Metric must be"):
est.statistic(iris_X[:n_samples], iris_y[:n_samples], metric="mi")

# covariate index must be an iterable
with pytest.raises(RuntimeError, match="covariate_index must be an iterable"):
est.statistic(iris_X[:n_samples], iris_y[:n_samples], 0, metric="mi")

# covariate index must be an iterable of ints
with pytest.raises(RuntimeError, match="Not all covariate_index"):
est.statistic(iris_X[:n_samples], iris_y[:n_samples], [0, 1.0], metric="mi")


def test_featureimportance_forest_errors():
permute_per_tree = False
Expand Down Expand Up @@ -257,6 +266,7 @@ def test_correlated_logit_model(hypotester, model_kwargs, n_samples, n_repeats,


@flaky(max_runs=2)
@pytest.mark.slowtest
@pytest.mark.parametrize("criterion", ["gini", "entropy"])
@pytest.mark.parametrize("honest_prior", ["empirical", "uniform"])
@pytest.mark.parametrize(
Expand Down Expand Up @@ -377,3 +387,28 @@ def test_forestht_check_inputs(forest_hyppo):
y_invalid = np.random.rand(X.shape[0])
with pytest.raises(RuntimeError, match="y must have type"):
forest_hyppo.statistic(X, y_invalid)


def test_parallelization():
"""Test parallelization of training forests."""
n_samples = 100
n_features = 5
X = rng.uniform(size=(n_samples, n_features))
y = rng.integers(0, 2, size=n_samples) # Binary classification

def run_forest(covariate_index=None):
clf = FeatureImportanceForestClassifier(
estimator=HonestForestClassifier(
n_estimators=10,
random_state=seed,
n_jobs=1,
),
)
obs_stat = clf.statistic(X, y, metric="mi")
perm_stat = clf.statistic(X, y, covariate_index=[covariate_index], metric="mi")
return obs_stat, perm_stat

out = Parallel(n_jobs=1)(
delayed(run_forest)(covariate_index) for covariate_index in range(n_features)
)
assert len(out) == n_features
12 changes: 6 additions & 6 deletions sktree/tree/_classes.py
Original file line number Diff line number Diff line change
Expand Up @@ -839,7 +839,7 @@ def __init__(
min_impurity_decrease=0.0,
class_weight=None,
feature_combinations=None,
ccp_alpha=None,
ccp_alpha=0.0,
store_leaf_values=False,
monotonic_cst=None,
):
Expand Down Expand Up @@ -1220,7 +1220,7 @@ def __init__(
max_leaf_nodes=None,
min_impurity_decrease=0.0,
feature_combinations=None,
ccp_alpha=None,
ccp_alpha=0.0,
store_leaf_values=False,
monotonic_cst=None,
):
Expand Down Expand Up @@ -1628,7 +1628,7 @@ def __init__(
data_dims=None,
boundary=None,
feature_weight=None,
ccp_alpha=None,
ccp_alpha=0.0,
store_leaf_values=False,
monotonic_cst=None,
):
Expand Down Expand Up @@ -2092,7 +2092,7 @@ def __init__(
data_dims=None,
boundary=None,
feature_weight=None,
ccp_alpha=None,
ccp_alpha=0.0,
store_leaf_values=False,
monotonic_cst=None,
):
Expand Down Expand Up @@ -2582,7 +2582,7 @@ def __init__(
min_impurity_decrease=0.0,
class_weight=None,
feature_combinations=None,
ccp_alpha=None,
ccp_alpha=0.0,
store_leaf_values=False,
monotonic_cst=None,
):
Expand Down Expand Up @@ -2973,7 +2973,7 @@ def __init__(
max_leaf_nodes=None,
min_impurity_decrease=0.0,
feature_combinations=None,
ccp_alpha=None,
ccp_alpha=0.0,
store_leaf_values=False,
monotonic_cst=None,
):
Expand Down

0 comments on commit c716440

Please sign in to comment.