Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Search Methods Enhancements to Avoid Duplicate Evaluated Pipelines πŸ₯ˆ #211

Open
wants to merge 9 commits into
base: master
Choose a base branch
from
130 changes: 112 additions & 18 deletions gama/GamaClassifier.py
Original file line number Diff line number Diff line change
@@ -1,41 +1,135 @@
import inspect
from typing import Union, Optional
import logging

import numpy as np
import pandas as pd
from ConfigSpace import ForbiddenEqualsClause
from sklearn.base import ClassifierMixin
from sklearn.preprocessing import LabelEncoder
import ConfigSpace as cs

from .gama import Gama
from gama.configuration.classification import config_space as clf_config
from gama.data_loading import X_y_from_file
from gama.configuration.classification import clf_config
from gama.utilities.metrics import scoring_to_metric
from gama.utilities.metrics import scoring_to_metric, Metric
from .gama import Gama
from .utilities.config_space import get_estimator_by_name

# Avoid stopit from logging warnings every time a pipeline evaluation times out
logging.getLogger("stopit").setLevel(logging.ERROR)
log = logging.getLogger(__name__)


class GamaClassifier(Gama):
"""Gama with adaptations for (multi-class) classification."""

def __init__(self, search_space=None, scoring="neg_log_loss", *args, **kwargs):
def __init__(
self,
search_space: Optional[cs.ConfigurationSpace] = None,
scoring: Metric = "neg_log_loss", # type: ignore
*args,
**kwargs,
):
if not search_space:
# Do this to avoid the whole dictionary being included in the documentation.
search_space = clf_config

self._metrics = scoring_to_metric(scoring)

search_space = self._search_space_check(search_space)

self._label_encoder = None
super().__init__(
*args, search_space=search_space, scoring=scoring, **kwargs
) # type: ignore

def _search_space_check(
self,
search_space: cs.ConfigurationSpace,
) -> cs.ConfigurationSpace:
"""Check if the search space is valid for classification."""

# Check if the search space contains a classifier hyperparameter.
if (
"estimators" not in search_space.meta
or (
search_space.meta["estimators"]
not in search_space.get_hyperparameters_dict()
)
or not isinstance(
search_space.get_hyperparameter(search_space.meta["estimators"]),
cs.CategoricalHyperparameter,
)
):
raise ValueError(
"The search space must include a hyperparameter for the classifiers "
"that is a CategoricalHyperparameter with choices for all desired "
"classifiers. Please double-check the spelling of the name, and review "
"the `meta` object in the search space configuration located at "
"`configurations/classification.py`. The `meta` object should contain "
"a key `estimators` with a value that is the name of the hyperparameter"
" that contains the classifier choices."
)

# Check if the search space contains a preprocessor hyperparameter
# if it is specified in the meta.
if (
"preprocessors" in search_space.meta
and (
search_space.meta["preprocessors"]
not in search_space.get_hyperparameters_dict()
)
or "preprocessors" in search_space.meta
and not isinstance(
search_space.get_hyperparameter(search_space.meta["preprocessors"]),
cs.CategoricalHyperparameter,
)
):
raise ValueError(
"The search space must include a hyperparameter for the preprocessors "
"that is a CategoricalHyperparameter with choices for all desired "
"preprocessors. Please double-check the spelling of the name, and "
"review the `meta` object in the search space configuration located at "
"`configurations/classification.py`. The `meta` object should contain "
"a key `preprocessors` with a value that is the name of the "
"hyperparameter that contains the preprocessor choices. "
)

# Check if the search space contains only classifiers that have predict_proba
# if the scoring requires probabilities.
if any(metric.requires_probabilities for metric in self._metrics):
# we don't want classifiers that do not have `predict_proba`,
# because then we have to start doing one hot encodings of predictions etc.
search_space = {
alg: hp
for (alg, hp) in search_space.items()
if not (
inspect.isclass(alg)
and issubclass(alg, ClassifierMixin)
and not hasattr(alg(), "predict_proba")
)
}

self._label_encoder = None
super().__init__(*args, search_space=search_space, scoring=scoring, **kwargs)
no_proba_clfs = []
for classifier in search_space.get_hyperparameter(
search_space.meta["estimators"]
).choices:
estimator = get_estimator_by_name(classifier)
if (
estimator is not None
and issubclass(estimator, ClassifierMixin)
and not hasattr(estimator(), "predict_proba")
):
no_proba_clfs.append(classifier)

log.info(
f"The following classifiers do not have a predict_proba method "
f"and will be excluded from the search space: {no_proba_clfs}"
)
search_space.add_forbidden_clauses(
[
ForbiddenEqualsClause(
search_space.get_hyperparameter(
search_space.meta["estimators"]
),
classifier,
)
for classifier in no_proba_clfs
if classifier
]
)

return search_space

def _predict(self, x: pd.DataFrame):
"""Predict the target for input X.
Expand All @@ -52,8 +146,8 @@ def _predict(self, x: pd.DataFrame):
"""
y = self.model.predict(x) # type: ignore
# Decode the predicted labels - necessary only if ensemble is not used.
if y[0] not in list(self._label_encoder.classes_):
y = self._label_encoder.inverse_transform(y)
if y[0] not in list(self._label_encoder.classes_): # type: ignore
y = self._label_encoder.inverse_transform(y) # type: ignore
return y

def _predict_proba(self, x: pd.DataFrame):
Expand Down
59 changes: 58 additions & 1 deletion gama/GamaRegressor.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
import pandas as pd

from .gama import Gama
from gama.configuration.regression import reg_config
from gama.configuration.regression import config_space as reg_config
import ConfigSpace as cs


class GamaRegressor(Gama):
Expand All @@ -16,8 +17,64 @@ def __init__(

if not search_space:
search_space = reg_config

search_space = self._search_space_check(search_space)

super().__init__(*args, search_space=search_space, scoring=scoring, **kwargs)

def _search_space_check(
self, search_space: cs.ConfigurationSpace
) -> cs.ConfigurationSpace:
"""Check if the search space is valid for regression."""

# Check if the search space contains a regressor hyperparameter.
if (
"estimators" not in search_space.meta
or (
search_space.meta["estimators"]
not in search_space.get_hyperparameters_dict()
)
or not isinstance(
search_space.get_hyperparameter(search_space.meta["estimators"]),
cs.CategoricalHyperparameter,
)
):
raise ValueError(
"The search space must include a hyperparameter for the regressors "
"that is a CategoricalHyperparameter with choices for all desired "
"regressors. Please double-check the spelling of the name, and review "
"the `meta` object in the search space configuration located at "
"`configurations/regression.py`. The `meta` object should contain "
"a key `estimators` with a value that is the name of the hyperparameter"
" that contains the regressor choices."
)

# Check if the search space contains a preprocessor hyperparameter
# if it is specified in the meta.
if (
"preprocessors" in search_space.meta
and (
search_space.meta["preprocessors"]
not in search_space.get_hyperparameters_dict()
)
or "preprocessors" in search_space.meta
and not isinstance(
search_space.get_hyperparameter(search_space.meta["preprocessors"]),
cs.CategoricalHyperparameter,
)
):
raise ValueError(
"The search space must include a hyperparameter for the preprocessors "
"that is a CategoricalHyperparameter with choices for all desired "
"preprocessors. Please double-check the spelling of the name, and "
"review the `meta` object in the search space configuration located at "
"`configurations/regression.py`. The `meta` object should contain "
"a key `preprocessors` with a value that is the name of the "
"hyperparameter that contains the preprocessor choices. "
)

return search_space

def _predict(self, x: pd.DataFrame):
"""Predict the target for input X.

Expand Down
Loading