From 3333b7a78716f7b2c8819f341e66854dd7929510 Mon Sep 17 00:00:00 2001 From: Provost Simon Date: Mon, 4 Dec 2023 15:24:38 +0000 Subject: [PATCH 1/9] refactor(configuration): add ConfigSpace + Custom config space is now utilising ConfigSpace + Classification.py is now divided in distinct Classifiers and Preprocesors for better understand and managmeent + Thorough documentation for users to add/modify anything --- gama/configuration/classification.py | 185 +++------ .../classification_task/__init__.py | 2 + .../classification_task/classifiers.py | 351 ++++++++++++++++++ .../classification_task/preprocessors.py | 289 ++++++++++++++ gama/utilities/metrics.py | 4 +- pyproject.toml | 1 + 6 files changed, 687 insertions(+), 145 deletions(-) create mode 100644 gama/configuration/classification_task/__init__.py create mode 100644 gama/configuration/classification_task/classifiers.py create mode 100644 gama/configuration/classification_task/preprocessors.py diff --git a/gama/configuration/classification.py b/gama/configuration/classification.py index 7388cc59..25b92ae3 100644 --- a/gama/configuration/classification.py +++ b/gama/configuration/classification.py @@ -1,147 +1,46 @@ -# sourcery skip: de-morgan -import numpy as np +import ConfigSpace as cs -from sklearn.naive_bayes import GaussianNB, BernoulliNB, MultinomialNB -from sklearn.tree import DecisionTreeClassifier -from sklearn.ensemble import ( - ExtraTreesClassifier, - RandomForestClassifier, - GradientBoostingClassifier, -) -from sklearn.neighbors import KNeighborsClassifier -from sklearn.svm import LinearSVC -from sklearn.linear_model import LogisticRegression -from sklearn.cluster import FeatureAgglomeration -from sklearn.preprocessing import ( - MaxAbsScaler, - MinMaxScaler, - Normalizer, - PolynomialFeatures, - RobustScaler, - StandardScaler, - Binarizer, -) -from sklearn.kernel_approximation import Nystroem, RBFSampler -from sklearn.decomposition import PCA, FastICA -from sklearn.feature_selection import ( - SelectFwe, - SelectPercentile, - f_classif, - VarianceThreshold, +from .classification_task import ClassifierConfig, PreprocessorConfig + +# Classifiers & Preprocessors 🚀 + +# This script is your ticket to configuring a ConfigSpace object, teeming with +# classifiers and preprocessors. We are diving in with the ClassifierConfig and +# PreprocessorConfig classes to fill the configuration space with a slew of +# hyperparameters and options. + +# Customise Your Space 🔧 + +# Want just classifiers? No biggie! Just comment out or remove the PreprocessorConfig +# setup + meta key-value & Voila! You're left with a sleek, classifier-only +# configuration space. + +# Want to add more classifiers or preprocessors? Easy! Just add them to the +# ClassifierConfig or PreprocessorConfig classes, respectively. You can even +# add your own custom classifiers or preprocessors. Just make sure they are +# compatible with scikit-learn's API. + +# Meta-Parameters 📝 + +# The meta-parameters are the "estimators" and "preprocessors" keys in the +# configuration space. These are used to identify the classifiers and preprocessors +# by the internal system. They are not hyperparameters, and should not be +# changed, except by advanced users. If you do change them, make sure to change +# the corresponding values in the current configuration space, i.e. in ClassifierConfig +# and PreprocessorConfig. + +# 👩‍💻👨‍💻 Happy configuring, and may your machine learning models shine! + +config_space = cs.ConfigurationSpace( + meta={ + # "gama_system_name": "current_configuration_name", + "estimators": "classifiers", + "preprocessors": "preprocessors", + } ) -# For comparison, this selection of operators and hyperparameters is -# currently most of what TPOT supports. +classifier_config = ClassifierConfig(config_space) +classifier_config.setup_classifiers() -clf_config = { - "alpha": [1e-3, 1e-2, 1e-1, 1.0, 10.0, 100.0], - "fit_prior": [True, False], - "min_samples_split": range(2, 21), - "min_samples_leaf": range(1, 21), - # Classifiers - GaussianNB: {}, - BernoulliNB: {"alpha": [], "fit_prior": []}, - MultinomialNB: {"alpha": [], "fit_prior": []}, - DecisionTreeClassifier: { - "criterion": ["gini", "entropy"], - "max_depth": range(1, 11), - "min_samples_split": [], - "min_samples_leaf": [], - }, - ExtraTreesClassifier: { - "n_estimators": [100], - "criterion": ["gini", "entropy"], - "max_features": np.arange(0.05, 1.01, 0.05), - "min_samples_split": [], - "min_samples_leaf": [], - "bootstrap": [True, False], - }, - RandomForestClassifier: { - "n_estimators": [100], - "criterion": ["gini", "entropy"], - "max_features": np.arange(0.05, 1.01, 0.05), - "min_samples_split": range(2, 21), - "min_samples_leaf": range(1, 21), - "bootstrap": [True, False], - }, - GradientBoostingClassifier: { - "n_estimators": [100], - "learning_rate": [1e-3, 1e-2, 1e-1, 0.5, 1.0], - "max_depth": range(1, 11), - "min_samples_split": range(2, 21), - "min_samples_leaf": range(1, 21), - "subsample": np.arange(0.05, 1.01, 0.05), - "max_features": np.arange(0.05, 1.01, 0.05), - }, - KNeighborsClassifier: { - "n_neighbors": range(1, 51), - "weights": ["uniform", "distance"], - "p": [1, 2], - }, - LinearSVC: { - "penalty": ["l1", "l2"], - "loss": ["hinge", "squared_hinge"], - "dual": [False, True], - "tol": [1e-5, 1e-4, 1e-3, 1e-2, 1e-1], - "C": [1e-4, 1e-3, 1e-2, 1e-1, 0.5, 1.0, 5.0, 10.0, 15.0, 20.0, 25.0], - "param_check": [ - lambda params: (not params["dual"] or params["penalty"] == "l2") - and not (params["penalty"] == "l1" and params["loss"] == "hinge") - and not ( - params["penalty"] == "l2" - and params["loss"] == "hinge" - and not params["dual"] - ) - ], - }, - LogisticRegression: { - "penalty": ["l2"], - "C": [1e-4, 1e-3, 1e-2, 1e-1, 0.5, 1.0, 5.0, 10.0, 15.0, 20.0, 25.0], - "dual": [False, True], - "solver": ["lbfgs"], - }, - Binarizer: {"threshold": np.arange(0.0, 1.01, 0.05)}, - FastICA: { - "tol": np.arange(0.0, 1.01, 0.05), - "whiten": ["unit-variance"], - }, - FeatureAgglomeration: { - "linkage": ["ward", "complete", "average"], - "affinity": ["euclidean", "l1", "l2", "manhattan", "cosine", "precomputed"], - "param_check": [ - lambda params: params["linkage"] != "ward" - or params["affinity"] == "euclidean" - ], - }, - MaxAbsScaler: {}, - MinMaxScaler: {}, - Normalizer: {"norm": ["l1", "l2", "max"]}, - Nystroem: { - "kernel": [ - "rbf", - "cosine", - "chi2", - "laplacian", - "polynomial", - "poly", - "linear", - "additive_chi2", - "sigmoid", - ], - "gamma": np.arange(0.0, 1.01, 0.05), - "n_components": range(1, 11), - }, - PCA: {"svd_solver": ["randomized"], "iterated_power": range(1, 11)}, - PolynomialFeatures: { - "degree": [2], - "include_bias": [False], - "interaction_only": [False], - }, - RBFSampler: {"gamma": np.arange(0.0, 1.01, 0.05)}, - RobustScaler: {}, - StandardScaler: {}, - # Selectors - SelectFwe: {"alpha": np.arange(0, 0.05, 0.001), "score_func": {f_classif: None}}, - SelectPercentile: {"percentile": range(1, 100), "score_func": {f_classif: None}}, - VarianceThreshold: {"threshold": np.arange(0.05, 1.01, 0.05)}, -} +preprocessor_config = PreprocessorConfig(config_space) +preprocessor_config.setup_preprocessors() diff --git a/gama/configuration/classification_task/__init__.py b/gama/configuration/classification_task/__init__.py new file mode 100644 index 00000000..fdf52686 --- /dev/null +++ b/gama/configuration/classification_task/__init__.py @@ -0,0 +1,2 @@ +from .classifiers import ClassifierConfig +from .preprocessors import PreprocessorConfig diff --git a/gama/configuration/classification_task/classifiers.py b/gama/configuration/classification_task/classifiers.py new file mode 100644 index 00000000..3f187d16 --- /dev/null +++ b/gama/configuration/classification_task/classifiers.py @@ -0,0 +1,351 @@ +import ConfigSpace as cs +import ConfigSpace.hyperparameters as csh + + +class ClassifierConfig: + """Manages the configuration space for classifiers in supervised learning contexts + + ClassifierConfig oversees the configuration space of classifiers utilised for a + supervised machine learning tasks. This class facilitates the addition of + new classifiers and the modification of existing ones in the configuration space + via standardised methods. The ConfigSpace library is utilised to designate the + configuration space, which enables the creation of intricate and adaptable + configuration setups. For additional information on utilising constraints and + various types of hyperparameters, with ConfigSpace, we refer the reader to + ConfigSpace documentation, available at: + https://automl.github.io/ConfigSpace/main/quickstart.html + + Add a classifier 💡 + ---------------- + + 1️⃣ To add a new classifier, define its setup method following the naming convention + `setup_classifierName`. This method should: + * Define hyperparameters specific to the classifier. + * Use `_add_hyperparameters_and_equals_conditions` to add these + hyperparameters to the config space with appropriate conditions. + + 2️⃣ Next, your setup method need to be added to the `classifiers_setup_map` in + the `__init__` method, where the key should be the Sci-kit learn name of your + classifier, and the value should be pointing to your newly setup method. + + voila! 🎉 You are done! Your classifier is now added to the config space. + + How to use the shared hyperparameters 🪢 + ------------------------------------- + + The shared hyperparameters are hyperparameters that are shared across multiple + classifiers. These hyperparameters are defined in the `shared_hyperparameters` + property. To use these hyperparameters, simply add them to the setup method of + the classifier you are adding. For example, to add the `C` hyperparameter to the + `LogisticRegression` classifier, add the following line to the + `setup_logistic_regression` method: + + >>> C = csh.CategoricalHyperparameter( + >>> "C__LogisticRegression", self.shared_hyperparameters["C"] + >>> ) + + voila! 🎉 The `C` hyperparameter is now added to the LogisticRegression classifier + with the shared value available in the `shared_hyperparameters` property. + + How to name my hyperparameters ✍️ + ------------------------------ + + The hyperparameters you add to the config space should be named in the following + format if similar hyperparameters names can be found in other classifiers: + + >>> __ + + For example, the `C` hyperparameter for the `LogisticRegression` classifier should + be named `C__LogisticRegression` given that the `C` hyperparameter is also + available in the `LinearSVC` classifier. This naming convention is used to ensure + that the hyperparameters are added to the correct classifier in the config space. + + If your hyperparameter name is unique to your classifier, you can name it as you + please without the need to have `__` at the end of the name. + Nonetheless, following the naming convention would in any way not cause any issues. + + Modify an existing classifier 💅 + ------------------- + + To modify an existing classifier, adjust its respective setup method and the + shared hyperparameters property as needed by modifying the values of the + hyperparameters. For example, to change the value of the `C` hyperparameter for + the `LogisticRegression` classifier, change the value of the `C` hyperparameter + in the `shared_hyperparameters` property by: + + >>> "C": [1e-4, 1e-3, 1e-2, 1e-1, 0.5, 1.0, 5.0, 10.0, 15.0, 20.0, 25.0], + + The `C` hyperparameter will then be added to the config space with the appropriate + value. However, be cautious, if you change values in the shared hyperparameters + property, it will be changed for all classifiers that use that hyperparameter. + If you want this change to only apply to a specific classifier, you should add + the hyperparameter to the setup method of that classifier. E.g. if you want to + change the value of the `C` hyperparameter for the `LogisticRegression` classifier, + and only want this change to apply to the `LogisticRegression` classifier, add + the following line to the `setup_logistic_regression` method: + + >>> C = csh.CategoricalHyperparameter( + >>> "C__LogisticRegression", + >>> [1e-4, 1e-3, 1e-2, 1e-1, 0.5, 1.0, 5.0, 10.0, 15.0, 20.0, 25.0] + >>> ) + + The `C` hyperparameter will be added as-is for the `LogisticRegression` classifier + and the value of the `C` hyperparameter for other classifiers will be as available + in the `shared_hyperparameters` property – iff they use the `C` + hyperparameter of the `shared_hyperparameters` property. + + Parameters + ---------- + config_space : cs.ConfigurationSpace + The ConfigSpace object that defines the hyperparameters and their ranges for + the classifiers. + + """ + + def __init__( + self, + config_space: cs.ConfigurationSpace, + ): + if "estimators" not in config_space.meta: + raise ValueError("Expected 'estimators' key in meta of config_space") + self.config_space = config_space + self.classifiers_setup_map = { + "BernoulliNB": self.setup_bernoulliNB, + "MultinomialNB": self.setup_multinomialNB, + "GaussianNB": self.setup_gaussianNB, + "DecisionTreeClassifier": self.setup_decision_tree, + "ExtraTreesClassifier": self.setup_extra_trees, + "RandomForestClassifier": self.setup_random_forest, + "GradientBoostingClassifier": self.setup_gradient_boosting, + "KNeighborsClassifier": self.setup_k_neighbors, + "LinearSVC": self.setup_linear_svc, + "LogisticRegression": self.setup_logistic_regression, + } + self.cs_estimators_name = self.config_space.meta["estimators"] + + @property + def shared_hyperparameters(self): + return { + "alpha": [1e-3, 1e-2, 1e-1, 1.0, 10.0, 100.0], + "fit_prior": [True, False], + "criterion": ["gini", "entropy"], + "max_depth": {"lower": 1, "upper": 11}, + "min_samples_split": {"lower": 2, "upper": 21}, + "min_samples_leaf": {"lower": 1, "upper": 21}, + "max_features": {"lower": 0.05, "upper": 1.01, "default_value": 1.0}, + "n_estimators": {"lower": 10, "upper": 100}, + "bootstrap": [True, False], + "dual": [True, False], + "C": [1e-4, 1e-3, 1e-2, 1e-1, 0.5, 1.0, 5.0, 10.0, 15.0, 20.0, 25.0], + } + + def setup_classifiers(self): + classifiers_choices = list(self.classifiers_setup_map.keys()) + + if not classifiers_choices: + raise ValueError("No classifiers to add to config space") + + classifiers = csh.CategoricalHyperparameter( + name=self.cs_estimators_name, + choices=classifiers_choices, + ) + self.config_space.add_hyperparameter(classifiers) + + for classifier_name in classifiers_choices: + if setup_func := self.classifiers_setup_map.get(classifier_name): + setup_func(classifiers) + + def _add_hyperparameters_and_equals_conditions( + self, local_vars: dict, estimator_name: str + ): + if "classifiers" not in local_vars or not isinstance( + local_vars["classifiers"], csh.CategoricalHyperparameter + ): + raise ValueError( + "Expected 'classifiers' key with a CategoricalHyperparameter in local" + "vars" + ) + + hyperparameters_to_add = [ + hyperparameter + for hyperparameter in local_vars.values() + if isinstance(hyperparameter, csh.Hyperparameter) + and hyperparameter != local_vars["classifiers"] + ] + + conditions_to_add = [ + cs.EqualsCondition( + hyperparameter, local_vars["classifiers"], estimator_name + ) + for hyperparameter in hyperparameters_to_add + ] + + self.config_space.add_hyperparameters(hyperparameters_to_add) + self.config_space.add_conditions(conditions_to_add) + + def setup_bernoulliNB(self, classifiers: csh.CategoricalHyperparameter): + alpha_NB = csh.CategoricalHyperparameter( + "alpha__BernoulliNB", self.shared_hyperparameters["alpha"] + ) + fit_prior = csh.CategoricalHyperparameter( + "fit_prior__BernoulliNB", self.shared_hyperparameters["fit_prior"] + ) + self._add_hyperparameters_and_equals_conditions(locals(), "BernoulliNB") + + def setup_multinomialNB(self, classifiers: csh.CategoricalHyperparameter): + alpha_NB = csh.CategoricalHyperparameter( + "alpha__MultinomialNB", self.shared_hyperparameters["alpha"] + ) + fit_prior = csh.CategoricalHyperparameter( + "fit_prior__MultinomialNB", self.shared_hyperparameters["fit_prior"] + ) + self._add_hyperparameters_and_equals_conditions(locals(), "MultinomialNB") + + def setup_gaussianNB(self, classifiers: csh.CategoricalHyperparameter): + # GaussianNB has no hyperparameters + pass + + def setup_decision_tree(self, classifiers: csh.CategoricalHyperparameter): + criterion = csh.CategoricalHyperparameter( + "criterion__DecisionTreeClassifier", + self.shared_hyperparameters["criterion"], + ) + max_depth = csh.UniformIntegerHyperparameter( + "max_depth__DecisionTreeClassifier", + **self.shared_hyperparameters["max_depth"], + ) + min_samples_split = csh.UniformIntegerHyperparameter( + "min_samples_split__DecisionTreeClassifier", + **self.shared_hyperparameters["min_samples_split"], + ) + min_samples_leaf = csh.UniformIntegerHyperparameter( + "min_samples_leaf__DecisionTreeClassifier", + **self.shared_hyperparameters["min_samples_leaf"], + ) + self._add_hyperparameters_and_equals_conditions( + locals(), "DecisionTreeClassifier" + ) + + def setup_extra_trees(self, classifiers: csh.CategoricalHyperparameter): + criterion = csh.CategoricalHyperparameter( + "criterion__ExtraTreesClassifier", self.shared_hyperparameters["criterion"] + ) + max_depth = csh.UniformIntegerHyperparameter( + "max_depth__ExtraTreesClassifier", + **self.shared_hyperparameters["max_depth"], + ) + min_samples_split = csh.UniformIntegerHyperparameter( + "min_samples_split__ExtraTreesClassifier", + **self.shared_hyperparameters["min_samples_split"], + ) + min_samples_leaf = csh.UniformIntegerHyperparameter( + "min_samples_leaf__ExtraTreesClassifier", + **self.shared_hyperparameters["min_samples_leaf"], + ) + max_features = csh.UniformFloatHyperparameter( + "max_features__ExtraTreesClassifier", + **self.shared_hyperparameters["max_features"], + ) + n_estimators = csh.UniformIntegerHyperparameter( + "n_estimators__ExtraTreesClassifier", + **self.shared_hyperparameters["n_estimators"], + ) + bootstrap = csh.CategoricalHyperparameter( + "bootstrap__ExtraTreesClassifier", self.shared_hyperparameters["bootstrap"] + ) + self._add_hyperparameters_and_equals_conditions( + locals(), "ExtraTreesClassifier" + ) + + def setup_random_forest(self, classifiers: csh.CategoricalHyperparameter): + criterion = csh.CategoricalHyperparameter( + "criterion__RandomForestClassifier", + self.shared_hyperparameters["criterion"], + ) + max_depth = csh.UniformIntegerHyperparameter( + "max_depth__RandomForestClassifier", + **self.shared_hyperparameters["max_depth"], + ) + min_samples_split = csh.UniformIntegerHyperparameter( + "min_samples_split", **self.shared_hyperparameters["min_samples_split"] + ) + min_samples_leaf = csh.UniformIntegerHyperparameter( + "min_samples_leaf", **self.shared_hyperparameters["min_samples_leaf"] + ) + max_features = csh.UniformFloatHyperparameter( + "max_features", **self.shared_hyperparameters["max_features"] + ) + n_estimators = csh.UniformIntegerHyperparameter( + "n_estimators", **self.shared_hyperparameters["n_estimators"] + ) + bootstrap = csh.CategoricalHyperparameter( + "bootstrap", self.shared_hyperparameters["bootstrap"] + ) + self._add_hyperparameters_and_equals_conditions( + locals(), "RandomForestClassifier" + ) + + def setup_gradient_boosting(self, classifiers: csh.CategoricalHyperparameter): + sub_sample = csh.UniformFloatHyperparameter( + "subsample", 0.05, 1.01, default_value=1.0 + ) + learning_rate = csh.CategoricalHyperparameter( + "learning_rate", [1e-3, 1e-2, 1e-1, 0.5, 1.0] + ) + max_features = csh.UniformFloatHyperparameter( + "max_features__GradientBoostingClassifier", + **self.shared_hyperparameters["max_features"], + ) + n_estimators = csh.UniformIntegerHyperparameter( + "n_estimators__GradientBoostingClassifier", + **self.shared_hyperparameters["n_estimators"], + ) + max_depth = csh.UniformIntegerHyperparameter( + "max_depth__GradientBoostingClassifier", + **self.shared_hyperparameters["max_depth"], + ) + self._add_hyperparameters_and_equals_conditions( + locals(), "GradientBoostingClassifier" + ) + + def setup_k_neighbors(self, classifiers: csh.CategoricalHyperparameter): + n_neighbors = csh.UniformIntegerHyperparameter("n_neighbors", 1, 51) + weights = csh.CategoricalHyperparameter("weights", ["uniform", "distance"]) + p = csh.UniformIntegerHyperparameter("p", 1, 2) + self._add_hyperparameters_and_equals_conditions( + locals(), "KNeighborsClassifier" + ) + + def setup_linear_svc(self, classifiers: csh.CategoricalHyperparameter): + loss = csh.CategoricalHyperparameter( + "loss__LinearSVC", ["hinge", "squared_hinge"] + ) + penalty = csh.CategoricalHyperparameter("penalty__LinearSVC", ["l1", "l2"]) + dual = csh.CategoricalHyperparameter( + "dual__LinearSVC", self.shared_hyperparameters["dual"] + ) + tol = csh.CategoricalHyperparameter( + "tol__LinearSVC", [1e-5, 1e-4, 1e-3, 1e-2, 1e-1] + ) + C = csh.CategoricalHyperparameter( + "C__LinearSVC", self.shared_hyperparameters["C"] + ) + self._add_hyperparameters_and_equals_conditions(locals(), "LinearSVC") + + # Forbidden clause: Penalty 'l1' cannot be used with loss 'hinge' + forbidden_penalty_loss = cs.ForbiddenAndConjunction( + cs.ForbiddenEqualsClause(self.config_space["penalty__LinearSVC"], "l1"), + cs.ForbiddenEqualsClause(self.config_space["loss__LinearSVC"], "hinge"), + ) + self.config_space.add_forbidden_clause(forbidden_penalty_loss) + + def setup_logistic_regression(self, classifiers: csh.CategoricalHyperparameter): + solver = csh.CategoricalHyperparameter("solver", ["lbfgs"]) + penalty = csh.CategoricalHyperparameter("penalty__LogisticRegression", ["l2"]) + C = csh.CategoricalHyperparameter( + "C__LogisticRegression", self.shared_hyperparameters["C"] + ) + dual = csh.CategoricalHyperparameter( + "dual__LogisticRegression", self.shared_hyperparameters["dual"] + ) + self._add_hyperparameters_and_equals_conditions(locals(), "LogisticRegression") diff --git a/gama/configuration/classification_task/preprocessors.py b/gama/configuration/classification_task/preprocessors.py new file mode 100644 index 00000000..728f564a --- /dev/null +++ b/gama/configuration/classification_task/preprocessors.py @@ -0,0 +1,289 @@ +import ConfigSpace as cs +import ConfigSpace.hyperparameters as csh + + +class PreprocessorConfig: + """Manages the configuration space for preprocessors in supervised learning contexts + + PreprocessorConfig oversees the configuration space of preprocessors used in + supervised machine learning tasks. This class facilitates the addition of + new preprocessors and the modification of existing ones in the configuration space + via standardised methods. The ConfigSpace library is used to designate the + configuration space, enabling the creation of complex and adaptable + configuration setups. For additional information on using constraints and + various types of hyperparameters with ConfigSpace, refer to + the ConfigSpace documentation, available at: + https://automl.github.io/ConfigSpace/main/quickstart.html + + Add a preprocessor 💡 + ---------------- + + 1️⃣ To add a new preprocessor, define its setup method following the naming + convention `setup_preprocessorName`. This method should: + * Define hyperparameters specific to the preprocessor. + * Use `_add_hyperparameters_and_equals_conditions` to add these + hyperparameters to the config space with appropriate conditions. + + 2️⃣ Next, your setup method needs to be added to the `preprocessors_setup_map` in + the `__init__` method, where the key should be the Sci-kit learn name of your + preprocessor, and the value should be pointing to your newly setup method. + + voila! 🎉 You are done! Your preprocessor is now added to the config space. + + How to use the shared hyperparameters 🪢 + ------------------------------------- + + The shared hyperparameters are hyperparameters that are shared across multiple + preprocessors. These hyperparameters are defined in the `shared_hyperparameters` + property. To use these hyperparameters, simply add them to the setup method of + the preprocessor you are adding. For example, to add the `gamma` hyperparameter to + the `Nystroem` preprocessor, add the following line to the `setup_nystroem` method: + + >>> gamma = csh.CategoricalHyperparameter( + >>> "gamma__Nystroem", self.shared_hyperparameters["gamma"] + >>> ) + + voila! 🎉 The `gamma` hyperparameter is now added to the Nystroem preprocessor + with the shared value available in the `shared_hyperparameters` property. + + How to name my hyperparameters ✍️ + ------------------------------ + + The hyperparameters you add to the config space should be named in the following + format if similar hyperparameter names can be found in other preprocessors: + + >>> __ + + For example, the `gamma` hyperparameter for the `Nystroem` preprocessor should + be named `gamma__Nystroem` given that the `gamma` hyperparameter is also + available in the `RBFSampler` preprocessor. This naming convention is used to ensure + that the hyperparameters are added to the correct preprocessor in the config space. + + If your hyperparameter name is unique to your preprocessor, you can name it as you + please without the need to have `__` at the end of the name. + Nonetheless, following the naming convention would in any way not cause any issues. + + Modify an existing preprocessor 💅 + ------------------- + + To modify an existing preprocessor, adjust its respective setup method and the + shared hyperparameters property as needed by modifying the values of the + hyperparameters. For example, to change the value of the `gamma` hyperparameter for + the `Nystroem` preprocessor, change the value of the `gamma` hyperparameter + in the `shared_hyperparameters` property by: + + >>> "gamma": {"lower": 0.001, "upper": 0.8, "default_value": 0.5}, + + The `gamma` hyperparameter will then be added to the config space with the + appropriate value. However, be cautious, if you change values in the shared + hyperparameters property, it will be changed for all preprocessors that use that + hyperparameter. If you want this change to only apply to a specific preprocessor, + you should add the hyperparameter to the setup method of that preprocessor. + E.g. if you want to change the value of the `gamma` hyperparameter for the + `Nystroem` preprocessor, and only want this change to apply to the `Nystroem` + preprocessor, add the following line to the `setup_nystroem` method: + + >>> gamma = csh.CategoricalHyperparameter( + >>> "gamma__Nystroem", {"lower": 0.001, "upper": 0.8, "default_value": 0.5}, + >>> ) + + The `gamma` hyperparameter will be added as-is for the `Nystroem` preprocessor + and the value of the `gamma` hyperparameter for other preprocessors will be as + available in the `shared_hyperparameters` property – iff they use the `gamma` + hyperparameter of the `shared_hyperparameters` property. + + + Parameters + ---------- + config_space : cs.ConfigurationSpace + The ConfigSpace object that will be used to add the preprocessors and their + respective hyperparameters. + + """ + + def __init__( + self, + config_space: cs.ConfigurationSpace, + ): + if "preprocessors" not in config_space.meta: + raise ValueError("Expected 'preprocessors' key in meta of config_space") + self.config_space = config_space + self.preprocessors_setup_map = { + "SelectFwe": self.setup_select_fwe, + "Binarizer": self.setup_binarizer, + "FastICA": self.setup_fast_ica, + "FeatureAgglomeration": self.setup_feature_agglomeration, + "MaxAbsScaler": self.setup_max_abs_scaler, + "MinMaxScaler": self.setup_min_max_scaler, + "Normalizer": self.setup_normalizer, + "Nystroem": self.setup_nystroem, + "PCA": self.setup_pca, + "PolynomialFeatures": self.setup_polynomial_features, + "RBFSampler": self.setup_rbf_sampler, + "RobustScaler": self.setup_robust_scaler, + "StandardScaler": self.setup_standard_scaler, + "SelectPercentile": self.setup_select_percentile, + "VarianceThreshold": self.setup_variance_threshold, + } + self.cs_preprocessors_name = config_space.meta["preprocessors"] + + @property + def shared_hyperparameters(self): + return { + "gamma": {"lower": 0.01, "upper": 1.01, "default_value": 1.0}, + } + + def setup_preprocessors(self): + preprocessors_choices = list(self.preprocessors_setup_map.keys()) + + if not preprocessors_choices: + raise ValueError("No preprocessors to add to config space") + + preprocessors = csh.CategoricalHyperparameter( + name=self.cs_preprocessors_name, + choices=preprocessors_choices, + ) + self.config_space.add_hyperparameter(preprocessors) + + for preprocessor_name in preprocessors_choices: + if setup_func := self.preprocessors_setup_map.get(preprocessor_name): + setup_func(preprocessors) + + def _add_hyperparameters_and_equals_conditions( + self, local_vars: dict, preprocessor_name: str + ): + if "preprocessors" not in local_vars or not isinstance( + local_vars["preprocessors"], csh.CategoricalHyperparameter + ): + raise ValueError( + "Expected 'preprocessors' key with a CategoricalHyperparameter in local" + "vars" + ) + + hyperparameters_to_add = [ + hyperparameter + for hyperparameter in local_vars.values() + if isinstance(hyperparameter, csh.Hyperparameter) + and hyperparameter != local_vars["preprocessors"] + ] + + conditions_to_add = [ + cs.EqualsCondition( + hyperparameter, local_vars["preprocessors"], preprocessor_name + ) + for hyperparameter in hyperparameters_to_add + ] + + self.config_space.add_hyperparameters(hyperparameters_to_add) + self.config_space.add_conditions(conditions_to_add) + + def setup_select_fwe(self, preprocessors: csh.CategoricalHyperparameter): + alpha = csh.UniformFloatHyperparameter( + "alpha__SelectFwe", 0.01, 0.05, default_value=0.05 + ) + self._add_hyperparameters_and_equals_conditions(locals(), "SelectFwe") + + def setup_binarizer(self, preprocessors: csh.CategoricalHyperparameter): + threshold = csh.UniformFloatHyperparameter( + "threshold__Binarizer", 0.0, 1.01, default_value=0.05 + ) + self._add_hyperparameters_and_equals_conditions(locals(), "Binarizer") + + def setup_fast_ica(self, preprocessors: csh.CategoricalHyperparameter): + whiten = csh.CategoricalHyperparameter("whiten", ["unit-variance"]) + tol = csh.UniformFloatHyperparameter( + "tol__FastICA", 0.0, 1.01, default_value=0.05 + ) + self._add_hyperparameters_and_equals_conditions(locals(), "FastICA") + + def setup_feature_agglomeration(self, preprocessors: csh.CategoricalHyperparameter): + linkage = csh.CategoricalHyperparameter( + "linkage__FeatureAgglomeration", ["ward", "complete", "average"] + ) + affinity = csh.CategoricalHyperparameter( + "affinity__FeatureAgglomeration", + ["euclidean", "l1", "l2", "manhattan", "cosine", "precomputed"], + ) + self._add_hyperparameters_and_equals_conditions( + locals(), "FeatureAgglomeration" + ) + + # Forbidden clause: Linkage is different from 'ward' and affinity is 'euclidean' + forbidden_penalty_loss = cs.ForbiddenAndConjunction( + cs.ForbiddenInClause( + self.config_space["linkage__FeatureAgglomeration"], + ["complete", "average"], + ), + cs.ForbiddenEqualsClause( + self.config_space["affinity__FeatureAgglomeration"], "euclidean" + ), + ) + self.config_space.add_forbidden_clause(forbidden_penalty_loss) + + def setup_max_abs_scaler(self, preprocessors: csh.CategoricalHyperparameter): + # No hyperparameters + pass + + def setup_min_max_scaler(self, preprocessors: csh.CategoricalHyperparameter): + # No hyperparameters + pass + + def setup_normalizer(self, preprocessors: csh.CategoricalHyperparameter): + norm = csh.CategoricalHyperparameter("norm", ["l1", "l2", "max"]) + self._add_hyperparameters_and_equals_conditions(locals(), "Normalizer") + + def setup_nystroem(self, preprocessors: csh.CategoricalHyperparameter): + kernel = csh.CategoricalHyperparameter( + "kernel", + [ + "rbf", + "cosine", + "chi2", + "laplacian", + "polynomial", + "poly", + "linear", + "additive_chi2", + "sigmoid", + ], + ) + gamma = csh.UniformFloatHyperparameter( + "gamma__Nystroem", **self.shared_hyperparameters["gamma"] + ) + n_components = csh.UniformIntegerHyperparameter("n_components", 1, 11) + self._add_hyperparameters_and_equals_conditions(locals(), "Nystroem") + + def setup_pca(self, preprocessors: csh.CategoricalHyperparameter): + svd_solver = csh.CategoricalHyperparameter("svd_solver", ["randomized"]) + iterated_power = csh.UniformIntegerHyperparameter("iterated_power", 1, 11) + self._add_hyperparameters_and_equals_conditions(locals(), "PCA") + + def setup_polynomial_features(self, preprocessors: csh.CategoricalHyperparameter): + degree = csh.CategoricalHyperparameter("degree", [2]) + include_bias = csh.CategoricalHyperparameter("include_bias", [False]) + interaction_only = csh.CategoricalHyperparameter("interaction_only", [False]) + self._add_hyperparameters_and_equals_conditions(locals(), "PolynomialFeatures") + + def setup_rbf_sampler(self, preprocessors: csh.CategoricalHyperparameter): + gamma = csh.UniformFloatHyperparameter( + "gamma__RBFSampler", **self.shared_hyperparameters["gamma"] + ) + self._add_hyperparameters_and_equals_conditions(locals(), "RBFSampler") + + def setup_robust_scaler(self, preprocessors: csh.CategoricalHyperparameter): + # No hyperparameters + pass + + def setup_standard_scaler(self, preprocessors: csh.CategoricalHyperparameter): + # No hyperparameters + pass + + def setup_select_percentile(self, preprocessors: csh.CategoricalHyperparameter): + percentile = csh.UniformIntegerHyperparameter("percentile", 1, 100) + self._add_hyperparameters_and_equals_conditions(locals(), "SelectPercentile") + + def setup_variance_threshold(self, preprocessors: csh.CategoricalHyperparameter): + threshold = csh.UniformFloatHyperparameter( + "threshold__VarianceThreshold", 0.05, 1.01, default_value=0.05 + ) + self._add_hyperparameters_and_equals_conditions(locals(), "VarianceThreshold") diff --git a/gama/utilities/metrics.py b/gama/utilities/metrics.py index 2c4e21d8..87c3101b 100644 --- a/gama/utilities/metrics.py +++ b/gama/utilities/metrics.py @@ -2,7 +2,7 @@ from typing import Iterable, Tuple, Union from sklearn.metrics import get_scorer -from sklearn.metrics._scorer import _ProbaScorer, _BaseScorer, SCORERS +from sklearn.metrics._scorer import _ProbaScorer, _BaseScorer, _SCORERS classification_metrics = {"accuracy", "roc_auc", "average_precision", "neg_log_loss"} for metric in ["precision", "recall", "f1"]: @@ -19,7 +19,7 @@ } all_metrics = {*classification_metrics, *regression_metrics} -reversed_scorers = {repr(v): k for k, v in SCORERS.items()} +reversed_scorers = {repr(v): k for k, v in _SCORERS.items()} class MetricType(Enum): diff --git a/pyproject.toml b/pyproject.toml index 9ffdd641..d742f6bb 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -26,6 +26,7 @@ dependencies = [ "category-encoders>=1.2.8", "black==23.3.0", "psutil", + "ConfigSpace>=0.7.1", ] [project.optional-dependencies] From abf174b7fd54d6f3e84a8c44f031ca723462b5ac Mon Sep 17 00:00:00 2001 From: Provost Simon Date: Mon, 4 Dec 2023 15:29:34 +0000 Subject: [PATCH 2/9] refactor(gama): update internal system to be ConfigSpace compliant --- gama/GamaClassifier.py | 130 ++++++- gama/configuration/parser.py | 139 -------- gama/gama.py | 90 +++-- .../components/individual.py | 11 +- .../components/primitive_node.py | 100 +++++- .../components/terminal.py | 10 + gama/genetic_programming/mutation.py | 144 ++++++-- gama/genetic_programming/operations.py | 323 ++++++++++++++++-- gama/logging/GamaReport.py | 13 +- gama/utilities/config_space.py | 74 ++++ 10 files changed, 778 insertions(+), 256 deletions(-) delete mode 100644 gama/configuration/parser.py create mode 100644 gama/utilities/config_space.py diff --git a/gama/GamaClassifier.py b/gama/GamaClassifier.py index 5ae27493..3ca9d63b 100644 --- a/gama/GamaClassifier.py +++ b/gama/GamaClassifier.py @@ -1,41 +1,135 @@ import inspect from typing import Union, Optional +import logging import numpy as np import pandas as pd +from ConfigSpace import ForbiddenEqualsClause from sklearn.base import ClassifierMixin from sklearn.preprocessing import LabelEncoder +import ConfigSpace as cs -from .gama import Gama +from gama.configuration.classification import config_space as clf_config from gama.data_loading import X_y_from_file -from gama.configuration.classification import clf_config -from gama.utilities.metrics import scoring_to_metric +from gama.utilities.metrics import scoring_to_metric, Metric +from .gama import Gama +from .utilities.config_space import get_estimator_by_name + +# Avoid stopit from logging warnings every time a pipeline evaluation times out +logging.getLogger("stopit").setLevel(logging.ERROR) +log = logging.getLogger(__name__) class GamaClassifier(Gama): """Gama with adaptations for (multi-class) classification.""" - def __init__(self, search_space=None, scoring="neg_log_loss", *args, **kwargs): + def __init__( + self, + search_space: Optional[cs.ConfigurationSpace] = None, + scoring: Metric = "neg_log_loss", # type: ignore + *args, + **kwargs, + ): if not search_space: # Do this to avoid the whole dictionary being included in the documentation. search_space = clf_config self._metrics = scoring_to_metric(scoring) + + search_space = self._search_space_check(search_space) + + self._label_encoder = None + super().__init__( + *args, search_space=search_space, scoring=scoring, **kwargs + ) # type: ignore + + def _search_space_check( + self, + search_space: cs.ConfigurationSpace, + ) -> cs.ConfigurationSpace: + """Check if the search space is valid for classification.""" + + # Check if the search space contains a classifier hyperparameter. + if ( + "estimators" not in search_space.meta + or ( + search_space.meta["estimators"] + not in search_space.get_hyperparameters_dict() + ) + or not isinstance( + search_space.get_hyperparameter(search_space.meta["estimators"]), + cs.CategoricalHyperparameter, + ) + ): + raise ValueError( + "The search space must include a hyperparameter for the classifiers " + "that is a CategoricalHyperparameter with choices for all desired " + "classifiers. Please double-check the spelling of the name, and review " + "the `meta` object in the search space configuration located at " + "`configurations/classification.py`. The `meta` object should contain " + "a key `estimators` with a value that is the name of the hyperparameter" + " that contains the classifier choices." + ) + + # Check if the search space contains a preprocessor hyperparameter + # if it is specified in the meta. + if ( + "preprocessors" in search_space.meta + and ( + search_space.meta["preprocessors"] + not in search_space.get_hyperparameters_dict() + ) + or "preprocessors" in search_space.meta + and not isinstance( + search_space.get_hyperparameter(search_space.meta["preprocessors"]), + cs.CategoricalHyperparameter, + ) + ): + raise ValueError( + "The search space must include a hyperparameter for the preprocessors " + "that is a CategoricalHyperparameter with choices for all desired " + "preprocessors. Please double-check the spelling of the name, and " + "review the `meta` object in the search space configuration located at " + "`configurations/classification.py`. The `meta` object should contain " + "a key `preprocessors` with a value that is the name of the " + "hyperparameter that contains the preprocessor choices. " + ) + + # Check if the search space contains only classifiers that have predict_proba + # if the scoring requires probabilities. if any(metric.requires_probabilities for metric in self._metrics): # we don't want classifiers that do not have `predict_proba`, # because then we have to start doing one hot encodings of predictions etc. - search_space = { - alg: hp - for (alg, hp) in search_space.items() - if not ( - inspect.isclass(alg) - and issubclass(alg, ClassifierMixin) - and not hasattr(alg(), "predict_proba") - ) - } - - self._label_encoder = None - super().__init__(*args, search_space=search_space, scoring=scoring, **kwargs) + no_proba_clfs = [] + for classifier in search_space.get_hyperparameter( + search_space.meta["estimators"] + ).choices: + estimator = get_estimator_by_name(classifier) + if ( + estimator is not None + and issubclass(estimator, ClassifierMixin) + and not hasattr(estimator(), "predict_proba") + ): + no_proba_clfs.append(classifier) + + log.info( + f"The following classifiers do not have a predict_proba method " + f"and will be excluded from the search space: {no_proba_clfs}" + ) + search_space.add_forbidden_clauses( + [ + ForbiddenEqualsClause( + search_space.get_hyperparameter( + search_space.meta["estimators"] + ), + classifier, + ) + for classifier in no_proba_clfs + if classifier + ] + ) + + return search_space def _predict(self, x: pd.DataFrame): """Predict the target for input X. @@ -52,8 +146,8 @@ def _predict(self, x: pd.DataFrame): """ y = self.model.predict(x) # type: ignore # Decode the predicted labels - necessary only if ensemble is not used. - if y[0] not in list(self._label_encoder.classes_): - y = self._label_encoder.inverse_transform(y) + if y[0] not in list(self._label_encoder.classes_): # type: ignore + y = self._label_encoder.inverse_transform(y) # type: ignore return y def _predict_proba(self, x: pd.DataFrame): diff --git a/gama/configuration/parser.py b/gama/configuration/parser.py deleted file mode 100644 index 705ec268..00000000 --- a/gama/configuration/parser.py +++ /dev/null @@ -1,139 +0,0 @@ -from collections import defaultdict -from typing import Dict, Any, Union, List, Callable, Tuple - -import sklearn - -from gama.genetic_programming.components import Primitive, Terminal, DATA_TERMINAL - - -def pset_from_config( - configuration: Dict[Union[str, object], Any] -) -> Tuple[Dict[str, List], Dict[str, Callable]]: - """Create a pset for the given configuration dictionary. - - Given a configuration dictionary specifying operators (e.g. sklearn - estimators), their hyperparameters and values for each hyperparameter, - create a gp.PrimitiveSetTyped that contains: - - - For each operator a primitive - - For each possible hyperparameter-value combination a unique terminal - - Side effect: Imports the classes of each primitive. - - returns: - pset - Dict[str, List]: - maps return-types to a list of Primitives and/or Terminals - parameter_check - Dict[str, Callable]: - maps Primitive name to a check for the validity of the hp configuration - """ - - pset: Dict[str, List[Union[Primitive, Terminal]]] = defaultdict(list) - parameter_checks = {} - - # Make sure the str-keys are evaluated first, they describe shared hyperparameters. - # Order-preserving dictionaries are not in the Python 3.6 specification. - sorted_keys = reversed(sorted(configuration.keys(), key=lambda x: str(type(x)))) - for key in sorted_keys: - values = configuration[key] - if isinstance(key, str): - # Specification of shared hyperparameters - for value in values: - pset[key].append(Terminal(value=value, output=key, identifier=key)) - elif isinstance(key, type): - # Specification of operator (learner, preprocessor) - hyperparameter_types: List[str] = [] - for name, param_values in sorted(values.items()): - # We construct a new type for each hyperparameter, so we can specify - # it as terminal type, making sure it matches with expected - # input of the operators. Moreover it automatically makes sure that - # crossover only happens between same hyperparameters. - if isinstance(param_values, list) and not param_values: - # An empty list indicates a shared hyperparameter - hyperparameter_types.append(name) - elif name == "param_check": - # This allows users to define illegal hyperparameter combinations, - # but is not a terminal. - parameter_checks[key.__name__] = param_values[0] - else: - hp_name = f"{key.__name__}.{name}" - hyperparameter_types.append(hp_name) - for value in param_values: - pset[hp_name].append( - Terminal( - value=value, - output=name, - identifier=hp_name, - ) - ) - - # After registering the hyperparameter types, - # we can register the operator itself. - if issubclass(key, sklearn.base.TransformerMixin): - pset[DATA_TERMINAL].append( - Primitive( - input=tuple(hyperparameter_types), - output=DATA_TERMINAL, - identifier=key, - ) - ) - elif issubclass(key, sklearn.base.ClassifierMixin): - pset["prediction"].append( - Primitive( - input=tuple(hyperparameter_types), - output="prediction", - identifier=key, - ) - ) - elif issubclass(key, sklearn.base.RegressorMixin): - pset["prediction"].append( - Primitive( - input=tuple(hyperparameter_types), - output="prediction", - identifier=key, - ) - ) - else: - raise TypeError( - f"Expected {key} to be either subclass of " - "TransformerMixin, RegressorMixin or ClassifierMixin." - ) - else: - raise TypeError( - "Encountered unknown type as key in dictionary." - "Keys in the configuration should be str or class." - ) - - return pset, parameter_checks - - -def merge_configurations(c1: Dict, c2: Dict) -> Dict: - """Takes two configurations and merges them together.""" - # Should refactor out 6 indentation levels - merged: Dict[Any, Any] = defaultdict(lambda: None, c1) - for algorithm, hparams2 in c2.items(): - if algorithm not in merged: - merged[algorithm] = hparams2 - continue - - hparams = merged[algorithm] - if isinstance(hparams, list) and isinstance(hparams2, list): - merged[algorithm] = list(set(hparams + hparams2)) - continue # Here the algorithm is actually a shared hyperparameter. - - for hyperparameter, values in hparams2.items(): - if hyperparameter not in hparams: - hparams[hyperparameter] = values - continue # Hyperparameter only specified in one configuration. - - values1 = hparams[hyperparameter] - if isinstance(values1, dict) and isinstance(values, dict): - hparams[hyperparameter] = {**values1, **values} - elif isinstance(values1, type(values)): - # Both are ranges, arrays or lists. - hparams[hyperparameter] = list(set(list(values1) + list(values))) - else: - raise TypeError( - f"Could not merge values of {algorithm}.{hyperparameter}:" - f"{hparams} vs. {hparams2}" - ) - return merged diff --git a/gama/gama.py b/gama/gama.py index c06cbcb5..9b7e3e61 100644 --- a/gama/gama.py +++ b/gama/gama.py @@ -26,11 +26,12 @@ import pandas as pd import numpy as np import stopit +from ConfigSpace import ForbiddenEqualsClause from sklearn.base import TransformerMixin from sklearn.pipeline import Pipeline import gama.genetic_programming.compilers.scikitlearn -from gama.genetic_programming.components import Individual, Fitness, DATA_TERMINAL +from gama.genetic_programming.components import Individual, Fitness from gama.search_methods.base_search import BaseSearch from gama.utilities.evaluation_library import EvaluationLibrary, Evaluation from gama.utilities.metrics import scoring_to_metric @@ -52,7 +53,6 @@ eliminate_from_pareto, ) from gama.genetic_programming.operations import create_random_expression -from gama.configuration.parser import pset_from_config from gama.genetic_programming.operator_set import OperatorSet from gama.genetic_programming.compilers.scikitlearn import compile_individual from gama.postprocessing import ( @@ -63,11 +63,12 @@ from gama.utilities.generic.async_evaluator import AsyncEvaluator from gama.utilities.metrics import Metric +import ConfigSpace as cs + # Avoid stopit from logging warnings every time a pipeline evaluation times out logging.getLogger("stopit").setLevel(logging.ERROR) log = logging.getLogger(__name__) - STR_NO_OPTIMAL_PIPELINE = """Gama did not yet establish an optimal pipeline. This can be because `fit` was not yet called, or did not terminate successfully.""" @@ -81,7 +82,7 @@ class Gama(ABC): def __init__( self, - search_space: Dict[Union[str, object], Any], + search_space: cs.ConfigurationSpace, scoring: Union[ str, Metric, Iterable[str], Iterable[Metric] ] = "filled_in_by_child_class", @@ -104,9 +105,9 @@ def __init__( Parameters ---------- - search_space: Dict - Specifies available components and their valid hyperparameter settings. - For more information, see :ref:`search_space_configuration`. + search_space: cs.ConfigurationSpace + The ConfigSpace object which defines the search space. Refer to the + configuration/(classification||regression).py file for further details. scoring: str, Metric or Tuple Specifies the/all metric(s) to optimize towards. @@ -277,6 +278,7 @@ def __init__( if random_state is not None: random.seed(random_state) np.random.seed(random_state) + search_space.seed(random_state) self._x: Optional[pd.DataFrame] = None self._y: Optional[pd.DataFrame] = None @@ -301,9 +303,13 @@ def __init__( e = search.logger(os.path.join(self.output_directory, "evaluations.log")) self.evaluation_completed(e.log_evaluation) - self._pset, parameter_checks = pset_from_config(search_space) + self.search_space = search_space - if DATA_TERMINAL not in self._pset: + if ( + "preprocessors" in self.search_space.meta + and self.search_space.meta["preprocessors"] + not in self.search_space.get_hyperparameter_names() + ) or ("preprocessors" not in self.search_space.meta): if max_pipeline_length is None: log.info( "Setting `max_pipeline_length` to 1 " @@ -319,14 +325,14 @@ def __init__( self._operator_set = OperatorSet( mutate=partial( # type: ignore #https://github.com/python/mypy/issues/1484 random_valid_mutation_in_place, - primitive_set=self._pset, + config_space=self.search_space, max_length=max_pipeline_length, ), mate=partial(random_crossover, max_length=max_pipeline_length), create_from_population=partial(create_from_population, cxpb=0.2, mutpb=0.8), create_new=partial( create_random_expression, - primitive_set=self._pset, + config_space=self.search_space, max_length=max_start_length, ), compile_=compile_individual, @@ -557,23 +563,55 @@ def fit( # KNN will create models of about 76Mb in size, which is too big, so # we exclude it from search: log.info("Excluding KNN from search because the dataset is too big.") - from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor - - self._pset["prediction"] = [ - p - for p in self._pset["prediction"] - if p.identifier not in [KNeighborsClassifier, KNeighborsRegressor] - ] + if ( + "KNeighborsClassifier" + in self.search_space.get_hyperparameter( + self.search_space.meta["estimators"] + ).choices + ): + self.search_space.add_forbidden_clause( + ForbiddenEqualsClause( + self.search_space.get_hyperparameter( + self.search_space.meta["estimators"] + ), + "KNeighborsClassifier", + ) + ) + if ( + "KNeighborsRegressor" + in self.search_space.get_hyperparameter( + self.search_space.meta["estimators"] + ).choices + ): + self.search_space.add_forbidden_clause( + ForbiddenEqualsClause( + self.search_space.get_hyperparameter( + self.search_space.meta["regressors"] + ), + "KNeighborsRegressor", + ) + ) - if store_pipelines and self._x.shape[1] > 50: + if ( + store_pipelines + and self._x.shape[1] > 50 + and "preprocessors" in self.search_space.meta + ): log.info("Data has too many features to include PolynomialFeatures") - from sklearn.preprocessing import PolynomialFeatures - - self._pset["data"] = [ - p - for p in self._pset["data"] - if p.identifier not in [PolynomialFeatures] - ] + if ( + "PolynomialFeatures" + in self.search_space.get_hyperparameter( + self.search_space.meta["preprocessors"] + ).choices + ): + self.search_space.add_forbidden_clause( + ForbiddenEqualsClause( + self.search_space.get_hyperparameter( + self.search_space.meta["preprocessors"] + ), + "PolynomialFeatures", + ) + ) if self._time_manager.total_time_remaining < 0: pre_time = self._time_manager.activities[-1].stopwatch.elapsed_time diff --git a/gama/genetic_programming/components/individual.py b/gama/genetic_programming/components/individual.py index 83f161f0..84067c9c 100644 --- a/gama/genetic_programming/components/individual.py +++ b/gama/genetic_programming/components/individual.py @@ -7,6 +7,8 @@ from .primitive_node import PrimitiveNode from .terminal import Terminal +import ConfigSpace as cs + class Individual: """Collection of PrimitiveNodes which together specify a machine learning pipeline. @@ -140,7 +142,7 @@ def copy_as_new(self) -> "Individual": def from_string( cls, string: str, - primitive_set: dict, + config_space: cs.ConfigurationSpace, to_pipeline: Optional[Callable] = None, strict: bool = True, ) -> "Individual": @@ -150,8 +152,9 @@ def from_string( ---------- string: str String formatted as `Individual.pipeline_str`. - primitive_set: dict - The dictionary defining all Terminals and Primitives. + config_space: ConfigurationSpace + The ConfigSpace object which defines the search space. Refer to the + configuration/(classification||regression).py file for further details. to_pipeline: Callable, optional (default=None) The function to convert the Individual into a pipeline representation. If `None`, the individuals `pipeline` property will not be available. @@ -166,5 +169,5 @@ def from_string( Individual An individual as defined by `str`. """ - expression = PrimitiveNode.from_string(string, primitive_set, strict) + expression = PrimitiveNode.from_string(string, config_space, strict) return cls(expression, to_pipeline=to_pipeline) diff --git a/gama/genetic_programming/components/primitive_node.py b/gama/genetic_programming/components/primitive_node.py index 0a6671c9..9dd0aa0b 100644 --- a/gama/genetic_programming/components/primitive_node.py +++ b/gama/genetic_programming/components/primitive_node.py @@ -1,6 +1,15 @@ +import ast from typing import List, Union, cast + + from .terminal import DATA_TERMINAL, Terminal from .primitive import Primitive +import ConfigSpace as cs + +from ...utilities.config_space import ( + get_hyperparameter_sklearn_name, + get_estimator_by_name, +) class PrimitiveNode: @@ -62,7 +71,7 @@ def copy(self) -> "PrimitiveNode": @classmethod def from_string( - cls, string: str, primitive_set: dict, strict: bool = True + cls, string: str, config_space: cs.ConfigurationSpace, strict: bool = True ) -> "PrimitiveNode": """Create a PrimitiveNode from string formatted like PrimitiveNode.__str__ @@ -70,8 +79,9 @@ def from_string( ---------- string: str A string formatted similar to PrimitiveNode.__str__ - primitive_set: dict - The dictionary defining all Terminals and Primitives. + config_space: ConfigurationSpace + The ConfigSpace object which defines the search space. Refer to the + configuration/(classification||regression).py file for further details. strict: bool (default=True) Require each primitives has all required terminals present in `string`. Non-strict matching may be useful when constructing individuals from @@ -92,13 +102,13 @@ def from_string( last_node: Union[PrimitiveNode, str] = DATA_TERMINAL for primitive_string, terminal_set in zip(reversed(primitives), terminal_sets): - primitive = find_primitive(primitive_set, primitive_string) + primitive = find_primitive(config_space, primitive_string) if terminal_set == "": terminals = [] else: terminal_set = terminal_set[2:] # 2 is because string starts with ', ' terminals = [ - find_terminal(primitive_set, terminal_string) + find_terminal(config_space, terminal_string, primitive_string) for terminal_string in terminal_set.split(", ") ] missing = set(primitive.input) - set(map(lambda t: t.identifier, terminals)) @@ -109,19 +119,75 @@ def from_string( return cast(PrimitiveNode, last_node) -def find_primitive(primitive_set: dict, primitive_string: str) -> Primitive: - """Find the Primitive that matches `primitive_string` in `primitive_set`.""" - all_primitives = primitive_set[DATA_TERMINAL] + primitive_set["prediction"] - for primitive in all_primitives: - if repr(primitive) == primitive_string: - return primitive +def find_primitive( + config_space: cs.ConfigurationSpace, primitive_string: str +) -> Primitive: + """Find the Primitive that matches `primitive_string` in `config_space`.""" + if config_space is None: + raise ValueError("config_space must not be None") + if "estimators" not in config_space.meta: + raise ValueError( + "config_space must have meta information about the estimators" + "hyperparameters" + ) + + estimators = config_space.get_hyperparameter( + config_space.meta["estimators"] + ).choices + preprocessors = [] + if "preprocessors" in config_space.meta: + preprocessors = config_space.get_hyperparameter( + config_space.meta["preprocessors"] + ).choices + + all_hyperparameters = estimators + preprocessors + + if primitive_string in all_hyperparameters: + return Primitive( + input=(), + output=( + "estimators" if primitive_string in estimators else "preprocessors" + ), + identifier=get_estimator_by_name(primitive_string), + ) + raise IndexError(f"Could not find Primitive of type '{primitive_string}'.") -def find_terminal(primitive_set: dict, terminal_string: str) -> Terminal: - """Find the Terminal that matches `terminal_string` in `primitive_set`.""" - term_type, _ = terminal_string.split("=") - for terminal in primitive_set[term_type]: - if repr(terminal) == terminal_string: - return terminal +def find_terminal( + config_space: cs.ConfigurationSpace, terminal_string: str, primitive_string: str +) -> Terminal: + """Find the Terminal that matches `terminal_string` in `config_space`.""" + if config_space is None: + raise ValueError("config_space must not be None") + + term_type, value = terminal_string.split("=") + if "." in term_type: + term_parent_type, term_type = term_type.split(".") + term_config_space_name = f"{term_type}__{term_parent_type}" + else: + term_config_space_name = f"{term_type}__{primitive_string}" + + if isinstance(value, str): + value = value.replace("'", "").replace('"', "").replace(" ", "") + try: + value = ast.literal_eval(value) + except (ValueError, SyntaxError): + value = str(value) + + if term_config_space_name in config_space.get_hyperparameter_names(): + return Terminal( + identifier=get_hyperparameter_sklearn_name(term_config_space_name), + value=value, + output=get_hyperparameter_sklearn_name(term_config_space_name), + config_space_name=term_config_space_name, + ) + if term_type in config_space.get_hyperparameter_names(): + return Terminal( + identifier=get_hyperparameter_sklearn_name(term_type), + value=value, + output=get_hyperparameter_sklearn_name(term_type), + config_space_name=term_type, + ) + raise RuntimeError(f"Could not find Terminal of type '{terminal_string}'.") diff --git a/gama/genetic_programming/components/terminal.py b/gama/genetic_programming/components/terminal.py index d8c96c39..c6795b72 100644 --- a/gama/genetic_programming/components/terminal.py +++ b/gama/genetic_programming/components/terminal.py @@ -7,11 +7,21 @@ class Terminal(NamedTuple): """Specifies a specific value for a specific type or input. E.g. a value for a hyperparameter for an algorithm. + + It is important to note that you should use the hyperparameter's sklearn name as + your output and identifier. If your name contains `__estimatorName`, you should + remove it (e.g. by using string split). More information may be found in the + documentation for the `get_hyperparameter_sklearn_name` function. + + Furthermore, the `config_space_name` is the name of the Config Space's + hyperparameter. As a result, this is the name formed by the `__estimatorName` and + the name of the hyperparameter. """ value: object output: str identifier: str + config_space_name: str = "Not Specified" def __str__(self) -> str: """str: e.g. "tol=0.5" """ diff --git a/gama/genetic_programming/mutation.py b/gama/genetic_programming/mutation.py index ad0457c8..834e72f8 100644 --- a/gama/genetic_programming/mutation.py +++ b/gama/genetic_programming/mutation.py @@ -2,53 +2,89 @@ Contains mutation functions for genetic programming. Each mutation function takes an individual and modifies it in-place. """ +import logging import random from functools import partial from typing import Callable, Optional, cast, List, Dict -from gama.genetic_programming.components import PrimitiveNode +import ConfigSpace as cs +import numpy as np + +from gama.genetic_programming.components import PrimitiveNode, Terminal from .components import Individual, DATA_TERMINAL from .operations import random_primitive_node +from ..utilities.config_space import get_internal_output_types + +# Avoid stopit from logging warnings every time a pipeline evaluation times out +logging.getLogger("stopit").setLevel(logging.ERROR) +log = logging.getLogger(__name__) -def mut_replace_terminal(individual: Individual, primitive_set: dict) -> None: +def mut_replace_terminal( + individual: Individual, config_space: cs.ConfigurationSpace +) -> None: """Mutates an Individual in-place by replacing one of its Terminals. Parameters ---------- individual: Individual Individual to mutate in-place. - primitive_set: dict + config_space: ConfigurationSpace + The ConfigSpace object which defines the search space. Refer to the + configuration/(classification||regression).py file for further details. """ def terminal_replaceable(index_terminal): _, terminal = index_terminal - return len(primitive_set[terminal.identifier]) > 1 + return has_multiple_options( + config_space.get_hyperparameter(terminal.config_space_name) + ) terminals = list(filter(terminal_replaceable, enumerate(individual.terminals))) if not terminals: raise ValueError("Individual has no terminals suitable for mutation.") terminal_index, old = random.choice(terminals) - candidates = filter(lambda t: t.value != old.value, primitive_set[old.identifier]) - new_terminal = random.choice(list(candidates)) + while True: + new_terminal_value = config_space.get_hyperparameter( + old.config_space_name + ).sample(np.random.RandomState(42)) + if new_terminal_value != old.value: + break + + new_terminal = Terminal( + identifier=old.identifier, + value=new_terminal_value, + output=old.output, + config_space_name=old.config_space_name, + ) individual.replace_terminal(terminal_index, new_terminal) -def mut_replace_primitive(individual: Individual, primitive_set: dict) -> None: +def mut_replace_primitive( + individual: Individual, config_space: cs.ConfigurationSpace +) -> None: """Mutates an Individual in-place by replacing one of its Primitives. Parameters ---------- individual: Individual Individual to mutate in-place. - primitive_set: dict + config_space: cs.ConfigurationSpace + The ConfigSpace object which defines the search space. Refer to the + configuration/(classification||regression).py file for further details. """ def primitive_replaceable(index_primitive): _, primitive = index_primitive - return len(primitive_set[primitive._primitive.output]) > 1 + return has_multiple_options( + config_space.get_hyperparameter( + config_space.meta[primitive._primitive.output] + if primitive._primitive.output in get_internal_output_types() + else primitive._primitive.output + ) + ) primitives = list(filter(primitive_replaceable, enumerate(individual.primitives))) if not primitives: @@ -57,7 +93,7 @@ def primitive_replaceable(index_primitive): primitive_index, old_primitive_node = random.choice(primitives) primitive_node = random_primitive_node( output_type=old_primitive_node._primitive.output, - primitive_set=primitive_set, + config_space=config_space, exclude=old_primitive_node._primitive, ) individual.replace_primitive(primitive_index, primitive_node) @@ -65,7 +101,7 @@ def primitive_replaceable(index_primitive): def mut_shrink( individual: Individual, - _primitive_set: Optional[dict] = None, + _config_space: Optional[cs.ConfigurationSpace] = None, shrink_by: Optional[int] = None, ) -> None: """Mutates an Individual in-place by removing any number of primitive nodes. @@ -76,7 +112,7 @@ def mut_shrink( ---------- individual: Individual Individual to mutate in-place. - _primitive_set: dict, optional + _config_space: dict, optional Not used. Present to create a matching function signature with other mutations. shrink_by: int, optional (default=None) Number of primitives to remove. @@ -97,7 +133,7 @@ def mut_shrink( current_primitive_node._data_node = DATA_TERMINAL -def mut_insert(individual: Individual, primitive_set: dict) -> None: +def mut_insert(individual: Individual, config_space: cs.ConfigurationSpace) -> None: """Mutate an Individual in-place by inserting a PrimitiveNode at a random location. The new PrimitiveNode will not be inserted as root node. @@ -106,18 +142,60 @@ def mut_insert(individual: Individual, primitive_set: dict) -> None: ---------- individual: Individual Individual to mutate in-place. - primitive_set: dict + config_space: cs.ConfigurationSpace + The ConfigSpace object which defines the search space. Refer to the + configuration/(classification||regression).py file for further details. """ parent_node = random.choice(list(individual.primitives)) new_primitive_node = random_primitive_node( - output_type=DATA_TERMINAL, primitive_set=primitive_set + output_type=DATA_TERMINAL, config_space=config_space ) new_primitive_node._data_node = parent_node._data_node parent_node._data_node = new_primitive_node +def has_multiple_options(hyperparameter: cs.hyperparameters.hyperparameter) -> bool: + """Check if a ConfigSpace hyperparameter has more than one option. + + Only Constant, Float, Integer, and Categorical hyperparameters are currently + supported. A TypeError is thrown if the hyperparameter is not of one of these + types. Additionally, readers are directed to our Github Issues page to request + support for other types. + + Parameters + ---------- + hyperparameter: cs.hyperparameters.hyperparameter + The hyperparameter to check. + + Returns + ------- + bool + True if the hyperparameter has more than one option, False otherwise. + """ + if isinstance( + hyperparameter, + ( + cs.hyperparameters.FloatHyperparameter, + cs.hyperparameters.IntegerHyperparameter, + ), + ): + # For Float and Integer, check if the upper and lower bounds are different + return hyperparameter.upper > hyperparameter.lower + elif isinstance(hyperparameter, cs.CategoricalHyperparameter): + # For Categorical, check if there are more than one unique items + return len(set(hyperparameter.choices)) > 1 + elif isinstance(hyperparameter, cs.hyperparameters.Constant): + # Constant has only one option + return False + else: + # Default case, assuming no options or not a recognised type + raise TypeError(f"Hyperparameter type {type(hyperparameter)} not supported") + + def random_valid_mutation_in_place( - individual: Individual, primitive_set: dict, max_length: Optional[int] = None + individual: Individual, + config_space: cs.ConfigurationSpace, + max_length: Optional[int] = None, ) -> Callable: """Apply a random valid mutation in place. @@ -131,11 +209,12 @@ def random_valid_mutation_in_place( Parameters ---------- individual: Individual - An individual to be mutated *in-place*. - primitive_set: dict - A dictionary defining the set of primitives and terminals. + An individual to be mutated *in-place*. + config_space: cs.ConfigurationSpace + The ConfigSpace object which defines the search space. Refer to the + configuration/(classification||regression).py file for further details. max_length: int, optional (default=None) - If specified, impose a maximum length on the new individual. + If specified, impose a maximum length on the new individual. Returns @@ -151,8 +230,16 @@ def random_valid_mutation_in_place( ) else: replaceable_primitives = filter( - lambda p: len(primitive_set[p._primitive.output]) > 1, individual.primitives + lambda p: has_multiple_options( + config_space.get_hyperparameter( + config_space.meta[p._primitive.output] + if p._primitive.output in get_internal_output_types() + else p._primitive.output + ) + ), + individual.primitives, ) + if len(list(replaceable_primitives)) > 1: available_mutations.append(mut_replace_primitive) @@ -162,12 +249,21 @@ def random_valid_mutation_in_place( available_mutations.append(mut_shrink) replaceable_terminals = filter( - lambda t: len(primitive_set[t.identifier]) > 1, individual.terminals + lambda t: has_multiple_options( + config_space.get_hyperparameter(t.config_space_name) + ), + individual.terminals, ) if len(list(replaceable_terminals)) > 1: available_mutations.append(mut_replace_terminal) - mut_fn = random.choice(available_mutations) - mut_fn(individual, primitive_set) + if not available_mutations: + log.warning( + f"Individual {individual} has no valid mutations. " + f"Returning original individual." + ) + return lambda ind, config: ind + mut_fn = random.choice(available_mutations) + mut_fn(individual, config_space) return mut_fn diff --git a/gama/genetic_programming/operations.py b/gama/genetic_programming/operations.py index 342da41f..2310cca1 100644 --- a/gama/genetic_programming/operations.py +++ b/gama/genetic_programming/operations.py @@ -1,5 +1,8 @@ +import copy +from typing import List, Optional, Any, Tuple, Union import random -from typing import List, Optional + +import ConfigSpace as cs from gama.genetic_programming.components import ( Primitive, @@ -7,38 +10,314 @@ PrimitiveNode, DATA_TERMINAL, ) - - -def random_terminals_for_primitive( - primitive_set: dict, primitive: Primitive -) -> List[Terminal]: - """Return a list with a random Terminal for each required input to Primitive.""" - return [random.choice(primitive_set[term_type]) for term_type in primitive.input] +from gama.utilities.config_space import ( + get_estimator_by_name, + get_hyperparameter_sklearn_name, + get_internal_output_types, +) def random_primitive_node( - output_type: str, primitive_set: dict, exclude: Optional[Primitive] = None + output_type: str, + config_space: cs.ConfigurationSpace, + exclude: Optional[Primitive] = None, ) -> PrimitiveNode: """Create a PrimitiveNode with specified output_type and random terminals.""" - primitive = random.choice([p for p in primitive_set[output_type] if p != exclude]) - terminals = random_terminals_for_primitive(primitive_set, primitive) - return PrimitiveNode(primitive, data_node=DATA_TERMINAL, terminals=terminals) + if output_type not in get_internal_output_types(): + raise ValueError(f"Output type {output_type} not supported") + + if exclude is not None: + temp_config_space = copy.deepcopy(config_space) + if output_type not in temp_config_space.meta: + raise ValueError(f"Output type {output_type} not in config_space meta.") + temp_config_space.add_forbidden_clause( + cs.ForbiddenEqualsClause( + temp_config_space.get_hyperparameter( + temp_config_space.meta[output_type] + ), + exclude.__str__(), + ) + ) + config = temp_config_space.sample_configuration() + else: + config = config_space.sample_configuration() + + if output_type in [DATA_TERMINAL, "preprocessors"]: + ( + preprocessor_primitive, + preprocessor_terminals, + ) = _config_preprocessor_to_primitive_node( + config, config_space.meta, config_space.get_conditions() + ) + return PrimitiveNode( + preprocessor_primitive, + data_node=DATA_TERMINAL, + terminals=preprocessor_terminals, + ) + estimator_primitive, estimator_terminals = _config_estimator_to_primitive_node( + config, config_space.meta, config_space.get_conditions() + ) + return PrimitiveNode( + primitive=estimator_primitive, + data_node=DATA_TERMINAL, + terminals=estimator_terminals, + ) def create_random_expression( - primitive_set: dict, min_length: int = 1, max_length: int = 3 + config_space: cs.ConfigurationSpace, + min_length: int = 1, + max_length: int = 3, ) -> PrimitiveNode: """Create at least min_length and at most max_length chained PrimitiveNodes.""" individual_length = random.randint(min_length, max_length) - learner_node = random_primitive_node( - output_type="prediction", primitive_set=primitive_set + return _config_to_primitive_node( + config=config_space.sample_configuration(), + config_meta=config_space.meta, + conditions=config_space.get_conditions(), + config_length=individual_length, + ) + + +def extract_valid_hyperparameters( + cond: cs.conditions, config: cs.Configuration, config_meta: dict, meta_key: str +) -> Union[str, None]: + """Extract valid hyperparameters from a condition. + + For each supported ConfigSpace condition type, extract the valid hyperparameters + from the condition. The valid hyperparameters are the hyperparameters that are + valid for the given condition and configuration based on the meta_key. + + Supported ConfigSpace condition types: + - EqualsCondition + - AndConjunction + + Readers are encouraged to add support for more ConfigSpace condition types if + needed. Open an issue or pull request on the GAMA GitHub repository. + + Parameters + ---------- + cond : cs.conditions + A condition of type ConfigSpace. + config : cs.Configuration + A configuration of type ConfigSpace. + config_meta : dict + The meta information of the ConfigSpace. + meta_key : str + The meta key of the ConfigSpace. + """ + if meta_key not in config_meta: + raise ValueError(f"Meta key {meta_key} not in config_meta") + if type(cond) not in [cs.conditions.EqualsCondition, cs.conditions.AndConjunction]: + raise ValueError( + f"Condition type {type(cond)} not supported. Refer to " + f"docstring for supported condition types." + ) + if isinstance(cond, cs.conditions.EqualsCondition): + if ( + cond.parent.name == config_meta[meta_key] + and cond.value == config[config_meta[meta_key]] + ): + return cond.child.name + elif isinstance(cond, cs.conditions.AndConjunction): + for component in cond.components: + if ( + component.parent.name == config_meta[meta_key] + and component.value == config[config_meta[meta_key]] + ): + return component.child.name + return None + + +def _config_estimator_to_primitive_node( + config: cs.Configuration, + config_meta: dict, + conditions: List[Any], + output_type: Optional[str] = None, +) -> Tuple[Primitive, List[Terminal]]: + """Create a PrimitiveNode from a configuration of type ConfigSpace (estimator). + + Creates a PrimitiveNode from a configuration of type ConfigSpace. Focuses on + the estimator part of the configuration. It starts by creating a Primitive for + the selected estimator. Then it determines the valid hyperparameters for the + estimator based on the conditions. Finally, it creates a Terminal for each valid + hyperparameter for the estimator. + + Parameters + ---------- + config : cs.Configuration + A configuration of type ConfigSpace. + config_meta : dict + The meta information of the ConfigSpace. + conditions : List[Any] + The conditions of the ConfigSpace. + output_type : str, optional + The output type of the PrimitiveNode, by default None. + """ + if ( + "estimators" not in config_meta + or config_meta["estimators"] not in config.get_dictionary() + ): + raise ValueError( + f"Configuration {config} does not contain an `estimator` ConfigSpace" + f"Hyperparameter. Cannot construct estimator PrimitiveNode." + ) + if output_type is None: + output_type = "estimators" + + # Create a Primitive for the selected classifier + estimator_primitive = Primitive( + identifier=get_estimator_by_name(config[config_meta["estimators"]]), + output=output_type, + input=tuple( + get_hyperparameter_sklearn_name(hp) + for hp in config + if hp + not in [ + config_meta["estimators"], + config_meta.get("preprocessors"), + ] + ), ) - last_primitive_node = learner_node - for _ in range(individual_length - 1): - primitive_node = random_primitive_node( - output_type=DATA_TERMINAL, primitive_set=primitive_set + + # Determine valid hyperparameters for estimators based on conditions + estimator_valid_hyperparameters = [ + name + for condition in conditions + if ( + name := extract_valid_hyperparameters( + condition, config, config_meta, "estimators" + ) + ) + is not None + ] + + # Create a Terminal for each valid hyperparameter for estimators + estimator_terminals = [ + Terminal( + identifier=get_hyperparameter_sklearn_name(param), + value=value, + output=get_hyperparameter_sklearn_name(param), + config_space_name=param, + ) + for param, value in config.items() + if param in estimator_valid_hyperparameters + and param + not in [ + config_meta["estimators"], + config_meta.get("preprocessors"), + ] + ] + + return estimator_primitive, estimator_terminals + + +def _config_preprocessor_to_primitive_node( + config: cs.Configuration, + config_meta: dict, + conditions: List[Any], + output_type: Optional[str] = None, +) -> Tuple[Primitive, List[Terminal]]: + """Create a PrimitiveNode from a configuration of type ConfigSpace (preprocessor). + + Creates a PrimitiveNode from a configuration of type ConfigSpace. Focuses on + the preprocessor part of the configuration. It starts by creating a Primitive for + the selected preprocessor. Then it determines the valid hyperparameters for the + preprocessor based on the conditions. Finally, it creates a Terminal for each + valid hyperparameter for the preprocessor. + + Parameters + ---------- + config : cs.Configuration + A configuration of type ConfigSpace. + config_meta : dict + The meta information of the ConfigSpace. + conditions : List[Any] + The conditions of the ConfigSpace. + output_type : str, optional + The output type of the PrimitiveNode, by default None. + """ + if ( + "preprocessors" not in config_meta + or config_meta["preprocessors"] not in config.get_dictionary() + ): + raise ValueError( + f"Configuration {config} does not contain an `preprocessor` ConfigSpace" + f"Hyperparameter. Cannot construct preprocessor PrimitiveNode." + ) + if output_type is None: + output_type = "preprocessors" + + # Create a Primitive for the selected preprocessor + preprocessor_primitive = Primitive( + identifier=get_estimator_by_name(config[config_meta["preprocessors"]]), + output=output_type, + input=tuple( + get_hyperparameter_sklearn_name(hp) + for hp in config + if hp not in [config_meta.get("estimators"), config_meta["preprocessors"]] + ), + ) + + # Determine valid hyperparameters for preprocessor based on conditions + preprocessor_valid_hyperparameters = [ + name + for condition in conditions + if ( + name := extract_valid_hyperparameters( + condition, config, config_meta, "preprocessors" + ) ) - last_primitive_node._data_node = primitive_node - last_primitive_node = primitive_node + is not None + ] - return learner_node + # Create a Terminal for each valid hyperparameter for preprocessor + preprocessor_terminals = [ + Terminal( + identifier=get_hyperparameter_sklearn_name(param), + value=value, + output=get_hyperparameter_sklearn_name(param), + config_space_name=param, + ) + for param, value in config.items() + if param in preprocessor_valid_hyperparameters + and param not in [config_meta.get("estimators"), config_meta["preprocessors"]] + ] + + return preprocessor_primitive, preprocessor_terminals + + +def _config_to_primitive_node( + config: cs.Configuration, + config_meta: dict, + conditions: List[Any], + config_length: Optional[int] = None, +) -> PrimitiveNode: + """Create a PrimitiveNode from a configuration. If config_length is specified, the + PrimitiveNode will have at most config_length PrimitiveNodes.""" + if isinstance(config_length, int) and config_length <= 1: + estimator_primitive, estimator_terminals = _config_estimator_to_primitive_node( + config, config_meta, conditions + ) + return PrimitiveNode( + estimator_primitive, data_node=DATA_TERMINAL, terminals=estimator_terminals + ) + estimator_primitive, estimator_terminals = _config_estimator_to_primitive_node( + config, config_meta, conditions + ) + ( + preprocessor_primitive, + preprocessor_terminals, + ) = _config_preprocessor_to_primitive_node(config, config_meta, conditions) + + # Create a PrimitiveNode for the preprocessor + preprocessor_node = PrimitiveNode( + preprocessor_primitive, + data_node=DATA_TERMINAL, + terminals=preprocessor_terminals, + ) + + # Create a PrimitiveNode for the classifier and chain it to the preprocessor + return PrimitiveNode( + estimator_primitive, data_node=preprocessor_node, terminals=estimator_terminals + ) diff --git a/gama/logging/GamaReport.py b/gama/logging/GamaReport.py index 56e6af4f..a6c3f11d 100644 --- a/gama/logging/GamaReport.py +++ b/gama/logging/GamaReport.py @@ -4,13 +4,14 @@ import pandas as pd -from gama.configuration.classification import clf_config -from gama.configuration.parser import pset_from_config, merge_configurations -from gama.configuration.regression import reg_config +from gama.configuration.classification import config_space as cls_config +from gama.configuration.regression import config_space as reg_config +from gama.utilities.config_space import merge_configurations from gama.genetic_programming.components import Individual - -pset, _ = pset_from_config(merge_configurations(clf_config, reg_config)) +config_space = merge_configurations( + c1=cls_config, c2=reg_config, prefix="merged_regression" +) class GamaReport: @@ -95,7 +96,7 @@ def tuple_to_metrics(tuple_str): df.duration = pd.to_timedelta(df.duration, unit="s") new_individuals = { - id_: Individual.from_string(pipeline, pset, strict=self.strict) + id_: Individual.from_string(pipeline, config_space, strict=self.strict) for id_, pipeline in zip(df.id, df.pipeline) } diff --git a/gama/utilities/config_space.py b/gama/utilities/config_space.py new file mode 100644 index 00000000..87b20f21 --- /dev/null +++ b/gama/utilities/config_space.py @@ -0,0 +1,74 @@ +import ConfigSpace as cs +import sklearn +from gama.genetic_programming.components import DATA_TERMINAL +from sklearn.utils import all_estimators + + +def get_internal_output_types() -> list[str]: + """Returns the internal ConfigSpace/GAMA output types. + + Returns + ------- + list[str] + List of internal ConfigSpace/GAMA output types. + """ + return [DATA_TERMINAL, "preprocessors", "estimators"] + + +def get_hyperparameter_sklearn_name(hyperparameter_name: str) -> str: + """Converts a ConfigSpace hyperparameter name to the name used in sklearn. + + Parameters + ---------- + hyperparameter_name: str + Name of the hyperparameter used in ConfigSpace. + + Returns + ------- + str + Name of the hyperparameter used in sklearn. + """ + return hyperparameter_name.split("__")[0] + + +def get_estimator_by_name(name: str) -> sklearn.base.BaseEstimator: + """Returns a (sklearn) estimator by name. + + Identify an estimator, which could be a classifier, regressor, or transformer. + The name should be the same as the estimator's name in sklearn + (for example, "GaussianNB"). If more than sklearn is supported, on the long term, + this function could be improved by searching through more than sklearn. + + Parameters + ---------- + name: str + Name of the (sklearn) estimator. + + Returns + ------- + estimator: sklearn.base.BaseEstimator + The (sklearn) estimator corresponding to the name. + """ + classifiers = dict(all_estimators(type_filter="classifier")) + regressors = dict(all_estimators(type_filter="regressor")) + transformers = dict(all_estimators(type_filter="transformer")) + + all_estimators_dict = classifiers | regressors | transformers + + estimator = all_estimators_dict.get(name) + + if estimator is None: + raise ValueError(f"Could not find estimator with name {name}.") + + return estimator + + +def merge_configurations( + c1: cs.ConfigurationSpace, + c2: cs.ConfigurationSpace, + prefix: str = "merged", + delimiter: str = "_", +) -> cs.ConfigurationSpace: + """Takes two configuration spaces and merges them together.""" + c1.add_configuration_space(prefix, c2, delimiter) + return c1 From f1bb4131bbbae42ea21953b65c199fa0d1fa8c8a Mon Sep 17 00:00:00 2001 From: Provost Simon Date: Mon, 4 Dec 2023 15:30:40 +0000 Subject: [PATCH 3/9] refactor(configuration): update regressors to be ConfigSpace compliant --- gama/GamaRegressor.py | 59 +++- gama/configuration/regression.py | 169 +++-------- .../configuration/regression_task/__init__.py | 2 + .../regression_task/preprocessors.py | 265 ++++++++++++++++ .../regression_task/regressors.py | 284 ++++++++++++++++++ 5 files changed, 648 insertions(+), 131 deletions(-) create mode 100644 gama/configuration/regression_task/__init__.py create mode 100644 gama/configuration/regression_task/preprocessors.py create mode 100644 gama/configuration/regression_task/regressors.py diff --git a/gama/GamaRegressor.py b/gama/GamaRegressor.py index f6e979e7..6f5386db 100644 --- a/gama/GamaRegressor.py +++ b/gama/GamaRegressor.py @@ -1,7 +1,8 @@ import pandas as pd from .gama import Gama -from gama.configuration.regression import reg_config +from gama.configuration.regression import config_space as reg_config +import ConfigSpace as cs class GamaRegressor(Gama): @@ -16,8 +17,64 @@ def __init__( if not search_space: search_space = reg_config + + search_space = self._search_space_check(search_space) + super().__init__(*args, search_space=search_space, scoring=scoring, **kwargs) + def _search_space_check( + self, search_space: cs.ConfigurationSpace + ) -> cs.ConfigurationSpace: + """Check if the search space is valid for regression.""" + + # Check if the search space contains a regressor hyperparameter. + if ( + "estimators" not in search_space.meta + or ( + search_space.meta["estimators"] + not in search_space.get_hyperparameters_dict() + ) + or not isinstance( + search_space.get_hyperparameter(search_space.meta["estimators"]), + cs.CategoricalHyperparameter, + ) + ): + raise ValueError( + "The search space must include a hyperparameter for the regressors " + "that is a CategoricalHyperparameter with choices for all desired " + "regressors. Please double-check the spelling of the name, and review " + "the `meta` object in the search space configuration located at " + "`configurations/regression.py`. The `meta` object should contain " + "a key `estimators` with a value that is the name of the hyperparameter" + " that contains the regressor choices." + ) + + # Check if the search space contains a preprocessor hyperparameter + # if it is specified in the meta. + if ( + "preprocessors" in search_space.meta + and ( + search_space.meta["preprocessors"] + not in search_space.get_hyperparameters_dict() + ) + or "preprocessors" in search_space.meta + and not isinstance( + search_space.get_hyperparameter(search_space.meta["preprocessors"]), + cs.CategoricalHyperparameter, + ) + ): + raise ValueError( + "The search space must include a hyperparameter for the preprocessors " + "that is a CategoricalHyperparameter with choices for all desired " + "preprocessors. Please double-check the spelling of the name, and " + "review the `meta` object in the search space configuration located at " + "`configurations/regression.py`. The `meta` object should contain " + "a key `preprocessors` with a value that is the name of the " + "hyperparameter that contains the preprocessor choices. " + ) + + return search_space + def _predict(self, x: pd.DataFrame): """Predict the target for input X. diff --git a/gama/configuration/regression.py b/gama/configuration/regression.py index f9de2fd0..872cfc69 100644 --- a/gama/configuration/regression.py +++ b/gama/configuration/regression.py @@ -1,136 +1,45 @@ -import numpy as np +import ConfigSpace as cs -from sklearn.cluster import FeatureAgglomeration -from sklearn.preprocessing import ( - MaxAbsScaler, - MinMaxScaler, - Normalizer, - PolynomialFeatures, - RobustScaler, - StandardScaler, - Binarizer, -) -from sklearn.kernel_approximation import Nystroem, RBFSampler -from sklearn.decomposition import PCA, FastICA -from sklearn.feature_selection import ( - SelectFwe, - SelectPercentile, - VarianceThreshold, - f_regression, -) +from .regression_task import RegressorConfig, PreprocessorConfig + +# Regressors & Preprocessors 🚀 + +# This script is your ticket to configuring a ConfigSpace object, teeming with +# regressors and preprocessors. We are diving in with the RegressorConfig and +# PreprocessorConfig classes to fill the configuration space with a slew of +# hyperparameters and options. + +# Customise Your Space 🔧 + +# Want just classifiers? No biggie! Just comment out or remove the PreprocessorConfig +# setup. Voila! You're left with a sleek, regressor-only configuration space. + +# Want to add more regressors or preprocessors? Easy! Just add them to the +# RegressorConfig or PreprocessorConfig classes, respectively. You can even +# add your own custom regressors or preprocessors. Just make sure they are +# compatible with scikit-learn's API. + +# Meta-Parameters 📝 + +# The meta-parameters are the "estimators" and "preprocessors" keys in the +# configuration space. These are used to identify the regressors and preprocessors +# by the internal system. They are not hyperparameters, and should not be +# changed, except by advanced users. If you do change them, make sure to change +# the corresponding values in the current configuration space, i.e. in RegressorConfig +# and PreprocessorConfig. +# 👩‍💻👨‍💻 Happy configuring, and may your machine learning models shine! -from sklearn.linear_model import ElasticNetCV, LassoLarsCV -from sklearn.ensemble import ( - ExtraTreesRegressor, - GradientBoostingRegressor, - AdaBoostRegressor, - RandomForestRegressor, +config_space = cs.ConfigurationSpace( + meta={ + # "gama_system_name": "current_configuration_name", + "estimators": "regressors", + "preprocessors": "preprocessors", + } ) -from sklearn.tree import DecisionTreeRegressor -from sklearn.neighbors import KNeighborsRegressor -from sklearn.svm import LinearSVR -# For comparison, this selection of operators and hyperparameters is -# currently most of what TPOT supports. +regressor_config = RegressorConfig(config_space) +regressor_config.setup_regressors() -reg_config = { - ElasticNetCV: { - "l1_ratio": np.arange(0.0, 1.01, 0.05), - "tol": [1e-5, 1e-4, 1e-3, 1e-2, 1e-1], - }, - ExtraTreesRegressor: { - "n_estimators": [100], - "max_features": np.arange(0.05, 1.01, 0.05), - "min_samples_split": range(2, 21), - "min_samples_leaf": range(1, 21), - "bootstrap": [True, False], - }, - GradientBoostingRegressor: { - "n_estimators": [100], - "loss": ["squared_error", "absolute_error", "huber", "quantile"], - "learning_rate": [1e-3, 1e-2, 1e-1, 0.5, 1.0], - "max_depth": range(1, 11), - "min_samples_split": range(2, 21), - "min_samples_leaf": range(1, 21), - "subsample": np.arange(0.05, 1.01, 0.05), - "max_features": np.arange(0.05, 1.01, 0.05), - "alpha": [0.75, 0.8, 0.85, 0.9, 0.95, 0.99], - }, - AdaBoostRegressor: { - "n_estimators": [100], - "learning_rate": [1e-3, 1e-2, 1e-1, 0.5, 1.0], - "loss": ["linear", "square", "exponential"], - # 'max_depth': range(1, 11) not available in sklearn==0.19.1 - }, - DecisionTreeRegressor: { - "max_depth": range(1, 11), - "min_samples_split": range(2, 21), - "min_samples_leaf": range(1, 21), - }, - KNeighborsRegressor: { - "n_neighbors": range(1, 101), - "weights": ["uniform", "distance"], - "p": [1, 2], - }, - LassoLarsCV: {"normalize": [True, False]}, - LinearSVR: { - "loss": ["epsilon_insensitive", "squared_epsilon_insensitive"], - "dual": [True, False], - "tol": [1e-5, 1e-4, 1e-3, 1e-2, 1e-1], - "C": [1e-4, 1e-3, 1e-2, 1e-1, 0.5, 1.0, 5.0, 10.0, 15.0, 20.0, 25.0], - "epsilon": [1e-4, 1e-3, 1e-2, 1e-1, 1.0], - }, - RandomForestRegressor: { - "n_estimators": [100], - "max_features": np.arange(0.05, 1.01, 0.05), - "min_samples_split": range(2, 21), - "min_samples_leaf": range(1, 21), - "bootstrap": [True, False], - }, - # Preprocesssors - Binarizer: {"threshold": np.arange(0.0, 1.01, 0.05)}, - FastICA: { - "tol": np.arange(0.0, 1.01, 0.05), - "whiten": ["unit-variance"], - }, - FeatureAgglomeration: { - "linkage": ["ward", "complete", "average"], - "affinity": ["euclidean", "l1", "l2", "manhattan", "cosine", "precomputed"], - "param_check": [ - lambda params: params["linkage"] != "ward" - or params["affinity"] == "euclidean" - ], - }, - MaxAbsScaler: {}, - MinMaxScaler: {}, - Normalizer: {"norm": ["l1", "l2", "max"]}, - Nystroem: { - "kernel": [ - "rbf", - "cosine", - "chi2", - "laplacian", - "polynomial", - "poly", - "linear", - "additive_chi2", - "sigmoid", - ], - "gamma": np.arange(0.0, 1.01, 0.05), - "n_components": range(1, 11), - }, - PCA: {"svd_solver": ["randomized"], "iterated_power": range(1, 11)}, - PolynomialFeatures: { - "degree": [2], - "include_bias": [False], - "interaction_only": [False], - }, - RBFSampler: {"gamma": np.arange(0.0, 1.01, 0.05)}, - RobustScaler: {}, - StandardScaler: {}, - # Selectors - SelectFwe: {"alpha": np.arange(0, 0.05, 0.001), "score_func": {f_regression: None}}, - SelectPercentile: {"percentile": range(1, 100), "score_func": {f_regression: None}}, - VarianceThreshold: {"threshold": np.arange(0.05, 1.01, 0.05)}, -} +preprocessor_config = PreprocessorConfig(config_space) +preprocessor_config.setup_preprocessors() diff --git a/gama/configuration/regression_task/__init__.py b/gama/configuration/regression_task/__init__.py new file mode 100644 index 00000000..2a2bdd75 --- /dev/null +++ b/gama/configuration/regression_task/__init__.py @@ -0,0 +1,2 @@ +from .regressors import RegressorConfig +from .preprocessors import PreprocessorConfig diff --git a/gama/configuration/regression_task/preprocessors.py b/gama/configuration/regression_task/preprocessors.py new file mode 100644 index 00000000..7b0c4b05 --- /dev/null +++ b/gama/configuration/regression_task/preprocessors.py @@ -0,0 +1,265 @@ +import ConfigSpace as cs +import ConfigSpace.hyperparameters as csh + + +class PreprocessorConfig: + """Manages the configuration space for preprocessors in supervised learning contexts + + PreprocessorConfig oversees the configuration space of preprocessors used in + supervised machine learning tasks. This class facilitates the addition of + new preprocessors and the modification of existing ones in the configuration space + via standardised methods. The ConfigSpace library is used to designate the + configuration space, enabling the creation of complex and adaptable + configuration setups. For additional information on using constraints and + various types of hyperparameters with ConfigSpace, refer to + the ConfigSpace documentation, available at: + https://automl.github.io/ConfigSpace/main/quickstart.html + + For further details how to add, modify and remove preprocessors, refer to the + documentation of classification task: + /configuration/classification_task/preprocessors.py + + + Parameters + ---------- + config_space : cs.ConfigurationSpace + The ConfigSpace object that will be used to add the preprocessors and their + respective hyperparameters. + + """ + + def __init__( + self, + config_space: cs.ConfigurationSpace, + ): + if "preprocessors" not in config_space.meta: + raise ValueError("Expected 'preprocessors' key in meta of config_space") + self.config_space = config_space + self.preprocessors_setup_map = { + "Binarizer": self.setup_binarizer, + "FastICA": self.setup_fast_ica, + "FeatureAgglomeration": self.setup_feature_agglomeration, + "MaxAbsScaler": self.setup_max_abs_scaler, + "MinMaxScaler": self.setup_min_max_scaler, + "Normalizer": self.setup_normalizer, + "Nystroem": self.setup_nystroem, + "PCA": self.setup_pca, + "PolynomialFeatures": self.setup_polynomial_features, + "RBFSampler": self.setup_rbf_sampler, + "RobustScaler": self.setup_robust_scaler, + "StandardScaler": self.setup_standard_scaler, + "SelectFwe": self.setup_select_fwe, + "SelectPercentile": self.setup_select_percentile, + "VarianceThreshold": self.setup_variance_threshold, + } + + self.cs_preprocessors_name = config_space.meta["preprocessors"] + + @property + def shared_hyperparameters(self): + return { + "gamma": {"lower": 0.0, "upper": 1.01, "default_value": 0.05}, + "threshold": {"lower": 0.0, "upper": 1.01, "default_value": 0.05}, + } + + def setup_preprocessors(self): + preprocessors_choices = list(self.preprocessors_setup_map.keys()) + + if not preprocessors_choices: + raise ValueError("No preprocessors to add to config space") + + preprocessors = csh.CategoricalHyperparameter( + name=self.cs_preprocessors_name, + choices=preprocessors_choices, + ) + self.config_space.add_hyperparameter(preprocessors) + + for preprocessor_name in preprocessors_choices: + if setup_func := self.preprocessors_setup_map.get(preprocessor_name): + setup_func(preprocessors) + + def _add_hyperparameters_and_equals_conditions( + self, local_vars: dict, preprocessor_name: str + ): + if "preprocessors" not in local_vars or not isinstance( + local_vars["preprocessors"], csh.CategoricalHyperparameter + ): + raise ValueError( + "Expected 'preprocessors' key with a CategoricalHyperparameter in local" + "vars" + ) + + hyperparameters_to_add = [ + hyperparameter + for hyperparameter in local_vars.values() + if isinstance(hyperparameter, csh.Hyperparameter) + and hyperparameter != local_vars["preprocessors"] + ] + + conditions_to_add = [ + cs.EqualsCondition( + hyperparameter, local_vars["preprocessors"], preprocessor_name + ) + for hyperparameter in hyperparameters_to_add + ] + + self.config_space.add_hyperparameters(hyperparameters_to_add) + self.config_space.add_conditions(conditions_to_add) + + def setup_binarizer(self, preprocessors: csh.CategoricalHyperparameter): + threshold = csh.UniformFloatHyperparameter( + "threshold__Binarizer", + **self.shared_hyperparameters["threshold"], + ) + + self._add_hyperparameters_and_equals_conditions(locals(), "Binarizer") + + def setup_fast_ica(self, preprocessors: csh.CategoricalHyperparameter): + tol = csh.UniformFloatHyperparameter( + "tol__FastICA", + lower=0.0, + upper=1.01, + default_value=0.05, + ) + whiten = csh.CategoricalHyperparameter( + "whiten__FastICA", + choices=["unit-variance"], + ) + + self._add_hyperparameters_and_equals_conditions(locals(), "FastICA") + + def setup_feature_agglomeration(self, preprocessors: csh.CategoricalHyperparameter): + linkage = csh.CategoricalHyperparameter( + "linkage__FeatureAgglomeration", + choices=["ward", "complete", "average"], + ) + affinity = csh.CategoricalHyperparameter( + "affinity__FeatureAgglomeration", + choices=["euclidean", "l1", "l2", "manhattan", "cosine"], + ) + + self._add_hyperparameters_and_equals_conditions( + locals(), "FeatureAgglomeration" + ) + + def setup_max_abs_scaler(self, preprocessors: csh.CategoricalHyperparameter): + # No hyperparameters + pass + + def setup_min_max_scaler(self, preprocessors: csh.CategoricalHyperparameter): + # No hyperparameters + pass + + def setup_normalizer(self, preprocessors: csh.CategoricalHyperparameter): + norm = csh.CategoricalHyperparameter( + "norm__Normalizer", + choices=["l1", "l2", "max"], + ) + + self._add_hyperparameters_and_equals_conditions(locals(), "Normalizer") + + def setup_nystroem(self, preprocessors: csh.CategoricalHyperparameter): + kernel = csh.CategoricalHyperparameter( + "kernel__Nystroem", + choices=[ + "rbf", + "cosine", + "chi2", + "laplacian", + "polynomial", + "poly", + "linear", + "additive_chi2", + "sigmoid", + ], + ) + gamma = csh.UniformFloatHyperparameter( + "gamma__Nystroem", + **self.shared_hyperparameters["gamma"], + ) + n_components = csh.UniformIntegerHyperparameter( + "n_components__Nystroem", + lower=1, + upper=11, + default_value=1, + ) + + self._add_hyperparameters_and_equals_conditions(locals(), "Nystroem") + + def setup_pca(self, preprocessors: csh.CategoricalHyperparameter): + svd_solver = csh.CategoricalHyperparameter( + "svd_solver__PCA", + choices=["randomized"], + ) + iterated_power = csh.UniformIntegerHyperparameter( + "iterated_power__PCA", + lower=1, + upper=11, + default_value=1, + ) + + self._add_hyperparameters_and_equals_conditions(locals(), "PCA") + + def setup_polynomial_features(self, preprocessors: csh.CategoricalHyperparameter): + degree = csh.CategoricalHyperparameter( + "degree__PolynomialFeatures", + choices=[2], + ) + include_bias = csh.CategoricalHyperparameter( + "include_bias__PolynomialFeatures", + choices=[False], + ) + interaction_only = csh.CategoricalHyperparameter( + "interaction_only__PolynomialFeatures", + choices=[False], + ) + + self._add_hyperparameters_and_equals_conditions(locals(), "PolynomialFeatures") + + def setup_rbf_sampler(self, preprocessors: csh.CategoricalHyperparameter): + gamma = csh.UniformFloatHyperparameter( + "gamma__RBFSampler", + **self.shared_hyperparameters["gamma"], + ) + + self._add_hyperparameters_and_equals_conditions(locals(), "RBFSampler") + + def setup_robust_scaler(self, preprocessors: csh.CategoricalHyperparameter): + # No hyperparameters + pass + + def setup_standard_scaler(self, preprocessors: csh.CategoricalHyperparameter): + # No hyperparameters + pass + + def setup_select_fwe(self, preprocessors: csh.CategoricalHyperparameter): + alpha = csh.UniformFloatHyperparameter( + "alpha__SelectFwe", + lower=0.0, + upper=0.05, + default_value=0.001, + ) + # TODO Score func, how to add this? + + self._add_hyperparameters_and_equals_conditions(locals(), "SelectFwe") + + def setup_select_percentile(self, preprocessors: csh.CategoricalHyperparameter): + percentile = csh.UniformIntegerHyperparameter( + "percentile__SelectPercentile", + lower=1, + upper=100, + default_value=1, + ) + # TODO @Pieter – Score func, how to add this, you reckon? + + self._add_hyperparameters_and_equals_conditions(locals(), "SelectPercentile") + + def setup_variance_threshold(self, preprocessors: csh.CategoricalHyperparameter): + threshold = csh.UniformFloatHyperparameter( + "threshold__VarianceThreshold", + lower=0.05, + upper=1.01, + default_value=0.05, + ) + + self._add_hyperparameters_and_equals_conditions(locals(), "VarianceThreshold") diff --git a/gama/configuration/regression_task/regressors.py b/gama/configuration/regression_task/regressors.py new file mode 100644 index 00000000..056df527 --- /dev/null +++ b/gama/configuration/regression_task/regressors.py @@ -0,0 +1,284 @@ +import ConfigSpace as cs +import ConfigSpace.hyperparameters as csh + + +class RegressorConfig: + """Manages the configuration space for regressors in supervised learning contexts + + RegressorConfig oversees the configuration space of regressors used for a + supervised machine learning task. This class facilitates the addition of + new regressors and the modification of existing ones in the configuration space + via standardized methods. The ConfigSpace library is utilized to designate the + configuration space, enabling the creation of complex and adaptable + configuration setups. For additional information on using constraints and + various types of hyperparameters with ConfigSpace, refer to + the ConfigSpace documentation, available at: + https://automl.github.io/ConfigSpace/main/quickstart.html + + For further details how to add, modify and remove regressors, refer to the + documentation of classification task: + /configuration/classification_task/classifiers.py + + Parameters + ---------- + config_space : cs.ConfigurationSpace + The ConfigSpace object that defines the hyperparameters and their ranges for + the regressors. + + """ + + def __init__( + self, + config_space: cs.ConfigurationSpace, + ): + if "estimators" not in config_space.meta: + raise ValueError("Expected 'estimators' key in meta of config_space") + self.config_space = config_space + self.regressors_setup_map = { + "ElasticNetCV": self.setup_elastic_net_cv, + "ExtraTreesRegressor": self.setup_extra_trees_regressor, + "GradientBoostingRegressor": self.setup_gradient_boosting_regressor, + "AdaBoostRegressor": self.setup_ada_boost_regressor, + "DecisionTreeRegressor": self.setup_decision_tree_regressor, + "KNeighborsRegressor": self.setup_k_neighbors_regressor, + "LassoLarsCV": self.setup_lasso_lars_cv, + "LinearSVR": self.setup_linear_svr, + "RandomForestRegressor": self.setup_random_forest_regressor, + } + self.cs_estimators_name = self.config_space.meta["estimators"] + + @property + def shared_hyperparameters(self): + return { + "n_estimators": [100], + "max_features": {"lower": 0.05, "upper": 1.01, "default_value": 1.0}, + "min_samples_split": {"lower": 2, "upper": 21}, + "min_samples_leaf": {"lower": 1, "upper": 21}, + "learning_rate": [1e-3, 1e-2, 1e-1, 0.5, 1.0], + "loss": [ + "squared_error", + "absolute_error", + "huber", + "quantile", + "linear", + "square", + "exponential", + ], + "tol": [1e-5, 1e-4, 1e-3, 1e-2, 1e-1], + "bootstrap": [True, False], + "max_depth": {"lower": 1, "upper": 11}, + } + + def setup_regressors(self): + regressors_choices = list(self.regressors_setup_map.keys()) + + if not regressors_choices: + raise ValueError("No regressors to add to config space") + + regressors = csh.CategoricalHyperparameter( + name=self.cs_estimators_name, + choices=regressors_choices, + ) + self.config_space.add_hyperparameter(regressors) + + for regressor_name in regressors_choices: + if setup_func := self.regressors_setup_map.get(regressor_name): + setup_func(regressors) + + def _add_hyperparameters_and_equals_conditions( + self, local_vars: dict, estimator_name: str + ): + if "regressors" not in local_vars or not isinstance( + local_vars["regressors"], csh.CategoricalHyperparameter + ): + raise ValueError( + "Expected 'regressors' key with a CategoricalHyperparameter in local" + "vars" + ) + + hyperparameters_to_add = [ + hyperparameter + for hyperparameter in local_vars.values() + if isinstance(hyperparameter, csh.Hyperparameter) + and hyperparameter != local_vars["regressors"] + ] + + conditions_to_add = [ + cs.EqualsCondition(hyperparameter, local_vars["regressors"], estimator_name) + for hyperparameter in hyperparameters_to_add + ] + + self.config_space.add_hyperparameters(hyperparameters_to_add) + self.config_space.add_conditions(conditions_to_add) + + def setup_elastic_net_cv(self, regressors: csh.CategoricalHyperparameter): + l1_ratio = csh.UniformFloatHyperparameter( + "l1_ratio__ElasticNetCV", lower=0.0, upper=1.01, default_value=0.05 + ) + tol = csh.CategoricalHyperparameter( + "tol__ElasticNetCV", self.shared_hyperparameters["tol"] + ) + + self._add_hyperparameters_and_equals_conditions(locals(), "ElasticNetCV") + + def setup_extra_trees_regressor(self, regressors: csh.CategoricalHyperparameter): + n_estimators = csh.Constant( + "n_estimators__ExtraTreesRegressor", + value=self.shared_hyperparameters["n_estimators"][0], + ) + max_features = csh.UniformFloatHyperparameter( + "max_features__ExtraTreesRegressor", + **self.shared_hyperparameters["max_features"], + ) + min_samples_split = csh.UniformIntegerHyperparameter( + "min_samples_split__ExtraTreesRegressor", + **self.shared_hyperparameters["min_samples_split"], + ) + min_samples_leaf = csh.UniformIntegerHyperparameter( + "min_samples_leaf__ExtraTreesRegressor", + **self.shared_hyperparameters["min_samples_leaf"], + ) + bootstrap = csh.CategoricalHyperparameter( + "bootstrap__ExtraTreesRegressor", self.shared_hyperparameters["bootstrap"] + ) + + self._add_hyperparameters_and_equals_conditions(locals(), "ExtraTreesRegressor") + + def setup_gradient_boosting_regressor( + self, regressors: csh.CategoricalHyperparameter + ): + n_estimators = csh.Constant( + "n_estimators__GradientBoostingRegressor", + value=self.shared_hyperparameters["n_estimators"][0], + ) + loss = csh.CategoricalHyperparameter( + "loss__GradientBoostingRegressor", self.shared_hyperparameters["loss"] + ) + learning_rate = csh.CategoricalHyperparameter( + "learning_rate__GradientBoostingRegressor", + self.shared_hyperparameters["learning_rate"], + ) + max_depth = csh.UniformIntegerHyperparameter( + "max_depth__GradientBoostingRegressor", + **self.shared_hyperparameters["max_depth"], + ) + min_samples_split = csh.UniformIntegerHyperparameter( + "min_samples_split__GradientBoostingRegressor", + **self.shared_hyperparameters["min_samples_split"], + ) + min_samples_leaf = csh.UniformIntegerHyperparameter( + "min_samples_leaf__GradientBoostingRegressor", + **self.shared_hyperparameters["min_samples_leaf"], + ) + subsample = csh.UniformFloatHyperparameter( + "subsample__GradientBoostingRegressor", + lower=0.05, + upper=1.01, + default_value=1.0, + ) + max_features = csh.UniformFloatHyperparameter( + "max_features__GradientBoostingRegressor", + **self.shared_hyperparameters["max_features"], + ) + alpha = csh.CategoricalHyperparameter( + "alpha__GradientBoostingRegressor", [0.75, 0.8, 0.85, 0.9, 0.95, 0.99] + ) + + self._add_hyperparameters_and_equals_conditions( + locals(), "GradientBoostingRegressor" + ) + + def setup_ada_boost_regressor(self, regressors: csh.CategoricalHyperparameter): + n_estimators = csh.Constant( + "n_estimators__AdaBoostRegressor", + value=self.shared_hyperparameters["n_estimators"][0], + ) + learning_rate = csh.CategoricalHyperparameter( + "learning_rate__AdaBoostRegressor", + self.shared_hyperparameters["learning_rate"], + ) + loss = csh.CategoricalHyperparameter( + "loss__AdaBoostRegressor", ["linear", "square", "exponential"] + ) + + self._add_hyperparameters_and_equals_conditions(locals(), "AdaBoostRegressor") + + def setup_decision_tree_regressor(self, regressors: csh.CategoricalHyperparameter): + max_depth = csh.UniformIntegerHyperparameter( + "max_depth___DecisionTreeRegressor", + **self.shared_hyperparameters["max_depth"], + ) + min_samples_split = csh.UniformIntegerHyperparameter( + "min_samples_split___DecisionTreeRegressor", + **self.shared_hyperparameters["min_samples_split"], + ) + min_samples_leaf = csh.UniformIntegerHyperparameter( + "min_samples_leaf___DecisionTreeRegressor", + **self.shared_hyperparameters["min_samples_leaf"], + ) + + self._add_hyperparameters_and_equals_conditions( + locals(), "DecisionTreeRegressor" + ) + + def setup_k_neighbors_regressor(self, regressors: csh.CategoricalHyperparameter): + n_neighbors = csh.UniformIntegerHyperparameter( + "n_neighbors__KNeighborsRegressor", lower=1, upper=101, default_value=5 + ) + weights = csh.CategoricalHyperparameter( + "weights__KNeighborsRegressor", ["uniform", "distance"] + ) + p = csh.CategoricalHyperparameter("p__KNeighborsRegressor", [1, 2]) + + self._add_hyperparameters_and_equals_conditions(locals(), "KNeighborsRegressor") + + def setup_lasso_lars_cv(self, regressors: csh.CategoricalHyperparameter): + normalize = csh.CategoricalHyperparameter( + "normalize__LassoLarsCV", [True, False] + ) + + self._add_hyperparameters_and_equals_conditions(locals(), "LassoLarsCV") + + def setup_linear_svr(self, regressors: csh.CategoricalHyperparameter): + loss = csh.CategoricalHyperparameter( + "loss__LinearSVR", ["epsilon_insensitive", "squared_epsilon_insensitive"] + ) + dual = csh.CategoricalHyperparameter("dual__LinearSVR", [True, False]) + tol = csh.CategoricalHyperparameter( + "tol__LinearSVR", self.shared_hyperparameters["tol"] + ) + C = csh.CategoricalHyperparameter( + "C__LinearSVR", + [1e-4, 1e-3, 1e-2, 1e-1, 0.5, 1.0, 5.0, 10.0, 15.0, 20.0, 25.0], + ) + epsilon = csh.CategoricalHyperparameter( + "epsilon__LinearSVR", [1e-4, 1e-3, 1e-2, 1e-1, 1.0] + ) + + self._add_hyperparameters_and_equals_conditions(locals(), "LinearSVR") + + def setup_random_forest_regressor(self, regressors: csh.CategoricalHyperparameter): + n_estimators = csh.Constant( + "n_estimators__RandomForestRegressor", + value=self.shared_hyperparameters["n_estimators"][0], + ) + max_features = csh.UniformFloatHyperparameter( + "max_features__RandomForestRegressor", + **self.shared_hyperparameters["max_features"], + ) + min_samples_split = csh.UniformIntegerHyperparameter( + "min_samples_split__RandomForestRegressor", + **self.shared_hyperparameters["min_samples_split"], + ) + min_samples_leaf = csh.UniformIntegerHyperparameter( + "min_samples_leaf__RandomForestRegressor", + **self.shared_hyperparameters["min_samples_leaf"], + ) + bootstrap = csh.CategoricalHyperparameter( + "bootstrap__RandomForestRegressor", + self.shared_hyperparameters["bootstrap"], + ) + + self._add_hyperparameters_and_equals_conditions( + locals(), "RandomForestRegressor" + ) From ee3f6e717738bd7013a47adfa4304896d1a50ff8 Mon Sep 17 00:00:00 2001 From: Provost Simon Date: Mon, 4 Dec 2023 15:31:25 +0000 Subject: [PATCH 4/9] refactor(tests): update tests to be ConfigSpace compliant --- .../configuration_task_test/__init__.py | 2 + .../configuration_task_test/classifiers.py | 254 ++++++++++++++++++ .../configuration_task_test/preprocessors.py | 191 +++++++++++++ gama/configuration/testconfiguration.py | 158 ++--------- tests/conftest.py | 42 +-- tests/data/ASHA/evaluations.log | 28 +- tests/data/AsyncEA/evaluations.log | 22 +- tests/data/RandomSearch/evaluations.log | 20 +- tests/system/test_gamaclassifier.py | 34 +++ tests/system/test_gamaregressor.py | 36 +++ tests/unit/test_configuration_parser.py | 35 ++- tests/unit/test_ea_mutation.py | 48 ++-- 12 files changed, 643 insertions(+), 227 deletions(-) create mode 100644 gama/configuration/configuration_task_test/__init__.py create mode 100644 gama/configuration/configuration_task_test/classifiers.py create mode 100644 gama/configuration/configuration_task_test/preprocessors.py diff --git a/gama/configuration/configuration_task_test/__init__.py b/gama/configuration/configuration_task_test/__init__.py new file mode 100644 index 00000000..b1b73016 --- /dev/null +++ b/gama/configuration/configuration_task_test/__init__.py @@ -0,0 +1,2 @@ +from .classifiers import ClassifierConfigTest +from .preprocessors import PreprocessorConfigTest diff --git a/gama/configuration/configuration_task_test/classifiers.py b/gama/configuration/configuration_task_test/classifiers.py new file mode 100644 index 00000000..4eadc7e0 --- /dev/null +++ b/gama/configuration/configuration_task_test/classifiers.py @@ -0,0 +1,254 @@ +import ConfigSpace as cs +import ConfigSpace.hyperparameters as csh + + +class ClassifierConfigTest: + def __init__( + self, + config_space: cs.ConfigurationSpace, + ): + if "estimators" not in config_space.meta: + raise ValueError("Expected 'estimators' key in meta of config_space") + self.config_space = config_space + self.classifiers_setup_map = { + "BernoulliNB": self.setup_bernoulliNB, + "MultinomialNB": self.setup_multinomialNB, + "GaussianNB": self.setup_gaussianNB, + "DecisionTreeClassifier": self.setup_decision_tree, + "ExtraTreesClassifier": self.setup_extra_trees, + "RandomForestClassifier": self.setup_random_forest, + "GradientBoostingClassifier": self.setup_gradient_boosting, + "KNeighborsClassifier": self.setup_k_neighbors, + "LinearSVC": self.setup_linear_svc, + "LogisticRegression": self.setup_logistic_regression, + } + self.cs_estimators_name = self.config_space.meta["estimators"] + + @property + def shared_hyperparameters(self): + return { + "alpha": [1e-3, 1e-2, 1e-1, 1.0, 10.0, 100.0], + "fit_prior": [True, False], + "criterion": ["gini", "entropy"], + "max_depth": {"lower": 1, "upper": 11}, + "min_samples_split": {"lower": 2, "upper": 21}, + "min_samples_leaf": {"lower": 1, "upper": 21}, + "max_features": {"lower": 0.05, "upper": 1.01, "default_value": 1.0}, + "n_estimators": [100], + "bootstrap": [True, False], + "dual": [True, False], + "C": [1e-4, 1e-3, 1e-2, 1e-1, 0.5, 1.0, 5.0, 10.0, 15.0, 20.0, 25.0], + } + + def setup_classifiers(self): + classifiers_choices = list(self.classifiers_setup_map.keys()) + + if not classifiers_choices: + raise ValueError("No classifiers to add to config space") + + classifiers = csh.CategoricalHyperparameter( + name=self.cs_estimators_name, + choices=classifiers_choices, + ) + self.config_space.add_hyperparameter(classifiers) + + for classifier_name in classifiers_choices: + if setup_func := self.classifiers_setup_map.get(classifier_name): + setup_func(classifiers) + + def _add_hyperparameters_and_equals_conditions( + self, local_vars: dict, estimator_name: str + ): + if "classifiers" not in local_vars or not isinstance( + local_vars["classifiers"], csh.CategoricalHyperparameter + ): + raise ValueError( + "Expected 'classifiers' key with a CategoricalHyperparameter in local" + "vars" + ) + + hyperparameters_to_add = [ + hyperparameter + for hyperparameter in local_vars.values() + if isinstance(hyperparameter, csh.Hyperparameter) + and hyperparameter != local_vars["classifiers"] + ] + + conditions_to_add = [ + cs.EqualsCondition( + hyperparameter, local_vars["classifiers"], estimator_name + ) + for hyperparameter in hyperparameters_to_add + ] + + self.config_space.add_hyperparameters(hyperparameters_to_add) + self.config_space.add_conditions(conditions_to_add) + + def setup_bernoulliNB(self, classifiers: csh.CategoricalHyperparameter): + alpha_NB = csh.CategoricalHyperparameter( + "alpha__BernoulliNB", self.shared_hyperparameters["alpha"] + ) + fit_prior = csh.CategoricalHyperparameter( + "fit_prior__BernoulliNB", self.shared_hyperparameters["fit_prior"] + ) + self._add_hyperparameters_and_equals_conditions(locals(), "BernoulliNB") + + def setup_multinomialNB(self, classifiers: csh.CategoricalHyperparameter): + alpha_NB = csh.CategoricalHyperparameter( + "alpha__MultinomialNB", self.shared_hyperparameters["alpha"] + ) + fit_prior = csh.CategoricalHyperparameter( + "fit_prior__MultinomialNB", self.shared_hyperparameters["fit_prior"] + ) + self._add_hyperparameters_and_equals_conditions(locals(), "MultinomialNB") + + def setup_gaussianNB(self, classifiers: csh.CategoricalHyperparameter): + # GaussianNB has no hyperparameters + pass + + def setup_decision_tree(self, classifiers: csh.CategoricalHyperparameter): + criterion = csh.CategoricalHyperparameter( + "criterion__DecisionTreeClassifier", + self.shared_hyperparameters["criterion"], + ) + max_depth = csh.UniformIntegerHyperparameter( + "max_depth__DecisionTreeClassifier", + **self.shared_hyperparameters["max_depth"], + ) + min_samples_split = csh.UniformIntegerHyperparameter( + "min_samples_split__DecisionTreeClassifier", + **self.shared_hyperparameters["min_samples_split"], + ) + min_samples_leaf = csh.UniformIntegerHyperparameter( + "min_samples_leaf__DecisionTreeClassifier", + **self.shared_hyperparameters["min_samples_leaf"], + ) + self._add_hyperparameters_and_equals_conditions( + locals(), "DecisionTreeClassifier" + ) + + def setup_extra_trees(self, classifiers: csh.CategoricalHyperparameter): + criterion = csh.CategoricalHyperparameter( + "criterion__ExtraTreesClassifier", self.shared_hyperparameters["criterion"] + ) + max_depth = csh.UniformIntegerHyperparameter( + "max_depth__ExtraTreesClassifier", + **self.shared_hyperparameters["max_depth"], + ) + min_samples_split = csh.UniformIntegerHyperparameter( + "min_samples_split__ExtraTreesClassifier", + **self.shared_hyperparameters["min_samples_split"], + ) + min_samples_leaf = csh.UniformIntegerHyperparameter( + "min_samples_leaf__ExtraTreesClassifier", + **self.shared_hyperparameters["min_samples_leaf"], + ) + max_features = csh.UniformFloatHyperparameter( + "max_features__ExtraTreesClassifier", + **self.shared_hyperparameters["max_features"], + ) + n_estimators = csh.CategoricalHyperparameter( + "n_estimators__ExtraTreesClassifier", + self.shared_hyperparameters["n_estimators"], + ) + bootstrap = csh.CategoricalHyperparameter( + "bootstrap__ExtraTreesClassifier", self.shared_hyperparameters["bootstrap"] + ) + self._add_hyperparameters_and_equals_conditions( + locals(), "ExtraTreesClassifier" + ) + + def setup_random_forest(self, classifiers: csh.CategoricalHyperparameter): + criterion = csh.CategoricalHyperparameter( + "criterion__RandomForestClassifier", + self.shared_hyperparameters["criterion"], + ) + max_depth = csh.UniformIntegerHyperparameter( + "max_depth__RandomForestClassifier", + **self.shared_hyperparameters["max_depth"], + ) + min_samples_split = csh.UniformIntegerHyperparameter( + "min_samples_split", **self.shared_hyperparameters["min_samples_split"] + ) + min_samples_leaf = csh.UniformIntegerHyperparameter( + "min_samples_leaf", **self.shared_hyperparameters["min_samples_leaf"] + ) + max_features = csh.UniformFloatHyperparameter( + "max_features", **self.shared_hyperparameters["max_features"] + ) + n_estimators = csh.CategoricalHyperparameter( + "n_estimators__RandomForestClassifier", + self.shared_hyperparameters["n_estimators"], + ) + bootstrap = csh.CategoricalHyperparameter( + "bootstrap", self.shared_hyperparameters["bootstrap"] + ) + self._add_hyperparameters_and_equals_conditions( + locals(), "RandomForestClassifier" + ) + + def setup_gradient_boosting(self, classifiers: csh.CategoricalHyperparameter): + sub_sample = csh.CategoricalHyperparameter( + "subsample", [0.05, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0] + ) + learning_rate = csh.CategoricalHyperparameter( + "learning_rate", [1e-3, 1e-2, 1e-1, 0.5, 1.0] + ) + max_features = csh.UniformFloatHyperparameter( + "max_features__GradientBoostingClassifier", + **self.shared_hyperparameters["max_features"], + ) + n_estimators = csh.CategoricalHyperparameter( + "n_estimators__GradientBoostingClassifier", + self.shared_hyperparameters["n_estimators"], + ) + max_depth = csh.UniformIntegerHyperparameter( + "max_depth__GradientBoostingClassifier", + **self.shared_hyperparameters["max_depth"], + ) + self._add_hyperparameters_and_equals_conditions( + locals(), "GradientBoostingClassifier" + ) + + def setup_k_neighbors(self, classifiers: csh.CategoricalHyperparameter): + n_neighbors = csh.UniformIntegerHyperparameter("n_neighbors", 1, 51) + weights = csh.CategoricalHyperparameter("weights", ["uniform", "distance"]) + p = csh.UniformIntegerHyperparameter("p", 1, 2) + self._add_hyperparameters_and_equals_conditions( + locals(), "KNeighborsClassifier" + ) + + def setup_linear_svc(self, classifiers: csh.CategoricalHyperparameter): + loss = csh.CategoricalHyperparameter( + "loss__LinearSVC", ["hinge", "squared_hinge"] + ) + penalty = csh.CategoricalHyperparameter("penalty__LinearSVC", ["l1", "l2"]) + dual = csh.CategoricalHyperparameter( + "dual__LinearSVC", self.shared_hyperparameters["dual"] + ) + tol = csh.CategoricalHyperparameter( + "tol__LinearSVC", [1e-5, 1e-4, 1e-3, 1e-2, 1e-1] + ) + C = csh.CategoricalHyperparameter( + "C__LinearSVC", self.shared_hyperparameters["C"] + ) + self._add_hyperparameters_and_equals_conditions(locals(), "LinearSVC") + + # Forbidden clause: Penalty 'l1' cannot be used with loss 'hinge' + forbidden_penalty_loss = cs.ForbiddenAndConjunction( + cs.ForbiddenEqualsClause(self.config_space["penalty__LinearSVC"], "l1"), + cs.ForbiddenEqualsClause(self.config_space["loss__LinearSVC"], "hinge"), + ) + self.config_space.add_forbidden_clause(forbidden_penalty_loss) + + def setup_logistic_regression(self, classifiers: csh.CategoricalHyperparameter): + penalty = csh.CategoricalHyperparameter( + "penalty__LogisticRegression", ["l1", "l2"] + ) + C = csh.CategoricalHyperparameter( + "C__LogisticRegression", self.shared_hyperparameters["C"] + ) + dual = csh.CategoricalHyperparameter( + "dual__LogisticRegression", self.shared_hyperparameters["dual"] + ) + self._add_hyperparameters_and_equals_conditions(locals(), "LogisticRegression") diff --git a/gama/configuration/configuration_task_test/preprocessors.py b/gama/configuration/configuration_task_test/preprocessors.py new file mode 100644 index 00000000..1aa69d3e --- /dev/null +++ b/gama/configuration/configuration_task_test/preprocessors.py @@ -0,0 +1,191 @@ +import ConfigSpace as cs +import ConfigSpace.hyperparameters as csh + + +class PreprocessorConfigTest: + def __init__( + self, + config_space: cs.ConfigurationSpace, + ): + if "preprocessors" not in config_space.meta: + raise ValueError("Expected 'preprocessors' key in meta of config_space") + self.config_space = config_space + self.preprocessors_setup_map = { + "SelectFwe": self.setup_select_fwe, + "Binarizer": self.setup_binarizer, + "FastICA": self.setup_fast_ica, + "FeatureAgglomeration": self.setup_feature_agglomeration, + "MaxAbsScaler": self.setup_max_abs_scaler, + "MinMaxScaler": self.setup_min_max_scaler, + "Normalizer": self.setup_normalizer, + "Nystroem": self.setup_nystroem, + "PCA": self.setup_pca, + "PolynomialFeatures": self.setup_polynomial_features, + "RBFSampler": self.setup_rbf_sampler, + "RobustScaler": self.setup_robust_scaler, + "StandardScaler": self.setup_standard_scaler, + "SelectPercentile": self.setup_select_percentile, + "VarianceThreshold": self.setup_variance_threshold, + } + self.cs_preprocessors_name = config_space.meta["preprocessors"] + + @property + def shared_hyperparameters(self): + return { + "gamma": {"lower": 0.01, "upper": 1.01, "default_value": 1.0}, + } + + def setup_preprocessors(self): + preprocessors_choices = list(self.preprocessors_setup_map.keys()) + + if not preprocessors_choices: + raise ValueError("No preprocessors to add to config space") + + preprocessors = csh.CategoricalHyperparameter( + name=self.cs_preprocessors_name, + choices=preprocessors_choices, + ) + self.config_space.add_hyperparameter(preprocessors) + + for preprocessor_name in preprocessors_choices: + if setup_func := self.preprocessors_setup_map.get(preprocessor_name): + setup_func(preprocessors) + + def _add_hyperparameters_and_equals_conditions( + self, local_vars: dict, preprocessor_name: str + ): + if "preprocessors" not in local_vars or not isinstance( + local_vars["preprocessors"], csh.CategoricalHyperparameter + ): + raise ValueError( + "Expected 'preprocessors' key with a CategoricalHyperparameter in local" + "vars" + ) + + hyperparameters_to_add = [ + hyperparameter + for hyperparameter in local_vars.values() + if isinstance(hyperparameter, csh.Hyperparameter) + and hyperparameter != local_vars["preprocessors"] + ] + + conditions_to_add = [ + cs.EqualsCondition( + hyperparameter, local_vars["preprocessors"], preprocessor_name + ) + for hyperparameter in hyperparameters_to_add + ] + + self.config_space.add_hyperparameters(hyperparameters_to_add) + self.config_space.add_conditions(conditions_to_add) + + def setup_select_fwe(self, preprocessors: csh.CategoricalHyperparameter): + alpha = csh.UniformFloatHyperparameter( + "alpha__SelectFwe", 0, 0.05, default_value=0.05 + ) + self._add_hyperparameters_and_equals_conditions(locals(), "SelectFwe") + + def setup_binarizer(self, preprocessors: csh.CategoricalHyperparameter): + threshold = csh.UniformFloatHyperparameter( + "threshold__Binarizer", 0.0, 1.01, default_value=0.05 + ) + self._add_hyperparameters_and_equals_conditions(locals(), "Binarizer") + + def setup_fast_ica(self, preprocessors: csh.CategoricalHyperparameter): + whiten = csh.CategoricalHyperparameter("whiten", ["unit-variance"]) + tol = csh.UniformFloatHyperparameter( + "tol__FastICA", 0.0, 1.01, default_value=0.05 + ) + self._add_hyperparameters_and_equals_conditions(locals(), "FastICA") + + def setup_feature_agglomeration(self, preprocessors: csh.CategoricalHyperparameter): + linkage = csh.CategoricalHyperparameter( + "linkage__FeatureAgglomeration", ["ward", "complete", "average"] + ) + affinity = csh.CategoricalHyperparameter( + "affinity__FeatureAgglomeration", + ["euclidean", "l1", "l2", "manhattan", "cosine", "precomputed"], + ) + self._add_hyperparameters_and_equals_conditions( + locals(), "FeatureAgglomeration" + ) + + # Forbidden clause: Linkage is different from 'ward' and affinity is 'euclidean' + forbidden_penalty_loss = cs.ForbiddenAndConjunction( + cs.ForbiddenInClause( + self.config_space["linkage__FeatureAgglomeration"], + ["complete", "average"], + ), + cs.ForbiddenEqualsClause( + self.config_space["affinity__FeatureAgglomeration"], "euclidean" + ), + ) + self.config_space.add_forbidden_clause(forbidden_penalty_loss) + + def setup_max_abs_scaler(self, preprocessors: csh.CategoricalHyperparameter): + # No hyperparameters + pass + + def setup_min_max_scaler(self, preprocessors: csh.CategoricalHyperparameter): + # No hyperparameters + pass + + def setup_normalizer(self, preprocessors: csh.CategoricalHyperparameter): + norm = csh.CategoricalHyperparameter("norm", ["l1", "l2", "max"]) + self._add_hyperparameters_and_equals_conditions(locals(), "Normalizer") + + def setup_nystroem(self, preprocessors: csh.CategoricalHyperparameter): + kernel = csh.CategoricalHyperparameter( + "kernel", + [ + "rbf", + "cosine", + "chi2", + "laplacian", + "polynomial", + "poly", + "linear", + "additive_chi2", + "sigmoid", + ], + ) + gamma = csh.UniformFloatHyperparameter( + "gamma__Nystroem", **self.shared_hyperparameters["gamma"] + ) + n_components = csh.UniformIntegerHyperparameter("n_components", 1, 11) + self._add_hyperparameters_and_equals_conditions(locals(), "Nystroem") + + def setup_pca(self, preprocessors: csh.CategoricalHyperparameter): + svd_solver = csh.CategoricalHyperparameter("svd_solver", ["randomized"]) + iterated_power = csh.UniformIntegerHyperparameter("iterated_power", 1, 11) + self._add_hyperparameters_and_equals_conditions(locals(), "PCA") + + def setup_polynomial_features(self, preprocessors: csh.CategoricalHyperparameter): + degree = csh.CategoricalHyperparameter("degree", [2]) + include_bias = csh.CategoricalHyperparameter("include_bias", [False]) + interaction_only = csh.CategoricalHyperparameter("interaction_only", [False]) + self._add_hyperparameters_and_equals_conditions(locals(), "PolynomialFeatures") + + def setup_rbf_sampler(self, preprocessors: csh.CategoricalHyperparameter): + gamma = csh.UniformFloatHyperparameter( + "gamma__RBFSampler", **self.shared_hyperparameters["gamma"] + ) + self._add_hyperparameters_and_equals_conditions(locals(), "RBFSampler") + + def setup_robust_scaler(self, preprocessors: csh.CategoricalHyperparameter): + # No hyperparameters + pass + + def setup_standard_scaler(self, preprocessors: csh.CategoricalHyperparameter): + # No hyperparameters + pass + + def setup_select_percentile(self, preprocessors: csh.CategoricalHyperparameter): + percentile = csh.UniformIntegerHyperparameter("percentile", 1, 100) + self._add_hyperparameters_and_equals_conditions(locals(), "SelectPercentile") + + def setup_variance_threshold(self, preprocessors: csh.CategoricalHyperparameter): + threshold = csh.UniformFloatHyperparameter( + "threshold__VarianceThreshold", 0.05, 1.01, default_value=0.05 + ) + self._add_hyperparameters_and_equals_conditions(locals(), "VarianceThreshold") diff --git a/gama/configuration/testconfiguration.py b/gama/configuration/testconfiguration.py index 4c134db9..e3d9bc2f 100644 --- a/gama/configuration/testconfiguration.py +++ b/gama/configuration/testconfiguration.py @@ -1,146 +1,22 @@ -import numpy as np +import ConfigSpace as cs -from sklearn.naive_bayes import GaussianNB, BernoulliNB, MultinomialNB -from sklearn.tree import DecisionTreeClassifier -from sklearn.ensemble import ( - ExtraTreesClassifier, - RandomForestClassifier, - GradientBoostingClassifier, +from gama.configuration.configuration_task_test import ( + ClassifierConfigTest, + PreprocessorConfigTest, ) -from sklearn.neighbors import KNeighborsClassifier -from sklearn.svm import LinearSVC -from sklearn.linear_model import LogisticRegression -from sklearn.cluster import FeatureAgglomeration -from sklearn.preprocessing import ( - MaxAbsScaler, - MinMaxScaler, - Normalizer, - PolynomialFeatures, - RobustScaler, - StandardScaler, - Binarizer, -) -from sklearn.kernel_approximation import Nystroem, RBFSampler -from sklearn.decomposition import PCA, FastICA -from sklearn.feature_selection import ( - SelectFwe, - SelectPercentile, - f_classif, - VarianceThreshold, + +# A configuration with limited operators for unit tests 🧪 + +config_space = cs.ConfigurationSpace( + meta={ + # "gama_system_name": "current_configuration_name", + "estimators": "classifiers", + "preprocessors": "preprocessors", + } ) -# A configuration with limited operators for unit tests. +classifier_config = ClassifierConfigTest(config_space) +classifier_config.setup_classifiers() -clf_config = { - "alpha": [1e-3, 1e-2, 1e-1, 1.0, 10.0, 100.0], - "fit_prior": [True, False], - "min_samples_split": range(2, 21), - "min_samples_leaf": range(1, 21), - # Classifiers - GaussianNB: {}, - BernoulliNB: {"alpha": [], "fit_prior": []}, - MultinomialNB: {"alpha": [], "fit_prior": []}, - DecisionTreeClassifier: { - "criterion": ["gini", "entropy"], - "max_depth": range(1, 11), - "min_samples_split": [], - "min_samples_leaf": [], - }, - ExtraTreesClassifier: { - "n_estimators": [100], - "criterion": ["gini", "entropy"], - "max_features": [0.05, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0], - "min_samples_split": [], - "min_samples_leaf": [], - "bootstrap": [True, False], - }, - RandomForestClassifier: { - "n_estimators": [100], - "criterion": ["gini", "entropy"], - "max_features": [0.05, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0], - "min_samples_split": range(2, 21), - "min_samples_leaf": range(1, 21), - "bootstrap": [True, False], - }, - GradientBoostingClassifier: { - "n_estimators": [100], - "learning_rate": [1e-3, 1e-2, 1e-1, 0.5, 1.0], - "max_depth": range(1, 11), - "min_samples_split": range(2, 21), - "min_samples_leaf": range(1, 21), - "subsample": [0.05, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0], - "max_features": np.arange(0.05, 1.01, 0.05), - }, - KNeighborsClassifier: { - "n_neighbors": range(1, 51), - "weights": ["uniform", "distance"], - "p": [1, 2], - }, - LinearSVC: { - "penalty": ["l1", "l2"], - "loss": ["hinge", "squared_hinge"], - "dual": [False, True], - "tol": [1e-5, 1e-4, 1e-3, 1e-2, 1e-1], - "C": [1e-4, 1e-3, 1e-2, 1e-1, 0.5, 1.0, 5.0, 10.0, 15.0, 20.0, 25.0], - "param_check": [ - lambda params: (not params["dual"] or params["penalty"] == "l2") - and not (params["penalty"] == "l1" and params["loss"] == "hinge") - and not ( - params["penalty"] == "l2" - and params["loss"] == "hinge" - and not params["dual"] - ) - ], - }, - LogisticRegression: { - "penalty": ["l1", "l2"], - "C": [1e-4, 1e-3, 1e-2, 1e-1, 0.5, 1.0, 5.0, 10.0, 15.0, 20.0, 25.0], - "dual": [False, True], - "param_check": [lambda params: not params["dual"] or params["penalty"] == "l2"], - }, - # Preprocesssors - Binarizer: {"threshold": np.arange(0.0, 1.01, 0.05)}, - FastICA: { - "tol": np.arange(0.0, 1.01, 0.05), - "whiten": ["unit-variance"], - }, - FeatureAgglomeration: { - "linkage": ["ward", "complete", "average"], - "affinity": ["euclidean", "l1", "l2", "manhattan", "cosine", "precomputed"], - "param_check": [ - lambda params: params["linkage"] != "ward" - or params["affinity"] == "euclidean" - ], - }, - MaxAbsScaler: {}, - MinMaxScaler: {}, - Normalizer: {"norm": ["l1", "l2", "max"]}, - Nystroem: { - "kernel": [ - "rbf", - "cosine", - "chi2", - "laplacian", - "polynomial", - "poly", - "linear", - "additive_chi2", - "sigmoid", - ], - "gamma": np.arange(0.0, 1.01, 0.05), - "n_components": range(1, 11), - }, - PCA: {"svd_solver": ["randomized"], "iterated_power": range(1, 11)}, - PolynomialFeatures: { - "degree": [2], - "include_bias": [False], - "interaction_only": [False], - }, - RBFSampler: {"gamma": np.arange(0.0, 1.01, 0.05)}, - RobustScaler: {}, - StandardScaler: {}, - # Selectors - SelectFwe: {"alpha": np.arange(0, 0.05, 0.001), "score_func": {f_classif: None}}, - SelectPercentile: {"percentile": range(1, 100), "score_func": {f_classif: None}}, - VarianceThreshold: {"threshold": np.arange(0.05, 1.01, 0.05)}, -} +preprocessor_config = PreprocessorConfigTest(config_space) +preprocessor_config.setup_preprocessors() diff --git a/tests/conftest.py b/tests/conftest.py index eb3dc76e..b1ead3f6 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,58 +1,62 @@ import pytest from gama import GamaClassifier from gama.genetic_programming.components import Individual -from gama.configuration.testconfiguration import clf_config +from gama.configuration.testconfiguration import config_space as test_config_space from gama.genetic_programming.compilers.scikitlearn import compile_individual @pytest.fixture -def pset(): - gc = GamaClassifier(search_space=clf_config, scoring="accuracy", store="nothing") - yield gc._pset +def config_space(): + gc = GamaClassifier( + search_space=test_config_space, scoring="accuracy", store="nothing" + ) + yield gc.search_space gc.cleanup("all") @pytest.fixture def opset(): - gc = GamaClassifier(search_space=clf_config, scoring="accuracy", store="nothing") + gc = GamaClassifier( + search_space=test_config_space, scoring="accuracy", store="nothing" + ) yield gc._operator_set gc.cleanup("all") @pytest.fixture -def GNB(pset): - return Individual.from_string("GaussianNB(data)", pset, compile_individual) +def GNB(config_space): + return Individual.from_string("GaussianNB(data)", config_space, compile_individual) @pytest.fixture -def RS_MNB(pset): +def RS_MNB(config_space): return Individual.from_string( "MultinomialNB(RobustScaler(data), alpha=1.0, fit_prior=True)", - pset, + config_space, compile_individual, ) @pytest.fixture -def SS_BNB(pset): +def SS_BNB(config_space): return Individual.from_string( "BernoulliNB(StandardScaler(data), alpha=0.1, fit_prior=True)", - pset, + config_space, compile_individual, ) @pytest.fixture -def SS_RBS_SS_BNB(pset): +def SS_RBS_SS_BNB(config_space): return Individual.from_string( "BernoulliNB(StandardScaler(RobustScaler(StandardScaler(data))), alpha=0.1, fit_prior=True)", # noqa: E501 - pset, + config_space, compile_individual, ) @pytest.fixture -def LinearSVC(pset): +def LinearSVC(config_space): individual_str = """LinearSVC(data, LinearSVC.C=0.001, LinearSVC.dual=True, @@ -60,11 +64,11 @@ def LinearSVC(pset): LinearSVC.penalty='l2', LinearSVC.tol=1e-05)""" individual_str = "".join(individual_str.split()).replace(",", ", ") - return Individual.from_string(individual_str, pset, None) + return Individual.from_string(individual_str, config_space, None) @pytest.fixture -def ForestPipeline(pset): +def ForestPipeline(config_space): individual_str = """RandomForestClassifier( FeatureAgglomeration( data, @@ -79,11 +83,11 @@ def ForestPipeline(pset): RandomForestClassifier.n_estimators=100)""" individual_str = "".join(individual_str.split()).replace(",", ", ") - return Individual.from_string(individual_str, pset, None) + return Individual.from_string(individual_str, config_space, None) @pytest.fixture -def InvalidLinearSVC(pset): +def InvalidLinearSVC(config_space): individual_str = """LinearSVC(data, LinearSVC.C=0.001, LinearSVC.dual=True, @@ -91,4 +95,4 @@ def InvalidLinearSVC(pset): LinearSVC.penalty='l1', LinearSVC.tol=1e-05)""" individual_str = "".join(individual_str.split()).replace(",", ", ") - return Individual.from_string(individual_str, pset, compile_individual) + return Individual.from_string(individual_str, config_space, compile_individual) diff --git a/tests/data/ASHA/evaluations.log b/tests/data/ASHA/evaluations.log index 703f6a34..ec9ed0f7 100644 --- a/tests/data/ASHA/evaluations.log +++ b/tests/data/ASHA/evaluations.log @@ -6,14 +6,14 @@ f372d4bc-9ec1-4c2c-90db-478ab6cb3a9c;20308;2020-06-23 11:32:26,824681;0.08644843 91760f3b-6d1e-4f14-99c2-60d77598aa50;8980;2020-06-23 11:32:26,855933;0.08632874488830566;0.09375;(-1.512501110261379, -1);BernoulliNB(data, alpha=0.001, fit_prior=False);None;1 f6adafd9-5c44-47f3-8c4b-8ce752639d8e;20308;2020-06-23 11:32:26,911130;0.11718416213989258;0.109375;(-0.2977521450115315, -2);KNeighborsClassifier(Normalizer(data, Normalizer.norm='l2'), KNeighborsClassifier.n_neighbors=7, KNeighborsClassifier.p=2, KNeighborsClassifier.weights='distance');None;1 5e8833dd-7ffb-492d-916c-cbdf51631f6d;20308;2020-06-23 11:32:27,043950;0.8933179378509521;0.890625;(-0.8640465359075755, -1);ExtraTreesClassifier(data, ExtraTreesClassifier.bootstrap=True, ExtraTreesClassifier.criterion='gini', ExtraTreesClassifier.max_features=0.9500000000000001, min_samples_leaf=10, min_samples_split=13, ExtraTreesClassifier.n_estimators=100);None;1 -5dc7fa69-471d-44cb-b2ca-c8957f3bbc30;20308;2020-06-23 11:32:28,059189;0.10135197639465332;0.109375;(-6.397603224636075, -2);DecisionTreeClassifier(SelectPercentile(data, SelectPercentile.percentile=80, SelectPercentile.score_func=f_classif), DecisionTreeClassifier.criterion='entropy', DecisionTreeClassifier.max_depth=6, min_samples_leaf=3, min_samples_split=12);None;1 +5dc7fa69-471d-44cb-b2ca-c8957f3bbc30;20308;2020-06-23 11:32:28,059189;0.10135197639465332;0.109375;(-6.397603224636075, -2);DecisionTreeClassifier(SelectPercentile(data, SelectPercentile.percentile=80), DecisionTreeClassifier.criterion='entropy', DecisionTreeClassifier.max_depth=6, min_samples_leaf=3, min_samples_split=12);None;1 078ae7f9-a743-4753-9545-477d44c8a99d;20308;2020-06-23 11:32:28,170596;0.020221710205078125;0.03125;(-inf, -3);MultinomialNB(PolynomialFeatures(RBFSampler(data, RBFSampler.gamma=0.2), PolynomialFeatures.degree=2, PolynomialFeatures.include_bias=False, PolynomialFeatures.interaction_only=False), alpha=0.1, fit_prior=True);Negative values in data passed to MultinomialNB (input X);1 e06ab766-ca64-47ad-b585-2c5bfd0a41f3;20308;2020-06-23 11:32:28,190817;0.09146928787231445;0.109375;(-1.9652004540424983, -1);MultinomialNB(data, alpha=0.001, fit_prior=True);None;2 a03a8626-a9cd-4617-bf26-5679159495ad;20308;2020-06-23 11:32:28,282287;0.997955322265625;1.0;(-0.5484535103280652, -1);ExtraTreesClassifier(data, ExtraTreesClassifier.bootstrap=False, ExtraTreesClassifier.criterion='entropy', ExtraTreesClassifier.max_features=0.9500000000000001, min_samples_leaf=2, min_samples_split=17, ExtraTreesClassifier.n_estimators=100);None;1 d4dc302a-b876-42d1-8f8e-1de63e72352a;20308;2020-06-23 11:32:29,350631;0.08852910995483398;0.09375;(-3.471276238265286, -3);DecisionTreeClassifier(StandardScaler(StandardScaler(data)), DecisionTreeClassifier.criterion='gini', DecisionTreeClassifier.max_depth=3, min_samples_leaf=9, min_samples_split=2);None;1 f6adafd9-5c44-47f3-8c4b-8ce752639d8e;20308;2020-06-23 11:32:29,441178;0.3259754180908203;0.328125;(-0.18687441947484362, -2);KNeighborsClassifier(Normalizer(data, Normalizer.norm='l2'), KNeighborsClassifier.n_neighbors=7, KNeighborsClassifier.p=2, KNeighborsClassifier.weights='distance');None;2 68cc22b5-93a8-418f-bb58-bfb926a930d1;20308;2020-06-23 11:32:29,807391;0.08854889869689941;0.09375;(-7.23093642073088, -1);GaussianNB(data);None;1 -853c817f-d8c0-43c1-873f-2bcf147f3c56;8980;2020-06-23 11:32:26,942262;2.744265556335449;2.734375;(-1.027437518275104, -3);GradientBoostingClassifier(SelectFwe(Nystroem(data, Nystroem.gamma=0.6000000000000001, Nystroem.kernel='poly', Nystroem.n_components=5), SelectFwe.alpha=0.033, SelectFwe.score_func=f_classif), GradientBoostingClassifier.learning_rate=0.1, GradientBoostingClassifier.max_depth=2, GradientBoostingClassifier.max_features=0.8, GradientBoostingClassifier.min_samples_leaf=8, GradientBoostingClassifier.min_samples_split=3, GradientBoostingClassifier.n_estimators=100, GradientBoostingClassifier.subsample=0.3);None;1 +853c817f-d8c0-43c1-873f-2bcf147f3c56;8980;2020-06-23 11:32:26,942262;2.744265556335449;2.734375;(-1.027437518275104, -3);GradientBoostingClassifier(SelectFwe(Nystroem(data, Nystroem.gamma=0.6000000000000001, Nystroem.kernel='poly', Nystroem.n_components=5), SelectFwe.alpha=0.033), GradientBoostingClassifier.learning_rate=0.1, GradientBoostingClassifier.max_depth=2, GradientBoostingClassifier.max_features=0.8, GradientBoostingClassifier.min_samples_leaf=8, GradientBoostingClassifier.min_samples_split=3, GradientBoostingClassifier.n_estimators=100, GradientBoostingClassifier.subsample=0.3);None;1 f372d4bc-9ec1-4c2c-90db-478ab6cb3a9c;8980;2020-06-23 11:32:29,963945;0.21486353874206543;0.21875;(-0.3003330747679789, -2);KNeighborsClassifier(VarianceThreshold(data, VarianceThreshold.threshold=0.4), KNeighborsClassifier.n_neighbors=5, KNeighborsClassifier.p=1, KNeighborsClassifier.weights='uniform');None;2 de62f096-3f36-4414-9da6-0d6bc5c1001c;8980;2020-06-23 11:32:30,188877;0.12078976631164551;0.125;(-0.7311093795579261, -1);KNeighborsClassifier(data, KNeighborsClassifier.n_neighbors=39, KNeighborsClassifier.p=1, KNeighborsClassifier.weights='distance');None;1 059a0e79-58fe-489b-a495-8de5b2e9be2c;8980;2020-06-23 11:32:30,319743;0.16148996353149414;0.296875;(-3.10848987554197, -2);MultinomialNB(PolynomialFeatures(data, PolynomialFeatures.degree=2, PolynomialFeatures.include_bias=False, PolynomialFeatures.interaction_only=False), alpha=0.1, fit_prior=True);None;1 @@ -24,15 +24,15 @@ a03a8626-a9cd-4617-bf26-5679159495ad;8980;2020-06-23 11:32:30,834177;2.069298028 f6adafd9-5c44-47f3-8c4b-8ce752639d8e;8980;2020-06-23 11:32:34,043611;0.6912477016448975;0.6875;(-0.1749001012829964, -2);KNeighborsClassifier(Normalizer(data, Normalizer.norm='l2'), KNeighborsClassifier.n_neighbors=7, KNeighborsClassifier.p=2, KNeighborsClassifier.weights='distance');None;3 dc9ce0ad-fe6c-4ea2-be0d-2f3ca447aa0f;8980;2020-06-23 11:32:34,775144;0.018099308013916016;0.015625;(-inf, -2);MultinomialNB(RBFSampler(data, RBFSampler.gamma=0.7000000000000001), alpha=100.0, fit_prior=True);Negative values in data passed to MultinomialNB (input X);1 de62f096-3f36-4414-9da6-0d6bc5c1001c;8980;2020-06-23 11:32:34,813505;0.317230224609375;0.328125;(-0.31852738660027063, -1);KNeighborsClassifier(data, KNeighborsClassifier.n_neighbors=39, KNeighborsClassifier.p=1, KNeighborsClassifier.weights='distance');None;2 -bf362289-f2a0-47a4-a64f-21212dfc4147;8980;2020-06-23 11:32:35,160956;0.928499698638916;0.921875;(-0.6259716573399572, -3);RandomForestClassifier(VarianceThreshold(SelectFwe(data, SelectFwe.alpha=0.026000000000000002, SelectFwe.score_func=f_classif), VarianceThreshold.threshold=0.9500000000000001), RandomForestClassifier.bootstrap=False, RandomForestClassifier.criterion='gini', RandomForestClassifier.max_features=0.4, RandomForestClassifier.min_samples_leaf=8, RandomForestClassifier.min_samples_split=16, RandomForestClassifier.n_estimators=100);None;1 -f4f43161-6adb-4c46-b723-e053d9914f58;8980;2020-06-23 11:32:36,132011;0.09155702590942383;0.09375;(-5.120968673363717, -3);DecisionTreeClassifier(SelectFwe(StandardScaler(data), SelectFwe.alpha=0.041, SelectFwe.score_func=f_classif), DecisionTreeClassifier.criterion='gini', DecisionTreeClassifier.max_depth=5, min_samples_leaf=13, min_samples_split=15);None;1 +bf362289-f2a0-47a4-a64f-21212dfc4147;8980;2020-06-23 11:32:35,160956;0.928499698638916;0.921875;(-0.6259716573399572, -3);RandomForestClassifier(VarianceThreshold(SelectFwe(data, SelectFwe.alpha=0.026000000000000002), VarianceThreshold.threshold=0.9500000000000001), RandomForestClassifier.bootstrap=False, RandomForestClassifier.criterion='gini', RandomForestClassifier.max_features=0.4, RandomForestClassifier.min_samples_leaf=8, RandomForestClassifier.min_samples_split=16, RandomForestClassifier.n_estimators=100);None;1 +f4f43161-6adb-4c46-b723-e053d9914f58;8980;2020-06-23 11:32:36,132011;0.09155702590942383;0.09375;(-5.120968673363717, -3);DecisionTreeClassifier(SelectFwe(StandardScaler(data), SelectFwe.alpha=0.041), DecisionTreeClassifier.criterion='gini', DecisionTreeClassifier.max_depth=5, min_samples_leaf=13, min_samples_split=15);None;1 ee8fb321-babc-4df5-9520-fb7bf6bb6b46;8980;2020-06-23 11:32:36,223568;0.02036762237548828;0.015625;(-inf, -3);ExtraTreesClassifier(FeatureAgglomeration(PolynomialFeatures(data, PolynomialFeatures.degree=2, PolynomialFeatures.include_bias=False, PolynomialFeatures.interaction_only=False), FeatureAgglomeration.affinity='l2', FeatureAgglomeration.linkage='ward'), ExtraTreesClassifier.bootstrap=True, ExtraTreesClassifier.criterion='gini', ExtraTreesClassifier.max_features=0.7500000000000001, min_samples_leaf=3, min_samples_split=12, ExtraTreesClassifier.n_estimators=100);l2 was provided as affinity. Ward can only work with euclidean distances.;1 62a4ffc0-8c55-4be7-b505-491bfcbc368e;8980;2020-06-23 11:32:36,243936;0.4920215606689453;0.546875;(-0.15472008267485318, -2);LogisticRegression(StandardScaler(data), LogisticRegression.C=5.0, LogisticRegression.dual=False, LogisticRegression.penalty='l2', LogisticRegression.solver='lbfgs');None;2 49fbab65-4fa7-442d-b795-38885cc938e3;8980;2020-06-23 11:32:36,735957;0.6860792636871338;3.796875;(-3.3646008205029583, -3);GaussianNB(PCA(MinMaxScaler(data), PCA.iterated_power=5, PCA.svd_solver='randomized'));None;1 7072ed2e-2630-460b-baf9-d17d6250f4b1;8980;2020-06-23 11:32:37,422036;0.08424186706542969;0.09375;(-1.5282662337169928, -1);BernoulliNB(data, alpha=100.0, fit_prior=False);None;1 -7d1f2eeb-bb1b-4197-a79a-326784318f22;20308;2020-06-23 11:32:29,963945;10.43557596206665;10.421875;(-1.8795147943396817, -2);GradientBoostingClassifier(SelectPercentile(data, SelectPercentile.percentile=15, SelectPercentile.score_func=f_classif), GradientBoostingClassifier.learning_rate=0.001, GradientBoostingClassifier.max_depth=8, GradientBoostingClassifier.max_features=0.6500000000000001, GradientBoostingClassifier.min_samples_leaf=3, GradientBoostingClassifier.min_samples_split=2, GradientBoostingClassifier.n_estimators=100, GradientBoostingClassifier.subsample=0.7500000000000001);None;1 +7d1f2eeb-bb1b-4197-a79a-326784318f22;20308;2020-06-23 11:32:29,963945;10.43557596206665;10.421875;(-1.8795147943396817, -2);GradientBoostingClassifier(SelectPercentile(data, SelectPercentile.percentile=15), GradientBoostingClassifier.learning_rate=0.001, GradientBoostingClassifier.max_depth=8, GradientBoostingClassifier.max_features=0.6500000000000001, GradientBoostingClassifier.min_samples_leaf=3, GradientBoostingClassifier.min_samples_split=2, GradientBoostingClassifier.n_estimators=100, GradientBoostingClassifier.subsample=0.7500000000000001);None;1 5765c2a4-a3a2-4880-a80c-04cb0624a0bb;20308;2020-06-23 11:32:40,946054;0.10809946060180664;0.09375;(-0.4664318786903273, -3);MultinomialNB(Normalizer(MaxAbsScaler(data), Normalizer.norm='max'), alpha=0.01, fit_prior=False);None;1 -bf362289-f2a0-47a4-a64f-21212dfc4147;20308;2020-06-23 11:32:41,118210;2.0044796466827393;2.0;(-0.46040942182561695, -3);RandomForestClassifier(VarianceThreshold(SelectFwe(data, SelectFwe.alpha=0.026000000000000002, SelectFwe.score_func=f_classif), VarianceThreshold.threshold=0.9500000000000001), RandomForestClassifier.bootstrap=False, RandomForestClassifier.criterion='gini', RandomForestClassifier.max_features=0.4, RandomForestClassifier.min_samples_leaf=8, RandomForestClassifier.min_samples_split=16, RandomForestClassifier.n_estimators=100);None;2 +bf362289-f2a0-47a4-a64f-21212dfc4147;20308;2020-06-23 11:32:41,118210;2.0044796466827393;2.0;(-0.46040942182561695, -3);RandomForestClassifier(VarianceThreshold(SelectFwe(data, SelectFwe.alpha=0.026000000000000002), VarianceThreshold.threshold=0.9500000000000001), RandomForestClassifier.bootstrap=False, RandomForestClassifier.criterion='gini', RandomForestClassifier.max_features=0.4, RandomForestClassifier.min_samples_leaf=8, RandomForestClassifier.min_samples_split=16, RandomForestClassifier.n_estimators=100);None;2 b22d5819-b305-456a-9192-705acac32dfa;20308;2020-06-23 11:32:43,187757;0.012001991271972656;0.015625;(-inf, -1);LogisticRegression(data, LogisticRegression.C=15.0, LogisticRegression.dual=True, LogisticRegression.penalty='l2', LogisticRegression.solver='lbfgs');Solver lbfgs supports only dual=False, got dual=True;1 62a4ffc0-8c55-4be7-b505-491bfcbc368e;20308;2020-06-23 11:32:43,224782;0.6437418460845947;1.046875;(-0.12549831803287947, -2);LogisticRegression(StandardScaler(data), LogisticRegression.C=5.0, LogisticRegression.dual=False, LogisticRegression.penalty='l2', LogisticRegression.solver='lbfgs');None;3 d7e8be90-bc28-42ec-a184-ae3a008f7cdf;20308;2020-06-23 11:32:43,870525;0.7992019653320312;0.796875;(-1.1502703392228544, -1);ExtraTreesClassifier(data, ExtraTreesClassifier.bootstrap=True, ExtraTreesClassifier.criterion='gini', ExtraTreesClassifier.max_features=0.15000000000000002, min_samples_leaf=10, min_samples_split=13, ExtraTreesClassifier.n_estimators=100);None;1 @@ -41,20 +41,20 @@ d7e8be90-bc28-42ec-a184-ae3a008f7cdf;20308;2020-06-23 11:32:43,870525;0.79920196 255868a0-149d-4c2c-92c6-392922bc880b;8980;2020-06-23 11:32:50,025929;9.158247709274292;12.09375;(-1.2934355344964124, -2);KNeighborsClassifier(FastICA(data, FastICA.tol=0.0), KNeighborsClassifier.n_neighbors=24, KNeighborsClassifier.p=1, KNeighborsClassifier.weights='uniform');None;1 0c976202-cb52-489c-9502-4bec0e51e675;8980;2020-06-23 11:32:59,202935;0.10809683799743652;0.109375;(-0.8741745397982907, -1);KNeighborsClassifier(data, KNeighborsClassifier.n_neighbors=41, KNeighborsClassifier.p=1, KNeighborsClassifier.weights='uniform');None;1 5765c2a4-a3a2-4880-a80c-04cb0624a0bb;8980;2020-06-23 11:32:59,323042;0.10610604286193848;0.109375;(-0.4325644636905871, -3);MultinomialNB(Normalizer(MaxAbsScaler(data), Normalizer.norm='max'), alpha=0.01, fit_prior=False);None;2 -1ab0a0b3-589e-433f-ae97-b35719893a80;8980;2020-06-23 11:32:59,431142;0.2368025779724121;0.234375;(-0.5465894927607848, -2);LogisticRegression(SelectFwe(data, SelectFwe.alpha=0.036000000000000004, SelectFwe.score_func=f_classif), LogisticRegression.C=0.001, LogisticRegression.dual=False, LogisticRegression.penalty='l2', LogisticRegression.solver='lbfgs');None;1 -8d96e5cc-bb60-4b7b-8b81-75675e51e376;8980;2020-06-23 11:32:59,668946;0.9110112190246582;0.921875;(-0.6092217386789287, -2);RandomForestClassifier(SelectPercentile(data, SelectPercentile.percentile=22, SelectPercentile.score_func=f_classif), RandomForestClassifier.bootstrap=True, RandomForestClassifier.criterion='gini', RandomForestClassifier.max_features=0.45, RandomForestClassifier.min_samples_leaf=4, RandomForestClassifier.min_samples_split=9, RandomForestClassifier.n_estimators=100);None;1 +1ab0a0b3-589e-433f-ae97-b35719893a80;8980;2020-06-23 11:32:59,431142;0.2368025779724121;0.234375;(-0.5465894927607848, -2);LogisticRegression(SelectFwe(data, SelectFwe.alpha=0.036000000000000004), LogisticRegression.C=0.001, LogisticRegression.dual=False, LogisticRegression.penalty='l2', LogisticRegression.solver='lbfgs');None;1 +8d96e5cc-bb60-4b7b-8b81-75675e51e376;8980;2020-06-23 11:32:59,668946;0.9110112190246582;0.921875;(-0.6092217386789287, -2);RandomForestClassifier(SelectPercentile(data, SelectPercentile.percentile=22), RandomForestClassifier.bootstrap=True, RandomForestClassifier.criterion='gini', RandomForestClassifier.max_features=0.45, RandomForestClassifier.min_samples_leaf=4, RandomForestClassifier.min_samples_split=9, RandomForestClassifier.n_estimators=100);None;1 165eff9e-a1dc-457f-b43b-6939773eb9ee;20308;2020-06-23 11:32:46,230403;14.299544095993042;14.28125;(-1.8679885977833623, -1);GradientBoostingClassifier(data, GradientBoostingClassifier.learning_rate=0.001, GradientBoostingClassifier.max_depth=8, GradientBoostingClassifier.max_features=0.35000000000000003, GradientBoostingClassifier.min_samples_leaf=2, GradientBoostingClassifier.min_samples_split=2, GradientBoostingClassifier.n_estimators=100, GradientBoostingClassifier.subsample=1.0);None;1 62d8eecf-48a2-47b8-8a43-c3c6476ea547;20308;2020-06-23 11:33:01,059957;0.14899969100952148;0.71875;(-inf, -3);MultinomialNB(FeatureAgglomeration(PCA(data, PCA.iterated_power=10, PCA.svd_solver='randomized'), FeatureAgglomeration.affinity='l1', FeatureAgglomeration.linkage='ward'), alpha=100.0, fit_prior=False);l1 was provided as affinity. Ward can only work with euclidean distances.;1 d6698020-249d-4eed-90d0-ebbb48e93b75;20308;2020-06-23 11:33:01,351087;0.10309410095214844;0.09375;(-0.8116612956766508, -1);BernoulliNB(data, alpha=0.1, fit_prior=False);None;2 6a017326-2341-4902-ad9b-d640d7f478d3;20308;2020-06-23 11:33:01,456182;0.0830843448638916;0.078125;(-7.23093642073088, -1);GaussianNB(data);None;1 f372d4bc-9ec1-4c2c-90db-478ab6cb3a9c;20308;2020-06-23 11:33:01,541260;0.3413205146789551;0.34375;(-0.22762732629524313, -2);KNeighborsClassifier(VarianceThreshold(data, VarianceThreshold.threshold=0.4), KNeighborsClassifier.n_neighbors=5, KNeighborsClassifier.p=1, KNeighborsClassifier.weights='uniform');None;3 089f9ae9-43e8-4f60-a258-41119b5cefcb;8980;2020-06-23 11:33:00,651932;1.3457434177398682;1.390625;(-1.080894599691573, -3);ExtraTreesClassifier(MaxAbsScaler(FastICA(data, FastICA.tol=1.0)), ExtraTreesClassifier.bootstrap=False, ExtraTreesClassifier.criterion='entropy', ExtraTreesClassifier.max_features=0.6000000000000001, min_samples_leaf=3, min_samples_split=12, ExtraTreesClassifier.n_estimators=100);None;1 -1ab0a0b3-589e-433f-ae97-b35719893a80;8980;2020-06-23 11:33:02,068741;0.4677395820617676;0.546875;(-0.34462743890816727, -2);LogisticRegression(SelectFwe(data, SelectFwe.alpha=0.036000000000000004, SelectFwe.score_func=f_classif), LogisticRegression.C=0.001, LogisticRegression.dual=False, LogisticRegression.penalty='l2', LogisticRegression.solver='lbfgs');None;2 +1ab0a0b3-589e-433f-ae97-b35719893a80;8980;2020-06-23 11:33:02,068741;0.4677395820617676;0.546875;(-0.34462743890816727, -2);LogisticRegression(SelectFwe(data, SelectFwe.alpha=0.036000000000000004), LogisticRegression.C=0.001, LogisticRegression.dual=False, LogisticRegression.penalty='l2', LogisticRegression.solver='lbfgs');None;2 45634922-1bbd-47cc-9624-5994fc579d9b;20308;2020-06-23 11:33:01,903600;0.6655070781707764;0.671875;(-0.7718147479085071, -1);ExtraTreesClassifier(data, ExtraTreesClassifier.bootstrap=False, ExtraTreesClassifier.criterion='gini', ExtraTreesClassifier.max_features=0.15000000000000002, min_samples_leaf=5, min_samples_split=6, ExtraTreesClassifier.n_estimators=100);None;1 5f03f46a-6dd5-432a-9dff-688c4ee16a43;20308;2020-06-23 11:33:02,652681;0.021008729934692383;0.03125;(-inf, -2);GaussianNB(Nystroem(data, Nystroem.gamma=0.15000000000000002, Nystroem.kernel='sigmoid', Nystroem.n_components=6));Input contains NaN, infinity or a value too large for dtype('float64').;1 b5c0c8d3-df7f-4715-a08d-a2628d792762;8980;2020-06-23 11:33:02,539475;0.6677131652832031;0.671875;(-2.2951229066094188, -3);RandomForestClassifier(FastICA(Nystroem(data, Nystroem.gamma=0.25, Nystroem.kernel='laplacian', Nystroem.n_components=3), FastICA.tol=0.35000000000000003), RandomForestClassifier.bootstrap=False, RandomForestClassifier.criterion='entropy', RandomForestClassifier.max_features=0.15000000000000002, RandomForestClassifier.min_samples_leaf=20, RandomForestClassifier.min_samples_split=14, RandomForestClassifier.n_estimators=100);None;1 5df5ca9b-5ba8-4e55-a50e-fdb28387db49;20308;2020-06-23 11:33:02,675691;0.9999330043792725;7.078125;(-3.4835290549443294, -2);GaussianNB(PCA(data, PCA.iterated_power=8, PCA.svd_solver='randomized'));None;1 -8d96e5cc-bb60-4b7b-8b81-75675e51e376;8980;2020-06-23 11:33:03,257234;1.3758268356323242;1.375;(-0.45151319618179075, -2);RandomForestClassifier(SelectPercentile(data, SelectPercentile.percentile=22, SelectPercentile.score_func=f_classif), RandomForestClassifier.bootstrap=True, RandomForestClassifier.criterion='gini', RandomForestClassifier.max_features=0.45, RandomForestClassifier.min_samples_leaf=4, RandomForestClassifier.min_samples_split=9, RandomForestClassifier.n_estimators=100);None;2 +8d96e5cc-bb60-4b7b-8b81-75675e51e376;8980;2020-06-23 11:33:03,257234;1.3758268356323242;1.375;(-0.45151319618179075, -2);RandomForestClassifier(SelectPercentile(data, SelectPercentile.percentile=22), RandomForestClassifier.bootstrap=True, RandomForestClassifier.criterion='gini', RandomForestClassifier.max_features=0.45, RandomForestClassifier.min_samples_leaf=4, RandomForestClassifier.min_samples_split=9, RandomForestClassifier.n_estimators=100);None;2 769017c3-600f-4254-b191-6b85b89a51ea;20308;2020-06-23 11:33:03,678627;1.1097559928894043;1.109375;(-0.916017764189669, -1);RandomForestClassifier(data, RandomForestClassifier.bootstrap=True, RandomForestClassifier.criterion='gini', RandomForestClassifier.max_features=0.9000000000000001, RandomForestClassifier.min_samples_leaf=15, RandomForestClassifier.min_samples_split=4, RandomForestClassifier.n_estimators=100);None;1 bb3099d4-7f1e-48a4-881c-cb6766c0ab12;8980;2020-06-23 11:33:04,732306;0.08111691474914551;0.078125;(-0.7825659886272689, -3);BernoulliNB(VarianceThreshold(VarianceThreshold(data, VarianceThreshold.threshold=0.15000000000000002), VarianceThreshold.threshold=0.55), alpha=0.1, fit_prior=True);None;1 0944f52a-56e9-403f-b7ea-c6d1b920e09e;8980;2020-06-23 11:33:04,842453;0.017014503479003906;0.015625;(-inf, -2);GaussianNB(Nystroem(data, Nystroem.gamma=0.05, Nystroem.kernel='sigmoid', Nystroem.n_components=1));Input contains NaN, infinity or a value too large for dtype('float64').;1 @@ -70,17 +70,17 @@ bb3099d4-7f1e-48a4-881c-cb6766c0ab12;8980;2020-06-23 11:33:06,518966;0.106031179 b89e51c0-9509-407f-800e-0528408bfc80;20308;2020-06-23 11:33:06,563942;0.09208345413208008;0.09375;(-2.267718783538574, -2);MultinomialNB(FeatureAgglomeration(data, FeatureAgglomeration.affinity='manhattan', FeatureAgglomeration.linkage='complete'), alpha=0.001, fit_prior=False);None;1 026ae87c-8fce-42ed-a94d-b88fba04073c;20308;2020-06-23 11:33:06,657026;0.07293033599853516;0.078125;(-2.378880797148006, -1);DecisionTreeClassifier(data, DecisionTreeClassifier.criterion='gini', DecisionTreeClassifier.max_depth=2, min_samples_leaf=1, min_samples_split=18);None;1 ba577e1f-35e2-4927-9a4c-2989ce6b4ecd;8980;2020-06-23 11:33:06,628000;1.163454532623291;1.15625;(-0.6327468037141656, -2);RandomForestClassifier(MinMaxScaler(data), RandomForestClassifier.bootstrap=False, RandomForestClassifier.criterion='gini', RandomForestClassifier.max_features=0.6000000000000001, RandomForestClassifier.min_samples_leaf=8, RandomForestClassifier.min_samples_split=10, RandomForestClassifier.n_estimators=100);None;1 -4df3b680-5789-4c42-b9eb-ae5330839d53;20308;2020-06-23 11:33:06,730948;1.0955390930175781;1.09375;(-0.6934472032882681, -2);RandomForestClassifier(SelectPercentile(data, SelectPercentile.percentile=28, SelectPercentile.score_func=f_classif), RandomForestClassifier.bootstrap=True, RandomForestClassifier.criterion='entropy', RandomForestClassifier.max_features=0.6500000000000001, RandomForestClassifier.min_samples_leaf=9, RandomForestClassifier.min_samples_split=8, RandomForestClassifier.n_estimators=100);None;1 +4df3b680-5789-4c42-b9eb-ae5330839d53;20308;2020-06-23 11:33:06,730948;1.0955390930175781;1.09375;(-0.6934472032882681, -2);RandomForestClassifier(SelectPercentile(data, SelectPercentile.percentile=28), RandomForestClassifier.bootstrap=True, RandomForestClassifier.criterion='entropy', RandomForestClassifier.max_features=0.6500000000000001, RandomForestClassifier.min_samples_leaf=9, RandomForestClassifier.min_samples_split=8, RandomForestClassifier.n_estimators=100);None;1 fff20a67-70f6-4634-968e-f43aed45e98a;20308;2020-06-23 11:33:07,876532;0.34650230407714844;0.34375;(-0.3290416617077613, -1);KNeighborsClassifier(data, KNeighborsClassifier.n_neighbors=40, KNeighborsClassifier.p=2, KNeighborsClassifier.weights='uniform');None;2 60302618-e317-4dd3-8993-8ed63cac52c9;8980;2020-06-23 11:33:07,859518;1.2853541374206543;1.296875;(-0.8660683947004788, -1);RandomForestClassifier(data, RandomForestClassifier.bootstrap=False, RandomForestClassifier.criterion='entropy', RandomForestClassifier.max_features=0.6000000000000001, RandomForestClassifier.min_samples_leaf=18, RandomForestClassifier.min_samples_split=6, RandomForestClassifier.n_estimators=100);None;1 706dfc52-d5a4-47d8-ad4c-7ade66a3a2d2;20308;2020-06-23 11:33:08,252070;1.0189173221588135;7.359375;(-0.9867157201028451, -3);BernoulliNB(Binarizer(PCA(data, PCA.iterated_power=9, PCA.svd_solver='randomized'), Binarizer.threshold=0.65), alpha=0.01, fit_prior=False);None;1 -1ab0a0b3-589e-433f-ae97-b35719893a80;8980;2020-06-23 11:33:09,187912;0.6098580360412598;0.734375;(-0.2842039177186586, -2);LogisticRegression(SelectFwe(data, SelectFwe.alpha=0.036000000000000004, SelectFwe.score_func=f_classif), LogisticRegression.C=0.001, LogisticRegression.dual=False, LogisticRegression.penalty='l2', LogisticRegression.solver='lbfgs');None;3 +1ab0a0b3-589e-433f-ae97-b35719893a80;8980;2020-06-23 11:33:09,187912;0.6098580360412598;0.734375;(-0.2842039177186586, -2);LogisticRegression(SelectFwe(data, SelectFwe.alpha=0.036000000000000004), LogisticRegression.C=0.001, LogisticRegression.dual=False, LogisticRegression.penalty='l2', LogisticRegression.solver='lbfgs');None;3 361c870d-95ab-4b30-8fda-8aadefa89235;8980;2020-06-23 11:33:09,800773;0.3285844326019287;0.328125;(-0.4126165378440759, -2);KNeighborsClassifier(MaxAbsScaler(data), KNeighborsClassifier.n_neighbors=47, KNeighborsClassifier.p=1, KNeighborsClassifier.weights='distance');None;2 -92400600-fab5-4ef3-844b-a8abed37d066;8980;2020-06-23 11:33:10,156383;0.08507728576660156;0.09375;(-2.317418995184293, -2);DecisionTreeClassifier(SelectFwe(data, SelectFwe.alpha=0.016, SelectFwe.score_func=f_classif), DecisionTreeClassifier.criterion='entropy', DecisionTreeClassifier.max_depth=1, min_samples_leaf=17, min_samples_split=6);None;1 +92400600-fab5-4ef3-844b-a8abed37d066;8980;2020-06-23 11:33:10,156383;0.08507728576660156;0.09375;(-2.317418995184293, -2);DecisionTreeClassifier(SelectFwe(data, SelectFwe.alpha=0.016), DecisionTreeClassifier.criterion='entropy', DecisionTreeClassifier.max_depth=1, min_samples_leaf=17, min_samples_split=6);None;1 5c2f1599-71ac-4632-a083-b3ce4deab228;8980;2020-06-23 11:33:10,244464;2.659486770629883;2.65625;(-0.8258589925779635, -2);ExtraTreesClassifier(PolynomialFeatures(data, PolynomialFeatures.degree=2, PolynomialFeatures.include_bias=False, PolynomialFeatures.interaction_only=False), ExtraTreesClassifier.bootstrap=False, ExtraTreesClassifier.criterion='gini', ExtraTreesClassifier.max_features=0.15000000000000002, min_samples_leaf=17, min_samples_split=3, ExtraTreesClassifier.n_estimators=100);None;1 4e207a66-d2cf-4a8e-828c-31749fa5abe3;8980;2020-06-23 11:33:12,937958;0.07606863975524902;0.078125;(-0.9261864805822425, -1);BernoulliNB(data, alpha=0.1, fit_prior=True);None;1 ba577e1f-35e2-4927-9a4c-2989ce6b4ecd;8980;2020-06-23 11:33:13,017029;2.630417823791504;2.640625;(-0.4533327218998748, -2);RandomForestClassifier(MinMaxScaler(data), RandomForestClassifier.bootstrap=False, RandomForestClassifier.criterion='gini', RandomForestClassifier.max_features=0.6000000000000001, RandomForestClassifier.min_samples_leaf=8, RandomForestClassifier.min_samples_split=10, RandomForestClassifier.n_estimators=100);None;2 2511c0a6-3a33-479f-944e-c9fa8e5a8ce9;8980;2020-06-23 11:33:15,740537;0.0740654468536377;0.078125;(-0.9271376361605299, -1);BernoulliNB(data, alpha=0.1, fit_prior=False);None;1 acaaef6e-d85c-4290-9786-ccca15db8fc7;20308;2020-06-23 11:33:09,274991;9.128485918045044;11.734375;(-11.539644837655507, -3);DecisionTreeClassifier(FastICA(StandardScaler(data), FastICA.tol=0.35000000000000003), DecisionTreeClassifier.criterion='entropy', DecisionTreeClassifier.max_depth=8, min_samples_leaf=7, min_samples_split=19);None;1 -521581af-6ffc-4b1f-b15f-d821d5c4eff2;20308;2020-06-23 11:33:18,416490;0.09208321571350098;0.09375;(-3.3691409717555003, -3);GaussianNB(SelectPercentile(VarianceThreshold(data, VarianceThreshold.threshold=0.7000000000000001), SelectPercentile.percentile=77, SelectPercentile.score_func=f_classif));None;1 +521581af-6ffc-4b1f-b15f-d821d5c4eff2;20308;2020-06-23 11:33:18,416490;0.09208321571350098;0.09375;(-3.3691409717555003, -3);GaussianNB(SelectPercentile(VarianceThreshold(data, VarianceThreshold.threshold=0.7000000000000001), SelectPercentile.percentile=77));None;1 aed55476-7129-44e5-a332-807aedebffdd;20308;2020-06-23 11:33:18,511576;0.7255513668060303;0.734375;(-1.0251053589590406, -1);ExtraTreesClassifier(data, ExtraTreesClassifier.bootstrap=False, ExtraTreesClassifier.criterion='entropy', ExtraTreesClassifier.max_features=0.6500000000000001, min_samples_leaf=20, min_samples_split=20, ExtraTreesClassifier.n_estimators=100);None;1 diff --git a/tests/data/AsyncEA/evaluations.log b/tests/data/AsyncEA/evaluations.log index f09c61a8..e96b18d4 100644 --- a/tests/data/AsyncEA/evaluations.log +++ b/tests/data/AsyncEA/evaluations.log @@ -14,8 +14,8 @@ f297beec-e504-40d0-bfa1-5f2de9fda475;15332;2020-06-23 11:28:50,018889;0.03124094 bbd7c3cf-3e9a-46d9-a693-1a6746f18223;11296;2020-06-23 11:28:50,763779;0.12203121185302734;0.125;(-9.75947539358216, -1);GaussianNB(data);None;;;new 624ebec6-d5c2-4764-b0bc-24bb4b3c4a4a;15332;2020-06-23 11:28:50,050130;1.1550064086914062;1.15625;(-0.468928003986364, -1);ExtraTreesClassifier(data, ExtraTreesClassifier.bootstrap=False, ExtraTreesClassifier.criterion='gini', ExtraTreesClassifier.max_features=0.15000000000000002, min_samples_leaf=5, min_samples_split=6, ExtraTreesClassifier.n_estimators=100);None;;;new 2be863ef-f0e8-4fe5-859d-a8dccb32669c;11296;2020-06-23 11:28:50,885811;0.43521785736083984;3.296875;(-inf, -3);MultinomialNB(FeatureAgglomeration(PCA(data, PCA.iterated_power=10, PCA.svd_solver='randomized'), FeatureAgglomeration.affinity='l1', FeatureAgglomeration.linkage='ward'), alpha=100.0, fit_prior=False);l1 was provided as affinity. Ward can only work with euclidean distances.;;;new -a74661ca-5cc7-48ff-b344-e67c95e65cf2;11296;2020-06-23 11:28:51,352403;1.5936839580535889;1.59375;(-0.39007147233709427, -2);RandomForestClassifier(SelectPercentile(data, SelectPercentile.percentile=22, SelectPercentile.score_func=f_classif), RandomForestClassifier.bootstrap=True, RandomForestClassifier.criterion='gini', RandomForestClassifier.max_features=0.45, RandomForestClassifier.min_samples_leaf=4, RandomForestClassifier.min_samples_split=9, RandomForestClassifier.n_estimators=100);None;;;new -2f02d715-31e5-43f4-9b07-40c406279d6f;11296;2020-06-23 11:28:53,017206;0.6100711822509766;0.84375;(-0.2842039177186586, -2);LogisticRegression(SelectFwe(data, SelectFwe.alpha=0.036000000000000004, SelectFwe.score_func=f_classif), LogisticRegression.C=0.001, LogisticRegression.dual=False, LogisticRegression.penalty='l2', LogisticRegression.solver='lbfgs');None;;;new +a74661ca-5cc7-48ff-b344-e67c95e65cf2;11296;2020-06-23 11:28:51,352403;1.5936839580535889;1.59375;(-0.39007147233709427, -2);RandomForestClassifier(SelectPercentile(data, SelectPercentile.percentile=22), RandomForestClassifier.bootstrap=True, RandomForestClassifier.criterion='gini', RandomForestClassifier.max_features=0.45, RandomForestClassifier.min_samples_leaf=4, RandomForestClassifier.min_samples_split=9, RandomForestClassifier.n_estimators=100);None;;;new +2f02d715-31e5-43f4-9b07-40c406279d6f;11296;2020-06-23 11:28:53,017206;0.6100711822509766;0.84375;(-0.2842039177186586, -2);LogisticRegression(SelectFwe(data, SelectFwe.alpha=0.036000000000000004), LogisticRegression.C=0.001, LogisticRegression.dual=False, LogisticRegression.penalty='l2', LogisticRegression.solver='lbfgs');None;;;new a8e80747-4b69-46f0-b1c9-202ce1ee47d0;11296;2020-06-23 11:28:53,642893;0.5683679580688477;0.5625;(-0.2610497596267842, -1);KNeighborsClassifier(data, KNeighborsClassifier.n_neighbors=41, KNeighborsClassifier.p=1, KNeighborsClassifier.weights='uniform');None;;;new 3a988c08-b951-4bd5-ad9d-b387e98f16c8;15332;2020-06-23 11:28:51,352403;3.5592164993286133;3.78125;(-0.9269214656386081, -3);ExtraTreesClassifier(MaxAbsScaler(FastICA(data, FastICA.tol=1.0)), ExtraTreesClassifier.bootstrap=False, ExtraTreesClassifier.criterion='entropy', ExtraTreesClassifier.max_features=0.6000000000000001, min_samples_leaf=3, min_samples_split=12, ExtraTreesClassifier.n_estimators=100);None;;;new ba4dc764-9c90-4fd2-8dd3-921e98109174;11296;2020-06-23 11:28:54,280149;6.002580404281616;8.734375;(-inf, -2);KNeighborsClassifier(FastICA(data, FastICA.tol=0.0), KNeighborsClassifier.n_neighbors=24, KNeighborsClassifier.p=1, KNeighborsClassifier.weights='uniform');;;;new @@ -26,8 +26,8 @@ fff9f956-933a-4fcc-a822-f7c22656d431;11296;2020-06-23 11:29:00,284752;1.03579115 b31b2580-6d10-4d17-807b-fe694e63ad30;11296;2020-06-23 11:29:01,381309;0.11871457099914551;0.109375;(-0.6182026836322747, -1);BernoulliNB(data, alpha=100.0, fit_prior=False);None;;;new e562c3d2-7aeb-4661-88a1-5b9c94ba43c8;11296;2020-06-23 11:29:01,500024;0.632824182510376;2.03125;(-2.579317437977062, -3);GaussianNB(PCA(MinMaxScaler(data), PCA.iterated_power=5, PCA.svd_solver='randomized'));None;;;new 600113fb-f38f-4cdd-83b4-40abc07c8e06;11296;2020-06-23 11:29:02,132848;0.04687356948852539;0.046875;(-inf, -3);ExtraTreesClassifier(FeatureAgglomeration(PolynomialFeatures(data, PolynomialFeatures.degree=2, PolynomialFeatures.include_bias=False, PolynomialFeatures.interaction_only=False), FeatureAgglomeration.affinity='l2', FeatureAgglomeration.linkage='ward'), ExtraTreesClassifier.bootstrap=True, ExtraTreesClassifier.criterion='gini', ExtraTreesClassifier.max_features=0.7500000000000001, min_samples_leaf=3, min_samples_split=12, ExtraTreesClassifier.n_estimators=100);l2 was provided as affinity. Ward can only work with euclidean distances.;;;new -444c5a03-7054-4af7-8311-7c383af065cb;11296;2020-06-23 11:29:02,195347;0.15927982330322266;0.15625;(-2.201861494055291, -3);DecisionTreeClassifier(SelectFwe(StandardScaler(data), SelectFwe.alpha=0.041, SelectFwe.score_func=f_classif), DecisionTreeClassifier.criterion='gini', DecisionTreeClassifier.max_depth=5, min_samples_leaf=13, min_samples_split=15);None;;;new -6a7f1764-1923-423e-8e56-6c89c8e39ed9;11296;2020-06-23 11:29:02,354627;2.5398776531219482;2.546875;(-0.37875159073374326, -3);RandomForestClassifier(VarianceThreshold(SelectFwe(data, SelectFwe.alpha=0.026000000000000002, SelectFwe.score_func=f_classif), VarianceThreshold.threshold=0.9500000000000001), RandomForestClassifier.bootstrap=False, RandomForestClassifier.criterion='gini', RandomForestClassifier.max_features=0.4, RandomForestClassifier.min_samples_leaf=8, RandomForestClassifier.min_samples_split=16, RandomForestClassifier.n_estimators=100);None;;;new +444c5a03-7054-4af7-8311-7c383af065cb;11296;2020-06-23 11:29:02,195347;0.15927982330322266;0.15625;(-2.201861494055291, -3);DecisionTreeClassifier(SelectFwe(StandardScaler(data), SelectFwe.alpha=0.041), DecisionTreeClassifier.criterion='gini', DecisionTreeClassifier.max_depth=5, min_samples_leaf=13, min_samples_split=15);None;;;new +6a7f1764-1923-423e-8e56-6c89c8e39ed9;11296;2020-06-23 11:29:02,354627;2.5398776531219482;2.546875;(-0.37875159073374326, -3);RandomForestClassifier(VarianceThreshold(SelectFwe(data, SelectFwe.alpha=0.026000000000000002), VarianceThreshold.threshold=0.9500000000000001), RandomForestClassifier.bootstrap=False, RandomForestClassifier.criterion='gini', RandomForestClassifier.max_features=0.4, RandomForestClassifier.min_samples_leaf=8, RandomForestClassifier.min_samples_split=16, RandomForestClassifier.n_estimators=100);None;;;new bc41711d-b6be-49a7-9545-43026e0d7ab6;11296;2020-06-23 11:29:04,996979;0.01803874969482422;0.015625;(-inf, -2);MultinomialNB(RBFSampler(data, RBFSampler.gamma=0.7000000000000001), alpha=100.0, fit_prior=True);Negative values in data passed to MultinomialNB (input X);;;new 181910ae-5017-4e92-8a93-4e6061c96365;11296;2020-06-23 11:29:05,015018;1.7232224941253662;1.734375;(-0.5815021169582558, -1);ExtraTreesClassifier(data, ExtraTreesClassifier.bootstrap=True, ExtraTreesClassifier.criterion='entropy', ExtraTreesClassifier.max_features=0.8500000000000001, min_samples_leaf=16, min_samples_split=19, ExtraTreesClassifier.n_estimators=100);None;;;new a08c8da2-abc4-4de1-801d-57030b94841c;11296;2020-06-23 11:29:06,807609;0.11546134948730469;0.109375;(-0.725480653467304, -1);BernoulliNB(data, alpha=0.1, fit_prior=False);None;;;new @@ -39,9 +39,9 @@ cf024278-e20a-429f-997b-6020ff69661f;11296;2020-06-23 11:29:08,198723;0.12175083 129881e7-a949-49ee-9df7-089bf30e4b42;11296;2020-06-23 11:29:08,320474;0.14296269416809082;0.140625;(-1.8425345071690191, -3);DecisionTreeClassifier(StandardScaler(StandardScaler(data)), DecisionTreeClassifier.criterion='gini', DecisionTreeClassifier.max_depth=3, min_samples_leaf=9, min_samples_split=2);None;;;new 685bb78c-14f0-4b87-81be-602b159b2aad;11296;2020-06-23 11:29:08,463437;2.6076319217681885;2.609375;(-0.31470563580921596, -1);ExtraTreesClassifier(data, ExtraTreesClassifier.bootstrap=False, ExtraTreesClassifier.criterion='entropy', ExtraTreesClassifier.max_features=0.9500000000000001, min_samples_leaf=2, min_samples_split=17, ExtraTreesClassifier.n_estimators=100);None;;;new 72c18b2c-0a71-499a-b013-68a28d7cc167;11296;2020-06-23 11:29:11,152268;0.05043792724609375;0.046875;(-inf, -3);MultinomialNB(PolynomialFeatures(RBFSampler(data, RBFSampler.gamma=0.2), PolynomialFeatures.degree=2, PolynomialFeatures.include_bias=False, PolynomialFeatures.interaction_only=False), alpha=0.1, fit_prior=True);Negative values in data passed to MultinomialNB (input X);;;new -a8a4e812-2bbb-404a-a5ae-2d4aa5917c2a;11296;2020-06-23 11:29:11,222802;0.18370366096496582;0.171875;(-2.8709226051225545, -2);DecisionTreeClassifier(SelectPercentile(data, SelectPercentile.percentile=80, SelectPercentile.score_func=f_classif), DecisionTreeClassifier.criterion='entropy', DecisionTreeClassifier.max_depth=6, min_samples_leaf=3, min_samples_split=12);None;;;new +a8a4e812-2bbb-404a-a5ae-2d4aa5917c2a;11296;2020-06-23 11:29:11,222802;0.18370366096496582;0.171875;(-2.8709226051225545, -2);DecisionTreeClassifier(SelectPercentile(data, SelectPercentile.percentile=80), DecisionTreeClassifier.criterion='entropy', DecisionTreeClassifier.max_depth=6, min_samples_leaf=3, min_samples_split=12);None;;;new 361ff5d5-6f55-46f2-96d4-77edb3dc4000;11296;2020-06-23 11:29:11,406505;1.7949175834655762;1.796875;(-0.4958370457456914, -1);ExtraTreesClassifier(data, ExtraTreesClassifier.bootstrap=True, ExtraTreesClassifier.criterion='gini', ExtraTreesClassifier.max_features=0.9500000000000001, min_samples_leaf=10, min_samples_split=13, ExtraTreesClassifier.n_estimators=100);None;;;new -08bec420-a279-417a-97d2-800f819ab60a;15332;2020-06-23 11:29:07,740216;6.011395215988159;6.0;(-inf, -2);GradientBoostingClassifier(SelectPercentile(data, SelectPercentile.percentile=15, SelectPercentile.score_func=f_classif), GradientBoostingClassifier.learning_rate=0.001, GradientBoostingClassifier.max_depth=8, GradientBoostingClassifier.max_features=0.6500000000000001, GradientBoostingClassifier.min_samples_leaf=3, GradientBoostingClassifier.min_samples_split=2, GradientBoostingClassifier.n_estimators=100, GradientBoostingClassifier.subsample=0.7500000000000001);;;;new +08bec420-a279-417a-97d2-800f819ab60a;15332;2020-06-23 11:29:07,740216;6.011395215988159;6.0;(-inf, -2);GradientBoostingClassifier(SelectPercentile(data, SelectPercentile.percentile=15), GradientBoostingClassifier.learning_rate=0.001, GradientBoostingClassifier.max_depth=8, GradientBoostingClassifier.max_features=0.6500000000000001, GradientBoostingClassifier.min_samples_leaf=3, GradientBoostingClassifier.min_samples_split=2, GradientBoostingClassifier.n_estimators=100, GradientBoostingClassifier.subsample=0.7500000000000001);;;;new 043acb38-5c31-4725-8185-9e49b3846c06;15332;2020-06-23 11:29:13,751611;0.6360459327697754;0.625;(-0.1749001012829964, -2);KNeighborsClassifier(Normalizer(data, Normalizer.norm='l2'), KNeighborsClassifier.n_neighbors=7, KNeighborsClassifier.p=2, KNeighborsClassifier.weights='distance');None;;;new d990743b-1d84-47b8-b573-c47b6bb75e96;15332;2020-06-23 11:29:14,438019;0.11130619049072266;0.125;(-0.7812520494792912, -1);BernoulliNB(data, alpha=0.001, fit_prior=False);None;;;new 0806032f-84f3-4694-bc71-ddd3798f8160;15332;2020-06-23 11:29:14,557362;0.012228727340698242;0.015625;(-inf, -2);LogisticRegression(Binarizer(data, Binarizer.threshold=0.9), LogisticRegression.C=0.5, LogisticRegression.dual=True, LogisticRegression.penalty='l2', LogisticRegression.solver='lbfgs');Solver lbfgs supports only dual=False, got dual=True;;;new @@ -53,9 +53,9 @@ aba34090-cfc6-4287-9b49-3ce91d91bf1b;15332;2020-06-23 11:29:15,030138;0.17381668 4688fb68-9ec8-4b74-a413-e5e14eb59445;15332;2020-06-23 11:29:16,036361;0.1812450885772705;0.171875;(-2.0426336549903987, -2);GaussianNB(FeatureAgglomeration(data, FeatureAgglomeration.affinity='l1', FeatureAgglomeration.linkage='complete'));None;acfab73c-619b-44e3-a2d9-6c11646cf248;;mut_replace_primitive 80b74994-2366-45a4-b635-8cb58af51d1f;15332;2020-06-23 11:29:16,219632;0.5777325630187988;0.578125;(-0.32731088166964445, -2);KNeighborsClassifier(MaxAbsScaler(data), KNeighborsClassifier.n_neighbors=47, KNeighborsClassifier.p=1, KNeighborsClassifier.weights='uniform');None;f0588335-f880-4ff4-abbc-6d0f112b8716;b8c3d504-e5a2-47af-9656-95de503581a5;cx 8016fa1c-ca06-439d-9a3d-76827adba806;15332;2020-06-23 11:29:16,847850;0.02045750617980957;0.015625;(-inf, -2);GaussianNB(Nystroem(data, Nystroem.gamma=0.05, Nystroem.kernel='sigmoid', Nystroem.n_components=5));Input contains NaN, infinity or a value too large for dtype('float64').;c0933bda-9302-45d7-b955-8bb5da6b1a5e;;mut_replace_terminal -3e6842be-8ccd-48c5-8b1b-fb17f287acea;11296;2020-06-23 11:29:13,262259;4.9716572761535645;4.96875;(-0.8456140220227049, -3);GradientBoostingClassifier(SelectFwe(Nystroem(data, Nystroem.gamma=0.6000000000000001, Nystroem.kernel='poly', Nystroem.n_components=5), SelectFwe.alpha=0.033, SelectFwe.score_func=f_classif), GradientBoostingClassifier.learning_rate=0.1, GradientBoostingClassifier.max_depth=2, GradientBoostingClassifier.max_features=0.8, GradientBoostingClassifier.min_samples_leaf=8, GradientBoostingClassifier.min_samples_split=3, GradientBoostingClassifier.n_estimators=100, GradientBoostingClassifier.subsample=0.3);None;;;new -ab3eac42-069e-4bf5-bf77-56c6319b47fb;11296;2020-06-23 11:29:18,477569;0.12957167625427246;0.125;(-1.5786095406709593, -2);MultinomialNB(SelectFwe(data, SelectFwe.alpha=0.01, SelectFwe.score_func=f_classif), alpha=100.0, fit_prior=True);None;4af27863-a8bf-4e4e-8dd4-502861172f53;;mut_insert -ba6ad5c6-e0cb-4246-93c9-968c0298412a;11296;2020-06-23 11:29:18,690142;0.12882161140441895;0.125;(-0.7227278612135112, -2);BernoulliNB(SelectFwe(data, SelectFwe.alpha=0.013000000000000001, SelectFwe.score_func=f_classif), alpha=0.1, fit_prior=True);None;acfab73c-619b-44e3-a2d9-6c11646cf248;;mut_replace_primitive +3e6842be-8ccd-48c5-8b1b-fb17f287acea;11296;2020-06-23 11:29:13,262259;4.9716572761535645;4.96875;(-0.8456140220227049, -3);GradientBoostingClassifier(SelectFwe(Nystroem(data, Nystroem.gamma=0.6000000000000001, Nystroem.kernel='poly', Nystroem.n_components=5), SelectFwe.alpha=0.033), GradientBoostingClassifier.learning_rate=0.1, GradientBoostingClassifier.max_depth=2, GradientBoostingClassifier.max_features=0.8, GradientBoostingClassifier.min_samples_leaf=8, GradientBoostingClassifier.min_samples_split=3, GradientBoostingClassifier.n_estimators=100, GradientBoostingClassifier.subsample=0.3);None;;;new +ab3eac42-069e-4bf5-bf77-56c6319b47fb;11296;2020-06-23 11:29:18,477569;0.12957167625427246;0.125;(-1.5786095406709593, -2);MultinomialNB(SelectFwe(data, SelectFwe.alpha=0.01), alpha=100.0, fit_prior=True);None;4af27863-a8bf-4e4e-8dd4-502861172f53;;mut_insert +ba6ad5c6-e0cb-4246-93c9-968c0298412a;11296;2020-06-23 11:29:18,690142;0.12882161140441895;0.125;(-0.7227278612135112, -2);BernoulliNB(SelectFwe(data, SelectFwe.alpha=0.013000000000000001), alpha=0.1, fit_prior=True);None;acfab73c-619b-44e3-a2d9-6c11646cf248;;mut_replace_primitive bd0b7073-69a8-4908-bf8c-0bb184ac7a93;11296;2020-06-23 11:29:18,820993;3.414860725402832;3.421875;(-2.3026090965822363, -2);GradientBoostingClassifier(Nystroem(data, Nystroem.gamma=0.15000000000000002, Nystroem.kernel='sigmoid', Nystroem.n_components=6), GradientBoostingClassifier.learning_rate=0.1, GradientBoostingClassifier.max_depth=9, GradientBoostingClassifier.max_features=0.9500000000000001, GradientBoostingClassifier.min_samples_leaf=9, GradientBoostingClassifier.min_samples_split=12, GradientBoostingClassifier.n_estimators=100, GradientBoostingClassifier.subsample=0.8);None;f297beec-e504-40d0-bfa1-5f2de9fda475;;mut_replace_primitive 333b65af-dabb-4866-a984-948a971d20e0;11296;2020-06-23 11:29:22,578399;0.10260462760925293;0.109375;(-2.302585092994045, -2);MultinomialNB(Nystroem(data, Nystroem.gamma=0.05, Nystroem.kernel='sigmoid', Nystroem.n_components=1), alpha=0.001, fit_prior=False);None;25f62463-7019-4d08-83eb-b358eba6379b;c0933bda-9302-45d7-b955-8bb5da6b1a5e;cx 97af8a2b-0c9b-4b3a-bc2a-7eb17b94f3ce;11296;2020-06-23 11:29:22,691056;0.1619114875793457;0.15625;(-2.192129193297742, -2);DecisionTreeClassifier(FeatureAgglomeration(data, FeatureAgglomeration.affinity='manhattan', FeatureAgglomeration.linkage='complete'), DecisionTreeClassifier.criterion='entropy', DecisionTreeClassifier.max_depth=1, min_samples_leaf=20, min_samples_split=16);None;25f62463-7019-4d08-83eb-b358eba6379b;;mut_replace_primitive @@ -85,7 +85,7 @@ f216023f-3715-4f46-a813-d54c3552eab7;11296;2020-06-23 11:29:30,001792;6.22617888 7be43d2c-d3b1-4df3-acfd-99e8ca1eae94;15332;2020-06-23 11:29:34,295898;2.5007877349853516;2.5;(-0.36775019985491075, -2);RandomForestClassifier(Normalizer(data, Normalizer.norm='max'), RandomForestClassifier.bootstrap=True, RandomForestClassifier.criterion='gini', RandomForestClassifier.max_features=0.45, RandomForestClassifier.min_samples_leaf=4, RandomForestClassifier.min_samples_split=9, RandomForestClassifier.n_estimators=100);None;a74661ca-5cc7-48ff-b344-e67c95e65cf2;;mut_replace_primitive cd09ec41-f0a0-4a47-932d-e1fe2192e05a;11296;2020-06-23 11:29:36,350332;0.8501160144805908;1.0625;(-0.7550461398497643, -3);KNeighborsClassifier(MaxAbsScaler(FastICA(data, FastICA.tol=1.0)), KNeighborsClassifier.n_neighbors=16, KNeighborsClassifier.p=1, KNeighborsClassifier.weights='uniform');None;3a988c08-b951-4bd5-ad9d-b387e98f16c8;;mut_replace_primitive 6a25fe90-1b6c-4d46-8539-8a528d57435c;15332;2020-06-23 11:29:36,867509;0.5889654159545898;0.59375;(-2.757675360428376, -2);KNeighborsClassifier(RBFSampler(data, RBFSampler.gamma=0.35000000000000003), KNeighborsClassifier.n_neighbors=39, KNeighborsClassifier.p=1, KNeighborsClassifier.weights='distance');None;855b69be-464f-4ca8-8842-e90849731bc0;;mut_insert -a2570a5d-7cd7-4530-ad1e-db3c146a4d17;15332;2020-06-23 11:29:37,476667;0.595221996307373;0.75;(-0.2304312362617506, -2);LogisticRegression(SelectFwe(data, SelectFwe.alpha=0.036000000000000004, SelectFwe.score_func=f_classif), LogisticRegression.C=15.0, LogisticRegression.dual=False, LogisticRegression.penalty='l2', LogisticRegression.solver='lbfgs');None;2f02d715-31e5-43f4-9b07-40c406279d6f;3a8f5755-e11e-4476-b1f9-ba8c24ec7f4a;cx +a2570a5d-7cd7-4530-ad1e-db3c146a4d17;15332;2020-06-23 11:29:37,476667;0.595221996307373;0.75;(-0.2304312362617506, -2);LogisticRegression(SelectFwe(data, SelectFwe.alpha=0.036000000000000004), LogisticRegression.C=15.0, LogisticRegression.dual=False, LogisticRegression.penalty='l2', LogisticRegression.solver='lbfgs');None;2f02d715-31e5-43f4-9b07-40c406279d6f;3a8f5755-e11e-4476-b1f9-ba8c24ec7f4a;cx fa23d921-efa5-47ae-98a8-2d7fff027c70;15332;2020-06-23 11:29:38,071889;0.13453054428100586;0.140625;(-2.874855901041313, -1);DecisionTreeClassifier(data, DecisionTreeClassifier.criterion='gini', DecisionTreeClassifier.max_depth=10, min_samples_leaf=12, min_samples_split=9);None;855b69be-464f-4ca8-8842-e90849731bc0;;mut_replace_primitive 18c3f236-9e8e-4b47-a849-0973c210beee;15332;2020-06-23 11:29:38,214445;0.6050162315368652;1.0;(-0.28422204432606374, -1);LogisticRegression(data, LogisticRegression.C=0.001, LogisticRegression.dual=False, LogisticRegression.penalty='l2', LogisticRegression.solver='lbfgs');None;2f02d715-31e5-43f4-9b07-40c406279d6f;;mut_shrink -e86f503d-e2cc-42be-9b50-a41427ea8dbb;15332;2020-06-23 11:29:38,819461;0.12485957145690918;0.125;(-3.9167388722110656, -2);GaussianNB(SelectFwe(data, SelectFwe.alpha=0.036000000000000004, SelectFwe.score_func=f_classif));None;d85b366f-08df-437b-bd86-a66c32dedf09;2f02d715-31e5-43f4-9b07-40c406279d6f;cx +e86f503d-e2cc-42be-9b50-a41427ea8dbb;15332;2020-06-23 11:29:38,819461;0.12485957145690918;0.125;(-3.9167388722110656, -2);GaussianNB(SelectFwe(data, SelectFwe.alpha=0.036000000000000004));None;d85b366f-08df-437b-bd86-a66c32dedf09;2f02d715-31e5-43f4-9b07-40c406279d6f;cx diff --git a/tests/data/RandomSearch/evaluations.log b/tests/data/RandomSearch/evaluations.log index 54a08467..7cbcb025 100644 --- a/tests/data/RandomSearch/evaluations.log +++ b/tests/data/RandomSearch/evaluations.log @@ -14,8 +14,8 @@ c87d1af8-d72f-4924-80d4-88847f3af143;17376;2020-06-23 11:37:29,231821;1.14003682 d3e9e4dd-eb14-47d8-92ca-7d8cbf3e63a0;4436;2020-06-23 11:37:31,780371;0.12316179275512695;0.125;(-9.75947539358216, -1);GaussianNB(data);None 316c6ee0-9732-4c00-80f1-78bb6c498e87;4436;2020-06-23 11:37:31,906526;0.2867724895477295;1.765625;(-inf, -3);MultinomialNB(FeatureAgglomeration(PCA(data, PCA.iterated_power=10, PCA.svd_solver='randomized'), FeatureAgglomeration.affinity='l1', FeatureAgglomeration.linkage='ward'), alpha=100.0, fit_prior=False);l1 was provided as affinity. Ward can only work with euclidean distances. 4f1269a2-1a91-4f01-b9e9-e3173266dc18;17376;2020-06-23 11:37:31,164957;1.1394340991973877;1.140625;(-0.4656039309759262, -1);ExtraTreesClassifier(data, ExtraTreesClassifier.bootstrap=False, ExtraTreesClassifier.criterion='gini', ExtraTreesClassifier.max_features=0.15000000000000002, min_samples_leaf=5, min_samples_split=6, ExtraTreesClassifier.n_estimators=100);None -78e7ab64-fc68-407c-8327-92ba8d089635;17376;2020-06-23 11:37:32,487358;1.6162452697753906;1.609375;(-0.3869096541872462, -2);RandomForestClassifier(SelectPercentile(data, SelectPercentile.percentile=22, SelectPercentile.score_func=f_classif), RandomForestClassifier.bootstrap=True, RandomForestClassifier.criterion='gini', RandomForestClassifier.max_features=0.45, RandomForestClassifier.min_samples_leaf=4, RandomForestClassifier.min_samples_split=9, RandomForestClassifier.n_estimators=100);None -0b4547b6-cc75-4bc6-a648-ccd935b48e05;17376;2020-06-23 11:37:34,225728;0.6055412292480469;0.71875;(-0.2842039177186586, -2);LogisticRegression(SelectFwe(data, SelectFwe.alpha=0.036000000000000004, SelectFwe.score_func=f_classif), LogisticRegression.C=0.001, LogisticRegression.dual=False, LogisticRegression.penalty='l2', LogisticRegression.solver='lbfgs');None +78e7ab64-fc68-407c-8327-92ba8d089635;17376;2020-06-23 11:37:32,487358;1.6162452697753906;1.609375;(-0.3869096541872462, -2);RandomForestClassifier(SelectPercentile(data, SelectPercentile.percentile=22), RandomForestClassifier.bootstrap=True, RandomForestClassifier.criterion='gini', RandomForestClassifier.max_features=0.45, RandomForestClassifier.min_samples_leaf=4, RandomForestClassifier.min_samples_split=9, RandomForestClassifier.n_estimators=100);None +0b4547b6-cc75-4bc6-a648-ccd935b48e05;17376;2020-06-23 11:37:34,225728;0.6055412292480469;0.71875;(-0.2842039177186586, -2);LogisticRegression(SelectFwe(data, SelectFwe.alpha=0.036000000000000004), LogisticRegression.C=0.001, LogisticRegression.dual=False, LogisticRegression.penalty='l2', LogisticRegression.solver='lbfgs');None 18293c9e-8b78-4379-819b-f4c1922141fc;17376;2020-06-23 11:37:34,834272;0.5966954231262207;0.59375;(-0.2610497596267842, -1);KNeighborsClassifier(data, KNeighborsClassifier.n_neighbors=41, KNeighborsClassifier.p=1, KNeighborsClassifier.weights='uniform');None 8981a517-e827-4bfc-8835-021c673f744f;4436;2020-06-23 11:37:32,194291;3.576796531677246;3.734375;(-0.9405601791681895, -3);ExtraTreesClassifier(MaxAbsScaler(FastICA(data, FastICA.tol=1.0)), ExtraTreesClassifier.bootstrap=False, ExtraTreesClassifier.criterion='entropy', ExtraTreesClassifier.max_features=0.6000000000000001, min_samples_leaf=3, min_samples_split=12, ExtraTreesClassifier.n_estimators=100);None 7973c2c5-ff0a-44b6-867d-1359c56d3d14;17376;2020-06-23 11:37:35,475819;6.002191066741943;9.375;(-inf, -2);KNeighborsClassifier(FastICA(data, FastICA.tol=0.0), KNeighborsClassifier.n_neighbors=24, KNeighborsClassifier.p=1, KNeighborsClassifier.weights='uniform'); @@ -26,8 +26,8 @@ cdf6c1c4-e030-4be2-bf16-929000fdba71;17376;2020-06-23 11:37:41,479020;1.06334972 3b36b7c9-31f2-4691-be93-17738c50432f;17376;2020-06-23 11:37:42,635453;0.11999154090881348;0.125;(-0.6182026836322747, -1);BernoulliNB(data, alpha=100.0, fit_prior=False);None ecc328d8-dad9-4f3b-a470-18641aa0612d;17376;2020-06-23 11:37:42,758447;0.6054596900939941;2.484375;(-2.579317437977081, -3);GaussianNB(PCA(MinMaxScaler(data), PCA.iterated_power=5, PCA.svd_solver='randomized'));None 20927788-b7cf-401d-b227-91348ea9cb31;17376;2020-06-23 11:37:43,366910;0.05504941940307617;0.046875;(-inf, -3);ExtraTreesClassifier(FeatureAgglomeration(PolynomialFeatures(data, PolynomialFeatures.degree=2, PolynomialFeatures.include_bias=False, PolynomialFeatures.interaction_only=False), FeatureAgglomeration.affinity='l2', FeatureAgglomeration.linkage='ward'), ExtraTreesClassifier.bootstrap=True, ExtraTreesClassifier.criterion='gini', ExtraTreesClassifier.max_features=0.7500000000000001, min_samples_leaf=3, min_samples_split=12, ExtraTreesClassifier.n_estimators=100);l2 was provided as affinity. Ward can only work with euclidean distances. -fb2cca07-0303-4ac9-bfa9-21480ba65be5;17376;2020-06-23 11:37:43,422961;0.16715216636657715;0.171875;(-2.224249850201862, -3);DecisionTreeClassifier(SelectFwe(StandardScaler(data), SelectFwe.alpha=0.041, SelectFwe.score_func=f_classif), DecisionTreeClassifier.criterion='gini', DecisionTreeClassifier.max_depth=5, min_samples_leaf=13, min_samples_split=15);None -28af84ac-c08a-40c7-95ad-b4dd5ecdecf9;17376;2020-06-23 11:37:43,592115;2.5859811305999756;2.59375;(-0.3760526224791315, -3);RandomForestClassifier(VarianceThreshold(SelectFwe(data, SelectFwe.alpha=0.026000000000000002, SelectFwe.score_func=f_classif), VarianceThreshold.threshold=0.9500000000000001), RandomForestClassifier.bootstrap=False, RandomForestClassifier.criterion='gini', RandomForestClassifier.max_features=0.4, RandomForestClassifier.min_samples_leaf=8, RandomForestClassifier.min_samples_split=16, RandomForestClassifier.n_estimators=100);None +fb2cca07-0303-4ac9-bfa9-21480ba65be5;17376;2020-06-23 11:37:43,422961;0.16715216636657715;0.171875;(-2.224249850201862, -3);DecisionTreeClassifier(SelectFwe(StandardScaler(data), SelectFwe.alpha=0.041), DecisionTreeClassifier.criterion='gini', DecisionTreeClassifier.max_depth=5, min_samples_leaf=13, min_samples_split=15);None +28af84ac-c08a-40c7-95ad-b4dd5ecdecf9;17376;2020-06-23 11:37:43,592115;2.5859811305999756;2.59375;(-0.3760526224791315, -3);RandomForestClassifier(VarianceThreshold(SelectFwe(data, SelectFwe.alpha=0.026000000000000002), VarianceThreshold.threshold=0.9500000000000001), RandomForestClassifier.bootstrap=False, RandomForestClassifier.criterion='gini', RandomForestClassifier.max_features=0.4, RandomForestClassifier.min_samples_leaf=8, RandomForestClassifier.min_samples_split=16, RandomForestClassifier.n_estimators=100);None 4568c944-c4fb-42b7-9198-fc471a45b018;17376;2020-06-23 11:37:46,299216;0.020008325576782227;0.015625;(-inf, -2);MultinomialNB(RBFSampler(data, RBFSampler.gamma=0.7000000000000001), alpha=100.0, fit_prior=True);Negative values in data passed to MultinomialNB (input X) d32ac816-234c-465b-b78a-4b628fe6d7e0;17376;2020-06-23 11:37:46,322228;1.7080767154693604;1.703125;(-0.5831894678623234, -1);ExtraTreesClassifier(data, ExtraTreesClassifier.bootstrap=True, ExtraTreesClassifier.criterion='entropy', ExtraTreesClassifier.max_features=0.8500000000000001, min_samples_leaf=16, min_samples_split=19, ExtraTreesClassifier.n_estimators=100);None 4ac48aec-dc85-4145-9c48-ddf56ab46111;4436;2020-06-23 11:37:42,094425;6.001939535140991;8.3125;(-inf, -3);GradientBoostingClassifier(FastICA(Binarizer(data, Binarizer.threshold=0.25), FastICA.tol=0.7000000000000001), GradientBoostingClassifier.learning_rate=0.01, GradientBoostingClassifier.max_depth=1, GradientBoostingClassifier.max_features=0.3, GradientBoostingClassifier.min_samples_leaf=6, GradientBoostingClassifier.min_samples_split=12, GradientBoostingClassifier.n_estimators=100, GradientBoostingClassifier.subsample=0.2); @@ -39,27 +39,27 @@ e10ae017-3183-47c0-9506-2db18e2b30dd;17376;2020-06-23 11:37:49,295382;0.12100100 4fc87032-f84b-4439-a2cd-c108fc2dcb9c;17376;2020-06-23 11:37:49,419376;0.14009952545166016;0.140625;(-1.8425345071690191, -3);DecisionTreeClassifier(StandardScaler(StandardScaler(data)), DecisionTreeClassifier.criterion='gini', DecisionTreeClassifier.max_depth=3, min_samples_leaf=9, min_samples_split=2);None a4e9ded8-f0a4-4cfb-a214-ca7f2c6df0bc;17376;2020-06-23 11:37:49,561468;2.6484246253967285;2.65625;(-0.31526854209750077, -1);ExtraTreesClassifier(data, ExtraTreesClassifier.bootstrap=False, ExtraTreesClassifier.criterion='entropy', ExtraTreesClassifier.max_features=0.9500000000000001, min_samples_leaf=2, min_samples_split=17, ExtraTreesClassifier.n_estimators=100);None ad128a9f-32e7-4d95-b692-64d1ea9ed56e;17376;2020-06-23 11:37:52,339020;0.055051565170288086;0.0625;(-inf, -3);MultinomialNB(PolynomialFeatures(RBFSampler(data, RBFSampler.gamma=0.2), PolynomialFeatures.degree=2, PolynomialFeatures.include_bias=False, PolynomialFeatures.interaction_only=False), alpha=0.1, fit_prior=True);Negative values in data passed to MultinomialNB (input X) -08f19f62-fbed-43ea-856d-bd37c4d4f870;17376;2020-06-23 11:37:52,396062;0.1831672191619873;0.171875;(-2.993647904692584, -2);DecisionTreeClassifier(SelectPercentile(data, SelectPercentile.percentile=80, SelectPercentile.score_func=f_classif), DecisionTreeClassifier.criterion='entropy', DecisionTreeClassifier.max_depth=6, min_samples_leaf=3, min_samples_split=12);None +08f19f62-fbed-43ea-856d-bd37c4d4f870;17376;2020-06-23 11:37:52,396062;0.1831672191619873;0.171875;(-2.993647904692584, -2);DecisionTreeClassifier(SelectPercentile(data, SelectPercentile.percentile=80), DecisionTreeClassifier.criterion='entropy', DecisionTreeClassifier.max_depth=6, min_samples_leaf=3, min_samples_split=12);None 4a34f5c5-5b05-4c4e-a210-aa71ccbe4b3a;17376;2020-06-23 11:37:52,581232;1.833895206451416;1.828125;(-0.49587180513877654, -1);ExtraTreesClassifier(data, ExtraTreesClassifier.bootstrap=True, ExtraTreesClassifier.criterion='gini', ExtraTreesClassifier.max_features=0.9500000000000001, min_samples_leaf=10, min_samples_split=13, ExtraTreesClassifier.n_estimators=100);None -6d466479-5b09-4dee-b860-fb1184353101;4436;2020-06-23 11:37:48,749974;6.003244638442993;6.015625;(-inf, -2);GradientBoostingClassifier(SelectPercentile(data, SelectPercentile.percentile=15, SelectPercentile.score_func=f_classif), GradientBoostingClassifier.learning_rate=0.001, GradientBoostingClassifier.max_depth=8, GradientBoostingClassifier.max_features=0.6500000000000001, GradientBoostingClassifier.min_samples_leaf=3, GradientBoostingClassifier.min_samples_split=2, GradientBoostingClassifier.n_estimators=100, GradientBoostingClassifier.subsample=0.7500000000000001); +6d466479-5b09-4dee-b860-fb1184353101;4436;2020-06-23 11:37:48,749974;6.003244638442993;6.015625;(-inf, -2);GradientBoostingClassifier(SelectPercentile(data, SelectPercentile.percentile=15), GradientBoostingClassifier.learning_rate=0.001, GradientBoostingClassifier.max_depth=8, GradientBoostingClassifier.max_features=0.6500000000000001, GradientBoostingClassifier.min_samples_leaf=3, GradientBoostingClassifier.min_samples_split=2, GradientBoostingClassifier.n_estimators=100, GradientBoostingClassifier.subsample=0.7500000000000001); f3ab2c73-57fc-4bdc-9d88-b43be1e9032e;4436;2020-06-23 11:37:54,754210;0.652604341506958;0.640625;(-0.1749001012829964, -2);KNeighborsClassifier(Normalizer(data, Normalizer.norm='l2'), KNeighborsClassifier.n_neighbors=7, KNeighborsClassifier.p=2, KNeighborsClassifier.weights='distance');None c13bfe8b-dd8b-4041-b0e8-0202bc35bf5b;4436;2020-06-23 11:37:55,449853;0.11909890174865723;0.125;(-0.7812520494792912, -1);BernoulliNB(data, alpha=0.001, fit_prior=False);None e02fe7ca-4192-479c-9966-f4659c75eabd;4436;2020-06-23 11:37:55,570954;0.015013694763183594;0.015625;(-inf, -2);LogisticRegression(Binarizer(data, Binarizer.threshold=0.9), LogisticRegression.C=0.5, LogisticRegression.dual=True, LogisticRegression.penalty='l2', LogisticRegression.solver='lbfgs');Solver lbfgs supports only dual=False, got dual=True 357d0f1b-cd36-4720-9700-ef8a26933c56;4436;2020-06-23 11:37:55,586978;0.33331799507141113;0.328125;(-0.22762732629524313, -2);KNeighborsClassifier(VarianceThreshold(data, VarianceThreshold.threshold=0.4), KNeighborsClassifier.n_neighbors=5, KNeighborsClassifier.p=1, KNeighborsClassifier.weights='uniform');None 9e0d922f-1391-4147-b386-844466d70352;4436;2020-06-23 11:37:55,941315;0.10909080505371094;0.109375;(-1.912858407938557, -1);MultinomialNB(data, alpha=0.001, fit_prior=True);None dbc8656f-8e74-4572-9e81-0ee5aa3f305e;4436;2020-06-23 11:37:56,052411;0.17516541481018066;0.171875;(-2.0695848226531206, -2);GaussianNB(FeatureAgglomeration(data, FeatureAgglomeration.affinity='l2', FeatureAgglomeration.linkage='complete'));None -59e0b3c0-6c89-4ebc-9896-fa2968a73c6c;17376;2020-06-23 11:37:54,472963;5.07777214050293;5.078125;(-0.8395738815712319, -3);GradientBoostingClassifier(SelectFwe(Nystroem(data, Nystroem.gamma=0.6000000000000001, Nystroem.kernel='poly', Nystroem.n_components=5), SelectFwe.alpha=0.033, SelectFwe.score_func=f_classif), GradientBoostingClassifier.learning_rate=0.1, GradientBoostingClassifier.max_depth=2, GradientBoostingClassifier.max_features=0.8, GradientBoostingClassifier.min_samples_leaf=8, GradientBoostingClassifier.min_samples_split=3, GradientBoostingClassifier.n_estimators=100, GradientBoostingClassifier.subsample=0.3);None +59e0b3c0-6c89-4ebc-9896-fa2968a73c6c;17376;2020-06-23 11:37:54,472963;5.07777214050293;5.078125;(-0.8395738815712319, -3);GradientBoostingClassifier(SelectFwe(Nystroem(data, Nystroem.gamma=0.6000000000000001, Nystroem.kernel='poly', Nystroem.n_components=5), SelectFwe.alpha=0.033), GradientBoostingClassifier.learning_rate=0.1, GradientBoostingClassifier.max_depth=2, GradientBoostingClassifier.max_features=0.8, GradientBoostingClassifier.min_samples_leaf=8, GradientBoostingClassifier.min_samples_split=3, GradientBoostingClassifier.n_estimators=100, GradientBoostingClassifier.subsample=0.3);None 5eaf6856-fcc0-456e-bbd1-d87354741e79;17376;2020-06-23 11:37:59,801973;0.11626839637756348;0.109375;(-1.9237242964139376, -1);DecisionTreeClassifier(data, DecisionTreeClassifier.criterion='gini', DecisionTreeClassifier.max_depth=2, min_samples_leaf=1, min_samples_split=18);None c96f28e8-9157-49f1-8c27-d75e43d4c04a;4436;2020-06-23 11:37:56,228577;3.762730121612549;3.765625;(-0.3777479933694315, -2);RandomForestClassifier(MinMaxScaler(data), RandomForestClassifier.bootstrap=False, RandomForestClassifier.criterion='gini', RandomForestClassifier.max_features=0.6000000000000001, RandomForestClassifier.min_samples_leaf=8, RandomForestClassifier.min_samples_split=10, RandomForestClassifier.n_estimators=100);None -372495d9-c4ca-461b-a502-73cadcbc5e8c;17376;2020-06-23 11:38:00,023327;2.4409291744232178;2.4375;(-0.42025874652083645, -2);RandomForestClassifier(SelectPercentile(data, SelectPercentile.percentile=28, SelectPercentile.score_func=f_classif), RandomForestClassifier.bootstrap=True, RandomForestClassifier.criterion='entropy', RandomForestClassifier.max_features=0.6500000000000001, RandomForestClassifier.min_samples_leaf=9, RandomForestClassifier.min_samples_split=8, RandomForestClassifier.n_estimators=100);None +372495d9-c4ca-461b-a502-73cadcbc5e8c;17376;2020-06-23 11:38:00,023327;2.4409291744232178;2.4375;(-0.42025874652083645, -2);RandomForestClassifier(SelectPercentile(data, SelectPercentile.percentile=28), RandomForestClassifier.bootstrap=True, RandomForestClassifier.criterion='entropy', RandomForestClassifier.max_features=0.6500000000000001, RandomForestClassifier.min_samples_leaf=9, RandomForestClassifier.min_samples_split=8, RandomForestClassifier.n_estimators=100);None c185c238-0183-47c2-bd02-c3088f0da366;17376;2020-06-23 11:38:02,523214;1.5693516731262207;10.140625;(-0.5943472369578043, -3);BernoulliNB(Binarizer(PCA(data, PCA.iterated_power=9, PCA.svd_solver='randomized'), Binarizer.threshold=0.65), alpha=0.01, fit_prior=False);None c02075fd-69bd-4717-8f02-cc45cb20f242;4436;2020-06-23 11:38:00,061361;4.123298168182373;4.125;(-0.45159053059345994, -1);RandomForestClassifier(data, RandomForestClassifier.bootstrap=False, RandomForestClassifier.criterion='entropy', RandomForestClassifier.max_features=0.6000000000000001, RandomForestClassifier.min_samples_leaf=18, RandomForestClassifier.min_samples_split=6, RandomForestClassifier.n_estimators=100);None -ad1d304f-3ea9-4367-a7c5-2980b9b327c0;4436;2020-06-23 11:38:04,239699;0.13112950325012207;0.125;(-2.0278122873887474, -2);DecisionTreeClassifier(SelectFwe(data, SelectFwe.alpha=0.016, SelectFwe.score_func=f_classif), DecisionTreeClassifier.criterion='entropy', DecisionTreeClassifier.max_depth=1, min_samples_leaf=17, min_samples_split=6);None +ad1d304f-3ea9-4367-a7c5-2980b9b327c0;4436;2020-06-23 11:38:04,239699;0.13112950325012207;0.125;(-2.0278122873887474, -2);DecisionTreeClassifier(SelectFwe(data, SelectFwe.alpha=0.016), DecisionTreeClassifier.criterion='entropy', DecisionTreeClassifier.max_depth=1, min_samples_leaf=17, min_samples_split=6);None b7791dd0-1c8f-4dbb-839e-093538d46132;17376;2020-06-23 11:38:04,096578;6.0024573802948;8.4375;(-inf, -3);DecisionTreeClassifier(FastICA(StandardScaler(data), FastICA.tol=0.35000000000000003), DecisionTreeClassifier.criterion='entropy', DecisionTreeClassifier.max_depth=8, min_samples_leaf=7, min_samples_split=19); 25071945-38e9-4509-a5bd-ecd9bd328aa0;17376;2020-06-23 11:38:10,101038;0.11623954772949219;0.109375;(-0.7250118551650611, -1);BernoulliNB(data, alpha=0.1, fit_prior=True);None 2f21e0ac-5192-4205-bf04-a029af60b353;17376;2020-06-23 11:38:10,219146;0.12323427200317383;0.125;(-0.725480653467304, -1);BernoulliNB(data, alpha=0.1, fit_prior=False);None 941f2710-1f46-4a68-afb2-16d732b01fff;4436;2020-06-23 11:38:04,374823;6.040501356124878;6.015625;(-inf, -2);ExtraTreesClassifier(PolynomialFeatures(data, PolynomialFeatures.degree=2, PolynomialFeatures.include_bias=False, PolynomialFeatures.interaction_only=False), ExtraTreesClassifier.bootstrap=False, ExtraTreesClassifier.criterion='gini', ExtraTreesClassifier.max_features=0.15000000000000002, min_samples_leaf=17, min_samples_split=3, ExtraTreesClassifier.n_estimators=100); -5a4432cf-ed7b-4cc2-8d13-acd44a91f33c;4436;2020-06-23 11:38:10,416325;0.1341235637664795;0.140625;(-1.5135121684745734, -3);GaussianNB(SelectPercentile(VarianceThreshold(data, VarianceThreshold.threshold=0.7000000000000001), SelectPercentile.percentile=77, SelectPercentile.score_func=f_classif));None +5a4432cf-ed7b-4cc2-8d13-acd44a91f33c;4436;2020-06-23 11:38:10,416325;0.1341235637664795;0.140625;(-1.5135121684745734, -3);GaussianNB(SelectPercentile(VarianceThreshold(data, VarianceThreshold.threshold=0.7000000000000001), SelectPercentile.percentile=77));None 7db5196b-f557-4a46-aa47-6bcf05c7604c;4436;2020-06-23 11:38:10,552449;1.7606029510498047;1.75;(-0.5498547616120083, -1);ExtraTreesClassifier(data, ExtraTreesClassifier.bootstrap=False, ExtraTreesClassifier.criterion='entropy', ExtraTreesClassifier.max_features=0.6500000000000001, min_samples_leaf=20, min_samples_split=20, ExtraTreesClassifier.n_estimators=100);None 2d97a0b6-811d-4520-8612-e6cf1a037fe7;4436;2020-06-23 11:38:12,371104;0.016015291213989258;0.015625;(-inf, -1);LogisticRegression(data, LogisticRegression.C=0.01, LogisticRegression.dual=True, LogisticRegression.penalty='l2', LogisticRegression.solver='lbfgs');Solver lbfgs supports only dual=False, got dual=True a42dbc5d-9e66-4804-9bd1-35df0706e5e0;4436;2020-06-23 11:38:12,390123;0.12411284446716309;0.125;(-0.725480653467304, -1);BernoulliNB(data, alpha=0.1, fit_prior=False);None diff --git a/tests/system/test_gamaclassifier.py b/tests/system/test_gamaclassifier.py index b77d2b56..fb88c502 100644 --- a/tests/system/test_gamaclassifier.py +++ b/tests/system/test_gamaclassifier.py @@ -10,6 +10,9 @@ from sklearn.metrics import accuracy_score, log_loss from sklearn.pipeline import Pipeline +from gama.configuration.configuration_task_test import ClassifierConfigTest +from gama.configuration.testconfiguration import config_space +import ConfigSpace as cs from gama.postprocessing import EnsemblePostProcessing from gama.search_methods import AsynchronousSuccessiveHalving, AsyncEA, RandomSearch from gama.search_methods.base_search import BaseSearch @@ -237,3 +240,34 @@ def test_missing_value_classification_arff(): def test_missing_value_classification(): """Binary classification, log loss (probabilities), missing values.""" _test_dataset_problem(breast_cancer_missing, "neg_log_loss", missing_values=True) + + +def test_wrong_meta_estimators_config_space_gc(): + """Meta with wrong estimators""" + with pytest.raises(ValueError): + config_space.meta = { + # "gama_system_name": "current_configuration_name", + "dummy": "dummy", + } + GamaClassifier( + search_space=config_space, + ) + + +def test_wrong_meta_preprocessors_config_space_gc(): + """Meta with wrong preprocessors""" + with pytest.raises(ValueError): + dummy_config_space = cs.ConfigurationSpace( + meta={ + # "gama_system_name": "current_configuration_name", + "estimators": "classifiers", + "preprocessors": "dummy", + } + ) + + dummy_classifier_config = ClassifierConfigTest(dummy_config_space) + dummy_classifier_config.setup_classifiers() + + GamaClassifier( + search_space=dummy_config_space, + ) diff --git a/tests/system/test_gamaregressor.py b/tests/system/test_gamaregressor.py index 985178c8..3b0d1995 100644 --- a/tests/system/test_gamaregressor.py +++ b/tests/system/test_gamaregressor.py @@ -4,10 +4,15 @@ from sklearn.model_selection import train_test_split from sklearn.metrics import mean_squared_error +from gama.configuration.regression_task import RegressorConfig +from gama.configuration.testconfiguration import config_space +import ConfigSpace as cs from gama.postprocessing import EnsemblePostProcessing from gama.utilities.generic.stopwatch import Stopwatch from gama import GamaRegressor +import pytest + FIT_TIME_MARGIN = 1.1 TOTAL_TIME_S = 60 @@ -74,3 +79,34 @@ def test_missing_value_regression(): store="nothing", ) _test_gama_regressor(gama, X_train, X_test, y_train, y_test, data, metric) + + +def test_wrong_meta_estimators_config_space_gr(): + """Meta with wrong estimators""" + with pytest.raises(ValueError): + config_space.meta = { + # "gama_system_name": "current_configuration_name", + "dummy": "dummy", + } + GamaRegressor( + search_space=config_space, + ) + + +def test_wrong_meta_preprocessors_config_space_gc(): + """Meta with wrong preprocessors""" + with pytest.raises(ValueError): + dummy_config_space = cs.ConfigurationSpace( + meta={ + # "gama_system_name": "current_configuration_name", + "estimators": "regressors", + "preprocessors": "dummy", + } + ) + + dummy_classifier_config = RegressorConfig(dummy_config_space) + dummy_classifier_config.setup_regressors() + + GamaRegressor( + search_space=dummy_config_space, + ) diff --git a/tests/unit/test_configuration_parser.py b/tests/unit/test_configuration_parser.py index 5618be54..2245788a 100644 --- a/tests/unit/test_configuration_parser.py +++ b/tests/unit/test_configuration_parser.py @@ -1,18 +1,31 @@ -from sklearn.naive_bayes import BernoulliNB, GaussianNB +from gama.utilities.config_space import merge_configurations -from gama.configuration.parser import merge_configurations +from gama.configuration.testconfiguration import ( + config_space as classification_config_space, +) +from gama.configuration.regression import config_space as regression_config_space def test_merge_configuration(): """Test merging two simple configurations works as expected.""" - one = {"alpha": [0, 1], BernoulliNB: {"fit_prior": [True, False]}} - two = {"alpha": [0, 2], GaussianNB: {"fit_prior": [True, False]}} - expected_merged = { - "alpha": [0, 1, 2], - GaussianNB: {"fit_prior": [True, False]}, - BernoulliNB: {"fit_prior": [True, False]}, - } + test_classification_config = classification_config_space + test_regression_config = regression_config_space - actual_merged = merge_configurations(one, two) - assert expected_merged == actual_merged + prefix = "merged" + delimiter = "_" + + merged_config = merge_configurations( + test_classification_config, + test_regression_config, + prefix=prefix, + delimiter=delimiter, + ) + + assert ( + test_classification_config.meta["estimators"] + in merged_config.get_hyperparameters_dict() + ) + assert ( + prefix + delimiter + test_regression_config.meta["estimators"] + ) in merged_config.get_hyperparameters_dict() diff --git a/tests/unit/test_ea_mutation.py b/tests/unit/test_ea_mutation.py index 5440bfd8..8589094f 100644 --- a/tests/unit/test_ea_mutation.py +++ b/tests/unit/test_ea_mutation.py @@ -14,44 +14,50 @@ from gama.genetic_programming.compilers.scikitlearn import compile_individual -def test_mut_replace_terminal(ForestPipeline, pset): +def test_mut_replace_terminal(ForestPipeline, config_space): """Tests if mut_replace_terminal replaces exactly one terminal.""" _test_mutation( ForestPipeline, mut_replace_terminal, _mut_replace_terminal_is_applied, - pset, + config_space, ) -def test_mut_replace_terminal_none_available(GNB, pset): +def test_mut_replace_terminal_none_available(GNB, config_space): """mut_replace_terminal raises an exception if no valid mutation is possible.""" with pytest.raises(ValueError) as error: - mut_replace_terminal(GNB, pset) + mut_replace_terminal(GNB, config_space) assert "Individual has no terminals suitable for mutation." in str(error.value) -def test_mut_replace_primitive_len_1(LinearSVC, pset): +def test_mut_replace_primitive_len_1(LinearSVC, config_space): """mut_replace_primitive replaces exactly one primitive.""" _test_mutation( - LinearSVC, mut_replace_primitive, _mut_replace_primitive_is_applied, pset + LinearSVC, + mut_replace_primitive, + _mut_replace_primitive_is_applied, + config_space, ) -def test_mut_replace_primitive_len_2(ForestPipeline, pset): +def test_mut_replace_primitive_len_2(ForestPipeline, config_space): """mut_replace_primitive replaces exactly one primitive.""" _test_mutation( - ForestPipeline, mut_replace_primitive, _mut_replace_primitive_is_applied, pset + ForestPipeline, + mut_replace_primitive, + _mut_replace_primitive_is_applied, + config_space, ) -def test_mut_insert(ForestPipeline, pset): +def test_mut_insert(ForestPipeline, config_space): """mut_insert inserts at least one primitive.""" - _test_mutation(ForestPipeline, mut_insert, _mut_insert_is_applied, pset) + _test_mutation(ForestPipeline, mut_insert, _mut_insert_is_applied, config_space) -def test_random_valid_mutation_with_all(ForestPipeline, pset): +def test_random_valid_mutation_with_all(ForestPipeline, config_space): """Test if a valid mutation is applied at random. I am honestly not sure of the best way to test this. @@ -63,7 +69,7 @@ def test_random_valid_mutation_with_all(ForestPipeline, pset): for i in range(_min_trials(n_mutations=4)): ind_clone = ForestPipeline.copy_as_new() - random_valid_mutation_in_place(ind_clone, pset) + random_valid_mutation_in_place(ind_clone, config_space) if _mut_shrink_is_applied(ForestPipeline, ind_clone)[0]: applied_mutation["shrink"] += 1 elif _mut_insert_is_applied(ForestPipeline, ind_clone)[0]: @@ -78,7 +84,7 @@ def test_random_valid_mutation_with_all(ForestPipeline, pset): assert all([count > 0 for (mut, count) in applied_mutation.items()]) -def test_random_valid_mutation_without_shrink(LinearSVC, pset): +def test_random_valid_mutation_without_shrink(LinearSVC, config_space): """Test if a valid mutation is applied at random. I am honestly not sure of the best way to test this. @@ -90,7 +96,7 @@ def test_random_valid_mutation_without_shrink(LinearSVC, pset): for i in range(_min_trials(n_mutations=3)): ind_clone = LinearSVC.copy_as_new() - random_valid_mutation_in_place(ind_clone, pset) + random_valid_mutation_in_place(ind_clone, config_space) if _mut_insert_is_applied(LinearSVC, ind_clone)[0]: applied_mutation["insert"] += 1 elif _mut_replace_terminal_is_applied(LinearSVC, ind_clone)[0]: @@ -103,7 +109,7 @@ def test_random_valid_mutation_without_shrink(LinearSVC, pset): assert all([count > 0 for (mut, count) in applied_mutation.items()]) -def test_random_valid_mutation_without_terminal(GNB, pset): +def test_random_valid_mutation_without_terminal(GNB, config_space): """Test if a valid mutation is applied at random. I am honestly not sure of the best way to test this. @@ -116,7 +122,7 @@ def test_random_valid_mutation_without_terminal(GNB, pset): for i in range(_min_trials(n_mutations=2)): ind_clone = GNB.copy_as_new() - random_valid_mutation_in_place(ind_clone, pset) + random_valid_mutation_in_place(ind_clone, config_space) if _mut_insert_is_applied(GNB, ind_clone)[0]: applied_mutation["insert"] += 1 elif _mut_replace_primitive_is_applied(GNB, ind_clone)[0]: @@ -127,7 +133,7 @@ def test_random_valid_mutation_without_terminal(GNB, pset): assert all([count > 0 for (mut, count) in applied_mutation.items()]) -def test_random_valid_mutation_without_insert(ForestPipeline, pset): +def test_random_valid_mutation_without_insert(ForestPipeline, config_space): """Test if a valid mutation is applied at random. I am honestly not sure of the best way to test this. @@ -141,7 +147,7 @@ def test_random_valid_mutation_without_insert(ForestPipeline, pset): for i in range(_min_trials(n_mutations=3)): ind_clone = ForestPipeline.copy_as_new() - random_valid_mutation_in_place(ind_clone, pset, max_length=2) + random_valid_mutation_in_place(ind_clone, config_space, max_length=2) if _mut_shrink_is_applied(ForestPipeline, ind_clone)[0]: applied_mutation["shrink"] += 1 elif _mut_replace_terminal_is_applied(ForestPipeline, ind_clone)[0]: @@ -245,7 +251,7 @@ def _mut_replace_primitive_is_applied(original, mutated): return True, None -def _test_mutation(individual: Individual, mutation, mutation_check, pset): +def _test_mutation(individual: Individual, mutation, mutation_check, config_space): """Test if an individual mutated by `mutation` passes `mutation_check` and compiles. :param individual: The individual to be mutated. @@ -255,10 +261,10 @@ def _test_mutation(individual: Individual, mutation, mutation_check, pset): see above functions. """ ind_clone = individual.copy_as_new() - mutation(ind_clone, pset) + mutation(ind_clone, config_space) applied, message = mutation_check(individual, ind_clone) assert applied, message # Should be able to compile the individual, will raise an Exception if not. - compile_individual(ind_clone, pset) + compile_individual(ind_clone, config_space) From 9dec917feef4e499fd8e33a28fb90abf85795e80 Mon Sep 17 00:00:00 2001 From: Provost Simon Date: Mon, 4 Dec 2023 15:31:25 +0000 Subject: [PATCH 5/9] refactor(tests): update tests to be ConfigSpace compliant --- .../test_configuration_task/__init__.py | 2 + .../test_configuration_task/classifiers.py | 238 ++++++++++++++++++ .../test_configuration_task/preprocessors.py | 190 ++++++++++++++ 3 files changed, 430 insertions(+) create mode 100644 gama/configuration/test_configuration_task/__init__.py create mode 100644 gama/configuration/test_configuration_task/classifiers.py create mode 100644 gama/configuration/test_configuration_task/preprocessors.py diff --git a/gama/configuration/test_configuration_task/__init__.py b/gama/configuration/test_configuration_task/__init__.py new file mode 100644 index 00000000..a7fdc527 --- /dev/null +++ b/gama/configuration/test_configuration_task/__init__.py @@ -0,0 +1,2 @@ +from .classifiers import TestClassifierConfig +from .preprocessors import TestPreprocessorConfig diff --git a/gama/configuration/test_configuration_task/classifiers.py b/gama/configuration/test_configuration_task/classifiers.py new file mode 100644 index 00000000..b0a72f57 --- /dev/null +++ b/gama/configuration/test_configuration_task/classifiers.py @@ -0,0 +1,238 @@ +import ConfigSpace as cs +import ConfigSpace.hyperparameters as csh + + +class TestClassifierConfig: + def __init__( + self, + config_space: cs.ConfigurationSpace, + ): + if "estimators" not in config_space.meta: + raise ValueError("Expected 'estimators' key in meta of config_space") + self.config_space = config_space + self.classifiers_setup_map = { + "BernoulliNB": self.setup_bernoulliNB, + "MultinomialNB": self.setup_multinomialNB, + "GaussianNB": self.setup_gaussianNB, + "DecisionTreeClassifier": self.setup_decision_tree, + "ExtraTreesClassifier": self.setup_extra_trees, + "RandomForestClassifier": self.setup_random_forest, + "GradientBoostingClassifier": self.setup_gradient_boosting, + "KNeighborsClassifier": self.setup_k_neighbors, + "LinearSVC": self.setup_linear_svc, + "LogisticRegression": self.setup_logistic_regression, + } + self.cs_estimators_name = self.config_space.meta["estimators"] + + @property + def shared_hyperparameters(self): + return { + "alpha": [1e-3, 1e-2, 1e-1, 1.0, 10.0, 100.0], + "fit_prior": [True, False], + "criterion": ["gini", "entropy"], + "max_depth": {"lower": 1, "upper": 11}, + "min_samples_split": {"lower": 2, "upper": 21}, + "min_samples_leaf": {"lower": 1, "upper": 21}, + "max_features": {"lower": 0.05, "upper": 1.01, "default_value": 1.0}, + "n_estimators": [100], + "bootstrap": [True, False], + "dual": [True, False], + "C": [1e-4, 1e-3, 1e-2, 1e-1, 0.5, 1.0, 5.0, 10.0, 15.0, 20.0, 25.0], + } + + def setup_classifiers(self): + classifiers_choices = list(self.classifiers_setup_map.keys()) + + if not classifiers_choices: + raise ValueError("No classifiers to add to config space") + + classifiers = csh.CategoricalHyperparameter( + name=self.cs_estimators_name, + choices=classifiers_choices, + ) + self.config_space.add_hyperparameter(classifiers) + + for classifier_name in classifiers_choices: + if setup_func := self.classifiers_setup_map.get(classifier_name): + setup_func(classifiers) + + def _add_hyperparameters_and_equals_conditions( + self, local_vars: dict, estimator_name: str + ): + if "classifiers" not in local_vars or not isinstance( + local_vars["classifiers"], csh.CategoricalHyperparameter + ): + raise ValueError( + "Expected 'classifiers' key with a CategoricalHyperparameter in local" + "vars" + ) + + hyperparameters_to_add = [ + hyperparameter + for hyperparameter in local_vars.values() + if isinstance(hyperparameter, csh.Hyperparameter) + and hyperparameter != local_vars["classifiers"] + ] + + conditions_to_add = [ + cs.EqualsCondition( + hyperparameter, local_vars["classifiers"], estimator_name + ) + for hyperparameter in hyperparameters_to_add + ] + + self.config_space.add_hyperparameters(hyperparameters_to_add) + self.config_space.add_conditions(conditions_to_add) + + def setup_bernoulliNB(self, classifiers: csh.CategoricalHyperparameter): + alpha_NB = csh.CategoricalHyperparameter( + "alpha__bernoulliNB", self.shared_hyperparameters["alpha"] + ) + fit_prior = csh.CategoricalHyperparameter( + "fit_prior__bernoulliNB", self.shared_hyperparameters["fit_prior"] + ) + self._add_hyperparameters_and_equals_conditions(locals(), "BernoulliNB") + + def setup_multinomialNB(self, classifiers: csh.CategoricalHyperparameter): + alpha_NB = csh.CategoricalHyperparameter( + "alpha__multinomialNB", self.shared_hyperparameters["alpha"] + ) + fit_prior = csh.CategoricalHyperparameter( + "fit_prior__multinomialNB", self.shared_hyperparameters["fit_prior"] + ) + self._add_hyperparameters_and_equals_conditions(locals(), "MultinomialNB") + + def setup_gaussianNB(self, classifiers: csh.CategoricalHyperparameter): + # GaussianNB has no hyperparameters + pass + + def setup_decision_tree(self, classifiers: csh.CategoricalHyperparameter): + criterion = csh.CategoricalHyperparameter( + "criterion__decision_tree", self.shared_hyperparameters["criterion"] + ) + max_depth = csh.UniformIntegerHyperparameter( + "max_depth__decision_tree", **self.shared_hyperparameters["max_depth"] + ) + min_samples_split = csh.UniformIntegerHyperparameter( + "min_samples_split__decision_tree", + **self.shared_hyperparameters["min_samples_split"], + ) + min_samples_leaf = csh.UniformIntegerHyperparameter( + "min_samples_leaf__decision_tree", + **self.shared_hyperparameters["min_samples_leaf"], + ) + self._add_hyperparameters_and_equals_conditions( + locals(), "DecisionTreeClassifier" + ) + + def setup_extra_trees(self, classifiers: csh.CategoricalHyperparameter): + criterion = csh.CategoricalHyperparameter( + "criterion__extra_trees", self.shared_hyperparameters["criterion"] + ) + max_depth = csh.UniformIntegerHyperparameter( + "max_depth__extra_trees", **self.shared_hyperparameters["max_depth"] + ) + min_samples_split = csh.UniformIntegerHyperparameter( + "min_samples_split__extra_trees", + **self.shared_hyperparameters["min_samples_split"], + ) + min_samples_leaf = csh.UniformIntegerHyperparameter( + "min_samples_leaf__extra_trees", + **self.shared_hyperparameters["min_samples_leaf"], + ) + max_features = csh.UniformFloatHyperparameter( + "max_features__extra_trees", **self.shared_hyperparameters["max_features"] + ) + n_estimators = csh.CategoricalHyperparameter( + "n_estimators__extra_trees", self.shared_hyperparameters["n_estimators"] + ) + bootstrap = csh.CategoricalHyperparameter( + "bootstrap__extra_trees", self.shared_hyperparameters["bootstrap"] + ) + self._add_hyperparameters_and_equals_conditions( + locals(), "ExtraTreesClassifier" + ) + + def setup_random_forest(self, classifiers: csh.CategoricalHyperparameter): + criterion = csh.CategoricalHyperparameter( + "criterion__random_forest", self.shared_hyperparameters["criterion"] + ) + max_depth = csh.UniformIntegerHyperparameter( + "max_depth__random_forest", **self.shared_hyperparameters["max_depth"] + ) + min_samples_split = csh.UniformIntegerHyperparameter( + "min_samples_split", **self.shared_hyperparameters["min_samples_split"] + ) + min_samples_leaf = csh.UniformIntegerHyperparameter( + "min_samples_leaf", **self.shared_hyperparameters["min_samples_leaf"] + ) + max_features = csh.UniformFloatHyperparameter( + "max_features", **self.shared_hyperparameters["max_features"] + ) + n_estimators = csh.CategoricalHyperparameter( + "n_estimators__random_forest", self.shared_hyperparameters["n_estimators"] + ) + bootstrap = csh.CategoricalHyperparameter( + "bootstrap", self.shared_hyperparameters["bootstrap"] + ) + self._add_hyperparameters_and_equals_conditions( + locals(), "RandomForestClassifier" + ) + + def setup_gradient_boosting(self, classifiers: csh.CategoricalHyperparameter): + sub_sample = csh.CategoricalHyperparameter( + "subsample", [0.05, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0] + ) + learning_rate = csh.CategoricalHyperparameter( + "learning_rate", [1e-3, 1e-2, 1e-1, 0.5, 1.0] + ) + max_features = csh.UniformFloatHyperparameter( + "max_features__gradient_boosting", + **self.shared_hyperparameters["max_features"], + ) + n_estimators = csh.CategoricalHyperparameter( + "n_estimators__gradient_boosting", + self.shared_hyperparameters["n_estimators"], + ) + self._add_hyperparameters_and_equals_conditions( + locals(), "GradientBoostingClassifier" + ) + + def setup_k_neighbors(self, classifiers: csh.CategoricalHyperparameter): + n_neighbors = csh.UniformIntegerHyperparameter("n_neighbors", 1, 51) + weights = csh.CategoricalHyperparameter("weights", ["uniform", "distance"]) + p = csh.UniformIntegerHyperparameter("p", 1, 2) + self._add_hyperparameters_and_equals_conditions( + locals(), "KNeighborsClassifier" + ) + + def setup_linear_svc(self, classifiers: csh.CategoricalHyperparameter): + loss = csh.CategoricalHyperparameter( + "loss__linear_svc", ["hinge", "squared_hinge"] + ) + penalty = csh.CategoricalHyperparameter("penalty__linear_svc", ["l1", "l2"]) + dual = csh.CategoricalHyperparameter( + "dual__svc", self.shared_hyperparameters["dual"] + ) + tol = csh.CategoricalHyperparameter("tol__svc", [1e-5, 1e-4, 1e-3, 1e-2, 1e-1]) + C = csh.CategoricalHyperparameter("C__svc", self.shared_hyperparameters["C"]) + self._add_hyperparameters_and_equals_conditions(locals(), "LinearSVC") + + # Forbidden clause: Penalty 'l1' cannot be used with loss 'hinge' + forbidden_penalty_loss = cs.ForbiddenAndConjunction( + cs.ForbiddenEqualsClause(self.config_space["penalty__linear_svc"], "l1"), + cs.ForbiddenEqualsClause(self.config_space["loss__linear_svc"], "hinge"), + ) + self.config_space.add_forbidden_clause(forbidden_penalty_loss) + + def setup_logistic_regression(self, classifiers: csh.CategoricalHyperparameter): + penalty = csh.CategoricalHyperparameter( + "penalty__logistic_regression", ["l1", "l2"] + ) + C = csh.CategoricalHyperparameter( + "C__logistic_regression", self.shared_hyperparameters["C"] + ) + dual = csh.CategoricalHyperparameter( + "dual__logistic_regression", self.shared_hyperparameters["dual"] + ) + self._add_hyperparameters_and_equals_conditions(locals(), "LogisticRegression") diff --git a/gama/configuration/test_configuration_task/preprocessors.py b/gama/configuration/test_configuration_task/preprocessors.py new file mode 100644 index 00000000..a80a5849 --- /dev/null +++ b/gama/configuration/test_configuration_task/preprocessors.py @@ -0,0 +1,190 @@ +import ConfigSpace as cs +import ConfigSpace.hyperparameters as csh + + +class TestPreprocessorConfig: + def __init__( + self, + config_space: cs.ConfigurationSpace, + ): + if "preprocessors" not in config_space.meta: + raise ValueError("Expected 'preprocessors' key in meta of config_space") + self.config_space = config_space + self.preprocessors_setup_map = { + "SelectFwe": self.setup_select_fwe, + "Binarizer": self.setup_binarizer, + "FastICA": self.setup_fast_ica, + "FeatureAgglomeration": self.setup_feature_agglomeration, + "MaxAbsScaler": self.setup_max_abs_scaler, + "MinMaxScaler": self.setup_min_max_scaler, + "Normalizer": self.setup_normalizer, + "Nystroem": self.setup_nystroem, + "PCA": self.setup_pca, + "PolynomialFeatures": self.setup_polynomial_features, + "RBFSampler": self.setup_rbf_sampler, + "RobustScaler": self.setup_robust_scaler, + "StandardScaler": self.setup_standard_scaler, + "SelectPercentile": self.setup_select_percentile, + "VarianceThreshold": self.setup_variance_threshold, + } + self.cs_preprocessors_name = config_space.meta["preprocessors"] + + @property + def shared_hyperparameters(self): + return { + "gamma": {"lower": 0.01, "upper": 1.01, "default_value": 1.0}, + } + + def setup_preprocessors(self): + preprocessors_choices = list(self.preprocessors_setup_map.keys()) + + if not preprocessors_choices: + raise ValueError("No preprocessors to add to config space") + + preprocessors = csh.CategoricalHyperparameter( + name=self.cs_preprocessors_name, + choices=preprocessors_choices, + ) + self.config_space.add_hyperparameter(preprocessors) + + for preprocessor_name in preprocessors_choices: + if setup_func := self.preprocessors_setup_map.get(preprocessor_name): + setup_func(preprocessors) + + def _add_hyperparameters_and_equals_conditions( + self, local_vars: dict, preprocessor_name: str + ): + if "preprocessors" not in local_vars or not isinstance( + local_vars["preprocessors"], csh.CategoricalHyperparameter + ): + raise ValueError( + "Expected 'preprocessors' key with a CategoricalHyperparameter in local" + "vars" + ) + + hyperparameters_to_add = [ + hyperparameter + for hyperparameter in local_vars.values() + if isinstance(hyperparameter, csh.Hyperparameter) + and hyperparameter != local_vars["preprocessors"] + ] + + conditions_to_add = [ + cs.EqualsCondition( + hyperparameter, local_vars["preprocessors"], preprocessor_name + ) + for hyperparameter in hyperparameters_to_add + ] + + self.config_space.add_hyperparameters(hyperparameters_to_add) + self.config_space.add_conditions(conditions_to_add) + + def setup_select_fwe(self, preprocessors: csh.CategoricalHyperparameter): + alpha = csh.UniformFloatHyperparameter( + "alpha__SelectFwe", 0, 0.05, default_value=0.05 + ) + self._add_hyperparameters_and_equals_conditions(locals(), "SelectFwe") + + def setup_binarizer(self, preprocessors: csh.CategoricalHyperparameter): + threshold = csh.UniformFloatHyperparameter( + "threshold__binarizer", 0.0, 1.01, default_value=0.05 + ) + self._add_hyperparameters_and_equals_conditions(locals(), "Binarizer") + + def setup_fast_ica(self, preprocessors: csh.CategoricalHyperparameter): + whiten = csh.CategoricalHyperparameter("whiten", ["unit-variance"]) + tol = csh.UniformFloatHyperparameter( + "tol__fast_ica", 0.0, 1.01, default_value=0.05 + ) + self._add_hyperparameters_and_equals_conditions(locals(), "FastICA") + + def setup_feature_agglomeration(self, preprocessors: csh.CategoricalHyperparameter): + linkage = csh.CategoricalHyperparameter( + "linkage__feature_agglomeration", ["ward", "complete", "average"] + ) + affinity = csh.CategoricalHyperparameter( + "affinity__feature_agglomeration", + ["euclidean", "l1", "l2", "manhattan", "cosine", "precomputed"], + ) + self._add_hyperparameters_and_equals_conditions( + locals(), "FeatureAgglomeration" + ) + + # Forbidden clause: Linkage is different from 'ward' and affinity is 'euclidean' + forbidden_penalty_loss = cs.ForbiddenAndConjunction( + cs.ForbiddenInClause( + self.config_space["linkage__feature_agglomeration"], + ["complete", "average"], + ), + cs.ForbiddenEqualsClause( + self.config_space["affinity__feature_agglomeration"], "euclidean" + ), + ) + self.config_space.add_forbidden_clause(forbidden_penalty_loss) + + def setup_max_abs_scaler(self, preprocessors: csh.CategoricalHyperparameter): + # No hyperparameters + pass + + def setup_min_max_scaler(self, preprocessors: csh.CategoricalHyperparameter): + # No hyperparameters + pass + + def setup_normalizer(self, preprocessors: csh.CategoricalHyperparameter): + norm = csh.CategoricalHyperparameter("norm", ["l1", "l2", "max"]) + self._add_hyperparameters_and_equals_conditions(locals(), "Normalizer") + + def setup_nystroem(self, preprocessors: csh.CategoricalHyperparameter): + kernel = csh.CategoricalHyperparameter( + "kernel", + [ + "rbf", + "cosine", + "chi2", + "laplacian", + "polynomial", + "poly", + "linear", + "additive_chi2", + "sigmoid", + ], + ) + gamma = csh.UniformFloatHyperparameter( + "gamma__nystroem", **self.shared_hyperparameters["gamma"] + ) + n_components = csh.UniformIntegerHyperparameter("n_components", 1, 11) + self._add_hyperparameters_and_equals_conditions(locals(), "Nystroem") + + def setup_pca(self, preprocessors: csh.CategoricalHyperparameter): + svd_solver = csh.CategoricalHyperparameter("svd_solver", ["randomized"]) + iterated_power = csh.UniformIntegerHyperparameter("iterated_power", 1, 11) + self._add_hyperparameters_and_equals_conditions(locals(), "PCA") + + def setup_polynomial_features(self, preprocessors: csh.CategoricalHyperparameter): + include_bias = csh.CategoricalHyperparameter("include_bias", [False]) + interaction_only = csh.CategoricalHyperparameter("interaction_only", [False]) + self._add_hyperparameters_and_equals_conditions(locals(), "PolynomialFeatures") + + def setup_rbf_sampler(self, preprocessors: csh.CategoricalHyperparameter): + gamma = csh.UniformFloatHyperparameter( + "gamma__rbf_sampler", **self.shared_hyperparameters["gamma"] + ) + self._add_hyperparameters_and_equals_conditions(locals(), "RBFSampler") + + def setup_robust_scaler(self, preprocessors: csh.CategoricalHyperparameter): + # No hyperparameters + pass + + def setup_standard_scaler(self, preprocessors: csh.CategoricalHyperparameter): + # No hyperparameters + pass + + def setup_select_percentile(self, preprocessors: csh.CategoricalHyperparameter): + percentile = csh.UniformIntegerHyperparameter("percentile", 1, 100) + self._add_hyperparameters_and_equals_conditions(locals(), "SelectPercentile") + + def setup_variance_threshold(self, preprocessors: csh.CategoricalHyperparameter): + threshold = csh.UniformFloatHyperparameter( + "threshold__variance_threshold", 0.05, 1.01, default_value=0.05 + ) + self._add_hyperparameters_and_equals_conditions(locals(), "VarianceThreshold") From a840863ca251a6e645531835f2590a24ca88bc0b Mon Sep 17 00:00:00 2001 From: Provost Simon Date: Sat, 9 Dec 2023 00:21:56 +0000 Subject: [PATCH 6/9] feat(evaluation_library): add is_evaluated candidate --- gama/gama.py | 1 + gama/genetic_programming/operator_set.py | 2 ++ gama/utilities/evaluation_library.py | 10 ++++++++++ 3 files changed, 13 insertions(+) diff --git a/gama/gama.py b/gama/gama.py index 9b7e3e61..a18af752 100644 --- a/gama/gama.py +++ b/gama/gama.py @@ -339,6 +339,7 @@ def __init__( eliminate=eliminate_from_pareto, evaluate_callback=self._on_evaluation_completed, completed_evaluations=self._evaluation_library.lookup, + is_evaluated=self._evaluation_library.is_evaluated, ) def cleanup(self, which="evaluations") -> None: diff --git a/gama/genetic_programming/operator_set.py b/gama/genetic_programming/operator_set.py index 995f7c17..a208d8ee 100644 --- a/gama/genetic_programming/operator_set.py +++ b/gama/genetic_programming/operator_set.py @@ -25,6 +25,7 @@ def __init__( evaluate_callback: Callable[[Evaluation], None], max_retry: int = 50, completed_evaluations: Optional[Dict[str, Evaluation]] = None, + is_evaluated: Optional[Callable[[Individual], bool]] = None, ): self._mutate = mutate self._mate = mate @@ -37,6 +38,7 @@ def __init__( self._evaluate = None self._evaluate_callback = evaluate_callback self.evaluate: Optional[Callable[..., Evaluation]] = None + self.is_evaluated = is_evaluated self._completed_evaluations = completed_evaluations diff --git a/gama/utilities/evaluation_library.py b/gama/utilities/evaluation_library.py index ad9ce85f..fa18f9b2 100644 --- a/gama/utilities/evaluation_library.py +++ b/gama/utilities/evaluation_library.py @@ -262,3 +262,13 @@ def n_best(self, n: int = 5, with_pipelines=True) -> List[Evaluation]: return heapq.nlargest(n, self.top_evaluations) else: return list(reversed(sorted(self.evaluations)))[:n] + + def is_evaluated(self, candidate: Union[Individual, None]) -> bool: + """Check if a candidate pipeline has already been evaluated.""" + if candidate is None: + log.warning("Candidate to check is None. Returning False.") + return False + return any( + str(candidate.pipeline) == str(evaluation.individual.pipeline) + for evaluation in self.lookup.values() + ) From 14cb0dc6c715a0b51994460981584c8d93425382 Mon Sep 17 00:00:00 2001 From: Provost Simon Date: Sat, 9 Dec 2023 00:31:05 +0000 Subject: [PATCH 7/9] refactor(search_methods): update random search uniqueness --- gama/search_methods/random_search.py | 22 ++++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) diff --git a/gama/search_methods/random_search.py b/gama/search_methods/random_search.py index ddf5ff60..e7fa0c63 100644 --- a/gama/search_methods/random_search.py +++ b/gama/search_methods/random_search.py @@ -33,8 +33,9 @@ def random_search( output: List[Individual], start_candidates: List[Individual], max_evaluations: Optional[int] = None, + max_attempts: int = 100000, ) -> List[Individual]: - """Perform random search over all possible pipelines. + """Perform random search over all possible pipelines Parameters ---------- @@ -47,6 +48,9 @@ def random_search( max_evaluations: int, optional (default=None) If specified, only a maximum of `max_evaluations` individuals are evaluated. If None, the algorithm will be run indefinitely. + max_attempts: int (default=100000) + Maximum number of attempts to generate a unique individual otherwise raise + an error. Returns ------- @@ -63,6 +67,20 @@ def random_search( future = operations.wait_next(async_) if future.result is not None: output.append(future.result.individual) - async_.submit(operations.evaluate, operations.individual()) + + attempts = 0 + while ( + new_individual := operations.individual() + ) and operations.is_evaluated( + new_individual + ): # type: ignore + if attempts >= max_attempts: + raise ValueError( + "Maximum attempts reached while trying to generate a" + "unique individual." + ) + attempts += 1 + + async_.submit(operations.evaluate, new_individual) return output From 125051c9f75b8dc7dfe1b307422662be19e1bc7c Mon Sep 17 00:00:00 2001 From: Provost Simon Date: Sat, 9 Dec 2023 01:05:48 +0000 Subject: [PATCH 8/9] refactor(search_methods): update EA uniqueness --- gama/search_methods/async_ea.py | 32 ++++++++++++++++++++++++++++++-- 1 file changed, 30 insertions(+), 2 deletions(-) diff --git a/gama/search_methods/async_ea.py b/gama/search_methods/async_ea.py index b7d6bd6e..aa4ae009 100644 --- a/gama/search_methods/async_ea.py +++ b/gama/search_methods/async_ea.py @@ -72,6 +72,23 @@ def search( ) +def generate_unique_individual( + ops: OperatorSet, generator_function: Callable, max_attempts: int +) -> Individual: + """Generate a unique individual using the given generator function""" + attempts = 0 + while (new_individual := generator_function()) and ops.is_evaluated( + new_individual + ): # type: ignore + if attempts >= max_attempts: + raise ValueError( + "Maximum attempts reached while trying to generate a" + "unique individual." + ) + attempts += 1 + return new_individual + + def async_ea( ops: OperatorSet, output: List[Individual], @@ -79,6 +96,7 @@ def async_ea( restart_callback: Optional[Callable[[], bool]] = None, max_n_evaluations: Optional[int] = None, population_size: int = 50, + max_attempts: int = 100000, ) -> List[Individual]: """Perform asynchronous evolutionary optimization with given operators. @@ -97,6 +115,9 @@ def async_ea( If None, the algorithm will be run indefinitely. population_size: int (default=50) Maximum number of individuals in the population at any time. + max_attempts: int (default=100000) + Maximum number of attempts to generate a unique individual otherwise raise + an error. Returns ------- @@ -139,14 +160,21 @@ def async_ea( # Increasing the number decreases the risk of lost compute time, # but also increases information lag. An offspring created too # early might miss out on a better parent. - new_individual = ops.create(current_population, 1)[0] + new_individual = generate_unique_individual( + ops, lambda: ops.create(current_population, 1)[0], max_attempts + ) async_.submit(ops.evaluate, new_individual) should_restart = restart_callback is not None and restart_callback() n_evaluated_individuals += 1 if should_restart: log.info("Restart criterion met. Creating new random population.") - start_candidates = [ops.individual() for _ in range(max_pop_size)] + start_candidates = [ + generate_unique_individual( + ops, lambda: ops.individual(), max_attempts + ) + for _ in range(max_pop_size) + ] break return current_population From d7310cd1fdecd36c6de7acaa99d1575939f3a166 Mon Sep 17 00:00:00 2001 From: Provost Simon Date: Sat, 9 Dec 2023 01:32:20 +0000 Subject: [PATCH 9/9] refactor(search_methods): update ASHA uniqueness --- gama/search_methods/asha.py | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/gama/search_methods/asha.py b/gama/search_methods/asha.py index 66ceb83a..049d43b3 100644 --- a/gama/search_methods/asha.py +++ b/gama/search_methods/asha.py @@ -89,6 +89,7 @@ def asha( maximum_resource: Union[int, float] = 1.0, minimum_early_stopping_rate: int = 0, max_full_evaluations: Optional[int] = None, + max_attempts: int = 100000, ) -> List[Individual]: """Asynchronous Halving Algorithm by Li et al. @@ -115,6 +116,9 @@ def asha( max_full_evaluations: Optional[int] (default=None) Maximum number of individuals to evaluate on the max rung (i.e. on all data). If None, the algorithm will be run indefinitely. + max_attempts: int (default=100000) + Maximum number of attempts to generate a unique individual otherwise raise + an error. Returns ------- @@ -163,7 +167,18 @@ def get_job(): if start_candidates: return start_candidates.pop(), minimum_early_stopping_rate - return operations.individual(), minimum_early_stopping_rate + + attempts = 0 + while (new_individual := operations.individual()) and operations.is_evaluated( + new_individual + ): + if attempts >= max_attempts: + raise ValueError( + "Maximum attempts reached while trying to generate a" + "unique individual." + ) + attempts += 1 + return new_individual, minimum_early_stopping_rate try: with AsyncEvaluator() as async_: