diff --git a/amlb/benchmark.py b/amlb/benchmark.py index 7c54a344c..c504928b7 100644 --- a/amlb/benchmark.py +++ b/amlb/benchmark.py @@ -24,9 +24,10 @@ from .datautils import read_csv from .resources import get as rget, config as rconfig, output_dirs as routput_dirs from .results import ErrorResult, Scoreboard, TaskResult -from .utils import Namespace as ns, OSMonitoring, as_list, datetime_iso, flatten, json_dump, lazy_property, profile, repr_def, \ - run_cmd, run_script, signal_handler, str2bool, str_sanitize, system_cores, system_memory_mb, system_volume_mb, touch - +from .utils import Namespace as ns, OSMonitoring, as_list, datetime_iso, flatten, \ + json_dump, lazy_property, profile, repr_def, \ + run_cmd, run_script, signal_handler, str2bool, str_sanitize, system_cores, \ + system_memory_mb, system_volume_mb, touch, Namespace log = logging.getLogger(__name__) @@ -371,9 +372,33 @@ def _is_task_enabled(task_def): class TaskConfig: - def __init__(self, name, fold, metrics, seed, + def __init__(self, *, name, fold, seed, max_runtime_seconds, cores, max_mem_size_mb, min_vol_size_mb, - input_dir, output_dir): + input_dir, output_dir, + metrics: Union[list[str], str, None] = None, + optimization_metrics: Union[list[str], str, None] = None, + evaluation_metrics: Union[list[str], str, None] = None, + ): + + if metrics: + log.warning( + "WARNING: The `metric` field of the task definition is deprecated" + " and will not work in the future. Please specify the metric(s) to " + "optimize for with `optimization_metrics` and any additional metric(s) " + "used only for evaluation in `evaluation_metrics`." + ) + if optimization_metrics: + raise ValueError( + "Detected both `metric` and `optimization_metrics` for task " + f"'{name}'. Aborting because desired setup is unclear." + "Please only use `optimization_metrics`." + ) + optimization_metrics = as_list(metrics)[:1] + evaluation_metrics = as_list(metrics)[1:] + + self.optimization_metrics = optimization_metrics or [] + self._evaluation_metrics = evaluation_metrics or [] + self.framework = None self.framework_params = None self.framework_version = None @@ -391,16 +416,25 @@ def __init__(self, name, fold, metrics, seed, self.output_predictions_file = os.path.join(output_dir, "predictions.csv") self.ext = ns() # used if frameworks require extra config points + @property + def evaluation_metrics(self) -> list[str]: + return list(set(self.optimization_metrics) | set(self._evaluation_metrics)) + + def load_default_metrics(self, *, dataset_type: str): + """ Sets `optimization/evaluation_metrics` based on defaults from config.yaml""" + self.optimization_metrics = as_list(rconfig().benchmarks.optimization_metrics[dataset_type]) + self._evaluation_metrics = as_list(rconfig().benchmarks.evaluation_metrics[dataset_type]) + def __setattr__(self, name, value): - if name == 'metrics': - self.metric = value[0] if isinstance(value, list) else value - elif name == 'max_runtime_seconds': - self.job_timeout_seconds = min(value * 2, - value + rconfig().benchmarks.overhead_time_seconds) + if name == 'max_runtime_seconds': + self.job_timeout_seconds = min( + value * 2, + value + rconfig().benchmarks.overhead_time_seconds + ) super().__setattr__(name, value) def __json__(self): - return self.__dict__ + return self.__dict__ | {"evaluation_metrics": self.evaluation_metrics} def __repr__(self): return repr_def(self) @@ -458,10 +492,13 @@ def __init__(self, benchmark: Benchmark, task_def, fold): self.benchmark = benchmark self._task_def = task_def self.fold = fold + self.task_config = TaskConfig( name=task_def.name, fold=fold, metrics=task_def.metric, + optimization_metrics=Namespace.get(task_def, "optimization_metrics"), + evaluation_metrics=Namespace.get(task_def, "evaluation_metrics"), seed=rget().seed(fold), max_runtime_seconds=task_def.max_runtime_seconds, cores=task_def.cores, @@ -542,9 +579,8 @@ def run(self): task_config.output_predictions_file = results._predictions_file task_config.output_metadata_file = results._metadata_file touch(os.path.dirname(task_config.output_predictions_file), as_dir=True) - if task_config.metrics is None: - task_config.metrics = as_list(rconfig().benchmarks.metrics[self._dataset.type.name]) - task_config.metric = task_config.metrics[0] + if not task_config.optimization_metrics: + task_config.load_default_metrics(dataset_type=self._dataset.type.name) result = meta_result = None try: diff --git a/amlb/results.py b/amlb/results.py index aaceb9fc8..4ed79972b 100644 --- a/amlb/results.py +++ b/amlb/results.py @@ -2,7 +2,7 @@ **results** module provides the logic to format, save and read predictions generated by the *automl frameworks* (cf. ``TaskResult``), as well as logic to compute, format, save, read and merge scores obtained from those predictions (cf. ``Result`` and ``Scoreboard``). """ -from functools import partial +import inspect import collections import io import logging @@ -10,7 +10,7 @@ import os import re import statistics -from typing import Union +from typing import Union, Callable import numpy as np from numpy import nan, sort @@ -130,24 +130,20 @@ def __init__(self, scores=None, framework_name=None, benchmark_name=None, task_n @cached def as_data_frame(self): - # index = ['task', 'framework', 'fold'] - index = [] df = (self.scores if is_data_frame(self.scores) else to_data_frame([dict(sc) for sc in self.scores])) if df.empty: # avoid dtype conversions during reindexing on empty frame return df - fixed_cols = ['id', 'task', 'framework', 'constraint', 'fold', 'type', 'result', 'metric', 'mode', 'version', + fixed_cols = ['id', 'task', 'framework', 'constraint', 'fold', 'type', 'optimization_metrics', 'mode', 'version', 'params', 'app_version', 'utc', 'duration', 'training_duration', 'predict_duration', 'models_count', 'seed', 'info'] - fixed_cols = [col for col in fixed_cols if col not in index] - metrics_cols = [col for col in df.columns - if (col in dir(ClassificationResult) or col in dir(RegressionResult)) - and not col.startswith('_')] + metrics_cols = [ + col for col in df.columns + if col in ClassificationResult.metrics() + RegressionResult.metrics() + ] metrics_cols.sort() dynamic_cols = [col for col in df.columns - if col not in index - and col not in fixed_cols - and col not in metrics_cols] + if col not in fixed_cols + metrics_cols] dynamic_cols.sort() df = df.reindex(columns=[]+fixed_cols+metrics_cols+dynamic_cols) log.debug("Scores columns: %s.", df.columns) @@ -174,9 +170,12 @@ def as_printable_data_frame(self, verbosity=3): for col in high_precision_float_cols: df[col] = df[col].map("{:.6g}".format).astype(float) + unique_metrics = (set(metrics.split(",")) for metrics in df['optimization_metrics'].unique()) + optimized_metrics = set.union(*unique_metrics) + cols = ([] if verbosity == 0 - else ['task', 'fold', 'framework', 'constraint', 'result', 'metric', 'info'] if verbosity == 1 - else ['id', 'task', 'fold', 'framework', 'constraint', 'result', 'metric', + else ['task', 'fold', 'framework', 'constraint', *optimized_metrics, 'optimization_metrics', 'info'] if verbosity == 1 + else ['id', 'task', 'fold', 'framework', 'constraint', *optimized_metrics, 'optimization_metrics', 'duration', 'seed', 'info'] if verbosity == 2 else slice(None)) return df.loc[:, cols] @@ -426,8 +425,8 @@ def compute_score(self, result=None, meta_result=None): seed=metadata.seed, app_version=rget().app_version, utc=datetime_iso(), - metric=metadata.metric, - duration=nan + optimization_metrics=metadata.optimization_metrics, + duration=nan, ) required_meta_res = ['training_duration', 'predict_duration', 'models_count'] for m in required_meta_res: @@ -435,29 +434,13 @@ def compute_score(self, result=None, meta_result=None): result = self.get_result() if result is None else result scoring_errors = [] - - def do_score(m): - score = result.evaluate(m) + for metric_ in metadata.evaluation_metrics: + score = result.evaluate(metric_) if 'message' in score: scoring_errors.append(score.message) - return score - - def set_score(score): - entry.metric = score.metric - entry.result = score.value - if score.higher_is_better is False: # if unknown metric, and higher_is_better is None, then no change - entry.metric = f"neg_{entry.metric}" - entry.result = - entry.result - - for metric in metadata.metrics or []: - sc = do_score(metric) - entry[metric] = sc.value - if metric == entry.metric: - set_score(sc) - - if 'result' not in entry: - set_score(do_score(entry.metric)) + entry[metric_] = score.value + entry.optimization_metrics = ','.join(entry.optimization_metrics) entry.info = result.info if scoring_errors: entry.info = "; ".join(filter(lambda it: it, [entry.info, *scoring_errors])) @@ -501,6 +484,14 @@ def evaluate(self, metric): eval_res += Namespace(value=nan, higher_is_better=None, message=f"Unsupported metric `{metric}` for {pb_type} problems") return eval_res + @classmethod + def metrics(cls) -> list[str]: + def has_metric_metadata(fn: Callable) -> bool: + return get_metadata(fn, "higher_is_better") is not None + return [ + name for name, _ in inspect.getmembers(cls, predicate=has_metric_metadata) + ] + class NoResult(Result): diff --git a/examples/custom/extensions/Stacking/exec.py b/examples/custom/extensions/Stacking/exec.py index d8c80879d..84449550b 100644 --- a/examples/custom/extensions/Stacking/exec.py +++ b/examples/custom/extensions/Stacking/exec.py @@ -33,8 +33,8 @@ def run(dataset, config): estimators_params = {e: config.framework_params.get(f'_{e}_params', {}) for e in ['rf', 'gbm', 'linear', 'svc', 'final']} log.info("Running Sklearn Stacking Ensemble with a maximum time of {}s on {} cores.".format(config.max_runtime_seconds, n_jobs)) - log.warning("We completely ignore the requirement to stay within the time limit.") - log.warning("We completely ignore the advice to optimize towards metric: {}.".format(config.metric)) + log.warning("We ignore the requirement to stay within the time limit.") + log.warning(f"We ignore the advice to optimize for: {config.optimization_metrics}.") if is_classification: diff --git a/frameworks/AutoGluon/exec.py b/frameworks/AutoGluon/exec.py index 6fe76769b..eab3db1e1 100644 --- a/frameworks/AutoGluon/exec.py +++ b/frameworks/AutoGluon/exec.py @@ -38,10 +38,10 @@ def run(dataset, config): rmse=metrics.root_mean_squared_error, ) - perf_metric = metrics_mapping[config.metric] if config.metric in metrics_mapping else None + perf_metric = metrics_mapping.get(config.optimization_metrics[0]) if perf_metric is None: # TODO: figure out if we are going to blindly pass metrics through, or if we use a strict mapping - log.warning("Performance metric %s not supported.", config.metric) + log.warning(f"Performance metric {config.optimization_metrics[0]} not supported.") is_classification = config.type == 'classification' training_params = {k: v for k, v in config.framework_params.items() if not k.startswith('_')} diff --git a/frameworks/AutoGluon/exec_ts.py b/frameworks/AutoGluon/exec_ts.py index ab7c4110f..eafdb4652 100644 --- a/frameworks/AutoGluon/exec_ts.py +++ b/frameworks/AutoGluon/exec_ts.py @@ -142,9 +142,9 @@ def get_eval_metric(config): rmse="RMSE", ) - eval_metric = metrics_mapping[config.metric] if config.metric in metrics_mapping else None + eval_metric = metrics_mapping.get(config.optimization_metrics[0]) if eval_metric is None: - log.warning("Performance metric %s not supported.", config.metric) + log.warning(f"Performance metric {config.optimization_metrics[0]} not supported.") return eval_metric diff --git a/frameworks/AutoWEKA/exec.py b/frameworks/AutoWEKA/exec.py index 7b7534e0a..4727e40ae 100644 --- a/frameworks/AutoWEKA/exec.py +++ b/frameworks/AutoWEKA/exec.py @@ -24,9 +24,10 @@ def run(dataset: Dataset, config: TaskConfig): auc='areaUnderROC', logloss='kBInformation' ) - metric = metrics_mapping[config.metric] if config.metric in metrics_mapping else None + metric = metrics_mapping.get(config.optimization_metrics[0]) if metric is None: - raise ValueError("Performance metric {} not supported.".format(config.metric)) + msg = f"Performance metric {config.optimization_metrics[0]} not supported." + raise ValueError(msg) train_file = dataset.train.path test_file = dataset.test.path diff --git a/frameworks/GAMA/exec.py b/frameworks/GAMA/exec.py index e0880bf34..8d5860e03 100644 --- a/frameworks/GAMA/exec.py +++ b/frameworks/GAMA/exec.py @@ -43,9 +43,10 @@ def run(dataset, config): r2='r2', rmse='neg_mean_squared_error', ) - scoring_metric = metrics_mapping[config.metric] if config.metric in metrics_mapping else None + scoring_metric = metrics_mapping.get(config.optimization_metrics[0]) if scoring_metric is None: - raise ValueError("Performance metric {} not supported.".format(config.metric)) + msg = f"Performance metric '{config.optimization_metrics[0]}' not supported." + raise ValueError(msg) training_params = {k: v for k, v in config.framework_params.items() if not k.startswith('_')} n_jobs = config.framework_params.get('_n_jobs', config.cores) # useful to disable multicore, regardless of the dataset config diff --git a/frameworks/H2OAutoML/exec.py b/frameworks/H2OAutoML/exec.py index 1d70b9bb2..9492d9f3e 100644 --- a/frameworks/H2OAutoML/exec.py +++ b/frameworks/H2OAutoML/exec.py @@ -43,10 +43,10 @@ def run(dataset, config): rmse='rmse', rmsle='rmsle' ) - sort_metric = metrics_mapping[config.metric] if config.metric in metrics_mapping else None + sort_metric = metrics_mapping.get(config.optimization_metrics[0]) if sort_metric is None: # TODO: Figure out if we are going to blindly pass metrics through, or if we use a strict mapping - log.warning("Performance metric %s not supported, defaulting to AUTO.", config.metric) + log.warning(f"Performance metric {config.optimization_metrics[0]} not supported, defaulting to AUTO.") try: training_params = {k: v for k, v in config.framework_params.items() if not k.startswith('_')} diff --git a/frameworks/MLPlan/exec.py b/frameworks/MLPlan/exec.py index 839d94d23..2aea045c6 100644 --- a/frameworks/MLPlan/exec.py +++ b/frameworks/MLPlan/exec.py @@ -30,10 +30,10 @@ def run(dataset, config): rmsle='ROOT_MEAN_SQUARED_LOGARITHM_ERROR', mae='MEAN_ABSOLUTE_ERROR' ) - - metric = metrics_mapping[config.metric] if config.metric in metrics_mapping else None + metric = metrics_mapping.get(config.optimization_metrics[0]) if metric is None: - raise ValueError('Performance metric {} is not supported.'.format(config.metric)) + msg = f'Performance metric {config.optimization_metrics[0]} is not supported.' + raise ValueError(msg) train_file = dataset.train.path test_file = dataset.test.path diff --git a/frameworks/RandomForest/exec.py b/frameworks/RandomForest/exec.py index 77bdc99ef..193667fb8 100644 --- a/frameworks/RandomForest/exec.py +++ b/frameworks/RandomForest/exec.py @@ -47,7 +47,7 @@ def run(dataset, config): memory_margin = config.framework_params.get('_memory_margin', 0.9) log.info("Running RandomForest with a maximum time of {}s on {} cores.".format(config.max_runtime_seconds, n_jobs)) - log.warning("We completely ignore the advice to optimize towards metric: {}.".format(config.metric)) + log.warning(f"We ignore the advice to optimize for: {config.optimization_metrics}.") estimator = RandomForestClassifier if is_classification else RandomForestRegressor rf = estimator(n_jobs=n_jobs, diff --git a/frameworks/TPOT/exec.py b/frameworks/TPOT/exec.py index ce70cb7f9..5766f76c6 100644 --- a/frameworks/TPOT/exec.py +++ b/frameworks/TPOT/exec.py @@ -36,9 +36,10 @@ def run(dataset, config): r2='r2', rmse='neg_mean_squared_error', # TPOT can score on mse, as app computes rmse independently on predictions ) - scoring_metric = metrics_mapping[config.metric] if config.metric in metrics_mapping else None + scoring_metric = metrics_mapping.get(config.optimization_metrics[0]) if scoring_metric is None: - raise ValueError("Performance metric {} not supported.".format(config.metric)) + msg = f"Performance metric {config.optimization_metrics[0]} not supported." + raise ValueError(msg) X_train = dataset.train.X y_train = dataset.train.y diff --git a/frameworks/TunedRandomForest/exec.py b/frameworks/TunedRandomForest/exec.py index 7c7a7dc15..897636dd5 100644 --- a/frameworks/TunedRandomForest/exec.py +++ b/frameworks/TunedRandomForest/exec.py @@ -76,7 +76,11 @@ def run(dataset, config): mse='neg_mean_squared_error', r2='r2', rmse='neg_root_mean_squared_error', - )[config.metric] + ).get(config.optimization_metrics[0]) + + if not metric: + msg = f"TunedRandomForest doesn't support {config.optimization_metrics[0]}" + raise ValueError(msg) n_features = X_train.shape[1] default_value = max(1, int(math.sqrt(n_features))) diff --git a/frameworks/autosklearn/exec.py b/frameworks/autosklearn/exec.py index 11690e09c..b612c8f6f 100644 --- a/frameworks/autosklearn/exec.py +++ b/frameworks/autosklearn/exec.py @@ -45,10 +45,10 @@ def run(dataset, config): rmse=metrics.mean_squared_error if askl_version < version.parse("0.10") else metrics.root_mean_squared_error, r2=metrics.r2 ) - perf_metric = metrics_mapping[config.metric] if config.metric in metrics_mapping else None + perf_metric = metrics_mapping.get(config.optimization_metrics[0]) if perf_metric is None: # TODO: figure out if we are going to blindly pass metrics through, or if we use a strict mapping - log.warning("Performance metric %s not supported.", config.metric) + log.warning(f"Performance metric {config.optimization_metrics[0]} not supported.") # Set resources based on datasize log.info( diff --git a/frameworks/flaml/exec.py b/frameworks/flaml/exec.py index a8a5131af..e458c3faa 100644 --- a/frameworks/flaml/exec.py +++ b/frameworks/flaml/exec.py @@ -32,10 +32,12 @@ def run(dataset, config): rmse='rmse', r2='r2', ) - perf_metric = metrics_mapping[ - config.metric] if config.metric in metrics_mapping else 'auto' - if perf_metric is None: - log.warning("Performance metric %s not supported.", config.metric) + perf_metric = metrics_mapping.get(config.optimization_metrics[0], 'auto') + if perf_metric == 'auto' and config.optimization_metrics[0] != 'auto': + log.warning( + f"Performance metric '{config.optimization_metrics[0]}' not supported, " + f"using metric='auto' instead.", + ) training_params = {k: v for k, v in config.framework_params.items() if not k.startswith('_')} diff --git a/frameworks/hyperoptsklearn/exec.py b/frameworks/hyperoptsklearn/exec.py index c4179cae1..44857707b 100644 --- a/frameworks/hyperoptsklearn/exec.py +++ b/frameworks/hyperoptsklearn/exec.py @@ -36,7 +36,11 @@ def run(dataset, config): r2=(default, False), # lambda y, pred: 1.0 - r2_score(y, pred) rmse=(mean_squared_error, False), ) - loss_fn, continuous_loss_fn = metrics_to_loss_mapping[config.metric] if config.metric in metrics_to_loss_mapping else (None, False) + + loss_fn, continuous_loss_fn = metrics_to_loss_mapping.get( + config.optimization_metrics[0], + (None, False) + ) if loss_fn is None: log.warning("Performance metric %s not supported: defaulting to %s.", config.metric, 'accuracy' if is_classification else 'r2') diff --git a/frameworks/mljarsupervised/exec.py b/frameworks/mljarsupervised/exec.py index 653d9cfd6..8ad65815a 100644 --- a/frameworks/mljarsupervised/exec.py +++ b/frameworks/mljarsupervised/exec.py @@ -24,7 +24,7 @@ def run(dataset, config): logloss='logloss', rmse='rmse' ) - eval_metric = metrics_mapping[config.metric] if config.metric in metrics_mapping else "auto" + eval_metric = metrics_mapping.get(config.optimization_metrics[0], 'auto') # Mapping of benchmark task to MLJAR ML task problem_mapping = dict( diff --git a/frameworks/oboe/exec.py b/frameworks/oboe/exec.py index bed388dc7..f47e43941 100644 --- a/frameworks/oboe/exec.py +++ b/frameworks/oboe/exec.py @@ -66,7 +66,7 @@ def run(dataset, config): n_cores = config.framework_params.get('_n_cores', config.cores) log.info('Running oboe with a maximum time of {}s on {} cores.'.format(config.max_runtime_seconds, n_cores)) - log.warning('We completely ignore the advice to optimize towards metric: {}.'.format(config.metric)) + log.warning(f'We ignore the advice to optimize for: {config.optimization_metrics[0]}.') aml = AutoLearner(p_type='classification' if is_classification else 'regression', n_cores=n_cores, diff --git a/resources/config.yaml b/resources/config.yaml index 4daf8087b..ca9ec6503 100644 --- a/resources/config.yaml +++ b/resources/config.yaml @@ -48,13 +48,17 @@ benchmarks: # configuration namespace for the benchmarks def os_mem_size_mb: 2048 # the default amount of memory left to the OS when task assigned memory is computed automatically. os_vol_size_mb: 2048 # the default amount of volume left to the OS when task volume memory is verified. overhead_time_seconds: 3600 # amount of additional time allowed for the job to complete before sending an interruption signal - metrics: # default metrics by dataset type (as listed by amlb.data.DatasetType), - # only the first metric is optimized by the frameworks, - # the others are computed only for information purpose. - binary: ['auc', 'logloss', 'acc', 'balacc'] # available metrics: auc (AUC), acc (Accuracy), balacc (Balanced Accuracy), pr_auc (Precision Recall AUC), logloss (Log Loss), f1, f2, f05 (F-beta scores with beta=1, 2, or 0.5), max_pce, mean_pce (Max/Mean Per-Class Error). - multiclass: ['logloss', 'acc', 'balacc'] # available metrics: same as for binary, except auc, replaced by auc_ovo (AUC One-vs-One), auc_ovr (AUC One-vs-Rest). AUC metrics and F-beta metrics are computed with weighted average. - regression: ['rmse', 'r2', 'mae'] # available metrics: mae (Mean Absolute Error), mse (Mean Squared Error), msle (Mean Squared Logarithmic Error), rmse (Root Mean Square Error), rmsle (Root Mean Square Logarithmic Error), r2 (R^2). - timeseries: ['mase', 'mape', 'smape', 'rmse', 'mse', 'nrmse', 'wape', 'ncrps'] + optimization_metrics: # default metrics to optimize for by dataset type (as listed by amlb.data.DatasetType) + binary: ['auc'] # available metrics: auc (AUC), acc (Accuracy), balacc (Balanced Accuracy), pr_auc (Precision Recall AUC), logloss (Log Loss), f1, f2, f05 (F-beta scores with beta=1, 2, or 0.5), max_pce, mean_pce (Max/Mean Per-Class Error). + multiclass: ['logloss'] # available metrics: same as for binary, except auc, replaced by auc_ovo (AUC One-vs-One), auc_ovr (AUC One-vs-Rest). AUC metrics and F-beta metrics are computed with weighted average. + regression: ['rmse'] # available metrics: mae (Mean Absolute Error), mse (Mean Squared Error), msle (Mean Squared Logarithmic Error), rmse (Root Mean Square Error), rmsle (Root Mean Square Logarithmic Error), r2 (R^2). + timeseries: ['mase'] + evaluation_metrics: # default metrics to report by dataset type (as listed by amlb.data.DatasetType) + binary: ['logloss', 'acc', 'balacc'] # available metrics are identical to those described for `optimization_metrics`. + multiclass: ['acc', 'balacc'] # any `optimization_metric` is also reported on, duplicating it here is not necessary. + regression: ['r2', 'mae'] + timeseries: ['mape', 'smape', 'rmse', 'mse', 'nrmse', 'wape', 'ncrps'] + defaults: # the default constraints, usually overridden by a constraint. folds: 10 # the amount of fold-runs executed for each dataset. max_runtime_seconds: 3600 # default time allocated to the framework to train a model.