diff --git a/CHANGELOG.txt b/CHANGELOG.txt index 03a9123..92b3bab 100644 --- a/CHANGELOG.txt +++ b/CHANGELOG.txt @@ -2,6 +2,12 @@ CHANGELOG ========= +------------------------------------------------------------------------------- +March, 23, 2021 1.0.1 +------------------------------------------------------------------------------- + +- Add cross-validation (cv) capability to benchmark function. + ------------------------------------------------------------------------------- February, 1, 2021 1.0.0 ------------------------------------------------------------------------------- diff --git a/README.md b/README.md index 2b68d52..f677928 100644 --- a/README.md +++ b/README.md @@ -9,7 +9,7 @@ The library provides: * Automated task detection. No need to know what feature selection method works with what machine learning task -* Benchmarking with multiple selectors +* Benchmarking multiple selectors using cross-validation * Inspection of results and feature importance @@ -91,7 +91,7 @@ selectors = { } # Benchmark -score_df, selected_df, runtime_df = benchmark(selectors, data, label) +score_df, selected_df, runtime_df = benchmark(selectors, data, label, cv=5) print(score_df, "\n\n", selected_df, "\n\n", runtime_df) # Get benchmark statistics by feature @@ -125,6 +125,18 @@ plot_importance(df) Selective is available to install as `pip install selective`. +## Source + +Alternatively, you can build a wheel package on your platform from scratch using the source code: + +```bash +git clone https://github.com/fidelity/selective.git +cd selective +pip install setuptools wheel # if wheel is not installed +python setup.py sdist bdist_wheel +pip install dist/selective-X.X.X-py3-none-any.whl +``` + ## Support Please submit bug reports and feature requests as [Issues](https://github.com/fidelity/selective/issues). diff --git a/dist/selective-1.0.0-py3-none-any.whl b/dist/selective-1.0.0-py3-none-any.whl deleted file mode 100644 index 9a653f5..0000000 Binary files a/dist/selective-1.0.0-py3-none-any.whl and /dev/null differ diff --git a/feature/_version.py b/feature/_version.py index 76221c0..6670214 100644 --- a/feature/_version.py +++ b/feature/_version.py @@ -2,4 +2,4 @@ # Copyright FMR LLC # SPDX-License-Identifier: GNU GPLv3 -__version__ = "1.0.0" +__version__ = "1.0.1" diff --git a/feature/selector.py b/feature/selector.py index 59510d6..2edf22b 100644 --- a/feature/selector.py +++ b/feature/selector.py @@ -22,6 +22,7 @@ from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor from sklearn.ensemble import ExtraTreesClassifier, ExtraTreesRegressor from sklearn.ensemble import GradientBoostingClassifier, GradientBoostingRegressor +from sklearn.model_selection import KFold from xgboost import XGBClassifier, XGBRegressor from feature.base import _BaseDispatcher, _BaseSupervisedSelector, _BaseUnsupervisedSelector @@ -475,9 +476,11 @@ def benchmark(selectors: Dict[str, Union[SelectionMethod.Correlation, SelectionMethod.Variance]], data: pd.DataFrame, labels: Optional[pd.Series] = None, + cv: Optional[int] = None, output_filename: Optional[str] = None, drop_zero_variance_features: Optional[bool] = True, - verbose: bool = False) \ + verbose: bool = False, + seed: int = Constants.default_seed) \ -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]: """ Benchmark with a given set of feature selectors. @@ -495,6 +498,8 @@ def benchmark(selectors: Dict[str, Union[SelectionMethod.Correlation, Data of shape (n_samples, n_features) used for feature selection. labels: pd.Series, optional (default=None) The target values (class labels in classification, real numbers in regression). + cv: int, optional (default=None) + Number of folds to use for cross-validation. output_filename: str, optional (default=None) If not None, benchmarking output is saved. If file exists, results are appended, otherwise file is created. @@ -502,6 +507,81 @@ def benchmark(selectors: Dict[str, Union[SelectionMethod.Correlation, Whether to drop features with zero variance before running feature selector methods or not. verbose: bool, optional (default=False) Whether to print progress messages or not. + seed: int, optional (default=Constants.default_seed) + The random seed to initialize the random number generator. + + Returns + ------- + Tuple of data frames with scores, selected features and runtime for each method. + If cv is not None, the data frames will contain the concatenated results from each fold. + """ + + check_true(selectors is not None, ValueError("Benchmark selectors cannot be none.")) + check_true(data is not None, ValueError("Benchmark data cannot be none.")) + + if cv is None: + return _bench(selectors=selectors, + data=data, + labels=labels, + output_filename=output_filename, + drop_zero_variance_features=drop_zero_variance_features, + verbose=verbose) + else: + + # Create K-Fold object + kf = KFold(n_splits=cv, shuffle=True, random_state=seed) + + # Initialize variables + t0 = time() + train_labels, test_labels = None, None + score_df, selected_df, runtime_df = pd.DataFrame(), pd.DataFrame(), pd.DataFrame() + + # Split data into cv-folds and run _bench for each fold + if verbose: + print("\n>>> Running") + for fold, (train_index, _) in enumerate(kf.split(data)): + + if verbose: + print("\tFold", fold, "...") + + # Split data, labels into folds + train_data = data.iloc[train_index] + if labels is not None: + train_labels = labels.iloc[train_index] + + # Run benchmark + score_cv_df, selected_cv_df, runtime_cv_df = _bench(selectors=selectors, + data=train_data, + labels=train_labels, + output_filename=output_filename, + drop_zero_variance_features=drop_zero_variance_features, + verbose=False) + + # Concatenate data frames + score_df = pd.concat((score_df, score_cv_df)) + selected_df = pd.concat((selected_df, selected_cv_df)) + runtime_df = pd.concat((runtime_df, runtime_cv_df)) + + if verbose: + print(f"<<< Done! Time taken: {(time() - t0) / 60:.2f} minutes") + + return score_df, selected_df, runtime_df + + +def _bench(selectors: Dict[str, Union[SelectionMethod.Correlation, + SelectionMethod.Linear, + SelectionMethod.TreeBased, + SelectionMethod.Statistical, + SelectionMethod.Variance]], + data: pd.DataFrame, + labels: Optional[pd.Series] = None, + output_filename: Optional[str] = None, + drop_zero_variance_features: Optional[bool] = True, + verbose: bool = False) \ + -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]: + """ + Benchmark with a given set of feature selectors. + Return a tuple of data frames with scores, runtime and selected features for each method. Returns ------- @@ -552,7 +632,7 @@ def benchmark(selectors: Dict[str, Union[SelectionMethod.Correlation, if verbose: print(f"<<< Done! Time taken: {(time() - t0) / 60:.2f} minutes") - # Convert to series + # Format runtime_df = pd.Series(method_to_runtime).to_frame("runtime").rename_axis("method").reset_index() return score_df, selected_df, runtime_df @@ -561,15 +641,19 @@ def benchmark(selectors: Dict[str, Union[SelectionMethod.Correlation, def calculate_statistics(scores: pd.DataFrame, selected: pd.DataFrame, columns: Optional[list] = None, - ignore_constant: Optional[bool] = True): - """Calculate statistics for each feature using scores/selections from list of methods. + ignore_constant: Optional[bool] = True) -> pd.DataFrame: + """ + Calculate statistics for each feature using scores/selections from list of methods. + Returns data frame with calculated statistics for each feature. Parameters ---------- scores: pd.DataFrame Data frame with scores for each feature (index) and selector (columns). + Each feature could have multiple rows from different cross-validation folds. selected: pd.DataFrame Data frame with selection flag for each feature (index) and selector (columns). + Each feature could have multiple rows from different cross-validation folds. columns: list (default=None) List of methods (columns) to include in statistics. If None, all methods (columns) will be used. @@ -584,9 +668,9 @@ def calculate_statistics(scores: pd.DataFrame, check_true(isinstance(scores, pd.DataFrame), ValueError("scores must be a data frame.")) check_true(isinstance(selected, pd.DataFrame), ValueError("selection must be a data frame.")) check_true(scores.shape == selected.shape, ValueError("Shapes of scores and selected data frames must match.")) - check_true(len(scores.index.intersection(selected.index)) == selected.shape[0], + check_true(np.all(scores.index == selected.index), ValueError("Index of score and selection data frames must match.")) - check_true(len(scores.columns.intersection(selected.columns)) == selected.shape[1], + check_true(np.all(scores.columns == selected.columns), ValueError("Columns of score and selection data frames must match.")) # Get columns to use @@ -597,25 +681,25 @@ def calculate_statistics(scores: pd.DataFrame, scores_df = scores[columns].copy() selected_df = selected[columns].copy() + # Group by feature for CV results + scores_df = scores_df.groupby(scores_df.index).mean() + selected_df = selected_df.groupby(selected_df.index).mean() + # Drop methods with constant scores if ignore_constant: mask = ~np.isclose(np.var(scores_df, axis=0), 0) scores_df = scores_df.loc[:, mask] selected_df = selected_df.loc[:, mask] - # Sort by index - scores_df.sort_index(inplace=True) - selected_df.sort_index(inplace=True) - # Calculate statistics - stats_df = pd.DataFrame(index=scores.index) - stats_df["_score_mean"] = scores_df.mean(axis=1) - stats_df["_score_mean_norm"] = normalize_columns(scores_df).mean(axis=1) - stats_df["_selection_freq"] = selected_df.sum(axis=1) - stats_df["_selection_freq_norm"] = normalize_columns(selected_df).sum(axis=1) + stats_df = pd.DataFrame(index=scores_df.index) + stats_df["score_mean"] = scores_df.mean(axis=1) + stats_df["score_mean_norm"] = normalize_columns(scores_df).mean(axis=1) + stats_df["selection_freq"] = selected_df.sum(axis=1) + stats_df["selection_freq_norm"] = normalize_columns(selected_df).sum(axis=1) # Sort - stats_df.sort_values(by="_score_mean_norm", ascending=False, inplace=True) + stats_df.sort_values(by="score_mean_norm", ascending=False, inplace=True) return stats_df @@ -632,6 +716,7 @@ def plot_importance(scores: pd.DataFrame, ---------- scores: pd.DataFrame Data frame with scores for each feature (index) and method (columns). + Each feature could have multiple rows from different cross-validation folds. columns: list (default=None) List of methods (columns) to include in statistics. If None, all methods (columns) will be used. @@ -663,6 +748,9 @@ def plot_importance(scores: pd.DataFrame, df = scores[columns].copy() df.fillna(0, inplace=True) + # Group by feature for CV results + df = df.groupby(df.index).mean() + # Get normalized scores such that scores for each method sums to 1 if normalize: df = normalize_columns(df) diff --git a/feature/tree_based.py b/feature/tree_based.py index 69ea6d5..d31ccf5 100644 --- a/feature/tree_based.py +++ b/feature/tree_based.py @@ -50,14 +50,14 @@ def dispatch_model(self, labels: pd.Series, *args): # Custom estimator should be compatible with the task if "classification_" in task_str: if isinstance(self.estimator, CatBoost): - if self.estimator._estimator_type is not 'classifier': + if self.estimator._estimator_type != 'classifier': raise TypeError(str(self.estimator) + " cannot be used for task: " + task_str) else: if not isinstance(self.estimator, ClassifierMixin): raise TypeError(str(self.estimator) + " cannot be used for task: " + task_str) else: if isinstance(self.estimator, CatBoost): - if self.estimator._estimator_type is not 'regressor': + if self.estimator._estimator_type != 'regressor': raise TypeError(str(self.estimator) + " cannot be used for task: " + task_str) else: if not isinstance(self.estimator, RegressorMixin): diff --git a/tests/test_benchmark.py b/tests/test_benchmark.py index 33511d1..2f69c0d 100644 --- a/tests/test_benchmark.py +++ b/tests/test_benchmark.py @@ -17,43 +17,42 @@ class TestBenchmark(BaseTest): - def test_benchmark_regression(self): + num_features = 3 + corr_threshold = 0.5 + alpha = 1000 + tree_params = {"random_state": 123, "n_estimators": 100} + + selectors = { + "corr_pearson": SelectionMethod.Correlation(corr_threshold, method="pearson"), + "corr_kendall": SelectionMethod.Correlation(corr_threshold, method="kendall"), + "corr_spearman": SelectionMethod.Correlation(corr_threshold, method="spearman"), + "univ_anova": SelectionMethod.Statistical(num_features, method="anova"), + "univ_chi_square": SelectionMethod.Statistical(num_features, method="chi_square"), + "univ_mutual_info": SelectionMethod.Statistical(num_features, method="mutual_info"), + "linear": SelectionMethod.Linear(num_features, regularization="none"), + "lasso": SelectionMethod.Linear(num_features, regularization="lasso", alpha=alpha), + "ridge": SelectionMethod.Linear(num_features, regularization="ridge", alpha=alpha), + "random_forest": SelectionMethod.TreeBased(num_features), + "xgboost_clf": SelectionMethod.TreeBased(num_features, estimator=XGBClassifier(**tree_params)), + "xgboost_reg": SelectionMethod.TreeBased(num_features, estimator=XGBRegressor(**tree_params)), + "extra_clf": SelectionMethod.TreeBased(num_features, estimator=ExtraTreesClassifier(**tree_params)), + "extra_reg": SelectionMethod.TreeBased(num_features, estimator=ExtraTreesRegressor(**tree_params)), + "lgbm_clf": SelectionMethod.TreeBased(num_features, estimator=LGBMClassifier(**tree_params)), + "lgbm_reg": SelectionMethod.TreeBased(num_features, estimator=LGBMRegressor(**tree_params)), + "gradient_clf": SelectionMethod.TreeBased(num_features, estimator=GradientBoostingClassifier(**tree_params)), + "gradient_reg": SelectionMethod.TreeBased(num_features, estimator=GradientBoostingRegressor(**tree_params)), + "adaboost_clf": SelectionMethod.TreeBased(num_features, estimator=AdaBoostClassifier(**tree_params)), + "adaboost_reg": SelectionMethod.TreeBased(num_features, estimator=AdaBoostRegressor(**tree_params)), + "catboost_clf": SelectionMethod.TreeBased(num_features, estimator=CatBoostClassifier(**tree_params, silent=True)), + "catboost_reg": SelectionMethod.TreeBased(num_features, estimator=CatBoostRegressor(**tree_params, silent=True)) + } + def test_benchmark_regression(self): data, label = get_data_label(load_boston()) data = data.drop(columns=["CHAS", "NOX", "RM", "DIS", "RAD", "TAX", "PTRATIO", "INDUS"]) - num_features = 3 - corr_threshold = 0.5 - alpha = 1000 - tree_params = {"random_state": 123, "n_estimators": 100} - - selectors = { - "corr_pearson": SelectionMethod.Correlation(corr_threshold, method="pearson"), - "corr_kendall": SelectionMethod.Correlation(corr_threshold, method="kendall"), - "corr_spearman": SelectionMethod.Correlation(corr_threshold, method="spearman"), - "univ_anova": SelectionMethod.Statistical(num_features, method="anova"), - "univ_chi_square": SelectionMethod.Statistical(num_features, method="chi_square"), - "univ_mutual_info": SelectionMethod.Statistical(num_features, method="mutual_info"), - "linear": SelectionMethod.Linear(num_features, regularization="none"), - "lasso": SelectionMethod.Linear(num_features, regularization="lasso", alpha=alpha), - "ridge": SelectionMethod.Linear(num_features, regularization="ridge", alpha=alpha), - "random_forest": SelectionMethod.TreeBased(num_features), - "xgboost_clf": SelectionMethod.TreeBased(num_features, estimator=XGBClassifier(**tree_params)), - "xgboost_reg": SelectionMethod.TreeBased(num_features, estimator=XGBRegressor(**tree_params)), - "extra_clf": SelectionMethod.TreeBased(num_features, estimator=ExtraTreesClassifier(**tree_params)), - "extra_reg": SelectionMethod.TreeBased(num_features, estimator=ExtraTreesRegressor(**tree_params)), - "lgbm_clf": SelectionMethod.TreeBased(num_features, estimator=LGBMClassifier(**tree_params)), - "lgbm_reg": SelectionMethod.TreeBased(num_features, estimator=LGBMRegressor(**tree_params)), - "gradient_clf": SelectionMethod.TreeBased(num_features, estimator=GradientBoostingClassifier(**tree_params)), - "gradient_reg": SelectionMethod.TreeBased(num_features, estimator=GradientBoostingRegressor(**tree_params)), - "adaboost_clf": SelectionMethod.TreeBased(num_features, estimator=AdaBoostClassifier(**tree_params)), - "adaboost_reg": SelectionMethod.TreeBased(num_features, estimator=AdaBoostRegressor(**tree_params)), - "catboost_clf": SelectionMethod.TreeBased(num_features, estimator=CatBoostClassifier(**tree_params, silent=True)), - "catboost_reg": SelectionMethod.TreeBased(num_features, estimator=CatBoostRegressor(**tree_params, silent=True)) - } - # Benchmark - score_df, selected_df, runtime_df = benchmark(selectors, data, label, output_filename=None) + score_df, selected_df, runtime_df = benchmark(self.selectors, data, label, output_filename=None) _ = calculate_statistics(score_df, selected_df) self.assertListAlmostEqual([0.4787777784012165, 0.47170429073431874, 0.5596288196730658, 0.4400410275414326, 0.5674082968785575], @@ -86,42 +85,61 @@ def test_benchmark_regression(self): self.assertListAlmostEqual([0.10947144861974874, 0.020211076089938374, 0.08416074180466389, 0.045604950489313435, 0.7405517829963355], score_df["random_forest"].to_list()) - def test_benchmark_classification(self): + def test_benchmark_regression_cv(self): + data, label = get_data_label(load_boston()) + data = data.drop(columns=["CHAS", "NOX", "RM", "DIS", "RAD", "TAX", "PTRATIO", "INDUS"]) - data, label = get_data_label(load_iris()) + # Benchmark + score_df, selected_df, runtime_df = benchmark(self.selectors, data, label, cv=5, output_filename=None) + _ = calculate_statistics(score_df, selected_df) + + # Aggregate scores from different cv-folds + score_df = score_df.groupby(score_df.index).mean() + + self.assertListAlmostEqual( + [0.5598624197527886, 0.43999689309372514, 0.47947203347292133, 0.5677393697964164, 0.4718904343871402], + score_df["corr_pearson"].to_list()) + + self.assertListAlmostEqual( + [0.5133150872001859, 0.33830236220280874, 0.5355471187677026, 0.4944995007684703, 0.4812959438381611], + score_df["corr_kendall"].to_list()) + + self.assertListAlmostEqual( + [0.6266784101694156, 0.3922216387923788, 0.6538541627239243, 0.598348546553966, 0.5537572894805117], + score_df["corr_spearman"].to_list()) - num_features = 3 - corr_threshold = 0.5 - alpha = 1000 - tree_params = {"random_state": 123, "n_estimators": 100} - - selectors = { - "corr_pearson": SelectionMethod.Correlation(corr_threshold, method="pearson"), - "corr_kendall": SelectionMethod.Correlation(corr_threshold, method="kendall"), - "corr_spearman": SelectionMethod.Correlation(corr_threshold, method="spearman"), - "univ_anova": SelectionMethod.Statistical(num_features, method="anova"), - "univ_chi_square": SelectionMethod.Statistical(num_features, method="chi_square"), - "univ_mutual_info": SelectionMethod.Statistical(num_features, method="mutual_info"), - "linear": SelectionMethod.Linear(num_features, regularization="none"), - "lasso": SelectionMethod.Linear(num_features, regularization="lasso", alpha=alpha), - "ridge": SelectionMethod.Linear(num_features, regularization="ridge", alpha=alpha), - "random_forest": SelectionMethod.TreeBased(num_features), - "xgboost_clf": SelectionMethod.TreeBased(num_features, estimator=XGBClassifier(**tree_params)), - "xgboost_reg": SelectionMethod.TreeBased(num_features, estimator=XGBRegressor(**tree_params)), - "extra_clf": SelectionMethod.TreeBased(num_features, estimator=ExtraTreesClassifier(**tree_params)), - "extra_reg": SelectionMethod.TreeBased(num_features, estimator=ExtraTreesRegressor(**tree_params)), - "lgbm_clf": SelectionMethod.TreeBased(num_features, estimator=LGBMClassifier(**tree_params)), - "lgbm_reg": SelectionMethod.TreeBased(num_features, estimator=LGBMRegressor(**tree_params)), - "gradient_clf": SelectionMethod.TreeBased(num_features, estimator=GradientBoostingClassifier(**tree_params)), - "gradient_reg": SelectionMethod.TreeBased(num_features, estimator=GradientBoostingRegressor(**tree_params)), - "adaboost_clf": SelectionMethod.TreeBased(num_features, estimator=AdaBoostClassifier(**tree_params)), - "adaboost_reg": SelectionMethod.TreeBased(num_features, estimator=AdaBoostRegressor(**tree_params)), - "catboost_clf": SelectionMethod.TreeBased(num_features, estimator=CatBoostClassifier(**tree_params, silent=True)), - "catboost_reg": SelectionMethod.TreeBased(num_features, estimator=CatBoostRegressor(**tree_params, silent=True)) - } + self.assertListAlmostEqual( + [66.9096213925407, 50.470199216622746, 71.84642313219175, 481.0566386481166, 60.5346993182466], + score_df["univ_anova"].to_list()) + + self.assertListAlmostEqual([0, 0, 0, 0, 0], + score_df["univ_chi_square"].to_list()) + + self.assertListAlmostEqual( + [0.31315151982855777, 0.16552049446241074, 0.3376809619388398, 0.681986210957143, 0.18450178283973817], + score_df["univ_mutual_info"].to_list()) + + self.assertListAlmostEqual( + [0.06157747888912044, 0.006445566885590223, 0.06693250180688959, 0.9576028432508157, 0.053796504696545476], + score_df["linear"].to_list()) + + self.assertListAlmostEqual( + [0.05329389111187177, 0.007117077997740284, 0.054563375238215125, 0.9260391103473467, 0.05071613235478144], + score_df["lasso"].to_list()) + + self.assertListAlmostEqual( + [0.061567603158881413, 0.006446613222308434, 0.06694625250225411, 0.9575175129470551, 0.05379855880797472], + score_df["ridge"].to_list()) + + self.assertListAlmostEqual( + [0.07819877553940296, 0.04385018441841779, 0.11432712180337742, 0.7401304941703286, 0.023493424068473153], + score_df["random_forest"].to_list()) + + def test_benchmark_classification(self): + data, label = get_data_label(load_iris()) # Benchmark - score_df, selected_df, runtime_df = benchmark(selectors, data, label, output_filename=None) + score_df, selected_df, runtime_df = benchmark(self.selectors, data, label, output_filename=None) _ = calculate_statistics(score_df, selected_df) self.assertListAlmostEqual([0.7018161715727902, 0.47803395524999537, 0.8157648279049796, 0.7867331225527027], @@ -153,3 +171,43 @@ def test_benchmark_classification(self): self.assertListAlmostEqual([0.09210348279677849, 0.03045933928742506, 0.4257647994615192, 0.45167237845427727], score_df["random_forest"].to_list()) + + def test_benchmark_classification_cv(self): + data, label = get_data_label(load_iris()) + + # Benchmark + score_df, selected_df, runtime_df = benchmark(self.selectors, data, label, cv=5, output_filename=None) + _ = calculate_statistics(score_df, selected_df) + + # Aggregate scores from different cv-folds + score_df = score_df.groupby(score_df.index).mean() + + self.assertListAlmostEqual([0.8161221983271784, 0.7871883928143776, 0.7020705184086643, 0.4793198034473529], + score_df["corr_pearson"].to_list()) + + self.assertListAlmostEqual([0.6780266710547757, 0.6550828618428932, 0.6125815664695313, 0.35594860548691776], + score_df["corr_kendall"].to_list()) + + self.assertListAlmostEqual([0.78225620681015, 0.7652859083343029, 0.7201874607448919, 0.44222588698925963], + score_df["corr_spearman"].to_list()) + + self.assertListAlmostEqual([946.9891701851375, 781.7441886012473, 95.65931730842011, 39.49994604080157], + score_df["univ_anova"].to_list()) + + self.assertListAlmostEqual([92.9884264821005, 53.62326775665224, 8.659084856298207, 2.9711267637858163], + score_df["univ_chi_square"].to_list()) + + self.assertListAlmostEqual([0.994113677302704, 0.9907696444894937, 0.4998955427118911, 0.2298786031192229], + score_df["univ_mutual_info"].to_list()) + + self.assertListAlmostEqual([0.22327603204146848, 0.03543066514916661, 0.26254667473769594, 0.506591069316828], + score_df["linear"].to_list()) + + self.assertListAlmostEqual([0.280393459805252, 0.9489351779830099, 0.6627768115497065, 0.4761878539373159], + score_df["lasso"].to_list()) + + self.assertListAlmostEqual([1.1049393460379105e-15, 2.0872192862952944e-15, 6.504056552595708e-16, 4.218847493575594e-16], + score_df["ridge"].to_list()) + + self.assertListAlmostEqual([0.4185294825699565, 0.4472560913161835, 0.10091608418224696, 0.03329834193161316], + score_df["random_forest"].to_list()) \ No newline at end of file