diff --git a/DESCRIPTION b/DESCRIPTION index c79f5928..3f17652e 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -41,6 +41,7 @@ Suggests: mlr3learners, mlr3pipelines, rpart, + fastVoteR, testthat (>= 3.0.0) Config/testthat/edition: 3 Config/testthat/parallel: true @@ -74,6 +75,7 @@ Collate: 'assertions.R' 'auto_fselector.R' 'bibentries.R' + 'embedded_ensemble_fselect.R' 'ensemble_fselect.R' 'extract_inner_fselect_archives.R' 'extract_inner_fselect_results.R' diff --git a/NAMESPACE b/NAMESPACE index 81cfe545..a0536eef 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -36,6 +36,7 @@ export(auto_fselector) export(callback_batch_fselect) export(clbk) export(clbks) +export(embedded_ensemble_fselect) export(ensemble_fselect) export(extract_inner_fselect_archives) export(extract_inner_fselect_results) diff --git a/NEWS.md b/NEWS.md index c456706a..46b078bc 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,5 +1,9 @@ # mlr3fselect (development version) +* Use [fastVoteR](https://github.com/bblodfon/fastVoteR) for feature ranking in `EnsembleFSResult()` objects +* Add embedded ensemble feature selection `embedded_ensemble_fselect()` +* Refactor `ensemble_fselect()` and `EnsembleFSResult()` + # mlr3fselect 1.2.1 * compatibility: mlr3 0.22.0 diff --git a/R/EnsembleFSResult.R b/R/EnsembleFSResult.R index 4b792738..fed296f2 100644 --- a/R/EnsembleFSResult.R +++ b/R/EnsembleFSResult.R @@ -5,7 +5,8 @@ #' @description #' The `EnsembleFSResult` stores the results of ensemble feature selection. #' It includes methods for evaluating the stability of the feature selection process and for ranking the selected features among others. -#' The function [ensemble_fselect()] returns an object of this class. +#' +#' Both functions [ensemble_fselect()] and [embedded_ensemble_fselect()] return an object of this class. #' #' @section S3 Methods: #' * `as.data.table.EnsembleFSResult(x, benchmark_result = TRUE)`\cr @@ -16,7 +17,7 @@ #' Whether to add the learner, task and resampling information from the benchmark result. #' #' @references -#' `r format_bib("das1999")` +#' `r format_bib("das1999", "meinshausen2010")` #' #' @export #' @examples @@ -27,7 +28,8 @@ #' learners = lrns(c("classif.rpart", "classif.featureless")), #' init_resampling = rsmp("subsampling", repeats = 2), #' inner_resampling = rsmp("cv", folds = 3), -#' measure = msr("classif.ce"), +#' inner_measure = msr("classif.ce"), +#' measure = msr("classif.acc"), #' terminator = trm("none") #' ) #' @@ -43,7 +45,16 @@ #' # returns a ranking of all features #' head(efsr$feature_ranking()) #' -#' # returns the empirical pareto front (nfeatures vs error) +#' # returns the empirical pareto front, i.e. n_features vs measure (error) +#' efsr$pareto_front() +#' +#' # returns the knee points (optimal trade-off between n_features and performance) +#' efsr$knee_points() +#' +#' # change to use the inner optimization measure +#' efsr$set_active_measure(which = "inner") +#' +#' # Pareto front is calculated on the inner measure #' efsr$pareto_front() #' } EnsembleFSResult = R6Class("EnsembleFSResult", @@ -62,26 +73,52 @@ EnsembleFSResult = R6Class("EnsembleFSResult", #' #' @param result ([data.table::data.table])\cr #' The result of the ensemble feature selection. - #' Column names should include `"resampling_iteration"`, `"learner_id"`, `"features"` - #' and `"n_features"`. + #' Mandatory column names should include `"resampling_iteration"`, `"learner_id"`, + #' `"features"` and `"n_features"`. + #' A column named as `{measure$id}` (scores on the test sets) must also be + #' always present. + #' The column with the performance scores on the inner resampling of the train sets is not mandatory, + #' but note that it should be named as `{inner_measure$id}_inner` to distinguish from + #' the `{measure$id}`. #' @param features ([character()])\cr #' The vector of features of the task that was used in the ensemble feature #' selection. #' @param benchmark_result ([mlr3::BenchmarkResult])\cr #' The benchmark result object. - #' @param measure_id (`character(1)`)\cr - #' Column name of `"result"` that corresponds to the measure used. - #' @param minimize (`logical(1)`)\cr - #' If `TRUE` (default), lower values of the measure correspond to higher performance. - initialize = function(result, features, benchmark_result = NULL, measure_id, - minimize = TRUE) { + #' @param measure ([mlr3::Measure])\cr + #' The performance measure used to evaluate the learners on the test sets generated + #' during the ensemble feature selection process. + #' By default, this serves as the 'active' measure for the methods of this object. + #' The active measure can be updated using the `$set_active_measure()` method. + #' @param inner_measure ([mlr3::Measure])\cr + #' The performance measure used to optimize and evaluate the learners during the inner resampling process of the training sets, generated as part of the ensemble feature selection procedure. + initialize = function( + result, + features, + benchmark_result = NULL, + measure, + inner_measure = NULL + ) { assert_data_table(result) - private$.measure_id = assert_string(measure_id, null.ok = FALSE) - mandatory_columns = c("resampling_iteration", "learner_id", "features", "n_features") - assert_names(names(result), must.include = c(mandatory_columns, measure_id)) + private$.measure = assert_measure(measure) + private$.active_measure = "outer" + measure_ids = c(private$.measure$id) + if (!is.null(inner_measure)) { + private$.inner_measure = assert_measure(inner_measure) + # special end-fix required for inner measure + measure_ids = c(measure_ids, sprintf("%s_inner", private$.inner_measure$id)) + } + + # the non-NULL measure ids should be defined as columns in the dt result + mandatory_columns = c("resampling_iteration", "learner_id", "features", + "n_features", measure_ids) + assert_names(names(result), must.include = mandatory_columns) private$.result = result private$.features = assert_character(features, any.missing = FALSE, null.ok = FALSE) - private$.minimize = assert_logical(minimize, null.ok = FALSE) + + # check that all feature sets are subsets of the task features + assert_subset(unlist(result$features), private$.features) + self$benchmark_result = if (!is.null(benchmark_result)) assert_benchmark_result(benchmark_result) self$man = "mlr3fselect::ensemble_fs_result" @@ -99,7 +136,8 @@ EnsembleFSResult = R6Class("EnsembleFSResult", #' #' @param ... (ignored). print = function(...) { - catf(format(self)) + catf("%s with %s learners and %s initial resamplings", + format(self), self$n_learners, self$n_resamples) print(private$.result[, c("resampling_iteration", "learner_id", "n_features"), with = FALSE]) }, @@ -110,43 +148,102 @@ EnsembleFSResult = R6Class("EnsembleFSResult", }, #' @description - #' Calculates the feature ranking. + #' Use this function to change the active measure. + #' + #' @param which (`character(1)`)\cr + #' Which [measure][mlr3::Measure] from the ensemble feature selection result + #' to use in methods of this object. + #' Should be either `"inner"` (optimization measure used in training sets) + #' or `"outer"` (measure used in test sets, default value). + set_active_measure = function(which = "outer") { + assert_choice(which, c("inner", "outer")) + + # check if `inner_measure` is an `mlr3::Measure` + if (which == "inner" && is.null(private$.inner_measure)) { + stop("No inner_measure was defined during initialization") + } + + private$.active_measure = which + }, + + #' @description + #' Calculates the feature ranking via [fastVoteR::rank_candidates()]. #' #' @details - #' The feature ranking process is built on the following framework: models act as voters, features act as candidates, and voters select certain candidates (features). + #' The feature ranking process is built on the following framework: models act as *voters*, features act as *candidates*, and voters select certain candidates (features). #' The primary objective is to compile these selections into a consensus ranked list of features, effectively forming a committee. - #' Currently, only `"approval_voting"` method is supported, which selects the candidates/features that have the highest approval score or selection frequency, i.e. appear the most often. + #' + #' For every feature a score is calculated, which depends on the `"method"` argument. + #' The higher the score, the higher the ranking of the feature. + #' Note that some methods output a feature ranking instead of a score per feature, so we always include **Borda's score**, which is method-agnostic, i.e. it can be used to compare the feature rankings across different methods. + #' + #' We shuffle the input candidates/features so that we enforce random tie-breaking. + #' Users should set the same `seed` for consistent comparison between the different feature ranking methods and for reproducibility. #' #' @param method (`character(1)`)\cr - #' The method to calculate the feature ranking. + #' The method to calculate the feature ranking. See [fastVoteR::rank_candidates()] + #' for a complete list of available methods. + #' Approval voting (`"av"`) is the default method. + #' @param use_weights (`logical(1)`)\cr + #' The default value (`TRUE`) uses weights equal to the performance scores + #' of each voter/model (or the inverse scores if the measure is minimized). + #' If `FALSE`, we treat all voters as equal and assign them all a weight equal to 1. + #' @param committee_size (`integer(1)`)\cr + #' Number of top selected features in the output ranking. + #' This parameter can be used to speed-up methods that build a committee sequentially + #' (`"seq_pav"`), by requesting only the top N selected candidates/features + #' and not the complete feature ranking. + #' @param shuffle_features (`logical(1)`)\cr + #' Whether to shuffle the task features randomly before computing the ranking. + #' Shuffling ensures consistent random tie-breaking across methods and prevents + #' deterministic biases when features with equal scores are encountered. + #' Default is `TRUE` and it's advised to set a seed before running this function. + #' Set to `FALSE` if deterministic ordering of features is preferred (same as + #' during initialization). #' - #' @return A [data.table::data.table] listing all the features, ordered by decreasing inclusion probability scores (depending on the `method`) - feature_ranking = function(method = "approval_voting") { - assert_choice(method, choices = "approval_voting") - - # cached results - if (!is.null(private$.feature_ranking[[method]])) { - return(private$.feature_ranking[[method]]) + #' @return A [data.table::data.table] listing all the features, ordered by decreasing scores (depends on the `"method"`). Columns are as follows: + #' - `"feature"`: Feature names. + #' - `"score"`: Scores assigned to each feature based on the selected method (if applicable). + #' - `"norm_score"`: Normalized scores (if applicable), scaled to the range \eqn{[0,1]}, which can be loosely interpreted as **selection probabilities** (Meinshausen et al. (2010)). + #' - `"borda_score"`: Borda scores for method-agnostic comparison, ranging in \eqn{[0,1]}, where the top feature receives a score of 1 and the lowest-ranked feature receives a score of 0. + #' This column is always included so that feature ranking methods that output only rankings have also a feature-wise score. + #' + feature_ranking = function(method = "av", use_weights = TRUE, committee_size = NULL, shuffle_features = TRUE) { + requireNamespace("fastVoteR") + + # candidates => all features, voters => list of selected (best) features sets + candidates = private$.features + voters = private$.result$features + + # calculate weights + if (use_weights) { + # voter weights are the (inverse) scores + measure = self$measure # get active measure + measure_id = ifelse(private$.active_measure == "inner", + sprintf("%s_inner", measure$id), + measure$id) + + scores = private$.result[, get(measure_id)] + weights = if (measure$minimize) 1 / scores else scores + } else { + # all voters are equal + weights = rep(1, length(voters)) } - count_tbl = sort(table(unlist(private$.result$features)), decreasing = TRUE) - features_selected = names(count_tbl) - features_not_selected = setdiff(private$.features, features_selected) - - res_fs = data.table( - feature = features_selected, - inclusion_probability = as.vector(count_tbl) / nrow(private$.result) + # get consensus feature ranking + res = fastVoteR::rank_candidates( + voters = voters, + candidates = candidates, + weights = weights, + committee_size = committee_size, + method = method, + borda_score = TRUE, + shuffle_candidates = shuffle_features ) - res_fns = data.table( - feature = features_not_selected, - inclusion_probability = 0 - ) + setnames(res, "candidate", "feature") - res = rbindlist(list(res_fs, res_fns)) - - private$.feature_ranking[[method]] = res - private$.feature_ranking[[method]] + res }, #' @description @@ -222,8 +319,11 @@ EnsembleFSResult = R6Class("EnsembleFSResult", pareto_front = function(type = "empirical") { assert_choice(type, choices = c("empirical", "estimated")) result = private$.result - measure_id = private$.measure_id - minimize = private$.minimize + measure = self$measure # get active measure + measure_id = ifelse(private$.active_measure == "inner", + sprintf("%s_inner", measure$id), + measure$id) + minimize = measure$minimize # Keep only n_features and performance scores cols_to_keep = c("n_features", measure_id) @@ -261,6 +361,8 @@ EnsembleFSResult = R6Class("EnsembleFSResult", # Transform the data (x => 1/x) n_features_inv = NULL pf[, n_features_inv := 1 / n_features] + # remove edge cases where no features were selected + pf = pf[n_features > 0] # Fit the linear model form = mlr3misc::formulate(lhs = measure_id, rhs = "n_features_inv") @@ -298,8 +400,11 @@ EnsembleFSResult = R6Class("EnsembleFSResult", knee_points = function(method = "NBI", type = "empirical") { assert_choice(method, choices = c("NBI")) assert_choice(type, choices = c("empirical", "estimated")) - measure_id = private$.measure_id - minimize = private$.minimize + measure = self$measure # get active measure + measure_id = ifelse(private$.active_measure == "inner", + sprintf("%s_inner", measure$id), + measure$id) + minimize = measure$minimize pf = if (type == "empirical") self$pareto_front() else self$pareto_front(type = "estimated") @@ -346,11 +451,36 @@ EnsembleFSResult = R6Class("EnsembleFSResult", uniqueN(private$.result$learner_id) }, - #' @field measure (`character(1)`)\cr - #' Returns the measure id used in the ensemble feature selection. + #' @field measure ([mlr3::Measure])\cr + #' Returns the 'active' measure that is used in methods of this object. measure = function(rhs) { assert_ro_binding(rhs) - private$.measure_id + + if (private$.active_measure == "outer") { + private$.measure + } else { + private$.inner_measure + } + }, + + #' @field active_measure (`character(1)`)\cr + #' Indicates the type of the active performance measure. + #' + #' During the ensemble feature selection process, the dataset is split into **multiple subsamples** (train/test splits) using an initial resampling scheme. + #' So, performance can be evaluated using one of two measures: + #' + #' - `"outer"`: measure used to evaluate the performance on the test sets. + #' - `"inner"`: measure used for optimization and to compute performance during inner resampling on the training sets. + active_measure = function(rhs) { + assert_ro_binding(rhs) + private$.active_measure + }, + + #' @field n_resamples (`character(1)`)\cr + #' Returns the number of times the task was initially resampled in the ensemble feature selection process. + n_resamples = function(rhs) { + assert_ro_binding(rhs) + uniqueN(self$result$resampling_iteration) } ), @@ -358,14 +488,14 @@ EnsembleFSResult = R6Class("EnsembleFSResult", .result = NULL, # with no R6 classes .stability_global = NULL, .stability_learner = NULL, - .feature_ranking = NULL, .features = NULL, - .measure_id = NULL, - .minimize = NULL + .measure = NULL, + .inner_measure = NULL, + .active_measure = NULL ) ) #' @export -as.data.table.EnsembleFSResult = function(x, ...) { +as.data.table.EnsembleFSResult = function(x, ...) { x$result } diff --git a/R/bibentries.R b/R/bibentries.R index d3787ad6..65a36a82 100644 --- a/R/bibentries.R +++ b/R/bibentries.R @@ -9,7 +9,6 @@ bibentries = c( title = "ecr 2.0", booktitle = "Proceedings of the Genetic and Evolutionary Computation Conference Companion" ), - bergstra_2012 = bibentry("article", title = "Random Search for Hyper-Parameter Optimization", author = "James Bergstra and Yoshua Bengio", @@ -20,8 +19,7 @@ bibentries = c( pages = "281--305", url = "https://jmlr.csail.mit.edu/papers/v13/bergstra12a.html" ), - - thomas2017 = bibentry("article", + thomas2017 = bibentry("article", doi = "10.1155/2017/1421409", year = "2017", publisher = "Hindawi Limited", @@ -31,8 +29,7 @@ bibentries = c( title = "Probing for Sparse and Fast Variable Selection with Model-Based Boosting", journal = "Computational and Mathematical Methods in Medicine" ), - - wu2007 = bibentry("article", + wu2007 = bibentry("article", doi = "10.1198/016214506000000843", year = "2007", month = "3", @@ -44,8 +41,7 @@ bibentries = c( title = "Controlling Variable Selection by the Addition of Pseudovariables", journal = "Journal of the American Statistical Association" ), - - guyon2002 = bibentry("article", + guyon2002 = bibentry("article", title = "Gene Selection for Cancer Classification using Support Vector Machines", volume = "46", issn = "1573-0565", @@ -56,7 +52,6 @@ bibentries = c( author = "Isabelle Guyon and Jason Weston and Stephen Barnhill and Vladimir Vapnik", year = "2002" ), - kuhn2013 = bibentry("Inbook", author = "Kuhn, Max and Johnson, Kjell", chapter = "Over-Fitting and Model Tuning", @@ -67,7 +62,6 @@ bibentries = c( pages = "61--92", isbn = "978-1-4614-6849-3" ), - saeys2008 = bibentry("article", author = "Saeys, Yvan and Abeel, Thomas and Van De Peer, Yves", doi = "10.1007/978-3-540-87481-2_21", @@ -79,7 +73,6 @@ bibentries = c( volume = "5212 LNAI", year = "2008" ), - abeel2010 = bibentry("article", author = "Abeel, Thomas and Helleputte, Thibault and Van de Peer, Yves and Dupont, Pierre and Saeys, Yvan", doi = "10.1093/BIOINFORMATICS/BTP630", @@ -92,7 +85,6 @@ bibentries = c( volume = "26", year = "2010" ), - pes2020 = bibentry("article", author = "Pes, Barbara", doi = "10.1007/s00521-019-04082-3", @@ -106,7 +98,6 @@ bibentries = c( volume = "32", year = "2020" ), - das1999 = bibentry("article", author = "Das, I", issn = "09344373", @@ -118,5 +109,31 @@ bibentries = c( title = "On characterizing the 'knee' of the Pareto curve based on normal-boundary intersection", volume = "18", year = "1999" + ), + meinshausen2010 = bibentry("article", + author = "Meinshausen, Nicolai and Buhlmann, Peter", + doi = "10.1111/J.1467-9868.2010.00740.X", + eprint = "0809.2932", + issn = "1369-7412", + journal = "Journal of the Royal Statistical Society Series B: Statistical Methodology", + month = "sep", + number = "4", + pages = "417--473", + publisher = "Oxford Academic", + title = "Stability Selection", + volume = "72", + year = "2010" + ), + hedou2024 = bibentry("article", + author = "Hedou, Julien and Maric, Ivana and Bellan, Gregoire and Einhaus, Jakob and Gaudilliere, Dyani K. and Ladant, Francois Xavier and Verdonk, Franck and Stelzer, Ina A. and Feyaerts, Dorien and Tsai, Amy S. and Ganio, Edward A. and Sabayev, Maximilian and Gillard, Joshua and Amar, Jonas and Cambriel, Amelie and Oskotsky, Tomiko T. and Roldan, Alennie and Golob, Jonathan L. and Sirota, Marina and Bonham, Thomas A. and Sato, Masaki and Diop, Maigane and Durand, Xavier and Angst, Martin S. and Stevenson, David K. and Aghaeepour, Nima and Montanari, Andrea and Gaudilliere, Brice", #nolint + doi = "10.1038/s41587-023-02033-x", + issn = "1546-1696", + journal = "Nature Biotechnology 2024", + month = "jan", + pages = "1--13", + publisher = "Nature Publishing Group", + title = "Discovery of sparse, reliable omic biomarkers with Stabl", + url = "https://www.nature.com/articles/s41587-023-02033-x", + year = "2024" ) ) diff --git a/R/embedded_ensemble_fselect.R b/R/embedded_ensemble_fselect.R new file mode 100644 index 00000000..8852d760 --- /dev/null +++ b/R/embedded_ensemble_fselect.R @@ -0,0 +1,112 @@ +#' @title Embedded Ensemble Feature Selection +#' +#' @include CallbackBatchFSelect.R +#' +#' @description +#' Ensemble feature selection using multiple learners. +#' The ensemble feature selection method is designed to identify the most predictive features from a given dataset by leveraging multiple machine learning models and resampling techniques. +#' Returns an [EnsembleFSResult]. +#' +#' @details +#' The method begins by applying an initial resampling technique specified by the user, to create **multiple subsamples** from the original dataset (train/test splits). +#' This resampling process helps in generating diverse subsets of data for robust feature selection. +#' +#' For each subsample (train set) generated in the previous step, the method applies learners +#' that support **embedded feature selection**. +#' These learners are then scored on their ability to predict on the resampled +#' test sets, storing the selected features during training, for each +#' combination of subsample and learner. +#' +#' Results are stored in an [EnsembleFSResult]. +#' +#' @param learners (list of [mlr3::Learner])\cr +#' The learners to be used for feature selection. +#' All learners must have the `selected_features` property, i.e. implement +#' embedded feature selection (e.g. regularized models). +#' @param init_resampling ([mlr3::Resampling])\cr +#' The initial resampling strategy of the data, from which each train set +#' will be passed on to the learners and each test set will be used for +#' prediction. +#' Can only be [mlr3::ResamplingSubsampling] or [mlr3::ResamplingBootstrap]. +#' @param measure ([mlr3::Measure])\cr +#' The measure used to score each learner on the test sets generated by +#' `init_resampling`. +#' If `NULL`, default measure is used. +#' @param store_benchmark_result (`logical(1)`)\cr +#' Whether to store the benchmark result in [EnsembleFSResult] or not. +#' +#' @template param_task +#' +#' @returns an [EnsembleFSResult] object. +#' +#' @source +#' `r format_bib("meinshausen2010", "hedou2024")` +#' @export +#' @examples +#' \donttest{ +#' eefsr = embedded_ensemble_fselect( +#' task = tsk("sonar"), +#' learners = lrns(c("classif.rpart", "classif.featureless")), +#' init_resampling = rsmp("subsampling", repeats = 5), +#' measure = msr("classif.ce") +#' ) +#' eefsr +#' } +embedded_ensemble_fselect = function( + task, + learners, + init_resampling, + measure, + store_benchmark_result = TRUE + ) { + assert_task(task) + assert_learners(as_learners(learners), task = task, properties = "selected_features") + assert_resampling(init_resampling) + assert_choice(class(init_resampling)[1], choices = c("ResamplingBootstrap", "ResamplingSubsampling")) + assert_measure(measure, task = task) + assert_flag(store_benchmark_result) + + init_resampling$instantiate(task) + + design = benchmark_grid( + tasks = task, + learners = learners, + resamplings = init_resampling + ) + + bmr = benchmark(design, store_models = TRUE) + + trained_learners = bmr$score()$learner + + # extract selected features + features = map(trained_learners, function(learner) { + learner$selected_features() + }) + + # extract n_features + n_features = map_int(features, length) + + # extract scores on the test sets + scores = bmr$score(measure) + + set(scores, j = "features", value = features) + set(scores, j = "n_features", value = n_features) + setnames(scores, "iteration", "resampling_iteration") + + # remove R6 objects + set(scores, j = "learner", value = NULL) + set(scores, j = "task", value = NULL) + set(scores, j = "resampling", value = NULL) + set(scores, j = "prediction_test", value = NULL) + set(scores, j = "task_id", value = NULL) + set(scores, j = "nr", value = NULL) + set(scores, j = "resampling_id", value = NULL) + set(scores, j = "uhash", value = NULL) + + EnsembleFSResult$new( + result = scores, + features = task$feature_names, + benchmark_result = if (store_benchmark_result) bmr, + measure = measure + ) +} diff --git a/R/ensemble_fselect.R b/R/ensemble_fselect.R index ed1aceb3..12be4636 100644 --- a/R/ensemble_fselect.R +++ b/R/ensemble_fselect.R @@ -1,39 +1,55 @@ -#' @title Ensemble Feature Selection +#' @title Wrapper-based Ensemble Feature Selection #' #' @include CallbackBatchFSelect.R #' #' @description #' Ensemble feature selection using multiple learners. -#' The ensemble feature selection method is designed to identify the most informative features from a given dataset by leveraging multiple machine learning models and resampling techniques. +#' The ensemble feature selection method is designed to identify the most predictive features from a given dataset by leveraging multiple machine learning models and resampling techniques. #' Returns an [EnsembleFSResult]. #' #' @details -#' The method begins by applying an initial resampling technique specified by the user, to create **multiple subsamples** from the original dataset. +#' The method begins by applying an initial resampling technique specified by the user, to create **multiple subsamples** from the original dataset (train/test splits). #' This resampling process helps in generating diverse subsets of data for robust feature selection. #' -#' For each subsample generated in the previous step, the method performs **wrapped-based feature selection** ([auto_fselector]) using each provided learner, the given inner resampling method, performance measure and optimization algorithm. -#' This process generates the best feature subset for each combination of subsample and learner. +#' For each subsample (train set) generated in the previous step, the method performs **wrapped-based feature selection** ([auto_fselector]) using each provided learner, the given inner resampling method, inner performance measure and optimization algorithm. +#' This process generates 1) the best feature subset and 2) a final trained model using these best features, for each combination of subsample and learner. +#' The final models are then scored on their ability to predict on the resampled test sets. +#' #' Results are stored in an [EnsembleFSResult]. #' +#' The result object also includes the performance scores calculated during the inner resampling of the training sets, using models with the best feature subsets. +#' These scores are stored in a column named `{measure_id}_inner`. +#' +#' @section Note: +#' +#' The **active measure** of performance is the one applied to the test sets. +#' This is preferred, as inner resampling scores on the training sets are likely to be overestimated when using the final models. +#' Users can change the active measure by using the `set_active_measure()` method of the [EnsembleFSResult]. +#' #' @param learners (list of [mlr3::Learner])\cr #' The learners to be used for feature selection. #' @param init_resampling ([mlr3::Resampling])\cr #' The initial resampling strategy of the data, from which each train set -#' will be passed on to the learners. +#' will be passed on to the [auto_fselector] to optimize the learners and +#' perform feature selection. +#' Each test set will be used for prediction on the final models returned by [auto_fselector]. #' Can only be [mlr3::ResamplingSubsampling] or [mlr3::ResamplingBootstrap]. #' @param inner_resampling ([mlr3::Resampling])\cr #' The inner resampling strategy used by the [FSelector]. +#' @param inner_measure ([mlr3::Measure])\cr +#' The inner optimization measure used by the [FSelector]. +#' @param measure ([mlr3::Measure])\cr +#' Measure used to score each trained learner on the test sets generated by `init_resampling`. #' @param store_benchmark_result (`logical(1)`)\cr #' Whether to store the benchmark result in [EnsembleFSResult] or not. #' @param store_models (`logical(1)`)\cr #' Whether to store models in [auto_fselector] or not. -#' @param callbacks (list of lists of [CallbackBatchFSelect])\cr +#' @param callbacks (Named list of lists of [CallbackBatchFSelect])\cr #' Callbacks to be used for each learner. -#' The lists must have the same length as the number of learners. +#' The lists must be named by the learner ids. #' #' @template param_fselector #' @template param_task -#' @template param_measure #' @template param_terminator #' #' @returns an [EnsembleFSResult] object. @@ -49,7 +65,8 @@ #' learners = lrns(c("classif.rpart", "classif.featureless")), #' init_resampling = rsmp("subsampling", repeats = 2), #' inner_resampling = rsmp("cv", folds = 3), -#' measure = msr("classif.ce"), +#' inner_measure = msr("classif.ce"), +#' measure = msr("classif.acc"), #' terminator = trm("evals", n_evals = 10) #' ) #' efsr @@ -60,49 +77,43 @@ ensemble_fselect = function( learners, init_resampling, inner_resampling, + inner_measure, measure, terminator, callbacks = NULL, store_benchmark_result = TRUE, - store_models = TRUE + store_models = FALSE ) { assert_task(task) assert_learners(as_learners(learners), task = task) assert_resampling(init_resampling) assert_choice(class(init_resampling)[1], choices = c("ResamplingBootstrap", "ResamplingSubsampling")) - assert_list(callbacks, types = "list", len = length(learners), null.ok = TRUE) + assert_resampling(inner_resampling) + assert_measure(inner_measure, task = task) + assert_measure(measure, task = task) + callbacks = map(callbacks, function(callbacks) assert_callbacks(as_callbacks(callbacks))) + if (length(callbacks)) assert_names(names(callbacks), subset.of = map_chr(learners, "id")) assert_flag(store_benchmark_result) + assert_flag(store_models) # create auto_fselector for each learner - afss = imap(unname(learners), function(learner, i) { + afss = map(learners, function(learner) { auto_fselector( fselector = fselector, learner = learner, resampling = inner_resampling, - measure = measure, + measure = inner_measure, terminator = terminator, store_models = store_models, - callbacks = callbacks[[i]] - ) - }) - - init_resampling$instantiate(task) - grid = map_dtr(seq(init_resampling$iters), function(i) { - - # create task and resampling for each outer iteration - task_subset = task$clone()$filter(init_resampling$train_set(i)) - resampling = rsmp("insample")$instantiate(task_subset) - - data.table( - resampling_iteration = i, - learner_id = map_chr(learners, "id"), - learner = afss, - task = list(task_subset), - resampling = list(resampling) + callbacks = callbacks[[learner$id]] ) }) - design = grid[, c("learner", "task", "resampling"), with = FALSE] + design = benchmark_grid( + tasks = task, + learners = afss, + resamplings = init_resampling + ) bmr = benchmark(design, store_models = TRUE) @@ -118,31 +129,42 @@ ensemble_fselect = function( afs$fselect_result$n_features[[1]] }) - # extract scores - scores = map_dbl(afss, function(afs) { - afs$fselect_instance$archive$best()[, measure$id, with = FALSE][[1]] + # extract inner scores + inner_scores = map_dbl(afss, function(afs) { + afs$fselect_instance$archive$best()[, inner_measure$id, with = FALSE][[1]] }) - set(grid, j = "features", value = features) - set(grid, j = "n_features", value = n_features) - set(grid, j = measure$id, value = scores) + # extract scores on the test sets + scores = bmr$score(measure) + + set(scores, j = "features", value = features) + set(scores, j = "n_features", value = n_features) + set(scores, j = sprintf("%s_inner", inner_measure$id), value = inner_scores) + setnames(scores, "iteration", "resampling_iteration") + + # remove R6 objects + set(scores, j = "learner", value = NULL) + set(scores, j = "task", value = NULL) + set(scores, j = "resampling", value = NULL) + set(scores, j = "prediction_test", value = NULL) + set(scores, j = "task_id", value = NULL) + set(scores, j = "nr", value = NULL) + set(scores, j = "resampling_id", value = NULL) + set(scores, j = "uhash", value = NULL) # extract importance scores if RFE optimization was used if (class(fselector)[1] == "FSelectorBatchRFE") { imp_scores = map(afss, function(afs) { afs$fselect_result$importance[[1]] }) - set(grid, j = "importance", value = imp_scores) + set(scores, j = "importance", value = imp_scores) } - set(grid, j = "learner", value = NULL) - set(grid, j = "task", value = NULL) - set(grid, j = "resampling", value = NULL) EnsembleFSResult$new( - result = grid, + result = scores, features = task$feature_names, benchmark_result = if (store_benchmark_result) bmr, - measure_id = measure$id, - minimize = measure$minimize + measure = measure, + inner_measure = inner_measure ) } diff --git a/man/embedded_ensemble_fselect.Rd b/man/embedded_ensemble_fselect.Rd new file mode 100644 index 00000000..0f580583 --- /dev/null +++ b/man/embedded_ensemble_fselect.Rd @@ -0,0 +1,79 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/embedded_ensemble_fselect.R +\name{embedded_ensemble_fselect} +\alias{embedded_ensemble_fselect} +\title{Embedded Ensemble Feature Selection} +\source{ +Meinshausen, Nicolai, Buhlmann, Peter (2010). +\dQuote{Stability Selection.} +\emph{Journal of the Royal Statistical Society Series B: Statistical Methodology}, \bold{72}(4), 417--473. +ISSN 1369-7412, \doi{10.1111/J.1467-9868.2010.00740.X}, 0809.2932. + +Hedou, Julien, Maric, Ivana, Bellan, Gregoire, Einhaus, Jakob, Gaudilliere, K. D, Ladant, Xavier F, Verdonk, Franck, Stelzer, A. I, Feyaerts, Dorien, Tsai, S. A, Ganio, A. E, Sabayev, Maximilian, Gillard, Joshua, Amar, Jonas, Cambriel, Amelie, Oskotsky, T. T, Roldan, Alennie, Golob, L. J, Sirota, Marina, Bonham, A. T, Sato, Masaki, Diop, Maigane, Durand, Xavier, Angst, S. M, Stevenson, K. D, Aghaeepour, Nima, Montanari, Andrea, Gaudilliere, Brice (2024). +\dQuote{Discovery of sparse, reliable omic biomarkers with Stabl.} +\emph{Nature Biotechnology 2024}, 1--13. +ISSN 1546-1696, \doi{10.1038/s41587-023-02033-x}, \url{https://www.nature.com/articles/s41587-023-02033-x}. +} +\usage{ +embedded_ensemble_fselect( + task, + learners, + init_resampling, + measure, + store_benchmark_result = TRUE +) +} +\arguments{ +\item{task}{(\link[mlr3:Task]{mlr3::Task})\cr +Task to operate on.} + +\item{learners}{(list of \link[mlr3:Learner]{mlr3::Learner})\cr +The learners to be used for feature selection. +All learners must have the \code{selected_features} property, i.e. implement +embedded feature selection (e.g. regularized models).} + +\item{init_resampling}{(\link[mlr3:Resampling]{mlr3::Resampling})\cr +The initial resampling strategy of the data, from which each train set +will be passed on to the learners and each test set will be used for +prediction. +Can only be \link[mlr3:mlr_resamplings_subsampling]{mlr3::ResamplingSubsampling} or \link[mlr3:mlr_resamplings_bootstrap]{mlr3::ResamplingBootstrap}.} + +\item{measure}{(\link[mlr3:Measure]{mlr3::Measure})\cr +The measure used to score each learner on the test sets generated by +\code{init_resampling}. +If \code{NULL}, default measure is used.} + +\item{store_benchmark_result}{(\code{logical(1)})\cr +Whether to store the benchmark result in \link{EnsembleFSResult} or not.} +} +\value{ +an \link{EnsembleFSResult} object. +} +\description{ +Ensemble feature selection using multiple learners. +The ensemble feature selection method is designed to identify the most predictive features from a given dataset by leveraging multiple machine learning models and resampling techniques. +Returns an \link{EnsembleFSResult}. +} +\details{ +The method begins by applying an initial resampling technique specified by the user, to create \strong{multiple subsamples} from the original dataset (train/test splits). +This resampling process helps in generating diverse subsets of data for robust feature selection. + +For each subsample (train set) generated in the previous step, the method applies learners +that support \strong{embedded feature selection}. +These learners are then scored on their ability to predict on the resampled +test sets, storing the selected features during training, for each +combination of subsample and learner. + +Results are stored in an \link{EnsembleFSResult}. +} +\examples{ +\donttest{ + eefsr = embedded_ensemble_fselect( + task = tsk("sonar"), + learners = lrns(c("classif.rpart", "classif.featureless")), + init_resampling = rsmp("subsampling", repeats = 5), + measure = msr("classif.ce") + ) + eefsr +} +} diff --git a/man/ensemble_fs_result.Rd b/man/ensemble_fs_result.Rd index 56a0fe37..dd0239a3 100644 --- a/man/ensemble_fs_result.Rd +++ b/man/ensemble_fs_result.Rd @@ -7,7 +7,8 @@ \description{ The \code{EnsembleFSResult} stores the results of ensemble feature selection. It includes methods for evaluating the stability of the feature selection process and for ranking the selected features among others. -The function \code{\link[=ensemble_fselect]{ensemble_fselect()}} returns an object of this class. + +Both functions \code{\link[=ensemble_fselect]{ensemble_fselect()}} and \code{\link[=embedded_ensemble_fselect]{embedded_ensemble_fselect()}} return an object of this class. } \section{S3 Methods}{ @@ -31,7 +32,8 @@ Whether to add the learner, task and resampling information from the benchmark r learners = lrns(c("classif.rpart", "classif.featureless")), init_resampling = rsmp("subsampling", repeats = 2), inner_resampling = rsmp("cv", folds = 3), - measure = msr("classif.ce"), + inner_measure = msr("classif.ce"), + measure = msr("classif.acc"), terminator = trm("none") ) @@ -47,7 +49,16 @@ Whether to add the learner, task and resampling information from the benchmark r # returns a ranking of all features head(efsr$feature_ranking()) - # returns the empirical pareto front (nfeatures vs error) + # returns the empirical pareto front, i.e. n_features vs measure (error) + efsr$pareto_front() + + # returns the knee points (optimal trade-off between n_features and performance) + efsr$knee_points() + + # change to use the inner optimization measure + efsr$set_active_measure(which = "inner") + + # Pareto front is calculated on the inner measure efsr$pareto_front() } } @@ -56,6 +67,11 @@ Das, I (1999). \dQuote{On characterizing the 'knee' of the Pareto curve based on normal-boundary intersection.} \emph{Structural Optimization}, \bold{18}(1-2), 107--115. ISSN 09344373. + +Meinshausen, Nicolai, Buhlmann, Peter (2010). +\dQuote{Stability Selection.} +\emph{Journal of the Royal Statistical Society Series B: Statistical Methodology}, \bold{72}(4), 417--473. +ISSN 1369-7412, \doi{10.1111/J.1467-9868.2010.00740.X}, 0809.2932. } \section{Public fields}{ \if{html}{\out{