mlr-org · bblodfon · Jul 31, 2024 · Jul 31, 2024 · Jul 31, 2024 · Jul 31, 2024
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -34,6 +34,7 @@ Imports:
     mlr3misc (>= 0.15.1),
     paradox (>= 1.0.0),
     R6,
+    Rcpp,
     stabm
 Suggests:
     e1071,
@@ -73,9 +74,11 @@ Collate:
     'FSelectorBatchShadowVariableSearch.R'
     'ObjectiveFSelect.R'
     'ObjectiveFSelectBatch.R'
+    'RcppExports.R'
     'assertions.R'
     'auto_fselector.R'
     'bibentries.R'
+    'embedded_ensemble_fselect.R'
     'ensemble_fselect.R'
     'extract_inner_fselect_archives.R'
     'extract_inner_fselect_results.R'
@@ -85,4 +88,7 @@ Collate:
     'mlr_callbacks.R'
     'reexports.R'
     'sugar.R'
+    'voting_methods.R'
     'zzz.R'
+LinkingTo: 
+    Rcpp
diff --git a/NAMESPACE b/NAMESPACE
@@ -36,6 +36,7 @@ export(auto_fselector)
 export(callback_batch_fselect)
 export(clbk)
 export(clbks)
+export(embedded_ensemble_fselect)
 export(ensemble_fselect)
 export(extract_inner_fselect_archives)
 export(extract_inner_fselect_results)
@@ -56,6 +57,7 @@ import(mlr3)
 import(mlr3misc)
 import(paradox)
 importFrom(R6,R6Class)
+importFrom(Rcpp,sourceCpp)
 importFrom(bbotk,mlr_terminators)
 importFrom(bbotk,trm)
 importFrom(bbotk,trms)
@@ -67,3 +69,4 @@ importFrom(utils,bibentry)
 importFrom(utils,combn)
 importFrom(utils,head)
 importFrom(utils,packageVersion)
+useDynLib(mlr3fselect, .registration = TRUE)
diff --git a/R/EnsembleFSResult.R b/R/EnsembleFSResult.R
@@ -16,7 +16,7 @@
 #'       Whether to add the learner, task and resampling information from the benchmark result.
 #'
 #' @references
-#' `r format_bib("das1999")`
+#' `r format_bib("das1999", "meinshausen2010")`
 #'
 #' @export
 #' @examples
@@ -82,6 +82,10 @@ EnsembleFSResult = R6Class("EnsembleFSResult",
       private$.result = result
       private$.features = assert_character(features, any.missing = FALSE, null.ok = FALSE)
       private$.minimize = assert_logical(minimize, null.ok = FALSE)
+
+      # check that all feature sets are subsets of the task features
+      assert_subset(unlist(result$features), private$.features)
+
       self$benchmark_result = if (!is.null(benchmark_result)) assert_benchmark_result(benchmark_result)
 
       self$man = "mlr3fselect::ensemble_fs_result"
@@ -99,7 +103,8 @@ EnsembleFSResult = R6Class("EnsembleFSResult",
     #'
     #' @param ... (ignored).
     print = function(...) {
-      catf(format(self))
+      catf("%s with %s learners and %s initial resamplings",
+           format(self), self$n_learners, self$n_resamples)
       print(private$.result[, c("resampling_iteration", "learner_id", "n_features"), with = FALSE])
     },
 
@@ -113,37 +118,85 @@ EnsembleFSResult = R6Class("EnsembleFSResult",
     #' Calculates the feature ranking.
     #'
     #' @details
-    #' The feature ranking process is built on the following framework: models act as voters, features act as candidates, and voters select certain candidates (features).
+    #' The feature ranking process is built on the following framework: models act as *voters*, features act as *candidates*, and voters select certain candidates (features).
     #' The primary objective is to compile these selections into a consensus ranked list of features, effectively forming a committee.
-    #' Currently, only `"approval_voting"` method is supported, which selects the candidates/features that have the highest approval score or selection frequency, i.e. appear the most often.
+    #'
+    #' For every feature a score is calculated, which depends on the `"method"` argument.
+    #' The higher the score, the higher the rank of the feature.
+    #' Most methods have a `"*_weighted"` version that outputs a weighted score.
+    #' The weights used are equal to the performance scores of each voter/model (or the inverse scores if the measure is minimized).
+    #' The un-weighted methods use same weights for all voters (equal to 1).
+    #'
+    #' Note that some methods output a feature ranking instead of a score per feature.
+    #' Therefore we also calculate **Borda's score**:
+    #' \eqn{s_{borda} = (p-i)/(p-1)}, where \eqn{p} is the total number of features, and \eqn{i} is the feature ranking.
+    #' So the best feature gets a borda score of \eqn{1} and the worst-ranked feature a borda score of \eqn{0}.
+    #' This score is method-agnostic, i.e. it can be used to compare the feature rankings across different methods.
+    #'
+    #' We randomly shuffle the input candidates/features so that we enforce the same tie-breaking mechanism for all available methods.
+    #' Users should use the same `seed` for consistent comparison between the different feature ranking methods and for reproducibility.
+    #'
+    #' The following methods are currently supported:
+    #'
+    #' - `"av"|"av_weighted"` (approval voting) selects the candidates that have the highest approval score, i.e. the features that appear the most often.
+    #' This is the default feature ranking method.
+    #' - `"sav"|"sav_weighted"` (satisfaction approval voting) selects the candidates that have a higher satisfaction score, in proportion to the size of the voters approval sets.
+    #' Voters who approve more candidates contribute a lesser score to the individual approved candidates.
+    #' - `"seq_pav"|"seq_pav_weighted"` (sequential proportional approval voting) sequentially builds a committee by iteratively selecting the candidate that maximizes the PAV score when added, ensuring proportional representation.
+    #' The **PAV score** (Proportional Approval Voting score) is a metric that calculates the weighted sum of harmonic numbers corresponding to the number of elected candidates supported by each voter, reflecting the overall satisfaction of voters in a committee selection process.
+    #' - `"seq_phragmen"|"seq_phragmen_weighted"` (sequential Phragmen's rule) distributes "loads" equally among voters for each candidate added to the committee.
+    #' The rule iteratively selects the candidate that results in the smallest increase in voter load.
+    #' This approach is suitable for scenarios where a balanced representation is desired, as it seeks to evenly distribute the "burden" of representation among all voters.
     #'
     #' @param method (`character(1)`)\cr
     #' The method to calculate the feature ranking.
+    #' @param committee_size (`integer(1)`)\cr
+    #' Number of top selected features in the output ranking.
+    #' This parameter can be used to speed-up methods that build a committee sequentially (`"seq_pav"`), by requesting only the top N selected candidates/features and not the complete feature ranking.
+    #'
+    #' @return A [data.table::data.table] listing all the features, ordered by decreasing scores (depends on the `"method"`).
+    #' An extra column `"norm_score"` is produced for methods for which the original scores (i.e. approval counts in the case of approval voting) can be normalized and interpreted as **selection probabilities**, see Meinshausen et al. (2010).
+    #' The `"borda_score"` column is always included to incorporate feature ranking methods that don't output per-feature scores but only rankings.
     #'
-    #' @return A [data.table::data.table] listing all the features, ordered by decreasing inclusion probability scores (depending on the `method`)
-    feature_ranking = function(method = "approval_voting") {
-      assert_choice(method, choices = "approval_voting")
+    feature_ranking = function(method = "av", committee_size = NULL) {
+      assert_choice(method, choices = c("av", "av_weighted", "sav", "sav_weighted",
+                                        "seq_pav", "seq_pav_weighted", "seq_phragmen",
+                                        "seq_phragmen_weighted"))
+      assert_int(committee_size, lower = 1, null.ok = TRUE)
 
       # cached results
       if (!is.null(private$.feature_ranking[[method]])) {
         return(private$.feature_ranking[[method]])
       }
 
-      count_tbl = sort(table(unlist(private$.result$features)), decreasing = TRUE)
-      features_selected = names(count_tbl)
-      features_not_selected = setdiff(private$.features, features_selected)
+      # candidates => all features, voters => list of selected (best) features sets
+      candidates = private$.features
+      voters = private$.result$features
 
-      res_fs = data.table(
-        feature = features_selected,
-        inclusion_probability = as.vector(count_tbl) / nrow(private$.result)
-      )
-
-      res_fns = data.table(
-        feature = features_not_selected,
-        inclusion_probability = 0
-      )
+      # calculate weights
+      use_weights = grepl(pattern = "weighted", x = method)
+      if (use_weights) {
+        # voter weights are the (inverse) scores
+        scores = private$.result[, get(private$.measure_id)]
+        weights = if (private$.minimize) 1 / scores else scores
+      } else {
+        # all voters are equal
+        weights = rep(1, length(voters))
+      }
 
-      res = rbindlist(list(res_fs, res_fns))
+      # shuffle candidates (force same tie-breaking between methods)
+      candidates = sample(candidates)
+
+      # calculate scores
+      if (startsWith(method, "av")) {
+        res = approval_voting(voters, candidates, weights)
+      } else if (startsWith(method, "sav")) {
+        res = satisfaction_approval_voting(voters, candidates, weights)
+      } else if (startsWith(method, "seq_pav")) {
+        res = seq_proportional_approval_voting(voters, candidates, weights, committee_size)
+      } else if (startsWith(method, "seq_phragmen")) {
+        res = seq_phragmen_rule(voters, candidates, weights, committee_size)
+      }
 
       private$.feature_ranking[[method]] = res
       private$.feature_ranking[[method]]
@@ -261,6 +314,8 @@ EnsembleFSResult = R6Class("EnsembleFSResult",
         # Transform the data (x => 1/x)
         n_features_inv = NULL
         pf[, n_features_inv := 1 / n_features]
+        # remove edge cases where no features were selected
+        pf = pf[n_features > 0]
 
         # Fit the linear model
         form = mlr3misc::formulate(lhs = measure_id, rhs = "n_features_inv")
@@ -351,6 +406,13 @@ EnsembleFSResult = R6Class("EnsembleFSResult",
     measure = function(rhs) {
       assert_ro_binding(rhs)
       private$.measure_id
+    },
+
+    #' @field n_resamples (`character(1)`)\cr
+    #' Returns the number of times the task was initially resampled in the ensemble feature selection.
+    n_resamples = function(rhs) {
+      assert_ro_binding(rhs)
+      uniqueN(self$result$resampling_iteration)
     }
   ),
 

diff --git a/R/RcppExports.R b/R/RcppExports.R
@@ -0,0 +1,19 @@
+# Generated by using Rcpp::compileAttributes() -> do not edit by hand
+# Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393
+
+AV_rcpp <- function(voters, candidates, weights) {
+    .Call(`_mlr3fselect_AV_rcpp`, voters, candidates, weights)
+}
+
+seq_PAV_rcpp <- function(voters, candidates, weights, committee_size) {
+    .Call(`_mlr3fselect_seq_PAV_rcpp`, voters, candidates, weights, committee_size)
+}
+
+seq_Phragmen_rcpp <- function(voters, candidates, weights, committee_size) {
+    .Call(`_mlr3fselect_seq_Phragmen_rcpp`, voters, candidates, weights, committee_size)
+}
+
+SAV_rcpp <- function(voters, candidates, weights) {
+    .Call(`_mlr3fselect_SAV_rcpp`, voters, candidates, weights)
+}
+
diff --git a/R/bibentries.R b/R/bibentries.R
@@ -9,7 +9,6 @@ bibentries = c(
     title       = "ecr 2.0",
     booktitle   = "Proceedings of the Genetic and Evolutionary Computation Conference Companion"
   ),
-
   bergstra_2012 = bibentry("article",
     title       = "Random Search for Hyper-Parameter Optimization",
     author      = "James Bergstra and Yoshua Bengio",
@@ -20,8 +19,7 @@ bibentries = c(
     pages       = "281--305",
     url         = "https://jmlr.csail.mit.edu/papers/v13/bergstra12a.html"
   ),
-
-  thomas2017  = bibentry("article",
+  thomas2017 = bibentry("article",
     doi       = "10.1155/2017/1421409",
     year      = "2017",
     publisher = "Hindawi Limited",
@@ -31,8 +29,7 @@ bibentries = c(
     title     = "Probing for Sparse and Fast Variable Selection with Model-Based Boosting",
     journal   = "Computational and Mathematical Methods in Medicine"
   ),
-
-  wu2007      = bibentry("article",
+  wu2007 = bibentry("article",
     doi       = "10.1198/016214506000000843",
     year      = "2007",
     month     = "3",
@@ -44,8 +41,7 @@ bibentries = c(
     title     = "Controlling Variable Selection by the Addition of Pseudovariables",
     journal   = "Journal of the American Statistical Association"
   ),
-
-  guyon2002     = bibentry("article",
+  guyon2002 = bibentry("article",
     title       = "Gene Selection for Cancer Classification using Support Vector Machines",
     volume      = "46",
     issn        = "1573-0565",
@@ -56,7 +52,6 @@ bibentries = c(
     author      = "Isabelle Guyon and Jason Weston and Stephen Barnhill and Vladimir Vapnik",
     year        = "2002"
   ),
-
   kuhn2013 = bibentry("Inbook",
     author    = "Kuhn, Max and Johnson, Kjell",
     chapter   = "Over-Fitting and Model Tuning",
@@ -67,7 +62,6 @@ bibentries = c(
     pages     = "61--92",
     isbn      = "978-1-4614-6849-3"
   ),
-
   saeys2008 = bibentry("article",
     author      = "Saeys, Yvan and Abeel, Thomas and Van De Peer, Yves",
     doi         = "10.1007/978-3-540-87481-2_21",
@@ -79,7 +73,6 @@ bibentries = c(
     volume      = "5212 LNAI",
     year        = "2008"
   ),
-
   abeel2010 = bibentry("article",
     author    = "Abeel, Thomas and Helleputte, Thibault and Van de Peer, Yves and Dupont, Pierre and Saeys, Yvan",
     doi       = "10.1093/BIOINFORMATICS/BTP630",
@@ -92,7 +85,6 @@ bibentries = c(
     volume    = "26",
     year      = "2010"
   ),
-
   pes2020 = bibentry("article",
     author    = "Pes, Barbara",
     doi       = "10.1007/s00521-019-04082-3",
@@ -106,7 +98,6 @@ bibentries = c(
     volume    = "32",
     year      = "2020"
   ),
-
   das1999 = bibentry("article",
     author    = "Das, I",
     issn      = "09344373",
@@ -118,5 +109,31 @@ bibentries = c(
     title     = "On characterizing the 'knee' of the Pareto curve based on normal-boundary intersection",
     volume    = "18",
     year      = "1999"
+  ),
+  meinshausen2010 = bibentry("article",
+    author    = "Meinshausen, Nicolai and Buhlmann, Peter",
+    doi       = "10.1111/J.1467-9868.2010.00740.X",
+    eprint    = "0809.2932",
+    issn      = "1369-7412",
+    journal   = "Journal of the Royal Statistical Society Series B: Statistical Methodology",
+    month     = "sep",
+    number    = "4",
+    pages     = "417--473",
+    publisher = "Oxford Academic",
+    title     = "Stability Selection",
+    volume    = "72",
+    year      = "2010"
+  ),
+  hedou2024 = bibentry("article",
+    author = "Hedou, Julien and Maric, Ivana and Bellan, Gregoire and Einhaus, Jakob and Gaudilliere, Dyani K. and Ladant, Francois Xavier and Verdonk, Franck and Stelzer, Ina A. and Feyaerts, Dorien and Tsai, Amy S. and Ganio, Edward A. and Sabayev, Maximilian and Gillard, Joshua and Amar, Jonas and Cambriel, Amelie and Oskotsky, Tomiko T. and Roldan, Alennie and Golob, Jonathan L. and Sirota, Marina and Bonham, Thomas A. and Sato, Masaki and Diop, Maigane and Durand, Xavier and Angst, Martin S. and Stevenson, David K. and Aghaeepour, Nima and Montanari, Andrea and Gaudilliere, Brice", #nolint
+    doi = "10.1038/s41587-023-02033-x",
+    issn = "1546-1696",
+    journal = "Nature Biotechnology 2024",
+    month = "jan",
+    pages = "1--13",
+    publisher = "Nature Publishing Group",
+    title = "Discovery of sparse, reliable omic biomarkers with Stabl",
+    url = "https://www.nature.com/articles/s41587-023-02033-x",
+    year = "2024"
   )
 )