-
-
Notifications
You must be signed in to change notification settings - Fork 25
Commit
FixFactors and CollapseFactors
- Loading branch information
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,136 @@ | ||
#' @title PipeOpCollapseFactors | ||
#' | ||
#' @usage NULL | ||
#' @name mlr_pipeops_collapsefactors | ||
#' @format [`R6Class`] object inheriting from [`PipeOpTaskPreprocSimple`]/[`PipeOpTaskPreproc`]/[`PipeOp`]. | ||
#' | ||
#' @description | ||
#' Collapses factors of type `factor`, `ordered`: Collapses the rarest factors in the | ||
#' training samples, until `target_level_count` levels remain. Levels that have prevalence above `no_collapse_above_prevalence` | ||
#' are retained, however. For `factor` variables, these are collapsed to the next larger level, for `ordered` variables, | ||
#' rare variables are collapsed to the neighbouring class, whichever has fewer samples. | ||
#' | ||
#' Levels not seen during training are not touched during prediction; Therefore it is useful to combine this with the | ||
#' [`PipeOpFixFactors`]. | ||
#' | ||
#' @section Construction: | ||
#' ``` | ||
#' PipeOpCollapseFactors$new(id = "collapsefactors", param_vals = list()) | ||
#' ``` | ||
#' * `id` :: `character(1)`\cr | ||
#' Identifier of resulting object, default `"collapsefactors"`. | ||
#' * `param_vals` :: named `list`\cr | ||
#' List of hyperparameter settings, overwriting the hyperparameter settings that would otherwise be set during construction. Default `list()`. | ||
#' | ||
#' @section Input and Output Channels: | ||
#' Input and output channels are inherited from [`PipeOpTaskPreproc`]. | ||
#' | ||
#' The output is the input [`Task`][mlr3::Task] with rare affected `factor` and `ordered` feature levels collapsed. | ||
#' | ||
#' @section State: | ||
#' The `$state` is a named `list` with the `$state` elements inherited from [`PipeOpTaskPreproc`], as well as: | ||
#' * `collapse_map` :: named `list` of named `list` of `character`\cr | ||
#' List of factor level maps. For each factor, `collapse_map` contains a named `list` that indicates what levels | ||
#' of the input task get mapped to what levels of the output task. If `collapse_map` has an entry `feat_1` with | ||
#' an entry `a = c("x", "y")`, it means that levels `"x"` and `"y"` get collapsed to level `"a"` in feature `"feat_1"`. | ||
#' | ||
#' @section Parameters: | ||
#' The parameters are the parameters inherited from [`PipeOpTaskPreproc`], as well as: | ||
#' * `no_collapse_above_prevalence` :: `numeric(1)` \cr | ||
#' Fraction of samples below which factor levels get collapsed. Default is 1, which causes all levels | ||
#' to be collapsed until `target_level_count` remain. | ||
#' * `target_level_count` :: `integer(1)` \cr | ||
#' Number of levels to retain. Default is 2. | ||
#' | ||
#' @section Internals: | ||
#' Makes use of the fact that `levels(fact_var) = list(target1 = c("source1", "source2"), target2 = "source2")` causes | ||
#' renaming of level `"source1"` and `"source2"` both to `"target1"`, and also `"source2"` to `"target2"`. | ||
#' | ||
#' @section Methods: | ||
#' Only methods inherited from [`PipeOpTaskPreprocSimple`]/[`PipeOpTaskPreproc`]/[`PipeOp`]. | ||
#' | ||
#' @family PipeOps | ||
#' @include PipeOpTaskPreproc.R | ||
#' @export | ||
#' @examples | ||
#' library("mlr3") | ||
PipeOpCollapseFactors = R6Class("PipeOpCollapseFactors", | ||
inherit = PipeOpTaskPreprocSimple, | ||
public = list( | ||
initialize = function(id = "collapsefactors", param_vals = list()) { | ||
ps = ParamSet$new(params = list( | ||
ParamDbl$new("no_collapse_above_prevalence", 0, 1, tags = c("train", "predict")), | ||
ParamInt$new("target_level_count", 2, tags = c("train", "predict")) | ||
)) | ||
ps$values = list(no_collapse_above_prevalence = 1, target_level_count = 2) | ||
super$initialize(id, param_set = ps, param_vals = param_vals) | ||
}, | ||
|
||
select_cols = function(task) { | ||
task$feature_types[get("type") %in% c("factor", "ordered"), get("id")] | ||
}, | ||
|
||
get_state = function(task) { | ||
# get the levels of the training task | ||
dt = task$data(cols = self$select_cols(task)) | ||
|
||
keep_fraction = self$param_set$values$no_collapse_above_prevalence | ||
target_count = self$param_set$values$target_level_count | ||
|
||
collapse_map = sapply(dt, function(d) { | ||
if (all(is.na(d))) { | ||
return(NULL) | ||
} | ||
if (length(levels(d)) <= target_count) { | ||
return(NULL) | ||
} | ||
dtable = table(d) | ||
fractions = sort(dtable, decreasing = TRUE) / sum(!is.na(d)) | ||
keep_fraction = names(fractions)[fractions >= keep_fraction] | ||
keep_count = names(fractions)[seq_len(target_count)] # at this point we know there are more levels than target_count | ||
keep = union(keep_fraction, keep_count) | ||
dont_keep = setdiff(levels(d), keep) | ||
if (is.ordered(d)) { | ||
cmap = setNames(as.list(levels(d)), levels(d)) | ||
for (eliminating in dont_keep) { | ||
position = match(eliminating, names(cmap)) | ||
if (position == 1) { | ||
cmap[[2]] = c(cmap[[2]], eliminating) | ||
} else if (position == length(cmap) || dtable[position - 1] < dtable[position + 1]) { | ||
cmap[[position - 1]] = c(cmap[[position - 1]], eliminating) | ||
} else { | ||
cmap[[position + 1]] = c(cmap[[position + 1]], eliminating) | ||
} | ||
dtable = dtable[-position] | ||
cmap[[position]] = NULL | ||
} | ||
} else { | ||
cmap = setNames(as.list(keep), keep) | ||
lowest_kept = keep[length(keep)] | ||
cmap[[lowest_kept]] = c(lowest_kept, dont_keep) | ||
} | ||
cmap | ||
}, simplify = FALSE) | ||
|
||
list(collapse_map = discard(collapse_map, is.null)) | ||
}, | ||
|
||
transform = function(task) { | ||
cmaplist = self$state$collapse_map | ||
dt = task$data(cols = names(cmaplist)) | ||
|
||
for (n in names(cmaplist)) { | ||
# don't touch unseen factor levels | ||
new_lvls = setdiff(levels(dt[[n]]), unlist(cmaplist[[n]], use.names = FALSE)) | ||
all_lvls = c(cmaplist[[n]], setNames(as.list(new_lvls), new_lvls)) | ||
levels(dt[[n]]) = c( | ||
all_lvls[intersect(levels(dt[[n]]), names(all_lvls))], # keep all levels in their order, if they were present before | ||
all_lvls[setdiff(names(all_lvls), levels(dt[[n]]))] # levels that are missing now get sorted to the back. | ||
) | ||
} | ||
task$select(setdiff(task$feature_names, names(cmaplist)))$cbind(dt) | ||
} | ||
) | ||
) | ||
|
||
mlr_pipeops$add("collapsefactors", PipeOpCollapseFactors) |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,106 @@ | ||
#' @title PipeOpFixFactors | ||
#' | ||
#' @usage NULL | ||
#' @name mlr_pipeops_fixfactors | ||
#' @format [`R6Class`] object inheriting from [`PipeOpTaskPreprocSimple`]/[`PipeOpTaskPreproc`]/[`PipeOp`]. | ||
#' | ||
#' @description | ||
#' Fixes factors of type `factor`, `ordered`: Makes sure the factor levels | ||
#' during prediction are the same as during training; possibly dropping empty | ||
#' training factor levels before. | ||
#' | ||
#' Note this may introduce *missing values* during prediction if unseen factor levels are found. | ||
#' | ||
#' @section Construction: | ||
#' ``` | ||
#' PipeOpFixFactors$new(id = "fixfactors", param_vals = list()) | ||
#' ``` | ||
#' * `id` :: `character(1)`\cr | ||
#' Identifier of resulting object, default `"fixfactors"`. | ||
#' * `param_vals` :: named `list`\cr | ||
#' List of hyperparameter settings, overwriting the hyperparameter settings that would otherwise be set during construction. Default `list()`. | ||
#' | ||
#' @section Input and Output Channels: | ||
#' Input and output channels are inherited from [`PipeOpTaskPreproc`]. | ||
#' | ||
#' The output is the input [`Task`][mlr3::Task] with all affected `factor` and `ordered` feature levels fixed. | ||
#' | ||
#' @section State: | ||
#' The `$state` is a named `list` with the `$state` elements inherited from [`PipeOpTaskPreproc`], as well as: | ||
#' * `levels` :: named `list` of `character`\cr | ||
#' List of factor levels of each affected `factor` or `ordered` feature that will be fixed. | ||
#' | ||
#' @section Parameters: | ||
#' The parameters are the parameters inherited from [`PipeOpTaskPreproc`], as well as: | ||
#' * `droplevels` :: `logical(1)` \cr | ||
#' Whether to drop empty factor levels of the training task. Default `TRUE` | ||
#' | ||
#' @section Internals: | ||
#' Changes factor levels of columns and attaches them with a new `data.table` backend and the virtual `cbind()` backend. | ||
#' | ||
#' @section Methods: | ||
#' Only methods inherited from [`PipeOpTaskPreprocSimple`]/[`PipeOpTaskPreproc`]/[`PipeOp`]. | ||
#' | ||
#' @family PipeOps | ||
#' @include PipeOpTaskPreproc.R | ||
#' @export | ||
#' @examples | ||
#' library("mlr3") | ||
PipeOpFixFactors = R6Class("PipeOpFixFactors", | ||
inherit = PipeOpTaskPreprocSimple, | ||
public = list( | ||
initialize = function(id = "fixfactors", param_vals = list()) { | ||
ps = ParamSet$new(params = list( | ||
ParamLgl$new("droplevels", tags = c("train", "predict")) | ||
)) | ||
ps$values = list(droplevels = TRUE) | ||
super$initialize(id, param_set = ps, param_vals = param_vals) | ||
}, | ||
|
||
select_cols = function(task) { | ||
task$feature_types[get("type") %in% c("factor", "ordered"), get("id")] | ||
}, | ||
|
||
get_state = function(task) { | ||
# get the levels of the training task | ||
dt = task$data(cols = self$select_cols(task)) | ||
if (self$param_set$values$droplevels) { | ||
dt = droplevels(dt) | ||
} | ||
list(levels = lapply(dt, function(x) levels(x))) # explicitly access the "levels" function | ||
}, | ||
|
||
transform = function(task) { | ||
dt = task$data(cols = names(self$state$levels)) | ||
|
||
# check which levels are actually different during training and prediction | ||
needs_adjustment = as.logical(imap(self$state$levels, function(lvx, id) { | ||
!identical(lvx, levels(dt[[id]])) | ||
})) | ||
|
||
if (!any(needs_adjustment)) { | ||
return(task) | ||
} | ||
|
||
changed_cols = as.data.table(imap(self$state$levels[needs_adjustment], function(lvx, id) { | ||
x = dt[[id]] | ||
if (is.ordered(x)) { | ||
ordered(x, levels = lvx) | ||
} else { | ||
factor(x, levels = lvx) | ||
} | ||
})) | ||
task$select(setdiff(task$feature_names, colnames(changed_cols)))$cbind(changed_cols) | ||
} | ||
) | ||
) | ||
|
||
mlr_pipeops$add("fixfactors", PipeOpFixFactors) | ||
|
||
# FIXME: from mlr3; should probably go to mlr3misc | ||
ujoin = function (x, y, key) { | ||
cn = setdiff(intersect(names(x), names(y)), key) | ||
expr = parse(text = paste0("`:=`(", paste0(sprintf("%1$s=i.%1$s", | ||
cn), collapse = ","), ")")) | ||
x[y, eval(expr), on = key] | ||
} |
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.