-
Notifications
You must be signed in to change notification settings - Fork 11
/
balance.R
109 lines (93 loc) · 3.1 KB
/
balance.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
#' Measures of class balance
#'
#' Classification task. These measures capture the differences in the number of
#' examples per class in the dataset. When these differences are severe,
#' problems related to generalization of the ML classification techniques could
#' happen because of the imbalance ratio.
#'
#' @family complexity-measures
#' @param x A data.frame contained only the input attributes.
#' @param y A factor response vector with one label for each row/component of x.
#' @param measures A list of measures names or \code{"all"} to include all them.
#' @param formula A formula to define the class column.
#' @param data A data.frame dataset contained the input attributes and class.
#' @param ... Not used.
#' @details
#' The following measures are allowed for this method:
#' \describe{
#' \item{"B1"}{The entropy of class proportions (B1) capture the imbalance in
#' a dataset based on the proportions of examples per class.}
#' \item{"B2"}{The imbalance ratio (B2) is an index computed for measuring
#' class balance. This is a version of the measure that is also suited for
#' multiclass classification problems.}
#' }
#' @return A list named by the requested class balance measure.
#'
#' @references
#' Ana C Lorena, Ivan G Costa, Newton Spolaor and Marcilio C P Souto. (2012).
#' Analysis of complexity indices for classification problems: Cancer gene
#' expression data. Neurocomputing 75, 1, 33--42.
#'
#' Ajay K Tanwani and Muddassar Farooq. (2010). Classification potential vs.
#' classification accuracy: a comprehensive study of evolutionary algorithms
#' with biomedical datasets. Learning Classifier Systems 6471, 127--144.
#'
#' @examples
#' ## Extract all balance measures for classification task
#' data(iris)
#' balance(Species ~ ., iris)
#' @export
balance <- function(...) {
UseMethod("balance")
}
#' @rdname balance
#' @export
balance.default <- function(x, y, measures="all", ...) {
if(!is.data.frame(x)) {
stop("data argument must be a data.frame")
}
if(is.data.frame(y)) {
y <- y[, 1]
}
y <- as.factor(y)
if(nrow(x) != length(y)) {
stop("x and y must have same number of rows")
}
if(measures[1] == "all") {
measures <- ls.balance()
}
measures <- match.arg(measures, ls.balance(), TRUE)
sapply(measures, function(f) {
eval(call(paste("c", f, sep="."), y=y))
}, simplify=FALSE)
}
#' @rdname balance
#' @export
balance.formula <- function(formula, data, measures="all", ...) {
if(!inherits(formula, "formula")) {
stop("method is only for formula datas")
}
if(!is.data.frame(data)) {
stop("data argument must be a data.frame")
}
modFrame <- stats::model.frame(formula, data)
attr(modFrame, "terms") <- NULL
balance.default(modFrame[, -1, drop=FALSE], modFrame[, 1, drop=FALSE],
measures, ...)
}
ls.balance <- function() {
c("B1", "B2")
}
c.B1 <- function(y) {
c <- -1/log(nlevels(y))
i <- table(y)/length(y)
aux <- 1 - c*sum(i*log(i))
return(aux)
}
c.B2 <- function(y) {
ii <- summary(y)
nc <- length(ii)
aux <- ((nc - 1)/nc) * sum(ii/(length(y) - ii))
aux <- 1 - (1/aux)
return(aux)
}