-
Notifications
You must be signed in to change notification settings - Fork 0
/
f.Diagnostics.R
138 lines (133 loc) · 5.46 KB
/
f.Diagnostics.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
#### Examine the deviances to assess goodness of fit ----
CompareDeviances <- function(models, modnames, debug=F) {
# models = a list of one or more model objects
# modnames = vector of the names to apply to the model objects
# Loop through the models passed to the function
out <- NULL
# Check if not actually passed a list of models
is.list <- class(models)[1] == "list"
if (debug) {print(is.list)}
if (is.list) {
n <- length(models)
} else {
n <- 1
}
if (debug) {print(paste("n =", n))}
# Loop through models
for (i in 1:n) {
# Extract the current model
if (is.list) {
m <- summary(models[[i]])
} else {
m <- summary(models)
}
m$difference <- m$null.deviance - m$deviance
m$rel.difference <- m$difference / m$null.deviance
m$df.p <- m$df.null
m$df.q <- m$df.residual
m$p <- with(m, pchisq(difference, df=df.p-df.q, lower.tail=F))
out <- rbind(out,
data.frame(null.dev = m$null.deviance,
dev = m$deviance,
diff = m$difference,
rel.diff = m$rel.difference,
df.p = m$df.p,
df.q = m$df.q,
p = m$p))
if (debug) {print(out)}
}
rownames(out) <- modnames
return(out)
}
#### Generate confusion matrix from a list of models ----
ConfusionMatrix <- function(models, testdata=NULL, modnames=NULL, threshold=.5, counts=T, digits=2, debug=F) {
# models = a list of one or more model objects
# modnames = vector of the names to apply to the model objects
# threshold = classification threshold for confusion matrix; usually would want to set to .5
# counts = if set to F, returns proportion table
# digits = significant digits (NOT rounded) to include
out <- list()
# Check if not actually passed a list of models
if (class(models)[[1]] != "list") {models <- list(models)}
n <- length(models)
if (debug) {print(paste("n =", n))}
# If not given test data, use the model data frames themselves
if (is.null(testdata)) {
for (i in 1:n) {testdata[[i]] <- model.frame(models[[i]])}
}
# Reshape test data
if (class(testdata) != "list") {testdata <- list(testdata)}
# Loop through models
for (i in 1:n) {
# Extract the current model
m <- models[[i]]
# Extract the current test data
if (debug) {print(paste("length test data", length(testdata), "i", i))}
if (length(testdata) >= i){
t <- testdata[[i]]
} else {
# If there is no more test data, keep using the last element of the testdata list
t <- testdata[[length(testdata)]]
}
if (debug) {
print(paste("Test data dimensions", nrow(t), "*", ncol(t), sep=" "))
print(paste("Test data type", class(t)))
print(length(t[, as.character(m$formula)[2]]))
print(length(na.omit(predict(m, newdata=t, type="response") >= threshold)))
}
tabl <- table(truth=na.omit(t[, as.character(m$formula)[2]]), pred=na.omit(predict(m, newdata=t, type="response") >= threshold))
if (counts) {out[[i]] <- addmargins(tabl)}
if (!counts) {out[[i]] <- signif(addmargins(prop.table(tabl)), digits)}
}
names(out) <- modnames
return(out)
}
#### Weighted 2-class confusion matrix, from probability and truth vectors ----
ConfusionMatrixWeighted <- function(preds, truth, dimnames=NULL, digits=2) {
# preds = predicted probabilities
# truth = true class membership
# dimnames = source for the class labels
q <- 1 - preds
# Construct the data
out <- round(matrix(c(sum(q[truth == F]),
sum(q[truth == T]),
sum(preds[truth == F]),
sum(preds[truth == T])),
nrow=2, dimnames=list(c("FALSE","TRUE"), c("FALSE","TRUE"))), digits)
# Coerce to table
out <- as.table(out)
names(attributes(out)$dimnames) <- dimnames
return(out)
}
#### Calculate Lp norm of a vector; defaults to L2 norm ----
LpNorm <- function(data, p=2) {
# data = numeric vector
# p = norm to calculate
if (p==0) {
return(NA)
} else {
return(sum(abs(data^p))^(1/p))
}
}
#### Make a random forest error plot with tidyr and ggplot2 ----
PlotForest <- function(forest, title=NULL) {
# Reformat err.rate to include the count of number of trees, and gather from columns into rows
err <- data.frame(trees=(1:forest$ntree), forest$err.rate)
err <- gather(err, "Label", "Error", 2:dim(err)[2])
# Pretty plot
ggplot(err, aes(x=trees, y=Error, color=Label)) + geom_line() + labs(title=title) + scale_y_continuous(limits=c(0,max(err$Error)))
}
#### Function to calculate error rates per stage ----
CalcErrorRates <- function(confusion.matrices, model.name, modnames=c("Plan", "Clear", "Ask", "Oral")) {
# confusion.matrices = list of objects; output from ConfusionMatrix
# model.name = string; row name for current model
output <- data.frame(
Model = model.name,
Plan = confusion.matrices[[modnames[1]]]["FALSE","TRUE"] + confusion.matrices[[modnames[1]]]["TRUE","FALSE"],
Clear = confusion.matrices[[modnames[2]]]["FALSE","TRUE"] + confusion.matrices[[modnames[2]]]["TRUE","FALSE"],
Ask = confusion.matrices[[modnames[3]]]["FALSE","TRUE"] + confusion.matrices[[modnames[3]]]["TRUE","FALSE"],
# Oral includes a fallback condition for when there are no True-False predictions
Oral = tryCatch(confuse[[modnames[4]]]["FALSE","TRUE"] + confusion.matrices[[modnames[4]]]["TRUE","FALSE"], error=function(.) sum(confusion.matrices[[modnames[4]]]["FALSE","TRUE"]))
)
return(output)
}