-
Notifications
You must be signed in to change notification settings - Fork 45
/
MatchBalance.R
executable file
·153 lines (139 loc) · 5.86 KB
/
MatchBalance.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
#' Calculates balance statistics for matching.
#'
#' @param df data frame with data to match.
#' @param formu the formula used to estimate propensity scores using logistic
#' regression.
#' @param formu.Y (optional) the formula used to estimate a multiple regression model with
#' covariates. Used to estimate covariate importance.
#' @param index.treated a vector of integers corresponding to the rows in \code{df}
#' for the matched treated units. That is, \code{index.treated[i]} is matched
#' to \code{index.control[i]}.
#' @param index.control a vector of integers corresponding to the rows in \code{df}
#' for the matched control units. That is, \code{index.treated[i]} is matched
#' to \code{index.control[i]}.
#' @param exact.covs character vector of covariates to match exactly on. For
#' quantitative variables with more than two unique values, matches will
#' be made within quintiles.
#' @param tolerance the maximum standard deviation the difference between matches
#' on a quantitative covariate can be to be considered matched.
#' @param n.levels maximum number of unique values in a covariate to attempt to
#' match exactly. If the number of unique values is greater than \code{n.levels},
#' then the covariate will be cut using quantiles and exact matches will
#' be estimated within each quantile.
#' @param ... other parameters passed to \code{\link{Match}}.
#' @return An object of class \code{MatchBalance} with the following elements:
#' \describe{
#' \item{ps.out}{Results from the logistic regression model for treatment.}
#' \item{Y.out}{Results from the multiple regression model for Y.}
#' \item{covariate.matched}{a named numeric vector containing the percent
#' of observations with "perfect" match for each covariate.}
#' \item{covariate.balance}{a named numeric vector containing the percent
#' of covariates with "perfect" match for each observation.}
#' \item{covariate.importance}{a data frame with covariate importance.
#' Covariate importance is estimated using the sum of the absolute
#' t-statistic from both glm calls for treatment and outcome.}
#' \item{index.treated}{vector containing observation row number from original
#' data frame for treated observations.}
#' \item{index.control}{vector containing observatino row number from original
#' data frame for control observations.}
#' }
#' @export
MatchBalance <- function(df, formu, formu.Y, exact.covs,
index.treated, index.control,
tolerance = 0.25,
n.levels = 2, ...) {
n.strata <- 5
if(is.logical(class(df[,all.vars(formu)[1]]))) {
df[,all.vars(formu)[1]] <- as.integer(df[,all.vars(formu)[1]])
}
lr.out <- glm(formu, family=binomial, data=df)
glm.out <- NULL
if(!missing(formu.Y)) {
glm.out <- glm(formu.Y, data=lr.out$data)
}
var.imp <- data.frame(Treat=summary(lr.out)$coefficients[,'z value'])
if(!missing(formu.Y)) {
var.imp$Y <- summary(glm.out)$coefficients[,'t value']
var.imp$Total <- abs(var.imp$Treat) + abs(var.imp$Y)
} else {
var.imp$Total <- abs(var.imp$Treat)
}
var.imp <- var.imp[-1,] # Remove intercept
var.imp <- var.imp[order(var.imp$Total, decreasing=TRUE),,drop=FALSE]
if(!missing(index.treated) & !missing(index.control)) {
if(!missing(exact.covs)) {
stop('Specifying exact.covs is not supported when also specifying
index.treated and index.control')
}
if(length(index.treated) != length(index.control) &
length(index.treated) != nrow(df)) {
stop('length(index.treated) != length(index.control) != nrow(df)')
}
} else {
exact <- list()
if(!missing(exact.covs)) {
for(i in exact.covs) {
if(is.numeric(df[,i]) & length(unique(df[,i])) > n.levels) {
q <- quantile(df[,i], seq(0, 1, 1/n.strata))
if(length(unique(q)) != (n.strata+5)) {
# breaks would not be unique so we'll treat as a qualitative covariate
exact[[i]] <- df[,i]
} else {
exact[[i]] <- cut(df[,i], q, include.lowest=TRUE,
labels=letters[1:n.strata])
}
} else {
exact[[i]] <- df[,i]
}
}
}
if(length(exact) > 0) {
match.out <- Matchby(Tr = df[,all.vars(formu)[1]],
X = fitted(lr.out),
by = exact,
print.level = 0, ... )
} else {
match.out <- Match(Tr = df[,all.vars(formu)[1]],
X = fitted(lr.out),
... )
}
index.treated <- match.out$index.treated
index.control <- match.out$index.control
}
df.matrix <- model.matrix(formu, data=df)
df.matrix <- df.matrix[,-1] # Remove the intercept
df.cov.bal <- data.frame(row.names=1:length(index.treated))
for(i in seq_len(ncol(df.matrix))) {
if(is.factor(df.matrix[,i]) |
is.character(df.matrix[,i]) |
is.logical(df.matrix[,i])) {
# This should not occur. Left over from using data.frame
df.cov.bal[,colnames(df.matrix)[i]] <-
df.matrix[index.treated,i] == df.matrix[index.control,i]
} else if(is.numeric(df.matrix[,i])) {
diff <- abs(df.matrix[index.treated,i] - df.matrix[index.control,i])
df.cov.bal[,colnames(df.matrix)[i]] <- diff / sd(df.matrix[,i])
} else {
warning(paste0('Unknown column type for ', names(df)[i]))
}
}
col.percents <- apply(df.cov.bal[,1:(ncol(df.cov.bal))] < tolerance, 2, sum) /
nrow(df.cov.bal) * 100
row.percents <- apply(df.cov.bal[,1:(ncol(df.cov.bal))] < tolerance, 1, sum) /
ncol(df.cov.bal) * 100
results <- list(
df = df,
df.matrix = df.matrix,
ps.out = lr.out,
Y.out = glm.out,
covariate.matched = col.percents,
observation.matched = row.percents,
covariate.balance = df.cov.bal,
covariate.importance = var.imp,
index.treated = index.treated,
index.control = index.control,
tolerance = tolerance
)
class(results) <- 'MatchBalance'
return(results)
}