diff --git a/.github/workflows/R-CMD-check.yaml b/.github/workflows/R-CMD-check.yaml index 1da087112..5e10ea6be 100644 --- a/.github/workflows/R-CMD-check.yaml +++ b/.github/workflows/R-CMD-check.yaml @@ -2,7 +2,11 @@ on: push: branches: - master + - cranversion pull_request: + branches: + - master + - cranversion name: R-CMD-check diff --git a/.github/workflows/lint.yaml b/.github/workflows/lint.yaml index d4efff65e..92a57554d 100644 --- a/.github/workflows/lint.yaml +++ b/.github/workflows/lint.yaml @@ -2,9 +2,11 @@ on: push: branches: - master + - cranversion pull_request: branches: - master + - cranversion name: lint diff --git a/.github/workflows/pkgdown.yaml b/.github/workflows/pkgdown.yaml index 5e9a81150..23924be88 100644 --- a/.github/workflows/pkgdown.yaml +++ b/.github/workflows/pkgdown.yaml @@ -1,7 +1,8 @@ on: push: - branches: master - + branches: + - master + - cranversion name: pkgdown jobs: diff --git a/CRAN-RELEASE b/CRAN-RELEASE new file mode 100644 index 000000000..92f8ef667 --- /dev/null +++ b/CRAN-RELEASE @@ -0,0 +1,2 @@ +This package was submitted to CRAN on 2021-01-21. +Once it is accepted, delete this file and tag the release (commit 8bad7333). diff --git a/DESCRIPTION b/DESCRIPTION index b8cdd3fe6..1126bf221 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,5 +1,5 @@ Package: shapr -Version: 0.1.3 +Version: 0.1.4.9000 Title: Prediction Explanation with Dependence-Aware Shapley Values Description: Complex machine learning models are often hard to interpret. However, in many situations it is crucial to understand and explain why a model made a specific diff --git a/NAMESPACE b/NAMESPACE index 54896f88c..7e20d200b 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -6,17 +6,17 @@ S3method(explain,ctree) S3method(explain,ctree_comb_mincrit) S3method(explain,empirical) S3method(explain,gaussian) -S3method(features,gam) -S3method(features,glm) -S3method(features,lm) -S3method(features,ranger) -S3method(features,xgb.Booster) -S3method(model_type,default) -S3method(model_type,gam) -S3method(model_type,glm) -S3method(model_type,lm) -S3method(model_type,ranger) -S3method(model_type,xgb.Booster) +S3method(get_model_specs,gam) +S3method(get_model_specs,glm) +S3method(get_model_specs,lm) +S3method(get_model_specs,ranger) +S3method(get_model_specs,xgb.Booster) +S3method(model_checker,default) +S3method(model_checker,gam) +S3method(model_checker,glm) +S3method(model_checker,lm) +S3method(model_checker,ranger) +S3method(model_checker,xgb.Booster) S3method(plot,shapr) S3method(predict_model,default) S3method(predict_model,gam) @@ -29,21 +29,25 @@ S3method(prepare_data,ctree) S3method(prepare_data,empirical) S3method(prepare_data,gaussian) export(aicc_full_single_cpp) +export(check_features) export(correction_matrix_cpp) export(create_ctree) export(explain) export(feature_combinations) export(feature_matrix_cpp) -export(features) +export(get_data_specs) +export(get_model_specs) export(hat_matrix_cpp) export(mahalanobis_distance_cpp) export(make_dummies) -export(model_type) +export(model_checker) export(observation_impute_cpp) export(predict_model) export(prepare_data) +export(preprocess_data) export(rss_cpp) export(shapr) +export(update_data) export(weight_matrix_cpp) importFrom(Rcpp,sourceCpp) importFrom(data.table,":=") @@ -65,9 +69,12 @@ importFrom(graphics,hist) importFrom(graphics,plot) importFrom(graphics,rect) importFrom(stats,as.formula) +importFrom(stats,contrasts) importFrom(stats,model.frame) importFrom(stats,model.matrix) importFrom(stats,predict) +importFrom(stats,setNames) importFrom(utils,head) +importFrom(utils,methods) importFrom(utils,tail) useDynLib(shapr, .registration = TRUE) diff --git a/NEWS.md b/NEWS.md index e6626e3cd..84ec247d7 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,4 +1,8 @@ +# shapr 0.1.4 + +* Patch to fulfill CRAN policy of using packages under Suggests conditionally (in tests and examples) + # shapr 0.1.3 * Fix installation error on Solaris diff --git a/R/explanation.R b/R/explanation.R index 58a28104e..1c663381c 100644 --- a/R/explanation.R +++ b/R/explanation.R @@ -16,11 +16,11 @@ #' #' @param ... Additional arguments passed to \code{\link{prepare_data}} #' -#' @details The most important thing to notice is that \code{shapr} has implemented three different +#' @details The most important thing to notice is that \code{shapr} has implemented four different #' approaches for estimating the conditional distributions of the data, namely \code{"empirical"}, -#' \code{"gaussian"} and \code{"copula"}. +#' \code{"gaussian"}, \code{"copula"} and \code{"ctree"}. #' -#' In addition to this the user will also have the option of combining the three approaches. +#' In addition, the user also has the option of combining the four approaches. #' E.g. if you're in a situation where you have trained a model the consists of 10 features, #' and you'd like to use the \code{"gaussian"} approach when you condition on a single feature, #' the \code{"empirical"} approach if you condition on 2-5 features, and \code{"copula"} version @@ -60,9 +60,10 @@ #' #' @export #' -#' @author Camilla Lingjaerde, Nikolai Sellereite +#' @author Camilla Lingjaerde, Nikolai Sellereite, Martin Jullum, Annabelle Redelmeier #' #' @examples +#' if (requireNamespace("MASS", quietly = TRUE)) { #' # Load example data #' data("Boston", package = "MASS") #' @@ -99,19 +100,22 @@ #' print(explain1$dt) #' #' # Plot the results +#' if (requireNamespace("ggplot2", quietly = TRUE)) { #' plot(explain1) +#' } +#' } explain <- function(x, explainer, approach, prediction_zero, ...) { extras <- list(...) # Check input for x if (!is.matrix(x) & !is.data.frame(x)) { - stop("x should be a matrix or a dataframe.") + stop("x should be a matrix or a data.frame/data.table.") } # Check input for approach if (!(is.vector(approach) && is.atomic(approach) && - (length(approach) == 1 | length(approach) == length(explainer$feature_labels)) && + (length(approach) == 1 | length(approach) == length(explainer$feature_list$labels)) && all(is.element(approach, c("empirical", "gaussian", "copula", "ctree")))) ) { stop( @@ -123,16 +127,7 @@ explain <- function(x, explainer, approach, prediction_zero, ...) { ) } - # Check that x contains correct variables - if (!all(explainer$feature_labels %in% colnames(x))) { - stop( - paste0( - "\nThe test data, x, does not contain all features necessary for\n", - "generating predictions. Please modify x so that all labels given\n", - "by explainer$feature_labels is present in colnames(x)." - ) - ) - } + if (length(approach) > 1) { class(x) <- "combined" @@ -175,7 +170,7 @@ explain.empirical <- function(x, explainer, approach, prediction_zero, start_aicc = 0.1, w_threshold = 0.95, ...) { # Add arguments to explainer object - explainer$x_test <- explainer_x_test(x, explainer$feature_labels) + explainer$x_test <- as.matrix(preprocess_data(x, explainer$feature_list)$x_dt) explainer$approach <- approach explainer$type <- type explainer$fixed_sigma_vec <- fixed_sigma_vec @@ -207,8 +202,9 @@ explain.empirical <- function(x, explainer, approach, prediction_zero, #' @export explain.gaussian <- function(x, explainer, approach, prediction_zero, mu = NULL, cov_mat = NULL, ...) { + # Add arguments to explainer object - explainer$x_test <- explainer_x_test(x, explainer$feature_labels) + explainer$x_test <- as.matrix(preprocess_data(x, explainer$feature_list)$x_dt) explainer$approach <- approach # If mu is not provided directly, use mean of training data @@ -246,7 +242,7 @@ explain.gaussian <- function(x, explainer, approach, prediction_zero, mu = NULL, explain.copula <- function(x, explainer, approach, prediction_zero, ...) { # Setup - explainer$x_test <- explainer_x_test(x, explainer$feature_labels) + explainer$x_test <- as.matrix(preprocess_data(x, explainer$feature_list)$x_dt) explainer$approach <- approach # Prepare transformed data @@ -314,7 +310,7 @@ explain.ctree <- function(x, explainer, approach, prediction_zero, } # Add arguments to explainer object - explainer$x_test <- explainer_x_test_dt(x, explainer$feature_labels) + explainer$x_test <- preprocess_data(x, explainer$feature_list)$x_dt explainer$approach <- approach explainer$mincriterion <- mincriterion explainer$minsplit <- minsplit @@ -341,7 +337,7 @@ explain.combined <- function(x, explainer, approach, prediction_zero, # Get indices of combinations l <- get_list_approaches(explainer$X$n_features, approach) explainer$return <- TRUE - explainer$x_test <- explainer_x_test(x, explainer$feature_labels) + explainer$x_test <- as.matrix(preprocess_data(x, explainer$feature_list)$x_dt) dt_l <- list() for (i in seq_along(l)) { @@ -398,32 +394,6 @@ get_list_approaches <- function(n_features, approach) { return(l) } -#' @keywords internal -explainer_x_test <- function(x_test, feature_labels) { - - # Remove variables that were not used for training - x <- data.table::as.data.table(x_test) - cnms_remove <- setdiff(colnames(x), feature_labels) - if (length(cnms_remove) > 0) x[, (cnms_remove) := NULL] - data.table::setcolorder(x, feature_labels) - - return(as.matrix(x)) -} - -#' @keywords internal -explainer_x_test_dt <- function(x_test, feature_labels) { - - # Remove variables that were not used for training - # Same as explainer_x_test() but doesn't convert to a matrix - # Useful for ctree method which sometimes takes categorical features - x <- data.table::as.data.table(x_test) - cnms_remove <- setdiff(colnames(x), feature_labels) - if (length(cnms_remove) > 0) x[, (cnms_remove) := NULL] - data.table::setcolorder(x, feature_labels) - - return(x) -} - #' @rdname explain #' @name explain @@ -462,15 +432,3 @@ get_list_ctree_mincrit <- function(n_features, mincriterion) { } return(l) } - -#' @keywords internal -explainer_x_test <- function(x_test, feature_labels) { - - # Remove variables that were not used for training - x <- data.table::as.data.table(x_test) - cnms_remove <- setdiff(colnames(x), feature_labels) - if (length(cnms_remove) > 0) x[, (cnms_remove) := NULL] - data.table::setcolorder(x, feature_labels) - - return(as.matrix(x)) -} diff --git a/R/features.R b/R/features.R index e006dcbf1..615a74898 100644 --- a/R/features.R +++ b/R/features.R @@ -181,28 +181,23 @@ helper_feature <- function(m, feature_sample) { #' #' @return A list that contains the following entries: #' \describe{ -#' \item{obj}{List, Contains \describe{ -#' \item{features}{Vector. Contains the names of all the features in \code{data}.} -#' \item{factor_features}{Vector. Contains the names of all the factors in \code{data}.} -#' \item{factor_list}{List. Contains each factor and its vector of levels.} -#' \item{contrasts_list}{List. Contains all the contrasts of the factors.} -#' }} +#' \item{feature_list}{List. Output from \code{check_features}} #' \item{train_dummies}{A data.frame containing all of the factors in \code{traindata} as #' one-hot encoded variables.} #' \item{test_dummies}{A data.frame containing all of the factors in \code{testdata} as #' one-hot encoded variables.} #' \item{traindata_new}{Original traindata with correct column ordering and factor levels. To be passed to -#' \code{shapr()}.} +#' \code{\link[shapr:shapr]{shapr}.}} #' \item{testdata_new}{Original testdata with correct column ordering and factor levels. To be passed to -#' \code{explain()}.} +#' \code{\link[shapr:explain]{explain}.}} #' } #' #' @export #' -#' @author Annabelle Redelmeier +#' @author Annabelle Redelmeier, Martin Jullum #' #' @examples -#' +#' if (requireNamespace("MASS", quietly = TRUE)) { #' data("Boston", package = "MASS") #' x_var <- c("lstat", "rm", "dis", "indus") #' y_var <- "medv" @@ -215,171 +210,115 @@ helper_feature <- function(m, feature_sample) { #' x_test$rm <- factor(round(x_test$rm), levels = levels(x_train$rm)) #' #' dummylist <- make_dummies(traindata = x_train, testdata = x_test) -#' +#'} make_dummies <- function(traindata, testdata) { - contrasts <- features <- factor_features <- NULL # due to NSE notes in R CMD check - # <- model.frame <- model.matrix - if (is.null(colnames(traindata))) { - stop("traindata must have column names.") - } - if (is.null(colnames(testdata))) { - stop("testdata must have column names.") - } - - traindata <- data.table::as.data.table(traindata) - testdata <- data.table::as.data.table(testdata) - - if (length(colnames(traindata)) != length(colnames(testdata))) { - stop("traindata and testdata must have the same number of columns.") - } - - if (!all(sort(colnames(traindata)) == sort(colnames(testdata)))) { - stop("traindata and testdata must have the same column names.") + if(all(is.null(colnames(traindata)))){ + stop(paste0("The traindata is missing column names")) } - features <- colnames(traindata) - # feature names must be unique - if (any(duplicated(features))) { - stop("Both traindata and testdata must have unique column names.") + if(all(is.null(colnames(testdata)))){ + stop(paste0("The testdata is missing column names")) } - # Check if any features have empty names i.e "" - if (any(features == "")) { - stop("One or more features is missing a name.") - } + train_dt <- data.table::as.data.table(traindata) + test_dt <- data.table::as.data.table(testdata) - # In case the testing data has a different column order than the training data: - testdata <- testdata[, features, with = FALSE] + feature_list_train <- get_data_specs(train_dt) + feature_list_test <- get_data_specs(test_dt) - # Check if the features all have class "integer", "numeric" or "factor - if (!all(sapply(traindata, class) %in% c("integer", "numeric", "factor"))) { - stop("All traindata must have class integer, numeric or factor.") - } - if (!all(sapply(testdata, class) %in% c("integer", "numeric", "factor"))) { - stop("All testdata must have class integer, numeric or factor.") - } - # Check if traindata and testdata have features with the same class - if (!all(sapply(traindata, class) == sapply(testdata, class))) { - stop("All traindata and testdata must have the same classes.") - } + feature_list_train$specs_type="traindata" + feature_list_test$specs_type="testdata" - # Check that traindata and testdata have the same levels for the factor features - is_factor <- sapply(traindata, is.factor) # check which features are factors - nb_factor <- sum(is_factor) + updater <- check_features(feature_list_train,feature_list_test,F) - list_levels_train <- lapply(traindata[, is_factor, with = FALSE], function(x) sort(levels(x))) - list_levels_test <- lapply(testdata[, is_factor, with = FALSE], function(x) sort(levels(x))) + # Reorderes factor levels so that they match each other + update_data(train_dt,updater) + update_data(test_dt,updater) - if (!identical(list_levels_train, list_levels_test)) { - stop("Levels of categorical variables in traindata and testdata must be the same.") - } + feature_list <- updater - # re-level traindata and testdata - for (i in names(list_levels_train)) { - traindata[[i]] <- factor(traindata[[i]], levels = list_levels_train[[i]]) - } - for (i in names(list_levels_test)) { - testdata[[i]] <- factor(testdata[[i]], levels = list_levels_test[[i]]) - } + # Extracts the components + factor_features <- feature_list$labels[updater$classes=="factor"] - if (nb_factor > 0) { - factor_features <- features[is_factor] - factor_list <- lapply(traindata[, factor_features, with = FALSE], levels) - } else { - factor_features <- NULL - factor_list <- NULL - } + if(length(factor_features)>0){ + factor_list <- feature_list$factor_levels[factor_features] + feature_list$contrasts_list <- lapply(train_dt[, factor_features, with = FALSE], contrasts, contrasts = FALSE) - contrasts_list <- lapply(traindata[, factor_features, with = FALSE], contrasts, contrasts = FALSE) + # get train dummies + m <- model.frame(data = train_dt, + xlev = factor_list) + train_dummies <- model.matrix(object = ~. + 0, + data = m, + contrasts.arg = feature_list$contrasts_list) - obj <- list(features = features, - factor_features = factor_features, - factor_list = factor_list, - contrasts_list = contrasts_list, - class_vector = sapply(traindata, class)) + # get test dummies + m <- model.frame(data = test_dt, + xlev = factor_list) + test_dummies <- model.matrix(object = ~. + 0, + data = m, + contrasts.arg = feature_list$contrasts_list) - # get train dummies - m <- model.frame(data = traindata, - xlev = obj$factor_list) - train_dummies <- model.matrix(object = ~. + 0, - data = m, - contrasts.arg = obj$contrasts_list) - # get test dummies - m <- model.frame(data = testdata, - xlev = obj$factor_list) - test_dummies <- model.matrix(object = ~. + 0, - data = m, - contrasts.arg = obj$contrasts_list) + } else { + train_dummies <- train_dt + test_dummies <- test_dt + } - return(list(obj = obj, train_dummies = train_dummies, test_dummies = test_dummies, traindata_new = traindata, - testdata_new = testdata)) + return(list(feature_list = feature_list, + train_dummies = train_dummies, test_dummies = test_dummies, traindata_new = train_dt, + testdata_new = test_dt)) } -#' Make dummy variables - this is an internal function intended only to be used in +#' Apply dummy variables - this is an internal function intended only to be used in #' predict_model.xgb.Booster() #' -#' @param obj List. Output of \code{make_dummies$obj}. +#' @param feature_list List. The \code{feature_list} object in the output object after running +#' \code{\link[shapr:make_dummies]{make_dummies}} #' #' @param testdata data.table or data.frame. New data that has the same -#' feature names, types, and levels as \code{obj$data}. +#' feature names, types, and levels as \code{feature_list}. #' -#' @return A data.frame containing all of the factors in \code{testdata} as -#' one-hot encoded variables. +#' @return A data.table with all features but where the factors in \code{testdata} are +#' one-hot encoded variables as specified in feature_list #' -#' @author Annabelle Redelmeier +#' @author Annabelle Redelmeier, Martin Jullum #' #' @keywords internal #' -apply_dummies <- function(obj, testdata) { +apply_dummies <- function(feature_list, testdata) { + - features <- NULL # due to NSE notes in R CMD check - if (is.null(colnames(testdata))) { - stop("testdata must have column names.") + if(all(is.null(colnames(testdata)))){ + stop(paste0("The testdata is missing column names")) } + test_dt <- data.table::as.data.table(testdata) - testdata <- data.table::as.data.table(testdata) - features <- obj$features + feature_list_test <- get_data_specs(test_dt) - if (length(features) != length(colnames(testdata))) { - stop("testdata must have the same number of columns as traindata.") - } - if (!all(sort(features) == sort(colnames(testdata)))) { - stop("testdata must have the same column names as traindata.") - } + feature_list_test$specs_type="testdata" - # in case the testing data has a different column order or more columns than the training data: - testdata <- testdata[, features, with = FALSE] + updater <- check_features(feature_list,feature_list_test,F) - if (!all(sapply(testdata, class) %in% c("integer", "numeric", "factor"))) { - stop("All testdata must have class integer, numeric or factor.") - } - if (!all(obj$class_vector == sapply(testdata, class))) { - stop("All traindata and testdata must have the same classes.") - } + # Reorderes factor levels so that they match + update_data(test_dt,updater) - # Check that traindata and testdata have the same levels for the factor features - is_factor <- obj$factor_features - list_levels_train <- obj$factor_list - list_levels_test <- lapply(testdata[, is_factor, with = FALSE], function(x) sort(levels(x))) + factor_features <- feature_list$labels[updater$classes=="factor"] # check which features are factors - if (!identical(list_levels_train, list_levels_test)) { - stop("Levels of categorical variables in traindata and testdata must be the same.") - } + if(length(factor_features)>0){ + factor_list <- feature_list$factor_levels[factor_features] - # re-level testdata - for (i in names(list_levels_test)) { - testdata[[i]] <- factor(testdata[[i]], levels = list_levels_test[[i]]) - } + m <- model.frame(data = test_dt, + xlev = factor_list) - m <- model.frame(data = testdata, - xlev = obj$factor_list) + x <- model.matrix(object = ~. + 0, + data = m, + contrasts.arg = feature_list$contrasts_list) + } else { + x <- test_dt + } - x <- model.matrix(object = ~. + 0, - data = m, - contrasts.arg = obj$contrasts_list) return(x) } diff --git a/R/models.R b/R/models.R index 82561bd7c..d82a274a6 100644 --- a/R/models.R +++ b/R/models.R @@ -26,8 +26,10 @@ #' If you have a binary classification model we'll always return the probability prediction #' for a single class. #' -#' For more details on how to use a custom model see the package vignette: \cr -#' \code{vignette("understanding_shapr", package = "shapr")} +#' For more details on how to explain other types of models (i.e. custom models), see the Advanced usage section +#' of the vignette: \cr +#' From R: \code{vignette("understanding_shapr", package = "shapr")} \cr +#' Web: \url{https://norskregnesentral.github.io/shapr/articles/understanding_shapr.html#explain-custom-models} #' #' @return Numeric #' @@ -36,6 +38,7 @@ #' #' @author Martin Jullum #' @examples +#' if (requireNamespace("MASS", quietly = TRUE)) { #'# Load example data #' data("Boston", package = "MASS") #' # Split data into test- and training data @@ -46,6 +49,7 @@ #' #' # Predicting for a model with a standardized format #' predict_model(x = model, newdata = x_test) +#' } predict_model <- function(x, newdata) { UseMethod("predict_model", x) } @@ -96,9 +100,6 @@ predict_model.ranger <- function(x, newdata) { stop("The ranger package is required for predicting ranger models") } - # Test model type - model_type <- model_type(x) - if (x$treetype == "Probability estimation") { predict(x, newdata)$predictions[, 2] } else { @@ -114,14 +115,11 @@ predict_model.xgb.Booster <- function(x, newdata) { stop("The xgboost package is required for predicting xgboost models") } - # Test model type - model_type <- model_type(x) - - if (model_type %in% c("cat_regression", "cat_classification")) { - newdata_dummy <- apply_dummies(obj = x$dummylist, testdata = newdata) - predict(x, as.matrix(newdata_dummy)) - } else { + if (is.null(x$feature_list)) { predict(x, as.matrix(newdata)) + } else { + newdata_dummy <- apply_dummies(feature_list = x$feature_list, testdata = newdata) + predict(x, as.matrix(newdata_dummy)) } } @@ -145,24 +143,24 @@ predict_model.gam <- function(x, newdata) { } -#' Define type of model +#' Check that the type of model is supported by the explanation method #' -#' @description The function checks whether the model given by \code{x} is -#' supported, and if it is a regression- or a classification model. If \code{x} is -#' not a supported model the function will return an error message, otherwise it will -#' return either \code{"regression"} or \code{"classification"}. +#' @description The function checks whether the model given by \code{x} is supported. +#' If \code{x} is not a supported model the function will return an error message, otherwise it return NULL +#' (meaning all types of models with this class is supported) #' #' @inheritParams predict_model #' #' @details See \code{\link{predict_model}} for more information about #' what type of models \code{shapr} currently support. #' -#' @return Either \code{"classification"} or \code{"regression"}. +#' @return Error or NULL #' #' @export #' @keywords internal #' #' @examples +#' if (requireNamespace("MASS", quietly = TRUE)) { #' # Load example data #' data("Boston", package = "MASS") #' # Split data into test- and training data @@ -170,38 +168,35 @@ predict_model.gam <- function(x, newdata) { #' # Fit a linear model #' model <- lm(medv ~ lstat + rm + dis + indus, data = x_train) #' -#' # Writing out the defined model type of the object -#' model_type(x = model) -model_type <- function(x) { - UseMethod("model_type") +#' # Checking the model object +#' model_checker(x = model) +#' } +model_checker <- function(x) { + UseMethod("model_checker", x) } -#' @rdname model_type +#' @rdname model_checker #' @export -model_type.default <- function(x) { - stop("The model you passed to shapr is currently not supported.") +model_checker.default <- function(x) { + stop("The model class you passed to shapr is currently not supported.") } -#' @rdname model_type +#' @rdname model_checker #' @export -model_type.lm <- function(x) { - "regression" +model_checker.lm <- function(x) { + NULL } -#' @rdname model_type +#' @rdname model_checker #' @export -model_type.glm <- function(x) { - ifelse( - x$family[[1]] == "binomial", - "classification", - "regression" - ) +model_checker.glm <- function(x) { + NULL } -#' @rdname model_type -#' @name model_type +#' @rdname model_checker +#' @name model_checker #' @export -model_type.ranger <- function(x) { +model_checker.ranger <- function(x) { if (x$treetype == "Classification") { stop( @@ -214,10 +209,16 @@ model_type.ranger <- function(x) { ) } - if (x$treetype == "Probability estimation") { - if (length(x$forest$levels) == 2) { - "classification" - } else { + if (x$treetype == "survival") { + stop( + paste0( + "\n", + "We currently don't support explanation of survival type of ranger models." + ) + ) + } + + if (x$treetype == "Probability estimation" & length(x$forest$levels) > 2) { stop( paste0( "\n", @@ -225,25 +226,31 @@ model_type.ranger <- function(x) { "where length(model$forest$levels) is greater than 2." ) ) - } - } else { - "regression" } + + # Additional check + if (is.null(x$forest)) { + stop( + paste0( + "\nIt looks like the model was fitted without saving the forest. Please set\n", + "write.forest = TRUE when fitting a model using ranger::ranger()." + ) + ) + } + + + return(NULL) } -#' @rdname model_type +#' @rdname model_checker #' @export -model_type.gam <- function(x) { - ifelse( - x$family[[1]] == "binomial", - "classification", - "regression" - ) +model_checker.gam <- function(x) { + NULL } -#' @rdname model_type +#' @rdname model_checker #' @export -model_type.xgb.Booster <- function(x) { +model_checker.xgb.Booster <- function(x) { if (!is.null(x$params$objective) && (x$params$objective == "multi:softmax" | x$params$objective == "multi:softprob") @@ -267,163 +274,218 @@ model_type.xgb.Booster <- function(x) { ) ) } - - ifelse( - !is.null(x$params$objective) && x$params$objective == "binary:logistic", - ifelse(is.null(x$dummylist), "classification", "cat_classification"), - ifelse(is.null(x$dummylist), "regression", "cat_regression") - ) + return(NULL) } -#' Fetches feature labels from a given model object +#' Fetches feature information from a given model object #' #' @inheritParams predict_model -#' @param cnms Character vector. Represents the names of the columns in the data used for training/explaining. -#' @param feature_labels Character vector. Represents the labels of the features used for prediction. #' -#' @keywords internal +#' @details This function is used to extract the feature information to be checked against data passed to \code{shapr} +#' and \code{explain}. The function is called from \code{preprocess_data}. +#' +#' @return A list with the following elements: +#' \describe{ +#' \item{labels}{character vector with the feature names to compute Shapley values for} +#' \item{classes}{a named character vector with the labels as names and the class type as elements} +#' \item{factor_levels}{a named list with the labels as names and character vectors with the factor levels as elements +#' (NULL if the feature is not a factor)} +#' } +#' +#' @author Martin Jullum #' -#' @export #' @keywords internal +#' @export #' #' @examples +#' if (requireNamespace("MASS", quietly = TRUE)) { #'# Load example data #' data("Boston", package = "MASS") #' # Split data into test- and training data -#' x_train <- head(Boston, -3) -#' # Fit a linear model -#' model <- lm(medv ~ lstat + rm + dis + indus, data = x_train) -#' -#' cnms <- c("lstat", "rm", "dis", "indus") +#' x_train <- data.table::as.data.table(head(Boston)) +#' x_train[,rad:=as.factor(rad)] +#' model <- lm(medv ~ lstat + rm + rad + indus, data = x_train) #' -#' # Checking that features used by the model corresponds to cnms -#' features(x = model, cnms = cnms, feature_labels = NULL) -features <- function(x, cnms, feature_labels = NULL) { - UseMethod("features", x) -} +#' get_model_specs(model) +#'} +get_model_specs <- function(x) { -#' @rdname features -features.default <- function(x, cnms, feature_labels = NULL) { + model_class <- NULL # Due to NSE notes in R CMD check - if (is.null(feature_labels)) { + required_model_objects <- "predict_model" + recommended_model_objects <- "get_model_specs" + + # Start with all checking for native models + model_info <- get_supported_models()[model_class==class(x)[1],] + available_model_objects <- names(which(unlist(model_info[,2:3]))) + + if(nrow(model_info)==0){ + stop( + "You passed a model to shapr which is not natively supported See ?shapr::shapr or the vignette\n", + "for more information on how to run shapr with custom models." + ) + } + + if(!(all(required_model_objects %in% available_model_objects))) + { + this_object_missing <- which(!(required_model_objects %in% available_model_objects)) stop( paste0( - "\nIt looks like you are using a custom model, and forgot to pass\n", - "a valid value for the argument feature_labels when calling shapr().\n", - "See ?shapr::shapr for more information about the argument." + "The following required model objects are not available for your custom model: ", + paste0(required_model_objects[this_object_missing],collapse = ", "),".\n", + "See the 'Advanced usage' section of the vignette:\n", + "vignette('understanding_shapr', package = 'shapr')\n", + "for more information.\n" ) ) } - if (!all(feature_labels %in% cnms)) { - stop( + if(!(all(recommended_model_objects %in% available_model_objects))) + { + this_object_missing <- which(!(recommended_model_objects %in% available_model_objects)) + message( paste0( - "\nThere is mismatch between the column names in x and\n", - "feature_labels. All elements in feature_labels should\n", - "be present in colnames(x)." + paste0(recommended_model_objects[this_object_missing],collapse = ", ")," is not available for your custom ", + "model. All feature consistency checking between model and data is disabled.\n", + "See the 'Advanced usage' section of the vignette:\n", + "vignette('understanding_shapr', package = 'shapr')\n", + "for more information.\n" ) ) } - feature_labels + + UseMethod("get_model_specs", x) } -#' @rdname features +#' @rdname get_model_specs +get_model_specs.default <- function(x) { + + # For custom models where there is no + return(list(labels = NA, classes = NA,factor_levels = NA)) +} + + +#' @rdname get_model_specs #' @export -features.lm <- function(x, cnms, feature_labels = NULL) { +get_model_specs.lm <- function(x) { + + model_checker(x) # Checking if the model is supported - if (!is.null(feature_labels)) message_features_labels() + feature_list = list() + feature_list$labels <- labels(x$terms) + m <- length(feature_list$labels) - nms <- tail(all.vars(x$terms), -1) - if (!all(nms %in% cnms) | is.null(nms)) error_feature_labels() + feature_list$classes <- attr(x$terms,"dataClasses")[-1] + feature_list$factor_levels <- setNames(vector("list", m), feature_list$labels) + feature_list$factor_levels[names(x$xlevels)] <- x$xlevels - return(nms) + return(feature_list) } -#' @rdname features +#' @rdname get_model_specs #' @export -features.glm <- function(x, cnms, feature_labels = NULL) { +get_model_specs.glm <- function(x) { - if (!is.null(feature_labels)) message_features_labels() + model_checker(x) # Checking if the model is supported - nms <- tail(all.vars(x$terms), -1) - if (!all(nms %in% cnms) | is.null(nms)) error_feature_labels() + feature_list = list() + feature_list$labels <- labels(x$terms) + m <- length(feature_list$labels) - return(nms) + feature_list$classes <- attr(x$terms,"dataClasses")[-1] + feature_list$factor_levels <- setNames(vector("list", m), feature_list$labels) + feature_list$factor_levels[names(x$xlevels)] <- x$xlevels + + return(feature_list) } -#' @rdname features +#' @rdname get_model_specs #' @export -features.ranger <- function(x, cnms, feature_labels = NULL) { +get_model_specs.gam <- function(x) { - if (!is.null(feature_labels)) message_features_labels() + model_checker(x) # Checking if the model is supported - nms <- x$forest$independent.variable.names + feature_list = list() + feature_list$labels <- labels(x$terms) + m <- length(feature_list$labels) - if (is.null(x$forest)) { - stop( - paste0( - "\nIt looks like the model was fitted without saving the forest. Please set\n", - "write.forest = TRUE when fitting a model using ranger::ranger()." - ) - ) - } - nms <- unique_features(nms) - - if (!all(nms %in% cnms) | is.null(nms)) error_feature_labels() + feature_list$classes <- attr(x$terms,"dataClasses")[-1] + feature_list$factor_levels <- setNames(vector("list", m), feature_list$labels) + feature_list$factor_levels[names(x$xlevels)] <- x$xlevels - return(nms) + return(feature_list) } -#' @rdname features +#' @rdname get_model_specs #' @export -features.gam <- function(x, cnms, feature_labels = NULL) { +get_model_specs.ranger <- function(x) { - if (!is.null(feature_labels)) message_features_labels() + model_checker(x) # Checking if the model is supported - nms <- tail(all.vars(x$terms), -1) + feature_list = list() + feature_list$labels <- unique_features(x$forest$independent.variable.names) + m <- length(feature_list$labels) - if (!all(nms %in% cnms) | is.null(nms)) error_feature_labels() + feature_list$classes <- setNames(rep(NA, m),feature_list$labels) # Not supported + feature_list$factor_levels <- setNames(vector("list", m), feature_list$labels) + feature_list$factor_levels[names(x$forest$covariate.levels)] <- x$forest$covariate.levels # Only provided when respect.unordered.factors == T - return(nms) + return(feature_list) } -#' @rdname features + +#' @rdname get_model_specs #' @export -features.xgb.Booster <- function(x, cnms, feature_labels = NULL) { - if (!is.null(feature_labels)) message_features_labels() +get_model_specs.xgb.Booster <- function(x) { + + model_checker(x) # Checking if the model is supported - nms <- x$feature_names + feature_list = list() + if (is.null(x[["feature_list"]])) { + feature_list$labels <- x$feature_names + m <- length(feature_list$labels) - if (!is.null(x[["dummylist"]])) { - return(cnms) + feature_list$classes <- setNames(rep(NA, m),feature_list$labels) # Not supported + feature_list$factor_levels <- setNames(vector("list", m), feature_list$labels) } else { - if (!all(nms %in% cnms)) error_feature_labels() + feature_list <- x$feature_list } - return(nms) -} + return(feature_list) -#' @keywords internal -message_features_labels <- function() { - message( - paste0( - "\nYou have passed a supported model object, and therefore\n", - "features_labels is ignored. The argument is only applicable when\n", - "using a custom model. For more information see ?shapr::shapr." - ) - ) } -#' @keywords internal -error_feature_labels <- function() { - stop( - paste0( - "\nThere is mismatch between the column names in x and\n", - "the returned elements from features(model). All elements\n", - "from features(model) should be present in colnames(x),\n", - "and they cannot be NULL.\n", - "For more information see ?shapr::features" - ) - ) + + + +#' Provides a data.table with the supported models +#' +#'@keywords internal +get_supported_models <- function(){ + + # Fixing NSE notes in R CMD check + rn <- get_model_specs <- native_get_model_specs <- from <- NULL + predict_model <- native_predict_model <- NULL + native <- NULL + + DT_get_model_specs <- data.table::as.data.table(attr(methods(get_model_specs),"info"),keep.rownames = T) + #DT_get_model_specs <- data.table::as.data.table(attr(.S3methods(get_model_specs,envir=globalenv()),"info"),keep.rownames = T) + + DT_get_model_specs[,rn:=substring(as.character(rn),first=17)] + DT_get_model_specs[,get_model_specs:=1] + DT_get_model_specs[,c("visible","from","generic","isS4"):=NULL] + + DT_predict_model <- data.table::as.data.table(attr(methods(predict_model),"info"),keep.rownames = T) + DT_predict_model[,rn:=substring(as.character(rn),first=15)] + DT_predict_model[,predict_model:=1] + DT_predict_model[,c("visible","from","generic","isS4"):=NULL] + + DT <- merge(DT_get_model_specs,DT_predict_model,by="rn",all=T,allow.cartesian=T,nomatch=0) + DT[,(colnames(DT)[-1]):=lapply(.SD,data.table::nafill,fill=0),.SDcols=colnames(DT)[-1]] + DT[,(colnames(DT)[2:3]):=lapply(.SD,as.logical),.SDcols=colnames(DT)[2:3]] + data.table::setnames(DT,"rn","model_class") + return(DT) } + + diff --git a/R/plot.R b/R/plot.R index bd0b9ab9e..59a70cd63 100644 --- a/R/plot.R +++ b/R/plot.R @@ -20,6 +20,7 @@ #' #' @export #' @examples +#' if (requireNamespace("MASS", quietly = TRUE)) { #' #' # Load example data #' data("Boston", package = "MASS") #' @@ -43,8 +44,11 @@ #' prediction_zero = p, #' n_samples = 1e2) #' +#'if (requireNamespace("ggplot2", quietly = TRUE)) { #' # Plot the explantion (this function) #' plot(explanation) +#' } +#' } #' #' @author Martin Jullum plot.shapr <- function(x, diff --git a/R/preprocess_data.R b/R/preprocess_data.R new file mode 100644 index 000000000..6777eb30d --- /dev/null +++ b/R/preprocess_data.R @@ -0,0 +1,340 @@ +#' Fetches feature information from a given data set +#' +#' @param x matrix, data.frame or data.table The data to extract feature information from. +#' +#' @details This function is used to extract the feature information to be checked against the corresponding +#' information extracted from the model and other data sets. The function is called from +#' \code{\link[shapr:preprocess_data]{preprocess_data}} +#' and \code{\link[shapr:make_dummies]{make_dummies}} +#' +#' @return A list with the following elements: +#' \describe{ +#' \item{labels}{character vector with the feature names to compute Shapley values for} +#' \item{classes}{a named character vector with the labels as names and the class types as elements} +#' \item{factor_levels}{a named list with the labels as names and character vectors with the factor levels as elements +#' (NULL if the feature is not a factor)} +#' } +#' @author Martin Jullum +#' +#' @keywords internal +#' @export +#' +#' @examples +#'# Load example data +#' if (requireNamespace("MASS", quietly = TRUE)) { +#' data("Boston", package = "MASS") +#' # Split data into test- and training data +#' x_train <- data.table::as.data.table(head(Boston)) +#' x_train[,rad:=as.factor(rad)] +#' get_data_specs(x_train) +#' } +get_data_specs <- function(x){ + + x <- data.table::as.data.table(x) + + feature_list = list() + feature_list$labels <- names(x) + feature_list$classes <- unlist(lapply(x,class)) + feature_list$factor_levels = lapply(x,levels) + + # Defining all integer values as numeric + feature_list$classes[feature_list$classes=="integer"] <- "numeric" + + return(feature_list) +} + +#' Process (check and update) data according to specified feature list +#' +#' @param x matrix, data.frame or data.table. The data to check input for and update +#' according to the specification in \code{feature_list}. +#' @param feature_list List. Output from running \code{\link[shapr:get_data_specs]{get_data_specs}} or +#' \code{\link[shapr:get_model_specs]{get_model_specs}} +#' +#' @details This function takes care of all preprocessing and checking of the provided data in \code{x} against +#' the feature_list which is typically the output from \code{\link[shapr:get_model_specs]{get_model_specs}} +#' +#' @return List with two named elements: \code{x_dt}: Checked and updated data \code{x} in data.table format, and +#' \code{update_feature_list} the output from \code{\link[shapr:check_features]{check_features}} +#' +#' @author Martin Jullum +#' +#' @keywords internal +#' @export +#' +#' @examples +#' # Load example data +#' if (requireNamespace("MASS", quietly = TRUE)) { +#' data("Boston", package = "MASS") +#' # Split data into test- and training data +#' x_train <- data.table::as.data.table(head(Boston)) +#' x_train[,rad:=as.factor(rad)] +#' data_features <- get_data_specs(x_train) +#' model <- lm(medv ~ lstat + rm + rad + indus, data = x_train) +#' +#' model_features <- get_model_specs(model) +#' preprocess_data(x_train,model_features) +#' } +preprocess_data = function(x,feature_list){ + if(all(is.null(colnames(x)))){ + stop(paste0("The data is missing column names")) + } + + x_dt <- data.table::as.data.table(x) + + feature_list_data <- get_data_specs(x_dt) + feature_list_data$specs_type <- "data" + + updater <- check_features(feature_list,feature_list_data, + use_1_as_truth = T) + update_data(x_dt,updater) # Updates x_dt by reference + + ret <- list(x_dt = x_dt, + updated_feature_list = updater) + + return(ret) +} + + +#' Checks that two extracted feature lists have exactly the same properites +#' +#' @param f_list_1,f_list_2 List. As extracted from either \code{get_data_specs} or \code{get_model_specs}. +#' @param use_1_as_truth Logical. If TRUE, \code{f_list_2} is compared to \code{f_list_1}, i.e. additional elements +#' is allowed in \code{f_list_2}, and if \code{f_list_1}'s feature classes contains NA's, feature class check is +#' ignored regardless of what is specified in \code{f_list_1}. If FALSE, \code{f_list_1} and \code{f_list_2} are +#' equated and they need to contain exactly the same elements. Set to TRUE when comparing a model and data, and FALSE +#' when comparing two data sets. +#' +#' @return List. The \code{f_list_1} is returned as inserted if there all check are carried out. If some info is +#' missing from \code{f_list_1}, the function continues consistency checking using \code{f_list_2} and returns that. +#' +#' @author Martin Jullum +#' +#' @keywords internal +#' @export +#' +#' @examples +#' # Load example data +#' if (requireNamespace("MASS", quietly = TRUE)) { +#' data("Boston", package = "MASS") +#' # Split data into test- and training data +#' x_train <- data.table::as.data.table(head(Boston)) +#' x_train[,rad:=as.factor(rad)] +#' data_features <- get_data_specs(x_train) +#' model <- lm(medv ~ lstat + rm + rad + indus, data = x_train) +#' +#' model_features <- get_model_specs(model) +#' check_features(model_features,data_features) +#' } +check_features <- function(f_list_1,f_list_2, + use_1_as_truth=T){ + + + if(is.null(f_list_1$specs_type)){ + f_list_1$specs_type <- "model" + } + + if(is.null(f_list_2$specs_type)){ + f_list_2$specs_type <- "model" + } + + name_1 <- f_list_1$specs_type + name_2 <- f_list_2$specs_type + + if(name_1 == name_2){ # If used in explain after a model has NA-info during check in shapr + name_1 <- paste0(name_1,"_train") + name_2 <- paste0(name_2,"_test") + } + + #### Checking that labels exists if required, otherwise stop or switch #### + NULL_1 <- is.null(f_list_1$labels) + NULL_2 <- is.null(f_list_2$labels) + + if(NULL_2 | (NULL_1 & !use_1_as_truth)){ + stop(paste0("The ",name_1," or ",name_2," have missing column names. Handle that to proceed.")) + } + if(NULL_1 & use_1_as_truth){ + message(paste0("The specified ",name_1," provides NULL feature labels. ", + "The labels of ",name_2," are taken as the truth.")) + f_list_1 <- f_list_2 + } + + NA_1 <- any(is.na(f_list_1$labels)) + NA_2 <- any(is.na(f_list_2$labels)) + + if((NA_1 & NA_2) | ((NA_1 | NA_2) & !use_1_as_truth)){ + stop(paste0("The ",name_1," or ",name_2," have column names that are NA. Handle that to proceed.")) + } + if((NA_1 & use_1_as_truth)){ + message(paste0("The specified ",name_1," provides feature labels that are NA. ", + "The labels of ",name_2," are taken as the truth.")) + f_list_1 <- f_list_2 + } + + # feature names must be unique + if (any(duplicated(f_list_1$labels))) { + stop(paste0(name_1," must have unique column names.")) + } + + # feature names must be unique + if (any(duplicated(f_list_2$labels))) { + stop(paste0(name_2," must have unique column names.")) + } + + + feat_in_1_not_in_2 <- f_list_1$labels[!(f_list_1$labels %in% f_list_2$labels)] + feat_in_2_not_in_1 <- f_list_2$labels[!(f_list_2$labels %in% f_list_1$labels)] + + # Check that the features in 1 are in 2 + if (length(feat_in_1_not_in_2)>0) { + stop(paste0("Feature(s) ",paste0(feat_in_1_not_in_2,collapse=", ")," in ",name_1," is not in ",name_2,".")) + } + + # Also check that the features in 2 are in 1 + if(!use_1_as_truth){ + if (length(feat_in_2_not_in_1)>0) { + stop(paste0("Feature(s) ",paste0(feat_in_2_not_in_1,collapse=", ")," in ",name_2," is not in ",name_1,".")) + } + } + + # Check if any features have empty names i.e "" + if (any(f_list_1$labels == "")) { + stop("One or more features is missing a name.") + } + + # Order classes and factor levels in the same way as labels + # for f_list_1 + order_1 <- match(f_list_1$labels,names(f_list_1$classes)) + f_list_1$classes <- f_list_1$classes[order_1] + f_list_1$factor_levels <- f_list_1$factor_levels[order_1] + + # for f_list_2 + order_2 <- match(f_list_2$labels,names(f_list_2$classes)) + f_list_2$classes <- f_list_2$classes[order_2] + f_list_2$factor_levels <- f_list_2$factor_levels[order_2] + + # Reorder f_List_2 to match f_list_1, also removing anything in the former which is not in the latter #### + f_list_2_reordering = match(f_list_1$labels,f_list_2$labels) + + f_list_2$labels <- f_list_2$labels[f_list_2_reordering] + f_list_2$classes <- f_list_2$classes[f_list_2_reordering] + f_list_2$factor_levels <- f_list_2$factor_levels[f_list_2_reordering] + + # Sorts the factor levels for easier comparison below + f_list_1$sorted_factor_levels <- lapply(f_list_1$factor_levels,FUN=sort) + f_list_2$sorted_factor_levels <- lapply(f_list_2$factor_levels,FUN=sort) + + + #### Checking classes #### + if(any(is.na(f_list_1$classes)) & use_1_as_truth){ # Only relevant when f_list_1 is a model + message(paste0("The specified ",name_1," provides feature classes that are NA. ", + "The classes of ",name_2," are taken as the truth.")) + f_list_1 <- f_list_2 + + } + # Check if f_list_1 and f_list_2 have features with the same class + if (!identical(f_list_1$classes, f_list_2$classes)) { + stop(paste0("The features in ",name_1," and ",name_2," must have the same classes.")) + } + + # Check if the features all have class "integer", "numeric" or "factor + if (!all(f_list_1$classes %in% c("integer", "numeric", "factor"))) { + invalid_class <- which(!(f_list_1$classes %in% c("integer", "numeric", "factor"))) + stop(paste0("Feature(s) ",paste0(f_list_1$labels[invalid_class],collapse=", ")," in ",name_1," and ",name_2, + " is not of class integer, numeric or factor.")) + } + + #### Checking factor levels #### + factor_classes <- which(f_list_1$classes == "factor") + if(length(factor_classes)>0){ + relevant_factor_levels <- f_list_1$factor_levels[factor_classes] + is_NA <- any(is.na(relevant_factor_levels)) + is_NULL <- any(is.null(relevant_factor_levels)) + if((is_NA | is_NULL) & use_1_as_truth ){ + message(paste0("The specified ",name_1," provides factor feature levels that are NULL or NA. ", + "The factor levels of ",name_2," are taken as the truth.")) + f_list_1 <- f_list_2 # Always safe to switch as f_list_2 is based on data, and extracts correctly + } + } + + # Checking factor levels # + if (!identical(f_list_1$sorted_factor_levels, f_list_2$sorted_factor_levels)) { + stop(paste0("Some levels for factor features are not present in both ",name_1," and ",name_2,".")) + } + + f_list_1$sorted_factor_levels <- NULL # Not needed + + return(f_list_1) # + +} + +#' Updates data by reference according to the updater argument. +#' +#' @description \code{data} is updated, i.e. unused columns and factor levels are removed as described in +#' \code{updater}. This is done by reference, i.e. updates the object being passed to data even if nothing is +#' returned by the function itself. +#' +#' @param data data.table. Data that ought to be updated. +#' @param updater List. The object should be the output from +#' \code{\link[shapr:check_features]{check_features}}. +#' +#' +#' @return NULL. +#' +#' @author Martin Jullum +#' +#' @keywords internal +#' @export +#' +#' @examples +#' # Load example data +#' if (requireNamespace("MASS", quietly = TRUE)) { +#' data("Boston", package = "MASS") +#' # Split data into test- and training data +#' x_train <- data.table::as.data.table(head(Boston)) +#' x_train[,rad:=as.factor(rad)] +#' data_features <- get_data_specs(x_train) +#' model <- lm(medv ~ lstat + rm + rad + indus, data = x_train) +#' +#' model_features <- get_model_specs(model) +#' updater <- check_features(model_features,data_features) +#' update_data(x_train,updater) +#' } +update_data = function(data,updater){ + # Operates on data by reference, so no copying of data here + + new_labels <- updater$labels + factor_levels <- updater$factor_levels + + # Reorder and delete unused columns + cnms_remove <- setdiff(colnames(data), new_labels) + if (length(cnms_remove) > 0) { + message( + paste0( + "The columns(s) ", + paste0(cnms_remove,collapse=", "), + " is not used by the model and thus removed from the data." + ) + ) + data[, (cnms_remove) := NULL] + } + data.table::setcolorder(data, new_labels) + + # Reorderes the factor levels + if(any(updater$classes=="factor")){ + org_factor_levels <- lapply(data,levels) + identical_levels <- mapply(FUN = "identical",org_factor_levels,factor_levels) + if(any(!identical_levels)){ + changed_levels <- which(!identical_levels) + message(paste0("Levels are reordered for the factor feature(s) ", + paste0(new_labels[changed_levels],collapse=", "),".")) + + for (i in changed_levels) { + data.table::set(data, + j=i, + value = factor(unlist(data[,new_labels[i],with=F],use.names = F), levels = factor_levels[[i]])) + } + } + } + + return(NULL) +} diff --git a/R/sampling.R b/R/sampling.R index ba82e6f94..e4875a545 100644 --- a/R/sampling.R +++ b/R/sampling.R @@ -167,7 +167,10 @@ sample_combinations <- function(ntrain, ntest, nsamples, joint_sampling = TRUE) #' #' @keywords internal #' +#' @author Annabelle Redelmeier +#' #' @examples +#' if (requireNamespace("MASS", quietly = TRUE) & requireNamespace("party", quietly = TRUE)) { #' m <- 10 #' n <- 40 #' n_samples <- 50 @@ -189,8 +192,7 @@ sample_combinations <- function(ntrain, ntest, nsamples, joint_sampling = TRUE) #' tree <- list(tree = datact, given_ind = given_ind, dependent_ind = dependent_ind) #' shapr:::sample_ctree(tree = tree, n_samples = n_samples, x_test = x_test_dt, x_train = x_train, #' p = length(x_test), sample = TRUE) -#' -#' @author Annabelle Redelmeier +#' } sample_ctree <- function(tree, n_samples, x_test, @@ -281,8 +283,12 @@ sample_ctree <- function(tree, #' @return List with conditional inference tree and the variables conditioned/not conditioned on. #' #' @keywords internal +#' @author Annabelle Redelmeier, Martin Jullum +#' +#' @export #' #' @examples +#' if (requireNamespace("MASS", quietly = TRUE) & requireNamespace("party", quietly = TRUE)) { #' m <- 10 #' n <- 40 #' n_samples <- 50 @@ -297,10 +303,7 @@ sample_ctree <- function(tree, #' create_ctree(given_ind = given_ind, x_train = x_train, #' mincriterion = mincriterion, minsplit = minsplit, #' minbucket = minbucket, use_partykit = "on_error") -#' -#' @author Annabelle Redelmeier, Martin Jullum -#' -#' @export +#' } create_ctree <- function(given_ind, x_train, mincriterion, diff --git a/R/shapley.R b/R/shapley.R index 2b8ccec97..e36c89e71 100644 --- a/R/shapley.R +++ b/R/shapley.R @@ -50,24 +50,23 @@ weight_matrix <- function(X, normalize_W_weights = TRUE) { #' Create an explainer object with Shapley weights for test data. #' -#' @param x Numeric matrix or data.frame. Contains the data used for training the model. +#' @param x Numeric matrix or data.frame/data.table. Contains the data used to estimate the (conditional) +#' distributions for the features needed to properly estimate the conditional expectations in the Shapley formula. #' -#' @param model The model whose predictions we want to explain. See \code{\link{predict_model}} -#' for more information about which models \code{shapr} supports natively. +#' @param model The model whose predictions we want to explain. Run +#' \code{\link[shapr:get_supported_models]{shapr:::get_supported_models()}} +#' for a table of which models \code{shapr} supports natively. #' #' @param n_combinations Integer. The number of feature combinations to sample. If \code{NULL}, #' the exact method is used and all combinations are considered. The maximum number of #' combinations equals \code{2^ncol(x)}. #' -#' @param feature_labels Character vector. The labels/names of the features used for training the model. -#' Only applicable if you are using a custom model. Otherwise the features in use are extracted from \code{model}. #' #' @return Named list that contains the following items: #' \describe{ #' \item{exact}{Boolean. Equals \code{TRUE} if \code{n_combinations = NULL} or #' \code{n_combinations < 2^ncol(x)}, otherwise \code{FALSE}.} #' \item{n_features}{Positive integer. The number of columns in \code{x}} -#' \item{model_type}{Character. Returned value after calling \code{model_type(model)}} #' \item{S}{Binary matrix. The number of rows equals the number of unique combinations, and #' the number of columns equals the total number of features. I.e. let's say we have a case with #' three features. In that case we have \code{2^3 = 8} unique combinations. If the j-th @@ -76,16 +75,17 @@ weight_matrix <- function(X, normalize_W_weights = TRUE) { #' \item{W}{Second item} #' \item{X}{data.table. Returned object from \code{\link{feature_combinations}}} #' \item{x_train}{data.table. Transformed \code{x} into a data.table.} +#' \item{feature_list}{List. The \code{updated_feature_list} output from \code{\link[shapr:preprocess_data]{preprocess_data}}} #' } #' -#' In addition to the items above \code{model}, \code{feature_labels} (updated with the names actually used by the -#' model) and \code{n_combinations} is also present in the returned object. +#' In addition to the items above, \code{model} and \code{n_combinations} are also present in the returned object. #' #' @export #' #' @author Nikolai Sellereite #' #' @examples +#' if (requireNamespace("MASS", quietly = TRUE)) { #' # Load example data #' data("Boston", package = "MASS") #' df <- Boston @@ -118,10 +118,10 @@ weight_matrix <- function(X, normalize_W_weights = TRUE) { #' #' print(nrow(explainer$X)) #' # 16 (which equals 2^4) +#' } shapr <- function(x, model, - n_combinations = NULL, - feature_labels = NULL) { + n_combinations = NULL) { # Checks input argument if (!is.matrix(x) & !is.data.frame(x)) { @@ -131,22 +131,31 @@ shapr <- function(x, # Setup explainer <- as.list(environment()) explainer$exact <- ifelse(is.null(n_combinations), TRUE, FALSE) - explainer$model_type <- model_type(model) - # Checks input argument - feature_labels <- features(model, colnames(x), feature_labels) - explainer$n_features <- length(feature_labels) - # Converts to data.table, otherwise copy to x_train -------------- - x_train <- data.table::as.data.table(x) + # Check features of training data against model specification + feature_list_model <- get_model_specs(model) + + processed_list <- preprocess_data(x = x, + feature_list = feature_list_model) - # Removes variables that are not included in model -------------- - cnms_remove <- setdiff(colnames(x), feature_labels) - if (length(cnms_remove) > 0) x_train[, (cnms_remove) := NULL] - data.table::setcolorder(x_train, feature_labels) + x_train <- processed_list$x_dt + updated_feature_list <- processed_list$updated_feature_list - # Checks model and features - explainer$p <- predict_model(model, head(x_train)) + explainer$n_features <- ncol(x_train) + + # Checking that the prediction function works + tmp <- predict_model(model, head(x_train,2)) + if(!(all(is.numeric(tmp)) & length(tmp)==2)){ + stop( + paste0( + "The predict_model function of class ",class(model)," is invalid.\n", + "See the 'Advanced usage' section of the vignette:\n", + "vignette('understanding_shapr', package = 'shapr')\n", + "for more information on running shapr with custom models.\n" + ) + ) + } # Get all combinations ---------------- dt_combinations <- feature_combinations( @@ -177,9 +186,8 @@ shapr <- function(x, explainer$W <- weighted_mat explainer$X <- dt_combinations explainer$x_train <- x_train - explainer$feature_labels <- feature_labels explainer$x <- NULL - explainer$p <- NULL + explainer$feature_list <- updated_feature_list attr(explainer, "class") <- c("explainer", "list") diff --git a/R/shapr-package.R b/R/shapr-package.R index 5b6440987..5337b9421 100644 --- a/R/shapr-package.R +++ b/R/shapr-package.R @@ -3,7 +3,7 @@ #' #' @importFrom graphics plot hist rect #' -#' @importFrom utils head tail +#' @importFrom utils head tail methods #' #' @importFrom stats predict #' @@ -13,9 +13,14 @@ #' #' @importFrom stats model.frame #' +#' @importFrom stats setNames +#' +#' @importFrom stats contrasts +#' #' @importFrom Rcpp sourceCpp #' #' @keywords internal #' #' @useDynLib shapr, .registration = TRUE NULL + diff --git a/cran-comments.md b/cran-comments.md index 128691182..cb34fc30b 100644 --- a/cran-comments.md +++ b/cran-comments.md @@ -1,37 +1,53 @@ -# Patch for shapr 0.1.2 +# Patch for shapr 0.1.3 + +* Got an email fran CRAN that shapr gave ERROR due to ERROR in package under Suggests (xgboost). Even though the xgboost issue was fixed in time, shapr was taken off CRAN due to packages under Suggests not being used conditionally in tests and examples. This patch fixes this issue using +``` +if(requireNamespace("pkgname", quietly = TRUE)) +``` +and checked to run fine without Suggested packages through +``` +devtools::check(vignettes = FALSE, env_vars=c(`_R_CHECK_DEPENDS_ONLY_` = "true")) +``` -* Fixes the installation error on Solaris for version 0.1.2 as described in -https://cran.r-project.org/web/checks/check_results_shapr.html. Checks using -rhub::check_on_solaris() indicates the issue is fixed in this release. ## Test environments -* GitHub Actions (macOS-latest): R 4.0.2 -* GitHub Actions (windows-latest): R 4.0.2 -* GitHub Actions (ubuntu-16.04): R 4.0.2, 3.6.3, 3.5.3 -* win-builder (x86_64-w64-mingw32): R 4.0.2, 3.6.3, R-devel (2020-08-23 r79071) -* local Ubuntu 18.04: R 3.6.3 -* local Windows 10: R 4.0.2, R-devel (2020-08-04 r78971) +* GitHub Actions (windows-latest): R 4.0 +* GitHub Actions (ubuntu-16.04): R 4.0, 3.6, 3.5 +* win-builder (x86_64-w64-mingw32): R 4.0, 3.6, R-devel +* local Ubuntu 18.04: R 3.6 +* local Windows 10: R 4.0 +* R-hub (windows-x86_64-devel): R-devel +* R-hub (ubuntu-gcc-release): R-release +* R-hub (fedora-gcc-devel): R-devel +* R-hub (macos-highsierra-release-cran): R-release ## R CMD check results There were no ERRORs or WARNINGs. -There was 2 NOTES: +There was 1 NOTE: * checking CRAN incoming feasibility ... NOTE -Maintainer: 'Martin Jullum ' +Maintainer: ‘Martin Jullum ’ + +New submission + +Package was archived on CRAN -Days since last update: 0 +Possibly mis-spelled words in DESCRIPTION: + Aas (9:16) + Jullum (9:21) + Løland (9:32) + Shapley (3:53, 6:15, 7:84, 10:72) -> This is a patch for the current installation error on Solaris +CRAN repository db overrides: + X-CRAN-Comment: Archived on 2021-01-20 as check problems were not + corrected in time. -* checking for future file timestamps ... NOTE - unable to verify current time +> This is a patch for the version that was taken off CRAN. -> This is a known issue with the worldclockapi.com currently being down - (https://stackoverflow.com/questions/63613301/r-cmd-check-note-unable-to-verify-current-time) ## Downstream dependencies There are currently no downstream dependencies for this package. diff --git a/inst/scripts/example_ctree_method.R b/inst/scripts/example_ctree_method.R index 2d4b728ee..712abe1c5 100644 --- a/inst/scripts/example_ctree_method.R +++ b/inst/scripts/example_ctree_method.R @@ -66,10 +66,10 @@ print(levels(x_train_cat$rm)) print(levels(x_test_cat$rm)) # -- special function when using categorical data + xgboost -make_dummies_list <- make_dummies(traindata = x_train_cat, testdata = x_test_cat) +dummylist <- make_dummies(traindata = x_train_cat, testdata = x_test_cat) -x_train_dummy <- make_dummies_list$train_dummies -x_test_dummy <- make_dummies_list$test_dummies +x_train_dummy <- dummylist$train_dummies +x_test_dummy <- dummylist$test_dummies # Fitting a basic xgboost model to the training data model_cat <- xgboost::xgboost( @@ -78,17 +78,17 @@ model_cat <- xgboost::xgboost( nround = 20, verbose = FALSE ) -model_cat$dummylist <- make_dummies_list$obj +model_cat$feature_list <- dummylist$feature_list -explainer_cat <- shapr(make_dummies_list$traindata_new, model_cat) +explainer_cat <- shapr(dummylist$traindata_new, model_cat) # Specifying the phi_0, i.e. the expected prediction without any features p0 <- mean(y_train) -# make_dummies_list$testdata_new$rm +# dummylist$testdata_new$rm explanation_cat <- explain( - make_dummies_list$testdata_new, + dummylist$testdata_new, approach = "ctree", explainer = explainer_cat, prediction_zero = p0 diff --git a/inst/scripts/example_custom_model.R b/inst/scripts/example_custom_model.R index 1974a2068..da6633fe8 100644 --- a/inst/scripts/example_custom_model.R +++ b/inst/scripts/example_custom_model.R @@ -8,37 +8,38 @@ data("Boston", package = "MASS") x_var <- c("lstat", "rm", "dis", "indus") y_var <- "medv" -xy_train <- tail(Boston, -6) -x_test <- head(Boston,6) +x_train <- as.matrix(Boston[-1:-6, x_var]) +y_train <- Boston[-1:-6, y_var] +x_test <- as.matrix(Boston[1:6, x_var]) form = as.formula(paste0(y_var,"~",paste0(x_var,collapse="+"))) +library(gbm) + +xy_train <- data.frame(x_train,medv = y_train) + + # Fitting a gbm model set.seed(825) model <- gbm::gbm( form, data = xy_train, - distribution="gaussian" + distribution = "gaussian" ) +#### Full feature versions of the three required model functions #### -# Create custom function of model_type for gbm -model_type.gbm <- function(x) { - ifelse( - x$distribution$name %in% c("bernoulli","adaboost"), - "classification", - "regression" - ) -} - -# Create custom function of predict_model for gbm predict_model.gbm <- function(x, newdata) { if (!requireNamespace('gbm', quietly = TRUE)) { stop('The gbm package is required for predicting train models') } - model_type <- model_type(x) + model_type <- ifelse( + x$distribution$name %in% c("bernoulli","adaboost"), + "classification", + "regression" + ) if (model_type == "classification") { predict(x, as.data.frame(newdata), type = "response",n.trees = x$n.trees) @@ -48,19 +49,46 @@ predict_model.gbm <- function(x, newdata) { } } +get_model_specs.gbm <- function(x){ + feature_list = list() + feature_list$labels <- labels(x$Terms) + m <- length(feature_list$labels) + + feature_list$classes <- attr(x$Terms,"dataClasses")[-1] + feature_list$factor_levels <- setNames(vector("list", m), feature_list$labels) + feature_list$factor_levels[feature_list$classes=="factor"] <- NA # the model object doesn't contain factor levels info + + return(feature_list) +} + # Prepare the data for explanation set.seed(123) -explainer <- shapr(xy_train, model,feature_labels = x_var) - -# Spedifying the phi_0, i.e. the expected prediction without any features +explainer <- shapr(xy_train, model) p0 <- mean(xy_train[,y_var]) - -# Computing the actual Shapley values with kernelSHAP accounting for feature dependence using -# the empirical (conditional) distribution approach with bandwidth parameter sigma = 0.1 (default) explanation <- explain(x_test, explainer, approach = "empirical", prediction_zero = p0) +# Plot results +plot(explanation) + + +# Minimal version of the three required model functions +# Note: Working only for this exact version of the model class +# Avoiding to define get_model_specs skips all feature +# consistency checking between your data and model + +# Removing the previously defined functions to simulate a fresh start +rm(predict_model.gbm) +rm(get_model_specs.gbm) -# Printing the Shapley values for the test data -explanation$dt +predict_model.gbm <- function(x, newdata) { + predict(x, as.data.frame(newdata),n.trees = x$n.trees) +} + + +# Prepare the data for explanation +set.seed(123) +explainer <- shapr(x_train, model) +p0 <- mean(xy_train[,y_var]) +explanation <- explain(x_test, explainer, approach = "empirical", prediction_zero = p0) # Plot results plot(explanation) diff --git a/man/apply_dummies.Rd b/man/apply_dummies.Rd index 5d388113a..315a9282c 100644 --- a/man/apply_dummies.Rd +++ b/man/apply_dummies.Rd @@ -2,26 +2,27 @@ % Please edit documentation in R/features.R \name{apply_dummies} \alias{apply_dummies} -\title{Make dummy variables - this is an internal function intended only to be used in +\title{Apply dummy variables - this is an internal function intended only to be used in predict_model.xgb.Booster()} \usage{ -apply_dummies(obj, testdata) +apply_dummies(feature_list, testdata) } \arguments{ -\item{obj}{List. Output of \code{make_dummies$obj}.} +\item{feature_list}{List. The \code{feature_list} object in the output object after running +\code{\link[shapr:make_dummies]{make_dummies}}} \item{testdata}{data.table or data.frame. New data that has the same -feature names, types, and levels as \code{obj$data}.} +feature names, types, and levels as \code{feature_list}.} } \value{ -A data.frame containing all of the factors in \code{testdata} as -one-hot encoded variables. +A data.table with all features but where the factors in \code{testdata} are +one-hot encoded variables as specified in feature_list } \description{ -Make dummy variables - this is an internal function intended only to be used in +Apply dummy variables - this is an internal function intended only to be used in predict_model.xgb.Booster() } \author{ -Annabelle Redelmeier +Annabelle Redelmeier, Martin Jullum } \keyword{internal} diff --git a/man/check_features.Rd b/man/check_features.Rd new file mode 100644 index 000000000..2634448d3 --- /dev/null +++ b/man/check_features.Rd @@ -0,0 +1,42 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/preprocess_data.R +\name{check_features} +\alias{check_features} +\title{Checks that two extracted feature lists have exactly the same properites} +\usage{ +check_features(f_list_1, f_list_2, use_1_as_truth = T) +} +\arguments{ +\item{f_list_1, f_list_2}{List. As extracted from either \code{get_data_specs} or \code{get_model_specs}.} + +\item{use_1_as_truth}{Logical. If TRUE, \code{f_list_2} is compared to \code{f_list_1}, i.e. additional elements +is allowed in \code{f_list_2}, and if \code{f_list_1}'s feature classes contains NA's, feature class check is +ignored regardless of what is specified in \code{f_list_1}. If FALSE, \code{f_list_1} and \code{f_list_2} are +equated and they need to contain exactly the same elements. Set to TRUE when comparing a model and data, and FALSE +when comparing two data sets.} +} +\value{ +List. The \code{f_list_1} is returned as inserted if there all check are carried out. If some info is +missing from \code{f_list_1}, the function continues consistency checking using \code{f_list_2} and returns that. +} +\description{ +Checks that two extracted feature lists have exactly the same properites +} +\examples{ +# Load example data +if (requireNamespace("MASS", quietly = TRUE)) { +data("Boston", package = "MASS") +# Split data into test- and training data +x_train <- data.table::as.data.table(head(Boston)) +x_train[,rad:=as.factor(rad)] +data_features <- get_data_specs(x_train) +model <- lm(medv ~ lstat + rm + rad + indus, data = x_train) + +model_features <- get_model_specs(model) +check_features(model_features,data_features) +} +} +\author{ +Martin Jullum +} +\keyword{internal} diff --git a/man/create_ctree.Rd b/man/create_ctree.Rd index fcb84efe1..42b3ae582 100644 --- a/man/create_ctree.Rd +++ b/man/create_ctree.Rd @@ -41,6 +41,7 @@ List with conditional inference tree and the variables conditioned/not condition Make all conditional inference trees } \examples{ +if (requireNamespace("MASS", quietly = TRUE) & requireNamespace("party", quietly = TRUE)) { m <- 10 n <- 40 n_samples <- 50 @@ -55,7 +56,7 @@ sample <- TRUE create_ctree(given_ind = given_ind, x_train = x_train, mincriterion = mincriterion, minsplit = minsplit, minbucket = minbucket, use_partykit = "on_error") - +} } \author{ Annabelle Redelmeier, Martin Jullum diff --git a/man/explain.Rd b/man/explain.Rd index ab7d81498..a4d0fe35b 100644 --- a/man/explain.Rd +++ b/man/explain.Rd @@ -158,11 +158,11 @@ such as the mean of the predictions in the training data are also reasonable. Explain the output of machine learning models with more accurately estimated Shapley values } \details{ -The most important thing to notice is that \code{shapr} has implemented three different +The most important thing to notice is that \code{shapr} has implemented four different approaches for estimating the conditional distributions of the data, namely \code{"empirical"}, -\code{"gaussian"} and \code{"copula"}. +\code{"gaussian"}, \code{"copula"} and \code{"ctree"}. -In addition to this the user will also have the option of combining the three approaches. +In addition, the user also has the option of combining the four approaches. E.g. if you're in a situation where you have trained a model the consists of 10 features, and you'd like to use the \code{"gaussian"} approach when you condition on a single feature, the \code{"empirical"} approach if you condition on 2-5 features, and \code{"copula"} version @@ -172,6 +172,7 @@ if you condition on more than 5 features this can be done by simply passing when conditioning on \code{i} features. } \examples{ +if (requireNamespace("MASS", quietly = TRUE)) { # Load example data data("Boston", package = "MASS") @@ -208,8 +209,11 @@ explain5 <- explain(x_test, explainer, approach = approach, prediction_zero = p, print(explain1$dt) # Plot the results +if (requireNamespace("ggplot2", quietly = TRUE)) { plot(explain1) } +} +} \author{ -Camilla Lingjaerde, Nikolai Sellereite +Camilla Lingjaerde, Nikolai Sellereite, Martin Jullum, Annabelle Redelmeier } diff --git a/man/features.Rd b/man/features.Rd deleted file mode 100644 index ceb8a0ced..000000000 --- a/man/features.Rd +++ /dev/null @@ -1,50 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/models.R -\name{features} -\alias{features} -\alias{features.default} -\alias{features.lm} -\alias{features.glm} -\alias{features.ranger} -\alias{features.gam} -\alias{features.xgb.Booster} -\title{Fetches feature labels from a given model object} -\usage{ -features(x, cnms, feature_labels = NULL) - -\method{features}{default}(x, cnms, feature_labels = NULL) - -\method{features}{lm}(x, cnms, feature_labels = NULL) - -\method{features}{glm}(x, cnms, feature_labels = NULL) - -\method{features}{ranger}(x, cnms, feature_labels = NULL) - -\method{features}{gam}(x, cnms, feature_labels = NULL) - -\method{features}{xgb.Booster}(x, cnms, feature_labels = NULL) -} -\arguments{ -\item{x}{Model object for the model to be explained.} - -\item{cnms}{Character vector. Represents the names of the columns in the data used for training/explaining.} - -\item{feature_labels}{Character vector. Represents the labels of the features used for prediction.} -} -\description{ -Fetches feature labels from a given model object -} -\examples{ -# Load example data -data("Boston", package = "MASS") -# Split data into test- and training data -x_train <- head(Boston, -3) -# Fit a linear model -model <- lm(medv ~ lstat + rm + dis + indus, data = x_train) - -cnms <- c("lstat", "rm", "dis", "indus") - -# Checking that features used by the model corresponds to cnms -features(x = model, cnms = cnms, feature_labels = NULL) -} -\keyword{internal} diff --git a/man/get_data_specs.Rd b/man/get_data_specs.Rd new file mode 100644 index 000000000..9a7cb1e68 --- /dev/null +++ b/man/get_data_specs.Rd @@ -0,0 +1,43 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/preprocess_data.R +\name{get_data_specs} +\alias{get_data_specs} +\title{Fetches feature information from a given data set} +\usage{ +get_data_specs(x) +} +\arguments{ +\item{x}{matrix, data.frame or data.table The data to extract feature information from.} +} +\value{ +A list with the following elements: +\describe{ + \item{labels}{character vector with the feature names to compute Shapley values for} + \item{classes}{a named character vector with the labels as names and the class types as elements} + \item{factor_levels}{a named list with the labels as names and character vectors with the factor levels as elements + (NULL if the feature is not a factor)} +} +} +\description{ +Fetches feature information from a given data set +} +\details{ +This function is used to extract the feature information to be checked against the corresponding +information extracted from the model and other data sets. The function is called from +\code{\link[shapr:preprocess_data]{preprocess_data}} +and \code{\link[shapr:make_dummies]{make_dummies}} +} +\examples{ +# Load example data +if (requireNamespace("MASS", quietly = TRUE)) { +data("Boston", package = "MASS") +# Split data into test- and training data +x_train <- data.table::as.data.table(head(Boston)) +x_train[,rad:=as.factor(rad)] +get_data_specs(x_train) +} +} +\author{ +Martin Jullum +} +\keyword{internal} diff --git a/man/get_model_specs.Rd b/man/get_model_specs.Rd new file mode 100644 index 000000000..f60f0f84d --- /dev/null +++ b/man/get_model_specs.Rd @@ -0,0 +1,61 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/models.R +\name{get_model_specs} +\alias{get_model_specs} +\alias{get_model_specs.default} +\alias{get_model_specs.lm} +\alias{get_model_specs.glm} +\alias{get_model_specs.gam} +\alias{get_model_specs.ranger} +\alias{get_model_specs.xgb.Booster} +\title{Fetches feature information from a given model object} +\usage{ +get_model_specs(x) + +\method{get_model_specs}{default}(x) + +\method{get_model_specs}{lm}(x) + +\method{get_model_specs}{glm}(x) + +\method{get_model_specs}{gam}(x) + +\method{get_model_specs}{ranger}(x) + +\method{get_model_specs}{xgb.Booster}(x) +} +\arguments{ +\item{x}{Model object for the model to be explained.} +} +\value{ +A list with the following elements: +\describe{ + \item{labels}{character vector with the feature names to compute Shapley values for} + \item{classes}{a named character vector with the labels as names and the class type as elements} + \item{factor_levels}{a named list with the labels as names and character vectors with the factor levels as elements + (NULL if the feature is not a factor)} +} +} +\description{ +Fetches feature information from a given model object +} +\details{ +This function is used to extract the feature information to be checked against data passed to \code{shapr} +and \code{explain}. The function is called from \code{preprocess_data}. +} +\examples{ +if (requireNamespace("MASS", quietly = TRUE)) { +# Load example data +data("Boston", package = "MASS") +# Split data into test- and training data +x_train <- data.table::as.data.table(head(Boston)) +x_train[,rad:=as.factor(rad)] +model <- lm(medv ~ lstat + rm + rad + indus, data = x_train) + +get_model_specs(model) +} +} +\author{ +Martin Jullum +} +\keyword{internal} diff --git a/man/get_supported_models.Rd b/man/get_supported_models.Rd new file mode 100644 index 000000000..7bcd24ab9 --- /dev/null +++ b/man/get_supported_models.Rd @@ -0,0 +1,12 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/models.R +\name{get_supported_models} +\alias{get_supported_models} +\title{Provides a data.table with the supported models} +\usage{ +get_supported_models() +} +\description{ +Provides a data.table with the supported models +} +\keyword{internal} diff --git a/man/make_dummies.Rd b/man/make_dummies.Rd index aaf8c0460..cc0b339f8 100644 --- a/man/make_dummies.Rd +++ b/man/make_dummies.Rd @@ -15,27 +15,22 @@ feature names, types, and levels as \code{traindata}.} \value{ A list that contains the following entries: \describe{ -\item{obj}{List, Contains \describe{ -\item{features}{Vector. Contains the names of all the features in \code{data}.} -\item{factor_features}{Vector. Contains the names of all the factors in \code{data}.} -\item{factor_list}{List. Contains each factor and its vector of levels.} -\item{contrasts_list}{List. Contains all the contrasts of the factors.} -}} +\item{feature_list}{List. Output from \code{check_features}} \item{train_dummies}{A data.frame containing all of the factors in \code{traindata} as one-hot encoded variables.} \item{test_dummies}{A data.frame containing all of the factors in \code{testdata} as one-hot encoded variables.} \item{traindata_new}{Original traindata with correct column ordering and factor levels. To be passed to -\code{shapr()}.} +\code{\link[shapr:shapr]{shapr}.}} \item{testdata_new}{Original testdata with correct column ordering and factor levels. To be passed to -\code{explain()}.} +\code{\link[shapr:explain]{explain}.}} } } \description{ Initiate the making of dummy variables } \examples{ - +if (requireNamespace("MASS", quietly = TRUE)) { data("Boston", package = "MASS") x_var <- c("lstat", "rm", "dis", "indus") y_var <- "medv" @@ -48,8 +43,8 @@ x_train$rm <- factor(round(x_train$rm)) x_test$rm <- factor(round(x_test$rm), levels = levels(x_train$rm)) dummylist <- make_dummies(traindata = x_train, testdata = x_test) - +} } \author{ -Annabelle Redelmeier +Annabelle Redelmeier, Martin Jullum } diff --git a/man/model_checker.Rd b/man/model_checker.Rd new file mode 100644 index 000000000..57b638022 --- /dev/null +++ b/man/model_checker.Rd @@ -0,0 +1,55 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/models.R +\name{model_checker} +\alias{model_checker} +\alias{model_checker.default} +\alias{model_checker.lm} +\alias{model_checker.glm} +\alias{model_checker.ranger} +\alias{model_checker.gam} +\alias{model_checker.xgb.Booster} +\title{Check that the type of model is supported by the explanation method} +\usage{ +model_checker(x) + +\method{model_checker}{default}(x) + +\method{model_checker}{lm}(x) + +\method{model_checker}{glm}(x) + +\method{model_checker}{ranger}(x) + +\method{model_checker}{gam}(x) + +\method{model_checker}{xgb.Booster}(x) +} +\arguments{ +\item{x}{Model object for the model to be explained.} +} +\value{ +Error or NULL +} +\description{ +The function checks whether the model given by \code{x} is supported. +If \code{x} is not a supported model the function will return an error message, otherwise it return NULL +(meaning all types of models with this class is supported) +} +\details{ +See \code{\link{predict_model}} for more information about +what type of models \code{shapr} currently support. +} +\examples{ +if (requireNamespace("MASS", quietly = TRUE)) { +# Load example data +data("Boston", package = "MASS") +# Split data into test- and training data +x_train <- head(Boston, -3) +# Fit a linear model +model <- lm(medv ~ lstat + rm + dis + indus, data = x_train) + +# Checking the model object +model_checker(x = model) +} +} +\keyword{internal} diff --git a/man/model_type.Rd b/man/model_type.Rd deleted file mode 100644 index d91492fd5..000000000 --- a/man/model_type.Rd +++ /dev/null @@ -1,54 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/models.R -\name{model_type} -\alias{model_type} -\alias{model_type.default} -\alias{model_type.lm} -\alias{model_type.glm} -\alias{model_type.ranger} -\alias{model_type.gam} -\alias{model_type.xgb.Booster} -\title{Define type of model} -\usage{ -model_type(x) - -\method{model_type}{default}(x) - -\method{model_type}{lm}(x) - -\method{model_type}{glm}(x) - -\method{model_type}{ranger}(x) - -\method{model_type}{gam}(x) - -\method{model_type}{xgb.Booster}(x) -} -\arguments{ -\item{x}{Model object for the model to be explained.} -} -\value{ -Either \code{"classification"} or \code{"regression"}. -} -\description{ -The function checks whether the model given by \code{x} is -supported, and if it is a regression- or a classification model. If \code{x} is -not a supported model the function will return an error message, otherwise it will -return either \code{"regression"} or \code{"classification"}. -} -\details{ -See \code{\link{predict_model}} for more information about -what type of models \code{shapr} currently support. -} -\examples{ -# Load example data -data("Boston", package = "MASS") -# Split data into test- and training data -x_train <- head(Boston, -3) -# Fit a linear model -model <- lm(medv ~ lstat + rm + dis + indus, data = x_train) - -# Writing out the defined model type of the object -model_type(x = model) -} -\keyword{internal} diff --git a/man/plot.shapr.Rd b/man/plot.shapr.Rd index f9f666e14..ad591d649 100644 --- a/man/plot.shapr.Rd +++ b/man/plot.shapr.Rd @@ -41,6 +41,7 @@ See \code{vignette("understanding_shapr", package = "shapr")} for an example of how you should use the function. } \examples{ +if (requireNamespace("MASS", quietly = TRUE)) { #' # Load example data data("Boston", package = "MASS") @@ -64,8 +65,11 @@ explanation <- explain(x_test, prediction_zero = p, n_samples = 1e2) +if (requireNamespace("ggplot2", quietly = TRUE)) { # Plot the explantion (this function) plot(explanation) +} +} } \author{ diff --git a/man/predict_model.Rd b/man/predict_model.Rd index 9a80251a8..c74eb0825 100644 --- a/man/predict_model.Rd +++ b/man/predict_model.Rd @@ -57,10 +57,13 @@ The returned object \code{p} always satisfies the following properties: If you have a binary classification model we'll always return the probability prediction for a single class. -For more details on how to use a custom model see the package vignette: \cr -\code{vignette("understanding_shapr", package = "shapr")} +For more details on how to explain other types of models (i.e. custom models), see the Advanced usage section +of the vignette: \cr +From R: \code{vignette("understanding_shapr", package = "shapr")} \cr +Web: \url{https://norskregnesentral.github.io/shapr/articles/understanding_shapr.html#explain-custom-models} } \examples{ +if (requireNamespace("MASS", quietly = TRUE)) { # Load example data data("Boston", package = "MASS") # Split data into test- and training data @@ -72,6 +75,7 @@ model <- lm(medv ~ lstat + rm + dis + indus, data = x_train) # Predicting for a model with a standardized format predict_model(x = model, newdata = x_test) } +} \author{ Martin Jullum } diff --git a/man/preprocess_data.Rd b/man/preprocess_data.Rd new file mode 100644 index 000000000..b3e73b02a --- /dev/null +++ b/man/preprocess_data.Rd @@ -0,0 +1,44 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/preprocess_data.R +\name{preprocess_data} +\alias{preprocess_data} +\title{Process (check and update) data according to specified feature list} +\usage{ +preprocess_data(x, feature_list) +} +\arguments{ +\item{x}{matrix, data.frame or data.table. The data to check input for and update +according to the specification in \code{feature_list}.} + +\item{feature_list}{List. Output from running \code{\link[shapr:get_data_specs]{get_data_specs}} or +\code{\link[shapr:get_model_specs]{get_model_specs}}} +} +\value{ +List with two named elements: \code{x_dt}: Checked and updated data \code{x} in data.table format, and +\code{update_feature_list} the output from \code{\link[shapr:check_features]{check_features}} +} +\description{ +Process (check and update) data according to specified feature list +} +\details{ +This function takes care of all preprocessing and checking of the provided data in \code{x} against +the feature_list which is typically the output from \code{\link[shapr:get_model_specs]{get_model_specs}} +} +\examples{ +# Load example data +if (requireNamespace("MASS", quietly = TRUE)) { +data("Boston", package = "MASS") +# Split data into test- and training data +x_train <- data.table::as.data.table(head(Boston)) +x_train[,rad:=as.factor(rad)] +data_features <- get_data_specs(x_train) +model <- lm(medv ~ lstat + rm + rad + indus, data = x_train) + +model_features <- get_model_specs(model) +preprocess_data(x_train,model_features) +} +} +\author{ +Martin Jullum +} +\keyword{internal} diff --git a/man/sample_ctree.Rd b/man/sample_ctree.Rd index c45d48560..54b8f035e 100644 --- a/man/sample_ctree.Rd +++ b/man/sample_ctree.Rd @@ -30,6 +30,7 @@ data.table with \code{n_samples} (conditional) Gaussian samples Sample ctree variables from a given conditional inference tree } \examples{ +if (requireNamespace("MASS", quietly = TRUE) & requireNamespace("party", quietly = TRUE)) { m <- 10 n <- 40 n_samples <- 50 @@ -51,7 +52,7 @@ mincriterion = 0.95)) tree <- list(tree = datact, given_ind = given_ind, dependent_ind = dependent_ind) shapr:::sample_ctree(tree = tree, n_samples = n_samples, x_test = x_test_dt, x_train = x_train, p = length(x_test), sample = TRUE) - +} } \author{ Annabelle Redelmeier diff --git a/man/shapr.Rd b/man/shapr.Rd index 3c4226854..0c90a58fa 100644 --- a/man/shapr.Rd +++ b/man/shapr.Rd @@ -4,20 +4,19 @@ \alias{shapr} \title{Create an explainer object with Shapley weights for test data.} \usage{ -shapr(x, model, n_combinations = NULL, feature_labels = NULL) +shapr(x, model, n_combinations = NULL) } \arguments{ -\item{x}{Numeric matrix or data.frame. Contains the data used for training the model.} +\item{x}{Numeric matrix or data.frame/data.table. Contains the data used to estimate the (conditional) +distributions for the features needed to properly estimate the conditional expectations in the Shapley formula.} -\item{model}{The model whose predictions we want to explain. See \code{\link{predict_model}} -for more information about which models \code{shapr} supports natively.} +\item{model}{The model whose predictions we want to explain. Run +\code{\link[shapr:get_supported_models]{shapr:::get_supported_models()}} +for a table of which models \code{shapr} supports natively.} \item{n_combinations}{Integer. The number of feature combinations to sample. If \code{NULL}, the exact method is used and all combinations are considered. The maximum number of combinations equals \code{2^ncol(x)}.} - -\item{feature_labels}{Character vector. The labels/names of the features used for training the model. -Only applicable if you are using a custom model. Otherwise the features in use are extracted from \code{model}.} } \value{ Named list that contains the following items: @@ -25,7 +24,6 @@ Named list that contains the following items: \item{exact}{Boolean. Equals \code{TRUE} if \code{n_combinations = NULL} or \code{n_combinations < 2^ncol(x)}, otherwise \code{FALSE}.} \item{n_features}{Positive integer. The number of columns in \code{x}} - \item{model_type}{Character. Returned value after calling \code{model_type(model)}} \item{S}{Binary matrix. The number of rows equals the number of unique combinations, and the number of columns equals the total number of features. I.e. let's say we have a case with three features. In that case we have \code{2^3 = 8} unique combinations. If the j-th @@ -34,15 +32,16 @@ Named list that contains the following items: \item{W}{Second item} \item{X}{data.table. Returned object from \code{\link{feature_combinations}}} \item{x_train}{data.table. Transformed \code{x} into a data.table.} + \item{feature_list}{List. The \code{updated_feature_list} output from \code{\link[shapr:preprocess_data]{preprocess_data}}} } -In addition to the items above \code{model}, \code{feature_labels} (updated with the names actually used by the -model) and \code{n_combinations} is also present in the returned object. +In addition to the items above, \code{model} and \code{n_combinations} are also present in the returned object. } \description{ Create an explainer object with Shapley weights for test data. } \examples{ +if (requireNamespace("MASS", quietly = TRUE)) { # Load example data data("Boston", package = "MASS") df <- Boston @@ -76,6 +75,7 @@ explainer <- shapr(df1, model, n_combinations = 1e3) print(nrow(explainer$X)) # 16 (which equals 2^4) } +} \author{ Nikolai Sellereite } diff --git a/man/update_data.Rd b/man/update_data.Rd new file mode 100644 index 000000000..ccf5585b6 --- /dev/null +++ b/man/update_data.Rd @@ -0,0 +1,41 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/preprocess_data.R +\name{update_data} +\alias{update_data} +\title{Updates data by reference according to the updater argument.} +\usage{ +update_data(data, updater) +} +\arguments{ +\item{data}{data.table. Data that ought to be updated.} + +\item{updater}{List. The object should be the output from +\code{\link[shapr:check_features]{check_features}}.} +} +\value{ +NULL. +} +\description{ +\code{data} is updated, i.e. unused columns and factor levels are removed as described in +\code{updater}. This is done by reference, i.e. updates the object being passed to data even if nothing is +returned by the function itself. +} +\examples{ +# Load example data +if (requireNamespace("MASS", quietly = TRUE)) { +data("Boston", package = "MASS") +# Split data into test- and training data +x_train <- data.table::as.data.table(head(Boston)) +x_train[,rad:=as.factor(rad)] +data_features <- get_data_specs(x_train) +model <- lm(medv ~ lstat + rm + rad + indus, data = x_train) + +model_features <- get_model_specs(model) +updater <- check_features(model_features,data_features) +update_data(x_train,updater) +} +} +\author{ +Martin Jullum +} +\keyword{internal} diff --git a/tests/testthat/manual_test_scripts/test_custom_models.R b/tests/testthat/manual_test_scripts/test_custom_models.R new file mode 100644 index 000000000..7cfd162fd --- /dev/null +++ b/tests/testthat/manual_test_scripts/test_custom_models.R @@ -0,0 +1,119 @@ +# Test custom models + +# Doing all testing from shapr +# Because new functions have to be created (to use gbm with shapr), we cannot use a classic testthat set up because +# shapr will not see the functions created inside of the test environment. Therefore we have to test these functions +# a bit differently (and more manual) than other tests. + +library(testthat) +library(shapr) +library(gbm) +library(MASS) + +# Data ----------- +data("Boston", package = "MASS") +y_var <- "medv" +x_train <- tail(Boston, -6) +y_train <- tail(Boston[, y_var], -6) +y_train_binary <- as.factor(tail((Boston[, y_var]>20)*1, -6)) + +# convert to factors for testing purposes +x_train$rad <- factor(round(x_train$rad)) +x_train$chas <- factor(round(x_train$chas)) + +train_df <- cbind(x_train, y_train,y_train_binary) + + +x_var_numeric <- c("lstat", "rm", "dis", "indus") +x_var_factor <- c("lstat", "rm", "dis", "indus", "rad", "chas") + +train_df_used_numeric <- x_train[,x_var_numeric] +train_df_used_factor <- x_train[,x_var_factor] + +formula_numeric <- as.formula(paste0("y_train ~ ",paste0(x_var_numeric,collapse="+"))) +formula_factor <- as.formula(paste0("y_train ~ ",paste0(x_var_factor,collapse="+"))) + +# Custom model with only numeric features +model_custom <- gbm::gbm(formula_numeric,data = train_df,distribution = "gaussian") +expect_error(shapr(train_df_used_numeric ,model_custom)) # Required model objects defined +get_model_specs.gbm <- function(x){ + feature_list = list() + feature_list$labels <- labels(x$Terms) + m <- length(feature_list$labels) + feature_list$classes <- attr(x$Terms,"dataClasses")[-1] + feature_list$factor_levels <- setNames(vector("list", m), feature_list$labels) + feature_list$factor_levels[feature_list$classes=="factor"] <- NA # the model object doesn't contain factor levels info + return(feature_list) +} +expect_error(shapr(train_df_used_numeric ,model_custom)) # predict_model objects not defined + +predict_model.gbm <- function(x, newdata) { + if (!requireNamespace('gbm', quietly = TRUE)) { + stop('The gbm package is required for predicting train models') + } + model_type <- ifelse( + x$distribution$name %in% c("bernoulli","adaboost"), + "classification", + "regression" + ) + if (model_type == "classification") { + predict(x, as.data.frame(newdata), type = "response",n.trees = x$n.trees) + } else { + predict(x, as.data.frame(newdata),n.trees = x$n.trees) + } +} + +expect_silent(shapr(train_df_used_numeric ,model_custom)) # Both defined, so pass silently + +rm(get_model_specs.gbm) + +expect_message(shapr(train_df_used_numeric ,model_custom)) # Only predict_model defined, so warning +rm(predict_model.gbm) + + +# Custom model with factors +model_custom <- gbm::gbm(formula_factor,data = train_df,distribution = "gaussian") +expect_error(shapr(train_df_used_factor ,model_custom)) # Required model objects defined +get_model_specs.gbm <- function(x){ + feature_list = list() + feature_list$labels <- labels(x$Terms) + m <- length(feature_list$labels) + feature_list$classes <- attr(x$Terms,"dataClasses")[-1] + feature_list$factor_levels <- setNames(vector("list", m), feature_list$labels) + feature_list$factor_levels[feature_list$classes=="factor"] <- NA # the model object doesn't contain factor levels info + return(feature_list) +} +expect_error(shapr(train_df_used_factor ,model_custom)) # predict_model objects not defined + +predict_model.gbm <- function(x, newdata) { + if (!requireNamespace('gbm', quietly = TRUE)) { + stop('The gbm package is required for predicting train models') + } + model_type <- ifelse( + x$distribution$name %in% c("bernoulli","adaboost"), + "classification", + "regression" + ) + if (model_type == "classification") { + predict(x, as.data.frame(newdata), type = "response",n.trees = x$n.trees) + } else { + predict(x, as.data.frame(newdata),n.trees = x$n.trees) + } +} +expect_message(shapr(train_df_used_factor ,model_custom)) # Both defined, so pass with message as factor_level is NA + +rm(get_model_specs.gbm) + +expect_message(shapr(train_df_used_factor ,model_custom)) # Only predict_model defined, so warning message returned + +rm(predict_model.gbm) + +predict_model.gbm <- function(x, newdata) NULL + +# Erroneous predict_model defined, so throw error + messages +expect_message(expect_error(shapr(train_df_used_factor ,model_custom))) + + +rm(predict_model.gbm) + + diff --git a/tests/testthat/test-a-shapley.R b/tests/testthat/test-a-shapley.R index 0f7c0051f..88fd46248 100644 --- a/tests/testthat/test-a-shapley.R +++ b/tests/testthat/test-a-shapley.R @@ -1,6 +1,3 @@ -library(testthat) -library(shapr) - context("test-shapley.R") RNGversion(vstr = "3.5.0") @@ -8,70 +5,125 @@ RNGversion(vstr = "3.5.0") test_that("Basic test functions in shapley.R", { # Load data ----------- - data("Boston", package = "MASS") - x_var <- c("lstat", "rm", "dis", "indus") - x_train <- tail(Boston[, x_var], 50) + if (requireNamespace("MASS", quietly = TRUE)) { + data("Boston", package = "MASS") + x_var <- c("lstat", "rm", "dis", "indus") + x_train <- tail(Boston[, x_var], 50) - # Load premade lm model. Path needs to be relative to testthat directory in the package - model <- readRDS("model_objects/lm_model_object.rds") + # Load premade lm model. Path needs to be relative to testthat directory in the package + model <- readRDS("model_objects/lm_model_object.rds") - # Prepare the data for explanation - explainer <- shapr(x_train, model) + # Prepare the data for explanation + explainer <- shapr(x_train, model) - expect_known_value(explainer, file = "test_objects/shapley_explainer_obj.rds", - update = FALSE) + expect_known_value(explainer, file = "test_objects/shapley_explainer_obj.rds", + update = FALSE) + } }) test_that("Testing data input to shapr in shapley.R", { - data("Boston", package = "MASS") - - x_var <- c("lstat", "rm", "dis", "indus") - x_var_sub <- x_var[1:2] - not_x_var <- "crim" - not_even_var <- "not_a_column_name" - - x_train <- as.matrix(tail(Boston[, x_var], -6)) - xy_train_full_df <- tail(Boston[, ], -6) - xy_train_missing_lstat_df <- xy_train_full_df[, !(colnames(xy_train_full_df) == "lstat")] - xy_train_full_df_no_colnames <- xy_train_full_df - colnames(xy_train_full_df_no_colnames) <- NULL - - # Fitting models - formula <- as.formula(paste0("medv ~ ", paste0(x_var, collapse = "+"))) - - l <- list( - xgboost::xgboost( - data = x_train, - label = tail(Boston[, "medv"], -6), - nround = 3, - verbose = FALSE - ), - lm( - formula = formula, - data = xy_train_full_df - ), - ranger::ranger( - formula = formula, - data = xy_train_full_df, - num.trees = 50 - ) - ) - - for (i in seq_along(l)) { - - # Expect silent - expect_silent(shapr(xy_train_full_df, l[[i]])) - - # Expect message that feature_labels is ignored - expect_message(shapr(xy_train_full_df, l[[i]], feature_labels = x_var_sub)) - expect_message(shapr(xy_train_full_df, l[[i]], feature_labels = x_var)) - - # Expect error, giving error message that indicates that x misses columns used by the model - expect_error(shapr(xy_train_missing_lstat_df, l[[i]])) - - # Expect error when x_train don't have column names - expect_error(shapr(xy_train_full_df_no_colnames, l[[i]], feature_labels = x_var_sub)) + if (requireNamespace("MASS", quietly = TRUE)) { + data("Boston", package = "MASS") + + y_var <- "medv" + x_train <- tail(Boston, -6) + y_train <- tail(Boston[, y_var], -6) + y_train_binary <- as.factor(tail((Boston[, y_var]>20)*1, -6)) + + # convert to factors for testing purposes + x_train$rad <- factor(round(x_train$rad)) + x_train$chas <- factor(round(x_train$chas)) + + train_df <- cbind(x_train, y_train,y_train_binary) + + + x_var_numeric <- c("lstat", "rm", "dis", "indus") + x_var_factor <- c("lstat", "rm", "dis", "indus", "rad", "chas") + + train_df_used_numeric <- x_train[,x_var_numeric] + train_df_used_factor <- x_train[,x_var_factor] + + formula_numeric <- as.formula(paste0("y_train ~ ",paste0(x_var_numeric,collapse="+"))) + formula_factor <- as.formula(paste0("y_train ~ ",paste0(x_var_factor,collapse="+"))) + + formula_binary_numeric <- as.formula(paste0("y_train_binary ~ ",paste0(x_var_numeric,collapse="+"))) + formula_binary_factor <- as.formula(paste0("y_train_binary ~ ",paste0(x_var_factor,collapse="+"))) + + dummylist <- make_dummies(traindata = x_train[, x_var_factor], testdata = x_train[, x_var_factor]) + + # List of models to run silently + l_numeric <- list( + stats::lm(formula_numeric, data = train_df), + stats::glm(formula_numeric, data = train_df)) + + if (requireNamespace("mgcv", quietly = TRUE)) { + l_numeric[[length(l_numeric) + 1]] <- mgcv::gam(formula_numeric, data = train_df) + } + + l_factor <- list( + stats::lm(formula_factor, data = train_df), + stats::glm(formula_factor, data = train_df)) + + if (requireNamespace("mgcv", quietly = TRUE)) { + l_factor[[length(l_factor) + 1]] <- mgcv::gam(formula_factor, data = train_df) + } + + if (requireNamespace("xgboost", quietly = TRUE)) { + l_factor[[length(l_factor) + 1]] <- xgboost::xgboost(data = dummylist$train_dummies, + label = y_train, + nrounds = 3, verbose = FALSE) + l_factor[[length(l_factor)]]$feature_list <- dummylist$feature_list + } + + + for (i in seq_along(l_numeric)) { + expect_silent(shapr(train_df_used_numeric,l_numeric[[i]])) # No modification + expect_message(shapr(train_df,l_numeric[[i]])) # Features dropped + } + + for (i in seq_along(l_factor)) { + expect_silent(shapr(train_df_used_factor,l_factor[[i]])) # No modification + expect_message(shapr(train_df,l_factor[[i]])) # Features dropped + } + + + # Testing errors on incompatible model and data + # Missing features + model <- stats::lm(formula_factor, data = train_df) + data_error <- train_df[,-3] + expect_error(shapr(data_error,model)) + + # Duplicated column names + data_error <- train_df_used_factor + data_error <- cbind(data_error,lstat = 1) + expect_error(shapr(data_error,model)) + + # Empty column names in data + data_error <- train_df + colnames(data_error) <- NULL + expect_error(shapr(data_error,model)) + + # Empty column names in model (ok if found in data -- and we trust it) + if (requireNamespace("xgboost", quietly = TRUE)) { + data_with_colnames <- data_without_colnames <- as.matrix(train_df_used_numeric) + colnames(data_without_colnames) <- NULL + + model_xgb <- xgboost::xgboost(data = data_without_colnames, label = y_train, + nrounds = 3, verbose = FALSE) + expect_message(shapr(data_with_colnames,model_xgb)) + } + + # Data feature with incorrect class + data_error <- train_df_used_factor + data_error$lstat <- as.logical(data_error$lstat>15) + expect_error(shapr(data_error,model)) + + # non-matching factor levels + data_error <- head(train_df_used_factor) + data_error$rad <- droplevels(data_error$rad) + expect_error(shapr(data_error,model)) } }) + diff --git a/tests/testthat/test-explanation.R b/tests/testthat/test-explanation.R index fdcab2e94..83e7899cb 100644 --- a/tests/testthat/test-explanation.R +++ b/tests/testthat/test-explanation.R @@ -1,6 +1,3 @@ -library(testthat) -library(shapr) - context("test-explanation.R") # For using same Random numer generator as CircelCI (R version 3.5.x) @@ -22,225 +19,234 @@ test_that("Test get_list_approaches", { test_that("Test functions in explanation.R", { # Load data ----------- - data("Boston", package = "MASS") - x_var <- c("lstat", "rm", "dis", "indus") - y_var <- "medv" - - y_train <- tail(Boston[, y_var], 50) - x_test <- as.matrix(head(Boston[, x_var], 2)) - - # Prepare the data for explanation. Path needs to be relative to testthat directory in the package - explainer <- readRDS(file = "test_objects/shapley_explainer_obj.rds") - - # Creating list with lots of different explainer objects - p0 <- mean(y_train) - ex_list <- list() - - # Ex 1: Explain predictions (gaussian) - ex_list[[1]] <- explain(x_test, explainer, approach = "gaussian", prediction_zero = p0) + if (requireNamespace("MASS", quietly = TRUE)) { + data("Boston", package = "MASS") + x_var <- c("lstat", "rm", "dis", "indus") + y_var <- "medv" - # Ex 2: Explain predictions (copula) - ex_list[[2]] <- explain(x_test, explainer, approach = "copula", prediction_zero = p0) + y_train <- tail(Boston[, y_var], 50) + x_test <- as.matrix(head(Boston[, x_var], 2)) - # Ex 3: Explain predictions (empirical, independence): - ex_list[[3]] <- explain(x_test, explainer, approach = "empirical", prediction_zero = p0, type = "independence") + # Prepare the data for explanation. Path needs to be relative to testthat directory in the package + explainer <- readRDS(file = "test_objects/shapley_explainer_obj.rds") - # Ex 4: Explain predictions (empirical, fixed sigma) - ex_list[[4]] <- explain(x_test, explainer, approach = "empirical", prediction_zero = p0, type = "fixed_sigma") + # Creating list with lots of different explainer objects + p0 <- mean(y_train) + ex_list <- list() - # Ex 5: Explain predictions (empirical, AICc) - ex_list[[5]] <- explain(x_test, explainer, approach = "empirical", prediction_zero = p0, type = "AICc_each_k") + # Ex 1: Explain predictions (gaussian) + ex_list[[1]] <- explain(x_test, explainer, approach = "gaussian", prediction_zero = p0) - # Ex 6: Explain predictions (empirical, AICc full) - ex_list[[6]] <- explain(x_test, explainer, approach = "empirical", prediction_zero = p0, type = "AICc_full") + # Ex 2: Explain predictions (copula) + ex_list[[2]] <- explain(x_test, explainer, approach = "copula", prediction_zero = p0) - # Ex 7: Explain combined - empirical and gaussian - ex_list[[7]] <- explain(x_test, explainer, approach = c("empirical", rep("gaussian", 3)), prediction_zero = p0) + # Ex 3: Explain predictions (empirical, independence): + ex_list[[3]] <- explain(x_test, explainer, approach = "empirical", prediction_zero = p0, type = "independence") - # Ex 8: Explain combined II - all gaussian - ex_list[[8]] <- explain(x_test, explainer, approach = c(rep("gaussian", 4)), prediction_zero = p0) + # Ex 4: Explain predictions (empirical, fixed sigma) + ex_list[[4]] <- explain(x_test, explainer, approach = "empirical", prediction_zero = p0, type = "fixed_sigma") - # Ex 9: Explain combined III - all copula - ex_list[[9]] <- explain(x_test, explainer, approach = rep("copula", 4), prediction_zero = p0) + # Ex 5: Explain predictions (empirical, AICc) + ex_list[[5]] <- explain(x_test, explainer, approach = "empirical", prediction_zero = p0, type = "AICc_each_k") - # Ex 10: gaussian and copula XX (works with seed) - approach <- c(rep("gaussian", 2), rep("copula", 2)) - ex_list[[10]] <- explain(x_test, explainer, approach = approach, prediction_zero = p0) + # Ex 6: Explain predictions (empirical, AICc full) + ex_list[[6]] <- explain(x_test, explainer, approach = "empirical", prediction_zero = p0, type = "AICc_full") - # Ex 11: empirical and gaussian - approach <- c(rep("empirical", 2), rep("gaussian", 2)) - ex_list[[11]] <- explain(x_test, explainer, approach = approach, prediction_zero = p0) + # Ex 7: Explain combined - empirical and gaussian + ex_list[[7]] <- explain(x_test, explainer, approach = c("empirical", rep("gaussian", 3)), prediction_zero = p0) - # Ex 12: empirical and copula - approach <- c(rep("empirical", 2), rep("copula", 2)) - ex_list[[12]] <- explain(x_test, explainer, approach = approach, prediction_zero = p0) + # Ex 8: Explain combined II - all gaussian + ex_list[[8]] <- explain(x_test, explainer, approach = c(rep("gaussian", 4)), prediction_zero = p0) - # Ex 13: copula and empirical XX (works now) - approach <- c(rep("copula", 2), rep("empirical", 2)) - ex_list[[13]] <- explain(x_test, explainer, approach = approach, prediction_zero = p0) + # Ex 9: Explain combined III - all copula + ex_list[[9]] <- explain(x_test, explainer, approach = rep("copula", 4), prediction_zero = p0) - # Ex 14: gaussian and copula XX (works with seed) - approach <- c(rep("gaussian", 1), rep("copula", 3)) - ex_list[[14]] <- explain(x_test, explainer, approach = approach, prediction_zero = p0) + # Ex 10: gaussian and copula XX (works with seed) + approach <- c(rep("gaussian", 2), rep("copula", 2)) + ex_list[[10]] <- explain(x_test, explainer, approach = approach, prediction_zero = p0) - # Ex 15: empirical and copula - approach <- c(rep("empirical", 1), rep("copula", 3)) - ex_list[[15]] <- explain(x_test, explainer, approach = approach, prediction_zero = p0) + # Ex 11: empirical and gaussian + approach <- c(rep("empirical", 2), rep("gaussian", 2)) + ex_list[[11]] <- explain(x_test, explainer, approach = approach, prediction_zero = p0) - # Ex 16: gaussian and empirical XX (works now) - approach <- c(rep("gaussian", 1), rep("empirical", 3)) - ex_list[[16]] <- explain(x_test, explainer, approach = approach, prediction_zero = p0) + # Ex 12: empirical and copula + approach <- c(rep("empirical", 2), rep("copula", 2)) + ex_list[[12]] <- explain(x_test, explainer, approach = approach, prediction_zero = p0) - # Ex 17: gaussian and empirical XX (works now!) - approach <- c(rep("gaussian", 2), rep("empirical", 2)) - ex_list[[17]] <- explain(x_test, explainer, approach = approach, prediction_zero = p0) + # Ex 13: copula and empirical XX (works now) + approach <- c(rep("copula", 2), rep("empirical", 2)) + ex_list[[13]] <- explain(x_test, explainer, approach = approach, prediction_zero = p0) - # Ex 18: Explain combined II - all empirical - approach <- c(rep("empirical", 4)) - ex_list[[18]] <- explain(x_test, explainer, approach = approach, prediction_zero = p0) + # Ex 14: gaussian and copula XX (works with seed) + approach <- c(rep("gaussian", 1), rep("copula", 3)) + ex_list[[14]] <- explain(x_test, explainer, approach = approach, prediction_zero = p0) - # Ex 19: Explain predictions (ctree, sample = FALSE, default parameters) - ex_list[[19]] <- explain(x_test, explainer, approach = "ctree", prediction_zero = p0, sample = FALSE) + # Ex 15: empirical and copula + approach <- c(rep("empirical", 1), rep("copula", 3)) + ex_list[[15]] <- explain(x_test, explainer, approach = approach, prediction_zero = p0) - # Ex 20: Explain predictions (ctree, sample = TRUE, default parameters) - ex_list[[20]] <- explain(x_test, explainer, approach = "ctree", prediction_zero = p0, sample = TRUE) + # Ex 16: gaussian and empirical XX (works now) + approach <- c(rep("gaussian", 1), rep("empirical", 3)) + ex_list[[16]] <- explain(x_test, explainer, approach = approach, prediction_zero = p0) - # Ex 21: Explain predictions (ctree, sample = FALSE, other ctree parameters) - ex_list[[21]] <- explain(x_test, explainer, approach = "ctree", prediction_zero = p0, sample = FALSE, - mincriterion = 0.9, minsplit = 20, minbucket = 25) + # Ex 17: gaussian and empirical XX (works now!) + approach <- c(rep("gaussian", 2), rep("empirical", 2)) + ex_list[[17]] <- explain(x_test, explainer, approach = approach, prediction_zero = p0) - # Ex 22: Explain predictions (ctree, sample = TRUE, other ctree parameters) - ex_list[[22]] <- explain(x_test, explainer, approach = "ctree", prediction_zero = p0, sample = TRUE, - mincriterion = 0.9, minsplit = 20, minbucket = 25) + # Ex 18: Explain combined II - all empirical + approach <- c(rep("empirical", 4)) + ex_list[[18]] <- explain(x_test, explainer, approach = approach, prediction_zero = p0) - # Ex 23: Explain combined - ctree and gaussian, sample = FALSE - ex_list[[23]] <- explain(x_test, explainer, approach = c("ctree", rep("gaussian", 3)), - prediction_zero = p0, sample = FALSE) + if (requireNamespace("party", quietly = TRUE)) { - # Ex 24: Explain combined II - ctree and gaussian, sample = FALSE - ex_list[[24]] <- explain(x_test, explainer, approach = c(rep("ctree", 2), rep("gaussian", 2)), - prediction_zero = p0, sample = FALSE) + # Ex 19: Explain predictions (ctree, sample = FALSE, default parameters) + ex_list[[19]] <- explain(x_test, explainer, approach = "ctree", prediction_zero = p0, sample = FALSE) - # Ex 25: Explain combined III - ctree and gaussian, sample = FALSE - ex_list[[25]] <- explain(x_test, explainer, approach = c(rep("ctree", 3), rep("gaussian", 1)), - prediction_zero = p0, sample = FALSE) + # Ex 20: Explain predictions (ctree, sample = TRUE, default parameters) + ex_list[[20]] <- explain(x_test, explainer, approach = "ctree", prediction_zero = p0, sample = TRUE) - # Ex 26: Explain combined IV - ctree all, sample = FALSE - ex_list[[26]] <- explain(x_test, explainer, approach = c(rep("ctree", 4)), - prediction_zero = p0, sample = FALSE) + # Ex 21: Explain predictions (ctree, sample = FALSE, other ctree parameters) + ex_list[[21]] <- explain(x_test, explainer, approach = "ctree", prediction_zero = p0, sample = FALSE, + mincriterion = 0.9, minsplit = 20, minbucket = 25) - # Ex 27: Explain combined - ctree and empirical, sample = FALSE - ex_list[[27]] <- explain(x_test, explainer, approach = c("ctree", rep("empirical", 3)), - prediction_zero = p0, sample = FALSE) + # Ex 22: Explain predictions (ctree, sample = TRUE, other ctree parameters) + ex_list[[22]] <- explain(x_test, explainer, approach = "ctree", prediction_zero = p0, sample = TRUE, + mincriterion = 0.9, minsplit = 20, minbucket = 25) - # Ex 28: Explain combined II - ctree and empirical, sample = FALSE - ex_list[[28]] <- explain(x_test, explainer, approach = c(rep("ctree", 2), rep("empirical", 2)), - prediction_zero = p0, sample = FALSE) + # Ex 23: Explain combined - ctree and gaussian, sample = FALSE + ex_list[[23]] <- explain(x_test, explainer, approach = c("ctree", rep("gaussian", 3)), + prediction_zero = p0, sample = FALSE) - # Ex 29: Explain combined III - ctree and empirical, sample = FALSE - ex_list[[29]] <- explain(x_test, explainer, approach = c(rep("ctree", 3), rep("empirical", 1)), - prediction_zero = p0, sample = FALSE) + # Ex 24: Explain combined II - ctree and gaussian, sample = FALSE + ex_list[[24]] <- explain(x_test, explainer, approach = c(rep("ctree", 2), rep("gaussian", 2)), + prediction_zero = p0, sample = FALSE) - # Ex 30: Explain combined - ctree and gaussian, sample = TRUE - ex_list[[30]] <- explain(x_test, explainer, approach = c("ctree", rep("gaussian", 3)), - prediction_zero = p0, sample = TRUE) + # Ex 25: Explain combined III - ctree and gaussian, sample = FALSE + ex_list[[25]] <- explain(x_test, explainer, approach = c(rep("ctree", 3), rep("gaussian", 1)), + prediction_zero = p0, sample = FALSE) - # Ex 31: Explain combined II - ctree and gaussian, sample = TRUE - ex_list[[31]] <- explain(x_test, explainer, approach = c(rep("ctree", 2), rep("gaussian", 2)), - prediction_zero = p0, sample = TRUE) + # Ex 26: Explain combined IV - ctree all, sample = FALSE + ex_list[[26]] <- explain(x_test, explainer, approach = c(rep("ctree", 4)), + prediction_zero = p0, sample = FALSE) - # Ex 32: Explain combined III - ctree and gaussian, sample = TRUE - ex_list[[32]] <- explain(x_test, explainer, approach = c(rep("ctree", 3), rep("gaussian", 1)), - prediction_zero = p0, sample = TRUE) + # Ex 27: Explain combined - ctree and empirical, sample = FALSE + ex_list[[27]] <- explain(x_test, explainer, approach = c("ctree", rep("empirical", 3)), + prediction_zero = p0, sample = FALSE) - # Ex 33: Explain combined IV - ctree all, sample = TRUE - ex_list[[33]] <- explain(x_test, explainer, approach = c(rep("ctree", 4)), - prediction_zero = p0, sample = TRUE) + # Ex 28: Explain combined II - ctree and empirical, sample = FALSE + ex_list[[28]] <- explain(x_test, explainer, approach = c(rep("ctree", 2), rep("empirical", 2)), + prediction_zero = p0, sample = FALSE) - # Ex 34: Explain combined - ctree and empirical, sample = TRUE - ex_list[[34]] <- explain(x_test, explainer, approach = c("ctree", rep("empirical", 3)), - prediction_zero = p0, sample = TRUE) + # Ex 29: Explain combined III - ctree and empirical, sample = FALSE + ex_list[[29]] <- explain(x_test, explainer, approach = c(rep("ctree", 3), rep("empirical", 1)), + prediction_zero = p0, sample = FALSE) - # Ex 35: Explain combined II - ctree and empirical, sample = TRUE - ex_list[[35]] <- explain(x_test, explainer, approach = c(rep("ctree", 2), rep("empirical", 2)), - prediction_zero = p0, sample = TRUE) + # Ex 30: Explain combined - ctree and gaussian, sample = TRUE + ex_list[[30]] <- explain(x_test, explainer, approach = c("ctree", rep("gaussian", 3)), + prediction_zero = p0, sample = TRUE) - # Ex 36: Explain combined III - ctree and empirical, sample = TRUE - ex_list[[36]] <- explain(x_test, explainer, approach = c(rep("ctree", 3), rep("empirical", 1)), - prediction_zero = p0, sample = TRUE) + # Ex 31: Explain combined II - ctree and gaussian, sample = TRUE + ex_list[[31]] <- explain(x_test, explainer, approach = c(rep("ctree", 2), rep("gaussian", 2)), + prediction_zero = p0, sample = TRUE) - # Ex 37: Explain different ctree mincriterion for different number of dependent variables, sample = TRUE - ex_list[[37]] <- explain(x_test, explainer, approach = "ctree", prediction_zero = p0, sample = TRUE, - mincriterion = c(0.05, 0.05, 0.95, 0.95)) + # Ex 32: Explain combined III - ctree and gaussian, sample = TRUE + ex_list[[32]] <- explain(x_test, explainer, approach = c(rep("ctree", 3), rep("gaussian", 1)), + prediction_zero = p0, sample = TRUE) - # Ex 38: Explain different ctree mincriterion for different number of dependent variables, sample = TRUE - ex_list[[38]] <- explain(x_test, explainer, approach = "ctree", prediction_zero = p0, sample = TRUE, - mincriterion = rep(0.95, 4)) + # Ex 33: Explain combined IV - ctree all, sample = TRUE + ex_list[[33]] <- explain(x_test, explainer, approach = c(rep("ctree", 4)), + prediction_zero = p0, sample = TRUE) - # Ex 38: Test that ctree with mincriterion equal to same probability four times gives the same as only passing one - # probability to mincriterion - expect_equal( - (explain(x_test, explainer, approach = "ctree", prediction_zero = p0, sample = TRUE, - mincriterion = rep(0.95, 4)))$dt, - (explain(x_test, explainer, approach = "ctree", prediction_zero = p0, sample = TRUE, - mincriterion = 0.95))$dt - ) + # Ex 34: Explain combined - ctree and empirical, sample = TRUE + ex_list[[34]] <- explain(x_test, explainer, approach = c("ctree", rep("empirical", 3)), + prediction_zero = p0, sample = TRUE) + # Ex 35: Explain combined II - ctree and empirical, sample = TRUE + ex_list[[35]] <- explain(x_test, explainer, approach = c(rep("ctree", 2), rep("empirical", 2)), + prediction_zero = p0, sample = TRUE) - # Ex 39: Test that ctree with the same mincriterion repeated four times is the same as passing mincriterion only once - expect_equal( - (explain(x_test, explainer, approach = "ctree", prediction_zero = p0, sample = FALSE, - mincriterion = c(rep(0.95, 2), rep(0.95, 2))))$dt, - (explain(x_test, explainer, approach = "ctree", prediction_zero = p0, sample = FALSE, - mincriterion = 0.95))$dt - ) + # Ex 36: Explain combined III - ctree and empirical, sample = TRUE + ex_list[[36]] <- explain(x_test, explainer, approach = c(rep("ctree", 3), rep("empirical", 1)), + prediction_zero = p0, sample = TRUE) - # Checking that explanations with different paralellizations gives the same result (only unix systems!) + # Ex 37: Explain different ctree mincriterion for different number of dependent variables, sample = TRUE + ex_list[[37]] <- explain(x_test, explainer, approach = "ctree", prediction_zero = p0, sample = TRUE, + mincriterion = c(0.05, 0.05, 0.95, 0.95)) - if (.Platform$OS.type == "unix") { - explain_base_nosample <- explain(x_test, explainer, approach = "ctree", prediction_zero = p0, sample = FALSE) + # Ex 38: Explain different ctree mincriterion for different number of dependent variables, sample = TRUE + ex_list[[38]] <- explain(x_test, explainer, approach = "ctree", prediction_zero = p0, sample = TRUE, + mincriterion = rep(0.95, 4)) - multicore <- 2 - - expect_equal( - explain_base_nosample, - explain(x_test, explainer, approach = "ctree", prediction_zero = p0, sample = FALSE, - mc_cores = multicore) - ) - - expect_equal( - explain_base_nosample, - explain(x_test, explainer, approach = "ctree", prediction_zero = p0, sample = FALSE, - mc_cores_create_ctree = 1, mc_cores_sample_ctree = multicore) - ) - - expect_equal( - explain_base_nosample, - explain(x_test, explainer, approach = "ctree", prediction_zero = p0, sample = FALSE, - mc_cores_create_ctree = multicore, mc_cores_sample_ctree = 1) - ) + # Ex 39: Test that ctree with mincriterion equal to same probability four times gives the same as only passing one + # probability to mincriterion + expect_equal( + (explain(x_test, explainer, approach = "ctree", prediction_zero = p0, sample = TRUE, + mincriterion = rep(0.95, 4)))$dt, + (explain(x_test, explainer, approach = "ctree", prediction_zero = p0, sample = TRUE, + mincriterion = 0.95))$dt + ) - explain_base_sample <- explain(x_test, explainer, approach = "ctree", prediction_zero = p0, sample = TRUE) - # Seed consistent when only paralellizing create_ctree, and not sample_ctree - expect_equal( - explain_base_sample, - explain(x_test, explainer, approach = "ctree", prediction_zero = p0, sample = TRUE, - mc_cores_create_ctree = multicore, mc_cores_sample_ctree = 1) - ) + # Ex 40: Test that ctree with the same mincriterion repeated four times is the same as passing mincriterion only once + expect_equal( + (explain(x_test, explainer, approach = "ctree", prediction_zero = p0, sample = FALSE, + mincriterion = c(rep(0.95, 2), rep(0.95, 2))))$dt, + (explain(x_test, explainer, approach = "ctree", prediction_zero = p0, sample = FALSE, + mincriterion = 0.95))$dt + ) - # Seed consistent, when run twice with same seed - expect_equal( - explain(x_test, explainer, approach = "ctree", prediction_zero = p0, sample = TRUE, - mc_cores = multicore), - explain(x_test, explainer, approach = "ctree", prediction_zero = p0, sample = TRUE, - mc_cores = multicore) - ) - } + # Checking that explanations with different paralellizations gives the same result (only unix systems!) + + if (.Platform$OS.type == "unix") { + explain_base_nosample <- explain(x_test, explainer, approach = "ctree", prediction_zero = p0, sample = FALSE) + + multicore <- 2 + + expect_equal( + explain_base_nosample, + explain(x_test, explainer, approach = "ctree", prediction_zero = p0, sample = FALSE, + mc_cores = multicore) + ) + + expect_equal( + explain_base_nosample, + explain(x_test, explainer, approach = "ctree", prediction_zero = p0, sample = FALSE, + mc_cores_create_ctree = 1, mc_cores_sample_ctree = multicore) + ) + + expect_equal( + explain_base_nosample, + explain(x_test, explainer, approach = "ctree", prediction_zero = p0, sample = FALSE, + mc_cores_create_ctree = multicore, mc_cores_sample_ctree = 1) + ) + + explain_base_sample <- explain(x_test, explainer, approach = "ctree", prediction_zero = p0, sample = TRUE) + + # Consistent results when only paralellizing create_ctree, and not sample_ctree + expect_equal( + explain_base_sample, + explain(x_test, explainer, approach = "ctree", prediction_zero = p0, sample = TRUE, + mc_cores_create_ctree = multicore, mc_cores_sample_ctree = 1) + ) + + # Consistent results when ran twice with same seed + expect_equal( + explain(x_test, explainer, approach = "ctree", prediction_zero = p0, sample = TRUE, + mc_cores = multicore), + explain(x_test, explainer, approach = "ctree", prediction_zero = p0, sample = TRUE, + mc_cores = multicore) + ) + } + # Checking that all explain objects produce the same as before + expect_known_value(ex_list, file = "test_objects/explanation_explain_obj_list.rds", + update = F) + + } else { + # Tests using only the first 17 elements of explanation_explain_obj_list.rds + expect_known_value(ex_list, file = "test_objects/explanation_explain_obj_list_no_ctree.rds", + update = F) + } - # Checking that all explain objects produce the same as before - expect_known_value(ex_list, file = "test_objects/explanation_explain_obj_list.rds", - update = F) ### Additional test to test that only the produced shapley values are the same as before fixed_explain_obj_list <- readRDS("test_objects/explanation_explain_obj_list_fixed.rds") @@ -249,147 +255,221 @@ test_that("Test functions in explanation.R", { } - # Checks that an error is returned - expect_error( - explain(1, explainer, approach = "gaussian", prediction_zero = p0) - ) - expect_error( - explain(list(), explainer, approach = "gaussian", prediction_zero = p0) - ) - expect_error( - explain(x_test, explainer, approach = "Gaussian", prediction_zero = p0) - ) - expect_error( - explain(x_test, explainer, approach = rep("gaussian", ncol(x_test) + 1), prediction_zero = p0) - ) + # Checks that an error is returned + expect_error( + explain(1, explainer, approach = "gaussian", prediction_zero = p0) + ) + expect_error( + explain(list(), explainer, approach = "gaussian", prediction_zero = p0) + ) + expect_error( + explain(x_test, explainer, approach = "Gaussian", prediction_zero = p0) + ) + expect_error( + explain(x_test, explainer, approach = rep("gaussian", ncol(x_test) + 1), prediction_zero = p0) + ) + } }) test_that("Testing data input to explain in explanation.R", { # Setup for training data and explainer object - data("Boston", package = "MASS") - x_var <- c("lstat", "rm", "dis", "indus") - y_var <- "medv" - - # Training data - x_train <- as.matrix(tail(Boston[, x_var], -6)) - y_train <- tail(Boston[, y_var], -6) - xy_train_full_df <- tail(Boston[, ], -6) - - # Test data - x_test <- as.matrix(head(Boston[, x_var], 6)) - x_test_full <- as.matrix(head(Boston[, ], 6)) - x_test_reordered <- as.matrix(head(Boston[, rev(x_var)], 6)) - xy_test_full_df <- head(Boston[, ], 6) - xy_test_missing_lstat_df <- xy_test_full_df[, !(colnames(xy_test_full_df) == "lstat")] - xy_test_full_df_no_colnames <- xy_test_full_df - colnames(xy_test_full_df_no_colnames) <- NULL - - # Fitting models - formula <- as.formula(paste0("medv ~ ", paste0(x_var, collapse = "+"))) - model1 <- xgboost::xgboost( - data = x_train, - label = y_train, - nround = 5, - verbose = FALSE - ) - - model2 <- lm( - formula = formula, - data = xy_train_full_df - ) - - model3 <- ranger::ranger( - formula = formula, - data = xy_train_full_df, - num.trees = 50 - ) - - p0 <- mean(y_train) - - # Get explainer objects - all_explainers <- lapply(list(model1, model2, model3), shapr, x = x_train) - - # Test data - all_test_data <- list( - x_test, - x_test_reordered, - x_test_full - ) - - # Expect silent for explainer 1, using correct, reordered and full data set, then identical results - l <- list() - for (i in seq_along(all_test_data)) { - l[[i]] <- expect_silent( + if (requireNamespace("MASS", quietly = TRUE)) { + data("Boston", package = "MASS") + x_var <- c("lstat", "rm", "dis", "indus") + y_var <- "medv" + + # Training data + x_train <- as.matrix(tail(Boston[, x_var], -6)) + y_train <- tail(Boston[, y_var], -6) + xy_train_full_df <- tail(Boston[, ], -6) + + # Test data + x_test <- as.matrix(head(Boston[, x_var], 6)) + x_test_full <- as.matrix(head(Boston[, ], 6)) + x_test_reordered <- as.matrix(head(Boston[, rev(x_var)], 6)) + xy_test_full_df <- head(Boston[, ], 6) + xy_test_missing_lstat_df <- xy_test_full_df[, !(colnames(xy_test_full_df) == "lstat")] + xy_test_full_df_no_colnames <- xy_test_full_df + colnames(xy_test_full_df_no_colnames) <- NULL + + formula <- as.formula(paste0("medv ~ ", paste0(x_var, collapse = "+"))) + p0 <- mean(y_train) + + # Test data + all_test_data <- list( + x_test, + x_test_reordered, + x_test_full + ) + + # Linear model + list_models <- list( + lm( + formula = formula, + data = xy_train_full_df + ) + ) + + all_explainers <- list( + shapr(x_train, list_models[[1]]) + ) + + # explainer 1 + # Expect message due to no label/factor checking + list_explanation <- list() + list_explanation[[1]] <- expect_silent( explain( - all_test_data[[i]], + all_test_data[[1]], all_explainers[[1]], approach = "empirical", prediction_zero = p0, n_samples = 1e2 ) ) - } - for (i in 2:length(l)) { - expect_equal(l[[i - 1]], l[[i]]) - } - - # Expect silent for explainer 2, using correct, reordered and bigger data set, then identical results - l <- list() - for (i in seq_along(all_test_data)) { - l[[i]] <- expect_silent( + # Expect message due to no label/factor checking + list_explanation[[2]] <- expect_silent( explain( - all_test_data[[i]], - all_explainers[[2]], + all_test_data[[2]], + all_explainers[[1]], approach = "empirical", prediction_zero = p0, n_samples = 1e2 ) ) - } - for (i in 2:length(l)) { - expect_equal(l[[i - 1]], l[[i]]) - } - - # Expect silent for explainer 3, using correct, reordered and bigger data set, then identical results - l <- list() - for (i in seq_along(all_test_data)) { - l[[i]] <- expect_silent( + # Expect message due to removal of data + list_explanation[[3]] <- expect_message( explain( - all_test_data[[i]], - all_explainers[[3]], + all_test_data[[3]], + all_explainers[[1]], approach = "empirical", prediction_zero = p0, n_samples = 1e2 ) ) - } - for (i in 2:length(l)) { - expect_equal(l[[i - 1]], l[[i]]) - } + for (i in 2:length(list_explanation)) { + expect_equal(list_explanation[[i - 1]], list_explanation[[i]]) + } - for (i in seq_along(all_explainers)) { - # Expect error when test data misses used variable - expect_error( - explain( - xy_test_missing_lstat_df, - all_explainers[[i]], - approach = "empirical", - prediction_zero = p0, - n_samples = 1e2 + if (requireNamespace("xgboost", quietly = TRUE)) { + list_models[[length(list_models) + 1]] <- xgboost::xgboost( + data = x_train, + label = y_train, + nround = 5, + verbose = FALSE ) - ) - # Expect error when test data misses column names - expect_error( - explain( - xy_test_full_df_no_colnames, - all_explainers[[i]], - approach = "empirical", - prediction_zero = p0, - n_samples = 1e2 + all_explainers[[length(all_explainers) + 1]] <- shapr(x_train, list_models[[length(list_models)]]) + + # explainer 2 + # Expect silent + list_explanation <- list() + list_explanation[[1]] <- expect_silent( + explain( + all_test_data[[1]], + all_explainers[[length(all_explainers)]], + approach = "empirical", + prediction_zero = p0, + n_samples = 1e2 + ) ) - ) + # Expect silent + list_explanation[[2]] <- expect_silent( + explain( + all_test_data[[2]], + all_explainers[[length(all_explainers)]], + approach = "empirical", + prediction_zero = p0, + n_samples = 1e2 + ) + ) + # Expect message due to removal of data + list_explanation[[3]] <- expect_message( + explain( + all_test_data[[3]], + all_explainers[[length(all_explainers)]], + approach = "empirical", + prediction_zero = p0, + n_samples = 1e2 + ) + ) + for (i in 2:length(list_explanation)) { + expect_equal(list_explanation[[i - 1]], list_explanation[[i]]) + } + } + + + if (requireNamespace("ranger", quietly = TRUE)) { + list_models[[length(list_models) + 1]] <- ranger::ranger( + formula = formula, + data = xy_train_full_df, + num.trees = 50 + ) + + all_explainers[[length(all_explainers) + 1]] <- shapr(x_train, list_models[[length(list_models)]]) + + # explainer 3 + # Expect silent + list_explanation <- list() + list_explanation[[1]] <- expect_silent( + explain( + all_test_data[[1]], + all_explainers[[length(all_explainers)]], + approach = "empirical", + prediction_zero = p0, + n_samples = 1e2 + ) + ) + # Expect silent + list_explanation[[2]] <- expect_silent( + explain( + all_test_data[[2]], + all_explainers[[length(all_explainers)]], + approach = "empirical", + prediction_zero = p0, + n_samples = 1e2 + ) + ) + # Expect message due removal of data + list_explanation[[3]] <- expect_message( + explain( + all_test_data[[3]], + all_explainers[[length(all_explainers)]], + approach = "empirical", + prediction_zero = p0, + n_samples = 1e2 + ) + ) + for (i in 2:length(list_explanation)) { + expect_equal(list_explanation[[i - 1]], list_explanation[[i]]) + } + + } + + for (i in seq_along(all_explainers)) { + + # Expect error when test data misses used variable + expect_error( + explain( + xy_test_missing_lstat_df, + all_explainers[[i]], + approach = "empirical", + prediction_zero = p0, + n_samples = 1e2 + ) + ) + + # Expect error when test data misses column names + expect_error( + explain( + xy_test_full_df_no_colnames, + all_explainers[[i]], + approach = "empirical", + prediction_zero = p0, + n_samples = 1e2 + ) + ) + } } }) diff --git a/tests/testthat/test-features.R b/tests/testthat/test-features.R index 22e4fe7c3..7a03be06e 100644 --- a/tests/testthat/test-features.R +++ b/tests/testthat/test-features.R @@ -1,5 +1,3 @@ -library(shapr) - context("test-features.R") test_that("Test feature_combinations", { @@ -169,205 +167,207 @@ test_that("Test helper_feature", { test_that("Test make_dummies", { - data("Boston", package = "MASS") - x_var <- c("lstat", "rm", "dis", "indus") - y_var <- "medv" - - x_train <- as.data.frame(Boston[401:411, x_var]) - y_train <- Boston[401:408, y_var] - x_test <- as.data.frame(Boston[1:4, x_var]) - - # convert to factors for illustrational purpose - x_train$rm <- factor(round(x_train$rm)) - x_test$rm <- factor(round(x_test$rm), levels = levels(x_train$rm)) - - factor_feat <- sapply(x_train, is.factor) - nb_factor_feat <- sum(factor_feat) - - dummylist <- make_dummies(traindata = x_train, testdata = x_train) - - # Tests - expect_type(dummylist, "list") - - expect_equal(length(dummylist$obj$contrasts_list), nb_factor_feat) - - expect_equal(length(dummylist$obj$features), ncol(x_train)) - - expect_equal(length(dummylist$obj$factor_features), nb_factor_feat) - - expect_equal(ncol(dummylist$obj$contrasts_list$rm), length(levels(x_train$rm))) - - # 1) What if train has more features than test but features in test are contained in train - x_train1 <- cbind(x_train, 1) - x_test1 <- x_test - expect_error(make_dummies(traindata = x_train1, testdata = x_test1)) - - # 2) What if train has different feature types than test - x_train2 <- x_train - x_test2 <- x_test - x_test2$rm <- as.numeric(x_test2$rm) - expect_error(make_dummies(traindata = x_train2, testdata = x_test2)) - - # 3) What if test has more features than train but features in train are contained in test - x_train3 <- x_train - x_test3 <- cbind(x_test, 1) - expect_error(make_dummies(traindata = x_train3, testdata = x_test3)) - - # 4) What if train and test only have numerical features - x_train4 <- x_train - x_train4$rm <- as.numeric(x_train4$rm) - x_test4 <- x_test - x_test4$rm <- as.numeric(x_test4$rm) - expect_type(make_dummies(traindata = x_train4, testdata = x_test4), "list") - - # 5) What if train and test only have categorical features - x_train5 <- x_train - x_train5 <- x_train5[, "rm", drop = FALSE] - x_test5 <- x_test - x_test5 <- x_test5[, "rm", drop = FALSE] - expect_type(make_dummies(traindata = x_train5, testdata = x_test5), "list") - - # 6) What if test has the same levels as train but random ordering of levels - x_train6 <- x_train - x_train6$rm <- factor(x_train6$rm, levels = 4:9) - x_test6 <- x_test - x_test6$rm <- factor(x_test6$rm, levels = c(8, 9, 7, 4, 5, 6)) - expect_type(make_dummies(traindata = x_train6, testdata = x_test6), "list") - - # 7) What if test has different levels than train - x_train7 <- x_train - x_train7$rm <- factor(x_train7$rm, levels = 4:9) - x_test7 <- x_test - x_test7$rm <- factor(x_test7$rm, levels = 6:8) - expect_error(make_dummies(traindata = x_train7, testdata = x_test7)) - - # 8) What if train and test have different feature names - x_train8 <- x_train - x_test8 <- x_test - names(x_test8) <- c("lstat2", "rm2", "dis2", "indus2") - expect_error(make_dummies(traindata = x_train8, testdata = x_test8)) - - # 9) What if one variables has an empty name - x_train9 <- x_train - colnames(x_train9) <- c("", "rm", "dis", "indus") - x_test9 <- x_test - colnames(x_test9) <- c("", "rm", "dis", "indus") - expect_error(make_dummies(traindata = x_train9, testdata = x_test9)) - - # 10) What if traindata has a column that repeats - x_train10 <- cbind(x_train, lstat = x_train$lstat) - x_test10 <- cbind(x_test, lstat = x_test$lstat) - expect_error(make_dummies(traindata = x_train10, testdata = x_test10)) - - # 11) What if traindata has no column names - x_train11 <- x_train - colnames(x_train11) <- NULL - x_test11 <- x_test - colnames(x_test11) <- NULL - expect_error(make_dummies(traindata = x_train11, testdata = x_test11)) - - # 12 Test that traindata_new and testdata_new will be the same as the original - # x_train and x_test. The only time this is different is if the levels of train - # and test are different. See below. - dummylist12 <- make_dummies(traindata = x_train, testdata = x_test) - # - expect_true(all(data.frame(dummylist12$traindata_new) == x_train)) - expect_true(all(levels(dummylist12$traindata_new$rm) == levels(x_train$rm))) - expect_true(all(data.frame(dummylist12$testdata_new) == x_test)) - expect_true(all(levels(dummylist12$testdata_new$rm) == levels(x_test$rm))) - - - # 13 Different levels same as check # 12 - # - x_train13 <- x_train - x_train13$rm <- factor(x_train13$rm, levels = 4:9) - x_test13 <- x_test - x_test13$rm <- factor(x_test13$rm, levels = c(8, 9, 7, 4, 5, 6)) - dummylist13 <- make_dummies(traindata = x_train13, testdata = x_test13) - # - expect_true(all(data.frame(dummylist13$traindata_new) == x_train13)) - expect_true(all(levels(dummylist13$traindata_new$rm) == levels(x_train13$rm))) - expect_true(all(data.frame(dummylist13$testdata_new) == x_test13)) - # Important !!!! - expect_false(all(levels(dummylist13$testdata_new$rm) == levels(x_test13$rm))) - + if (requireNamespace("MASS", quietly = TRUE)) { + data("Boston", package = "MASS") + x_var <- c("lstat", "rm", "dis", "indus") + y_var <- "medv" + + x_train <- as.data.frame(Boston[401:411, x_var]) + y_train <- Boston[401:408, y_var] + x_test <- as.data.frame(Boston[1:4, x_var]) + + # convert to factors for illustrational purpose + x_train$rm <- factor(round(x_train$rm)) + x_test$rm <- factor(round(x_test$rm), levels = levels(x_train$rm)) + + factor_feat <- sapply(x_train, is.factor) + nb_factor_feat <- sum(factor_feat) + + dummylist <- make_dummies(traindata = x_train, testdata = x_train) + + # Tests + expect_type(dummylist, "list") + + expect_equal(length(dummylist$feature_list$contrasts_list), nb_factor_feat) + + expect_equal(length(dummylist$feature_list$labels), ncol(x_train)) + + expect_equal(sum(dummylist$feature_list$classes=="factor"), nb_factor_feat) + + expect_equal(ncol(dummylist$feature_list$contrasts_list$rm), length(levels(x_train$rm))) + + # 1) What if train has more features than test but features in test are contained in train + x_train1 <- cbind(x_train, 1) + x_test1 <- x_test + expect_error(make_dummies(traindata = x_train1, testdata = x_test1)) + + # 2) What if train has different feature types than test + x_train2 <- x_train + x_test2 <- x_test + x_test2$rm <- as.numeric(x_test2$rm) + expect_error(make_dummies(traindata = x_train2, testdata = x_test2)) + + # 3) What if test has more features than train but features in train are contained in test + x_train3 <- x_train + x_test3 <- cbind(x_test, 1) + expect_error(make_dummies(traindata = x_train3, testdata = x_test3)) + + # 4) What if train and test only have numerical features + x_train4 <- x_train + x_train4$rm <- as.numeric(x_train4$rm) + x_test4 <- x_test + x_test4$rm <- as.numeric(x_test4$rm) + expect_type(make_dummies(traindata = x_train4, testdata = x_test4), "list") + + # 5) What if train and test only have categorical features + x_train5 <- x_train + x_train5 <- x_train5[, "rm", drop = FALSE] + x_test5 <- x_test + x_test5 <- x_test5[, "rm", drop = FALSE] + expect_type(make_dummies(traindata = x_train5, testdata = x_test5), "list") + + # 6) What if test has the same levels as train but random ordering of levels + x_train6 <- x_train + x_train6$rm <- factor(x_train6$rm, levels = 4:9) + x_test6 <- x_test + x_test6$rm <- factor(x_test6$rm, levels = c(8, 9, 7, 4, 5, 6)) + expect_type(make_dummies(traindata = x_train6, testdata = x_test6), "list") + + # 7) What if test has different levels than train + x_train7 <- x_train + x_train7$rm <- factor(x_train7$rm, levels = 4:9) + x_test7 <- x_test + x_test7$rm <- factor(x_test7$rm, levels = 6:8) + expect_error(make_dummies(traindata = x_train7, testdata = x_test7)) + + # 8) What if train and test have different feature names + x_train8 <- x_train + x_test8 <- x_test + names(x_test8) <- c("lstat2", "rm2", "dis2", "indus2") + expect_error(make_dummies(traindata = x_train8, testdata = x_test8)) + + # 9) What if one variables has an empty name + x_train9 <- x_train + colnames(x_train9) <- c("", "rm", "dis", "indus") + x_test9 <- x_test + colnames(x_test9) <- c("", "rm", "dis", "indus") + expect_error(make_dummies(traindata = x_train9, testdata = x_test9)) + + # 10) What if traindata has a column that repeats + x_train10 <- cbind(x_train, lstat = x_train$lstat) + x_test10 <- cbind(x_test, lstat = x_test$lstat) + expect_error(make_dummies(traindata = x_train10, testdata = x_test10)) + + # 11) What if traindata has no column names + x_train11 <- x_train + colnames(x_train11) <- NULL + x_test11 <- x_test + colnames(x_test11) <- NULL + expect_error(make_dummies(traindata = x_train11, testdata = x_test11)) + + # 12 Test that traindata_new and testdata_new will be the same as the original + # x_train and x_test. The only time this is different is if the levels of train + # and test are different. See below. + dummylist12 <- make_dummies(traindata = x_train, testdata = x_test) + # + expect_true(all(data.frame(dummylist12$traindata_new) == x_train)) + expect_true(all(levels(dummylist12$traindata_new$rm) == levels(x_train$rm))) + expect_true(all(data.frame(dummylist12$testdata_new) == x_test)) + expect_true(all(levels(dummylist12$testdata_new$rm) == levels(x_test$rm))) + + + # 13 Different levels same as check # 12 + # + x_train13 <- x_train + x_train13$rm <- factor(x_train13$rm, levels = 4:9) + x_test13 <- x_test + x_test13$rm <- factor(x_test13$rm, levels = c(8, 9, 7, 4, 5, 6)) + dummylist13 <- make_dummies(traindata = x_train13, testdata = x_test13) + # + expect_true(all(data.frame(dummylist13$traindata_new) == x_train13)) + expect_true(all(levels(dummylist13$traindata_new$rm) == levels(x_train13$rm))) + expect_true(all(data.frame(dummylist13$testdata_new) == x_test13)) + # Important !!!! + expect_false(all(levels(dummylist13$testdata_new$rm) == levels(x_test13$rm))) + } }) test_that("Test apply_dummies", { - data("Boston", package = "MASS") - x_var <- c("lstat", "rm", "dis", "indus") - y_var <- "medv" + if (requireNamespace("MASS", quietly = TRUE)) { + data("Boston", package = "MASS") + x_var <- c("lstat", "rm", "dis", "indus") + y_var <- "medv" - x_train <- as.data.frame(Boston[401:411, x_var]) - y_train <- Boston[401:408, y_var] - x_test <- as.data.frame(Boston[1:4, x_var]) + x_train <- as.data.frame(Boston[401:411, x_var]) + y_train <- Boston[401:408, y_var] + x_test <- as.data.frame(Boston[1:4, x_var]) - # convert to factors for illustrational purpose - x_train$rm <- factor(round(x_train$rm)) - x_test$rm <- factor(round(x_test$rm), levels = levels(x_train$rm)) + # convert to factors for illustrational purpose + x_train$rm <- factor(round(x_train$rm)) + x_test$rm <- factor(round(x_test$rm), levels = levels(x_train$rm)) - numeric_feat <- !sapply(x_train, is.factor) - nb_numeric_feat <- sum(numeric_feat) + numeric_feat <- !sapply(x_train, is.factor) + nb_numeric_feat <- sum(numeric_feat) - dummylist <- make_dummies(traindata = x_train, testdata = x_test) + dummylist <- make_dummies(traindata = x_train, testdata = x_test) - x_test_dummies <- apply_dummies(obj = dummylist$obj, testdata = x_test) + x_test_dummies <- apply_dummies(feature_list = dummylist$feature_list, testdata = x_test) - # Tests - expect_type(x_test_dummies, "double") + # Tests + expect_type(x_test_dummies, "double") - expect_equal(ncol(x_test_dummies), - nb_numeric_feat + length(dummylist$obj$factor_list$rm)) + expect_equal(ncol(x_test_dummies), + nb_numeric_feat + ncol(dummylist$feature_list$contrasts_list$rm)) - # Test that make_dummies() and apply_dummies() gives the same output - # for a given traindata and testdata - expect_true(all(dummylist$test_dummies == x_test_dummies)) + # Test that make_dummies() and apply_dummies() gives the same output + # for a given traindata and testdata + expect_true(all(dummylist$test_dummies == x_test_dummies)) - # 1) What if you re-arrange the columns in x_train - x_test1 <- x_test[, c(2, 3, 1, 4)] - diff_column_placements <- apply_dummies(obj = dummylist$obj, testdata = x_test1) - expect_equal(colnames(diff_column_placements), colnames(x_test_dummies)) + # 1) What if you re-arrange the columns in x_train + x_test1 <- x_test[, c(2, 3, 1, 4)] + diff_column_placements <- apply_dummies(dummylist$feature_list, testdata = x_test1) + expect_equal(colnames(diff_column_placements), colnames(x_test_dummies)) - # 2) What if you put in less features then the original traindata - x_test2 <- x_test[, c(2, 1)] - expect_error(apply_dummies(dummylist$obj, testdata = x_test2)) + # 2) What if you put in less features then the original traindata + x_test2 <- x_test[, c(2, 1)] + expect_error(apply_dummies(dummylist$feature_list, testdata = x_test2)) - # 3) What if you change the feature types of testdata - x_test3 <- sapply(x_test, as.numeric) - expect_error(apply_dummies(dummylist$obj, testdata = x_test3)) + # 3) What if you change the feature types of testdata + x_test3 <- sapply(x_test, as.numeric) + expect_error(apply_dummies(dummylist$feature_list, testdata = x_test3)) - # 4) What if you add a feature - x_test4 <- cbind(x_train[, c(1, 2)], new_var = x_train[, 2], x_train[, c(3, 4)]) - expect_error(apply_dummies(dummylist$obj, testdata = x_test4)) + # 4) What if you add a feature + x_test4 <- cbind(x_train[, c(1, 2)], new_var = x_train[, 2], x_train[, c(3, 4)]) + expect_error(apply_dummies(dummylist$feature_list, testdata = x_test4)) - # 6) What if test has the same levels as train but random ordering of levels - x_test6 <- x_test - x_test6$rm <- factor(x_test6$rm, levels = c(8, 9, 7, 4, 5, 6)) - expect_error(apply_dummies(dummylist$obj, testdata = x_test6)) + # 6) What if test has the same levels as train but random ordering of levels + x_test6 <- x_test + x_test6$rm <- factor(x_test6$rm, levels = c(8, 9, 7, 4, 5, 6)) + expect_error(apply_dummies(dummylist$feature_list, testdata = x_test6)) - # 7) What if test has different levels than train - x_test7 <- x_test - x_test7$rm <- factor(x_test7$rm, levels = 6:8) - expect_error(apply_dummies(dummylist$obj, testdata = x_test7)) + # 7) What if test has different levels than train + x_test7 <- x_test + x_test7$rm <- factor(x_test7$rm, levels = 6:8) + expect_error(apply_dummies(dummylist$feature_list, testdata = x_test7)) - # 8) What if train and test have different feature names - x_test8 <- x_test - names(x_test8) <- c("lstat2", "rm2", "dis2", "indus2") - expect_error(apply_dummies(dummylist$obj, testdata = x_test8)) + # 8) What if train and test have different feature names + x_test8 <- x_test + names(x_test8) <- c("lstat2", "rm2", "dis2", "indus2") + expect_error(apply_dummies(dummylist$feature_list, testdata = x_test8)) - # 9) What if one variables has an empty name - x_test9 <- x_test - colnames(x_test9) <- c("", "rm", "dis", "indus") - expect_error(apply_dummies(dummylist$obj, testdata = x_test9)) + # 9) What if one variables has an empty name + x_test9 <- x_test + colnames(x_test9) <- c("", "rm", "dis", "indus") + expect_error(apply_dummies(dummylist$feature_list, testdata = x_test9)) - # 10) What if traindata has a column that repeats - x_test10 <- cbind(x_test, lstat = x_test$lstat) - expect_error(apply_dummies(dummylist$obj, testdata = x_test10)) - - # 11) What if testdata has no column names - x_test11 <- x_test - colnames(x_test11) <- NULL - expect_error(apply_dummies(dummylist$obj, testdata = x_test11)) + # 10) What if traindata has a column that repeats + x_test10 <- cbind(x_test, lstat = x_test$lstat) + expect_error(apply_dummies(dummylist$feature_list, testdata = x_test10)) + # 11) What if testdata has no column names + x_test11 <- x_test + colnames(x_test11) <- NULL + expect_error(apply_dummies(dummylist$feature_list, testdata = x_test11)) + } }) diff --git a/tests/testthat/test-models.R b/tests/testthat/test-models.R index c4a5ba911..45e811ac1 100644 --- a/tests/testthat/test-models.R +++ b/tests/testthat/test-models.R @@ -1,351 +1,491 @@ -library(shapr) -library(testthat) - context("test-models.R") test_that("Test predict_model (regression)", { # Data ----------- - data("Boston", package = "MASS") - x_var <- c("lstat", "rm", "dis", "indus") - y_var <- "medv" - x_train <- tail(Boston[, x_var], -6) - y_train <- tail(Boston[, y_var], -6) - x_test <- head(Boston[, x_var], 6) - str_formula <- "y_train ~ lstat + rm + dis + indus" - train_df <- cbind(y_train, x_train) - - # List of models - l <- list( - stats::lm(str_formula, data = train_df), - stats::glm(str_formula, data = train_df), - ranger::ranger(str_formula, data = train_df), - xgboost::xgboost(data = as.matrix(x_train), label = y_train, nrounds = 3, verbose = FALSE), - mgcv::gam(as.formula(str_formula), data = train_df) - ) + if (requireNamespace("MASS", quietly = TRUE)) { + data("Boston", package = "MASS") + x_var <- c("lstat", "rm", "dis", "indus") + y_var <- "medv" + x_train <- tail(Boston[, x_var], -6) + y_train <- tail(Boston[, y_var], -6) + x_test <- head(Boston[, x_var], 6) + str_formula <- "y_train ~ lstat + rm + dis + indus" + train_df <- cbind(y_train, x_train) + + # List of models + l <- list( + stats::lm(str_formula, data = train_df), + stats::glm(str_formula, data = train_df)) + + if (requireNamespace("ranger", quietly = TRUE)) { + l[[length(l) + 1]] <- ranger::ranger(str_formula, data = train_df) + } + if (requireNamespace("xgboost", quietly = TRUE)) { + l[[length(l) + 1]] <- xgboost::xgboost(data = as.matrix(x_train), label = y_train, nrounds = 3, verbose = FALSE) + } + if (requireNamespace("mgcv", quietly = TRUE)) { + l[[length(l) + 1]] <- mgcv::gam(as.formula(str_formula), data = train_df) + } + + # Tests + for (i in seq_along(l)) { + + # Input equals data.frame + expect_true( + is.vector(predict_model(l[[i]], x_test)) + ) + expect_true( + is.atomic(predict_model(l[[i]], x_test)) + ) + expect_true( + is.double(predict_model(l[[i]], x_test)) + ) + expect_true( + length(predict_model(l[[i]], x_test)) == nrow(x_test) + ) + + # Input equals matrix + expect_true( + is.double(predict_model(l[[i]], as.matrix(x_test))) + ) + expect_true( + is.atomic(predict_model(l[[i]], as.matrix(x_test))) + ) + expect_true( + is.vector(predict_model(l[[i]], as.matrix(x_test))) + ) + expect_true( + length(predict_model(l[[i]], as.matrix(x_test))) == nrow(x_test) + ) + } + } +}) - # Tests - for (i in seq_along(l)) { +test_that("Test predict_model (binary classification)", { - # Input equals data.frame - expect_true( - is.vector(predict_model(l[[i]], x_test)) - ) - expect_true( - is.atomic(predict_model(l[[i]], x_test)) - ) - expect_true( - is.double(predict_model(l[[i]], x_test)) - ) - expect_true( - length(predict_model(l[[i]], x_test)) == nrow(x_test) - ) + # Data ----------- - # Input equals matrix - expect_true( - is.double(predict_model(l[[i]], as.matrix(x_test))) - ) - expect_true( - is.atomic(predict_model(l[[i]], as.matrix(x_test))) - ) - expect_true( - is.vector(predict_model(l[[i]], as.matrix(x_test))) - ) - expect_true( - length(predict_model(l[[i]], as.matrix(x_test))) == nrow(x_test) - ) + if (requireNamespace("datasets", quietly = TRUE)) { + data("iris", package = "datasets") + x_var <- c("Sepal.Length", "Sepal.Width", "Petal.Length", "Petal.Width") + y_var <- "Species" + iris$Species <- as.character(iris$Species) + iris <- iris[which(iris$Species != "virginica"), ] + iris$Species <- as.factor(iris$Species) + x_train <- tail(iris[, x_var], -6) + y_train <- tail(iris[, y_var], -6) + x_test <- head(iris[, x_var], 6) + str_formula <- "y_train ~ Sepal.Length + Sepal.Width + Petal.Length + Petal.Width" + train_df <- cbind(y_train, x_train) + + # List of models + l <- list( + suppressWarnings(stats::glm(str_formula, data = train_df, family = "binomial")) + ) + + if (requireNamespace("mgcv", quietly = TRUE)) { + l[[length(l) + 1]] <- suppressWarnings(mgcv::gam(as.formula(str_formula), data = train_df, family = "binomial")) + } + if (requireNamespace("ranger", quietly = TRUE)) { + l[[length(l) + 1]] <- ranger::ranger(str_formula, data = train_df, probability = TRUE) + } + if (requireNamespace("xgboost", quietly = TRUE)) { + l[[length(l) + 1]] <- xgboost::xgboost( + data = as.matrix(x_train), + label = as.integer(y_train) - 1, + nrounds = 2, + verbose = FALSE, + objective = "binary:logistic", + eval_metric = "error" + ) + } + + # Tests + for (i in seq_along(l)) { + + # Input equals data.frame + expect_true( + is.vector(predict_model(l[[i]], x_test)) + ) + expect_true( + is.atomic(predict_model(l[[i]], x_test)) + ) + expect_true( + is.double(predict_model(l[[i]], x_test)) + ) + expect_true( + length(predict_model(l[[i]], x_test)) == nrow(x_test) + ) + expect_true( + all(data.table::between(predict_model(l[[i]], x_test), 0, 1)) + ) + + # Input equals matrix + expect_true( + is.double(predict_model(l[[i]], as.matrix(x_test))) + ) + expect_true( + is.atomic(predict_model(l[[i]], as.matrix(x_test))) + ) + expect_true( + is.vector(predict_model(l[[i]], as.matrix(x_test))) + ) + expect_true( + length(predict_model(l[[i]], as.matrix(x_test))) == nrow(x_test) + ) + expect_true( + all(data.table::between(predict_model(l[[i]], as.matrix(x_test)), 0, 1)) + ) + + # Check that output is equal + expect_equal( + predict_model(l[[i]], x_test), predict_model(l[[i]], as.matrix(x_test)) + ) + + } + + # Errors + l <- list() + + if (requireNamespace("ranger", quietly = TRUE)) { + l[[length(l) + 1]] <- ranger::ranger(str_formula, data = train_df) + } + if (requireNamespace("xgboost", quietly = TRUE)) { + l[[length(l) + 1]] <- xgboost::xgboost( + data = as.matrix(x_train), + label = as.integer(y_train) - 1, + nrounds = 2, + verbose = FALSE, + objective = "reg:logistic") + } + + # Tests + for (i in seq_along(l)) { + + # Input equals data.frame + expect_error( + get_model_specs(l[[i]]) + ) + + # Input equals matrix + expect_error( + get_model_specs(l[[i]]) + ) + } + } +}) - # Check that output is equal - expect_equal( - predict_model(l[[i]], x_test), predict_model(l[[i]], as.matrix(x_test)) - ) +test_that("Test predict_model (multi-classification)", { - # Check model type - expect_equal( - model_type(l[[i]]), "regression" - ) + # Data ----------- + if (requireNamespace("datasets", quietly = TRUE)) { + data("iris", package = "datasets") + x_var <- c("Sepal.Length", "Sepal.Width", "Petal.Length", "Petal.Width") + y_var <- "Species" + x_train <- tail(iris[, x_var], -6) + y_train <- tail(iris[, y_var], -6) + x_test <- head(iris[, x_var], 6) + str_formula <- "y_train ~ Sepal.Length + Sepal.Width + Petal.Length + Petal.Width" + train_df <- cbind(y_train, x_train) + + # List of models + l <- list() + + if (requireNamespace("ranger", quietly = TRUE)) { + l[[length(l) + 1]] <- ranger::ranger( + str_formula, + data = train_df + ) + l[[length(l) + 1]] <- ranger::ranger( + str_formula, + data = train_df, + probability = TRUE + ) + } + if (requireNamespace("xgboost", quietly = TRUE)) { + l[[length(l) + 1]] <- xgboost::xgboost( + as.matrix(x_train), + label = as.integer(y_train) - 1, + nrounds = 2, + verbose = FALSE, + objective = "multi:softprob", + eval_metric = "merror", + num_class = 3 + ) + l[[length(l) + 1]] <- xgboost::xgboost( + as.matrix(x_train), + label = as.integer(y_train) - 1, + nrounds = 2, + verbose = FALSE, + objective = "multi:softmax", + eval_metric = "merror", + num_class = 3 + ) + } + + + # Tests + for (i in seq_along(l)) { + + # Input equals data.frame + expect_error( + get_model_specs(l[[i]], x_test) + ) + + # Input equals matrix + expect_error( + get_model_specs(l[[i]], as.matrix(x_test)) + ) + } } }) -test_that("Test predict_model (binary classification)", { +test_that("Test check_features + update_data", { # Data ----------- - data("iris", package = "datasets") - x_var <- c("Sepal.Length", "Sepal.Width", "Petal.Length", "Petal.Width") - y_var <- "Species" - iris$Species <- as.character(iris$Species) - iris <- iris[which(iris$Species != "virginica"), ] - iris$Species <- as.factor(iris$Species) - x_train <- tail(iris[, x_var], -6) - y_train <- tail(iris[, y_var], -6) - x_test <- head(iris[, x_var], 6) - str_formula <- "y_train ~ Sepal.Length + Sepal.Width + Petal.Length + Petal.Width" - train_df <- cbind(y_train, x_train) - - # List of models - l <- list( - suppressWarnings(stats::glm(str_formula, data = train_df, family = "binomial")), - suppressWarnings(mgcv::gam(as.formula(str_formula), data = train_df, family = "binomial")), - ranger::ranger(str_formula, data = train_df, probability = TRUE), - xgboost::xgboost( - data = as.matrix(x_train), - label = as.integer(y_train) - 1, - nrounds = 2, - verbose = FALSE, - objective = "binary:logistic", - eval_metric = "error" - ) - ) + if (requireNamespace("MASS", quietly = TRUE)) { + data("Boston", package = "MASS") + y_var <- "medv" + x_train <- tail(Boston, -6) + y_train <- tail(Boston[, y_var], -6) + y_train_binary <- as.factor(tail((Boston[, y_var]>20)*1, -6)) - # Tests - for (i in seq_along(l)) { + # convert to factors for testing purposes + x_train$rad <- factor(round(x_train$rad)) + x_train$chas <- factor(round(x_train$chas)) - # Input equals data.frame - expect_true( - is.vector(predict_model(l[[i]], x_test)) - ) - expect_true( - is.atomic(predict_model(l[[i]], x_test)) - ) - expect_true( - is.double(predict_model(l[[i]], x_test)) - ) - expect_true( - length(predict_model(l[[i]], x_test)) == nrow(x_test) - ) - expect_true( - all(data.table::between(predict_model(l[[i]], x_test), 0, 1)) - ) + train_df <- cbind(x_train, y_train,y_train_binary) - # Input equals matrix - expect_true( - is.double(predict_model(l[[i]], as.matrix(x_test))) - ) - expect_true( - is.atomic(predict_model(l[[i]], as.matrix(x_test))) - ) - expect_true( - is.vector(predict_model(l[[i]], as.matrix(x_test))) - ) - expect_true( - length(predict_model(l[[i]], as.matrix(x_test))) == nrow(x_test) - ) - expect_true( - all(data.table::between(predict_model(l[[i]], as.matrix(x_test)), 0, 1)) - ) + x_var_numeric <- c("lstat", "rm", "dis", "indus") + x_var_factor <- c("lstat", "rm", "dis", "indus", "rad", "chas") - # Check that output is equal - expect_equal( - predict_model(l[[i]], x_test), predict_model(l[[i]], as.matrix(x_test)) - ) + formula_numeric <- as.formula(paste0("y_train ~ ",paste0(x_var_numeric,collapse="+"))) + formula_factor <- as.formula(paste0("y_train ~ ",paste0(x_var_factor,collapse="+"))) - # Check model type - expect_equal( - model_type(l[[i]]), "classification" - ) + formula_binary_numeric <- as.formula(paste0("y_train_binary ~ ",paste0(x_var_numeric,collapse="+"))) + formula_binary_factor <- as.formula(paste0("y_train_binary ~ ",paste0(x_var_factor,collapse="+"))) + + dummylist <- make_dummies(traindata = x_train[, x_var_factor], testdata = x_train[, x_var_factor]) + + # List of models to run silently + l_silent <- list( + stats::lm(formula_numeric, data = train_df), + stats::lm(formula_factor, data = train_df), + stats::glm(formula_numeric, data = train_df), + stats::glm(formula_factor, data = train_df), + stats::glm(formula_binary_numeric, data = train_df, family = "binomial"), + stats::glm(formula_binary_factor, data = train_df, family = "binomial") + ) + l_message <- list() + + + if (requireNamespace("mgcv", quietly = TRUE)) { + l_silent[[length(l_silent) + 1]] <- mgcv::gam(formula_numeric, data = train_df) + l_silent[[length(l_silent) + 1]] <- mgcv::gam(formula_factor, data = train_df) + l_silent[[length(l_silent) + 1]] <- mgcv::gam(formula_binary_numeric, data = train_df, family = "binomial") + l_silent[[length(l_silent) + 1]] <- mgcv::gam(formula_binary_factor, data = train_df, family = "binomial") } - # Errors - l <- list( - ranger::ranger( - str_formula, - data = train_df - ), - xgboost::xgboost( - data = as.matrix(x_train), - label = as.integer(y_train) - 1, - nrounds = 2, + if (requireNamespace("xgboost", quietly = TRUE)) { + l_silent[[length(l_silent) + 1]] <- xgboost::xgboost(data = dummylist$train_dummies, label = y_train, + nrounds = 3, verbose = FALSE) + l_silent[[length(l_silent)]]$feature_list <- dummylist$feature_list + + l_silent[[length(l_silent) + 1]] <- xgboost::xgboost( + data = dummylist$train_dummies, + label = as.integer(y_train_binary)-1, + nrounds = 3, verbose = FALSE, - objective = "reg:logistic" - ) - ) + objective = "binary:logistic", + eval_metric = "error" + ) + l_silent[[length(l_silent)]]$feature_list <- dummylist$feature_list - # Tests - for (i in seq_along(l)) { + l_message[[length(l_message) + 1]] <- xgboost::xgboost(data = as.matrix(x_train[, x_var_numeric]), + label = y_train, nrounds = 3, verbose = FALSE) + } - # Input equals data.frame - expect_error( - predict_model(l[[i]], x_test) - ) + if (requireNamespace("ranger", quietly = TRUE)) { + l_message[[length(l_message) + 1]] <- ranger::ranger(formula_numeric, data = train_df) + l_message[[length(l_message) + 1]] <- ranger::ranger(formula_factor, data = train_df) + l_message[[length(l_message) + 1]] <- ranger::ranger(formula_binary_numeric, data = train_df, probability = TRUE) + l_message[[length(l_message) + 1]] <- ranger::ranger(formula_binary_factor, data = train_df, probability = TRUE) + } - # Input equals matrix - expect_error( - predict_model(l[[i]], as.matrix(x_test)) - ) + data_features <- get_data_specs(train_df) + for (i in seq_along(l_silent)) { + model_features <- get_model_specs(l_silent[[i]]) + expect_silent(check_features(model_features,data_features)) + } - # Check model type - expect_error( - model_type(l[[i]]) - ) + for (i in seq_along(l_message)) { + model_features <- get_model_specs(l_message[[i]]) + expect_message(check_features(model_features,data_features)) } -}) -test_that("Test predict_model (multi-classification)", { - # Data ----------- - data("iris", package = "datasets") - x_var <- c("Sepal.Length", "Sepal.Width", "Petal.Length", "Petal.Width") - y_var <- "Species" - x_train <- tail(iris[, x_var], -6) - y_train <- tail(iris[, y_var], -6) - x_test <- head(iris[, x_var], 6) - str_formula <- "y_train ~ Sepal.Length + Sepal.Width + Petal.Length + Petal.Width" - train_df <- cbind(y_train, x_train) - - # List of models - l <- list( - ranger::ranger( - str_formula, - data = train_df - ), - ranger::ranger( - str_formula, - data = train_df, - probability = TRUE - ), - xgboost::xgboost( - as.matrix(x_train), - label = as.integer(y_train) - 1, - nrounds = 2, - verbose = FALSE, - objective = "multi:softprob", - eval_metric = "merror", - num_class = 3 - ), - xgboost::xgboost( - as.matrix(x_train), - label = as.integer(y_train) - 1, - nrounds = 2, - verbose = FALSE, - objective = "multi:softmax", - eval_metric = "merror", - num_class = 3 - ) - ) + # Checking all stops in check_features + data_features_ok <- get_data_specs(train_df) - # Tests - for (i in seq_along(l)) { + # Non-matching labels + data_features_error <- get_data_specs(train_df) + data_features_error$labels <- NULL + expect_error(check_features(data_features_ok,data_features_error)) + expect_message(check_features(data_features_error,data_features_ok,use_1_as_truth = T)) + expect_error(check_features(data_features_error,data_features_ok,use_1_as_truth = F)) - # Input equals data.frame - expect_error( - predict_model(l[[i]], x_test) - ) - # Input equals matrix - expect_error( - predict_model(l[[i]], as.matrix(x_test)) - ) + # Missing features + data_features_error <- get_data_specs(train_df[,-3]) + expect_error(check_features(data_features_ok,data_features_error)) + expect_error(check_features(data_features_error,data_features_ok,use_1_as_truth = F)) - # Check model type - expect_error( - model_type(l[[i]]) - ) - } -}) + # Duplicated column names + data_features_error <- get_data_specs(cbind(crim=train_df[,1],train_df)) + expect_error(check_features(data_features_error,data_features_error)) -test_that("Test features (regression)", { + # Empty column names + train_df_0 <- train_df + names(train_df_0)[1] = "" + data_features_error <- get_data_specs(train_df_0) + expect_error(check_features(data_features_error,data_features_error)) - # Data ----------- - data("Boston", package = "MASS") - x_var <- c("lstat", "rm", "dis", "indus") - y_var <- "medv" - x_train <- tail(Boston, -6) - y_train <- tail(Boston[, y_var], -6) - str_formula <- "y_train ~ lstat + rm + dis + indus" - train_df <- cbind(y_train, x_train) - - # List of models - l <- list( - stats::lm(str_formula, data = train_df), - stats::glm(str_formula, data = train_df), - ranger::ranger(str_formula, data = train_df), - xgboost::xgboost(data = as.matrix(x_train[, x_var]), label = y_train, nrounds = 3, verbose = FALSE), - mgcv::gam(as.formula(str_formula), data = train_df) - ) + # feature class is NA + data_features_error <- data_features_ok + data_features_error$classes <- rep(NA,length(data_features_error$classes)) + expect_message(check_features(data_features_error,data_features_ok)) - for (i in seq_along(l)) { - expect_equal(features(l[[i]], cnms = colnames(train_df)), x_var) - } + # feature classes are different + data_features_error <- data_features_ok + data_features_error$classes <- rev(data_features_error$classes) + names(data_features_error$classes) <- names(data_features_ok$classes) + expect_error(check_features(data_features_ok,data_features_error)) -}) + # invalid feature class + data_features_error <- data_features_ok + data_features_error$classes[1] <- "logical" + expect_error(check_features(data_features_error,data_features_error)) -test_that("Test features (binary classification)", { + # non-matching factor levels + data_features_error <- data_features_ok + data_features_error$factor_levels$chas <- c(data_features_error$factor_levels$chas,"2") + expect_error(check_features(data_features_ok,data_features_error)) - # Data ----------- - data("iris", package = "datasets") - x_var <- c("Sepal.Length", "Sepal.Width", "Petal.Length", "Petal.Width") - y_var <- "Species" - iris$Species <- as.character(iris$Species) - iris <- iris[which(iris$Species != "virginica"), ] - iris$Species <- as.factor(iris$Species) - x_train <- tail(iris, -6) - y_train <- tail(iris[, y_var], -6) - str_formula <- "y_train ~ Sepal.Length + Sepal.Width + Petal.Length + Petal.Width" - train_df <- cbind(y_train, x_train) - - # List of models - l <- list( - suppressWarnings(stats::glm(str_formula, data = train_df, family = "binomial")), - suppressWarnings(mgcv::gam(as.formula(str_formula), data = train_df, family = "binomial")), - ranger::ranger(str_formula, data = train_df, probability = TRUE), - xgboost::xgboost( - data = as.matrix(x_train[, x_var]), - label = as.integer(y_train) - 1, - nrounds = 2, - verbose = FALSE, - objective = "binary:logistic", - eval_metric = "error", - ) - ) + #### Now turning to update_data tests #### - for (i in seq_along(l)) { - expect_equal(features(l[[i]], cnms = colnames(train_df)), x_var) - } + model_features_ok <- get_model_specs(l_silent[[2]]) + # Checking null output and message to remove features + train_dt <- as.data.table(train_df) + data_to_update <- copy(train_dt) + expect_message(expect_null(update_data(data_to_update,model_features_ok))) + # Checking that features are indeed removed + expect_equal(names(data_to_update),model_features_ok$labels) + + # Second call with same input should do nothing + expect_silent(expect_null(update_data(data_to_update,model_features_ok))) + + # Checking null output and message to shuffle factor levels + data_to_update_2 <- head(copy(train_dt),20) + data_to_update_2$rad <- droplevels(data_to_update_2$rad) + org_levels_rad <- levels(data_to_update_2$rad) + + expect_message(expect_null(update_data(data_to_update_2,model_features_ok))) + + # Checking that levels are indeed updated + expect_true(length(org_levels_rad) 0) { - S[i, features[[i]]] <- 1L + # Example ----------- + m <- 3 + n_combinations <- 2^m + mtcars <- mtcars[1:15, seq(m)] + ntrain <- 14 + xtrain <- mtcars[seq(ntrain), ] + xtest <- mtcars[-seq(ntrain), , drop = FALSE] + S <- matrix(0L, n_combinations, m) + features <- list( + integer(), 1, 2, 3, c(1, 2), c(1, 3), c(2, 3), c(1, 2, 3) + ) + for (i in seq_along(features)) { + feature_i <- features[[i]] + if (length(feature_i) > 0) { + S[i, features[[i]]] <- 1L + } } - } - # Tests (invalid input) ----------- - expect_error( - observation_impute_cpp( - index_xtrain = c(1, 2), - index_s = c(1, 2, 3), - xtrain = xtrain, - xtest = xtest, - S = S + # Tests (invalid input) ----------- + expect_error( + observation_impute_cpp( + index_xtrain = c(1, 2), + index_s = c(1, 2, 3), + xtrain = xtrain, + xtest = xtest, + S = S + ) + ) + expect_error( + observation_impute_cpp( + index_xtrain = c(1, 2), + index_s = c(2, 3), + xtrain = xtrain[, 1:2], + xtest = xtest, + S = S + ) ) - ) - expect_error( - observation_impute_cpp( - index_xtrain = c(1, 2), - index_s = c(2, 3), - xtrain = xtrain[, 1:2], + + # Tests (valid input) ----------- + index_xtrain <- c(1, 2) + index_s <- c(4, 5) + x <- observation_impute_cpp( + index_xtrain = index_xtrain, + index_s = index_s, + xtrain = xtrain, xtest = xtest, S = S ) - ) - - # Tests (valid input) ----------- - index_xtrain <- c(1, 2) - index_s <- c(4, 5) - x <- observation_impute_cpp( - index_xtrain = index_xtrain, - index_s = index_s, - xtrain = xtrain, - xtest = xtest, - S = S - ) - expect_equal(nrow(x), length(index_s)) - expect_equal(ncol(x), ncol(xtrain)) - expect_true(is.matrix(x)) - expect_true(is.double(x)) + expect_equal(nrow(x), length(index_s)) + expect_equal(ncol(x), ncol(xtrain)) + expect_true(is.matrix(x)) + expect_true(is.double(x)) - for (i in 1:nrow(x)) { - feature_i <- features[[index_s[i]]] + for (i in 1:nrow(x)) { + feature_i <- features[[index_s[i]]] - for (j in seq(m)) { + for (j in seq(m)) { - if (j %in% feature_i) { - expect_equal(x[i, j], unname(xtest[1, j])) - } else { - expect_equal(x[i, j], unname(xtrain[index_xtrain[i], j])) + if (j %in% feature_i) { + expect_equal(x[i, j], unname(xtest[1, j])) + } else { + expect_equal(x[i, j], unname(xtrain[index_xtrain[i], j])) + } } } } diff --git a/tests/testthat/test-src_weighted_matrix.R b/tests/testthat/test-src_weighted_matrix.R index b308f5db4..e11ad1a88 100644 --- a/tests/testthat/test-src_weighted_matrix.R +++ b/tests/testthat/test-src_weighted_matrix.R @@ -1,5 +1,3 @@ -library(shapr) - context("test-src_weighted_matrix.R") test_that("Test weight_matrix_cpp", { diff --git a/tests/testthat/test-transformation.R b/tests/testthat/test-transformation.R index 66aad20b7..af5ee06ed 100644 --- a/tests/testthat/test-transformation.R +++ b/tests/testthat/test-transformation.R @@ -1,5 +1,3 @@ -library(shapr) - context("test-transformation.R") test_that("Test inv_gaussian_transform", { diff --git a/tests/testthat/test_objects/explanation_explain_obj_list_no_ctree.rds b/tests/testthat/test_objects/explanation_explain_obj_list_no_ctree.rds new file mode 100644 index 000000000..35b895dda Binary files /dev/null and b/tests/testthat/test_objects/explanation_explain_obj_list_no_ctree.rds differ diff --git a/tests/testthat/test_objects/shapley_explainer_obj.rds b/tests/testthat/test_objects/shapley_explainer_obj.rds index ce0d7585e..ec938fd6c 100644 Binary files a/tests/testthat/test_objects/shapley_explainer_obj.rds and b/tests/testthat/test_objects/shapley_explainer_obj.rds differ diff --git a/vignettes/understanding_shapr.Rmd b/vignettes/understanding_shapr.Rmd index 595d97aa7..66c32edea 100644 --- a/vignettes/understanding_shapr.Rmd +++ b/vignettes/understanding_shapr.Rmd @@ -344,10 +344,10 @@ y_train <- Boston[-1:-6, y_var] x_test_cat <- Boston[1:6, x_var_cat] # -- special function when using categorical data + xgboost -make_dummies_list <- make_dummies(traindata = x_train_cat, testdata = x_test_cat) +dummylist <- make_dummies(traindata = x_train_cat, testdata = x_test_cat) -x_train_dummy <- make_dummies_list$train_dummies -x_test_dummy <- make_dummies_list$test_dummies +x_train_dummy <- dummylist$train_dummies +x_test_dummy <- dummylist$test_dummies # Fitting a basic xgboost model to the training data model_cat <- xgboost::xgboost( @@ -356,22 +356,22 @@ model_cat <- xgboost::xgboost( nround = 20, verbose = FALSE ) -model_cat$dummylist <- make_dummies_list$obj +model_cat$feature_list <- dummylist$feature_list -explainer_cat <- shapr(make_dummies_list$traindata_new, model_cat) +explainer_cat <- shapr(dummylist$traindata_new, model_cat) p <- mean(y_train) explanation_cat <- explain( - make_dummies_list$testdata_new, + dummylist$testdata_new, approach = "ctree", explainer = explainer_cat, prediction_zero = p ) -# Plot the resulting explanations for observations 1 and 6, excluding +# Plot the resulting explanations for observations 1 and 6, excluding # the no-covariate effect -plot(explanation_ctree, plot_phi0 = FALSE, index_x_test = c(1, 6)) +plot(explanation_cat, plot_phi0 = FALSE, index_x_test = c(1, 6)) ``` @@ -548,7 +548,7 @@ plot(explanation_categorical, plot_phi0 = FALSE, index_x_test = c(1, 6)) ## Explain custom models -`shapr` currently supports explanation of predictions from models fitted with the following functions: +`shapr` currently natively supports explanation of predictions from models fitted with the following functions: * `stats::lm` * `stats::glm` @@ -558,22 +558,38 @@ plot(explanation_categorical, plot_phi0 = FALSE, index_x_test = c(1, 6)) Any continuous response regression model or binary classification model of these model classes, can be explained with the package directly as exemplified above. -Moreover, essentially any feature dependent prediction model can be explained by the package by specifying two simple -additional functions to the class your model belongs to. -The first class function is `model_type`, taking the model object as input, and outputting a character indicating -whether the model is a (continuous response) `"regression"` model, or a (binary) `"classification"` model. -The second function is `predict_model`, taking the model and data (as a `matrix` or `data.frame`) as input and -outputting the corresponding prediction as a numeric vector. -Once these two class functions are created, one can explain predictions from this model class as before. -Below we exemplify this process by explaining a `gbm` model using the `gbm` package, fitted to the same Boston data -set as used above. +Moreover, essentially any feature dependent prediction model can be explained by the package by specifying two (or one) +simple additional functions to the class your model belongs to. + +*Note: The below procedure for specifying custom models was changed in shapr v0.2.0* +The first class function is `predict_model`, taking the model and data (as a `matrix` or `data.frame/data.table`) as +input and outputting the corresponding prediction as a numeric vector. +The second (optional, but highly recommended) class function is `get_model_specs`, taking the model as input and +outputting a list with the following elements: +*labels* (vector with the feature names to compute Shapley values for), +*classes* (a named vector with the labels as names and the class type as elements), +*factor_levels* (a named list with the labels as names and vectors with the factor levels as elements (NULL if the +feature is not a factor)). +The `get_model_specs` function is used to check that the format of the data passed to `shapr` and `explain` have the +correct format in terms of the necessary feature columns being available and having the correct class/attributes. +It is highly recommended to do such checks in order to ensure correct usage of `shapr` and `explain`. +If, for some reason, such checking is not desirable, one does not have to provide the `get_model_specs` function class. +This will, however, throw a warning that all feature consistency checking against the model is disabled. + +Once the above class functions are created, one can explain predictions from this model class as before. +These functions **can** be made general enough to handle all supported model types of that class, or they can be made +minimal, possibly only allowing explanation of the specific version of the model class at hand. +Below we give examples of both full support versions of these functions and a minimal version which skips the +`get_model_specs` function. +We do this for the `gbm` model class from the `gbm` package, fitted to the same Boston data set as used above. ```{r} library(gbm) -form <- as.formula(paste0(y_var, "~", paste0(x_var, collapse = "+"))) -xy_train <- data.frame(x_train, medv = y_train) +xy_train <- data.frame(x_train,medv = y_train) + +form <- as.formula(paste0(y_var,"~",paste0(x_var,collapse="+"))) # Fitting a gbm model set.seed(825) @@ -581,56 +597,73 @@ model <- gbm::gbm( form, data = xy_train, distribution = "gaussian" - ) +) -# Create custom function of model_type for gbm -model_type.gbm <- function(x) { - ifelse( - x$distribution$name %in% c("bernoulli", "adaboost"), - "classification", - "regression" - ) -} +#### Full feature versions of the three required model functions #### -# Create custom function of predict_model for gbm predict_model.gbm <- function(x, newdata) { - + if (!requireNamespace('gbm', quietly = TRUE)) { stop('The gbm package is required for predicting train models') } - model_type <- model_type(x) + model_type <- ifelse( + x$distribution$name %in% c("bernoulli","adaboost"), + "classification", + "regression" + ) if (model_type == "classification") { - predict(x, as.data.frame(newdata), type = "response", n.trees = x$n.trees) + predict(x, as.data.frame(newdata), type = "response",n.trees = x$n.trees) } else { - predict(x, as.data.frame(newdata), n.trees = x$n.trees) + predict(x, as.data.frame(newdata),n.trees = x$n.trees) } } +get_model_specs.gbm <- function(x){ + feature_list = list() + feature_list$labels <- labels(x$Terms) + m <- length(feature_list$labels) + + feature_list$classes <- attr(x$Terms,"dataClasses")[-1] + feature_list$factor_levels <- setNames(vector("list", m), feature_list$labels) + feature_list$factor_levels[feature_list$classes=="factor"] <- NA # the model object doesn't contain factor levels info + + return(feature_list) +} + # Prepare the data for explanation set.seed(123) -explainer <- shapr(xy_train, model, feature_labels = x_var) +explainer <- shapr(xy_train, model) +p0 <- mean(xy_train[,y_var]) +explanation <- explain(x_test, explainer, approach = "empirical", prediction_zero = p0) +# Plot results +plot(explanation) -# Spedifying the phi_0, i.e. the expected prediction without any features -p0 <- mean(xy_train[, y_var]) +#### Minimal version of the three required model functions #### +# Note: Working only for this exact version of the model class +# Avoiding to define get_model_specs skips all feature +# consistency checking between your data and model + +# Removing the previously defined functions to simulate a fresh start +rm(predict_model.gbm) +rm(get_model_specs.gbm) + +predict_model.gbm <- function(x, newdata) { + predict(x, as.data.frame(newdata),n.trees = x$n.trees) +} + +# Prepare the data for explanation +set.seed(123) +explainer <- shapr(x_train, model) +p0 <- mean(xy_train[,y_var]) +explanation <- explain(x_test, explainer, approach = "empirical", prediction_zero = p0) +# Plot results +plot(explanation) -# Computing the actual Shapley values with kernelSHAP accounting for feature dependence using -# the empirical (conditional) distribution approach with -# bandwidth parameter sigma = 0.1 (default) -explanation <- explain( - x_test, - explainer, - approach = "empirical", - prediction_zero = p0 -) -# Plot the resulting explanations for observations 1 and 6, excluding -# the no-covariate effect. -plot(explanation_combined, plot_phi0 = FALSE, index_x_test = c(1, 6)) ``` -Note that this explains a different (but similar) model than the above xgboost model!