NorskRegnesentral · martinju · Oct 13, 2024 · Aug 3, 2024 · Aug 3, 2024 · Aug 5, 2024
diff --git a/NAMESPACE b/NAMESPACE
@@ -80,8 +80,11 @@ export(plot_MSEv_eval_crit)
 export(plot_SV_several_approaches)
 export(predict_model)
 export(prepare_data)
+export(prepare_data_causal)
 export(prepare_data_copula_cpp)
+export(prepare_data_copula_cpp_caus)
 export(prepare_data_gaussian_cpp)
+export(prepare_data_gaussian_cpp_caus)
 export(prepare_next_iteration)
 export(print_iter)
 export(regression.train_model)
@@ -136,5 +139,6 @@ importFrom(utils,capture.output)
 importFrom(utils,head)
 importFrom(utils,methods)
 importFrom(utils,modifyList)
+importFrom(utils,relist)
 importFrom(utils,tail)
 useDynLib(shapr, .registration = TRUE)
diff --git a/R/RcppExports.R b/R/RcppExports.R
@@ -138,6 +138,36 @@ prepare_data_copula_cpp <- function(MC_samples_mat, x_explain_mat, x_explain_gau
     .Call(`_shapr_prepare_data_copula_cpp`, MC_samples_mat, x_explain_mat, x_explain_gaussian_mat, x_train_mat, S, mu, cov_mat)
 }
 
+#' Generate (Gaussian) Copula MC samples for the causal setup with a single MC sample for each explicand
+#'
+#' @param MC_samples_mat arma::mat. Matrix of dimension (`n_explain`, `n_features`) containing samples from the
+#' univariate standard normal. The i'th row will be applied to the i'th row in `x_explain_mat`.
+#' @param x_explain_mat arma::mat. Matrix of dimension (`n_explain`, `n_features`) containing the observations to
+#' explain on the original scale. The MC sample for the i'th explicand is based on the i'th row in `MC_samples_mat`.
+#' @param x_explain_gaussian_mat arma::mat. Matrix of dimension (`n_explain`, `n_features`) containing the
+#' observations to explain after being transformed using the Gaussian transform, i.e., the samples have been
+#' transformed to a standardized normal distribution.
+#' @param x_train_mat arma::mat. Matrix of dimension (`n_train`, `n_features`) containing the training observations.
+#' @param S arma::mat. Matrix of dimension (`n_coalitions`, `n_features`) containing binary representations of
+#' the used coalitions. S cannot contain the empty or grand coalition, i.e., a row containing only zeros or ones.
+#' This is not a problem internally in shapr as the empty and grand coalitions treated differently.
+#' @param mu arma::vec. Vector of length `n_features` containing the mean of each feature after being transformed
+#' using the Gaussian transform, i.e., the samples have been transformed to a standardized normal distribution.
+#' @param cov_mat arma::mat. Matrix of dimension (`n_features`, `n_features`) containing the pairwise covariance
+#' between all pairs of features after being transformed using the Gaussian transform, i.e., the samples have been
+#' transformed to a standardized normal distribution.
+#'
+#' @return An arma::mat/2D array of dimension (`n_explain` * `n_coalitions`, `n_features`),
+#' where the rows (n_explain * S_ind, n_explain * (S_ind + 1) - 1) contains the single
+#' conditional Gaussian MC samples for each explicand and `S_ind` coalition.
+#'
+#' @export
+#' @keywords internal
+#' @author Lars Henry Berge Olsen
+prepare_data_copula_cpp_caus <- function(MC_samples_mat, x_explain_mat, x_explain_gaussian_mat, x_train_mat, S, mu, cov_mat) {
+    .Call(`_shapr_prepare_data_copula_cpp_caus`, MC_samples_mat, x_explain_mat, x_explain_gaussian_mat, x_train_mat, S, mu, cov_mat)
+}
+
 #' Generate Gaussian MC samples
 #'
 #' @param MC_samples_mat arma::mat. Matrix of dimension (`n_MC_samples`, `n_features`) containing samples from the
@@ -162,6 +192,30 @@ prepare_data_gaussian_cpp <- function(MC_samples_mat, x_explain_mat, S, mu, cov_
     .Call(`_shapr_prepare_data_gaussian_cpp`, MC_samples_mat, x_explain_mat, S, mu, cov_mat)
 }
 
+#' Generate Gaussian MC samples for the causal setup with a single MC sample for each explicand
+#'
+#' @param MC_samples_mat arma::mat. Matrix of dimension (`n_explain`, `n_features`) containing samples from the
+#' univariate standard normal. The i'th row will be applied to the i'th row in `x_explain_mat`.
+#' @param x_explain_mat arma::mat. Matrix of dimension (`n_explain`, `n_features`) containing the observations
+#' to explain. The MC sample for the i'th explicand is based on the i'th row in `MC_samples_mat`
+#' @param S arma::mat. Matrix of dimension (`n_combinations`, `n_features`) containing binary representations of
+#' the used coalitions. S cannot contain the empty or grand coalition, i.e., a row containing only zeros or ones.
+#' This is not a problem internally in shapr as the empty and grand coalitions treated differently.
+#' @param mu arma::vec. Vector of length `n_features` containing the mean of each feature.
+#' @param cov_mat arma::mat. Matrix of dimension (`n_features`, `n_features`) containing the pairwise covariance
+#' between all pairs of features.
+#'
+#' @return An arma::mat/2D array of dimension (`n_explain` * `n_coalitions`, `n_features`),
+#' where the rows (n_explain * S_ind, n_explain * (S_ind + 1) - 1) contains the single
+#' conditional Gaussian MC samples for each explicand and `S_ind` coalition.
+#'
+#' @export
+#' @keywords internal
+#' @author Lars Henry Berge Olsen
+prepare_data_gaussian_cpp_caus <- function(MC_samples_mat, x_explain_mat, S, mu, cov_mat) {
+    .Call(`_shapr_prepare_data_gaussian_cpp_caus`, MC_samples_mat, x_explain_mat, S, mu, cov_mat)
+}
+
 #' (Generalized) Mahalanobis distance
 #'
 #' Used to get the Euclidean distance as well by setting \code{mcov} = \code{diag(m)}.

diff --git a/R/approach_categorical.R b/R/approach_categorical.R
@@ -7,7 +7,7 @@
 #'
 #' @param categorical.epsilon Numeric value. (Optional)
 #' If \code{joint_probability_dt} is not supplied, probabilities/frequencies are
-#' estimated using `x_train`. If certain observations occur in `x_train` and NOT in `x_explain`,
+#' estimated using `x_train`. If certain observations occur in `x_explain` and NOT in `x_train`,
 #' then epsilon is used as the proportion of times that these observations occurs in the training data.
 #' In theory, this proportion should be zero, but this causes an error later in the Shapley computation.
 #'
@@ -36,35 +36,44 @@ setup_approach.categorical <- function(internal,
 
   # estimate joint_prob_dt if it is not passed to the function
   if (is.null(joint_probability_dt)) {
+    # Get the frequency of the unique feature value combinations in the training data
     joint_prob_dt0 <- x_train[, .N, eval(feature_names)]
 
-    explain_not_in_train <- data.table::setkeyv(data.table::setDT(x_explain), feature_names)[!x_train]
+    # Get the feature value combinations in the explicands that are NOT in the training data and their frequency
+    explain_not_in_train <- data.table::setkeyv(data.table::setDT(data.table::copy(x_explain)), feature_names)[!x_train]
     N_explain_not_in_train <- nrow(unique(explain_not_in_train))
 
+    # Add these feature value combinations, and their corresponding frequency, to joint_prob_dt0
     if (N_explain_not_in_train > 0) {
       joint_prob_dt0 <- rbind(joint_prob_dt0, cbind(explain_not_in_train, N = categorical.epsilon))
     }
 
+    # Compute the joint probability for each feature value combination
     joint_prob_dt0[, joint_prob := N / .N]
     joint_prob_dt0[, joint_prob := joint_prob / sum(joint_prob)]
     data.table::setkeyv(joint_prob_dt0, feature_names)
 
+    # Remove the frequency column and add an id column
     joint_probability_dt <- joint_prob_dt0[, N := NULL][, id_all := .I]
   } else {
+    # The `joint_probability_dt` is passed to explain by the user, and we do some checks.
     for (i in colnames(x_explain)) {
+      # Check that feature name is present
       is_error <- !(i %in% names(joint_probability_dt))
 
       if (is_error > 0) {
         stop(paste0(i, " is in x_explain but not in joint_probability_dt."))
       }
 
+      # Check that the feature has the same levels
       is_error <- !all(levels(x_explain[[i]]) %in% levels(joint_probability_dt[[i]]))
 
       if (is_error > 0) {
         stop(paste0(i, " in x_explain has factor levels than in joint_probability_dt."))
       }
     }
 
+    # Check that dt contains a `joint_prob` col all entries are probabilities between 0 and 1 (inclusive) and add to 1.
     is_error <- !("joint_prob" %in% names(joint_probability_dt)) |
       !all(joint_probability_dt$joint_prob <= 1) |
       !all(joint_probability_dt$joint_prob >= 0) |
@@ -76,9 +85,11 @@ setup_approach.categorical <- function(internal,
       sum(joint_prob) must equal to 1.')
     }
 
+    # Add an id column
     joint_probability_dt <- joint_probability_dt[, id_all := .I]
   }
 
+  # Store the `joint_probability_dt` data table
   internal$parameters$categorical.joint_prob_dt <- joint_probability_dt
 
   return(internal)
@@ -90,24 +101,12 @@ setup_approach.categorical <- function(internal,
 #' @rdname prepare_data
 #' @export
 #' @keywords internal
+#' @author Annabelle Redelmeier and Lars Henry Berge Olsen
 prepare_data.categorical <- function(internal, index_features = NULL, ...) {
-  x_train <- internal$data$x_train
-  x_explain <- internal$data$x_explain
-
-  joint_probability_dt <- internal$parameters$categorical.joint_prob_dt
-
-  iter <- length(internal$iter_list)
-
-  X <- internal$iter_list[[iter]]$X
-  S <- internal$iter_list[[iter]]$S
-
-
-  if (is.null(index_features)) { # 2,3
-    features <- X$features # list of [1], [2], [2, 3]
-  } else {
-    features <- X$features[index_features] # list of [1],
+  # Use a faster function when index_feature is only a single coalition, as in causal Shapley values.
+  if (length(index_features) == 1) {
+    return(prepare_data_single_coalition(internal, index_features))
   }
-  feature_names <- internal$parameters$feature_names
 
   # 3 id columns: id, id_coalition, and id_all
   # id: for each x_explain observation
@@ -116,19 +115,25 @@ prepare_data.categorical <- function(internal, index_features = NULL, ...) {
   # the training data (not necessarily the ones in the explain data)
 
 
+  # Extract the needed objects/variables
+  x_explain <- internal$data$x_explain
+  joint_probability_dt <- internal$parameters$categorical.joint_prob_dt
+  feature_names <- internal$parameters$feature_names
   feature_conditioned <- paste0(feature_names, "_conditioned")
   feature_conditioned_id <- c(feature_conditioned, "id")
 
-  S_dt <- data.table::data.table(S)
+  # Extract from iterative list
+  iter <- length(internal$iter_list)
+  S <- internal$iter_list[[iter]]$S
+  S_dt <- data.table::data.table(S[index_features, , drop = FALSE])
   S_dt[S_dt == 0] <- NA
-  S_dt[, id_coalition := seq_len(nrow(S_dt))]
-
+  S_dt[, id_coalition := index_features]
   data.table::setnames(S_dt, c(feature_conditioned, "id_coalition"))
 
   # (1) Compute marginal probabilities
 
-  # multiply table of probabilities nrow(S) times
-  joint_probability_mult <- joint_probability_dt[rep(id_all, nrow(S))]
+  # multiply table of probabilities length(index_features) times
+  joint_probability_mult <- joint_probability_dt[rep(id_all, length(index_features))]
 
   data.table::setkeyv(joint_probability_mult, "id_all")
   j_S_dt <- cbind(joint_probability_mult, S_dt) # combine joint probability and S matrix
@@ -156,14 +161,10 @@ prepare_data.categorical <- function(internal, index_features = NULL, ...) {
 
   cond_dt <- j_S_all_feat[marg_dt, on = feature_conditioned]
   cond_dt[, cond_prob := joint_prob / marg_prob]
-  cond_dt[id_coalition == 1, marg_prob := 0]
-  cond_dt[id_coalition == 1, cond_prob := 1]
 
   # check marginal probabilities
   cond_dt_unique <- unique(cond_dt, by = feature_conditioned)
-  check <- cond_dt_unique[id_coalition != 1][, .(sum_prob = sum(marg_prob)),
-    by = "id_coalition"
-  ][["sum_prob"]]
+  check <- cond_dt_unique[id_coalition != 1][, .(sum_prob = sum(marg_prob)), by = "id_coalition"][["sum_prob"]]
   if (!all(round(check) == 1)) {
     print("Warning - not all marginal probabilities sum to 1. There could be a problem
           with the joint probabilities. Consider checking.")
@@ -181,9 +182,7 @@ prepare_data.categorical <- function(internal, index_features = NULL, ...) {
   dt <- cond_dt[dt_explain_just_conditioned, on = feature_conditioned, allow.cartesian = TRUE]
 
   # check conditional probabilities
-  check <- dt[id_coalition != 1][, .(sum_prob = sum(cond_prob)),
-    by = c("id_coalition", "id")
-  ][["sum_prob"]]
+  check <- dt[id_coalition != 1][, .(sum_prob = sum(cond_prob)), by = c("id_coalition", "id")][["sum_prob"]]
   if (!all(round(check) == 1)) {
     print("Warning - not all conditional probabilities sum to 1. There could be a problem
           with the joint probabilities. Consider checking.")
@@ -192,11 +191,58 @@ prepare_data.categorical <- function(internal, index_features = NULL, ...) {
   setnames(dt, "cond_prob", "w")
   data.table::setkeyv(dt, c("id_coalition", "id"))
 
-  # here we merge so that we only return the combintations found in our actual explain data
-  # this merge does not change the number of rows in dt
-  # dt <- merge(dt, x$X[, .(id_coalition, n_features)], by = "id_coalition")
-  # dt[n_features %in% c(0, ncol(x_explain)), w := 1.0]
-  dt[id_coalition %in% c(1, 2^ncol(x_explain)), w := 1.0]
-  ret_col <- c("id_coalition", "id", feature_names, "w")
-  return(dt[id_coalition %in% index_features, mget(ret_col)])
+  # Return the relevant columns
+  return(dt[, mget(c("id_coalition", "id", feature_names, "w"))])
+}
+
+#' Compute the conditional probabilities for a single coalition for the categorical approach
+#'
+#' The [shapr::prepare_data.categorical()] function is slow when evaluated for a single coalition.
+#' This is a bottleneck for Causal Shapley values which call said function a lot with single coalitions.
+#'
+#' @inheritParams default_doc
+#'
+#' @keywords internal
+#' @author Lars Henry Berge Olsen
+prepare_data_single_coalition <- function(internal, index_features) {
+  # if (length(index_features) != 1) stop("`index_features` must be single integer.")
+
+  # Extract the needed objects
+  x_explain <- internal$data$x_explain
+  feature_names <- internal$parameters$feature_names
+  joint_probability_dt <- internal$parameters$categorical.joint_prob_dt
+
+  # Extract from iterative list
+  iter <- length(internal$iter_list)
+  S <- internal$iter_list[[iter]]$S
+
+  # Add an id column to x_explain (copy as this changes `x_explain` outside the function)
+  x_explain_copy <- data.table::copy(x_explain)[, id := .I]
+
+  # Extract the feature names of the features we are to condition on
+  cond_cols <- feature_names[S[index_features, ] == 1]
+  cond_cols_with_id <- c("id", cond_cols)
+
+  # Extract the feature values to condition and including the id column
+  dt_conditional_feature_values <- x_explain_copy[, cond_cols_with_id, with = FALSE]
+
+  # Merge (right outer join) the joint_probability_dt data with the conditional feature values
+  results_id_coalition <- data.table::merge.data.table(joint_probability_dt,
+    dt_conditional_feature_values,
+    by = cond_cols,
+    allow.cartesian = TRUE
+  )
+
+  # Get the weights/conditional probabilities for each valid X_sbar conditioned on X_s for all explicands
+  results_id_coalition[, w := joint_prob / sum(joint_prob), by = id]
+  results_id_coalition[, c("id_all", "joint_prob") := NULL]
+
+  # Set the index_features to their correct value
+  results_id_coalition[, id_coalition := index_features]
+
+  # Set id_coalition and id to be the keys and the two first columns for consistency with other approaches
+  data.table::setkeyv(results_id_coalition, c("id_coalition", "id"))
+  data.table::setcolorder(results_id_coalition, c("id_coalition", "id", feature_names))
+
+  return(results_id_coalition)
 }
diff --git a/R/approach_copula.R b/R/approach_copula.R
@@ -57,18 +57,41 @@ prepare_data.copula <- function(internal, index_features, ...) {
   copula.mu <- internal$parameters$copula.mu
   copula.cov_mat <- internal$parameters$copula.cov_mat
   copula.x_explain_gaussian_mat <- as.matrix(internal$data$copula.x_explain_gaussian)
+  causal_sampling <- internal$parameters$causal_sampling
+
+  # Update the number of MC samples for causal Shapley values not in the first step
+  causal_first_step <- isTRUE(internal$parameters$causal_first_step) # Only set when called from `prepdare_data_causal`
+  n_MC_samples_updated <- if (causal_sampling && !causal_first_step) n_explain else n_MC_samples
+
+  # Update the `copula.x_explain_gaussian_mat` for causal Shapley values not in the first step
+  if (causal_sampling && !causal_first_step) {
+    copula.x_explain_gaussian <- apply(
+      X = rbind(x_explain_mat, x_train_mat),
+      MARGIN = 2,
+      FUN = gaussian_transform_separate,
+      n_y = nrow(x_explain_mat)
+    )
+    if (is.null(dim(copula.x_explain_gaussian))) copula.x_explain_gaussian <- t(as.matrix(copula.x_explain_gaussian))
+    copula.x_explain_gaussian_mat <- as.matrix(copula.x_explain_gaussian)
+  }
 
   iter <- length(internal$iter_list)
 
   S <- internal$iter_list[[iter]]$S[index_features, , drop = FALSE]
 
   # Generate the MC samples from N(0, 1)
-  MC_samples_mat <- matrix(rnorm(n_MC_samples * n_features), nrow = n_MC_samples, ncol = n_features)
+  MC_samples_mat <- matrix(rnorm(n_MC_samples_updated * n_features), nrow = n_MC_samples_updated, ncol = n_features)
+
+  # Determine which copula data generating function to use
+  prepare_data_copula <-
+    if (causal_sampling && !causal_first_step) prepare_data_copula_cpp_caus else prepare_data_copula_cpp
 
   # Use C++ to convert the MC samples to N(mu_{Sbar|S}, Sigma_{Sbar|S}), for all coalitions and explicands,
   # and then transforming them back to the original scale using the inverse Gaussian transform in C++.
-  # The object `dt` is a 3D array of dimension (n_MC_samples, n_explain * n_coalitions, n_features).
-  dt <- prepare_data_copula_cpp(
+  # The `dt` object is a 3D array of dimension (n_MC_samples, n_explain * n_coalitions, n_features) for regular
+  # Shapley and in the first step for causal Shapley values. For later steps in the causal Shapley value framework,
+  # the `dt` object is a matrix of dimension (n_explain * n_coalitions, n_features).
+  dt <- prepare_data_copula(
     MC_samples_mat = MC_samples_mat,
     x_explain_mat = x_explain_mat,
     x_explain_gaussian_mat = copula.x_explain_gaussian_mat,
@@ -78,8 +101,8 @@ prepare_data.copula <- function(internal, index_features, ...) {
     cov_mat = copula.cov_mat
   )
 
-  # Reshape `dt` to a 2D array of dimension (n_MC_samples * n_explain * n_coalitions, n_features).
-  dim(dt) <- c(n_coalitions_now * n_explain * n_MC_samples, n_features)
+  # Reshape `dt` to a 2D array of dimension (n_MC_samples * n_explain * n_coalitions, n_features) when needed
+  if (!causal_sampling || causal_first_step) dim(dt) <- c(n_coalitions_now * n_explain * n_MC_samples, n_features)
 
   # Convert to a data.table and add extra identification columns
   dt <- data.table::as.data.table(dt)