DoubleML · PhilippBach · Jul 20, 2021 · Jul 20, 2021 · Jul 20, 2021 · Jul 21, 2021
diff --git a/NAMESPACE b/NAMESPACE
@@ -12,6 +12,7 @@ export(fetch_401k)
 export(fetch_bonus)
 export(make_iivm_data)
 export(make_irm_data)
+export(make_pliv_BCCH2012)
 export(make_pliv_CHS2015)
 export(make_pliv_multiway_cluster_CKMS2021)
 export(make_plr_CCDDHNR2018)

diff --git a/R/datasets.R b/R/datasets.R
@@ -552,6 +552,155 @@ make_pliv_CHS2015 = function(n_obs, alpha = 1, dim_x = 200, dim_z = 150,
   return(data)
 }
 
+#' @title Generates data from a partially linear IV regression model used in
+#' Belloni et al. (2012).
+#'
+#' @description
+#' Generates data from a linear IV regression model used in
+#' Belloni et al. (2012). The data generating process
+#' is defined as
+#'
+#' \eqn{y_i = \beta d_i + e_i,}
+#'
+#' \eqn{d_i = z_i'\Pi + v_i,}
+#'
+#' with i.i.d.
+#'
+#' \eqn{(e_i, v_i)  \sim  \mathcal{N} \left(0, \left( \begin{array}{cc}
+#' \sigma^2_e & \sigma_{ev} \\ \sigma_{ev} & \sigma^2_v\end{array}
+#' \right) \right),}
+#'
+#' with \eqn{\beta} being the parameter of interests and
+#' \eqn{\Pi = C \cdot \left(\pi_0^0, \pi_0^1, \pi_0^2 \ldots, \pi_0^{p_z - 1}
+#' \right)}, instrumental variables \eqn{z_i = (z_{i1}, \ldots, z_{ip_z})} drawn
+#' from a normal distribution \eqn{N(0,\Sigma)} with covariance matrix
+#' \eqn{\Sigma_Z} and \eqn{E[z^2_{ih}]=\sigma^2_z} and \eqn{Corr(z_{ih},
+#' z_{ij})=\rho^{j-h}}.
+#' The sparsity parameter `s` can be used to set coefficients in \eqn{\Pi}
+#' with \eqn{j>s} exactly to zero, i.e.,
+#' \eqn{\Pi = C \cdot \left(\pi_0^0, \pi_0^1, \pi_0^2 \ldots, \pi_0^{s}, 0,
+#' \ldots , 0 \right)}. The constant \eqn{C} is calibrated internally such that
+#' the concentration parameter \eqn{\mu^2} is set to a specific value specified
+#' via `mu2`.
+#'
+#' Default values are set to \eqn{\rho = 0.5}, \eqn{\sigma^2_e = 1},
+#' \eqn{\sigma^2_z = 1} and \eqn{Corr(e,v) = 0.6}. For the coefficient vectors
+#' defaults are set such that \eqn{\beta = 1}, \eqn{\mu^2 = 30} and
+#' \eqn{\pi_0 = 0.7}.
+#'
+#' @references Belloni, A., Chen, D., Chernozhukov, V., and Hansen, C. (2012),
+#' Sparse Models and Methods for Optimal Instruments with an Application to
+#' Eminent Domain. Econometrica, 80 (6): 2369-2429.
+#'
+#' @param n_obs (`integer(1)`) \cr
+#' The number of observations to simulate.
+#'
+#' @param beta (`numeric(1)`) \cr
+#' The value of the causal parameter.
+#'
+#' @param dim_z (`integer(1)`) \cr
+#' The number of instruments.
+#'
+#' @param pi_0 (`numeric(1)`) \cr
+#' Coefficient vector in first-stage equation.
+#'
+#' @param s (`integer(1)`) \cr
+#' Sparsity index.
+#'
+#' @param mu2 (`numeric(1)`) \cr
+#' Value of concentration parameter used for calibration of constant \eqn{C}.
+#'
+#' @param rho (`numeric(1)`) \cr
+#' Coefficient determining correlation between instruments.
+#'
+#' @param sigma_z (`numeric(1)`) \cr
+#' Standard deviation of instruments.
+#'
+#' @param corr (`numeric(1)`) \cr
+#' Correlation between errors \eqn{e} and \eqn{v}.
+#'
+#' @param sigma_e (`numeric(1)`) \cr
+#' Standard deviation for error \eqn{e}.
+#'
+#' @param return_type (`character(1)`) \cr
+#' If `"DoubleMLData"`, returns a `DoubleMLData` object.
+#' If `"data.frame"` returns a `data.frame()`.
+#' If `"data.table"` returns a `data.table()`.
+#' If `"matrix"` a named `list()` with entries `X`, `y`, `d` and
+#' `z` is returned.
+#' Every entry in the list is a `matrix()` object.  Default is `"DoubleMLData"`.
+#'
+#' @return A data object according to the choice of `return_type`.
+#'
+#' @export
+make_pliv_BCCH2012 = function(n_obs = 100, beta = 1, dim_z = 100, pi_0 = 0.7,
+  s = 0, mu2 = 30,
+  rho = 0.5, sigma_z = 1,
+  corr = 0.6, sigma_e = 1,
+  return_type = "DoubleMLData") {
+  # based on https://www.econometricsociety.org/content/supplement-sparse-models-and-methods-optimal-instruments-application-eminent-domain-1 and
+  # http://qed.econ.queensu.ca/jae/datasets/spindler001/
+
+  assert_count(n_obs)
+  assert_numeric(beta, len = 1)
+  assert_count(dim_z)
+  assert_numeric(pi_0, len = 1)
+  assert_count(s, positive = FALSE)
+  assert_numeric(mu2, len = 1)
+  assert_numeric(rho, len = 1)
+  assert_numeric(sigma_z, len = 1)
+  assert_numeric(corr, len = 1)
+  assert_numeric(sigma_e, len = 1)
+  assert_choice(
+    return_type,
+    c("data.table", "matrix", "data.frame", "DoubleMLData"))
+
+  sigma_z = toeplitz(rho^(0:(dim_z - 1)))
+  mu_z = rep(0, dim_z)
+  z = rmvnorm(n = n_obs, mean = mu_z, sigma = sigma_z)
+  pi = pi_0^(0:(dim_z - 1))
+
+  scale = c(sqrt(mu2 / ((n_obs + mu2) * pi %*% sigma_z %*% pi)))
+  sigma_v = sqrt(1 - (scale^2) * t(pi) %*% sigma_z %*% pi)
+  sev = corr * sigma_e * sigma_v
+
+  sigma_e_v = matrix(c(sigma_e^2, sev, sev, sigma_v^2), ncol = 2)
+  mu_e_v = rep(0, 2)
+  e_v = rmvnorm(n = n_obs, mean = mu_e_v, sigma = sigma_e_v)
+  e = e_v[, 1]
+  v = e_v[, 2]
+
+  if (s > 0) {
+    pi[(s + 1):dim_z] = 0
+  }
+  d = scale * z %*% pi + v
+  y = beta * d + e
+
+  if (return_type == "matrix") {
+    return(list("y" = y, "d" = d, "z" = z))
+  } else {
+    colnames(z) = paste0("Z", 1:dim_z)
+    colnames(y) = "y"
+    colnames(d) = "d"
+
+    if (return_type == "data.frame") {
+      data = data.frame(y, d, z)
+      return(data)
+    } else if (return_type == "data.table") {
+      data = data.table(y, d, z)
+      return(data)
+    } else if (return_type == "DoubleMLData") {
+      dt = data.table(y, d, z)
+      data = DoubleMLData$new(dt,
+        y_col = "y", d_cols = "d",
+        x_cols = NULL,
+        z_cols = colnames(z))
+      return(data)
+    }
+  }
+  return(data)
+}
+
 
 #' @title Generates data from a interactive regression (IRM) model.
 #'

diff --git a/R/double_ml_pliv.R b/R/double_ml_pliv.R
@@ -257,7 +257,6 @@ DoubleMLPLIV = R6Class("DoubleMLPLIV",
 
       return(res)
     },
-
     ml_nuisance_and_score_elements_partialX = function(smpls, ...) {
 
       g_hat = dml_cv_predict(self$learner$ml_g,
@@ -447,28 +446,74 @@ DoubleMLPLIV = R6Class("DoubleMLPLIV",
     ml_nuisance_and_score_elements_partialZ = function(smpls, ...) {
 
       # nuisance r
-
-      r_hat = dml_cv_predict(self$learner$ml_r,
-        c(
-          self$data$x_cols,
-          self$data$other_treat_cols,
-          self$data$z_cols),
-        self$data$treat_col,
-        self$data$data_model,
-        nuisance_id = "nuis_r",
-        smpls = smpls,
-        est_params = self$get_params("ml_r"),
-        return_train_preds = FALSE,
-        learner_class = private$learner_class$ml_r,
-        fold_specific_params = private$fold_specific_params)
-
       d = self$data$data_model[[self$data$treat_col]]
       y = self$data$data_model[[self$data$y_col]]
 
+      if (test_character(self$data$x_cols, len = 0)) {
+        r_hat = dml_cv_predict(self$learner$ml_r,
+          c(
+            self$data$x_cols,
+            self$data$other_treat_cols,
+            self$data$z_cols),
+          self$data$treat_col,
+          self$data$data_model,
+          nuisance_id = "nuis_r",
+          smpls = smpls,
+          est_params = self$get_params("ml_r"),
+          return_train_preds = FALSE,
+          learner_class = private$learner_class$ml_r,
+          fold_specific_params =
+            private$fold_specific_params)
+      } else {
+        # Partial out Xs from y and d by using linear regression
+        task_part_y = initiate_task("lm_part_out_y", self$data$data_model,
+          target = self$data$y_col,
+          select_cols = c(
+            self$data$x_cols,
+            self$data$other_treat_cols),
+          "LearnerRegr")
+        learner_lm = LearnerRegrLM$new()
+        resampling_part_y = rsmp("insample")$instantiate(task_part_y)
+        r_part_y = resample(task_part_y, learner_lm, resampling_part_y,
+          store_models = TRUE)
+        y_tilde = y - as.data.table(r_part_y$prediction())$response
+
+        task_part_d = initiate_task("lm_part_out_d", self$data$data_model,
+          target = self$data$treat_col,
+          select_cols = c(
+            self$data$x_cols,
+            self$data$other_treat_cols),
+          "LearnerRegr")
+        resampling_part_d = rsmp("insample")$instantiate(task_part_d)
+        r_part_d = resample(task_part_d, learner_lm, resampling_part_d,
+          store_models = TRUE)
+        d_tilde = d - as.data.table(r_part_d$prediction())$response
+
+        data_aux = data.table(self$data$data_model, "d_tilde" = d_tilde)
+        r_hat = dml_cv_predict(self$learner$ml_r,
+          c(
+            self$data$x_cols,
+            self$data$other_treat_cols,
+            self$data$z_cols),
+          "d_tilde",
+          data_aux,
+          nuisance_id = "nuis_r",
+          smpls = smpls,
+          est_params = self$get_params("ml_r"),
+          return_train_preds = FALSE,
+          learner_class = private$learner_class$ml_r,
+          fold_specific_params =
+            private$fold_specific_params)
+      }
       if (is.character(self$score)) {
         if (self$score == "partialling out") {
-          psi_a = -r_hat * d
-          psi_b = r_hat * y
+          if (test_character(self$data$x_cols, len = 0)) {
+            psi_a = -r_hat * d
+            psi_b = r_hat * y
+          } else {
+            psi_a = -r_hat * d_tilde
+            psi_b = r_hat * y_tilde
+          }
         }
         res = list(psi_a = psi_a, psi_b = psi_b)
       } else if (is.function(self$score)) {
@@ -656,27 +701,61 @@ DoubleMLPLIV = R6Class("DoubleMLPLIV",
           params = tuning_result_r$params))
       return(tuning_result)
     },
-
     ml_nuisance_tuning_partialZ = function(smpls, param_set,
       tune_settings, tune_on_folds, ...) {
-      if (!tune_on_folds) {
-        data_tune_list = list(self$data$data_model)
+      if (test_character(self$data$x_cols, len = 0)) {
+        if (!tune_on_folds) {
+          data_tune_list = list(self$data$data_model)
+        } else {
+          data_tune_list = lapply(
+            smpls$train_ids,
+            function(x) extract_training_data(self$data$data_model, x))
+        }
+        tuning_result_r = dml_tune(self$learner$ml_r,
+          c(
+            self$data$x_cols,
+            self$data$other_treat_cols,
+            self$data$z_cols),
+          self$data$treat_col, data_tune_list,
+          nuisance_id = "nuis_r",
+          param_set$ml_r, tune_settings,
+          tune_settings$measure$ml_r,
+          private$learner_class$ml_r)
       } else {
-        data_tune_list = lapply(
-          smpls$train_ids,
-          function(x) extract_training_data(self$data$data_model, x))
-      }
+        # Partial out Xs from d by using linear regression
+        task_part_d = initiate_task("lm_part_out_d", self$data$data_model,
+          target = self$data$treat_col,
+          select_cols = c(
+            self$data$x_cols,
+            self$data$other_treat_cols),
+          "LearnerRegr")
+        resampling_part_d = rsmp("insample")$instantiate(task_part_d)
+        learner_lm = LearnerRegrLM$new()
+        r_part_d = resample(task_part_d, learner_lm, resampling_part_d,
+          store_models = TRUE)
+        d_tilde = self$data$data_model[[self$data$treat_col]] -
+          as.data.table(r_part_d$prediction())$response
+        data_aux = data.table(self$data$data_model, "d_tilde" = d_tilde)
 
-      tuning_result_r = dml_tune(self$learner$ml_r,
-        c(
-          self$data$x_cols,
-          self$data$other_treat_cols,
-          self$data$z_cols),
-        self$data$treat_col, data_tune_list,
-        nuisance_id = "nuis_r",
-        param_set$ml_r, tune_settings,
-        tune_settings$measure$ml_r,
-        private$learner_class$ml_r)
+        if (!tune_on_folds) {
+          data_tune_list = list(data_aux)
+        } else {
+          data_tune_list = lapply(
+            smpls$train_ids,
+            function(x) extract_training_data(data_aux, x))
+        }
+
+        tuning_result_r = dml_tune(self$learner$ml_r,
+          c(
+            self$data$x_cols,
+            self$data$other_treat_cols,
+            self$data$z_cols),
+          "d_tilde", data_tune_list,
+          nuisance_id = "nuis_r",
+          param_set$ml_r, tune_settings,
+          tune_settings$measure$ml_r,
+          private$learner_class$ml_r)
+      }
 
       tuning_result = list("ml_r" = list(tuning_result_r,
         params = tuning_result_r$params))

diff --git a/_pkgdown.yml b/_pkgdown.yml
@@ -62,6 +62,7 @@ reference:
   - title: Datasets generators
     contents:
       - make_plr_CCDDHNR2018
+      - make_pliv_BCCH2012
       - make_pliv_CHS2015
       - make_irm_data
       - make_iivm_data