Merge branch 'main' into 0.7.X

DoubleML · Feb 2, 2024 · 46c6cba · 46c6cba
2 parents baa145e + 8431daf
commit 46c6cba
Show file tree

Hide file tree

Showing 46 changed files with 3,283 additions and 769 deletions.
diff --git a/.gitignore b/.gitignore
@@ -29,3 +29,4 @@ share/python-wheels/
 MANIFEST
 *.idea
 *.vscode
+.flake8
diff --git a/README.md b/README.md
@@ -5,7 +5,7 @@
 [![Conda Version](https://img.shields.io/conda/vn/conda-forge/doubleml.svg)](https://anaconda.org/conda-forge/doubleml)
 [![codecov](https://codecov.io/gh/DoubleML/doubleml-for-py/branch/main/graph/badge.svg?token=0BjlFPgdGk)](https://codecov.io/gh/DoubleML/doubleml-for-py)
 [![Codacy Badge](https://app.codacy.com/project/badge/Grade/1c08ec7d782c451784293c996537de14)](https://www.codacy.com/gh/DoubleML/doubleml-for-py/dashboard?utm_source=github.com&amp;utm_medium=referral&amp;utm_content=DoubleML/doubleml-for-py&amp;utm_campaign=Badge_Grade)
-[![Python version](https://img.shields.io/badge/python-3.6%20%7C%203.7%20%7C%203.8%20%7C%203.9%20%7C%203.10%20%7C%203.11-blue)](https://www.python.org/)
+[![Python version](https://img.shields.io/badge/python-3.8%20%7C%203.9%20%7C%203.10%20%7C%203.11-blue)](https://www.python.org/)
 
 The Python package **DoubleML** provides an implementation of the double / debiased machine learning framework of
 [Chernozhukov et al. (2018)](https://doi.org/10.1111/ectj.12097).

diff --git a/doc/conf.py b/doc/conf.py
@@ -22,7 +22,7 @@
 author = 'Bach, P., Chernozhukov, V., Kurz, M. S., and Spindler, M.'
 
 # The full version, including alpha/beta/rc tags
-release = '0.7.0'
+release = '0.7.1'
 
 
 # -- General configuration ---------------------------------------------------

diff --git a/doubleml/_utils.py b/doubleml/_utils.py
@@ -333,3 +333,9 @@ def _var_est(psi, psi_deriv, apply_cross_fitting, smpls, is_cluster_data,
     sigma2_hat = np.multiply(scaling, gamma_hat)
 
     return sigma2_hat, var_scaling_factor
+
+
+def _cond_targets(target, cond_sample):
+    cond_target = target.astype(float)
+    cond_target[np.invert(cond_sample)] = np.nan
+    return cond_target
diff --git a/doubleml/_utils_checks.py b/doubleml/_utils_checks.py
@@ -226,3 +226,53 @@ def _check_benchmarks(benchmarks):
                 raise TypeError('benchmarks name must be of string type. '
                                 f'{str(benchmarks["name"][i])} of type {str(type(benchmarks["name"][i]))} was passed.')
     return
+
+
+def _check_weights(weights, score, n_obs, n_rep):
+    if weights is not None:
+
+        # check general type
+        if (not isinstance(weights, np.ndarray)) and (not isinstance(weights, dict)):
+            raise TypeError("weights must be a numpy array or dictionary. "
+                            f"weights of type {str(type(weights))} was passed.")
+
+        # check shape
+        if isinstance(weights, np.ndarray):
+            if (weights.ndim != 1) or weights.shape[0] != n_obs:
+                raise ValueError(f"weights must have shape ({n_obs},). "
+                                 f"weights of shape {weights.shape} was passed.")
+            if not np.all(0 <= weights):
+                raise ValueError("All weights values must be greater or equal 0.")
+            if weights.sum() == 0:
+                raise ValueError("At least one weight must be non-zero.")
+
+        # check special form for ATTE score
+        if score == "ATTE":
+            if not isinstance(weights, np.ndarray):
+                raise TypeError("weights must be a numpy array for ATTE score. "
+                                f"weights of type {str(type(weights))} was passed.")
+
+            is_binary = np.all((np.power(weights, 2) - weights) == 0)
+            if not is_binary:
+                raise ValueError("weights must be binary for ATTE score.")
+
+        # check general form for ATE score
+        if isinstance(weights, dict):
+            assert score == "ATE"
+            expected_keys = ["weights", "weights_bar"]
+            if not set(weights.keys()) == set(expected_keys):
+                raise ValueError(f"weights must have keys {expected_keys}. "
+                                 f"keys {str(weights.keys())} were passed.")
+
+            expected_shapes = [(n_obs,), (n_obs, n_rep)]
+            if weights["weights"].shape != expected_shapes[0]:
+                raise ValueError(f"weights must have shape {expected_shapes[0]}. "
+                                 f"weights of shape {weights['weights'].shape} was passed.")
+            if weights["weights_bar"].shape != expected_shapes[1]:
+                raise ValueError(f"weights_bar must have shape {expected_shapes[1]}. "
+                                 f"weights_bar of shape {weights['weights_bar'].shape} was passed.")
+            if (not np.all(weights["weights"] >= 0)) or (not np.all(weights["weights_bar"] >= 0)):
+                raise ValueError("All weights values must be greater or equal 0.")
+            if (weights["weights"].sum() == 0) or (weights["weights_bar"].sum() == 0):
+                raise ValueError("At least one weight must be non-zero.")
+    return
diff --git a/doubleml/_utils_resampling.py b/doubleml/_utils_resampling.py
@@ -4,6 +4,15 @@
 from sklearn.model_selection import KFold, RepeatedKFold, RepeatedStratifiedKFold
 
 
+# Remove warnings in future versions
+def deprication_apply_cross_fitting():
+    warnings.warn('The apply_cross_fitting argument is deprecated and will be removed in future versions. '
+                  'In the future, crossfitting is applied by default. '
+                  'To rely on sample splitting please use external predictions.',
+                  DeprecationWarning)
+    return
+
+
 class DoubleMLResampling:
     def __init__(self,
                  n_folds,
@@ -14,6 +23,8 @@ def __init__(self,
         self.n_folds = n_folds
         self.n_rep = n_rep
         self.n_obs = n_obs
+        if not apply_cross_fitting:
+            deprication_apply_cross_fitting()
         self.apply_cross_fitting = apply_cross_fitting
         self.stratify = stratify
         if (self.n_folds == 1) & self.apply_cross_fitting:

diff --git a/doubleml/datasets.py b/doubleml/datasets.py
@@ -1228,3 +1228,120 @@ def f_g(beta_a):
                 'oracle_values': oracle_values}
 
     return res_dict
+
+
+def make_heterogeneous_data(n_obs=200, p=30, support_size=5, n_x=1, binary_treatment=False):
+    """
+    Creates a simple synthetic example for heterogeneous treatment effects.
+    The data generating process is based on the Monte Carlo simulation from Oprescu et al. (2019).
+
+    The data is generated as
+
+    .. math::
+
+        Y_i & = \\theta_0(X_i)D_i + \\langle X_i,\\gamma_0\\rangle + \\epsilon_i
+
+        D_i & = \\langle X_i,\\beta_0\\rangle + \\eta_i,
+
+    where :math:`X_i\\sim\\mathcal{U}[0,1]^{p}` and :math:`\\epsilon_i,\\eta_i
+    \\sim\\mathcal{U}[-1,1]`.
+    If the treatment is set to be binary, the treatment is generated as
+
+    .. math::
+        D_i = 1\\{\\langle X_i,\\beta_0\\rangle \\ge \\eta_i\\}.
+
+    The coefficient vectors :math:`\\gamma_0` and :math:`\\beta_0` both have small random (identical) support
+    which values are drawn independently from :math:`\\mathcal{U}[0,1]` and :math:`\\mathcal{U}[0,0.3]`.
+    Further, :math:`\\theta_0(x)` defines the conditional treatment effect, which is defined differently depending
+    on the dimension of :math:`x`.
+
+    If the heterogeneity is univariate the conditional treatment effect takes the following form
+
+    .. math::
+            \\theta_0(x) = \\exp(2x_0) + 3\\sin(4x_0),
+
+    whereas for the two-dimensional case the conditional treatment effect is defined as
+
+    .. math::
+        \\theta_0(x) = \\exp(2x_0) + 3\\sin(4x_1).
+
+    Parameters
+    ----------
+    n_obs : int
+        Number of observations to simulate.
+        Default is ``200``.
+
+    p : int
+        Dimension of covariates.
+        Default is ``30``.
+
+    support_size : int
+        Number of relevant (confounding) covariates.
+        Default is ``5``.
+
+    n_x : int
+        Dimension of the heterogeneity. Can be either ``1`` or ``2``.
+        Default is ``1``.
+
+    binary_treatment : bool
+        Indicates whether the treatment is binary.
+        Default is ``False``.
+
+    Returns
+    -------
+    res_dict : dictionary
+       Dictionary with entries ``data``, ``effects``, ``treatment_effect``.
+
+    """
+    # simple input checks
+    assert n_x in [1, 2], 'n_x must be either 1 or 2.'
+    assert support_size <= p, 'support_size must be smaller than p.'
+    assert isinstance(binary_treatment, bool), 'binary_treatment must be a boolean.'
+
+    # define treatment effects
+    if n_x == 1:
+        def treatment_effect(x):
+            return np.exp(2 * x[:, 0]) + 3 * np.sin(4 * x[:, 0])
+    else:
+        assert n_x == 2
+
+        # redefine treatment effect
+        def treatment_effect(x):
+            return np.exp(2 * x[:, 0]) + 3 * np.sin(4 * x[:, 1])
+
+    # Outcome support and coefficients
+    support_y = np.random.choice(np.arange(p), size=support_size, replace=False)
+    coefs_y = np.random.uniform(0, 1, size=support_size)
+    # treatment support and coefficients
+    support_d = support_y
+    coefs_d = np.random.uniform(0, 0.3, size=support_size)
+
+    # noise
+    epsilon = np.random.uniform(-1, 1, size=n_obs)
+    eta = np.random.uniform(-1, 1, size=n_obs)
+
+    # Generate controls, covariates, treatments and outcomes
+    x = np.random.uniform(0, 1, size=(n_obs, p))
+    # Heterogeneous treatment effects
+    te = treatment_effect(x)
+    if binary_treatment:
+        d = 1.0 * (np.dot(x[:, support_d], coefs_d) >= eta)
+    else:
+        d = np.dot(x[:, support_d], coefs_d) + eta
+    y = te * d + np.dot(x[:, support_y], coefs_y) + epsilon
+
+    # Now we build the dataset
+    y_df = pd.DataFrame({'y': y})
+    d_df = pd.DataFrame({'d': d})
+    x_df = pd.DataFrame(
+        data=x,
+        index=np.arange(x.shape[0]),
+        columns=[f'X_{i}' for i in range(x.shape[1])]
+    )
+
+    data = pd.concat([y_df, d_df, x_df], axis=1)
+    res_dict = {
+        'data': data,
+        'effects': te,
+        'treatment_effect': treatment_effect}
+    return res_dict