Skip to content

Commit

Permalink
Merge branch 'main' into 0.7.X
Browse files Browse the repository at this point in the history
  • Loading branch information
SvenKlaassen committed Feb 2, 2024
2 parents baa145e + 8431daf commit 46c6cba
Show file tree
Hide file tree
Showing 46 changed files with 3,283 additions and 769 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -29,3 +29,4 @@ share/python-wheels/
MANIFEST
*.idea
*.vscode
.flake8
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
[![Conda Version](https://img.shields.io/conda/vn/conda-forge/doubleml.svg)](https://anaconda.org/conda-forge/doubleml)
[![codecov](https://codecov.io/gh/DoubleML/doubleml-for-py/branch/main/graph/badge.svg?token=0BjlFPgdGk)](https://codecov.io/gh/DoubleML/doubleml-for-py)
[![Codacy Badge](https://app.codacy.com/project/badge/Grade/1c08ec7d782c451784293c996537de14)](https://www.codacy.com/gh/DoubleML/doubleml-for-py/dashboard?utm_source=github.com&utm_medium=referral&utm_content=DoubleML/doubleml-for-py&utm_campaign=Badge_Grade)
[![Python version](https://img.shields.io/badge/python-3.6%20%7C%203.7%20%7C%203.8%20%7C%203.9%20%7C%203.10%20%7C%203.11-blue)](https://www.python.org/)
[![Python version](https://img.shields.io/badge/python-3.8%20%7C%203.9%20%7C%203.10%20%7C%203.11-blue)](https://www.python.org/)

The Python package **DoubleML** provides an implementation of the double / debiased machine learning framework of
[Chernozhukov et al. (2018)](https://doi.org/10.1111/ectj.12097).
Expand Down
2 changes: 1 addition & 1 deletion doc/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
author = 'Bach, P., Chernozhukov, V., Kurz, M. S., and Spindler, M.'

# The full version, including alpha/beta/rc tags
release = '0.7.0'
release = '0.7.1'


# -- General configuration ---------------------------------------------------
Expand Down
6 changes: 6 additions & 0 deletions doubleml/_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -333,3 +333,9 @@ def _var_est(psi, psi_deriv, apply_cross_fitting, smpls, is_cluster_data,
sigma2_hat = np.multiply(scaling, gamma_hat)

return sigma2_hat, var_scaling_factor


def _cond_targets(target, cond_sample):
cond_target = target.astype(float)
cond_target[np.invert(cond_sample)] = np.nan
return cond_target
50 changes: 50 additions & 0 deletions doubleml/_utils_checks.py
Original file line number Diff line number Diff line change
Expand Up @@ -226,3 +226,53 @@ def _check_benchmarks(benchmarks):
raise TypeError('benchmarks name must be of string type. '
f'{str(benchmarks["name"][i])} of type {str(type(benchmarks["name"][i]))} was passed.')
return


def _check_weights(weights, score, n_obs, n_rep):
if weights is not None:

# check general type
if (not isinstance(weights, np.ndarray)) and (not isinstance(weights, dict)):
raise TypeError("weights must be a numpy array or dictionary. "
f"weights of type {str(type(weights))} was passed.")

# check shape
if isinstance(weights, np.ndarray):
if (weights.ndim != 1) or weights.shape[0] != n_obs:
raise ValueError(f"weights must have shape ({n_obs},). "
f"weights of shape {weights.shape} was passed.")
if not np.all(0 <= weights):
raise ValueError("All weights values must be greater or equal 0.")
if weights.sum() == 0:
raise ValueError("At least one weight must be non-zero.")

# check special form for ATTE score
if score == "ATTE":
if not isinstance(weights, np.ndarray):
raise TypeError("weights must be a numpy array for ATTE score. "
f"weights of type {str(type(weights))} was passed.")

is_binary = np.all((np.power(weights, 2) - weights) == 0)
if not is_binary:
raise ValueError("weights must be binary for ATTE score.")

# check general form for ATE score
if isinstance(weights, dict):
assert score == "ATE"
expected_keys = ["weights", "weights_bar"]
if not set(weights.keys()) == set(expected_keys):
raise ValueError(f"weights must have keys {expected_keys}. "
f"keys {str(weights.keys())} were passed.")

expected_shapes = [(n_obs,), (n_obs, n_rep)]
if weights["weights"].shape != expected_shapes[0]:
raise ValueError(f"weights must have shape {expected_shapes[0]}. "
f"weights of shape {weights['weights'].shape} was passed.")
if weights["weights_bar"].shape != expected_shapes[1]:
raise ValueError(f"weights_bar must have shape {expected_shapes[1]}. "
f"weights_bar of shape {weights['weights_bar'].shape} was passed.")
if (not np.all(weights["weights"] >= 0)) or (not np.all(weights["weights_bar"] >= 0)):
raise ValueError("All weights values must be greater or equal 0.")
if (weights["weights"].sum() == 0) or (weights["weights_bar"].sum() == 0):
raise ValueError("At least one weight must be non-zero.")
return
11 changes: 11 additions & 0 deletions doubleml/_utils_resampling.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,15 @@
from sklearn.model_selection import KFold, RepeatedKFold, RepeatedStratifiedKFold


# Remove warnings in future versions
def deprication_apply_cross_fitting():
warnings.warn('The apply_cross_fitting argument is deprecated and will be removed in future versions. '
'In the future, crossfitting is applied by default. '
'To rely on sample splitting please use external predictions.',
DeprecationWarning)
return


class DoubleMLResampling:
def __init__(self,
n_folds,
Expand All @@ -14,6 +23,8 @@ def __init__(self,
self.n_folds = n_folds
self.n_rep = n_rep
self.n_obs = n_obs
if not apply_cross_fitting:
deprication_apply_cross_fitting()
self.apply_cross_fitting = apply_cross_fitting
self.stratify = stratify
if (self.n_folds == 1) & self.apply_cross_fitting:
Expand Down
117 changes: 117 additions & 0 deletions doubleml/datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -1228,3 +1228,120 @@ def f_g(beta_a):
'oracle_values': oracle_values}

return res_dict


def make_heterogeneous_data(n_obs=200, p=30, support_size=5, n_x=1, binary_treatment=False):
"""
Creates a simple synthetic example for heterogeneous treatment effects.
The data generating process is based on the Monte Carlo simulation from Oprescu et al. (2019).
The data is generated as
.. math::
Y_i & = \\theta_0(X_i)D_i + \\langle X_i,\\gamma_0\\rangle + \\epsilon_i
D_i & = \\langle X_i,\\beta_0\\rangle + \\eta_i,
where :math:`X_i\\sim\\mathcal{U}[0,1]^{p}` and :math:`\\epsilon_i,\\eta_i
\\sim\\mathcal{U}[-1,1]`.
If the treatment is set to be binary, the treatment is generated as
.. math::
D_i = 1\\{\\langle X_i,\\beta_0\\rangle \\ge \\eta_i\\}.
The coefficient vectors :math:`\\gamma_0` and :math:`\\beta_0` both have small random (identical) support
which values are drawn independently from :math:`\\mathcal{U}[0,1]` and :math:`\\mathcal{U}[0,0.3]`.
Further, :math:`\\theta_0(x)` defines the conditional treatment effect, which is defined differently depending
on the dimension of :math:`x`.
If the heterogeneity is univariate the conditional treatment effect takes the following form
.. math::
\\theta_0(x) = \\exp(2x_0) + 3\\sin(4x_0),
whereas for the two-dimensional case the conditional treatment effect is defined as
.. math::
\\theta_0(x) = \\exp(2x_0) + 3\\sin(4x_1).
Parameters
----------
n_obs : int
Number of observations to simulate.
Default is ``200``.
p : int
Dimension of covariates.
Default is ``30``.
support_size : int
Number of relevant (confounding) covariates.
Default is ``5``.
n_x : int
Dimension of the heterogeneity. Can be either ``1`` or ``2``.
Default is ``1``.
binary_treatment : bool
Indicates whether the treatment is binary.
Default is ``False``.
Returns
-------
res_dict : dictionary
Dictionary with entries ``data``, ``effects``, ``treatment_effect``.
"""
# simple input checks
assert n_x in [1, 2], 'n_x must be either 1 or 2.'
assert support_size <= p, 'support_size must be smaller than p.'
assert isinstance(binary_treatment, bool), 'binary_treatment must be a boolean.'

# define treatment effects
if n_x == 1:
def treatment_effect(x):
return np.exp(2 * x[:, 0]) + 3 * np.sin(4 * x[:, 0])
else:
assert n_x == 2

# redefine treatment effect
def treatment_effect(x):
return np.exp(2 * x[:, 0]) + 3 * np.sin(4 * x[:, 1])

# Outcome support and coefficients
support_y = np.random.choice(np.arange(p), size=support_size, replace=False)
coefs_y = np.random.uniform(0, 1, size=support_size)
# treatment support and coefficients
support_d = support_y
coefs_d = np.random.uniform(0, 0.3, size=support_size)

# noise
epsilon = np.random.uniform(-1, 1, size=n_obs)
eta = np.random.uniform(-1, 1, size=n_obs)

# Generate controls, covariates, treatments and outcomes
x = np.random.uniform(0, 1, size=(n_obs, p))
# Heterogeneous treatment effects
te = treatment_effect(x)
if binary_treatment:
d = 1.0 * (np.dot(x[:, support_d], coefs_d) >= eta)
else:
d = np.dot(x[:, support_d], coefs_d) + eta
y = te * d + np.dot(x[:, support_y], coefs_y) + epsilon

# Now we build the dataset
y_df = pd.DataFrame({'y': y})
d_df = pd.DataFrame({'d': d})
x_df = pd.DataFrame(
data=x,
index=np.arange(x.shape[0]),
columns=[f'X_{i}' for i in range(x.shape[1])]
)

data = pd.concat([y_df, d_df, x_df], axis=1)
res_dict = {
'data': data,
'effects': te,
'treatment_effect': treatment_effect}
return res_dict
Loading

0 comments on commit 46c6cba

Please sign in to comment.