From 9cbde04fac818b309dd07dabcfbd0fcb435511e9 Mon Sep 17 00:00:00 2001
From: Kin Ho Lo <kin.lo@booking.com>
Date: Sat, 6 Apr 2024 11:48:08 +0200
Subject: [PATCH 1/9] Enable external_predictions for short model in benchmarks

---
 doubleml/double_ml.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/doubleml/double_ml.py b/doubleml/double_ml.py
index b47a5ace..320320da 100644
--- a/doubleml/double_ml.py
+++ b/doubleml/double_ml.py
@@ -1735,7 +1735,7 @@ def sensitivity_plot(self, idx_treatment=0, value='theta', include_scenario=True
                                         fill=fill)
         return fig
 
-    def sensitivity_benchmark(self, benchmarking_set):
+    def sensitivity_benchmark(self, benchmarking_set, fit_args={}):
         """
         Computes a benchmark for a given set of features.
         Returns a DataFrame containing the corresponding values for cf_y, cf_d, rho and the change in estimates.
@@ -1757,12 +1757,15 @@ def sensitivity_benchmark(self, benchmarking_set):
         if not set(benchmarking_set) <= set(x_list_long):
             raise ValueError(f"benchmarking_set must be a subset of features {str(self._dml_data.x_cols)}. "
                              f'{str(benchmarking_set)} was passed.')
+        if not isinstance(fit_args, dict):
+            raise TypeError('fit_args must be a dict. '
+                            f'{str(fit_args)} of type {type(fit_args)} was passed.')
 
         # refit short form of the model
         x_list_short = [x for x in x_list_long if x not in benchmarking_set]
         dml_short = copy.deepcopy(self)
         dml_short._dml_data.x_cols = x_list_short
-        dml_short.fit()
+        dml_short.fit(**fit_args)
 
         benchmark_dict = gain_statistics(dml_long=self, dml_short=dml_short)
         df_benchmark = pd.DataFrame(benchmark_dict, index=self._dml_data.d_cols)

From 8b7d9380508e20a2e923903668eb4dbc04b6ced6 Mon Sep 17 00:00:00 2001
From: Kin Ho Lucien Lo <lucienlo@luciens-macbook-pro-2023.home>
Date: Mon, 8 Apr 2024 14:29:49 +0200
Subject: [PATCH 2/9] Added unit test
 test_sensitivity_benchmark_external_prediction_exception in
 test_exceptions_ext_preds.py

---
 doubleml/tests/test_exceptions_ext_preds.py | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/doubleml/tests/test_exceptions_ext_preds.py b/doubleml/tests/test_exceptions_ext_preds.py
index 395d8bf5..d72cd45b 100644
--- a/doubleml/tests/test_exceptions_ext_preds.py
+++ b/doubleml/tests/test_exceptions_ext_preds.py
@@ -1,8 +1,10 @@
 import pytest
-from doubleml import DoubleMLCVAR, DoubleMLQTE, DoubleMLData
+from doubleml import DoubleMLCVAR, DoubleMLQTE, DoubleMLIRM, DoubleMLData
 from doubleml.datasets import make_irm_data
 from doubleml.utils import DMLDummyRegressor, DMLDummyClassifier
 
+from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
+
 df_irm = make_irm_data(n_obs=10, dim_x=2, theta=0.5, return_type="DataFrame")
 ext_predictions = {"d": {}}
 
@@ -21,3 +23,13 @@ def test_qte_external_prediction_exception():
     with pytest.raises(NotImplementedError, match=msg):
         qte = DoubleMLQTE(DoubleMLData(df_irm, "y", "d"), DMLDummyClassifier(), DMLDummyClassifier())
         qte.fit(external_predictions=ext_predictions)
+
+@pytest.mark.ci
+def test_sensitivity_benchmark_external_prediction_exception():
+    msg = "fit_args must be a dict. "
+    with pytest.raises(TypeError, match=msg):
+        fit_args = []
+        irm = DoubleMLIRM(DoubleMLData(df_irm, "y", "d"), RandomForestRegressor(), RandomForestClassifier())
+        irm.fit()
+        irm.sensitivity_analysis()
+        irm.sensitivity_benchmark(benchmarking_set=["X1"], fit_args=fit_args)

From 519bae63d05b5935e48bc63f00d15e85b2b22a30 Mon Sep 17 00:00:00 2001
From: Kin Ho Lucien Lo <lucienlo@luciens-macbook-pro.home>
Date: Mon, 8 Apr 2024 23:19:59 +0200
Subject: [PATCH 3/9] Change default value of fit_args to be None

---
 doubleml/double_ml.py | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/doubleml/double_ml.py b/doubleml/double_ml.py
index 320320da..8ea773a4 100644
--- a/doubleml/double_ml.py
+++ b/doubleml/double_ml.py
@@ -1735,7 +1735,7 @@ def sensitivity_plot(self, idx_treatment=0, value='theta', include_scenario=True
                                         fill=fill)
         return fig
 
-    def sensitivity_benchmark(self, benchmarking_set, fit_args={}):
+    def sensitivity_benchmark(self, benchmarking_set, fit_args=None):
         """
         Computes a benchmark for a given set of features.
         Returns a DataFrame containing the corresponding values for cf_y, cf_d, rho and the change in estimates.
@@ -1757,7 +1757,7 @@ def sensitivity_benchmark(self, benchmarking_set, fit_args={}):
         if not set(benchmarking_set) <= set(x_list_long):
             raise ValueError(f"benchmarking_set must be a subset of features {str(self._dml_data.x_cols)}. "
                              f'{str(benchmarking_set)} was passed.')
-        if not isinstance(fit_args, dict):
+        if fit_args is not None and not isinstance(fit_args, dict):
             raise TypeError('fit_args must be a dict. '
                             f'{str(fit_args)} of type {type(fit_args)} was passed.')
 
@@ -1765,7 +1765,10 @@ def sensitivity_benchmark(self, benchmarking_set, fit_args={}):
         x_list_short = [x for x in x_list_long if x not in benchmarking_set]
         dml_short = copy.deepcopy(self)
         dml_short._dml_data.x_cols = x_list_short
-        dml_short.fit(**fit_args)
+        if fit_args is not None:
+            dml_short.fit(**fit_args)
+        else:
+            dml_short.fit()
 
         benchmark_dict = gain_statistics(dml_long=self, dml_short=dml_short)
         df_benchmark = pd.DataFrame(benchmark_dict, index=self._dml_data.d_cols)

From d996fdad7e54bf368240ce0155f1e93796707449 Mon Sep 17 00:00:00 2001
From: Kin Ho Lucien Lo <lucienlo@luciens-macbook-pro.home>
Date: Tue, 9 Apr 2024 16:23:50 +0200
Subject: [PATCH 4/9] Added test_dml_benchmark_fixture in test_sensitivity.py

---
 doubleml/tests/test_sensitivity.py | 49 ++++++++++++++++++++++++++++++
 1 file changed, 49 insertions(+)

diff --git a/doubleml/tests/test_sensitivity.py b/doubleml/tests/test_sensitivity.py
index b1277b78..8bb73936 100644
--- a/doubleml/tests/test_sensitivity.py
+++ b/doubleml/tests/test_sensitivity.py
@@ -1,12 +1,21 @@
 import pytest
+import math
 import numpy as np
+import copy
 
 import doubleml as dml
+from doubleml import DoubleMLIRM, DoubleMLData
+from doubleml.datasets import make_irm_data
 from sklearn.linear_model import LinearRegression
+from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
 
 from ._utils_doubleml_sensitivity_manual import doubleml_sensitivity_manual, \
     doubleml_sensitivity_benchmark_manual
 
+@pytest.fixture(scope="module", params=[["X1"],["X2"],["X3"]])
+def benchmarking_set(request):
+    return request.param
+
 
 @pytest.fixture(scope='module',
                 params=[1, 3])
@@ -99,3 +108,43 @@ def test_dml_sensitivity_benchmark(dml_sensitivity_multitreat_fixture):
     assert all(dml_sensitivity_multitreat_fixture['benchmark'].index ==
                dml_sensitivity_multitreat_fixture['d_cols'])
     assert dml_sensitivity_multitreat_fixture['benchmark'].equals(dml_sensitivity_multitreat_fixture['benchmark_manual'])
+
+@pytest.fixture(scope="module")
+def test_dml_benchmark_fixture(benchmarking_set,n_rep):
+    
+    random_state = 42
+    
+    x, y, d = make_irm_data(n_obs=10, dim_x=5, theta=0.5, return_type="np.array")
+
+    classifier_class = RandomForestClassifier
+    regressor_class = RandomForestRegressor
+    
+    np.random.seed(3141)
+    dml_data = DoubleMLData.from_arrays(x=x, y=y, d=d)
+    x_list_long = copy.deepcopy(dml_data.x_cols)
+    dml_int = DoubleMLIRM(dml_data, ml_m=classifier_class(random_state=random_state), ml_g=regressor_class(random_state=random_state), n_folds=2)
+    dml_int.fit(store_predictions=True)
+    dml_int.sensitivity_analysis()
+    dml_ext = copy.deepcopy(dml_int)
+    df_bm = dml_int.sensitivity_benchmark(benchmarking_set=benchmarking_set)
+    
+    np.random.seed(3141)
+    dml_short = copy.deepcopy(dml_ext)
+    dml_data_short = DoubleMLData.from_arrays(x=x, y=y, d=d)
+    dml_data_short.x_cols = [x for x in x_list_long if x not in benchmarking_set]
+    dml_short = DoubleMLIRM(dml_data_short, ml_m=classifier_class(random_state=random_state), ml_g=regressor_class(random_state=random_state), n_folds=2)
+    dml_short.fit(store_predictions=True)
+    fit_args = {"external_predictions": {"d": {"ml_m": dml_short.predictions["ml_m"][:, :, 0],
+                                               "ml_g0": dml_short.predictions["ml_g0"][:, :, 0],
+                                               "ml_g1": dml_short.predictions["ml_g1"][:, :, 0],}},}
+    dml_ext.sensitivity_analysis()
+    df_bm_ext = dml_ext.sensitivity_benchmark(benchmarking_set=benchmarking_set,fit_args=fit_args)
+
+    res_dict = {"default_benchmark": df_bm.loc["d","delta_theta"],
+                "external_benchmark": df_bm_ext.loc["d","delta_theta"]}
+
+    return res_dict
+
+@pytest.mark.ci
+def test_dml_sensitivity_external_predictions(test_dml_benchmark_fixture):
+    assert math.isclose(test_dml_benchmark_fixture["default_benchmark"], test_dml_benchmark_fixture["external_benchmark"], rel_tol=1e-9, abs_tol=1e-4)

From d68143eb2c028a0250c67c325aa7e52e86ad8ca4 Mon Sep 17 00:00:00 2001
From: Kin Ho Lucien Lo <lucienlo@luciens-macbook-pro.home>
Date: Tue, 9 Apr 2024 16:31:07 +0200
Subject: [PATCH 5/9] Remove trailing line in test_sensitivity.py

---
 doubleml/tests/test_sensitivity.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/doubleml/tests/test_sensitivity.py b/doubleml/tests/test_sensitivity.py
index 8bb73936..e67ab6ae 100644
--- a/doubleml/tests/test_sensitivity.py
+++ b/doubleml/tests/test_sensitivity.py
@@ -110,8 +110,7 @@ def test_dml_sensitivity_benchmark(dml_sensitivity_multitreat_fixture):
     assert dml_sensitivity_multitreat_fixture['benchmark'].equals(dml_sensitivity_multitreat_fixture['benchmark_manual'])
 
 @pytest.fixture(scope="module")
-def test_dml_benchmark_fixture(benchmarking_set,n_rep):
-    
+def test_dml_benchmark_fixture(benchmarking_set,n_rep): 
     random_state = 42
     
     x, y, d = make_irm_data(n_obs=10, dim_x=5, theta=0.5, return_type="np.array")

From 96f5d4c989c759b778f1de5886d2a89910f2a7dd Mon Sep 17 00:00:00 2001
From: Sven Klaassen <47529404+SvenKlaassen@users.noreply.github.com>
Date: Thu, 11 Apr 2024 14:49:36 +0200
Subject: [PATCH 6/9] remove trailing whitespaces from test_sensitivity

---
 doubleml/tests/test_sensitivity.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/doubleml/tests/test_sensitivity.py b/doubleml/tests/test_sensitivity.py
index e67ab6ae..6424158d 100644
--- a/doubleml/tests/test_sensitivity.py
+++ b/doubleml/tests/test_sensitivity.py
@@ -110,9 +110,8 @@ def test_dml_sensitivity_benchmark(dml_sensitivity_multitreat_fixture):
     assert dml_sensitivity_multitreat_fixture['benchmark'].equals(dml_sensitivity_multitreat_fixture['benchmark_manual'])
 
 @pytest.fixture(scope="module")
-def test_dml_benchmark_fixture(benchmarking_set,n_rep): 
+def test_dml_benchmark_fixture(benchmarking_set,n_rep):
     random_state = 42
-    
     x, y, d = make_irm_data(n_obs=10, dim_x=5, theta=0.5, return_type="np.array")
 
     classifier_class = RandomForestClassifier

From 3160b7532c60f811abfb4305f6e5dba4268eb25b Mon Sep 17 00:00:00 2001
From: Sven Klaassen <47529404+SvenKlaassen@users.noreply.github.com>
Date: Thu, 11 Apr 2024 16:36:25 +0200
Subject: [PATCH 7/9] format files

---
 doubleml/tests/test_exceptions_ext_preds.py |  1 +
 doubleml/tests/test_sensitivity.py          | 37 +++++++++++++++------
 2 files changed, 27 insertions(+), 11 deletions(-)

diff --git a/doubleml/tests/test_exceptions_ext_preds.py b/doubleml/tests/test_exceptions_ext_preds.py
index d72cd45b..4a61361d 100644
--- a/doubleml/tests/test_exceptions_ext_preds.py
+++ b/doubleml/tests/test_exceptions_ext_preds.py
@@ -24,6 +24,7 @@ def test_qte_external_prediction_exception():
         qte = DoubleMLQTE(DoubleMLData(df_irm, "y", "d"), DMLDummyClassifier(), DMLDummyClassifier())
         qte.fit(external_predictions=ext_predictions)
 
+
 @pytest.mark.ci
 def test_sensitivity_benchmark_external_prediction_exception():
     msg = "fit_args must be a dict. "
diff --git a/doubleml/tests/test_sensitivity.py b/doubleml/tests/test_sensitivity.py
index 6424158d..c994eda2 100644
--- a/doubleml/tests/test_sensitivity.py
+++ b/doubleml/tests/test_sensitivity.py
@@ -12,7 +12,8 @@
 from ._utils_doubleml_sensitivity_manual import doubleml_sensitivity_manual, \
     doubleml_sensitivity_benchmark_manual
 
-@pytest.fixture(scope="module", params=[["X1"],["X2"],["X3"]])
+
+@pytest.fixture(scope="module", params=[["X1"], ["X2"], ["X3"]])
 def benchmarking_set(request):
     return request.param
 
@@ -109,40 +110,54 @@ def test_dml_sensitivity_benchmark(dml_sensitivity_multitreat_fixture):
                dml_sensitivity_multitreat_fixture['d_cols'])
     assert dml_sensitivity_multitreat_fixture['benchmark'].equals(dml_sensitivity_multitreat_fixture['benchmark_manual'])
 
+
 @pytest.fixture(scope="module")
-def test_dml_benchmark_fixture(benchmarking_set,n_rep):
+def test_dml_benchmark_fixture(benchmarking_set, n_rep):
     random_state = 42
     x, y, d = make_irm_data(n_obs=10, dim_x=5, theta=0.5, return_type="np.array")
 
     classifier_class = RandomForestClassifier
     regressor_class = RandomForestRegressor
-    
+
     np.random.seed(3141)
     dml_data = DoubleMLData.from_arrays(x=x, y=y, d=d)
     x_list_long = copy.deepcopy(dml_data.x_cols)
-    dml_int = DoubleMLIRM(dml_data, ml_m=classifier_class(random_state=random_state), ml_g=regressor_class(random_state=random_state), n_folds=2)
+    dml_int = DoubleMLIRM(dml_data,
+                          ml_m=classifier_class(random_state=random_state),
+                          ml_g=regressor_class(random_state=random_state),
+                          n_folds=2)
     dml_int.fit(store_predictions=True)
     dml_int.sensitivity_analysis()
     dml_ext = copy.deepcopy(dml_int)
     df_bm = dml_int.sensitivity_benchmark(benchmarking_set=benchmarking_set)
-    
+
     np.random.seed(3141)
     dml_short = copy.deepcopy(dml_ext)
     dml_data_short = DoubleMLData.from_arrays(x=x, y=y, d=d)
     dml_data_short.x_cols = [x for x in x_list_long if x not in benchmarking_set]
-    dml_short = DoubleMLIRM(dml_data_short, ml_m=classifier_class(random_state=random_state), ml_g=regressor_class(random_state=random_state), n_folds=2)
+    dml_short = DoubleMLIRM(dml_data_short,
+                            ml_m=classifier_class(random_state=random_state),
+                            ml_g=regressor_class(random_state=random_state),
+                            n_folds=2)
     dml_short.fit(store_predictions=True)
     fit_args = {"external_predictions": {"d": {"ml_m": dml_short.predictions["ml_m"][:, :, 0],
                                                "ml_g0": dml_short.predictions["ml_g0"][:, :, 0],
-                                               "ml_g1": dml_short.predictions["ml_g1"][:, :, 0],}},}
+                                               "ml_g1": dml_short.predictions["ml_g1"][:, :, 0],
+                                               }
+                                         },
+                }
     dml_ext.sensitivity_analysis()
-    df_bm_ext = dml_ext.sensitivity_benchmark(benchmarking_set=benchmarking_set,fit_args=fit_args)
+    df_bm_ext = dml_ext.sensitivity_benchmark(benchmarking_set=benchmarking_set, fit_args=fit_args)
 
-    res_dict = {"default_benchmark": df_bm.loc["d","delta_theta"],
-                "external_benchmark": df_bm_ext.loc["d","delta_theta"]}
+    res_dict = {"default_benchmark": df_bm.loc["d", "delta_theta"],
+                "external_benchmark": df_bm_ext.loc["d", "delta_theta"]}
 
     return res_dict
 
+
 @pytest.mark.ci
 def test_dml_sensitivity_external_predictions(test_dml_benchmark_fixture):
-    assert math.isclose(test_dml_benchmark_fixture["default_benchmark"], test_dml_benchmark_fixture["external_benchmark"], rel_tol=1e-9, abs_tol=1e-4)
+    assert math.isclose(test_dml_benchmark_fixture["default_benchmark"],
+                        test_dml_benchmark_fixture["external_benchmark"],
+                        rel_tol=1e-9,
+                        abs_tol=1e-4)

From c0cbf41a6a1e339dd9d29c189853f3e7dee7b829 Mon Sep 17 00:00:00 2001
From: Sven Klaassen <47529404+SvenKlaassen@users.noreply.github.com>
Date: Thu, 11 Apr 2024 16:40:45 +0200
Subject: [PATCH 8/9] remove additional dml_short definition

---
 doubleml/tests/test_sensitivity.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/doubleml/tests/test_sensitivity.py b/doubleml/tests/test_sensitivity.py
index c994eda2..d4a379f4 100644
--- a/doubleml/tests/test_sensitivity.py
+++ b/doubleml/tests/test_sensitivity.py
@@ -132,7 +132,6 @@ def test_dml_benchmark_fixture(benchmarking_set, n_rep):
     df_bm = dml_int.sensitivity_benchmark(benchmarking_set=benchmarking_set)
 
     np.random.seed(3141)
-    dml_short = copy.deepcopy(dml_ext)
     dml_data_short = DoubleMLData.from_arrays(x=x, y=y, d=d)
     dml_data_short.x_cols = [x for x in x_list_long if x not in benchmarking_set]
     dml_short = DoubleMLIRM(dml_data_short,

From d9807a85ebf46c65537587c78b0cf6a016401c9c Mon Sep 17 00:00:00 2001
From: Sven Klaassen <47529404+SvenKlaassen@users.noreply.github.com>
Date: Thu, 11 Apr 2024 16:56:15 +0200
Subject: [PATCH 9/9] extend external predictions benchmarking to multiple
 repetitions

---
 doubleml/tests/test_sensitivity.py | 32 +++++++++++++++---------------
 1 file changed, 16 insertions(+), 16 deletions(-)

diff --git a/doubleml/tests/test_sensitivity.py b/doubleml/tests/test_sensitivity.py
index d4a379f4..9c9ca9f3 100644
--- a/doubleml/tests/test_sensitivity.py
+++ b/doubleml/tests/test_sensitivity.py
@@ -1,13 +1,11 @@
 import pytest
-import math
 import numpy as np
 import copy
 
 import doubleml as dml
 from doubleml import DoubleMLIRM, DoubleMLData
 from doubleml.datasets import make_irm_data
-from sklearn.linear_model import LinearRegression
-from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
+from sklearn.linear_model import LinearRegression, LogisticRegression
 
 from ._utils_doubleml_sensitivity_manual import doubleml_sensitivity_manual, \
     doubleml_sensitivity_benchmark_manual
@@ -114,18 +112,19 @@ def test_dml_sensitivity_benchmark(dml_sensitivity_multitreat_fixture):
 @pytest.fixture(scope="module")
 def test_dml_benchmark_fixture(benchmarking_set, n_rep):
     random_state = 42
-    x, y, d = make_irm_data(n_obs=10, dim_x=5, theta=0.5, return_type="np.array")
+    x, y, d = make_irm_data(n_obs=50, dim_x=5, theta=0, return_type="np.array")
 
-    classifier_class = RandomForestClassifier
-    regressor_class = RandomForestRegressor
+    classifier_class = LogisticRegression
+    regressor_class = LinearRegression
 
     np.random.seed(3141)
     dml_data = DoubleMLData.from_arrays(x=x, y=y, d=d)
     x_list_long = copy.deepcopy(dml_data.x_cols)
     dml_int = DoubleMLIRM(dml_data,
                           ml_m=classifier_class(random_state=random_state),
-                          ml_g=regressor_class(random_state=random_state),
-                          n_folds=2)
+                          ml_g=regressor_class(),
+                          n_folds=2,
+                          n_rep=n_rep)
     dml_int.fit(store_predictions=True)
     dml_int.sensitivity_analysis()
     dml_ext = copy.deepcopy(dml_int)
@@ -136,8 +135,9 @@ def test_dml_benchmark_fixture(benchmarking_set, n_rep):
     dml_data_short.x_cols = [x for x in x_list_long if x not in benchmarking_set]
     dml_short = DoubleMLIRM(dml_data_short,
                             ml_m=classifier_class(random_state=random_state),
-                            ml_g=regressor_class(random_state=random_state),
-                            n_folds=2)
+                            ml_g=regressor_class(),
+                            n_folds=2,
+                            n_rep=n_rep)
     dml_short.fit(store_predictions=True)
     fit_args = {"external_predictions": {"d": {"ml_m": dml_short.predictions["ml_m"][:, :, 0],
                                                "ml_g0": dml_short.predictions["ml_g0"][:, :, 0],
@@ -148,15 +148,15 @@ def test_dml_benchmark_fixture(benchmarking_set, n_rep):
     dml_ext.sensitivity_analysis()
     df_bm_ext = dml_ext.sensitivity_benchmark(benchmarking_set=benchmarking_set, fit_args=fit_args)
 
-    res_dict = {"default_benchmark": df_bm.loc["d", "delta_theta"],
-                "external_benchmark": df_bm_ext.loc["d", "delta_theta"]}
+    res_dict = {"default_benchmark": df_bm,
+                "external_benchmark": df_bm_ext}
 
     return res_dict
 
 
 @pytest.mark.ci
 def test_dml_sensitivity_external_predictions(test_dml_benchmark_fixture):
-    assert math.isclose(test_dml_benchmark_fixture["default_benchmark"],
-                        test_dml_benchmark_fixture["external_benchmark"],
-                        rel_tol=1e-9,
-                        abs_tol=1e-4)
+    assert np.allclose(test_dml_benchmark_fixture["default_benchmark"],
+                       test_dml_benchmark_fixture["external_benchmark"],
+                       rtol=1e-9,
+                       atol=1e-4)