DoubleML
diff --git a/‎doubleml/double_ml.py
Lines changed: 6 additions & 42 deletions b/‎doubleml/double_ml.py
Lines changed: 6 additions & 42 deletions
diff --git a/‎doubleml/irm/irm.py
Lines changed: 42 additions & 32 deletions b/‎doubleml/irm/irm.py
Lines changed: 42 additions & 32 deletions
diff --git a/‎doubleml/irm/tests/_utils_irm_manual.py
Lines changed: 5 additions & 1 deletion b/‎doubleml/irm/tests/_utils_irm_manual.py
Lines changed: 5 additions & 1 deletion
diff --git a/‎doubleml/irm/tests/test_irm.py
Lines changed: 1 addition & 1 deletion b/‎doubleml/irm/tests/test_irm.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎doubleml/tests/_utils_dml_cv_predict.py
Lines changed: 27 additions & 11 deletions b/‎doubleml/tests/_utils_dml_cv_predict.py
Lines changed: 27 additions & 11 deletions
diff --git a/‎doubleml/tests/test_exceptions.py
Lines changed: 10 additions & 4 deletions b/‎doubleml/tests/test_exceptions.py
Lines changed: 10 additions & 4 deletions
@@ -19,7 +19,7 @@
 from .utils._checks import _check_in_zero_one, _check_integer, _check_float, _check_bool, _check_is_partition, \
     _check_all_smpls, _check_smpl_split, _check_smpl_split_tpl, _check_benchmarks, _check_external_predictions
 from .utils._plots import _sensitivity_contour_plot
-
+from .utils.gain_statistics import gain_statistics
 
 _implemented_data_backends = ['DoubleMLData', 'DoubleMLClusterData']
 
@@ -272,7 +272,8 @@ def params_names(self):
     @property
     def predictions(self):
         """
-        The predictions of the nuisance models.
+        The predictions of the nuisance models in form of a dictinary.
+        Each key refers to a nuisance element with a array of values of shape ``(n_obs, n_rep, n_coefs)``.
         """
         return self._predictions
 
@@ -354,6 +355,7 @@ def psi(self):
         Values of the score function after calling :meth:`fit`;
         For models (e.g., PLR, IRM, PLIV, IIVM) with linear score (in the parameter)
         :math:`\\psi(W; \\theta, \\eta) = \\psi_a(W; \\eta) \\theta + \\psi_b(W; \\eta)`.
+        The shape is ``(n_obs, n_rep, n_coefs)``.
         """
         return self._psi
 
@@ -364,6 +366,7 @@ def psi_deriv(self):
         after calling :meth:`fit`;
         For models (e.g., PLR, IRM, PLIV, IIVM) with linear score (in the parameter)
         :math:`\\psi_a(W; \\eta)`.
+        The shape is ``(n_obs, n_rep, n_coefs)``.
         """
         return self._psi_deriv
 
@@ -1966,45 +1969,6 @@ def sensitivity_benchmark(self, benchmarking_set):
         dml_short._dml_data.x_cols = x_list_short
         dml_short.fit()
 
-        # save elements for readability
-        var_y = np.var(self._dml_data.y)
-        var_y_residuals_long = np.squeeze(self.sensitivity_elements['sigma2'], axis=0)
-        nu2_long = np.squeeze(self.sensitivity_elements['nu2'], axis=0)
-        var_y_residuals_short = np.squeeze(dml_short.sensitivity_elements['sigma2'], axis=0)
-        nu2_short = np.squeeze(dml_short.sensitivity_elements['nu2'], axis=0)
-
-        # compute nonparametric R2
-        R2_y_long = 1.0 - np.divide(var_y_residuals_long, var_y)
-        R2_y_short = 1.0 - np.divide(var_y_residuals_short, var_y)
-        R2_riesz = np.divide(nu2_short, nu2_long)
-
-        # Gain statistics
-        all_cf_y_benchmark = np.clip(np.divide((R2_y_long - R2_y_short), (1.0 - R2_y_long)), 0, 1)
-        all_cf_d_benchmark = np.clip(np.divide((1.0 - R2_riesz), R2_riesz), 0, 1)
-        cf_y_benchmark = np.median(all_cf_y_benchmark, axis=0)
-        cf_d_benchmark = np.median(all_cf_d_benchmark, axis=0)
-
-        # change in estimates (slightly different to paper)
-        all_delta_theta = np.transpose(dml_short.all_coef - self.all_coef)
-        delta_theta = np.median(all_delta_theta, axis=0)
-
-        # degree of adversity
-        var_g = var_y_residuals_short - var_y_residuals_long
-        var_riesz = nu2_long - nu2_short
-        denom = np.sqrt(np.multiply(var_g, var_riesz), out=np.zeros_like(var_g), where=(var_g > 0) & (var_riesz > 0))
-        rho_sign = np.sign(all_delta_theta)
-        rho_values = np.clip(np.divide(np.absolute(all_delta_theta),
-                                       denom,
-                                       out=np.ones_like(all_delta_theta),
-                                       where=denom != 0),
-                             0.0, 1.0)
-        all_rho_benchmark = np.multiply(rho_values, rho_sign)
-        rho_benchmark = np.median(all_rho_benchmark, axis=0)
-        benchmark_dict = {
-            "cf_y": cf_y_benchmark,
-            "cf_d": cf_d_benchmark,
-            "rho": rho_benchmark,
-            "delta_theta": delta_theta,
-        }
+        benchmark_dict = gain_statistics(dml_long=self, dml_short=dml_short)
         df_benchmark = pd.DataFrame(benchmark_dict, index=self._dml_data.d_cols)
         return df_benchmark
@@ -221,12 +221,26 @@ def _initialize_weights(self, weights):
             assert isinstance(weights, dict)
             self._weights = weights
 
-    def _get_weights(self):
-        weights = self._weights['weights']
-        if 'weights_bar' not in self._weights.keys():
-            weights_bar = self._weights['weights']
+    def _get_weights(self, m_hat=None):
+        # standard case for ATE
+        if self.score == 'ATE':
+            weights = self._weights['weights']
+            if 'weights_bar' not in self._weights.keys():
+                weights_bar = self._weights['weights']
+            else:
+                weights_bar = self._weights['weights_bar'][:, self._i_rep]
         else:
-            weights_bar = self._weights['weights_bar'][:, self._i_rep]
+            # special case for ATTE
+            assert self.score == 'ATTE'
+            assert m_hat is not None
+            subgroup = self._weights['weights'] * self._dml_data.d
+            subgroup_probability = np.mean(subgroup)
+            weights = np.divide(subgroup, subgroup_probability)
+
+            weights_bar = np.divide(
+                np.multiply(m_hat, self._weights['weights']),
+                subgroup_probability)
+
         return weights, weights_bar
 
     def _check_data(self, obj_dml_data):
@@ -280,8 +294,13 @@ def _nuisance_est(self, smpls, n_jobs_cv, external_predictions, return_models=Fa
                                      f'predictions obtained with the ml_g learner {str(self._learner["ml_g"])} are also '
                                      'observed to be binary with values 0 and 1. Make sure that for classifiers '
                                      'probabilities and not labels are predicted.')
+        if self.score == 'ATTE':
+            # skip g_hat1 estimation
+            g_hat1 = {'preds': None,
+                      'targets': None,
+                      'models': None}
 
-        if g1_external:
+        elif g1_external:
             # use external predictions
             g_hat1 = {'preds': external_predictions['ml_g1'],
                       'targets': None,
@@ -294,7 +313,7 @@ def _nuisance_est(self, smpls, n_jobs_cv, external_predictions, return_models=Fa
             # adjust target values to consider only compatible subsamples
             g_hat1['targets'] = _cond_targets(g_hat1['targets'], cond_sample=(d == 1))
 
-        if self._dml_data.binary_outcome:
+        if self._dml_data.binary_outcome & (self.score != 'ATTE'):
             binary_preds = (type_of_target(g_hat1['preds']) == 'binary')
             zero_one_preds = np.all((np.power(g_hat1['preds'], 2) - g_hat1['preds']) == 0)
             if binary_preds & zero_one_preds:
@@ -338,11 +357,6 @@ def _nuisance_est(self, smpls, n_jobs_cv, external_predictions, return_models=Fa
 
     def _score_elements(self, y, d, g_hat0, g_hat1, m_hat, smpls):
 
-        # fraction of treated for ATTE
-        p_hat = None
-        if self.score == 'ATTE':
-            p_hat = np.mean(d)
-
         m_hat_adj = np.full_like(m_hat, np.nan, dtype='float64')
         if self.normalize_ipw:
             if self.dml_procedure == 'dml1':
@@ -355,24 +369,21 @@ def _score_elements(self, y, d, g_hat0, g_hat1, m_hat, smpls):
 
         # compute residuals
         u_hat0 = y - g_hat0
-        u_hat1 = None
-        if self.score == 'ATE':
-            u_hat1 = y - g_hat1
-
-        if isinstance(self.score, str):
+        if self.score == 'ATTE':
+            g_hat1 = y
+        u_hat1 = y - g_hat1
+
+        if (self.score == 'ATE') or (self.score == 'ATTE'):
+            weights, weights_bar = self._get_weights(m_hat=m_hat_adj)
+            psi_b = weights * (g_hat1 - g_hat0) \
+                + weights_bar * (
+                    np.divide(np.multiply(d, u_hat1), m_hat_adj)
+                    - np.divide(np.multiply(1.0-d, u_hat0), 1.0 - m_hat_adj))
             if self.score == 'ATE':
-                weights, weights_bar = self._get_weights()
-                psi_b = weights * (g_hat1 - g_hat0) \
-                    + weights_bar * (
-                        np.divide(np.multiply(d, u_hat1), m_hat_adj)
-                        - np.divide(np.multiply(1.0-d, u_hat0), 1.0 - m_hat_adj))
                 psi_a = np.full_like(m_hat_adj, -1.0)
             else:
                 assert self.score == 'ATTE'
-                psi_b = np.divide(np.multiply(d, u_hat0), p_hat) \
-                    - np.divide(np.multiply(m_hat_adj, np.multiply(1.0-d, u_hat0)),
-                                np.multiply(p_hat, (1.0 - m_hat_adj)))
-                psi_a = - np.divide(d, p_hat)
+                psi_a = -1.0 * weights
         else:
             assert callable(self.score)
             psi_a, psi_b = self.score(y=y, d=d,
@@ -388,15 +399,14 @@ def _sensitivity_element_est(self, preds):
 
         m_hat = preds['predictions']['ml_m']
         g_hat0 = preds['predictions']['ml_g0']
-        g_hat1 = preds['predictions']['ml_g1']
-
-        # use weights make this extendable
         if self.score == 'ATE':
-            weights, weights_bar = self._get_weights()
+            g_hat1 = preds['predictions']['ml_g1']
         else:
             assert self.score == 'ATTE'
-            weights = np.divide(d, np.mean(d))
-            weights_bar = np.divide(m_hat, np.mean(d))
+            g_hat1 = y
+
+        # use weights make this extendable
+        weights, weights_bar = self._get_weights(m_hat=m_hat)
 
         sigma2_score_element = np.square(y - np.multiply(d, g_hat1) - np.multiply(1.0-d, g_hat0))
         sigma2 = np.mean(sigma2_score_element)
 
@@ -298,7 +298,11 @@ def fit_sensitivity_elements_irm(y, d, all_coef, predictions, score, n_rep):
 
         m_hat = predictions['ml_m'][:, i_rep, 0]
         g_hat0 = predictions['ml_g0'][:, i_rep, 0]
-        g_hat1 = predictions['ml_g1'][:, i_rep, 0]
+        if score == 'ATE':
+            g_hat1 = predictions['ml_g1'][:, i_rep, 0]
+        else:
+            assert score == 'ATTE'
+            g_hat1 = y
 
         if score == 'ATE':
             weights = np.ones_like(d)
 
@@ -278,7 +278,7 @@ def dml_irm_weights_fixture(n_rep, dml_procedure):
 
     # First stage estimation
     ml_g = LinearRegression()
-    ml_m = LogisticRegression(penalty='none', random_state=42)
+    ml_m = LogisticRegression(penalty='l2', random_state=42)
 
     # ATE with and without weights
     dml_irm_obj_ate_no_weights = dml.DoubleMLIRM(
 
@@ -8,6 +8,21 @@
 from sklearn.preprocessing import LabelEncoder
 from sklearn.model_selection._validation import _fit_and_predict, _check_is_permutation
 
+# Adapt _fit_and_predict for earlier sklearn versions
+from distutils.version import LooseVersion
+from sklearn import __version__ as sklearn_version
+
+if LooseVersion(sklearn_version) < LooseVersion("1.4.0"):
+    def _fit_and_predict_adapted(estimator, x, y, train, test, fit_params, method):
+        res = _fit_and_predict(estimator, x, y, train, test,
+                               verbose=0,
+                               fit_params=fit_params,
+                               method=method)
+        return res
+else:
+    def _fit_and_predict_adapted(estimator, x, y, train, test, fit_params, method):
+        return _fit_and_predict(estimator, x, y, train, test, fit_params, method)
+
 
 def _dml_cv_predict_ut_version(estimator, x, y, smpls=None,
                                n_jobs=None, est_params=None, method='predict'):
@@ -22,18 +37,19 @@ def _dml_cv_predict_ut_version(estimator, x, y, smpls=None,
         train_index, test_index = smpls[0]
         # set some defaults aligned with cross_val_predict
         fit_params = None
-        verbose = 0
         if method == 'predict_proba':
             predictions = np.full((len(y), 2), np.nan)
         else:
             predictions = np.full(len(y), np.nan)
         if est_params is None:
-            xx = _fit_and_predict(clone(estimator),
-                                  x, y, train_index, test_index, verbose, fit_params, method)
+            xx = _fit_and_predict_adapted(
+                clone(estimator),
+                x, y, train_index, test_index, fit_params, method)
         else:
             assert isinstance(est_params, dict)
-            xx = _fit_and_predict(clone(estimator).set_params(**est_params),
-                                  x, y, train_index, test_index, verbose, fit_params, method)
+            xx = _fit_and_predict_adapted(
+                clone(estimator).set_params(**est_params),
+                x, y, train_index, test_index, fit_params, method)
 
         # implementation is (also at other parts) restricted to a sorted set of test_indices, but this could be fixed
         # inv_test_indices = np.argsort(test_indices)
@@ -61,22 +77,22 @@ def _dml_cv_predict_ut_version(estimator, x, y, smpls=None,
                         pre_dispatch=pre_dispatch)
     # FixMe: Find a better way to handle the different combinations of paramters and smpls_is_partition
     if est_params is None:
-        prediction_blocks = parallel(delayed(_fit_and_predict)(
+        prediction_blocks = parallel(delayed(_fit_and_predict_adapted)(
             estimator,
-            x, y, train_index, test_index, verbose, fit_params, method)
+            x, y, train_index, test_index, fit_params, method)
                                      for idx, (train_index, test_index) in enumerate(smpls))
     elif isinstance(est_params, dict):
         # if no fold-specific parameters we redirect to the standard method
         # warnings.warn("Using the same (hyper-)parameters for all folds")
-        prediction_blocks = parallel(delayed(_fit_and_predict)(
+        prediction_blocks = parallel(delayed(_fit_and_predict_adapted)(
             clone(estimator).set_params(**est_params),
-            x, y, train_index, test_index, verbose, fit_params, method)
+            x, y, train_index, test_index, fit_params, method)
                                      for idx, (train_index, test_index) in enumerate(smpls))
     else:
         assert len(est_params) == len(smpls), 'provide one parameter setting per fold'
-        prediction_blocks = parallel(delayed(_fit_and_predict)(
+        prediction_blocks = parallel(delayed(_fit_and_predict_adapted)(
             clone(estimator).set_params(**est_params[idx]),
-            x, y, train_index, test_index, verbose, fit_params, method)
+            x, y, train_index, test_index, fit_params, method)
             for idx, (train_index, test_index) in enumerate(smpls))
 
     # Concatenate the predictions
 
@@ -428,16 +428,17 @@ def test_doubleml_exception_trimming_rule():
 
 @pytest.mark.ci
 def test_doubleml_exception_weights():
-    msg = "weights can only be set for score type 'ATE'. ATTE was passed."
-    with pytest.raises(NotImplementedError, match=msg):
-        _ = DoubleMLIRM(dml_data_irm, Lasso(), LogisticRegression(),
-                        score='ATTE', weights=np.ones_like(dml_data_irm.d))
+
     msg = "weights must be a numpy array or dictionary. weights of type <class 'int'> was passed."
     with pytest.raises(TypeError, match=msg):
         _ = DoubleMLIRM(dml_data_irm, Lasso(), LogisticRegression(), weights=1)
     msg = r"weights must have keys \['weights', 'weights_bar'\]. keys dict_keys\(\['d'\]\) were passed."
     with pytest.raises(ValueError, match=msg):
         _ = DoubleMLIRM(dml_data_irm, Lasso(), LogisticRegression(), weights={'d': [1, 2, 3]})
+    msg = "weights must be a numpy array for ATTE score. weights of type <class 'dict'> was passed."
+    with pytest.raises(TypeError, match=msg):
+        _ = DoubleMLIRM(dml_data_irm, Lasso(), LogisticRegression(),
+                        score='ATTE', weights={'weights': np.ones_like(dml_data_irm.d)})
 
     # shape checks
     msg = rf"weights must have shape \({n},\). weights of shape \(1,\) was passed."
@@ -485,6 +486,11 @@ def test_doubleml_exception_weights():
                         weights={'weights': np.ones((dml_data_irm.d.shape[0], )),
                                  'weights_bar': np.zeros((dml_data_irm.d.shape[0], 1))})
 
+    msg = "weights must be binary for ATTE score."
+    with pytest.raises(ValueError, match=msg):
+        _ = DoubleMLIRM(dml_data_irm, Lasso(), LogisticRegression(),
+                        score='ATTE', weights=np.random.choice([0, 0.2], dml_data_irm.d.shape[0]))
+
 
 @pytest.mark.ci
 def test_doubleml_exception_quantiles():