DoubleML
diff --git a/‎.coverage
68 KB b/‎.coverage
68 KB
diff --git a/‎doubleml/tests/_utils_lpq_manual.py
Lines changed: 7 additions & 6 deletions b/‎doubleml/tests/_utils_lpq_manual.py
Lines changed: 7 additions & 6 deletions
diff --git a/‎doubleml/tests/test_blp.py
Lines changed: 8 additions & 2 deletions b/‎doubleml/tests/test_blp.py
Lines changed: 8 additions & 2 deletions
diff --git a/‎doubleml/tests/test_cvar_tune.py
Lines changed: 0 additions & 3 deletions b/‎doubleml/tests/test_cvar_tune.py
Lines changed: 0 additions & 3 deletions
diff --git a/‎doubleml/tests/test_dml_data.py
Lines changed: 29 additions & 1 deletion b/‎doubleml/tests/test_dml_data.py
Lines changed: 29 additions & 1 deletion
diff --git a/‎doubleml/tests/test_doubleml_exceptions.py
Lines changed: 21 additions & 2 deletions b/‎doubleml/tests/test_doubleml_exceptions.py
Lines changed: 21 additions & 2 deletions
diff --git a/‎doubleml/tests/test_doubleml_return_types.py
Lines changed: 19 additions & 9 deletions b/‎doubleml/tests/test_doubleml_return_types.py
Lines changed: 19 additions & 9 deletions
diff --git a/‎doubleml/tests/test_lpq.py
Lines changed: 61 additions & 25 deletions b/‎doubleml/tests/test_lpq.py
Lines changed: 61 additions & 25 deletions
@@ -11,6 +11,7 @@ def fit_lpq(y, x, d, z, quantile,
             learner_g, learner_m, all_smpls, treatment, dml_procedure, n_rep=1,
             trimming_rule='truncate',
             trimming_threshold=1e-2,
+            kde=_default_kde,
             normalize_ipw=True, m_z_params=None,
             m_d_z0_params=None, m_d_z1_params=None,
             g_du_z0_params=None, g_du_z1_params=None):
@@ -37,10 +38,10 @@ def fit_lpq(y, x, d, z, quantile,
                                                                    g_du_z1_params=g_du_z1_params)
         if dml_procedure == 'dml1':
             lpqs[i_rep], ses[i_rep] = lpq_dml1(y, d, z, m_z_hat, g_du_z0_hat, g_du_z1_hat, comp_prob_hat,
-                                               treatment, quantile, ipw_vec, coef_bounds, smpls)
+                                               treatment, quantile, ipw_vec, coef_bounds, smpls, kde)
         else:
             lpqs[i_rep], ses[i_rep] = lpq_dml2(y, d, z, m_z_hat, g_du_z0_hat, g_du_z1_hat, comp_prob_hat,
-                                               treatment, quantile, ipw_vec, coef_bounds)
+                                               treatment, quantile, ipw_vec, coef_bounds, kde)
 
     lpq = np.median(lpqs)
     se = np.sqrt(np.median(np.power(ses, 2) * n_obs + np.power(lpqs - lpq, 2)) / n_obs)
@@ -200,7 +201,7 @@ def ipw_score(theta):
     return m_z_hat, g_du_z0_hat, g_du_z1_hat, comp_prob_hat, ipw_vec, coef_bounds
 
 
-def lpq_dml1(y, d, z, m_z, g_du_z0, g_du_z1, comp_prob, treatment, quantile, ipw_vec, coef_bounds, smpls):
+def lpq_dml1(y, d, z, m_z, g_du_z0, g_du_z1, comp_prob, treatment, quantile, ipw_vec, coef_bounds, smpls, kde):
     thetas = np.zeros(len(smpls))
     n_obs = len(y)
     ipw_est = ipw_vec.mean()
@@ -211,17 +212,17 @@ def lpq_dml1(y, d, z, m_z, g_du_z0, g_du_z1, comp_prob, treatment, quantile, ipw
 
     theta_hat = np.mean(thetas)
 
-    se = np.sqrt(lpq_var_est(theta_hat, m_z, g_du_z0, g_du_z1, comp_prob, d, y, z, treatment, quantile, n_obs))
+    se = np.sqrt(lpq_var_est(theta_hat, m_z, g_du_z0, g_du_z1, comp_prob, d, y, z, treatment, quantile, n_obs, kde))
 
     return theta_hat, se
 
 
-def lpq_dml2(y, d, z, m_z, g_du_z0, g_du_z1, comp_prob, treatment, quantile, ipw_vec, coef_bounds):
+def lpq_dml2(y, d, z, m_z, g_du_z0, g_du_z1, comp_prob, treatment, quantile, ipw_vec, coef_bounds, kde):
     n_obs = len(y)
     ipw_est = ipw_vec.mean()
     theta_hat = lpq_est(m_z, g_du_z0, g_du_z1, comp_prob, d, y, z, treatment, quantile, ipw_est, coef_bounds)
 
-    se = np.sqrt(lpq_var_est(theta_hat, m_z, g_du_z0, g_du_z1, comp_prob, d, y, z, treatment, quantile, n_obs))
+    se = np.sqrt(lpq_var_est(theta_hat, m_z, g_du_z0, g_du_z1, comp_prob, d, y, z, treatment, quantile, n_obs, kde))
 
     return theta_hat, se
 
 
@@ -1,6 +1,7 @@
 import numpy as np
 import pandas as pd
 import pytest
+import copy
 
 import doubleml as dml
 
@@ -26,7 +27,10 @@ def dml_blp_fixture(ci_joint, ci_level):
     random_basis = pd.DataFrame(np.random.normal(0, 1, size=(n, 3)))
     random_signal = np.random.normal(0, 1, size=(n, ))
 
-    blp = dml.DoubleMLBLP(random_signal, random_basis).fit()
+    blp = dml.DoubleMLBLP(random_signal, random_basis)
+
+    blp_obj = copy.copy(blp)
+    blp.fit()
     blp_manual = fit_blp(random_signal, random_basis)
 
     np.random.seed(42)
@@ -47,7 +51,8 @@ def dml_blp_fixture(ci_joint, ci_level):
                 'ci_1': ci_1,
                 'ci_2': ci_2,
                 'ci_manual': ci_manual,
-                'blp_model': blp}
+                'blp_model': blp,
+                'unfitted_blp_model': blp_obj}
 
     return res_dict
 
@@ -91,6 +96,7 @@ def test_dml_blp_ci_2(dml_blp_fixture):
 def test_dml_blp_return_types(dml_blp_fixture):
     assert isinstance(dml_blp_fixture['blp_model'].__str__(), str)
     assert isinstance(dml_blp_fixture['blp_model'].summary, pd.DataFrame)
+    assert isinstance(dml_blp_fixture['unfitted_blp_model'].summary, pd.DataFrame)
 
 
 @pytest.mark.ci
 
@@ -58,9 +58,6 @@ def tune_on_folds(request):
 def get_par_grid(learner):
     if learner.__class__ in [RandomForestRegressor, RandomForestClassifier]:
         par_grid = {'n_estimators': [5, 10, 15, 20]}
-    else:
-        assert learner.__class__ in [LogisticRegression]
-        par_grid = {'C': np.logspace(-4, 2, 10)}
     return par_grid
 
 
 
@@ -5,9 +5,27 @@
 from doubleml import DoubleMLData, DoubleMLPLR, DoubleMLClusterData, DoubleMLDIDCS
 from doubleml.datasets import make_plr_CCDDHNR2018, _make_pliv_data, make_pliv_CHS2015,\
     make_pliv_multiway_cluster_CKMS2021, make_did_SZ2020
+from doubleml.double_ml_data import DoubleMLBaseData
+
 from sklearn.linear_model import Lasso, LogisticRegression
 
 
+class DummyDataClass(DoubleMLBaseData):
+    def __init__(self, data):
+        DoubleMLBaseData.__init__(self, data)
+
+    @property
+    def n_coefs(self):
+        return 1
+
+
+@pytest.mark.ci
+def test_doubleml_basedata():
+    dummy_dml_data = DummyDataClass(pd.DataFrame(np.zeros((100, 10))))
+    assert dummy_dml_data.d_cols[0] == 'theta'
+    assert dummy_dml_data.n_treat == 1
+
+
 @pytest.fixture(scope="module")
 def dml_data_fixture(generate_data1):
     data = generate_data1
@@ -157,12 +175,22 @@ def test_dml_data_no_instr_no_time():
 
 
 @pytest.mark.ci
-def test_dml_cluster_summary_with_time():
+def test_dml_summary_with_time():
     dml_data_did_cs = make_did_SZ2020(n_obs=200, cross_sectional_data=True)
     dml_did_cs = DoubleMLDIDCS(dml_data_did_cs, Lasso(), LogisticRegression())
     assert isinstance(dml_did_cs.__str__(), str)
     assert isinstance(dml_did_cs.summary, pd.DataFrame)
 
+    dml_data = make_plr_CCDDHNR2018(n_obs=100)
+    df = dml_data.data.copy().iloc[:, :11]
+    df.columns = [f'X{i + 1}' for i in np.arange(8)] + ['y', 'd1', 'd2']
+    print(df)
+    dml_data = DoubleMLClusterData(df, 'y', ['d1', 'd2'],
+                                   cluster_cols=[f'X{i + 1}' for i in [5, 6]],
+                                   x_cols=[f'X{i + 1}' for i in np.arange(5)],
+                                   t_col='X8')
+    assert isinstance(dml_data._data_summary_str(), str)
+
 
 @pytest.mark.ci
 def test_x_cols_setter_defaults():
 
@@ -178,8 +178,11 @@ def test_doubleml_exception_data():
     df_iivm = dml_data_iivm.data.copy()
     df_iivm['z'] = df_iivm['z'] * 2
     with pytest.raises(ValueError, match=msg):
+        # no instrument Z for LPQ
+        _ = DoubleMLLPQ(DoubleMLData(df_iivm, 'y', 'd', x_cols=['z']),
+                        LogisticRegression(), LogisticRegression(), treatment=1)
         # non-binary Z for LPQ
-        _ = DoubleMLLPQ(DoubleMLData(df_iivm, 'y', 'd', 'z'),
+        _ = DoubleMLLPQ(DoubleMLData(df_iivm, 'y', 'd', z_cols=['z']),
                         LogisticRegression(), LogisticRegression(), treatment=1)
 
     # CVAR with IV
@@ -470,10 +473,12 @@ def test_doubleml_exception_kde():
         _ = DoubleMLPQ(dml_data_irm, ml_g, ml_m, treatment=1, kde="0.1")
     with pytest.raises(TypeError, match=msg):
         _ = DoubleMLLPQ(dml_data_iivm, ml_g, ml_m, treatment=1, kde="0.1")
+    with pytest.raises(TypeError, match=msg):
+        _ = DoubleMLQTE(dml_data_irm, ml_g, ml_m, kde="0.1")
 
 
 @pytest.mark.ci
-def test_doubleml_exception_normalization():
+def test_doubleml_exception_ipw_normalization():
     msg = "Normalization indicator has to be boolean. Object of type <class 'int'> passed."
     with pytest.raises(TypeError, match=msg):
         _ = DoubleMLIRM(dml_data_irm, ml_g, LogisticRegression(), normalize_ipw=1)
@@ -485,6 +490,8 @@ def test_doubleml_exception_normalization():
         _ = DoubleMLQTE(dml_data_irm, ml_g, ml_m, normalize_ipw=1)
     with pytest.raises(TypeError, match=msg):
         _ = DoubleMLLPQ(dml_data_iivm, ml_g, ml_m, treatment=1, normalize_ipw=1)
+    with pytest.raises(TypeError, match=msg):
+        _ = DoubleMLCVAR(dml_data_irm, Lasso(), LogisticRegression(), treatment=1, normalize_ipw=1)
 
     # DID models in_sample_normalization
     msg = "in_sample_normalization indicator has to be boolean. Object of type <class 'int'> passed."
@@ -869,6 +876,18 @@ def test_doubleml_exception_learner():
     with pytest.raises(ValueError, match=msg):
         _ = DoubleMLIIVM(dml_data_iivm, LogisticRegression(), LogisticRegression(), LogisticRegression())
 
+    # we allow classifiers for ml_g for binary treatment variables in DID
+    msg = (r'The ml_g learner LogisticRegression\(\) was identified as classifier '
+           'but the outcome variable is not binary with values 0 and 1.')
+    with pytest.raises(ValueError, match=msg):
+        _ = DoubleMLDID(dml_data_did, LogisticRegression(), LogisticRegression())
+
+    # we allow classifiers for ml_g for binary treatment variables in DIDCS
+    msg = (r'The ml_g learner LogisticRegression\(\) was identified as classifier '
+           'but the outcome variable is not binary with values 0 and 1.')
+    with pytest.raises(ValueError, match=msg):
+        _ = DoubleMLDIDCS(dml_data_did_cs, LogisticRegression(), LogisticRegression())
+
     # construct a classifier which is not identifiable as classifier via is_classifier by sklearn
     # it then predicts labels and therefore an exception will be thrown
     log_reg = LogisticRegression()
 
@@ -2,7 +2,7 @@
 import pandas as pd
 import numpy as np
 
-from doubleml import DoubleMLPLR, DoubleMLIRM, DoubleMLIIVM, DoubleMLPLIV, DoubleMLClusterData, \
+from doubleml import DoubleMLPLR, DoubleMLIRM, DoubleMLIIVM, DoubleMLPLIV, DoubleMLData, DoubleMLClusterData, \
     DoubleMLCVAR, DoubleMLPQ, DoubleMLLPQ, DoubleMLDID, DoubleMLDIDCS
 from doubleml.datasets import make_plr_CCDDHNR2018, make_irm_data, make_pliv_CHS2015, make_iivm_data,\
     make_pliv_multiway_cluster_CKMS2021, make_did_SZ2020
@@ -12,13 +12,18 @@
 from sklearn.svm import LinearSVR
 
 np.random.seed(3141)
-dml_data_plr = make_plr_CCDDHNR2018(n_obs=200)
-dml_data_pliv = make_pliv_CHS2015(n_obs=200, dim_z=1)
-dml_data_irm = make_irm_data(n_obs=200)
-dml_data_iivm = make_iivm_data(n_obs=200)
+n_obs = 200
+dml_data_plr = make_plr_CCDDHNR2018(n_obs=n_obs)
+dml_data_pliv = make_pliv_CHS2015(n_obs=n_obs, dim_z=1)
+dml_data_irm = make_irm_data(n_obs=n_obs)
+dml_data_iivm = make_iivm_data(n_obs=n_obs)
 dml_cluster_data_pliv = make_pliv_multiway_cluster_CKMS2021(N=10, M=10)
-dml_data_did = make_did_SZ2020(n_obs=200)
-dml_data_did_cs = make_did_SZ2020(n_obs=200, cross_sectional_data=True)
+dml_data_did = make_did_SZ2020(n_obs=n_obs)
+dml_data_did_cs = make_did_SZ2020(n_obs=n_obs, cross_sectional_data=True)
+(x, y, d, t) = make_did_SZ2020(n_obs=n_obs, cross_sectional_data=True, return_type='array')
+binary_outcome = np.random.binomial(n=1, p=0.5, size=n_obs)
+dml_data_did_binary_outcome = DoubleMLData.from_arrays(x, binary_outcome, d)
+dml_data_did_cs_binary_outcome = DoubleMLData.from_arrays(x, binary_outcome, d, t=t)
 
 dml_plr = DoubleMLPLR(dml_data_plr, Lasso(), Lasso())
 dml_pliv = DoubleMLPLIV(dml_data_pliv, Lasso(), Lasso(), Lasso())
@@ -29,7 +34,9 @@
 dml_pq = DoubleMLPQ(dml_data_irm, ml_g=RandomForestClassifier(), ml_m=RandomForestClassifier())
 dml_lpq = DoubleMLLPQ(dml_data_iivm, ml_g=RandomForestClassifier(), ml_m=RandomForestClassifier())
 dml_did = DoubleMLDID(dml_data_did, Lasso(), LogisticRegression())
+dml_did_binary_outcome = DoubleMLDID(dml_data_did_binary_outcome, LogisticRegression(), LogisticRegression())
 dml_did_cs = DoubleMLDIDCS(dml_data_did_cs, Lasso(), LogisticRegression())
+dml_did_cs_binary_outcome = DoubleMLDIDCS(dml_data_did_cs_binary_outcome, LogisticRegression(), LogisticRegression())
 
 
 @pytest.mark.ci
@@ -43,7 +50,9 @@
                           (dml_pq, DoubleMLPQ),
                           (dml_lpq, DoubleMLLPQ),
                           (dml_did, DoubleMLDID),
-                          (dml_did_cs, DoubleMLDIDCS)])
+                          (dml_did_binary_outcome, DoubleMLDID),
+                          (dml_did_cs, DoubleMLDIDCS),
+                          (dml_did_cs_binary_outcome, DoubleMLDIDCS)])
 def test_return_types(dml_obj, cls):
     # ToDo: A second test case with multiple treatment variables would be helpful
     assert isinstance(dml_obj.__str__(), str)
@@ -130,7 +139,8 @@ def test_return_types(dml_obj, cls):
 
 @pytest.mark.ci
 @pytest.mark.parametrize('dml_obj',
-                         [plr_dml1, pliv_dml1,  irm_dml1,  iivm_dml1, cvar_dml1, pq_dml1, lpq_dml1, did_dml1, did_cs_dml1])
+                         [plr_dml1, pliv_dml1,  irm_dml1,  iivm_dml1, cvar_dml1, pq_dml1, lpq_dml1,
+                          did_dml1, did_cs_dml1])
 def test_property_types_and_shapes(dml_obj):
     # not checked: apply_cross_fitting, dml_procedure, learner, learner_names, params, params_names, score
     # already checked: summary
 
@@ -7,9 +7,18 @@
 from sklearn.base import clone
 from sklearn.linear_model import LogisticRegression
 from sklearn.ensemble import RandomForestClassifier
+from statsmodels.nonparametric.kde import KDEUnivariate
 
 from ._utils import draw_smpls
 from ._utils_lpq_manual import fit_lpq
+from .._utils import _default_kde
+
+
+def custom_kde(u, weights):
+    dens = KDEUnivariate(u)
+    dens.fit(kernel='epa', bw='silverman', weights=weights, fft=False)
+
+    return dens.evaluate(0)
 
 
 @pytest.fixture(scope='module',
@@ -19,14 +28,13 @@ def treatment(request):
 
 
 @pytest.fixture(scope='module',
-                params=[0.25, 0.5, 0.75])
+                params=[0.25, 0.75])
 def quantile(request):
     return request.param
 
 
 @pytest.fixture(scope='module',
-                params=[RandomForestClassifier(max_depth=2, n_estimators=5, random_state=42),
-                        LogisticRegression()])
+                params=[LogisticRegression()])
 def learner(request):
     return request.param
 
@@ -44,14 +52,20 @@ def normalize_ipw(request):
 
 
 @pytest.fixture(scope='module',
-                params=[0.01, 0.05])
+                params=[0.05])
 def trimming_threshold(request):
     return request.param
 
 
+@pytest.fixture(scope='module',
+                params=['default', custom_kde])
+def kde(request):
+    return request.param
+
+
 @pytest.fixture(scope="module")
 def dml_lpq_fixture(generate_data_local_quantiles, treatment, quantile, learner,
-                    dml_procedure, normalize_ipw, trimming_threshold):
+                    dml_procedure, normalize_ipw, trimming_threshold, kde):
     n_folds = 3
 
     # collect data
@@ -63,26 +77,48 @@ def dml_lpq_fixture(generate_data_local_quantiles, treatment, quantile, learner,
     all_smpls = draw_smpls(n_obs, n_folds, n_rep=1, groups=strata)
 
     np.random.seed(42)
-    dml_lpq_obj = dml.DoubleMLLPQ(obj_dml_data,
-                                  clone(learner), clone(learner),
-                                  treatment=treatment,
-                                  quantile=quantile,
-                                  n_folds=n_folds,
-                                  n_rep=1,
-                                  dml_procedure=dml_procedure,
-                                  normalize_ipw=normalize_ipw,
-                                  trimming_threshold=trimming_threshold,
-                                  draw_sample_splitting=False)
-
-    # synchronize the sample splitting
-    dml_lpq_obj.set_sample_splitting(all_smpls=all_smpls)
-    dml_lpq_obj.fit()
-
-    np.random.seed(42)
-    res_manual = fit_lpq(y, x, d, z, quantile, clone(learner), clone(learner),
-                         all_smpls, treatment, dml_procedure,
-                         normalize_ipw=normalize_ipw,
-                         n_rep=1, trimming_threshold=trimming_threshold)
+    if kde == 'default':
+        dml_lpq_obj = dml.DoubleMLLPQ(obj_dml_data,
+                                    clone(learner), clone(learner),
+                                    treatment=treatment,
+                                    quantile=quantile,
+                                    n_folds=n_folds,
+                                    n_rep=1,
+                                    dml_procedure=dml_procedure,
+                                    normalize_ipw=normalize_ipw,
+                                    trimming_threshold=trimming_threshold,
+                                    draw_sample_splitting=False)
+        # synchronize the sample splitting
+        dml_lpq_obj.set_sample_splitting(all_smpls=all_smpls)
+        dml_lpq_obj.fit()
+
+        np.random.seed(42)
+        res_manual = fit_lpq(y, x, d, z, quantile, clone(learner), clone(learner),
+                             all_smpls, treatment, dml_procedure,
+                             normalize_ipw=normalize_ipw, kde=_default_kde,
+                             n_rep=1, trimming_threshold=trimming_threshold)
+    else:
+        dml_lpq_obj = dml.DoubleMLLPQ(obj_dml_data,
+                                      clone(learner), clone(learner),
+                                      treatment=treatment,
+                                      quantile=quantile,
+                                      n_folds=n_folds,
+                                      n_rep=1,
+                                      dml_procedure=dml_procedure,
+                                      normalize_ipw=normalize_ipw,
+                                      kde=kde,
+                                      trimming_threshold=trimming_threshold,
+                                      draw_sample_splitting=False)
+
+        # synchronize the sample splitting
+        dml_lpq_obj.set_sample_splitting(all_smpls=all_smpls)
+        dml_lpq_obj.fit()
+
+        np.random.seed(42)
+        res_manual = fit_lpq(y, x, d, z, quantile, clone(learner), clone(learner),
+                             all_smpls, treatment, dml_procedure,
+                             normalize_ipw=normalize_ipw, kde=kde,
+                             n_rep=1, trimming_threshold=trimming_threshold)
 
     res_dict = {'coef': dml_lpq_obj.coef,
                 'coef_manual': res_manual['lpq'],