Skip to content

Commit 050fa7c

Browse files
committed
extend unit tests
1 parent a671b61 commit 050fa7c

11 files changed

+156
-58
lines changed

.coverage

68 KB
Binary file not shown.

doubleml/tests/_utils_lpq_manual.py

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ def fit_lpq(y, x, d, z, quantile,
1111
learner_g, learner_m, all_smpls, treatment, dml_procedure, n_rep=1,
1212
trimming_rule='truncate',
1313
trimming_threshold=1e-2,
14+
kde=_default_kde,
1415
normalize_ipw=True, m_z_params=None,
1516
m_d_z0_params=None, m_d_z1_params=None,
1617
g_du_z0_params=None, g_du_z1_params=None):
@@ -37,10 +38,10 @@ def fit_lpq(y, x, d, z, quantile,
3738
g_du_z1_params=g_du_z1_params)
3839
if dml_procedure == 'dml1':
3940
lpqs[i_rep], ses[i_rep] = lpq_dml1(y, d, z, m_z_hat, g_du_z0_hat, g_du_z1_hat, comp_prob_hat,
40-
treatment, quantile, ipw_vec, coef_bounds, smpls)
41+
treatment, quantile, ipw_vec, coef_bounds, smpls, kde)
4142
else:
4243
lpqs[i_rep], ses[i_rep] = lpq_dml2(y, d, z, m_z_hat, g_du_z0_hat, g_du_z1_hat, comp_prob_hat,
43-
treatment, quantile, ipw_vec, coef_bounds)
44+
treatment, quantile, ipw_vec, coef_bounds, kde)
4445

4546
lpq = np.median(lpqs)
4647
se = np.sqrt(np.median(np.power(ses, 2) * n_obs + np.power(lpqs - lpq, 2)) / n_obs)
@@ -200,7 +201,7 @@ def ipw_score(theta):
200201
return m_z_hat, g_du_z0_hat, g_du_z1_hat, comp_prob_hat, ipw_vec, coef_bounds
201202

202203

203-
def lpq_dml1(y, d, z, m_z, g_du_z0, g_du_z1, comp_prob, treatment, quantile, ipw_vec, coef_bounds, smpls):
204+
def lpq_dml1(y, d, z, m_z, g_du_z0, g_du_z1, comp_prob, treatment, quantile, ipw_vec, coef_bounds, smpls, kde):
204205
thetas = np.zeros(len(smpls))
205206
n_obs = len(y)
206207
ipw_est = ipw_vec.mean()
@@ -211,17 +212,17 @@ def lpq_dml1(y, d, z, m_z, g_du_z0, g_du_z1, comp_prob, treatment, quantile, ipw
211212

212213
theta_hat = np.mean(thetas)
213214

214-
se = np.sqrt(lpq_var_est(theta_hat, m_z, g_du_z0, g_du_z1, comp_prob, d, y, z, treatment, quantile, n_obs))
215+
se = np.sqrt(lpq_var_est(theta_hat, m_z, g_du_z0, g_du_z1, comp_prob, d, y, z, treatment, quantile, n_obs, kde))
215216

216217
return theta_hat, se
217218

218219

219-
def lpq_dml2(y, d, z, m_z, g_du_z0, g_du_z1, comp_prob, treatment, quantile, ipw_vec, coef_bounds):
220+
def lpq_dml2(y, d, z, m_z, g_du_z0, g_du_z1, comp_prob, treatment, quantile, ipw_vec, coef_bounds, kde):
220221
n_obs = len(y)
221222
ipw_est = ipw_vec.mean()
222223
theta_hat = lpq_est(m_z, g_du_z0, g_du_z1, comp_prob, d, y, z, treatment, quantile, ipw_est, coef_bounds)
223224

224-
se = np.sqrt(lpq_var_est(theta_hat, m_z, g_du_z0, g_du_z1, comp_prob, d, y, z, treatment, quantile, n_obs))
225+
se = np.sqrt(lpq_var_est(theta_hat, m_z, g_du_z0, g_du_z1, comp_prob, d, y, z, treatment, quantile, n_obs, kde))
225226

226227
return theta_hat, se
227228

doubleml/tests/test_blp.py

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
import numpy as np
22
import pandas as pd
33
import pytest
4+
import copy
45

56
import doubleml as dml
67

@@ -26,7 +27,10 @@ def dml_blp_fixture(ci_joint, ci_level):
2627
random_basis = pd.DataFrame(np.random.normal(0, 1, size=(n, 3)))
2728
random_signal = np.random.normal(0, 1, size=(n, ))
2829

29-
blp = dml.DoubleMLBLP(random_signal, random_basis).fit()
30+
blp = dml.DoubleMLBLP(random_signal, random_basis)
31+
32+
blp_obj = copy.copy(blp)
33+
blp.fit()
3034
blp_manual = fit_blp(random_signal, random_basis)
3135

3236
np.random.seed(42)
@@ -47,7 +51,8 @@ def dml_blp_fixture(ci_joint, ci_level):
4751
'ci_1': ci_1,
4852
'ci_2': ci_2,
4953
'ci_manual': ci_manual,
50-
'blp_model': blp}
54+
'blp_model': blp,
55+
'unfitted_blp_model': blp_obj}
5156

5257
return res_dict
5358

@@ -91,6 +96,7 @@ def test_dml_blp_ci_2(dml_blp_fixture):
9196
def test_dml_blp_return_types(dml_blp_fixture):
9297
assert isinstance(dml_blp_fixture['blp_model'].__str__(), str)
9398
assert isinstance(dml_blp_fixture['blp_model'].summary, pd.DataFrame)
99+
assert isinstance(dml_blp_fixture['unfitted_blp_model'].summary, pd.DataFrame)
94100

95101

96102
@pytest.mark.ci

doubleml/tests/test_cvar_tune.py

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -58,9 +58,6 @@ def tune_on_folds(request):
5858
def get_par_grid(learner):
5959
if learner.__class__ in [RandomForestRegressor, RandomForestClassifier]:
6060
par_grid = {'n_estimators': [5, 10, 15, 20]}
61-
else:
62-
assert learner.__class__ in [LogisticRegression]
63-
par_grid = {'C': np.logspace(-4, 2, 10)}
6461
return par_grid
6562

6663

doubleml/tests/test_dml_data.py

Lines changed: 29 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,9 +5,27 @@
55
from doubleml import DoubleMLData, DoubleMLPLR, DoubleMLClusterData, DoubleMLDIDCS
66
from doubleml.datasets import make_plr_CCDDHNR2018, _make_pliv_data, make_pliv_CHS2015,\
77
make_pliv_multiway_cluster_CKMS2021, make_did_SZ2020
8+
from doubleml.double_ml_data import DoubleMLBaseData
9+
810
from sklearn.linear_model import Lasso, LogisticRegression
911

1012

13+
class DummyDataClass(DoubleMLBaseData):
14+
def __init__(self, data):
15+
DoubleMLBaseData.__init__(self, data)
16+
17+
@property
18+
def n_coefs(self):
19+
return 1
20+
21+
22+
@pytest.mark.ci
23+
def test_doubleml_basedata():
24+
dummy_dml_data = DummyDataClass(pd.DataFrame(np.zeros((100, 10))))
25+
assert dummy_dml_data.d_cols[0] == 'theta'
26+
assert dummy_dml_data.n_treat == 1
27+
28+
1129
@pytest.fixture(scope="module")
1230
def dml_data_fixture(generate_data1):
1331
data = generate_data1
@@ -157,12 +175,22 @@ def test_dml_data_no_instr_no_time():
157175

158176

159177
@pytest.mark.ci
160-
def test_dml_cluster_summary_with_time():
178+
def test_dml_summary_with_time():
161179
dml_data_did_cs = make_did_SZ2020(n_obs=200, cross_sectional_data=True)
162180
dml_did_cs = DoubleMLDIDCS(dml_data_did_cs, Lasso(), LogisticRegression())
163181
assert isinstance(dml_did_cs.__str__(), str)
164182
assert isinstance(dml_did_cs.summary, pd.DataFrame)
165183

184+
dml_data = make_plr_CCDDHNR2018(n_obs=100)
185+
df = dml_data.data.copy().iloc[:, :11]
186+
df.columns = [f'X{i + 1}' for i in np.arange(8)] + ['y', 'd1', 'd2']
187+
print(df)
188+
dml_data = DoubleMLClusterData(df, 'y', ['d1', 'd2'],
189+
cluster_cols=[f'X{i + 1}' for i in [5, 6]],
190+
x_cols=[f'X{i + 1}' for i in np.arange(5)],
191+
t_col='X8')
192+
assert isinstance(dml_data._data_summary_str(), str)
193+
166194

167195
@pytest.mark.ci
168196
def test_x_cols_setter_defaults():

doubleml/tests/test_doubleml_exceptions.py

Lines changed: 21 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -178,8 +178,11 @@ def test_doubleml_exception_data():
178178
df_iivm = dml_data_iivm.data.copy()
179179
df_iivm['z'] = df_iivm['z'] * 2
180180
with pytest.raises(ValueError, match=msg):
181+
# no instrument Z for LPQ
182+
_ = DoubleMLLPQ(DoubleMLData(df_iivm, 'y', 'd', x_cols=['z']),
183+
LogisticRegression(), LogisticRegression(), treatment=1)
181184
# non-binary Z for LPQ
182-
_ = DoubleMLLPQ(DoubleMLData(df_iivm, 'y', 'd', 'z'),
185+
_ = DoubleMLLPQ(DoubleMLData(df_iivm, 'y', 'd', z_cols=['z']),
183186
LogisticRegression(), LogisticRegression(), treatment=1)
184187

185188
# CVAR with IV
@@ -470,10 +473,12 @@ def test_doubleml_exception_kde():
470473
_ = DoubleMLPQ(dml_data_irm, ml_g, ml_m, treatment=1, kde="0.1")
471474
with pytest.raises(TypeError, match=msg):
472475
_ = DoubleMLLPQ(dml_data_iivm, ml_g, ml_m, treatment=1, kde="0.1")
476+
with pytest.raises(TypeError, match=msg):
477+
_ = DoubleMLQTE(dml_data_irm, ml_g, ml_m, kde="0.1")
473478

474479

475480
@pytest.mark.ci
476-
def test_doubleml_exception_normalization():
481+
def test_doubleml_exception_ipw_normalization():
477482
msg = "Normalization indicator has to be boolean. Object of type <class 'int'> passed."
478483
with pytest.raises(TypeError, match=msg):
479484
_ = DoubleMLIRM(dml_data_irm, ml_g, LogisticRegression(), normalize_ipw=1)
@@ -485,6 +490,8 @@ def test_doubleml_exception_normalization():
485490
_ = DoubleMLQTE(dml_data_irm, ml_g, ml_m, normalize_ipw=1)
486491
with pytest.raises(TypeError, match=msg):
487492
_ = DoubleMLLPQ(dml_data_iivm, ml_g, ml_m, treatment=1, normalize_ipw=1)
493+
with pytest.raises(TypeError, match=msg):
494+
_ = DoubleMLCVAR(dml_data_irm, Lasso(), LogisticRegression(), treatment=1, normalize_ipw=1)
488495

489496
# DID models in_sample_normalization
490497
msg = "in_sample_normalization indicator has to be boolean. Object of type <class 'int'> passed."
@@ -869,6 +876,18 @@ def test_doubleml_exception_learner():
869876
with pytest.raises(ValueError, match=msg):
870877
_ = DoubleMLIIVM(dml_data_iivm, LogisticRegression(), LogisticRegression(), LogisticRegression())
871878

879+
# we allow classifiers for ml_g for binary treatment variables in DID
880+
msg = (r'The ml_g learner LogisticRegression\(\) was identified as classifier '
881+
'but the outcome variable is not binary with values 0 and 1.')
882+
with pytest.raises(ValueError, match=msg):
883+
_ = DoubleMLDID(dml_data_did, LogisticRegression(), LogisticRegression())
884+
885+
# we allow classifiers for ml_g for binary treatment variables in DIDCS
886+
msg = (r'The ml_g learner LogisticRegression\(\) was identified as classifier '
887+
'but the outcome variable is not binary with values 0 and 1.')
888+
with pytest.raises(ValueError, match=msg):
889+
_ = DoubleMLDIDCS(dml_data_did_cs, LogisticRegression(), LogisticRegression())
890+
872891
# construct a classifier which is not identifiable as classifier via is_classifier by sklearn
873892
# it then predicts labels and therefore an exception will be thrown
874893
log_reg = LogisticRegression()

doubleml/tests/test_doubleml_return_types.py

Lines changed: 19 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
import pandas as pd
33
import numpy as np
44

5-
from doubleml import DoubleMLPLR, DoubleMLIRM, DoubleMLIIVM, DoubleMLPLIV, DoubleMLClusterData, \
5+
from doubleml import DoubleMLPLR, DoubleMLIRM, DoubleMLIIVM, DoubleMLPLIV, DoubleMLData, DoubleMLClusterData, \
66
DoubleMLCVAR, DoubleMLPQ, DoubleMLLPQ, DoubleMLDID, DoubleMLDIDCS
77
from doubleml.datasets import make_plr_CCDDHNR2018, make_irm_data, make_pliv_CHS2015, make_iivm_data,\
88
make_pliv_multiway_cluster_CKMS2021, make_did_SZ2020
@@ -12,13 +12,18 @@
1212
from sklearn.svm import LinearSVR
1313

1414
np.random.seed(3141)
15-
dml_data_plr = make_plr_CCDDHNR2018(n_obs=200)
16-
dml_data_pliv = make_pliv_CHS2015(n_obs=200, dim_z=1)
17-
dml_data_irm = make_irm_data(n_obs=200)
18-
dml_data_iivm = make_iivm_data(n_obs=200)
15+
n_obs = 200
16+
dml_data_plr = make_plr_CCDDHNR2018(n_obs=n_obs)
17+
dml_data_pliv = make_pliv_CHS2015(n_obs=n_obs, dim_z=1)
18+
dml_data_irm = make_irm_data(n_obs=n_obs)
19+
dml_data_iivm = make_iivm_data(n_obs=n_obs)
1920
dml_cluster_data_pliv = make_pliv_multiway_cluster_CKMS2021(N=10, M=10)
20-
dml_data_did = make_did_SZ2020(n_obs=200)
21-
dml_data_did_cs = make_did_SZ2020(n_obs=200, cross_sectional_data=True)
21+
dml_data_did = make_did_SZ2020(n_obs=n_obs)
22+
dml_data_did_cs = make_did_SZ2020(n_obs=n_obs, cross_sectional_data=True)
23+
(x, y, d, t) = make_did_SZ2020(n_obs=n_obs, cross_sectional_data=True, return_type='array')
24+
binary_outcome = np.random.binomial(n=1, p=0.5, size=n_obs)
25+
dml_data_did_binary_outcome = DoubleMLData.from_arrays(x, binary_outcome, d)
26+
dml_data_did_cs_binary_outcome = DoubleMLData.from_arrays(x, binary_outcome, d, t=t)
2227

2328
dml_plr = DoubleMLPLR(dml_data_plr, Lasso(), Lasso())
2429
dml_pliv = DoubleMLPLIV(dml_data_pliv, Lasso(), Lasso(), Lasso())
@@ -29,7 +34,9 @@
2934
dml_pq = DoubleMLPQ(dml_data_irm, ml_g=RandomForestClassifier(), ml_m=RandomForestClassifier())
3035
dml_lpq = DoubleMLLPQ(dml_data_iivm, ml_g=RandomForestClassifier(), ml_m=RandomForestClassifier())
3136
dml_did = DoubleMLDID(dml_data_did, Lasso(), LogisticRegression())
37+
dml_did_binary_outcome = DoubleMLDID(dml_data_did_binary_outcome, LogisticRegression(), LogisticRegression())
3238
dml_did_cs = DoubleMLDIDCS(dml_data_did_cs, Lasso(), LogisticRegression())
39+
dml_did_cs_binary_outcome = DoubleMLDIDCS(dml_data_did_cs_binary_outcome, LogisticRegression(), LogisticRegression())
3340

3441

3542
@pytest.mark.ci
@@ -43,7 +50,9 @@
4350
(dml_pq, DoubleMLPQ),
4451
(dml_lpq, DoubleMLLPQ),
4552
(dml_did, DoubleMLDID),
46-
(dml_did_cs, DoubleMLDIDCS)])
53+
(dml_did_binary_outcome, DoubleMLDID),
54+
(dml_did_cs, DoubleMLDIDCS),
55+
(dml_did_cs_binary_outcome, DoubleMLDIDCS)])
4756
def test_return_types(dml_obj, cls):
4857
# ToDo: A second test case with multiple treatment variables would be helpful
4958
assert isinstance(dml_obj.__str__(), str)
@@ -130,7 +139,8 @@ def test_return_types(dml_obj, cls):
130139

131140
@pytest.mark.ci
132141
@pytest.mark.parametrize('dml_obj',
133-
[plr_dml1, pliv_dml1, irm_dml1, iivm_dml1, cvar_dml1, pq_dml1, lpq_dml1, did_dml1, did_cs_dml1])
142+
[plr_dml1, pliv_dml1, irm_dml1, iivm_dml1, cvar_dml1, pq_dml1, lpq_dml1,
143+
did_dml1, did_cs_dml1])
134144
def test_property_types_and_shapes(dml_obj):
135145
# not checked: apply_cross_fitting, dml_procedure, learner, learner_names, params, params_names, score
136146
# already checked: summary

doubleml/tests/test_lpq.py

Lines changed: 61 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -7,9 +7,18 @@
77
from sklearn.base import clone
88
from sklearn.linear_model import LogisticRegression
99
from sklearn.ensemble import RandomForestClassifier
10+
from statsmodels.nonparametric.kde import KDEUnivariate
1011

1112
from ._utils import draw_smpls
1213
from ._utils_lpq_manual import fit_lpq
14+
from .._utils import _default_kde
15+
16+
17+
def custom_kde(u, weights):
18+
dens = KDEUnivariate(u)
19+
dens.fit(kernel='epa', bw='silverman', weights=weights, fft=False)
20+
21+
return dens.evaluate(0)
1322

1423

1524
@pytest.fixture(scope='module',
@@ -19,14 +28,13 @@ def treatment(request):
1928

2029

2130
@pytest.fixture(scope='module',
22-
params=[0.25, 0.5, 0.75])
31+
params=[0.25, 0.75])
2332
def quantile(request):
2433
return request.param
2534

2635

2736
@pytest.fixture(scope='module',
28-
params=[RandomForestClassifier(max_depth=2, n_estimators=5, random_state=42),
29-
LogisticRegression()])
37+
params=[LogisticRegression()])
3038
def learner(request):
3139
return request.param
3240

@@ -44,14 +52,20 @@ def normalize_ipw(request):
4452

4553

4654
@pytest.fixture(scope='module',
47-
params=[0.01, 0.05])
55+
params=[0.05])
4856
def trimming_threshold(request):
4957
return request.param
5058

5159

60+
@pytest.fixture(scope='module',
61+
params=['default', custom_kde])
62+
def kde(request):
63+
return request.param
64+
65+
5266
@pytest.fixture(scope="module")
5367
def dml_lpq_fixture(generate_data_local_quantiles, treatment, quantile, learner,
54-
dml_procedure, normalize_ipw, trimming_threshold):
68+
dml_procedure, normalize_ipw, trimming_threshold, kde):
5569
n_folds = 3
5670

5771
# collect data
@@ -63,26 +77,48 @@ def dml_lpq_fixture(generate_data_local_quantiles, treatment, quantile, learner,
6377
all_smpls = draw_smpls(n_obs, n_folds, n_rep=1, groups=strata)
6478

6579
np.random.seed(42)
66-
dml_lpq_obj = dml.DoubleMLLPQ(obj_dml_data,
67-
clone(learner), clone(learner),
68-
treatment=treatment,
69-
quantile=quantile,
70-
n_folds=n_folds,
71-
n_rep=1,
72-
dml_procedure=dml_procedure,
73-
normalize_ipw=normalize_ipw,
74-
trimming_threshold=trimming_threshold,
75-
draw_sample_splitting=False)
76-
77-
# synchronize the sample splitting
78-
dml_lpq_obj.set_sample_splitting(all_smpls=all_smpls)
79-
dml_lpq_obj.fit()
80-
81-
np.random.seed(42)
82-
res_manual = fit_lpq(y, x, d, z, quantile, clone(learner), clone(learner),
83-
all_smpls, treatment, dml_procedure,
84-
normalize_ipw=normalize_ipw,
85-
n_rep=1, trimming_threshold=trimming_threshold)
80+
if kde == 'default':
81+
dml_lpq_obj = dml.DoubleMLLPQ(obj_dml_data,
82+
clone(learner), clone(learner),
83+
treatment=treatment,
84+
quantile=quantile,
85+
n_folds=n_folds,
86+
n_rep=1,
87+
dml_procedure=dml_procedure,
88+
normalize_ipw=normalize_ipw,
89+
trimming_threshold=trimming_threshold,
90+
draw_sample_splitting=False)
91+
# synchronize the sample splitting
92+
dml_lpq_obj.set_sample_splitting(all_smpls=all_smpls)
93+
dml_lpq_obj.fit()
94+
95+
np.random.seed(42)
96+
res_manual = fit_lpq(y, x, d, z, quantile, clone(learner), clone(learner),
97+
all_smpls, treatment, dml_procedure,
98+
normalize_ipw=normalize_ipw, kde=_default_kde,
99+
n_rep=1, trimming_threshold=trimming_threshold)
100+
else:
101+
dml_lpq_obj = dml.DoubleMLLPQ(obj_dml_data,
102+
clone(learner), clone(learner),
103+
treatment=treatment,
104+
quantile=quantile,
105+
n_folds=n_folds,
106+
n_rep=1,
107+
dml_procedure=dml_procedure,
108+
normalize_ipw=normalize_ipw,
109+
kde=kde,
110+
trimming_threshold=trimming_threshold,
111+
draw_sample_splitting=False)
112+
113+
# synchronize the sample splitting
114+
dml_lpq_obj.set_sample_splitting(all_smpls=all_smpls)
115+
dml_lpq_obj.fit()
116+
117+
np.random.seed(42)
118+
res_manual = fit_lpq(y, x, d, z, quantile, clone(learner), clone(learner),
119+
all_smpls, treatment, dml_procedure,
120+
normalize_ipw=normalize_ipw, kde=kde,
121+
n_rep=1, trimming_threshold=trimming_threshold)
86122

87123
res_dict = {'coef': dml_lpq_obj.coef,
88124
'coef_manual': res_manual['lpq'],

0 commit comments

Comments
 (0)