Skip to content

Extend unit tests and bug fixes #202

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 5 commits into from
Jun 2, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Binary file added .coverage
Binary file not shown.
13 changes: 8 additions & 5 deletions doubleml/double_ml.py
Original file line number Diff line number Diff line change
Expand Up @@ -1064,7 +1064,7 @@ def evaluate_learners(self, learners=None, metric=_rmse):
where ``n`` specifies the number of observations. Remark that some models like IRM are
not able to provide all values for ``y_true`` for all learners and might contain
some ``nan`` values in the target vector.
Default is the euclidean distance.
Default is the root-mean-square error.

Returns
-------
Expand All @@ -1085,10 +1085,13 @@ def evaluate_learners(self, learners=None, metric=_rmse):
>>> obj_dml_data = dml.DoubleMLData(data, 'y', 'd')
>>> dml_irm_obj = dml.DoubleMLIRM(obj_dml_data, ml_g, ml_m)
>>> dml_irm_obj.fit()
>>> dml_irm_obj.evaluate_learners(metric=mean_absolute_error)
{'ml_g0': array([[1.13318973]]),
'ml_g1': array([[0.91659939]]),
'ml_m': array([[0.36350912]])}
>>> def mae(y_true, y_pred):
>>> subset = np.logical_not(np.isnan(y_true))
>>> return mean_absolute_error(y_true[subset], y_pred[subset])
>>> dml_irm_obj.evaluate_learners(metric=mae)
{'ml_g0': array([[0.85974356]]),
'ml_g1': array([[0.85280376]]),
'ml_m': array([[0.35365143]])}
"""
# if no learners are provided try to evaluate all learners
if learners is None:
Expand Down
46 changes: 0 additions & 46 deletions doubleml/double_ml_pliv.py
Original file line number Diff line number Diff line change
Expand Up @@ -281,17 +281,6 @@ def _check_data(self, obj_dml_data):
'use DoubleMLPLR instead of DoubleMLPLIV.')
return

# To be removed in version 0.6.0
def set_ml_nuisance_params(self, learner, treat_var, params):
if isinstance(self.score, str) & (self.score == 'partialling out') & (learner == 'ml_g'):
warnings.warn(("Learner ml_g was renamed to ml_l. "
"Please adapt the argument learner accordingly. "
"The provided parameters are set for ml_l. "
"The redirection will be removed in a future version."),
DeprecationWarning, stacklevel=2)
learner = 'ml_l'
super(DoubleMLPLIV, self).set_ml_nuisance_params(learner, treat_var, params)

def _nuisance_est(self, smpls, n_jobs_cv, return_models=False):
if self.partialX & (not self.partialZ):
psi_elements, preds = self._nuisance_est_partial_x(smpls, n_jobs_cv, return_models)
Expand Down Expand Up @@ -523,41 +512,6 @@ def _nuisance_est_partial_xz(self, smpls, n_jobs_cv, return_models=False):

return psi_elements, preds

# To be removed in version 0.6.0
def tune(self,
param_grids,
tune_on_folds=False,
scoring_methods=None, # if None the estimator's score method is used
n_folds_tune=5,
search_mode='grid_search',
n_iter_randomized_search=100,
n_jobs_cv=None,
set_as_params=True,
return_tune_res=False):

if isinstance(self.score, str) and (self.score == 'partialling out') and (param_grids is not None) and \
('ml_g' in param_grids) and ('ml_l' not in param_grids):
warnings.warn(("Learner ml_g was renamed to ml_l. "
"Please adapt the key of param_grids accordingly. "
"The provided param_grids for ml_g are set for ml_l. "
"The redirection will be removed in a future version."),
DeprecationWarning, stacklevel=2)
param_grids['ml_l'] = param_grids.pop('ml_g')

if isinstance(self.score, str) and (self.score == 'partialling out') and (scoring_methods is not None) and \
('ml_g' in scoring_methods) and ('ml_l' not in scoring_methods):
warnings.warn(("Learner ml_g was renamed to ml_l. "
"Please adapt the key of scoring_methods accordingly. "
"The provided scoring_methods for ml_g are set for ml_l. "
"The redirection will be removed in a future version."),
DeprecationWarning, stacklevel=2)
scoring_methods['ml_l'] = scoring_methods.pop('ml_g')

tune_res = super(DoubleMLPLIV, self).tune(param_grids, tune_on_folds, scoring_methods, n_folds_tune,
search_mode, n_iter_randomized_search, n_jobs_cv, set_as_params,
return_tune_res)
return tune_res

def _nuisance_tuning_partial_x(self, smpls, param_grids, scoring_methods, n_folds_tune, n_jobs_cv,
search_mode, n_iter_randomized_search):
x, y = check_X_y(self._dml_data.x, self._dml_data.y,
Expand Down
45 changes: 0 additions & 45 deletions doubleml/double_ml_plr.py
Original file line number Diff line number Diff line change
Expand Up @@ -163,17 +163,6 @@ def _check_data(self, obj_dml_data):
'To fit a partially linear IV regression model use DoubleMLPLIV instead of DoubleMLPLR.')
return

# To be removed in version 0.6.0
def set_ml_nuisance_params(self, learner, treat_var, params):
if isinstance(self.score, str) & (self.score == 'partialling out') & (learner == 'ml_g'):
warnings.warn(("Learner ml_g was renamed to ml_l. "
"Please adapt the argument learner accordingly. "
"The provided parameters are set for ml_l. "
"The redirection will be removed in a future version."),
DeprecationWarning, stacklevel=2)
learner = 'ml_l'
super(DoubleMLPLR, self).set_ml_nuisance_params(learner, treat_var, params)

def _nuisance_est(self, smpls, n_jobs_cv, return_models=False):
x, y = check_X_y(self._dml_data.x, self._dml_data.y,
force_all_finite=False)
Expand Down Expand Up @@ -252,40 +241,6 @@ def _score_elements(self, y, d, l_hat, m_hat, g_hat, smpls):

return psi_a, psi_b

# To be removed in version 0.6.0
def tune(self,
param_grids,
tune_on_folds=False,
scoring_methods=None, # if None the estimator's score method is used
n_folds_tune=5,
search_mode='grid_search',
n_iter_randomized_search=100,
n_jobs_cv=None,
set_as_params=True,
return_tune_res=False):

if isinstance(self.score, str) and (self.score == 'partialling out') and (param_grids is not None) and \
('ml_g' in param_grids) and ('ml_l' not in param_grids):
warnings.warn(("Learner ml_g was renamed to ml_l. "
"Please adapt the key of param_grids accordingly. "
"The provided param_grids for ml_g are set for ml_l. "
"The redirection will be removed in a future version."),
DeprecationWarning, stacklevel=2)
param_grids['ml_l'] = param_grids.pop('ml_g')

if isinstance(self.score, str) and (self.score == 'partialling out') and (scoring_methods is not None) and \
('ml_g' in scoring_methods) and ('ml_l' not in scoring_methods):
warnings.warn(("Learner ml_g was renamed to ml_l. "
"Please adapt the key of scoring_methods accordingly. "
"The provided scoring_methods for ml_g are set for ml_l. "
"The redirection will be removed in a future version."),
DeprecationWarning, stacklevel=2)
scoring_methods['ml_l'] = scoring_methods.pop('ml_g')

tune_res = super(DoubleMLPLR, self).tune(param_grids, tune_on_folds, scoring_methods, n_folds_tune, search_mode,
n_iter_randomized_search, n_jobs_cv, set_as_params, return_tune_res)
return tune_res

def _nuisance_tuning(self, smpls, param_grids, scoring_methods, n_folds_tune, n_jobs_cv,
search_mode, n_iter_randomized_search):
x, y = check_X_y(self._dml_data.x, self._dml_data.y,
Expand Down
13 changes: 7 additions & 6 deletions doubleml/tests/_utils_lpq_manual.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ def fit_lpq(y, x, d, z, quantile,
learner_g, learner_m, all_smpls, treatment, dml_procedure, n_rep=1,
trimming_rule='truncate',
trimming_threshold=1e-2,
kde=_default_kde,
normalize_ipw=True, m_z_params=None,
m_d_z0_params=None, m_d_z1_params=None,
g_du_z0_params=None, g_du_z1_params=None):
Expand All @@ -37,10 +38,10 @@ def fit_lpq(y, x, d, z, quantile,
g_du_z1_params=g_du_z1_params)
if dml_procedure == 'dml1':
lpqs[i_rep], ses[i_rep] = lpq_dml1(y, d, z, m_z_hat, g_du_z0_hat, g_du_z1_hat, comp_prob_hat,
treatment, quantile, ipw_vec, coef_bounds, smpls)
treatment, quantile, ipw_vec, coef_bounds, smpls, kde)
else:
lpqs[i_rep], ses[i_rep] = lpq_dml2(y, d, z, m_z_hat, g_du_z0_hat, g_du_z1_hat, comp_prob_hat,
treatment, quantile, ipw_vec, coef_bounds)
treatment, quantile, ipw_vec, coef_bounds, kde)

lpq = np.median(lpqs)
se = np.sqrt(np.median(np.power(ses, 2) * n_obs + np.power(lpqs - lpq, 2)) / n_obs)
Expand Down Expand Up @@ -200,7 +201,7 @@ def ipw_score(theta):
return m_z_hat, g_du_z0_hat, g_du_z1_hat, comp_prob_hat, ipw_vec, coef_bounds


def lpq_dml1(y, d, z, m_z, g_du_z0, g_du_z1, comp_prob, treatment, quantile, ipw_vec, coef_bounds, smpls):
def lpq_dml1(y, d, z, m_z, g_du_z0, g_du_z1, comp_prob, treatment, quantile, ipw_vec, coef_bounds, smpls, kde):
thetas = np.zeros(len(smpls))
n_obs = len(y)
ipw_est = ipw_vec.mean()
Expand All @@ -211,17 +212,17 @@ def lpq_dml1(y, d, z, m_z, g_du_z0, g_du_z1, comp_prob, treatment, quantile, ipw

theta_hat = np.mean(thetas)

se = np.sqrt(lpq_var_est(theta_hat, m_z, g_du_z0, g_du_z1, comp_prob, d, y, z, treatment, quantile, n_obs))
se = np.sqrt(lpq_var_est(theta_hat, m_z, g_du_z0, g_du_z1, comp_prob, d, y, z, treatment, quantile, n_obs, kde))

return theta_hat, se


def lpq_dml2(y, d, z, m_z, g_du_z0, g_du_z1, comp_prob, treatment, quantile, ipw_vec, coef_bounds):
def lpq_dml2(y, d, z, m_z, g_du_z0, g_du_z1, comp_prob, treatment, quantile, ipw_vec, coef_bounds, kde):
n_obs = len(y)
ipw_est = ipw_vec.mean()
theta_hat = lpq_est(m_z, g_du_z0, g_du_z1, comp_prob, d, y, z, treatment, quantile, ipw_est, coef_bounds)

se = np.sqrt(lpq_var_est(theta_hat, m_z, g_du_z0, g_du_z1, comp_prob, d, y, z, treatment, quantile, n_obs))
se = np.sqrt(lpq_var_est(theta_hat, m_z, g_du_z0, g_du_z1, comp_prob, d, y, z, treatment, quantile, n_obs, kde))

return theta_hat, se

Expand Down
10 changes: 8 additions & 2 deletions doubleml/tests/test_blp.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import numpy as np
import pandas as pd
import pytest
import copy

import doubleml as dml

Expand All @@ -26,7 +27,10 @@ def dml_blp_fixture(ci_joint, ci_level):
random_basis = pd.DataFrame(np.random.normal(0, 1, size=(n, 3)))
random_signal = np.random.normal(0, 1, size=(n, ))

blp = dml.DoubleMLBLP(random_signal, random_basis).fit()
blp = dml.DoubleMLBLP(random_signal, random_basis)

blp_obj = copy.copy(blp)
blp.fit()
blp_manual = fit_blp(random_signal, random_basis)

np.random.seed(42)
Expand All @@ -47,7 +51,8 @@ def dml_blp_fixture(ci_joint, ci_level):
'ci_1': ci_1,
'ci_2': ci_2,
'ci_manual': ci_manual,
'blp_model': blp}
'blp_model': blp,
'unfitted_blp_model': blp_obj}

return res_dict

Expand Down Expand Up @@ -91,6 +96,7 @@ def test_dml_blp_ci_2(dml_blp_fixture):
def test_dml_blp_return_types(dml_blp_fixture):
assert isinstance(dml_blp_fixture['blp_model'].__str__(), str)
assert isinstance(dml_blp_fixture['blp_model'].summary, pd.DataFrame)
assert isinstance(dml_blp_fixture['unfitted_blp_model'].summary, pd.DataFrame)


@pytest.mark.ci
Expand Down
5 changes: 0 additions & 5 deletions doubleml/tests/test_cvar_tune.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,6 @@
import math

from sklearn.base import clone

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor

import doubleml as dml
Expand Down Expand Up @@ -58,9 +56,6 @@ def tune_on_folds(request):
def get_par_grid(learner):
if learner.__class__ in [RandomForestRegressor, RandomForestClassifier]:
par_grid = {'n_estimators': [5, 10, 15, 20]}
else:
assert learner.__class__ in [LogisticRegression]
par_grid = {'C': np.logspace(-4, 2, 10)}
return par_grid


Expand Down
30 changes: 29 additions & 1 deletion doubleml/tests/test_dml_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,27 @@
from doubleml import DoubleMLData, DoubleMLPLR, DoubleMLClusterData, DoubleMLDIDCS
from doubleml.datasets import make_plr_CCDDHNR2018, _make_pliv_data, make_pliv_CHS2015,\
make_pliv_multiway_cluster_CKMS2021, make_did_SZ2020
from doubleml.double_ml_data import DoubleMLBaseData

from sklearn.linear_model import Lasso, LogisticRegression


class DummyDataClass(DoubleMLBaseData):
def __init__(self, data):
DoubleMLBaseData.__init__(self, data)

@property
def n_coefs(self):
return 1


@pytest.mark.ci
def test_doubleml_basedata():
dummy_dml_data = DummyDataClass(pd.DataFrame(np.zeros((100, 10))))
assert dummy_dml_data.d_cols[0] == 'theta'
assert dummy_dml_data.n_treat == 1
assert dummy_dml_data.n_coefs == 1

@pytest.fixture(scope="module")
def dml_data_fixture(generate_data1):
data = generate_data1
Expand Down Expand Up @@ -157,12 +175,22 @@ def test_dml_data_no_instr_no_time():


@pytest.mark.ci
def test_dml_cluster_summary_with_time():
def test_dml_summary_with_time():
dml_data_did_cs = make_did_SZ2020(n_obs=200, cross_sectional_data=True)
dml_did_cs = DoubleMLDIDCS(dml_data_did_cs, Lasso(), LogisticRegression())
assert isinstance(dml_did_cs.__str__(), str)
assert isinstance(dml_did_cs.summary, pd.DataFrame)

dml_data = make_plr_CCDDHNR2018(n_obs=100)
df = dml_data.data.copy().iloc[:, :11]
df.columns = [f'X{i + 1}' for i in np.arange(8)] + ['y', 'd1', 'd2']
print(df)
dml_data = DoubleMLClusterData(df, 'y', ['d1', 'd2'],
cluster_cols=[f'X{i + 1}' for i in [5, 6]],
x_cols=[f'X{i + 1}' for i in np.arange(5)],
t_col='X8')
assert isinstance(dml_data._data_summary_str(), str)


@pytest.mark.ci
def test_x_cols_setter_defaults():
Expand Down
Loading