Skip to content

Add RMSEs and targets #182

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 14 commits into from
Jan 25, 2023
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
*.vscode
MANIFEST
*.idea
*.vscode
14 changes: 13 additions & 1 deletion doubleml/_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -111,6 +111,7 @@ def _dml_cv_predict(estimator, x, y, smpls=None,
res['preds'] = preds[:, 1]
else:
res['preds'] = preds
res['targets'] = y
else:
if not smpls_is_partition:
assert not fold_specific_target, 'combination of fold-specific y and no cross-fitting not implemented yet'
Expand Down Expand Up @@ -150,7 +151,9 @@ def _dml_cv_predict(estimator, x, y, smpls=None,
for idx, (train_index, test_index) in enumerate(smpls))

preds = np.full(n_obs, np.nan)
targets = np.full(n_obs, np.nan)
train_preds = list()
train_targets = list()
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

As far as I see, we're not using the train_targets right now, right? I guess that's alright and we can leave a in-sample (fold-wise cross-validated) vs. out-of-sample (cross-fitted) evaluation for later. I'm wondering how we can implement this in a clever way... 🤔

for idx, (train_index, test_index) in enumerate(smpls):
assert idx == fitted_models[idx][1]
pred_fun = getattr(fitted_models[idx][0], method)
Expand All @@ -159,12 +162,21 @@ def _dml_cv_predict(estimator, x, y, smpls=None,
else:
preds[test_index] = pred_fun(x[test_index, :])

if fold_specific_target:
# targets not available for fold specific target
targets = None
else:
targets[test_index] = y[test_index]

if return_train_preds:
train_preds.append(pred_fun(x[train_index, :]))
train_targets.append(y[train_index])

res['preds'] = preds
res['targets'] = targets
if return_train_preds:
res['train_preds'] = train_preds
res['train_targets'] = train_targets
if return_models:
fold_ids = [xx[1] for xx in fitted_models]
if not np.alltrue(fold_ids == np.arange(len(smpls))):
Expand Down Expand Up @@ -222,4 +234,4 @@ def _check_is_propensity(preds, learner, learner_name, smpls, eps=1e-12):
if any((preds[test_indices] < eps) | (preds[test_indices] > 1 - eps)):
warnings.warn(f'Propensity predictions from learner {str(learner)} for'
f' {learner_name} are close to zero or one (eps={eps}).')
return
return
120 changes: 114 additions & 6 deletions doubleml/double_ml.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import warnings

from sklearn.base import is_regressor, is_classifier
from sklearn.metrics import mean_squared_error

from scipy.stats import norm

Expand Down Expand Up @@ -45,8 +46,10 @@ def __init__(self,
self._learner = None
self._params = None

# initialize predictions to None which are only stored if method fit is called with store_predictions=True
# initialize predictions and target to None which are only stored if method fit is called with store_predictions=True
self._predictions = None
self._nuisance_targets = None
self._rmses = None

# initialize models to None which are only stored if method fit is called with store_models=True
self._models = None
Expand Down Expand Up @@ -129,6 +132,11 @@ def __str__(self):
learner_info = ''
for key, value in self.learner.items():
learner_info += f'Learner {key}: {str(value)}\n'
if self.rmses is not None:
learner_info += 'Out-of-sample Performance:\n'
for learner in self.params_names:
learner_info += f'Learner {learner} RMSE: {self.rmses[learner]}\n'

if self._is_cluster_data:
resampling_info = f'No. folds per cluster: {self._n_folds_per_cluster}\n' \
f'No. folds: {self.n_folds}\n' \
Expand Down Expand Up @@ -231,6 +239,20 @@ def predictions(self):
"""
return self._predictions

@property
def nuisance_targets(self):
"""
The outcome of the nuisance models.
"""
return self._nuisance_targets

@property
def rmses(self):
"""
The root-mean-squared-errors of the nuisance models.
"""
return self._rmses

@property
def models(self):
"""
Expand Down Expand Up @@ -434,7 +456,7 @@ def __psi_deriv(self):
def __all_se(self):
return self._all_se[self._i_treat, self._i_rep]

def fit(self, n_jobs_cv=None, store_predictions=False, store_models=False):
def fit(self, n_jobs_cv=None, store_predictions=True, store_models=False):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks @SvenKlaassen - I think it's a good idea to set the default for store_predictions to True 👍

"""
Estimate DoubleML models.

Expand Down Expand Up @@ -471,8 +493,11 @@ def fit(self, n_jobs_cv=None, store_predictions=False, store_models=False):
raise TypeError('store_models must be True or False. '
f'Got {str(store_models)}.')

# initialize rmse arrays for nuisance functions evaluation
self._initialize_rmses()

if store_predictions:
self._initialize_predictions()
self._initialize_predictions_and_targets()

if store_models:
self._initialize_models()
Expand All @@ -491,8 +516,10 @@ def fit(self, n_jobs_cv=None, store_predictions=False, store_models=False):

self._set_score_elements(score_elements, self._i_rep, self._i_treat)

# calculate rmses and store predictions and targets of the nuisance models
self._calc_rmses(preds['predictions'], preds['targets'])
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hi @SvenKlaassen ,

thanks for this PR! I think adding some diagnostics is really nice.

I think it should be possible to make this a bit more general by using (maybe only a subset of) sklearn's metrics either by letting users pass a callable for evaluation of the nuisance predictions or by directly supporting the measures. For example, in case $D$ or $Y$ are binary, classification error or cross-entropy loss might be more relevant than the RMSE. What do you think about this?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I guess that sklearn's measures have built in some methods to handle exceptions that might occur ...

Copy link
Member Author

@SvenKlaassen SvenKlaassen Jan 6, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks @PhilippBach, that would be helpful. I think I can add this, but I think keeping RMSE as default would be useful, since one would have to specify different metrics for each learner. RMSE is still useful for classifications.
Another option would be a different method which could evaluate the the nuisance function with a metric, but keeping RMSE as default for the summary.

if store_predictions:
self._store_predictions(preds['predictions'])
self._store_predictions_and_targets(preds['predictions'], preds['targets'])
if store_models:
self._store_models(preds['models'])

Expand Down Expand Up @@ -990,22 +1017,103 @@ def _initialize_boot_arrays(self, n_rep_boot):
boot_t_stat = np.full((self._dml_data.n_coefs, n_rep_boot * self.n_rep), np.nan)
return n_rep_boot, boot_coef, boot_t_stat

def _initialize_predictions(self):
def _initialize_predictions_and_targets(self):
self._predictions = {learner: np.full((self._dml_data.n_obs, self.n_rep, self._dml_data.n_coefs), np.nan)
for learner in self.params_names}
self._nuisance_targets = {learner: np.full((self._dml_data.n_obs, self.n_rep, self._dml_data.n_coefs), np.nan)
for learner in self.params_names}

def _initialize_rmses(self):
self._rmses = {learner: np.full((self.n_rep, self._dml_data.n_coefs), np.nan)
for learner in self.params_names}

def _initialize_models(self):
self._models = {learner: {treat_var: [None] * self.n_rep for treat_var in self._dml_data.d_cols}
for learner in self.params_names}

def _store_predictions(self, preds):
def _store_predictions_and_targets(self, preds, targets):
for learner in self.params_names:
self._predictions[learner][:, self._i_rep, self._i_treat] = preds[learner]
self._nuisance_targets[learner][:, self._i_rep, self._i_treat] = targets[learner]

def _calc_rmses(self, preds, targets):
for learner in self.params_names:
if targets[learner] is None:
self._rmses[learner][self._i_rep, self._i_treat] = np.nan
else:
sq_error = np.power(targets[learner] - preds[learner], 2)
self._rmses[learner][self._i_rep, self._i_treat] = np.sqrt(np.mean(sq_error, 0))

def _store_models(self, models):
for learner in self.params_names:
self._models[learner][self._dml_data.d_cols[self._i_treat]][self._i_rep] = models[learner]

def evaluate_learners(self, learners=None, metric=mean_squared_error):
"""
Evaluate fitted learners for DoubleML models on cross-validated predictions.

Parameters
----------
learners : list
A list of strings which correspond to the nuisance functions of the model.

metric : callable
A callable function with inputs ``y_pred`` and ``y_true`` of shape ``(1, n)``,
where ``n`` specifies the number of observations.
Default is the euclidean distance.

Returns
-------
dist : dict
A dictionary containing the evaluated metric for each learner.

Examples
--------
>>> import numpy as np
>>> import doubleml as dml
>>> from sklearn.metrics import mean_absolute_error
>>> from doubleml.datasets import make_irm_data
>>> from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
>>> np.random.seed(3141)
>>> ml_g = RandomForestRegressor(n_estimators=100, max_features=20, max_depth=5, min_samples_leaf=2)
>>> ml_m = RandomForestClassifier(n_estimators=100, max_features=20, max_depth=5, min_samples_leaf=2)
>>> data = make_irm_data(theta=0.5, n_obs=500, dim_x=20, return_type='DataFrame')
>>> obj_dml_data = dml.DoubleMLData(data, 'y', 'd')
>>> dml_irm_obj = dml.DoubleMLIRM(obj_dml_data, ml_g, ml_m)
>>> dml_irm_obj.fit()
>>> dml_irm_obj.evaluate_learners(metric=mean_absolute_error)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm afraid we get a little problem here if we use callables for classification here, like for example if we run instead:

import numpy as np
import doubleml as dml
from sklearn.metrics import log_loss
from doubleml.datasets import make_irm_data
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
np.random.seed(3141)
ml_g = RandomForestRegressor(n_estimators=100, max_features=20, max_depth=5, min_samples_leaf=2)
ml_m = RandomForestClassifier(n_estimators=100, max_features=20, max_depth=5, min_samples_leaf=2)
data = make_irm_data(theta=0.5, n_obs=500, dim_x=20, return_type='DataFrame')
obj_dml_data = dml.DoubleMLData(data, 'y', 'd')
dml_irm_obj = dml.DoubleMLIRM(obj_dml_data, ml_g, ml_m)
dml_irm_obj.fit()
dml_irm_obj.evaluate_learners(metric=log_loss)

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we'd have to check whether a learner is a regression or classification learner and then pass (optionally) two callables (one for the regressions, one for the classification tasks); Alternatively, one could pass through a keyword referring to the learner_name but that's probably going to lead to a messy interface;

I think the default can still be RMSE (or another regression measure) for all nuisance parts, but the option to use a classification measure is probably reasonable, what do you think?

{'ml_g0': array([[1.13318973]]),
'ml_g1': array([[0.91659939]]),
'ml_m': array([[0.36350912]])}
"""
# if no learners are provided try to evaluate all learners
if learners is None:
learners = self.params_names

# check metric
if not callable(metric):
raise TypeError('metric should be a callable. '
'%r was passed.' % metric)

if all(learner in self.params_names for learner in learners):
if self.nuisance_targets is None:
raise ValueError('Apply fit() before evaluate_learners().')
else:
dist = {learner: np.full((self.n_rep, self._dml_data.n_coefs), np.nan)
for learner in learners}
for learner in learners:
for rep in range(self.n_rep):
for coef_idx in range(self._dml_data.n_coefs):
res = metric(y_pred=self.predictions[learner][:, rep, coef_idx].reshape(1, -1),
y_true=self.nuisance_targets[learner][:, rep, coef_idx].reshape(1, -1))
if not np.isfinite(res):
raise ValueError(f'Evaluation from learner {str(learner)} is not finite.')
dist[learner][rep, coef_idx] = res
return dist
else:
raise ValueError(f'The learners have to be a subset of {str(self.params_names)}. '
f'Learners {str(learners)} provided.')

def draw_sample_splitting(self):
"""
Draw sample splitting for DoubleML models.
Expand Down
13 changes: 9 additions & 4 deletions doubleml/double_ml_iivm.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ class DoubleMLIIVM(LinearScoreMixin, DoubleML):

trimming_threshold : float
The threshold used for trimming.
Default is ``1e-12``.
Default is ``1e-2``.

draw_sample_splitting : bool
Indicates whether the sample splitting should be drawn during initialization of the object.
Expand Down Expand Up @@ -129,7 +129,7 @@ def __init__(self,
subgroups=None,
dml_procedure='dml2',
trimming_rule='truncate',
trimming_threshold=1e-12,
trimming_threshold=1e-2,
draw_sample_splitting=True,
apply_cross_fitting=True):
super().__init__(obj_dml_data,
Expand Down Expand Up @@ -282,15 +282,15 @@ def _nuisance_est(self, smpls, n_jobs_cv, return_models=False):
est_params=self._get_params('ml_r0'), method=self._predict_method['ml_r'],
return_models=return_models)
else:
r_hat0 = {'preds': np.zeros_like(d), 'models': None}
r_hat0 = {'preds': np.zeros_like(d), 'targets': np.zeros_like(d), 'models': None}
_check_finite_predictions(r_hat0['preds'], self._learner['ml_r'], 'ml_r', smpls)

if self.subgroups['never_takers']:
r_hat1 = _dml_cv_predict(self._learner['ml_r'], x, d, smpls=smpls_z1, n_jobs=n_jobs_cv,
est_params=self._get_params('ml_r1'), method=self._predict_method['ml_r'],
return_models=return_models)
else:
r_hat1 = {'preds': np.ones_like(d), 'models': None}
r_hat1 = {'preds': np.ones_like(d), 'targets': np.ones_like(d), 'models': None}
_check_finite_predictions(r_hat1['preds'], self._learner['ml_r'], 'ml_r', smpls)

psi_a, psi_b = self._score_elements(y, z, d,
Expand All @@ -303,6 +303,11 @@ def _nuisance_est(self, smpls, n_jobs_cv, return_models=False):
'ml_m': m_hat['preds'],
'ml_r0': r_hat0['preds'],
'ml_r1': r_hat1['preds']},
'targets': {'ml_g0': g_hat0['targets'],
'ml_g1': g_hat1['targets'],
'ml_m': m_hat['targets'],
'ml_r0': r_hat0['targets'],
'ml_r1': r_hat1['targets']},
'models': {'ml_g0': g_hat0['models'],
'ml_g1': g_hat1['models'],
'ml_m': m_hat['models'],
Expand Down
9 changes: 6 additions & 3 deletions doubleml/double_ml_irm.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ class DoubleMLIRM(LinearScoreMixin, DoubleML):

trimming_threshold : float
The threshold used for trimming.
Default is ``1e-12``.
Default is ``1e-2``.

draw_sample_splitting : bool
Indicates whether the sample splitting should be drawn during initialization of the object.
Expand Down Expand Up @@ -114,7 +114,7 @@ def __init__(self,
score='ATE',
dml_procedure='dml2',
trimming_rule='truncate',
trimming_threshold=1e-12,
trimming_threshold=1e-2,
draw_sample_splitting=True,
apply_cross_fitting=True):
super().__init__(obj_dml_data,
Expand Down Expand Up @@ -206,7 +206,7 @@ def _nuisance_est(self, smpls, n_jobs_cv, return_models=False):
'observed to be binary with values 0 and 1. Make sure that for classifiers '
'probabilities and not labels are predicted.')

g_hat1 = {'preds': None, 'models': None}
g_hat1 = {'preds': None, 'targets': None, 'models': None}
if (self.score == 'ATE') | callable(self.score):
g_hat1 = _dml_cv_predict(self._learner['ml_g'], x, y, smpls=smpls_d1, n_jobs=n_jobs_cv,
est_params=self._get_params('ml_g1'), method=self._predict_method['ml_g'],
Expand Down Expand Up @@ -237,6 +237,9 @@ def _nuisance_est(self, smpls, n_jobs_cv, return_models=False):
preds = {'predictions': {'ml_g0': g_hat0['preds'],
'ml_g1': g_hat1['preds'],
'ml_m': m_hat['preds']},
'targets': {'ml_g0': g_hat0['targets'],
'ml_g1': g_hat1['targets'],
'ml_m': m_hat['targets']},
'models': {'ml_g0': g_hat0['models'],
'ml_g1': g_hat1['models'],
'ml_m': m_hat['models']}
Expand Down
Loading