3
3
import warnings
4
4
5
5
from sklearn .base import is_regressor , is_classifier
6
+ from sklearn .metrics import mean_squared_error
6
7
7
8
from scipy .stats import norm
8
9
@@ -45,8 +46,10 @@ def __init__(self,
45
46
self ._learner = None
46
47
self ._params = None
47
48
48
- # initialize predictions to None which are only stored if method fit is called with store_predictions=True
49
+ # initialize predictions and target to None which are only stored if method fit is called with store_predictions=True
49
50
self ._predictions = None
51
+ self ._nuisance_targets = None
52
+ self ._rmses = None
50
53
51
54
# initialize models to None which are only stored if method fit is called with store_models=True
52
55
self ._models = None
@@ -129,6 +132,11 @@ def __str__(self):
129
132
learner_info = ''
130
133
for key , value in self .learner .items ():
131
134
learner_info += f'Learner { key } : { str (value )} \n '
135
+ if self .rmses is not None :
136
+ learner_info += 'Out-of-sample Performance:\n '
137
+ for learner in self .params_names :
138
+ learner_info += f'Learner { learner } RMSE: { self .rmses [learner ]} \n '
139
+
132
140
if self ._is_cluster_data :
133
141
resampling_info = f'No. folds per cluster: { self ._n_folds_per_cluster } \n ' \
134
142
f'No. folds: { self .n_folds } \n ' \
@@ -231,6 +239,20 @@ def predictions(self):
231
239
"""
232
240
return self ._predictions
233
241
242
+ @property
243
+ def nuisance_targets (self ):
244
+ """
245
+ The outcome of the nuisance models.
246
+ """
247
+ return self ._nuisance_targets
248
+
249
+ @property
250
+ def rmses (self ):
251
+ """
252
+ The root-mean-squared-errors of the nuisance models.
253
+ """
254
+ return self ._rmses
255
+
234
256
@property
235
257
def models (self ):
236
258
"""
@@ -434,7 +456,7 @@ def __psi_deriv(self):
434
456
def __all_se (self ):
435
457
return self ._all_se [self ._i_treat , self ._i_rep ]
436
458
437
- def fit (self , n_jobs_cv = None , store_predictions = False , store_models = False ):
459
+ def fit (self , n_jobs_cv = None , store_predictions = True , store_models = False ):
438
460
"""
439
461
Estimate DoubleML models.
440
462
@@ -471,8 +493,11 @@ def fit(self, n_jobs_cv=None, store_predictions=False, store_models=False):
471
493
raise TypeError ('store_models must be True or False. '
472
494
f'Got { str (store_models )} .' )
473
495
496
+ # initialize rmse arrays for nuisance functions evaluation
497
+ self ._initialize_rmses ()
498
+
474
499
if store_predictions :
475
- self ._initialize_predictions ()
500
+ self ._initialize_predictions_and_targets ()
476
501
477
502
if store_models :
478
503
self ._initialize_models ()
@@ -491,8 +516,10 @@ def fit(self, n_jobs_cv=None, store_predictions=False, store_models=False):
491
516
492
517
self ._set_score_elements (score_elements , self ._i_rep , self ._i_treat )
493
518
519
+ # calculate rmses and store predictions and targets of the nuisance models
520
+ self ._calc_rmses (preds ['predictions' ], preds ['targets' ])
494
521
if store_predictions :
495
- self ._store_predictions (preds ['predictions' ])
522
+ self ._store_predictions_and_targets (preds ['predictions' ], preds [ 'targets ' ])
496
523
if store_models :
497
524
self ._store_models (preds ['models' ])
498
525
@@ -990,22 +1017,103 @@ def _initialize_boot_arrays(self, n_rep_boot):
990
1017
boot_t_stat = np .full ((self ._dml_data .n_coefs , n_rep_boot * self .n_rep ), np .nan )
991
1018
return n_rep_boot , boot_coef , boot_t_stat
992
1019
993
- def _initialize_predictions (self ):
1020
+ def _initialize_predictions_and_targets (self ):
994
1021
self ._predictions = {learner : np .full ((self ._dml_data .n_obs , self .n_rep , self ._dml_data .n_coefs ), np .nan )
995
1022
for learner in self .params_names }
1023
+ self ._nuisance_targets = {learner : np .full ((self ._dml_data .n_obs , self .n_rep , self ._dml_data .n_coefs ), np .nan )
1024
+ for learner in self .params_names }
1025
+
1026
+ def _initialize_rmses (self ):
1027
+ self ._rmses = {learner : np .full ((self .n_rep , self ._dml_data .n_coefs ), np .nan )
1028
+ for learner in self .params_names }
996
1029
997
1030
def _initialize_models (self ):
998
1031
self ._models = {learner : {treat_var : [None ] * self .n_rep for treat_var in self ._dml_data .d_cols }
999
1032
for learner in self .params_names }
1000
1033
1001
- def _store_predictions (self , preds ):
1034
+ def _store_predictions_and_targets (self , preds , targets ):
1002
1035
for learner in self .params_names :
1003
1036
self ._predictions [learner ][:, self ._i_rep , self ._i_treat ] = preds [learner ]
1037
+ self ._nuisance_targets [learner ][:, self ._i_rep , self ._i_treat ] = targets [learner ]
1038
+
1039
+ def _calc_rmses (self , preds , targets ):
1040
+ for learner in self .params_names :
1041
+ if targets [learner ] is None :
1042
+ self ._rmses [learner ][self ._i_rep , self ._i_treat ] = np .nan
1043
+ else :
1044
+ sq_error = np .power (targets [learner ] - preds [learner ], 2 )
1045
+ self ._rmses [learner ][self ._i_rep , self ._i_treat ] = np .sqrt (np .mean (sq_error , 0 ))
1004
1046
1005
1047
def _store_models (self , models ):
1006
1048
for learner in self .params_names :
1007
1049
self ._models [learner ][self ._dml_data .d_cols [self ._i_treat ]][self ._i_rep ] = models [learner ]
1008
1050
1051
+ def evaluate_learners (self , learners = None , metric = mean_squared_error ):
1052
+ """
1053
+ Evaluate fitted learners for DoubleML models on cross-validated predictions.
1054
+
1055
+ Parameters
1056
+ ----------
1057
+ learners : list
1058
+ A list of strings which correspond to the nuisance functions of the model.
1059
+
1060
+ metric : callable
1061
+ A callable function with inputs ``y_pred`` and ``y_true`` of shape ``(1, n)``,
1062
+ where ``n`` specifies the number of observations.
1063
+ Default is the euclidean distance.
1064
+
1065
+ Returns
1066
+ -------
1067
+ dist : dict
1068
+ A dictionary containing the evaluated metric for each learner.
1069
+
1070
+ Examples
1071
+ --------
1072
+ >>> import numpy as np
1073
+ >>> import doubleml as dml
1074
+ >>> from sklearn.metrics import mean_absolute_error
1075
+ >>> from doubleml.datasets import make_irm_data
1076
+ >>> from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
1077
+ >>> np.random.seed(3141)
1078
+ >>> ml_g = RandomForestRegressor(n_estimators=100, max_features=20, max_depth=5, min_samples_leaf=2)
1079
+ >>> ml_m = RandomForestClassifier(n_estimators=100, max_features=20, max_depth=5, min_samples_leaf=2)
1080
+ >>> data = make_irm_data(theta=0.5, n_obs=500, dim_x=20, return_type='DataFrame')
1081
+ >>> obj_dml_data = dml.DoubleMLData(data, 'y', 'd')
1082
+ >>> dml_irm_obj = dml.DoubleMLIRM(obj_dml_data, ml_g, ml_m)
1083
+ >>> dml_irm_obj.fit()
1084
+ >>> dml_irm_obj.evaluate_learners(metric=mean_absolute_error)
1085
+ {'ml_g0': array([[1.13318973]]),
1086
+ 'ml_g1': array([[0.91659939]]),
1087
+ 'ml_m': array([[0.36350912]])}
1088
+ """
1089
+ # if no learners are provided try to evaluate all learners
1090
+ if learners is None :
1091
+ learners = self .params_names
1092
+
1093
+ # check metric
1094
+ if not callable (metric ):
1095
+ raise TypeError ('metric should be a callable. '
1096
+ '%r was passed.' % metric )
1097
+
1098
+ if all (learner in self .params_names for learner in learners ):
1099
+ if self .nuisance_targets is None :
1100
+ raise ValueError ('Apply fit() before evaluate_learners().' )
1101
+ else :
1102
+ dist = {learner : np .full ((self .n_rep , self ._dml_data .n_coefs ), np .nan )
1103
+ for learner in learners }
1104
+ for learner in learners :
1105
+ for rep in range (self .n_rep ):
1106
+ for coef_idx in range (self ._dml_data .n_coefs ):
1107
+ res = metric (y_pred = self .predictions [learner ][:, rep , coef_idx ].reshape (1 , - 1 ),
1108
+ y_true = self .nuisance_targets [learner ][:, rep , coef_idx ].reshape (1 , - 1 ))
1109
+ if not np .isfinite (res ):
1110
+ raise ValueError (f'Evaluation from learner { str (learner )} is not finite.' )
1111
+ dist [learner ][rep , coef_idx ] = res
1112
+ return dist
1113
+ else :
1114
+ raise ValueError (f'The learners have to be a subset of { str (self .params_names )} . '
1115
+ f'Learners { str (learners )} provided.' )
1116
+
1009
1117
def draw_sample_splitting (self ):
1010
1118
"""
1011
1119
Draw sample splitting for DoubleML models.
0 commit comments