1
1
import numpy as np
2
2
import pandas as pd
3
- from scipy .stats import norm
4
3
5
4
from sklearn .base import clone
6
5
12
11
from .cvar import DoubleMLCVAR
13
12
from ..double_ml_framework import concat
14
13
15
- from ..utils ._estimation import _draw_weights , _default_kde
14
+ from ..utils ._estimation import _default_kde
16
15
from ..utils .resampling import DoubleMLResampling
17
16
from ..utils ._checks import _check_score , _check_trimming , _check_zero_one_treatment
18
17
@@ -155,13 +154,6 @@ def __init__(self,
155
154
# initialize all models
156
155
self ._modellist_0 , self ._modellist_1 = self ._initialize_models ()
157
156
158
- # initialize arrays according to obj_dml_data and the resampling settings
159
- self ._psi0 , self ._psi1 , self ._psi0_deriv , self ._psi1_deriv , \
160
- self ._coef , self ._se , self ._all_coef , self ._all_se = self ._initialize_arrays ()
161
-
162
- # also initialize bootstrap arrays with the default number of bootstrap replications
163
- self ._n_rep_boot , self ._boot_coef , self ._boot_t_stat = self ._initialize_boot_arrays (n_rep_boot = 500 )
164
-
165
157
def __str__ (self ):
166
158
class_name = self .__class__ .__name__
167
159
header = f'================== { class_name } Object ==================\n '
@@ -276,52 +268,69 @@ def trimming_threshold(self):
276
268
@property
277
269
def coef (self ):
278
270
"""
279
- Estimates for the causal parameter(s) after calling :meth:`fit`.
271
+ Estimates for the causal parameter(s) after calling :meth:`fit` (shape (``n_quantiles``,)) .
280
272
"""
281
- return self ._coef
282
-
283
- @ coef . setter
284
- def coef ( self , value ):
285
- self . _coef = value
273
+ if self ._framework is None :
274
+ coef = None
275
+ else :
276
+ coef = self . framework . thetas
277
+ return coef
286
278
287
279
@property
288
280
def all_coef (self ):
289
281
"""
290
- Estimates of the causal parameter(s) for the ``n_rep`` different sample splits after calling :meth:`fit`.
282
+ Estimates of the causal parameter(s) for the ``n_rep`` different sample splits after calling :meth:`fit`
283
+ (shape (``n_quantiles``, ``n_rep``)).
291
284
"""
292
- return self ._all_coef
285
+ if self ._framework is None :
286
+ all_coef = None
287
+ else :
288
+ all_coef = self .framework .all_thetas
289
+ return all_coef
293
290
294
291
@property
295
292
def se (self ):
296
293
"""
297
- Standard errors for the causal parameter(s) after calling :meth:`fit`.
294
+ Standard errors for the causal parameter(s) after calling :meth:`fit` (shape (``n_quantiles``,)) .
298
295
"""
299
- return self ._se
296
+ if self ._framework is None :
297
+ se = None
298
+ else :
299
+ se = self .framework .ses
300
+ return se
300
301
301
- @se .setter
302
- def se (self , value ):
303
- self ._se = value
302
+ @property
303
+ def all_se (self ):
304
+ """
305
+ Standard errors of the causal parameter(s) for the ``n_rep`` different sample splits after calling :meth:`fit`
306
+ (shape (``n_quantiles``, ``n_rep``)).
307
+ """
308
+ if self ._framework is None :
309
+ all_se = None
310
+ else :
311
+ all_se = self .framework .all_ses
312
+ return all_se
304
313
305
314
@property
306
315
def t_stat (self ):
307
316
"""
308
- t-statistics for the causal parameter(s) after calling :meth:`fit`.
317
+ t-statistics for the causal parameter(s) after calling :meth:`fit` (shape (``n_quantiles``,)) .
309
318
"""
310
319
t_stat = self .coef / self .se
311
320
return t_stat
312
321
313
322
@property
314
323
def pval (self ):
315
324
"""
316
- p-values for the causal parameter(s) after calling :meth:`fit` .
325
+ p-values for the causal parameter(s) (shape (``n_quantiles``,)) .
317
326
"""
318
- pval = 2 * norm .cdf (- np .abs (self .t_stat ))
319
- return pval
327
+ return self .framework .pvals
320
328
321
329
@property
322
330
def boot_t_stat (self ):
323
331
"""
324
- Bootstrapped t-statistics for the causal parameter(s) after calling :meth:`fit` and :meth:`bootstrap`.
332
+ Bootstrapped t-statistics for the causal parameter(s) after calling :meth:`fit` and :meth:`bootstrap`
333
+ (shape (``n_rep_boot``, ``n_quantiles``, ``n_rep``)).
325
334
"""
326
335
if self ._framework is None :
327
336
boot_t_stat = None
@@ -362,30 +371,6 @@ def summary(self):
362
371
df_summary = df_summary .join (ci )
363
372
return df_summary
364
373
365
- # The private properties with __ always deliver the single treatment, single (cross-fitting) sample subselection.
366
- # The slicing is based on the two properties self._i_quant, the index of the quantile, and
367
- # self._i_rep, the index of the cross-fitting sample.
368
-
369
- @property
370
- def __psi0 (self ):
371
- return self ._psi0 [:, self ._i_rep , self ._i_quant ]
372
-
373
- @property
374
- def __psi0_deriv (self ):
375
- return self ._psi0_deriv [:, self ._i_rep , self ._i_quant ]
376
-
377
- @property
378
- def __psi1 (self ):
379
- return self ._psi1 [:, self ._i_rep , self ._i_quant ]
380
-
381
- @property
382
- def __psi1_deriv (self ):
383
- return self ._psi1_deriv [:, self ._i_rep , self ._i_quant ]
384
-
385
- @property
386
- def __all_se (self ):
387
- return self ._all_se [self ._i_quant , self ._i_rep ]
388
-
389
374
def fit (self , n_jobs_models = None , n_jobs_cv = None , store_predictions = True , store_models = False , external_predictions = None ):
390
375
"""
391
376
Estimate DoubleMLQTE models.
@@ -436,26 +421,6 @@ def fit(self, n_jobs_models=None, n_jobs_cv=None, store_predictions=True, store_
436
421
framework_list [self ._i_quant ] = self ._modellist_1 [self ._i_quant ].framework - \
437
422
self ._modellist_0 [self ._i_quant ].framework
438
423
439
- # treatment Effects
440
- self ._all_coef [self ._i_quant , :] = self .modellist_1 [self ._i_quant ].all_coef - \
441
- self .modellist_0 [self ._i_quant ].all_coef
442
-
443
- # save scores and derivatives
444
- self ._psi0 [:, :, self ._i_quant ] = np .squeeze (self .modellist_0 [i_quant ].psi , 2 )
445
- self ._psi1 [:, :, self ._i_quant ] = np .squeeze (self .modellist_1 [i_quant ].psi , 2 )
446
-
447
- self ._psi0_deriv [:, :, self ._i_quant ] = np .squeeze (self .modellist_0 [i_quant ].psi_deriv , 2 )
448
- self ._psi1_deriv [:, :, self ._i_quant ] = np .squeeze (self .modellist_1 [i_quant ].psi_deriv , 2 )
449
-
450
- # Estimate the variance
451
- for i_rep in range (self .n_rep ):
452
- self ._i_rep = i_rep
453
-
454
- self ._all_se [self ._i_quant , self ._i_rep ] = self ._se_causal_pars ()
455
-
456
- # aggregated parameter estimates and standard errors from repeated cross-fitting
457
- self ._agg_cross_fit ()
458
-
459
424
# aggregate all frameworks
460
425
self ._framework = concat (framework_list )
461
426
@@ -478,33 +443,10 @@ def bootstrap(self, method='normal', n_rep_boot=500):
478
443
-------
479
444
self : object
480
445
"""
481
- if np . isnan ( self .coef ). all () :
446
+ if self ._framework is None :
482
447
raise ValueError ('Apply fit() before bootstrap().' )
448
+ self ._framework .bootstrap (method = method , n_rep_boot = n_rep_boot )
483
449
484
- if (not isinstance (method , str )) | (method not in ['Bayes' , 'normal' , 'wild' ]):
485
- raise ValueError ('Method must be "Bayes", "normal" or "wild". '
486
- f'Got { str (method )} .' )
487
-
488
- if not isinstance (n_rep_boot , int ):
489
- raise TypeError ('The number of bootstrap replications must be of int type. '
490
- f'{ str (n_rep_boot )} of type { str (type (n_rep_boot ))} was passed.' )
491
- if n_rep_boot < 1 :
492
- raise ValueError ('The number of bootstrap replications must be positive. '
493
- f'{ str (n_rep_boot )} was passed.' )
494
-
495
- self ._n_rep_boot , self ._boot_coef , self ._boot_t_stat = self ._initialize_boot_arrays (n_rep_boot )
496
-
497
- for i_rep in range (self .n_rep ):
498
- self ._i_rep = i_rep
499
-
500
- n_obs = self ._dml_data .n_obs
501
- weights = _draw_weights (method , n_rep_boot , n_obs )
502
- for i_quant in range (self .n_quantiles ):
503
- self ._i_quant = i_quant
504
- i_start = self ._i_rep * self .n_rep_boot
505
- i_end = (self ._i_rep + 1 ) * self .n_rep_boot
506
- self ._boot_coef [self ._i_quant , i_start :i_end ], self ._boot_t_stat [self ._i_quant , i_start :i_end ] = \
507
- self ._compute_bootstrap (weights )
508
450
return self
509
451
510
452
def draw_sample_splitting (self ):
@@ -526,15 +468,6 @@ def draw_sample_splitting(self):
526
468
527
469
return self
528
470
529
- def _compute_bootstrap (self , weights ):
530
- J0 = np .mean (self .__psi0_deriv )
531
- J1 = np .mean (self .__psi1_deriv )
532
- scaled_score = self .__psi1 / J1 - self .__psi0 / J0
533
-
534
- boot_coef = np .matmul (weights , scaled_score ) / self ._dml_data .n_obs
535
- boot_t_stat = np .matmul (weights , scaled_score ) / (self ._dml_data .n_obs * self .__all_se )
536
- return boot_coef , boot_t_stat
537
-
538
471
def confint (self , joint = False , level = 0.95 ):
539
472
"""
540
473
Confidence intervals for DoubleML models.
@@ -555,97 +488,49 @@ def confint(self, joint=False, level=0.95):
555
488
A data frame with the confidence interval(s).
556
489
"""
557
490
558
- if not isinstance (joint , bool ):
559
- raise TypeError ('joint must be True or False. '
560
- f'Got { str (joint )} .' )
561
-
562
- if not isinstance (level , float ):
563
- raise TypeError ('The confidence level must be of float type. '
564
- f'{ str (level )} of type { str (type (level ))} was passed.' )
565
- if (level <= 0 ) | (level >= 1 ):
566
- raise ValueError ('The confidence level must be in (0,1). '
567
- f'{ str (level )} was passed.' )
568
-
569
- a = (1 - level )
570
- ab = np .array ([a / 2 , 1. - a / 2 ])
571
- if joint :
572
- if np .isnan (self .boot_coef ).all ():
573
- raise ValueError ('Apply fit() & bootstrap() before confint(joint=True).' )
574
- sim = np .amax (np .abs (self .boot_t_stat ), 0 )
575
- hatc = np .quantile (sim , 1 - a )
576
- ci = np .vstack ((self .coef - self .se * hatc , self .coef + self .se * hatc )).T
577
- else :
578
- if np .isnan (self .coef ).all ():
579
- raise ValueError ('Apply fit() before confint().' )
580
- fac = norm .ppf (ab )
581
- ci = np .vstack ((self .coef + self .se * fac [0 ], self .coef + self .se * fac [1 ])).T
582
-
583
- df_ci = pd .DataFrame (ci ,
584
- columns = ['{:.1f} %' .format (i * 100 ) for i in ab ],
585
- index = self ._quantiles )
586
- return df_ci
587
-
588
- def _fit_quantile (self , i_quant , n_jobs_cv = None , store_predictions = True , store_models = False ):
589
-
590
- model_0 = self .modellist_0 [i_quant ]
591
- model_1 = self .modellist_1 [i_quant ]
592
-
593
- model_0 .fit (n_jobs_cv = n_jobs_cv , store_predictions = store_predictions , store_models = store_models )
594
- model_1 .fit (n_jobs_cv = n_jobs_cv , store_predictions = store_predictions , store_models = store_models )
595
-
596
- return model_0 , model_1
491
+ if self .framework is None :
492
+ raise ValueError ('Apply fit() before confint().' )
597
493
598
- def _agg_cross_fit (self ):
599
- # aggregate parameters from the repeated cross-fitting
600
- # don't use the getter (always for one treatment variable and one sample), but the private variable
601
- self .coef = np .median (self ._all_coef , 1 )
494
+ df_ci = self .framework .confint (joint = joint , level = level )
495
+ df_ci .set_index (pd .Index (self ._quantiles ), inplace = True )
602
496
603
- # TODO: In the documentation of standard errors we need to cleary state what we return here, i.e.,
604
- # the asymptotic variance sigma_hat/N and not sigma_hat (which sometimes is also called the asympt var)!
605
- # TODO: In the edge case of repeated no-cross-fitting, the test sets might have different size and therefore
606
- # it would note be valid to always use the same self._var_scaling_factor
607
- xx = np .tile (self .coef .reshape (- 1 , 1 ), self .n_rep )
608
- self .se = np .sqrt (np .divide (np .median (np .multiply (np .power (self ._all_se , 2 ), self ._var_scaling_factor ) +
609
- np .power (self ._all_coef - xx , 2 ), 1 ), self ._var_scaling_factor ))
497
+ return df_ci
610
498
611
- def _var_est (self ):
612
- """
613
- Estimate the standard errors of the structural parameter
499
+ def p_adjust (self , method = 'romano-wolf' ):
614
500
"""
615
- J0 = self ._psi0_deriv [:, self ._i_rep , self ._i_quant ].mean ()
616
- J1 = self ._psi1_deriv [:, self ._i_rep , self ._i_quant ].mean ()
617
- score0 = self ._psi0 [:, self ._i_rep , self ._i_quant ]
618
- score1 = self ._psi1 [:, self ._i_rep , self ._i_quant ]
619
- omega = score1 / J1 - score0 / J0
501
+ Multiple testing adjustment for DoubleML models.
620
502
621
- self ._var_scaling_factor = self ._dml_data .n_obs
622
- sigma2_hat = 1 / self ._var_scaling_factor * np .mean (np .power (omega , 2 ))
503
+ Parameters
504
+ ----------
505
+ method : str
506
+ A str (``'romano-wolf''``, ``'bonferroni'``, ``'holm'``, etc) specifying the adjustment method.
507
+ In addition to ``'romano-wolf''``, all methods implemented in
508
+ :py:func:`statsmodels.stats.multitest.multipletests` can be applied.
509
+ Default is ``'romano-wolf'``.
623
510
624
- return sigma2_hat
511
+ Returns
512
+ -------
513
+ p_val : pd.DataFrame
514
+ A data frame with adjusted p-values.
515
+ """
625
516
626
- def _se_causal_pars (self ):
627
- se = np .sqrt (self ._var_est ())
628
- return se
517
+ if self .framework is None :
518
+ raise ValueError ('Apply fit() before p_adjust().' )
629
519
630
- def _initialize_arrays (self ):
631
- psi0 = np .full ((self ._dml_data .n_obs , self .n_rep , self .n_quantiles ), np .nan )
632
- psi0_deriv = np .full ((self ._dml_data .n_obs , self .n_rep , self .n_quantiles ), np .nan )
520
+ p_val , _ = self .framework .p_adjust (method = method )
521
+ p_val .set_index (pd .Index (self ._quantiles ), inplace = True )
633
522
634
- psi1 = np .full ((self ._dml_data .n_obs , self .n_rep , self .n_quantiles ), np .nan )
635
- psi1_deriv = np .full ((self ._dml_data .n_obs , self .n_rep , self .n_quantiles ), np .nan )
523
+ return p_val
636
524
637
- coef = np .full (self .n_quantiles , np .nan )
638
- se = np .full (self .n_quantiles , np .nan )
525
+ def _fit_quantile (self , i_quant , n_jobs_cv = None , store_predictions = True , store_models = False ):
639
526
640
- all_coef = np . full (( self .n_quantiles , self . n_rep ), np . nan )
641
- all_se = np . full (( self .n_quantiles , self . n_rep ), np . nan )
527
+ model_0 = self .modellist_0 [ i_quant ]
528
+ model_1 = self .modellist_1 [ i_quant ]
642
529
643
- return psi0 , psi1 , psi0_deriv , psi1_deriv , coef , se , all_coef , all_se
530
+ model_0 .fit (n_jobs_cv = n_jobs_cv , store_predictions = store_predictions , store_models = store_models )
531
+ model_1 .fit (n_jobs_cv = n_jobs_cv , store_predictions = store_predictions , store_models = store_models )
644
532
645
- def _initialize_boot_arrays (self , n_rep_boot ):
646
- boot_coef = np .full ((self .n_quantiles , n_rep_boot * self .n_rep ), np .nan )
647
- boot_t_stat = np .full ((self .n_quantiles , n_rep_boot * self .n_rep ), np .nan )
648
- return n_rep_boot , boot_coef , boot_t_stat
533
+ return model_0 , model_1
649
534
650
535
def _check_data (self , obj_dml_data ):
651
536
if not isinstance (obj_dml_data , DoubleMLData ):
0 commit comments