@@ -1345,3 +1345,91 @@ def treatment_effect(x):
1345
1345
'effects' : te ,
1346
1346
'treatment_effect' : treatment_effect }
1347
1347
return res_dict
1348
+
1349
+
1350
+ def make_ssm_data (n_obs = 8000 , dim_x = 100 , theta = 1 , mar = True , return_type = 'DoubleMLData' ):
1351
+ """
1352
+ Generates data from a sample selection model (SSM).
1353
+ The data generating process is defined as
1354
+
1355
+ .. math::
1356
+
1357
+ y_i &= \\ theta d_i + x_i' \\ beta d_i + u_i, & with Y being observed if s = 1,
1358
+
1359
+ s_i &= 1\\ left\\ lbrace d_i + \\ gamma z_i + x_i' \\ beta + v_i > 0 \\ right\\ rbrace, & &d_i
1360
+ = 1\\ left\\ lbrace x_i' \\ beta + w_i > 0 \\ right\\ rbrace,
1361
+
1362
+
1363
+ with covariates :math:`x_i \\ sim \\ mathcal{N}(0, \\ Sigma^2_x)`, where
1364
+ :math:`\\ Sigma^2_x` is a matrix with entries
1365
+ :math:`\\ Sigma_{kj} = 0.5^{|j-k|}`.
1366
+ :math:`\\ beta` is a `dim_x`-vector with entries :math:`\\ beta_j=\\ frac{0.4}{j^2}`
1367
+ :math:`z_i \\ sim \\ mathcal{N}(0, 1)`,
1368
+ :math:`(u_i,v_i) \\ sim \\ mathcal{N}(0, \\ Sigma^2_{u,v})`,
1369
+ :math:`w_i \\ sim \\ mathcal{N}(0, 1)`
1370
+
1371
+
1372
+ The data generating process is inspired by a process used in the simulation study (see Appendix E) of Bia,
1373
+ Huber and Lafférs (2023).
1374
+
1375
+ Parameters
1376
+ ----------
1377
+ n_obs :
1378
+ The number of observations to simulate.
1379
+ dim_x :
1380
+ The number of covariates.
1381
+ theta :
1382
+ The value of the causal parameter.
1383
+ mar:
1384
+ Boolean. Indicates whether missingness at random holds.
1385
+ return_type :
1386
+ If ``'DoubleMLData'`` or ``DoubleMLData``, returns a ``DoubleMLData`` object.
1387
+
1388
+ If ``'DataFrame'``, ``'pd.DataFrame'`` or ``pd.DataFrame``, returns a ``pd.DataFrame``.
1389
+
1390
+ If ``'array'``, ``'np.ndarray'``, ``'np.array'`` or ``np.ndarray``, returns ``np.ndarray``'s ``(x, y, d, z, s)``.
1391
+
1392
+ References
1393
+ ----------
1394
+ Michela Bia, Martin Huber & Lukáš Lafférs (2023) Double Machine Learning for Sample Selection Models,
1395
+ Journal of Business & Economic Statistics, DOI: 10.1080/07350015.2023.2271071
1396
+ """
1397
+ if mar :
1398
+ sigma = np .array ([[1 , 0 ], [0 , 1 ]])
1399
+ gamma = 0
1400
+ else :
1401
+ sigma = np .array ([[1 , 0.8 ], [0.8 , 1 ]])
1402
+ gamma = 1
1403
+
1404
+ e = np .random .multivariate_normal (mean = [0 , 0 ], cov = sigma , size = n_obs ).T
1405
+
1406
+ cov_mat = toeplitz ([np .power (0.5 , k ) for k in range (dim_x )])
1407
+ x = np .random .multivariate_normal (np .zeros (dim_x ), cov_mat , size = [n_obs , ])
1408
+
1409
+ beta = [0.4 / (k ** 2 ) for k in range (1 , dim_x + 1 )]
1410
+
1411
+ d = np .where (np .dot (x , beta ) + np .random .randn (n_obs ) > 0 , 1 , 0 )
1412
+ z = np .random .randn (n_obs )
1413
+ s = np .where (np .dot (x , beta ) + d + gamma * z + e [0 ] > 0 , 1 , 0 )
1414
+
1415
+ y = np .dot (x , beta ) + theta * d + e [1 ]
1416
+ y [s == 0 ] = 0
1417
+
1418
+ if return_type in _array_alias :
1419
+ return x , y , d , z , s
1420
+ elif return_type in _data_frame_alias + _dml_data_alias :
1421
+ x_cols = [f'X{ i + 1 } ' for i in np .arange (dim_x )]
1422
+ if mar :
1423
+ data = pd .DataFrame (np .column_stack ((x , y , d , s )),
1424
+ columns = x_cols + ['y' , 'd' , 's' ])
1425
+ else :
1426
+ data = pd .DataFrame (np .column_stack ((x , y , d , z , s )),
1427
+ columns = x_cols + ['y' , 'd' , 'z' , 's' ])
1428
+ if return_type in _data_frame_alias :
1429
+ return data
1430
+ else :
1431
+ if mar :
1432
+ return DoubleMLData (data , 'y' , 'd' , x_cols , None , 's' )
1433
+ return DoubleMLData (data , 'y' , 'd' , x_cols , 'z' , 's' )
1434
+ else :
1435
+ raise ValueError ('Invalid return_type.' )
0 commit comments