Skip to content

Commit 7dfeea8

Browse files
authored
Merge pull request #231 from mychaelka/causalweight_impl
Implementation of sample selection estimators
2 parents 8431daf + 6115f25 commit 7dfeea8

10 files changed

+1419
-7
lines changed

.coverage

-16 KB
Binary file not shown.

doubleml/__init__.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
from .double_ml_lpq import DoubleMLLPQ
1414
from .double_ml_cvar import DoubleMLCVAR
1515
from .double_ml_policytree import DoubleMLPolicyTree
16+
from .double_ml_ssm import DoubleMLSSM
1617

1718
__all__ = ['DoubleMLPLR',
1819
'DoubleMLPLIV',
@@ -27,6 +28,7 @@
2728
'DoubleMLQTE',
2829
'DoubleMLLPQ',
2930
'DoubleMLCVAR',
30-
'DoubleMLPolicyTree']
31+
'DoubleMLPolicyTree',
32+
'DoubleMLSSM']
3133

3234
__version__ = get_distribution('doubleml').version

doubleml/datasets.py

Lines changed: 88 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1345,3 +1345,91 @@ def treatment_effect(x):
13451345
'effects': te,
13461346
'treatment_effect': treatment_effect}
13471347
return res_dict
1348+
1349+
1350+
def make_ssm_data(n_obs=8000, dim_x=100, theta=1, mar=True, return_type='DoubleMLData'):
1351+
"""
1352+
Generates data from a sample selection model (SSM).
1353+
The data generating process is defined as
1354+
1355+
.. math::
1356+
1357+
y_i &= \\theta d_i + x_i' \\beta d_i + u_i, & with Y being observed if s = 1,
1358+
1359+
s_i &= 1\\left\\lbrace d_i + \\gamma z_i + x_i' \\beta + v_i > 0 \\right\\rbrace, & &d_i
1360+
= 1\\left\\lbrace x_i' \\beta + w_i > 0 \\right\\rbrace,
1361+
1362+
1363+
with covariates :math:`x_i \\sim \\mathcal{N}(0, \\Sigma^2_x)`, where
1364+
:math:`\\Sigma^2_x` is a matrix with entries
1365+
:math:`\\Sigma_{kj} = 0.5^{|j-k|}`.
1366+
:math:`\\beta` is a `dim_x`-vector with entries :math:`\\beta_j=\\frac{0.4}{j^2}`
1367+
:math:`z_i \\sim \\mathcal{N}(0, 1)`,
1368+
:math:`(u_i,v_i) \\sim \\mathcal{N}(0, \\Sigma^2_{u,v})`,
1369+
:math:`w_i \\sim \\mathcal{N}(0, 1)`
1370+
1371+
1372+
The data generating process is inspired by a process used in the simulation study (see Appendix E) of Bia,
1373+
Huber and Lafférs (2023).
1374+
1375+
Parameters
1376+
----------
1377+
n_obs :
1378+
The number of observations to simulate.
1379+
dim_x :
1380+
The number of covariates.
1381+
theta :
1382+
The value of the causal parameter.
1383+
mar:
1384+
Boolean. Indicates whether missingness at random holds.
1385+
return_type :
1386+
If ``'DoubleMLData'`` or ``DoubleMLData``, returns a ``DoubleMLData`` object.
1387+
1388+
If ``'DataFrame'``, ``'pd.DataFrame'`` or ``pd.DataFrame``, returns a ``pd.DataFrame``.
1389+
1390+
If ``'array'``, ``'np.ndarray'``, ``'np.array'`` or ``np.ndarray``, returns ``np.ndarray``'s ``(x, y, d, z, s)``.
1391+
1392+
References
1393+
----------
1394+
Michela Bia, Martin Huber & Lukáš Lafférs (2023) Double Machine Learning for Sample Selection Models,
1395+
Journal of Business & Economic Statistics, DOI: 10.1080/07350015.2023.2271071
1396+
"""
1397+
if mar:
1398+
sigma = np.array([[1, 0], [0, 1]])
1399+
gamma = 0
1400+
else:
1401+
sigma = np.array([[1, 0.8], [0.8, 1]])
1402+
gamma = 1
1403+
1404+
e = np.random.multivariate_normal(mean=[0, 0], cov=sigma, size=n_obs).T
1405+
1406+
cov_mat = toeplitz([np.power(0.5, k) for k in range(dim_x)])
1407+
x = np.random.multivariate_normal(np.zeros(dim_x), cov_mat, size=[n_obs, ])
1408+
1409+
beta = [0.4 / (k**2) for k in range(1, dim_x + 1)]
1410+
1411+
d = np.where(np.dot(x, beta) + np.random.randn(n_obs) > 0, 1, 0)
1412+
z = np.random.randn(n_obs)
1413+
s = np.where(np.dot(x, beta) + d + gamma * z + e[0] > 0, 1, 0)
1414+
1415+
y = np.dot(x, beta) + theta * d + e[1]
1416+
y[s == 0] = 0
1417+
1418+
if return_type in _array_alias:
1419+
return x, y, d, z, s
1420+
elif return_type in _data_frame_alias + _dml_data_alias:
1421+
x_cols = [f'X{i + 1}' for i in np.arange(dim_x)]
1422+
if mar:
1423+
data = pd.DataFrame(np.column_stack((x, y, d, s)),
1424+
columns=x_cols + ['y', 'd', 's'])
1425+
else:
1426+
data = pd.DataFrame(np.column_stack((x, y, d, z, s)),
1427+
columns=x_cols + ['y', 'd', 'z', 's'])
1428+
if return_type in _data_frame_alias:
1429+
return data
1430+
else:
1431+
if mar:
1432+
return DoubleMLData(data, 'y', 'd', x_cols, None, 's')
1433+
return DoubleMLData(data, 'y', 'd', x_cols, 'z', 's')
1434+
else:
1435+
raise ValueError('Invalid return_type.')

0 commit comments

Comments
 (0)