Skip to content

Commit b095e93

Browse files
author
Riera Auge
committed
updated synthetic example to make use of class method
1 parent 9ab8e93 commit b095e93

File tree

1 file changed

+34
-249
lines changed

1 file changed

+34
-249
lines changed

synthetic_example.py

Lines changed: 34 additions & 249 deletions
Original file line numberDiff line numberDiff line change
@@ -18,209 +18,14 @@
1818
import matplotlib.pyplot as plt
1919

2020

21-
def polynomial_fit(X: np.array, y: np.array, max_degree: int) -> tuple[RegressionResults, int]:
22-
"""
23-
Polynomial regression of y with respect to X. CV is implemented to test polynomials from degree 1 up to max_degree.
24-
Only the degree with minimal residuals is returned.
25-
26-
X in this case should be the feature for which to calculate the CATE and y the robust score described in
27-
Semenova 2.2
28-
29-
Parameters
30-
----------
31-
X: features of the regression (variable for which we want to calculate the CATE)
32-
y: target variable (robust score)
33-
max_degree: maximum degree of the polynomials
34-
35-
Returns
36-
-------
37-
fitted model and degree of the polynomials
38-
"""
39-
# todo: assert correct dimensions of the parameters
40-
# todo: is it really important that the polynomials be orthogonal?
41-
# todo: allow for multiple variables in X
42-
# First we find which is the degree with minimal error
43-
cv_errors = np.zeros(max_degree)
44-
for degree in range(1, max_degree + 1):
45-
pf = PolynomialFeatures(degree=degree, include_bias=True)
46-
X_poly = pf.fit_transform(X)
47-
model = sm.OLS(y, X_poly)
48-
results = model.fit()
49-
influence = results.get_influence()
50-
leverage = influence.hat_matrix_diag # this is what semenova uses (leverage)
51-
cv_errors[degree - 1] = np.sum((results.resid / (1 - leverage)) ** 2)
52-
53-
# Degree chosen by cross-validation (we add one because degree zero is not included)
54-
cv_degree = np.argmin(cv_errors) + 1
55-
56-
# Estimate coefficients
57-
pf = PolynomialFeatures(degree=cv_degree, include_bias=True)
58-
x_poly = pf.fit_transform(X)
59-
model = sm.OLS(y, x_poly)
60-
results = model.fit()
61-
return results, cv_degree
62-
63-
64-
def splines_fit(X: np.array, y: np.array, max_knots: int, degree: int = 3) -> tuple[RegressionResults, int]:
65-
"""
66-
Splines interpolation of y with respect to X. CV is implemented to test splines of number of knots
67-
from knots 2 up to max_knots. Only the knots with minimal residuals is returned.
68-
69-
X in this case should be the feature for which to calculate the CATE and y the robust score described in
70-
Semenova 2.2
71-
72-
Parameters
73-
----------
74-
X: features of the regression (variable for which we want to calculate the CATE)
75-
y: target variable (robust score)
76-
max_knots: maximum number of knots in the splines interpolation
77-
degree: degree of the splines polynomials to be used
78-
79-
Returns
80-
-------
81-
fitted model and degree of the polynomials
82-
"""
83-
# todo: assert correct dimensions of the parameters
84-
# todo: is it really important that the polynomials be orthogonal?
85-
# todo: allow for multiple variables in X
86-
# First we find which is the n_knots with minimal error
87-
cv_errors = np.zeros(max_knots)
88-
for n_knots in range(2, max_knots + 1):
89-
breaks = np.quantile(X, q=np.array(range(0, n_knots + 1)) / n_knots)
90-
X_splines = patsy.bs(X, knots=breaks[:-1], degree=degree)
91-
model = sm.OLS(y, X_splines)
92-
results = model.fit()
93-
influence = results.get_influence()
94-
leverage = influence.hat_matrix_diag # this is what semenova uses (leverage)
95-
cv_errors[n_knots - 2] = np.sum((results.resid / (1 - leverage)) ** 2)
96-
97-
# Degree chosen by cross-validation (we add one because degree zero is not included)
98-
cv_knots = np.argmin(cv_errors) + 1
99-
100-
# Estimate coefficients
101-
breaks = np.quantile(X, q=np.array(range(0, cv_knots + 1)) / cv_knots)
102-
X_splines = patsy.bs(X, knots=breaks[:-1], degree=degree)
103-
model = sm.OLS(y, X_splines)
104-
results = model.fit()
105-
return results, cv_knots
106-
107-
108-
def calculate_bootstrap_tstat(regressors_grid: np.array, omega_hat: np.array, alpha: float, n_samples_bootstrap: int) \
109-
-> float:
110-
"""
111-
This function calculates the critical value of the confidence bands of the bootstrapped t-statistics
112-
from def. 2.7 in Semenova.
113-
114-
Parameters
115-
----------
116-
regressors_grid: support of the variable of interested for which to calculate the t-statistics
117-
omega_hat: covariance matrix
118-
alpha: p-value
119-
n_samples_bootstrap: number of samples to generate for the normal distribution draw
120-
121-
Returns
122-
-------
123-
float with the critical value of the t-statistic
124-
"""
125-
# don't need sqrt(N) because it cancels out with the numerator
126-
numerator_grid = regressors_grid @ sqrtm(omega_hat)
127-
# we take the diagonal because in the paper the multiplication is p(x)'*Omega*p(x),
128-
# where p(x) is the vector of basis functions
129-
denominator_grid = np.sqrt(np.diag(regressors_grid @ omega_hat @ np.transpose(regressors_grid)))
130-
131-
norm_numerator_grid = numerator_grid.copy()
132-
for k in range(numerator_grid.shape[0]):
133-
norm_numerator_grid[k, :] = numerator_grid[k, :] / denominator_grid[k]
134-
135-
t_maxs = np.amax(np.abs(norm_numerator_grid @ np.random.normal(size=numerator_grid.shape[1] * n_samples_bootstrap)
136-
.reshape(numerator_grid.shape[1], n_samples_bootstrap)), axis=1)
137-
return np.quantile(t_maxs, q=1 - alpha)
138-
139-
140-
def second_stage_estimation(X: np.array, y: np.array, method: str, max_degree: int, max_knots: int, degree: int,
141-
alpha: float, n_nodes: int, n_samples_bootstrap: int) -> dict:
142-
"""
143-
Calculates the CATE with respect to variable X by polynomial or splines approximation.
144-
It calculates it on an equidistant grid of n_nodes points over the range of X
145-
146-
Parameters
147-
----------
148-
X: variable whose CATE is calculated
149-
y: robust dependent variable, as defined in 2.2 in Semenova
150-
method: "poly" or "splines", chooses which method to approximate with
151-
max_degree: maximum degree of the polynomial approximation
152-
max_knots: max knots for the splines approximation
153-
degree: degree of the polynomials used in the splines
154-
alpha: p-value for the confidence intervals
155-
n_nodes: number of points of X for which we wish to calculate the CATE
156-
n_samples_bootstrap: how many samples to use to calculate the t-statistics described in 2.6
157-
158-
Returns
159-
-------
160-
A dictionary containing the estimated CATE (g_hat), with upper and lower confidence bounds (both simultaneous as
161-
well as pointwise), fitted linear model and grid of X for which the CATE was calculated
162-
163-
"""
164-
x_grid = np.linspace(np.min(X), np.max(X), n_nodes).reshape(-1, 1)
165-
166-
if method == "poly":
167-
fitted_model, poly_degree = polynomial_fit(X=X, y=y, max_degree=max_degree)
168-
169-
# Build the set of datapoints in X that will be used for prediction
170-
pf = PolynomialFeatures(degree=poly_degree, include_bias=True)
171-
regressors_grid = pf.fit_transform(x_grid)
172-
173-
elif method == "splines":
174-
fitted_model, n_knots = splines_fit(X=X, y=y, degree=degree, max_knots=max_knots)
175-
176-
# Build the set of datapoints in X that will be used for prediction
177-
breaks = np.quantile(X, q=np.array(range(0, n_knots + 1)) / n_knots)
178-
regressors_grid = patsy.bs(x_grid, knots=breaks[:-1], degree=degree)
179-
180-
else:
181-
raise NotImplementedError("The specified method is not implemented. Please use 'poly' or 'splines'")
182-
183-
g_hat = regressors_grid @ fitted_model.params
184-
# we can get the HCO matrix directly from the model object
185-
hcv_coeff = fitted_model.cov_HC0
186-
standard_error = np.sqrt(np.diag(regressors_grid @ hcv_coeff @ np.transpose(regressors_grid)))
187-
# Lower pointwise CI
188-
g_hat_lower_point = g_hat + norm.ppf(q=alpha / 2) * standard_error
189-
# Upper pointwise CI
190-
g_hat_upper_point = g_hat + norm.ppf(q=1 - alpha / 2) * standard_error
191-
192-
max_t_stat = calculate_bootstrap_tstat(regressors_grid=regressors_grid,
193-
omega_hat=hcv_coeff,
194-
alpha=alpha,
195-
n_samples_bootstrap=n_samples_bootstrap)
196-
# Lower simultaneous CI
197-
g_hat_lower = g_hat - max_t_stat * standard_error
198-
# Upper simultaneous CI
199-
g_hat_upper = g_hat + max_t_stat * standard_error
200-
results_dict = {
201-
"g_hat": g_hat,
202-
"g_hat_lower": g_hat_lower,
203-
"g_hat_upper": g_hat_upper,
204-
"g_hat_lower_point": g_hat_lower_point,
205-
"g_hat_upper_point": g_hat_upper_point,
206-
"x_grid": x_grid,
207-
"fitted_model": fitted_model
208-
}
209-
return results_dict
210-
211-
21221
# Reproducing the same results as in https://github.com/microsoft/EconML/blob/master/notebooks/Double%20Machine%20Learning%20Examples.ipynb
21322

21423
# Treatment effect function
21524
def transform_te(x):
216-
return np.exp(4 + 2*x[0])
25+
return np.exp(4 + 2 * x[0])
21726
# return 3
21827

21928

220-
# DGP constants
221-
np.random.seed(123)
222-
223-
22429
def create_synthetic_data(n: int, n_w: int, support_size: int, n_x: int):
22530
"""
22631
Creates a synthetic example based on example 2 of https://github.com/microsoft/EconML/blob/master/notebooks/Double%20Machine%20Learning%20Examples.ipynb
@@ -292,65 +97,45 @@ def plot_results(dict_results: dict, n, n_w, support_size, n_x, method):
29297
plt.plot(df_plot["x_grid"], df_plot["True_g"], "red", linestyle="solid", label="True effect")
29398
plt.legend(loc="upper left")
29499
plt.title(f"CATE with {method}, n={n}, n_w={n_w}, sup_size={support_size}, n_x={n_x},")
295-
plt.savefig(
296-
rf"\\ad.uni-hamburg.de\redir\redir0101\BBB1675\Documents\CATE\figures\CATE_{method}_n{n}_nw{n_w}_sup{support_size}_nx{n_x}.png")
297-
100+
plt.savefig(rf".\CATE_{method}_n{n}_nw{n_w}_sup{support_size}_nx{n_x}.png")
298101

102+
# DGP constants
103+
np.random.seed(123)
299104
n = 2000
300105
n_w = 30
301106
support_size = 5
302107
n_x = 1
303108

304-
for n in [2000, 3000, 4000]:
305-
for n_w in [5, 10, 20, 30]:
306-
for support_size in [5, 10, 20, 30]:
307-
if support_size <= n_w:
308-
for method in ["poly", "splines"]:
309-
print(f"CATE with {method}, n={n}, n_w={n_w}, sup_size={support_size}, n_x={n_x},")
109+
# Create data
110+
data, covariates = create_synthetic_data(n=n, n_w=n_w, support_size=support_size, n_x=n_x)
111+
data_dml_base = dml.DoubleMLData(data,
112+
y_col='y',
113+
d_cols='t',
114+
x_cols=covariates)
310115

311-
# Create data
312-
data, covariates = create_synthetic_data(n=n, n_w=n_w, support_size=support_size, n_x=n_x)
313-
data_dml_base = dml.DoubleMLData(data,
314-
y_col='y',
315-
d_cols='t',
316-
x_cols=covariates)
116+
# First stage estimation
117+
# Lasso regression
118+
lasso_reg = LassoCV()
119+
randomForest_class = RandomForestClassifier(n_estimators=500)
317120

318-
# First stage estimation
319-
# Lasso regression
320-
lasso_reg = LassoCV()
321-
randomForest_class = RandomForestClassifier(n_estimators=500)
322-
323-
np.random.seed(123)
324-
325-
dml_irm = dml.DoubleMLIRM(data_dml_base,
326-
ml_g=lasso_reg,
327-
ml_m=randomForest_class,
328-
trimming_threshold=0.01,
329-
n_folds=5
330-
)
331-
print("Training first stage")
332-
dml_irm.fit(store_predictions=True)
333-
334-
# Psi_b is the part of the score that we are interested in (It is the robust Y in Semenova)
335-
y_robust = dml_irm.psi_b # Q: why is y with dim 3?
336-
# We reshape it so that it is a nx1 vector
337-
y_robust = y_robust.reshape(-1, 1)
121+
np.random.seed(123)
338122

339-
X = np.array(data['x'])
340-
X = X.reshape(-1, 1)
341-
max_degree = 3
342-
degree = 3
343-
max_knots = 20
344-
alpha = 0.05
345-
n_nodes = 90
346-
n_samples_bootstrap = 10000
347-
print("Training second stage")
348-
dict_results = second_stage_estimation(X=X, y=y_robust, method=method,
349-
max_degree=max_degree,
350-
degree=degree,
351-
max_knots=max_knots,
352-
alpha=alpha,
353-
n_nodes=n_nodes,
354-
n_samples_bootstrap=n_samples_bootstrap)
355-
print("Saving results")
356-
plot_results(dict_results, n, n_w=n_w, support_size=support_size, n_x=n_x, method=method)
123+
dml_irm = dml.DoubleMLIRM(data_dml_base,
124+
ml_g=lasso_reg,
125+
ml_m=randomForest_class,
126+
trimming_threshold=0.01,
127+
n_folds=5
128+
)
129+
print("Training first stage")
130+
dml_irm.fit(store_predictions=True)
131+
132+
dict_results = dml_irm.cate(cate_var="x",
133+
method="splines",
134+
alpha=0.05,
135+
n_grid_nodes=90,
136+
n_samples_bootstrap=10000,
137+
cv=False,
138+
splines_knots=30,
139+
splines_degree=3)
140+
141+
plot_results(dict_results, n, n_w=n_w, support_size=support_size, n_x=n_x, method="splines")

0 commit comments

Comments
 (0)