|
18 | 18 | import matplotlib.pyplot as plt
|
19 | 19 |
|
20 | 20 |
|
21 |
| -def polynomial_fit(X: np.array, y: np.array, max_degree: int) -> tuple[RegressionResults, int]: |
22 |
| - """ |
23 |
| - Polynomial regression of y with respect to X. CV is implemented to test polynomials from degree 1 up to max_degree. |
24 |
| - Only the degree with minimal residuals is returned. |
25 |
| -
|
26 |
| - X in this case should be the feature for which to calculate the CATE and y the robust score described in |
27 |
| - Semenova 2.2 |
28 |
| -
|
29 |
| - Parameters |
30 |
| - ---------- |
31 |
| - X: features of the regression (variable for which we want to calculate the CATE) |
32 |
| - y: target variable (robust score) |
33 |
| - max_degree: maximum degree of the polynomials |
34 |
| -
|
35 |
| - Returns |
36 |
| - ------- |
37 |
| - fitted model and degree of the polynomials |
38 |
| - """ |
39 |
| - # todo: assert correct dimensions of the parameters |
40 |
| - # todo: is it really important that the polynomials be orthogonal? |
41 |
| - # todo: allow for multiple variables in X |
42 |
| - # First we find which is the degree with minimal error |
43 |
| - cv_errors = np.zeros(max_degree) |
44 |
| - for degree in range(1, max_degree + 1): |
45 |
| - pf = PolynomialFeatures(degree=degree, include_bias=True) |
46 |
| - X_poly = pf.fit_transform(X) |
47 |
| - model = sm.OLS(y, X_poly) |
48 |
| - results = model.fit() |
49 |
| - influence = results.get_influence() |
50 |
| - leverage = influence.hat_matrix_diag # this is what semenova uses (leverage) |
51 |
| - cv_errors[degree - 1] = np.sum((results.resid / (1 - leverage)) ** 2) |
52 |
| - |
53 |
| - # Degree chosen by cross-validation (we add one because degree zero is not included) |
54 |
| - cv_degree = np.argmin(cv_errors) + 1 |
55 |
| - |
56 |
| - # Estimate coefficients |
57 |
| - pf = PolynomialFeatures(degree=cv_degree, include_bias=True) |
58 |
| - x_poly = pf.fit_transform(X) |
59 |
| - model = sm.OLS(y, x_poly) |
60 |
| - results = model.fit() |
61 |
| - return results, cv_degree |
62 |
| - |
63 |
| - |
64 |
| -def splines_fit(X: np.array, y: np.array, max_knots: int, degree: int = 3) -> tuple[RegressionResults, int]: |
65 |
| - """ |
66 |
| - Splines interpolation of y with respect to X. CV is implemented to test splines of number of knots |
67 |
| - from knots 2 up to max_knots. Only the knots with minimal residuals is returned. |
68 |
| -
|
69 |
| - X in this case should be the feature for which to calculate the CATE and y the robust score described in |
70 |
| - Semenova 2.2 |
71 |
| -
|
72 |
| - Parameters |
73 |
| - ---------- |
74 |
| - X: features of the regression (variable for which we want to calculate the CATE) |
75 |
| - y: target variable (robust score) |
76 |
| - max_knots: maximum number of knots in the splines interpolation |
77 |
| - degree: degree of the splines polynomials to be used |
78 |
| -
|
79 |
| - Returns |
80 |
| - ------- |
81 |
| - fitted model and degree of the polynomials |
82 |
| - """ |
83 |
| - # todo: assert correct dimensions of the parameters |
84 |
| - # todo: is it really important that the polynomials be orthogonal? |
85 |
| - # todo: allow for multiple variables in X |
86 |
| - # First we find which is the n_knots with minimal error |
87 |
| - cv_errors = np.zeros(max_knots) |
88 |
| - for n_knots in range(2, max_knots + 1): |
89 |
| - breaks = np.quantile(X, q=np.array(range(0, n_knots + 1)) / n_knots) |
90 |
| - X_splines = patsy.bs(X, knots=breaks[:-1], degree=degree) |
91 |
| - model = sm.OLS(y, X_splines) |
92 |
| - results = model.fit() |
93 |
| - influence = results.get_influence() |
94 |
| - leverage = influence.hat_matrix_diag # this is what semenova uses (leverage) |
95 |
| - cv_errors[n_knots - 2] = np.sum((results.resid / (1 - leverage)) ** 2) |
96 |
| - |
97 |
| - # Degree chosen by cross-validation (we add one because degree zero is not included) |
98 |
| - cv_knots = np.argmin(cv_errors) + 1 |
99 |
| - |
100 |
| - # Estimate coefficients |
101 |
| - breaks = np.quantile(X, q=np.array(range(0, cv_knots + 1)) / cv_knots) |
102 |
| - X_splines = patsy.bs(X, knots=breaks[:-1], degree=degree) |
103 |
| - model = sm.OLS(y, X_splines) |
104 |
| - results = model.fit() |
105 |
| - return results, cv_knots |
106 |
| - |
107 |
| - |
108 |
| -def calculate_bootstrap_tstat(regressors_grid: np.array, omega_hat: np.array, alpha: float, n_samples_bootstrap: int) \ |
109 |
| - -> float: |
110 |
| - """ |
111 |
| - This function calculates the critical value of the confidence bands of the bootstrapped t-statistics |
112 |
| - from def. 2.7 in Semenova. |
113 |
| -
|
114 |
| - Parameters |
115 |
| - ---------- |
116 |
| - regressors_grid: support of the variable of interested for which to calculate the t-statistics |
117 |
| - omega_hat: covariance matrix |
118 |
| - alpha: p-value |
119 |
| - n_samples_bootstrap: number of samples to generate for the normal distribution draw |
120 |
| -
|
121 |
| - Returns |
122 |
| - ------- |
123 |
| - float with the critical value of the t-statistic |
124 |
| - """ |
125 |
| - # don't need sqrt(N) because it cancels out with the numerator |
126 |
| - numerator_grid = regressors_grid @ sqrtm(omega_hat) |
127 |
| - # we take the diagonal because in the paper the multiplication is p(x)'*Omega*p(x), |
128 |
| - # where p(x) is the vector of basis functions |
129 |
| - denominator_grid = np.sqrt(np.diag(regressors_grid @ omega_hat @ np.transpose(regressors_grid))) |
130 |
| - |
131 |
| - norm_numerator_grid = numerator_grid.copy() |
132 |
| - for k in range(numerator_grid.shape[0]): |
133 |
| - norm_numerator_grid[k, :] = numerator_grid[k, :] / denominator_grid[k] |
134 |
| - |
135 |
| - t_maxs = np.amax(np.abs(norm_numerator_grid @ np.random.normal(size=numerator_grid.shape[1] * n_samples_bootstrap) |
136 |
| - .reshape(numerator_grid.shape[1], n_samples_bootstrap)), axis=1) |
137 |
| - return np.quantile(t_maxs, q=1 - alpha) |
138 |
| - |
139 |
| - |
140 |
| -def second_stage_estimation(X: np.array, y: np.array, method: str, max_degree: int, max_knots: int, degree: int, |
141 |
| - alpha: float, n_nodes: int, n_samples_bootstrap: int) -> dict: |
142 |
| - """ |
143 |
| - Calculates the CATE with respect to variable X by polynomial or splines approximation. |
144 |
| - It calculates it on an equidistant grid of n_nodes points over the range of X |
145 |
| -
|
146 |
| - Parameters |
147 |
| - ---------- |
148 |
| - X: variable whose CATE is calculated |
149 |
| - y: robust dependent variable, as defined in 2.2 in Semenova |
150 |
| - method: "poly" or "splines", chooses which method to approximate with |
151 |
| - max_degree: maximum degree of the polynomial approximation |
152 |
| - max_knots: max knots for the splines approximation |
153 |
| - degree: degree of the polynomials used in the splines |
154 |
| - alpha: p-value for the confidence intervals |
155 |
| - n_nodes: number of points of X for which we wish to calculate the CATE |
156 |
| - n_samples_bootstrap: how many samples to use to calculate the t-statistics described in 2.6 |
157 |
| -
|
158 |
| - Returns |
159 |
| - ------- |
160 |
| - A dictionary containing the estimated CATE (g_hat), with upper and lower confidence bounds (both simultaneous as |
161 |
| - well as pointwise), fitted linear model and grid of X for which the CATE was calculated |
162 |
| -
|
163 |
| - """ |
164 |
| - x_grid = np.linspace(np.min(X), np.max(X), n_nodes).reshape(-1, 1) |
165 |
| - |
166 |
| - if method == "poly": |
167 |
| - fitted_model, poly_degree = polynomial_fit(X=X, y=y, max_degree=max_degree) |
168 |
| - |
169 |
| - # Build the set of datapoints in X that will be used for prediction |
170 |
| - pf = PolynomialFeatures(degree=poly_degree, include_bias=True) |
171 |
| - regressors_grid = pf.fit_transform(x_grid) |
172 |
| - |
173 |
| - elif method == "splines": |
174 |
| - fitted_model, n_knots = splines_fit(X=X, y=y, degree=degree, max_knots=max_knots) |
175 |
| - |
176 |
| - # Build the set of datapoints in X that will be used for prediction |
177 |
| - breaks = np.quantile(X, q=np.array(range(0, n_knots + 1)) / n_knots) |
178 |
| - regressors_grid = patsy.bs(x_grid, knots=breaks[:-1], degree=degree) |
179 |
| - |
180 |
| - else: |
181 |
| - raise NotImplementedError("The specified method is not implemented. Please use 'poly' or 'splines'") |
182 |
| - |
183 |
| - g_hat = regressors_grid @ fitted_model.params |
184 |
| - # we can get the HCO matrix directly from the model object |
185 |
| - hcv_coeff = fitted_model.cov_HC0 |
186 |
| - standard_error = np.sqrt(np.diag(regressors_grid @ hcv_coeff @ np.transpose(regressors_grid))) |
187 |
| - # Lower pointwise CI |
188 |
| - g_hat_lower_point = g_hat + norm.ppf(q=alpha / 2) * standard_error |
189 |
| - # Upper pointwise CI |
190 |
| - g_hat_upper_point = g_hat + norm.ppf(q=1 - alpha / 2) * standard_error |
191 |
| - |
192 |
| - max_t_stat = calculate_bootstrap_tstat(regressors_grid=regressors_grid, |
193 |
| - omega_hat=hcv_coeff, |
194 |
| - alpha=alpha, |
195 |
| - n_samples_bootstrap=n_samples_bootstrap) |
196 |
| - # Lower simultaneous CI |
197 |
| - g_hat_lower = g_hat - max_t_stat * standard_error |
198 |
| - # Upper simultaneous CI |
199 |
| - g_hat_upper = g_hat + max_t_stat * standard_error |
200 |
| - results_dict = { |
201 |
| - "g_hat": g_hat, |
202 |
| - "g_hat_lower": g_hat_lower, |
203 |
| - "g_hat_upper": g_hat_upper, |
204 |
| - "g_hat_lower_point": g_hat_lower_point, |
205 |
| - "g_hat_upper_point": g_hat_upper_point, |
206 |
| - "x_grid": x_grid, |
207 |
| - "fitted_model": fitted_model |
208 |
| - } |
209 |
| - return results_dict |
210 |
| - |
211 |
| - |
212 | 21 | # Reproducing the same results as in https://github.com/microsoft/EconML/blob/master/notebooks/Double%20Machine%20Learning%20Examples.ipynb
|
213 | 22 |
|
214 | 23 | # Treatment effect function
|
215 | 24 | def transform_te(x):
|
216 |
| - return np.exp(4 + 2*x[0]) |
| 25 | + return np.exp(4 + 2 * x[0]) |
217 | 26 | # return 3
|
218 | 27 |
|
219 | 28 |
|
220 |
| -# DGP constants |
221 |
| -np.random.seed(123) |
222 |
| - |
223 |
| - |
224 | 29 | def create_synthetic_data(n: int, n_w: int, support_size: int, n_x: int):
|
225 | 30 | """
|
226 | 31 | Creates a synthetic example based on example 2 of https://github.com/microsoft/EconML/blob/master/notebooks/Double%20Machine%20Learning%20Examples.ipynb
|
@@ -292,65 +97,45 @@ def plot_results(dict_results: dict, n, n_w, support_size, n_x, method):
|
292 | 97 | plt.plot(df_plot["x_grid"], df_plot["True_g"], "red", linestyle="solid", label="True effect")
|
293 | 98 | plt.legend(loc="upper left")
|
294 | 99 | plt.title(f"CATE with {method}, n={n}, n_w={n_w}, sup_size={support_size}, n_x={n_x},")
|
295 |
| - plt.savefig( |
296 |
| - rf"\\ad.uni-hamburg.de\redir\redir0101\BBB1675\Documents\CATE\figures\CATE_{method}_n{n}_nw{n_w}_sup{support_size}_nx{n_x}.png") |
297 |
| - |
| 100 | + plt.savefig(rf".\CATE_{method}_n{n}_nw{n_w}_sup{support_size}_nx{n_x}.png") |
298 | 101 |
|
| 102 | +# DGP constants |
| 103 | +np.random.seed(123) |
299 | 104 | n = 2000
|
300 | 105 | n_w = 30
|
301 | 106 | support_size = 5
|
302 | 107 | n_x = 1
|
303 | 108 |
|
304 |
| -for n in [2000, 3000, 4000]: |
305 |
| - for n_w in [5, 10, 20, 30]: |
306 |
| - for support_size in [5, 10, 20, 30]: |
307 |
| - if support_size <= n_w: |
308 |
| - for method in ["poly", "splines"]: |
309 |
| - print(f"CATE with {method}, n={n}, n_w={n_w}, sup_size={support_size}, n_x={n_x},") |
| 109 | +# Create data |
| 110 | +data, covariates = create_synthetic_data(n=n, n_w=n_w, support_size=support_size, n_x=n_x) |
| 111 | +data_dml_base = dml.DoubleMLData(data, |
| 112 | + y_col='y', |
| 113 | + d_cols='t', |
| 114 | + x_cols=covariates) |
310 | 115 |
|
311 |
| - # Create data |
312 |
| - data, covariates = create_synthetic_data(n=n, n_w=n_w, support_size=support_size, n_x=n_x) |
313 |
| - data_dml_base = dml.DoubleMLData(data, |
314 |
| - y_col='y', |
315 |
| - d_cols='t', |
316 |
| - x_cols=covariates) |
| 116 | +# First stage estimation |
| 117 | +# Lasso regression |
| 118 | +lasso_reg = LassoCV() |
| 119 | +randomForest_class = RandomForestClassifier(n_estimators=500) |
317 | 120 |
|
318 |
| - # First stage estimation |
319 |
| - # Lasso regression |
320 |
| - lasso_reg = LassoCV() |
321 |
| - randomForest_class = RandomForestClassifier(n_estimators=500) |
322 |
| - |
323 |
| - np.random.seed(123) |
324 |
| - |
325 |
| - dml_irm = dml.DoubleMLIRM(data_dml_base, |
326 |
| - ml_g=lasso_reg, |
327 |
| - ml_m=randomForest_class, |
328 |
| - trimming_threshold=0.01, |
329 |
| - n_folds=5 |
330 |
| - ) |
331 |
| - print("Training first stage") |
332 |
| - dml_irm.fit(store_predictions=True) |
333 |
| - |
334 |
| - # Psi_b is the part of the score that we are interested in (It is the robust Y in Semenova) |
335 |
| - y_robust = dml_irm.psi_b # Q: why is y with dim 3? |
336 |
| - # We reshape it so that it is a nx1 vector |
337 |
| - y_robust = y_robust.reshape(-1, 1) |
| 121 | +np.random.seed(123) |
338 | 122 |
|
339 |
| - X = np.array(data['x']) |
340 |
| - X = X.reshape(-1, 1) |
341 |
| - max_degree = 3 |
342 |
| - degree = 3 |
343 |
| - max_knots = 20 |
344 |
| - alpha = 0.05 |
345 |
| - n_nodes = 90 |
346 |
| - n_samples_bootstrap = 10000 |
347 |
| - print("Training second stage") |
348 |
| - dict_results = second_stage_estimation(X=X, y=y_robust, method=method, |
349 |
| - max_degree=max_degree, |
350 |
| - degree=degree, |
351 |
| - max_knots=max_knots, |
352 |
| - alpha=alpha, |
353 |
| - n_nodes=n_nodes, |
354 |
| - n_samples_bootstrap=n_samples_bootstrap) |
355 |
| - print("Saving results") |
356 |
| - plot_results(dict_results, n, n_w=n_w, support_size=support_size, n_x=n_x, method=method) |
| 123 | +dml_irm = dml.DoubleMLIRM(data_dml_base, |
| 124 | + ml_g=lasso_reg, |
| 125 | + ml_m=randomForest_class, |
| 126 | + trimming_threshold=0.01, |
| 127 | + n_folds=5 |
| 128 | + ) |
| 129 | +print("Training first stage") |
| 130 | +dml_irm.fit(store_predictions=True) |
| 131 | + |
| 132 | +dict_results = dml_irm.cate(cate_var="x", |
| 133 | + method="splines", |
| 134 | + alpha=0.05, |
| 135 | + n_grid_nodes=90, |
| 136 | + n_samples_bootstrap=10000, |
| 137 | + cv=False, |
| 138 | + splines_knots=30, |
| 139 | + splines_degree=3) |
| 140 | + |
| 141 | +plot_results(dict_results, n, n_w=n_w, support_size=support_size, n_x=n_x, method="splines") |
0 commit comments