@@ -19,6 +19,9 @@ trunc_probs <- function(probs, trunc = 0.01) {
19
19
}
20
20
21
21
.filter_eval_time <- function (eval_time , fail = TRUE ) {
22
+ if (! is.null(eval_time )) {
23
+ eval_time <- as.numeric(eval_time )
24
+ }
22
25
# will still propagate nulls:
23
26
eval_time <- eval_time [! is.na(eval_time )]
24
27
eval_time <- unique(eval_time )
@@ -32,3 +35,199 @@ trunc_probs <- function(probs, trunc = 0.01) {
32
35
}
33
36
eval_time
34
37
}
38
+
39
+ add_dot_row_to_weights <- function (dat , rows = NULL ) {
40
+ if (is.null(rows )) {
41
+ dat <- add_rowindex(dat )
42
+ } else {
43
+ m <- length(rows )
44
+ n <- nrow(dat )
45
+ if (m != n ) {
46
+ rlang :: abort(
47
+ glue :: glue(
48
+ " The length of 'rows' ({m}) should be equal to the number of rows in 'data' ({n})"
49
+ )
50
+ )
51
+ }
52
+ dat $ .row <- rows
53
+ }
54
+ dat
55
+ }
56
+
57
+ .check_censor_model <- function (x ) {
58
+ nms <- names(x )
59
+ if (! any(nms == " censor_probs" )) {
60
+ rlang :: abort(" Please refit the model with parsnip version 1.0.4 or greater." )
61
+ }
62
+ invisible (NULL )
63
+ }
64
+
65
+ # nocov start
66
+ # these are tested in extratests
67
+ # ------------------------------------------------------------------------------
68
+ # Brier score helpers. Most of this is based off of Graf, E., Schmoor, C.,
69
+ # Sauerbrei, W. and Schumacher, M. (1999), Assessment and comparison of
70
+ # prognostic classification schemes for survival data. _Statist. Med._, 18:
71
+ # 2529-2545.
72
+
73
+ # We need to use the time of analysis to determine what time to use to evaluate
74
+ # the IPCWs.
75
+
76
+ graf_weight_time <- function (surv_obj , eval_time , rows = NULL , eps = 10 ^- 10 ) {
77
+ event_time <- .extract_surv_time(surv_obj )
78
+ status <- .extract_surv_status(surv_obj )
79
+ is_event_before_t <- event_time < = eval_time & status == 1
80
+ is_censored <- event_time > eval_time
81
+
82
+ # Three possible contributions to the statistic from Graf 1999
83
+
84
+ # Censoring time before eval_time, no contribution (Graf category 3)
85
+ weight_time <- rep(NA_real_ , length(event_time ))
86
+
87
+ # A real event prior to eval_time (Graf category 1)
88
+ weight_time [is_event_before_t ] <- event_time [is_event_before_t ] - eps
89
+
90
+ # Observed time greater than eval_time (Graf category 2)
91
+ weight_time [is_censored ] <- eval_time - eps
92
+
93
+ weight_time <- ifelse(weight_time < 0 , 0 , weight_time )
94
+
95
+ res <- tibble :: tibble(surv = surv_obj , weight_time = weight_time , eval_time )
96
+ add_dot_row_to_weights(res , rows )
97
+ }
98
+
99
+ # ------------------------------------------------------------------------------
100
+ # ' Calculations for inverse probability of censoring weights (IPCW)
101
+ # '
102
+ # ' The method of Graf _et al_ (1999) is used to compute weights at specific
103
+ # ' evaluation times that can be used to help measure a model's time-dependent
104
+ # ' performance (e.g. the time-dependent Brier score or the area under the ROC
105
+ # ' curve).
106
+ # ' @param data A data frame with a column containing a [survival::Surv()] object.
107
+ # ' @param predictors Not currently used. A potential future slot for models with
108
+ # ' informative censoring based on columns in `data`.
109
+ # ' @param rows An optional integer vector with length equal to the number of
110
+ # ' rows in `data` that is used to index the original data. The default is to
111
+ # ' use a fresh index on data (i.e. `1:nrow(data)`).
112
+ # ' @param eval_time A vector of finite, non-negative times at which to
113
+ # ' compute the probability of censoring and the corresponding weights.
114
+ # ' @param object A fitted parsnip model object or fitted workflow with a mode
115
+ # ' of "censored regression".
116
+ # ' @param trunc A potential lower bound for the probability of censoring to avoid
117
+ # ' very large weight values.
118
+ # ' @param eps A small value that is subtracted from the evaluation time when
119
+ # ' computing the censoring probabilities. See Details below.
120
+ # ' @return A tibble with columns `.row`, `eval_time`, `.prob_cens` (the
121
+ # ' probability of being censored just prior to the evaluation time), and
122
+ # ' `.weight_cens` (the inverse probability of censoring weight).
123
+ # ' @details
124
+ # '
125
+ # ' A probability that the data are censored immediately prior to a specific
126
+ # ' time is computed. To do this, we must determine what time to
127
+ # ' make the prediction. There are two time values for each row of the data set:
128
+ # ' the observed time (either censored or not) and the time that the model is
129
+ # ' being evaluated at (e.g. the survival function prediction at some time point),
130
+ # ' which is constant across rows. .
131
+ # '
132
+ # ' From Graf _et al_ (1999) there are three cases:
133
+ # '
134
+ # ' - If the observed time is a censoring time and that is before the
135
+ # ' evaluation time, the data point should make no contribution to the
136
+ # ' performance metric (their "category 3"). These values have a missing
137
+ # ' value for their probability estimate (and also for their weight column).
138
+ # '
139
+ # ' - If the observed time corresponds to an actual event, and that time is
140
+ # ' prior to the evaluation time (category 1), the probability of being
141
+ # ' censored is predicted at the observed time (minus an epsilon).
142
+ # '
143
+ # ' - If the observed time is _after_ the evaluation time (category 2), regardless of
144
+ # ' the status, the probability of being censored is predicted at the evaluation
145
+ # ' time (minus an epsilon).
146
+ # '
147
+ # ' The epsilon is used since, we would not have actual information at time `t`
148
+ # ' for a data point being predicted at time `t` (only data prior to time `t`
149
+ # ' should be available).
150
+ # '
151
+ # ' After the censoring probability is computed, the `trunc` option is used to
152
+ # ' avoid using numbers pathologically close to zero. After this, the weight is
153
+ # ' computed by inverting the censoring probability.
154
+ # '
155
+ # ' The `eps` argument is used to avoid information leakage when computing the
156
+ # ' censoring probability. Subtracting a small number avoids using data that
157
+ # ' would not be known at the time of prediction. For example, if we are making
158
+ # ' survival probability predictions at `eval_time = 3.0`, we would not know the
159
+ # ' about the probability of being censored at that exact time (since it has not
160
+ # ' occurred yet).
161
+ # '
162
+ # ' Note that if there are `n` rows in `data` and `t` time points, the resulting
163
+ # ' data has `n * t` rows. Computations will not easily scale well as `t` becomes
164
+ # ' large.
165
+ # ' @references Graf, E., Schmoor, C., Sauerbrei, W. and Schumacher, M. (1999),
166
+ # ' Assessment and comparison of prognostic classification schemes for survival
167
+ # ' data. _Statist. Med._, 18: 2529-2545.
168
+ # ' @export
169
+ # ' @name censoring_weights
170
+ # ' @keywords internal
171
+ .censoring_weights_graf <- function (object , ... ) {
172
+ UseMethod(" .censoring_weights_graf" )
173
+ }
174
+
175
+ # ' @export
176
+ # ' @rdname censoring_weights
177
+ .censoring_weights_graf.default <- function (object , ... ) {
178
+ cls <- paste0(" '" , class(object ), " '" , collapse = " , " )
179
+ msg <- paste(" There is no `.censoring_weights_graf()` method for objects with class(es):" ,
180
+ cls )
181
+ rlang :: abort(msg )
182
+ }
183
+
184
+
185
+ # ' @export
186
+ # ' @rdname censoring_weights
187
+ .censoring_weights_graf.workflow <- function (object ,
188
+ data ,
189
+ eval_time ,
190
+ rows = NULL ,
191
+ predictors = NULL ,
192
+ trunc = 0.05 , eps = 10 ^- 10 , ... ) {
193
+ if (is.null(object $ fit $ fit )) {
194
+ rlang :: abort(" The workflow does not have a model fit object." , call = FALSE )
195
+ }
196
+ .censoring_weights_graf(object $ fit $ fit , data , eval_time , rows , predictors , trunc , eps )
197
+ }
198
+
199
+ # ' @export
200
+ # ' @rdname censoring_weights
201
+ .censoring_weights_graf.model_fit <- function (object ,
202
+ data ,
203
+ eval_time ,
204
+ rows = NULL ,
205
+ predictors = NULL ,
206
+ trunc = 0.05 , eps = 10 ^- 10 , ... ) {
207
+ rlang :: check_dots_empty()
208
+ .check_censor_model(object )
209
+ if (! is.null(predictors )) {
210
+ rlang :: warn(" The 'predictors' argument to the survival weighting function is not currently used." , call = FALSE )
211
+ }
212
+ eval_time <- .filter_eval_time(eval_time )
213
+
214
+ truth <- object $ preproc $ y_var
215
+ if (length(truth ) != 1 ) {
216
+ # check_outcome() tests that the outcome column is a Surv object
217
+ rlang :: abort(" The event time data should be in a single column with class 'Surv'" , call = FALSE )
218
+ }
219
+ surv_data <- dplyr :: select(data , dplyr :: all_of(!! truth )) %> % setNames(" surv" )
220
+ .check_censored_right(surv_data $ surv )
221
+
222
+ purrr :: map(eval_time ,
223
+ ~ graf_weight_time(surv_data $ surv , .x , eps = eps , rows = rows )) %> %
224
+ purrr :: list_rbind() %> %
225
+ dplyr :: mutate(
226
+ .prob_cens = predict(object $ censor_probs , time = weight_time , as_vector = TRUE ),
227
+ .prob_cens = trunc_probs(.prob_cens , trunc ),
228
+ .weight_cens = 1 / .prob_cens
229
+ ) %> %
230
+ dplyr :: select(.row , eval_time , .prob_cens , .weight_cens )
231
+ }
232
+
233
+ # nocov end
0 commit comments