tidymodels · topepo · Feb 14, 2023 · Jan 12, 2023 · Jan 12, 2023 · Jan 13, 2023
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: parsnip
 Title: A Common API to Modeling and Analysis Functions
-Version: 1.0.3.9000
+Version: 1.0.3.9001
 Authors@R: c(
     person("Max", "Kuhn", , "[email protected]", role = c("aut", "cre")),
     person("Davis", "Vaughan", , "[email protected]", role = "aut"),
@@ -54,6 +54,7 @@ Suggests:
     mgcv,
     modeldata,
     nlme,
+    prodlim,
     ranger (>= 0.12.0),
     remotes,
     rmarkdown,
@@ -75,4 +76,4 @@ Config/testthat/edition: 3
 Encoding: UTF-8
 LazyData: true
 Roxygen: list(markdown = TRUE)
-RoxygenNote: 7.2.1.9000
+RoxygenNote: 7.2.3
diff --git a/NAMESPACE b/NAMESPACE
@@ -29,6 +29,7 @@ S3method(nullmodel,default)
 S3method(predict,"_elnet")
 S3method(predict,"_lognet")
 S3method(predict,"_multnet")
+S3method(predict,censoring_model_reverse_km)
 S3method(predict,model_fit)
 S3method(predict,model_spec)
 S3method(predict,nullmodel)

diff --git a/NEWS.md b/NEWS.md
@@ -1,5 +1,7 @@
 # parsnip (development version)
 
+* For censored regression models, a "reverse Kaplan-Meier" curve is computed for the censoring distribution. This can be used when evaluating this type of model (#855).
+
 # parsnip 1.0.3
 
 * Adds documentation and tuning infrastructure for the new `flexsurvspline` engine for the `survival_reg()` model specification from the `censored` package (@mattwarkentin, #831).

diff --git a/R/censoring_probs.R b/R/censoring_probs.R
@@ -0,0 +1,107 @@
+# nocov start
+# tested in the extratests repo
+
+new_censoring_model <-
+  function(formula,
+           object,
+           pkgs = character(0),
+           label = character(0),
+           extra_cls = character(0)) {
+    res <- list(formula = formula, fit = object, label = label, required_pkgs = pkgs)
+    class(res) <- c(paste0("censoring_model_", label), "censoring_model", extra_cls)
+    res
+  }
+
+# ------------------------------------------------------------------------------
+# estimate the reverse km curve for censored regresison models
+
+make_cens_prob_model <- function(obj, eval_env) {
+  if (obj$mode != "censored regression") {
+    return(list())
+  }
+  rlang::check_installed("prodlim")
+
+  # Note: even when fit_xy() is called, eval_env will still have
+  # objects data and formula in them
+  f <- eval_env$formula
+  km_form <- stats::update(f, ~ 1)
+  cl <-
+    rlang::call2(
+      "prodlim",
+      formula = km_form,
+      .ns = "prodlim",
+      reverse = TRUE,
+      type = "surv",
+      x = FALSE,
+      data = rlang::expr(eval_env$data)
+    )
+
+  if (!is.null(eval_env$weights)) {
+    cl <- rlang::call_modify(cl, caseweights = rlang::expr(eval_env$weights))
+  }
+  rkm <- try(rlang::eval_tidy(cl), silent = TRUE)
+  if (!inherits(rkm, "try-error")) {
+    attr(f, ".Environment") <- rlang::base_env()
+  }
+  attr(rkm$formula, ".Environment") <- rlang::base_env()
+  new_censoring_model(f, object = rkm, label = "reverse_km", pkgs = "prodlim")
+}
+
+# ------------------------------------------------------------------------------
+# Basic S3 methods
+
+print.censoring_model <- function(x, ...) {
+  cat(x$label, "model for predicting the probability of censoring\n")
+  invisible(x)
+}
+
+predict.censoring_model <- function(object, ...) {
+  rlang::abort(
+    paste("Don't know how to predict with a censoring model of type:", object$label)
+  )
+  invisible(NULL)
+}
+
+#' @export
+predict.censoring_model_reverse_km <- function(object, new_data = NULL, time, as_vector = FALSE, ...) {
+  rlang::check_installed("prodlim")
+
+  res <- rep(NA_real_, length(time))
+
+  # Some time values might be NA (for Graf category 2)
+  is_na <- which(is.na(time))
+  if (length(is_na) > 0) {
+    time <- time[-is_na]
+  }
+
+  if (length(time) > 0) {
+    if (is.null(new_data)) {
+      tmp <-
+        purrr::map_dbl(time, ~ predict(object$fit, times = .x, type = "surv"))
+    } else {
+      tmp <-
+        purrr::map_dbl(time, ~ predict(object$fit, newdata = new_data, times = .x, type = "surv"))
+    }
+    zero_prob <- purrr::map_lgl(tmp, ~ !is.na(.x) && .x == 0)
+    if (any(zero_prob)) {
+      # Don't want censoring probabilities of zero so add an epsilon
+      # Either use 1/n or half of the minimum survival probability
+      n <- max(object$fit$n.risk)
+      half_min_surv_prob <- min(object$fit$surv[object$fit$surv > 0]) / 2
+      eps <- min(1 / n, half_min_surv_prob)
+      tmp[zero_prob] <- eps
+    }
+
+    if (length(is_na) > 0) {
+      res[-is_na] <- tmp
+    } else {
+      res <- tmp
+    }
+  }
+  if (!as_vector) {
+    res <- tibble::tibble(.prob_censored = unname(res))
+  }
+  res
+}
+
+# nocov end
diff --git a/R/fit.R b/R/fit.R
@@ -54,6 +54,12 @@
 #' `options(contrasts = c(unordered = "contr.helmert", ordered = "contr.poly"))`.
 #' See the help page for [stats::contr.treatment()] for more possible contrast
 #' types.
+#'
+#' For models with `"censored regression"` modes, an additional computation is
+#' executed and saved in the parsnip object. The `censor_probs` element contains
+#' a "reverse Kaplan-Meier" curve that models the probability of censoring. This
+#' may be used later to compute inverse probability censoring weights for
+#' performance measures.
 #' @examples
 #' # Although `glm()` only has a formula interface, different
 #' # methods for specifying the model can be used
@@ -206,6 +212,7 @@ fit.model_spec <-
 
         rlang::abort(glue::glue("{interfaces} is unknown."))
       )
+    res$censor_probs <- make_cens_prob_model(object, eval_env)
     model_classes <- class(res$fit)
     class(res) <- c(paste0("_", model_classes[1]), "model_fit")
     res
@@ -317,6 +324,7 @@ fit_xy.model_spec <-
           ),
         rlang::abort(glue::glue("{interfaces} is unknown."))
       )
+    res$censor_probs <- make_cens_prob_model(object, eval_env)
     model_classes <- class(res$fit)
     class(res) <- c(paste0("_", model_classes[1]), "model_fit")
     res

diff --git a/R/fit_helpers.R b/R/fit_helpers.R
@@ -197,5 +197,3 @@ xy_form <- function(object, env, control, ...) {
   res$preproc <- data_obj[c("x_var", "y_var")]
   res
 }
-
-
diff --git a/man/fit.Rd b/man/fit.Rd
diff --git a/tests/testthat/README.md b/tests/testthat/README.md
@@ -0,0 +1,2 @@
+Note that some functionality in parsnip is tested outside of the package. Due to a high degree of dependencies, many additional tests are in the [extratexts](https://github.com/tidymodels/extratests/tree/main/tests/testthat) repository. These are run nightly with CRAN and Github versions of parsnip as well as other tidymodels packages. 
+
Original file line number	Diff line number	Diff line change
Expand Up		@@ -197,5 +197,3 @@ xy_form <- function(object, env, control, ...) {
		res$preproc <- data_obj[c("x_var", "y_var")]
		res
		}
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		Note that some functionality in parsnip is tested outside of the package. Due to a high degree of dependencies, many additional tests are in the [extratexts](https://github.com/tidymodels/extratests/tree/main/tests/testthat) repository. These are run nightly with CRAN and Github versions of parsnip as well as other tidymodels packages.