tidymodels · topepo · Jun 6, 2022 · Mar 23, 2022 · Mar 23, 2022 · Mar 23, 2022
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -21,11 +21,11 @@ Depends:
 Imports: 
     cli,
     dplyr (>= 0.8.0.1),
-    generics (>= 0.1.0.9000),
+    generics (>= 0.1.2),
     ggplot2,
     globals,
     glue,
-    hardhat (>= 0.1.6.9001),
+    hardhat (>= 0.2.0.9000),
     lifecycle,
     magrittr,
     prettyunits,
@@ -40,9 +40,8 @@ Imports:
 Suggests: 
     C50,
     covr,
-    dials (>= 0.0.10.9001),
+    dials (>= 0.1.0),
     earth,
-    tensorflow,
     ggrepel,
     keras,
     kernlab,
@@ -60,30 +59,17 @@ Suggests:
     rpart,
     sparklyr (>= 1.0.0),
     survival,
+    tensorflow,
     testthat,
     xgboost (>= 1.5.0.1)
+Remotes:
+    tidymodels/hardhat
 VignetteBuilder: 
     knitr
 ByteCompile: true
-Config/Needs/website: 
-    C50, 
-    dbarts, 
-    earth, 
-    glmnet, 
-    keras, 
-    kernlab, 
-    kknn,
-    LiblineaR, 
-    mgcv, 
-    nnet, 
-    parsnip, 
-    randomForest, 
-    ranger, 
-    rpart, 
-    rstanarm,
-    tidymodels/tidymodels, 
-    tidyverse/tidytemplate, 
-    rstudio/reticulate,
+Config/Needs/website: C50, dbarts, earth, glmnet, keras, kernlab, kknn,
+    LiblineaR, mgcv, nnet, parsnip, randomForest, ranger, rpart, rstanarm,
+    tidymodels/tidymodels, tidyverse/tidytemplate, rstudio/reticulate,
     xgboost
 Config/rcmdcheck/ignore-inconsequential-notes: true
 Encoding: UTF-8

diff --git a/R/arguments.R b/R/arguments.R
@@ -149,10 +149,14 @@ make_call <- function(fun, ns, args, ...) {
 
 make_form_call <- function(object, env = NULL) {
   fit_args <- object$method$fit$args
+  uses_weights <- !is.null(env$weights)
 
   # Get the arguments related to data:
   if (is.null(object$method$fit$data)) {
     data_args <- c(formula = "formula", data = "data")
+    if (uses_weights) {
+      data_args["weights"] <- "weights"
+    }
   } else {
     data_args <- object$method$fit$data
   }
@@ -165,6 +169,13 @@ make_form_call <- function(object, env = NULL) {
   # sub in actual formula
   fit_args[[ unname(data_args["formula"]) ]]  <- env$formula
 
+  # Add in case weights symbol
+  if (uses_weights) {
+    fit_args[[ unname(data_args["weights"]) ]]  <- rlang::expr(weights)
+  }
+
+
+  # TODO remove weights col from data?
   if (object$engine == "spark") {
     env$x <- env$data
   }
@@ -177,12 +188,17 @@ make_form_call <- function(object, env = NULL) {
   fit_call
 }
 
-make_xy_call <- function(object, target) {
+# TODO we need something to indicate that case weights are being used.
+make_xy_call <- function(object, target, env) {
   fit_args <- object$method$fit$args
+  uses_weights <- !is.null(env$weights)
 
   # Get the arguments related to data:
   if (is.null(object$method$fit$data)) {
     data_args <- c(x = "x", y = "y")
+    if (uses_weights) {
+      data_args["weights"] <- "weights"
+    }
   } else {
     data_args <- object$method$fit$data
   }
@@ -196,6 +212,9 @@ make_xy_call <- function(object, target) {
       matrix = rlang::expr(maybe_matrix(x)),
       rlang::abort(glue::glue("Invalid data type target: {target}."))
     )
+  if (uses_weights) {
+    object$method$fit$args[[ unname(data_args["weights"]) ]] <- rlang::expr(weights)
+  }
 
   fit_call <- make_call(
     fun = object$method$fit$func["fun"],
@@ -268,3 +287,4 @@ min_rows <- function(num_rows, source, offset = 0) {
 
   as.integer(num_rows)
 }
+
diff --git a/R/fit.R b/R/fit.R
@@ -18,6 +18,8 @@
 #'  below). A data frame containing all relevant variables (e.g.
 #'  outcome(s), predictors, case weights, etc). Note: when needed, a
 #'  \emph{named argument} should be used.
+#' @param case_weights A vector of numeric case weights with underlying class of
+#' "`hardhat_case_weights`". See [hardhat::frequency_weights()] for example.
 #' @param control A named list with elements `verbosity` and
 #'  `catch`. See [control_parsnip()].
 #' @param ... Not currently used; values passed here will be
@@ -101,6 +103,7 @@ fit.model_spec <-
   function(object,
            formula,
            data,
+           case_weights = NULL,
            control = control_parsnip(),
            ...
   ) {
@@ -110,6 +113,8 @@ fit.model_spec <-
     if (!identical(class(control), class(control_parsnip()))) {
       rlang::abort("The 'control' argument should have class 'control_parsnip'.")
     }
+    check_case_weights(case_weights, object)
+
     dots <- quos(...)
 
     if (length(possible_engines(object)) == 0) {
@@ -129,15 +134,31 @@ fit.model_spec <-
       }
     }
 
-    if (all(c("x", "y") %in% names(dots)))
+    if (all(c("x", "y") %in% names(dots))) {
       rlang::abort("`fit.model_spec()` is for the formula methods. Use `fit_xy()` instead.")
+    }
     cl <- match.call(expand.dots = TRUE)
     # Create an environment with the evaluated argument objects. This will be
     # used when a model call is made later.
     eval_env <- rlang::env()
 
+    wts <- weights_to_numeric(case_weights)
+
+    # `lm()` and `glm()` and others use the original model function call to
+    # construct a call for `model.frame()`. That will normally fail because the
+    # formula has its own environment attached (usually the global environment)
+    # and it will look there for a vector named 'weights'. We've stashed that
+    # vector in the environment 'env' so we reset the reference environment in
+    # the formula to have our data objects so they can be found.
+    fenv <- rlang::env_clone(environment(formula))
+    fenv$data <- data
+    fenv$weights <- wts
+    environment(formula) <- fenv
+
     eval_env$data <- data
     eval_env$formula <- formula
+    eval_env$weights <- wts
+
     fit_interface <-
       check_interface(eval_env$formula, eval_env$data, cl, object)
 
@@ -206,6 +227,7 @@ fit_xy.model_spec <-
   function(object,
            x,
            y,
+           case_weights = NULL,
            control = control_parsnip(),
            ...
   ) {
@@ -223,6 +245,8 @@ fit_xy.model_spec <-
     if (is.null(colnames(x))) {
       rlang::abort("'x' should have column names.")
     }
+    check_case_weights(case_weights, object)
+
     object <- check_mode(object, levels(y))
     dots <- quos(...)
     if (is.null(object$engine)) {
@@ -245,6 +269,9 @@ fit_xy.model_spec <-
     eval_env <- rlang::env()
     eval_env$x <- x
     eval_env$y <- y
+    eval_env$weights <- weights_to_numeric(case_weights)
+
+    # TODO case weights: pass in eval_env not individual elements
     fit_interface <- check_xy_interface(eval_env$x, eval_env$y, cl, object)
 
     if (object$engine == "spark")
@@ -306,18 +333,18 @@ fit_xy.model_spec <-
 
 # ------------------------------------------------------------------------------
 
-eval_mod <- function(e, capture = FALSE, catch = FALSE, ...) {
+eval_mod <- function(e, capture = FALSE, catch = FALSE, envir = NULL, ...) {
   if (capture) {
     if (catch) {
-      junk <- capture.output(res <- try(eval_tidy(e, ...), silent = TRUE))
+      junk <- capture.output(res <- try(eval_tidy(e, env = envir, ...), silent = TRUE))
     } else {
-      junk <- capture.output(res <- eval_tidy(e, ...))
+      junk <- capture.output(res <- eval_tidy(e, env = envir, ...))
     }
   } else {
     if (catch) {
-      res <- try(eval_tidy(e, ...), silent = TRUE)
+      res <- try(eval_tidy(e, env = envir, ...), silent = TRUE)
     } else {
-      res <- eval_tidy(e, ...)
+      res <- eval_tidy(e, env = envir, ...)
     }
   }
   res

diff --git a/R/fit_helpers.R b/R/fit_helpers.R
@@ -39,7 +39,7 @@ form_form <-
           fit_call,
           capture = control$verbosity == 0,
           catch = control$catch,
-          env = env,
+          envir = env,
           ...
         ),
         gcFirst = FALSE
@@ -49,7 +49,7 @@ form_form <-
         fit_call,
         capture = control$verbosity == 0,
         catch = control$catch,
-        env = env,
+        envir = env,
         ...
       )
       elapsed <- list(elapsed = NA_real_)
@@ -88,7 +88,7 @@ xy_xy <- function(object, env, control, target = "none", ...) {
   # sub in arguments to actual syntax for corresponding engine
   object <- translate(object, engine = object$engine)
 
-  fit_call <- make_xy_call(object, target)
+  fit_call <- make_xy_call(object, target, env)
 
   res <- list(lvl = levels(env$y), spec = object)
 
@@ -98,7 +98,7 @@ xy_xy <- function(object, env, control, target = "none", ...) {
         fit_call,
         capture = control$verbosity == 0,
         catch = control$catch,
-        env = env,
+        envir = env,
         ...
       ),
       gcFirst = FALSE
@@ -108,7 +108,7 @@ xy_xy <- function(object, env, control, target = "none", ...) {
       fit_call,
       capture = control$verbosity == 0,
       catch = control$catch,
-      env = env,
+      envir = env,
       ...
     )
     elapsed <- list(elapsed = NA_real_)
@@ -200,3 +200,39 @@ xy_form <- function(object, env, control, ...) {
   res
 }
 
+
+weights_to_numeric <- function(x) {
+  if (is.null(x)) {
+    return(NULL)
+  }
+
+  to_int <- c("hardhat_frequency_weights")
+  if (inherits(x, to_int)) {
+    x <- as.integer(x)
+  } else {
+    x <- as.numeric(x)
+  }
+  x
+}
+
+case_weights_allowed <- function(spec) {
+  mod_type <- class(spec)[1]
+  mod_eng <- spec$engine
+  mod_mode <- spec$mode
+
+  model_info <-
+    get_from_env(paste0(mod_type, "_fit")) %>%
+    dplyr::filter(engine == mod_eng & mode == mod_mode)
+  if (nrow(model_info) != 1) {
+    rlang::abort(
+      glue::glue(
+        "Error in geting model information for model {mod_type} with engine {mod_eng} and mode {mod_mode}."
+      )
+    )
+  }
+  # If weights are used, they are protected data arguments with the canonical
+  # name 'weights' (although this may not be the model function's argument name).
+  data_args <- model_info$value[[1]]$protect
+  any(data_args == "weights")
+}
+
diff --git a/R/logistic_reg_data.R b/R/logistic_reg_data.R
@@ -220,7 +220,7 @@ set_fit(
   mode = "classification",
   value = list(
     interface = "matrix",
-    protect = c("x", "y", "wi"),
+    protect = c("x", "y"),
     data = c(x = "data", y = "target"),
     func = c(pkg = "LiblineaR", fun = "LiblineaR"),
     defaults = list(verbose = FALSE)

diff --git a/R/misc.R b/R/misc.R
@@ -385,3 +385,18 @@ stan_conf_int <- function(object, newdata) {
 
   penalty
 }
+
+
+check_case_weights <- function(x, spec) {
+  if (is.null(x)) {
+    return(invisible(NULL))
+  }
+  if (!inherits(x, "hardhat_case_weights")) {
+    rlang::abort("'case_weights' should be a single numeric vector of class 'hardhat_case_weights'.")
+  }
+  allowed <- case_weights_allowed(spec)
+  if (!allowed) {
+    rlang::abort("Case weights are not enabled by the underlying model implementation.")
+  }
+  invisible(NULL)
+}
diff --git a/R/parsnip-package.R b/R/parsnip-package.R
@@ -41,7 +41,7 @@ utils::globalVariables(
     "sub_neighbors", ".pred_class", "x", "y", "predictor_indicators",
     "compute_intercept", "remove_intercept", "estimate", "term",
     "call_info", "component", "component_id", "func", "tunable", "label",
-    "pkg", ".order", "item", "tunable", "has_ext"
+    "pkg", ".order", "item", "tunable", "has_ext", "weights"
   )
 )
 

diff --git a/R/rand_forest_data.R b/R/rand_forest_data.R
@@ -122,7 +122,8 @@ set_fit(
   mode = "classification",
   value = list(
     interface = "data.frame",
-    protect = c("x", "y", "case.weights"),
+    data = c(x = "x", y = "y", weights = "case.weights"),
+    protect = c("x", "y", "weights"),
     func = c(pkg = "ranger", fun = "ranger"),
     defaults =
       list(
@@ -151,7 +152,8 @@ set_fit(
   mode = "regression",
   value = list(
     interface = "data.frame",
-    protect = c("x", "y", "case.weights"),
+    data = c(x = "x", y = "y", weights = "case.weights"),
+    protect = c("x", "y", "weights"),
     func = c(pkg = "ranger", fun = "ranger"),
     defaults =
       list(

diff --git a/R/svm_linear_data.R b/R/svm_linear_data.R
@@ -33,7 +33,7 @@ set_fit(
   mode = "regression",
   value = list(
     interface = "matrix",
-    protect = c("x", "y", "wi"),
+    protect = c("x", "y"),
     data = c(x = "data", y = "target"),
     func = c(pkg = "LiblineaR", fun = "LiblineaR"),
     defaults = list(type = 11)
@@ -47,7 +47,8 @@ set_fit(
   value = list(
     interface = "matrix",
     data = c(x = "data", y = "target"),
-    protect = c("x", "y", "wi"),
+    protect = c("x", "y"),
+    data = c(x = "data", y = "target"),
     func = c(pkg = "LiblineaR", fun = "LiblineaR"),
     defaults = list(type = 1)
   )