tidymodels · topepo · Jul 2, 2020 · Jun 18, 2020 · Jun 18, 2020 · Jun 24, 2020
diff --git a/NAMESPACE b/NAMESPACE
@@ -101,6 +101,7 @@ export(add_rowindex)
 export(boost_tree)
 export(check_empty_ellipse)
 export(check_final_param)
+export(contr_one_hot)
 export(control_parsnip)
 export(convert_stan_interval)
 export(decision_tree)

diff --git a/NEWS.md b/NEWS.md
@@ -1,14 +1,20 @@
 # parsnip (development version)
 
+## Breaking Changes
+
+ * `parsnip` now has options to set specific types of predictor encodings for different models. For example, `ranger` models run using `parsnip` and `workflows` do the same thing by _not_ creating indicator variables. These encodings can be overridden using the `blueprint` options in `workflows`. As a consequence, it is possible to get a different model fit that previous versions of `parsnip`. More details about specific encoding changes are below. (#326)
+
 ## Other Changes
 
  * `tidyr` >= 1.0.0 is now required. 
 
- * SVM models produced by `kernlab` now use the formula method. This change was due to how `ksvm()` made indicator variables for factor predictors (with one-hot encodings). Since the ordinary formula method did not do this, the data are passed as-is to `ksvm()` so that the results are closer to what one would get if `ksmv()` were called directly. 
+ * SVM models produced by `kernlab` now use the formula method (see breaking change notice above). This change was due to how `ksvm()` made indicator variables for factor predictors (with one-hot encodings). Since the ordinary formula method did not do this, the data are passed as-is to `ksvm()` so that the results are closer to what one would get if `ksmv()` were called directly. 
 
  * MARS models produced by `earth` now use the formula method. 
 
- * Under-the-hood changes were made so that non-standard data arguments in the modeling packages can be accomodated. (#315)
+ * For `xgboost`, a one-hot encoding is used when indicator variables are created. 
+
+ * Under-the-hood changes were made so that non-standard data arguments in the modeling packages can be accommodated. (#315)
 
 ## New Features
 

diff --git a/R/aaa.R b/R/aaa.R
@@ -39,7 +39,8 @@ utils::globalVariables(
     'lab', 'original', 'predicted_label', 'prediction', 'value', 'type',
     "neighbors", ".submodels", "has_submodel", "max_neighbor", "max_penalty",
     "max_terms", "max_tree", "model", "name", "num_terms", "penalty", "trees",
-    "sub_neighbors", ".pred_class", "x", "y", "predictor_indicators")
+    "sub_neighbors", ".pred_class", "x", "y", "predictor_indicators",
+    "compute_intercept", "remove_intercept")
 )
 
 # nocov end
diff --git a/R/aaa_models.R b/R/aaa_models.R
@@ -323,11 +323,8 @@ check_interface_val <- function(x) {
 #'  below, depending on context.
 #' @param pre,post Optional functions for pre- and post-processing of prediction
 #'  results.
-#' @param options A list of options for engine-specific encodings. Currently,
-#' the option implemented is `predictor_indicators` which tells `parsnip`
-#' whether the pre-processing should make indicator/dummy variables from factor
-#' predictors. This only affects cases when [fit.model_spec()] is used and the
-#' underlying model has an x/y interface.
+#' @param options A list of options for engine-specific preprocessing encodings.
+#'  See Details below.
 #' @param ... Optional arguments that should be passed into the `args` slot for
 #'  prediction objects.
 #' @keywords internal
@@ -347,6 +344,36 @@ check_interface_val <- function(x) {
 #'  already been registered. `check_model_doesnt_exist()` checks the model value
 #'  and also checks to see if it is novel in the environment.
 #'
+#'  The options for engine-specific encodings dictate how the predictors should be
+#'  handled. These options ensure that the data
+#'  that `parsnip` gives to the underlying model allows for a model fit that is
+#'  as similar as possible to what it would have produced directly.
+#'
+#'  For example, if `fit()` is used to fit a model that does not have
+#'  a formula interface, typically some predictor preprocessing must
+#'  be conducted. `glmnet` is a good example of this.
+#'
+#'   There are three options that can be used for the encodings:
+#'
+#'  `predictor_indicators` describes whether and how to create indicator/dummy
+#'  variables from factor predictors. There are three options: `"none"` (do not
+#'  expand factor predictors), `"traditional"` (apply the standard
+#'  `model.matrix()` encodings), and `"one_hot"` (create the complete set
+#'  including the baseline level for all factors). This encoding only affects
+#'  cases when [fit.model_spec()] is used and the underlying model has an x/y
+#'  interface.
+#'
+#' Another option is `compute_intercept`; this controls whether `model.matrix()`
+#'  should include the intercept in its formula. This affects more than the
+#'  inclusion of an intercept column. With an intercept, `model.matrix()`
+#'  computes dummy variables for all but one factor levels. Without an
+#'  intercept, `model.matrix()` computes a full set of indicators for the
+#'  _first_ factor variable, but an incomplete set for the remainder.
+#'
+#'  Finally, the option `remove_intercept` will remove the intercept column
+#'  _after_ `model.matrix()` is finished. This can be useful if the model
+#'  function (e.g. `lm()`) automatically generates an intercept.
+#'
 #' @references "Making a parsnip model from scratch"
 #'  \url{https://tidymodels.github.io/parsnip/articles/articles/Scratch.html}
 #' @examples
@@ -791,7 +818,9 @@ check_encodings <- function(x) {
   if (!is.list(x)) {
     rlang::abort("`values` should be a list.")
   }
-  req_args <- list(predictor_indicators = TRUE)
+  req_args <- list(predictor_indicators = rlang::na_chr,
+                   compute_intercept = rlang::na_lgl,
+                   remove_intercept = rlang::na_lgl)
 
   missing_args <- setdiff(names(req_args), names(x))
   if (length(missing_args) > 0) {
@@ -834,9 +863,12 @@ set_encoding <- function(model, mode, eng, options) {
     current <- get_from_env(nm)
     dup_check <-
       current %>%
-      dplyr::inner_join(new_values, by = c("model", "engine", "mode", "predictor_indicators"))
+      dplyr::inner_join(
+        new_values,
+        by = c("model", "engine", "mode", "predictor_indicators")
+      )
     if (nrow(dup_check)) {
-      rlang::abort(glue::glue("Engine '{eng}' and mode '{mode}' already have defined encodings."))
+      rlang::abort(glue::glue("Engine '{eng}' and mode '{mode}' already have defined encodings for model '{model}'."))
     }
 
   } else {
@@ -856,6 +888,19 @@ set_encoding <- function(model, mode, eng, options) {
 get_encoding <- function(model) {
   check_model_exists(model)
   nm <- paste0(model, "_encoding")
-  rlang::env_get(get_model_env(), nm)
+  res <- try(get_from_env(nm), silent = TRUE)
+  if (inherits(res, "try-error")) {
+    # for objects made before encodings were specified in parsnip
+    res <-
+      get_from_env(model) %>%
+      dplyr::mutate(
+        model = model,
+        predictor_indicators = "traditional",
+        compute_intercept = TRUE,
+        remove_intercept = TRUE
+      ) %>%
+      dplyr::select(model, engine, mode, predictor_indicators,
+                    compute_intercept, remove_intercept)
+  }
+  res
 }
-
diff --git a/R/boost_tree_data.R b/R/boost_tree_data.R
@@ -91,7 +91,11 @@ set_encoding(
   model = "boost_tree",
   eng = "xgboost",
   mode = "regression",
-  options = list(predictor_indicators = TRUE)
+  options = list(
+    predictor_indicators = "one_hot",
+    compute_intercept = FALSE,
+    remove_intercept = TRUE
+  )
 )
 
 set_pred(
@@ -136,7 +140,11 @@ set_encoding(
   model = "boost_tree",
   eng = "xgboost",
   mode = "classification",
-  options = list(predictor_indicators = TRUE)
+  options = list(
+    predictor_indicators = "one_hot",
+    compute_intercept = FALSE,
+    remove_intercept = TRUE
+  )
 )
 
 set_pred(
@@ -239,7 +247,11 @@ set_encoding(
   model = "boost_tree",
   eng = "C5.0",
   mode = "classification",
-  options = list(predictor_indicators = FALSE)
+  options = list(
+    predictor_indicators = "none",
+    compute_intercept = FALSE,
+    remove_intercept = FALSE
+  )
 )
 
 set_pred(
@@ -369,7 +381,11 @@ set_encoding(
   model = "boost_tree",
   eng = "spark",
   mode = "regression",
-  options = list(predictor_indicators = TRUE)
+  options = list(
+    predictor_indicators = "none",
+    compute_intercept = FALSE,
+    remove_intercept = FALSE
+  )
 )
 
 set_fit(
@@ -389,7 +405,11 @@ set_encoding(
   model = "boost_tree",
   eng = "spark",
   mode = "classification",
-  options = list(predictor_indicators = TRUE)
+  options = list(
+    predictor_indicators = "none",
+    compute_intercept = FALSE,
+    remove_intercept = FALSE
+  )
 )
 
 set_pred(

diff --git a/R/contr_one_hot.R b/R/contr_one_hot.R
@@ -0,0 +1,47 @@
+#' Contrast function for one-hot encodings
+#'
+#' This contrast function produces a model matrix with indicator columns for
+#' each level of each factor.
+#'
+#' @param n A vector of character factor levels or the number of unique levels.
+#' @param contrasts This argument is for backwards compatibility and only the
+#'   default of `TRUE` is supported.
+#' @param sparse This argument is for backwards compatibility and only the
+#'   default of `FALSE` is supported.
+#'
+#' @includeRmd man/rmd/one-hot.Rmd details
+#'
+#' @return A diagonal matrix that is `n`-by-`n`.
+#'
+#' @export
+contr_one_hot <- function(n, contrasts = TRUE, sparse = FALSE) {
+  if (sparse) {
+    rlang::warn("`sparse = TRUE` not implemented for `contr_one_hot()`.")
+  }
+
+  if (!contrasts) {
+    rlang::warn("`contrasts = FALSE` not implemented for `contr_one_hot()`.")
+  }
+
+  if (is.character(n)) {
+    names <- n
+    n <- length(names)
+  } else if (is.numeric(n)) {
+    n <- as.integer(n)
+
+    if (length(n) != 1L) {
+      rlang::abort("`n` must have length 1 when an integer is provided.")
+    }
+
+    names <- as.character(seq_len(n))
+  } else {
+    rlang::abort("`n` must be a character vector or an integer of size 1.")
+  }
+
+  out <- diag(n)
+
+  rownames(out) <- names
+  colnames(out) <- names
+
+  out
+}
diff --git a/R/convert_data.R b/R/convert_data.R
@@ -20,8 +20,9 @@ convert_form_to_xy_fit <- function(
   data,
   ...,
   na.action = na.omit,
-  indicators = TRUE,
-  composition = "data.frame"
+  indicators = "traditional",
+  composition = "data.frame",
+  remove_intercept = TRUE
 ) {
   if (!(composition %in% c("data.frame", "matrix")))
     rlang::abort("`composition` should be either 'data.frame' or 'matrix'.")
@@ -72,8 +73,16 @@ convert_form_to_xy_fit <- function(
         )
   }
 
-  if (indicators) {
+  if (indicators != "none") {
+    if (indicators == "one_hot") {
+      old_contr <- options("contrasts")$contrasts
+      on.exit(options(contrasts = old_contr))
+      new_contr <- old_contr
+      new_contr["unordered"] <- "contr_one_hot"
+      options(contrasts = new_contr)
+    }
     x <- model.matrix(mod_terms, mod_frame, contrasts)
+
   } else {
     # this still ignores -vars in formula
     x <- model.frame(mod_terms, data)
@@ -82,14 +91,15 @@ convert_form_to_xy_fit <- function(
       x <- x[,-y_cols, drop = FALSE]
   }
 
-  ## TODO maybe an option not to do this?
-  x <- x[, colnames(x) != "(Intercept)", drop = FALSE]
-
+  if (remove_intercept) {
+    x <- x[, colnames(x) != "(Intercept)", drop = FALSE]
+  }
   options <-
     list(
       indicators = indicators,
       composition = composition,
-      contrasts = contrasts
+      contrasts = contrasts,
+      remove_intercept = remove_intercept
     )
 
   if (composition == "data.frame") {
@@ -165,12 +175,21 @@ convert_form_to_xy_new <- function(object, new_data, na.action = na.pass,
   if (!is.null(cl))
     .checkMFClasses(cl, new_data)
 
-  if(object$options$indicators) {
+  if(object$options$indicators != "none") {
+    if (object$options$indicators == "one_hot") {
+      old_contr <- options("contrasts")$contrasts
+      on.exit(options(contrasts = old_contr))
+      new_contr <- old_contr
+      new_contr["unordered"] <- "contr_one_hot"
+      options(contrasts = new_contr)
+    }
     new_data <-
       model.matrix(mod_terms, new_data, contrasts.arg = object$contrasts)
   }
 
-  new_data <- new_data[, colnames(new_data) != "(Intercept)", drop = FALSE]
+  if(object$options$remove_intercept) {
+    new_data <- new_data[, colnames(new_data) != "(Intercept)", drop = FALSE]
+  }
 
   if (composition == "data.frame")
     new_data <- as.data.frame(new_data)
@@ -188,10 +207,15 @@ convert_form_to_xy_new <- function(object, new_data, na.action = na.pass,
 
 #' @importFrom dplyr bind_cols
 # TODO slots for other roles
-convert_xy_to_form_fit <- function(x, y, weights = NULL, y_name = "..y") {
+convert_xy_to_form_fit <- function(x, y, weights = NULL, y_name = "..y",
+                                   remove_intercept = TRUE) {
   if (is.vector(x))
     rlang::abort("`x` cannot be a vector.")
 
+  if(remove_intercept) {
+    x <- x[, colnames(x) != "(Intercept)", drop = FALSE]
+  }
+
   rn <- rownames(x)
 
   if (!is.data.frame(x))