tidymodels
diff --git a/‎.github/workflows/R-CMD-check.yaml
Lines changed: 19 additions & 8 deletions b/‎.github/workflows/R-CMD-check.yaml
Lines changed: 19 additions & 8 deletions
diff --git a/‎.github/workflows/test-coverage.yaml
Lines changed: 2 additions & 0 deletions b/‎.github/workflows/test-coverage.yaml
Lines changed: 2 additions & 0 deletions
diff --git a/‎NAMESPACE
Lines changed: 1 addition & 0 deletions b/‎NAMESPACE
Lines changed: 1 addition & 0 deletions
diff --git a/‎NEWS.md
Lines changed: 8 additions & 2 deletions b/‎NEWS.md
Lines changed: 8 additions & 2 deletions
diff --git a/‎R/aaa.R
Lines changed: 2 additions & 1 deletion b/‎R/aaa.R
Lines changed: 2 additions & 1 deletion
diff --git a/‎R/aaa_models.R
Lines changed: 55 additions & 10 deletions b/‎R/aaa_models.R
Lines changed: 55 additions & 10 deletions
diff --git a/‎R/boost_tree.R
Lines changed: 2 additions & 2 deletions b/‎R/boost_tree.R
Lines changed: 2 additions & 2 deletions
diff --git a/‎R/boost_tree_data.R
Lines changed: 25 additions & 5 deletions b/‎R/boost_tree_data.R
Lines changed: 25 additions & 5 deletions
diff --git a/‎R/contr_one_hot.R
Lines changed: 47 additions & 0 deletions b/‎R/contr_one_hot.R
Lines changed: 47 additions & 0 deletions
@@ -15,11 +15,13 @@ jobs:
       fail-fast: false
       matrix:
         config:
-        - { os: windows-latest, r: '3.6'}
-        - { os: windows-latest, r: '4.0'}
-        - { os: windows-latest, r: 'devel'}
-        - { os: ubuntu-16.04, r: '3.5', cran: "https://demo.rstudiopm.com/all/__linux__/xenial/latest"}
-        - { os: ubuntu-16.04, r: '3.6', cran: "https://demo.rstudiopm.com/all/__linux__/xenial/latest"}
+          - {os: macOS-latest,   r: 'devel'}
+          - {os: macOS-latest,   r: 'release'}
+          - {os: windows-latest, r: 'release'}
+          - {os: windows-latest, r: '3.6'}
+          - {os: ubuntu-16.04,   r: 'release', rspm: "https://packagemanager.rstudio.com/cran/__linux__/xenial/latest"}
+          - {os: ubuntu-16.04,   r: 'oldrel',  rspm: "https://packagemanager.rstudio.com/cran/__linux__/xenial/latest"}
+          - {os: ubuntu-16.04,   r: '3.5',     rspm: "https://packagemanager.rstudio.com/cran/__linux__/xenial/latest"}
 
     env:
       R_REMOTES_NO_ERRORS_FROM_WARNINGS: true
@@ -65,11 +67,20 @@ jobs:
           remotes::install_cran("rcmdcheck")
         shell: Rscript {0}
 
-      - name: Install TensorFlow
+      - name: Install Miniconda
         run: |
+          Rscript -e "remotes::install_github('rstudio/reticulate')"
           Rscript -e "reticulate::install_miniconda()"
-          Rscript -e "reticulate::conda_create('r-reticulate', packages = 'python==3.6.9')"
-          Rscript -e "tensorflow::install_tensorflow(version='1.14.0')"
+
+      - name: Find Miniconda on macOS
+        if: runner.os == 'macOS'
+        run: echo "options(reticulate.conda_binary = reticulate:::miniconda_conda())" >> .Rprofile
+
+      - name: Install TensorFlow
+        run: |
+          reticulate::conda_create('r-reticulate', packages = c('python==3.6.9'))
+          tensorflow::install_tensorflow(version='1.14.0')
+        shell: Rscript {0}
 
       - name: Session info
         run: |
 
@@ -43,7 +43,9 @@ jobs:
 
       - name: Install TensorFlow
         run: |
+          Rscript -e "remotes::install_github('rstudio/reticulate')"
           Rscript -e "reticulate::install_miniconda()"
+          echo "options(reticulate.conda_binary = reticulate:::miniconda_conda())" >> .Rprofile
           Rscript -e "reticulate::conda_create('r-reticulate', packages = 'python==3.6.9')"
           Rscript -e "tensorflow::install_tensorflow(version='1.14.0')"
 
 
@@ -101,6 +101,7 @@ export(add_rowindex)
 export(boost_tree)
 export(check_empty_ellipse)
 export(check_final_param)
+export(contr_one_hot)
 export(control_parsnip)
 export(convert_stan_interval)
 export(decision_tree)
 
@@ -1,14 +1,20 @@
 # parsnip (development version)
 
+## Breaking Changes
+
+ * `parsnip` now has options to set specific types of predictor encodings for different models. For example, `ranger` models run using `parsnip` and `workflows` do the same thing by _not_ creating indicator variables. These encodings can be overridden using the `blueprint` options in `workflows`. As a consequence, it is possible to get a different model fit that previous versions of `parsnip`. More details about specific encoding changes are below. (#326)
+
 ## Other Changes
 
  * `tidyr` >= 1.0.0 is now required. 
 
- * SVM models produced by `kernlab` now use the formula method. This change was due to how `ksvm()` made indicator variables for factor predictors (with one-hot encodings). Since the ordinary formula method did not do this, the data are passed as-is to `ksvm()` so that the results are closer to what one would get if `ksmv()` were called directly. 
+ * SVM models produced by `kernlab` now use the formula method (see breaking change notice above). This change was due to how `ksvm()` made indicator variables for factor predictors (with one-hot encodings). Since the ordinary formula method did not do this, the data are passed as-is to `ksvm()` so that the results are closer to what one would get if `ksmv()` were called directly. 
 
  * MARS models produced by `earth` now use the formula method. 
 
- * Under-the-hood changes were made so that non-standard data arguments in the modeling packages can be accomodated. (#315)
+ * For `xgboost`, a one-hot encoding is used when indicator variables are created. 
+ 
+ * Under-the-hood changes were made so that non-standard data arguments in the modeling packages can be accommodated. (#315)
 
 ## New Features
 
 
@@ -39,7 +39,8 @@ utils::globalVariables(
     'lab', 'original', 'predicted_label', 'prediction', 'value', 'type',
     "neighbors", ".submodels", "has_submodel", "max_neighbor", "max_penalty",
     "max_terms", "max_tree", "model", "name", "num_terms", "penalty", "trees",
-    "sub_neighbors", ".pred_class", "x", "y", "predictor_indicators")
+    "sub_neighbors", ".pred_class", "x", "y", "predictor_indicators",
+    "compute_intercept", "remove_intercept")
 )
 
 # nocov end
@@ -323,11 +323,8 @@ check_interface_val <- function(x) {
 #'  below, depending on context.
 #' @param pre,post Optional functions for pre- and post-processing of prediction
 #'  results.
-#' @param options A list of options for engine-specific encodings. Currently,
-#' the option implemented is `predictor_indicators` which tells `parsnip`
-#' whether the pre-processing should make indicator/dummy variables from factor
-#' predictors. This only affects cases when [fit.model_spec()] is used and the
-#' underlying model has an x/y interface.
+#' @param options A list of options for engine-specific preprocessing encodings.
+#'  See Details below.
 #' @param ... Optional arguments that should be passed into the `args` slot for
 #'  prediction objects.
 #' @keywords internal
@@ -347,6 +344,36 @@ check_interface_val <- function(x) {
 #'  already been registered. `check_model_doesnt_exist()` checks the model value
 #'  and also checks to see if it is novel in the environment.
 #'
+#'  The options for engine-specific encodings dictate how the predictors should be
+#'  handled. These options ensure that the data
+#'  that `parsnip` gives to the underlying model allows for a model fit that is
+#'  as similar as possible to what it would have produced directly.
+#'
+#'  For example, if `fit()` is used to fit a model that does not have
+#'  a formula interface, typically some predictor preprocessing must
+#'  be conducted. `glmnet` is a good example of this.
+#'
+#'   There are three options that can be used for the encodings:
+#'
+#'  `predictor_indicators` describes whether and how to create indicator/dummy
+#'  variables from factor predictors. There are three options: `"none"` (do not
+#'  expand factor predictors), `"traditional"` (apply the standard
+#'  `model.matrix()` encodings), and `"one_hot"` (create the complete set
+#'  including the baseline level for all factors). This encoding only affects
+#'  cases when [fit.model_spec()] is used and the underlying model has an x/y
+#'  interface.
+#'
+#' Another option is `compute_intercept`; this controls whether `model.matrix()`
+#'  should include the intercept in its formula. This affects more than the
+#'  inclusion of an intercept column. With an intercept, `model.matrix()`
+#'  computes dummy variables for all but one factor levels. Without an
+#'  intercept, `model.matrix()` computes a full set of indicators for the
+#'  _first_ factor variable, but an incomplete set for the remainder.
+#'
+#'  Finally, the option `remove_intercept` will remove the intercept column
+#'  _after_ `model.matrix()` is finished. This can be useful if the model
+#'  function (e.g. `lm()`) automatically generates an intercept.
+#'
 #' @references "Making a parsnip model from scratch"
 #'  \url{https://tidymodels.github.io/parsnip/articles/articles/Scratch.html}
 #' @examples
@@ -791,7 +818,9 @@ check_encodings <- function(x) {
   if (!is.list(x)) {
     rlang::abort("`values` should be a list.")
   }
-  req_args <- list(predictor_indicators = TRUE)
+  req_args <- list(predictor_indicators = rlang::na_chr,
+                   compute_intercept = rlang::na_lgl,
+                   remove_intercept = rlang::na_lgl)
 
   missing_args <- setdiff(names(req_args), names(x))
   if (length(missing_args) > 0) {
@@ -834,9 +863,12 @@ set_encoding <- function(model, mode, eng, options) {
     current <- get_from_env(nm)
     dup_check <-
       current %>%
-      dplyr::inner_join(new_values, by = c("model", "engine", "mode", "predictor_indicators"))
+      dplyr::inner_join(
+        new_values,
+        by = c("model", "engine", "mode", "predictor_indicators")
+      )
     if (nrow(dup_check)) {
-      rlang::abort(glue::glue("Engine '{eng}' and mode '{mode}' already have defined encodings."))
+      rlang::abort(glue::glue("Engine '{eng}' and mode '{mode}' already have defined encodings for model '{model}'."))
     }
 
   } else {
@@ -856,6 +888,19 @@ set_encoding <- function(model, mode, eng, options) {
 get_encoding <- function(model) {
   check_model_exists(model)
   nm <- paste0(model, "_encoding")
-  rlang::env_get(get_model_env(), nm)
+  res <- try(get_from_env(nm), silent = TRUE)
+  if (inherits(res, "try-error")) {
+    # for objects made before encodings were specified in parsnip
+    res <-
+      get_from_env(model) %>%
+      dplyr::mutate(
+        model = model,
+        predictor_indicators = "traditional",
+        compute_intercept = TRUE,
+        remove_intercept = TRUE
+      ) %>%
+      dplyr::select(model, engine, mode, predictor_indicators,
+                    compute_intercept, remove_intercept)
+  }
+  res
 }
-
 
@@ -301,7 +301,7 @@ xgb_train <- function(
 
 
   if (is.numeric(y)) {
-    loss <- "reg:linear"
+    loss <- "reg:squarederror"
   } else {
     lvl <- levels(y)
     y <- as.numeric(y) - 1
@@ -399,7 +399,7 @@ xgb_pred <- function(object, newdata, ...) {
 
   x = switch(
     object$params$objective,
-    "reg:linear" = , "reg:logistic" = , "binary:logistic" = res,
+    "reg:squarederror" = , "reg:logistic" = , "binary:logistic" = res,
     "binary:logitraw" = stats::binomial()$linkinv(res),
     "multi:softprob" = matrix(res, ncol = object$params$num_class, byrow = TRUE),
     res
 
@@ -91,7 +91,11 @@ set_encoding(
   model = "boost_tree",
   eng = "xgboost",
   mode = "regression",
-  options = list(predictor_indicators = TRUE)
+  options = list(
+    predictor_indicators = "one_hot",
+    compute_intercept = FALSE,
+    remove_intercept = TRUE
+  )
 )
 
 set_pred(
@@ -136,7 +140,11 @@ set_encoding(
   model = "boost_tree",
   eng = "xgboost",
   mode = "classification",
-  options = list(predictor_indicators = TRUE)
+  options = list(
+    predictor_indicators = "one_hot",
+    compute_intercept = FALSE,
+    remove_intercept = TRUE
+  )
 )
 
 set_pred(
@@ -239,7 +247,11 @@ set_encoding(
   model = "boost_tree",
   eng = "C5.0",
   mode = "classification",
-  options = list(predictor_indicators = FALSE)
+  options = list(
+    predictor_indicators = "none",
+    compute_intercept = FALSE,
+    remove_intercept = FALSE
+  )
 )
 
 set_pred(
@@ -369,7 +381,11 @@ set_encoding(
   model = "boost_tree",
   eng = "spark",
   mode = "regression",
-  options = list(predictor_indicators = TRUE)
+  options = list(
+    predictor_indicators = "none",
+    compute_intercept = FALSE,
+    remove_intercept = FALSE
+  )
 )
 
 set_fit(
@@ -389,7 +405,11 @@ set_encoding(
   model = "boost_tree",
   eng = "spark",
   mode = "classification",
-  options = list(predictor_indicators = TRUE)
+  options = list(
+    predictor_indicators = "none",
+    compute_intercept = FALSE,
+    remove_intercept = FALSE
+  )
 )
 
 set_pred(
 
@@ -0,0 +1,47 @@
+#' Contrast function for one-hot encodings
+#'
+#' This contrast function produces a model matrix with indicator columns for
+#' each level of each factor.
+#'
+#' @param n A vector of character factor levels or the number of unique levels.
+#' @param contrasts This argument is for backwards compatibility and only the
+#'   default of `TRUE` is supported.
+#' @param sparse This argument is for backwards compatibility and only the
+#'   default of `FALSE` is supported.
+#'
+#' @includeRmd man/rmd/one-hot.Rmd details
+#'
+#' @return A diagonal matrix that is `n`-by-`n`.
+#'
+#' @export
+contr_one_hot <- function(n, contrasts = TRUE, sparse = FALSE) {
+  if (sparse) {
+    rlang::warn("`sparse = TRUE` not implemented for `contr_one_hot()`.")
+  }
+
+  if (!contrasts) {
+    rlang::warn("`contrasts = FALSE` not implemented for `contr_one_hot()`.")
+  }
+
+  if (is.character(n)) {
+    names <- n
+    n <- length(names)
+  } else if (is.numeric(n)) {
+    n <- as.integer(n)
+
+    if (length(n) != 1L) {
+      rlang::abort("`n` must have length 1 when an integer is provided.")
+    }
+
+    names <- as.character(seq_len(n))
+  } else {
+    rlang::abort("`n` must be a character vector or an integer of size 1.")
+  }
+
+  out <- diag(n)
+
+  rownames(out) <- names
+  colnames(out) <- names
+
+  out
+}
Original file line number	Diff line number	Diff line change
`@@ -39,7 +39,8 @@ utils::globalVariables(`
`39`	`39`	`'lab', 'original', 'predicted_label', 'prediction', 'value', 'type',`
`40`	`40`	`"neighbors", ".submodels", "has_submodel", "max_neighbor", "max_penalty",`
`41`	`41`	`"max_terms", "max_tree", "model", "name", "num_terms", "penalty", "trees",`
`42`		`- "sub_neighbors", ".pred_class", "x", "y", "predictor_indicators")`
	`42`	`+ "sub_neighbors", ".pred_class", "x", "y", "predictor_indicators",`
	`43`	`+ "compute_intercept", "remove_intercept")`
`43`	`44`	`)`
`44`	`45`
`45`	`46`	`# nocov end`