Add LiblineaR engine to logistic_reg() (#429)

juliasilge · topepo · web-flow · commit 154c1ab16d9f · 2021-02-26T12:14:02.000-07:00
* Add LiblineaR engine for logistic_reg()

* Update docs, tests, NEWS for LiblineaR logistic_reg()

* Update NEWS

* Add docs about LiblineaR regularizing intercept

* Change test to engine arg of bias

* Update man/rmd/logistic-reg.Rmd

Co-authored-by: Max Kuhn &lt;mxkuhn@gmail.com&gt;

* Redocument

* Test logistic_reg for varying() penalty

Co-authored-by: Max Kuhn &lt;mxkuhn@gmail.com&gt;
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -32,7 +32,7 @@ Imports:
     prettyunits,
     vctrs (>= 0.2.0)
 Roxygen: list(markdown = TRUE)
-RoxygenNote: 7.1.1.9000
+RoxygenNote: 7.1.1.9001
 Suggests: 
     testthat,
     knitr,
diff --git a/NEWS.md b/NEWS.md
@@ -2,7 +2,7 @@
 
 * The `liquidSVM` engine for `svm_rbf()` was deprecated due to that package's removal from CRAN. (#425)
 
-* A new linear SVM model `svm_linear()` is now available with the `LiblineaR` engine. (#424)
+* A new linear SVM model `svm_linear()` is now available with the `LiblineaR` engine (#424), and the `LiblineaR` engine is available for `logistic_reg()` as well (#429).
 
 # parsnip 0.1.5
 
diff --git a/R/logistic_reg.R b/R/logistic_reg.R
@@ -20,21 +20,22 @@
 #' @param mode A single character string for the type of model.
 #'  The only possible value for this model is "classification".
 #' @param penalty A non-negative number representing the total
-#'  amount of regularization (`glmnet`, `keras`, and `spark` only).
+#'  amount of regularization (`glmnet`, `LiblineaR`, `keras`, and `spark` only).
 #'  For `keras` models, this corresponds to purely L2 regularization
-#'  (aka weight decay) while the other models can be a combination
+#'  (aka weight decay) while the other models can be either or a combination
 #'  of L1 and L2 (depending on the value of `mixture`).
 #' @param mixture A number between zero and one (inclusive) that is the
 #'  proportion of L1 regularization (i.e. lasso) in the model. When
 #'  `mixture = 1`, it is a pure lasso model while `mixture = 0` indicates that
-#'  ridge regression is being used. (`glmnet` and `spark` only).
+#'  ridge regression is being used. (`glmnet`, `LiblineaR`, and `spark` only).
+#'  For `LiblineaR` models, `mixture` must be exactly 0 or 1 only.
 #' @details
 #' For `logistic_reg()`, the mode will always be "classification".
 #'
 #' The model can be created using the `fit()` function using the
 #'  following _engines_:
 #' \itemize{
-#' \item \pkg{R}:  `"glm"`  (the default) or `"glmnet"`
+#' \item \pkg{R}:  `"glm"`  (the default), `"glmnet"`, or `"LiblineaR"`
 #' \item \pkg{Stan}:  `"stan"`
 #' \item \pkg{Spark}: `"spark"`
 #' \item \pkg{keras}: `"keras"`
@@ -101,7 +102,45 @@ print.logistic_reg <- function(x, ...) {
 }
 
 #' @export
-translate.logistic_reg <- translate.linear_reg
+translate.logistic_reg <- function(x, engine = x$engine, ...) {
+  x <- translate.default(x, engine, ...)
+
+  # slightly cleaner code using
+  arg_vals <- x$method$fit$args
+  arg_names <- names(arg_vals)
+
+
+  if (engine == "glmnet") {
+    # See discussion in https://github.com/tidymodels/parsnip/issues/195
+    arg_vals$lambda <- NULL
+    # Since the `fit` information is gone for the penalty, we need to have an
+    # evaluated value for the parameter.
+    x$args$penalty <- rlang::eval_tidy(x$args$penalty)
+  }
+
+  if (engine == "LiblineaR") {
+    # convert parameter arguments
+    new_penalty <- rlang::eval_tidy(x$args$penalty)
+    if (is.numeric(new_penalty))
+      arg_vals$cost <- rlang::new_quosure(1 / new_penalty, env = rlang::empty_env())
+
+    if (any(arg_names == "type")) {
+      if (is.numeric(quo_get_expr(arg_vals$type)))
+        if (quo_get_expr(x$args$mixture) == 0) {
+          arg_vals$type <- 0      ## ridge
+        } else if (quo_get_expr(x$args$mixture) == 1) {
+          arg_vals$type <- 6      ## lasso
+        } else {
+          rlang::abort("For the LiblineaR engine, mixture must be 0 or 1.")
+        }
+    }
+
+  }
+
+  x$method$fit$args <- arg_vals
+
+  x
+}
 
 # ------------------------------------------------------------------------------
 
@@ -169,6 +208,16 @@ check_args.logistic_reg <- function(object) {
   if (is.numeric(args$mixture) && length(args$mixture) > 1)
     rlang::abort("Only one value of `mixture` is allowed.")
 
+  if (object$engine == "LiblineaR") {
+    if(is.numeric(args$mixture) && !args$mixture %in% 0:1)
+      rlang::abort(c("For the LiblineaR engine, mixture must be 0 or 1.",
+                     "Choose a pure ridge model with `mixture = 0`.",
+                     "Choose a pure lasso model with `mixture = 1`.",
+                     "The Liblinear engine does not support other values."))
+    if(all(is.numeric(args$penalty)) && !all(args$penalty > 0))
+      rlang::abort("For the LiblineaR engine, penalty must be > 0.")
+  }
+
   invisible(object)
 }
 
@@ -346,3 +395,12 @@ predict_raw._lognet <- function(object, new_data, opts = list(), ...) {
   predict_raw.model_fit(object, new_data = new_data, opts = opts, ...)
 }
 
+# ------------------------------------------------------------------------------
+
+liblinear_preds <- function(results, object) {
+  results$predictions
+}
+
+liblinear_probs <- function(results, object) {
+  as_tibble(results$probabilities)
+}
diff --git a/R/logistic_reg_data.R b/R/logistic_reg_data.R
@@ -233,6 +233,104 @@ set_pred(
 
 # ------------------------------------------------------------------------------
 
+set_model_engine("logistic_reg", "classification", "LiblineaR")
+set_dependency("logistic_reg", "LiblineaR", "LiblineaR")
+
+set_fit(
+  model = "logistic_reg",
+  eng = "LiblineaR",
+  mode = "classification",
+  value = list(
+    interface = "matrix",
+    protect = c("x", "y", "wi"),
+    data = c(x = "data", y = "target"),
+    func = c(pkg = "LiblineaR", fun = "LiblineaR"),
+    defaults = list(verbose = FALSE)
+  )
+)
+
+set_encoding(
+  model = "logistic_reg",
+  eng = "LiblineaR",
+  mode = "classification",
+  options = list(
+    predictor_indicators = "none",
+    compute_intercept = FALSE,
+    remove_intercept = FALSE,
+    allow_sparse_x = FALSE
+  )
+)
+
+set_model_arg(
+  model = "logistic_reg",
+  eng = "LiblineaR",
+  parsnip = "penalty",
+  original = "cost",
+  func = list(pkg = "dials", fun = "penalty"),
+  has_submodel = TRUE
+)
+
+set_model_arg(
+  model = "logistic_reg",
+  eng = "LiblineaR",
+  parsnip = "mixture",
+  original = "type",
+  func = list(pkg = "dials", fun = "mixture"),
+  has_submodel = FALSE
+)
+
+set_pred(
+  model = "logistic_reg",
+  eng = "LiblineaR",
+  mode = "classification",
+  type = "class",
+  value = list(
+    pre = NULL,
+    post = liblinear_preds,
+    func = c(fun = "predict"),
+    args =
+      list(
+        object = quote(object$fit),
+        newx = expr(as.matrix(new_data))
+      )
+  )
+)
+
+set_pred(
+  model = "logistic_reg",
+  eng = "LiblineaR",
+  mode = "classification",
+  type = "prob",
+  value = list(
+    pre = NULL,
+    post = liblinear_probs,
+    func = c(fun = "predict"),
+    args =
+      list(
+        object = quote(object$fit),
+        newx = expr(as.matrix(new_data)),
+        proba = TRUE
+      )
+  )
+)
+
+set_pred(
+  model = "logistic_reg",
+  eng = "LiblineaR",
+  mode = "classification",
+  type = "raw",
+  value = list(
+    pre = NULL,
+    post = NULL,
+    func = c(fun = "predict"),
+    args = list(
+      object = quote(object$fit),
+      newx = quote(new_data))
+  )
+)
+
+# ------------------------------------------------------------------------------
+
 set_model_engine("logistic_reg", "classification", "spark")
 set_dependency("logistic_reg", "spark", "sparklyr")
 
diff --git a/man/logistic_reg.Rd b/man/logistic_reg.Rd
diff --git a/man/rmd/logistic-reg.Rmd b/man/rmd/logistic-reg.Rmd
@@ -33,6 +33,22 @@ multiple  penalties, the `multi_predict()` function can be used. It  returns a
 tibble with a list column called `.pred` that contains  a tibble with all of the
 penalty results.
 
+## LiblineaR
+
+```{r liblinear-reg}
+logistic_reg() %>% 
+  set_engine("LiblineaR") %>% 
+  set_mode("classification") %>% 
+  translate()
+```
+
+For `LiblineaR` models, the value for `mixture` can either be 0 (for ridge) or 1 
+(for lasso) but not other intermediate values. In the `LiblineaR` documentation, 
+these correspond to types 0 (L2-regularized) and 6 (L1-regularized).
+
+Be aware that the `LiblineaR` engine regularizes the intercept. Other 
+regularized regression models do not, which will result in different parameter estimates.
+
 ## stan
 
 ```{r stan-reg}
@@ -81,11 +97,11 @@ get_defaults_logistic_reg <- function() {
   tibble::tribble(
     ~model,          ~engine,     ~parsnip,            ~original,  ~default,
     "logistic_reg",  "glmnet",    "mixture",              "alpha",  get_arg("glmnet", "glmnet", "alpha"),
+    "logistic_reg",  "LiblineaR", "mixture",              "type",   "0",
     "logistic_reg",  "spark",     "penalty",          "reg_param",  get_arg("sparklyr", "ml_logistic_regression", "reg_param"),
     "logistic_reg",  "spark",     "mixture",  "elastic_net_param",  get_arg("sparklyr", "ml_logistic_regression", "elastic_net_param"),
     "logistic_reg",  "keras",     "penalty",            "penalty",  get_arg("parsnip", "keras_mlp", "penalty"),
   )
 }
 convert_args("logistic_reg")
 ```
-
diff --git a/tests/testthat/test_logistic_reg.R b/tests/testthat/test_logistic_reg.R