Skip to content

Allow sparse tibbles in fit() and fit_xy() #1165

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 6 commits into from
Sep 4, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@

* `fit_xy()` can now take dgCMatrix input for `x` argument (#1121).

* `fit()` and `fit_xy()` can now take sparse tibbles as data values (#1165).

* Transitioned package errors and warnings to use cli (#1147 and #1148 by
@shum461, #1153 by @RobLBaker and @wright13, #1154 by @JamesHWade, #1160,
#1161, #1081).
Expand Down
1 change: 1 addition & 0 deletions R/arguments.R
Original file line number Diff line number Diff line change
Expand Up @@ -264,6 +264,7 @@ make_xy_call <- function(object, target, env) {
none = rlang::expr(x),
data.frame = rlang::expr(maybe_data_frame(x)),
matrix = rlang::expr(maybe_matrix(x)),
dgCMatrix = rlang::expr(maybe_sparse_matrix(x)),
cli::cli_abort("Invalid data type target: {target}.")
)
if (uses_weights) {
Expand Down
23 changes: 20 additions & 3 deletions R/convert_data.R
Original file line number Diff line number Diff line change
Expand Up @@ -41,9 +41,10 @@
indicators = "traditional",
composition = "data.frame",
remove_intercept = TRUE) {
if (!(composition %in% c("data.frame", "matrix"))) {
if (!(composition %in% c("data.frame", "matrix", "dgCMatrix"))) {
cli::cli_abort(
"{.arg composition} should be either {.val data.frame} or {.val matrix}."
"{.arg composition} should be either {.val data.frame}, {.val matrix}, or
{.val dgCMatrix}."
)
}

Expand Down Expand Up @@ -122,6 +123,18 @@
xlevels = .getXlevels(mod_terms, mod_frame),
options = options
)
} else if (composition == "dgCMatrix") {
x <- sparsevctrs::coerce_to_sparse_matrix(data)
res <-
list(
x = x,
y = y,
weights = w,
offset = offset,
terms = mod_terms,
xlevels = .getXlevels(mod_terms, mod_frame),
options = options
)
} else {
# Since a matrix is requested, try to convert y but check
# to see if it is possible
Expand Down Expand Up @@ -389,7 +402,11 @@ maybe_matrix <- function(x) {
}

maybe_sparse_matrix <- function(x) {
if (any(vapply(x, sparsevctrs::is_sparse_vector, logical(1)))) {
if (methods::is(x, "sparseMatrix")) {
return(x)
}

if (is_sparse_tibble(x)) {
res <- sparsevctrs::coerce_to_sparse_matrix(x)
} else {
res <- as.matrix(x)
Expand Down
2 changes: 2 additions & 0 deletions R/fit.R
Original file line number Diff line number Diff line change
Expand Up @@ -174,6 +174,8 @@ fit.model_spec <-
eval_env$formula <- formula
eval_env$weights <- wts

data <- materialize_sparse_tibble(data, object, "data")

fit_interface <-
check_interface(eval_env$formula, eval_env$data, cl, object)

Expand Down
5 changes: 5 additions & 0 deletions R/fit_helpers.R
Original file line number Diff line number Diff line change
Expand Up @@ -129,6 +129,11 @@ form_xy <- function(object, control, env,

indicators <- encoding_info %>% dplyr::pull(predictor_indicators)
remove_intercept <- encoding_info %>% dplyr::pull(remove_intercept)
allow_sparse_x <- encoding_info %>% dplyr::pull(allow_sparse_x)

if (allow_sparse_x && is_sparse_tibble(env$data)) {
target <- "dgCMatrix"
}

data_obj <- .convert_form_to_xy_fit(
formula = env$formula,
Expand Down
21 changes: 21 additions & 0 deletions R/sparsevctrs.R
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,27 @@ to_sparse_data_frame <- function(x, object) {
"{.arg x} is a sparse matrix, but {.fn {class(object)[1]}} with
engine {.code {object$engine}} doesn't accept that.")
}
} else if (is.data.frame(x)) {
x <- materialize_sparse_tibble(x, object, "x")
}
x
}

is_sparse_tibble <- function(x) {
any(vapply(x, sparsevctrs::is_sparse_vector, logical(1)))
}

materialize_sparse_tibble <- function(x, object, input) {
if ((!allow_sparse(object)) && is_sparse_tibble(x)) {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this the same case as f2faed9?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes it is. i just refactored it out as we need it a couple of places

cli::cli_warn(
"{.arg {input}} is a sparse tibble, but {.fn {class(object)[1]}} with
engine {.code {object$engine}} doesn't accept that. Converting to
non-sparse."
)
for (i in seq_along(ncol(x))) {
# materialize with []
x[[i]] <- x[[i]][]
}
}
x
}
16 changes: 16 additions & 0 deletions tests/testthat/_snaps/sparsevctrs.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,19 @@
# sparse tibble can be passed to `fit()

Code
lm_fit <- fit(spec, avg_price_per_room ~ ., data = hotel_data[1:100, ])
Condition
Warning:
`data` is a sparse tibble, but `linear_reg()` with engine `lm` doesn't accept that. Converting to non-sparse.

# sparse tibble can be passed to `fit_xy()

Code
lm_fit <- fit_xy(spec, x = hotel_data[1:100, -1], y = hotel_data[1:100, 1])
Condition
Warning:
`x` is a sparse tibble, but `linear_reg()` with engine `lm` doesn't accept that. Converting to non-sparse.

# sparse matrices can be passed to `fit_xy()

Code
Expand Down
48 changes: 47 additions & 1 deletion tests/testthat/test-sparsevctrs.R
Original file line number Diff line number Diff line change
@@ -1,3 +1,49 @@
test_that("sparse tibble can be passed to `fit()", {
skip_if_not_installed("xgboost")

hotel_data <- sparse_hotel_rates()
hotel_data <- sparsevctrs::coerce_to_sparse_tibble(hotel_data)

spec <- boost_tree() %>%
set_mode("regression") %>%
set_engine("xgboost")

expect_no_error(
lm_fit <- fit(spec, avg_price_per_room ~ ., data = hotel_data)
)

spec <- linear_reg() %>%
set_mode("regression") %>%
set_engine("lm")

expect_snapshot(
lm_fit <- fit(spec, avg_price_per_room ~ ., data = hotel_data[1:100, ])
)
})

test_that("sparse tibble can be passed to `fit_xy()", {
skip_if_not_installed("xgboost")

hotel_data <- sparse_hotel_rates()
hotel_data <- sparsevctrs::coerce_to_sparse_tibble(hotel_data)

spec <- boost_tree() %>%
set_mode("regression") %>%
set_engine("xgboost")

expect_no_error(
lm_fit <- fit_xy(spec, x = hotel_data[, -1], y = hotel_data[, 1])
)

spec <- linear_reg() %>%
set_mode("regression") %>%
set_engine("lm")

expect_snapshot(
lm_fit <- fit_xy(spec, x = hotel_data[1:100, -1], y = hotel_data[1:100, 1])
)
})

test_that("sparse matrices can be passed to `fit_xy()", {
skip_if_not_installed("xgboost")

Expand Down Expand Up @@ -66,7 +112,7 @@ test_that("maybe_sparse_matrix() is used correctly", {

local_mocked_bindings(
maybe_sparse_matrix = function(x) {
if (any(vapply(x, sparsevctrs::is_sparse_vector, logical(1)))) {
if (is_sparse_tibble(x)) {
stop("sparse vectors detected")
} else {
stop("no sparse vectors detected")
Expand Down
Loading