Skip to content

Commit e942833

Browse files
authored
Weighted eCDF (#5119)
* Add weighted ecdf * Update docs * Add tests for weighted ecdf * Add NEWS.md bullet * Fix typo * Don't fallback to `ecdf()` * Add clarification in docs * use wecdf * cleanup news
1 parent 9202a47 commit e942833

File tree

4 files changed

+164
-5
lines changed

4 files changed

+164
-5
lines changed

NEWS.md

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,6 @@
11
# ggplot2 (development version)
22

3-
* A function can be provided to `labs(alt = <...>)` that takes the plot as input
4-
and returns text as output (@teunbrand, #4795).
3+
* `stat_ecdf()` now has an optional `weight` aesthetic (@teunbrand, #5058).
54
* Position scales combined with `coord_sf()` can now use functions in the
65
`breaks` argument. In addition, `n.breaks` works as intended and
76
`breaks = NULL` removes grid lines and axes (@teunbrand, #4622).

R/stat-ecdf.R

Lines changed: 81 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,10 @@
1212
#' and one of them must be unused. The ECDF will be calculated on the given aesthetic
1313
#' and will be output on the unused one.
1414
#'
15+
#' If the `weight` aesthetic is provided, a weighted ECDF will be computed. In
16+
#' this case, the ECDF is incremented by `weight / sum(weight)` instead of
17+
#' `1 / length(x)` for each observation.
18+
#'
1519
#' @inheritParams layer
1620
#' @inheritParams geom_point
1721
#' @param na.rm If `FALSE` (the default), removes missing values with
@@ -20,10 +24,16 @@
2024
#' of points to interpolate with.
2125
#' @param pad If `TRUE`, pad the ecdf with additional points (-Inf, 0)
2226
#' and (Inf, 1)
27+
#' @eval rd_aesthetics("stat", "ecdf")
2328
#' @eval rd_computed_vars(
2429
#' ecdf = "Cumulative density corresponding to `x`.",
2530
#' y = "`r lifecycle::badge('superseded')` For backward compatibility."
2631
#' )
32+
#' @section Dropped variables:
33+
#' \describe{
34+
#' \item{weight}{After calculation, weights of individual observations (if
35+
#' supplied), are no longer available.}
36+
#' }
2737
#' @export
2838
#' @examples
2939
#' set.seed(1)
@@ -41,6 +51,17 @@
4151
#' # Multiple ECDFs
4252
#' ggplot(df, aes(x, colour = g)) +
4353
#' stat_ecdf()
54+
#'
55+
#' # Using weighted eCDF
56+
#' weighted <- data.frame(x = 1:10, weights = c(1:5, 5:1))
57+
#' plain <- data.frame(x = rep(weighted$x, weighted$weights))
58+
#'
59+
#' ggplot(plain, aes(x)) +
60+
#' stat_ecdf(linewidth = 1) +
61+
#' stat_ecdf(
62+
#' aes(weight = weights),
63+
#' data = weighted, colour = "green"
64+
#' )
4465
stat_ecdf <- function(mapping = NULL, data = NULL,
4566
geom = "step", position = "identity",
4667
...,
@@ -74,7 +95,7 @@ stat_ecdf <- function(mapping = NULL, data = NULL,
7495
StatEcdf <- ggproto("StatEcdf", Stat,
7596
required_aes = c("x|y"),
7697

77-
default_aes = aes(x = after_stat(ecdf), y = after_stat(ecdf)),
98+
default_aes = aes(x = after_stat(ecdf), y = after_stat(ecdf), weight = NULL),
7899

79100
setup_params = function(self, data, params) {
80101
params$flipped_aes <- has_flipped_aes(data, params, main_is_orthogonal = FALSE, main_is_continuous = TRUE)
@@ -100,7 +121,7 @@ StatEcdf <- ggproto("StatEcdf", Stat,
100121
if (pad) {
101122
x <- c(-Inf, x, Inf)
102123
}
103-
data_ecdf <- stats::ecdf(data$x)(x)
124+
data_ecdf <- wecdf(data$x, data$weight)(x)
104125

105126
df_ecdf <- data_frame0(
106127
x = x,
@@ -110,6 +131,63 @@ StatEcdf <- ggproto("StatEcdf", Stat,
110131
)
111132
df_ecdf$flipped_aes <- flipped_aes
112133
flip_data(df_ecdf, flipped_aes)
113-
}
134+
},
135+
136+
dropped_aes = "weight"
114137
)
115138

139+
# Weighted eCDF function
140+
wecdf <- function(x, weights = NULL) {
141+
142+
weights <- weights %||% 1
143+
weights <- vec_recycle(weights, length(x))
144+
145+
# Sort vectors
146+
ord <- order(x, na.last = NA)
147+
x <- x[ord]
148+
weights <- weights[ord]
149+
150+
if (any(!is.finite(weights))) {
151+
cli::cli_warn(c(paste0(
152+
"The {.field weight} aesthetic does not support non-finite or ",
153+
"{.code NA} values."
154+
), "i" = "These weights were replaced by {.val 0}."))
155+
weights[!is.finite(weights)] <- 0
156+
}
157+
158+
# `total` replaces `length(x)`
159+
total <- sum(weights)
160+
161+
if (abs(total) < 1000 * .Machine$double.eps) {
162+
if (total == 0) {
163+
cli::cli_abort(paste0(
164+
"Cannot compute eCDF when the {.field weight} aesthetic sums up to ",
165+
"{.val 0}."
166+
))
167+
}
168+
cli::cli_warn(c(
169+
"The sum of the {.field weight} aesthetic is close to {.val 0}.",
170+
"i" = "Computed eCDF might be unstable."
171+
))
172+
}
173+
174+
# Link each observation to unique value
175+
vals <- unique0(x)
176+
matched <- match(x, vals)
177+
178+
# Instead of tabulating `matched`, as we would for unweighted `ecdf(x)`,
179+
# we sum weights per unique value of `x`
180+
agg_weights <- vapply(
181+
split(weights, matched),
182+
sum, numeric(1)
183+
)
184+
185+
# Like `ecdf(x)`, we return an approx function
186+
approxfun(
187+
vals,
188+
cumsum(agg_weights) / total,
189+
method = "constant",
190+
yleft = 0, yright = 1,
191+
f = 0, ties = "ordered"
192+
)
193+
}

man/stat_ecdf.Rd

Lines changed: 34 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

tests/testthat/test-stat-ecdf.R

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,54 @@ test_that("stat_ecdf works in both directions", {
1515
expect_snapshot_error(ggplot_build(p))
1616
})
1717

18+
test_that("weighted ecdf computes sensible results", {
19+
20+
set.seed(42)
21+
x <- rpois(100, 5)
22+
ux <- sort(unique0(x))
23+
24+
# Absent weights should be the same as the original
25+
expect_equal(
26+
ecdf(x)(ux),
27+
wecdf(x, NULL)(ux)
28+
)
29+
30+
# Uniform weights should be the same as the original
31+
expect_equal(
32+
ecdf(x)(ux),
33+
wecdf(x, pi)(ux)
34+
)
35+
36+
# Tabulated weights should be the same as the original
37+
tab <- as.data.frame(table(x), stringsAsFactors = FALSE)
38+
tab$x <- as.numeric(tab$x)
39+
expect_equal(
40+
ecdf(x)(ux),
41+
wecdf(tab$x, tab$Freq)(ux)
42+
)
43+
})
44+
45+
test_that("weighted ecdf warns about weird weights", {
46+
47+
# Should warn when provided with illegal weights
48+
expect_warning(
49+
wecdf(1:10, c(NA, rep(1, 9))),
50+
"does not support non-finite"
51+
)
52+
53+
# Should warn when provided with near-0 weights
54+
expect_warning(
55+
wecdf(1:10, .Machine$double.eps),
56+
"might be unstable"
57+
)
58+
59+
# Should error when weights sum to 0
60+
expect_error(
61+
wecdf(1:10, rep(c(-1, 1), 5)),
62+
"Cannot compute eCDF"
63+
)
64+
})
65+
1866
# See #5113 and #5112
1967
test_that("stat_ecdf responds to axis transformations", {
2068
n <- 4

0 commit comments

Comments
 (0)