Skip to content

Commit

Permalink
categorize() add labels="range" option (#549)
Browse files Browse the repository at this point in the history
* categorize() add labels="range" option
Fixes #548

* add test, docs

* lintr

* add option to decide on exclusive/inclusive breaks

* docs
  • Loading branch information
strengejacke authored Oct 2, 2024
1 parent ea2f16f commit 77d67f4
Show file tree
Hide file tree
Showing 4 changed files with 144 additions and 23 deletions.
69 changes: 52 additions & 17 deletions R/categorize.R
Original file line number Diff line number Diff line change
Expand Up @@ -31,10 +31,18 @@
#' for numeric variables, the minimum of the original input is preserved. For
#' factors, the default minimum is `1`. For `split = "equal_range"`, the
#' default minimum is always `1`, unless specified otherwise in `lowest`.
#' @param breaks Character, indicating whether breaks for categorizing data are
#' `"inclusive"` (values indicate the _upper_ bound of the _previous_ group or
#' interval) or `"exclusive"` (values indicate the _lower_ bound of the _next_
#' group or interval to begin). Use `labels = "range"` to make this behaviour
#' easier to see.
#' @param labels Character vector of value labels. If not `NULL`, `categorize()`
#' will returns factors instead of numeric variables, with `labels` used
#' for labelling the factor levels. Can also be `"mean"` or `"median"` for a
#' factor with labels as the mean/median of each groups.
#' for labelling the factor levels. Can also be `"mean"`, `"median"`,
#' `"range"` or `"observed"` for a factor with labels as the mean/median,
#' the requested range (even if not all values of that range are present in
#' the data) or observed range (range of the actual recoded values) of each
#' group. See 'Examples'.
#' @param append Logical or string. If `TRUE`, recoded or converted variables
#' get new column names and are appended (column bind) to `x`, thus returning
#' both the original and the recoded variables. The new columns get a suffix,
Expand All @@ -53,7 +61,7 @@
#'
#' # Splits and breaks (cut-off values)
#'
#' Breaks are in general _exclusive_, this means that these values indicate
#' Breaks are by default _exclusive_, this means that these values indicate
#' the lower bound of the next group or interval to begin. Take a simple
#' example, a numeric variable with values from 1 to 9. The median would be 5,
#' thus the first interval ranges from 1-4 and is recoded into 1, while 5-9
Expand All @@ -63,6 +71,9 @@
#' from 1 to 3 belong to the first interval and are recoded into 1 (because
#' the next interval starts at 3.67), 4 to 6 into 2 and 7 to 9 into 3.
#'
#' The opposite behaviour can be achieved using `breaks = "inclusive"`, in which
#' case
#'
#' # Recoding into groups with equal size or range
#'
#' `split = "equal_length"` and `split = "equal_range"` try to divide the
Expand Down Expand Up @@ -119,6 +130,13 @@
#' x <- sample(1:10, size = 30, replace = TRUE)
#' categorize(x, "equal_length", n_groups = 3, labels = "mean")
#' categorize(x, "equal_length", n_groups = 3, labels = "median")
#'
#' # cut numeric into groups with the requested range as a label name
#' # each category has the same range, and labels indicate this range
#' categorize(mtcars$mpg, "equal_length", n_groups = 5, labels = "range")
#' # in this example, each category has the same range, but labels only refer
#' # to the ranges of the actual values (present in the data) inside each group
#' categorize(mtcars$mpg, "equal_length", n_groups = 5, labels = "observed")
#' @export
categorize <- function(x, ...) {
UseMethod("categorize")
Expand All @@ -142,6 +160,7 @@ categorize.numeric <- function(x,
n_groups = NULL,
range = NULL,
lowest = 1,
breaks = "exclusive",
labels = NULL,
verbose = TRUE,
...) {
Expand All @@ -152,6 +171,9 @@ categorize.numeric <- function(x,
if (identical(split, "equal_length")) split <- "length"
if (identical(split, "equal_range")) split <- "range"

# check for valid values
breaks <- match.arg(breaks, c("exclusive", "inclusive"))

# save
original_x <- x

Expand All @@ -169,9 +191,9 @@ categorize.numeric <- function(x,
}

if (is.numeric(split)) {
breaks <- split
category_splits <- split
} else {
breaks <- switch(split,
category_splits <- switch(split,
median = stats::median(x),
mean = mean(x),
length = n_groups,
Expand All @@ -182,15 +204,18 @@ categorize.numeric <- function(x,
}

# complete ranges, including minimum and maximum
if (!identical(split, "length")) breaks <- unique(c(min(x), breaks, max(x)))
if (!identical(split, "length")) {
category_splits <- unique(c(min(x), category_splits, max(x)))
}

# recode into groups
out <- droplevels(cut(
x,
breaks = breaks,
breaks = category_splits,
include.lowest = TRUE,
right = FALSE
right = identical(breaks, "inclusive")
))
cut_result <- out
levels(out) <- 1:nlevels(out)

# fix lowest value, add back into original vector
Expand All @@ -201,7 +226,7 @@ categorize.numeric <- function(x,
original_x[!is.na(original_x)] <- out

# turn into factor?
.original_x_to_factor(original_x, x, labels, out, verbose, ...)
.original_x_to_factor(original_x, x, cut_result, labels, out, verbose, ...)
}


Expand All @@ -223,6 +248,7 @@ categorize.data.frame <- function(x,
n_groups = NULL,
range = NULL,
lowest = 1,
breaks = "exclusive",
labels = NULL,
append = FALSE,
ignore_case = FALSE,
Expand Down Expand Up @@ -260,6 +286,7 @@ categorize.data.frame <- function(x,
n_groups = n_groups,
range = range,
lowest = lowest,
breaks = breaks,
labels = labels,
verbose = verbose,
...
Expand All @@ -276,6 +303,7 @@ categorize.grouped_df <- function(x,
n_groups = NULL,
range = NULL,
lowest = 1,
breaks = "exclusive",
labels = NULL,
append = FALSE,
ignore_case = FALSE,
Expand Down Expand Up @@ -319,6 +347,7 @@ categorize.grouped_df <- function(x,
n_groups = n_groups,
range = range,
lowest = lowest,
breaks = breaks,
labels = labels,
select = select,
exclude = exclude,
Expand Down Expand Up @@ -375,20 +404,26 @@ categorize.grouped_df <- function(x,
}


.original_x_to_factor <- function(original_x, x, labels, out, verbose, ...) {
.original_x_to_factor <- function(original_x, x, cut_result, labels, out, verbose, ...) {
if (!is.null(labels)) {
if (length(labels) == length(unique(out))) {
original_x <- as.factor(original_x)
levels(original_x) <- labels
} else if (length(labels) == 1 && labels %in% c("mean", "median")) {
} else if (length(labels) == 1 && labels %in% c("mean", "median", "range", "observed")) {
original_x <- as.factor(original_x)
no_na_x <- original_x[!is.na(original_x)]
if (labels == "mean") {
labels <- stats::aggregate(x, list(no_na_x), FUN = mean, na.rm = TRUE)$x
} else {
labels <- stats::aggregate(x, list(no_na_x), FUN = stats::median, na.rm = TRUE)$x
}
levels(original_x) <- insight::format_value(labels, ...)
out <- switch(labels,
mean = stats::aggregate(x, list(no_na_x), FUN = mean, na.rm = TRUE)$x,
median = stats::aggregate(x, list(no_na_x), FUN = stats::median, na.rm = TRUE)$x,
# labels basically like what "cut()" returns
range = levels(cut_result),
# range based on the values that are actually present in the data
{
temp <- stats::aggregate(x, list(no_na_x), FUN = range, na.rm = TRUE)$x
apply(temp, 1, function(i) paste0("(", paste(as.vector(i), collapse = "-"), ")"))
}
)
levels(original_x) <- insight::format_value(out, ...)
} else if (isTRUE(verbose)) {
insight::format_warning(
"Argument `labels` and levels of the recoded variable are not of the same length.",
Expand Down
27 changes: 24 additions & 3 deletions man/categorize.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

47 changes: 47 additions & 0 deletions tests/testthat/_snaps/categorize.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
# categorize labelling ranged

Code
categorize(mtcars$mpg, "equal_length", n_groups = 5)
Output
[1] 3 3 3 3 2 2 1 3 3 2 2 2 2 2 1 1 1 5 5 5 3 2 2 1 2 4 4 5 2 2 1 3

---

Code
categorize(mtcars$mpg, "equal_length", n_groups = 5, labels = "range")
Output
[1] [19.8,24.5) [19.8,24.5) [19.8,24.5) [19.8,24.5) [15.1,19.8) [15.1,19.8)
[7] [10.4,15.1) [19.8,24.5) [19.8,24.5) [15.1,19.8) [15.1,19.8) [15.1,19.8)
[13] [15.1,19.8) [15.1,19.8) [10.4,15.1) [10.4,15.1) [10.4,15.1) [29.2,33.9]
[19] [29.2,33.9] [29.2,33.9] [19.8,24.5) [15.1,19.8) [15.1,19.8) [10.4,15.1)
[25] [15.1,19.8) [24.5,29.2) [24.5,29.2) [29.2,33.9] [15.1,19.8) [15.1,19.8)
[31] [10.4,15.1) [19.8,24.5)
Levels: [10.4,15.1) [15.1,19.8) [19.8,24.5) [24.5,29.2) [29.2,33.9]

---

Code
categorize(mtcars$mpg, "equal_length", n_groups = 5, labels = "observed")
Output
[1] (21-24.4) (21-24.4) (21-24.4) (21-24.4) (15.2-19.7) (15.2-19.7)
[7] (10.4-15) (21-24.4) (21-24.4) (15.2-19.7) (15.2-19.7) (15.2-19.7)
[13] (15.2-19.7) (15.2-19.7) (10.4-15) (10.4-15) (10.4-15) (30.4-33.9)
[19] (30.4-33.9) (30.4-33.9) (21-24.4) (15.2-19.7) (15.2-19.7) (10.4-15)
[25] (15.2-19.7) (26-27.3) (26-27.3) (30.4-33.9) (15.2-19.7) (15.2-19.7)
[31] (10.4-15) (21-24.4)
Levels: (10.4-15) (15.2-19.7) (21-24.4) (26-27.3) (30.4-33.9)

---

Code
categorize(mtcars$mpg, "equal_length", n_groups = 5, labels = "observed",
breaks = "inclusive")
Output
[1] (21-24.4) (21-24.4) (21-24.4) (21-24.4) (15.2-19.7) (15.2-19.7)
[7] (10.4-15) (21-24.4) (21-24.4) (15.2-19.7) (15.2-19.7) (15.2-19.7)
[13] (15.2-19.7) (15.2-19.7) (10.4-15) (10.4-15) (10.4-15) (30.4-33.9)
[19] (30.4-33.9) (30.4-33.9) (21-24.4) (15.2-19.7) (15.2-19.7) (10.4-15)
[25] (15.2-19.7) (26-27.3) (26-27.3) (30.4-33.9) (15.2-19.7) (15.2-19.7)
[31] (10.4-15) (21-24.4)
Levels: (10.4-15) (15.2-19.7) (21-24.4) (26-27.3) (30.4-33.9)

24 changes: 21 additions & 3 deletions tests/testthat/test-categorize.R
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
set.seed(123)
d <- sample(1:10, size = 500, replace = TRUE)
d <- sample.int(10, size = 500, replace = TRUE)

test_that("recode median", {
expect_identical(categorize(d), ifelse(d >= median(d), 2, 1))
Expand All @@ -22,7 +22,7 @@ test_that("recode quantile", {
})

set.seed(123)
d <- sample(1:100, size = 1000, replace = TRUE)
d <- sample.int(100, size = 1000, replace = TRUE)

test_that("recode range", {
expect_error(categorize(d, split = "range"))
Expand Down Expand Up @@ -84,7 +84,7 @@ test_that("recode length", {
})

set.seed(123)
x <- sample(1:10, size = 30, replace = TRUE)
x <- sample.int(10, size = 30, replace = TRUE)
test_that("recode factor labels", {
expect_type(categorize(x, "equal_length", n_groups = 3), "double")
expect_s3_class(categorize(x, "equal_length", n_groups = 3, labels = c("low", "mid", "high")), "factor")
Expand Down Expand Up @@ -232,3 +232,21 @@ test_that("categorize regex", {
categorize(mtcars, select = "mpg")
)
})


# select helpers ------------------------------
test_that("categorize labelling ranged", {
data(mtcars)
expect_snapshot(categorize(mtcars$mpg, "equal_length", n_groups = 5))
expect_snapshot(categorize(mtcars$mpg, "equal_length", n_groups = 5, labels = "range"))
expect_snapshot(categorize(mtcars$mpg, "equal_length", n_groups = 5, labels = "observed"))
})

test_that("categorize breaks", {
data(mtcars)
expect_snapshot(categorize(mtcars$mpg, "equal_length", n_groups = 5, labels = "observed", breaks = "inclusive"))
expect_error(
categorize(mtcars$mpg, "equal_length", n_groups = 5, breaks = "something"),
regex = "should be one of"
)
})

0 comments on commit 77d67f4

Please sign in to comment.