Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

categorize() add labels="range" option #549

Merged
merged 6 commits into from
Oct 2, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
69 changes: 52 additions & 17 deletions R/categorize.R
Original file line number Diff line number Diff line change
Expand Up @@ -31,10 +31,18 @@
#' for numeric variables, the minimum of the original input is preserved. For
#' factors, the default minimum is `1`. For `split = "equal_range"`, the
#' default minimum is always `1`, unless specified otherwise in `lowest`.
#' @param breaks Character, indicating whether breaks for categorizing data are
#' `"inclusive"` (values indicate the _upper_ bound of the _previous_ group or
#' interval) or `"exclusive"` (values indicate the _lower_ bound of the _next_
#' group or interval to begin). Use `labels = "range"` to make this behaviour
#' easier to see.
#' @param labels Character vector of value labels. If not `NULL`, `categorize()`
#' will returns factors instead of numeric variables, with `labels` used
#' for labelling the factor levels. Can also be `"mean"` or `"median"` for a
#' factor with labels as the mean/median of each groups.
#' for labelling the factor levels. Can also be `"mean"`, `"median"`,
#' `"range"` or `"observed"` for a factor with labels as the mean/median,
#' the requested range (even if not all values of that range are present in
#' the data) or observed range (range of the actual recoded values) of each
#' group. See 'Examples'.
#' @param append Logical or string. If `TRUE`, recoded or converted variables
#' get new column names and are appended (column bind) to `x`, thus returning
#' both the original and the recoded variables. The new columns get a suffix,
Expand All @@ -53,7 +61,7 @@
#'
#' # Splits and breaks (cut-off values)
#'
#' Breaks are in general _exclusive_, this means that these values indicate
#' Breaks are by default _exclusive_, this means that these values indicate
#' the lower bound of the next group or interval to begin. Take a simple
#' example, a numeric variable with values from 1 to 9. The median would be 5,
#' thus the first interval ranges from 1-4 and is recoded into 1, while 5-9
Expand All @@ -63,6 +71,9 @@
#' from 1 to 3 belong to the first interval and are recoded into 1 (because
#' the next interval starts at 3.67), 4 to 6 into 2 and 7 to 9 into 3.
#'
#' The opposite behaviour can be achieved using `breaks = "inclusive"`, in which
#' case
#'
#' # Recoding into groups with equal size or range
#'
#' `split = "equal_length"` and `split = "equal_range"` try to divide the
Expand Down Expand Up @@ -119,6 +130,13 @@
#' x <- sample(1:10, size = 30, replace = TRUE)
#' categorize(x, "equal_length", n_groups = 3, labels = "mean")
#' categorize(x, "equal_length", n_groups = 3, labels = "median")
#'
#' # cut numeric into groups with the requested range as a label name
#' # each category has the same range, and labels indicate this range
#' categorize(mtcars$mpg, "equal_length", n_groups = 5, labels = "range")
#' # in this example, each category has the same range, but labels only refer
#' # to the ranges of the actual values (present in the data) inside each group
#' categorize(mtcars$mpg, "equal_length", n_groups = 5, labels = "observed")
#' @export
categorize <- function(x, ...) {
UseMethod("categorize")
Expand All @@ -142,6 +160,7 @@ categorize.numeric <- function(x,
n_groups = NULL,
range = NULL,
lowest = 1,
breaks = "exclusive",
labels = NULL,
verbose = TRUE,
...) {
Expand All @@ -152,6 +171,9 @@ categorize.numeric <- function(x,
if (identical(split, "equal_length")) split <- "length"
if (identical(split, "equal_range")) split <- "range"

# check for valid values
breaks <- match.arg(breaks, c("exclusive", "inclusive"))

# save
original_x <- x

Expand All @@ -169,9 +191,9 @@ categorize.numeric <- function(x,
}

if (is.numeric(split)) {
breaks <- split
category_splits <- split
} else {
breaks <- switch(split,
category_splits <- switch(split,
median = stats::median(x),
mean = mean(x),
length = n_groups,
Expand All @@ -182,15 +204,18 @@ categorize.numeric <- function(x,
}

# complete ranges, including minimum and maximum
if (!identical(split, "length")) breaks <- unique(c(min(x), breaks, max(x)))
if (!identical(split, "length")) {
category_splits <- unique(c(min(x), category_splits, max(x)))
}

# recode into groups
out <- droplevels(cut(
x,
breaks = breaks,
breaks = category_splits,
include.lowest = TRUE,
right = FALSE
right = identical(breaks, "inclusive")
))
cut_result <- out
levels(out) <- 1:nlevels(out)

# fix lowest value, add back into original vector
Expand All @@ -201,7 +226,7 @@ categorize.numeric <- function(x,
original_x[!is.na(original_x)] <- out

# turn into factor?
.original_x_to_factor(original_x, x, labels, out, verbose, ...)
.original_x_to_factor(original_x, x, cut_result, labels, out, verbose, ...)
}


Expand All @@ -223,6 +248,7 @@ categorize.data.frame <- function(x,
n_groups = NULL,
range = NULL,
lowest = 1,
breaks = "exclusive",
labels = NULL,
append = FALSE,
ignore_case = FALSE,
Expand Down Expand Up @@ -260,6 +286,7 @@ categorize.data.frame <- function(x,
n_groups = n_groups,
range = range,
lowest = lowest,
breaks = breaks,
labels = labels,
verbose = verbose,
...
Expand All @@ -276,6 +303,7 @@ categorize.grouped_df <- function(x,
n_groups = NULL,
range = NULL,
lowest = 1,
breaks = "exclusive",
labels = NULL,
append = FALSE,
ignore_case = FALSE,
Expand Down Expand Up @@ -319,6 +347,7 @@ categorize.grouped_df <- function(x,
n_groups = n_groups,
range = range,
lowest = lowest,
breaks = breaks,
labels = labels,
select = select,
exclude = exclude,
Expand Down Expand Up @@ -375,20 +404,26 @@ categorize.grouped_df <- function(x,
}


.original_x_to_factor <- function(original_x, x, labels, out, verbose, ...) {
.original_x_to_factor <- function(original_x, x, cut_result, labels, out, verbose, ...) {
if (!is.null(labels)) {
if (length(labels) == length(unique(out))) {
original_x <- as.factor(original_x)
levels(original_x) <- labels
} else if (length(labels) == 1 && labels %in% c("mean", "median")) {
} else if (length(labels) == 1 && labels %in% c("mean", "median", "range", "observed")) {
original_x <- as.factor(original_x)
no_na_x <- original_x[!is.na(original_x)]
if (labels == "mean") {
labels <- stats::aggregate(x, list(no_na_x), FUN = mean, na.rm = TRUE)$x
} else {
labels <- stats::aggregate(x, list(no_na_x), FUN = stats::median, na.rm = TRUE)$x
}
levels(original_x) <- insight::format_value(labels, ...)
out <- switch(labels,
mean = stats::aggregate(x, list(no_na_x), FUN = mean, na.rm = TRUE)$x,
median = stats::aggregate(x, list(no_na_x), FUN = stats::median, na.rm = TRUE)$x,
# labels basically like what "cut()" returns
range = levels(cut_result),
# range based on the values that are actually present in the data
{
temp <- stats::aggregate(x, list(no_na_x), FUN = range, na.rm = TRUE)$x
apply(temp, 1, function(i) paste0("(", paste(as.vector(i), collapse = "-"), ")"))
}
)
levels(original_x) <- insight::format_value(out, ...)
} else if (isTRUE(verbose)) {
insight::format_warning(
"Argument `labels` and levels of the recoded variable are not of the same length.",
Expand Down
27 changes: 24 additions & 3 deletions man/categorize.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

47 changes: 47 additions & 0 deletions tests/testthat/_snaps/categorize.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
# categorize labelling ranged

Code
categorize(mtcars$mpg, "equal_length", n_groups = 5)
Output
[1] 3 3 3 3 2 2 1 3 3 2 2 2 2 2 1 1 1 5 5 5 3 2 2 1 2 4 4 5 2 2 1 3

---

Code
categorize(mtcars$mpg, "equal_length", n_groups = 5, labels = "range")
Output
[1] [19.8,24.5) [19.8,24.5) [19.8,24.5) [19.8,24.5) [15.1,19.8) [15.1,19.8)
[7] [10.4,15.1) [19.8,24.5) [19.8,24.5) [15.1,19.8) [15.1,19.8) [15.1,19.8)
[13] [15.1,19.8) [15.1,19.8) [10.4,15.1) [10.4,15.1) [10.4,15.1) [29.2,33.9]
[19] [29.2,33.9] [29.2,33.9] [19.8,24.5) [15.1,19.8) [15.1,19.8) [10.4,15.1)
[25] [15.1,19.8) [24.5,29.2) [24.5,29.2) [29.2,33.9] [15.1,19.8) [15.1,19.8)
[31] [10.4,15.1) [19.8,24.5)
Levels: [10.4,15.1) [15.1,19.8) [19.8,24.5) [24.5,29.2) [29.2,33.9]

---

Code
categorize(mtcars$mpg, "equal_length", n_groups = 5, labels = "observed")
Output
[1] (21-24.4) (21-24.4) (21-24.4) (21-24.4) (15.2-19.7) (15.2-19.7)
[7] (10.4-15) (21-24.4) (21-24.4) (15.2-19.7) (15.2-19.7) (15.2-19.7)
[13] (15.2-19.7) (15.2-19.7) (10.4-15) (10.4-15) (10.4-15) (30.4-33.9)
[19] (30.4-33.9) (30.4-33.9) (21-24.4) (15.2-19.7) (15.2-19.7) (10.4-15)
[25] (15.2-19.7) (26-27.3) (26-27.3) (30.4-33.9) (15.2-19.7) (15.2-19.7)
[31] (10.4-15) (21-24.4)
Levels: (10.4-15) (15.2-19.7) (21-24.4) (26-27.3) (30.4-33.9)

---

Code
categorize(mtcars$mpg, "equal_length", n_groups = 5, labels = "observed",
breaks = "inclusive")
Output
[1] (21-24.4) (21-24.4) (21-24.4) (21-24.4) (15.2-19.7) (15.2-19.7)
[7] (10.4-15) (21-24.4) (21-24.4) (15.2-19.7) (15.2-19.7) (15.2-19.7)
[13] (15.2-19.7) (15.2-19.7) (10.4-15) (10.4-15) (10.4-15) (30.4-33.9)
[19] (30.4-33.9) (30.4-33.9) (21-24.4) (15.2-19.7) (15.2-19.7) (10.4-15)
[25] (15.2-19.7) (26-27.3) (26-27.3) (30.4-33.9) (15.2-19.7) (15.2-19.7)
[31] (10.4-15) (21-24.4)
Levels: (10.4-15) (15.2-19.7) (21-24.4) (26-27.3) (30.4-33.9)

24 changes: 21 additions & 3 deletions tests/testthat/test-categorize.R
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
set.seed(123)
d <- sample(1:10, size = 500, replace = TRUE)
d <- sample.int(10, size = 500, replace = TRUE)

test_that("recode median", {
expect_identical(categorize(d), ifelse(d >= median(d), 2, 1))
Expand All @@ -22,7 +22,7 @@
})

set.seed(123)
d <- sample(1:100, size = 1000, replace = TRUE)
d <- sample.int(100, size = 1000, replace = TRUE)

test_that("recode range", {
expect_error(categorize(d, split = "range"))
Expand Down Expand Up @@ -84,7 +84,7 @@
})

set.seed(123)
x <- sample(1:10, size = 30, replace = TRUE)
x <- sample.int(10, size = 30, replace = TRUE)
test_that("recode factor labels", {
expect_type(categorize(x, "equal_length", n_groups = 3), "double")
expect_s3_class(categorize(x, "equal_length", n_groups = 3, labels = c("low", "mid", "high")), "factor")
Expand Down Expand Up @@ -127,7 +127,7 @@

expect_warning(
expect_warning(
out <- categorize(x, split = "median", select = c("sepal.Length", "sepal.Width"), ignore_case = FALSE),

Check warning on line 130 in tests/testthat/test-categorize.R

View workflow job for this annotation

GitHub Actions / lint-changed-files / lint-changed-files

file=tests/testthat/test-categorize.R,line=130,col=7,[implicit_assignment_linter] Avoid implicit assignments in function calls. For example, instead of `if (x <- 1L) { ... }`, write `x <- 1L; if (x) { ... }`.
"not found"
),
"not found"
Expand Down Expand Up @@ -186,14 +186,14 @@
test_that("recode all NA", {
x <- rep(NA, 10)
expect_message(
y <- categorize(x),

Check warning on line 189 in tests/testthat/test-categorize.R

View workflow job for this annotation

GitHub Actions / lint-changed-files / lint-changed-files

file=tests/testthat/test-categorize.R,line=189,col=5,[implicit_assignment_linter] Avoid implicit assignments in function calls. For example, instead of `if (x <- 1L) { ... }`, write `x <- 1L; if (x) { ... }`.
"can't be recoded"
)
expect_identical(y, x)

x <- rep(NA_real_, 10)
expect_message(
y <- categorize(x),

Check warning on line 196 in tests/testthat/test-categorize.R

View workflow job for this annotation

GitHub Actions / lint-changed-files / lint-changed-files

file=tests/testthat/test-categorize.R,line=196,col=5,[implicit_assignment_linter] Avoid implicit assignments in function calls. For example, instead of `if (x <- 1L) { ... }`, write `x <- 1L; if (x) { ... }`.
"only missing values"
)
expect_identical(y, x)
Expand Down Expand Up @@ -232,3 +232,21 @@
categorize(mtcars, select = "mpg")
)
})


# select helpers ------------------------------
test_that("categorize labelling ranged", {
data(mtcars)
expect_snapshot(categorize(mtcars$mpg, "equal_length", n_groups = 5))
expect_snapshot(categorize(mtcars$mpg, "equal_length", n_groups = 5, labels = "range"))
expect_snapshot(categorize(mtcars$mpg, "equal_length", n_groups = 5, labels = "observed"))
})

test_that("categorize breaks", {
data(mtcars)
expect_snapshot(categorize(mtcars$mpg, "equal_length", n_groups = 5, labels = "observed", breaks = "inclusive"))
expect_error(
categorize(mtcars$mpg, "equal_length", n_groups = 5, breaks = "something"),
regex = "should be one of"
)
})
Loading