diff --git a/R/categorize.R b/R/categorize.R index cf5db484e..53e287a0a 100644 --- a/R/categorize.R +++ b/R/categorize.R @@ -31,6 +31,10 @@ #' for numeric variables, the minimum of the original input is preserved. For #' factors, the default minimum is `1`. For `split = "equal_range"`, the #' default minimum is always `1`, unless specified otherwise in `lowest`. +#' @param breaks Character, indicating whether breaks for categorizing data are +#' `"inclusive"` (values indicate the _upper_ bound of the _previous_ group or +#' interval) or `"exclusive"` (values indicate the _lower_ bound of the _next_ +#' group or interval to begin.) #' @param labels Character vector of value labels. If not `NULL`, `categorize()` #' will returns factors instead of numeric variables, with `labels` used #' for labelling the factor levels. Can also be `"mean"`, `"median"`, @@ -56,7 +60,7 @@ #' #' # Splits and breaks (cut-off values) #' -#' Breaks are in general _exclusive_, this means that these values indicate +#' Breaks are by default _exclusive_, this means that these values indicate #' the lower bound of the next group or interval to begin. Take a simple #' example, a numeric variable with values from 1 to 9. The median would be 5, #' thus the first interval ranges from 1-4 and is recoded into 1, while 5-9 @@ -66,6 +70,9 @@ #' from 1 to 3 belong to the first interval and are recoded into 1 (because #' the next interval starts at 3.67), 4 to 6 into 2 and 7 to 9 into 3. #' +#' The opposite behaviour can be achieved using `breaks = "inclusive"`, in which +#' case +#' #' # Recoding into groups with equal size or range #' #' `split = "equal_length"` and `split = "equal_range"` try to divide the @@ -152,6 +159,7 @@ categorize.numeric <- function(x, n_groups = NULL, range = NULL, lowest = 1, + breaks = "exclusive", labels = NULL, verbose = TRUE, ...) { @@ -162,6 +170,9 @@ categorize.numeric <- function(x, if (identical(split, "equal_length")) split <- "length" if (identical(split, "equal_range")) split <- "range" + # check for valid values + breaks <- match.arg(breaks, c("exclusive", "inclusive")) + # save original_x <- x @@ -179,9 +190,9 @@ categorize.numeric <- function(x, } if (is.numeric(split)) { - breaks <- split + category_splits <- split } else { - breaks <- switch(split, + category_splits <- switch(split, median = stats::median(x), mean = mean(x), length = n_groups, @@ -192,14 +203,16 @@ categorize.numeric <- function(x, } # complete ranges, including minimum and maximum - if (!identical(split, "length")) breaks <- unique(c(min(x), breaks, max(x))) + if (!identical(split, "length")) { + category_splits <- unique(c(min(x), category_splits, max(x))) + } # recode into groups out <- droplevels(cut( x, - breaks = breaks, + breaks = category_splits, include.lowest = TRUE, - right = FALSE + right = identical(breaks, "inclusive") )) cut_result <- out levels(out) <- 1:nlevels(out) @@ -234,6 +247,7 @@ categorize.data.frame <- function(x, n_groups = NULL, range = NULL, lowest = 1, + breaks = "exclusive", labels = NULL, append = FALSE, ignore_case = FALSE, @@ -271,6 +285,7 @@ categorize.data.frame <- function(x, n_groups = n_groups, range = range, lowest = lowest, + breaks = breaks, labels = labels, verbose = verbose, ... @@ -287,6 +302,7 @@ categorize.grouped_df <- function(x, n_groups = NULL, range = NULL, lowest = 1, + breaks = "exclusive", labels = NULL, append = FALSE, ignore_case = FALSE, @@ -330,6 +346,7 @@ categorize.grouped_df <- function(x, n_groups = n_groups, range = range, lowest = lowest, + breaks = breaks, labels = labels, select = select, exclude = exclude, diff --git a/man/categorize.Rd b/man/categorize.Rd index 8d3ffcd8e..097a5ef8d 100644 --- a/man/categorize.Rd +++ b/man/categorize.Rd @@ -14,6 +14,7 @@ categorize(x, ...) n_groups = NULL, range = NULL, lowest = 1, + breaks = "exclusive", labels = NULL, verbose = TRUE, ... @@ -27,6 +28,7 @@ categorize(x, ...) n_groups = NULL, range = NULL, lowest = 1, + breaks = "exclusive", labels = NULL, append = FALSE, ignore_case = FALSE, @@ -67,6 +69,11 @@ for numeric variables, the minimum of the original input is preserved. For factors, the default minimum is \code{1}. For \code{split = "equal_range"}, the default minimum is always \code{1}, unless specified otherwise in \code{lowest}.} +\item{breaks}{Character, indicating whether breaks for categorizing data are +\code{"inclusive"} (values indicate the \emph{upper} bound of the \emph{previous} group or +interval) or \code{"exclusive"} (values indicate the \emph{lower} bound of the \emph{next} +group or interval to begin.)} + \item{labels}{Character vector of value labels. If not \code{NULL}, \code{categorize()} will returns factors instead of numeric variables, with \code{labels} used for labelling the factor levels. Can also be \code{"mean"}, \code{"median"}, @@ -148,7 +155,7 @@ It is basically a wrapper around base R's \code{cut()}, providing a simplified and more accessible way to define the interval breaks (cut-off values). } \section{Splits and breaks (cut-off values)}{ -Breaks are in general \emph{exclusive}, this means that these values indicate +Breaks are by default \emph{exclusive}, this means that these values indicate the lower bound of the next group or interval to begin. Take a simple example, a numeric variable with values from 1 to 9. The median would be 5, thus the first interval ranges from 1-4 and is recoded into 1, while 5-9 @@ -157,6 +164,9 @@ using \code{split = "quantile"} and \code{n_groups = 3} would define breaks at 3 and 6.33 (see \code{quantile(1:9, probs = c(1/3, 2/3))}), which means that values from 1 to 3 belong to the first interval and are recoded into 1 (because the next interval starts at 3.67), 4 to 6 into 2 and 7 to 9 into 3. + +The opposite behaviour can be achieved using \code{breaks = "inclusive"}, in which +case } \section{Recoding into groups with equal size or range}{ diff --git a/tests/testthat/_snaps/categorize.md b/tests/testthat/_snaps/categorize.md index 1b8f9fd64..d08c14c4d 100644 --- a/tests/testthat/_snaps/categorize.md +++ b/tests/testthat/_snaps/categorize.md @@ -31,3 +31,17 @@ [31] (10.4-15) (21-24.4) Levels: (10.4-15) (15.2-19.7) (21-24.4) (26-27.3) (30.4-33.9) +--- + + Code + categorize(mtcars$mpg, "equal_length", n_groups = 5, labels = "observed", + breaks = "inclusive") + Output + [1] (21-24.4) (21-24.4) (21-24.4) (21-24.4) (15.2-19.7) (15.2-19.7) + [7] (10.4-15) (21-24.4) (21-24.4) (15.2-19.7) (15.2-19.7) (15.2-19.7) + [13] (15.2-19.7) (15.2-19.7) (10.4-15) (10.4-15) (10.4-15) (30.4-33.9) + [19] (30.4-33.9) (30.4-33.9) (21-24.4) (15.2-19.7) (15.2-19.7) (10.4-15) + [25] (15.2-19.7) (26-27.3) (26-27.3) (30.4-33.9) (15.2-19.7) (15.2-19.7) + [31] (10.4-15) (21-24.4) + Levels: (10.4-15) (15.2-19.7) (21-24.4) (26-27.3) (30.4-33.9) + diff --git a/tests/testthat/test-categorize.R b/tests/testthat/test-categorize.R index a8a7c1171..714b9dbac 100644 --- a/tests/testthat/test-categorize.R +++ b/tests/testthat/test-categorize.R @@ -240,4 +240,5 @@ test_that("categorize labelling ranged", { expect_snapshot(categorize(mtcars$mpg, "equal_length", n_groups = 5)) expect_snapshot(categorize(mtcars$mpg, "equal_length", n_groups = 5, labels = "range")) expect_snapshot(categorize(mtcars$mpg, "equal_length", n_groups = 5, labels = "observed")) + expect_snapshot(categorize(mtcars$mpg, "equal_length", n_groups = 5, labels = "observed", breaks = "inclusive")) })