From 31c5803382bd3fafaf4203248dcee4a69321b866 Mon Sep 17 00:00:00 2001 From: Daniel Date: Sun, 24 Nov 2024 21:10:54 +0100 Subject: [PATCH] `data_arrange()` gets a `by` argument --- DESCRIPTION | 2 +- NEWS.md | 3 ++ R/data_arrange.R | 80 ++++++++++++++++++++---------- R/data_summary.R | 46 +++++++++-------- man/data_arrange.Rd | 16 ++++-- man/data_summary.Rd | 6 +-- tests/testthat/test-data_arrange.R | 42 ++++++++++++++++ 7 files changed, 141 insertions(+), 54 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 2325c062d..be41e0f6f 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Type: Package Package: datawizard Title: Easy Data Wrangling and Statistical Transformations -Version: 0.13.0.13 +Version: 0.13.0.14 Authors@R: c( person("Indrajeet", "Patil", , "patilindrajeet.science@gmail.com", role = "aut", comment = c(ORCID = "0000-0003-1995-6531")), diff --git a/NEWS.md b/NEWS.md index 663efa310..40d9b489c 100644 --- a/NEWS.md +++ b/NEWS.md @@ -22,6 +22,9 @@ CHANGES * `data_modify()` now recognizes `n()`, for example to create an index for data groups with `1:n()` (#535). +* `data_arrange()` gets a `by` argument, to arrange data grouped by values or + levels of certain variables. + BUG FIXES * `describe_distribution()` no longer errors if the sample was too sparse to compute diff --git a/R/data_arrange.R b/R/data_arrange.R index b64876a24..75739e3d1 100644 --- a/R/data_arrange.R +++ b/R/data_arrange.R @@ -1,15 +1,20 @@ -#' Arrange rows by column values +#' @title Arrange rows by column values +#' @name data_arrange #' +#' @description #' `data_arrange()` orders the rows of a data frame by the values of selected #' columns. #' -#' @param data A data frame, or an object that can be coerced to a data frame. +#' @param data A (grouped) data frame, or an object that can be coerced to a +#' data frame. #' @param select Character vector of column names. Use a dash just before column -#' name to arrange in decreasing order, for example `"-x1"`. +#' name to arrange in decreasing order, for example `"-x1"`. #' @param safe Do not throw an error if one of the variables specified doesn't -#' exist. +#' exist. +#' @param ... Currently not used. +#' @inheritParams data_summary #' -#' @return A data frame. +#' @return A data frame, where rows are sorted according to the selected columns. #' #' @examples #' @@ -22,14 +27,55 @@ #' # Throw an error if one of the variables specified doesn't exist #' try(data_arrange(head(mtcars), c("gear", "foo"), safe = FALSE)) #' @export -data_arrange <- function(data, select = NULL, safe = TRUE) { +data_arrange <- function(data, ...) { UseMethod("data_arrange") } +#' @rdname data_arrange +#' @export +data_arrange.default <- function(data, select = NULL, by = NULL, safe = TRUE, ...) { + if (!is.null(by)) { + # check "by" argument for valid names + .sanitize_by_argument(data, by) + split_data <- split(data, data[by], drop = TRUE) + # we remove names, else rownames are not correct - these would be prefixed + # by the values for each list-element + names(split_data) <- NULL + out <- lapply(split_data, function(x) { + .data_arrange(x, select = select, safe = safe) + }) + out <- do.call(rbind, out) + # remove rownames if original data had none + if (!insight::object_has_rownames(data)) { + rownames(out) <- NULL + } + } else { + out <- .data_arrange(data, select = select, safe = safe) + } + out +} + #' @export -data_arrange.default <- function(data, select = NULL, safe = TRUE) { +data_arrange.grouped_df <- function(data, select = NULL, by = NULL, safe = TRUE, ...) { + # extract group variables + grps <- attr(data, "groups", exact = TRUE) + group_variables <- data_remove(grps, ".rows") + # if "by" is not supplied, use group variables + if (is.null(by)) { + by <- colnames(group_variables) + } + # remove information specific to grouped df's + attr(data, "groups") <- NULL + class(data) <- "data.frame" + data_arrange(data = data, select = select, by = by, safe = safe, ...) +} + + +# utilities ---------------------- + +.data_arrange <- function(data, select = NULL, safe = TRUE) { if (is.null(select) || length(select) == 0) { return(data) } @@ -98,23 +144,3 @@ data_arrange.default <- function(data, select = NULL, safe = TRUE) { out } - - - -#' @export -data_arrange.grouped_df <- function(data, select = NULL, safe = TRUE) { - grps <- attr(data, "groups", exact = TRUE) - grps <- grps[[".rows"]] - - out <- lapply(grps, function(x) { - data_arrange.default(data[x, ], select = select, safe = safe) - }) - - out <- do.call(rbind, out) - - if (!insight::object_has_rownames(data)) { - rownames(out) <- NULL - } - - out -} diff --git a/R/data_summary.R b/R/data_summary.R index 7662d0c94..a63f3d2b3 100644 --- a/R/data_summary.R +++ b/R/data_summary.R @@ -5,9 +5,9 @@ #' data frame or a matrix. #' #' @param x A (grouped) data frame. -#' @param by Optional character string, indicating the name of a variable in `x`. -#' If supplied, the data will be split by this variable and summary statistics -#' will be computed for each group. +#' @param by Optional character string, indicating the names of one or more +#' variables in the data frame. If supplied, the data will be split by these +#' variables and summary statistics will be computed for each group. #' @param remove_na Logical. If `TRUE`, missing values are omitted from the #' grouping variable. If `FALSE` (default), missing values are included as a #' level in the grouping variable. @@ -84,23 +84,8 @@ data_summary.data.frame <- function(x, ..., by = NULL, remove_na = FALSE) { out <- data.frame(summarise) colnames(out) <- vapply(summarise, names, character(1)) } else { - # sanity check - is "by" a character string? - if (!is.character(by)) { - insight::format_error("Argument `by` must be a character string indicating the name of variables in the data.") - } - # is "by" in the data? - if (!all(by %in% colnames(x))) { - by_not_found <- by[!by %in% colnames(x)] - insight::format_error( - paste0( - "Variable", - ifelse(length(by_not_found) > 1, "s ", " "), - text_concatenate(by_not_found, enclose = "\""), - " not found in the data." - ), - .misspelled_string(colnames(x), by_not_found, "Possibly misspelled?") - ) - } + # check "by" argument for valid names + .sanitize_by_argument(x, by) # split data, add NA levels, if requested l <- lapply(x[by], function(i) { if (remove_na || !anyNA(i)) { @@ -207,6 +192,27 @@ data_summary.grouped_df <- function(x, ..., by = NULL, remove_na = FALSE) { } +.sanitize_by_argument <- function(x, by) { + # sanity check - is "by" a character string? + if (!is.character(by)) { + insight::format_error("Argument `by` must be a character string indicating the name of variables in the data.") + } + # is "by" in the data? + if (!all(by %in% colnames(x))) { + by_not_found <- by[!by %in% colnames(x)] + insight::format_error( + paste0( + "Variable", + ifelse(length(by_not_found) > 1, "s ", " "), + text_concatenate(by_not_found, enclose = "\""), + " not found in the data." + ), + .misspelled_string(colnames(x), by_not_found, "Possibly misspelled?") + ) + } +} + + # methods ---------------------------------------------------------------------- #' @export diff --git a/man/data_arrange.Rd b/man/data_arrange.Rd index 3e87c1abb..bd876b30e 100644 --- a/man/data_arrange.Rd +++ b/man/data_arrange.Rd @@ -2,21 +2,31 @@ % Please edit documentation in R/data_arrange.R \name{data_arrange} \alias{data_arrange} +\alias{data_arrange.default} \title{Arrange rows by column values} \usage{ -data_arrange(data, select = NULL, safe = TRUE) +data_arrange(data, ...) + +\method{data_arrange}{default}(data, select = NULL, by = NULL, safe = TRUE, ...) } \arguments{ -\item{data}{A data frame, or an object that can be coerced to a data frame.} +\item{data}{A (grouped) data frame, or an object that can be coerced to a +data frame.} + +\item{...}{Currently not used.} \item{select}{Character vector of column names. Use a dash just before column name to arrange in decreasing order, for example \code{"-x1"}.} +\item{by}{Optional character string, indicating the names of one or more +variables in the data frame. If supplied, the data will be split by these +variables and summary statistics will be computed for each group.} + \item{safe}{Do not throw an error if one of the variables specified doesn't exist.} } \value{ -A data frame. +A data frame, where rows are sorted according to the selected columns. } \description{ \code{data_arrange()} orders the rows of a data frame by the values of selected diff --git a/man/data_summary.Rd b/man/data_summary.Rd index 24cfa1a9f..82d154175 100644 --- a/man/data_summary.Rd +++ b/man/data_summary.Rd @@ -18,9 +18,9 @@ and the function to compute the summary statistic. Example: as a character string, e.g. \code{"mean_sepal_width = mean(Sepal.Width)"}. The summary function \code{n()} can be used to count the number of observations.} -\item{by}{Optional character string, indicating the name of a variable in \code{x}. -If supplied, the data will be split by this variable and summary statistics -will be computed for each group.} +\item{by}{Optional character string, indicating the names of one or more +variables in the data frame. If supplied, the data will be split by these +variables and summary statistics will be computed for each group.} \item{remove_na}{Logical. If \code{TRUE}, missing values are omitted from the grouping variable. If \code{FALSE} (default), missing values are included as a diff --git a/tests/testthat/test-data_arrange.R b/tests/testthat/test-data_arrange.R index fcd769a86..b20189626 100644 --- a/tests/testthat/test-data_arrange.R +++ b/tests/testthat/test-data_arrange.R @@ -97,6 +97,48 @@ test_that("data_arrange works with grouped df", { ) }) +test_that("data_arrange works with by", { + set.seed(123) + x <- mtcars[sample(seq_len(nrow(mtcars)), 10, replace = TRUE), c("cyl", "mpg")] + + expected <- data.frame( + cyl = c(4, 4, 4, 6, 6, 8, 8, 8, 8, 8), + mpg = c(22.8, 30.4, 32.4, 17.8, 19.2, 10.4, 15, 15.2, 15.5, 18.7) + ) + rownames(expected) <- c( + "Datsun 710", "Honda Civic", "Fiat 128", "Merc 280C", "Merc 280", + "Cadillac Fleetwood", "Maserati Bora", "Merc 450SLC", "Dodge Challenger", + "Hornet Sportabout" + ) + + expect_identical( + data_arrange(x, "mpg", by = "cyl"), + expected, + ignore_attr = TRUE + ) + + # works for df without rownames + set.seed(123) + x <- iris[sample(seq_len(nrow(iris)), 10, replace = TRUE), c("Sepal.Width", "Species")] + rownames(x) <- NULL + + expected <- data.frame( + Sepal.Width = c(3, 3, 3.2, 3.3, 2.5, 2.6, 2.6, 3, 3.8, 3.8), + Species = structure( + c(1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 3L, 3L), + levels = c("setosa", "versicolor", "virginica"), + class = "factor" + ) + ) + rownames(expected) <- NULL + + expect_identical( + data_arrange(x, "Sepal.Width", by = "Species"), + expected, + ignore_attr = TRUE + ) +}) + test_that("data_arrange works with NA", { # without groups