data_arrange() gets a by argument

easystats · Nov 24, 2024 · 31c5803 · 31c5803
1 parent 2741cdc
commit 31c5803
Show file tree

Hide file tree

Showing 7 changed files with 141 additions and 54 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,7 +1,7 @@
 Type: Package
 Package: datawizard
 Title: Easy Data Wrangling and Statistical Transformations
-Version: 0.13.0.13
+Version: 0.13.0.14
 Authors@R: c(
     person("Indrajeet", "Patil", , "[email protected]", role = "aut",
            comment = c(ORCID = "0000-0003-1995-6531")),

diff --git a/NEWS.md b/NEWS.md
@@ -22,6 +22,9 @@ CHANGES
 * `data_modify()` now recognizes `n()`, for example to create an index for data groups
   with `1:n()` (#535).
 
+* `data_arrange()` gets a `by` argument, to arrange data grouped by values or
+  levels of certain variables.
+
 BUG FIXES
 
 * `describe_distribution()` no longer errors if the sample was too sparse to compute

diff --git a/R/data_arrange.R b/R/data_arrange.R
@@ -1,15 +1,20 @@
-#' Arrange rows by column values
+#' @title Arrange rows by column values
+#' @name data_arrange
 #'
+#' @description
 #' `data_arrange()` orders the rows of a data frame by the values of selected
 #' columns.
 #'
-#' @param data A data frame, or an object that can be coerced to a data frame.
+#' @param data A (grouped) data frame, or an object that can be coerced to a
+#' data frame.
 #' @param select Character vector of column names. Use a dash just before column
-#'   name to arrange in decreasing order, for example `"-x1"`.
+#' name to arrange in decreasing order, for example `"-x1"`.
 #' @param safe Do not throw an error if one of the variables specified doesn't
-#'   exist.
+#' exist.
+#' @param ... Currently not used.
+#' @inheritParams data_summary
 #'
-#' @return A data frame.
+#' @return A data frame, where rows are sorted according to the selected columns.
 #'
 #' @examples
 #'
@@ -22,14 +27,55 @@
 #' # Throw an error if one of the variables specified doesn't exist
 #' try(data_arrange(head(mtcars), c("gear", "foo"), safe = FALSE))
 #' @export
-data_arrange <- function(data, select = NULL, safe = TRUE) {
+data_arrange <- function(data, ...) {
   UseMethod("data_arrange")
 }
 
 
+#' @rdname data_arrange
+#' @export
+data_arrange.default <- function(data, select = NULL, by = NULL, safe = TRUE, ...) {
+  if (!is.null(by)) {
+    # check "by" argument for valid names
+    .sanitize_by_argument(data, by)
+    split_data <- split(data, data[by], drop = TRUE)
+    # we remove names, else rownames are not correct - these would be prefixed
+    # by the values for each list-element
+    names(split_data) <- NULL
+    out <- lapply(split_data, function(x) {
+      .data_arrange(x, select = select, safe = safe)
+    })
+    out <- do.call(rbind, out)
+    # remove rownames if original data had none
+    if (!insight::object_has_rownames(data)) {
+      rownames(out) <- NULL
+    }
+  } else {
+    out <- .data_arrange(data, select = select, safe = safe)
+  }
+  out
+}
+
 
 #' @export
-data_arrange.default <- function(data, select = NULL, safe = TRUE) {
+data_arrange.grouped_df <- function(data, select = NULL, by = NULL, safe = TRUE, ...) {
+  # extract group variables
+  grps <- attr(data, "groups", exact = TRUE)
+  group_variables <- data_remove(grps, ".rows")
+  # if "by" is not supplied, use group variables
+  if (is.null(by)) {
+    by <- colnames(group_variables)
+  }
+  # remove information specific to grouped df's
+  attr(data, "groups") <- NULL
+  class(data) <- "data.frame"
+  data_arrange(data = data, select = select, by = by, safe = safe, ...)
+}
+
+
+# utilities ----------------------
+
+.data_arrange <- function(data, select = NULL, safe = TRUE) {
   if (is.null(select) || length(select) == 0) {
     return(data)
   }
@@ -98,23 +144,3 @@ data_arrange.default <- function(data, select = NULL, safe = TRUE) {
 
   out
 }
-
-
-
-#' @export
-data_arrange.grouped_df <- function(data, select = NULL, safe = TRUE) {
-  grps <- attr(data, "groups", exact = TRUE)
-  grps <- grps[[".rows"]]
-
-  out <- lapply(grps, function(x) {
-    data_arrange.default(data[x, ], select = select, safe = safe)
-  })
-
-  out <- do.call(rbind, out)
-
-  if (!insight::object_has_rownames(data)) {
-    rownames(out) <- NULL
-  }
-
-  out
-}
diff --git a/R/data_summary.R b/R/data_summary.R
@@ -5,9 +5,9 @@
 #' data frame or a matrix.
 #'
 #' @param x A (grouped) data frame.
-#' @param by Optional character string, indicating the name of a variable in `x`.
-#' If supplied, the data will be split by this variable and summary statistics
-#' will be computed for each group.
+#' @param by Optional character string, indicating the names of one or more
+#' variables in the data frame. If supplied, the data will be split by these
+#' variables and summary statistics will be computed for each group.
 #' @param remove_na Logical. If `TRUE`, missing values are omitted from the
 #' grouping variable. If `FALSE` (default), missing values are included as a
 #' level in the grouping variable.
@@ -84,23 +84,8 @@ data_summary.data.frame <- function(x, ..., by = NULL, remove_na = FALSE) {
     out <- data.frame(summarise)
     colnames(out) <- vapply(summarise, names, character(1))
   } else {
-    # sanity check - is "by" a character string?
-    if (!is.character(by)) {
-      insight::format_error("Argument `by` must be a character string indicating the name of variables in the data.")
-    }
-    # is "by" in the data?
-    if (!all(by %in% colnames(x))) {
-      by_not_found <- by[!by %in% colnames(x)]
-      insight::format_error(
-        paste0(
-          "Variable",
-          ifelse(length(by_not_found) > 1, "s ", " "),
-          text_concatenate(by_not_found, enclose = "\""),
-          " not found in the data."
-        ),
-        .misspelled_string(colnames(x), by_not_found, "Possibly misspelled?")
-      )
-    }
+    # check "by" argument for valid names
+    .sanitize_by_argument(x, by)
     # split data, add NA levels, if requested
     l <- lapply(x[by], function(i) {
       if (remove_na || !anyNA(i)) {
@@ -207,6 +192,27 @@ data_summary.grouped_df <- function(x, ..., by = NULL, remove_na = FALSE) {
 }
 
 
+.sanitize_by_argument <- function(x, by) {
+  # sanity check - is "by" a character string?
+  if (!is.character(by)) {
+    insight::format_error("Argument `by` must be a character string indicating the name of variables in the data.")
+  }
+  # is "by" in the data?
+  if (!all(by %in% colnames(x))) {
+    by_not_found <- by[!by %in% colnames(x)]
+    insight::format_error(
+      paste0(
+        "Variable",
+        ifelse(length(by_not_found) > 1, "s ", " "),
+        text_concatenate(by_not_found, enclose = "\""),
+        " not found in the data."
+      ),
+      .misspelled_string(colnames(x), by_not_found, "Possibly misspelled?")
+    )
+  }
+}
+
+
 # methods ----------------------------------------------------------------------
 
 #' @export

diff --git a/man/data_arrange.Rd b/man/data_arrange.Rd
diff --git a/man/data_summary.Rd b/man/data_summary.Rd
diff --git a/tests/testthat/test-data_arrange.R b/tests/testthat/test-data_arrange.R
@@ -97,6 +97,48 @@ test_that("data_arrange works with grouped df", {
   )
 })
 
+test_that("data_arrange works with by", {
+  set.seed(123)
+  x <- mtcars[sample(seq_len(nrow(mtcars)), 10, replace = TRUE), c("cyl", "mpg")]
+
+  expected <- data.frame(
+    cyl = c(4, 4, 4, 6, 6, 8, 8, 8, 8, 8),
+    mpg = c(22.8, 30.4, 32.4, 17.8, 19.2, 10.4, 15, 15.2, 15.5, 18.7)
+  )
+  rownames(expected) <- c(
+    "Datsun 710", "Honda Civic", "Fiat 128", "Merc 280C", "Merc 280",
+    "Cadillac Fleetwood", "Maserati Bora", "Merc 450SLC", "Dodge Challenger",
+    "Hornet Sportabout"
+  )
+
+  expect_identical(
+    data_arrange(x, "mpg", by = "cyl"),
+    expected,
+    ignore_attr = TRUE
+  )
+
+  # works for df without rownames
+  set.seed(123)
+  x <- iris[sample(seq_len(nrow(iris)), 10, replace = TRUE), c("Sepal.Width", "Species")]
+  rownames(x) <- NULL
+
+  expected <- data.frame(
+    Sepal.Width = c(3, 3, 3.2, 3.3, 2.5, 2.6, 2.6, 3, 3.8, 3.8),
+    Species = structure(
+      c(1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 3L, 3L),
+      levels = c("setosa", "versicolor", "virginica"),
+      class = "factor"
+    )
+  )
+  rownames(expected) <- NULL
+
+  expect_identical(
+    data_arrange(x, "Sepal.Width", by = "Species"),
+    expected,
+    ignore_attr = TRUE
+  )
+})
+
 test_that("data_arrange works with NA", {
   # without groups