Skip to content

Commit

Permalink
data_arrange() gets a by argument
Browse files Browse the repository at this point in the history
  • Loading branch information
strengejacke committed Nov 24, 2024
1 parent 2741cdc commit 31c5803
Show file tree
Hide file tree
Showing 7 changed files with 141 additions and 54 deletions.
2 changes: 1 addition & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
Type: Package
Package: datawizard
Title: Easy Data Wrangling and Statistical Transformations
Version: 0.13.0.13
Version: 0.13.0.14
Authors@R: c(
person("Indrajeet", "Patil", , "[email protected]", role = "aut",
comment = c(ORCID = "0000-0003-1995-6531")),
Expand Down
3 changes: 3 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,9 @@ CHANGES
* `data_modify()` now recognizes `n()`, for example to create an index for data groups
with `1:n()` (#535).

* `data_arrange()` gets a `by` argument, to arrange data grouped by values or
levels of certain variables.

BUG FIXES

* `describe_distribution()` no longer errors if the sample was too sparse to compute
Expand Down
80 changes: 53 additions & 27 deletions R/data_arrange.R
Original file line number Diff line number Diff line change
@@ -1,15 +1,20 @@
#' Arrange rows by column values
#' @title Arrange rows by column values
#' @name data_arrange
#'
#' @description
#' `data_arrange()` orders the rows of a data frame by the values of selected
#' columns.
#'
#' @param data A data frame, or an object that can be coerced to a data frame.
#' @param data A (grouped) data frame, or an object that can be coerced to a
#' data frame.
#' @param select Character vector of column names. Use a dash just before column
#' name to arrange in decreasing order, for example `"-x1"`.
#' name to arrange in decreasing order, for example `"-x1"`.
#' @param safe Do not throw an error if one of the variables specified doesn't
#' exist.
#' exist.
#' @param ... Currently not used.
#' @inheritParams data_summary
#'
#' @return A data frame.
#' @return A data frame, where rows are sorted according to the selected columns.
#'
#' @examples
#'
Expand All @@ -22,14 +27,55 @@
#' # Throw an error if one of the variables specified doesn't exist
#' try(data_arrange(head(mtcars), c("gear", "foo"), safe = FALSE))
#' @export
data_arrange <- function(data, select = NULL, safe = TRUE) {
data_arrange <- function(data, ...) {
UseMethod("data_arrange")
}


#' @rdname data_arrange
#' @export
data_arrange.default <- function(data, select = NULL, by = NULL, safe = TRUE, ...) {
if (!is.null(by)) {
# check "by" argument for valid names
.sanitize_by_argument(data, by)
split_data <- split(data, data[by], drop = TRUE)
# we remove names, else rownames are not correct - these would be prefixed
# by the values for each list-element
names(split_data) <- NULL
out <- lapply(split_data, function(x) {
.data_arrange(x, select = select, safe = safe)
})
out <- do.call(rbind, out)
# remove rownames if original data had none
if (!insight::object_has_rownames(data)) {
rownames(out) <- NULL
}
} else {
out <- .data_arrange(data, select = select, safe = safe)
}
out
}


#' @export
data_arrange.default <- function(data, select = NULL, safe = TRUE) {
data_arrange.grouped_df <- function(data, select = NULL, by = NULL, safe = TRUE, ...) {
# extract group variables
grps <- attr(data, "groups", exact = TRUE)
group_variables <- data_remove(grps, ".rows")
# if "by" is not supplied, use group variables
if (is.null(by)) {
by <- colnames(group_variables)
}
# remove information specific to grouped df's
attr(data, "groups") <- NULL
class(data) <- "data.frame"
data_arrange(data = data, select = select, by = by, safe = safe, ...)
}


# utilities ----------------------

.data_arrange <- function(data, select = NULL, safe = TRUE) {
if (is.null(select) || length(select) == 0) {
return(data)
}
Expand Down Expand Up @@ -98,23 +144,3 @@ data_arrange.default <- function(data, select = NULL, safe = TRUE) {

out
}



#' @export
data_arrange.grouped_df <- function(data, select = NULL, safe = TRUE) {
grps <- attr(data, "groups", exact = TRUE)
grps <- grps[[".rows"]]

out <- lapply(grps, function(x) {
data_arrange.default(data[x, ], select = select, safe = safe)
})

out <- do.call(rbind, out)

if (!insight::object_has_rownames(data)) {
rownames(out) <- NULL
}

out
}
46 changes: 26 additions & 20 deletions R/data_summary.R
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,9 @@
#' data frame or a matrix.
#'
#' @param x A (grouped) data frame.
#' @param by Optional character string, indicating the name of a variable in `x`.
#' If supplied, the data will be split by this variable and summary statistics
#' will be computed for each group.
#' @param by Optional character string, indicating the names of one or more
#' variables in the data frame. If supplied, the data will be split by these
#' variables and summary statistics will be computed for each group.
#' @param remove_na Logical. If `TRUE`, missing values are omitted from the
#' grouping variable. If `FALSE` (default), missing values are included as a
#' level in the grouping variable.
Expand Down Expand Up @@ -84,23 +84,8 @@ data_summary.data.frame <- function(x, ..., by = NULL, remove_na = FALSE) {
out <- data.frame(summarise)
colnames(out) <- vapply(summarise, names, character(1))
} else {
# sanity check - is "by" a character string?
if (!is.character(by)) {
insight::format_error("Argument `by` must be a character string indicating the name of variables in the data.")
}
# is "by" in the data?
if (!all(by %in% colnames(x))) {
by_not_found <- by[!by %in% colnames(x)]
insight::format_error(
paste0(
"Variable",
ifelse(length(by_not_found) > 1, "s ", " "),
text_concatenate(by_not_found, enclose = "\""),
" not found in the data."
),
.misspelled_string(colnames(x), by_not_found, "Possibly misspelled?")
)
}
# check "by" argument for valid names
.sanitize_by_argument(x, by)
# split data, add NA levels, if requested
l <- lapply(x[by], function(i) {
if (remove_na || !anyNA(i)) {
Expand Down Expand Up @@ -207,6 +192,27 @@ data_summary.grouped_df <- function(x, ..., by = NULL, remove_na = FALSE) {
}


.sanitize_by_argument <- function(x, by) {
# sanity check - is "by" a character string?
if (!is.character(by)) {
insight::format_error("Argument `by` must be a character string indicating the name of variables in the data.")
}
# is "by" in the data?
if (!all(by %in% colnames(x))) {
by_not_found <- by[!by %in% colnames(x)]
insight::format_error(
paste0(
"Variable",
ifelse(length(by_not_found) > 1, "s ", " "),
text_concatenate(by_not_found, enclose = "\""),
" not found in the data."
),
.misspelled_string(colnames(x), by_not_found, "Possibly misspelled?")
)
}
}


# methods ----------------------------------------------------------------------

#' @export
Expand Down
16 changes: 13 additions & 3 deletions man/data_arrange.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

6 changes: 3 additions & 3 deletions man/data_summary.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

42 changes: 42 additions & 0 deletions tests/testthat/test-data_arrange.R
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,48 @@ test_that("data_arrange works with grouped df", {
)
})

test_that("data_arrange works with by", {
set.seed(123)
x <- mtcars[sample(seq_len(nrow(mtcars)), 10, replace = TRUE), c("cyl", "mpg")]

expected <- data.frame(
cyl = c(4, 4, 4, 6, 6, 8, 8, 8, 8, 8),
mpg = c(22.8, 30.4, 32.4, 17.8, 19.2, 10.4, 15, 15.2, 15.5, 18.7)
)
rownames(expected) <- c(
"Datsun 710", "Honda Civic", "Fiat 128", "Merc 280C", "Merc 280",
"Cadillac Fleetwood", "Maserati Bora", "Merc 450SLC", "Dodge Challenger",
"Hornet Sportabout"
)

expect_identical(
data_arrange(x, "mpg", by = "cyl"),
expected,
ignore_attr = TRUE
)

# works for df without rownames
set.seed(123)
x <- iris[sample(seq_len(nrow(iris)), 10, replace = TRUE), c("Sepal.Width", "Species")]
rownames(x) <- NULL

expected <- data.frame(
Sepal.Width = c(3, 3, 3.2, 3.3, 2.5, 2.6, 2.6, 3, 3.8, 3.8),
Species = structure(
c(1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 3L, 3L),
levels = c("setosa", "versicolor", "virginica"),
class = "factor"
)
)
rownames(expected) <- NULL

expect_identical(
data_arrange(x, "Sepal.Width", by = "Species"),
expected,
ignore_attr = TRUE
)
})

test_that("data_arrange works with NA", {
# without groups

Expand Down

0 comments on commit 31c5803

Please sign in to comment.