diff --git a/.dev/test-value_at.R b/.dev/test-value_at.R new file mode 100644 index 000000000..3c8272101 --- /dev/null +++ b/.dev/test-value_at.R @@ -0,0 +1,12 @@ +test_that("value_at", { + data(efc, package = "datawizard") + expect_equal(value_at(efc$e42dep, 5), 4, ignore_attr = TRUE) + expect_equal(value_at(efc$c12hour, 4), NA_real_, ignore_attr = TRUE) + expect_equal(value_at(efc$c12hour, 4, remove_na = TRUE), 168, ignore_attr = TRUE) + expect_equal(value_at(efc$c12hour, 5:7), efc$c12hour[5:7], ignore_attr = TRUE) + expect_equal(value_at(efc$e42dep, 123456, default = 55), 55, ignore_attr = TRUE) + expect_null(value_at(efc$e42dep, 123456)) + expect_null(value_at(efc$e42dep, NULL)) + expect_error(value_at(efc$e42dep, NA), regex = "`position` can't") + expect_error(value_at(efc$e42dep, c(3, NA)), regex = "`position` can't") +}) diff --git a/.dev/value_at.R b/.dev/value_at.R new file mode 100644 index 000000000..cdadc9dc6 --- /dev/null +++ b/.dev/value_at.R @@ -0,0 +1,52 @@ +#' @title Find the value(s) at a specific position in a variable +#' @name value_at +#' +#' @description This function can be used to extract one or more values at a +#' specific position in a variable. +#' +#' @param x A vector or factor. +#' @param position An integer or a vector of integers, indicating the position(s) +#' of the value(s) to be returned. Negative values are counted from the end of +#' the vector. If `NA`, an error is thrown. +#' @param remove_na Logical, if `TRUE`, missing values are removed before +#' computing the position. If `FALSE`, missing values are included in the +#' computation. +#' @param default The value to be returned if the position is out of range. +#' +#' @seealso `data_summary()` to use `value_at()` inside a `data_summary()` call. +#' +#' @return A vector with the value(s) at the specified position(s). +#' +#' @examples +#' data(mtcars) +#' # 5th value +#' value_at(mtcars$mpg, 5) +#' # last value +#' value_at(mtcars$mpg, -1) +#' # out of range, return default +#' value_at(mtcars$mpg, 150) +#' # return 2nd and fifth value +#' value_at(mtcars$mpg, c(2, 5)) +#' @export +value_at <- function(x, position = 1, default = NULL, remove_na = FALSE) { + if (remove_na) { + x <- x[!is.na(x)] + } + n <- length(x) + unlist(lapply(position, .values_at, x = x, n = n, default = default), use.names = FALSE) +} + +# helper ---- + +.values_at <- function(x, position, n, default) { + if (is.na(position)) { + insight::format_error("`position` can't be `NA`.") + } + if (position < 0L) { + position <- position + n + 1 + } + if (position <= 0 || position > n) { + return(default) + } + x[position] +} diff --git a/DESCRIPTION b/DESCRIPTION index 048d15065..eeee80702 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Type: Package Package: datawizard Title: Easy Data Wrangling and Statistical Transformations -Version: 0.9.1.4 +Version: 0.9.1.5 Authors@R: c( person("Indrajeet", "Patil", , "patilindrajeet.science@gmail.com", role = "aut", comment = c(ORCID = "0000-0003-1995-6531", Twitter = "@patilindrajeets")), diff --git a/NAMESPACE b/NAMESPACE index e89863807..d10d1884b 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -52,6 +52,10 @@ S3method(data_modify,data.frame) S3method(data_modify,default) S3method(data_modify,grouped_df) S3method(data_peek,data.frame) +S3method(data_summary,data.frame) +S3method(data_summary,default) +S3method(data_summary,grouped_df) +S3method(data_summary,matrix) S3method(data_tabulate,data.frame) S3method(data_tabulate,default) S3method(data_tabulate,grouped_df) @@ -90,6 +94,7 @@ S3method(plot,visualisation_recipe) S3method(print,data_codebook) S3method(print,data_seek) S3method(print,dw_data_peek) +S3method(print,dw_data_summary) S3method(print,dw_data_tabulate) S3method(print,dw_data_tabulates) S3method(print,dw_data_xtabulate) @@ -249,6 +254,7 @@ export(data_rotate) export(data_seek) export(data_select) export(data_separate) +export(data_summary) export(data_tabulate) export(data_to_long) export(data_to_wide) diff --git a/NEWS.md b/NEWS.md index 968c9d417..8b6ba88da 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,5 +1,9 @@ # datawizard 0.9.2 +NEW FUNCTIONS + +* `data_summary()`, to compute summary statistics of (grouped) data frames. + CHANGES * `data_modify()` gets three new arguments, `.at`, `.if` and `.modify`, to modify diff --git a/R/adjust.R b/R/adjust.R index 01b1b5a2c..821821864 100644 --- a/R/adjust.R +++ b/R/adjust.R @@ -124,11 +124,11 @@ adjust <- function(data, predictors[predictors == predictors_num] <- paste0("s(", predictors_num, ")") } formula_predictors <- paste(c("1", predictors), collapse = " + ") - formula <- paste(var, "~", formula_predictors) + model_formula <- paste(var, "~", formula_predictors) x <- .model_adjust_for( data = data[unique(c(var, effect, facs))], - formula, + model_formula = model_formula, multilevel = multilevel, additive = additive, bayesian = bayesian, @@ -148,7 +148,7 @@ data_adjust <- adjust #' @keywords internal .model_adjust_for <- function(data, - formula, + model_formula, multilevel = FALSE, additive = FALSE, bayesian = FALSE, @@ -159,32 +159,28 @@ data_adjust <- adjust # Bayesian if (bayesian) { insight::check_if_installed("rstanarm") - model <- rstanarm::stan_gamm4(stats::as.formula(formula), random = formula_random, data = data, refresh = 0) + model <- rstanarm::stan_gamm4(stats::as.formula(model_formula), random = formula_random, data = data, refresh = 0) # Frequentist } else { insight::check_if_installed("gamm4") - model <- gamm4::gamm4(stats::as.formula(formula), random = formula_random, data = data) + model <- gamm4::gamm4(stats::as.formula(model_formula), random = formula_random, data = data) } # Linear ------------------------- - } else { + } else if (bayesian) { # Bayesian - if (bayesian) { - insight::check_if_installed("rstanarm") - if (multilevel) { - model <- rstanarm::stan_lmer(paste(formula, formula_random), data = data, refresh = 0) - } else { - model <- rstanarm::stan_glm(formula, data = data, refresh = 0) - } - # Frequentist + insight::check_if_installed("rstanarm") + if (multilevel) { + model <- rstanarm::stan_lmer(paste(model_formula, formula_random), data = data, refresh = 0) } else { - if (multilevel) { - insight::check_if_installed("lme4") - model <- lme4::lmer(paste(formula, formula_random), data = data) - } else { - model <- stats::lm(formula, data = data) - } + model <- rstanarm::stan_glm(model_formula, data = data, refresh = 0) } + } else if (multilevel) { + # Frequentist + insight::check_if_installed("lme4") + model <- lme4::lmer(paste(model_formula, formula_random), data = data) + } else { + model <- stats::lm(model_formula, data = data) } adjusted <- insight::get_residuals(model) diff --git a/R/assign_labels.R b/R/assign_labels.R index 9390101ad..bd35513bf 100644 --- a/R/assign_labels.R +++ b/R/assign_labels.R @@ -79,7 +79,7 @@ assign_labels.numeric <- function(x, variable = NULL, values = NULL, ...) { attr(x, "label") <- variable } else { insight::format_error( - "Variable labels (argument `variable`) must be provided as a single character string, e.g. `variable = \"mylabel\"`." + "Variable labels (argument `variable`) must be provided as a single character string, e.g. `variable = \"mylabel\"`." # nolint ) } } @@ -88,13 +88,13 @@ assign_labels.numeric <- function(x, variable = NULL, values = NULL, ...) { if (!is.null(values)) { # extract unique values unique_values <- as.vector(sort(stats::na.omit(unique(x)))) - labels <- NULL + value_labels <- NULL # do we have a names vector for "values"? # else check if number of labels and values match if (is.null(names(values))) { if (length(values) == length(unique_values)) { - labels <- stats::setNames(unique_values, values) + value_labels <- stats::setNames(unique_values, values) } else { insight::format_error( "Cannot add labels. Number of unique values and number of value labels are not equal.", @@ -114,11 +114,11 @@ assign_labels.numeric <- function(x, variable = NULL, values = NULL, ...) { if (length(values)) { # we need to switch names and values - labels <- stats::setNames(coerce_to_numeric(names(values)), values) + value_labels <- stats::setNames(coerce_to_numeric(names(values)), values) } } - attr(x, "labels") <- labels + attr(x, "labels") <- value_labels } x diff --git a/R/categorize.R b/R/categorize.R index d440069d8..341d0c0c9 100644 --- a/R/categorize.R +++ b/R/categorize.R @@ -145,28 +145,8 @@ categorize.numeric <- function(x, labels = NULL, verbose = TRUE, ...) { - # check arguments - if (is.character(split)) { - split <- match.arg( - split, - choices = c( - "median", "mean", "quantile", "equal_length", "equal_range", - "equal", "equal_distance", "range", "distance" - ) - ) - } - - if (is.character(split) && split %in% c("quantile", "equal_length") && is.null(n_groups)) { - insight::format_error( - "Recoding based on quantiles or equal-sized groups requires the `n_groups` argument to be specified." - ) - } - - if (is.character(split) && split == "equal_range" && is.null(n_groups) && is.null(range)) { - insight::format_error( - "Recoding into groups with equal range requires either the `range` or `n_groups` argument to be specified." - ) - } + # sanity check + split <- .sanitize_split_arg(split, n_groups, range) # handle aliases if (identical(split, "equal_length")) split <- "length" @@ -221,28 +201,7 @@ categorize.numeric <- function(x, original_x[!is.na(original_x)] <- out # turn into factor? - if (!is.null(labels)) { - if (length(labels) == length(unique(out))) { - original_x <- as.factor(original_x) - levels(original_x) <- labels - } else if (length(labels) == 1 && labels %in% c("mean", "median")) { - original_x <- as.factor(original_x) - no_na_x <- original_x[!is.na(original_x)] - if (labels == "mean") { - labels <- stats::aggregate(x, list(no_na_x), FUN = mean, na.rm = TRUE)$x - } else { - labels <- stats::aggregate(x, list(no_na_x), FUN = stats::median, na.rm = TRUE)$x - } - levels(original_x) <- insight::format_value(labels, ...) - } else if (isTRUE(verbose)) { - insight::format_warning( - "Argument `labels` and levels of the recoded variable are not of the same length.", - "Variable will not be converted to factor." - ) - } - } - - original_x + .original_x_to_factor(original_x, x, labels, out, verbose, ...) } @@ -283,15 +242,15 @@ categorize.data.frame <- function(x, # create the new variables and updates "select", so new variables are processed if (!isFALSE(append)) { # process arguments - args <- .process_append( + my_args <- .process_append( x, select, append, append_suffix = "_r" ) # update processed arguments - x <- args$x - select <- args$select + x <- my_args$x + select <- my_args$select } x[select] <- lapply( @@ -342,15 +301,15 @@ categorize.grouped_df <- function(x, # create the new variables and updates "select", so new variables are processed if (!isFALSE(append)) { # process arguments - args <- .process_append( + my_args <- .process_append( x, select, append, append_suffix = "_r" ) # update processed arguments - x <- args$x - select <- args$select + x <- my_args$x + select <- my_args$select } x <- as.data.frame(x) @@ -387,3 +346,56 @@ categorize.grouped_df <- function(x, } seq(lowest, max(x), by = range) } + + +.sanitize_split_arg <- function(split, n_groups, range) { + # check arguments + if (is.character(split)) { + split <- match.arg( + split, + choices = c( + "median", "mean", "quantile", "equal_length", "equal_range", + "equal", "equal_distance", "range", "distance" + ) + ) + } + + if (is.character(split) && split %in% c("quantile", "equal_length") && is.null(n_groups)) { + insight::format_error( + "Recoding based on quantiles or equal-sized groups requires the `n_groups` argument to be specified." + ) + } + + if (is.character(split) && split == "equal_range" && is.null(n_groups) && is.null(range)) { + insight::format_error( + "Recoding into groups with equal range requires either the `range` or `n_groups` argument to be specified." + ) + } + + split +} + + +.original_x_to_factor <- function(original_x, x, labels, out, verbose, ...) { + if (!is.null(labels)) { + if (length(labels) == length(unique(out))) { + original_x <- as.factor(original_x) + levels(original_x) <- labels + } else if (length(labels) == 1 && labels %in% c("mean", "median")) { + original_x <- as.factor(original_x) + no_na_x <- original_x[!is.na(original_x)] + if (labels == "mean") { + labels <- stats::aggregate(x, list(no_na_x), FUN = mean, na.rm = TRUE)$x + } else { + labels <- stats::aggregate(x, list(no_na_x), FUN = stats::median, na.rm = TRUE)$x + } + levels(original_x) <- insight::format_value(labels, ...) + } else if (isTRUE(verbose)) { + insight::format_warning( + "Argument `labels` and levels of the recoded variable are not of the same length.", + "Variable will not be converted to factor." + ) + } + } + original_x +} diff --git a/R/data_modify.R b/R/data_modify.R index 40a186736..c9b9d035a 100644 --- a/R/data_modify.R +++ b/R/data_modify.R @@ -177,69 +177,14 @@ data_modify.data.frame <- function(data, ..., .if = NULL, .at = NULL, .modify = } for (i in seq_along(dots)) { - # iterate expressions for new variables - symbol <- dots[[i]] - - # expression is given as character string in a variable, but named, e.g. - # a <- "2 * Sepal.Width" - # data_modify(iris, double_SepWidth = a) - # we reconstruct the symbol as if it were provided as literal expression. - # However, we need to check that we don't have a character vector, - # like: data_modify(iris, new_var = "a") - # this one should be recycled instead. - if (!is.character(symbol)) { - eval_symbol <- .dynEval(symbol, ifnotfound = NULL) - if (is.character(eval_symbol)) { - symbol <- try(str2lang(paste0(names(dots)[i], " = ", eval_symbol)), silent = TRUE) - # we may have the edge-case of having a function that returns a character - # vector, like "new_var = sample(letters[1:3])". In this case, "eval_symbol" - # is of type character, but no symbol, thus str2lang() above creates a - # wrong pattern. We then take "eval_symbol" as character input. - if (inherits(symbol, "try-error")) { - symbol <- str2lang(paste0( - names(dots)[i], - " = c(", paste0("\"", eval_symbol, "\"", collapse = ","), ")" - )) - } - } - } - - # finally, we can evaluate expression and get values for new variables - new_variable <- try(with(data, eval(symbol)), silent = TRUE) - - # successful, or any errors, like misspelled variable name? - if (inherits(new_variable, "try-error")) { - # in which step did error happen? - step_number <- switch(as.character(i), - "1" = "the first expression", - "2" = "the second expression", - "3" = "the third expression", - paste("expression", i) - ) - step_msg <- paste0("There was an error in ", step_number, ".") - # try to find out which variable was the cause for the error - error_msg <- attributes(new_variable)$condition$message - if (grepl("object '(.*)' not found", error_msg)) { - error_var <- gsub("object '(.*)' not found", "\\1", error_msg) - insight::format_error( - paste0(step_msg, " Variable \"", error_var, "\" was not found in the dataset or in the environment."), - .misspelled_string(colnames(data), error_var, "Possibly misspelled or not yet defined?") - ) - } else { - insight::format_error(paste0( - step_msg, " ", insight::format_capitalize(error_msg), - ". Possibly misspelled or not yet defined?" - )) - } - } - + # create new variable + new_variable <- .get_new_dots_variable(dots, i, data) # give informative error when new variable doesn't match number of rows if (!is.null(new_variable) && length(new_variable) != nrow(data) && (nrow(data) %% length(new_variable)) != 0) { insight::format_error( "New variable has not the same length as the other variables in the data frame and cannot be recycled." ) } - data[[names(dots)[i]]] <- new_variable } } @@ -375,3 +320,70 @@ data_modify.grouped_df <- function(data, ..., .if = NULL, .at = NULL, .modify = data } + +.get_new_dots_variable <- function(dots, i, data) { + # iterate expressions for new variables + symbol <- dots[[i]] + + # expression is given as character string in a variable, but named, e.g. + # a <- "2 * Sepal.Width" + # data_modify(iris, double_SepWidth = a) + # we reconstruct the symbol as if it were provided as literal expression. + # However, we need to check that we don't have a character vector, + # like: data_modify(iris, new_var = "a") + # this one should be recycled instead. + if (!is.character(symbol)) { + eval_symbol <- .dynEval(symbol, ifnotfound = NULL) + if (is.character(eval_symbol)) { + symbol <- try(str2lang(paste0(names(dots)[i], " = ", eval_symbol)), silent = TRUE) + # we may have the edge-case of having a function that returns a character + # vector, like "new_var = sample(letters[1:3])". In this case, "eval_symbol" + # is of type character, but no symbol, thus str2lang() above creates a + # wrong pattern. We then take "eval_symbol" as character input. + if (inherits(symbol, "try-error")) { + symbol <- str2lang(paste0( + names(dots)[i], + " = c(", paste0("\"", eval_symbol, "\"", collapse = ","), ")" + )) + } + } + } + + # finally, we can evaluate expression and get values for new variables + symbol_string <- insight::safe_deparse(symbol) + if (!is.null(symbol_string) && all(symbol_string == "n()")) { + # "special" functions + new_variable <- nrow(data) + } else { + # default evaluation of expression + new_variable <- try(with(data, eval(symbol)), silent = TRUE) + } + + # successful, or any errors, like misspelled variable name? + if (inherits(new_variable, "try-error")) { + # in which step did error happen? + step_number <- switch(as.character(i), + "1" = "the first expression", + "2" = "the second expression", + "3" = "the third expression", + paste("expression", i) + ) + step_msg <- paste0("There was an error in ", step_number, ".") + # try to find out which variable was the cause for the error + error_msg <- attributes(new_variable)$condition$message + if (grepl("object '(.*)' not found", error_msg)) { + error_var <- gsub("object '(.*)' not found", "\\1", error_msg) + insight::format_error( + paste0(step_msg, " Variable \"", error_var, "\" was not found in the dataset or in the environment."), + .misspelled_string(colnames(data), error_var, "Possibly misspelled or not yet defined?") + ) + } else { + insight::format_error(paste0( + step_msg, " ", insight::format_capitalize(error_msg), + ". Possibly misspelled or not yet defined?" + )) + } + } + + new_variable +} diff --git a/R/data_summary.R b/R/data_summary.R new file mode 100644 index 000000000..8d15f8483 --- /dev/null +++ b/R/data_summary.R @@ -0,0 +1,219 @@ +#' @title Summarize data +#' @name data_summary +#' +#' @description This function can be used to compute summary statistics for a +#' data frame or a matrix. +#' +#' @param x A (grouped) data frame. +#' @param by Optional character string, indicating the name of a variable in `x`. +#' If supplied, the data will be split by this variable and summary statistics +#' will be computed for each group. +#' @param include_na Logical. If `TRUE`, missing values are included as a level +#' in the grouping variable. If `FALSE`, missing values are omitted from the +#' grouping variable. +#' @param ... One or more named expressions that define the new variable name +#' and the function to compute the summary statistic. Example: +#' `mean_sepal_width = mean(Sepal.Width)`. The expression can also be provided +#' as a character string, e.g. `"mean_sepal_width = mean(Sepal.Width)"`. The +#' summary function `n()` can be used to count the number of observations. +#' +#' @return A data frame with the requested summary statistics. +#' +#' @examples +#' data(iris) +#' data_summary(iris, MW = mean(Sepal.Width), SD = sd(Sepal.Width)) +#' data_summary( +#' iris, +#' MW = mean(Sepal.Width), +#' SD = sd(Sepal.Width), +#' by = "Species" +#' ) +#' +#' # same as +#' d <- data_group(iris, "Species") +#' data_summary(d, MW = mean(Sepal.Width), SD = sd(Sepal.Width)) +#' +#' # multiple groups +#' data(mtcars) +#' data_summary(mtcars, MW = mean(mpg), SD = sd(mpg), by = c("am", "gear")) +#' +#' # expressions can also be supplied as character strings +#' data_summary(mtcars, "MW = mean(mpg)", "SD = sd(mpg)", by = c("am", "gear")) +#' +#' # count observations within groups +#' data_summary(mtcars, observations = n(), by = c("am", "gear")) +#' +#' # first and last observations of "mpg" within groups +#' data_summary( +#' mtcars, +#' first = mpg[1], +#' last = mpg[length(mpg)], +#' by = c("am", "gear") +#' ) +#' @export +data_summary <- function(x, ...) { + UseMethod("data_summary") +} + + +#' @export +data_summary.matrix <- function(x, ..., by = NULL, include_na = TRUE) { + data_summary(as.data.frame(x), ..., by = by, include_na = include_na) +} + + +#' @export +data_summary.default <- function(x, ...) { + insight::format_error("`data_summary()` only works for (grouped) data frames and matrices.") +} + + +#' @rdname data_summary +#' @export +data_summary.data.frame <- function(x, ..., by = NULL, include_na = TRUE) { + dots <- eval(substitute(alist(...))) + + # do we have any expression at all? + if (length(dots) == 0) { + insight::format_error("No expressions for calculating summary statistics provided.") + } + + if (is.null(by)) { + # when we have no grouping, just compute a one-row summary + summarise <- .process_datasummary_dots(dots, x) + out <- data.frame(summarise) + colnames(out) <- vapply(summarise, names, character(1)) + } else { + # sanity check - is "by" a character string? + if (!is.character(by)) { + insight::format_error("Argument `by` must be a character string indicating the name of variables in the data.") + } + # is "by" in the data? + if (!all(by %in% colnames(x))) { + by_not_found <- by[!by %in% colnames(x)] + insight::format_error( + paste0( + "Variable", + ifelse(length(by_not_found) > 1, "s ", " "), + text_concatenate(by_not_found, enclose = "\""), + " not found in the data." + ), + .misspelled_string(colnames(x), by_not_found, "Possibly misspelled?") + ) + } + # split data, add NA levels, if requested + l <- lapply(x[by], function(i) { + if (include_na && anyNA(i)) { + addNA(i) + } else { + i + } + }) + split_data <- split(x, l, drop = TRUE) + out <- lapply(split_data, function(s) { + # no data for combination? Return NULL + if (nrow(s) == 0) { + return(NULL) + } + # summarize data + summarise <- .process_datasummary_dots(dots, s) + # coerce to data frame + summarised_data <- data.frame(summarise) + # bind grouping-variables and values + summarised_data <- cbind(s[1, by], summarised_data) + # make sure we have proper column names + colnames(summarised_data) <- c(by, vapply(summarise, names, character(1))) + summarised_data + }) + out <- do.call(rbind, out) + } + # sort data + out <- data_arrange(out, select = by) + # data attributes + class(out) <- c("dw_data_summary", "data.frame") + rownames(out) <- NULL + out +} + + +#' @export +data_summary.grouped_df <- function(x, ..., by = NULL, include_na = TRUE) { + # extract group variables + grps <- attr(x, "groups", exact = TRUE) + group_variables <- data_remove(grps, ".rows") + # if "by" is not supplied, use group variables + if (is.null(by)) { + by <- colnames(group_variables) + } + # remove information specific to grouped df's + attr(x, "groups") <- NULL + class(x) <- "data.frame" + data_summary(x, ..., by = by, include_na = include_na) +} + + +# helper ----------------------------------------------------------------------- + +.process_datasummary_dots <- function(dots, data) { + out <- NULL + if (length(dots)) { + # we check for character vector of expressions, in which case + # "dots" should be unnamed + if (is.null(names(dots))) { + # if we have multiple strings, concatenate them to a character vector + # and put it into a list... + if (length(dots) > 1) { + if (all(vapply(dots, is.character, logical(1)))) { + dots <- list(unlist(dots)) + } else { + insight::format_error("You cannot mix string and literal representation of expressions.") + } + } + # expression is given as character string, e.g. + # a <- "mean_sepwid = mean(Sepal.Width)" + # data_summary(iris, a, by = "Species") + # or as character vector, e.g. + # data_summary(iris, c("var_a = mean(Sepal.Width)", "var_b = sd(Sepal.Width)")) + character_symbol <- tryCatch(.dynEval(dots[[1]]), error = function(e) NULL) + # do we have a character vector? Then we can proceed + if (is.character(character_symbol)) { + dots <- lapply(character_symbol, function(s) { + # turn value from character vector into expression + str2lang(.dynEval(s)) + }) + names(dots) <- vapply(dots, function(n) insight::safe_deparse(n[[2]]), character(1)) + } + } + + out <- lapply(seq_along(dots), function(i) { + new_variable <- .get_new_dots_variable(dots, i, data) + stats::setNames(new_variable, names(dots)[i]) + }) + } + + # check for correct length of output - must be a single value! + if (any(lengths(out) != 1)) { + insight::format_error( + paste0( + "Each expression must return a single value. Following expression", + ifelse(sum(lengths(out) != 1) > 1, "s", " "), + " returned more than one value: ", + text_concatenate(vapply(dots[lengths(out) != 1], insight::safe_deparse, character(1)), enclose = "\"") + ) + ) + } + + out +} + + +# methods ---------------------------------------------------------------------- + +#' @export +print.dw_data_summary <- function(x, ...) { + if (nrow(x) == 0) { + cat("No matches found.\n") + } else { + cat(insight::export_table(x, missing = "", ...)) + } +} diff --git a/_pkgdown.yaml b/_pkgdown.yaml index 2adc0768c..0062b8a5a 100644 --- a/_pkgdown.yaml +++ b/_pkgdown.yaml @@ -61,6 +61,7 @@ reference: Functions to compute statistical summaries of data properties and distributions contents: - data_codebook + - data_summary - data_tabulate - data_peek - data_seek diff --git a/man/data_summary.Rd b/man/data_summary.Rd new file mode 100644 index 000000000..ccbf4c524 --- /dev/null +++ b/man/data_summary.Rd @@ -0,0 +1,67 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/data_summary.R +\name{data_summary} +\alias{data_summary} +\alias{data_summary.data.frame} +\title{Summarize data} +\usage{ +data_summary(x, ...) + +\method{data_summary}{data.frame}(x, ..., by = NULL, include_na = TRUE) +} +\arguments{ +\item{x}{A (grouped) data frame.} + +\item{...}{One or more named expressions that define the new variable name +and the function to compute the summary statistic. Example: +\code{mean_sepal_width = mean(Sepal.Width)}. The expression can also be provided +as a character string, e.g. \code{"mean_sepal_width = mean(Sepal.Width)"}. The +summary function \code{n()} can be used to count the number of observations.} + +\item{by}{Optional character string, indicating the name of a variable in \code{x}. +If supplied, the data will be split by this variable and summary statistics +will be computed for each group.} + +\item{include_na}{Logical. If \code{TRUE}, missing values are included as a level +in the grouping variable. If \code{FALSE}, missing values are omitted from the +grouping variable.} +} +\value{ +A data frame with the requested summary statistics. +} +\description{ +This function can be used to compute summary statistics for a +data frame or a matrix. +} +\examples{ +data(iris) +data_summary(iris, MW = mean(Sepal.Width), SD = sd(Sepal.Width)) +data_summary( + iris, + MW = mean(Sepal.Width), + SD = sd(Sepal.Width), + by = "Species" +) + +# same as +d <- data_group(iris, "Species") +data_summary(d, MW = mean(Sepal.Width), SD = sd(Sepal.Width)) + +# multiple groups +data(mtcars) +data_summary(mtcars, MW = mean(mpg), SD = sd(mpg), by = c("am", "gear")) + +# expressions can also be supplied as character strings +data_summary(mtcars, "MW = mean(mpg)", "SD = sd(mpg)", by = c("am", "gear")) + +# count observations within groups +data_summary(mtcars, observations = n(), by = c("am", "gear")) + +# first and last observations of "mpg" within groups +data_summary( + mtcars, + first = mpg[1], + last = mpg[length(mpg)], + by = c("am", "gear") +) +} diff --git a/tests/testthat/_snaps/data_summary.md b/tests/testthat/_snaps/data_summary.md new file mode 100644 index 000000000..44b7f3f4c --- /dev/null +++ b/tests/testthat/_snaps/data_summary.md @@ -0,0 +1,55 @@ +# data_summary, print + + Code + print(out) + Output + am | gear | MW | SD + ------------------------ + 0 | 3 | 16.11 | 3.37 + 0 | 4 | 21.05 | 3.07 + 1 | 4 | 26.27 | 5.41 + 1 | 5 | 21.38 | 6.66 + +# data_summary, with NA + + Code + print(out) + Output + c172code | MW + ---------------- + 1 | 87.12 + 2 | 94.05 + 3 | 75.00 + | 47.80 + +--- + + Code + print(out) + Output + c172code | MW + ---------------- + 1 | 87.12 + 2 | 94.05 + 3 | 75.00 + +--- + + Code + print(out) + Output + e42dep | c172code | MW + -------------------------- + 1 | 2 | 17.00 + 2 | 2 | 34.25 + 3 | 1 | 39.50 + 3 | 2 | 52.44 + 3 | 3 | 52.00 + 3 | | 84.00 + 4 | 1 | 134.75 + 4 | 2 | 119.26 + 4 | 3 | 88.80 + 4 | | 43.29 + | 2 | + | | 7.00 + diff --git a/tests/testthat/test-data_reorder.R b/tests/testthat/test-data_reorder.R index 34a8f5420..463f36040 100644 --- a/tests/testthat/test-data_reorder.R +++ b/tests/testthat/test-data_reorder.R @@ -1,11 +1,11 @@ test_that("data_reorder works as expected", { - expect_equal( - names(data_reorder(iris, c("Species", "Sepal.Length"))), + expect_named( + data_reorder(iris, c("Species", "Sepal.Length")), c("Species", "Sepal.Length", "Sepal.Width", "Petal.Length", "Petal.Width") ) - expect_warning(expect_equal( - names(data_reorder(iris, c("Species", "dupa"))), + expect_warning(expect_named( + data_reorder(iris, c("Species", "dupa")), c("Species", "Sepal.Length", "Sepal.Width", "Petal.Length", "Petal.Width") )) }) @@ -24,5 +24,6 @@ test_that("data_reorder preserves attributes", { a2 <- attributes(out2) # attributes may not be in the same order - expect_true(all(names(a1) %in% names(a2)) && length(a1) == length(a2)) + expect_true(all(names(a1) %in% names(a2))) + expect_length(a1, length(a2)) }) diff --git a/tests/testthat/test-data_summary.R b/tests/testthat/test-data_summary.R new file mode 100644 index 000000000..746d4c51a --- /dev/null +++ b/tests/testthat/test-data_summary.R @@ -0,0 +1,230 @@ +test_that("data_summary, single row summary", { + data(iris) + out <- data_summary(iris, MW = mean(Sepal.Width), SD = sd(Sepal.Width)) + expect_equal(out$MW, mean(iris$Sepal.Width), tolerance = 1e-4) + expect_equal(out$SD, sd(iris$Sepal.Width), tolerance = 1e-4) +}) + + +test_that("data_summary, single row summary, string expression", { + data(iris) + out <- data_summary(iris, "MW = mean(Sepal.Width)", "SD = sd(Sepal.Width)") + expect_equal(out$MW, mean(iris$Sepal.Width), tolerance = 1e-4) + expect_equal(out$SD, sd(iris$Sepal.Width), tolerance = 1e-4) +}) + + +test_that("data_summary, summary for groups", { + data(iris) + out <- data_summary(iris, MW = mean(Sepal.Width), SD = sd(Sepal.Width), by = "Species") + expect_equal( + out$MW, + aggregate(iris["Sepal.Width"], list(iris$Species), mean)$Sepal.Width, + tolerance = 1e-4 + ) + expect_equal( + out$SD, + aggregate(iris["Sepal.Width"], list(iris$Species), sd)$Sepal.Width, + tolerance = 1e-4 + ) +}) + + +test_that("data_summary, summary for groups, string expression", { + data(iris) + out <- data_summary( + iris, + "MW = mean(Sepal.Width)", + "SD = sd(Sepal.Width)", + by = "Species" + ) + expect_equal( + out$MW, + aggregate(iris["Sepal.Width"], list(iris$Species), mean)$Sepal.Width, + tolerance = 1e-4 + ) + expect_equal( + out$SD, + aggregate(iris["Sepal.Width"], list(iris$Species), sd)$Sepal.Width, + tolerance = 1e-4 + ) +}) + + +test_that("data_summary, grouped data frames", { + data(iris) + d <- data_group(iris, "Species") + out <- data_summary(d, MW = mean(Sepal.Width), SD = sd(Sepal.Width)) + expect_equal( + out$MW, + aggregate(iris["Sepal.Width"], list(iris$Species), mean)$Sepal.Width, + tolerance = 1e-4 + ) + expect_equal( + out$SD, + aggregate(iris["Sepal.Width"], list(iris$Species), sd)$Sepal.Width, + tolerance = 1e-4 + ) + # "by" overrides groups + data(mtcars) + d <- data_group(mtcars, "gear") + out <- data_summary(d, MW = mean(mpg), SD = sd(mpg), by = "am") + expect_identical( + out$MW, + aggregate(mtcars["mpg"], list(mtcars$am), mean)$mpg + ) +}) + + +test_that("data_summary, summary for multiple groups", { + data(mtcars) + out <- data_summary(mtcars, MW = mean(mpg), SD = sd(mpg), by = c("am", "gear")) + expect_equal( + out$MW, + aggregate(mtcars["mpg"], list(mtcars$am, mtcars$gear), mean)$mpg, + tolerance = 1e-4 + ) + expect_equal( + out$SD, + aggregate(mtcars["mpg"], list(mtcars$am, mtcars$gear), sd)$mpg, + tolerance = 1e-4 + ) + x <- data_group(mtcars, c("am", "gear")) + out <- data_summary(x, MW = mean(mpg), SD = sd(mpg)) + expect_equal( + out$MW, + aggregate(mtcars["mpg"], list(mtcars$am, mtcars$gear), mean)$mpg, + tolerance = 1e-4 + ) + expect_equal( + out$SD, + aggregate(mtcars["mpg"], list(mtcars$am, mtcars$gear), sd)$mpg, + tolerance = 1e-4 + ) +}) + + +test_that("data_summary, errors", { + data(iris) + data(mtcars) + # "by" must be character + expect_error( + data_summary(iris, MW = mean(Sepal.Width), SD = sd(Sepal.Width), by = 5), + regex = "Argument `by` must be a character string" + ) + # "by" must be in data + expect_error( + data_summary(iris, MW = mean(Sepal.Width), SD = sd(Sepal.Width), by = "Speceis"), + regex = "Variable \"Speceis\" not" + ) + # by for multiple variables + expect_error( + data_summary(mtcars, MW = mean(mpg), SD = sd(mpg), by = c("bam", "gear")), + regex = "Variable \"bam\" not" + ) + expect_error( + data_summary(mtcars, MW = mean(mpg), SD = sd(mpg), by = c("bam", "geas")), + regex = "Did you mean one of \"am\" or \"gear\"?" + ) + # not a data frame + expect_error( + data_summary(iris$Sepal.Width, MW = mean(Sepal.Width), SD = sd(Sepal.Width)), + regex = "only works for" + ) + # no expressions + expect_error( + data_summary(iris, by = "Species"), + regex = "No expressions for calculating" + ) + # wrong expression + expect_error( + data_summary(mtcars, mw = mesn(mpg), by = "am"), + regex = "There was an error" + ) + # wrong variable name + expect_error( + data_summary(mtcars, n = max(mpeg)), + regex = "There was an error" + ) + # expression returns more than one value + expect_error( + data_summary(mtcars, n = unique(mpg), j = c(min(am), max(am)), by = c("am", "gear")), + regex = "Each expression must return" + ) +}) + + +test_that("data_summary, values_at", { + data(mtcars) + out <- data_summary(mtcars, pos1 = mpg[1], pos_end = mpg[length(mpg)], by = c("am", "gear")) + # same as: + # dplyr::summarise(mtcars, pos1 = dplyr::first(mpg), pos_end = dplyr::last(mpg), .by = c("am", "gear")) + expect_equal(out$pos1, c(21.4, 24.4, 21, 26), tolerance = 1e-3) + expect_equal(out$pos_end, c(19.2, 17.8, 21.4, 15), tolerance = 1e-3) +}) + + +test_that("data_summary, print", { + data(mtcars) + out <- data_summary(mtcars, MW = mean(mpg), SD = sd(mpg), by = c("am", "gear")) + expect_snapshot(print(out)) +}) + + +test_that("data_summary, with NA", { + data(efc, package = "datawizard") + out <- data_summary(efc, MW = mean(c12hour, na.rm = TRUE), by = "c172code") + expect_snapshot(print(out)) + out <- data_summary(efc, MW = mean(c12hour, na.rm = TRUE), by = "c172code", include_na = FALSE) + expect_snapshot(print(out)) + # sorting for multiple groups + out <- data_summary(efc, MW = mean(c12hour, na.rm = TRUE), by = c("e42dep", "c172code")) + expect_snapshot(print(out)) +}) + + +test_that("data_summary, inside functions", { + foo1 <- function(x, ...) { + datawizard::data_summary(x, ..., by = "Species") + } + + foo2 <- function(x, by, ...) { + datawizard::data_summary(x, ..., by = by) + } + + foo3 <- function(x, by) { + datawizard::data_summary(x, MW = mean(Sepal.Width), by = by) + } + + data(iris) + out1 <- foo1(iris, MW = mean(Sepal.Width)) + out2 <- foo2(iris, by = "Species", MW = mean(Sepal.Width)) + out3 <- foo3(iris, "Species") + expect_equal(out1$MW, out2$MW, tolerance = 1e-4) + expect_equal(out1$MW, out3$MW, tolerance = 1e-4) +}) + + +test_that("data_summary, expression as variable", { + data(mtcars) + a <- "MW = mean(mpg)" + b <- "SD = sd(mpg)" + out <- data_summary(mtcars, a, by = c("am", "gear")) + expect_named(out, c("am", "gear", "MW")) + expect_equal(out$MW, aggregate(mtcars["mpg"], list(mtcars$am, mtcars$gear), mean)$mpg, tolerance = 1e-4) + expect_error( + data_summary(mtcars, a, b, by = c("am", "gear")), + regex = "You cannot mix" + ) + out <- data_summary(mtcars, c(a, b), by = c("am", "gear")) + expect_named(out, c("am", "gear", "MW", "SD")) + expect_equal(out$SD, aggregate(mtcars["mpg"], list(mtcars$am, mtcars$gear), sd)$mpg, tolerance = 1e-4) +}) + + +test_that("data_summary, extra functions", { + data(mtcars) + # n() + out <- data_summary(mtcars, n = n(), by = c("am", "gear")) + expect_identical(out$n, c(15L, 4L, 8L, 5L)) +})