Skip to content

Commit

Permalink
Draft new data_summary() function (#482)
Browse files Browse the repository at this point in the history
* Draft new `data_summary()` function

* check if we can avoid duplicated code

* pkgdown

* fix

* fixes

* lintr

* add tests

* code style

* desc, news

* fix

* add print method and snapshot test

* add test

* correct english form

* test

* include NA, sort output

* add test

* meaningful code comments

* add test

* allow n()

* docs

* only n()

* informative error for incorrect length, add "value_at()" function

* pkgdown

* docs, test, examples, xref

* add test

* fix tests

* fix

* lintr

* lintr

* lintr

* lintr (reduce complexity)

* fix

* fix

* address comments

* use text_concatenate

* remove value_at, move code into dev-folder

* update docs

* update test
  • Loading branch information
strengejacke authored Mar 4, 2024
1 parent 83f9703 commit 7a1f372
Show file tree
Hide file tree
Showing 15 changed files with 805 additions and 138 deletions.
12 changes: 12 additions & 0 deletions .dev/test-value_at.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
test_that("value_at", {
data(efc, package = "datawizard")
expect_equal(value_at(efc$e42dep, 5), 4, ignore_attr = TRUE)
expect_equal(value_at(efc$c12hour, 4), NA_real_, ignore_attr = TRUE)
expect_equal(value_at(efc$c12hour, 4, remove_na = TRUE), 168, ignore_attr = TRUE)
expect_equal(value_at(efc$c12hour, 5:7), efc$c12hour[5:7], ignore_attr = TRUE)
expect_equal(value_at(efc$e42dep, 123456, default = 55), 55, ignore_attr = TRUE)
expect_null(value_at(efc$e42dep, 123456))
expect_null(value_at(efc$e42dep, NULL))
expect_error(value_at(efc$e42dep, NA), regex = "`position` can't")
expect_error(value_at(efc$e42dep, c(3, NA)), regex = "`position` can't")
})
52 changes: 52 additions & 0 deletions .dev/value_at.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
#' @title Find the value(s) at a specific position in a variable
#' @name value_at
#'
#' @description This function can be used to extract one or more values at a
#' specific position in a variable.
#'
#' @param x A vector or factor.
#' @param position An integer or a vector of integers, indicating the position(s)
#' of the value(s) to be returned. Negative values are counted from the end of
#' the vector. If `NA`, an error is thrown.
#' @param remove_na Logical, if `TRUE`, missing values are removed before
#' computing the position. If `FALSE`, missing values are included in the
#' computation.
#' @param default The value to be returned if the position is out of range.
#'
#' @seealso `data_summary()` to use `value_at()` inside a `data_summary()` call.
#'
#' @return A vector with the value(s) at the specified position(s).
#'
#' @examples
#' data(mtcars)
#' # 5th value
#' value_at(mtcars$mpg, 5)
#' # last value
#' value_at(mtcars$mpg, -1)
#' # out of range, return default
#' value_at(mtcars$mpg, 150)
#' # return 2nd and fifth value
#' value_at(mtcars$mpg, c(2, 5))
#' @export
value_at <- function(x, position = 1, default = NULL, remove_na = FALSE) {
if (remove_na) {
x <- x[!is.na(x)]
}
n <- length(x)
unlist(lapply(position, .values_at, x = x, n = n, default = default), use.names = FALSE)
}

# helper ----

.values_at <- function(x, position, n, default) {
if (is.na(position)) {
insight::format_error("`position` can't be `NA`.")
}
if (position < 0L) {
position <- position + n + 1
}
if (position <= 0 || position > n) {
return(default)
}
x[position]
}
2 changes: 1 addition & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
Type: Package
Package: datawizard
Title: Easy Data Wrangling and Statistical Transformations
Version: 0.9.1.4
Version: 0.9.1.5
Authors@R: c(
person("Indrajeet", "Patil", , "[email protected]", role = "aut",
comment = c(ORCID = "0000-0003-1995-6531", Twitter = "@patilindrajeets")),
Expand Down
6 changes: 6 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,10 @@ S3method(data_modify,data.frame)
S3method(data_modify,default)
S3method(data_modify,grouped_df)
S3method(data_peek,data.frame)
S3method(data_summary,data.frame)
S3method(data_summary,default)
S3method(data_summary,grouped_df)
S3method(data_summary,matrix)
S3method(data_tabulate,data.frame)
S3method(data_tabulate,default)
S3method(data_tabulate,grouped_df)
Expand Down Expand Up @@ -90,6 +94,7 @@ S3method(plot,visualisation_recipe)
S3method(print,data_codebook)
S3method(print,data_seek)
S3method(print,dw_data_peek)
S3method(print,dw_data_summary)
S3method(print,dw_data_tabulate)
S3method(print,dw_data_tabulates)
S3method(print,dw_data_xtabulate)
Expand Down Expand Up @@ -249,6 +254,7 @@ export(data_rotate)
export(data_seek)
export(data_select)
export(data_separate)
export(data_summary)
export(data_tabulate)
export(data_to_long)
export(data_to_wide)
Expand Down
4 changes: 4 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,9 @@
# datawizard 0.9.2

NEW FUNCTIONS

* `data_summary()`, to compute summary statistics of (grouped) data frames.

CHANGES

* `data_modify()` gets three new arguments, `.at`, `.if` and `.modify`, to modify
Expand Down
36 changes: 16 additions & 20 deletions R/adjust.R
Original file line number Diff line number Diff line change
Expand Up @@ -124,11 +124,11 @@ adjust <- function(data,
predictors[predictors == predictors_num] <- paste0("s(", predictors_num, ")")
}
formula_predictors <- paste(c("1", predictors), collapse = " + ")
formula <- paste(var, "~", formula_predictors)
model_formula <- paste(var, "~", formula_predictors)

x <- .model_adjust_for(
data = data[unique(c(var, effect, facs))],
formula,
model_formula = model_formula,
multilevel = multilevel,
additive = additive,
bayesian = bayesian,
Expand All @@ -148,7 +148,7 @@ data_adjust <- adjust

#' @keywords internal
.model_adjust_for <- function(data,
formula,
model_formula,
multilevel = FALSE,
additive = FALSE,
bayesian = FALSE,
Expand All @@ -159,32 +159,28 @@ data_adjust <- adjust
# Bayesian
if (bayesian) {
insight::check_if_installed("rstanarm")
model <- rstanarm::stan_gamm4(stats::as.formula(formula), random = formula_random, data = data, refresh = 0)
model <- rstanarm::stan_gamm4(stats::as.formula(model_formula), random = formula_random, data = data, refresh = 0)
# Frequentist
} else {
insight::check_if_installed("gamm4")
model <- gamm4::gamm4(stats::as.formula(formula), random = formula_random, data = data)
model <- gamm4::gamm4(stats::as.formula(model_formula), random = formula_random, data = data)
}

# Linear -------------------------
} else {
} else if (bayesian) {
# Bayesian
if (bayesian) {
insight::check_if_installed("rstanarm")
if (multilevel) {
model <- rstanarm::stan_lmer(paste(formula, formula_random), data = data, refresh = 0)
} else {
model <- rstanarm::stan_glm(formula, data = data, refresh = 0)
}
# Frequentist
insight::check_if_installed("rstanarm")
if (multilevel) {
model <- rstanarm::stan_lmer(paste(model_formula, formula_random), data = data, refresh = 0)
} else {
if (multilevel) {
insight::check_if_installed("lme4")
model <- lme4::lmer(paste(formula, formula_random), data = data)
} else {
model <- stats::lm(formula, data = data)
}
model <- rstanarm::stan_glm(model_formula, data = data, refresh = 0)
}
} else if (multilevel) {
# Frequentist
insight::check_if_installed("lme4")
model <- lme4::lmer(paste(model_formula, formula_random), data = data)
} else {
model <- stats::lm(model_formula, data = data)
}

adjusted <- insight::get_residuals(model)
Expand Down
10 changes: 5 additions & 5 deletions R/assign_labels.R
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@ assign_labels.numeric <- function(x, variable = NULL, values = NULL, ...) {
attr(x, "label") <- variable
} else {
insight::format_error(
"Variable labels (argument `variable`) must be provided as a single character string, e.g. `variable = \"mylabel\"`."
"Variable labels (argument `variable`) must be provided as a single character string, e.g. `variable = \"mylabel\"`." # nolint
)
}
}
Expand All @@ -88,13 +88,13 @@ assign_labels.numeric <- function(x, variable = NULL, values = NULL, ...) {
if (!is.null(values)) {
# extract unique values
unique_values <- as.vector(sort(stats::na.omit(unique(x))))
labels <- NULL
value_labels <- NULL

# do we have a names vector for "values"?
# else check if number of labels and values match
if (is.null(names(values))) {
if (length(values) == length(unique_values)) {
labels <- stats::setNames(unique_values, values)
value_labels <- stats::setNames(unique_values, values)
} else {
insight::format_error(
"Cannot add labels. Number of unique values and number of value labels are not equal.",
Expand All @@ -114,11 +114,11 @@ assign_labels.numeric <- function(x, variable = NULL, values = NULL, ...) {

if (length(values)) {
# we need to switch names and values
labels <- stats::setNames(coerce_to_numeric(names(values)), values)
value_labels <- stats::setNames(coerce_to_numeric(names(values)), values)
}
}

attr(x, "labels") <- labels
attr(x, "labels") <- value_labels
}

x
Expand Down
112 changes: 62 additions & 50 deletions R/categorize.R
Original file line number Diff line number Diff line change
Expand Up @@ -145,28 +145,8 @@ categorize.numeric <- function(x,
labels = NULL,
verbose = TRUE,
...) {
# check arguments
if (is.character(split)) {
split <- match.arg(
split,
choices = c(
"median", "mean", "quantile", "equal_length", "equal_range",
"equal", "equal_distance", "range", "distance"
)
)
}

if (is.character(split) && split %in% c("quantile", "equal_length") && is.null(n_groups)) {
insight::format_error(
"Recoding based on quantiles or equal-sized groups requires the `n_groups` argument to be specified."
)
}

if (is.character(split) && split == "equal_range" && is.null(n_groups) && is.null(range)) {
insight::format_error(
"Recoding into groups with equal range requires either the `range` or `n_groups` argument to be specified."
)
}
# sanity check
split <- .sanitize_split_arg(split, n_groups, range)

# handle aliases
if (identical(split, "equal_length")) split <- "length"
Expand Down Expand Up @@ -221,28 +201,7 @@ categorize.numeric <- function(x,
original_x[!is.na(original_x)] <- out

# turn into factor?
if (!is.null(labels)) {
if (length(labels) == length(unique(out))) {
original_x <- as.factor(original_x)
levels(original_x) <- labels
} else if (length(labels) == 1 && labels %in% c("mean", "median")) {
original_x <- as.factor(original_x)
no_na_x <- original_x[!is.na(original_x)]
if (labels == "mean") {
labels <- stats::aggregate(x, list(no_na_x), FUN = mean, na.rm = TRUE)$x
} else {
labels <- stats::aggregate(x, list(no_na_x), FUN = stats::median, na.rm = TRUE)$x
}
levels(original_x) <- insight::format_value(labels, ...)
} else if (isTRUE(verbose)) {
insight::format_warning(
"Argument `labels` and levels of the recoded variable are not of the same length.",
"Variable will not be converted to factor."
)
}
}

original_x
.original_x_to_factor(original_x, x, labels, out, verbose, ...)
}


Expand Down Expand Up @@ -283,15 +242,15 @@ categorize.data.frame <- function(x,
# create the new variables and updates "select", so new variables are processed
if (!isFALSE(append)) {
# process arguments
args <- .process_append(
my_args <- .process_append(
x,
select,
append,
append_suffix = "_r"
)
# update processed arguments
x <- args$x
select <- args$select
x <- my_args$x
select <- my_args$select
}

x[select] <- lapply(
Expand Down Expand Up @@ -342,15 +301,15 @@ categorize.grouped_df <- function(x,
# create the new variables and updates "select", so new variables are processed
if (!isFALSE(append)) {
# process arguments
args <- .process_append(
my_args <- .process_append(
x,
select,
append,
append_suffix = "_r"
)
# update processed arguments
x <- args$x
select <- args$select
x <- my_args$x
select <- my_args$select
}

x <- as.data.frame(x)
Expand Down Expand Up @@ -387,3 +346,56 @@ categorize.grouped_df <- function(x,
}
seq(lowest, max(x), by = range)
}


.sanitize_split_arg <- function(split, n_groups, range) {
# check arguments
if (is.character(split)) {
split <- match.arg(
split,
choices = c(
"median", "mean", "quantile", "equal_length", "equal_range",
"equal", "equal_distance", "range", "distance"
)
)
}

if (is.character(split) && split %in% c("quantile", "equal_length") && is.null(n_groups)) {
insight::format_error(
"Recoding based on quantiles or equal-sized groups requires the `n_groups` argument to be specified."
)
}

if (is.character(split) && split == "equal_range" && is.null(n_groups) && is.null(range)) {
insight::format_error(
"Recoding into groups with equal range requires either the `range` or `n_groups` argument to be specified."
)
}

split
}


.original_x_to_factor <- function(original_x, x, labels, out, verbose, ...) {
if (!is.null(labels)) {
if (length(labels) == length(unique(out))) {
original_x <- as.factor(original_x)
levels(original_x) <- labels
} else if (length(labels) == 1 && labels %in% c("mean", "median")) {
original_x <- as.factor(original_x)
no_na_x <- original_x[!is.na(original_x)]
if (labels == "mean") {
labels <- stats::aggregate(x, list(no_na_x), FUN = mean, na.rm = TRUE)$x
} else {
labels <- stats::aggregate(x, list(no_na_x), FUN = stats::median, na.rm = TRUE)$x
}
levels(original_x) <- insight::format_value(labels, ...)
} else if (isTRUE(verbose)) {
insight::format_warning(
"Argument `labels` and levels of the recoded variable are not of the same length.",
"Variable will not be converted to factor."
)
}
}
original_x
}
Loading

0 comments on commit 7a1f372

Please sign in to comment.