Skip to content

Commit

Permalink
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge branch 'main' into unnormalize-grouped-data
Browse files Browse the repository at this point in the history
etiennebacher authored Sep 10, 2023

Verified

This commit was created on GitHub.com and signed with GitHub’s verified signature. The key has expired.
2 parents 957dd35 + ae7df24 commit 04d08b3
Showing 17 changed files with 608 additions and 212 deletions.
2 changes: 1 addition & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
Type: Package
Package: datawizard
Title: Easy Data Wrangling and Statistical Transformations
Version: 0.8.0.8
Version: 0.8.0.12
Authors@R: c(
person("Indrajeet", "Patil", , "[email protected]", role = "aut",
comment = c(ORCID = "0000-0003-1995-6531", Twitter = "@patilindrajeets")),
2 changes: 1 addition & 1 deletion NAMESPACE
Original file line number Diff line number Diff line change
@@ -279,9 +279,9 @@ export(reshape_longer)
export(reshape_wider)
export(reverse)
export(reverse_scale)
export(row_means)
export(row_to_colnames)
export(rowid_as_column)
export(rowmean_n)
export(rownames_as_column)
export(skewness)
export(slide)
15 changes: 12 additions & 3 deletions NEWS.md
Original file line number Diff line number Diff line change
@@ -2,10 +2,10 @@

NEW FUNCTIONS

* `contr.deviation()` for sum-deviation contrast coding of factors.
* `row_means()`, to compute row means, optionally only for the rows with at
least `min_valid` non-missing values.

* `rowmean_n()`, to compute row means if row contains at least `n` non-missing
values.
* `contr.deviation()` for sum-deviation contrast coding of factors.

* `means_by_group()`, to compute mean values of variables, grouped by levels
of specified factors.
@@ -15,6 +15,9 @@ CHANGES
* `recode_into()` gains an `overwrite` argument to skip overwriting already
recoded cases when multiple recode patterns apply to the same case.

* `recode_into()` gains an `preserve_na` argument to preserve `NA` values
when recoding.

* `data_read()` now passes the `encoding` argument to `data.table::fread()`.
This allows to read files with non-ASCII characters.

@@ -24,12 +27,18 @@ CHANGES

BUG FIXES

* Fixed issue in `labels_to_levels()` when values of labels were not in sorted
order and values were not sequentially numbered.

* Fixed issues in `data_write()` when writing labelled data into SPSS format
and vectors were of different type as value labels.

* Fixed issue in `recode_into()` with probably wrong case number printed in the
warning when several recode patterns match to one case.

* Fixed issue in `recode_into()` when original data contained `NA` values and
`NA` was not included in the recode pattern.

* Fixed issue in `data_filter()` where functions containing a `=` (e.g. when
naming arguments, like `grepl(pattern, x = a)`) were mistakenly seen as
faulty syntax.
File renamed without changes.
File renamed without changes.
65 changes: 63 additions & 2 deletions R/recode_into.r → R/recode_into.R
Original file line number Diff line number Diff line change
@@ -20,6 +20,13 @@
#' recode patterns. If `FALSE`, former recoded cases will not be altered by later
#' recode patterns that would apply to those cases again. A warning message is
#' printed to alert such situations and to avoid unintentional recodings.
#' @param preserve_na Logical, if `TRUE` and `default` is not `NA`, missing
#' values in the original variable will be set back to `NA` in the recoded
#' variable (unless overwritten by other recode patterns). If `FALSE`, missing
#' values in the original variable will be recoded to `default`. Setting
#' `preserve_na = TRUE` prevents unintentional overwriting of missing values
#' with `default`, which means that you won't find valid values where the
#' original data only had missing values. See 'Examples'.
#' @param verbose Toggle warnings.
#'
#' @return A vector with recoded values.
@@ -72,8 +79,38 @@
#' data = d,
#' default = 0
#' )
#'
#' # handling of missing values
#' d <- data.frame(
#' x = c(1, NA, 2, NA, 3, 4),
#' y = c(1, 11, 3, NA, 5, 6)
#' )
#' # first NA in x is overwritten by valid value from y
#' # we have no known value for second NA in x and y,
#' # thus we get one NA in the result
#' recode_into(
#' x <= 3 ~ 1,
#' y > 5 ~ 2,
#' data = d,
#' default = 0,
#' preserve_na = TRUE
#' )
#' # first NA in x is overwritten by valid value from y
#' # default value is used for second NA
#' recode_into(
#' x <= 3 ~ 1,
#' y > 5 ~ 2,
#' data = d,
#' default = 0,
#' preserve_na = FALSE
#' )
#' @export
recode_into <- function(..., data = NULL, default = NA, overwrite = TRUE, verbose = TRUE) {
recode_into <- function(...,
data = NULL,
default = NA,
overwrite = TRUE,
preserve_na = FALSE,
verbose = TRUE) {
dots <- list(...)

# get length of vector, so we know the length of the output vector
@@ -124,6 +161,9 @@ recode_into <- function(..., data = NULL, default = NA, overwrite = TRUE, verbos
)
}

# indicator to show message when replacing NA by default
# needed to show message only once
overwrite_NA_msg <- TRUE

# iterate all expressions
for (i in seq_len(n_params)) {
@@ -135,6 +175,12 @@ recode_into <- function(..., data = NULL, default = NA, overwrite = TRUE, verbos
index <- with(data, eval(dots[[i]][[2]]))
value <- with(data, eval(dots[[i]][[3]]))
}
# remember missing values, so we can add back later
missing_index <- is.na(index)
# make sure index has no missing values. when we have missing values in
# original expression, these are considered as "no match" and set to FALSE
# we handle NA value later and thus want to remove them from "index" now
index[is.na(index)] <- FALSE
# overwriting values? do more recode-patterns match the same case?
if (is.na(default)) {
already_exists <- !is.na(out[index])
@@ -144,7 +190,7 @@ recode_into <- function(..., data = NULL, default = NA, overwrite = TRUE, verbos
# save indices of overwritten cases
overwritten_cases <- which(index)[already_exists]
# tell user...
if (any(already_exists) && verbose) {
if (any(already_exists, na.rm = TRUE) && verbose) {
if (overwrite) {
msg <- paste(
"Several recode patterns apply to the same cases.",
@@ -164,7 +210,22 @@ recode_into <- function(..., data = NULL, default = NA, overwrite = TRUE, verbos
if (!overwrite) {
index[overwritten_cases] <- FALSE
}
# write new values into output vector
out[index] <- value
# set back missing values
if (any(missing_index) && !is.na(default)) {
if (preserve_na) {
# but only where we still have default values
# we don't want to overwrite already recoded values with NA
out[missing_index & out == default] <- NA
} else if (overwrite_NA_msg && verbose) {
# don't show msg again
overwrite_NA_msg <- FALSE
insight::format_alert(
"Missing values in original variable are overwritten by default value. If you want to preserve missing values, set `preserve_na = TRUE`."
)
}
}
}

out
139 changes: 139 additions & 0 deletions R/row_means.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,139 @@
#' @title Row means (optionally with minimum amount of valid values)
#' @name row_means
#' @description This function is similar to the SPSS `MEAN.n` function and computes
#' row means from a data frame or matrix if at least `min_valid` values of a row are
#' valid (and not `NA`).
#'
#' @param data A data frame with at least two columns, where row means are applied.
#' @param min_valid Optional, a numeric value of length 1. May either be
#' - a numeric value that indicates the amount of valid values per row to
#' calculate the row mean;
#' - or a value between 0 and 1, indicating a proportion of valid values per
#' row to calculate the row mean (see 'Details').
#' - `NULL` (default), in which all cases are considered.
#'
#' If a row's sum of valid values is less than `min_valid`, `NA` will be returned.
#' @param digits Numeric value indicating the number of decimal places to be
#' used for rounding mean values. Negative values are allowed (see 'Details').
#' By default, `digits = NULL` and no rounding is used.
#' @param remove_na Logical, if `TRUE` (default), removes missing (`NA`) values
#' before calculating row means. Only applies if `min_valuid` is not specified.
#' @param verbose Toggle warnings.
#' @inheritParams find_columns
#'
#' @return A vector with row means for those rows with at least `n` valid values.
#'
#' @details Rounding to a negative number of `digits` means rounding to a power of
#' ten, for example `row_means(df, 3, digits = -2)` rounds to the nearest hundred.
#' For `min_valid`, if not `NULL`, `min_valid` must be a numeric value from `0`
#' to `ncol(data)`. If a row in the data frame has at least `min_valid`
#' non-missing values, the row mean is returned. If `min_valid` is a non-integer
#' value from 0 to 1, `min_valid` is considered to indicate the proportion of
#' required non-missing values per row. E.g., if `min_valid = 0.75`, a row must
#' have at least `ncol(data) * min_valid` non-missing values for the row mean
#' to be calculated. See 'Examples'.
#'
#' @examples
#' dat <- data.frame(
#' c1 = c(1, 2, NA, 4),
#' c2 = c(NA, 2, NA, 5),
#' c3 = c(NA, 4, NA, NA),
#' c4 = c(2, 3, 7, 8)
#' )
#'
#' # default, all means are shown, if no NA values are present
#' row_means(dat)
#'
#' # remove all NA before computing row means
#' row_means(dat, remove_na = TRUE)
#'
#' # needs at least 4 non-missing values per row
#' row_means(dat, min_valid = 4) # 1 valid return value
#'
#' # needs at least 3 non-missing values per row
#' row_means(dat, min_valid = 3) # 2 valid return values
#'
#' # needs at least 2 non-missing values per row
#' row_means(dat, min_valid = 2)
#'
#' # needs at least 1 non-missing value per row, for two selected variables
#' row_means(dat, select = c("c1", "c3"), min_valid = 1)
#'
#' # needs at least 50% of non-missing values per row
#' row_means(dat, min_valid = 0.5) # 3 valid return values
#'
#' # needs at least 75% of non-missing values per row
#' row_means(dat, min_valid = 0.75) # 2 valid return values
#'
#' @export
row_means <- function(data,
select = NULL,
exclude = NULL,
min_valid = NULL,
digits = NULL,
ignore_case = FALSE,
regex = FALSE,
remove_na = FALSE,
verbose = TRUE) {
# evaluate arguments
select <- .select_nse(select,
data,
exclude,
ignore_case = ignore_case,
regex = regex,
verbose = verbose
)

if (is.null(select) || length(select) == 0) {
insight::format_error("No columns selected.")
}

data <- .coerce_to_dataframe(data[select])

# n must be a numeric, non-missing value
if (!is.null(min_valid) && (all(is.na(min_valid)) || !is.numeric(min_valid) || length(min_valid) > 1)) {
insight::format_error("`min_valid` must be a numeric value of length 1.")
}

# make sure we only have numeric values
numeric_columns <- vapply(data, is.numeric, TRUE)
if (!all(numeric_columns)) {
if (verbose) {
insight::format_alert("Only numeric columns are considered for calculation.")
}
data <- data[numeric_columns]
}

# check if we have a data framme with at least two columns
if (ncol(data) < 2) {
insight::format_error("`data` must be a data frame with at least two numeric columns.")
}

# proceed here if min_valid is not NULL
if (!is.null(min_valid)) {
# is 'min_valid' indicating a proportion?
decimals <- min_valid %% 1
if (decimals != 0) {
min_valid <- round(ncol(data) * decimals)
}

# min_valid may not be larger as df's amount of columns
if (ncol(data) < min_valid) {
insight::format_error("`min_valid` must be smaller or equal to number of columns in data frame.")
}

# row means
to_na <- rowSums(is.na(data)) > ncol(data) - min_valid
out <- rowMeans(data, na.rm = TRUE)
out[to_na] <- NA
} else {
out <- rowMeans(data, na.rm = remove_na)
}

# round, if requested
if (!is.null(digits) && !all(is.na(digits))) {
out <- round(out, digits = digits)
}

out
}
Loading

0 comments on commit 04d08b3

Please sign in to comment.