From 830361841f2f628d5381e8d61a1183ead011fc66 Mon Sep 17 00:00:00 2001 From: Daniel Date: Thu, 10 Oct 2024 16:40:30 +0200 Subject: [PATCH] Draft `row_sums()` as complement to `row_means()` --- NAMESPACE | 1 + NEWS.md | 3 + R/row_means.R | 142 ++++++++++++++++++++++---------- man/row_means.Rd | 55 +++++++++---- tests/testthat/test-row_means.R | 11 ++- 5 files changed, 150 insertions(+), 62 deletions(-) diff --git a/NAMESPACE b/NAMESPACE index c435c0cc5..1c2edb93a 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -297,6 +297,7 @@ export(reshape_wider) export(reverse) export(reverse_scale) export(row_means) +export(row_sums) export(row_to_colnames) export(rowid_as_column) export(rownames_as_column) diff --git a/NEWS.md b/NEWS.md index 388c5a822..d6976d12f 100644 --- a/NEWS.md +++ b/NEWS.md @@ -6,6 +6,9 @@ CHANGES variables, can now also be a character vector with quoted variable names, including a colon to indicate a range of several variables (e.g. `"cyl:gear"`). +* New function `row_sums()`, to calculate row sums (optionally with minimum + amount of valid values), as complement to `row_means()`. + BUG FIXES * `describe_distribution()` no longer errors if the sample was too sparse to compute diff --git a/R/row_means.R b/R/row_means.R index 4d2876c6a..729c800be 100644 --- a/R/row_means.R +++ b/R/row_means.R @@ -1,15 +1,16 @@ -#' @title Row means (optionally with minimum amount of valid values) +#' @title Row means or sums (optionally with minimum amount of valid values) #' @name row_means -#' @description This function is similar to the SPSS `MEAN.n` function and computes -#' row means from a data frame or matrix if at least `min_valid` values of a row are -#' valid (and not `NA`). +#' @description This function is similar to the SPSS `MEAN.n` or `SUM.n` +#' function and computes row means or row sums from a data frame or matrix if at +#' least `min_valid` values of a row are valid (and not `NA`). #' -#' @param data A data frame with at least two columns, where row means are applied. +#' @param data A data frame with at least two columns, where row means or row +#' sums are applied. #' @param min_valid Optional, a numeric value of length 1. May either be #' - a numeric value that indicates the amount of valid values per row to -#' calculate the row mean; +#' calculate the row mean or row sum; #' - or a value between `0` and `1`, indicating a proportion of valid values per -#' row to calculate the row mean (see 'Details'). +#' row to calculate the row mean or row sum (see 'Details'). #' - `NULL` (default), in which all cases are considered. #' #' If a row's sum of valid values is less than `min_valid`, `NA` will be returned. @@ -17,21 +18,24 @@ #' used for rounding mean values. Negative values are allowed (see 'Details'). #' By default, `digits = NULL` and no rounding is used. #' @param remove_na Logical, if `TRUE` (default), removes missing (`NA`) values -#' before calculating row means. Only applies if `min_valuid` is not specified. +#' before calculating row means or row sums. Only applies if `min_valid` is not +#' specified. #' @param verbose Toggle warnings. #' @inheritParams extract_column_names #' -#' @return A vector with row means for those rows with at least `n` valid values. +#' @return A vector with row means (for `row_means()`) or row sums (for +#' `row_sums()`) for those rows with at least `n` valid values. #' -#' @details Rounding to a negative number of `digits` means rounding to a power of -#' ten, for example `row_means(df, 3, digits = -2)` rounds to the nearest hundred. -#' For `min_valid`, if not `NULL`, `min_valid` must be a numeric value from `0` -#' to `ncol(data)`. If a row in the data frame has at least `min_valid` -#' non-missing values, the row mean is returned. If `min_valid` is a non-integer -#' value from 0 to 1, `min_valid` is considered to indicate the proportion of -#' required non-missing values per row. E.g., if `min_valid = 0.75`, a row must -#' have at least `ncol(data) * min_valid` non-missing values for the row mean -#' to be calculated. See 'Examples'. +#' @details Rounding to a negative number of `digits` means rounding to a power +#' of ten, for example `row_means(df, 3, digits = -2)` rounds to the nearest +#' hundred. For `min_valid`, if not `NULL`, `min_valid` must be a numeric value +#' from `0` to `ncol(data)`. If a row in the data frame has at least `min_valid` +#' non-missing values, the row mean or row sum is returned. If `min_valid` is a +#' non-integer value from 0 to 1, `min_valid` is considered to indicate the +#' proportion of required non-missing values per row. E.g., if +#' `min_valid = 0.75`, a row must have at least `ncol(data) * min_valid` +#' non-missing values for the row mean or row sum to be calculated. See +#' 'Examples'. #' #' @examples #' dat <- data.frame( @@ -49,6 +53,7 @@ #' #' # needs at least 4 non-missing values per row #' row_means(dat, min_valid = 4) # 1 valid return value +#' row_sums(dat, min_valid = 4) # 1 valid return value #' #' # needs at least 3 non-missing values per row #' row_means(dat, min_valid = 3) # 2 valid return values @@ -61,6 +66,7 @@ #' #' # needs at least 50% of non-missing values per row #' row_means(dat, min_valid = 0.5) # 3 valid return values +#' row_sums(dat, min_valid = 0.5) #' #' # needs at least 75% of non-missing values per row #' row_means(dat, min_valid = 0.75) # 2 valid return values @@ -84,34 +90,52 @@ row_means <- function(data, verbose = verbose ) - if (is.null(select) || length(select) == 0) { - insight::format_error("No columns selected.") - } + # prepare data, sanity checks + data <- .prepare_row_data(data, select, min_valid, verbose) - data <- .coerce_to_dataframe(data[select]) + # calculate row means + .row_sums_or_means(data, min_valid, digits, remove_na, fun = "mean") +} - # n must be a numeric, non-missing value - if (!is.null(min_valid) && (all(is.na(min_valid)) || !is.numeric(min_valid) || length(min_valid) > 1)) { - insight::format_error("`min_valid` must be a numeric value of length 1.") - } - # make sure we only have numeric values - numeric_columns <- vapply(data, is.numeric, TRUE) - if (!all(numeric_columns)) { - if (verbose) { - insight::format_alert("Only numeric columns are considered for calculation.") - } - data <- data[numeric_columns] - } +#' @rdname row_means +#' @export +row_sums <- function(data, + select = NULL, + exclude = NULL, + min_valid = NULL, + digits = NULL, + ignore_case = FALSE, + regex = FALSE, + remove_na = FALSE, + verbose = TRUE) { + # evaluate arguments + select <- .select_nse(select, + data, + exclude, + ignore_case = ignore_case, + regex = regex, + verbose = verbose + ) + + # prepare data, sanity checks + data <- .prepare_row_data(data, select, min_valid, verbose) + + # calculate row sums + .row_sums_or_means(data, min_valid, digits, remove_na, fun = "sum") +} - # check if we have a data framme with at least two columns - if (ncol(data) < 2) { - insight::format_error("`data` must be a data frame with at least two numeric columns.") - } - # proceed here if min_valid is not NULL +# helper ------------------------ + +# calculate row means or sums +.row_sums_or_means <- function(data, min_valid, digits, remove_na, fun) { if (is.null(min_valid)) { - out <- rowMeans(data, na.rm = remove_na) + # calculate row means or sums for complete data + out <- switch(fun, + mean = rowMeans(data, na.rm = remove_na), + rowSums(data, na.rm = remove_na) + ) } else { # is 'min_valid' indicating a proportion? decimals <- min_valid %% 1 @@ -124,9 +148,12 @@ row_means <- function(data, insight::format_error("`min_valid` must be smaller or equal to number of columns in data frame.") } - # row means + # row means or sums to_na <- rowSums(is.na(data)) > ncol(data) - min_valid - out <- rowMeans(data, na.rm = TRUE) + out <- switch(fun, + mean = rowMeans(data, na.rm = TRUE), + rowSums(data, na.rm = TRUE) + ) out[to_na] <- NA } @@ -137,3 +164,34 @@ row_means <- function(data, out } + + +# check that data is in shape for row means or row sums +.prepare_row_data <- function(data, select, min_valid, verbose) { + if (is.null(select) || length(select) == 0) { + insight::format_error("No columns selected.") + } + + data <- .coerce_to_dataframe(data[select]) + + # n must be a numeric, non-missing value + if (!is.null(min_valid) && (all(is.na(min_valid)) || !is.numeric(min_valid) || length(min_valid) > 1)) { + insight::format_error("`min_valid` must be a numeric value of length 1.") + } + + # make sure we only have numeric values + numeric_columns <- vapply(data, is.numeric, TRUE) + if (!all(numeric_columns)) { + if (verbose) { + insight::format_alert("Only numeric columns are considered for calculation.") + } + data <- data[numeric_columns] + } + + # check if we have a data framme with at least two columns + if (ncol(data) < 2) { + insight::format_error("`data` must be a data frame with at least two numeric columns.") + } + + data +} diff --git a/man/row_means.Rd b/man/row_means.Rd index 4d61e8cf2..43d85b5b0 100644 --- a/man/row_means.Rd +++ b/man/row_means.Rd @@ -2,7 +2,8 @@ % Please edit documentation in R/row_means.R \name{row_means} \alias{row_means} -\title{Row means (optionally with minimum amount of valid values)} +\alias{row_sums} +\title{Row means or sums (optionally with minimum amount of valid values)} \usage{ row_means( data, @@ -15,9 +16,22 @@ row_means( remove_na = FALSE, verbose = TRUE ) + +row_sums( + data, + select = NULL, + exclude = NULL, + min_valid = NULL, + digits = NULL, + ignore_case = FALSE, + regex = FALSE, + remove_na = FALSE, + verbose = TRUE +) } \arguments{ -\item{data}{A data frame with at least two columns, where row means are applied.} +\item{data}{A data frame with at least two columns, where row means or row +sums are applied.} \item{select}{Variables that will be included when performing the required tasks. Can be either @@ -60,9 +74,9 @@ excludes no columns.} \item{min_valid}{Optional, a numeric value of length 1. May either be \itemize{ \item a numeric value that indicates the amount of valid values per row to -calculate the row mean; +calculate the row mean or row sum; \item or a value between \code{0} and \code{1}, indicating a proportion of valid values per -row to calculate the row mean (see 'Details'). +row to calculate the row mean or row sum (see 'Details'). \item \code{NULL} (default), in which all cases are considered. } @@ -86,28 +100,31 @@ since the select-helpers may not work when called from inside other functions (see 'Details'), this argument may be used as workaround.} \item{remove_na}{Logical, if \code{TRUE} (default), removes missing (\code{NA}) values -before calculating row means. Only applies if \code{min_valuid} is not specified.} +before calculating row means or row sums. Only applies if \code{min_valid} is not +specified.} \item{verbose}{Toggle warnings.} } \value{ -A vector with row means for those rows with at least \code{n} valid values. +A vector with row means (for \code{row_means()}) or row sums (for +\code{row_sums()}) for those rows with at least \code{n} valid values. } \description{ -This function is similar to the SPSS \code{MEAN.n} function and computes -row means from a data frame or matrix if at least \code{min_valid} values of a row are -valid (and not \code{NA}). +This function is similar to the SPSS \code{MEAN.n} or \code{SUM.n} +function and computes row means or row sums from a data frame or matrix if at +least \code{min_valid} values of a row are valid (and not \code{NA}). } \details{ -Rounding to a negative number of \code{digits} means rounding to a power of -ten, for example \code{row_means(df, 3, digits = -2)} rounds to the nearest hundred. -For \code{min_valid}, if not \code{NULL}, \code{min_valid} must be a numeric value from \code{0} -to \code{ncol(data)}. If a row in the data frame has at least \code{min_valid} -non-missing values, the row mean is returned. If \code{min_valid} is a non-integer -value from 0 to 1, \code{min_valid} is considered to indicate the proportion of -required non-missing values per row. E.g., if \code{min_valid = 0.75}, a row must -have at least \code{ncol(data) * min_valid} non-missing values for the row mean -to be calculated. See 'Examples'. +Rounding to a negative number of \code{digits} means rounding to a power +of ten, for example \code{row_means(df, 3, digits = -2)} rounds to the nearest +hundred. For \code{min_valid}, if not \code{NULL}, \code{min_valid} must be a numeric value +from \code{0} to \code{ncol(data)}. If a row in the data frame has at least \code{min_valid} +non-missing values, the row mean or row sum is returned. If \code{min_valid} is a +non-integer value from 0 to 1, \code{min_valid} is considered to indicate the +proportion of required non-missing values per row. E.g., if +\code{min_valid = 0.75}, a row must have at least \code{ncol(data) * min_valid} +non-missing values for the row mean or row sum to be calculated. See +'Examples'. } \examples{ dat <- data.frame( @@ -125,6 +142,7 @@ row_means(dat, remove_na = TRUE) # needs at least 4 non-missing values per row row_means(dat, min_valid = 4) # 1 valid return value +row_sums(dat, min_valid = 4) # 1 valid return value # needs at least 3 non-missing values per row row_means(dat, min_valid = 3) # 2 valid return values @@ -137,6 +155,7 @@ row_means(dat, select = c("c1", "c3"), min_valid = 1) # needs at least 50\% of non-missing values per row row_means(dat, min_valid = 0.5) # 3 valid return values +row_sums(dat, min_valid = 0.5) # needs at least 75\% of non-missing values per row row_means(dat, min_valid = 0.75) # 2 valid return values diff --git a/tests/testthat/test-row_means.R b/tests/testthat/test-row_means.R index 8d0504c69..4db0d7039 100644 --- a/tests/testthat/test-row_means.R +++ b/tests/testthat/test-row_means.R @@ -1,4 +1,4 @@ -test_that("row_means", { +test_that("row_means/sums", { d_mn <- data.frame( c1 = c(1, 2, NA, 4), c2 = c(NA, 2, NA, 5), @@ -14,14 +14,21 @@ test_that("row_means", { expect_equal(row_means(d_mn, min_valid = 2, digits = 1), c(1.5, 2.8, NA, 5.7), tolerance = 1e-1) expect_message(row_means(iris), regex = "Only numeric") expect_equal(row_means(iris, verbose = FALSE), rowMeans(iris[, 1:4]), tolerance = 1e-3, ignore_attr = TRUE) + expect_equal(row_sums(d_mn, min_valid = 4), c(NA, 11, NA, NA), tolerance = 1e-3) + expect_equal(row_sums(d_mn, min_valid = 3), c(NA, 11, NA, 17), tolerance = 1e-3) + expect_message(row_sums(iris), regex = "Only numeric") }) -test_that("row_means, errors or messages", { +test_that("row_means/sums, errors or messages", { data(iris) expect_error(expect_warning(row_means(iris, select = "abc")), regex = "No columns") + expect_error(expect_warning(row_sums(iris, select = "abc")), regex = "No columns") expect_error(row_means(iris[1], min_valid = 1), regex = "two numeric") expect_error(row_means(iris, min_valid = 1:4), regex = "numeric value") expect_error(row_means(iris, min_valid = "a"), regex = "numeric value") expect_message(row_means(iris[1:3, ], min_valid = 3), regex = "Only numeric") expect_silent(row_means(iris[1:3, ], min_valid = 3, verbose = FALSE)) + expect_error(row_sums(iris[1], min_valid = 1), regex = "two numeric") + expect_message(row_sums(iris[1:3, ], min_valid = 3), regex = "Only numeric") + expect_silent(row_sums(iris[1:3, ], min_valid = 3, verbose = FALSE)) })