From 1dcec608f908e134e5adb15581b19361af17cd59 Mon Sep 17 00:00:00 2001 From: Daniel Date: Thu, 10 Oct 2024 17:05:14 +0200 Subject: [PATCH] Draft `row_count()` --- DESCRIPTION | 2 +- NAMESPACE | 1 + NEWS.md | 2 + R/row_count.R | 68 ++++++++++++++++++++++ man/row_count.Rd | 99 +++++++++++++++++++++++++++++++++ pkgdown/_pkgdown.yaml | 1 + tests/testthat/test-row_count.R | 25 +++++++++ 7 files changed, 197 insertions(+), 1 deletion(-) create mode 100644 R/row_count.R create mode 100644 man/row_count.Rd create mode 100644 tests/testthat/test-row_count.R diff --git a/DESCRIPTION b/DESCRIPTION index 4758f601c..00574ecb1 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Type: Package Package: datawizard Title: Easy Data Wrangling and Statistical Transformations -Version: 0.13.0.2 +Version: 0.13.0.4 Authors@R: c( person("Indrajeet", "Patil", , "patilindrajeet.science@gmail.com", role = "aut", comment = c(ORCID = "0000-0003-1995-6531")), diff --git a/NAMESPACE b/NAMESPACE index c435c0cc5..1775af562 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -296,6 +296,7 @@ export(reshape_longer) export(reshape_wider) export(reverse) export(reverse_scale) +export(row_count) export(row_means) export(row_to_colnames) export(rowid_as_column) diff --git a/NEWS.md b/NEWS.md index 388c5a822..b4154449d 100644 --- a/NEWS.md +++ b/NEWS.md @@ -6,6 +6,8 @@ CHANGES variables, can now also be a character vector with quoted variable names, including a colon to indicate a range of several variables (e.g. `"cyl:gear"`). +* New function `row_count()`, to calculate row-wise sums of specific values. + BUG FIXES * `describe_distribution()` no longer errors if the sample was too sparse to compute diff --git a/R/row_count.R b/R/row_count.R new file mode 100644 index 000000000..7ece22514 --- /dev/null +++ b/R/row_count.R @@ -0,0 +1,68 @@ +#' @title Row means or sums (optionally with minimum amount of valid values) +#' @name row_count +#' @description `row_count()` mimics base R's `rowSums()`, with sums for a +#' specific value indicated by `count`. Hence, it is equivalent to +#' `rowSums(x == count, na.rm = TRUE)`. +#' +#' @param data A data frame with at least two columns, where number of specific +#' values are counted row-wise. +#' @param count The value for which the row sum should be computed. May be a +#' numeric value, a character string (for factors or character vectors), `NA` or +#' `Inf`. +#' @inheritParams extract_column_names +#' @inheritParams row_means +#' +#' @return A vector with row-wise counts of values specified in `count`. +#' +#' @examples +#' dat <- data.frame( +#' c1 = c(1, 2, NA, 4), +#' c2 = c(NA, 2, NA, 5), +#' c3 = c(NA, 4, NA, NA), +#' c4 = c(2, 3, 7, 8) +#' ) +#' +#' # count all 2s per row +#' row_count(dat, count = 2) +#' # count all missing values per row +#' row_count(dat, count = NA) +#' +#' @export +row_count <- function(data, + select = NULL, + exclude = NULL, + count = NULL, + ignore_case = FALSE, + regex = FALSE, + verbose = TRUE) { + # evaluate arguments + select <- .select_nse(select, + data, + exclude, + ignore_case = ignore_case, + regex = regex, + verbose = verbose + ) + + if (is.null(count)) { + insight::format_error("`count` must be a valid value (including `NA` or `Inf`), but not `NULL`.") + } + + if (is.null(select) || length(select) == 0) { + insight::format_error("No columns selected.") + } + + data <- .coerce_to_dataframe(data[select]) + + # check if we have a data framme with at least two columns + if (ncol(data) < 2) { + insight::format_error("`data` must be a data frame with at least two numeric columns.") + } + + # special case: count missing + if (is.na(count)) { + rowSums(is.na(data)) + } else { + rowSums(data == count, na.rm = TRUE) + } +} diff --git a/man/row_count.Rd b/man/row_count.Rd new file mode 100644 index 000000000..820baad8c --- /dev/null +++ b/man/row_count.Rd @@ -0,0 +1,99 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/row_count.R +\name{row_count} +\alias{row_count} +\title{Row means or sums (optionally with minimum amount of valid values)} +\usage{ +row_count( + data, + select = NULL, + exclude = NULL, + count = NULL, + ignore_case = FALSE, + regex = FALSE, + verbose = TRUE +) +} +\arguments{ +\item{data}{A data frame with at least two columns, where number of specific +values are counted row-wise.} + +\item{select}{Variables that will be included when performing the required +tasks. Can be either +\itemize{ +\item a variable specified as a literal variable name (e.g., \code{column_name}), +\item a string with the variable name (e.g., \code{"column_name"}), a character +vector of variable names (e.g., \code{c("col1", "col2", "col3")}), or a +character vector of variable names including ranges specified via \code{:} +(e.g., \code{c("col1:col3", "col5")}), +\item a formula with variable names (e.g., \code{~column_1 + column_2}), +\item a vector of positive integers, giving the positions counting from the left +(e.g. \code{1} or \code{c(1, 3, 5)}), +\item a vector of negative integers, giving the positions counting from the +right (e.g., \code{-1} or \code{-1:-3}), +\item one of the following select-helpers: \code{starts_with()}, \code{ends_with()}, +\code{contains()}, a range using \code{:} or \code{regex("")}. \code{starts_with()}, +\code{ends_with()}, and \code{contains()} accept several patterns, e.g +\code{starts_with("Sep", "Petal")}. +\item or a function testing for logical conditions, e.g. \code{is.numeric()} (or +\code{is.numeric}), or any user-defined function that selects the variables +for which the function returns \code{TRUE} (like: \code{foo <- function(x) mean(x) > 3}), +\item ranges specified via literal variable names, select-helpers (except +\code{regex()}) and (user-defined) functions can be negated, i.e. return +non-matching elements, when prefixed with a \code{-}, e.g. \code{-ends_with("")}, +\code{-is.numeric} or \code{-(Sepal.Width:Petal.Length)}. \strong{Note:} Negation means +that matches are \emph{excluded}, and thus, the \code{exclude} argument can be +used alternatively. For instance, \code{select=-ends_with("Length")} (with +\code{-}) is equivalent to \code{exclude=ends_with("Length")} (no \code{-}). In case +negation should not work as expected, use the \code{exclude} argument instead. +} + +If \code{NULL}, selects all columns. Patterns that found no matches are silently +ignored, e.g. \code{extract_column_names(iris, select = c("Species", "Test"))} +will just return \code{"Species"}.} + +\item{exclude}{See \code{select}, however, column names matched by the pattern +from \code{exclude} will be excluded instead of selected. If \code{NULL} (the default), +excludes no columns.} + +\item{count}{The value for which the row sum should be computed. May be a +numeric value, a character string (for factors or character vectors), \code{NA} or +\code{Inf}.} + +\item{ignore_case}{Logical, if \code{TRUE} and when one of the select-helpers or +a regular expression is used in \code{select}, ignores lower/upper case in the +search pattern when matching against variable names.} + +\item{regex}{Logical, if \code{TRUE}, the search pattern from \code{select} will be +treated as regular expression. When \code{regex = TRUE}, select \emph{must} be a +character string (or a variable containing a character string) and is not +allowed to be one of the supported select-helpers or a character vector +of length > 1. \code{regex = TRUE} is comparable to using one of the two +select-helpers, \code{select = contains("")} or \code{select = regex("")}, however, +since the select-helpers may not work when called from inside other +functions (see 'Details'), this argument may be used as workaround.} + +\item{verbose}{Toggle warnings.} +} +\value{ +A vector with row-wise counts of values specified in \code{count}. +} +\description{ +\code{row_count()} mimics base R's \code{rowSums()}, with sums for a +specific value indicated by \code{count}. Hence, it is equivalent to +\code{rowSums(x == count, na.rm = TRUE)}. +} +\examples{ +dat <- data.frame( + c1 = c(1, 2, NA, 4), + c2 = c(NA, 2, NA, 5), + c3 = c(NA, 4, NA, NA), + c4 = c(2, 3, 7, 8) +) + +# count all 2s per row +row_count(dat, count = 2) +# count all missing values per row +row_count(dat, count = NA) + +} diff --git a/pkgdown/_pkgdown.yaml b/pkgdown/_pkgdown.yaml index 6e6feb5b2..31ec901d0 100644 --- a/pkgdown/_pkgdown.yaml +++ b/pkgdown/_pkgdown.yaml @@ -71,6 +71,7 @@ reference: - kurtosis - smoothness - skewness + - row_count - row_means - weighted_mean - mean_sd diff --git a/tests/testthat/test-row_count.R b/tests/testthat/test-row_count.R new file mode 100644 index 000000000..f40c7f69b --- /dev/null +++ b/tests/testthat/test-row_count.R @@ -0,0 +1,25 @@ +test_that("row_count", { + d_mn <- data.frame( + c1 = c(1, 2, NA, 4), + c2 = c(NA, 2, NA, 5), + c3 = c(NA, 4, NA, NA), + c4 = c(2, 3, 7, 8) + ) + expect_identical(row_count(d_mn, count = 2), c(1, 2, 0, 0)) + expect_identical(row_count(d_mn, count = NA), c(2, 0, 3, 1)) + d_mn <- data.frame( + c1 = c("a", "b", NA, "c"), + c2 = c(NA, "b", NA, "d"), + c3 = c(NA, 4, NA, NA), + c4 = c(2, 3, 7, Inf) + ) + expect_identical(row_count(d_mn, count = "b"), c(0, 2, 0, 0)) + expect_identical(row_count(d_mn, count = Inf), c(0, 0, 0, 1)) +}) + +test_that("row_means, errors or messages", { + data(iris) + expect_error(expect_warning(row_count(iris, select = "abc")), regex = "must be a valid") + expect_error(expect_warning(row_count(iris, select = "abc", count = 3)), regex = "no columns") + expect_error(row_count(iris[1], count = 3), regex = "with at least") +})