From 8ab24547120f07c4267b716dc9aedaccff8eb782 Mon Sep 17 00:00:00 2001 From: Daniel Date: Thu, 10 Oct 2024 21:57:27 +0200 Subject: [PATCH 1/2] Rename `drop_na` into `remove_na` in `data_match()` (#556) * Rename drop_na into remove_na? Fixes #554 * news, desc * typo --- DESCRIPTION | 2 +- NEWS.md | 5 +++++ R/data_group.R | 2 +- R/data_match.R | 19 ++++++++++++++++--- man/data_match.Rd | 14 ++++++++++++-- tests/testthat/test-data_match.R | 4 ++-- 6 files changed, 37 insertions(+), 9 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 4758f601c..20764a09e 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Type: Package Package: datawizard Title: Easy Data Wrangling and Statistical Transformations -Version: 0.13.0.2 +Version: 0.13.0.5 Authors@R: c( person("Indrajeet", "Patil", , "patilindrajeet.science@gmail.com", role = "aut", comment = c(ORCID = "0000-0003-1995-6531")), diff --git a/NEWS.md b/NEWS.md index 388c5a822..08e3527b3 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,5 +1,10 @@ # datawizard (development) +BREAKING CHANGES + +* Argument `drop_na` in `data_match()` is deprecated now. Please use `remove_na` + instead. + CHANGES * The `select` argument, which is available in different functions to select diff --git a/R/data_group.R b/R/data_group.R index 00a7adf84..538c875c2 100644 --- a/R/data_group.R +++ b/R/data_group.R @@ -51,7 +51,7 @@ data_group <- function(data, to = my_grid[i, , drop = FALSE], match = "and", return_indices = TRUE, - drop_na = FALSE + remove_na = FALSE )) }) my_grid[[".rows"]] <- .rows diff --git a/R/data_match.R b/R/data_match.R index c03b3f222..6b522a0b8 100644 --- a/R/data_match.R +++ b/R/data_match.R @@ -15,7 +15,7 @@ #' @param return_indices Logical, if `FALSE`, return the vector of rows that #' can be used to filter the original data frame. If `FALSE` (default), #' returns directly the filtered data frame instead of the row indices. -#' @param drop_na Logical, if `TRUE`, missing values (`NA`s) are removed before +#' @param remove_na Logical, if `TRUE`, missing values (`NA`s) are removed before #' filtering the data. This is the default behaviour, however, sometimes when #' row indices are requested (i.e. `return_indices=TRUE`), it might be useful #' to preserve `NA` values, so returned row indices match the row indices of @@ -26,6 +26,7 @@ #' character vector (e.g. `c("x > 4", "y == 2")`) or a variable that contains #' the string representation of a logical expression. These might be useful #' when used in packages to avoid defining undefined global variables. +#' @param drop_na Deprecated, please use `remove_na` instead. #' #' @return A filtered data frame, or the row indices that match the specified #' configuration. @@ -100,12 +101,24 @@ #' data_filter(mtcars, fl) #' @inherit data_rename seealso #' @export -data_match <- function(x, to, match = "and", return_indices = FALSE, drop_na = TRUE, ...) { +data_match <- function(x, + to, + match = "and", + return_indices = FALSE, + remove_na = TRUE, + drop_na, + ...) { if (!is.data.frame(to)) { to <- as.data.frame(to) } original_x <- x + ## TODO: remove deprecated argument later + if (!missing(drop_na)) { + insight::format_warning("Argument `drop_na` is deprecated. Please use `remove_na` instead.") + remove_na <- drop_na + } + # evaluate match <- match.arg(tolower(match), c("and", "&", "&&", "or", "|", "||", "!", "not")) match <- switch(match, @@ -133,7 +146,7 @@ data_match <- function(x, to, match = "and", return_indices = FALSE, drop_na = T idx <- vector("numeric", length = 0L) } else { # remove missings before matching - if (isTRUE(drop_na)) { + if (isTRUE(remove_na)) { x <- x[stats::complete.cases(x), , drop = FALSE] } idx <- seq_len(nrow(x)) diff --git a/man/data_match.Rd b/man/data_match.Rd index a57c34768..a209170ab 100644 --- a/man/data_match.Rd +++ b/man/data_match.Rd @@ -5,7 +5,15 @@ \alias{data_filter} \title{Return filtered or sliced data frame, or row indices} \usage{ -data_match(x, to, match = "and", return_indices = FALSE, drop_na = TRUE, ...) +data_match( + x, + to, + match = "and", + return_indices = FALSE, + remove_na = TRUE, + drop_na, + ... +) data_filter(x, ...) } @@ -24,12 +32,14 @@ or \code{"not"} (or \code{"!"}).} can be used to filter the original data frame. If \code{FALSE} (default), returns directly the filtered data frame instead of the row indices.} -\item{drop_na}{Logical, if \code{TRUE}, missing values (\code{NA}s) are removed before +\item{remove_na}{Logical, if \code{TRUE}, missing values (\code{NA}s) are removed before filtering the data. This is the default behaviour, however, sometimes when row indices are requested (i.e. \code{return_indices=TRUE}), it might be useful to preserve \code{NA} values, so returned row indices match the row indices of the original data frame.} +\item{drop_na}{Deprecated, please use \code{remove_na} instead.} + \item{...}{A sequence of logical expressions indicating which rows to keep, or a numeric vector indicating the row indices of rows to keep. Can also be a string representation of a logical expression (e.g. \code{"x > 4"}), a diff --git a/tests/testthat/test-data_match.R b/tests/testthat/test-data_match.R index 75991b4b2..1a40f39fd 100644 --- a/tests/testthat/test-data_match.R +++ b/tests/testthat/test-data_match.R @@ -52,7 +52,7 @@ test_that("data_match works with missing data", { data.frame(c172code = 1, e16sex = 2), match = "not", return_indices = TRUE, - drop_na = FALSE + remove_na = FALSE )) expect_identical(x1, 41L) x1 <- length(data_match( @@ -60,7 +60,7 @@ test_that("data_match works with missing data", { data.frame(c172code = 1, e16sex = 2), match = "not", return_indices = TRUE, - drop_na = TRUE + remove_na = TRUE )) expect_identical(x1, 36L) }) From 5ce207b7169a20fdb4168fd83224d836592e6cca Mon Sep 17 00:00:00 2001 From: Daniel Date: Fri, 11 Oct 2024 11:43:00 +0200 Subject: [PATCH 2/2] Add `row_count()` to count specific values row-wise (#553) * Draft `row_count()` * docs, type safe comparisons * lintr * apply suggestions * add test * fix test * rename arg * switch TRUE and FALSE * update docs * resolve comment * comments * typo --- DESCRIPTION | 2 +- NAMESPACE | 1 + NEWS.md | 2 + R/row_count.R | 124 ++++++++++++++++++++++++++++++ man/row_count.Rd | 132 ++++++++++++++++++++++++++++++++ pkgdown/_pkgdown.yaml | 1 + tests/testthat/test-row_count.R | 57 ++++++++++++++ 7 files changed, 318 insertions(+), 1 deletion(-) create mode 100644 R/row_count.R create mode 100644 man/row_count.Rd create mode 100644 tests/testthat/test-row_count.R diff --git a/DESCRIPTION b/DESCRIPTION index 20764a09e..8fa9eee94 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Type: Package Package: datawizard Title: Easy Data Wrangling and Statistical Transformations -Version: 0.13.0.5 +Version: 0.13.0.6 Authors@R: c( person("Indrajeet", "Patil", , "patilindrajeet.science@gmail.com", role = "aut", comment = c(ORCID = "0000-0003-1995-6531")), diff --git a/NAMESPACE b/NAMESPACE index c435c0cc5..1775af562 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -296,6 +296,7 @@ export(reshape_longer) export(reshape_wider) export(reverse) export(reverse_scale) +export(row_count) export(row_means) export(row_to_colnames) export(rowid_as_column) diff --git a/NEWS.md b/NEWS.md index 08e3527b3..da3296536 100644 --- a/NEWS.md +++ b/NEWS.md @@ -11,6 +11,8 @@ CHANGES variables, can now also be a character vector with quoted variable names, including a colon to indicate a range of several variables (e.g. `"cyl:gear"`). +* New function `row_count()`, to count specific values row-wise. + BUG FIXES * `describe_distribution()` no longer errors if the sample was too sparse to compute diff --git a/R/row_count.R b/R/row_count.R new file mode 100644 index 000000000..02b1c16dc --- /dev/null +++ b/R/row_count.R @@ -0,0 +1,124 @@ +#' @title Count specific values row-wise +#' @name row_count +#' @description `row_count()` mimics base R's `rowSums()`, with sums for a +#' specific value indicated by `count`. Hence, it is similar to +#' `rowSums(x == count, na.rm = TRUE)`, but offers some more options, including +#' strict comparisons. Comparisons using `==` coerce values to atomic vectors, +#' thus both `2 == 2` and `"2" == 2` are `TRUE`. In `row_count()`, it is also +#' possible to make "type safe" comparisons using the `allow_coercion` argument, +#' where `"2" == 2` is not true. +#' +#' @param data A data frame with at least two columns, where number of specific +#' values are counted row-wise. +#' @param count The value for which the row sum should be computed. May be a +#' numeric value, a character string (for factors or character vectors), `NA` or +#' `Inf`. +#' @param allow_coercion Logical. If `FALSE`, `count` matches only values of same +#' class (i.e. when `count = 2`, the value `"2"` is not counted and vice versa). +#' By default, when `allow_coercion = TRUE`, `count = 2` also matches `"2"`. In +#' order to count factor levels in the data, use `count = factor("level")`. See +#' 'Examples'. +#' +#' @inheritParams extract_column_names +#' @inheritParams row_means +#' +#' @return A vector with row-wise counts of values specified in `count`. +#' +#' @examples +#' dat <- data.frame( +#' c1 = c(1, 2, NA, 4), +#' c2 = c(NA, 2, NA, 5), +#' c3 = c(NA, 4, NA, NA), +#' c4 = c(2, 3, 7, 8) +#' ) +#' +#' # count all 4s per row +#' row_count(dat, count = 4) +#' # count all missing values per row +#' row_count(dat, count = NA) +#' +#' dat <- data.frame( +#' c1 = c("1", "2", NA, "3"), +#' c2 = c(NA, "2", NA, "3"), +#' c3 = c(NA, 4, NA, NA), +#' c4 = c(2, 3, 7, Inf) +#' ) +#' # count all 2s and "2"s per row +#' row_count(dat, count = 2) +#' # only count 2s, but not "2"s +#' row_count(dat, count = 2, allow_coercion = FALSE) +#' +#' dat <- data.frame( +#' c1 = factor(c("1", "2", NA, "3")), +#' c2 = c("2", "1", NA, "3"), +#' c3 = c(NA, 4, NA, NA), +#' c4 = c(2, 3, 7, Inf) +#' ) +#' # find only character "2"s +#' row_count(dat, count = "2", allow_coercion = FALSE) +#' # find only factor level "2"s +#' row_count(dat, count = factor("2"), allow_coercion = FALSE) +#' +#' @export +row_count <- function(data, + select = NULL, + exclude = NULL, + count = NULL, + allow_coercion = TRUE, + ignore_case = FALSE, + regex = FALSE, + verbose = TRUE) { + # evaluate arguments + select <- .select_nse(select, + data, + exclude, + ignore_case = ignore_case, + regex = regex, + verbose = verbose + ) + + if (is.null(count)) { + insight::format_error("`count` must be a valid value (including `NA` or `Inf`), but not `NULL`.") + } + + if (is.null(select) || length(select) == 0) { + insight::format_error("No columns selected.") + } + + data <- .coerce_to_dataframe(data[select]) + + # check if we have a data framme with at least two columns + if (nrow(data) < 1) { + insight::format_error("`data` must be a data frame with at least one row.") + } + + # check if we have a data framme with at least two columns + if (ncol(data) < 2) { + insight::format_error("`data` must be a data frame with at least two numeric columns.") + } + # special case: count missing + if (is.na(count)) { + rowSums(is.na(data)) + } else { + # comparisons in R using == coerce values into a atomic vector, i.e. + # 2 == "2" is TRUE. If `allow_coercion = FALSE`, we only want 2 == 2 or + # "2" == "2" (i.e. we want exact types to be compared only) + if (isFALSE(allow_coercion)) { + # we need the "type" of the count-value - we use class() instead of typeof(), + # because the latter sometimes returns unsuitable classes/types. compare + # typeof(as.Date("2020-01-01")), which returns "double". + count_type <- class(count)[1] + valid_columns <- vapply(data, inherits, TRUE, what = count_type) + # check if any columns left? + if (!any(valid_columns)) { + insight::format_error("No column has same type as the value provided in `count`. Set `allow_coercion = TRUE` or specify a valid value for `count`.") # nolint + } + data <- data[valid_columns] + } + # coerce - we have only valid columns anyway, and we need to coerce factors + # to vectors, else comparison with `==` errors. + count <- as.vector(count) + # finally, count + rowSums(data == count, na.rm = TRUE) + } +} diff --git a/man/row_count.Rd b/man/row_count.Rd new file mode 100644 index 000000000..7bf54fe5f --- /dev/null +++ b/man/row_count.Rd @@ -0,0 +1,132 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/row_count.R +\name{row_count} +\alias{row_count} +\title{Count specific values row-wise} +\usage{ +row_count( + data, + select = NULL, + exclude = NULL, + count = NULL, + allow_coercion = TRUE, + ignore_case = FALSE, + regex = FALSE, + verbose = TRUE +) +} +\arguments{ +\item{data}{A data frame with at least two columns, where number of specific +values are counted row-wise.} + +\item{select}{Variables that will be included when performing the required +tasks. Can be either +\itemize{ +\item a variable specified as a literal variable name (e.g., \code{column_name}), +\item a string with the variable name (e.g., \code{"column_name"}), a character +vector of variable names (e.g., \code{c("col1", "col2", "col3")}), or a +character vector of variable names including ranges specified via \code{:} +(e.g., \code{c("col1:col3", "col5")}), +\item a formula with variable names (e.g., \code{~column_1 + column_2}), +\item a vector of positive integers, giving the positions counting from the left +(e.g. \code{1} or \code{c(1, 3, 5)}), +\item a vector of negative integers, giving the positions counting from the +right (e.g., \code{-1} or \code{-1:-3}), +\item one of the following select-helpers: \code{starts_with()}, \code{ends_with()}, +\code{contains()}, a range using \code{:} or \code{regex("")}. \code{starts_with()}, +\code{ends_with()}, and \code{contains()} accept several patterns, e.g +\code{starts_with("Sep", "Petal")}. +\item or a function testing for logical conditions, e.g. \code{is.numeric()} (or +\code{is.numeric}), or any user-defined function that selects the variables +for which the function returns \code{TRUE} (like: \code{foo <- function(x) mean(x) > 3}), +\item ranges specified via literal variable names, select-helpers (except +\code{regex()}) and (user-defined) functions can be negated, i.e. return +non-matching elements, when prefixed with a \code{-}, e.g. \code{-ends_with("")}, +\code{-is.numeric} or \code{-(Sepal.Width:Petal.Length)}. \strong{Note:} Negation means +that matches are \emph{excluded}, and thus, the \code{exclude} argument can be +used alternatively. For instance, \code{select=-ends_with("Length")} (with +\code{-}) is equivalent to \code{exclude=ends_with("Length")} (no \code{-}). In case +negation should not work as expected, use the \code{exclude} argument instead. +} + +If \code{NULL}, selects all columns. Patterns that found no matches are silently +ignored, e.g. \code{extract_column_names(iris, select = c("Species", "Test"))} +will just return \code{"Species"}.} + +\item{exclude}{See \code{select}, however, column names matched by the pattern +from \code{exclude} will be excluded instead of selected. If \code{NULL} (the default), +excludes no columns.} + +\item{count}{The value for which the row sum should be computed. May be a +numeric value, a character string (for factors or character vectors), \code{NA} or +\code{Inf}.} + +\item{allow_coercion}{Logical. If \code{FALSE}, \code{count} matches only values of same +class (i.e. when \code{count = 2}, the value \code{"2"} is not counted and vice versa). +By default, when \code{allow_coercion = TRUE}, \code{count = 2} also matches \code{"2"}. In +order to count factor levels in the data, use \code{count = factor("level")}. See +'Examples'.} + +\item{ignore_case}{Logical, if \code{TRUE} and when one of the select-helpers or +a regular expression is used in \code{select}, ignores lower/upper case in the +search pattern when matching against variable names.} + +\item{regex}{Logical, if \code{TRUE}, the search pattern from \code{select} will be +treated as regular expression. When \code{regex = TRUE}, select \emph{must} be a +character string (or a variable containing a character string) and is not +allowed to be one of the supported select-helpers or a character vector +of length > 1. \code{regex = TRUE} is comparable to using one of the two +select-helpers, \code{select = contains("")} or \code{select = regex("")}, however, +since the select-helpers may not work when called from inside other +functions (see 'Details'), this argument may be used as workaround.} + +\item{verbose}{Toggle warnings.} +} +\value{ +A vector with row-wise counts of values specified in \code{count}. +} +\description{ +\code{row_count()} mimics base R's \code{rowSums()}, with sums for a +specific value indicated by \code{count}. Hence, it is similar to +\code{rowSums(x == count, na.rm = TRUE)}, but offers some more options, including +strict comparisons. Comparisons using \code{==} coerce values to atomic vectors, +thus both \code{2 == 2} and \code{"2" == 2} are \code{TRUE}. In \code{row_count()}, it is also +possible to make "type safe" comparisons using the \code{allow_coercion} argument, +where \code{"2" == 2} is not true. +} +\examples{ +dat <- data.frame( + c1 = c(1, 2, NA, 4), + c2 = c(NA, 2, NA, 5), + c3 = c(NA, 4, NA, NA), + c4 = c(2, 3, 7, 8) +) + +# count all 4s per row +row_count(dat, count = 4) +# count all missing values per row +row_count(dat, count = NA) + +dat <- data.frame( + c1 = c("1", "2", NA, "3"), + c2 = c(NA, "2", NA, "3"), + c3 = c(NA, 4, NA, NA), + c4 = c(2, 3, 7, Inf) +) +# count all 2s and "2"s per row +row_count(dat, count = 2) +# only count 2s, but not "2"s +row_count(dat, count = 2, allow_coercion = FALSE) + +dat <- data.frame( + c1 = factor(c("1", "2", NA, "3")), + c2 = c("2", "1", NA, "3"), + c3 = c(NA, 4, NA, NA), + c4 = c(2, 3, 7, Inf) +) +# find only character "2"s +row_count(dat, count = "2", allow_coercion = FALSE) +# find only factor level "2"s +row_count(dat, count = factor("2"), allow_coercion = FALSE) + +} diff --git a/pkgdown/_pkgdown.yaml b/pkgdown/_pkgdown.yaml index 6e6feb5b2..31ec901d0 100644 --- a/pkgdown/_pkgdown.yaml +++ b/pkgdown/_pkgdown.yaml @@ -71,6 +71,7 @@ reference: - kurtosis - smoothness - skewness + - row_count - row_means - weighted_mean - mean_sd diff --git a/tests/testthat/test-row_count.R b/tests/testthat/test-row_count.R new file mode 100644 index 000000000..0c7d67691 --- /dev/null +++ b/tests/testthat/test-row_count.R @@ -0,0 +1,57 @@ +test_that("row_count", { + d_mn <- data.frame( + c1 = c(1, 2, NA, 4), + c2 = c(NA, 2, NA, 5), + c3 = c(NA, 4, NA, NA), + c4 = c(2, 3, 7, 8) + ) + expect_identical(row_count(d_mn, count = 2), c(1, 2, 0, 0)) + expect_identical(row_count(d_mn, count = NA), c(2, 0, 3, 1)) + d_mn <- data.frame( + c1 = c("a", "b", NA, "c"), + c2 = c(NA, "b", NA, "d"), + c3 = c(NA, 4, NA, NA), + c4 = c(2, 3, 7, Inf), + stringsAsFactors = FALSE + ) + expect_identical(row_count(d_mn, count = "b"), c(0, 2, 0, 0)) + expect_identical(row_count(d_mn, count = Inf), c(0, 0, 0, 1)) +}) + +test_that("row_count, errors or messages", { + data(iris) + expect_error(expect_warning(row_count(iris, select = "abc")), regex = "must be a valid") + expect_error(expect_warning(row_count(iris, select = "abc", count = 3)), regex = "No columns") + expect_error(row_count(iris[1], count = 3), regex = "with at least") + expect_error(row_count(iris[-seq_len(nrow(iris)), , drop = FALSE], count = 2), regex = "one row") +}) + +test_that("row_count, allow_coercion match", { + d_mn <- data.frame( + c1 = c("1", "2", NA, "3"), + c2 = c(NA, "2", NA, "3"), + c3 = c(NA, 4, NA, NA), + c4 = c(2, 3, 7, Inf), + stringsAsFactors = FALSE + ) + expect_identical(row_count(d_mn, count = 2, allow_coercion = TRUE), c(1, 2, 0, 0)) + expect_identical(row_count(d_mn, count = 2, allow_coercion = FALSE), c(1, 0, 0, 0)) + expect_identical(row_count(d_mn, count = "2", allow_coercion = FALSE), c(0, 2, 0, 0)) + expect_identical(row_count(d_mn, count = factor("2"), allow_coercion = TRUE), c(1, 2, 0, 0)) + expect_error(row_count(d_mn, count = factor("2"), allow_coercion = FALSE), regex = "No column has") + + # mix character / factor + d_mn <- data.frame( + c1 = factor(c("1", "2", NA, "3")), + c2 = c("2", "1", NA, "3"), + c3 = c(NA, 4, NA, NA), + c4 = c(2, 3, 7, Inf), + stringsAsFactors = FALSE + ) + expect_identical(row_count(d_mn, count = 2, allow_coercion = TRUE), c(2, 1, 0, 0)) + expect_identical(row_count(d_mn, count = 2, allow_coercion = FALSE), c(1, 0, 0, 0)) + expect_identical(row_count(d_mn, count = "2", allow_coercion = FALSE), c(1, 0, 0, 0)) + expect_identical(row_count(d_mn, count = "2", allow_coercion = TRUE), c(2, 1, 0, 0)) + expect_identical(row_count(d_mn, count = factor("2"), allow_coercion = FALSE), c(0, 1, 0, 0)) + expect_identical(row_count(d_mn, count = factor("2"), allow_coercion = TRUE), c(2, 1, 0, 0)) +})