Add row_count() to count specific values row-wise (#553)

* Draft `row_count()` * docs, type safe comparisons * lintr * apply suggestions * add test * fix test * rename arg * switch TRUE and FALSE * update docs * resolve comment * comments * typo
easystats · Oct 11, 2024 · 5ce207b · 5ce207b
1 parent 8ab2454
commit 5ce207b
Show file tree

Hide file tree

Showing 7 changed files with 318 additions and 1 deletion.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,7 +1,7 @@
 Type: Package
 Package: datawizard
 Title: Easy Data Wrangling and Statistical Transformations
-Version: 0.13.0.5
+Version: 0.13.0.6
 Authors@R: c(
     person("Indrajeet", "Patil", , "[email protected]", role = "aut",
            comment = c(ORCID = "0000-0003-1995-6531")),

diff --git a/NAMESPACE b/NAMESPACE
@@ -296,6 +296,7 @@ export(reshape_longer)
 export(reshape_wider)
 export(reverse)
 export(reverse_scale)
+export(row_count)
 export(row_means)
 export(row_to_colnames)
 export(rowid_as_column)

diff --git a/NEWS.md b/NEWS.md
@@ -11,6 +11,8 @@ CHANGES
   variables, can now also be a character vector with quoted variable names,
   including a colon to indicate a range of several variables (e.g. `"cyl:gear"`).
 
+* New function `row_count()`, to count specific values row-wise.
+
 BUG FIXES
 
 * `describe_distribution()` no longer errors if the sample was too sparse to compute

diff --git a/R/row_count.R b/R/row_count.R
@@ -0,0 +1,124 @@
+#' @title Count specific values row-wise
+#' @name row_count
+#' @description `row_count()` mimics base R's `rowSums()`, with sums for a
+#' specific value indicated by `count`. Hence, it is similar to
+#' `rowSums(x == count, na.rm = TRUE)`, but offers some more options, including
+#' strict comparisons. Comparisons using `==` coerce values to atomic vectors,
+#' thus both `2 == 2` and `"2" == 2` are `TRUE`. In `row_count()`, it is also
+#' possible to make "type safe" comparisons using the `allow_coercion` argument,
+#' where `"2" == 2` is not true.
+#'
+#' @param data A data frame with at least two columns, where number of specific
+#' values are counted row-wise.
+#' @param count The value for which the row sum should be computed. May be a
+#' numeric value, a character string (for factors or character vectors), `NA` or
+#' `Inf`.
+#' @param allow_coercion Logical. If `FALSE`, `count` matches only values of same
+#' class (i.e. when `count = 2`, the value `"2"` is not counted and vice versa).
+#' By default, when `allow_coercion = TRUE`, `count = 2` also matches `"2"`. In
+#' order to count factor levels in the data, use `count = factor("level")`. See
+#' 'Examples'.
+#'
+#' @inheritParams extract_column_names
+#' @inheritParams row_means
+#'
+#' @return A vector with row-wise counts of values specified in `count`.
+#'
+#' @examples
+#' dat <- data.frame(
+#'   c1 = c(1, 2, NA, 4),
+#'   c2 = c(NA, 2, NA, 5),
+#'   c3 = c(NA, 4, NA, NA),
+#'   c4 = c(2, 3, 7, 8)
+#' )
+#'
+#' # count all 4s per row
+#' row_count(dat, count = 4)
+#' # count all missing values per row
+#' row_count(dat, count = NA)
+#'
+#' dat <- data.frame(
+#'   c1 = c("1", "2", NA, "3"),
+#'   c2 = c(NA, "2", NA, "3"),
+#'   c3 = c(NA, 4, NA, NA),
+#'   c4 = c(2, 3, 7, Inf)
+#' )
+#' # count all 2s and "2"s per row
+#' row_count(dat, count = 2)
+#' # only count 2s, but not "2"s
+#' row_count(dat, count = 2, allow_coercion = FALSE)
+#'
+#' dat <- data.frame(
+#'   c1 = factor(c("1", "2", NA, "3")),
+#'   c2 = c("2", "1", NA, "3"),
+#'   c3 = c(NA, 4, NA, NA),
+#'   c4 = c(2, 3, 7, Inf)
+#' )
+#' # find only character "2"s
+#' row_count(dat, count = "2", allow_coercion = FALSE)
+#' # find only factor level "2"s
+#' row_count(dat, count = factor("2"), allow_coercion = FALSE)
+#'
+#' @export
+row_count <- function(data,
+                      select = NULL,
+                      exclude = NULL,
+                      count = NULL,
+                      allow_coercion = TRUE,
+                      ignore_case = FALSE,
+                      regex = FALSE,
+                      verbose = TRUE) {
+  # evaluate arguments
+  select <- .select_nse(select,
+    data,
+    exclude,
+    ignore_case = ignore_case,
+    regex = regex,
+    verbose = verbose
+  )
+
+  if (is.null(count)) {
+    insight::format_error("`count` must be a valid value (including `NA` or `Inf`), but not `NULL`.")
+  }
+
+  if (is.null(select) || length(select) == 0) {
+    insight::format_error("No columns selected.")
+  }
+
+  data <- .coerce_to_dataframe(data[select])
+
+  # check if we have a data framme with at least two columns
+  if (nrow(data) < 1) {
+    insight::format_error("`data` must be a data frame with at least one row.")
+  }
+
+  # check if we have a data framme with at least two columns
+  if (ncol(data) < 2) {
+    insight::format_error("`data` must be a data frame with at least two numeric columns.")
+  }
+  # special case: count missing
+  if (is.na(count)) {
+    rowSums(is.na(data))
+  } else {
+    # comparisons in R using == coerce values into a atomic vector, i.e.
+    # 2 == "2" is TRUE. If `allow_coercion = FALSE`, we only want 2 == 2 or
+    # "2" == "2" (i.e. we want exact types to be compared only)
+    if (isFALSE(allow_coercion)) {
+      # we need the "type" of the count-value - we use class() instead of typeof(),
+      # because the latter sometimes returns unsuitable classes/types. compare
+      # typeof(as.Date("2020-01-01")), which returns "double".
+      count_type <- class(count)[1]
+      valid_columns <- vapply(data, inherits, TRUE, what = count_type)
+      # check if any columns left?
+      if (!any(valid_columns)) {
+        insight::format_error("No column has same type as the value provided in `count`. Set `allow_coercion = TRUE` or specify a valid value for `count`.") # nolint
+      }
+      data <- data[valid_columns]
+    }
+    # coerce - we have only valid columns anyway, and we need to coerce factors
+    # to vectors, else comparison with `==` errors.
+    count <- as.vector(count)
+    # finally, count
+    rowSums(data == count, na.rm = TRUE)
+  }
+}
diff --git a/man/row_count.Rd b/man/row_count.Rd
diff --git a/pkgdown/_pkgdown.yaml b/pkgdown/_pkgdown.yaml
@@ -71,6 +71,7 @@ reference:
       - kurtosis
       - smoothness
       - skewness
+      - row_count
       - row_means
       - weighted_mean
       - mean_sd

diff --git a/tests/testthat/test-row_count.R b/tests/testthat/test-row_count.R
@@ -0,0 +1,57 @@
+test_that("row_count", {
+  d_mn <- data.frame(
+    c1 = c(1, 2, NA, 4),
+    c2 = c(NA, 2, NA, 5),
+    c3 = c(NA, 4, NA, NA),
+    c4 = c(2, 3, 7, 8)
+  )
+  expect_identical(row_count(d_mn, count = 2), c(1, 2, 0, 0))
+  expect_identical(row_count(d_mn, count = NA), c(2, 0, 3, 1))
+  d_mn <- data.frame(
+    c1 = c("a", "b", NA, "c"),
+    c2 = c(NA, "b", NA, "d"),
+    c3 = c(NA, 4, NA, NA),
+    c4 = c(2, 3, 7, Inf),
+    stringsAsFactors = FALSE
+  )
+  expect_identical(row_count(d_mn, count = "b"), c(0, 2, 0, 0))
+  expect_identical(row_count(d_mn, count = Inf), c(0, 0, 0, 1))
+})
+
+test_that("row_count, errors or messages", {
+  data(iris)
+  expect_error(expect_warning(row_count(iris, select = "abc")), regex = "must be a valid")
+  expect_error(expect_warning(row_count(iris, select = "abc", count = 3)), regex = "No columns")
+  expect_error(row_count(iris[1], count = 3), regex = "with at least")
+  expect_error(row_count(iris[-seq_len(nrow(iris)), , drop = FALSE], count = 2), regex = "one row")
+})
+
+test_that("row_count, allow_coercion match", {
+  d_mn <- data.frame(
+    c1 = c("1", "2", NA, "3"),
+    c2 = c(NA, "2", NA, "3"),
+    c3 = c(NA, 4, NA, NA),
+    c4 = c(2, 3, 7, Inf),
+    stringsAsFactors = FALSE
+  )
+  expect_identical(row_count(d_mn, count = 2, allow_coercion = TRUE), c(1, 2, 0, 0))
+  expect_identical(row_count(d_mn, count = 2, allow_coercion = FALSE), c(1, 0, 0, 0))
+  expect_identical(row_count(d_mn, count = "2", allow_coercion = FALSE), c(0, 2, 0, 0))
+  expect_identical(row_count(d_mn, count = factor("2"), allow_coercion = TRUE), c(1, 2, 0, 0))
+  expect_error(row_count(d_mn, count = factor("2"), allow_coercion = FALSE), regex = "No column has")
+
+  # mix character / factor
+  d_mn <- data.frame(
+    c1 = factor(c("1", "2", NA, "3")),
+    c2 = c("2", "1", NA, "3"),
+    c3 = c(NA, 4, NA, NA),
+    c4 = c(2, 3, 7, Inf),
+    stringsAsFactors = FALSE
+  )
+  expect_identical(row_count(d_mn, count = 2, allow_coercion = TRUE), c(2, 1, 0, 0))
+  expect_identical(row_count(d_mn, count = 2, allow_coercion = FALSE), c(1, 0, 0, 0))
+  expect_identical(row_count(d_mn, count = "2", allow_coercion = FALSE), c(1, 0, 0, 0))
+  expect_identical(row_count(d_mn, count = "2", allow_coercion = TRUE), c(2, 1, 0, 0))
+  expect_identical(row_count(d_mn, count = factor("2"), allow_coercion = FALSE), c(0, 1, 0, 0))
+  expect_identical(row_count(d_mn, count = factor("2"), allow_coercion = TRUE), c(2, 1, 0, 0))
+})