Merge branch 'main' into row_sums

easystats · Oct 11, 2024 · d885b4a · d885b4a
2 parents 7d7b92d + 5ce207b
commit d885b4a
Show file tree

Hide file tree

Showing 11 changed files with 354 additions and 9 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,7 +1,7 @@
 Type: Package
 Package: datawizard
 Title: Easy Data Wrangling and Statistical Transformations
-Version: 0.13.0.3
+Version: 0.13.0.7
 Authors@R: c(
     person("Indrajeet", "Patil", , "[email protected]", role = "aut",
            comment = c(ORCID = "0000-0003-1995-6531")),

diff --git a/NAMESPACE b/NAMESPACE
@@ -296,6 +296,7 @@ export(reshape_longer)
 export(reshape_wider)
 export(reverse)
 export(reverse_scale)
+export(row_count)
 export(row_means)
 export(row_sums)
 export(row_to_colnames)

diff --git a/NEWS.md b/NEWS.md
@@ -1,5 +1,10 @@
 # datawizard (development)
 
+BREAKING CHANGES
+
+* Argument `drop_na` in `data_match()` is deprecated now. Please use `remove_na`
+  instead.
+
 CHANGES
 
 * The `select` argument, which is available in different functions to select
@@ -9,6 +14,8 @@ CHANGES
 * New function `row_sums()`, to calculate row sums (optionally with minimum
   amount of valid values), as complement to `row_means()`.
 
+* New function `row_count()`, to count specific values row-wise.
+
 BUG FIXES
 
 * `describe_distribution()` no longer errors if the sample was too sparse to compute

diff --git a/R/data_group.R b/R/data_group.R
@@ -51,7 +51,7 @@ data_group <- function(data,
       to = my_grid[i, , drop = FALSE],
       match = "and",
       return_indices = TRUE,
-      drop_na = FALSE
+      remove_na = FALSE
     ))
   })
   my_grid[[".rows"]] <- .rows

diff --git a/R/data_match.R b/R/data_match.R
@@ -15,7 +15,7 @@
 #' @param return_indices Logical, if `FALSE`, return the vector of rows that
 #'   can be used to filter the original data frame. If `FALSE` (default),
 #'   returns directly the filtered data frame instead of the row indices.
-#' @param drop_na Logical, if `TRUE`, missing values (`NA`s) are removed before
+#' @param remove_na Logical, if `TRUE`, missing values (`NA`s) are removed before
 #'   filtering the data. This is the default behaviour, however, sometimes when
 #'   row indices are requested (i.e. `return_indices=TRUE`), it might be useful
 #'   to preserve `NA` values, so returned row indices match the row indices of
@@ -26,6 +26,7 @@
 #'   character vector (e.g. `c("x > 4", "y == 2")`) or a variable that contains
 #'   the string representation of a logical expression. These might be useful
 #'   when used in packages to avoid defining undefined global variables.
+#' @param drop_na Deprecated, please use `remove_na` instead.
 #'
 #' @return A filtered data frame, or the row indices that match the specified
 #' configuration.
@@ -100,12 +101,24 @@
 #' data_filter(mtcars, fl)
 #' @inherit data_rename seealso
 #' @export
-data_match <- function(x, to, match = "and", return_indices = FALSE, drop_na = TRUE, ...) {
+data_match <- function(x,
+                       to,
+                       match = "and",
+                       return_indices = FALSE,
+                       remove_na = TRUE,
+                       drop_na,
+                       ...) {
   if (!is.data.frame(to)) {
     to <- as.data.frame(to)
   }
   original_x <- x
 
+  ## TODO: remove deprecated argument later
+  if (!missing(drop_na)) {
+    insight::format_warning("Argument `drop_na` is deprecated. Please use `remove_na` instead.")
+    remove_na <- drop_na
+  }
+
   # evaluate
   match <- match.arg(tolower(match), c("and", "&", "&&", "or", "|", "||", "!", "not"))
   match <- switch(match,
@@ -133,7 +146,7 @@ data_match <- function(x, to, match = "and", return_indices = FALSE, drop_na = T
     idx <- vector("numeric", length = 0L)
   } else {
     # remove missings before matching
-    if (isTRUE(drop_na)) {
+    if (isTRUE(remove_na)) {
       x <- x[stats::complete.cases(x), , drop = FALSE]
     }
     idx <- seq_len(nrow(x))

diff --git a/R/row_count.R b/R/row_count.R
@@ -0,0 +1,124 @@
+#' @title Count specific values row-wise
+#' @name row_count
+#' @description `row_count()` mimics base R's `rowSums()`, with sums for a
+#' specific value indicated by `count`. Hence, it is similar to
+#' `rowSums(x == count, na.rm = TRUE)`, but offers some more options, including
+#' strict comparisons. Comparisons using `==` coerce values to atomic vectors,
+#' thus both `2 == 2` and `"2" == 2` are `TRUE`. In `row_count()`, it is also
+#' possible to make "type safe" comparisons using the `allow_coercion` argument,
+#' where `"2" == 2` is not true.
+#'
+#' @param data A data frame with at least two columns, where number of specific
+#' values are counted row-wise.
+#' @param count The value for which the row sum should be computed. May be a
+#' numeric value, a character string (for factors or character vectors), `NA` or
+#' `Inf`.
+#' @param allow_coercion Logical. If `FALSE`, `count` matches only values of same
+#' class (i.e. when `count = 2`, the value `"2"` is not counted and vice versa).
+#' By default, when `allow_coercion = TRUE`, `count = 2` also matches `"2"`. In
+#' order to count factor levels in the data, use `count = factor("level")`. See
+#' 'Examples'.
+#'
+#' @inheritParams extract_column_names
+#' @inheritParams row_means
+#'
+#' @return A vector with row-wise counts of values specified in `count`.
+#'
+#' @examples
+#' dat <- data.frame(
+#'   c1 = c(1, 2, NA, 4),
+#'   c2 = c(NA, 2, NA, 5),
+#'   c3 = c(NA, 4, NA, NA),
+#'   c4 = c(2, 3, 7, 8)
+#' )
+#'
+#' # count all 4s per row
+#' row_count(dat, count = 4)
+#' # count all missing values per row
+#' row_count(dat, count = NA)
+#'
+#' dat <- data.frame(
+#'   c1 = c("1", "2", NA, "3"),
+#'   c2 = c(NA, "2", NA, "3"),
+#'   c3 = c(NA, 4, NA, NA),
+#'   c4 = c(2, 3, 7, Inf)
+#' )
+#' # count all 2s and "2"s per row
+#' row_count(dat, count = 2)
+#' # only count 2s, but not "2"s
+#' row_count(dat, count = 2, allow_coercion = FALSE)
+#'
+#' dat <- data.frame(
+#'   c1 = factor(c("1", "2", NA, "3")),
+#'   c2 = c("2", "1", NA, "3"),
+#'   c3 = c(NA, 4, NA, NA),
+#'   c4 = c(2, 3, 7, Inf)
+#' )
+#' # find only character "2"s
+#' row_count(dat, count = "2", allow_coercion = FALSE)
+#' # find only factor level "2"s
+#' row_count(dat, count = factor("2"), allow_coercion = FALSE)
+#'
+#' @export
+row_count <- function(data,
+                      select = NULL,
+                      exclude = NULL,
+                      count = NULL,
+                      allow_coercion = TRUE,
+                      ignore_case = FALSE,
+                      regex = FALSE,
+                      verbose = TRUE) {
+  # evaluate arguments
+  select <- .select_nse(select,
+    data,
+    exclude,
+    ignore_case = ignore_case,
+    regex = regex,
+    verbose = verbose
+  )
+
+  if (is.null(count)) {
+    insight::format_error("`count` must be a valid value (including `NA` or `Inf`), but not `NULL`.")
+  }
+
+  if (is.null(select) || length(select) == 0) {
+    insight::format_error("No columns selected.")
+  }
+
+  data <- .coerce_to_dataframe(data[select])
+
+  # check if we have a data framme with at least two columns
+  if (nrow(data) < 1) {
+    insight::format_error("`data` must be a data frame with at least one row.")
+  }
+
+  # check if we have a data framme with at least two columns
+  if (ncol(data) < 2) {
+    insight::format_error("`data` must be a data frame with at least two numeric columns.")
+  }
+  # special case: count missing
+  if (is.na(count)) {
+    rowSums(is.na(data))
+  } else {
+    # comparisons in R using == coerce values into a atomic vector, i.e.
+    # 2 == "2" is TRUE. If `allow_coercion = FALSE`, we only want 2 == 2 or
+    # "2" == "2" (i.e. we want exact types to be compared only)
+    if (isFALSE(allow_coercion)) {
+      # we need the "type" of the count-value - we use class() instead of typeof(),
+      # because the latter sometimes returns unsuitable classes/types. compare
+      # typeof(as.Date("2020-01-01")), which returns "double".
+      count_type <- class(count)[1]
+      valid_columns <- vapply(data, inherits, TRUE, what = count_type)
+      # check if any columns left?
+      if (!any(valid_columns)) {
+        insight::format_error("No column has same type as the value provided in `count`. Set `allow_coercion = TRUE` or specify a valid value for `count`.") # nolint
+      }
+      data <- data[valid_columns]
+    }
+    # coerce - we have only valid columns anyway, and we need to coerce factors
+    # to vectors, else comparison with `==` errors.
+    count <- as.vector(count)
+    # finally, count
+    rowSums(data == count, na.rm = TRUE)
+  }
+}
diff --git a/man/data_match.Rd b/man/data_match.Rd
diff --git a/man/row_count.Rd b/man/row_count.Rd
diff --git a/pkgdown/_pkgdown.yaml b/pkgdown/_pkgdown.yaml
@@ -71,6 +71,7 @@ reference:
       - kurtosis
       - smoothness
       - skewness
+      - row_count
       - row_means
       - weighted_mean
       - mean_sd