Skip to content

Commit

Permalink
Add row_count() to count specific values row-wise (#553)
Browse files Browse the repository at this point in the history
* Draft `row_count()`

* docs, type safe comparisons

* lintr

* apply suggestions

* add test

* fix test

* rename arg

* switch TRUE and FALSE

* update docs

* resolve comment

* comments

* typo
  • Loading branch information
strengejacke authored Oct 11, 2024
1 parent 8ab2454 commit 5ce207b
Show file tree
Hide file tree
Showing 7 changed files with 318 additions and 1 deletion.
2 changes: 1 addition & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
Type: Package
Package: datawizard
Title: Easy Data Wrangling and Statistical Transformations
Version: 0.13.0.5
Version: 0.13.0.6
Authors@R: c(
person("Indrajeet", "Patil", , "[email protected]", role = "aut",
comment = c(ORCID = "0000-0003-1995-6531")),
Expand Down
1 change: 1 addition & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -296,6 +296,7 @@ export(reshape_longer)
export(reshape_wider)
export(reverse)
export(reverse_scale)
export(row_count)
export(row_means)
export(row_to_colnames)
export(rowid_as_column)
Expand Down
2 changes: 2 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@ CHANGES
variables, can now also be a character vector with quoted variable names,
including a colon to indicate a range of several variables (e.g. `"cyl:gear"`).

* New function `row_count()`, to count specific values row-wise.

BUG FIXES

* `describe_distribution()` no longer errors if the sample was too sparse to compute
Expand Down
124 changes: 124 additions & 0 deletions R/row_count.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,124 @@
#' @title Count specific values row-wise
#' @name row_count
#' @description `row_count()` mimics base R's `rowSums()`, with sums for a
#' specific value indicated by `count`. Hence, it is similar to
#' `rowSums(x == count, na.rm = TRUE)`, but offers some more options, including
#' strict comparisons. Comparisons using `==` coerce values to atomic vectors,
#' thus both `2 == 2` and `"2" == 2` are `TRUE`. In `row_count()`, it is also
#' possible to make "type safe" comparisons using the `allow_coercion` argument,
#' where `"2" == 2` is not true.
#'
#' @param data A data frame with at least two columns, where number of specific
#' values are counted row-wise.
#' @param count The value for which the row sum should be computed. May be a
#' numeric value, a character string (for factors or character vectors), `NA` or
#' `Inf`.
#' @param allow_coercion Logical. If `FALSE`, `count` matches only values of same
#' class (i.e. when `count = 2`, the value `"2"` is not counted and vice versa).
#' By default, when `allow_coercion = TRUE`, `count = 2` also matches `"2"`. In
#' order to count factor levels in the data, use `count = factor("level")`. See
#' 'Examples'.
#'
#' @inheritParams extract_column_names
#' @inheritParams row_means
#'
#' @return A vector with row-wise counts of values specified in `count`.
#'
#' @examples
#' dat <- data.frame(
#' c1 = c(1, 2, NA, 4),
#' c2 = c(NA, 2, NA, 5),
#' c3 = c(NA, 4, NA, NA),
#' c4 = c(2, 3, 7, 8)
#' )
#'
#' # count all 4s per row
#' row_count(dat, count = 4)
#' # count all missing values per row
#' row_count(dat, count = NA)
#'
#' dat <- data.frame(
#' c1 = c("1", "2", NA, "3"),
#' c2 = c(NA, "2", NA, "3"),
#' c3 = c(NA, 4, NA, NA),
#' c4 = c(2, 3, 7, Inf)
#' )
#' # count all 2s and "2"s per row
#' row_count(dat, count = 2)
#' # only count 2s, but not "2"s
#' row_count(dat, count = 2, allow_coercion = FALSE)
#'
#' dat <- data.frame(
#' c1 = factor(c("1", "2", NA, "3")),
#' c2 = c("2", "1", NA, "3"),
#' c3 = c(NA, 4, NA, NA),
#' c4 = c(2, 3, 7, Inf)
#' )
#' # find only character "2"s
#' row_count(dat, count = "2", allow_coercion = FALSE)
#' # find only factor level "2"s
#' row_count(dat, count = factor("2"), allow_coercion = FALSE)
#'
#' @export
row_count <- function(data,
select = NULL,
exclude = NULL,
count = NULL,
allow_coercion = TRUE,
ignore_case = FALSE,
regex = FALSE,
verbose = TRUE) {
# evaluate arguments
select <- .select_nse(select,
data,
exclude,
ignore_case = ignore_case,
regex = regex,
verbose = verbose
)

if (is.null(count)) {
insight::format_error("`count` must be a valid value (including `NA` or `Inf`), but not `NULL`.")
}

if (is.null(select) || length(select) == 0) {
insight::format_error("No columns selected.")
}

data <- .coerce_to_dataframe(data[select])

# check if we have a data framme with at least two columns
if (nrow(data) < 1) {
insight::format_error("`data` must be a data frame with at least one row.")
}

# check if we have a data framme with at least two columns
if (ncol(data) < 2) {
insight::format_error("`data` must be a data frame with at least two numeric columns.")
}
# special case: count missing
if (is.na(count)) {
rowSums(is.na(data))
} else {
# comparisons in R using == coerce values into a atomic vector, i.e.
# 2 == "2" is TRUE. If `allow_coercion = FALSE`, we only want 2 == 2 or
# "2" == "2" (i.e. we want exact types to be compared only)
if (isFALSE(allow_coercion)) {
# we need the "type" of the count-value - we use class() instead of typeof(),
# because the latter sometimes returns unsuitable classes/types. compare
# typeof(as.Date("2020-01-01")), which returns "double".
count_type <- class(count)[1]
valid_columns <- vapply(data, inherits, TRUE, what = count_type)
# check if any columns left?
if (!any(valid_columns)) {
insight::format_error("No column has same type as the value provided in `count`. Set `allow_coercion = TRUE` or specify a valid value for `count`.") # nolint
}
data <- data[valid_columns]
}
# coerce - we have only valid columns anyway, and we need to coerce factors
# to vectors, else comparison with `==` errors.
count <- as.vector(count)
# finally, count
rowSums(data == count, na.rm = TRUE)
}
}
132 changes: 132 additions & 0 deletions man/row_count.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions pkgdown/_pkgdown.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,7 @@ reference:
- kurtosis
- smoothness
- skewness
- row_count
- row_means
- weighted_mean
- mean_sd
Expand Down
57 changes: 57 additions & 0 deletions tests/testthat/test-row_count.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
test_that("row_count", {
d_mn <- data.frame(
c1 = c(1, 2, NA, 4),
c2 = c(NA, 2, NA, 5),
c3 = c(NA, 4, NA, NA),
c4 = c(2, 3, 7, 8)
)
expect_identical(row_count(d_mn, count = 2), c(1, 2, 0, 0))
expect_identical(row_count(d_mn, count = NA), c(2, 0, 3, 1))
d_mn <- data.frame(
c1 = c("a", "b", NA, "c"),
c2 = c(NA, "b", NA, "d"),
c3 = c(NA, 4, NA, NA),
c4 = c(2, 3, 7, Inf),
stringsAsFactors = FALSE
)
expect_identical(row_count(d_mn, count = "b"), c(0, 2, 0, 0))
expect_identical(row_count(d_mn, count = Inf), c(0, 0, 0, 1))
})

test_that("row_count, errors or messages", {
data(iris)
expect_error(expect_warning(row_count(iris, select = "abc")), regex = "must be a valid")
expect_error(expect_warning(row_count(iris, select = "abc", count = 3)), regex = "No columns")
expect_error(row_count(iris[1], count = 3), regex = "with at least")
expect_error(row_count(iris[-seq_len(nrow(iris)), , drop = FALSE], count = 2), regex = "one row")
})

test_that("row_count, allow_coercion match", {
d_mn <- data.frame(
c1 = c("1", "2", NA, "3"),
c2 = c(NA, "2", NA, "3"),
c3 = c(NA, 4, NA, NA),
c4 = c(2, 3, 7, Inf),
stringsAsFactors = FALSE
)
expect_identical(row_count(d_mn, count = 2, allow_coercion = TRUE), c(1, 2, 0, 0))
expect_identical(row_count(d_mn, count = 2, allow_coercion = FALSE), c(1, 0, 0, 0))
expect_identical(row_count(d_mn, count = "2", allow_coercion = FALSE), c(0, 2, 0, 0))
expect_identical(row_count(d_mn, count = factor("2"), allow_coercion = TRUE), c(1, 2, 0, 0))
expect_error(row_count(d_mn, count = factor("2"), allow_coercion = FALSE), regex = "No column has")

# mix character / factor
d_mn <- data.frame(
c1 = factor(c("1", "2", NA, "3")),
c2 = c("2", "1", NA, "3"),
c3 = c(NA, 4, NA, NA),
c4 = c(2, 3, 7, Inf),
stringsAsFactors = FALSE
)
expect_identical(row_count(d_mn, count = 2, allow_coercion = TRUE), c(2, 1, 0, 0))
expect_identical(row_count(d_mn, count = 2, allow_coercion = FALSE), c(1, 0, 0, 0))
expect_identical(row_count(d_mn, count = "2", allow_coercion = FALSE), c(1, 0, 0, 0))
expect_identical(row_count(d_mn, count = "2", allow_coercion = TRUE), c(2, 1, 0, 0))
expect_identical(row_count(d_mn, count = factor("2"), allow_coercion = FALSE), c(0, 1, 0, 0))
expect_identical(row_count(d_mn, count = factor("2"), allow_coercion = TRUE), c(2, 1, 0, 0))
})

0 comments on commit 5ce207b

Please sign in to comment.