-
-
Notifications
You must be signed in to change notification settings - Fork 16
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add
row_count()
to count specific values row-wise (#553)
* Draft `row_count()` * docs, type safe comparisons * lintr * apply suggestions * add test * fix test * rename arg * switch TRUE and FALSE * update docs * resolve comment * comments * typo
- Loading branch information
1 parent
8ab2454
commit 5ce207b
Showing
7 changed files
with
318 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,7 +1,7 @@ | ||
Type: Package | ||
Package: datawizard | ||
Title: Easy Data Wrangling and Statistical Transformations | ||
Version: 0.13.0.5 | ||
Version: 0.13.0.6 | ||
Authors@R: c( | ||
person("Indrajeet", "Patil", , "[email protected]", role = "aut", | ||
comment = c(ORCID = "0000-0003-1995-6531")), | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,124 @@ | ||
#' @title Count specific values row-wise | ||
#' @name row_count | ||
#' @description `row_count()` mimics base R's `rowSums()`, with sums for a | ||
#' specific value indicated by `count`. Hence, it is similar to | ||
#' `rowSums(x == count, na.rm = TRUE)`, but offers some more options, including | ||
#' strict comparisons. Comparisons using `==` coerce values to atomic vectors, | ||
#' thus both `2 == 2` and `"2" == 2` are `TRUE`. In `row_count()`, it is also | ||
#' possible to make "type safe" comparisons using the `allow_coercion` argument, | ||
#' where `"2" == 2` is not true. | ||
#' | ||
#' @param data A data frame with at least two columns, where number of specific | ||
#' values are counted row-wise. | ||
#' @param count The value for which the row sum should be computed. May be a | ||
#' numeric value, a character string (for factors or character vectors), `NA` or | ||
#' `Inf`. | ||
#' @param allow_coercion Logical. If `FALSE`, `count` matches only values of same | ||
#' class (i.e. when `count = 2`, the value `"2"` is not counted and vice versa). | ||
#' By default, when `allow_coercion = TRUE`, `count = 2` also matches `"2"`. In | ||
#' order to count factor levels in the data, use `count = factor("level")`. See | ||
#' 'Examples'. | ||
#' | ||
#' @inheritParams extract_column_names | ||
#' @inheritParams row_means | ||
#' | ||
#' @return A vector with row-wise counts of values specified in `count`. | ||
#' | ||
#' @examples | ||
#' dat <- data.frame( | ||
#' c1 = c(1, 2, NA, 4), | ||
#' c2 = c(NA, 2, NA, 5), | ||
#' c3 = c(NA, 4, NA, NA), | ||
#' c4 = c(2, 3, 7, 8) | ||
#' ) | ||
#' | ||
#' # count all 4s per row | ||
#' row_count(dat, count = 4) | ||
#' # count all missing values per row | ||
#' row_count(dat, count = NA) | ||
#' | ||
#' dat <- data.frame( | ||
#' c1 = c("1", "2", NA, "3"), | ||
#' c2 = c(NA, "2", NA, "3"), | ||
#' c3 = c(NA, 4, NA, NA), | ||
#' c4 = c(2, 3, 7, Inf) | ||
#' ) | ||
#' # count all 2s and "2"s per row | ||
#' row_count(dat, count = 2) | ||
#' # only count 2s, but not "2"s | ||
#' row_count(dat, count = 2, allow_coercion = FALSE) | ||
#' | ||
#' dat <- data.frame( | ||
#' c1 = factor(c("1", "2", NA, "3")), | ||
#' c2 = c("2", "1", NA, "3"), | ||
#' c3 = c(NA, 4, NA, NA), | ||
#' c4 = c(2, 3, 7, Inf) | ||
#' ) | ||
#' # find only character "2"s | ||
#' row_count(dat, count = "2", allow_coercion = FALSE) | ||
#' # find only factor level "2"s | ||
#' row_count(dat, count = factor("2"), allow_coercion = FALSE) | ||
#' | ||
#' @export | ||
row_count <- function(data, | ||
select = NULL, | ||
exclude = NULL, | ||
count = NULL, | ||
allow_coercion = TRUE, | ||
ignore_case = FALSE, | ||
regex = FALSE, | ||
verbose = TRUE) { | ||
# evaluate arguments | ||
select <- .select_nse(select, | ||
data, | ||
exclude, | ||
ignore_case = ignore_case, | ||
regex = regex, | ||
verbose = verbose | ||
) | ||
|
||
if (is.null(count)) { | ||
insight::format_error("`count` must be a valid value (including `NA` or `Inf`), but not `NULL`.") | ||
} | ||
|
||
if (is.null(select) || length(select) == 0) { | ||
insight::format_error("No columns selected.") | ||
} | ||
|
||
data <- .coerce_to_dataframe(data[select]) | ||
|
||
# check if we have a data framme with at least two columns | ||
if (nrow(data) < 1) { | ||
insight::format_error("`data` must be a data frame with at least one row.") | ||
} | ||
|
||
# check if we have a data framme with at least two columns | ||
if (ncol(data) < 2) { | ||
insight::format_error("`data` must be a data frame with at least two numeric columns.") | ||
} | ||
# special case: count missing | ||
if (is.na(count)) { | ||
rowSums(is.na(data)) | ||
} else { | ||
# comparisons in R using == coerce values into a atomic vector, i.e. | ||
# 2 == "2" is TRUE. If `allow_coercion = FALSE`, we only want 2 == 2 or | ||
# "2" == "2" (i.e. we want exact types to be compared only) | ||
if (isFALSE(allow_coercion)) { | ||
# we need the "type" of the count-value - we use class() instead of typeof(), | ||
# because the latter sometimes returns unsuitable classes/types. compare | ||
# typeof(as.Date("2020-01-01")), which returns "double". | ||
count_type <- class(count)[1] | ||
valid_columns <- vapply(data, inherits, TRUE, what = count_type) | ||
# check if any columns left? | ||
if (!any(valid_columns)) { | ||
insight::format_error("No column has same type as the value provided in `count`. Set `allow_coercion = TRUE` or specify a valid value for `count`.") # nolint | ||
} | ||
data <- data[valid_columns] | ||
} | ||
# coerce - we have only valid columns anyway, and we need to coerce factors | ||
# to vectors, else comparison with `==` errors. | ||
count <- as.vector(count) | ||
# finally, count | ||
rowSums(data == count, na.rm = TRUE) | ||
} | ||
} |
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -71,6 +71,7 @@ reference: | |
- kurtosis | ||
- smoothness | ||
- skewness | ||
- row_count | ||
- row_means | ||
- weighted_mean | ||
- mean_sd | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,57 @@ | ||
test_that("row_count", { | ||
d_mn <- data.frame( | ||
c1 = c(1, 2, NA, 4), | ||
c2 = c(NA, 2, NA, 5), | ||
c3 = c(NA, 4, NA, NA), | ||
c4 = c(2, 3, 7, 8) | ||
) | ||
expect_identical(row_count(d_mn, count = 2), c(1, 2, 0, 0)) | ||
expect_identical(row_count(d_mn, count = NA), c(2, 0, 3, 1)) | ||
d_mn <- data.frame( | ||
c1 = c("a", "b", NA, "c"), | ||
c2 = c(NA, "b", NA, "d"), | ||
c3 = c(NA, 4, NA, NA), | ||
c4 = c(2, 3, 7, Inf), | ||
stringsAsFactors = FALSE | ||
) | ||
expect_identical(row_count(d_mn, count = "b"), c(0, 2, 0, 0)) | ||
expect_identical(row_count(d_mn, count = Inf), c(0, 0, 0, 1)) | ||
}) | ||
|
||
test_that("row_count, errors or messages", { | ||
data(iris) | ||
expect_error(expect_warning(row_count(iris, select = "abc")), regex = "must be a valid") | ||
expect_error(expect_warning(row_count(iris, select = "abc", count = 3)), regex = "No columns") | ||
expect_error(row_count(iris[1], count = 3), regex = "with at least") | ||
expect_error(row_count(iris[-seq_len(nrow(iris)), , drop = FALSE], count = 2), regex = "one row") | ||
}) | ||
|
||
test_that("row_count, allow_coercion match", { | ||
d_mn <- data.frame( | ||
c1 = c("1", "2", NA, "3"), | ||
c2 = c(NA, "2", NA, "3"), | ||
c3 = c(NA, 4, NA, NA), | ||
c4 = c(2, 3, 7, Inf), | ||
stringsAsFactors = FALSE | ||
) | ||
expect_identical(row_count(d_mn, count = 2, allow_coercion = TRUE), c(1, 2, 0, 0)) | ||
expect_identical(row_count(d_mn, count = 2, allow_coercion = FALSE), c(1, 0, 0, 0)) | ||
expect_identical(row_count(d_mn, count = "2", allow_coercion = FALSE), c(0, 2, 0, 0)) | ||
expect_identical(row_count(d_mn, count = factor("2"), allow_coercion = TRUE), c(1, 2, 0, 0)) | ||
expect_error(row_count(d_mn, count = factor("2"), allow_coercion = FALSE), regex = "No column has") | ||
|
||
# mix character / factor | ||
d_mn <- data.frame( | ||
c1 = factor(c("1", "2", NA, "3")), | ||
c2 = c("2", "1", NA, "3"), | ||
c3 = c(NA, 4, NA, NA), | ||
c4 = c(2, 3, 7, Inf), | ||
stringsAsFactors = FALSE | ||
) | ||
expect_identical(row_count(d_mn, count = 2, allow_coercion = TRUE), c(2, 1, 0, 0)) | ||
expect_identical(row_count(d_mn, count = 2, allow_coercion = FALSE), c(1, 0, 0, 0)) | ||
expect_identical(row_count(d_mn, count = "2", allow_coercion = FALSE), c(1, 0, 0, 0)) | ||
expect_identical(row_count(d_mn, count = "2", allow_coercion = TRUE), c(2, 1, 0, 0)) | ||
expect_identical(row_count(d_mn, count = factor("2"), allow_coercion = FALSE), c(0, 1, 0, 0)) | ||
expect_identical(row_count(d_mn, count = factor("2"), allow_coercion = TRUE), c(2, 1, 0, 0)) | ||
}) |