Skip to content

Commit

Permalink
Merge branch 'main' into row_sums
Browse files Browse the repository at this point in the history
  • Loading branch information
strengejacke authored Oct 11, 2024
2 parents 7d7b92d + 5ce207b commit d885b4a
Show file tree
Hide file tree
Showing 11 changed files with 354 additions and 9 deletions.
2 changes: 1 addition & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
Type: Package
Package: datawizard
Title: Easy Data Wrangling and Statistical Transformations
Version: 0.13.0.3
Version: 0.13.0.7
Authors@R: c(
person("Indrajeet", "Patil", , "[email protected]", role = "aut",
comment = c(ORCID = "0000-0003-1995-6531")),
Expand Down
1 change: 1 addition & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -296,6 +296,7 @@ export(reshape_longer)
export(reshape_wider)
export(reverse)
export(reverse_scale)
export(row_count)
export(row_means)
export(row_sums)
export(row_to_colnames)
Expand Down
7 changes: 7 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,10 @@
# datawizard (development)

BREAKING CHANGES

* Argument `drop_na` in `data_match()` is deprecated now. Please use `remove_na`
instead.

CHANGES

* The `select` argument, which is available in different functions to select
Expand All @@ -9,6 +14,8 @@ CHANGES
* New function `row_sums()`, to calculate row sums (optionally with minimum
amount of valid values), as complement to `row_means()`.

* New function `row_count()`, to count specific values row-wise.

BUG FIXES

* `describe_distribution()` no longer errors if the sample was too sparse to compute
Expand Down
2 changes: 1 addition & 1 deletion R/data_group.R
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ data_group <- function(data,
to = my_grid[i, , drop = FALSE],
match = "and",
return_indices = TRUE,
drop_na = FALSE
remove_na = FALSE
))
})
my_grid[[".rows"]] <- .rows
Expand Down
19 changes: 16 additions & 3 deletions R/data_match.R
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
#' @param return_indices Logical, if `FALSE`, return the vector of rows that
#' can be used to filter the original data frame. If `FALSE` (default),
#' returns directly the filtered data frame instead of the row indices.
#' @param drop_na Logical, if `TRUE`, missing values (`NA`s) are removed before
#' @param remove_na Logical, if `TRUE`, missing values (`NA`s) are removed before
#' filtering the data. This is the default behaviour, however, sometimes when
#' row indices are requested (i.e. `return_indices=TRUE`), it might be useful
#' to preserve `NA` values, so returned row indices match the row indices of
Expand All @@ -26,6 +26,7 @@
#' character vector (e.g. `c("x > 4", "y == 2")`) or a variable that contains
#' the string representation of a logical expression. These might be useful
#' when used in packages to avoid defining undefined global variables.
#' @param drop_na Deprecated, please use `remove_na` instead.
#'
#' @return A filtered data frame, or the row indices that match the specified
#' configuration.
Expand Down Expand Up @@ -100,12 +101,24 @@
#' data_filter(mtcars, fl)
#' @inherit data_rename seealso
#' @export
data_match <- function(x, to, match = "and", return_indices = FALSE, drop_na = TRUE, ...) {
data_match <- function(x,
to,
match = "and",
return_indices = FALSE,
remove_na = TRUE,
drop_na,

Check warning on line 109 in R/data_match.R

View workflow job for this annotation

GitHub Actions / lint / lint

file=R/data_match.R,line=109,col=24,[function_argument_linter] Arguments without defaults should come before arguments with defaults. Consider setting the default to NULL and using is.null() instead of using missing()
...) {
if (!is.data.frame(to)) {
to <- as.data.frame(to)
}
original_x <- x

## TODO: remove deprecated argument later
if (!missing(drop_na)) {
insight::format_warning("Argument `drop_na` is deprecated. Please use `remove_na` instead.")
remove_na <- drop_na
}

# evaluate
match <- match.arg(tolower(match), c("and", "&", "&&", "or", "|", "||", "!", "not"))
match <- switch(match,
Expand Down Expand Up @@ -133,7 +146,7 @@ data_match <- function(x, to, match = "and", return_indices = FALSE, drop_na = T
idx <- vector("numeric", length = 0L)
} else {
# remove missings before matching
if (isTRUE(drop_na)) {
if (isTRUE(remove_na)) {
x <- x[stats::complete.cases(x), , drop = FALSE]
}
idx <- seq_len(nrow(x))
Expand Down
124 changes: 124 additions & 0 deletions R/row_count.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,124 @@
#' @title Count specific values row-wise
#' @name row_count
#' @description `row_count()` mimics base R's `rowSums()`, with sums for a
#' specific value indicated by `count`. Hence, it is similar to
#' `rowSums(x == count, na.rm = TRUE)`, but offers some more options, including
#' strict comparisons. Comparisons using `==` coerce values to atomic vectors,
#' thus both `2 == 2` and `"2" == 2` are `TRUE`. In `row_count()`, it is also
#' possible to make "type safe" comparisons using the `allow_coercion` argument,
#' where `"2" == 2` is not true.
#'
#' @param data A data frame with at least two columns, where number of specific
#' values are counted row-wise.
#' @param count The value for which the row sum should be computed. May be a
#' numeric value, a character string (for factors or character vectors), `NA` or
#' `Inf`.
#' @param allow_coercion Logical. If `FALSE`, `count` matches only values of same
#' class (i.e. when `count = 2`, the value `"2"` is not counted and vice versa).
#' By default, when `allow_coercion = TRUE`, `count = 2` also matches `"2"`. In
#' order to count factor levels in the data, use `count = factor("level")`. See
#' 'Examples'.
#'
#' @inheritParams extract_column_names
#' @inheritParams row_means
#'
#' @return A vector with row-wise counts of values specified in `count`.
#'
#' @examples
#' dat <- data.frame(
#' c1 = c(1, 2, NA, 4),
#' c2 = c(NA, 2, NA, 5),
#' c3 = c(NA, 4, NA, NA),
#' c4 = c(2, 3, 7, 8)
#' )
#'
#' # count all 4s per row
#' row_count(dat, count = 4)
#' # count all missing values per row
#' row_count(dat, count = NA)
#'
#' dat <- data.frame(
#' c1 = c("1", "2", NA, "3"),
#' c2 = c(NA, "2", NA, "3"),
#' c3 = c(NA, 4, NA, NA),
#' c4 = c(2, 3, 7, Inf)
#' )
#' # count all 2s and "2"s per row
#' row_count(dat, count = 2)
#' # only count 2s, but not "2"s
#' row_count(dat, count = 2, allow_coercion = FALSE)
#'
#' dat <- data.frame(
#' c1 = factor(c("1", "2", NA, "3")),
#' c2 = c("2", "1", NA, "3"),
#' c3 = c(NA, 4, NA, NA),
#' c4 = c(2, 3, 7, Inf)
#' )
#' # find only character "2"s
#' row_count(dat, count = "2", allow_coercion = FALSE)
#' # find only factor level "2"s
#' row_count(dat, count = factor("2"), allow_coercion = FALSE)
#'
#' @export
row_count <- function(data,
select = NULL,
exclude = NULL,
count = NULL,
allow_coercion = TRUE,
ignore_case = FALSE,
regex = FALSE,
verbose = TRUE) {
# evaluate arguments
select <- .select_nse(select,
data,
exclude,
ignore_case = ignore_case,
regex = regex,
verbose = verbose
)

if (is.null(count)) {
insight::format_error("`count` must be a valid value (including `NA` or `Inf`), but not `NULL`.")
}

if (is.null(select) || length(select) == 0) {
insight::format_error("No columns selected.")
}

data <- .coerce_to_dataframe(data[select])

# check if we have a data framme with at least two columns
if (nrow(data) < 1) {
insight::format_error("`data` must be a data frame with at least one row.")
}

# check if we have a data framme with at least two columns
if (ncol(data) < 2) {
insight::format_error("`data` must be a data frame with at least two numeric columns.")
}
# special case: count missing
if (is.na(count)) {
rowSums(is.na(data))
} else {
# comparisons in R using == coerce values into a atomic vector, i.e.
# 2 == "2" is TRUE. If `allow_coercion = FALSE`, we only want 2 == 2 or
# "2" == "2" (i.e. we want exact types to be compared only)
if (isFALSE(allow_coercion)) {
# we need the "type" of the count-value - we use class() instead of typeof(),
# because the latter sometimes returns unsuitable classes/types. compare
# typeof(as.Date("2020-01-01")), which returns "double".
count_type <- class(count)[1]
valid_columns <- vapply(data, inherits, TRUE, what = count_type)
# check if any columns left?
if (!any(valid_columns)) {
insight::format_error("No column has same type as the value provided in `count`. Set `allow_coercion = TRUE` or specify a valid value for `count`.") # nolint
}
data <- data[valid_columns]
}
# coerce - we have only valid columns anyway, and we need to coerce factors
# to vectors, else comparison with `==` errors.
count <- as.vector(count)
# finally, count
rowSums(data == count, na.rm = TRUE)
}
}
14 changes: 12 additions & 2 deletions man/data_match.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

132 changes: 132 additions & 0 deletions man/row_count.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions pkgdown/_pkgdown.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,7 @@ reference:
- kurtosis
- smoothness
- skewness
- row_count
- row_means
- weighted_mean
- mean_sd
Expand Down
Loading

0 comments on commit d885b4a

Please sign in to comment.