Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

data_tabulate() gains a weights argument #479

Merged
merged 9 commits into from
Feb 7, 2024
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
Type: Package
Package: datawizard
Title: Easy Data Wrangling and Statistical Transformations
Version: 0.9.1
Version: 0.9.1.2
Authors@R: c(
person("Indrajeet", "Patil", , "[email protected]", role = "aut",
comment = c(ORCID = "0000-0003-1995-6531", Twitter = "@patilindrajeets")),
Expand Down Expand Up @@ -72,7 +72,7 @@ VignetteBuilder:
Encoding: UTF-8
Language: en-US
Roxygen: list(markdown = TRUE)
RoxygenNote: 7.2.3.9000
RoxygenNote: 7.3.1
Config/testthat/edition: 3
Config/testthat/parallel: true
Config/Needs/website:
Expand Down
83 changes: 74 additions & 9 deletions R/data_tabulate.R
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@
#' for printing.
#' @param collapse Logical, if `TRUE` collapses multiple tables into one larger
#' table for printing. This affects only printing, not the returned object.
#' @param weights Optional numeric vector of weights. Must be of the same length
#' as `x`. If `weights` is supplied, weighted frequencies are calculated.
#' @param ... not used.
#' @inheritParams find_columns
#'
Expand Down Expand Up @@ -46,6 +48,12 @@
#'
#' # to remove the big mark, use "print(..., big_mark = "")"
#' print(data_tabulate(x), big_mark = "")
#'
#' # weighted frequencies
#' set.seed(123)
#' efc$weights <- abs(rnorm(n = nrow(efc), mean = 1, sd = 0.5))
#' data_tabulate(efc$e42dep, weights = efc$weights)
#'
#' @export
data_tabulate <- function(x, ...) {
UseMethod("data_tabulate")
Expand All @@ -54,7 +62,7 @@ data_tabulate <- function(x, ...) {

#' @rdname data_tabulate
#' @export
data_tabulate.default <- function(x, drop_levels = FALSE, name = NULL, verbose = TRUE, ...) {
data_tabulate.default <- function(x, drop_levels = FALSE, weights = NULL, name = NULL, verbose = TRUE, ...) {
# save label attribute, before it gets lost...
var_label <- attr(x, "label", exact = TRUE)

Expand All @@ -70,8 +78,26 @@ data_tabulate.default <- function(x, drop_levels = FALSE, name = NULL, verbose =
x <- droplevels(x)
}

# check for correct length of weights - must be equal to "x"
if (!is.null(weights) && length(weights) != length(x)) {
insight::format_error("Length of weights must be equal to length of `x`.")
strengejacke marked this conversation as resolved.
Show resolved Hide resolved
}

# frequency table
freq_table <- tryCatch(table(addNA(x)), error = function(e) NULL)
if (is.null(weights)) {
freq_table <- tryCatch(table(addNA(x)), error = function(e) NULL)
} else {
# weighted frequency table
freq_table <- tryCatch(
stats::xtabs(
weights ~ x,
data = data.frame(weights = weights, x = x),
na.action = stats::na.pass,
addNA = TRUE
),
error = function(e) NULL
)
}

if (is.null(freq_table)) {
insight::format_warning(paste0("Can't compute frequency tables for objects of class `", class(x)[1], "`."))
Expand All @@ -83,6 +109,11 @@ data_tabulate.default <- function(x, drop_levels = FALSE, name = NULL, verbose =
replacement = c("Value", "N")
)

# we want to round N for weighted frequencies
if (!is.null(weights)) {
out$N <- round(out$N)
}

out$`Raw %` <- 100 * out$N / sum(out$N)
out$`Valid %` <- c(100 * out$N[-nrow(out)] / sum(out$N[-nrow(out)]), NA)
out$`Cumulative %` <- cumsum(out$`Valid %`)
Expand Down Expand Up @@ -110,6 +141,7 @@ data_tabulate.default <- function(x, drop_levels = FALSE, name = NULL, verbose =
attr(out, "object") <- obj_name
attr(out, "group_variable") <- group_variable
attr(out, "duplicate_varnames") <- duplicated(out$Variable)
attr(out, "weights") <- weights

attr(out, "total_n") <- sum(out$N, na.rm = TRUE)
attr(out, "valid_n") <- sum(out$N[-length(out$N)], na.rm = TRUE)
Expand All @@ -129,6 +161,7 @@ data_tabulate.data.frame <- function(x,
regex = FALSE,
collapse = FALSE,
drop_levels = FALSE,
weights = NULL,
verbose = TRUE,
...) {
# evaluate arguments
Expand All @@ -140,11 +173,12 @@ data_tabulate.data.frame <- function(x,
verbose = verbose
)
out <- lapply(select, function(i) {
data_tabulate(x[[i]], drop_levels = drop_levels, name = i, verbose = verbose, ...)
data_tabulate(x[[i]], drop_levels = drop_levels, weights = weights, name = i, verbose = verbose, ...)
})

class(out) <- c("dw_data_tabulates", "list")
attr(out, "collapse") <- isTRUE(collapse)
attr(out, "is_weighted") <- !is.null(weights)

out
}
Expand All @@ -159,6 +193,7 @@ data_tabulate.grouped_df <- function(x,
verbose = TRUE,
collapse = FALSE,
drop_levels = FALSE,
weights = NULL,
...) {
# works only for dplyr >= 0.8.0
grps <- attr(x, "groups", exact = TRUE)
Expand Down Expand Up @@ -191,12 +226,14 @@ data_tabulate.grouped_df <- function(x,
ignore_case = ignore_case,
verbose = verbose,
drop_levels = drop_levels,
weights = weights,
group_variable = group_variable,
...
))
}
class(out) <- c("dw_data_tabulates", "list")
attr(out, "collapse") <- isTRUE(collapse)
attr(out, "is_weighted") <- !is.null(weights)

out
}
Expand Down Expand Up @@ -270,7 +307,12 @@ print.dw_data_tabulate <- function(x, big_mark = NULL, ...) {
a$valid_n <- .add_commas_in_numbers(a$valid_n, big_mark)

# summary of total and valid N (we may add mean/sd as well?)
summary_line <- sprintf("# total N=%s valid N=%s\n\n", a$total_n, a$valid_n)
summary_line <- sprintf(
"# total N=%s valid N=%s%s\n\n",
a$total_n,
a$valid_n,
ifelse(is.null(a$weights), "", " (weighted)")
)
cat(insight::print_color(summary_line, "blue"))

# remove information that goes into the header/footer
Expand All @@ -295,7 +337,12 @@ print_html.dw_data_tabulate <- function(x, big_mark = NULL, ...) {
caption <- .table_header(x, "html")

# summary of total and valid N (we may add mean/sd as well?)
footer <- sprintf("total N=%i valid N=%i\n\n", a$total_n, a$valid_n)
footer <- sprintf(
"total N=%i valid N=%i%s",
a$total_n,
a$valid_n,
ifelse(is.null(a$weights), "", " (weighted)")
)

# remove information that goes into the header/footer
x$Variable <- NULL
Expand All @@ -320,7 +367,12 @@ print_md.dw_data_tabulate <- function(x, big_mark = NULL, ...) {
caption <- .table_header(x, "markdown")

# summary of total and valid N (we may add mean/sd as well?)
footer <- sprintf("total N=%i valid N=%i\n\n", a$total_n, a$valid_n)
footer <- sprintf(
"total N=%i valid N=%i%s\n\n",
a$total_n,
a$valid_n,
ifelse(is.null(a$weights), "", " (weighted)")
)

# remove information that goes into the header/footer
x$Variable <- NULL
Expand All @@ -339,6 +391,9 @@ print_md.dw_data_tabulate <- function(x, big_mark = NULL, ...) {

#' @export
print.dw_data_tabulates <- function(x, big_mark = NULL, ...) {
# check if we have weights
is_weighted <- isTRUE(attributes(x)$is_weighted)

a <- attributes(x)
if (!isTRUE(a$collapse) || length(x) == 1) {
for (i in seq_along(x)) {
Expand All @@ -356,7 +411,11 @@ print.dw_data_tabulates <- function(x, big_mark = NULL, ...) {
})

out <- do.call(rbind, x)
cat(insight::print_color("# Frequency Table\n\n", "blue"))
if (is_weighted) {
cat(insight::print_color("# Frequency Table (weighted)\n\n", "blue"))
} else {
cat(insight::print_color("# Frequency Table\n\n", "blue"))
}

# print table
cat(insight::export_table(
Expand All @@ -371,6 +430,9 @@ print.dw_data_tabulates <- function(x, big_mark = NULL, ...) {

#' @export
print_html.dw_data_tabulates <- function(x, big_mark = NULL, ...) {
# check if we have weights
is_weighted <- isTRUE(attributes(x)$is_weighted)

if (length(x) == 1) {
print_html(x[[1]], big_mark = big_mark, ...)
} else {
Expand All @@ -387,7 +449,7 @@ print_html.dw_data_tabulates <- function(x, big_mark = NULL, ...) {
insight::export_table(
out,
missing = "<NA>",
caption = "Frequency Table",
caption = ifelse(is_weighted, "Frequency Table (weighted)", "Frequency Table"),
format = "html",
group_by = "Group"
)
Expand All @@ -397,6 +459,9 @@ print_html.dw_data_tabulates <- function(x, big_mark = NULL, ...) {

#' @export
print_md.dw_data_tabulates <- function(x, big_mark = NULL, ...) {
# check if we have weights
is_weighted <- isTRUE(attributes(x)$is_weighted)

if (length(x) == 1) {
print_md(x[[1]], big_mark = big_mark, ...)
} else {
Expand All @@ -417,7 +482,7 @@ print_md.dw_data_tabulates <- function(x, big_mark = NULL, ...) {
missing = "(NA)",
empty_line = "-",
format = "markdown",
title = "Frequency Table"
title = ifelse(is_weighted, "Frequency Table (weighted)", "Frequency Table")
)
}
}
Expand Down
18 changes: 17 additions & 1 deletion man/data_tabulate.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 0 additions & 1 deletion man/datawizard-package.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

79 changes: 79 additions & 0 deletions tests/testthat/_snaps/data_tabulate.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,82 @@
# data_tabulate, weights

Code
print(data_tabulate(efc$e42dep, weights = efc$weights))
Output
elder's dependency (efc$e42dep) <categorical>
# total N=105 valid N=100 (weighted)

Value | N | Raw % | Valid % | Cumulative %
------+----+-------+---------+-------------
1 | 3 | 2.86 | 3.00 | 3.00
2 | 4 | 3.81 | 4.00 | 7.00
3 | 26 | 24.76 | 26.00 | 33.00
4 | 67 | 63.81 | 67.00 | 100.00
<NA> | 5 | 4.76 | <NA> | <NA>

---

Code
print_md(data_tabulate(efc$e42dep, weights = efc$weights))
Output
[1] "Table: elder's dependency (efc$e42dep) (categorical)"
[2] ""
[3] "|Value | N| Raw %| Valid %| Cumulative %|"
[4] "|:-----|--:|-----:|-------:|------------:|"
[5] "|1 | 3| 2.86| 3.00| 3.00|"
[6] "|2 | 4| 3.81| 4.00| 7.00|"
[7] "|3 | 26| 24.76| 26.00| 33.00|"
[8] "|4 | 67| 63.81| 67.00| 100.00|"
[9] "|(NA) | 5| 4.76| (NA)| (NA)|"
[10] "total N=105 valid N=100 (weighted)\n\n"
attr(,"format")
[1] "pipe"
attr(,"class")
[1] "knitr_kable" "character"

---

Code
print(data_tabulate(efc, c("e42dep", "e16sex"), collapse = TRUE, weights = efc$
weights))
Output
# Frequency Table (weighted)

Variable | Value | N | Raw % | Valid % | Cumulative %
---------+-------+----+-------+---------+-------------
e42dep | 1 | 3 | 2.86 | 3.00 | 3.00
| 2 | 4 | 3.81 | 4.00 | 7.00
| 3 | 26 | 24.76 | 26.00 | 33.00
| 4 | 67 | 63.81 | 67.00 | 100.00
| <NA> | 5 | 4.76 | <NA> | <NA>
---------+-------+----+-------+---------+-------------
e16sex | 1 | 50 | 47.62 | 100.00 | 100.00
| 2 | 55 | 52.38 | <NA> | <NA>
------------------------------------------------------

---

Code
print_md(data_tabulate(efc, c("e42dep", "e16sex"), weights = efc$weights))
Output
[1] "Table: Frequency Table (weighted)"
[2] ""
[3] "|Variable | Value| N| Raw %| Valid %| Cumulative %|"
[4] "|:--------|-----:|--:|-----:|-------:|------------:|"
[5] "|e42dep | 1| 3| 2.86| 3.00| 3.00|"
[6] "| | 2| 4| 3.81| 4.00| 7.00|"
[7] "| | 3| 26| 24.76| 26.00| 33.00|"
[8] "| | 4| 67| 63.81| 67.00| 100.00|"
[9] "| | (NA)| 5| 4.76| (NA)| (NA)|"
[10] "| | | | | | |"
[11] "|e16sex | 1| 50| 47.62| 100.00| 100.00|"
[12] "| | 2| 55| 52.38| (NA)| (NA)|"
[13] "| | | | | | |"
attr(,"format")
[1] "pipe"
attr(,"class")
[1] "knitr_kable" "character"

# data_tabulate print

Code
Expand Down
Loading
Loading