Skip to content

Commit

Permalink
Add age_lbls, stdev_brks functions
Browse files Browse the repository at this point in the history
  • Loading branch information
camille-s committed Oct 20, 2023
1 parent 1892851 commit 6f7e1b7
Show file tree
Hide file tree
Showing 12 changed files with 242 additions and 31 deletions.
3 changes: 2 additions & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,8 @@ Imports:
stringr,
sysfonts,
tidyr,
xfun
xfun,
tidyselect
Suggests:
roxygen2,
forcats,
Expand Down
2 changes: 2 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ S3method(plot,palx)
export("%>%")
export(":=")
export(.data)
export(age_lbls)
export(as_label)
export(as_name)
export(dodge_lbls)
Expand All @@ -24,6 +25,7 @@ export(round_sum100)
export(scaffold_project)
export(scale_y_barcontinuous)
export(stack_lbls)
export(stdev_brks)
export(sym)
export(syms)
export(title_case)
Expand Down
18 changes: 18 additions & 0 deletions R/age_lbls.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
#' @title Clean up age group labels
#' @description This function does some tedious regex replacement to make common labels we use for age groups (i.e. syntactically correct variable names) to labels meant for display in charts and tables. The main features are replacing underscores between numbers with dashes, adding space between text and numbers, replacing symbols, and parsing numbers. So "ages00_17" becomes "Ages 0-17", and "ages65plus" becomes "Ages 65+". Of course, you could also use this for other numeric ranges, like years.
#' @param x A string vector of labels to clean up
#' @return A string vector of display-worthy labels
#' @examples
#' a <- c("ages00_17", "ages18_64", "ages65plus", "under18", "ages18up")
#' age_lbls(a)
#' @export
age_lbls <- function(x) {
x <- stringr::str_replace(x, "(?<=\\d)_(?=\\d)", "-")
x <- stringr::str_replace_all(x, "_", " ")
x <- stringr::str_replace(x, "(plus|up)$", "+")
x <- stringr::str_replace_all(x, "(?<=[A-Za-z])\\B(?=\\d)", " ")
x <- stringr::str_replace_all(x, "(?<=\\d)\\B(?=[A-Za-z])", " ")
x <- stringr::str_remove(x, "(?<=\\b)0(?=[0-9])")
x <- stringr::str_to_sentence(x)
x
}
1 change: 0 additions & 1 deletion R/data.R
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
#' @concept dataset
#' Life expectancy
#'
#' Dataset of life expectancy in years by census tract within New Haven County for 2010 to 2015.
Expand Down
97 changes: 97 additions & 0 deletions R/stdev_brks.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
#' @title Create labeled intervals based on standard deviations
#' @description Occasionally we make charts using standard deviations away from an average value to fill bars or geographies---not a hard task, but tedious. This function takes a data frame, then gets a midpoint value, either by calculating the mean or by filtering for an observation already in the data frame (such as a statewide value). It then calculates z-scores based on this midpoint and standard deviation, then cuts z-scores based on `brks`. Pay close attention to the argument `by`, which allows you to do these calculations grouped by some column; this is useful if you have a data frame of several different indicators. Alternatively, passing a grouped data frame will also do the calculations by group.
#' @param x A data frame or tibble
#' @param value Bare name of the numeric value column, Default: value
#' @param filters An optional named list of values to use for filtering. If given, the observation matching these values will be used as the midpoint. If NULL (the default), the midpoint will be calculated as the mean of values, grouped by `x`'s grouping columns (if any) and the arguments to `by` (also if any).
#' @param by Optional character vector. If given, this will be used as the group within which intervals are calculated. Default: NULL
#' @param brks Numeric vector of break points for cutting z-scores. This vector, plus `-Inf` and `Inf`, will be passed to `base::cut`'s `breaks` argument. Default: c(-2, -1/2, 1/2, 2)
#' @param labels Character vector of labels for the resulting factor. If NULL, levels will be in `base::cut`'s interval notation. The length of this vector should be one more than the length of `brks`. Default: NULL
#' @param na.rm Boolean passed on to `mean` if midpoints are being calculated. Default: TRUE
#' @param keep_calcs Boolean, whether to keep columns from calculations. Default: TRUE
#' @param ... Additional arguments passed to `base::cut`
#' @return A data frame or tibble with the same number of rows as `x`. If `keep_calcs` is true, the returned data frame will have numeric columns added for midpoint (`midpt`), standard deviation (`sd`), and z-score (`z`), and a factor column for the resulting intervals (`brk`). If false, the only column added will be the intervals.
#' @examples
#' # Calculate intervals along the full dataset, based on calculated mean
#' stdev_brks(life_exp,
#' labels = c("Lower", "Somewhat lower", "Average", "Somewhat higher", "Higher"))
#'
#' # Calculate intervals for each of the three indicators in the `question` column.
#' # Both examples have the same result:
#' fin_insecurity |>
#' stdev_brks(filters = list(category = "Connecticut"), by = "question")
#'
#' fin_insecurity |>
#' dplyr::group_by(question) |>
#' stdev_brks(filters = list(category = "Connecticut"))
#' @export
#' @seealso [base::cut()]
stdev_brks <- function(x,
value = value,
filters = NULL,
by = NULL,
brks = c(-2, -1/2, 1/2, 2),
labels = NULL,
na.rm = TRUE,
keep_calcs = TRUE,
...) {
# type checks
if (!is.null(labels)) {
if (!(length(labels) == (length(brks) + 1))) {
cli::cli_abort("If supplying labels, {.arg labels} must have a length of 1 longer than that of {.arg brks}.")
}
}

grp_vars <- dplyr::group_vars(x)
by <- c(grp_vars, by)
x_out <- dplyr::ungroup(x)

if (!is.null(filters)) {
if (!inherits(filters, "list") || is.null(names(filters))) {
cli::cli_abort("{.arg filters} must be a named list, or {.val NULL} in order to skip filtering.")
}
filter_df <- as.data.frame(filters)
# warning--only use first row
if (nrow(filter_df) > 1) {
filter_df <- dplyr::slice(filter_df, 1)
filter_lbl <- purrr::imap_chr(filter_df, function(df, id) paste(id, df, sep = " = "))
cli::cli_warn(c("Your filters have too many values. Only the first observation will be used as the midpoint.",
"i" = "Filtering for {filter_lbl}"))
}

midpt_df <- dplyr::semi_join(x_out, filter_df, by = names(filters))
midpt_df <- dplyr::select(midpt_df, tidyselect::all_of(by), midpt = {{ value }})
# remove midpoint for getting sd
sd_df <- dplyr::anti_join(x_out, filter_df, by = names(filters))
} else {
midpt_df <- dplyr::group_by(x_out, dplyr::across(tidyselect::all_of(by)))
midpt_df <- dplyr::summarise(midpt_df, midpt = mean({{ value }}, na.rm = na.rm))
# no middle value to remove
sd_df <- x
}

# check that there are no dupes in midpoint groups
# e.g. for cws data shouldn't use category = "Gender" because there's more than one observation
if (length(by) > 0 && any(duplicated(midpt_df[[by]]))) {
cli::cli_abort("Your filters match too many observations. Only one observation should be used as the midpoint.")
}
sd_df <- dplyr::group_by(sd_df, dplyr::across(tidyselect::all_of(by)))
sd_df <- dplyr::summarise(sd_df, sd = sd({{ value }}, na.rm = na.rm))

# left join if there's a join column, cross join if null
if (length(by) > 0) {
x_out <- dplyr::left_join(x_out, midpt_df, by = by)
x_out <- dplyr::left_join(x_out, sd_df, by = by)
} else {
x_out <- dplyr::cross_join(x_out, midpt_df)
x_out <- dplyr::cross_join(x_out, sd_df)
}

x_out <- dplyr::mutate(x_out, z = ({{ value }} - midpt) / sd)
x_out <- dplyr::mutate(x_out, brk = cut(z, breaks = c(-Inf, brks, Inf), labels = labels, ...))

if (!keep_calcs) {
x_out <- dplyr::select(x_out, -midpt, -sd, -z)
}
x_out <- dplyr::group_by(x_out, dplyr::across(tidyselect::any_of(grp_vars)))
x_out
}
21 changes: 21 additions & 0 deletions man/age_lbls.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

8 changes: 5 additions & 3 deletions man/endpoint_lbls.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 0 additions & 1 deletion man/pipe.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

60 changes: 60 additions & 0 deletions man/stdev_brks.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

24 changes: 0 additions & 24 deletions man/stylehaven-package.Rd

This file was deleted.

1 change: 0 additions & 1 deletion man/tidyeval.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

37 changes: 37 additions & 0 deletions tests/testthat/test-stdev_brks.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
test_that("stdev_brks returns correct dimensions", {
df_calc <- stdev_brks(life_exp, keep_calcs = TRUE)
df_drop <- stdev_brks(life_exp, keep_calcs = FALSE)
expect_equal(nrow(df_calc), nrow(life_exp))
expect_equal(nrow(df_drop), nrow(life_exp))
expect_equal(ncol(df_calc), ncol(life_exp) + 4)
expect_equal(ncol(df_drop), ncol(life_exp) + 1)
})

test_that("stdev_brks retains groups", {
df_grp <- fin_insecurity |>
dplyr::group_by(question) |>
stdev_brks()
df_not <- fin_insecurity |>
stdev_brks(by = "question")
expect_true(dplyr::is_grouped_df(df_grp))
expect_false(dplyr::is_grouped_df(df_not))
expect_identical(dplyr::ungroup(df_grp), df_not)
})

test_that("stdev_brks handles filters", {
df <- dplyr::group_by(fin_insecurity, question)
f_right <- list(category = "Connecticut", group = "Connecticut")
# too many values
f_warn <- list(group = c("Connecticut", "Greater New Haven"))
# no names
f_err1 <- list("Connecticut")
# vector instead of list
f_err2 <- c(group = "Connecticut")
# dupes
f_err3 <- list(category = "Gender")
expect_s3_class(stdev_brks(df, filters = f_right), "data.frame")
expect_warning(stdev_brks(df, filters = f_warn))
expect_error(stdev_brks(df, filters = f_err1))
expect_error(stdev_brks(df, filters = f_err2))
expect_error(stdev_brks(df, filters = f_err3))
})

0 comments on commit 6f7e1b7

Please sign in to comment.