diff --git a/DESCRIPTION b/DESCRIPTION index 1cc55f2..d95f3ce 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -29,7 +29,8 @@ Imports: stringr, sysfonts, tidyr, - xfun + xfun, + tidyselect Suggests: roxygen2, forcats, diff --git a/NAMESPACE b/NAMESPACE index d9b8606..83793cf 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -5,6 +5,7 @@ S3method(plot,palx) export("%>%") export(":=") export(.data) +export(age_lbls) export(as_label) export(as_name) export(dodge_lbls) @@ -24,6 +25,7 @@ export(round_sum100) export(scaffold_project) export(scale_y_barcontinuous) export(stack_lbls) +export(stdev_brks) export(sym) export(syms) export(title_case) diff --git a/R/age_lbls.R b/R/age_lbls.R new file mode 100644 index 0000000..d1d3493 --- /dev/null +++ b/R/age_lbls.R @@ -0,0 +1,18 @@ +#' @title Clean up age group labels +#' @description This function does some tedious regex replacement to make common labels we use for age groups (i.e. syntactically correct variable names) to labels meant for display in charts and tables. The main features are replacing underscores between numbers with dashes, adding space between text and numbers, replacing symbols, and parsing numbers. So "ages00_17" becomes "Ages 0-17", and "ages65plus" becomes "Ages 65+". Of course, you could also use this for other numeric ranges, like years. +#' @param x A string vector of labels to clean up +#' @return A string vector of display-worthy labels +#' @examples +#' a <- c("ages00_17", "ages18_64", "ages65plus", "under18", "ages18up") +#' age_lbls(a) +#' @export +age_lbls <- function(x) { + x <- stringr::str_replace(x, "(?<=\\d)_(?=\\d)", "-") + x <- stringr::str_replace_all(x, "_", " ") + x <- stringr::str_replace(x, "(plus|up)$", "+") + x <- stringr::str_replace_all(x, "(?<=[A-Za-z])\\B(?=\\d)", " ") + x <- stringr::str_replace_all(x, "(?<=\\d)\\B(?=[A-Za-z])", " ") + x <- stringr::str_remove(x, "(?<=\\b)0(?=[0-9])") + x <- stringr::str_to_sentence(x) + x +} diff --git a/R/data.R b/R/data.R index b3e966d..8097e92 100644 --- a/R/data.R +++ b/R/data.R @@ -1,4 +1,3 @@ -#' @concept dataset #' Life expectancy #' #' Dataset of life expectancy in years by census tract within New Haven County for 2010 to 2015. diff --git a/R/stdev_brks.R b/R/stdev_brks.R new file mode 100644 index 0000000..f3995b8 --- /dev/null +++ b/R/stdev_brks.R @@ -0,0 +1,97 @@ +#' @title Create labeled intervals based on standard deviations +#' @description Occasionally we make charts using standard deviations away from an average value to fill bars or geographies---not a hard task, but tedious. This function takes a data frame, then gets a midpoint value, either by calculating the mean or by filtering for an observation already in the data frame (such as a statewide value). It then calculates z-scores based on this midpoint and standard deviation, then cuts z-scores based on `brks`. Pay close attention to the argument `by`, which allows you to do these calculations grouped by some column; this is useful if you have a data frame of several different indicators. Alternatively, passing a grouped data frame will also do the calculations by group. +#' @param x A data frame or tibble +#' @param value Bare name of the numeric value column, Default: value +#' @param filters An optional named list of values to use for filtering. If given, the observation matching these values will be used as the midpoint. If NULL (the default), the midpoint will be calculated as the mean of values, grouped by `x`'s grouping columns (if any) and the arguments to `by` (also if any). +#' @param by Optional character vector. If given, this will be used as the group within which intervals are calculated. Default: NULL +#' @param brks Numeric vector of break points for cutting z-scores. This vector, plus `-Inf` and `Inf`, will be passed to `base::cut`'s `breaks` argument. Default: c(-2, -1/2, 1/2, 2) +#' @param labels Character vector of labels for the resulting factor. If NULL, levels will be in `base::cut`'s interval notation. The length of this vector should be one more than the length of `brks`. Default: NULL +#' @param na.rm Boolean passed on to `mean` if midpoints are being calculated. Default: TRUE +#' @param keep_calcs Boolean, whether to keep columns from calculations. Default: TRUE +#' @param ... Additional arguments passed to `base::cut` +#' @return A data frame or tibble with the same number of rows as `x`. If `keep_calcs` is true, the returned data frame will have numeric columns added for midpoint (`midpt`), standard deviation (`sd`), and z-score (`z`), and a factor column for the resulting intervals (`brk`). If false, the only column added will be the intervals. +#' @examples +#' # Calculate intervals along the full dataset, based on calculated mean +#' stdev_brks(life_exp, +#' labels = c("Lower", "Somewhat lower", "Average", "Somewhat higher", "Higher")) +#' +#' # Calculate intervals for each of the three indicators in the `question` column. +#' # Both examples have the same result: +#' fin_insecurity |> +#' stdev_brks(filters = list(category = "Connecticut"), by = "question") +#' +#' fin_insecurity |> +#' dplyr::group_by(question) |> +#' stdev_brks(filters = list(category = "Connecticut")) +#' @export +#' @seealso [base::cut()] +stdev_brks <- function(x, + value = value, + filters = NULL, + by = NULL, + brks = c(-2, -1/2, 1/2, 2), + labels = NULL, + na.rm = TRUE, + keep_calcs = TRUE, + ...) { + # type checks + if (!is.null(labels)) { + if (!(length(labels) == (length(brks) + 1))) { + cli::cli_abort("If supplying labels, {.arg labels} must have a length of 1 longer than that of {.arg brks}.") + } + } + + grp_vars <- dplyr::group_vars(x) + by <- c(grp_vars, by) + x_out <- dplyr::ungroup(x) + + if (!is.null(filters)) { + if (!inherits(filters, "list") || is.null(names(filters))) { + cli::cli_abort("{.arg filters} must be a named list, or {.val NULL} in order to skip filtering.") + } + filter_df <- as.data.frame(filters) + # warning--only use first row + if (nrow(filter_df) > 1) { + filter_df <- dplyr::slice(filter_df, 1) + filter_lbl <- purrr::imap_chr(filter_df, function(df, id) paste(id, df, sep = " = ")) + cli::cli_warn(c("Your filters have too many values. Only the first observation will be used as the midpoint.", + "i" = "Filtering for {filter_lbl}")) + } + + midpt_df <- dplyr::semi_join(x_out, filter_df, by = names(filters)) + midpt_df <- dplyr::select(midpt_df, tidyselect::all_of(by), midpt = {{ value }}) + # remove midpoint for getting sd + sd_df <- dplyr::anti_join(x_out, filter_df, by = names(filters)) + } else { + midpt_df <- dplyr::group_by(x_out, dplyr::across(tidyselect::all_of(by))) + midpt_df <- dplyr::summarise(midpt_df, midpt = mean({{ value }}, na.rm = na.rm)) + # no middle value to remove + sd_df <- x + } + + # check that there are no dupes in midpoint groups + # e.g. for cws data shouldn't use category = "Gender" because there's more than one observation + if (length(by) > 0 && any(duplicated(midpt_df[[by]]))) { + cli::cli_abort("Your filters match too many observations. Only one observation should be used as the midpoint.") + } + sd_df <- dplyr::group_by(sd_df, dplyr::across(tidyselect::all_of(by))) + sd_df <- dplyr::summarise(sd_df, sd = sd({{ value }}, na.rm = na.rm)) + + # left join if there's a join column, cross join if null + if (length(by) > 0) { + x_out <- dplyr::left_join(x_out, midpt_df, by = by) + x_out <- dplyr::left_join(x_out, sd_df, by = by) + } else { + x_out <- dplyr::cross_join(x_out, midpt_df) + x_out <- dplyr::cross_join(x_out, sd_df) + } + + x_out <- dplyr::mutate(x_out, z = ({{ value }} - midpt) / sd) + x_out <- dplyr::mutate(x_out, brk = cut(z, breaks = c(-Inf, brks, Inf), labels = labels, ...)) + + if (!keep_calcs) { + x_out <- dplyr::select(x_out, -midpt, -sd, -z) + } + x_out <- dplyr::group_by(x_out, dplyr::across(tidyselect::any_of(grp_vars))) + x_out +} diff --git a/man/age_lbls.Rd b/man/age_lbls.Rd new file mode 100644 index 0000000..0f67043 --- /dev/null +++ b/man/age_lbls.Rd @@ -0,0 +1,21 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/age_lbls.R +\name{age_lbls} +\alias{age_lbls} +\title{Clean up age group labels} +\usage{ +age_lbls(x) +} +\arguments{ +\item{x}{A string vector of labels to clean up} +} +\value{ +A string vector of display-worthy labels +} +\description{ +This function does some tedious regex replacement to make common labels we use for age groups (i.e. syntactically correct variable names) to labels meant for display in charts and tables. The main features are replacing underscores between numbers with dashes, adding space between text and numbers, replacing symbols, and parsing numbers. So "ages00_17" becomes "Ages 0-17", and "ages65plus" becomes "Ages 65+". Of course, you could also use this for other numeric ranges, like years. +} +\examples{ +a <- c("ages00_17", "ages18_64", "ages65plus", "under18", "ages18up") +age_lbls(a) +} diff --git a/man/endpoint_lbls.Rd b/man/endpoint_lbls.Rd index eb2443f..dae5f96 100644 --- a/man/endpoint_lbls.Rd +++ b/man/endpoint_lbls.Rd @@ -55,15 +55,17 @@ Easily put together labels for the endpoints of a chart, such as a line chart wi geom_line() + geom_point(size = 3) + geom_text(aes(label = lbl, hjust = just, x = x)) + - scale_x_continuous(expand = expansion(add = c(1, 2))) + scale_x_continuous(expand = expansion(add = c(1, 3)), + breaks = c(2015, 2021)) cws_trend \%>\% dplyr::filter(indicator == "local_govt_responsive", category == "Age") \%>\% endpoint_lbls(value = value, x = year, group = group, long_side = "both", - fun = percent100, add = 0.1, mult = NULL) \%>\% + fun = percent100, add = 0.4, mult = NULL) \%>\% ggplot(aes(x = year, y = value, color = group)) + geom_line() + geom_point(size = 3) + geom_text(aes(label = lbl, hjust = just, x = x)) + - scale_x_continuous(expand = expansion(add = 2)) + scale_x_continuous(expand = expansion(add = 3), + breaks = c(2015, 2021)) } diff --git a/man/pipe.Rd b/man/pipe.Rd index a648c29..8ef7807 100644 --- a/man/pipe.Rd +++ b/man/pipe.Rd @@ -17,4 +17,3 @@ The result of calling \code{rhs(lhs)}. \description{ See \code{magrittr::\link[magrittr:pipe]{\%>\%}} for details. } -\keyword{internal} diff --git a/man/stdev_brks.Rd b/man/stdev_brks.Rd new file mode 100644 index 0000000..e77d1a1 --- /dev/null +++ b/man/stdev_brks.Rd @@ -0,0 +1,60 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/stdev_brks.R +\name{stdev_brks} +\alias{stdev_brks} +\title{Create labeled intervals based on standard deviations} +\usage{ +stdev_brks( + x, + value = value, + filters = NULL, + by = NULL, + brks = c(-2, -1/2, 1/2, 2), + labels = NULL, + na.rm = TRUE, + keep_calcs = TRUE, + ... +) +} +\arguments{ +\item{x}{A data frame or tibble} + +\item{value}{Bare name of the numeric value column, Default: value} + +\item{filters}{An optional named list of values to use for filtering. If given, the observation matching these values will be used as the midpoint. If NULL (the default), the midpoint will be calculated as the mean of values, grouped by \code{x}'s grouping columns (if any) and the arguments to \code{by} (also if any).} + +\item{by}{Optional character vector. If given, this will be used as the group within which intervals are calculated. Default: NULL} + +\item{brks}{Numeric vector of break points for cutting z-scores. This vector, plus \code{-Inf} and \code{Inf}, will be passed to \code{base::cut}'s \code{breaks} argument. Default: c(-2, -1/2, 1/2, 2)} + +\item{labels}{Character vector of labels for the resulting factor. If NULL, levels will be in \code{base::cut}'s interval notation. The length of this vector should be one more than the length of \code{brks}. Default: NULL} + +\item{na.rm}{Boolean passed on to \code{mean} if midpoints are being calculated. Default: TRUE} + +\item{keep_calcs}{Boolean, whether to keep columns from calculations. Default: TRUE} + +\item{...}{Additional arguments passed to \code{base::cut}} +} +\value{ +A data frame or tibble with the same number of rows as \code{x}. If \code{keep_calcs} is true, the returned data frame will have numeric columns added for midpoint (\code{midpt}), standard deviation (\code{sd}), and z-score (\code{z}), and a factor column for the resulting intervals (\code{brk}). If false, the only column added will be the intervals. +} +\description{ +Occasionally we make charts using standard deviations away from an average value to fill bars or geographies---not a hard task, but tedious. This function takes a data frame, then gets a midpoint value, either by calculating the mean or by filtering for an observation already in the data frame (such as a statewide value). It then calculates z-scores based on this midpoint and standard deviation, then cuts z-scores based on \code{brks}. Pay close attention to the argument \code{by}, which allows you to do these calculations grouped by some column; this is useful if you have a data frame of several different indicators. Alternatively, passing a grouped data frame will also do the calculations by group. +} +\examples{ +# Calculate intervals along the full dataset, based on calculated mean +stdev_brks(life_exp, + labels = c("Lower", "Somewhat lower", "Average", "Somewhat higher", "Higher")) + +# Calculate intervals for each of the three indicators in the `question` column. +# Both examples have the same result: +fin_insecurity |> + stdev_brks(filters = list(category = "Connecticut"), by = "question") + +fin_insecurity |> + dplyr::group_by(question) |> + stdev_brks(filters = list(category = "Connecticut")) +} +\seealso{ +\code{\link[base:cut]{base::cut()}} +} diff --git a/man/stylehaven-package.Rd b/man/stylehaven-package.Rd deleted file mode 100644 index 3b20011..0000000 --- a/man/stylehaven-package.Rd +++ /dev/null @@ -1,24 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/stylehaven-package.R -\docType{package} -\name{stylehaven-package} -\alias{stylehaven} -\alias{stylehaven-package} -\title{stylehaven: Quick style utilities and guides} -\description{ -This is a collection of style utilities mostly revolving around making clean labels for legible plots and (eventually) examples of building common plots. There's not a whole lot to it, probably never will be, just a collection of utilities, examples, and some best(?) practices. -} -\seealso{ -Useful links: -\itemize{ - \item \url{https://github.com/CT-Data-Haven/stylehaven} - \item \url{https://ct-data-haven.github.io/stylehaven} - \item Report bugs at \url{https://github.com/CT-Data-Haven/stylehaven/issues} -} - -} -\author{ -\strong{Maintainer}: Camille Seaberry \email{camille@ctdatahaven.org} - -} -\keyword{internal} diff --git a/man/tidyeval.Rd b/man/tidyeval.Rd index ebcd89a..35afa30 100644 --- a/man/tidyeval.Rd +++ b/man/tidyeval.Rd @@ -48,4 +48,3 @@ To learn more about tidy eval and how to use these tools, visit \href{https://adv-r.hadley.nz/metaprogramming.html}{Metaprogramming section} of \href{https://adv-r.hadley.nz}{Advanced R}. } -\keyword{internal} diff --git a/tests/testthat/test-stdev_brks.R b/tests/testthat/test-stdev_brks.R new file mode 100644 index 0000000..d924904 --- /dev/null +++ b/tests/testthat/test-stdev_brks.R @@ -0,0 +1,37 @@ +test_that("stdev_brks returns correct dimensions", { + df_calc <- stdev_brks(life_exp, keep_calcs = TRUE) + df_drop <- stdev_brks(life_exp, keep_calcs = FALSE) + expect_equal(nrow(df_calc), nrow(life_exp)) + expect_equal(nrow(df_drop), nrow(life_exp)) + expect_equal(ncol(df_calc), ncol(life_exp) + 4) + expect_equal(ncol(df_drop), ncol(life_exp) + 1) +}) + +test_that("stdev_brks retains groups", { + df_grp <- fin_insecurity |> + dplyr::group_by(question) |> + stdev_brks() + df_not <- fin_insecurity |> + stdev_brks(by = "question") + expect_true(dplyr::is_grouped_df(df_grp)) + expect_false(dplyr::is_grouped_df(df_not)) + expect_identical(dplyr::ungroup(df_grp), df_not) +}) + +test_that("stdev_brks handles filters", { + df <- dplyr::group_by(fin_insecurity, question) + f_right <- list(category = "Connecticut", group = "Connecticut") + # too many values + f_warn <- list(group = c("Connecticut", "Greater New Haven")) + # no names + f_err1 <- list("Connecticut") + # vector instead of list + f_err2 <- c(group = "Connecticut") + # dupes + f_err3 <- list(category = "Gender") + expect_s3_class(stdev_brks(df, filters = f_right), "data.frame") + expect_warning(stdev_brks(df, filters = f_warn)) + expect_error(stdev_brks(df, filters = f_err1)) + expect_error(stdev_brks(df, filters = f_err2)) + expect_error(stdev_brks(df, filters = f_err3)) +})