Add age_lbls, stdev_brks functions

CT-Data-Haven · Oct 20, 2023 · 6f7e1b7 · 6f7e1b7
1 parent 1892851
commit 6f7e1b7
Show file tree

Hide file tree

Showing 12 changed files with 242 additions and 31 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -29,7 +29,8 @@ Imports:
     stringr,
     sysfonts,
     tidyr,
-    xfun
+    xfun,
+    tidyselect
 Suggests: 
     roxygen2,
     forcats,

diff --git a/NAMESPACE b/NAMESPACE
@@ -5,6 +5,7 @@ S3method(plot,palx)
 export("%>%")
 export(":=")
 export(.data)
+export(age_lbls)
 export(as_label)
 export(as_name)
 export(dodge_lbls)
@@ -24,6 +25,7 @@ export(round_sum100)
 export(scaffold_project)
 export(scale_y_barcontinuous)
 export(stack_lbls)
+export(stdev_brks)
 export(sym)
 export(syms)
 export(title_case)

diff --git a/R/age_lbls.R b/R/age_lbls.R
@@ -0,0 +1,18 @@
+#' @title Clean up age group labels
+#' @description This function does some tedious regex replacement to make common labels we use for age groups (i.e. syntactically correct variable names) to labels meant for display in charts and tables. The main features are replacing underscores between numbers with dashes, adding space between text and numbers, replacing symbols, and parsing numbers. So "ages00_17" becomes "Ages 0-17", and "ages65plus" becomes "Ages 65+". Of course, you could also use this for other numeric ranges, like years.
+#' @param x A string vector of labels to clean up
+#' @return A string vector of display-worthy labels
+#' @examples 
+#' a <- c("ages00_17", "ages18_64", "ages65plus", "under18", "ages18up")
+#' age_lbls(a)
+#' @export 
+age_lbls <- function(x) {
+  x <- stringr::str_replace(x, "(?<=\\d)_(?=\\d)", "-")
+  x <- stringr::str_replace_all(x, "_", " ")
+  x <- stringr::str_replace(x, "(plus|up)$", "+")
+  x <- stringr::str_replace_all(x, "(?<=[A-Za-z])\\B(?=\\d)", " ")
+  x <- stringr::str_replace_all(x, "(?<=\\d)\\B(?=[A-Za-z])", " ")
+  x <- stringr::str_remove(x, "(?<=\\b)0(?=[0-9])")
+  x <- stringr::str_to_sentence(x)
+  x
+}
diff --git a/R/data.R b/R/data.R
@@ -1,4 +1,3 @@
-#' @concept dataset
 #' Life expectancy
 #'
 #' Dataset of life expectancy in years by census tract within New Haven County for 2010 to 2015.

diff --git a/R/stdev_brks.R b/R/stdev_brks.R
@@ -0,0 +1,97 @@
+#' @title Create labeled intervals based on standard deviations
+#' @description Occasionally we make charts using standard deviations away from an average value to fill bars or geographies---not a hard task, but tedious. This function takes a data frame, then gets a midpoint value, either by calculating the mean or by filtering for an observation already in the data frame (such as a statewide value). It then calculates z-scores based on this midpoint and standard deviation, then cuts z-scores based on `brks`. Pay close attention to the argument `by`, which allows you to do these calculations grouped by some column; this is useful if you have a data frame of several different indicators. Alternatively, passing a grouped data frame will also do the calculations by group.
+#' @param x A data frame or tibble
+#' @param value Bare name of the numeric value column, Default: value
+#' @param filters An optional named list of values to use for filtering. If given, the observation matching these values will be used as the midpoint. If NULL (the default), the midpoint will be calculated as the mean of values, grouped by `x`'s grouping columns (if any) and the arguments to `by` (also if any).
+#' @param by Optional character vector. If given, this will be used as the group within which intervals are calculated. Default: NULL
+#' @param brks Numeric vector of break points for cutting z-scores. This vector, plus `-Inf` and `Inf`, will be passed to `base::cut`'s `breaks` argument. Default: c(-2, -1/2, 1/2, 2)
+#' @param labels Character vector of labels for the resulting factor. If NULL, levels will be in `base::cut`'s interval notation. The length of this vector should be one more than the length of `brks`. Default: NULL
+#' @param na.rm Boolean passed on to `mean` if midpoints are being calculated. Default: TRUE
+#' @param keep_calcs Boolean, whether to keep columns from calculations. Default: TRUE
+#' @param ... Additional arguments passed to `base::cut`
+#' @return A data frame or tibble with the same number of rows as `x`. If `keep_calcs` is true, the returned data frame will have numeric columns added for midpoint (`midpt`), standard deviation (`sd`), and z-score (`z`), and a factor column for the resulting intervals (`brk`). If false, the only column added will be the intervals.
+#' @examples 
+#' # Calculate intervals along the full dataset, based on calculated mean
+#' stdev_brks(life_exp, 
+#'            labels = c("Lower", "Somewhat lower", "Average", "Somewhat higher", "Higher"))
+#'            
+#' # Calculate intervals for each of the three indicators in the `question` column. 
+#' # Both examples have the same result:
+#' fin_insecurity |>
+#'   stdev_brks(filters = list(category = "Connecticut"), by = "question")
+#' 
+#' fin_insecurity |>
+#'   dplyr::group_by(question) |>
+#'   stdev_brks(filters = list(category = "Connecticut"))
+#' @export 
+#' @seealso [base::cut()]
+stdev_brks <- function(x,
+                       value = value,
+                       filters = NULL,
+                       by = NULL,
+                       brks = c(-2, -1/2, 1/2, 2),
+                       labels = NULL,
+                       na.rm = TRUE,
+                       keep_calcs = TRUE,
+                       ...) {
+  # type checks
+  if (!is.null(labels)) {
+    if (!(length(labels) == (length(brks) + 1))) {
+      cli::cli_abort("If supplying labels, {.arg labels} must have a length of 1 longer than that of {.arg brks}.")
+    }
+  }
+
+  grp_vars <- dplyr::group_vars(x)
+  by <- c(grp_vars, by)
+  x_out <- dplyr::ungroup(x)
+
+  if (!is.null(filters)) {
+    if (!inherits(filters, "list") || is.null(names(filters))) {
+      cli::cli_abort("{.arg filters} must be a named list, or {.val NULL} in order to skip filtering.")
+    }
+    filter_df <- as.data.frame(filters)
+    # warning--only use first row
+    if (nrow(filter_df) > 1) {
+      filter_df <- dplyr::slice(filter_df, 1)
+      filter_lbl <- purrr::imap_chr(filter_df, function(df, id) paste(id, df, sep = " = "))
+      cli::cli_warn(c("Your filters have too many values. Only the first observation will be used as the midpoint.",
+                      "i" = "Filtering for {filter_lbl}"))
+    }
+
+    midpt_df <- dplyr::semi_join(x_out, filter_df, by = names(filters))
+    midpt_df <- dplyr::select(midpt_df, tidyselect::all_of(by), midpt = {{ value }})
+    # remove midpoint for getting sd
+    sd_df <- dplyr::anti_join(x_out, filter_df, by = names(filters))
+  } else {
+    midpt_df <- dplyr::group_by(x_out, dplyr::across(tidyselect::all_of(by)))
+    midpt_df <- dplyr::summarise(midpt_df, midpt = mean({{ value }}, na.rm = na.rm))
+    # no middle value to remove
+    sd_df <- x
+  }
+
+  # check that there are no dupes in midpoint groups
+  # e.g. for cws data shouldn't use category = "Gender" because there's more than one observation
+  if (length(by) > 0 && any(duplicated(midpt_df[[by]]))) {
+    cli::cli_abort("Your filters match too many observations. Only one observation should be used as the midpoint.")
+  }
+  sd_df <- dplyr::group_by(sd_df, dplyr::across(tidyselect::all_of(by)))
+  sd_df <- dplyr::summarise(sd_df, sd = sd({{ value }}, na.rm = na.rm))
+
+  # left join if there's a join column, cross join if null
+  if (length(by) > 0) {
+    x_out <- dplyr::left_join(x_out, midpt_df, by = by)
+    x_out <- dplyr::left_join(x_out, sd_df, by = by)
+  } else {
+    x_out <- dplyr::cross_join(x_out, midpt_df)
+    x_out <- dplyr::cross_join(x_out, sd_df)
+  }
+
+  x_out <- dplyr::mutate(x_out, z = ({{ value }} - midpt) / sd)
+  x_out <- dplyr::mutate(x_out, brk = cut(z, breaks = c(-Inf, brks, Inf), labels = labels, ...))
+
+  if (!keep_calcs) {
+    x_out <- dplyr::select(x_out, -midpt, -sd, -z)
+  }
+  x_out <- dplyr::group_by(x_out, dplyr::across(tidyselect::any_of(grp_vars)))
+  x_out
+}
diff --git a/man/age_lbls.Rd b/man/age_lbls.Rd
diff --git a/man/endpoint_lbls.Rd b/man/endpoint_lbls.Rd
diff --git a/man/pipe.Rd b/man/pipe.Rd
diff --git a/man/stdev_brks.Rd b/man/stdev_brks.Rd
diff --git a/man/stylehaven-package.Rd b/man/stylehaven-package.Rd
diff --git a/man/tidyeval.Rd b/man/tidyeval.Rd
diff --git a/tests/testthat/test-stdev_brks.R b/tests/testthat/test-stdev_brks.R
@@ -0,0 +1,37 @@
+test_that("stdev_brks returns correct dimensions", {
+  df_calc <- stdev_brks(life_exp, keep_calcs = TRUE)
+  df_drop <- stdev_brks(life_exp, keep_calcs = FALSE)
+  expect_equal(nrow(df_calc), nrow(life_exp))
+  expect_equal(nrow(df_drop), nrow(life_exp))
+  expect_equal(ncol(df_calc), ncol(life_exp) + 4)
+  expect_equal(ncol(df_drop), ncol(life_exp) + 1)
+})
+
+test_that("stdev_brks retains groups", {
+  df_grp <- fin_insecurity |>
+    dplyr::group_by(question) |>
+    stdev_brks()
+  df_not <- fin_insecurity |>
+    stdev_brks(by = "question")
+  expect_true(dplyr::is_grouped_df(df_grp))
+  expect_false(dplyr::is_grouped_df(df_not))
+  expect_identical(dplyr::ungroup(df_grp), df_not)
+})
+
+test_that("stdev_brks handles filters", {
+  df <- dplyr::group_by(fin_insecurity, question)
+  f_right <- list(category = "Connecticut", group = "Connecticut")
+  # too many values
+  f_warn <- list(group = c("Connecticut", "Greater New Haven"))
+  # no names
+  f_err1 <- list("Connecticut")
+  # vector instead of list
+  f_err2 <- c(group = "Connecticut")
+  # dupes
+  f_err3 <- list(category = "Gender")
+  expect_s3_class(stdev_brks(df, filters = f_right), "data.frame")
+  expect_warning(stdev_brks(df, filters = f_warn))
+  expect_error(stdev_brks(df, filters = f_err1))
+  expect_error(stdev_brks(df, filters = f_err2))
+  expect_error(stdev_brks(df, filters = f_err3))
+})