easystats · strengejacke · Dec 17, 2024 · Dec 17, 2024 · Dec 17, 2024 · Dec 17, 2024
diff --git a/NEWS.md b/NEWS.md
@@ -16,13 +16,21 @@ BREAKING CHANGES AND DEPRECATIONS
   - if `select` (previously `pattern`) is a named vector, then all elements
     must be named, e.g. `c(length = "Sepal.Length", "Sepal.Width")` errors.
 
+
+* The name of the rescaled weights variables in `rescale_weights()` have been
+  renamed. `pweights_a` and `pweights_b` are now named `rescaled_weights_a`
+  and `rescaled_weights_b`.
+
 * `print()` methods for `data_tabulate()` with multiple sub-tables (i.e. when
   length of `by` was > 1) were revised. Now, an integrated table instead of
   multiple tables is returned. Furthermore, `print_html()` did not work, which
   was also fixed now.
 
 CHANGES
 
+* `rescale_weights()` gets a `method` argument, to choose method to rescale
+  weights. Options are `"carle"` (the default) and `"kish"`.
+
 * The `select` argument, which is available in different functions to select
   variables, can now also be a character vector with quoted variable names,
   including a colon to indicate a range of several variables (e.g. `"cyl:gear"`).

diff --git a/R/data_arrange.R b/R/data_arrange.R
@@ -42,7 +42,7 @@ data_arrange.default <- function(data, select = NULL, safe = TRUE) {
   select <- gsub("^-", "", select)
 
   # check for variables that are not in data
-  dont_exist <- select[which(!select %in% names(data))]
+  dont_exist <- setdiff(select, colnames(data))
 
   if (length(dont_exist) > 0) {
     if (safe) {

diff --git a/R/rescale_weights.R b/R/rescale_weights.R
@@ -2,96 +2,161 @@
 #' @name rescale_weights
 #'
 #' @description Most functions to fit multilevel and mixed effects models only
-#'   allow to specify frequency weights, but not design (i.e. sampling or
-#'   probability) weights, which should be used when analyzing complex samples
-#'   and survey data. `rescale_weights()` implements an algorithm proposed
-#'   by \cite{Asparouhov (2006)} and \cite{Carle (2009)} to rescale design
-#'   weights in survey data to account for the grouping structure of multilevel
-#'   models, which then can be used for multilevel modelling.
+#' allow to specify frequency weights, but not design (i.e. sampling or
+#' probability) weights, which should be used when analyzing complex samples
+#' and survey data. `rescale_weights()` implements two algorithms, one proposed
+#' by \cite{Asparouhov (2006)} and \cite{Carle (2009)} and one proposed by
+#' \cite{Kish (1965)}, to rescale design weights in survey data to account for the
+#' grouping structure of multilevel models, which then can be used for
+#' multilevel modelling.
 #'
 #' @param data A data frame.
 #' @param by Variable names (as character vector, or as formula), indicating
-#'   the grouping structure (strata) of the survey data (level-2-cluster
-#'   variable). It is also possible to create weights for multiple group
-#'   variables; in such cases, each created weighting variable will be suffixed
-#'   by the name of the group variable.
+#' the grouping structure (strata) of the survey data (level-2-cluster
+#' variable). It is also possible to create weights for multiple group
+#' variables; in such cases, each created weighting variable will be suffixed
+#' by the name of the group variable. Argument `by` only applies to the default
+#' rescaling-method (`method = "carle"`), not to `method = "kish"`.
 #' @param probability_weights Variable indicating the probability (design or
-#'   sampling) weights of the survey data (level-1-weight).
+#' sampling) weights of the survey data (level-1-weight).
 #' @param nest Logical, if `TRUE` and `by` indicates at least two
-#'   group variables, then groups are "nested", i.e. groups are now a
-#'   combination from each group level of the variables in `by`.
+#' group variables, then groups are "nested", i.e. groups are now a
+#' combination from each group level of the variables in `by`.
+#' @param method String, indicating which rescale-method is used for rescaling
+#' weights. Can be either `"carle"` (default) or `"kish"`. See 'Details'.
 #'
-#' @return `data`, including the new weighting variables: `pweights_a`
-#'   and `pweights_b`, which represent the rescaled design weights to use
-#'   in multilevel models (use these variables for the `weights` argument).
+#' @return `data`, including the new weighting variable(s). For
+#' `method = "carle"`, new columns `rescaled_weights_a` and `rescaled_weights_b`
+#' are returned, and for `method = "klish"`, the returned data contains a column
+#' `rescaled_weights`. These represent the rescaled design weights to use in
+#' multilevel models (use these variables for the `weights` argument).
 #'
 #' @details
+#' - `method = "carle"`
 #'
-#' Rescaling is based on two methods: For `pweights_a`, the sample weights
-#' `probability_weights` are adjusted by a factor that represents the proportion
-#' of group size divided by the sum of sampling weights within each group. The
-#' adjustment factor for `pweights_b` is the sum of sample weights within each
-#' group divided by the sum of squared sample weights within each group (see
-#' Carle (2009), Appendix B). In other words, `pweights_a` "scales the weights
-#' so that the new weights sum to the cluster sample size" while `pweights_b`
-#' "scales the weights so that the new weights sum to the effective cluster
-#' size".
-#'
-#' Regarding the choice between scaling methods A and B, Carle suggests that
-#' "analysts who wish to discuss point estimates should report results based on
-#' weighting method A. For analysts more interested in residual between-group
-#' variance, method B may generally provide the least biased estimates". In
-#' general, it is recommended to fit a non-weighted model and weighted models
-#' with both scaling methods and when comparing the models, see whether the
-#' "inferential decisions converge", to gain confidence in the results.
-#'
-#' Though the bias of scaled weights decreases with increasing group size,
-#' method A is preferred when insufficient or low group size is a concern.
-#'
-#' The group ID and probably PSU may be used as random effects (e.g. nested
-#' design, or group and PSU as varying intercepts), depending on the survey
-#' design that should be mimicked.
+#'   Rescaling is based on two methods: For `rescaled_weights_a`, the sample
+#'   weights `probability_weights` are adjusted by a factor that represents the
+#'   proportion of group size divided by the sum of sampling weights within each
+#'   group. The adjustment factor for `rescaled_weights_b` is the sum of sample
+#'   weights within each group divided by the sum of squared sample weights
+#'   within each group (see Carle (2009), Appendix B). In other words,
+#'   `rescaled_weights_a` "scales the weights so that the new weights sum to the
+#'   cluster sample size" while `rescaled_weights_b` "scales the weights so that
+#'   the new weights sum to the effective cluster size".
+#'
+#'   Regarding the choice between scaling methods A and B, Carle suggests that
+#'   "analysts who wish to discuss point estimates should report results based
+#'   on weighting method A. For analysts more interested in residual
+#'   between-group variance, method B may generally provide the least biased
+#'   estimates". In general, it is recommended to fit a non-weighted model and
+#'   weighted models with both scaling methods and when comparing the models,
+#'   see whether the "inferential decisions converge", to gain confidence in the
+#'   results.
+#'
+#'   Though the bias of scaled weights decreases with increasing group size,
+#'   method A is preferred when insufficient or low group size is a concern.
+#'
+#'   The group ID and probably PSU may be used as random effects (e.g. nested
+#'   design, or group and PSU as varying intercepts), depending on the survey
+#'   design that should be mimicked.
+#'
+#' - `method = "kish"`
+#'
+#'   Rescaling is based on scaling the sample weights so the mean value is 1,
+#'   which means the sum of all weights equals the sample size. Next, the design
+#'   effect (_Kish 1965_) is calculated, which is the mean of the squared weights
+#'   divided by the squared mean of the weights. The scales sample weights are
+#'   then divided by the design effect.
+#'
+#'   Some tests on real-world survey-data suggest that, in comparison to the
+#'   Carle-method, the Kish-method comes closer to estimates from a regular
+#'   survey-design using the **survey** package. Note that these tests are not
+#'   representative and it is recommended to check your results against a
+#'   standard survey-design.
 #'
 #' @references
+#'   - Asparouhov T. (2006). General Multi-Level Modeling with Sampling
+#'   Weights. Communications in Statistics - Theory and Methods 35: 439-460
+#'
 #'   - Carle A.C. (2009). Fitting multilevel models in complex survey data
 #'   with design weights: Recommendations. BMC Medical Research Methodology
 #'   9(49): 1-13
 #'
-#'   - Asparouhov T. (2006). General Multi-Level Modeling with Sampling
-#'   Weights. Communications in Statistics - Theory and Methods 35: 439-460
+#'   - Kish, L. (1965) Survey Sampling. London: Wiley.
 #'
-#' @examples
-#' if (require("lme4")) {
-#'   data(nhanes_sample)
-#'   head(rescale_weights(nhanes_sample, "SDMVSTRA", "WTINT2YR"))
-#'
-#'   # also works with multiple group-variables
-#'   head(rescale_weights(nhanes_sample, c("SDMVSTRA", "SDMVPSU"), "WTINT2YR"))
-#'
-#'   # or nested structures.
-#'   x <- rescale_weights(
-#'     data = nhanes_sample,
-#'     by = c("SDMVSTRA", "SDMVPSU"),
-#'     probability_weights = "WTINT2YR",
-#'     nest = TRUE
-#'   )
-#'   head(x)
-#'
-#'   nhanes_sample <- rescale_weights(nhanes_sample, "SDMVSTRA", "WTINT2YR")
-#'
-#'   glmer(
-#'     total ~ factor(RIAGENDR) * (log(age) + factor(RIDRETH1)) + (1 | SDMVPSU),
-#'     family = poisson(),
-#'     data = nhanes_sample,
-#'     weights = pweights_a
-#'   )
+#' @examplesIf all(insight::check_if_installed(c("lme4", "parameters"), quietly = TRUE))
+#' data(nhanes_sample)
+#' head(rescale_weights(nhanes_sample, "SDMVSTRA", "WTINT2YR"))
+#'
+#' # also works with multiple group-variables
+#' head(rescale_weights(nhanes_sample, c("SDMVSTRA", "SDMVPSU"), "WTINT2YR"))
+#'
+#' # or nested structures.
+#' x <- rescale_weights(
+#'   data = nhanes_sample,
+#'   by = c("SDMVSTRA", "SDMVPSU"),
+#'   probability_weights = "WTINT2YR",
+#'   nest = TRUE
+#' )
+#' head(x)
+#'
+#' \donttest{
+#' # compare different methods, using multilevel-Poisson regression
+#'
+#' d <- rescale_weights(nhanes_sample, "SDMVSTRA", "WTINT2YR")
+#' result1 <- lme4::glmer(
+#'   total ~ factor(RIAGENDR) + log(age) + factor(RIDRETH1) + (1 | SDMVPSU),
+#'   family = poisson(),
+#'   data = d,
+#'   weights = rescaled_weights_a
+#' )
+#' result2 <- lme4::glmer(
+#'   total ~ factor(RIAGENDR) + log(age) + factor(RIDRETH1) + (1 | SDMVPSU),
+#'   family = poisson(),
+#'   data = d,
+#'   weights = rescaled_weights_b
+#' )
+#'
+#' d <- rescale_weights(
+#'   nhanes_sample,
+#'   probability_weights = "WTINT2YR",
+#'   method = "kish"
+#' )
+#' result3 <- lme4::glmer(
+#'   total ~ factor(RIAGENDR) + log(age) + factor(RIDRETH1) + (1 | SDMVPSU),
+#'   family = poisson(),
+#'   data = d,
+#'   weights = rescaled_weights
+#' )
+#' parameters::compare_parameters(
+#'   list(result1, result2, result3),
+#'   exponentiate = TRUE,
+#'   column_names = c("Carle (A)", "Carle (B)", "Kish")
+#' )
 #' }
 #' @export
-rescale_weights <- function(data, by, probability_weights, nest = FALSE) {
+rescale_weights <- function(data,
+                            by = NULL,
+                            probability_weights = NULL,
+                            nest = FALSE,
+                            method = "carle") {
+  method <- insight::validate_argument(method, c("carle", "kish"))
+
   if (inherits(by, "formula")) {
     by <- all.vars(by)
   }
 
+  # check for existing variable names
+  if ((method == "carle" && any(c("rescaled_weights_a", "rescaled_weights_b") %in% colnames(data))) ||
+    (method == "kish" && "rescaled_weights" %in% colnames(data))) {
+    insight::format_warning("The variable name for the rescaled weights already exists in the data. Returned columns will be renamed into unique names.") # nolint
+  }
+
+  # need probability_weights
+  if (is.null(probability_weights)) {
+    insight::format_error("The argument `probability_weights` is missing, but required to rescale weights.")
+  }
+
   # check if weight has missings. we need to remove them first,
   # and add back weights to correct cases later
 
@@ -104,9 +169,63 @@ rescale_weights <- function(data, by, probability_weights, nest = FALSE) {
     data_tmp <- data
   }
 
+  fun_args <- list(
+    nest = nest,
+    probability_weights = probability_weights,
+    data_tmp = data_tmp,
+    data = data,
+    by = by,
+    weight_non_na = weight_non_na
+  )
+
+  switch(method,
+    carle = do.call(.rescale_weights_carle, fun_args),
+    do.call(.rescale_weights_kish, fun_args)
+  )
+}
+
+
+# rescale weights, method Kish ----------------------------
+
+.rescale_weights_kish <- function(nest, probability_weights, data_tmp, data, by, weight_non_na) {
+  # check argument
+  if (!is.null(by)) {
+    insight::format_warning("The `by` argument is not used for `method = \"kish\" and will be ignored.")
+  }
+  p_weights <- data_tmp[[probability_weights]]
+  # design effect according to Kish
+  deff <- mean(p_weights^2) / (mean(p_weights)^2)
+  # rescale weights, so their mean is 1
+  z_weights <- p_weights * (1 / mean(p_weights))
+  # divide weights by design effect
+  data$rescaled_weights <- NA_real_
+  data$rescaled_weights[weight_non_na] <- z_weights / deff
+  # return result
+  data
+}
+
+
+# rescale weights, method Carle ----------------------------
+
+.rescale_weights_carle <- function(nest, probability_weights, data_tmp, data, by, weight_non_na) {
   # sort id
   data_tmp$.bamboozled <- seq_len(nrow(data_tmp))
 
+  if (is.null(by)) {
+    insight::format_error("Argument `by` must be specified. Please provide one or more variable names in `by` that indicate the grouping structure (strata) of the survey data (level-2-cluster variable).") # nolint
+  }
+
+  if (!all(by %in% colnames(data_tmp))) {
+    dont_exist <- setdiff(by, colnames(data_tmp))
+    insight::format_error(
+      paste0(
+        "The following variable(s) specified in `by` don't exist in the dataset: ",
+        text_concatenate(dont_exist), "."
+      ),
+      .misspelled_string(colnames(data_tmp), dont_exist, "Possibly misspelled?")
+    )
+  }
+
   if (nest && length(by) < 2) {
     insight::format_warning(
       sprintf(
@@ -129,7 +248,15 @@ rescale_weights <- function(data, by, probability_weights, nest = FALSE) {
     })
   }
 
-  do.call(cbind, list(data, out))
+  make_unique_names <- any(vapply(out, function(i) any(colnames(i) %in% colnames(data)), logical(1)))
+  # add weights to data frame
+  out <- do.call(cbind, list(data, out))
+  # check if we have to rename columns
+  if (make_unique_names) {
+    colnames(out) <- make.unique(colnames(out), sep = "_")
+  }
+
+  out
 }
 
 
@@ -158,12 +285,12 @@ rescale_weights <- function(data, by, probability_weights, nest = FALSE) {
   w_b <- x[[probability_weights]] * x$sum_weights_by_group / x$sum_squared_weights_by_group
 
   out <- data.frame(
-    pweights_a = rep(NA_real_, times = n),
-    pweights_b = rep(NA_real_, times = n)
+    rescaled_weights_a = rep(NA_real_, times = n),
+    rescaled_weights_b = rep(NA_real_, times = n)
   )
 
-  out$pweights_a[weight_non_na] <- w_a
-  out$pweights_b[weight_non_na] <- w_b
+  out$rescaled_weights_a[weight_non_na] <- w_a
+  out$rescaled_weights_b[weight_non_na] <- w_b
 
   out
 }
@@ -206,12 +333,12 @@ rescale_weights <- function(data, by, probability_weights, nest = FALSE) {
   w_b <- x[[probability_weights]] * x$sum_weights_by_group / x$sum_squared_weights_by_group
 
   out <- data.frame(
-    pweights_a = rep(NA_real_, times = n),
-    pweights_b = rep(NA_real_, times = n)
+    rescaled_weights_a = rep(NA_real_, times = n),
+    rescaled_weights_b = rep(NA_real_, times = n)
   )
 
-  out$pweights_a[weight_non_na] <- w_a
-  out$pweights_b[weight_non_na] <- w_b
+  out$rescaled_weights_a[weight_non_na] <- w_a
+  out$rescaled_weights_b[weight_non_na] <- w_b
 
   out
 }
diff --git a/R/select_nse.R b/R/select_nse.R
@@ -198,11 +198,13 @@
 }
 
 # small helper, to avoid duplicated code
+
 .action_if_not_found <- function(x,
                                  columns,
                                  matches,
                                  verbose,
                                  ifnotfound) {
+
   msg <- paste0(
     "Following variable(s) were not found: ",
     toString(x[is.na(matches)])

diff --git a/inst/WORDLIST b/inst/WORDLIST
@@ -26,6 +26,7 @@ Heisig
 Herrington
 Hoffmann
 Joanes
+Kish
 Llabre
 Lumley
 MADs
-Original file line number
+Diff line change
@@ Expand Up / @@ -26,6 +26,7 @@ Heisig @@
     Herrington
     Hoffmann
     Joanes
+    Kish
     Llabre
     Lumley
     MADs
@@ Expand Down @@