From 5ad1bcbd3ed3d78df5b3f86f0c3a594b15dc8578 Mon Sep 17 00:00:00 2001 From: Daniel Date: Wed, 18 Dec 2024 11:23:53 +0100 Subject: [PATCH] docs, tests, rename into rescaled_weights --- DESCRIPTION | 2 +- NEWS.md | 12 +- R/rescale_weights.R | 54 ++++---- man/rescale_weights.Rd | 30 ++--- tests/testthat/_snaps/rescale_weights.md | 159 ++++++++++++----------- tests/testthat/test-rescale_weights.R | 10 +- 6 files changed, 141 insertions(+), 126 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 68cfb6741..034c823ed 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Type: Package Package: datawizard Title: Easy Data Wrangling and Statistical Transformations -Version: 0.13.0.19 +Version: 0.13.0.20 Authors@R: c( person("Indrajeet", "Patil", , "patilindrajeet.science@gmail.com", role = "aut", comment = c(ORCID = "0000-0003-1995-6531")), diff --git a/NEWS.md b/NEWS.md index 35e549ffa..82513340f 100644 --- a/NEWS.md +++ b/NEWS.md @@ -4,20 +4,28 @@ BREAKING CHANGES AND DEPRECATIONS * *datawizard* now requires R >= 4.0 (#515). -* Argument `drop_na` in `data_match()` is deprecated now. Please use +* Argument `drop_na` in `data_match()` is deprecated now. Please use `remove_na` instead. * In `data_rename()` (#567): - argument `pattern` is deprecated. Use `select` instead. - - argument `safe` is deprecated. The function now errors when `select` + - argument `safe` is deprecated. The function now errors when `select` contains unknown column names. - when `replacement` is `NULL`, an error is now thrown (previously, column indices were used as new names). - if `select` (previously `pattern`) is a named vector, then all elements must be named, e.g. `c(length = "Sepal.Length", "Sepal.Width")` errors. +* The name of the rescaled weights variables in `rescale_weights()` have been + renamed. `pweights_a` and `pweights_b` are now named `rescaled_weights_a` + and `rescaled_weights_b`. + CHANGES +* `rescale_weights()` gets a `method` argument, to choose method to rescale + weights. Options are `"carle"` (the default) and `"kish"`, a newly added + method to rescale weights. + * The `select` argument, which is available in different functions to select variables, can now also be a character vector with quoted variable names, including a colon to indicate a range of several variables (e.g. `"cyl:gear"`). diff --git a/R/rescale_weights.R b/R/rescale_weights.R index 5113a281e..de5d1874e 100644 --- a/R/rescale_weights.R +++ b/R/rescale_weights.R @@ -26,23 +26,23 @@ #' weights. Can be either `"carle"` (default) or `"kish"`. See 'Details'. #' #' @return `data`, including the new weighting variable(s). For -#' `method = "carle"`, new columns `pweights_a` and `pweights_b` are returned, -#' and for `method = "klish"`, the returned data contains a column `pweights`. -#' These represent the rescaled design weights to use in multilevel models (use -#' these variables for the `weights` argument). +#' `method = "carle"`, new columns `rescaled_weights_a` and `rescaled_weights_b` +#' are returned, and for `method = "klish"`, the returned data contains a column +#' `rescaled_weights`. These represent the rescaled design weights to use in +#' multilevel models (use these variables for the `weights` argument). #' #' @details #' - `method = "carle"` #' -#' Rescaling is based on two methods: For `pweights_a`, the sample weights -#' `probability_weights` are adjusted by a factor that represents the +#' Rescaling is based on two methods: For `rescaled_weights_a`, the sample +#' weights `probability_weights` are adjusted by a factor that represents the #' proportion of group size divided by the sum of sampling weights within each -#' group. The adjustment factor for `pweights_b` is the sum of sample weights -#' within each group divided by the sum of squared sample weights within each -#' group (see Carle (2009), Appendix B). In other words, `pweights_a` "scales -#' the weights so that the new weights sum to the cluster sample size" while -#' `pweights_b` "scales the weights so that the new weights sum to the -#' effective cluster size". +#' group. The adjustment factor for `rescaled_weights_b` is the sum of sample +#' weights within each group divided by the sum of squared sample weights +#' within each group (see Carle (2009), Appendix B). In other words, +#' `rescaled_weights_a` "scales the weights so that the new weights sum to the +#' cluster sample size" while `rescaled_weights_b` "scales the weights so that +#' the new weights sum to the effective cluster size". #' #' Regarding the choice between scaling methods A and B, Carle suggests that #' "analysts who wish to discuss point estimates should report results based @@ -102,13 +102,13 @@ #' total ~ factor(RIAGENDR) + log(age) + factor(RIDRETH1) + (1 | SDMVPSU), #' family = poisson(), #' data = d, -#' weights = pweights_a +#' weights = rescaled_weights_a #' ) #' result2 <- lme4::glmer( #' total ~ factor(RIAGENDR) + log(age) + factor(RIDRETH1) + (1 | SDMVPSU), #' family = poisson(), #' data = d, -#' weights = pweights_b +#' weights = rescaled_weights_b #' ) #' #' d <- rescale_weights( @@ -120,7 +120,7 @@ #' total ~ factor(RIAGENDR) + log(age) + factor(RIDRETH1) + (1 | SDMVPSU), #' family = poisson(), #' data = d, -#' weights = pweights +#' weights = rescaled_weights #' ) #' parameters::compare_parameters( #' list(result1, result2, result3), @@ -139,8 +139,8 @@ rescale_weights <- function(data, } # check for existing variable names - if ((method == "carle" && any(c("pweights_a", "pweights_b") %in% colnames(data))) || - (method == "kish" && "pweights" %in% colnames(data))) { + if ((method == "carle" && any(c("rescaled_weights_a", "rescaled_weights_b") %in% colnames(data))) || + (method == "kish" && "rescaled_weights" %in% colnames(data))) { insight::format_warning("The variable name for the rescaled weights already exists in the data. Returned columns will be renamed into unique names.") # nolint } @@ -190,8 +190,8 @@ rescale_weights <- function(data, # rescale weights, so their mean is 1 z_weights <- p_weights * (1 / mean(p_weights)) # divide weights by design effect - data$pweights <- NA_real_ - data$pweights[weight_non_na] <- z_weights / deff + data$rescaled_weights <- NA_real_ + data$rescaled_weights[weight_non_na] <- z_weights / deff # return result data } @@ -277,12 +277,12 @@ rescale_weights <- function(data, w_b <- x[[probability_weights]] * x$sum_weights_by_group / x$sum_squared_weights_by_group out <- data.frame( - pweights_a = rep(NA_real_, times = n), - pweights_b = rep(NA_real_, times = n) + rescaled_weights_a = rep(NA_real_, times = n), + rescaled_weights_b = rep(NA_real_, times = n) ) - out$pweights_a[weight_non_na] <- w_a - out$pweights_b[weight_non_na] <- w_b + out$rescaled_weights_a[weight_non_na] <- w_a + out$rescaled_weights_b[weight_non_na] <- w_b out } @@ -325,12 +325,12 @@ rescale_weights <- function(data, w_b <- x[[probability_weights]] * x$sum_weights_by_group / x$sum_squared_weights_by_group out <- data.frame( - pweights_a = rep(NA_real_, times = n), - pweights_b = rep(NA_real_, times = n) + rescaled_weights_a = rep(NA_real_, times = n), + rescaled_weights_b = rep(NA_real_, times = n) ) - out$pweights_a[weight_non_na] <- w_a - out$pweights_b[weight_non_na] <- w_b + out$rescaled_weights_a[weight_non_na] <- w_a + out$rescaled_weights_b[weight_non_na] <- w_b out } diff --git a/man/rescale_weights.Rd b/man/rescale_weights.Rd index 41e932990..65b4ec6cd 100644 --- a/man/rescale_weights.Rd +++ b/man/rescale_weights.Rd @@ -34,10 +34,10 @@ weights. Can be either \code{"carle"} (default) or \code{"kish"}. See 'Details'. } \value{ \code{data}, including the new weighting variable(s). For -\code{method = "carle"}, new columns \code{pweights_a} and \code{pweights_b} are returned, -and for \code{method = "klish"}, the returned data contains a column \code{pweights}. -These represent the rescaled design weights to use in multilevel models (use -these variables for the \code{weights} argument). +\code{method = "carle"}, new columns \code{rescaled_weights_a} and \code{rescaled_weights_b} +are returned, and for \code{method = "klish"}, the returned data contains a column +\code{rescaled_weights}. These represent the rescaled design weights to use in +multilevel models (use these variables for the \code{weights} argument). } \description{ Most functions to fit multilevel and mixed effects models only @@ -53,15 +53,15 @@ multilevel modelling. \itemize{ \item \code{method = "carle"} -Rescaling is based on two methods: For \code{pweights_a}, the sample weights -\code{probability_weights} are adjusted by a factor that represents the +Rescaling is based on two methods: For \code{rescaled_weights_a}, the sample +weights \code{probability_weights} are adjusted by a factor that represents the proportion of group size divided by the sum of sampling weights within each -group. The adjustment factor for \code{pweights_b} is the sum of sample weights -within each group divided by the sum of squared sample weights within each -group (see Carle (2009), Appendix B). In other words, \code{pweights_a} "scales -the weights so that the new weights sum to the cluster sample size" while -\code{pweights_b} "scales the weights so that the new weights sum to the -effective cluster size". +group. The adjustment factor for \code{rescaled_weights_b} is the sum of sample +weights within each group divided by the sum of squared sample weights +within each group (see Carle (2009), Appendix B). In other words, +\code{rescaled_weights_a} "scales the weights so that the new weights sum to the +cluster sample size" while \code{rescaled_weights_b} "scales the weights so that +the new weights sum to the effective cluster size". Regarding the choice between scaling methods A and B, Carle suggests that "analysts who wish to discuss point estimates should report results based @@ -112,13 +112,13 @@ result1 <- lme4::glmer( total ~ factor(RIAGENDR) + log(age) + factor(RIDRETH1) + (1 | SDMVPSU), family = poisson(), data = d, - weights = pweights_a + weights = rescaled_weights_a ) result2 <- lme4::glmer( total ~ factor(RIAGENDR) + log(age) + factor(RIDRETH1) + (1 | SDMVPSU), family = poisson(), data = d, - weights = pweights_b + weights = rescaled_weights_b ) d <- rescale_weights( @@ -130,7 +130,7 @@ result3 <- lme4::glmer( total ~ factor(RIAGENDR) + log(age) + factor(RIDRETH1) + (1 | SDMVPSU), family = poisson(), data = d, - weights = pweights + weights = rescaled_weights ) parameters::compare_parameters( list(result1, result2, result3), diff --git a/tests/testthat/_snaps/rescale_weights.md b/tests/testthat/_snaps/rescale_weights.md index ecdefdd06..4ad736b27 100644 --- a/tests/testthat/_snaps/rescale_weights.md +++ b/tests/testthat/_snaps/rescale_weights.md @@ -3,13 +3,20 @@ Code head(rescale_weights(nhanes_sample, "SDMVSTRA", "WTINT2YR")) Output - total age RIAGENDR RIDRETH1 SDMVPSU SDMVSTRA WTINT2YR pweights_a pweights_b - 1 1 2.20 1 3 2 31 97593.68 1.5733612 1.2005159 - 2 7 2.08 2 3 1 29 39599.36 0.6231745 0.5246593 - 3 3 1.48 2 1 2 42 26619.83 0.8976966 0.5439111 - 4 4 1.32 2 4 2 33 34998.53 0.7083628 0.5498944 - 5 1 2.00 2 1 1 41 14746.45 0.4217782 0.3119698 - 6 6 2.20 2 4 1 38 28232.10 0.6877550 0.5155503 + total age RIAGENDR RIDRETH1 SDMVPSU SDMVSTRA WTINT2YR rescaled_weights_a + 1 1 2.20 1 3 2 31 97593.68 1.5733612 + 2 7 2.08 2 3 1 29 39599.36 0.6231745 + 3 3 1.48 2 1 2 42 26619.83 0.8976966 + 4 4 1.32 2 4 2 33 34998.53 0.7083628 + 5 1 2.00 2 1 1 41 14746.45 0.4217782 + 6 6 2.20 2 4 1 38 28232.10 0.6877550 + rescaled_weights_b + 1 1.2005159 + 2 0.5246593 + 3 0.5439111 + 4 0.5498944 + 5 0.3119698 + 6 0.5155503 --- @@ -36,13 +43,13 @@ Code head(rescale_weights(nhanes_sample, probability_weights = "WTINT2YR", method = "kish")) Output - total age RIAGENDR RIDRETH1 SDMVPSU SDMVSTRA WTINT2YR pweights - 1 1 2.20 1 3 2 31 97593.68 1.3952529 - 2 7 2.08 2 3 1 29 39599.36 0.5661343 - 3 3 1.48 2 1 2 42 26619.83 0.3805718 - 4 4 1.32 2 4 2 33 34998.53 0.5003582 - 5 1 2.00 2 1 1 41 14746.45 0.2108234 - 6 6 2.20 2 4 1 38 28232.10 0.4036216 + total age RIAGENDR RIDRETH1 SDMVPSU SDMVSTRA WTINT2YR rescaled_weights + 1 1 2.20 1 3 2 31 97593.68 1.3952529 + 2 7 2.08 2 3 1 29 39599.36 0.5661343 + 3 3 1.48 2 1 2 42 26619.83 0.3805718 + 4 4 1.32 2 4 2 33 34998.53 0.5003582 + 5 1 2.00 2 1 1 41 14746.45 0.2108234 + 6 6 2.20 2 4 1 38 28232.10 0.4036216 # rescale_weights nested works as expected @@ -50,66 +57,66 @@ rescale_weights(data = head(nhanes_sample, n = 30), by = c("SDMVSTRA", "SDMVPSU"), probability_weights = "WTINT2YR", nest = TRUE) Output - total age RIAGENDR RIDRETH1 SDMVPSU SDMVSTRA WTINT2YR pweights_a - 1 1 2.20 1 3 2 31 97593.679 1.0000000 - 2 7 2.08 2 3 1 29 39599.363 0.5502486 - 3 3 1.48 2 1 2 42 26619.834 0.9512543 - 4 4 1.32 2 4 2 33 34998.530 0.6766764 - 5 1 2.00 2 1 1 41 14746.454 0.7147710 - 6 6 2.20 2 4 1 38 28232.100 1.0000000 - 7 350 1.60 1 3 2 33 93162.431 1.8012419 - 8 NA 1.48 2 3 1 29 82275.986 1.1432570 - 9 3 2.28 2 4 1 41 24726.391 1.1985056 - 10 30 0.84 1 3 2 35 39895.048 1.0000000 - 11 70 1.24 1 4 2 33 27002.703 0.5220817 - 12 5 1.68 2 1 2 39 18792.034 0.3866720 - 13 60 2.20 1 3 2 30 76894.563 1.0000000 - 14 2 1.48 2 3 1 29 82275.986 1.1432570 - 15 8 2.36 2 3 2 39 78406.811 1.6133280 - 16 3 2.04 2 3 2 36 98200.912 1.0000000 - 17 1 2.08 1 3 1 40 87786.091 1.0000000 - 18 7 1.00 1 3 2 32 90803.158 1.2693642 - 19 9 2.28 2 3 2 34 45002.917 1.0000000 - 20 2 1.24 2 3 1 29 82275.986 1.1432570 - 21 4 2.28 2 3 1 34 91437.145 1.4088525 - 22 3 1.04 1 1 2 42 29348.027 1.0487457 - 23 4 1.12 1 1 1 34 38366.567 0.5911475 - 24 1 1.52 2 1 1 42 6622.334 1.0000000 - 25 22 2.24 1 4 1 41 22420.209 1.0867233 - 26 7 1.00 2 3 2 41 65529.204 1.0000000 - 27 5 0.92 2 4 1 30 27089.745 1.0000000 - 28 15 1.04 1 3 2 32 52265.570 0.7306358 - 29 3 0.80 1 3 1 33 64789.307 1.0000000 - 30 1 1.00 1 3 1 29 73404.222 1.0199804 - pweights_b - 1 1.0000000 - 2 0.5226284 - 3 0.9489993 - 4 0.5107078 - 5 0.6854605 - 6 1.0000000 - 7 1.3594509 - 8 1.0858702 - 9 1.1493587 - 10 1.0000000 - 11 0.3940306 - 12 0.2809766 - 13 1.0000000 - 14 1.0858702 - 15 1.1723308 - 16 1.0000000 - 17 1.0000000 - 18 1.1834934 - 19 1.0000000 - 20 1.0858702 - 21 1.2070771 - 22 1.0462596 - 23 0.5064835 - 24 1.0000000 - 25 1.0421602 - 26 1.0000000 - 27 1.0000000 - 28 0.6812093 - 29 1.0000000 - 30 0.9687816 + total age RIAGENDR RIDRETH1 SDMVPSU SDMVSTRA WTINT2YR rescaled_weights_a + 1 1 2.20 1 3 2 31 97593.679 1.0000000 + 2 7 2.08 2 3 1 29 39599.363 0.5502486 + 3 3 1.48 2 1 2 42 26619.834 0.9512543 + 4 4 1.32 2 4 2 33 34998.530 0.6766764 + 5 1 2.00 2 1 1 41 14746.454 0.7147710 + 6 6 2.20 2 4 1 38 28232.100 1.0000000 + 7 350 1.60 1 3 2 33 93162.431 1.8012419 + 8 NA 1.48 2 3 1 29 82275.986 1.1432570 + 9 3 2.28 2 4 1 41 24726.391 1.1985056 + 10 30 0.84 1 3 2 35 39895.048 1.0000000 + 11 70 1.24 1 4 2 33 27002.703 0.5220817 + 12 5 1.68 2 1 2 39 18792.034 0.3866720 + 13 60 2.20 1 3 2 30 76894.563 1.0000000 + 14 2 1.48 2 3 1 29 82275.986 1.1432570 + 15 8 2.36 2 3 2 39 78406.811 1.6133280 + 16 3 2.04 2 3 2 36 98200.912 1.0000000 + 17 1 2.08 1 3 1 40 87786.091 1.0000000 + 18 7 1.00 1 3 2 32 90803.158 1.2693642 + 19 9 2.28 2 3 2 34 45002.917 1.0000000 + 20 2 1.24 2 3 1 29 82275.986 1.1432570 + 21 4 2.28 2 3 1 34 91437.145 1.4088525 + 22 3 1.04 1 1 2 42 29348.027 1.0487457 + 23 4 1.12 1 1 1 34 38366.567 0.5911475 + 24 1 1.52 2 1 1 42 6622.334 1.0000000 + 25 22 2.24 1 4 1 41 22420.209 1.0867233 + 26 7 1.00 2 3 2 41 65529.204 1.0000000 + 27 5 0.92 2 4 1 30 27089.745 1.0000000 + 28 15 1.04 1 3 2 32 52265.570 0.7306358 + 29 3 0.80 1 3 1 33 64789.307 1.0000000 + 30 1 1.00 1 3 1 29 73404.222 1.0199804 + rescaled_weights_b + 1 1.0000000 + 2 0.5226284 + 3 0.9489993 + 4 0.5107078 + 5 0.6854605 + 6 1.0000000 + 7 1.3594509 + 8 1.0858702 + 9 1.1493587 + 10 1.0000000 + 11 0.3940306 + 12 0.2809766 + 13 1.0000000 + 14 1.0858702 + 15 1.1723308 + 16 1.0000000 + 17 1.0000000 + 18 1.1834934 + 19 1.0000000 + 20 1.0858702 + 21 1.2070771 + 22 1.0462596 + 23 0.5064835 + 24 1.0000000 + 25 1.0421602 + 26 1.0000000 + 27 1.0000000 + 28 0.6812093 + 29 1.0000000 + 30 0.9687816 diff --git a/tests/testthat/test-rescale_weights.R b/tests/testthat/test-rescale_weights.R index 9c2415626..1a1c1f296 100644 --- a/tests/testthat/test-rescale_weights.R +++ b/tests/testthat/test-rescale_weights.R @@ -10,10 +10,10 @@ test_that("rescale_weights works as expected", { expect_snapshot(head(rescale_weights(nhanes_sample, probability_weights = "WTINT2YR", method = "kish"))) out <- rescale_weights(nhanes_sample, "SDMVSTRA", "WTINT2YR") - expect_equal(sum(out$pweights_a), 2992, tolerance = 1e-3) - expect_equal(sum(out$pweights_b), 2244.71451, tolerance = 1e-3) + expect_equal(sum(out$rescaled_weights_a), 2992, tolerance = 1e-3) + expect_equal(sum(out$rescaled_weights_b), 2244.71451, tolerance = 1e-3) out <- rescale_weights(nhanes_sample, probability_weights = "WTINT2YR", method = "kish") - expect_equal(sum(out$pweights), 2162.53961, tolerance = 1e-3) + expect_equal(sum(out$rescaled_weights), 2162.53961, tolerance = 1e-3) }) @@ -90,7 +90,7 @@ test_that("rescale_weights errors and warnings", { regex = "is not used" ) - nhanes_sample$pweights_a <- 1 + nhanes_sample$rescaled_weights_a <- 1 expect_warning( { out <- rescale_weights( @@ -105,7 +105,7 @@ test_that("rescale_weights errors and warnings", { out, c( "total", "age", "RIAGENDR", "RIDRETH1", "SDMVPSU", "SDMVSTRA", - "WTINT2YR", "pweights_a", "pweights_a_1", "pweights_b" + "WTINT2YR", "rescaled_weights_a", "rescaled_weights_a_1", "rescaled_weights_b" ) ) })