From 5ad1bcbd3ed3d78df5b3f86f0c3a594b15dc8578 Mon Sep 17 00:00:00 2001
From: Daniel <mail@danielluedecke.de>
Date: Wed, 18 Dec 2024 11:23:53 +0100
Subject: [PATCH] docs, tests, rename into rescaled_weights

---
 DESCRIPTION                              |   2 +-
 NEWS.md                                  |  12 +-
 R/rescale_weights.R                      |  54 ++++----
 man/rescale_weights.Rd                   |  30 ++---
 tests/testthat/_snaps/rescale_weights.md | 159 ++++++++++++-----------
 tests/testthat/test-rescale_weights.R    |  10 +-
 6 files changed, 141 insertions(+), 126 deletions(-)

diff --git a/DESCRIPTION b/DESCRIPTION
index 68cfb6741..034c823ed 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -1,7 +1,7 @@
 Type: Package
 Package: datawizard
 Title: Easy Data Wrangling and Statistical Transformations
-Version: 0.13.0.19
+Version: 0.13.0.20
 Authors@R: c(
     person("Indrajeet", "Patil", , "patilindrajeet.science@gmail.com", role = "aut",
            comment = c(ORCID = "0000-0003-1995-6531")),
diff --git a/NEWS.md b/NEWS.md
index 35e549ffa..82513340f 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -4,20 +4,28 @@ BREAKING CHANGES AND DEPRECATIONS
 
 * *datawizard* now requires R >= 4.0 (#515).
 
-* Argument `drop_na` in `data_match()` is deprecated now. Please use 
+* Argument `drop_na` in `data_match()` is deprecated now. Please use
   `remove_na` instead.
 
 * In `data_rename()` (#567):
   - argument `pattern` is deprecated. Use `select` instead.
-  - argument `safe` is deprecated. The function now errors when `select` 
+  - argument `safe` is deprecated. The function now errors when `select`
     contains unknown column names.
   - when `replacement` is `NULL`, an error is now thrown (previously, column
     indices were used as new names).
   - if `select` (previously `pattern`) is a named vector, then all elements
     must be named, e.g. `c(length = "Sepal.Length", "Sepal.Width")` errors.
 
+* The name of the rescaled weights variables in `rescale_weights()` have been
+  renamed. `pweights_a` and `pweights_b` are now named `rescaled_weights_a`
+  and `rescaled_weights_b`.
+
 CHANGES
 
+* `rescale_weights()` gets a `method` argument, to choose method to rescale
+  weights. Options are `"carle"` (the default) and `"kish"`, a newly added
+  method to rescale weights.
+
 * The `select` argument, which is available in different functions to select
   variables, can now also be a character vector with quoted variable names,
   including a colon to indicate a range of several variables (e.g. `"cyl:gear"`).
diff --git a/R/rescale_weights.R b/R/rescale_weights.R
index 5113a281e..de5d1874e 100644
--- a/R/rescale_weights.R
+++ b/R/rescale_weights.R
@@ -26,23 +26,23 @@
 #' weights. Can be either `"carle"` (default) or `"kish"`. See 'Details'.
 #'
 #' @return `data`, including the new weighting variable(s). For
-#' `method = "carle"`, new columns `pweights_a` and `pweights_b` are returned,
-#' and for `method = "klish"`, the returned data contains a column `pweights`.
-#' These represent the rescaled design weights to use in multilevel models (use
-#' these variables for the `weights` argument).
+#' `method = "carle"`, new columns `rescaled_weights_a` and `rescaled_weights_b`
+#' are returned, and for `method = "klish"`, the returned data contains a column
+#' `rescaled_weights`. These represent the rescaled design weights to use in
+#' multilevel models (use these variables for the `weights` argument).
 #'
 #' @details
 #' - `method = "carle"`
 #'
-#'   Rescaling is based on two methods: For `pweights_a`, the sample weights
-#'   `probability_weights` are adjusted by a factor that represents the
+#'   Rescaling is based on two methods: For `rescaled_weights_a`, the sample
+#'   weights `probability_weights` are adjusted by a factor that represents the
 #'   proportion of group size divided by the sum of sampling weights within each
-#'   group. The adjustment factor for `pweights_b` is the sum of sample weights
-#'   within each group divided by the sum of squared sample weights within each
-#'   group (see Carle (2009), Appendix B). In other words, `pweights_a` "scales
-#'   the weights so that the new weights sum to the cluster sample size" while
-#'   `pweights_b` "scales the weights so that the new weights sum to the
-#'   effective cluster size".
+#'   group. The adjustment factor for `rescaled_weights_b` is the sum of sample
+#'   weights within each group divided by the sum of squared sample weights
+#'   within each group (see Carle (2009), Appendix B). In other words,
+#'   `rescaled_weights_a` "scales the weights so that the new weights sum to the
+#'   cluster sample size" while `rescaled_weights_b` "scales the weights so that
+#'   the new weights sum to the effective cluster size".
 #'
 #'   Regarding the choice between scaling methods A and B, Carle suggests that
 #'   "analysts who wish to discuss point estimates should report results based
@@ -102,13 +102,13 @@
 #'   total ~ factor(RIAGENDR) + log(age) + factor(RIDRETH1) + (1 | SDMVPSU),
 #'   family = poisson(),
 #'   data = d,
-#'   weights = pweights_a
+#'   weights = rescaled_weights_a
 #' )
 #' result2 <- lme4::glmer(
 #'   total ~ factor(RIAGENDR) + log(age) + factor(RIDRETH1) + (1 | SDMVPSU),
 #'   family = poisson(),
 #'   data = d,
-#'   weights = pweights_b
+#'   weights = rescaled_weights_b
 #' )
 #'
 #' d <- rescale_weights(
@@ -120,7 +120,7 @@
 #'   total ~ factor(RIAGENDR) + log(age) + factor(RIDRETH1) + (1 | SDMVPSU),
 #'   family = poisson(),
 #'   data = d,
-#'   weights = pweights
+#'   weights = rescaled_weights
 #' )
 #' parameters::compare_parameters(
 #'   list(result1, result2, result3),
@@ -139,8 +139,8 @@ rescale_weights <- function(data,
   }
 
   # check for existing variable names
-  if ((method == "carle" && any(c("pweights_a", "pweights_b") %in% colnames(data))) ||
-    (method == "kish" && "pweights" %in% colnames(data))) {
+  if ((method == "carle" && any(c("rescaled_weights_a", "rescaled_weights_b") %in% colnames(data))) ||
+    (method == "kish" && "rescaled_weights" %in% colnames(data))) {
     insight::format_warning("The variable name for the rescaled weights already exists in the data. Returned columns will be renamed into unique names.") # nolint
   }
 
@@ -190,8 +190,8 @@ rescale_weights <- function(data,
   # rescale weights, so their mean is 1
   z_weights <- p_weights * (1 / mean(p_weights))
   # divide weights by design effect
-  data$pweights <- NA_real_
-  data$pweights[weight_non_na] <- z_weights / deff
+  data$rescaled_weights <- NA_real_
+  data$rescaled_weights[weight_non_na] <- z_weights / deff
   # return result
   data
 }
@@ -277,12 +277,12 @@ rescale_weights <- function(data,
   w_b <- x[[probability_weights]] * x$sum_weights_by_group / x$sum_squared_weights_by_group
 
   out <- data.frame(
-    pweights_a = rep(NA_real_, times = n),
-    pweights_b = rep(NA_real_, times = n)
+    rescaled_weights_a = rep(NA_real_, times = n),
+    rescaled_weights_b = rep(NA_real_, times = n)
   )
 
-  out$pweights_a[weight_non_na] <- w_a
-  out$pweights_b[weight_non_na] <- w_b
+  out$rescaled_weights_a[weight_non_na] <- w_a
+  out$rescaled_weights_b[weight_non_na] <- w_b
 
   out
 }
@@ -325,12 +325,12 @@ rescale_weights <- function(data,
   w_b <- x[[probability_weights]] * x$sum_weights_by_group / x$sum_squared_weights_by_group
 
   out <- data.frame(
-    pweights_a = rep(NA_real_, times = n),
-    pweights_b = rep(NA_real_, times = n)
+    rescaled_weights_a = rep(NA_real_, times = n),
+    rescaled_weights_b = rep(NA_real_, times = n)
   )
 
-  out$pweights_a[weight_non_na] <- w_a
-  out$pweights_b[weight_non_na] <- w_b
+  out$rescaled_weights_a[weight_non_na] <- w_a
+  out$rescaled_weights_b[weight_non_na] <- w_b
 
   out
 }
diff --git a/man/rescale_weights.Rd b/man/rescale_weights.Rd
index 41e932990..65b4ec6cd 100644
--- a/man/rescale_weights.Rd
+++ b/man/rescale_weights.Rd
@@ -34,10 +34,10 @@ weights. Can be either \code{"carle"} (default) or \code{"kish"}. See 'Details'.
 }
 \value{
 \code{data}, including the new weighting variable(s). For
-\code{method = "carle"}, new columns \code{pweights_a} and \code{pweights_b} are returned,
-and for \code{method = "klish"}, the returned data contains a column \code{pweights}.
-These represent the rescaled design weights to use in multilevel models (use
-these variables for the \code{weights} argument).
+\code{method = "carle"}, new columns \code{rescaled_weights_a} and \code{rescaled_weights_b}
+are returned, and for \code{method = "klish"}, the returned data contains a column
+\code{rescaled_weights}. These represent the rescaled design weights to use in
+multilevel models (use these variables for the \code{weights} argument).
 }
 \description{
 Most functions to fit multilevel and mixed effects models only
@@ -53,15 +53,15 @@ multilevel modelling.
 \itemize{
 \item \code{method = "carle"}
 
-Rescaling is based on two methods: For \code{pweights_a}, the sample weights
-\code{probability_weights} are adjusted by a factor that represents the
+Rescaling is based on two methods: For \code{rescaled_weights_a}, the sample
+weights \code{probability_weights} are adjusted by a factor that represents the
 proportion of group size divided by the sum of sampling weights within each
-group. The adjustment factor for \code{pweights_b} is the sum of sample weights
-within each group divided by the sum of squared sample weights within each
-group (see Carle (2009), Appendix B). In other words, \code{pweights_a} "scales
-the weights so that the new weights sum to the cluster sample size" while
-\code{pweights_b} "scales the weights so that the new weights sum to the
-effective cluster size".
+group. The adjustment factor for \code{rescaled_weights_b} is the sum of sample
+weights within each group divided by the sum of squared sample weights
+within each group (see Carle (2009), Appendix B). In other words,
+\code{rescaled_weights_a} "scales the weights so that the new weights sum to the
+cluster sample size" while \code{rescaled_weights_b} "scales the weights so that
+the new weights sum to the effective cluster size".
 
 Regarding the choice between scaling methods A and B, Carle suggests that
 "analysts who wish to discuss point estimates should report results based
@@ -112,13 +112,13 @@ result1 <- lme4::glmer(
   total ~ factor(RIAGENDR) + log(age) + factor(RIDRETH1) + (1 | SDMVPSU),
   family = poisson(),
   data = d,
-  weights = pweights_a
+  weights = rescaled_weights_a
 )
 result2 <- lme4::glmer(
   total ~ factor(RIAGENDR) + log(age) + factor(RIDRETH1) + (1 | SDMVPSU),
   family = poisson(),
   data = d,
-  weights = pweights_b
+  weights = rescaled_weights_b
 )
 
 d <- rescale_weights(
@@ -130,7 +130,7 @@ result3 <- lme4::glmer(
   total ~ factor(RIAGENDR) + log(age) + factor(RIDRETH1) + (1 | SDMVPSU),
   family = poisson(),
   data = d,
-  weights = pweights
+  weights = rescaled_weights
 )
 parameters::compare_parameters(
   list(result1, result2, result3),
diff --git a/tests/testthat/_snaps/rescale_weights.md b/tests/testthat/_snaps/rescale_weights.md
index ecdefdd06..4ad736b27 100644
--- a/tests/testthat/_snaps/rescale_weights.md
+++ b/tests/testthat/_snaps/rescale_weights.md
@@ -3,13 +3,20 @@
     Code
       head(rescale_weights(nhanes_sample, "SDMVSTRA", "WTINT2YR"))
     Output
-        total  age RIAGENDR RIDRETH1 SDMVPSU SDMVSTRA WTINT2YR pweights_a pweights_b
-      1     1 2.20        1        3       2       31 97593.68  1.5733612  1.2005159
-      2     7 2.08        2        3       1       29 39599.36  0.6231745  0.5246593
-      3     3 1.48        2        1       2       42 26619.83  0.8976966  0.5439111
-      4     4 1.32        2        4       2       33 34998.53  0.7083628  0.5498944
-      5     1 2.00        2        1       1       41 14746.45  0.4217782  0.3119698
-      6     6 2.20        2        4       1       38 28232.10  0.6877550  0.5155503
+        total  age RIAGENDR RIDRETH1 SDMVPSU SDMVSTRA WTINT2YR rescaled_weights_a
+      1     1 2.20        1        3       2       31 97593.68          1.5733612
+      2     7 2.08        2        3       1       29 39599.36          0.6231745
+      3     3 1.48        2        1       2       42 26619.83          0.8976966
+      4     4 1.32        2        4       2       33 34998.53          0.7083628
+      5     1 2.00        2        1       1       41 14746.45          0.4217782
+      6     6 2.20        2        4       1       38 28232.10          0.6877550
+        rescaled_weights_b
+      1          1.2005159
+      2          0.5246593
+      3          0.5439111
+      4          0.5498944
+      5          0.3119698
+      6          0.5155503
 
 ---
 
@@ -36,13 +43,13 @@
     Code
       head(rescale_weights(nhanes_sample, probability_weights = "WTINT2YR", method = "kish"))
     Output
-        total  age RIAGENDR RIDRETH1 SDMVPSU SDMVSTRA WTINT2YR  pweights
-      1     1 2.20        1        3       2       31 97593.68 1.3952529
-      2     7 2.08        2        3       1       29 39599.36 0.5661343
-      3     3 1.48        2        1       2       42 26619.83 0.3805718
-      4     4 1.32        2        4       2       33 34998.53 0.5003582
-      5     1 2.00        2        1       1       41 14746.45 0.2108234
-      6     6 2.20        2        4       1       38 28232.10 0.4036216
+        total  age RIAGENDR RIDRETH1 SDMVPSU SDMVSTRA WTINT2YR rescaled_weights
+      1     1 2.20        1        3       2       31 97593.68        1.3952529
+      2     7 2.08        2        3       1       29 39599.36        0.5661343
+      3     3 1.48        2        1       2       42 26619.83        0.3805718
+      4     4 1.32        2        4       2       33 34998.53        0.5003582
+      5     1 2.00        2        1       1       41 14746.45        0.2108234
+      6     6 2.20        2        4       1       38 28232.10        0.4036216
 
 # rescale_weights nested works as expected
 
@@ -50,66 +57,66 @@
       rescale_weights(data = head(nhanes_sample, n = 30), by = c("SDMVSTRA",
         "SDMVPSU"), probability_weights = "WTINT2YR", nest = TRUE)
     Output
-         total  age RIAGENDR RIDRETH1 SDMVPSU SDMVSTRA  WTINT2YR pweights_a
-      1      1 2.20        1        3       2       31 97593.679  1.0000000
-      2      7 2.08        2        3       1       29 39599.363  0.5502486
-      3      3 1.48        2        1       2       42 26619.834  0.9512543
-      4      4 1.32        2        4       2       33 34998.530  0.6766764
-      5      1 2.00        2        1       1       41 14746.454  0.7147710
-      6      6 2.20        2        4       1       38 28232.100  1.0000000
-      7    350 1.60        1        3       2       33 93162.431  1.8012419
-      8     NA 1.48        2        3       1       29 82275.986  1.1432570
-      9      3 2.28        2        4       1       41 24726.391  1.1985056
-      10    30 0.84        1        3       2       35 39895.048  1.0000000
-      11    70 1.24        1        4       2       33 27002.703  0.5220817
-      12     5 1.68        2        1       2       39 18792.034  0.3866720
-      13    60 2.20        1        3       2       30 76894.563  1.0000000
-      14     2 1.48        2        3       1       29 82275.986  1.1432570
-      15     8 2.36        2        3       2       39 78406.811  1.6133280
-      16     3 2.04        2        3       2       36 98200.912  1.0000000
-      17     1 2.08        1        3       1       40 87786.091  1.0000000
-      18     7 1.00        1        3       2       32 90803.158  1.2693642
-      19     9 2.28        2        3       2       34 45002.917  1.0000000
-      20     2 1.24        2        3       1       29 82275.986  1.1432570
-      21     4 2.28        2        3       1       34 91437.145  1.4088525
-      22     3 1.04        1        1       2       42 29348.027  1.0487457
-      23     4 1.12        1        1       1       34 38366.567  0.5911475
-      24     1 1.52        2        1       1       42  6622.334  1.0000000
-      25    22 2.24        1        4       1       41 22420.209  1.0867233
-      26     7 1.00        2        3       2       41 65529.204  1.0000000
-      27     5 0.92        2        4       1       30 27089.745  1.0000000
-      28    15 1.04        1        3       2       32 52265.570  0.7306358
-      29     3 0.80        1        3       1       33 64789.307  1.0000000
-      30     1 1.00        1        3       1       29 73404.222  1.0199804
-         pweights_b
-      1   1.0000000
-      2   0.5226284
-      3   0.9489993
-      4   0.5107078
-      5   0.6854605
-      6   1.0000000
-      7   1.3594509
-      8   1.0858702
-      9   1.1493587
-      10  1.0000000
-      11  0.3940306
-      12  0.2809766
-      13  1.0000000
-      14  1.0858702
-      15  1.1723308
-      16  1.0000000
-      17  1.0000000
-      18  1.1834934
-      19  1.0000000
-      20  1.0858702
-      21  1.2070771
-      22  1.0462596
-      23  0.5064835
-      24  1.0000000
-      25  1.0421602
-      26  1.0000000
-      27  1.0000000
-      28  0.6812093
-      29  1.0000000
-      30  0.9687816
+         total  age RIAGENDR RIDRETH1 SDMVPSU SDMVSTRA  WTINT2YR rescaled_weights_a
+      1      1 2.20        1        3       2       31 97593.679          1.0000000
+      2      7 2.08        2        3       1       29 39599.363          0.5502486
+      3      3 1.48        2        1       2       42 26619.834          0.9512543
+      4      4 1.32        2        4       2       33 34998.530          0.6766764
+      5      1 2.00        2        1       1       41 14746.454          0.7147710
+      6      6 2.20        2        4       1       38 28232.100          1.0000000
+      7    350 1.60        1        3       2       33 93162.431          1.8012419
+      8     NA 1.48        2        3       1       29 82275.986          1.1432570
+      9      3 2.28        2        4       1       41 24726.391          1.1985056
+      10    30 0.84        1        3       2       35 39895.048          1.0000000
+      11    70 1.24        1        4       2       33 27002.703          0.5220817
+      12     5 1.68        2        1       2       39 18792.034          0.3866720
+      13    60 2.20        1        3       2       30 76894.563          1.0000000
+      14     2 1.48        2        3       1       29 82275.986          1.1432570
+      15     8 2.36        2        3       2       39 78406.811          1.6133280
+      16     3 2.04        2        3       2       36 98200.912          1.0000000
+      17     1 2.08        1        3       1       40 87786.091          1.0000000
+      18     7 1.00        1        3       2       32 90803.158          1.2693642
+      19     9 2.28        2        3       2       34 45002.917          1.0000000
+      20     2 1.24        2        3       1       29 82275.986          1.1432570
+      21     4 2.28        2        3       1       34 91437.145          1.4088525
+      22     3 1.04        1        1       2       42 29348.027          1.0487457
+      23     4 1.12        1        1       1       34 38366.567          0.5911475
+      24     1 1.52        2        1       1       42  6622.334          1.0000000
+      25    22 2.24        1        4       1       41 22420.209          1.0867233
+      26     7 1.00        2        3       2       41 65529.204          1.0000000
+      27     5 0.92        2        4       1       30 27089.745          1.0000000
+      28    15 1.04        1        3       2       32 52265.570          0.7306358
+      29     3 0.80        1        3       1       33 64789.307          1.0000000
+      30     1 1.00        1        3       1       29 73404.222          1.0199804
+         rescaled_weights_b
+      1           1.0000000
+      2           0.5226284
+      3           0.9489993
+      4           0.5107078
+      5           0.6854605
+      6           1.0000000
+      7           1.3594509
+      8           1.0858702
+      9           1.1493587
+      10          1.0000000
+      11          0.3940306
+      12          0.2809766
+      13          1.0000000
+      14          1.0858702
+      15          1.1723308
+      16          1.0000000
+      17          1.0000000
+      18          1.1834934
+      19          1.0000000
+      20          1.0858702
+      21          1.2070771
+      22          1.0462596
+      23          0.5064835
+      24          1.0000000
+      25          1.0421602
+      26          1.0000000
+      27          1.0000000
+      28          0.6812093
+      29          1.0000000
+      30          0.9687816
 
diff --git a/tests/testthat/test-rescale_weights.R b/tests/testthat/test-rescale_weights.R
index 9c2415626..1a1c1f296 100644
--- a/tests/testthat/test-rescale_weights.R
+++ b/tests/testthat/test-rescale_weights.R
@@ -10,10 +10,10 @@ test_that("rescale_weights works as expected", {
   expect_snapshot(head(rescale_weights(nhanes_sample, probability_weights = "WTINT2YR", method = "kish")))
 
   out <- rescale_weights(nhanes_sample, "SDMVSTRA", "WTINT2YR")
-  expect_equal(sum(out$pweights_a), 2992, tolerance = 1e-3)
-  expect_equal(sum(out$pweights_b), 2244.71451, tolerance = 1e-3)
+  expect_equal(sum(out$rescaled_weights_a), 2992, tolerance = 1e-3)
+  expect_equal(sum(out$rescaled_weights_b), 2244.71451, tolerance = 1e-3)
   out <- rescale_weights(nhanes_sample, probability_weights = "WTINT2YR", method = "kish")
-  expect_equal(sum(out$pweights), 2162.53961, tolerance = 1e-3)
+  expect_equal(sum(out$rescaled_weights), 2162.53961, tolerance = 1e-3)
 })
 
 
@@ -90,7 +90,7 @@ test_that("rescale_weights errors and warnings", {
     regex = "is not used"
   )
 
-  nhanes_sample$pweights_a <- 1
+  nhanes_sample$rescaled_weights_a <- 1
   expect_warning(
     {
       out <- rescale_weights(
@@ -105,7 +105,7 @@ test_that("rescale_weights errors and warnings", {
     out,
     c(
       "total", "age", "RIAGENDR", "RIDRETH1", "SDMVPSU", "SDMVSTRA",
-      "WTINT2YR", "pweights_a", "pweights_a_1", "pweights_b"
+      "WTINT2YR", "rescaled_weights_a", "rescaled_weights_a_1", "rescaled_weights_b"
     )
   )
 })