From fbd4430d7143e186b14952f6e3da7d2c8e55e4df Mon Sep 17 00:00:00 2001
From: Daniel <mail@danielluedecke.de>
Date: Fri, 8 Sep 2023 13:06:10 +0200
Subject: [PATCH] fix labels_to_levels (#456)

* fix labels_to_levels

* fix

* lintr

* lintr

* add comments
---
 DESCRIPTION                            |  2 +-
 NEWS.md                                |  3 +++
 R/utils_labels.R                       | 16 +++++++++++++++-
 tests/testthat/test-labels_to_levels.R | 24 ++++++++++++++++++++++--
 4 files changed, 41 insertions(+), 4 deletions(-)

diff --git a/DESCRIPTION b/DESCRIPTION
index 76d6967bc..9fed6f068 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -1,7 +1,7 @@
 Type: Package
 Package: datawizard
 Title: Easy Data Wrangling and Statistical Transformations
-Version: 0.8.0.10
+Version: 0.8.0.11
 Authors@R: c(
     person("Indrajeet", "Patil", , "patilindrajeet.science@gmail.com", role = "aut",
            comment = c(ORCID = "0000-0003-1995-6531", Twitter = "@patilindrajeets")),
diff --git a/NEWS.md b/NEWS.md
index 295570ec4..160c0fae2 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -25,6 +25,9 @@ CHANGES
 
 BUG FIXES
 
+* Fixed issue in `labels_to_levels()` when values of labels were not in sorted
+  order and values were not sequentially numbered.
+
 * Fixed issues in `data_write()` when writing labelled data into SPSS format
   and vectors were of different type as value labels.
 
diff --git a/R/utils_labels.R b/R/utils_labels.R
index 54b1c46fd..67b3ecc6a 100644
--- a/R/utils_labels.R
+++ b/R/utils_labels.R
@@ -46,7 +46,21 @@
       "Not all factor levels had a matching value label. Non-matching levels were preserved."
     )
   }
-  levels(x)[levels_in_labs] <- names(value_labels[labs_in_levels])
+  if (length(value_labels) == length(levels_in_labs)) {
+    # when length of value_labels and levels_in_labs is identical, we can simply
+    # replace the levels with the value labels. This makes sure than levels or
+    # value labels, which are not sorted or not sequentially numbered, match.
+    # Example:
+    # x <- c(5, 5, 1, 3, 1, 7)
+    # attr(x, "labels") <- c(no = 7, yes = 1, maybe = 3, `don't know` = 5)
+    # to_factor(x, labels_to_levels = TRUE)
+    levels(x)[levels_in_labs] <- names(value_labels)
+  } else {
+    # else, we need to select only those value labels that have a matching level
+    # (in labs_in_levels). This is required when not all values that have labels
+    # appear in the data.
+    levels(x)[levels_in_labs] <- names(value_labels[labs_in_levels])
+  }
   attr(x, "labels") <- NULL
 
   x
diff --git a/tests/testthat/test-labels_to_levels.R b/tests/testthat/test-labels_to_levels.R
index 518dac70e..55105acfe 100644
--- a/tests/testthat/test-labels_to_levels.R
+++ b/tests/testthat/test-labels_to_levels.R
@@ -8,13 +8,13 @@ test_that("labels_to_levels, numeric", {
 test_that("labels_to_levels, factor", {
   data(efc)
   x <- as.factor(efc$c172code)
-  attr(x, "labels") <- c("low" = 1, "mid" = 2, "high" = 3)
+  attr(x, "labels") <- c(low = 1, mid = 2, high = 3)
   x <- labels_to_levels(x)
   expect_identical(levels(x), c("low", "mid", "high"))
   expect_equal(table(x), table(efc$c172code), ignore_attr = TRUE)
 
   x <- as.ordered(efc$c172code)
-  attr(x, "labels") <- c("low" = 1, "mid" = 2, "high" = 3)
+  attr(x, "labels") <- c(low = 1, mid = 2, high = 3)
   x <- labels_to_levels(x)
   expect_identical(levels(x), c("low", "mid", "high"))
   expect_s3_class(x, "ordered")
@@ -40,3 +40,23 @@ test_that("labels_to_levels, factor, data frame", {
   )
   expect_identical(sum(vapply(efc, is.factor, TRUE)), 1L)
 })
+
+test_that("labels_to_levels, factor, with random value numbers (no sequential order)", {
+  x <- c(5, 5, 1, 3, 1, 7)
+  attr(x, "labels") <- c(no = 7, yes = 1, maybe = 3, `don't know` = 5)
+  out <- to_factor(x, labels_to_levels = TRUE)
+  expect_identical(as.character(out), c("don't know", "don't know", "yes", "maybe", "yes", "no"))
+  expect_identical(levels(out), c("yes", "maybe", "don't know", "no"))
+
+  x <- c(4, 4, 1, 2, 1, 3)
+  attr(x, "labels") <- c(a = 1, b = 2, c = 3, d = 4)
+  out <- to_factor(x, labels_to_levels = TRUE)
+  expect_identical(as.character(out), c("d", "d", "a", "b", "a", "c"))
+  expect_identical(levels(out), c("a", "b", "c", "d"))
+
+  x <- c(4, 4, 1, 2, 1, 3)
+  attr(x, "labels") <- c(d = 1, c = 2, b = 3, a = 4)
+  out <- to_factor(x, labels_to_levels = TRUE)
+  expect_identical(as.character(out), c("a", "a", "d", "c", "d", "b"))
+  expect_identical(levels(out), c("d", "c", "b", "a"))
+})