easystats · strengejacke · Sep 8, 2023 · Sep 8, 2023 · Sep 8, 2023 · Sep 8, 2023
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,7 +1,7 @@
 Type: Package
 Package: datawizard
 Title: Easy Data Wrangling and Statistical Transformations
-Version: 0.8.0.11
+Version: 0.8.0.12
 Authors@R: c(
     person("Indrajeet", "Patil", , "[email protected]", role = "aut",
            comment = c(ORCID = "0000-0003-1995-6531", Twitter = "@patilindrajeets")),

diff --git a/R/utils_labels.R b/R/utils_labels.R
@@ -46,21 +46,17 @@
       "Not all factor levels had a matching value label. Non-matching levels were preserved."
     )
   }
-  if (length(value_labels) == length(levels_in_labs)) {
-    # when length of value_labels and levels_in_labs is identical, we can simply
-    # replace the levels with the value labels. This makes sure than levels or
-    # value labels, which are not sorted or not sequentially numbered, match.
-    # Example:
-    # x <- c(5, 5, 1, 3, 1, 7)
-    # attr(x, "labels") <- c(no = 7, yes = 1, maybe = 3, `don't know` = 5)
-    # to_factor(x, labels_to_levels = TRUE)
-    levels(x)[levels_in_labs] <- names(value_labels)
-  } else {
-    # else, we need to select only those value labels that have a matching level
-    # (in labs_in_levels). This is required when not all values that have labels
-    # appear in the data.
-    levels(x)[levels_in_labs] <- names(value_labels[labs_in_levels])
-  }
+  # we need to find out which levels have no labelled value
+  missing_levels <- levels(x)[!levels(x) %in% value_labels]
+
+  # and we need to remove those value labels that don't have a matching level
+  value_labels <- value_labels[value_labels %in% levels(x)]
+
+  # for levels that have no label, we just keep the original factor level
+  value_labels <- c(value_labels, stats::setNames(missing_levels, missing_levels))
+
+  # now we can add back levels
+  levels(x) <- names(value_labels)[order(as.numeric(value_labels))]
   attr(x, "labels") <- NULL
 
   x

diff --git a/tests/testthat/test-labels_to_levels.R b/tests/testthat/test-labels_to_levels.R
@@ -59,4 +59,77 @@ test_that("labels_to_levels, factor, with random value numbers (no sequential or
   out <- to_factor(x, labels_to_levels = TRUE)
   expect_identical(as.character(out), c("a", "a", "d", "c", "d", "b"))
   expect_identical(levels(out), c("d", "c", "b", "a"))
+
+  x <- c(5, 5, 1, 3, 1, 7)
+  attr(x, "labels") <- c(no = 7, yes = 1, maybe = 3, `don't know` = 5)
+  out <- to_factor(x, labels_to_levels = TRUE)
+  expect_identical(
+    out,
+    structure(
+      c(3L, 3L, 1L, 2L, 1L, 4L),
+      levels = c("yes", "maybe", "don't know", "no"),
+      class = "factor"
+    )
+  )
+  expect_identical(
+    as.character(out),
+    c("don't know", "don't know", "yes", "maybe", "yes", "no")
+  )
+
+  x <- c(5, 5, 1, 3, 1, 7, 4)
+  attr(x, "labels") <- c(no = 7, yes = 1, maybe = 3, `don't know` = 5)
+  expect_message(
+    {
+      out <- to_factor(x, labels_to_levels = TRUE)
+    },
+    regex = "Not all factor levels"
+  )
+  expect_identical(
+    out,
+    structure(
+      c(4L, 4L, 1L, 2L, 1L, 5L, 3L),
+      levels = c("yes", "maybe", "4", "don't know", "no"),
+      class = "factor"
+    )
+  )
+  expect_identical(
+    as.character(out),
+    c("don't know", "don't know", "yes", "maybe", "yes", "no", "4")
+  )
+
+  x <- c(5, 5, 1, 3, 1, 7)
+  attr(x, "labels") <- c(no = 7, yes = 1, maybe = 4, `don't know` = 5)
+  expect_message({
+    out <- to_factor(x, labels_to_levels = TRUE)
+  })
+  expect_identical(
+    out,
+    structure(
+      c(3L, 3L, 1L, 2L, 1L, 4L),
+      levels = c("yes", "3", "don't know", "no"),
+      class = "factor"
+    )
+  )
+  expect_identical(
+    as.character(out),
+    c("don't know", "don't know", "yes", "3", "yes", "no")
+  )
+
+  x <- c(5, 5, 1, 3, 1, 7, 6)
+  attr(x, "labels") <- c(no = 7, yes = 1, maybe = 4, `don't know` = 5)
+  expect_message({
+    out <- to_factor(x, labels_to_levels = TRUE)
+  })
+  expect_identical(
+    out,
+    structure(
+      c(3L, 3L, 1L, 2L, 1L, 5L, 4L),
+      levels = c("yes", "3", "don't know", "6", "no"),
+      class = "factor"
+    )
+  )
+  expect_identical(
+    as.character(out),
+    c("don't know", "don't know", "yes", "3", "yes", "no", "6")
+  )
 })