From fbd4430d7143e186b14952f6e3da7d2c8e55e4df Mon Sep 17 00:00:00 2001 From: Daniel Date: Fri, 8 Sep 2023 13:06:10 +0200 Subject: [PATCH] fix labels_to_levels (#456) * fix labels_to_levels * fix * lintr * lintr * add comments --- DESCRIPTION | 2 +- NEWS.md | 3 +++ R/utils_labels.R | 16 +++++++++++++++- tests/testthat/test-labels_to_levels.R | 24 ++++++++++++++++++++++-- 4 files changed, 41 insertions(+), 4 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 76d6967bc..9fed6f068 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Type: Package Package: datawizard Title: Easy Data Wrangling and Statistical Transformations -Version: 0.8.0.10 +Version: 0.8.0.11 Authors@R: c( person("Indrajeet", "Patil", , "patilindrajeet.science@gmail.com", role = "aut", comment = c(ORCID = "0000-0003-1995-6531", Twitter = "@patilindrajeets")), diff --git a/NEWS.md b/NEWS.md index 295570ec4..160c0fae2 100644 --- a/NEWS.md +++ b/NEWS.md @@ -25,6 +25,9 @@ CHANGES BUG FIXES +* Fixed issue in `labels_to_levels()` when values of labels were not in sorted + order and values were not sequentially numbered. + * Fixed issues in `data_write()` when writing labelled data into SPSS format and vectors were of different type as value labels. diff --git a/R/utils_labels.R b/R/utils_labels.R index 54b1c46fd..67b3ecc6a 100644 --- a/R/utils_labels.R +++ b/R/utils_labels.R @@ -46,7 +46,21 @@ "Not all factor levels had a matching value label. Non-matching levels were preserved." ) } - levels(x)[levels_in_labs] <- names(value_labels[labs_in_levels]) + if (length(value_labels) == length(levels_in_labs)) { + # when length of value_labels and levels_in_labs is identical, we can simply + # replace the levels with the value labels. This makes sure than levels or + # value labels, which are not sorted or not sequentially numbered, match. + # Example: + # x <- c(5, 5, 1, 3, 1, 7) + # attr(x, "labels") <- c(no = 7, yes = 1, maybe = 3, `don't know` = 5) + # to_factor(x, labels_to_levels = TRUE) + levels(x)[levels_in_labs] <- names(value_labels) + } else { + # else, we need to select only those value labels that have a matching level + # (in labs_in_levels). This is required when not all values that have labels + # appear in the data. + levels(x)[levels_in_labs] <- names(value_labels[labs_in_levels]) + } attr(x, "labels") <- NULL x diff --git a/tests/testthat/test-labels_to_levels.R b/tests/testthat/test-labels_to_levels.R index 518dac70e..55105acfe 100644 --- a/tests/testthat/test-labels_to_levels.R +++ b/tests/testthat/test-labels_to_levels.R @@ -8,13 +8,13 @@ test_that("labels_to_levels, numeric", { test_that("labels_to_levels, factor", { data(efc) x <- as.factor(efc$c172code) - attr(x, "labels") <- c("low" = 1, "mid" = 2, "high" = 3) + attr(x, "labels") <- c(low = 1, mid = 2, high = 3) x <- labels_to_levels(x) expect_identical(levels(x), c("low", "mid", "high")) expect_equal(table(x), table(efc$c172code), ignore_attr = TRUE) x <- as.ordered(efc$c172code) - attr(x, "labels") <- c("low" = 1, "mid" = 2, "high" = 3) + attr(x, "labels") <- c(low = 1, mid = 2, high = 3) x <- labels_to_levels(x) expect_identical(levels(x), c("low", "mid", "high")) expect_s3_class(x, "ordered") @@ -40,3 +40,23 @@ test_that("labels_to_levels, factor, data frame", { ) expect_identical(sum(vapply(efc, is.factor, TRUE)), 1L) }) + +test_that("labels_to_levels, factor, with random value numbers (no sequential order)", { + x <- c(5, 5, 1, 3, 1, 7) + attr(x, "labels") <- c(no = 7, yes = 1, maybe = 3, `don't know` = 5) + out <- to_factor(x, labels_to_levels = TRUE) + expect_identical(as.character(out), c("don't know", "don't know", "yes", "maybe", "yes", "no")) + expect_identical(levels(out), c("yes", "maybe", "don't know", "no")) + + x <- c(4, 4, 1, 2, 1, 3) + attr(x, "labels") <- c(a = 1, b = 2, c = 3, d = 4) + out <- to_factor(x, labels_to_levels = TRUE) + expect_identical(as.character(out), c("d", "d", "a", "b", "a", "c")) + expect_identical(levels(out), c("a", "b", "c", "d")) + + x <- c(4, 4, 1, 2, 1, 3) + attr(x, "labels") <- c(d = 1, c = 2, b = 3, a = 4) + out <- to_factor(x, labels_to_levels = TRUE) + expect_identical(as.character(out), c("a", "a", "d", "c", "d", "b")) + expect_identical(levels(out), c("d", "c", "b", "a")) +})