diff --git a/DESCRIPTION b/DESCRIPTION index 9fed6f068..c062db49e 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Type: Package Package: datawizard Title: Easy Data Wrangling and Statistical Transformations -Version: 0.8.0.11 +Version: 0.8.0.12 Authors@R: c( person("Indrajeet", "Patil", , "patilindrajeet.science@gmail.com", role = "aut", comment = c(ORCID = "0000-0003-1995-6531", Twitter = "@patilindrajeets")), diff --git a/R/utils_labels.R b/R/utils_labels.R index 67b3ecc6a..a7f4fa2c3 100644 --- a/R/utils_labels.R +++ b/R/utils_labels.R @@ -46,21 +46,17 @@ "Not all factor levels had a matching value label. Non-matching levels were preserved." ) } - if (length(value_labels) == length(levels_in_labs)) { - # when length of value_labels and levels_in_labs is identical, we can simply - # replace the levels with the value labels. This makes sure than levels or - # value labels, which are not sorted or not sequentially numbered, match. - # Example: - # x <- c(5, 5, 1, 3, 1, 7) - # attr(x, "labels") <- c(no = 7, yes = 1, maybe = 3, `don't know` = 5) - # to_factor(x, labels_to_levels = TRUE) - levels(x)[levels_in_labs] <- names(value_labels) - } else { - # else, we need to select only those value labels that have a matching level - # (in labs_in_levels). This is required when not all values that have labels - # appear in the data. - levels(x)[levels_in_labs] <- names(value_labels[labs_in_levels]) - } + # we need to find out which levels have no labelled value + missing_levels <- levels(x)[!levels(x) %in% value_labels] + + # and we need to remove those value labels that don't have a matching level + value_labels <- value_labels[value_labels %in% levels(x)] + + # for levels that have no label, we just keep the original factor level + value_labels <- c(value_labels, stats::setNames(missing_levels, missing_levels)) + + # now we can add back levels + levels(x) <- names(value_labels)[order(as.numeric(value_labels))] attr(x, "labels") <- NULL x diff --git a/tests/testthat/test-labels_to_levels.R b/tests/testthat/test-labels_to_levels.R index 55105acfe..866154c8f 100644 --- a/tests/testthat/test-labels_to_levels.R +++ b/tests/testthat/test-labels_to_levels.R @@ -59,4 +59,77 @@ test_that("labels_to_levels, factor, with random value numbers (no sequential or out <- to_factor(x, labels_to_levels = TRUE) expect_identical(as.character(out), c("a", "a", "d", "c", "d", "b")) expect_identical(levels(out), c("d", "c", "b", "a")) + + x <- c(5, 5, 1, 3, 1, 7) + attr(x, "labels") <- c(no = 7, yes = 1, maybe = 3, `don't know` = 5) + out <- to_factor(x, labels_to_levels = TRUE) + expect_identical( + out, + structure( + c(3L, 3L, 1L, 2L, 1L, 4L), + levels = c("yes", "maybe", "don't know", "no"), + class = "factor" + ) + ) + expect_identical( + as.character(out), + c("don't know", "don't know", "yes", "maybe", "yes", "no") + ) + + x <- c(5, 5, 1, 3, 1, 7, 4) + attr(x, "labels") <- c(no = 7, yes = 1, maybe = 3, `don't know` = 5) + expect_message( + { + out <- to_factor(x, labels_to_levels = TRUE) + }, + regex = "Not all factor levels" + ) + expect_identical( + out, + structure( + c(4L, 4L, 1L, 2L, 1L, 5L, 3L), + levels = c("yes", "maybe", "4", "don't know", "no"), + class = "factor" + ) + ) + expect_identical( + as.character(out), + c("don't know", "don't know", "yes", "maybe", "yes", "no", "4") + ) + + x <- c(5, 5, 1, 3, 1, 7) + attr(x, "labels") <- c(no = 7, yes = 1, maybe = 4, `don't know` = 5) + expect_message({ + out <- to_factor(x, labels_to_levels = TRUE) + }) + expect_identical( + out, + structure( + c(3L, 3L, 1L, 2L, 1L, 4L), + levels = c("yes", "3", "don't know", "no"), + class = "factor" + ) + ) + expect_identical( + as.character(out), + c("don't know", "don't know", "yes", "3", "yes", "no") + ) + + x <- c(5, 5, 1, 3, 1, 7, 6) + attr(x, "labels") <- c(no = 7, yes = 1, maybe = 4, `don't know` = 5) + expect_message({ + out <- to_factor(x, labels_to_levels = TRUE) + }) + expect_identical( + out, + structure( + c(3L, 3L, 1L, 2L, 1L, 5L, 4L), + levels = c("yes", "3", "don't know", "6", "no"), + class = "factor" + ) + ) + expect_identical( + as.character(out), + c("don't know", "don't know", "yes", "3", "yes", "no", "6") + ) })