include_reference = TRUE doesn't work in combination with `as.facto…

…r()` (#957) * `include_reference = TRUE` doesn't work in combination with `as.factor()` Fixes #956 * desc, news * add tests * lintr * update test * comment non working test * update snapshots * update snapshot * skip test * like that?
easystats · Mar 15, 2024 · 00b5ec8 · 00b5ec8
1 parent 043b3ff
commit 00b5ec8
Show file tree

Hide file tree

Showing 8 changed files with 164 additions and 16 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,7 +1,7 @@
 Type: Package
 Package: parameters
 Title: Processing of Model Parameters
-Version: 0.21.5.7
+Version: 0.21.5.8
 Authors@R:
     c(person(given = "Daniel",
              family = "Lüdecke",

diff --git a/NEWS.md b/NEWS.md
@@ -25,6 +25,9 @@
   of class `clmm` (package *ordinal*), when model had no `component` column
   (e.g., no scale or location parameters were returned).
 
+* `include_reference` now also works when factor were created "on-the-fly" inside
+  the model formula (i.e. `y ~ as.factor(x)`).
+
 # parameters 0.21.5
 
 ## Bug fixes

diff --git a/R/display.R b/R/display.R
@@ -115,7 +115,8 @@ display.parameters_model <- function(object,
       footer_digits = footer_digits, align = align, ci_brackets = ci_brackets,
       show_sigma = show_sigma, show_formula = show_formula, zap_small = zap_small,
       font_size = font_size, line_padding = line_padding,
-      column_labels = column_labels, verbose = verbose, ...
+      column_labels = column_labels, include_reference = include_reference,
+      verbose = verbose, ...
     )
   } else {
     print_md(
@@ -124,7 +125,7 @@ display.parameters_model <- function(object,
       footer = footer, ci_digits = ci_digits, p_digits = p_digits,
       footer_digits = footer_digits, ci_brackets = ci_brackets,
       show_sigma = show_sigma, show_formula = show_formula, zap_small = zap_small,
-      verbose = verbose, ...
+      include_reference = include_reference, verbose = verbose, ...
     )
   }
 }

diff --git a/R/print_md.R b/R/print_md.R
@@ -443,8 +443,10 @@ print_md.parameters_distribution <- function(x, digits = 2, ci_brackets = c("(",
   formatted_table[is.na(formatted_table)] <- ""
   # create base table
   out <- tinytable::tt(formatted_table, notes = footer, caption = caption)
-  # insert sub header rows and column spans
-  out <- tinytable::group_tt(out, i = row_groups, j = col_groups)
+  # insert sub header rows and column spans, if we have them
+  if (!(is.null(row_groups) && is.null(col_groups))) {
+    out <- tinytable::group_tt(out, i = row_groups, j = col_groups)
+  }
   out@output <- outformat
   out
 }
diff --git a/R/utils.R b/R/utils.R
@@ -140,16 +140,39 @@
 }
 
 
-.find_factor_levels <- function(data) {
-  out <- lapply(colnames(data), function(i) {
-    v <- data[[i]]
+.find_factor_levels <- function(model_data, model = NULL, model_call = NULL) {
+  # check whether we have on-the-fly conversion of factors
+  if (!is.null(model)) {
+    model_terms <- insight::find_terms(model)
+  } else if (!is.null(model_call)) { # nolint
+    model_terms <- insight::find_terms(model_call)
+  } else {
+    model_terms <- NULL
+  }
+  # extract all model terms, we now have "as.factor(term)" etc., if any
+  if (!is.null(model_terms$conditional)) {
+    # extract variable names from "as.factor(term)" etc.
+    factor_terms <- grep("(as\\.factor|factor|as\\.character)", model_terms$conditional, value = TRUE)
+    cleaned <- gsub("(as\\.factor|factor|as\\.character)\\((.*)\\)", "\\2", factor_terms)
+    # convert on-the-fly factors into real factors
+    if (length(cleaned)) {
+      for (i in seq_along(cleaned)) {
+        model_data[[factor_terms[i]]] <- as.factor(model_data[[cleaned[i]]])
+      }
+    }
+  }
+  # extract levels from factors, so we know the reference level
+  out <- lapply(colnames(model_data), function(i) {
+    v <- model_data[[i]]
     if (is.factor(v)) {
       paste0(i, levels(v))
+    } else if (is.character(v)) {
+      paste0(i, unique(v))
     } else {
       NULL
     }
   })
-  names(out) <- names(data)
+  names(out) <- names(model_data)
   insight::compact_list(out)
 }
 

diff --git a/R/utils_format.R b/R/utils_format.R
@@ -53,6 +53,7 @@
     # here we either have "<br>" or " " as line breaks, followed by empty "()"
     i <- gsub("<br>()", "", i, fixed = TRUE)
     i <- gsub(" ()", "", i, fixed = TRUE)
+    i <- gsub("<br>(, )", "", i, fixed = TRUE)
     i <- gsub(" (, )", "", i, fixed = TRUE)
     i[i == "()"] <- ""
     i[i == "(, )"] <- ""
@@ -374,7 +375,7 @@
   }
 
   # find factors and factor levels and check if we have any factors in the data
-  factors <- .find_factor_levels(model_data)
+  factors <- .find_factor_levels(model_data, model, model_call = attributes(params)$model_call)
   if (!length(factors)) {
     params
   }
@@ -419,8 +420,16 @@
     if (length(found)) {
       # the reference level is *not* in the pretty names yet
       reference_level <- f[!f %in% names(pretty_names)]
+
+      # for on-the-fly conversion of factors, the names of the factors can
+      # can also contain "factor()" or "as.factor()" - we need to remove these
+      if (any(grepl("(as\\.factor|factor|as\\.character)", fn))) {
+        fn_clean <- gsub("(as\\.factor|factor|as\\.character)\\((.*)\\)", "\\2", fn)
+      } else {
+        fn_clean <- fn
+      }
       # create a pretty level for the reference category
-      pretty_level <- paste0(fn, " [", sub(fn, "", reference_level, fixed = TRUE), "]")
+      pretty_level <- paste0(fn_clean, " [", sub(fn, "", reference_level, fixed = TRUE), "]")
       # insert new pretty level at the correct position in "pretty_names"
       pretty_names <- .insert_element_at(
         pretty_names,
@@ -520,7 +529,7 @@
                                            is_zero_inflated,
                                            is_ordinal_model,
                                            is_multivariate = FALSE,
-                                           ran_pars,
+                                           ran_pars, # nolint
                                            formatted_table = NULL) {
   # prepare component names
   .conditional_fixed_text <- if (is_zero_inflated) {
@@ -944,7 +953,7 @@
 
   # fix column output
   if (inherits(attributes(x)$model, c("lavaan", "blavaan")) && "Label" %in% colnames(x)) {
-    x$From <- ifelse(!nzchar(as.character(x$Label), keepNA = TRUE) | x$Label == x$To, x$From, paste0(x$From, " (", x$Label, ")"))
+    x$From <- ifelse(!nzchar(as.character(x$Label), keepNA = TRUE) | x$Label == x$To, x$From, paste0(x$From, " (", x$Label, ")")) # nolint
     x$Label <- NULL
   }
 
@@ -1062,7 +1071,7 @@
     # rename columns for zero-inflation part
     if (startsWith(type, "zero") && !is.null(zi_coef_name) && !is.null(coef_column)) {
       colnames(tables[[type]])[which(colnames(tables[[type]]) == coef_column)] <- zi_coef_name
-      colnames(tables[[type]])[which(colnames(tables[[type]]) == paste0("Std_", coef_column))] <- paste0("Std_", zi_coef_name)
+      colnames(tables[[type]])[which(colnames(tables[[type]]) == paste0("Std_", coef_column))] <- paste0("Std_", zi_coef_name) # nolint
     }
 
     # rename columns for correlation, location or scale part
@@ -1142,8 +1151,8 @@
       }
       # replace brackets by parenthesis
       if (!is.null(parameter_column) && parameter_column %in% colnames(formatted_table)) {
-        formatted_table[[parameter_column]] <- gsub("[", ci_brackets[1], formatted_table[[parameter_column]], fixed = TRUE)
-        formatted_table[[parameter_column]] <- gsub("]", ci_brackets[2], formatted_table[[parameter_column]], fixed = TRUE)
+        formatted_table[[parameter_column]] <- gsub("[", ci_brackets[1], formatted_table[[parameter_column]], fixed = TRUE) # nolint
+        formatted_table[[parameter_column]] <- gsub("]", ci_brackets[2], formatted_table[[parameter_column]], fixed = TRUE) # nolint
       }
     }
 

diff --git a/tests/testthat/_snaps/include_reference.md b/tests/testthat/_snaps/include_reference.md
@@ -0,0 +1,66 @@
+# include_reference, on-the-fly factors
+
+    Code
+      print(out1)
+    Output
+      Parameter   | Coefficient |   SE |         95% CI | t(27) |      p
+      ------------------------------------------------------------------
+      (Intercept) |       27.48 | 1.97 | [23.43, 31.53] | 13.92 | < .001
+      gear [3]    |        0.00 |      |                |       |       
+      gear [4]    |        0.08 | 1.83 | [-3.68,  3.83] |  0.04 | 0.967 
+      gear [5]    |        2.39 | 2.38 | [-2.50,  7.29] |  1.00 | 0.324 
+      am [0]      |        0.00 |      |                |       |       
+      am [1]      |        4.14 | 1.81 | [ 0.42,  7.85] |  2.29 | 0.030 
+      hp          |       -0.06 | 0.01 | [-0.09, -0.04] | -6.24 | < .001
+    Message
+      
+      Uncertainty intervals (equal-tailed) and p-values (two-tailed) computed
+        using a Wald t-distribution approximation.
+
+---
+
+    Code
+      print(out2)
+    Output
+      Parameter   | Coefficient |   SE |         95% CI | t(27) |      p
+      ------------------------------------------------------------------
+      (Intercept) |       27.48 | 1.97 | [23.43, 31.53] | 13.92 | < .001
+      gear [3]    |        0.00 |      |                |       |       
+      gear [4]    |        0.08 | 1.83 | [-3.68,  3.83] |  0.04 | 0.967 
+      gear [5]    |        2.39 | 2.38 | [-2.50,  7.29] |  1.00 | 0.324 
+      am [0]      |        0.00 |      |                |       |       
+      am [1]      |        4.14 | 1.81 | [ 0.42,  7.85] |  2.29 | 0.030 
+      hp          |       -0.06 | 0.01 | [-0.09, -0.04] | -6.24 | < .001
+    Message
+      
+      Uncertainty intervals (equal-tailed) and p-values (two-tailed) computed
+        using a Wald t-distribution approximation.
+
+---
+
+    Code
+      print_md(out, engine = "tt")
+    Output
+      
+      +--------------+----------------------+----------------------+
+      | Parameter    | m1                   | m2                   |
+      +==============+======================+======================+
+      | (Intercept)  | 27.48 (23.43, 31.53) | 27.48 (23.43, 31.53) |
+      +--------------+----------------------+----------------------+
+      | gear (3)     | 0.00                 | 0.00                 |
+      +--------------+----------------------+----------------------+
+      | gear (4)     | 0.08 (-3.68, 3.83)   | 0.08 (-3.68, 3.83)   |
+      +--------------+----------------------+----------------------+
+      | gear (5)     | 2.39 (-2.50, 7.29)   | 2.39 (-2.50, 7.29)   |
+      +--------------+----------------------+----------------------+
+      | am (0)       | 0.00                 | 0.00                 |
+      +--------------+----------------------+----------------------+
+      | am (1)       | 4.14 (0.42, 7.85)    | 4.14 (0.42, 7.85)    |
+      +--------------+----------------------+----------------------+
+      | hp           | -0.06 (-0.09, -0.04) | -0.06 (-0.09, -0.04) |
+      +--------------+----------------------+----------------------+
+      |              |                      |                      |
+      +--------------+----------------------+----------------------+
+      | Observations | 32                   | 32                   |
+      +--------------+----------------------+----------------------+ 
+
diff --git a/tests/testthat/test-include_reference.R b/tests/testthat/test-include_reference.R
@@ -0,0 +1,44 @@
+skip_if_not_installed("tinytable")
+
+test_that("include_reference, on-the-fly factors", {
+  data(mtcars)
+  d <- as.data.frame(mtcars)
+  d$gear <- as.factor(d$gear)
+  d$am <- as.factor(d$am)
+
+  m1 <- lm(mpg ~ as.factor(gear) + factor(am) + hp, data = mtcars)
+  m2 <- lm(mpg ~ gear + am + hp, data = d)
+
+  out1 <- model_parameters(m1, include_reference = TRUE)
+  out2 <- model_parameters(m2, include_reference = TRUE)
+
+  expect_snapshot(print(out1))
+  expect_snapshot(print(out2))
+
+  expect_equal(attributes(out1)$pretty_names, attributes(out2)$pretty_names, ignore_attr = TRUE)
+  expect_equal(out1$Coefficient, out2$Coefficient, tolerance = 1e-4)
+
+  out <- compare_parameters(m1, m2, include_reference = TRUE)
+  expect_snapshot(print_md(out, engine = "tt"))
+})
+
+skip_if(getRversion() < "4.3.3")
+skip_if_not_installed("datawizard")
+
+test_that("include_reference, on-the-fly factors", {
+  data(mtcars)
+  d <- as.data.frame(mtcars)
+  d$gear <- as.factor(d$gear)
+  d$am <- as.factor(d$am)
+
+  m1 <- lm(mpg ~ as.factor(gear) + factor(am) + hp, data = mtcars)
+  m2 <- lm(mpg ~ gear + am + hp, data = d)
+
+  out1 <- model_parameters(m1, include_reference = TRUE)
+  out3 <- mtcars |>
+    datawizard::data_modify(gear = factor(gear), am = as.factor(am)) |>
+    lm(mpg ~ gear + am + hp, data = _) |>
+    model_parameters(include_reference = TRUE)
+
+  expect_equal(attributes(out1)$pretty_names, attributes(out3)$pretty_names, ignore_attr = TRUE)
+})