Added native support for Amelia; adjusted multiple imputation article

vincentarelbundock · Sep 12, 2023 · 638e537 · 638e537
1 parent bb07daa
commit 638e537
Show file tree

Hide file tree

Showing 9 changed files with 145 additions and 52 deletions.
diff --git a/NEWS.md b/NEWS.md
@@ -3,6 +3,7 @@
 ## dev
 
 * `hypotheses()`: The `FUN` argument handles `group` columns gracefully.
+* Native support for `Amelia` for multiple imputation.
 
 ## 0.15.0
 

diff --git a/R/comparisons.R b/R/comparisons.R
@@ -4,7 +4,7 @@
 #' Predict the outcome variable at different regressor values (e.g., college
 #' graduates vs. others), and compare those predictions by computing a difference,
 #' ratio, or some other function. `comparisons()` can return many quantities of
-#' interest, such as contrasts, differences, risk ratios, changes in log odds, lift, 
+#' interest, such as contrasts, differences, risk ratios, changes in log odds, lift,
 #' slopes, elasticities, etc.
 #'
 #' * `comparisons()`: unit-level (conditional) estimates.
@@ -198,7 +198,7 @@
 #' # Second, we use that SD as the treatment size in the `variables` argument
 #' library(dplyr)
 #' mod <- lm(mpg ~ hp + factor(cyl), mtcars)
-#' tmp <- mtcars %>% 
+#' tmp <- mtcars %>%
 #'     group_by(cyl) %>%
 #'     mutate(hp_sd = sd(hp))
 #' avg_comparisons(mod, variables = list(hp = tmp$hp_sd), by = "cyl")
@@ -318,9 +318,9 @@ comparisons <- function(model,
     tmp <- sanitize_hypothesis(hypothesis, ...)
     hypothesis <- tmp$hypothesis
     hypothesis_null <- tmp$hypothesis_null
-    
+
     # multiple imputation
-    if (inherits(model, "mira")) {
+    if (inherits(model, c("mira", "amest"))) {
         out <- process_imputation(model, call_attr)
         return(out)
     }

diff --git a/R/get_modeldata.R b/R/get_modeldata.R
@@ -1,6 +1,6 @@
 get_modeldata <- function(model, additional_variables = FALSE, modeldata = NULL, wts = NULL, ...) {
 
-    if (inherits(model, "mira")) {
+    if (inherits(model, c("mira", "amest"))) {
         return(modeldata)
     }
 

diff --git a/R/imputation.R b/R/imputation.R
@@ -1,13 +1,20 @@
 process_imputation <- function(x, call_attr, marginal_means = FALSE) {
     insight::check_if_installed("mice")
-    mfx_list <- list()
-    for (i in seq_along(x$analyses)) {
+
+    if (inherits(x, "mira")) {
+      x <- x$analyses
+    } else if (inherits(x, "amest")) {
+      x <- x
+    }
+
+    mfx_list <- vector("list", length(x))
+    for (i in seq_along(x)) {
         calltmp <- call_attr
-        calltmp[["model"]] <- x$analyses[[i]]
+        calltmp[["model"]] <- x[[i]]
 
         # not sure why but this breaks marginal_means on "modeldata specified twice"
         if (isFALSE(marginal_means)) {
-            calltmp[["modeldata"]] <- get_modeldata( x$analyses[[i]], additional_variables = FALSE)
+            calltmp[["modeldata"]] <- get_modeldata( x[[i]], additional_variables = FALSE)
         }
 
         mfx_list[[i]] <- evalup(calltmp)
@@ -41,7 +48,7 @@ process_imputation <- function(x, call_attr, marginal_means = FALSE) {
 
 
 #' tidy helper
-#' 
+#'
 #' @noRd
 #' @export
 tidy.marginaleffects_mids <- function(x, ...) {
@@ -55,7 +62,7 @@ tidy.marginaleffects_mids <- function(x, ...) {
 
 
 #' glance helper
-#' 
+#'
 #' @noRd
 #' @export
 glance.marginaleffects_mids <- function(x, ...) {

diff --git a/R/marginal_means.R b/R/marginal_means.R
@@ -4,7 +4,7 @@
 #' Marginal means are adjusted predictions, averaged across a grid of categorical predictors,
 #' holding other numeric predictors at their means. To learn more, read the marginal means vignette, visit the
 #' package website, or scroll down this page for a full list of vignettes:
-#' 
+#'
 #' * <https://marginaleffects.com/articles/marginalmeans.html>
 #' * <https://marginaleffects.com/>
 #'
@@ -21,7 +21,7 @@
 #' type, but will typically be a string such as: "response", "link", "probs",
 #' or "zero". When an unsupported string is entered, the model-specific list of
 #' acceptable values is returned in an error message. When `type` is `NULL`, the
-#' first entry in the error message is used by default. 
+#' first entry in the error message is used by default.
 #' @param wts character value. Weights to use in the averaging.
 #' + "equal": each combination of variables in `newdata` gets equal weight.
 #' + "cells": each combination of values for the variables in the `newdata` gets a weight proportional to its frequency in the original data.
@@ -71,30 +71,30 @@
 #' dat$cyl <- factor(dat$cyl)
 #' dat$am <- as.logical(dat$am)
 #' mod <- lm(mpg ~ carb + cyl + am, dat)
-#' 
+#'
 #' marginal_means(
 #'   mod,
 #'   variables = "cyl")
-#' 
+#'
 #' # collapse levels of cyl by averaging
 #' by <- data.frame(
 #'   cyl = c(4, 6, 8),
 #'   by = c("4 & 6", "4 & 6", "8"))
 #' marginal_means(mod,
 #'   variables = "cyl",
 #'   by = by)
-#' 
+#'
 #' # pairwise differences between collapsed levels
 #' marginal_means(mod,
 #'   variables = "cyl",
 #'   by = by,
 #'   hypothesis = "pairwise")
-#' 
+#'
 #' # cross
 #' marginal_means(mod,
 #'   variables = c("cyl", "carb"),
 #'   cross = TRUE)
-#' 
+#'
 #' # collapsed cross
 #' by <- expand.grid(
 #'   cyl = unique(mtcars$cyl),
@@ -103,8 +103,8 @@
 #'   by$cyl == 4,
 #'   paste("Control:", by$carb),
 #'   paste("Treatment:", by$carb))
-#' 
-#' 
+#'
+#'
 #' # Convert numeric variables to categorical before fitting the model
 #' dat <- mtcars
 #' dat$am <- as.logical(dat$am)
@@ -129,7 +129,7 @@
 #'   ncol = 2,
 #'   dimnames = list(NULL, c("A", "B")))
 #' marginal_means(mod, variables = "carb", hypothesis = lc)
-#' 
+#'
 marginal_means <- function(model,
                            variables = NULL,
                            newdata = NULL,
@@ -183,9 +183,9 @@ marginal_means <- function(model,
         df = df),
         list(...))
     call_attr <- do.call("call", call_attr)
-    
+
     # multiple imputation
-    if (inherits(model, "mira")) {
+    if (inherits(model, c("mira", "amest"))) {
         out <- process_imputation(model, call_attr, marginal_means = TRUE)
         return(out)
     }
@@ -528,7 +528,7 @@ get_marginalmeans <- function(model,
 
 
 #' `marginal_means()` is an alias to `marginal_means()`
-#' 
+#'
 #' This alias is kept for backward compatibility and because some users may prefer that name.
 #'
 #' @inherit marginal_means

diff --git a/R/predictions.R b/R/predictions.R
@@ -267,7 +267,7 @@ predictions <- function(model,
     hypothesis_null <- tmp$hypothesis_null
 
     # multiple imputation
-    if (inherits(model, "mira")) {
+    if (inherits(model, c("mira", "amest"))) {
         out <- process_imputation(model, call_attr)
         return(out)
     }
@@ -385,7 +385,7 @@ predictions <- function(model,
     if (!is.null(out)) {
         return(out)
     }
-    
+
 
     # pre-building the model matrix can speed up repeated predictions
     newdata <- get_model_matrix_attribute(model, newdata)

diff --git a/R/sanity_model.R b/R/sanity_model.R
@@ -26,6 +26,7 @@ sanity_model_supported_class <- function(model) {
     custom_classes <- as.list(custom_classes)
     supported <- append(custom_classes, list(
         "afex_aov",
+        "amest", #package: Amelia
         "betareg",
         "bglmerMod",
         "blmerMod",

diff --git a/book/articles/multiple_imputation.qmd b/book/articles/multiple_imputation.qmd
@@ -15,30 +15,33 @@ knitr::opts_chunk$set(
 )
 ```
 
-The `marginaleffects` package offers convenience functions to compute and display predictions, contrasts, and marginal effects from models with multiple imputation from the `mice` package. The workflow follows Rubin's rules (Rubin, 1987, p. 76), via the following steps:
+The `marginaleffects` package offers convenience functions to compute and display predictions, contrasts, and marginal effects from models with multiple imputation from the `mice` and `Amelia` packages. The workflow follows Rubin's rules (Rubin, 1987, p. 76), via the following steps:
 
 1. Impute $M$ data sets.
-2. Fit in each of the $M$ imputed data sets.
+2. Fit a model in each of the $M$ imputed data sets.
 3. Compute marginal effects in each of the $M$ data sets.
 4. Pool results.
 
-To highlight the workflow, we consider a simple linear regression model, although the same workflow should work with any model type which is fit using a formula interface and a `data` argument.
+To highlight the workflow, we consider a simple linear regression model, although the same workflow should work with any model type that is fit using a formula interface and a `data` argument.
 
-`marginaleffects` support the `mice` imputation package, and any other package which can return a list of imputed data frames. 
-
-## `mice`
-
-First, we insert missing observations randomly in a dataset and we impute the dataset using the `mice` package:
+`marginaleffects` directly supports the `mice` and `Amelia` imputation packages, as well as any other package that can return a list of imputed data frames. This is demonstrated below using the `iris` dataset, in which we insert missing observations randomly and then impute missing values using several packages.
 
 ```{r}
-library(mice)
 library(marginaleffects)
 set.seed(1024)
 
 dat <- iris
 dat$Sepal.Length[sample(seq_len(nrow(iris)), 40)] <- NA
 dat$Sepal.Width[sample(seq_len(nrow(iris)), 40)] <- NA
 dat$Species[sample(seq_len(nrow(iris)), 40)] <- NA
+```
+
+## `mice`
+
+First, we impute the dataset using the `mice` package:
+
+```{r}
+library(mice)
 
 dat_mice <- mice(dat, m = 20, printFlag = FALSE, .Random.seed = 1024)
 ```
@@ -56,45 +59,61 @@ mfx_mice <- avg_slopes(mod_mice, by = "Species")
 mfx_mice
 ```
 
-## Other imputation packages: `Amelia`, `missRanger`, or lists of imputed data frames.
+## `Amelia`
+
+With `Amelia`, the workflow is essentially the same. First, we impute using `Amelia`:
+
+```{r, include = FALSE}
+## no startup messages
+library(Amelia)
+```
+```{r, message = FALSE, warning = FALSE}
+library(Amelia)
+
+dat_amelia <- amelia(dat, m = 20, noms = "Species", p2s = 0)
+```
+
+Then, we use `Amelia` syntax to produce an object of class `amest` with all the models:
+
+```{r}
+mod_amelia <- with(dat_amelia, lm(Petal.Width ~ Sepal.Length * Sepal.Width + Species))
+```
 
-Several `R` packages can impute missing data. Indeed, [the `Missing Data CRAN View`](https://cran.r-project.org/web/views/MissingData.html) lists at least a dozen alternatives. Since user interface changes a lot from package to package, `marginaleffects` supports a single workflow which can be used, with some adaptation, to with all imputation packages:
+Finally, we feed the `amest` object to a `marginaleffects` function:
+
+```{r}
+mfx_amelia <- avg_slopes(mod_amelia, by = "Species")
+mfx_amelia
+```
+
+## Other imputation packages: `missRanger`, or lists of imputed data frames.
+
+Several `R` packages can impute missing data. Indeed, [the `Missing Data CRAN View`](https://cran.r-project.org/web/views/MissingData.html) lists at least a dozen alternatives. Since user interfaces change a lot from package to package, `marginaleffects` supports a single workflow that can be used, with some adaptation, with all imputation packages:
 
 1. Use an external package to create a list of imputed data frames.
 2. Apply the `datalist2mids()` function from the `miceadds` package to convert the list of imputed data frames to a `mids` object.
-3. Use the `with()` function to fit models, as illustrated in the `mice` section above.
-4. Pass the `mids` object to a `marginaleffects` function.
+3. Use the `with()` function to fit models to create `mira` object, as illustrated in the `mice` and `Amelia` sections above.
+4. Pass the `mira` object to a `marginaleffects` function.
 
-Consider two imputation packages, which can both generate lists of imputed datasets: `Amelia` and `missRanger`.
+Consider the imputation package `missRanger`, which generates a list of imputed datasets:
 
 ```{r, include = FALSE}
 ## no startup messages
-library(Amelia)
 library(miceadds)
 library(missRanger)
 ```
 
 ```{r, message = FALSE, warning = FALSE}
-library(Amelia)
 library(miceadds)
 library(missRanger)
 
-## impute data
-dat_amelia <- amelia(dat, noms = "Species", p2s = 0)$imputations
-mids_amelia <- datlist2mids(dat_amelia)
-
 ## convert lists of imputed datasets to `mids` objects
 dat_missRanger <- replicate(20, missRanger(dat, verbose = 0), simplify = FALSE)
 mids_missRanger <- datlist2mids(dat_missRanger)
 
 ## fit models
-mod_amelia <- with(mids_amelia, lm(Petal.Width ~ Sepal.Length * Sepal.Width + Species))
 mod_missRanger <- with(mids_missRanger, lm(Petal.Width ~ Sepal.Length * Sepal.Width + Species))
 
-## `Amelia` slopes
-mfx_amelia <- avg_slopes(mod_amelia, by = "Species")
-mfx_amelia
-
 ## `missRanger` slopes
 mfx_missRanger <- avg_slopes(mod_missRanger, by = "Species")
 mfx_missRanger
@@ -123,7 +142,7 @@ modelsummary(models, shape = term : contrast + Species ~ model)
 
 ## Passing new data arguments: `newdata`, `wts`, `by`, etc.
 
-Sometimes we want to pass arguments changing or specifying the data on which we will do our analysis using `marginaleffects`. This can be for reasons such as wanting to specify the values or weights at which we evaluate e.g. `avg_slopes`, or due to the underlying models not robustly preserving all the original data columns (such as `fixest` objects not saving their data in the fit object making it potentially challenging to retrieve, and even if retrievable it will not include the weights used during fitting as a column as `wts` expects when given a string).
+Sometimes we want to pass arguments changing or specifying the data on which we will do our analysis using `marginaleffects`. This can be for reasons such as wanting to specify the values or weights at which we evaluate e.g. `avg_slopes()`, or due to the underlying models not robustly preserving all the original data columns (such as `fixest` objects not saving their data in the fit object making it potentially challenging to retrieve, and even if retrievable it will not include the weights used during fitting as a column as `wts` expects when given a string).
 
 If we are not using multiple imputation, or if we want to just pass a single dataset to the several fitted models after multiple imputation, we can pass a single dataset to the `newdata` argument. However, if we wish to supply each model in our list resulting after multiple imputation with a /different/ dataset on which to calculate results, we cannot use `newdata`. Instead, in this case it can be useful to revert to a more manual (but still very easy) approach. Here is an example calculating `avg_slopes` using a different set of weights for each of the `fixest` models which we fit after multiple imputation.
-Original file line number
+Diff line change
@@ Expand Up / @@ -3,6 +3,7 @@ @@
     ## dev
     * `hypotheses()`: The `FUN` argument handles `group` columns gracefully.
+    * Native support for `Amelia` for multiple imputation.
     ## 0.15.0
@@ Expand Down @@