easystats · strengejacke · Feb 4, 2024 · Jan 24, 2024 · Feb 2, 2024 · Feb 3, 2024
diff --git a/NEWS.md b/NEWS.md
@@ -23,6 +23,9 @@
 * `performance_score()` should no longer fail for models where scoring rules
   can't be calculated. Instead, an informative message is returned.
 
+* `check_outliers()` now properly accept the `percentage_central` argument when
+  using the `"mcd"` method.
+
 # performance 0.10.8
 
 ## Changes

diff --git a/R/check_model.R b/R/check_model.R
@@ -49,7 +49,7 @@
 #'
 #' @details For Bayesian models from packages **rstanarm** or **brms**,
 #' models will be "converted" to their frequentist counterpart, using
 #' [`bayestestR::bayesian_as_frequentist`](https://easystats.github.io/bayestestR/reference/convert_bayesian_as_frequentist.html).
 #' A more advanced model-check for Bayesian models will be implemented at a
 #' later stage.
 #'
@@ -77,7 +77,7 @@
 #' plots are helpful to check model assumptions, they do not necessarily indicate
 #' so-called "lack of fit", e.g. missed non-linear relationships or interactions.
 #' Thus, it is always recommended to also look at
 #' [effect plots, including partial residuals](https://strengejacke.github.io/ggeffects/articles/introduction_partial_residuals.html).
 #'
 #' @section Homogeneity of Variance:
 #' This plot checks the assumption of equal variance (homoscedasticity). The
@@ -184,14 +184,12 @@
   minfo <- insight::model_info(x, verbose = FALSE)
 
   ca <- tryCatch(
-    {
-      if (minfo$is_bayesian) {
-        suppressWarnings(.check_assumptions_stan(x, ...))
-      } else if (minfo$is_linear) {
-        suppressWarnings(.check_assumptions_linear(x, minfo, verbose, ...))
-      } else {
-        suppressWarnings(.check_assumptions_glm(x, minfo, verbose, ...))
-      }
+    if (minfo$is_bayesian) {
+      suppressWarnings(.check_assumptions_stan(x, ...))
+    } else if (minfo$is_linear) {
+      suppressWarnings(.check_assumptions_linear(x, minfo, verbose, ...))
+    } else {
+      suppressWarnings(.check_assumptions_glm(x, minfo, verbose, ...))
     },
     error = function(e) {
       NULL

diff --git a/R/check_outliers.R b/R/check_outliers.R
@@ -25,7 +25,8 @@
 #' @param ID Optional, to report an ID column along with the row number.
 #' @param ... When `method = "ics"`, further arguments in `...` are passed
 #' down to [ICSOutlier::ics.outlier()]. When `method = "mahalanobis"`,
-#' they are  passed down to [stats::mahalanobis()].
+#' they are  passed down to [stats::mahalanobis()]. `percentage_central` can
+#' be specified when `method = "mcd"`.
 #'
 #' @return A logical vector of the detected outliers with a nice printing
 #'   method: a check (message) on whether outliers were detected or not. The
@@ -160,6 +161,9 @@
 #' the data (by default, 66\%), before computing the Mahalanobis Distance. This
 #' is deemed to be a more robust method of identifying and removing outliers
 #' than regular Mahalanobis distance.
+#' This method has a `percentage_central` argument that allows specifying
+#' the breakdown point (0.75, the default, is recommended by Leys et al. 2018,
+#' but a commonly used alternative is 0.50).
 #'
 #'  - **Invariant Coordinate Selection (ICS)**:
 #'  The outlier are detected using ICS, which by default uses an alpha threshold
@@ -305,7 +309,7 @@
 #' group_iris <- datawizard::data_group(iris, "Species")
 #' check_outliers(group_iris)
 #'
 #' @examplesIf require("see") && require("bigutilsr") && require("loo") && require("MASS") && require("ICSOutlier") && require("ICS") && require("dbscan")
 #' \donttest{
 #' # You can also run all the methods
 #' check_outliers(data, method = "all")
@@ -385,16 +389,16 @@
   )
 
   # Get data
-  data <- insight::get_data(x, verbose = FALSE)
+  my_data <- insight::get_data(x, verbose = FALSE)
 
   # Remove non-numerics
-  data <- datawizard::data_select(data, select = is.numeric)
+  my_data <- datawizard::data_select(my_data, select = is.numeric)
 
   # Thresholds
   if (is.null(threshold)) {
-    thresholds <- .check_outliers_thresholds(data)
+    thresholds <- .check_outliers_thresholds(my_data)
   } else if (is.list(threshold)) {
-    thresholds <- .check_outliers_thresholds(data)
+    thresholds <- .check_outliers_thresholds(my_data)
     thresholds[names(threshold)] <- threshold[names(threshold)]
   } else {
     insight::format_error(
@@ -411,15 +415,15 @@
 
   # Others
   if (all(method %in% c("cook", "pareto"))) {
-    df <- data.frame(Row = seq_len(nrow(as.data.frame(data))))
+    my_df <- data.frame(Row = seq_len(nrow(as.data.frame(my_data))))
     outlier_count <- list()
     outlier_var <- list()
   } else {
-    out <- check_outliers(data, method, threshold)
+    out <- check_outliers(my_data, method, threshold)
     outlier_var <- attributes(out)$outlier_var
     outlier_count <- attributes(out)$outlier_count
-    df <- attributes(out)$data
-    df <- df[!names(df) == "Outlier"]
+    my_df <- attributes(out)$data
+    my_df <- my_df[names(my_df) != "Outlier"]
   }
 
   # Cook
@@ -429,7 +433,7 @@
       threshold = thresholds$cook
     )$data_cook
 
-    df <- datawizard::data_merge(list(df, data_cook),
+    my_df <- datawizard::data_merge(list(my_df, data_cook),
       join = "full",
       by = "Row"
     )
@@ -459,7 +463,7 @@
       )
     }
   } else {
-    method <- method[!(method == "cook")]
+    method <- method[method != "cook"]
   }
 
   # Pareto
@@ -469,7 +473,7 @@
       threshold = thresholds$pareto
     )$data_pareto
 
-    df <- datawizard::data_merge(list(df, data_pareto),
+    my_df <- datawizard::data_merge(list(my_df, data_pareto),
       join = "full",
       by = "Row"
     )
@@ -499,7 +503,7 @@
       )
     }
   } else {
-    method <- method[!(method == "pareto")]
+    method <- method[method != "pareto"]
   }
 
   outlier_count$all <- datawizard::convert_na_to(outlier_count$all,
@@ -531,21 +535,21 @@
   thresholds <- thresholds[names(thresholds) %in% method]
 
   # Composite outlier score
-  df$Outlier <- rowMeans(df[grepl("Outlier_", names(df), fixed = TRUE)])
-  df <- df[c(names(df)[names(df) != "Outlier"], "Outlier")]
+  my_df$Outlier <- rowMeans(my_df[grepl("Outlier_", names(my_df), fixed = TRUE)])
+  my_df <- my_df[c(names(my_df)[names(my_df) != "Outlier"], "Outlier")]
 
   # Out
-  outlier <- df$Outlier > 0.5
+  outlier <- my_df$Outlier > 0.5
 
   # Attributes
   class(outlier) <- c("check_outliers", "see_check_outliers", class(outlier))
-  attr(outlier, "data") <- df
+  attr(outlier, "data") <- my_df
   attr(outlier, "threshold") <- thresholds
   attr(outlier, "method") <- method
   attr(outlier, "text_size") <- 3
   attr(outlier, "influential_obs") <- .influential_obs(x)
   attr(outlier, "variables") <- "(Whole model)"
-  attr(outlier, "raw_data") <- data
+  attr(outlier, "raw_data") <- my_data
   attr(outlier, "outlier_var") <- outlier_var
   attr(outlier, "outlier_count") <- outlier_count
 
@@ -798,7 +802,7 @@
   )
 
   # Remove non-numerics
-  data <- x
+  my_data <- x
   x <- x[, vapply(x, is.numeric, logical(1)), drop = FALSE]
 
   # Check args
@@ -841,20 +845,20 @@
   outlier_var <- out.meta$outlier_var
 
   # Combine outlier data
-  df <- out[vapply(out, is.data.frame, logical(1))]
-  if (length(df) > 1 && !is.null(ID)) {
-    df <- datawizard::data_merge(df, by = c("Row", ID))
-  } else if (length(df) > 1) {
-    df <- datawizard::data_merge(df, by = "Row")
+  my_df <- out[vapply(out, is.data.frame, logical(1))]
+  if (length(my_df) > 1 && !is.null(ID)) {
+    my_df <- datawizard::data_merge(my_df, by = c("Row", ID))
+  } else if (length(my_df) > 1) {
+    my_df <- datawizard::data_merge(my_df, by = "Row")
   } else {
-    df <- df[[1]]
+    my_df <- my_df[[1]]
   }
 
   # Composite outlier score
-  df$Outlier <- rowMeans(df[grepl("Outlier_", names(df), fixed = TRUE)])
+  my_df$Outlier <- rowMeans(my_df[grepl("Outlier_", names(my_df), fixed = TRUE)])
 
   # Out
-  outlier <- df$Outlier > 0.5
+  outlier <- my_df$Outlier > 0.5
 
   # Combine outlier frequency table
   if (length(outlier_count) > 1 && !is.null(ID)) {
@@ -896,12 +900,12 @@
 
   # Attributes
   class(outlier) <- c("check_outliers", "see_check_outliers", class(outlier))
-  attr(outlier, "data") <- df
+  attr(outlier, "data") <- my_df
   attr(outlier, "threshold") <- thresholds
   attr(outlier, "method") <- method
   attr(outlier, "text_size") <- 3
   attr(outlier, "variables") <- names(x)
-  attr(outlier, "raw_data") <- data
+  attr(outlier, "raw_data") <- my_data
   attr(outlier, "outlier_var") <- outlier_var
   attr(outlier, "outlier_count") <- outlier_count
   outlier
@@ -914,7 +918,7 @@
     outlier.list <- lapply(outlier.list, function(x) {
       x[x[[Outlier_method]] >= 0.5, ]
     })
-    outlier.list <- outlier.list[lapply(outlier.list, nrow) > 0]
+    outlier.list <- outlier.list[vapply(outlier.list, nrow, numeric(1)) > 0]
     outlier.list <- lapply(outlier.list, datawizard::data_remove,
       Outlier_method,
       as_data_frame = TRUE
@@ -1098,8 +1102,8 @@
     out <- c(out, .check_outliers_mcd(
       x,
       threshold = thresholds$mcd,
-      percentage_central = 0.66,
-      ID.names = ID.names
+      ID.names = ID.names,
+      ...
     ))
 
     count.table <- datawizard::data_filter(
@@ -1171,7 +1175,7 @@

  # Isolation Forest
  # if ("iforest" %in% method) {
  #   out <- c(out, .check_outliers_iforest(x, threshold = thresholds$iforest))
  # }

  # Local Outlier Factor
@@ -1217,7 +1221,7 @@
   }
 
   # Initialize elements
-  data <- data.frame()
+  my_data <- data.frame()
   out <- NULL
   thresholds <- list()
   outlier_var <- list()
@@ -1226,24 +1230,24 @@
   # Loop through groups
   for (i in seq_along(grps)) {
     rows <- grps[[i]]
-    subset <- check_outliers(
+    outliers_subset <- check_outliers(
       as.data.frame(x[rows, ]),
       method = method,
       threshold = threshold,
       ID = ID,
       ...
     )
-    data <- rbind(data, as.data.frame(subset))
-    out <- c(out, subset)
-    thresholds[[paste0("group_", i)]] <- attributes(subset)$threshold
+    my_data <- rbind(my_data, as.data.frame(outliers_subset))
+    out <- c(out, outliers_subset)
+    thresholds[[paste0("group_", i)]] <- attributes(outliers_subset)$threshold
     outlier_var[[i]] <- lapply(
-      attributes(subset)$outlier_var, lapply, function(y) {
+      attributes(outliers_subset)$outlier_var, lapply, function(y) {
         y$Row <- rows[which(seq_along(rows) %in% y$Row)]
         y
       }
     )
     outlier_count[[i]] <- lapply(
-      attributes(subset)$outlier_count, function(y) {
+      attributes(outliers_subset)$outlier_count, function(y) {
         y$Row <- rows[which(seq_along(rows) %in% y$Row)]
         y
       }
@@ -1264,16 +1268,16 @@
     info$groups$.rows[[x]] <- as.data.frame(info$groups$.rows[[x]])
   })
 
-  data[names(info$groups)[1]] <- do.call(rbind, groups)
-  data <- datawizard::data_relocate(
-    data,
+  my_data[names(info$groups)[1]] <- do.call(rbind, groups)
+  my_data <- datawizard::data_relocate(
+    my_data,
     select = names(info$groups)[1],
     after = "Row"
   )
-  data$Row <- seq_len(nrow(data))
+  my_data$Row <- seq_len(nrow(my_data))
 
   class(out) <- c("check_outliers", "see_check_outliers", class(out))
-  attr(out, "data") <- data
+  attr(out, "data") <- my_data
   attr(out, "method") <- method
   attr(out, "threshold") <- thresholds[[1]]
   attr(out, "text_size") <- 3
@@ -1434,7 +1438,7 @@
   bci <- 1 - 0.001
   cook <- stats::qf(0.5, ncol(x), nrow(x) - ncol(x))
   pareto <- 0.7
-  mahalanobis <- stats::qchisq(p = 1 - 0.001, df = ncol(x))
+  mahalanobis_value <- stats::qchisq(p = 1 - 0.001, df = ncol(x))
   mahalanobis_robust <- stats::qchisq(p = 1 - 0.001, df = ncol(x))
   mcd <- stats::qchisq(p = 1 - 0.001, df = ncol(x))
   ics <- 0.001
@@ -1451,7 +1455,7 @@
     bci = bci,
     cook = cook,
     pareto = pareto,
-    mahalanobis = mahalanobis,
+    mahalanobis = mahalanobis_value,
     mahalanobis_robust = mahalanobis_robust,
     mcd = mcd,
     ics = ics,
@@ -1555,7 +1559,7 @@
    out <- cbind(out, ID.names)
  }

  # out$Distance_IQR <- Distance_IQR

  out$Distance_IQR <- vapply(as.data.frame(t(Distance_IQR)), function(x) {
    ifelse(all(is.na(x)), NA_real_, max(x, na.rm = TRUE))
@@ -1726,14 +1730,31 @@
 
 .check_outliers_mcd <- function(x,
                                 threshold = stats::qchisq(p = 1 - 0.001, df = ncol(x)),
-                                percentage_central = 0.50,
-                                ID.names = NULL) {
+                                percentage_central = 0.75,
+                                ID.names = NULL,
+                                verbose = TRUE,
+                                ...) {
   out <- data.frame(Row = seq_len(nrow(x)))
 
   if (!is.null(ID.names)) {
     out <- cbind(out, ID.names)
   }
 
+  # check whether N to p ratio is not too large, else MCD flags too many outliers
+  # See #672: This does seem to be a function of the N/p (N = sample size; p =
+  # number of parameters) ratio. When it is larger than 10, the % of outliers
+  # flagged is okay (in well behaved data). This makes sense: the MCD looks at
+  # the cov matrix of subsamples of the data - with high dimensional data, small
+  # samples sizes will give highly variable cov matrices, as so the "smallest"
+  # one will probably miss-represent the data.
+
+  if ((nrow(x) / ncol(x)) <= 10 && isTRUE(verbose)) {
+    insight::format_alert(
+      "Sample size is too small resp. number of variables is too high in your data for MCD to be reliable.",
+      "You may try to increase the `percentage_central` argument (must be between 0 and 1), or choose another method."
+    )
+  }
+
   insight::check_if_installed("MASS")
 
   # Compute
@@ -1861,15 +1882,15 @@


 # .check_outliers_iforest <- function(x, threshold = 0.025) {
 #   out <- data.frame(Row = seq_len(nrow(x)))
 #
 #   # Install packages
 #  insight::check_if_installed("solitude")
 #
 #   # Compute
 #   if (utils::packageVersion("solitude") < "0.2.0") {
 #     iforest <- solitude::isolationForest(x)
 #     out$Distance_iforest <- stats::predict(iforest, x, type = "anomaly_score")
 #   } else if (utils::packageVersion("solitude") == "0.2.0") {
 #     stop(paste("Must update package `solitude` (above version 0.2.0).",
 #                "Please run `install.packages('solitude')`."), call. = FALSE)

diff --git a/man/check_outliers.Rd b/man/check_outliers.Rd
diff --git a/tests/testthat/test-check_outliers.R b/tests/testthat/test-check_outliers.R
@@ -90,6 +90,10 @@ test_that("mcd which", {
     tail(which(check_outliers(mtcars[1:4], method = "mcd", threshold = 45))),
     31L
   )
+  out <- check_outliers(mtcars, method = "mcd")
+  expect_identical(sum(out), 8)
+  out <- check_outliers(mtcars, method = "mcd", percentage_central = 0.5)
+  expect_identical(sum(out), 15)
 })
 
 ## FIXME: Fails on CRAN/windows
@@ -197,10 +201,10 @@ test_that("all methods which", {
         "mahalanobis", "mahalanobis_robust", "mcd", "optics", "lof"
       ),
       threshold = list(
-        "zscore" = 2.2, "zscore_robust" = 2.2, "iqr" = 1.2,
-        "ci" = 0.95, "eti" = 0.95, "hdi" = 0.90, "bci" = 0.95,
-        "mahalanobis" = 20, "mahalanobis_robust" = 25, "mcd" = 25,
-        "optics" = 14, "lof" = 0.005
+        zscore = 2.2, zscore_robust = 2.2, iqr = 1.2,
+        ci = 0.95, eti = 0.95, hdi = 0.90, bci = 0.95,
+        mahalanobis = 20, mahalanobis_robust = 25, mcd = 25,
+        optics = 14, lof = 0.005
       )
     )),
     as.integer(c(9, 15, 16, 19, 20, 28, 29, 31))
@@ -219,10 +223,10 @@ test_that("multiple methods with ID", {
       "mahalanobis", "mahalanobis_robust", "mcd", "optics", "lof"
     ),
     threshold = list(
-      "zscore" = 2.2, "zscore_robust" = 2.2, "iqr" = 1.2,
-      "ci" = 0.95, "eti" = 0.95, "hdi" = 0.90, "bci" = 0.95,
-      "mahalanobis" = 20, "mahalanobis_robust" = 25, "mcd" = 25,
-      "optics" = 14, "lof" = 0.005
+      zscore = 2.2, zscore_robust = 2.2, iqr = 1.2,
+      ci = 0.95, eti = 0.95, hdi = 0.90, bci = 0.95,
+      mahalanobis = 20, mahalanobis_robust = 25, mcd = 25,
+      optics = 14, lof = 0.005
     ),
     ID = "car"
   ))