diff --git a/NEWS.md b/NEWS.md index a4fd3df0f..0cc49b785 100644 --- a/NEWS.md +++ b/NEWS.md @@ -23,7 +23,8 @@ * `performance_score()` should no longer fail for models where scoring rules can't be calculated. Instead, an informative message is returned. -* `check_outliers()` now properly accept the `percentage_central` argument when using the `"mcd"` method. +* `check_outliers()` now properly accept the `percentage_central` argument when + using the `"mcd"` method. # performance 0.10.8 diff --git a/R/check_outliers.R b/R/check_outliers.R index e01b20e12..ba8b6418d 100644 --- a/R/check_outliers.R +++ b/R/check_outliers.R @@ -25,7 +25,8 @@ #' @param ID Optional, to report an ID column along with the row number. #' @param ... When `method = "ics"`, further arguments in `...` are passed #' down to [ICSOutlier::ics.outlier()]. When `method = "mahalanobis"`, -#' they are passed down to [stats::mahalanobis()]. +#' they are passed down to [stats::mahalanobis()]. `percentage_central` can +#' be specified when `method = "mcd"`. #' #' @return A logical vector of the detected outliers with a nice printing #' method: a check (message) on whether outliers were detected or not. The @@ -388,16 +389,16 @@ check_outliers.default <- function(x, ) # Get data - data <- insight::get_data(x, verbose = FALSE) + my_data <- insight::get_data(x, verbose = FALSE) # Remove non-numerics - data <- datawizard::data_select(data, select = is.numeric) + my_data <- datawizard::data_select(my_data, select = is.numeric) # Thresholds if (is.null(threshold)) { - thresholds <- .check_outliers_thresholds(data) + thresholds <- .check_outliers_thresholds(my_data) } else if (is.list(threshold)) { - thresholds <- .check_outliers_thresholds(data) + thresholds <- .check_outliers_thresholds(my_data) thresholds[names(threshold)] <- threshold[names(threshold)] } else { insight::format_error( @@ -414,15 +415,15 @@ check_outliers.default <- function(x, # Others if (all(method %in% c("cook", "pareto"))) { - df <- data.frame(Row = seq_len(nrow(as.data.frame(data)))) + my_df <- data.frame(Row = seq_len(nrow(as.data.frame(my_data)))) outlier_count <- list() outlier_var <- list() } else { - out <- check_outliers(data, method, threshold) + out <- check_outliers(my_data, method, threshold) outlier_var <- attributes(out)$outlier_var outlier_count <- attributes(out)$outlier_count - df <- attributes(out)$data - df <- df[!names(df) == "Outlier"] + my_df <- attributes(out)$data + my_df <- my_df[names(my_df) != "Outlier"] } # Cook @@ -432,7 +433,7 @@ check_outliers.default <- function(x, threshold = thresholds$cook )$data_cook - df <- datawizard::data_merge(list(df, data_cook), + my_df <- datawizard::data_merge(list(my_df, data_cook), join = "full", by = "Row" ) @@ -462,7 +463,7 @@ check_outliers.default <- function(x, ) } } else { - method <- method[!(method == "cook")] + method <- method[method != "cook"] } # Pareto @@ -472,7 +473,7 @@ check_outliers.default <- function(x, threshold = thresholds$pareto )$data_pareto - df <- datawizard::data_merge(list(df, data_pareto), + my_df <- datawizard::data_merge(list(my_df, data_pareto), join = "full", by = "Row" ) @@ -502,7 +503,7 @@ check_outliers.default <- function(x, ) } } else { - method <- method[!(method == "pareto")] + method <- method[method != "pareto"] } outlier_count$all <- datawizard::convert_na_to(outlier_count$all, @@ -534,21 +535,21 @@ check_outliers.default <- function(x, thresholds <- thresholds[names(thresholds) %in% method] # Composite outlier score - df$Outlier <- rowMeans(df[grepl("Outlier_", names(df), fixed = TRUE)]) - df <- df[c(names(df)[names(df) != "Outlier"], "Outlier")] + my_df$Outlier <- rowMeans(my_df[grepl("Outlier_", names(my_df), fixed = TRUE)]) + my_df <- my_df[c(names(my_df)[names(my_df) != "Outlier"], "Outlier")] # Out - outlier <- df$Outlier > 0.5 + outlier <- my_df$Outlier > 0.5 # Attributes class(outlier) <- c("check_outliers", "see_check_outliers", class(outlier)) - attr(outlier, "data") <- df + attr(outlier, "data") <- my_df attr(outlier, "threshold") <- thresholds attr(outlier, "method") <- method attr(outlier, "text_size") <- 3 attr(outlier, "influential_obs") <- .influential_obs(x) attr(outlier, "variables") <- "(Whole model)" - attr(outlier, "raw_data") <- data + attr(outlier, "raw_data") <- my_data attr(outlier, "outlier_var") <- outlier_var attr(outlier, "outlier_count") <- outlier_count @@ -801,7 +802,7 @@ check_outliers.data.frame <- function(x, ) # Remove non-numerics - data <- x + my_data <- x x <- x[, vapply(x, is.numeric, logical(1)), drop = FALSE] # Check args @@ -844,20 +845,20 @@ check_outliers.data.frame <- function(x, outlier_var <- out.meta$outlier_var # Combine outlier data - df <- out[vapply(out, is.data.frame, logical(1))] - if (length(df) > 1 && !is.null(ID)) { - df <- datawizard::data_merge(df, by = c("Row", ID)) - } else if (length(df) > 1) { - df <- datawizard::data_merge(df, by = "Row") + my_df <- out[vapply(out, is.data.frame, logical(1))] + if (length(my_df) > 1 && !is.null(ID)) { + my_df <- datawizard::data_merge(my_df, by = c("Row", ID)) + } else if (length(my_df) > 1) { + my_df <- datawizard::data_merge(my_df, by = "Row") } else { - df <- df[[1]] + my_df <- my_df[[1]] } # Composite outlier score - df$Outlier <- rowMeans(df[grepl("Outlier_", names(df), fixed = TRUE)]) + my_df$Outlier <- rowMeans(my_df[grepl("Outlier_", names(my_df), fixed = TRUE)]) # Out - outlier <- df$Outlier > 0.5 + outlier <- my_df$Outlier > 0.5 # Combine outlier frequency table if (length(outlier_count) > 1 && !is.null(ID)) { @@ -899,12 +900,12 @@ check_outliers.data.frame <- function(x, # Attributes class(outlier) <- c("check_outliers", "see_check_outliers", class(outlier)) - attr(outlier, "data") <- df + attr(outlier, "data") <- my_df attr(outlier, "threshold") <- thresholds attr(outlier, "method") <- method attr(outlier, "text_size") <- 3 attr(outlier, "variables") <- names(x) - attr(outlier, "raw_data") <- data + attr(outlier, "raw_data") <- my_data attr(outlier, "outlier_var") <- outlier_var attr(outlier, "outlier_count") <- outlier_count outlier @@ -917,7 +918,7 @@ check_outliers.data.frame <- function(x, outlier.list <- lapply(outlier.list, function(x) { x[x[[Outlier_method]] >= 0.5, ] }) - outlier.list <- outlier.list[lapply(outlier.list, nrow) > 0] + outlier.list <- outlier.list[vapply(outlier.list, nrow, numeric(1)) > 0] outlier.list <- lapply(outlier.list, datawizard::data_remove, Outlier_method, as_data_frame = TRUE @@ -1220,7 +1221,7 @@ check_outliers.grouped_df <- function(x, } # Initialize elements - data <- data.frame() + my_data <- data.frame() out <- NULL thresholds <- list() outlier_var <- list() @@ -1229,24 +1230,24 @@ check_outliers.grouped_df <- function(x, # Loop through groups for (i in seq_along(grps)) { rows <- grps[[i]] - subset <- check_outliers( + outliers_subset <- check_outliers( as.data.frame(x[rows, ]), method = method, threshold = threshold, ID = ID, ... ) - data <- rbind(data, as.data.frame(subset)) - out <- c(out, subset) - thresholds[[paste0("group_", i)]] <- attributes(subset)$threshold + my_data <- rbind(my_data, as.data.frame(outliers_subset)) + out <- c(out, outliers_subset) + thresholds[[paste0("group_", i)]] <- attributes(outliers_subset)$threshold outlier_var[[i]] <- lapply( - attributes(subset)$outlier_var, lapply, function(y) { + attributes(outliers_subset)$outlier_var, lapply, function(y) { y$Row <- rows[which(seq_along(rows) %in% y$Row)] y } ) outlier_count[[i]] <- lapply( - attributes(subset)$outlier_count, function(y) { + attributes(outliers_subset)$outlier_count, function(y) { y$Row <- rows[which(seq_along(rows) %in% y$Row)] y } @@ -1267,16 +1268,16 @@ check_outliers.grouped_df <- function(x, info$groups$.rows[[x]] <- as.data.frame(info$groups$.rows[[x]]) }) - data[names(info$groups)[1]] <- do.call(rbind, groups) - data <- datawizard::data_relocate( - data, + my_data[names(info$groups)[1]] <- do.call(rbind, groups) + my_data <- datawizard::data_relocate( + my_data, select = names(info$groups)[1], after = "Row" ) - data$Row <- seq_len(nrow(data)) + my_data$Row <- seq_len(nrow(my_data)) class(out) <- c("check_outliers", "see_check_outliers", class(out)) - attr(out, "data") <- data + attr(out, "data") <- my_data attr(out, "method") <- method attr(out, "threshold") <- thresholds[[1]] attr(out, "text_size") <- 3 @@ -1437,7 +1438,7 @@ check_outliers.metabin <- check_outliers.metagen bci <- 1 - 0.001 cook <- stats::qf(0.5, ncol(x), nrow(x) - ncol(x)) pareto <- 0.7 - mahalanobis <- stats::qchisq(p = 1 - 0.001, df = ncol(x)) + mahalanobis_value <- stats::qchisq(p = 1 - 0.001, df = ncol(x)) mahalanobis_robust <- stats::qchisq(p = 1 - 0.001, df = ncol(x)) mcd <- stats::qchisq(p = 1 - 0.001, df = ncol(x)) ics <- 0.001 @@ -1454,7 +1455,7 @@ check_outliers.metabin <- check_outliers.metagen bci = bci, cook = cook, pareto = pareto, - mahalanobis = mahalanobis, + mahalanobis = mahalanobis_value, mahalanobis_robust = mahalanobis_robust, mcd = mcd, ics = ics, @@ -1730,7 +1731,8 @@ check_outliers.metabin <- check_outliers.metagen .check_outliers_mcd <- function(x, threshold = stats::qchisq(p = 1 - 0.001, df = ncol(x)), percentage_central = 0.75, - ID.names = NULL) { + ID.names = NULL, + ...) { out <- data.frame(Row = seq_len(nrow(x))) if (!is.null(ID.names)) { diff --git a/tests/testthat/test-check_outliers.R b/tests/testthat/test-check_outliers.R index e464028d0..7d5585b27 100644 --- a/tests/testthat/test-check_outliers.R +++ b/tests/testthat/test-check_outliers.R @@ -90,6 +90,10 @@ test_that("mcd which", { tail(which(check_outliers(mtcars[1:4], method = "mcd", threshold = 45))), 31L ) + out <- check_outliers(mtcars, method = "mcd") + expect_identical(sum(out), 8) + out <- check_outliers(mtcars, method = "mcd", percentage_central = 0.5) + expect_identical(sum(out), 15) }) ## FIXME: Fails on CRAN/windows @@ -197,10 +201,10 @@ test_that("all methods which", { "mahalanobis", "mahalanobis_robust", "mcd", "optics", "lof" ), threshold = list( - "zscore" = 2.2, "zscore_robust" = 2.2, "iqr" = 1.2, - "ci" = 0.95, "eti" = 0.95, "hdi" = 0.90, "bci" = 0.95, - "mahalanobis" = 20, "mahalanobis_robust" = 25, "mcd" = 25, - "optics" = 14, "lof" = 0.005 + zscore = 2.2, zscore_robust = 2.2, iqr = 1.2, + ci = 0.95, eti = 0.95, hdi = 0.90, bci = 0.95, + mahalanobis = 20, mahalanobis_robust = 25, mcd = 25, + optics = 14, lof = 0.005 ) )), as.integer(c(9, 15, 16, 19, 20, 28, 29, 31)) @@ -219,10 +223,10 @@ test_that("multiple methods with ID", { "mahalanobis", "mahalanobis_robust", "mcd", "optics", "lof" ), threshold = list( - "zscore" = 2.2, "zscore_robust" = 2.2, "iqr" = 1.2, - "ci" = 0.95, "eti" = 0.95, "hdi" = 0.90, "bci" = 0.95, - "mahalanobis" = 20, "mahalanobis_robust" = 25, "mcd" = 25, - "optics" = 14, "lof" = 0.005 + zscore = 2.2, zscore_robust = 2.2, iqr = 1.2, + ci = 0.95, eti = 0.95, hdi = 0.90, bci = 0.95, + mahalanobis = 20, mahalanobis_robust = 25, mcd = 25, + optics = 14, lof = 0.005 ), ID = "car" ))