From 605f9e369dae9a379609ae37f60489f320786cd6 Mon Sep 17 00:00:00 2001 From: rempsyc Date: Wed, 24 Jan 2024 18:03:05 +0100 Subject: [PATCH 01/17] fix problem with percentage_central argument in check_outliers with mcd method --- NEWS.md | 2 ++ R/check_outliers.R | 9 ++++++--- man/check_outliers.Rd | 3 +++ 3 files changed, 11 insertions(+), 3 deletions(-) diff --git a/NEWS.md b/NEWS.md index 5475776cc..a4fd3df0f 100644 --- a/NEWS.md +++ b/NEWS.md @@ -23,6 +23,8 @@ * `performance_score()` should no longer fail for models where scoring rules can't be calculated. Instead, an informative message is returned. +* `check_outliers()` now properly accept the `percentage_central` argument when using the `"mcd"` method. + # performance 0.10.8 ## Changes diff --git a/R/check_outliers.R b/R/check_outliers.R index 2a1d185d3..e01b20e12 100644 --- a/R/check_outliers.R +++ b/R/check_outliers.R @@ -160,6 +160,9 @@ #' the data (by default, 66\%), before computing the Mahalanobis Distance. This #' is deemed to be a more robust method of identifying and removing outliers #' than regular Mahalanobis distance. +#' This method has a `percentage_central` argument that allows specifying +#' the breakdown point (0.75, the default, is recommended by Leys et al. 2018, +#' but a commonly used alternative is 0.50). #' #' - **Invariant Coordinate Selection (ICS)**: #' The outlier are detected using ICS, which by default uses an alpha threshold @@ -1098,8 +1101,8 @@ check_outliers.data.frame <- function(x, out <- c(out, .check_outliers_mcd( x, threshold = thresholds$mcd, - percentage_central = 0.66, - ID.names = ID.names + ID.names = ID.names, + ... )) count.table <- datawizard::data_filter( @@ -1726,7 +1729,7 @@ check_outliers.metabin <- check_outliers.metagen .check_outliers_mcd <- function(x, threshold = stats::qchisq(p = 1 - 0.001, df = ncol(x)), - percentage_central = 0.50, + percentage_central = 0.75, ID.names = NULL) { out <- data.frame(Row = seq_len(nrow(x))) diff --git a/man/check_outliers.Rd b/man/check_outliers.Rd index 74c992b6f..9c2a046b4 100644 --- a/man/check_outliers.Rd +++ b/man/check_outliers.Rd @@ -189,6 +189,9 @@ calculates the mean and covariance matrix based on the most central subset of the data (by default, 66\\%), before computing the Mahalanobis Distance. This is deemed to be a more robust method of identifying and removing outliers than regular Mahalanobis distance. +This method has a \code{percentage_central} argument that allows specifying +the breakdown point (0.75, the default, is recommended by Leys et al. 2018, +but a commonly used alternative is 0.50). \item \strong{Invariant Coordinate Selection (ICS)}: The outlier are detected using ICS, which by default uses an alpha threshold of 0.025 (corresponding to the 2.5\\% most extreme observations) as a cut-off From a13c9858d71d0c811e5bbe69a737e133ccd1222c Mon Sep 17 00:00:00 2001 From: Daniel Date: Sat, 3 Feb 2024 17:09:21 +0100 Subject: [PATCH 02/17] lintr, add test --- NEWS.md | 3 +- R/check_outliers.R | 92 ++++++++++++++-------------- tests/testthat/test-check_outliers.R | 20 +++--- 3 files changed, 61 insertions(+), 54 deletions(-) diff --git a/NEWS.md b/NEWS.md index a4fd3df0f..0cc49b785 100644 --- a/NEWS.md +++ b/NEWS.md @@ -23,7 +23,8 @@ * `performance_score()` should no longer fail for models where scoring rules can't be calculated. Instead, an informative message is returned. -* `check_outliers()` now properly accept the `percentage_central` argument when using the `"mcd"` method. +* `check_outliers()` now properly accept the `percentage_central` argument when + using the `"mcd"` method. # performance 0.10.8 diff --git a/R/check_outliers.R b/R/check_outliers.R index e01b20e12..ba8b6418d 100644 --- a/R/check_outliers.R +++ b/R/check_outliers.R @@ -25,7 +25,8 @@ #' @param ID Optional, to report an ID column along with the row number. #' @param ... When `method = "ics"`, further arguments in `...` are passed #' down to [ICSOutlier::ics.outlier()]. When `method = "mahalanobis"`, -#' they are passed down to [stats::mahalanobis()]. +#' they are passed down to [stats::mahalanobis()]. `percentage_central` can +#' be specified when `method = "mcd"`. #' #' @return A logical vector of the detected outliers with a nice printing #' method: a check (message) on whether outliers were detected or not. The @@ -388,16 +389,16 @@ check_outliers.default <- function(x, ) # Get data - data <- insight::get_data(x, verbose = FALSE) + my_data <- insight::get_data(x, verbose = FALSE) # Remove non-numerics - data <- datawizard::data_select(data, select = is.numeric) + my_data <- datawizard::data_select(my_data, select = is.numeric) # Thresholds if (is.null(threshold)) { - thresholds <- .check_outliers_thresholds(data) + thresholds <- .check_outliers_thresholds(my_data) } else if (is.list(threshold)) { - thresholds <- .check_outliers_thresholds(data) + thresholds <- .check_outliers_thresholds(my_data) thresholds[names(threshold)] <- threshold[names(threshold)] } else { insight::format_error( @@ -414,15 +415,15 @@ check_outliers.default <- function(x, # Others if (all(method %in% c("cook", "pareto"))) { - df <- data.frame(Row = seq_len(nrow(as.data.frame(data)))) + my_df <- data.frame(Row = seq_len(nrow(as.data.frame(my_data)))) outlier_count <- list() outlier_var <- list() } else { - out <- check_outliers(data, method, threshold) + out <- check_outliers(my_data, method, threshold) outlier_var <- attributes(out)$outlier_var outlier_count <- attributes(out)$outlier_count - df <- attributes(out)$data - df <- df[!names(df) == "Outlier"] + my_df <- attributes(out)$data + my_df <- my_df[names(my_df) != "Outlier"] } # Cook @@ -432,7 +433,7 @@ check_outliers.default <- function(x, threshold = thresholds$cook )$data_cook - df <- datawizard::data_merge(list(df, data_cook), + my_df <- datawizard::data_merge(list(my_df, data_cook), join = "full", by = "Row" ) @@ -462,7 +463,7 @@ check_outliers.default <- function(x, ) } } else { - method <- method[!(method == "cook")] + method <- method[method != "cook"] } # Pareto @@ -472,7 +473,7 @@ check_outliers.default <- function(x, threshold = thresholds$pareto )$data_pareto - df <- datawizard::data_merge(list(df, data_pareto), + my_df <- datawizard::data_merge(list(my_df, data_pareto), join = "full", by = "Row" ) @@ -502,7 +503,7 @@ check_outliers.default <- function(x, ) } } else { - method <- method[!(method == "pareto")] + method <- method[method != "pareto"] } outlier_count$all <- datawizard::convert_na_to(outlier_count$all, @@ -534,21 +535,21 @@ check_outliers.default <- function(x, thresholds <- thresholds[names(thresholds) %in% method] # Composite outlier score - df$Outlier <- rowMeans(df[grepl("Outlier_", names(df), fixed = TRUE)]) - df <- df[c(names(df)[names(df) != "Outlier"], "Outlier")] + my_df$Outlier <- rowMeans(my_df[grepl("Outlier_", names(my_df), fixed = TRUE)]) + my_df <- my_df[c(names(my_df)[names(my_df) != "Outlier"], "Outlier")] # Out - outlier <- df$Outlier > 0.5 + outlier <- my_df$Outlier > 0.5 # Attributes class(outlier) <- c("check_outliers", "see_check_outliers", class(outlier)) - attr(outlier, "data") <- df + attr(outlier, "data") <- my_df attr(outlier, "threshold") <- thresholds attr(outlier, "method") <- method attr(outlier, "text_size") <- 3 attr(outlier, "influential_obs") <- .influential_obs(x) attr(outlier, "variables") <- "(Whole model)" - attr(outlier, "raw_data") <- data + attr(outlier, "raw_data") <- my_data attr(outlier, "outlier_var") <- outlier_var attr(outlier, "outlier_count") <- outlier_count @@ -801,7 +802,7 @@ check_outliers.data.frame <- function(x, ) # Remove non-numerics - data <- x + my_data <- x x <- x[, vapply(x, is.numeric, logical(1)), drop = FALSE] # Check args @@ -844,20 +845,20 @@ check_outliers.data.frame <- function(x, outlier_var <- out.meta$outlier_var # Combine outlier data - df <- out[vapply(out, is.data.frame, logical(1))] - if (length(df) > 1 && !is.null(ID)) { - df <- datawizard::data_merge(df, by = c("Row", ID)) - } else if (length(df) > 1) { - df <- datawizard::data_merge(df, by = "Row") + my_df <- out[vapply(out, is.data.frame, logical(1))] + if (length(my_df) > 1 && !is.null(ID)) { + my_df <- datawizard::data_merge(my_df, by = c("Row", ID)) + } else if (length(my_df) > 1) { + my_df <- datawizard::data_merge(my_df, by = "Row") } else { - df <- df[[1]] + my_df <- my_df[[1]] } # Composite outlier score - df$Outlier <- rowMeans(df[grepl("Outlier_", names(df), fixed = TRUE)]) + my_df$Outlier <- rowMeans(my_df[grepl("Outlier_", names(my_df), fixed = TRUE)]) # Out - outlier <- df$Outlier > 0.5 + outlier <- my_df$Outlier > 0.5 # Combine outlier frequency table if (length(outlier_count) > 1 && !is.null(ID)) { @@ -899,12 +900,12 @@ check_outliers.data.frame <- function(x, # Attributes class(outlier) <- c("check_outliers", "see_check_outliers", class(outlier)) - attr(outlier, "data") <- df + attr(outlier, "data") <- my_df attr(outlier, "threshold") <- thresholds attr(outlier, "method") <- method attr(outlier, "text_size") <- 3 attr(outlier, "variables") <- names(x) - attr(outlier, "raw_data") <- data + attr(outlier, "raw_data") <- my_data attr(outlier, "outlier_var") <- outlier_var attr(outlier, "outlier_count") <- outlier_count outlier @@ -917,7 +918,7 @@ check_outliers.data.frame <- function(x, outlier.list <- lapply(outlier.list, function(x) { x[x[[Outlier_method]] >= 0.5, ] }) - outlier.list <- outlier.list[lapply(outlier.list, nrow) > 0] + outlier.list <- outlier.list[vapply(outlier.list, nrow, numeric(1)) > 0] outlier.list <- lapply(outlier.list, datawizard::data_remove, Outlier_method, as_data_frame = TRUE @@ -1220,7 +1221,7 @@ check_outliers.grouped_df <- function(x, } # Initialize elements - data <- data.frame() + my_data <- data.frame() out <- NULL thresholds <- list() outlier_var <- list() @@ -1229,24 +1230,24 @@ check_outliers.grouped_df <- function(x, # Loop through groups for (i in seq_along(grps)) { rows <- grps[[i]] - subset <- check_outliers( + outliers_subset <- check_outliers( as.data.frame(x[rows, ]), method = method, threshold = threshold, ID = ID, ... ) - data <- rbind(data, as.data.frame(subset)) - out <- c(out, subset) - thresholds[[paste0("group_", i)]] <- attributes(subset)$threshold + my_data <- rbind(my_data, as.data.frame(outliers_subset)) + out <- c(out, outliers_subset) + thresholds[[paste0("group_", i)]] <- attributes(outliers_subset)$threshold outlier_var[[i]] <- lapply( - attributes(subset)$outlier_var, lapply, function(y) { + attributes(outliers_subset)$outlier_var, lapply, function(y) { y$Row <- rows[which(seq_along(rows) %in% y$Row)] y } ) outlier_count[[i]] <- lapply( - attributes(subset)$outlier_count, function(y) { + attributes(outliers_subset)$outlier_count, function(y) { y$Row <- rows[which(seq_along(rows) %in% y$Row)] y } @@ -1267,16 +1268,16 @@ check_outliers.grouped_df <- function(x, info$groups$.rows[[x]] <- as.data.frame(info$groups$.rows[[x]]) }) - data[names(info$groups)[1]] <- do.call(rbind, groups) - data <- datawizard::data_relocate( - data, + my_data[names(info$groups)[1]] <- do.call(rbind, groups) + my_data <- datawizard::data_relocate( + my_data, select = names(info$groups)[1], after = "Row" ) - data$Row <- seq_len(nrow(data)) + my_data$Row <- seq_len(nrow(my_data)) class(out) <- c("check_outliers", "see_check_outliers", class(out)) - attr(out, "data") <- data + attr(out, "data") <- my_data attr(out, "method") <- method attr(out, "threshold") <- thresholds[[1]] attr(out, "text_size") <- 3 @@ -1437,7 +1438,7 @@ check_outliers.metabin <- check_outliers.metagen bci <- 1 - 0.001 cook <- stats::qf(0.5, ncol(x), nrow(x) - ncol(x)) pareto <- 0.7 - mahalanobis <- stats::qchisq(p = 1 - 0.001, df = ncol(x)) + mahalanobis_value <- stats::qchisq(p = 1 - 0.001, df = ncol(x)) mahalanobis_robust <- stats::qchisq(p = 1 - 0.001, df = ncol(x)) mcd <- stats::qchisq(p = 1 - 0.001, df = ncol(x)) ics <- 0.001 @@ -1454,7 +1455,7 @@ check_outliers.metabin <- check_outliers.metagen bci = bci, cook = cook, pareto = pareto, - mahalanobis = mahalanobis, + mahalanobis = mahalanobis_value, mahalanobis_robust = mahalanobis_robust, mcd = mcd, ics = ics, @@ -1730,7 +1731,8 @@ check_outliers.metabin <- check_outliers.metagen .check_outliers_mcd <- function(x, threshold = stats::qchisq(p = 1 - 0.001, df = ncol(x)), percentage_central = 0.75, - ID.names = NULL) { + ID.names = NULL, + ...) { out <- data.frame(Row = seq_len(nrow(x))) if (!is.null(ID.names)) { diff --git a/tests/testthat/test-check_outliers.R b/tests/testthat/test-check_outliers.R index e464028d0..7d5585b27 100644 --- a/tests/testthat/test-check_outliers.R +++ b/tests/testthat/test-check_outliers.R @@ -90,6 +90,10 @@ test_that("mcd which", { tail(which(check_outliers(mtcars[1:4], method = "mcd", threshold = 45))), 31L ) + out <- check_outliers(mtcars, method = "mcd") + expect_identical(sum(out), 8) + out <- check_outliers(mtcars, method = "mcd", percentage_central = 0.5) + expect_identical(sum(out), 15) }) ## FIXME: Fails on CRAN/windows @@ -197,10 +201,10 @@ test_that("all methods which", { "mahalanobis", "mahalanobis_robust", "mcd", "optics", "lof" ), threshold = list( - "zscore" = 2.2, "zscore_robust" = 2.2, "iqr" = 1.2, - "ci" = 0.95, "eti" = 0.95, "hdi" = 0.90, "bci" = 0.95, - "mahalanobis" = 20, "mahalanobis_robust" = 25, "mcd" = 25, - "optics" = 14, "lof" = 0.005 + zscore = 2.2, zscore_robust = 2.2, iqr = 1.2, + ci = 0.95, eti = 0.95, hdi = 0.90, bci = 0.95, + mahalanobis = 20, mahalanobis_robust = 25, mcd = 25, + optics = 14, lof = 0.005 ) )), as.integer(c(9, 15, 16, 19, 20, 28, 29, 31)) @@ -219,10 +223,10 @@ test_that("multiple methods with ID", { "mahalanobis", "mahalanobis_robust", "mcd", "optics", "lof" ), threshold = list( - "zscore" = 2.2, "zscore_robust" = 2.2, "iqr" = 1.2, - "ci" = 0.95, "eti" = 0.95, "hdi" = 0.90, "bci" = 0.95, - "mahalanobis" = 20, "mahalanobis_robust" = 25, "mcd" = 25, - "optics" = 14, "lof" = 0.005 + zscore = 2.2, zscore_robust = 2.2, iqr = 1.2, + ci = 0.95, eti = 0.95, hdi = 0.90, bci = 0.95, + mahalanobis = 20, mahalanobis_robust = 25, mcd = 25, + optics = 14, lof = 0.005 ), ID = "car" )) From 13882ae56fece1f5da06b7ea1fe735d1140cd9c0 Mon Sep 17 00:00:00 2001 From: Daniel Date: Sat, 3 Feb 2024 22:18:43 +0100 Subject: [PATCH 03/17] lintr --- R/check_model.R | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/R/check_model.R b/R/check_model.R index e261704a5..2a8672823 100644 --- a/R/check_model.R +++ b/R/check_model.R @@ -184,14 +184,12 @@ check_model.default <- function(x, minfo <- insight::model_info(x, verbose = FALSE) ca <- tryCatch( - { - if (minfo$is_bayesian) { - suppressWarnings(.check_assumptions_stan(x, ...)) - } else if (minfo$is_linear) { - suppressWarnings(.check_assumptions_linear(x, minfo, verbose, ...)) - } else { - suppressWarnings(.check_assumptions_glm(x, minfo, verbose, ...)) - } + if (minfo$is_bayesian) { + suppressWarnings(.check_assumptions_stan(x, ...)) + } else if (minfo$is_linear) { + suppressWarnings(.check_assumptions_linear(x, minfo, verbose, ...)) + } else { + suppressWarnings(.check_assumptions_glm(x, minfo, verbose, ...)) }, error = function(e) { NULL From 73e34283ef3c47e41da1a679139411cbf72a9141 Mon Sep 17 00:00:00 2001 From: Daniel Date: Sun, 4 Feb 2024 11:41:44 +0100 Subject: [PATCH 04/17] flag message --- R/check_outliers.R | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/R/check_outliers.R b/R/check_outliers.R index ba8b6418d..a4dfd3aa8 100644 --- a/R/check_outliers.R +++ b/R/check_outliers.R @@ -1732,6 +1732,7 @@ check_outliers.metabin <- check_outliers.metagen threshold = stats::qchisq(p = 1 - 0.001, df = ncol(x)), percentage_central = 0.75, ID.names = NULL, + verbose = TRUE, ...) { out <- data.frame(Row = seq_len(nrow(x))) @@ -1739,6 +1740,18 @@ check_outliers.metabin <- check_outliers.metagen out <- cbind(out, ID.names) } + # check whether N to p ratio is not too large, else MCD flags too many outliers + # See #672: This does seem to be a function of the N/p (N = sample size; p = + # number of parameters) ratio. When it is larger than 10, the % of outliers + # flagged is okay (in well behaved data). This makes sense: the MCD looks at + # the cov matrix of subsamples of the data - with high dimensional data, small + # samples sizes will give highly variable cov matrices, as so the "smallest" + # one will probably miss-represent the data. + + if ((prod(dim(x)) / nrow(x)) > 10 && isTRUE(verbose)) { + insight::format_alert("Sample size is too small resp. number of variables is too high in your data for MCD to be reliable.") # nolint + } + insight::check_if_installed("MASS") # Compute From d40a4524087e30a7d5717f0956b385fac9c828fb Mon Sep 17 00:00:00 2001 From: Daniel Date: Sun, 4 Feb 2024 11:44:31 +0100 Subject: [PATCH 05/17] fix --- R/check_outliers.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/check_outliers.R b/R/check_outliers.R index a4dfd3aa8..6bce0d4dd 100644 --- a/R/check_outliers.R +++ b/R/check_outliers.R @@ -1748,7 +1748,7 @@ check_outliers.metabin <- check_outliers.metagen # samples sizes will give highly variable cov matrices, as so the "smallest" # one will probably miss-represent the data. - if ((prod(dim(x)) / nrow(x)) > 10 && isTRUE(verbose)) { + if ((nrow(x) / ncol(x)) > 10 && isTRUE(verbose)) { insight::format_alert("Sample size is too small resp. number of variables is too high in your data for MCD to be reliable.") # nolint } From 10a0c4ebcab66975c5611a01ab145c7a61a3e8ab Mon Sep 17 00:00:00 2001 From: Daniel Date: Sun, 4 Feb 2024 11:44:53 +0100 Subject: [PATCH 06/17] fix --- R/check_outliers.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/check_outliers.R b/R/check_outliers.R index 6bce0d4dd..af45cecbe 100644 --- a/R/check_outliers.R +++ b/R/check_outliers.R @@ -1748,7 +1748,7 @@ check_outliers.metabin <- check_outliers.metagen # samples sizes will give highly variable cov matrices, as so the "smallest" # one will probably miss-represent the data. - if ((nrow(x) / ncol(x)) > 10 && isTRUE(verbose)) { + if ((nrow(x) / ncol(x)) <= 10 && isTRUE(verbose)) { insight::format_alert("Sample size is too small resp. number of variables is too high in your data for MCD to be reliable.") # nolint } From 9504f802e2bf9f12f3f2925972b32bf6bb58ff2b Mon Sep 17 00:00:00 2001 From: Daniel Date: Sun, 4 Feb 2024 12:02:19 +0100 Subject: [PATCH 07/17] message --- R/check_outliers.R | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/R/check_outliers.R b/R/check_outliers.R index af45cecbe..2226d95ee 100644 --- a/R/check_outliers.R +++ b/R/check_outliers.R @@ -1749,7 +1749,10 @@ check_outliers.metabin <- check_outliers.metagen # one will probably miss-represent the data. if ((nrow(x) / ncol(x)) <= 10 && isTRUE(verbose)) { - insight::format_alert("Sample size is too small resp. number of variables is too high in your data for MCD to be reliable.") # nolint + insight::format_alert( + "Sample size is too small resp. number of variables is too high in your data for MCD to be reliable.", + "You may try to increase the `percentage_central` argument (must be between 0 and 1), or choose another method." + ) } insight::check_if_installed("MASS") From 941da8855d6f529fd07ccf2d94a85da4575f3636 Mon Sep 17 00:00:00 2001 From: Daniel Date: Sun, 4 Feb 2024 16:40:06 +0100 Subject: [PATCH 08/17] warning instead msg --- R/check_outliers.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/R/check_outliers.R b/R/check_outliers.R index 2226d95ee..85bf73e63 100644 --- a/R/check_outliers.R +++ b/R/check_outliers.R @@ -1749,8 +1749,8 @@ check_outliers.metabin <- check_outliers.metagen # one will probably miss-represent the data. if ((nrow(x) / ncol(x)) <= 10 && isTRUE(verbose)) { - insight::format_alert( - "Sample size is too small resp. number of variables is too high in your data for MCD to be reliable.", + insight::format_warning( + "Sample size is too small respectively number of variables is too high in your data for MCD to be reliable.", "You may try to increase the `percentage_central` argument (must be between 0 and 1), or choose another method." ) } From 56eaf72d001eadf69e50cfdcd4fff332df649194 Mon Sep 17 00:00:00 2001 From: Daniel Date: Sun, 4 Feb 2024 18:04:30 +0100 Subject: [PATCH 09/17] fix test --- tests/testthat/test-check_outliers.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/testthat/test-check_outliers.R b/tests/testthat/test-check_outliers.R index 7d5585b27..ec9bb7ffc 100644 --- a/tests/testthat/test-check_outliers.R +++ b/tests/testthat/test-check_outliers.R @@ -91,9 +91,9 @@ test_that("mcd which", { 31L ) out <- check_outliers(mtcars, method = "mcd") - expect_identical(sum(out), 8) + expect_identical(sum(out), 8L) out <- check_outliers(mtcars, method = "mcd", percentage_central = 0.5) - expect_identical(sum(out), 15) + expect_identical(sum(out), 15L) }) ## FIXME: Fails on CRAN/windows From 4ea505d21ac91dfbf355dec495b9dc870e9f4b0d Mon Sep 17 00:00:00 2001 From: Daniel Date: Sun, 4 Feb 2024 18:10:59 +0100 Subject: [PATCH 10/17] fix example --- DESCRIPTION | 2 +- R/check_outliers.R | 2 +- man/check_outliers.Rd | 5 +++-- man/performance-package.Rd | 1 - 4 files changed, 5 insertions(+), 5 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 30285f897..2b35751ec 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -144,7 +144,7 @@ Suggests: withr (>= 3.0.0) Encoding: UTF-8 Language: en-US -RoxygenNote: 7.2.3.9000 +RoxygenNote: 7.3.1 Roxygen: list(markdown = TRUE) Config/testthat/edition: 3 Config/testthat/parallel: true diff --git a/R/check_outliers.R b/R/check_outliers.R index 85bf73e63..52a257e2a 100644 --- a/R/check_outliers.R +++ b/R/check_outliers.R @@ -312,7 +312,7 @@ #' @examplesIf require("see") && require("bigutilsr") && require("loo") && require("MASS") && require("ICSOutlier") && require("ICS") && require("dbscan") #' \donttest{ #' # You can also run all the methods -#' check_outliers(data, method = "all") +#' check_outliers(data, method = "all", verbose = FALSE) #' #' # For statistical models --------------------------------------------- #' # select only mpg and disp (continuous) diff --git a/man/check_outliers.Rd b/man/check_outliers.Rd index 9c2a046b4..8dc0326bb 100644 --- a/man/check_outliers.Rd +++ b/man/check_outliers.Rd @@ -26,7 +26,8 @@ check_outliers(x, ...) \item{...}{When \code{method = "ics"}, further arguments in \code{...} are passed down to \code{\link[ICSOutlier:ics.outlier]{ICSOutlier::ics.outlier()}}. When \code{method = "mahalanobis"}, -they are passed down to \code{\link[stats:mahalanobis]{stats::mahalanobis()}}.} +they are passed down to \code{\link[stats:mahalanobis]{stats::mahalanobis()}}. \code{percentage_central} can +be specified when \code{method = "mcd"}.} \item{method}{The outlier detection method(s). Can be \code{"all"} or some of \code{"cook"}, \code{"pareto"}, \code{"zscore"}, \code{"zscore_robust"}, \code{"iqr"}, \code{"ci"}, \code{"eti"}, @@ -298,7 +299,7 @@ check_outliers(group_iris) \dontshow{if (require("see") && require("bigutilsr") && require("loo") && require("MASS") && require("ICSOutlier") && require("ICS") && require("dbscan")) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf} \donttest{ # You can also run all the methods -check_outliers(data, method = "all") +check_outliers(data, method = "all", verbose = FALSE) # For statistical models --------------------------------------------- # select only mpg and disp (continuous) diff --git a/man/performance-package.Rd b/man/performance-package.Rd index abbdcf21f..b3f1d5a3f 100644 --- a/man/performance-package.Rd +++ b/man/performance-package.Rd @@ -3,7 +3,6 @@ \docType{package} \name{performance-package} \alias{performance-package} -\alias{_PACKAGE} \title{performance: An R Package for Assessment, Comparison and Testing of Statistical Models} \description{ From 7250bb753b103c9ecfe8e91b35913a804aebf78b Mon Sep 17 00:00:00 2001 From: Daniel Date: Sun, 4 Feb 2024 18:24:35 +0100 Subject: [PATCH 11/17] suppress warnings --- vignettes/check_outliers.Rmd | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/vignettes/check_outliers.Rmd b/vignettes/check_outliers.Rmd index d9f906d6c..0c567489a 100644 --- a/vignettes/check_outliers.Rmd +++ b/vignettes/check_outliers.Rmd @@ -132,7 +132,7 @@ One common approach for this is to compute multivariate distance metrics such as In *{performance}*'s `check_outliers()`, one can use this approach with `method = "mcd"`.^[Our default threshold for the MCD method is defined by `stats::qchisq(p = 1 - 0.001, df = ncol(x))`, which again is an approximation of the critical value for _p_ < .001 consistent with the thresholds of our other methods.] ```{r multivariate} -outliers <- check_outliers(data, method = "mcd") +outliers <- check_outliers(data, method = "mcd", verbose = FALSE) outliers ``` @@ -227,7 +227,7 @@ rempsyc::nice_scatter(data, "height", "weight") Using either the *z*-score or MCD methods, our model-consistent observation will be incorrectly flagged as an outlier or influential observation. ```{r} -outliers <- check_outliers(model, method = c("zscore_robust", "mcd")) +outliers <- check_outliers(model, method = c("zscore_robust", "mcd"), verbose = FALSE) which(outliers) ``` @@ -248,7 +248,7 @@ The *{performance}* package also offers an alternative, consensus-based approach In practice, this approach computes a composite outlier score, formed of the average of the binary (0 or 1) classification results of each method. It represents the probability that each observation is classified as an outlier by at least one method. The default decision rule classifies rows with composite outlier scores superior or equal to 0.5 as outlier observations (i.e., that were classified as outliers by at least half of the methods). In *{performance}*'s `check_outliers()`, one can use this approach by including all desired methods in the corresponding argument. ```{r multimethod, fig.cap = "Visual depiction of outliers using several different statistical outlier detection methods."} -outliers <- check_outliers(model, method = c("zscore_robust", "mcd", "cook")) +outliers <- check_outliers(model, method = c("zscore_robust", "mcd", "cook"), verbose = FALSE) which(outliers) ``` From bb17417bb968dcbc85f96a822b50d3b883831e3a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?R=C3=A9mi=20Th=C3=A9riault?= <13123390+rempsyc@users.noreply.github.com> Date: Sun, 4 Feb 2024 19:30:30 +0100 Subject: [PATCH 12/17] update warning message --- R/check_outliers.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/check_outliers.R b/R/check_outliers.R index 52a257e2a..f3bf7f5a9 100644 --- a/R/check_outliers.R +++ b/R/check_outliers.R @@ -1750,7 +1750,7 @@ check_outliers.metabin <- check_outliers.metagen if ((nrow(x) / ncol(x)) <= 10 && isTRUE(verbose)) { insight::format_warning( - "Sample size is too small respectively number of variables is too high in your data for MCD to be reliable.", + "The sample size is too small in your data, relative to the number of variables, for MCD to be reliable.", "You may try to increase the `percentage_central` argument (must be between 0 and 1), or choose another method." ) } From 49b885ff38f911461ded298f25c146a83aff684c Mon Sep 17 00:00:00 2001 From: Daniel Date: Sun, 4 Feb 2024 19:44:56 +0100 Subject: [PATCH 13/17] test, lintr --- R/check_heterogeneity_bias.R | 10 +++++----- tests/testthat/test-check_outliers.R | 14 ++++++++++---- 2 files changed, 15 insertions(+), 9 deletions(-) diff --git a/R/check_heterogeneity_bias.R b/R/check_heterogeneity_bias.R index b87aa9962..d9bb337f9 100644 --- a/R/check_heterogeneity_bias.R +++ b/R/check_heterogeneity_bias.R @@ -31,9 +31,9 @@ check_heterogeneity_bias <- function(x, select = NULL, group = NULL) { if (insight::is_model(x)) { group <- insight::find_random(x, split_nested = TRUE, flatten = TRUE) if (is.null(group)) { - insight::format_error("Model is no mixed model. Please provide a mixed model, or a data frame and arguments `select` and `group`.") + insight::format_error("Model is no mixed model. Please provide a mixed model, or a data frame and arguments `select` and `group`.") # nolint } - data <- insight::get_data(x, source = "mf", verbose = FALSE) + my_data <- insight::get_data(x, source = "mf", verbose = FALSE) select <- insight::find_predictors(x, effects = "fixed", component = "conditional", flatten = TRUE) } else { if (inherits(select, "formula")) { @@ -42,15 +42,15 @@ check_heterogeneity_bias <- function(x, select = NULL, group = NULL) { if (inherits(group, "formula")) { group <- all.vars(group) } - data <- x + my_data <- x } - unique_groups <- .n_unique(data[[group]]) + unique_groups <- .n_unique(my_data[[group]]) combinations <- expand.grid(select, group) result <- Map(function(predictor, id) { # demean predictor - d <- datawizard::demean(data, select = predictor, group = id, verbose = FALSE) + d <- datawizard::demean(my_data, select = predictor, group = id, verbose = FALSE) # get new names within_name <- paste0(predictor, "_within") diff --git a/tests/testthat/test-check_outliers.R b/tests/testthat/test-check_outliers.R index ec9bb7ffc..02c6804a5 100644 --- a/tests/testthat/test-check_outliers.R +++ b/tests/testthat/test-check_outliers.R @@ -87,12 +87,17 @@ test_that("mcd which", { # (not clear why method mcd needs a seed) set.seed(42) expect_identical( - tail(which(check_outliers(mtcars[1:4], method = "mcd", threshold = 45))), + tail(which(check_outliers(mtcars[1:4], method = "mcd", threshold = 45, verbose = FALSE))), 31L ) - out <- check_outliers(mtcars, method = "mcd") + expect_warning( + { + out <- check_outliers(mtcars, method = "mcd") + }, + regex = "The sample size is too small" + ) expect_identical(sum(out), 8L) - out <- check_outliers(mtcars, method = "mcd", percentage_central = 0.5) + out <- check_outliers(mtcars, method = "mcd", percentage_central = 0.5, verbose = FALSE) expect_identical(sum(out), 15L) }) @@ -228,7 +233,8 @@ test_that("multiple methods with ID", { mahalanobis = 20, mahalanobis_robust = 25, mcd = 25, optics = 14, lof = 0.005 ), - ID = "car" + ID = "car", + verbose = FALSE )) expect_identical( x$outlier_var$zscore$mpg$car, From 435f975d9f10b60be798f4d9144161488f31c712 Mon Sep 17 00:00:00 2001 From: Daniel Date: Sun, 4 Feb 2024 20:01:58 +0100 Subject: [PATCH 14/17] lintr --- R/r2.R | 8 +++----- R/r2_bayes.R | 42 +++++++++++++++++++--------------------- R/r2_coxsnell.R | 36 +++++++++++++++++----------------- R/test_bf.R | 20 +++++++++---------- R/test_likelihoodratio.R | 22 ++++++++++----------- R/test_performance.R | 16 +++++++-------- R/test_vuong.R | 10 +++++----- R/test_wald.R | 24 +++++++++++------------ 8 files changed, 87 insertions(+), 91 deletions(-) diff --git a/R/r2.R b/R/r2.R index dacdef614..2789b5efe 100644 --- a/R/r2.R +++ b/R/r2.R @@ -896,12 +896,10 @@ r2.DirichletRegModel <- function(model, ...) { m <- sum(w * f / sum(w)) mss <- sum(w * (f - m)^2) } + } else if (is.null(w)) { + mss <- sum(f^2) } else { - if (is.null(w)) { - mss <- sum(f^2) - } else { - mss <- sum(w * f^2) - } + mss <- sum(w * f^2) } if (is.null(w)) { rss <- sum(r^2) diff --git a/R/r2_bayes.R b/R/r2_bayes.R index 24e8269ba..71a44ef43 100644 --- a/R/r2_bayes.R +++ b/R/r2_bayes.R @@ -201,28 +201,26 @@ r2_posterior.brmsfit <- function(model, verbose = TRUE, ...) { }) names(br2) <- res } + } else if (mi$is_mixed) { + br2 <- list( + R2_Bayes = as.vector(rstantools::bayes_R2( + model, + re.form = NULL, + re_formula = NULL, + summary = FALSE + )), + R2_Bayes_marginal = as.vector(rstantools::bayes_R2( + model, + re.form = NA, + re_formula = NA, + summary = FALSE + )) + ) + names(br2$R2_Bayes) <- rep("Conditional R2", length(br2$R2_Bayes)) + names(br2$R2_Bayes_marginal) <- rep("Marginal R2", length(br2$R2_Bayes)) } else { - if (mi$is_mixed) { - br2 <- list( - R2_Bayes = as.vector(rstantools::bayes_R2( - model, - re.form = NULL, - re_formula = NULL, - summary = FALSE - )), - R2_Bayes_marginal = as.vector(rstantools::bayes_R2( - model, - re.form = NA, - re_formula = NA, - summary = FALSE - )) - ) - names(br2$R2_Bayes) <- rep("Conditional R2", length(br2$R2_Bayes)) - names(br2$R2_Bayes_marginal) <- rep("Marginal R2", length(br2$R2_Bayes)) - } else { - br2 <- list(R2_Bayes = as.vector(rstantools::bayes_R2(model, summary = FALSE))) - names(br2$R2_Bayes) <- rep("R2", length(br2$R2_Bayes)) - } + br2 <- list(R2_Bayes = as.vector(rstantools::bayes_R2(model, summary = FALSE))) + names(br2$R2_Bayes) <- rep("R2", length(br2$R2_Bayes)) } br2 @@ -400,7 +398,7 @@ as.data.frame.r2_bayes <- function(x, ...) { # remove sig and g cols params_theta <- params[, !grepl(pattern = "^sig2$|^g_|^g$", colnames(params))] - params_sigma <- sqrt(params[, grepl(pattern = "^sig2$", colnames(params))]) + params_sigma <- sqrt(params[, colnames(params) == "sig2"]) # Model Matrix mm <- insight::get_modelmatrix(model[1]) diff --git a/R/r2_coxsnell.R b/R/r2_coxsnell.R index d56803f5c..042e5461a 100644 --- a/R/r2_coxsnell.R +++ b/R/r2_coxsnell.R @@ -69,20 +69,20 @@ r2_coxsnell.glm <- function(model, verbose = TRUE, ...) { if (is.null(info)) { info <- suppressWarnings(insight::model_info(model, verbose = FALSE)) } + # Cox & Snell's R2 is not defined for binomial models that are not Bernoulli models if (info$is_binomial && !info$is_bernoulli && class(model)[1] == "glm") { if (verbose) { insight::format_alert("Can't calculate accurate R2 for binomial models that are not Bernoulli models.") } return(NULL) - } else { - # if no deviance, return NA - if (is.null(model$deviance)) { - return(NULL) - } - r2_coxsnell <- (1 - exp((model$deviance - model$null.deviance) / insight::n_obs(model, disaggregate = TRUE))) - names(r2_coxsnell) <- "Cox & Snell's R2" - r2_coxsnell } + # if no deviance, return NULL + if (is.null(model$deviance)) { + return(NULL) + } + r2_coxsnell <- (1 - exp((model$deviance - model$null.deviance) / insight::n_obs(model, disaggregate = TRUE))) + names(r2_coxsnell) <- "Cox & Snell's R2" + r2_coxsnell } #' @export @@ -95,22 +95,22 @@ r2_coxsnell.glmmTMB <- function(model, verbose = TRUE, ...) { if (is.null(info)) { info <- suppressWarnings(insight::model_info(model, verbose = FALSE)) } + # Cox & Snell's R2 is not defined for binomial models that are not Bernoulli models if (info$is_binomial && !info$is_bernoulli) { if (verbose) { insight::format_alert("Can't calculate accurate R2 for binomial models that are not Bernoulli models.") } return(NULL) - } else { - dev <- stats::deviance(model) - # if no deviance, return NA - if (is.null(dev)) { - return(NULL) - } - null_dev <- stats::deviance(insight::null_model(model)) - r2_coxsnell <- (1 - exp((dev - null_dev) / insight::n_obs(model, disaggregate = TRUE))) - names(r2_coxsnell) <- "Cox & Snell's R2" - r2_coxsnell } + dev <- stats::deviance(model) + # if no deviance, return NULL + if (is.null(dev)) { + return(NULL) + } + null_dev <- stats::deviance(insight::null_model(model)) + r2_coxsnell <- (1 - exp((dev - null_dev) / insight::n_obs(model, disaggregate = TRUE))) + names(r2_coxsnell) <- "Cox & Snell's R2" + r2_coxsnell } diff --git a/R/test_bf.R b/R/test_bf.R index 60cb3a129..38b0e421d 100644 --- a/R/test_bf.R +++ b/R/test_bf.R @@ -9,21 +9,21 @@ test_bf <- function(...) { #' @export test_bf.default <- function(..., reference = 1, text_length = NULL) { # Attribute class to list and get names from the global environment - objects <- insight::ellipsis_info(..., only_models = TRUE) - names(objects) <- match.call(expand.dots = FALSE)$`...` + my_objects <- insight::ellipsis_info(..., only_models = TRUE) + names(my_objects) <- match.call(expand.dots = FALSE)[["..."]] # validation checks (will throw error if non-valid objects) - .test_performance_checks(objects, multiple = FALSE) + .test_performance_checks(objects = my_objects, multiple = FALSE) - if (length(objects) == 1 && isTRUE(insight::is_model(objects))) { + if (length(my_objects) == 1 && isTRUE(insight::is_model(my_objects))) { insight::format_error( - "`test_bf()` is designed to compare multiple models together. For a single model, you might want to run `bayestestR::bf_parameters()` instead." + "`test_bf()` is designed to compare multiple models together. For a single model, you might want to run `bayestestR::bf_parameters()` instead." # nolint ) } # If a suitable class is found, run the more specific method on it - if (inherits(objects, c("ListNestedRegressions", "ListNonNestedRegressions", "ListLavaan"))) { - test_bf(objects, reference = reference, text_length = text_length) + if (inherits(my_objects, c("ListNestedRegressions", "ListNonNestedRegressions", "ListLavaan"))) { + test_bf(my_objects, reference = reference, text_length = text_length) } else { insight::format_error("The models cannot be compared for some reason :/") } @@ -87,9 +87,9 @@ test_bf.ListModels <- function(objects, reference = 1, text_length = NULL, ...) if (all(bayesian_models)) { "yes" - } else if (!all(bayesian_models)) { - "no" - } else { + } else if (any(bayesian_models)) { "mixed" + } else { + "no" } } diff --git a/R/test_likelihoodratio.R b/R/test_likelihoodratio.R index 9784b13a9..529516487 100644 --- a/R/test_likelihoodratio.R +++ b/R/test_likelihoodratio.R @@ -22,29 +22,29 @@ test_lrt <- test_likelihoodratio #' @export test_likelihoodratio.default <- function(..., estimator = "OLS", verbose = TRUE) { # Attribute class to list - objects <- insight::ellipsis_info(..., only_models = TRUE) + my_objects <- insight::ellipsis_info(..., only_models = TRUE) # validation checks (will throw error if non-valid objects) - objects <- .test_performance_checks(objects, verbose = verbose) + my_objects <- .test_performance_checks(my_objects, verbose = verbose) # different default when mixed model or glm is included if (missing(estimator)) { - mixed_models <- sapply(objects, insight::is_mixed_model) - if (all(mixed_models) && all(sapply(objects, .is_lmer_reml)) && isTRUE(attributes(objects)$same_fixef)) { + mixed_models <- sapply(my_objects, insight::is_mixed_model) + if (all(mixed_models) && all(sapply(my_objects, .is_lmer_reml)) && isTRUE(attributes(my_objects)$same_fixef)) { estimator <- "REML" - } else if (any(mixed_models) || !all(attributes(objects)$is_linear)) { + } else if (any(mixed_models) || !all(attributes(my_objects)$is_linear)) { estimator <- "ML" } } # ensure proper object names - objects <- .check_objectnames(objects, sapply(match.call(expand.dots = FALSE)$`...`, as.character)) + my_objects <- .check_objectnames(my_objects, sapply(match.call(expand.dots = FALSE)[["..."]], as.character)) # If a suitable class is found, run the more specific method on it - if (inherits(objects, "ListNestedRegressions")) { - test_likelihoodratio(objects, estimator = estimator) - } else if (inherits(objects, "ListLavaan")) { - test_likelihoodratio_ListLavaan(..., objects = objects) # Because lavaanLRT requires the ellipsis + if (inherits(my_objects, "ListNestedRegressions")) { + test_likelihoodratio(my_objects, estimator = estimator) + } else if (inherits(my_objects, "ListLavaan")) { + test_likelihoodratio_ListLavaan(..., objects = my_objects) # Because lavaanLRT requires the ellipsis } else { insight::format_error( "The models are not nested, which is a prerequisite for `test_likelihoodratio()`.", @@ -106,7 +106,7 @@ test_likelihoodratio.ListNestedRegressions <- function(objects, estimator = "ML" same_fixef <- attributes(objects)$same_fixef # sort by df - if (!all(sort(dfs) == dfs) && !all(sort(dfs) == rev(dfs))) { + if (is.unsorted(dfs) && is.unsorted(rev(dfs))) { objects <- objects[order(dfs)] dfs <- sort(dfs, na.last = TRUE) } diff --git a/R/test_performance.R b/R/test_performance.R index b643143f7..f818b02dd 100644 --- a/R/test_performance.R +++ b/R/test_performance.R @@ -236,17 +236,17 @@ test_performance <- function(..., reference = 1, verbose = TRUE) { #' @export test_performance.default <- function(..., reference = 1, include_formula = FALSE, verbose = TRUE) { # Attribute class to list and get names from the global environment - objects <- insight::ellipsis_info(..., only_models = TRUE) + my_objects <- insight::ellipsis_info(..., only_models = TRUE) # validation checks (will throw error if non-valid objects) - objects <- .test_performance_checks(objects, verbose = verbose) + my_objects <- .test_performance_checks(my_objects, verbose = verbose) # ensure proper object names - objects <- .check_objectnames(objects, sapply(match.call(expand.dots = FALSE)$`...`, as.character)) + my_objects <- .check_objectnames(my_objects, sapply(match.call(expand.dots = FALSE)[["..."]], as.character)) # If a suitable class is found, run the more specific method on it - if (inherits(objects, c("ListNestedRegressions", "ListNonNestedRegressions", "ListLavaan"))) { - test_performance(objects, reference = reference, include_formula = include_formula) + if (inherits(my_objects, c("ListNestedRegressions", "ListNonNestedRegressions", "ListLavaan"))) { + test_performance(my_objects, reference = reference, include_formula = include_formula) } else { insight::format_error("The models cannot be compared for some reason :/") } @@ -421,10 +421,10 @@ test_performance.ListNonNestedRegressions <- function(objects, .test_performance_init <- function(objects, include_formula = FALSE) { - names <- insight::model_name(objects, include_formula = include_formula) + model_names <- insight::model_name(objects, include_formula = include_formula) out <- data.frame( Name = names(objects), - Model = names, + Model = model_names, stringsAsFactors = FALSE ) row.names(out) <- NULL @@ -453,7 +453,7 @@ test_performance.ListNonNestedRegressions <- function(objects, if (same_response && !inherits(objects, "ListLavaan") && isFALSE(attributes(objects)$same_response)) { insight::format_error( - "The models' dependent variables don't have the same data, which is a prerequisite to compare them. Probably the proportion of missing data differs between models." + "The models' dependent variables don't have the same data, which is a prerequisite to compare them. Probably the proportion of missing data differs between models." # nolint ) } diff --git a/R/test_vuong.R b/R/test_vuong.R index 449f55ddb..f0cce6d16 100644 --- a/R/test_vuong.R +++ b/R/test_vuong.R @@ -8,17 +8,17 @@ test_vuong <- function(..., verbose = TRUE) { #' @export test_vuong.default <- function(..., reference = 1, verbose = TRUE) { # Attribute class to list and get names from the global environment - objects <- insight::ellipsis_info(..., only_models = TRUE) + my_objects <- insight::ellipsis_info(..., only_models = TRUE) # validation checks (will throw error if non-valid objects) - objects <- .test_performance_checks(objects, verbose = verbose) + my_objects <- .test_performance_checks(my_objects, verbose = verbose) # ensure proper object names - objects <- .check_objectnames(objects, sapply(match.call(expand.dots = FALSE)$`...`, as.character)) + my_objects <- .check_objectnames(my_objects, sapply(match.call(expand.dots = FALSE)[["..."]], as.character)) # If a suitable class is found, run the more specific method on it - if (inherits(objects, c("ListNestedRegressions", "ListNonNestedRegressions", "ListLavaan"))) { - test_vuong(objects, reference = reference) + if (inherits(my_objects, c("ListNestedRegressions", "ListNonNestedRegressions", "ListLavaan"))) { + test_vuong(my_objects, reference = reference) } else { insight::format_error("The models cannot be compared for some reason :/") } diff --git a/R/test_wald.R b/R/test_wald.R index 0e043502a..ddfe21829 100644 --- a/R/test_wald.R +++ b/R/test_wald.R @@ -8,17 +8,17 @@ test_wald <- function(..., verbose = TRUE) { #' @export test_wald.default <- function(..., verbose = TRUE) { # Attribute class to list and get names from the global environment - objects <- insight::ellipsis_info(..., only_models = TRUE) + my_objects <- insight::ellipsis_info(..., only_models = TRUE) # validation checks (will throw error if non-valid objects) - objects <- .test_performance_checks(objects, verbose = verbose) + my_objects <- .test_performance_checks(my_objects, verbose = verbose) # ensure proper object names - objects <- .check_objectnames(objects, sapply(match.call(expand.dots = FALSE)$`...`, as.character)) + my_objects <- .check_objectnames(my_objects, sapply(match.call(expand.dots = FALSE)[["..."]], as.character)) # If a suitable class is found, run the more specific method on it - if (inherits(objects, c("ListNestedRegressions", "ListNonNestedRegressions", "ListLavaan"))) { - test_wald(objects) + if (inherits(my_objects, c("ListNestedRegressions", "ListNonNestedRegressions", "ListLavaan"))) { + test_wald(my_objects) } else { insight::format_error("The models cannot be compared for some reason :/") } @@ -37,10 +37,10 @@ test_wald.ListNestedRegressions <- function(objects, verbose = TRUE, ...) { ) } return(test_likelihoodratio(objects)) - } else { - out <- .test_wald(objects, test = "F") } + out <- .test_wald(objects, test = "F") + attr(out, "is_nested") <- TRUE class(out) <- c("test_performance", class(out)) out @@ -60,7 +60,7 @@ test_wald.ListNonNestedRegressions <- function(objects, verbose = TRUE, ...) { dfs <- sapply(objects, insight::get_df, type = "residual") # sort by df - if (!all(sort(dfs) == dfs) && !all(sort(dfs) == rev(dfs))) { + if (is.unsorted(dfs) && is.unsorted(rev(dfs))) { objects <- objects[order(dfs)] dfs <- sort(dfs, na.last = TRUE) } @@ -78,18 +78,18 @@ test_wald.ListNonNestedRegressions <- function(objects, verbose = TRUE, ...) { # Find reference-model related stuff refmodel <- order(dfs)[1] - scale <- dev[refmodel] / dfs[refmodel] + my_scale <- dev[refmodel] / dfs[refmodel] # test = "F" if (test == "F") { - f_value <- (dev_diff / dfs_diff) / scale + f_value <- (dev_diff / dfs_diff) / my_scale f_value[!is.na(f_value) & f_value < 0] <- NA # rather than p = 0 - out$`F` <- f_value + out[["F"]] <- f_value p <- stats::pf(f_value, abs(dfs_diff), dfs[refmodel], lower.tail = FALSE) # test = "LRT" } else { - chi2 <- dev_diff / scale * sign(dfs_diff) + chi2 <- dev_diff / my_scale * sign(dfs_diff) chi2[!is.na(chi2) & chi2 < 0] <- NA # rather than p = 0 out$Chi2 <- chi2 p <- stats::pchisq(chi2, abs(dfs_diff), lower.tail = FALSE) From 388609a0daeee159b68e3804185f0473cbd0df76 Mon Sep 17 00:00:00 2001 From: Daniel Date: Sun, 4 Feb 2024 20:15:23 +0100 Subject: [PATCH 15/17] lintr --- R/icc.R | 42 ++++++++++++++++++--------------------- R/r2_loo.R | 58 ++++++++++++++++++++++++++---------------------------- 2 files changed, 47 insertions(+), 53 deletions(-) diff --git a/R/icc.R b/R/icc.R index 46c1a331a..47fbb9a45 100644 --- a/R/icc.R +++ b/R/icc.R @@ -532,14 +532,12 @@ print.icc_decomposed <- function(x, digits = 2, ...) { name_full = "ICC", verbose = TRUE) { vars <- tryCatch( - { - insight::get_variance(model, - name_fun = name_fun, - name_full = name_full, - tolerance = tolerance, - verbose = verbose - ) - }, + insight::get_variance(model, + name_fun = name_fun, + name_full = name_full, + tolerance = tolerance, + verbose = verbose + ), error = function(e) { if (inherits(e, c("simpleError", "error")) && verbose) { insight::print_color(e$message, "red") @@ -597,7 +595,7 @@ print.icc_decomposed <- function(x, digits = 2, ...) { # prepare arguments for "lme4::bootMer" .do_lme4_bootmer <- function(model, .boot_fun, iterations, dots) { insight::check_if_installed(c("lme4", "boot")) - args <- list( + my_args <- list( model, .boot_fun, nsim = iterations, @@ -608,25 +606,25 @@ print.icc_decomposed <- function(x, digits = 2, ...) { ) # add/overwrite dot-args if (!is.null(dots[["use.u"]])) { - args$use.u <- dots[["use.u"]] + my_args$use.u <- dots[["use.u"]] } if (!is.null(dots[["re.form"]])) { - args$re.form <- dots[["re.form"]] + my_args$re.form <- dots[["re.form"]] } if (!is.null(dots[["type"]])) { - args$type <- dots[["type"]] - if (args$type == "semiparametric") { - args$use.u <- TRUE + my_args$type <- dots[["type"]] + if (my_args$type == "semiparametric") { + my_args$use.u <- TRUE } } if (!is.null(dots[["parallel"]])) { - args$parallel <- dots[["parallel"]] + my_args$parallel <- dots[["parallel"]] } if (!is.null(dots[["ncpus"]])) { - args$ncpus <- dots[["ncpus"]] + my_args$ncpus <- dots[["ncpus"]] } # bootsrap - do.call(lme4::bootMer, args) + do.call(lme4::bootMer, args = my_args) } @@ -664,12 +662,10 @@ print.icc_decomposed <- function(x, digits = 2, ...) { } model_rank <- tryCatch( - { - if (!is.null(model$rank)) { - model$rank - df_int - } else { - insight::n_parameters(model) - df_int - } + if (!is.null(model$rank)) { + model$rank - df_int + } else { + insight::n_parameters(model) - df_int }, error = function(e) insight::n_parameters(model) - df_int ) diff --git a/R/r2_loo.R b/R/r2_loo.R index 040d9b572..dbcaf38b3 100644 --- a/R/r2_loo.R +++ b/R/r2_loo.R @@ -50,10 +50,10 @@ r2_loo <- function(model, robust = TRUE, ci = 0.95, verbose = TRUE, ...) { loo_r2 <- structure( class = "r2_loo", lapply(loo_r2, ifelse(robust, stats::median, mean)), - "SE" = lapply(loo_r2, ifelse(robust, stats::mad, stats::sd)), + SE = lapply(loo_r2, ifelse(robust, stats::mad, stats::sd)), # "Estimates" = lapply(r2_bayesian, bayestestR::point_estimate, centrality = "all", dispersion = TRUE), - "CI" = lapply(loo_r2, bayestestR::hdi, ci = ci), - "robust" = robust + CI = lapply(loo_r2, bayestestR::hdi, ci = ci), + robust = robust ) return(loo_r2) } @@ -84,13 +84,13 @@ r2_loo_posterior.brmsfit <- function(model, verbose = TRUE, ...) { res <- insight::find_response(model) if (mi[[1]]$is_mixed) { br2_mv <- list( - "R2_loo" = rstantools::loo_R2( + R2_loo = rstantools::loo_R2( model, re.form = NULL, re_formula = NULL, summary = FALSE ), - "R2_loo_marginal" = rstantools::loo_R2( + R2_loo_marginal = rstantools::loo_R2( model, re.form = NA, re_formula = NA, @@ -99,40 +99,38 @@ r2_loo_posterior.brmsfit <- function(model, verbose = TRUE, ...) { ) br2 <- lapply(seq_along(res), function(x) { list( - "R2_loo" = unname(as.vector(br2_mv$R2_loo[, x])), - "R2_loo_marginal" = unname(as.vector(br2_mv$R2_loo_marginal[, x])) + R2_loo = unname(as.vector(br2_mv$R2_loo[, x])), + R2_loo_marginal = unname(as.vector(br2_mv$R2_loo_marginal[, x])) ) }) names(br2) <- res } else { - br2_mv <- list("R2_loo" = rstantools::loo_R2(model, summary = FALSE)) + br2_mv <- list(R2_loo = rstantools::loo_R2(model, summary = FALSE)) br2 <- lapply(seq_along(res), function(x) { - list("R2_loo" = unname(as.vector(br2_mv$R2_loo[, x]))) + list(R2_loo = unname(as.vector(br2_mv$R2_loo[, x]))) }) names(br2) <- res } + } else if (mi$is_mixed) { + br2 <- list( + R2_loo = as.vector(rstantools::loo_R2( + model, + re.form = NULL, + re_formula = NULL, + summary = FALSE + )), + R2_loo_marginal = as.vector(rstantools::loo_R2( + model, + re.form = NA, + re_formula = NA, + summary = FALSE + )) + ) + names(br2$R2_loo) <- rep("Conditional R2_adjusted", length(br2$R2_loo)) + names(br2$R2_loo_marginal) <- rep("Marginal R2_adjusted", length(br2$R2_loo)) } else { - if (mi$is_mixed) { - br2 <- list( - "R2_loo" = as.vector(rstantools::loo_R2( - model, - re.form = NULL, - re_formula = NULL, - summary = FALSE - )), - "R2_loo_marginal" = as.vector(rstantools::loo_R2( - model, - re.form = NA, - re_formula = NA, - summary = FALSE - )) - ) - names(br2$R2_loo) <- rep("Conditional R2_adjusted", length(br2$R2_loo)) - names(br2$R2_loo_marginal) <- rep("Marginal R2_adjusted", length(br2$R2_loo)) - } else { - br2 <- list("R2_loo" = as.vector(rstantools::loo_R2(model, summary = FALSE))) - names(br2$R2_loo) <- rep("R2_adjusted", length(br2$R2_loo)) - } + br2 <- list(R2_loo = as.vector(rstantools::loo_R2(model, summary = FALSE))) + names(br2$R2_loo) <- rep("R2_adjusted", length(br2$R2_loo)) } br2 From f23bda836613e62f511ac5515163ba21f26c438e Mon Sep 17 00:00:00 2001 From: Daniel Date: Sun, 4 Feb 2024 21:24:43 +0100 Subject: [PATCH 16/17] minor fixes, lintr --- R/check_model_diagnostics.R | 74 +++++++++++++--------------- tests/testthat/test-check_outliers.R | 3 +- 2 files changed, 35 insertions(+), 42 deletions(-) diff --git a/R/check_model_diagnostics.R b/R/check_model_diagnostics.R index 6e398efa8..754616ee6 100644 --- a/R/check_model_diagnostics.R +++ b/R/check_model_diagnostics.R @@ -94,29 +94,25 @@ insight::check_if_installed("lme4") tryCatch( - { - if (inherits(model, "glmmTMB")) { - var_attr <- "condVar" - re <- .collapse_cond(lme4::ranef(model, condVar = TRUE)) - } else { - var_attr <- "postVar" - re <- lme4::ranef(model, condVar = TRUE) - } + if (inherits(model, "glmmTMB")) { + var_attr <- "condVar" + re <- .collapse_cond(lme4::ranef(model, condVar = TRUE)) + } else { + var_attr <- "postVar" + re <- lme4::ranef(model, condVar = TRUE) }, error = function(e) { - return(NULL) + NULL } ) se <- tryCatch( - { - suppressWarnings(lapply(re, function(.x) { - pv <- attr(.x, var_attr, exact = TRUE) - cols <- seq_len(dim(pv)[1]) - unlist(lapply(cols, function(.y) sqrt(pv[.y, .y, ]))) - })) - }, + suppressWarnings(lapply(re, function(.x) { + pv <- attr(.x, var_attr, exact = TRUE) + cols <- seq_len(dim(pv)[1]) + unlist(lapply(cols, function(.y) sqrt(pv[.y, .y, ]))) + })), error = function(e) { NULL } @@ -186,15 +182,15 @@ n_params <- tryCatch(model$rank, error = function(e) insight::n_parameters(model)) infl <- stats::influence(model, do.coef = FALSE) - resid <- as.numeric(insight::get_residuals(model)) + model_resid <- as.numeric(insight::get_residuals(model)) - std_resid <- tryCatch(stats::rstandard(model, infl), error = function(e) resid) + std_resid <- tryCatch(stats::rstandard(model, infl), error = function(e) model_resid) plot_data <- data.frame( Hat = infl$hat, Cooks_Distance = stats::cooks.distance(model, infl), Fitted = insight::get_predicted(model, ci = NULL), - Residuals = resid, + Residuals = model_resid, Std_Residuals = std_resid, stringsAsFactors = FALSE ) @@ -213,12 +209,10 @@ .diag_ncv <- function(model, verbose = TRUE) { ncv <- tryCatch( - { - data.frame( - x = as.numeric(stats::fitted(model)), - y = as.numeric(stats::residuals(model)) - ) - }, + data.frame( + x = as.numeric(stats::fitted(model)), + y = as.numeric(stats::residuals(model)) + ), error = function(e) { NULL } @@ -244,24 +238,22 @@ .diag_homogeneity <- function(model, verbose = TRUE) { faminfo <- insight::model_info(model) r <- tryCatch( - { - if (inherits(model, "merMod")) { - stats::residuals(model, scaled = TRUE) - } else if (inherits(model, "gam")) { - stats::residuals(model, type = "scaled.pearson") - } else if (inherits(model, c("glmmTMB", "MixMod"))) { - sigma <- if (faminfo$is_mixed) { - sqrt(insight::get_variance_residual(model)) - } else { - .sigma_glmmTMB_nonmixed(model, faminfo) - } - stats::residuals(model) / sigma - } else if (inherits(model, "glm")) { - ## TODO: check if we can / should use deviance residuals (as for QQ plots) here as well? - stats::rstandard(model, type = "pearson") + if (inherits(model, "merMod")) { + stats::residuals(model, scaled = TRUE) + } else if (inherits(model, "gam")) { + stats::residuals(model, type = "scaled.pearson") + } else if (inherits(model, c("glmmTMB", "MixMod"))) { + residual_sigma <- if (faminfo$is_mixed) { + sqrt(insight::get_variance_residual(model)) } else { - stats::rstandard(model) + .sigma_glmmTMB_nonmixed(model, faminfo) } + stats::residuals(model) / residual_sigma + } else if (inherits(model, "glm")) { + ## TODO: check if we can / should use deviance residuals (as for QQ plots) here as well? + stats::rstandard(model, type = "pearson") + } else { + stats::rstandard(model) }, error = function(e) { NULL diff --git a/tests/testthat/test-check_outliers.R b/tests/testthat/test-check_outliers.R index 02c6804a5..259f3eac9 100644 --- a/tests/testthat/test-check_outliers.R +++ b/tests/testthat/test-check_outliers.R @@ -210,7 +210,8 @@ test_that("all methods which", { ci = 0.95, eti = 0.95, hdi = 0.90, bci = 0.95, mahalanobis = 20, mahalanobis_robust = 25, mcd = 25, optics = 14, lof = 0.005 - ) + ), + verbose = FALSE )), as.integer(c(9, 15, 16, 19, 20, 28, 29, 31)) ) From 092a6a3f238a3da2f990a058a78d9e2a5cc6c9a2 Mon Sep 17 00:00:00 2001 From: Daniel Date: Sun, 4 Feb 2024 21:25:10 +0100 Subject: [PATCH 17/17] version bump --- DESCRIPTION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/DESCRIPTION b/DESCRIPTION index 2b35751ec..9391ecf34 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Type: Package Package: performance Title: Assessment of Regression Models Performance -Version: 0.10.8.10 +Version: 0.10.8.11 Authors@R: c(person(given = "Daniel", family = "Lüdecke",