diff --git a/DESCRIPTION b/DESCRIPTION index 23190a9e4..114e31a10 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Type: Package Package: performance Title: Assessment of Regression Models Performance -Version: 0.12.4.12 +Version: 0.12.4.13 Authors@R: c(person(given = "Daniel", family = "Lüdecke", diff --git a/NEWS.md b/NEWS.md index a181937d8..321412b00 100644 --- a/NEWS.md +++ b/NEWS.md @@ -2,6 +2,9 @@ ## Breaking changes +* `check_outliers()` with `method = "optics"` now returns a further refined + cluster selection, by passing the `optics_xi` argument to `dbscan::extractXi()`. + * Deprecated arguments and alias-function-names have been removed. * Argument names in `check_model()` that refer to plot-aesthetics (like diff --git a/R/check_outliers.R b/R/check_outliers.R index ffb2d633f..0e68d1b18 100644 --- a/R/check_outliers.R +++ b/R/check_outliers.R @@ -198,7 +198,8 @@ #' extreme values), this algorithm functions in a different manner and won't #' always detect outliers. Note that `method = "optics"` requires the #' **dbscan** package to be installed, and that it takes some time to compute -#' the results. +#' the results. Additionally, the `optics_xi` (default to 0.05) is passed to +#' the [dbscan::extractXi()] function to further refine the cluster selection. #' #' - **Local Outlier Factor**: #' Based on a K nearest neighbors algorithm, LOF compares the local density of @@ -242,6 +243,7 @@ #' mcd = stats::qchisq(p = 1 - 0.001, df = ncol(x)), #' ics = 0.001, #' optics = 2 * ncol(x), +#' optics_xi = 0.05, #' lof = 0.001 #' ) #' ``` @@ -881,6 +883,13 @@ check_outliers.data.frame <- function(x, } else if (is.numeric(threshold)) { thresholds <- .check_outliers_thresholds(x) thresholds <- lapply(thresholds, function(x) threshold) + # need to fix this manually - "optics" automatically includes method + # "optics_xi", which is allowed to range between 0 and 1 - since values + # for "optics" can be > 1, it might overwrite "optics_xi" with an invalid + # value... + if (thresholds$optics_xi > 1) { + thresholds$optics_xi <- 0.05 + } } else { insight::format_error( paste( @@ -890,7 +899,13 @@ check_outliers.data.frame <- function(x, ) } - thresholds <- thresholds[names(thresholds) %in% method] + # Keep only relevant threshold + valid <- method + if("optics" %in% valid) { + valid <- c(valid, "optics_xi") + method <- c(method, "optics_xi") + } + thresholds <- thresholds[names(thresholds) %in% valid] out.meta <- .check_outliers.data.frame_method(x, method, thresholds, ID, ID.names, ...) out <- out.meta$out @@ -1207,7 +1222,8 @@ check_outliers.data.frame <- function(x, out <- c(out, .check_outliers_optics( x, threshold = thresholds$optics, - ID.names = ID.names + ID.names = ID.names, + xi = thresholds$optics_xi )) count.table <- datawizard::data_filter( @@ -1506,38 +1522,23 @@ check_outliers.DHARMa <- check_outliers.performance_simres } .check_outliers_thresholds_nowarn <- function(x) { - zscore <- stats::qnorm(p = 1 - 0.001 / 2) - zscore_robust <- stats::qnorm(p = 1 - 0.001 / 2) - iqr <- 1.7 - ci <- 1 - 0.001 - eti <- 1 - 0.001 - hdi <- 1 - 0.001 - bci <- 1 - 0.001 - cook <- stats::qf(0.5, ncol(x), nrow(x) - ncol(x)) - pareto <- 0.7 - mahalanobis_value <- stats::qchisq(p = 1 - 0.001, df = ncol(x)) - mahalanobis_robust <- stats::qchisq(p = 1 - 0.001, df = ncol(x)) - mcd <- stats::qchisq(p = 1 - 0.001, df = ncol(x)) - ics <- 0.001 - optics <- 2 * ncol(x) - lof <- 0.001 - list( - zscore = zscore, - zscore_robust = zscore_robust, - iqr = iqr, - ci = ci, - hdi = hdi, - eti = eti, - bci = bci, - cook = cook, - pareto = pareto, - mahalanobis = mahalanobis_value, - mahalanobis_robust = mahalanobis_robust, - mcd = mcd, - ics = ics, - optics = optics, - lof = lof + zscore = stats::qnorm(p = 1 - 0.001 / 2), + zscore_robust = stats::qnorm(p = 1 - 0.001 / 2), + iqr = 1.7, + ci = 1 - 0.001, + hdi = 1 - 0.001, + eti = 1 - 0.001, + bci = 1 - 0.001, + cook = stats::qf(0.5, ncol(x), nrow(x) - ncol(x)), + pareto = 0.7, + mahalanobis = stats::qchisq(p = 1 - 0.001, df = ncol(x)), + mahalanobis_robust = stats::qchisq(p = 1 - 0.001, df = ncol(x)), + mcd = stats::qchisq(p = 1 - 0.001, df = ncol(x)), + ics = 0.001, + optics = 2 * ncol(x), + optics_xi = 0.05, + lof = 0.001 ) } @@ -1929,7 +1930,8 @@ check_outliers.DHARMa <- check_outliers.performance_simres .check_outliers_optics <- function(x, threshold = NULL, - ID.names = NULL) { + ID.names = NULL, + xi = 0.05) { out <- data.frame(Row = seq_len(nrow(x))) if (!is.null(ID.names)) { @@ -1940,7 +1942,7 @@ check_outliers.DHARMa <- check_outliers.performance_simres # Compute rez <- dbscan::optics(x, minPts = threshold) - rez <- dbscan::extractXi(rez, xi = 0.05) # TODO: find automatic way of setting xi + rez <- dbscan::extractXi(rez, xi = xi) # TODO: find automatic way of setting xi out$Distance_OPTICS <- rez$coredist diff --git a/man/check_outliers.Rd b/man/check_outliers.Rd index 623eae4b2..489dbafc3 100644 --- a/man/check_outliers.Rd +++ b/man/check_outliers.Rd @@ -236,7 +236,8 @@ detect several outliers (as these are usually defined as a percentage of extreme values), this algorithm functions in a different manner and won't always detect outliers. Note that \code{method = "optics"} requires the \strong{dbscan} package to be installed, and that it takes some time to compute -the results. +the results. Additionally, the \code{optics_xi} (default to 0.05) is passed to +the \code{\link[dbscan:optics]{dbscan::extractXi()}} function to further refine the cluster selection. \item \strong{Local Outlier Factor}: Based on a K nearest neighbors algorithm, LOF compares the local density of a point to the local densities of its neighbors instead of computing a @@ -283,6 +284,7 @@ Default thresholds are currently specified as follows: mcd = stats::qchisq(p = 1 - 0.001, df = ncol(x)), ics = 0.001, optics = 2 * ncol(x), + optics_xi = 0.05, lof = 0.001 ) }\if{html}{\out{}}