Skip to content

Commit

Permalink
lintr, add test
Browse files Browse the repository at this point in the history
  • Loading branch information
strengejacke committed Feb 3, 2024
1 parent e451836 commit a13c985
Show file tree
Hide file tree
Showing 3 changed files with 61 additions and 54 deletions.
3 changes: 2 additions & 1 deletion NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,8 @@
* `performance_score()` should no longer fail for models where scoring rules
can't be calculated. Instead, an informative message is returned.

* `check_outliers()` now properly accept the `percentage_central` argument when using the `"mcd"` method.
* `check_outliers()` now properly accept the `percentage_central` argument when
using the `"mcd"` method.

# performance 0.10.8

Expand Down
92 changes: 47 additions & 45 deletions R/check_outliers.R
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,8 @@
#' @param ID Optional, to report an ID column along with the row number.
#' @param ... When `method = "ics"`, further arguments in `...` are passed
#' down to [ICSOutlier::ics.outlier()]. When `method = "mahalanobis"`,
#' they are passed down to [stats::mahalanobis()].
#' they are passed down to [stats::mahalanobis()]. `percentage_central` can
#' be specified when `method = "mcd"`.
#'
#' @return A logical vector of the detected outliers with a nice printing
#' method: a check (message) on whether outliers were detected or not. The
Expand Down Expand Up @@ -388,16 +389,16 @@ check_outliers.default <- function(x,
)

# Get data
data <- insight::get_data(x, verbose = FALSE)
my_data <- insight::get_data(x, verbose = FALSE)

# Remove non-numerics
data <- datawizard::data_select(data, select = is.numeric)
my_data <- datawizard::data_select(my_data, select = is.numeric)

# Thresholds
if (is.null(threshold)) {
thresholds <- .check_outliers_thresholds(data)
thresholds <- .check_outliers_thresholds(my_data)
} else if (is.list(threshold)) {
thresholds <- .check_outliers_thresholds(data)
thresholds <- .check_outliers_thresholds(my_data)
thresholds[names(threshold)] <- threshold[names(threshold)]
} else {
insight::format_error(
Expand All @@ -414,15 +415,15 @@ check_outliers.default <- function(x,

# Others
if (all(method %in% c("cook", "pareto"))) {
df <- data.frame(Row = seq_len(nrow(as.data.frame(data))))
my_df <- data.frame(Row = seq_len(nrow(as.data.frame(my_data))))
outlier_count <- list()
outlier_var <- list()
} else {
out <- check_outliers(data, method, threshold)
out <- check_outliers(my_data, method, threshold)
outlier_var <- attributes(out)$outlier_var
outlier_count <- attributes(out)$outlier_count
df <- attributes(out)$data
df <- df[!names(df) == "Outlier"]
my_df <- attributes(out)$data
my_df <- my_df[names(my_df) != "Outlier"]
}

# Cook
Expand All @@ -432,7 +433,7 @@ check_outliers.default <- function(x,
threshold = thresholds$cook
)$data_cook

df <- datawizard::data_merge(list(df, data_cook),
my_df <- datawizard::data_merge(list(my_df, data_cook),
join = "full",
by = "Row"
)
Expand Down Expand Up @@ -462,7 +463,7 @@ check_outliers.default <- function(x,
)
}
} else {
method <- method[!(method == "cook")]
method <- method[method != "cook"]
}

# Pareto
Expand All @@ -472,7 +473,7 @@ check_outliers.default <- function(x,
threshold = thresholds$pareto
)$data_pareto

df <- datawizard::data_merge(list(df, data_pareto),
my_df <- datawizard::data_merge(list(my_df, data_pareto),
join = "full",
by = "Row"
)
Expand Down Expand Up @@ -502,7 +503,7 @@ check_outliers.default <- function(x,
)
}
} else {
method <- method[!(method == "pareto")]
method <- method[method != "pareto"]
}

outlier_count$all <- datawizard::convert_na_to(outlier_count$all,
Expand Down Expand Up @@ -534,21 +535,21 @@ check_outliers.default <- function(x,
thresholds <- thresholds[names(thresholds) %in% method]

# Composite outlier score
df$Outlier <- rowMeans(df[grepl("Outlier_", names(df), fixed = TRUE)])
df <- df[c(names(df)[names(df) != "Outlier"], "Outlier")]
my_df$Outlier <- rowMeans(my_df[grepl("Outlier_", names(my_df), fixed = TRUE)])
my_df <- my_df[c(names(my_df)[names(my_df) != "Outlier"], "Outlier")]

# Out
outlier <- df$Outlier > 0.5
outlier <- my_df$Outlier > 0.5

# Attributes
class(outlier) <- c("check_outliers", "see_check_outliers", class(outlier))
attr(outlier, "data") <- df
attr(outlier, "data") <- my_df
attr(outlier, "threshold") <- thresholds
attr(outlier, "method") <- method
attr(outlier, "text_size") <- 3
attr(outlier, "influential_obs") <- .influential_obs(x)
attr(outlier, "variables") <- "(Whole model)"
attr(outlier, "raw_data") <- data
attr(outlier, "raw_data") <- my_data
attr(outlier, "outlier_var") <- outlier_var
attr(outlier, "outlier_count") <- outlier_count

Expand Down Expand Up @@ -801,7 +802,7 @@ check_outliers.data.frame <- function(x,
)

# Remove non-numerics
data <- x
my_data <- x
x <- x[, vapply(x, is.numeric, logical(1)), drop = FALSE]

# Check args
Expand Down Expand Up @@ -844,20 +845,20 @@ check_outliers.data.frame <- function(x,
outlier_var <- out.meta$outlier_var

# Combine outlier data
df <- out[vapply(out, is.data.frame, logical(1))]
if (length(df) > 1 && !is.null(ID)) {
df <- datawizard::data_merge(df, by = c("Row", ID))
} else if (length(df) > 1) {
df <- datawizard::data_merge(df, by = "Row")
my_df <- out[vapply(out, is.data.frame, logical(1))]
if (length(my_df) > 1 && !is.null(ID)) {
my_df <- datawizard::data_merge(my_df, by = c("Row", ID))
} else if (length(my_df) > 1) {
my_df <- datawizard::data_merge(my_df, by = "Row")
} else {
df <- df[[1]]
my_df <- my_df[[1]]
}

# Composite outlier score
df$Outlier <- rowMeans(df[grepl("Outlier_", names(df), fixed = TRUE)])
my_df$Outlier <- rowMeans(my_df[grepl("Outlier_", names(my_df), fixed = TRUE)])

# Out
outlier <- df$Outlier > 0.5
outlier <- my_df$Outlier > 0.5

# Combine outlier frequency table
if (length(outlier_count) > 1 && !is.null(ID)) {
Expand Down Expand Up @@ -899,12 +900,12 @@ check_outliers.data.frame <- function(x,

# Attributes
class(outlier) <- c("check_outliers", "see_check_outliers", class(outlier))
attr(outlier, "data") <- df
attr(outlier, "data") <- my_df
attr(outlier, "threshold") <- thresholds
attr(outlier, "method") <- method
attr(outlier, "text_size") <- 3
attr(outlier, "variables") <- names(x)
attr(outlier, "raw_data") <- data
attr(outlier, "raw_data") <- my_data
attr(outlier, "outlier_var") <- outlier_var
attr(outlier, "outlier_count") <- outlier_count
outlier
Expand All @@ -917,7 +918,7 @@ check_outliers.data.frame <- function(x,
outlier.list <- lapply(outlier.list, function(x) {
x[x[[Outlier_method]] >= 0.5, ]
})
outlier.list <- outlier.list[lapply(outlier.list, nrow) > 0]
outlier.list <- outlier.list[vapply(outlier.list, nrow, numeric(1)) > 0]
outlier.list <- lapply(outlier.list, datawizard::data_remove,
Outlier_method,
as_data_frame = TRUE
Expand Down Expand Up @@ -1220,7 +1221,7 @@ check_outliers.grouped_df <- function(x,
}

# Initialize elements
data <- data.frame()
my_data <- data.frame()
out <- NULL
thresholds <- list()
outlier_var <- list()
Expand All @@ -1229,24 +1230,24 @@ check_outliers.grouped_df <- function(x,
# Loop through groups
for (i in seq_along(grps)) {
rows <- grps[[i]]
subset <- check_outliers(
outliers_subset <- check_outliers(
as.data.frame(x[rows, ]),
method = method,
threshold = threshold,
ID = ID,
...
)
data <- rbind(data, as.data.frame(subset))
out <- c(out, subset)
thresholds[[paste0("group_", i)]] <- attributes(subset)$threshold
my_data <- rbind(my_data, as.data.frame(outliers_subset))
out <- c(out, outliers_subset)
thresholds[[paste0("group_", i)]] <- attributes(outliers_subset)$threshold
outlier_var[[i]] <- lapply(
attributes(subset)$outlier_var, lapply, function(y) {
attributes(outliers_subset)$outlier_var, lapply, function(y) {
y$Row <- rows[which(seq_along(rows) %in% y$Row)]
y
}
)
outlier_count[[i]] <- lapply(
attributes(subset)$outlier_count, function(y) {
attributes(outliers_subset)$outlier_count, function(y) {
y$Row <- rows[which(seq_along(rows) %in% y$Row)]
y
}
Expand All @@ -1267,16 +1268,16 @@ check_outliers.grouped_df <- function(x,
info$groups$.rows[[x]] <- as.data.frame(info$groups$.rows[[x]])
})

data[names(info$groups)[1]] <- do.call(rbind, groups)
data <- datawizard::data_relocate(
data,
my_data[names(info$groups)[1]] <- do.call(rbind, groups)
my_data <- datawizard::data_relocate(
my_data,
select = names(info$groups)[1],
after = "Row"
)
data$Row <- seq_len(nrow(data))
my_data$Row <- seq_len(nrow(my_data))

class(out) <- c("check_outliers", "see_check_outliers", class(out))
attr(out, "data") <- data
attr(out, "data") <- my_data
attr(out, "method") <- method
attr(out, "threshold") <- thresholds[[1]]
attr(out, "text_size") <- 3
Expand Down Expand Up @@ -1437,7 +1438,7 @@ check_outliers.metabin <- check_outliers.metagen
bci <- 1 - 0.001
cook <- stats::qf(0.5, ncol(x), nrow(x) - ncol(x))
pareto <- 0.7
mahalanobis <- stats::qchisq(p = 1 - 0.001, df = ncol(x))
mahalanobis_value <- stats::qchisq(p = 1 - 0.001, df = ncol(x))
mahalanobis_robust <- stats::qchisq(p = 1 - 0.001, df = ncol(x))
mcd <- stats::qchisq(p = 1 - 0.001, df = ncol(x))
ics <- 0.001
Expand All @@ -1454,7 +1455,7 @@ check_outliers.metabin <- check_outliers.metagen
bci = bci,
cook = cook,
pareto = pareto,
mahalanobis = mahalanobis,
mahalanobis = mahalanobis_value,
mahalanobis_robust = mahalanobis_robust,
mcd = mcd,
ics = ics,
Expand Down Expand Up @@ -1730,7 +1731,8 @@ check_outliers.metabin <- check_outliers.metagen
.check_outliers_mcd <- function(x,
threshold = stats::qchisq(p = 1 - 0.001, df = ncol(x)),
percentage_central = 0.75,
ID.names = NULL) {
ID.names = NULL,
...) {
out <- data.frame(Row = seq_len(nrow(x)))

if (!is.null(ID.names)) {
Expand Down
20 changes: 12 additions & 8 deletions tests/testthat/test-check_outliers.R
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,10 @@ test_that("mcd which", {
tail(which(check_outliers(mtcars[1:4], method = "mcd", threshold = 45))),
31L
)
out <- check_outliers(mtcars, method = "mcd")
expect_identical(sum(out), 8)
out <- check_outliers(mtcars, method = "mcd", percentage_central = 0.5)
expect_identical(sum(out), 15)
})

## FIXME: Fails on CRAN/windows
Expand Down Expand Up @@ -197,10 +201,10 @@ test_that("all methods which", {
"mahalanobis", "mahalanobis_robust", "mcd", "optics", "lof"
),
threshold = list(
"zscore" = 2.2, "zscore_robust" = 2.2, "iqr" = 1.2,
"ci" = 0.95, "eti" = 0.95, "hdi" = 0.90, "bci" = 0.95,
"mahalanobis" = 20, "mahalanobis_robust" = 25, "mcd" = 25,
"optics" = 14, "lof" = 0.005
zscore = 2.2, zscore_robust = 2.2, iqr = 1.2,
ci = 0.95, eti = 0.95, hdi = 0.90, bci = 0.95,
mahalanobis = 20, mahalanobis_robust = 25, mcd = 25,
optics = 14, lof = 0.005
)
)),
as.integer(c(9, 15, 16, 19, 20, 28, 29, 31))
Expand All @@ -219,10 +223,10 @@ test_that("multiple methods with ID", {
"mahalanobis", "mahalanobis_robust", "mcd", "optics", "lof"
),
threshold = list(
"zscore" = 2.2, "zscore_robust" = 2.2, "iqr" = 1.2,
"ci" = 0.95, "eti" = 0.95, "hdi" = 0.90, "bci" = 0.95,
"mahalanobis" = 20, "mahalanobis_robust" = 25, "mcd" = 25,
"optics" = 14, "lof" = 0.005
zscore = 2.2, zscore_robust = 2.2, iqr = 1.2,
ci = 0.95, eti = 0.95, hdi = 0.90, bci = 0.95,
mahalanobis = 20, mahalanobis_robust = 25, mcd = 25,
optics = 14, lof = 0.005
),
ID = "car"
))
Expand Down

0 comments on commit a13c985

Please sign in to comment.