Skip to content

Commit

Permalink
Draft seek_variables
Browse files Browse the repository at this point in the history
  • Loading branch information
strengejacke committed Sep 12, 2023
1 parent ae7df24 commit cec92d9
Show file tree
Hide file tree
Showing 8 changed files with 216 additions and 4 deletions.
2 changes: 1 addition & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
Type: Package
Package: datawizard
Title: Easy Data Wrangling and Statistical Transformations
Version: 0.8.0.12
Version: 0.8.0.13
Authors@R: c(
person("Indrajeet", "Patil", , "[email protected]", role = "aut",
comment = c(ORCID = "0000-0003-1995-6531", Twitter = "@patilindrajeets")),
Expand Down
1 change: 1 addition & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -282,6 +282,7 @@ export(row_means)
export(row_to_colnames)
export(rowid_as_column)
export(rownames_as_column)
export(seek_variables)
export(skewness)
export(slide)
export(smoothness)
Expand Down
5 changes: 5 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,11 @@ NEW FUNCTIONS
* `means_by_group()`, to compute mean values of variables, grouped by levels
of specified factors.

* `seek_variables()`, to seek for variables in a data frame, based on their
column names, variables labels, value labels or factor levels. Searching for
labels only works for "labelled" data, i.e. when variables have a `label` or
`labels` attribute.

CHANGES

* `recode_into()` gains an `overwrite` argument to skip overwriting already
Expand Down
145 changes: 145 additions & 0 deletions R/seek_variables.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,145 @@
#' @title Find variables by its name, variable or value labels
#' @name seek_variables
#'
#' @description This functions seeks variables in a data frame, based on patterns
#' that either match the variable name (column name), variable labels, value labels
#' or factor levels. Matching variable and value labels only works for "labelled"
#' data, i.e. when the variables either have a `label` attribute or `labels`
#' attribute.
#'
#' @param data A data frame.
#' @param pattern Character string (regular expression) to be matched in `data`.
#' May also be a character vector of length > 1. `pattern` is searched for in
#' column names, variable label and value labels attributes, or factor levels of
#' variables in `data`.
#' @param source Character vector, indicating where `pattern` is sought. Use one
#' of more of the following options:
#'
#' - `"name"`: searches in column names.
#' - `"labels"`: searches in variable labels. Only applies when a `label` attribute
#' is set for a variable.
#' - `"values"`: searches in value labels or factor levels. Only applies when a
#' `labels` attribute is set for a variable, or if a variable is a factor.
#' - `"all"`: searches in all of the above.
#' @param fuzzy Logical, if `TRUE`, "fuzzy matching" (partial and close distance
#' matching) will be used to find `pattern`.
#'
#' @return A data frame with three columns: the column index, the column name
#' and - if available - the variable label of all matched variables in `data`.
#'
#' @examples
#' # seek variables with "Length" in variable name or labels
#' seek_variables(iris, "Length")
#'
#' # seek variables with "dependency" in names or labels
#' # column "e42dep" has a label-attribute "elder's dependency"
#' data(efc)
#' seek_variables(efc, "dependency")
#'
#' # "female" only appears as value label attribute - default search is in
#' # variable names and labels only, so no match
#' seek_variables(efc, "female")
#' # when we seek in all sources, we find the variable "e16sex"
#' seek_variables(efc, "female", source = "all")
#'
#' # typo, no match
#' seek_variables(iris, "Lenght")
#' # typo, fuzzy match
#' seek_variables(iris, "Lenght", fuzzy = TRUE)
#' @export
seek_variables <- function(data, pattern, source = c("names", "labels"), fuzzy = FALSE) {
# check valid args
if (!is.data.frame(data)) {
insight::format_error("`data` must be a data frame.")
}

# check valid args
source <- intersect(source, c("names", "labels", "values", "levels", "column_names", "columns", "all"))
if (is.null(source) || !length(source)) {
insight::format_error("`source` must be one of \"names\", \"labels\", \"values\", a combination of these options, or \"all\".")
}

pos1 <- pos2 <- pos3 <- NULL

pos <- unlist(lapply(pattern, function(search_pattern) {
# search in variable names?
if (any(source %in% c("names", "columns", "column_names", "all"))) {
pos1 <- which(grepl(search_pattern, colnames(data)))
# find in near distance?
if (fuzzy) {
pos1 <- c(pos1, .fuzzy_grep(x = colnames(data), pattern = search_pattern))
}
}

# search in variable labels?
if (any(source %in% c("labels", "all"))) {
labels <- insight::compact_character(unlist(lapply(data, attr, which = "label", exact = TRUE)))
if (!is.null(labels) && length(labels)) {
found <- grepl(search_pattern, labels)
pos2 <- match(names(labels)[found], colnames(data))
# find in near distanc?
if (fuzzy) {
pos2 <- c(pos2, .fuzzy_grep(x = labels, pattern = search_pattern))
}
}
}

# search for pattern in value labels or levels?
if (any(source %in% c("values", "levels", "all"))) {
values <- insight::compact_list(lapply(data, function(i) {
l <- attr(i, "labels", exact = TRUE)
if (is.null(l) && is.factor(i)) {
levels(i)
} else {
names(l)
}
}))
if (!is.null(values) && length(values)) {
found <- vapply(values, function(i) any(grepl(search_pattern, i)), logical(1))
pos3 <- match(names(found)[found], colnames(data))
# find in near distance
if (fuzzy) {
pos3 <- which(vapply(
values,
function(i) {
p <- .fuzzy_grep(
x = i,
pattern = search_pattern
)
!insight::is_empty_object(p[1])
},
logical(1)
))
}
}
}
# get unique variable indices
c(pos1, pos2, pos3)
}))

# clean up
pos <- unique(pos)
# remove -1
pos <- pos[which(pos != -1)]

# variable labels of matching variables
labels <- vapply(
colnames(data[pos]),
function(i) {
l <- attr(data[[i]], "label", exact = TRUE)
if (is.null(l)) {
i
} else {
l
}
},
character(1)
)

data.frame(
index = pos,
column = colnames(data)[pos],
labels = labels,
stringsAsFactors = FALSE
)
}
2 changes: 1 addition & 1 deletion man/data_peek.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion man/data_read.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion man/recode_into.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

61 changes: 61 additions & 0 deletions man/seek_variables.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

0 comments on commit cec92d9

Please sign in to comment.