Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[188037693]: fix bugs with as.data.frame(ds, force = TRUE) #646

Merged
merged 12 commits into from
Aug 21, 2024
Merged
2 changes: 1 addition & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ Suggests:
VignetteBuilder: knitr
Language: en-US
Encoding: UTF-8
RoxygenNote: 7.2.3
RoxygenNote: 7.3.2
Roxygen: list(markdown = TRUE)
LazyData: true
Collate:
Expand Down
180 changes: 138 additions & 42 deletions R/as-data-frame.R
Original file line number Diff line number Diff line change
Expand Up @@ -29,9 +29,9 @@
#' levels matching the Crunch Variable's categories (the default), or, if
#' `categorical.mode` is specified as "id" or "numeric", a numeric vector of
#' category ids or numeric values, respectively
#' * Array variables (Categorical Array, Multiple Response) are decomposed into
#' their constituent categorical subvariables. An array with three subvariables,
#' for example, will result in three columns in the `data.frame`.
#' * Array variables (Categorical Array, Multiple Response) can be decomposed into
#' their constituent categorical subvariables or put in 'packed' data.frame columns,
#' see the `array_strategy` argument.
#'
#' Column names in the `data.frame` are the variable/subvariable aliases.
#'
Expand All @@ -46,6 +46,14 @@
#' @param categorical.mode what mode should categoricals be pulled as? One of
#' factor, numeric, id (default: factor)
#' @param include.hidden logical: should hidden variables be included? (default: `TRUE`)
#' @param array_strategy Strategy to import array variables: "alias" (the default)
#' reads them as flat variables with the subvariable aliases, unless there are duplicate
#' aliases in which case they are qualified in brackets after the array alias,
#' like "array_alias\[subvar_alias\]". "qualified_alias" always uses the bracket notation.
#' "packed" reads them in what the tidyverse calls "packed" data.frame columns, with the
#' alias from the array variable, and subvariables as the columns of the data.frame.
#' @param verbose Whether to output a message to the console when subvariable aliases
#' are qualified when array_strategy="alias" (defaults to TRUE)
#' @param ... additional arguments passed to `as.data.frame` (default method).
#' @return When called on a `CrunchDataset`, the method returns an object of
#' class `CrunchDataFrame` unless `force = TRUE`, in which case the return is a
Expand All @@ -70,7 +78,7 @@
include.hidden = include.hidden
)
if (force) {
out <- as.data.frame(out)
out <- as.data.frame(out, ...)
}
return(out)
}
Expand All @@ -82,63 +90,151 @@
row.names = NULL,
optional = FALSE,
include.hidden = attr(x, "include.hidden"),
array_strategy = c("alias", "qualified_alias", "packed"),
verbose = TRUE,
...) {
array_strategy <- match.arg(array_strategy)
ds <- attr(x, "crunchDataset")
tmp <- tempfile()
on.exit(unlink(tmp))
write.csv(ds, tmp, categorical = "id", include.hidden = include.hidden)
# TODO: use variableMetadata to provide all `colClasses`?
# meta <- variableMetadata(ds)
ds_out <- read.csv(tmp, stringsAsFactors = FALSE, check.names = FALSE)
return(csvToDataFrame(ds_out, x))
write.csv(
ds,
tmp,
categorical = "id",
header_field = "qualified_alias",
missing_values = "",
include.hidden = include.hidden
)

parsing_info <- csvColInfo(ds, verbose = verbose && array_strategy == "alias")

# guessing has been good enough (and distinguishes between Date and POSIXct class for us)
# except for text variables, so continue to guess the parsing info for all columns besides text
col_classes <- setNames(
ifelse(parsing_info$var_type == "text", "character", NA_character_),
parsing_info$qualified_alias
)

ds_out <- read.csv(
tmp,
stringsAsFactors = FALSE,
check.names = FALSE,
colClasses = col_classes,
na.strings = ""
)
dup_csv_names <- duplicated(names(ds_out))
if (any(dup_csv_names)) {
stop(
"csv has duplicate column headers, cannot parse: ",
paste0(unique(names(ds_out)[dup_csv_names]), collapse = ", ")
)
}
return(csvToDataFrame(ds_out, x, parsing_info, array_strategy, categorical.mode = attr(x, "mode")))

Check warning on line 132 in R/as-data-frame.R

View workflow job for this annotation

GitHub Actions / test-coverage

file=R/as-data-frame.R,line=132,col=101,[line_length_linter] Lines should not be more than 100 characters. This line is 103 characters.
}

csvColInfo <- function(ds, verbose = TRUE) {
# Get variable metadata for variables included in the export
meta <- variableMetadata(ds)[urls(allVariables(ds))]
flattened_meta <- flattenVariableMetadata(meta)

orig_aliases <- aliases(flattened_meta)
parent_aliases <- vapply(flattened_meta, function(x) x$parent_alias %||% NA_character_, character(1))

Check warning on line 141 in R/as-data-frame.R

View workflow job for this annotation

GitHub Actions / test-coverage

file=R/as-data-frame.R,line=141,col=101,[line_length_linter] Lines should not be more than 100 characters. This line is 105 characters.
qualified_aliases <- ifelse(
is.na(parent_aliases),
orig_aliases,
paste0(parent_aliases, "[", orig_aliases, "]")
)
# cond_qualified_aliases are only qualified if there are duplicates
dup_aliases <- orig_aliases[duplicated(orig_aliases)]
cond_qualified_aliases <- ifelse(orig_aliases %in% dup_aliases, qualified_aliases, orig_aliases)
out <- data.frame(
orig_alias = orig_aliases,
parent_alias = parent_aliases,
qualified_alias = qualified_aliases,
cond_qualified_alias = cond_qualified_aliases,
var_type = types(flattened_meta)
)
out <- out[!out$var_type %in% ARRAY_TYPES, ]

if (verbose) {
msg_rows <- out$cond_qualified_alias != out$orig_alias
if (any(msg_rows)) {
alias_info <- paste0(out$orig_alias[msg_rows], " -> ", out$cond_qualified_alias[msg_rows])

Check warning on line 162 in R/as-data-frame.R

View workflow job for this annotation

GitHub Actions / test-coverage

file=R/as-data-frame.R,line=162,col=101,[line_length_linter] Lines should not be more than 100 characters. This line is 102 characters.
message(
"Some column names are qualified because there were duplicate aliases ",
"in dataset:\n", paste0(alias_info, collapse = ", ")
)
}
}

attr(out, "meta") <- meta
out
}

csvToDataFrame <- function(csv_df, crdf) {
ds <- attr(crdf, "crunchDataset")
mode <- attr(crdf, "mode")
## Use `variableMetadata` to avoid a GET on each variable entity for
## categories and subvariables
## Subset variableMetadata on the urls of the variables in the ds in case
## `ds` has only a subset of variables
ds@variables <- variableMetadata(ds)[urls(allVariables(ds))]
csvToDataFrame <- function(csv_df,
cr_data,
parsing_info,
array_strategy = c("alias", "qualified_alias", "packed"),
categorical.mode = "factor") {
array_strategy <- match.arg(array_strategy)
meta <- attr(parsing_info, "meta")
## CrunchDataFrames contain both server variables and local variables.
## Iterate over the names of crdf to preserve the desired order.
## Nest individual columns in a list and then unlist all because array
## variables can return multiple columns
out <- unlist(lapply(names(crdf), function(a) {
v <- ds[[a]]
var_order <- if (inherits(cr_data, "CrunchDataFrame")) names(cr_data) else aliases(allVariables(cr_data))

Check warning on line 182 in R/as-data-frame.R

View workflow job for this annotation

GitHub Actions / test-coverage

file=R/as-data-frame.R,line=182,col=101,[line_length_linter] Lines should not be more than 100 characters. This line is 109 characters.
## Iterate over the names of cr_data to preserve the desired order.
## Nest everything in an extra layer of lists because one layer is removed
out <- unlist(lapply(var_order, function(a) {
meta_idx <- match(a, aliases(meta))
v <- if (!is.na(meta_idx)) meta[[meta_idx[1]]] else NULL
if (is.null(v)) {
## Not in the dataset, so it exists only in the CRDF. Get it there.
return(structure(list(crdf[[a]]), .Names = a))
} else if (is.Array(v)) {
return(structure(list(cr_data[[a]]), .Names = a))
} else if (type(v) %in% ARRAY_TYPES) {
## Find the subvar columns in the csv_df and parse them as categorical
if (is.NumericArray(v)) {
cp <- columnParser("numeric")
if (type(v) == "numeric_array") {
cp <- numericCsvParser
} else {
cp <- columnParser("categorical")
}
sub_a <- aliases(subvariables(v))
return(structure(lapply(csv_df[sub_a], cp, v, mode), .Names = sub_a))
} else if (is.Numeric(v)) {
# When data is downloaded using write.csv it includes the name of
# the No Data category instead of a missing value, and this is read
# into R as a character vector. The data needs to be downloaded in
# this form to preserve the missing categories for categorical data.
# We use as.numeric to convert this to numeric and coerce the "No
# Data" elements to NA. So c("1", "No Data", "2.7") becomes c(1, NA,
# 2.7). as.numeric issues a warning when coercion creates NAs, and
# because we expect that, we suppress the warning.
df_vect <- suppressWarnings(as.numeric(csv_df[[a]]))
return(structure(list(df_vect), .Names = a))
subvar_info <- parsing_info[!is.na(parsing_info$parent_alias) & parsing_info$parent_alias == alias(v), ]

Check warning on line 198 in R/as-data-frame.R

View workflow job for this annotation

GitHub Actions / test-coverage

file=R/as-data-frame.R,line=198,col=101,[line_length_linter] Lines should not be more than 100 characters. This line is 116 characters.
cols <- csv_df[, subvar_info$qualified_alias]
if (array_strategy == "alias"){
return(structure(lapply(cols, cp, v, categorical.mode), .Names = subvar_info$cond_qualified_alias))

Check warning on line 201 in R/as-data-frame.R

View workflow job for this annotation

GitHub Actions / test-coverage

file=R/as-data-frame.R,line=201,col=101,[line_length_linter] Lines should not be more than 100 characters. This line is 115 characters.
} else if (array_strategy == "qualified_alias") {
return(structure(lapply(cols, cp, v, categorical.mode), .Names = subvar_info$qualified_alias))

Check warning on line 203 in R/as-data-frame.R

View workflow job for this annotation

GitHub Actions / test-coverage

file=R/as-data-frame.R,line=203,col=101,[line_length_linter] Lines should not be more than 100 characters. This line is 110 characters.
} else { # array_strategy==packed
# Extra list layer to hold the array variable's alias
return(structure(
list(
structure(
lapply(cols, cp, v, categorical.mode),
class = "data.frame",
.Names = subvar_info$orig_alias,
row.names = c(NA, -nrow(csv_df))
)
),
.Names = alias(v)
))
}
} else {
cp <- columnParser(type(v))
return(structure(list(cp(csv_df[[a]], v, mode)), .Names = a))
type <- type(v)
cp <- switch(type, "numeric" = numericCsvParser, "text" = textCsvParser, columnParser(type))

Check warning on line 220 in R/as-data-frame.R

View workflow job for this annotation

GitHub Actions / test-coverage

file=R/as-data-frame.R,line=220,col=101,[line_length_linter] Lines should not be more than 100 characters. This line is 104 characters.
return(structure(list(cp(csv_df[[a]], v, categorical.mode)), .Names = a))
}
}), recursive = FALSE)

## Wrap that list of columns in a data.frame structure
return(structure(out, class = "data.frame", row.names = c(NA, -nrow(ds))))
return(structure(out, class = "data.frame", row.names = c(NA, -nrow(csv_df))))
}

# We pass missing_values to export so no longer have to worry about finding text
# in a numeric variable
numericCsvParser <- function(col, ...) col

# When data comes from a csv it should already be text (and definitely won't be
# a list with missing reasons included like JSON's text columnParser)
textCsvParser <- function(col, ...) col


#' as.data.frame method for catalog objects
#'
#' This method gives you a view of a catalog, such as a `VariableCatalog`, as a
Expand Down
1 change: 1 addition & 0 deletions R/variable-metadata.R
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,7 @@ flattenVariableMetadata <- function(vm) {
## Add the parent ref
x$parent <- u
x$parent_alias <- this$alias
x$type <- if (this$type == "numeric_array") "numeric" else "categorical"
return(x)
})
return(out)
Expand Down
2 changes: 2 additions & 0 deletions R/variable-type.R
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,8 @@ is.subvariable <- function(x) {

CASTABLE_TYPES <- c("numeric", "text", "categorical") ## Add datetime when server supports

ARRAY_TYPES <- c("categorical_array", "multiple_response", "numeric_array")

#' Change Crunch variable types
#'
#' Numeric, text, and categorical variables can be cast to one another by
Expand Down
1 change: 1 addition & 0 deletions R/variable.R
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
setMethod("tuple", "VariableTuple", function(x) x)
setMethod("tuple", "CrunchVariable", function(x) x@tuple)
setMethod("tuple<-", "CrunchVariable", function(x, value) {
x@tuple <- value
Expand Down
111 changes: 111 additions & 0 deletions dev-misc/fixture-creation/dup-dataset.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
library(crunch)
library(here)
library(fs)
library(httptest)
library(purrr)

setupCrunchAuth("team")

source(here("dev-misc/fixture-creation/redactors.R"))

# Make a dataset with duplicate aliases (in subvariables)
ds <- newDataset(data.frame(
x1 = 1:3,
x2 = 2:4,
y1 = factor(letters[1:3], levels = letters[1:5]),
y2 = factor(letters[2:4], levels = letters[1:5]),
z = factor(letters[11:13], levels = letters[11:15])
), "dup test")

ds$x <- deriveArray(
list(
VarDef(ds$x1, name = "x1", alias = "x1"),
VarDef(ds$x2, name = "x2_derived", alias = "x2_derived")
),
name = "x",
numeric = TRUE
)

ds$y <- deriveArray(
list(
VarDef(ds$y1, name = "y1", alias = "y1"),
VarDef(ds$y2, name = "z", alias = "z")
),
name = "y",
numeric = FALSE
)

mv(projects()[["Vegetables fixture"]], ds, projects()[["Vegetables fixture"]])
ds <- refresh(ds)
ds_url <- self(ds)

# Capture fixtures ----
set_redactor(response_redactor(ds, "dup"))
set_requester(request_redactor(ds, "dup"))
## Capture dataset fixtures ----
### General dataset capture ----
temp_dir <- tempfile()
httpcache::clearCache()
dir_create(temp_dir)
start_capturing(temp_dir)

ds <- loadDataset(ds_url)
aliases(allVariables(ds))
# Don't actually export because httptest doesn't get it right
# but we do need the export views and metadata
exporters <- crGET(shojiURL(ds, "views", "export"))
var_meta <- variableMetadata(ds)

stop_capturing()

stabilize_json_files(
temp_dir,
list(
"app.crunch.io/api/datasets/dup.json",
list(list("body", "current_editor_name"), "User"),
list(list("body", "owner_name"), "User"),
list(list("body", "creation_time"), "2024-01-01T21:25:59.791000"),
list(list("body", "modification_time"), "2024-01-01T21:26:43.038000"),
list(list("body", "access_time"), "2024-01-01T21:26:43.038000"),
list(
# --- Only keep the palettes from the project folder so changes to crunch org
# --- don't affect fixtures. Maybe it'd be better to ask for a rcrunch test
# --- account, but this is okay for now
list("body", "palette", "analysis"),
function(x) {
purrr::keep(x, ~.$name %in% c("Default green palette for fixture", "purple palette for fixture"))
}
),
list(list("urls", "owner_url"), "https://app.crunch.io/api/projects/pid/")
)
)

dir_delete(here("mocks/app.crunch.io/api/datasets/dup/"))
file_copy(
path(temp_dir, "app.crunch.io/api/datasets/dup.json"),
here("mocks/app.crunch.io/api/datasets/dup.json"),
overwrite = TRUE
)
dir_copy(
path(temp_dir, "app.crunch.io/api/datasets/dup/"),
here("mocks/app.crunch.io/api/datasets/dup/"),
overwrite = TRUE
)


write.csv(
ds,
here("mocks", "dataset-fixtures", "dup.csv"),
categorical = "id",
include.hidden = TRUE,
missing_values = ""#,
# header_field = "qualified_alias" # This will only work after #188045851 ships
)

# Mock what header_field="qualified_alias" will look like after #188045851 ships
lines <- readLines(here("mocks", "dataset-fixtures", "dup.csv"))
lines[1] <- "x1,x2,y1,y2,z,x[x1],x[x2_derived],y[y1],y[z]"
writeLines(lines, here("mocks", "dataset-fixtures", "dup.csv"))


with_consent(delete(ds))
5 changes: 4 additions & 1 deletion dev-misc/fixture-creation/redactors.R
Original file line number Diff line number Diff line change
Expand Up @@ -178,6 +178,8 @@ ids_from_ds <- function(ds, desired_ds_id) {


stable_var_alias_order <- function(ds) {
if (name(ds) != "Vegetables example") return(aliases(allVariables(ds)))

saved_order_path <- here::here("dev-misc/fixture-creation/var_order.csv")
saved_var_order <- suppressWarnings(try(read.csv(saved_order_path, stringsAsFactors = FALSE)[[1]], silent = TRUE))
if (inherits(saved_var_order, "try-error")) {
Expand All @@ -203,6 +205,7 @@ ids_from_folders <- function(ds) {
)

out <- unlist(out)
if (length(out) == 0) return()
setNames(out, sprintf("vdir_%02d", seq_along(out)))
}

Expand All @@ -214,7 +217,7 @@ ids_below <- function(folder) {
}

ids_from_decks <- function(ds) {
if (length(decks(ds)) == 0) return
if (length(decks(ds)) == 0) return()
deck_ids <- lapply(seq_along(decks(ds)), function(deck_num) {
deck <- refresh(decks(ds)[[deck_num]])
slide_ids <- lapply(seq_along(refresh(slides(deck))), function(slide_num) {
Expand Down
Loading
Loading