Crunch-io · crunchbot · Aug 21, 2024 · Aug 1, 2024 · Aug 2, 2024 · Aug 5, 2024
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -49,7 +49,7 @@ Suggests:
 VignetteBuilder: knitr
 Language: en-US
 Encoding: UTF-8
-RoxygenNote: 7.2.3
+RoxygenNote: 7.3.2
 Roxygen: list(markdown = TRUE)
 LazyData: true
 Collate:

diff --git a/R/as-data-frame.R b/R/as-data-frame.R
@@ -29,9 +29,9 @@
 #' levels matching the Crunch Variable's categories (the default), or, if
 #' `categorical.mode` is specified as "id" or "numeric", a numeric vector of
 #' category ids or numeric values, respectively
-#' * Array variables (Categorical Array, Multiple Response) are decomposed into
-#' their constituent categorical subvariables. An array with three subvariables,
-#' for example, will result in three columns in the `data.frame`.
+#' * Array variables (Categorical Array, Multiple Response) can be decomposed into
+#' their constituent categorical subvariables or put in 'packed' data.frame columns,
+#' see the `array_strategy` argument.
 #'
 #' Column names in the `data.frame` are the variable/subvariable aliases.
 #'
@@ -46,6 +46,14 @@
 #' @param categorical.mode what mode should categoricals be pulled as? One of
 #' factor, numeric, id (default: factor)
 #' @param include.hidden logical: should hidden variables be included? (default: `TRUE`)
+#' @param array_strategy Strategy to import array variables: "alias" (the default)
+#' reads them as flat variables with the subvariable aliases, unless there are duplicate
+#' aliases in which case they are qualified in brackets after the array alias,
+#' like "array_alias\[subvar_alias\]". "qualified_alias" always uses the bracket notation.
+#' "packed" reads them in what the tidyverse calls "packed" data.frame columns, with the
+#' alias from the array variable, and subvariables as the columns of the data.frame.
+#' @param verbose Whether to output a message to the console when subvariable aliases
+#' are qualified when array_strategy="alias" (defaults to TRUE)
 #' @param ... additional arguments passed to `as.data.frame` (default method).
 #' @return When called on a `CrunchDataset`, the method returns an object of
 #' class `CrunchDataFrame` unless `force = TRUE`, in which case the return is a
@@ -70,7 +78,7 @@
         include.hidden = include.hidden
     )
     if (force) {
-        out <- as.data.frame(out)
+        out <- as.data.frame(out, ...)
     }
     return(out)
 }
@@ -82,63 +90,151 @@
                                           row.names = NULL,
                                           optional = FALSE,
                                           include.hidden = attr(x, "include.hidden"),
+                                          array_strategy = c("alias", "qualified_alias", "packed"),
+                                          verbose = TRUE,
                                           ...) {
+    array_strategy <- match.arg(array_strategy)
     ds <- attr(x, "crunchDataset")
     tmp <- tempfile()
     on.exit(unlink(tmp))
-    write.csv(ds, tmp, categorical = "id", include.hidden = include.hidden)
-    # TODO: use variableMetadata to provide all `colClasses`?
-    # meta <- variableMetadata(ds)
-    ds_out <- read.csv(tmp, stringsAsFactors = FALSE, check.names = FALSE)
-    return(csvToDataFrame(ds_out, x))
+    write.csv(
+        ds,
+        tmp,
+        categorical = "id",
+        header_field = "qualified_alias",
+        missing_values = "",
+        include.hidden = include.hidden
+    )
+
+    parsing_info <- csvColInfo(ds, verbose = verbose && array_strategy == "alias")
+
+    # guessing has been good enough (and distinguishes between Date and POSIXct class for us)
+    # except for text variables, so continue to guess the parsing info for all columns besides text
+    col_classes <- setNames(
+        ifelse(parsing_info$var_type == "text", "character", NA_character_),
+        parsing_info$qualified_alias
+    )
+
+    ds_out <- read.csv(
+        tmp,
+        stringsAsFactors = FALSE,
+        check.names = FALSE,
+        colClasses = col_classes,
+        na.strings = ""
+    )
+    dup_csv_names <- duplicated(names(ds_out))
+    if (any(dup_csv_names)) {
+        stop(
+            "csv has duplicate column headers, cannot parse: ",
+            paste0(unique(names(ds_out)[dup_csv_names]), collapse = ", ")
+        )
+    }
+    return(csvToDataFrame(ds_out, x, parsing_info, array_strategy, categorical.mode = attr(x, "mode")))
+}
+
+csvColInfo <- function(ds, verbose = TRUE) {
+    # Get variable metadata for variables included in the export
+    meta <- variableMetadata(ds)[urls(allVariables(ds))]
+    flattened_meta <- flattenVariableMetadata(meta)
+
+    orig_aliases <- aliases(flattened_meta)
+    parent_aliases <- vapply(flattened_meta, function(x) x$parent_alias %||% NA_character_, character(1))
+    qualified_aliases <- ifelse(
+        is.na(parent_aliases),
+        orig_aliases,
+        paste0(parent_aliases, "[", orig_aliases, "]")
+    )
+    # cond_qualified_aliases are only qualified if there are duplicates
+    dup_aliases <- orig_aliases[duplicated(orig_aliases)]
+    cond_qualified_aliases <- ifelse(orig_aliases %in% dup_aliases, qualified_aliases, orig_aliases)
+    out <- data.frame(
+        orig_alias = orig_aliases,
+        parent_alias = parent_aliases,
+        qualified_alias = qualified_aliases,
+        cond_qualified_alias = cond_qualified_aliases,
+        var_type = types(flattened_meta)
+    )
+    out <- out[!out$var_type %in% ARRAY_TYPES, ]
+
+    if (verbose) {
+        msg_rows <- out$cond_qualified_alias != out$orig_alias
+        if (any(msg_rows)) {
+            alias_info <- paste0(out$orig_alias[msg_rows], " -> ", out$cond_qualified_alias[msg_rows])
+            message(
+                "Some column names are qualified because there were duplicate aliases ",
+                "in dataset:\n", paste0(alias_info, collapse = ", ")
+            )
+        }
+    }
+
+    attr(out, "meta") <- meta
+    out
 }
 
-csvToDataFrame <- function(csv_df, crdf) {
-    ds <- attr(crdf, "crunchDataset")
-    mode <- attr(crdf, "mode")
-    ## Use `variableMetadata` to avoid a GET on each variable entity for
-    ## categories and subvariables
-    ## Subset variableMetadata on the urls of the variables in the ds in case
-    ## `ds` has only a subset of variables
-    ds@variables <- variableMetadata(ds)[urls(allVariables(ds))]
+csvToDataFrame <- function(csv_df,
+                           cr_data,
+                           parsing_info,
+                           array_strategy = c("alias", "qualified_alias", "packed"),
+                           categorical.mode = "factor") {
+    array_strategy <- match.arg(array_strategy)
+    meta <- attr(parsing_info, "meta")
     ## CrunchDataFrames contain both server variables and local variables.
-    ## Iterate over the names of crdf to preserve the desired order.
-    ## Nest individual columns in a list and then unlist all because array
-    ## variables can return multiple columns
-    out <- unlist(lapply(names(crdf), function(a) {
-        v <- ds[[a]]
+    var_order <- if (inherits(cr_data, "CrunchDataFrame")) names(cr_data) else aliases(allVariables(cr_data))
+    ## Iterate over the names of cr_data to preserve the desired order.
+    ## Nest everything in an extra layer of lists because one layer is removed
+    out <- unlist(lapply(var_order, function(a) {
+        meta_idx <- match(a, aliases(meta))
+        v <- if (!is.na(meta_idx)) meta[[meta_idx[1]]] else NULL
         if (is.null(v)) {
             ## Not in the dataset, so it exists only in the CRDF. Get it there.
-            return(structure(list(crdf[[a]]), .Names = a))
-        } else if (is.Array(v)) {
+            return(structure(list(cr_data[[a]]), .Names = a))
+        } else if (type(v) %in% ARRAY_TYPES) {
             ## Find the subvar columns in the csv_df and parse them as categorical
-            if (is.NumericArray(v)) {
-                cp <- columnParser("numeric")
+            if (type(v) == "numeric_array") {
+                cp <- numericCsvParser
             } else {
                 cp <- columnParser("categorical")
             }
-            sub_a <- aliases(subvariables(v))
-            return(structure(lapply(csv_df[sub_a], cp, v, mode), .Names = sub_a))
-        } else if (is.Numeric(v)) {
-            # When data is downloaded using write.csv it includes the name of
-            # the No Data category instead of a missing value, and this is read
-            # into R as a character vector. The data needs to be downloaded in
-            # this form to preserve the missing categories for categorical data.
-            # We use as.numeric to convert this to numeric and coerce the "No
-            # Data" elements to NA. So c("1", "No Data", "2.7") becomes c(1, NA,
-            # 2.7). as.numeric issues a warning when coercion creates NAs, and
-            # because we expect that, we suppress the warning.
-            df_vect <- suppressWarnings(as.numeric(csv_df[[a]]))
-            return(structure(list(df_vect), .Names = a))
+            subvar_info <- parsing_info[!is.na(parsing_info$parent_alias) & parsing_info$parent_alias == alias(v), ]
+            cols <- csv_df[, subvar_info$qualified_alias]
+            if (array_strategy == "alias"){
+                return(structure(lapply(cols, cp, v, categorical.mode), .Names = subvar_info$cond_qualified_alias))
+            } else if (array_strategy == "qualified_alias") {
+                return(structure(lapply(cols, cp, v, categorical.mode), .Names = subvar_info$qualified_alias))
+            } else { # array_strategy==packed
+                # Extra list layer to hold the array variable's alias
+                return(structure(
+                    list(
+                        structure(
+                            lapply(cols, cp, v, categorical.mode),
+                            class = "data.frame",
+                            .Names = subvar_info$orig_alias,
+                            row.names = c(NA, -nrow(csv_df))
+                        )
+                    ),
+                    .Names = alias(v)
+                ))
+            }
         } else {
-            cp <- columnParser(type(v))
-            return(structure(list(cp(csv_df[[a]], v, mode)), .Names = a))
+            type <- type(v)
+            cp <- switch(type, "numeric" = numericCsvParser, "text" = textCsvParser, columnParser(type))
+            return(structure(list(cp(csv_df[[a]], v, categorical.mode)), .Names = a))
         }
     }), recursive = FALSE)
+
     ## Wrap that list of columns in a data.frame structure
-    return(structure(out, class = "data.frame", row.names = c(NA, -nrow(ds))))
+    return(structure(out, class = "data.frame", row.names = c(NA, -nrow(csv_df))))
 }
 
+# We pass missing_values to export so no longer have to worry about finding text
+# in a numeric variable
+numericCsvParser <- function(col, ...) col
+
+# When data comes from a csv it should already be text (and definitely won't be
+# a list with missing reasons included like JSON's text columnParser)
+textCsvParser <- function(col, ...) col
+
+
 #' as.data.frame method for catalog objects
 #'
 #' This method gives you a view of a catalog, such as a `VariableCatalog`, as a

diff --git a/R/variable-metadata.R b/R/variable-metadata.R
@@ -56,6 +56,7 @@ flattenVariableMetadata <- function(vm) {
                 ## Add the parent ref
                 x$parent <- u
                 x$parent_alias <- this$alias
+                x$type <- if (this$type == "numeric_array") "numeric" else "categorical"
                 return(x)
             })
             return(out)

diff --git a/R/variable-type.R b/R/variable-type.R
@@ -65,6 +65,8 @@ is.subvariable <- function(x) {
 
 CASTABLE_TYPES <- c("numeric", "text", "categorical") ## Add datetime when server supports
 
+ARRAY_TYPES <- c("categorical_array", "multiple_response", "numeric_array")
+
 #' Change Crunch variable types
 #'
 #' Numeric, text, and categorical variables can be cast to one another by

diff --git a/R/variable.R b/R/variable.R
@@ -1,3 +1,4 @@
+setMethod("tuple", "VariableTuple", function(x) x)
 setMethod("tuple", "CrunchVariable", function(x) x@tuple)
 setMethod("tuple<-", "CrunchVariable", function(x, value) {
     x@tuple <- value

diff --git a/dev-misc/fixture-creation/dup-dataset.R b/dev-misc/fixture-creation/dup-dataset.R
@@ -0,0 +1,111 @@
+library(crunch)
+library(here)
+library(fs)
+library(httptest)
+library(purrr)
+
+setupCrunchAuth("team")
+
+source(here("dev-misc/fixture-creation/redactors.R"))
+
+# Make a dataset with duplicate aliases (in subvariables)
+ds <- newDataset(data.frame(
+    x1 = 1:3,
+    x2 = 2:4,
+    y1 = factor(letters[1:3], levels = letters[1:5]),
+    y2 = factor(letters[2:4], levels = letters[1:5]),
+    z = factor(letters[11:13], levels = letters[11:15])
+), "dup test")
+
+ds$x <- deriveArray(
+    list(
+        VarDef(ds$x1, name = "x1", alias = "x1"),
+        VarDef(ds$x2, name = "x2_derived", alias = "x2_derived")
+    ),
+    name = "x",
+    numeric = TRUE
+)
+
+ds$y <- deriveArray(
+    list(
+        VarDef(ds$y1, name = "y1", alias = "y1"),
+        VarDef(ds$y2, name = "z", alias = "z")
+    ),
+    name = "y",
+    numeric = FALSE
+)
+
+mv(projects()[["Vegetables fixture"]], ds, projects()[["Vegetables fixture"]])
+ds <- refresh(ds)
+ds_url <- self(ds)
+
+# Capture fixtures ----
+set_redactor(response_redactor(ds, "dup"))
+set_requester(request_redactor(ds, "dup"))
+## Capture dataset fixtures ----
+### General dataset capture ----
+temp_dir <- tempfile()
+httpcache::clearCache()
+dir_create(temp_dir)
+start_capturing(temp_dir)
+
+ds <- loadDataset(ds_url)
+aliases(allVariables(ds))
+# Don't actually export because httptest doesn't get it right
+# but we do need the export views and metadata
+exporters <- crGET(shojiURL(ds, "views", "export"))
+var_meta <- variableMetadata(ds)
+
+stop_capturing()
+
+stabilize_json_files(
+    temp_dir,
+    list(
+        "app.crunch.io/api/datasets/dup.json",
+        list(list("body", "current_editor_name"), "User"),
+        list(list("body", "owner_name"), "User"),
+        list(list("body", "creation_time"), "2024-01-01T21:25:59.791000"),
+        list(list("body", "modification_time"), "2024-01-01T21:26:43.038000"),
+        list(list("body", "access_time"), "2024-01-01T21:26:43.038000"),
+        list(
+            # --- Only keep the palettes from the project folder so changes to crunch org
+            # --- don't affect fixtures. Maybe it'd be better to ask for a rcrunch test
+            # --- account, but this is okay for now
+            list("body", "palette", "analysis"),
+            function(x) {
+                purrr::keep(x, ~.$name %in% c("Default green palette for fixture", "purple palette for fixture"))
+            }
+        ),
+        list(list("urls", "owner_url"), "https://app.crunch.io/api/projects/pid/")
+    )
+)
+
+dir_delete(here("mocks/app.crunch.io/api/datasets/dup/"))
+file_copy(
+    path(temp_dir, "app.crunch.io/api/datasets/dup.json"),
+    here("mocks/app.crunch.io/api/datasets/dup.json"),
+    overwrite = TRUE
+)
+dir_copy(
+    path(temp_dir, "app.crunch.io/api/datasets/dup/"),
+    here("mocks/app.crunch.io/api/datasets/dup/"),
+    overwrite = TRUE
+)
+
+
+write.csv(
+    ds,
+    here("mocks", "dataset-fixtures", "dup.csv"),
+    categorical = "id",
+    include.hidden = TRUE,
+    missing_values = ""#,
+    # header_field = "qualified_alias" # This will only work after #188045851 ships
+)
+
+# Mock what header_field="qualified_alias" will look like after #188045851 ships
+lines <- readLines(here("mocks", "dataset-fixtures", "dup.csv"))
+lines[1] <- "x1,x2,y1,y2,z,x[x1],x[x2_derived],y[y1],y[z]"
+writeLines(lines, here("mocks", "dataset-fixtures", "dup.csv"))
+
+
+with_consent(delete(ds))
diff --git a/dev-misc/fixture-creation/redactors.R b/dev-misc/fixture-creation/redactors.R
@@ -178,6 +178,8 @@ ids_from_ds <- function(ds, desired_ds_id) {
 
 
 stable_var_alias_order <- function(ds) {
+    if (name(ds) != "Vegetables example") return(aliases(allVariables(ds)))
+
     saved_order_path <- here::here("dev-misc/fixture-creation/var_order.csv")
     saved_var_order <- suppressWarnings(try(read.csv(saved_order_path, stringsAsFactors = FALSE)[[1]], silent = TRUE))
     if (inherits(saved_var_order, "try-error")) {
@@ -203,6 +205,7 @@ ids_from_folders <- function(ds) {
     )
 
     out <- unlist(out)
+    if (length(out) == 0) return()
     setNames(out, sprintf("vdir_%02d", seq_along(out)))
 }
 
@@ -214,7 +217,7 @@ ids_below <- function(folder) {
 }
 
 ids_from_decks <- function(ds) {
-    if (length(decks(ds)) == 0) return
+    if (length(decks(ds)) == 0) return()
     deck_ids <- lapply(seq_along(decks(ds)), function(deck_num) {
         deck <- refresh(decks(ds)[[deck_num]])
         slide_ids <- lapply(seq_along(refresh(slides(deck))), function(slide_num) {