diff --git a/DESCRIPTION b/DESCRIPTION index b130c0e..68d439e 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,5 +1,5 @@ Package: dataverse -Version: 0.3.14 +Version: 0.3.15 Title: Client for Dataverse 4+ Repositories Authors@R: c(person(given = "Shiro", @@ -37,14 +37,22 @@ Authors@R: family = "Gruber", role = c("ctb"), email = "JohannesB.Gruber@gmail.com", - comment = c(ORCID = "0000-0001-9177-1772"))) + comment = c(ORCID = "0000-0001-9177-1772")), + person(given = "Martin", + family = "Morgan", + role = "ctb", + email = "mtmorgan.xyz@gmail.com", + comment = c(ORCID = "0000-0002-5874-8148"))) Imports: checkmate, httr, + memoise, + cachem, jsonlite, readr, stats, utils, + tools, xml2 Suggests: covr, diff --git a/NAMESPACE b/NAMESPACE index 32d841a..13af94f 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -24,6 +24,10 @@ S3method(print,get_file) S3method(print,sword_service_document) export(add_dataset_file) export(add_file) +export(cache_dataset) +export(cache_info) +export(cache_path) +export(cache_reset) export(create_dataset) export(create_dataverse) export(dataset_atom) @@ -63,3 +67,8 @@ export(service_document) export(set_dataverse_metadata) export(update_dataset) export(update_dataset_file) +importFrom(cachem,cache_disk) +importFrom(checkmate,assert_string) +importFrom(checkmate,test_string) +importFrom(memoise,memoise) +importFrom(tools,R_user_dir) diff --git a/NEWS.md b/NEWS.md index 50b86d9..8473970 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,5 +1,9 @@ # dataverse +# CHANGES in dataverse 0.3.15 + +* Implement a cache for API calls (including file download) when dataset version is specified. The functions will reload from the cache automatically the second time. (#112, #135, by @mtmorgan) + # CHANGES in dataverse 0.3.14 * Improve recommendation for rdata loading (#107, #127) diff --git a/R/SWORD.R b/R/SWORD.R index c8f4a2e..acdb793 100644 --- a/R/SWORD.R +++ b/R/SWORD.R @@ -17,9 +17,8 @@ #' @export service_document <- function(key = Sys.getenv("DATAVERSE_KEY"), server = Sys.getenv("DATAVERSE_SERVER"), ...) { u <- paste0(api_url(server, prefix = "dvn/api/"), "data-deposit/v1.1/swordv2/service-document") - r <- httr::GET(u, httr::authenticate(key, ""), ...) - httr::stop_for_status(r) - x <- xml2::as_list(xml2::read_xml(httr::content(r, "text"))) + r <- api_get(u, httr::authenticate(key, ""), ...) + x <- xml2::as_list(xml2::read_xml(r)) w <- x$workspace out <- list() if ("title" %in% names(w)) { @@ -74,11 +73,10 @@ list_datasets <- function(dataverse, key = Sys.getenv("DATAVERSE_KEY"), server = dataverse <- get_dataverse(dataverse, key = key, server = server, ...)$alias } u <- paste0(api_url(server, prefix = "dvn/api/"), "data-deposit/v1.1/swordv2/collection/dataverse/", dataverse) - r <- httr::GET(u, httr::authenticate(key, ""), ...) - httr::stop_for_status(r) + r <- api_get(u, httr::authenticate(key, ""), ..., as = "raw") # clean up response structure - x <- xml2::as_list(xml2::read_xml(r$content)) + x <- xml2::as_list(xml2::read_xml(r)) feed <- x[["feed"]] out <- list(title = feed[["title"]][[1L]], generator = feed[["generator"]], diff --git a/R/SWORD_dataset.R b/R/SWORD_dataset.R index c5209fc..dcd859f 100644 --- a/R/SWORD_dataset.R +++ b/R/SWORD_dataset.R @@ -205,9 +205,8 @@ dataset_atom <- function(dataset, key = Sys.getenv("DATAVERSE_KEY"), server = Sy u <- paste0(api_url(server, prefix="dvn/api/"), "data-deposit/v1.1/swordv2/edit/study/", dataset) } - r <- httr::GET(u, httr::authenticate(key, ""), ...) - httr::stop_for_status(r, task = httr::content(r)$message) - out <- parse_atom(rawToChar(r$content)) + r <- api_get(u, httr::authenticate(key, ""), ..., as = "raw") + out <- parse_atom(rawToChar(r)) out } @@ -229,7 +228,6 @@ dataset_statement <- function(dataset, key = Sys.getenv("DATAVERSE_KEY"), server dataset <- prepend_doi(dataset) u <- paste0(api_url(server, prefix="dvn/api/"), "data-deposit/v1.1/swordv2/statement/study/", dataset) } - r <- httr::GET(u, httr::authenticate(key, ""), ...) - httr::stop_for_status(r, task = httr::content(r)$message) - parse_dataset_statement(rawToChar(r$content)) + r <- api_get(u, httr::authenticate(key, ""), ..., as = "raw") + parse_dataset_statement(rawToChar(r)) } diff --git a/R/cache.R b/R/cache.R new file mode 100644 index 0000000..c26949d --- /dev/null +++ b/R/cache.R @@ -0,0 +1,93 @@ +#' @rdname cache +#' @aliases use_cache +#' @title Utilities for cache management +#' @description The dataverse package uses disk and session caches to improve network performance. Use of the cache is described on this page. +#' @details +#' Use of the cache is determined by the value of the `use_cache =` argument to dataset and other API calls, or by the environment variable `DATAVERSE_USE_CACHE`. Possible values are +#' +#' - `"none"`: do not use the cache. This is the default for datasets that are versioned with `":draft"`, `":latest"`, and `":latest-published"`. +#' - `"session"`: cache API requests for the duration of the *R* session. This is the default for API calls that do not involve file or dataset retrieval. +#' - `"disk": use a permanent disk cache. This is the default for files and explicitly versioned datasets. +#' +#' @template version +#' @details +#' `cache_dataset()` determines whether a dataset or file should be cached based on the version specification. +#' @return +#' `cache_dataset()` returns `"disk"` if the dataset version is to be cached to disk, `"none"` otherwise. +#' @importFrom checkmate assert_string +#' @examples +#' cache_dataset(":latest") # "none" +#' cache_dataset("1.2") # "disk" +#' @export +cache_dataset <- function(version) { + assert_string(version) + if (version %in% c(":draft", ":latest", ":latest-published")) { + "none" + } else { + "disk" + } +} + +#' @rdname cache +#' @details +#' `cache_path()` finds or creates the location (directory) on the file system containing the cache. +#' +#' @return +#' `cache_path()` returns the file path to the directory containing the cache. +#' +#' @examples +#' cache_path() +#' +#' @importFrom tools R_user_dir +#' @export +cache_path <- function() { + cache_path <- file.path(R_user_dir("dataverse", "cache"), "api_cache") + if (!dir.exists(cache_path)) { + status <- dir.create(cache_path, recursive = TRUE) + if (!status) + warning("'dataverse' failed to create a 'disk' cache") + } + + cache_path +} + +#' @rdname cache +#' @details +#' `cache_info()` queries the cache for information about the name, size, and other attributes of files in the cache. The file name is a 'hash' of the function used to retrieve the file; it is not useful for identifying specific files. +#' @return +#' `cache_info()` returns a data.frame containing names and sizes of files in the cache. +#' @examples +#' cache_info() +#' @export +cache_info <- function() { + cache_path <- cache_path() + if (dir.exists(cache_path)) { + files <- dir(cache_path(), full.names = TRUE) + info <- file.info(files) + rownames(info) <- basename(files) + info + } +} + +#' @rdname cache +#' @details +#' `cache_reset()` clears all downloaded files from the disk cache. +#' @returns +#' `cache_reset()` returns the path to the (now empty) cache, invisibly) +#' @export +cache_reset <- function() { + cache_path <- cache_path() + if (dir.exists(cache_path)) + cache_disk(cache_path)$reset() + invisible(cache_path) +} + +## utility to check valid values of `use_cache =` +#' @importFrom checkmate test_string +assert_use_cache <- function(use_cache) { + test <- + test_string(use_cache) && + use_cache %in% c("disk", "session", "none") + if (!test) + stop("argument 'use_cache' is not correct, see ?use_cache") +} diff --git a/R/dataset_versions.R b/R/dataset_versions.R index 17b59c8..31b82c3 100644 --- a/R/dataset_versions.R +++ b/R/dataset_versions.R @@ -20,9 +20,8 @@ dataset_versions <- function(dataset, key = Sys.getenv("DATAVERSE_KEY"), server = Sys.getenv("DATAVERSE_SERVER"), ...) { dataset <- dataset_id(dataset, key = key, server = server, ...) u <- paste0(api_url(server), "datasets/", dataset, "/versions") - r <- httr::GET(u, httr::add_headers("X-Dataverse-key" = key), ...) - httr::stop_for_status(r, task = httr::content(r)$message) - out <- httr::content(r, encoding = "UTF-8")$data + r <- api_get(u, ..., key = key, as = NULL) + out <- r$data lapply(out, function(x) { x <- `class<-`(x, "dataverse_dataset_version") x$files <- lapply(x$files, `class<-`, "dataverse_file") diff --git a/R/dataverse_metadata.R b/R/dataverse_metadata.R index bbf54be..6cd63a6 100644 --- a/R/dataverse_metadata.R +++ b/R/dataverse_metadata.R @@ -18,9 +18,8 @@ dataverse_metadata <- function(dataverse, key = Sys.getenv("DATAVERSE_KEY"), server = Sys.getenv("DATAVERSE_SERVER"), ...) { dataverse <- dataverse_id(dataverse, key = key, server = server, ...) u <- paste0(api_url(server), "dataverses/", dataverse, "/metadatablocks") - r <- httr::GET(u, httr::add_headers("X-Dataverse-key" = key), ...) - httr::stop_for_status(r, task = httr::content(r)$message) - jsonlite::fromJSON(httr::content(r, as = "text", encoding = "UTF-8"), simplifyDataFrame = FALSE)$data + r <- api_get(u, ..., key = key) + jsonlite::fromJSON(r, simplifyDataFrame = FALSE)$data } #' @title Set Dataverse metadata diff --git a/R/dataverse_search.R b/R/dataverse_search.R index 8f72d27..4d70439 100644 --- a/R/dataverse_search.R +++ b/R/dataverse_search.R @@ -103,9 +103,8 @@ function(..., u <- paste0(api_url(server), "search") # execute request - r <- httr::GET(u, httr::add_headers("X-Dataverse-key" = key), query = query) - httr::stop_for_status(r, task = httr::content(r)$message) - out <- jsonlite::fromJSON(httr::content(r, as = "text", encoding = "UTF-8")) + r <- api_get(u, query = query, key = key) + out <- jsonlite::fromJSON(r) if (isTRUE(verbose)) { n_total <- ngettext(out$data$total_count, "result", "results") message(sprintf(paste0("%s of %s ", n_total, " retrieved"), out$data$count_in_response, out$data$total_count)) diff --git a/R/get_dataset.R b/R/get_dataset.R index 61e5351..d7a80fb 100644 --- a/R/get_dataset.R +++ b/R/get_dataset.R @@ -20,6 +20,7 @@ #' @template version #' @template envvars #' @template dots +#' @param use_cache one of `"disk"`, `"session"`, or `"none"`, describing how datasets are cached to reduce network traffic. See \code{\link{cache_dataset}} for details. #' @return A list of class \dQuote{dataverse_dataset} or a list of a form dependent #' on the specific metadata block retrieved. \code{dataset_files} returns a list of #' objects of class \dQuote{dataverse_file}. @@ -45,17 +46,17 @@ get_dataset <- function( version = ":latest", key = Sys.getenv("DATAVERSE_KEY"), server = Sys.getenv("DATAVERSE_SERVER"), - ... + ..., + use_cache = Sys.getenv("DATAVERSE_USE_CACHE", cache_dataset(version)) ) { - dataset <- dataset_id(dataset, key = key, server = server, ...) + dataset <- dataset_id(dataset, key = key, server = server, ..., use_cache = use_cache) if (!is.null(version)) { u <- paste0(api_url(server), "datasets/", dataset, "/versions/", version) } else { u <- paste0(api_url(server), "datasets/", dataset) } - r <- httr::GET(u, httr::add_headers("X-Dataverse-key" = key), ...) - httr::stop_for_status(r, task = httr::content(r)$message) - parse_dataset(httr::content(r, as = "text", encoding = "UTF-8")) + r <- api_get(u, ..., key = key, use_cache = use_cache) + parse_dataset(r) } #' @rdname get_dataset @@ -70,19 +71,18 @@ dataset_metadata <- function( block = "citation", key = Sys.getenv("DATAVERSE_KEY"), server = Sys.getenv("DATAVERSE_SERVER"), - ... + ..., + use_cache = Sys.getenv("DATAVERSE_USE_CACHE", cache_dataset(version)) ) { - dataset <- dataset_id(dataset, key = key, server = server, ...) + dataset <- dataset_id(dataset, key = key, server = server, ..., use_cache = use_cache) if (!is.null(block)) { u <- paste0(api_url(server), "datasets/", dataset, "/versions/", version, "/metadata/", block) } else { u <- paste0(api_url(server), "datasets/", dataset, "/versions/", version, "/metadata") } - r <- httr::GET(u, httr::add_headers("X-Dataverse-key" = key), ...) - httr::stop_for_status(r, task = httr::content(r)$message) - out <- httr::content(r, as = "text", encoding = "UTF-8") - jsonlite::fromJSON(out)[["data"]] + r <- api_get(u, ..., key = key, use_cache = use_cache) + jsonlite::fromJSON(r)[["data"]] } #' @rdname get_dataset @@ -92,12 +92,12 @@ dataset_files <- function( version = ":latest", key = Sys.getenv("DATAVERSE_KEY"), server = Sys.getenv("DATAVERSE_SERVER"), - ... + ..., + use_cache = Sys.getenv("DATAVERSE_USE_CACHE", cache_dataset(version)) ) { - dataset <- dataset_id(dataset, key = key, server = server, ...) + dataset <- dataset_id(dataset, key = key, server = server, ..., use_cache = use_cache) u <- paste0(api_url(server), "datasets/", dataset, "/versions/", version, "/files") - r <- httr::GET(u, httr::add_headers("X-Dataverse-key" = key), ...) - httr::stop_for_status(r, task = httr::content(r)$message) - out <- jsonlite::fromJSON(httr::content(r, as = "text", encoding = "UTF-8"), simplifyDataFrame = FALSE)$data + r <- api_get(u, ..., key = key, use_cache = use_cache) + out <- jsonlite::fromJSON(r, simplifyDataFrame = FALSE)$data structure(lapply(out, `class<-`, "dataverse_file")) } diff --git a/R/get_dataverse.R b/R/get_dataverse.R index ff51744..d0e7c4c 100644 --- a/R/get_dataverse.R +++ b/R/get_dataverse.R @@ -31,9 +31,8 @@ get_dataverse <- function(dataverse, key = Sys.getenv("DATAVERSE_KEY"), server = dataverse <- dataverse_id(dataverse, key = key, server = server, ...) } u <- paste0(api_url(server), "dataverses/", dataverse) - r <- httr::GET(u, httr::add_headers("X-Dataverse-key" = key), ...) - httr::stop_for_status(r, task = httr::content(r)$message) - out <- jsonlite::fromJSON(httr::content(r, as = "text", encoding = "UTF-8")) + r <- api_get(u, ..., key = key) + out <- jsonlite::fromJSON(r) structure(out$data, class = "dataverse") } @@ -42,9 +41,8 @@ get_dataverse <- function(dataverse, key = Sys.getenv("DATAVERSE_KEY"), server = dataverse_contents <- function(dataverse, key = Sys.getenv("DATAVERSE_KEY"), server = Sys.getenv("DATAVERSE_SERVER"), ...) { dataverse <- dataverse_id(dataverse, key = key, server = server, ...) u <- paste0(api_url(server), "dataverses/", dataverse, "/contents") - r <- httr::GET(u, httr::add_headers("X-Dataverse-key" = key), ...) - httr::stop_for_status(r, task = httr::content(r)$message) - out <- jsonlite::fromJSON(httr::content(r, as = "text", encoding = "UTF-8"), simplifyDataFrame = FALSE) + r <- api_get(u, ..., key = key) + out <- jsonlite::fromJSON(r, simplifyDataFrame = FALSE) structure(lapply(out$data, function(x) { `class<-`(x, if (x$type == "dataset") "dataverse_dataset" else "dataverse") })) diff --git a/R/get_facets.R b/R/get_facets.R index eca29b2..b10d75a 100644 --- a/R/get_facets.R +++ b/R/get_facets.R @@ -20,7 +20,6 @@ get_facets <- function(dataverse, key = Sys.getenv("DATAVERSE_KEY"), server = Sys.getenv("DATAVERSE_SERVER"), ...) { dataverse <- dataverse_id(dataverse, key = key, server = server, ...) u <- paste0(api_url(server), "dataverses/", dataverse, "/facets") - r <- httr::GET(u, httr::add_headers("X-Dataverse-key" = key), ...) - httr::stop_for_status(r, task = httr::content(r)$message) - jsonlite::fromJSON(httr::content(r, as = "text", encoding = "UTF-8"))$data + r <- api_get(u, ..., key = key) + jsonlite::fromJSON(r)$data } diff --git a/R/get_file.R b/R/get_file.R index cf2fd92..db8925f 100644 --- a/R/get_file.R +++ b/R/get_file.R @@ -34,6 +34,7 @@ #' @template envvars #' @template dots #' @template ds +#' @template version #' #' @return \code{get_file} returns a raw vector (or list of raw vectors, #' if \code{length(file) > 1}), which can be saved locally with the `writeBin` @@ -89,6 +90,7 @@ get_file <- function( key = Sys.getenv("DATAVERSE_KEY"), server = Sys.getenv("DATAVERSE_SERVER"), original = TRUE, + version = ":latest", ... ) { diff --git a/R/get_file_by_id.R b/R/get_file_by_id.R index 3c24c85..cf00c4d 100644 --- a/R/get_file_by_id.R +++ b/R/get_file_by_id.R @@ -98,16 +98,10 @@ get_file_by_id <- function( if (return_url) { return(httr::modify_url(u, query = query)) } - if (isFALSE(progress)) - r <- httr::GET(u, httr::add_headers("X-Dataverse-key" = key), query = query, ...) - - if (isTRUE(progress)) - r <- httr::GET(u, httr::add_headers("X-Dataverse-key" = key), query = query, httr::progress(type = "down"), ...) - - - - httr::stop_for_status(r, task = httr::content(r)$message) - httr::content(r, as = "raw") + # add a progress bar; 'NULL' if progress is not TRUE. 'NULL' arguments + # are not seen by httr::GET() + progress_bar <- if (isTRUE(progress)) httr::progress(type = "down") + api_get(u, query = query, progress_bar, ..., key = key, as = "raw") } #' @rdname files diff --git a/R/get_file_metadata.R b/R/get_file_metadata.R index 013d430..72bd6c1 100644 --- a/R/get_file_metadata.R +++ b/R/get_file_metadata.R @@ -48,8 +48,5 @@ get_file_metadata <- u <- paste0(api_url(server), "access/datafile/", file, "/metadata/", format) } - r <- httr::GET(u, httr::add_headers("X-Dataverse-key" = key), ...) - httr::stop_for_status(r, task = httr::content(r)$message) - out <- httr::content(r, as = "text", encoding = "UTF-8") - return(out) + api_get(u, ..., key = key) } diff --git a/R/native_role_groups.R b/R/native_role_groups.R index c1b0169..465cf20 100644 --- a/R/native_role_groups.R +++ b/R/native_role_groups.R @@ -79,9 +79,8 @@ update_group <- function(group, name, description, dataverse, key = Sys.getenv(" list_groups <- function(dataverse, key = Sys.getenv("DATAVERSE_KEY"), server = Sys.getenv("DATAVERSE_SERVER"), ...) { dataverse <- dataverse_id(dataverse, key = key, server = server, ...) u <- paste0(api_url(server), "dataverses/", dataverse, "/groups") - r <- httr::GET(u, httr::add_headers("X-Dataverse-key" = key), ...) - httr::stop_for_status(r, task = httr::content(r)$message) - j <- jsonlite::fromJSON(httr::content(r, as = "text", encoding = "UTF-8"), simplifyDataFrame = FALSE)$data + r <- api_get(u, ..., key = key) + j <- jsonlite::fromJSON(r, simplifyDataFrame = FALSE)$data lapply(j, function(x) { x$dataverse <- dataverse class(x) <- "dataverse_group" @@ -99,9 +98,8 @@ get_group <- function(group, dataverse, key = Sys.getenv("DATAVERSE_KEY"), serve dataverse <- dataverse_id(dataverse, key = key, server = server, ...) u <- paste0(api_url(server), "dataverses/", dataverse, "/groups/", group) } - r <- httr::GET(u, httr::add_headers("X-Dataverse-key" = key), ...) - httr::stop_for_status(r, task = httr::content(r)$message) - j <- jsonlite::fromJSON(httr::content(r, as = "text", encoding = "UTF-8"))$data + r <- api_get(u, ..., key = key) + j <- jsonlite::fromJSON(r)$data j$dataverse <- dataverse structure(j, class = "dataverse_group") } diff --git a/R/native_roles.R b/R/native_roles.R index a4a895c..b1d5a11 100644 --- a/R/native_roles.R +++ b/R/native_roles.R @@ -21,9 +21,8 @@ # @export get_role <- function(role, key = Sys.getenv("DATAVERSE_KEY"), server = Sys.getenv("DATAVERSE_SERVER"), ...) { u <- paste0(api_url(server), "roles/", role) - r <- httr::GET(u, httr::add_headers("X-Dataverse-key" = key), ...) - httr::stop_for_status(r, task = httr::content(r)$message) - j <- jsonlite::fromJSON(httr::content(r, as = "text", encoding = "UTF-8"))$data + r <- api_get(u, ..., key) + j <- jsonlite::fromJSON(r)$data j } @@ -43,17 +42,12 @@ list_roles <- function(dataverse, key = Sys.getenv("DATAVERSE_KEY"), server = Sy if (!missing(dataverse)) { dataverse <- dataverse_id(dataverse, key = key, server = server, ...) u <- paste0(api_url(server), "dataverses/", dataverse, "/roles") - r <- httr::GET(u, httr::add_headers("X-Dataverse-key" = key), ...) - httr::stop_for_status(r, task = httr::content(r)$message) - out <- jsonlite::fromJSON(httr::content(r, "text", encoding = "UTF-8"))$data - structure(lapply(out, `class<-`, "dataverse_role")) } else { u <- paste0(api_url(server), "admin/roles") - r <- httr::GET(u, httr::add_headers("X-Dataverse-key" = key), ...) - httr::stop_for_status(r, task = httr::content(r)$message) - out <- jsonlite::fromJSON(httr::content(r, as = "text", encoding = "UTF-8"))$data - structure(lapply(out, `class<-`, "dataverse_role")) } + r <- api_get(u, ..., key = key) + out <- jsonlite::fromJSON(r)$data + structure(lapply(out, `class<-`, "dataverse_role")) } # @rdname roles diff --git a/R/native_roles_assignments.R b/R/native_roles_assignments.R index cfc6d87..99e042e 100644 --- a/R/native_roles_assignments.R +++ b/R/native_roles_assignments.R @@ -19,9 +19,8 @@ get_assignments <- function(dataverse, key = Sys.getenv("DATAVERSE_KEY"), server = Sys.getenv("DATAVERSE_SERVER"), ...) { dataverse <- dataverse_id(dataverse, key = key, server = server, ...) u <- paste0(api_url(server), "dataverses/", dataverse, "/assignments") - r <- httr::GET(u, httr::add_headers("X-Dataverse-key" = key), ...) - httr::stop_for_status(r, task = httr::content(r)$message) - out <- jsonlite::fromJSON(httr::content(r, as = "text", encoding = "UTF-8"), simplifyDataFrame = FALSE)$data + r <- api_get(u, ..., key = key) + out <- jsonlite::fromJSON(r, simplifyDataFrame = FALSE)$data lapply(out, function(x) { x$dataverse <- dataverse class(x) <- "dataverse_role_assignment" diff --git a/R/native_user.R b/R/native_user.R index 59f160e..bbb5399 100644 --- a/R/native_user.R +++ b/R/native_user.R @@ -39,8 +39,7 @@ create_user <- function(password, key = Sys.getenv("DATAVERSE_KEY"), server = Sy #' @export get_user_key <- function(user, password, server = Sys.getenv("DATAVERSE_SERVER"), ...) { u <- paste0(api_url(server), "builtin-users/", user, "/api-token?password=", password) - r <- httr::GET(u, ...) - httr::stop_for_status(r, task = httr::content(r)$message) - j <- jsonlite::fromJSON(httr::content(r, as = "text", encoding = "UTF-8")) + r <- api_get(u, ...) + j <- jsonlite::fromJSON(r) j$data$message } diff --git a/R/onload.R b/R/onload.R index 48de84a..a129f9e 100644 --- a/R/onload.R +++ b/R/onload.R @@ -1,6 +1,20 @@ +#' @importFrom cachem cache_disk +#' +#' @importFrom memoise memoise .onLoad <- function(libname, pkgname) { - # a <- Sys.getenv("DATAVERSE_SERVER") - # if(a == "") { - # Sys.setenv("DATAVERSE_SERVER" = "dataverse.harvard.edu") - # } + ## + ## 'memoise' httr::GET calls + ## + + ## API session cache + api_get_session_cache <<- memoise(api_get_impl) + + ## API disk cache + cache_path <- cache_path() + if (dir.exists(cache_path)) { + # disk cache, no age or size limits + cache <- cache_disk(cache_path) + get_disk <- memoise(api_get_impl, cache = cache) + } + api_get_disk_cache <<- get_disk } diff --git a/R/utils.R b/R/utils.R index 27e613a..4b05544 100644 --- a/R/utils.R +++ b/R/utils.R @@ -26,11 +26,15 @@ dataset_id.default <- function(x, ...) { dataset_id.character <- function(x, key = Sys.getenv("DATAVERSE_KEY"), server = Sys.getenv("DATAVERSE_SERVER"), ...) { x <- prepend_doi(x) u <- paste0(api_url(server), "datasets/:persistentId?persistentId=", x) - r <- tryCatch(httr::GET(u, httr::add_headers("X-Dataverse-key" = key), ...), - error = function(e) { - stop("Could not retrieve Dataset ID from persistent identifier!") - }) - jsonlite::fromJSON(httr::content(r, as = "text", encoding = "UTF-8"))[["data"]][["id"]] + r <- tryCatch({ + api_get(u, ..., key = key) + }, error = function(e) { + stop( + "Could not retrieve Dataset ID from persistent identifier! ", + conditionMessage(e) + ) + }) + jsonlite::fromJSON(r)[["data"]][["id"]] } #' @export dataset_id.dataverse_dataset <- function(x, ...) { @@ -204,6 +208,37 @@ api_url <- function(server = Sys.getenv("DATAVERSE_SERVER"), prefix = "api/") { return(paste0("https://", domain, "/", prefix)) } +## common httr::GET() uses +#' @importFrom checkmate assert_string +api_get <- function(url, ..., key = NULL, as = "text", use_cache = Sys.getenv("DATAVERSE_USE_CACHE", "session")) { + assert_string(url) + assert_string(key, null.ok = TRUE) + assert_string(as, null.ok = TRUE) + assert_use_cache(use_cache) + get <- switch( + use_cache, + "none" = api_get_impl, + "session" = api_get_session_cache, + "disk" = api_get_disk_cache, + stop("unknown value for 'use_cache'") + ) + get(url, ..., key = key, as = as) +} + +## cache implemented via memoization; memoized functions defined in +## .onLoad() +api_get_impl <- function(url, ..., key = NULL, as = "text") { + if (!is.null(key)) + key <- httr::add_headers("X-Dataverse-key", key) + r <- httr::GET(url, ..., key) + httr::stop_for_status(r, task = httr::content(r)$message) + httr::content(r, as = as, encoding = "UTF-8") +} + +api_get_session_cache <- NULL # per-session memoisatoin + +api_get_disk_cache <- NULL # 'permanent' memoisation + # parse dataset response into list/dataframe parse_dataset <- function(out) { out <- jsonlite::fromJSON(out)$data diff --git a/man-roxygen/dots.R b/man-roxygen/dots.R index cf74d0d..89f575e 100644 --- a/man-roxygen/dots.R +++ b/man-roxygen/dots.R @@ -1,3 +1,5 @@ -#' @param ... Additional arguments passed to an HTTP request function, such as -#' \code{\link[httr]{GET}}, \code{\link[httr]{POST}}, or -#' \code{\link[httr]{DELETE}}. +#' @param ... Additional arguments passed to an HTTP request function, +#' such as \code{\link[httr]{GET}}, \code{\link[httr]{POST}}, or +#' \code{\link[httr]{DELETE}}. See \code{\link{use_cache}} for details +#' on how the *R* dataverse package uses disk and session caches to +#' improve network performance. diff --git a/man-roxygen/version.R b/man-roxygen/version.R index 70cfc78..cb109dc 100644 --- a/man-roxygen/version.R +++ b/man-roxygen/version.R @@ -1 +1,11 @@ -#' @param version A character string specifying a version of the dataset. This can be one of \dQuote{:draft} (the current draft), \dQuote{:latest} (the latest draft, if it exists, or the latest published version), \dQuote{:latest-published} (the latest published version, ignoring any draft), or \dQuote{x.y} (where \samp{x} is a major version and \samp{y} is a minor version; the \samp{.y} can be omitted to obtain a major version). In lieu of this, a dataset's version-specific identification number can be used for the \code{dataset} argument. +#' @param version A character specifying a version of the dataset. +#' This can be of the form `"1.1"` or `"1"` (where in `"x.y"`, x is a major +#' version and y is an optional minor version), or +#' `":latest"` (the default, the latest published version). +#' We recommend using the number format so that +#' the function stores a cache of the data (See \code{\link{cache_dataset}}). +#' If the user specifies a `key` or `DATAVERSE_KEY` argument, they can access the +#' draft version by `":draft"` (the current draft) or `":latest"` (which will +#' prioritize the draft over the latest published version. +#' Finally, set `use_cache = "none"` to not read from the cache and re-download +#' afresh even when `version` is provided. diff --git a/man/URLs.Rd b/man/URLs.Rd index aaa67d1..7462ba1 100644 --- a/man/URLs.Rd +++ b/man/URLs.Rd @@ -85,9 +85,11 @@ available, download the original version instead of the ingested? If there was no ingested version, is set to NA. Note in \verb{get_dataframe_*}, \code{original} is set to FALSE by default. Either can be changed.} -\item{...}{Additional arguments passed to an HTTP request function, such as -\code{\link[httr]{GET}}, \code{\link[httr]{POST}}, or -\code{\link[httr]{DELETE}}.} +\item{...}{Additional arguments passed to an HTTP request function, +such as \code{\link[httr]{GET}}, \code{\link[httr]{POST}}, or +\code{\link[httr]{DELETE}}. See \code{\link{use_cache}} for details +on how the \emph{R} dataverse package uses disk and session caches to +improve network performance.} \item{filename}{Filename of the dataset, with file extension as shown in Dataverse (for example, if nlsw88.dta was the original but is displayed as the ingested diff --git a/man/add_dataset_file.Rd b/man/add_dataset_file.Rd index 8e9fa05..f1fbbd2 100644 --- a/man/add_dataset_file.Rd +++ b/man/add_dataset_file.Rd @@ -47,9 +47,11 @@ be set as a default via an environment variable. To set a default, run or add \code{DATAVERSE_SERVER = "dataverse.harvard.edu"} in one's \code{.Renviron} file (\code{usethis::edit_r_environ()}), with the appropriate domain as its value.} -\item{...}{Additional arguments passed to an HTTP request function, such as -\code{\link[httr]{GET}}, \code{\link[httr]{POST}}, or -\code{\link[httr]{DELETE}}.} +\item{...}{Additional arguments passed to an HTTP request function, +such as \code{\link[httr]{GET}}, \code{\link[httr]{POST}}, or +\code{\link[httr]{DELETE}}. See \code{\link{use_cache}} for details +on how the \emph{R} dataverse package uses disk and session caches to +improve network performance.} \item{id}{An integer specifying a file identifier; or, if \code{doi} is specified, a character string specifying a file name within the DOI-identified dataset; or an diff --git a/man/add_file.Rd b/man/add_file.Rd index 64e5670..a2f30a9 100644 --- a/man/add_file.Rd +++ b/man/add_file.Rd @@ -30,9 +30,11 @@ be set as a default via an environment variable. To set a default, run or add \code{DATAVERSE_SERVER = "dataverse.harvard.edu"} in one's \code{.Renviron} file (\code{usethis::edit_r_environ()}), with the appropriate domain as its value.} -\item{...}{Additional arguments passed to an HTTP request function, such as -\code{\link[httr]{GET}}, \code{\link[httr]{POST}}, or -\code{\link[httr]{DELETE}}.} +\item{...}{Additional arguments passed to an HTTP request function, +such as \code{\link[httr]{GET}}, \code{\link[httr]{POST}}, or +\code{\link[httr]{DELETE}}. See \code{\link{use_cache}} for details +on how the \emph{R} dataverse package uses disk and session caches to +improve network performance.} } \value{ An object of class \dQuote{dataset_atom}. diff --git a/man/cache.Rd b/man/cache.Rd new file mode 100644 index 0000000..de5a18b --- /dev/null +++ b/man/cache.Rd @@ -0,0 +1,66 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/cache.R +\name{cache_dataset} +\alias{cache_dataset} +\alias{use_cache} +\alias{cache_path} +\alias{cache_info} +\alias{cache_reset} +\title{Utilities for cache management} +\usage{ +cache_dataset(version) + +cache_path() + +cache_info() + +cache_reset() +} +\arguments{ +\item{version}{A character specifying a version of the dataset. +This can be of the form \code{"1.1"} or \code{"1"} (where in \code{"x.y"}, x is a major +version and y is an optional minor version), or +\code{":latest"} (the default, the latest published version). +We recommend using the number format so that +the function stores a cache of the data (See \code{\link{cache_dataset}}). +If the user specifies a \code{key} or \code{DATAVERSE_KEY} argument, they can access the +draft version by \code{":draft"} (the current draft) or \code{":latest"} (which will +prioritize the draft over the latest published version. +Finally, set \code{use_cache = "none"} to not read from the cache and re-download +afresh even when \code{version} is provided.} +} +\value{ +\code{cache_dataset()} returns \code{"disk"} if the dataset version is to be cached to disk, \code{"none"} otherwise. + +\code{cache_path()} returns the file path to the directory containing the cache. + +\code{cache_info()} returns a data.frame containing names and sizes of files in the cache. + +\code{cache_reset()} returns the path to the (now empty) cache, invisibly) +} +\description{ +The dataverse package uses disk and session caches to improve network performance. Use of the cache is described on this page. +} +\details{ +Use of the cache is determined by the value of the \verb{use_cache =} argument to dataset and other API calls, or by the environment variable \code{DATAVERSE_USE_CACHE}. Possible values are +\itemize{ +\item \code{"none"}: do not use the cache. This is the default for datasets that are versioned with \code{":draft"}, \code{":latest"}, and \code{":latest-published"}. +\item \code{"session"}: cache API requests for the duration of the \emph{R} session. This is the default for API calls that do not involve file or dataset retrieval. +\item `"disk": use a permanent disk cache. This is the default for files and explicitly versioned datasets. +} + +\code{cache_dataset()} determines whether a dataset or file should be cached based on the version specification. + +\code{cache_path()} finds or creates the location (directory) on the file system containing the cache. + +\code{cache_info()} queries the cache for information about the name, size, and other attributes of files in the cache. The file name is a 'hash' of the function used to retrieve the file; it is not useful for identifying specific files. + +\code{cache_reset()} clears all downloaded files from the disk cache. +} +\examples{ +cache_dataset(":latest") # "none" +cache_dataset("1.2") # "disk" +cache_path() + +cache_info() +} diff --git a/man/create_dataset.Rd b/man/create_dataset.Rd index 51510ab..3e5f0d3 100644 --- a/man/create_dataset.Rd +++ b/man/create_dataset.Rd @@ -39,9 +39,11 @@ be set as a default via an environment variable. To set a default, run or add \code{DATAVERSE_SERVER = "dataverse.harvard.edu"} in one's \code{.Renviron} file (\code{usethis::edit_r_environ()}), with the appropriate domain as its value.} -\item{...}{Additional arguments passed to an HTTP request function, such as -\code{\link[httr]{GET}}, \code{\link[httr]{POST}}, or -\code{\link[httr]{DELETE}}.} +\item{...}{Additional arguments passed to an HTTP request function, +such as \code{\link[httr]{GET}}, \code{\link[httr]{POST}}, or +\code{\link[httr]{DELETE}}. See \code{\link{use_cache}} for details +on how the \emph{R} dataverse package uses disk and session caches to +improve network performance.} \item{dataset}{A character specifying a persistent identification ID for a dataset, for example \code{"10.70122/FK2/HXJVJU"}. Alternatively, an object of class diff --git a/man/create_dataverse.Rd b/man/create_dataverse.Rd index 7b59cd9..da7ff59 100644 --- a/man/create_dataverse.Rd +++ b/man/create_dataverse.Rd @@ -27,9 +27,11 @@ be set as a default via an environment variable. To set a default, run or add \code{DATAVERSE_SERVER = "dataverse.harvard.edu"} in one's \code{.Renviron} file (\code{usethis::edit_r_environ()}), with the appropriate domain as its value.} -\item{...}{Additional arguments passed to an HTTP request function, such as -\code{\link[httr]{GET}}, \code{\link[httr]{POST}}, or -\code{\link[httr]{DELETE}}.} +\item{...}{Additional arguments passed to an HTTP request function, +such as \code{\link[httr]{GET}}, \code{\link[httr]{POST}}, or +\code{\link[httr]{DELETE}}. See \code{\link{use_cache}} for details +on how the \emph{R} dataverse package uses disk and session caches to +improve network performance.} } \value{ A list. diff --git a/man/dataset_atom.Rd b/man/dataset_atom.Rd index 605994a..4584a06 100644 --- a/man/dataset_atom.Rd +++ b/man/dataset_atom.Rd @@ -35,9 +35,11 @@ be set as a default via an environment variable. To set a default, run or add \code{DATAVERSE_SERVER = "dataverse.harvard.edu"} in one's \code{.Renviron} file (\code{usethis::edit_r_environ()}), with the appropriate domain as its value.} -\item{...}{Additional arguments passed to an HTTP request function, such as -\code{\link[httr]{GET}}, \code{\link[httr]{POST}}, or -\code{\link[httr]{DELETE}}.} +\item{...}{Additional arguments passed to an HTTP request function, +such as \code{\link[httr]{GET}}, \code{\link[httr]{POST}}, or +\code{\link[httr]{DELETE}}. See \code{\link{use_cache}} for details +on how the \emph{R} dataverse package uses disk and session caches to +improve network performance.} } \value{ A list. For \code{dataset_atom}, an object of class \dQuote{dataset_atom}. diff --git a/man/dataset_versions.Rd b/man/dataset_versions.Rd index 5cea74d..e51cb3e 100644 --- a/man/dataset_versions.Rd +++ b/man/dataset_versions.Rd @@ -29,9 +29,11 @@ be set as a default via an environment variable. To set a default, run or add \code{DATAVERSE_SERVER = "dataverse.harvard.edu"} in one's \code{.Renviron} file (\code{usethis::edit_r_environ()}), with the appropriate domain as its value.} -\item{...}{Additional arguments passed to an HTTP request function, such as -\code{\link[httr]{GET}}, \code{\link[httr]{POST}}, or -\code{\link[httr]{DELETE}}.} +\item{...}{Additional arguments passed to an HTTP request function, +such as \code{\link[httr]{GET}}, \code{\link[httr]{POST}}, or +\code{\link[httr]{DELETE}}. See \code{\link{use_cache}} for details +on how the \emph{R} dataverse package uses disk and session caches to +improve network performance.} } \value{ A list of class \dQuote{dataverse_dataset_version}. diff --git a/man/dataverse.Rd b/man/dataverse.Rd index a35185c..915e33a 100644 --- a/man/dataverse.Rd +++ b/man/dataverse.Rd @@ -60,6 +60,7 @@ Other contributors: \item Jan Kanis [contributor] \item Edward Jee [contributor] \item Johannes Gruber \email{JohannesB.Gruber@gmail.com} (\href{https://orcid.org/0000-0001-9177-1772}{ORCID}) [contributor] + \item Martin Morgan \email{mtmorgan.xyz@gmail.com} (\href{https://orcid.org/0000-0002-5874-8148}{ORCID}) [contributor] } } diff --git a/man/dataverse_metadata.Rd b/man/dataverse_metadata.Rd index c6ae364..b39c0c1 100644 --- a/man/dataverse_metadata.Rd +++ b/man/dataverse_metadata.Rd @@ -27,9 +27,11 @@ be set as a default via an environment variable. To set a default, run or add \code{DATAVERSE_SERVER = "dataverse.harvard.edu"} in one's \code{.Renviron} file (\code{usethis::edit_r_environ()}), with the appropriate domain as its value.} -\item{...}{Additional arguments passed to an HTTP request function, such as -\code{\link[httr]{GET}}, \code{\link[httr]{POST}}, or -\code{\link[httr]{DELETE}}.} +\item{...}{Additional arguments passed to an HTTP request function, +such as \code{\link[httr]{GET}}, \code{\link[httr]{POST}}, or +\code{\link[httr]{DELETE}}. See \code{\link{use_cache}} for details +on how the \emph{R} dataverse package uses disk and session caches to +improve network performance.} } \value{ A list diff --git a/man/delete_dataset.Rd b/man/delete_dataset.Rd index c5ae98f..62e0e78 100644 --- a/man/delete_dataset.Rd +++ b/man/delete_dataset.Rd @@ -29,9 +29,11 @@ be set as a default via an environment variable. To set a default, run or add \code{DATAVERSE_SERVER = "dataverse.harvard.edu"} in one's \code{.Renviron} file (\code{usethis::edit_r_environ()}), with the appropriate domain as its value.} -\item{...}{Additional arguments passed to an HTTP request function, such as -\code{\link[httr]{GET}}, \code{\link[httr]{POST}}, or -\code{\link[httr]{DELETE}}.} +\item{...}{Additional arguments passed to an HTTP request function, +such as \code{\link[httr]{GET}}, \code{\link[httr]{POST}}, or +\code{\link[httr]{DELETE}}. See \code{\link{use_cache}} for details +on how the \emph{R} dataverse package uses disk and session caches to +improve network performance.} } \value{ A logical. diff --git a/man/delete_dataverse.Rd b/man/delete_dataverse.Rd index 132c693..2e0803e 100644 --- a/man/delete_dataverse.Rd +++ b/man/delete_dataverse.Rd @@ -27,9 +27,11 @@ be set as a default via an environment variable. To set a default, run or add \code{DATAVERSE_SERVER = "dataverse.harvard.edu"} in one's \code{.Renviron} file (\code{usethis::edit_r_environ()}), with the appropriate domain as its value.} -\item{...}{Additional arguments passed to an HTTP request function, such as -\code{\link[httr]{GET}}, \code{\link[httr]{POST}}, or -\code{\link[httr]{DELETE}}.} +\item{...}{Additional arguments passed to an HTTP request function, +such as \code{\link[httr]{GET}}, \code{\link[httr]{POST}}, or +\code{\link[httr]{DELETE}}. See \code{\link{use_cache}} for details +on how the \emph{R} dataverse package uses disk and session caches to +improve network performance.} } \value{ A logical. diff --git a/man/delete_file.Rd b/man/delete_file.Rd index 839bfcd..ab5c5bf 100644 --- a/man/delete_file.Rd +++ b/man/delete_file.Rd @@ -27,9 +27,11 @@ be set as a default via an environment variable. To set a default, run or add \code{DATAVERSE_SERVER = "dataverse.harvard.edu"} in one's \code{.Renviron} file (\code{usethis::edit_r_environ()}), with the appropriate domain as its value.} -\item{...}{Additional arguments passed to an HTTP request function, such as -\code{\link[httr]{GET}}, \code{\link[httr]{POST}}, or -\code{\link[httr]{DELETE}}.} +\item{...}{Additional arguments passed to an HTTP request function, +such as \code{\link[httr]{GET}}, \code{\link[httr]{POST}}, or +\code{\link[httr]{DELETE}}. See \code{\link{use_cache}} for details +on how the \emph{R} dataverse package uses disk and session caches to +improve network performance.} } \value{ If successful, a logical \code{TRUE}, else possibly some information. diff --git a/man/delete_sword_dataset.Rd b/man/delete_sword_dataset.Rd index da5eee2..f77326c 100644 --- a/man/delete_sword_dataset.Rd +++ b/man/delete_sword_dataset.Rd @@ -27,9 +27,11 @@ be set as a default via an environment variable. To set a default, run or add \code{DATAVERSE_SERVER = "dataverse.harvard.edu"} in one's \code{.Renviron} file (\code{usethis::edit_r_environ()}), with the appropriate domain as its value.} -\item{...}{Additional arguments passed to an HTTP request function, such as -\code{\link[httr]{GET}}, \code{\link[httr]{POST}}, or -\code{\link[httr]{DELETE}}.} +\item{...}{Additional arguments passed to an HTTP request function, +such as \code{\link[httr]{GET}}, \code{\link[httr]{POST}}, or +\code{\link[httr]{DELETE}}. See \code{\link{use_cache}} for details +on how the \emph{R} dataverse package uses disk and session caches to +improve network performance.} } \value{ If successful, a logical \code{TRUE}, else possibly some information. diff --git a/man/files.Rd b/man/files.Rd index 13741aa..ef0f187 100644 --- a/man/files.Rd +++ b/man/files.Rd @@ -16,6 +16,7 @@ get_file( key = Sys.getenv("DATAVERSE_KEY"), server = Sys.getenv("DATAVERSE_SERVER"), original = TRUE, + version = ":latest", ... ) @@ -99,9 +100,23 @@ available, download the original version instead of the ingested? If there was no ingested version, is set to NA. Note in \verb{get_dataframe_*}, \code{original} is set to FALSE by default. Either can be changed.} -\item{...}{Additional arguments passed to an HTTP request function, such as -\code{\link[httr]{GET}}, \code{\link[httr]{POST}}, or -\code{\link[httr]{DELETE}}.} +\item{version}{A character specifying a version of the dataset. +This can be of the form \code{"1.1"} or \code{"1"} (where in \code{"x.y"}, x is a major +version and y is an optional minor version), or +\code{":latest"} (the default, the latest published version). +We recommend using the number format so that +the function stores a cache of the data (See \code{\link{cache_dataset}}). +If the user specifies a \code{key} or \code{DATAVERSE_KEY} argument, they can access the +draft version by \code{":draft"} (the current draft) or \code{":latest"} (which will +prioritize the draft over the latest published version. +Finally, set \code{use_cache = "none"} to not read from the cache and re-download +afresh even when \code{version} is provided.} + +\item{...}{Additional arguments passed to an HTTP request function, +such as \code{\link[httr]{GET}}, \code{\link[httr]{POST}}, or +\code{\link[httr]{DELETE}}. See \code{\link{use_cache}} for details +on how the \emph{R} dataverse package uses disk and session caches to +improve network performance.} \item{filename}{Filename of the dataset, with file extension as shown in Dataverse (for example, if nlsw88.dta was the original but is displayed as the ingested diff --git a/man/get_dataframe.Rd b/man/get_dataframe.Rd index 354c566..0266fd7 100644 --- a/man/get_dataframe.Rd +++ b/man/get_dataframe.Rd @@ -64,6 +64,17 @@ be set as a default via an environment variable. To set a default, run \code{Sys.setenv("DATAVERSE_SERVER" = "dataverse.harvard.edu")} or add \code{DATAVERSE_SERVER = "dataverse.harvard.edu"} in one's \code{.Renviron} file (\code{usethis::edit_r_environ()}), with the appropriate domain as its value.} + \item{\code{version}}{A character specifying a version of the dataset. +This can be of the form \code{"1.1"} or \code{"1"} (where in \code{"x.y"}, x is a major +version and y is an optional minor version), or +\code{":latest"} (the default, the latest published version). +We recommend using the number format so that +the function stores a cache of the data (See \code{\link{cache_dataset}}). +If the user specifies a \code{key} or \code{DATAVERSE_KEY} argument, they can access the +draft version by \code{":draft"} (the current draft) or \code{":latest"} (which will +prioritize the draft over the latest published version. +Finally, set \code{use_cache = "none"} to not read from the cache and re-download +afresh even when \code{version} is provided.} \item{\code{return_url}}{Instead of downloading the file, return the URL for download. Defaults to \code{FALSE}.} }} diff --git a/man/get_dataset.Rd b/man/get_dataset.Rd index cab1c56..d5c35b6 100644 --- a/man/get_dataset.Rd +++ b/man/get_dataset.Rd @@ -11,7 +11,8 @@ get_dataset( version = ":latest", key = Sys.getenv("DATAVERSE_KEY"), server = Sys.getenv("DATAVERSE_SERVER"), - ... + ..., + use_cache = Sys.getenv("DATAVERSE_USE_CACHE", cache_dataset(version)) ) dataset_metadata( @@ -20,7 +21,8 @@ dataset_metadata( block = "citation", key = Sys.getenv("DATAVERSE_KEY"), server = Sys.getenv("DATAVERSE_SERVER"), - ... + ..., + use_cache = Sys.getenv("DATAVERSE_USE_CACHE", cache_dataset(version)) ) dataset_files( @@ -28,7 +30,8 @@ dataset_files( version = ":latest", key = Sys.getenv("DATAVERSE_KEY"), server = Sys.getenv("DATAVERSE_SERVER"), - ... + ..., + use_cache = Sys.getenv("DATAVERSE_USE_CACHE", cache_dataset(version)) ) } \arguments{ @@ -36,7 +39,17 @@ dataset_files( for example \code{"10.70122/FK2/HXJVJU"}. Alternatively, an object of class \dQuote{dataverse_dataset} obtained by \code{dataverse_contents()}.} -\item{version}{A character string specifying a version of the dataset. This can be one of \dQuote{:draft} (the current draft), \dQuote{:latest} (the latest draft, if it exists, or the latest published version), \dQuote{:latest-published} (the latest published version, ignoring any draft), or \dQuote{x.y} (where \samp{x} is a major version and \samp{y} is a minor version; the \samp{.y} can be omitted to obtain a major version). In lieu of this, a dataset's version-specific identification number can be used for the \code{dataset} argument.} +\item{version}{A character specifying a version of the dataset. +This can be of the form \code{"1.1"} or \code{"1"} (where in \code{"x.y"}, x is a major +version and y is an optional minor version), or +\code{":latest"} (the default, the latest published version). +We recommend using the number format so that +the function stores a cache of the data (See \code{\link{cache_dataset}}). +If the user specifies a \code{key} or \code{DATAVERSE_KEY} argument, they can access the +draft version by \code{":draft"} (the current draft) or \code{":latest"} (which will +prioritize the draft over the latest published version. +Finally, set \code{use_cache = "none"} to not read from the cache and re-download +afresh even when \code{version} is provided.} \item{key}{A character string specifying a Dataverse server API key. If one is not specified, functions calling authenticated API endpoints will fail. @@ -51,9 +64,13 @@ be set as a default via an environment variable. To set a default, run or add \code{DATAVERSE_SERVER = "dataverse.harvard.edu"} in one's \code{.Renviron} file (\code{usethis::edit_r_environ()}), with the appropriate domain as its value.} -\item{...}{Additional arguments passed to an HTTP request function, such as -\code{\link[httr]{GET}}, \code{\link[httr]{POST}}, or -\code{\link[httr]{DELETE}}.} +\item{...}{Additional arguments passed to an HTTP request function, +such as \code{\link[httr]{GET}}, \code{\link[httr]{POST}}, or +\code{\link[httr]{DELETE}}. See \code{\link{use_cache}} for details +on how the \emph{R} dataverse package uses disk and session caches to +improve network performance.} + +\item{use_cache}{one of \code{"disk"}, \code{"session"}, or \code{"none"}, describing how datasets are cached to reduce network traffic. See \code{\link{cache_dataset}} for details.} \item{block}{A character string specifying a metadata block to retrieve. By default this is \dQuote{citation}. Other values may be available, depending diff --git a/man/get_dataverse.Rd b/man/get_dataverse.Rd index d984700..7925955 100644 --- a/man/get_dataverse.Rd +++ b/man/get_dataverse.Rd @@ -38,9 +38,11 @@ file (\code{usethis::edit_r_environ()}), with the appropriate domain as its valu \item{check}{A logical indicating whether to check that the value of \code{dataverse} is actually a numeric} -\item{...}{Additional arguments passed to an HTTP request function, such as -\code{\link[httr]{GET}}, \code{\link[httr]{POST}}, or -\code{\link[httr]{DELETE}}.} +\item{...}{Additional arguments passed to an HTTP request function, +such as \code{\link[httr]{GET}}, \code{\link[httr]{POST}}, or +\code{\link[httr]{DELETE}}. See \code{\link{use_cache}} for details +on how the \emph{R} dataverse package uses disk and session caches to +improve network performance.} } \value{ A list of class \dQuote{dataverse}. diff --git a/man/get_facets.Rd b/man/get_facets.Rd index 54aadcb..6f86052 100644 --- a/man/get_facets.Rd +++ b/man/get_facets.Rd @@ -27,9 +27,11 @@ be set as a default via an environment variable. To set a default, run or add \code{DATAVERSE_SERVER = "dataverse.harvard.edu"} in one's \code{.Renviron} file (\code{usethis::edit_r_environ()}), with the appropriate domain as its value.} -\item{...}{Additional arguments passed to an HTTP request function, such as -\code{\link[httr]{GET}}, \code{\link[httr]{POST}}, or -\code{\link[httr]{DELETE}}.} +\item{...}{Additional arguments passed to an HTTP request function, +such as \code{\link[httr]{GET}}, \code{\link[httr]{POST}}, or +\code{\link[httr]{DELETE}}. See \code{\link{use_cache}} for details +on how the \emph{R} dataverse package uses disk and session caches to +improve network performance.} } \value{ A list. diff --git a/man/get_file_metadata.Rd b/man/get_file_metadata.Rd index 4ec313d..7be3b2c 100644 --- a/man/get_file_metadata.Rd +++ b/man/get_file_metadata.Rd @@ -40,9 +40,11 @@ be set as a default via an environment variable. To set a default, run or add \code{DATAVERSE_SERVER = "dataverse.harvard.edu"} in one's \code{.Renviron} file (\code{usethis::edit_r_environ()}), with the appropriate domain as its value.} -\item{...}{Additional arguments passed to an HTTP request function, such as -\code{\link[httr]{GET}}, \code{\link[httr]{POST}}, or -\code{\link[httr]{DELETE}}.} +\item{...}{Additional arguments passed to an HTTP request function, +such as \code{\link[httr]{GET}}, \code{\link[httr]{POST}}, or +\code{\link[httr]{DELETE}}. See \code{\link{use_cache}} for details +on how the \emph{R} dataverse package uses disk and session caches to +improve network performance.} } \value{ A character vector containing a DDI diff --git a/man/get_user_key.Rd b/man/get_user_key.Rd index aac73cc..be2d2b8 100644 --- a/man/get_user_key.Rd +++ b/man/get_user_key.Rd @@ -13,9 +13,11 @@ get_user_key(user, password, server = Sys.getenv("DATAVERSE_SERVER"), ...) \item{server}{The Dataverse instance. See \code{get_file}.} -\item{...}{Additional arguments passed to an HTTP request function, such as -\code{\link[httr]{GET}}, \code{\link[httr]{POST}}, or -\code{\link[httr]{DELETE}}.} +\item{...}{Additional arguments passed to an HTTP request function, +such as \code{\link[httr]{GET}}, \code{\link[httr]{POST}}, or +\code{\link[httr]{DELETE}}. See \code{\link{use_cache}} for details +on how the \emph{R} dataverse package uses disk and session caches to +improve network performance.} } \value{ A list. diff --git a/man/initiate_sword_dataset.Rd b/man/initiate_sword_dataset.Rd index 13893f0..170fea8 100644 --- a/man/initiate_sword_dataset.Rd +++ b/man/initiate_sword_dataset.Rd @@ -30,9 +30,11 @@ be set as a default via an environment variable. To set a default, run or add \code{DATAVERSE_SERVER = "dataverse.harvard.edu"} in one's \code{.Renviron} file (\code{usethis::edit_r_environ()}), with the appropriate domain as its value.} -\item{...}{Additional arguments passed to an HTTP request function, such as -\code{\link[httr]{GET}}, \code{\link[httr]{POST}}, or -\code{\link[httr]{DELETE}}.} +\item{...}{Additional arguments passed to an HTTP request function, +such as \code{\link[httr]{GET}}, \code{\link[httr]{POST}}, or +\code{\link[httr]{DELETE}}. See \code{\link{use_cache}} for details +on how the \emph{R} dataverse package uses disk and session caches to +improve network performance.} } \value{ An object of class \dQuote{dataset_atom}. diff --git a/man/list_datasets.Rd b/man/list_datasets.Rd index f8f6c08..bc7cb77 100644 --- a/man/list_datasets.Rd +++ b/man/list_datasets.Rd @@ -27,9 +27,11 @@ be set as a default via an environment variable. To set a default, run or add \code{DATAVERSE_SERVER = "dataverse.harvard.edu"} in one's \code{.Renviron} file (\code{usethis::edit_r_environ()}), with the appropriate domain as its value.} -\item{...}{Additional arguments passed to an HTTP request function, such as -\code{\link[httr]{GET}}, \code{\link[httr]{POST}}, or -\code{\link[httr]{DELETE}}.} +\item{...}{Additional arguments passed to an HTTP request function, +such as \code{\link[httr]{GET}}, \code{\link[httr]{POST}}, or +\code{\link[httr]{DELETE}}. See \code{\link{use_cache}} for details +on how the \emph{R} dataverse package uses disk and session caches to +improve network performance.} } \value{ A list. diff --git a/man/publish_dataset.Rd b/man/publish_dataset.Rd index 2b7f27b..0ebd463 100644 --- a/man/publish_dataset.Rd +++ b/man/publish_dataset.Rd @@ -32,9 +32,11 @@ be set as a default via an environment variable. To set a default, run or add \code{DATAVERSE_SERVER = "dataverse.harvard.edu"} in one's \code{.Renviron} file (\code{usethis::edit_r_environ()}), with the appropriate domain as its value.} -\item{...}{Additional arguments passed to an HTTP request function, such as -\code{\link[httr]{GET}}, \code{\link[httr]{POST}}, or -\code{\link[httr]{DELETE}}.} +\item{...}{Additional arguments passed to an HTTP request function, +such as \code{\link[httr]{GET}}, \code{\link[httr]{POST}}, or +\code{\link[httr]{DELETE}}. See \code{\link{use_cache}} for details +on how the \emph{R} dataverse package uses disk and session caches to +improve network performance.} } \value{ A list. diff --git a/man/publish_dataverse.Rd b/man/publish_dataverse.Rd index fb229ae..2c611c1 100644 --- a/man/publish_dataverse.Rd +++ b/man/publish_dataverse.Rd @@ -27,9 +27,11 @@ be set as a default via an environment variable. To set a default, run or add \code{DATAVERSE_SERVER = "dataverse.harvard.edu"} in one's \code{.Renviron} file (\code{usethis::edit_r_environ()}), with the appropriate domain as its value.} -\item{...}{Additional arguments passed to an HTTP request function, such as -\code{\link[httr]{GET}}, \code{\link[httr]{POST}}, or -\code{\link[httr]{DELETE}}.} +\item{...}{Additional arguments passed to an HTTP request function, +such as \code{\link[httr]{GET}}, \code{\link[httr]{POST}}, or +\code{\link[httr]{DELETE}}. See \code{\link{use_cache}} for details +on how the \emph{R} dataverse package uses disk and session caches to +improve network performance.} } \value{ A list. diff --git a/man/publish_sword_dataset.Rd b/man/publish_sword_dataset.Rd index e53e03b..0899f68 100644 --- a/man/publish_sword_dataset.Rd +++ b/man/publish_sword_dataset.Rd @@ -27,9 +27,11 @@ be set as a default via an environment variable. To set a default, run or add \code{DATAVERSE_SERVER = "dataverse.harvard.edu"} in one's \code{.Renviron} file (\code{usethis::edit_r_environ()}), with the appropriate domain as its value.} -\item{...}{Additional arguments passed to an HTTP request function, such as -\code{\link[httr]{GET}}, \code{\link[httr]{POST}}, or -\code{\link[httr]{DELETE}}.} +\item{...}{Additional arguments passed to an HTTP request function, +such as \code{\link[httr]{GET}}, \code{\link[httr]{POST}}, or +\code{\link[httr]{DELETE}}. See \code{\link{use_cache}} for details +on how the \emph{R} dataverse package uses disk and session caches to +improve network performance.} } \value{ A list. diff --git a/man/service_document.Rd b/man/service_document.Rd index 3cfac1e..1c48bf8 100644 --- a/man/service_document.Rd +++ b/man/service_document.Rd @@ -24,9 +24,11 @@ be set as a default via an environment variable. To set a default, run or add \code{DATAVERSE_SERVER = "dataverse.harvard.edu"} in one's \code{.Renviron} file (\code{usethis::edit_r_environ()}), with the appropriate domain as its value.} -\item{...}{Additional arguments passed to an HTTP request function, such as -\code{\link[httr]{GET}}, \code{\link[httr]{POST}}, or -\code{\link[httr]{DELETE}}.} +\item{...}{Additional arguments passed to an HTTP request function, +such as \code{\link[httr]{GET}}, \code{\link[httr]{POST}}, or +\code{\link[httr]{DELETE}}. See \code{\link{use_cache}} for details +on how the \emph{R} dataverse package uses disk and session caches to +improve network performance.} } \value{ A list of class \dQuote{sword_service_document}, possibly with one or more \dQuote{sword_collection} entries. The latter are SWORD representations of a Dataverse. These can be passed to other SWORD API functions, e.g., for creating a new dataset. diff --git a/man/set_dataverse_metadata.Rd b/man/set_dataverse_metadata.Rd index 6a25f3c..4f85106 100644 --- a/man/set_dataverse_metadata.Rd +++ b/man/set_dataverse_metadata.Rd @@ -33,9 +33,11 @@ be set as a default via an environment variable. To set a default, run or add \code{DATAVERSE_SERVER = "dataverse.harvard.edu"} in one's \code{.Renviron} file (\code{usethis::edit_r_environ()}), with the appropriate domain as its value.} -\item{...}{Additional arguments passed to an HTTP request function, such as -\code{\link[httr]{GET}}, \code{\link[httr]{POST}}, or -\code{\link[httr]{DELETE}}.} +\item{...}{Additional arguments passed to an HTTP request function, +such as \code{\link[httr]{GET}}, \code{\link[httr]{POST}}, or +\code{\link[httr]{DELETE}}. See \code{\link{use_cache}} for details +on how the \emph{R} dataverse package uses disk and session caches to +improve network performance.} } \value{ A list diff --git a/tests/testthat.R b/tests/testthat.R index a5ae096..ef235f0 100644 --- a/tests/testthat.R +++ b/tests/testthat.R @@ -14,8 +14,11 @@ if (!requireNamespace("yaml", quietly = TRUE)) { config <- yaml::read_yaml(system.file("constants.yml", package = "dataverse")) # config <- yaml::read_yaml("inst/constants.yml") - Sys.setenv("DATAVERSE_SERVER" = config$server) - Sys.setenv("DATAVERSE_KEY" = config$api_token) + Sys.setenv( + DATAVERSE_SERVER = config$server, + DATAVERSE_KEY = config$api_token, + DATAVERSE_USE_CACHE = "none" + ) # To better identify the source of problems, check if the token is expired. # This check *should* be unnecessary on CRAN, since not CRAN tests should diff --git a/tests/testthat/tests-cache.R b/tests/testthat/tests-cache.R new file mode 100644 index 0000000..e66303f --- /dev/null +++ b/tests/testthat/tests-cache.R @@ -0,0 +1,24 @@ +test_that("cache management works", { + expect_true(dir.exists(cache_path())) + + expect_identical(cache_dataset(":latest"), "none") + expect_identical(cache_dataset("1.2"), "disk") + + info <- cache_info() + expect_s3_class(info, "data.frame") + ## subset of column names common across platforms; see ?file.info + ## 'uname' not available in Windows <= R 4.4.1 + colnames <- c("size", "isdir", "mode", "mtime", "ctime", "atime") + expect_true(all(colnames %in% names(info))) +}) + +test_that("'api_get' validates use_cache", { + testthat::skip_on_cran() + + expect_error(get_url_by_name( + filename = "nlsw88.tab", + dataset = "10.70122/FK2/PPIAXE", + server = "demo.dataverse.org", + use_cache = "BAD VALUE" + ), ".*argument 'use_cache' is not correct, see \\?use_cache") +}) diff --git a/vignettes/C-download.Rmd b/vignettes/C-download.Rmd index a5ba31d..cb4beb1 100644 --- a/vignettes/C-download.Rmd +++ b/vignettes/C-download.Rmd @@ -94,6 +94,10 @@ head(energy) The dataverse package can also download datasets that are _drafts_ (i.e. versions not released publicly), as long as the user of the dataset provides their appropriate DATAVERSE_KEY. Users may need to modify the metadata of a datafile, such as adding a descriptive label, for the data downloading to work properly in this case. This is because the the file identifier UNF, which the read function relies on, may only appear after metadata has been added. +## Caching large datasets + +As of v0.3.15, datasets are _cached_ on your computer if the user specifies a version of the dataset. The next time the code is run, the function will read from the cache rather than re-downloading from the Dataverse. Version specification can be done, e.g., by setting `version = "3"` for V3, for instance. This is useful to avoid re-downloading the identical dataset every time, especially if they take some time to download. To turn off or view the settings of caching, see `cache_dataset()`. + ## Retrieving Custom Data Formats (RDS, Stata, SPSS)