Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Issue 112/135 caching memoise api get #137

Merged
merged 13 commits into from
Oct 16, 2024
Merged
12 changes: 10 additions & 2 deletions DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
Package: dataverse
Version: 0.3.14
Version: 0.3.15
Title: Client for Dataverse 4+ Repositories
Authors@R:
c(person(given = "Shiro",
Expand Down Expand Up @@ -37,14 +37,22 @@ Authors@R:
family = "Gruber",
role = c("ctb"),
email = "[email protected]",
comment = c(ORCID = "0000-0001-9177-1772")))
comment = c(ORCID = "0000-0001-9177-1772")),
person(given = "Martin",
family = "Morgan",
role = "ctb",
email = "[email protected]",
comment = c(ORCID = "0000-0002-5874-8148")))
Imports:
checkmate,
httr,
memoise,
cachem,
jsonlite,
readr,
stats,
utils,
tools,
xml2
Suggests:
covr,
Expand Down
9 changes: 9 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,10 @@ S3method(print,get_file)
S3method(print,sword_service_document)
export(add_dataset_file)
export(add_file)
export(cache_dataset)
export(cache_info)
export(cache_path)
export(cache_reset)
export(create_dataset)
export(create_dataverse)
export(dataset_atom)
Expand Down Expand Up @@ -63,3 +67,8 @@ export(service_document)
export(set_dataverse_metadata)
export(update_dataset)
export(update_dataset_file)
importFrom(cachem,cache_disk)
importFrom(checkmate,assert_string)
importFrom(checkmate,test_string)
importFrom(memoise,memoise)
importFrom(tools,R_user_dir)
4 changes: 4 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,9 @@
# dataverse

# CHANGES in dataverse 0.3.15

* Implement a cache for API calls (including file download) when dataset version is specified. The functions will reload from the cache automatically the second time. (#112, #135, by @mtmorgan)

# CHANGES in dataverse 0.3.14

* Improve recommendation for rdata loading (#107, #127)
Expand Down
10 changes: 4 additions & 6 deletions R/SWORD.R
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,8 @@
#' @export
service_document <- function(key = Sys.getenv("DATAVERSE_KEY"), server = Sys.getenv("DATAVERSE_SERVER"), ...) {
u <- paste0(api_url(server, prefix = "dvn/api/"), "data-deposit/v1.1/swordv2/service-document")
r <- httr::GET(u, httr::authenticate(key, ""), ...)
httr::stop_for_status(r)
x <- xml2::as_list(xml2::read_xml(httr::content(r, "text")))
r <- api_get(u, httr::authenticate(key, ""), ...)
x <- xml2::as_list(xml2::read_xml(r))
w <- x$workspace
out <- list()
if ("title" %in% names(w)) {
Expand Down Expand Up @@ -74,11 +73,10 @@ list_datasets <- function(dataverse, key = Sys.getenv("DATAVERSE_KEY"), server =
dataverse <- get_dataverse(dataverse, key = key, server = server, ...)$alias
}
u <- paste0(api_url(server, prefix = "dvn/api/"), "data-deposit/v1.1/swordv2/collection/dataverse/", dataverse)
r <- httr::GET(u, httr::authenticate(key, ""), ...)
httr::stop_for_status(r)
r <- api_get(u, httr::authenticate(key, ""), ..., as = "raw")

# clean up response structure
x <- xml2::as_list(xml2::read_xml(r$content))
x <- xml2::as_list(xml2::read_xml(r))
feed <- x[["feed"]]
out <- list(title = feed[["title"]][[1L]],
generator = feed[["generator"]],
Expand Down
10 changes: 4 additions & 6 deletions R/SWORD_dataset.R
Original file line number Diff line number Diff line change
Expand Up @@ -205,9 +205,8 @@ dataset_atom <- function(dataset, key = Sys.getenv("DATAVERSE_KEY"), server = Sy
u <- paste0(api_url(server, prefix="dvn/api/"), "data-deposit/v1.1/swordv2/edit/study/", dataset)
}

r <- httr::GET(u, httr::authenticate(key, ""), ...)
httr::stop_for_status(r, task = httr::content(r)$message)
out <- parse_atom(rawToChar(r$content))
r <- api_get(u, httr::authenticate(key, ""), ..., as = "raw")
out <- parse_atom(rawToChar(r))
out
}

Expand All @@ -229,7 +228,6 @@ dataset_statement <- function(dataset, key = Sys.getenv("DATAVERSE_KEY"), server
dataset <- prepend_doi(dataset)
u <- paste0(api_url(server, prefix="dvn/api/"), "data-deposit/v1.1/swordv2/statement/study/", dataset)
}
r <- httr::GET(u, httr::authenticate(key, ""), ...)
httr::stop_for_status(r, task = httr::content(r)$message)
parse_dataset_statement(rawToChar(r$content))
r <- api_get(u, httr::authenticate(key, ""), ..., as = "raw")
parse_dataset_statement(rawToChar(r))
}
93 changes: 93 additions & 0 deletions R/cache.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
#' @rdname cache
#' @aliases use_cache
#' @title Utilities for cache management
#' @description The dataverse package uses disk and session caches to improve network performance. Use of the cache is described on this page.
#' @details
#' Use of the cache is determined by the value of the `use_cache =` argument to dataset and other API calls, or by the environment variable `DATAVERSE_USE_CACHE`. Possible values are
#'
#' - `"none"`: do not use the cache. This is the default for datasets that are versioned with `":draft"`, `":latest"`, and `":latest-published"`.
#' - `"session"`: cache API requests for the duration of the *R* session. This is the default for API calls that do not involve file or dataset retrieval.
#' - `"disk": use a permanent disk cache. This is the default for files and explicitly versioned datasets.
#'
#' @template version
#' @details
#' `cache_dataset()` determines whether a dataset or file should be cached based on the version specification.
#' @return
#' `cache_dataset()` returns `"disk"` if the dataset version is to be cached to disk, `"none"` otherwise.
#' @importFrom checkmate assert_string
#' @examples
#' cache_dataset(":latest") # "none"
#' cache_dataset("1.2") # "disk"
#' @export
cache_dataset <- function(version) {
assert_string(version)
if (version %in% c(":draft", ":latest", ":latest-published")) {
"none"
} else {
"disk"
}
}

#' @rdname cache
#' @details
#' `cache_path()` finds or creates the location (directory) on the file system containing the cache.
#'
#' @return
#' `cache_path()` returns the file path to the directory containing the cache.
#'
#' @examples
#' cache_path()
#'
#' @importFrom tools R_user_dir
#' @export
cache_path <- function() {
cache_path <- file.path(R_user_dir("dataverse", "cache"), "api_cache")
if (!dir.exists(cache_path)) {
status <- dir.create(cache_path, recursive = TRUE)
if (!status)
warning("'dataverse' failed to create a 'disk' cache")
}

cache_path
}

#' @rdname cache
#' @details
#' `cache_info()` queries the cache for information about the name, size, and other attributes of files in the cache. The file name is a 'hash' of the function used to retrieve the file; it is not useful for identifying specific files.
#' @return
#' `cache_info()` returns a data.frame containing names and sizes of files in the cache.
#' @examples
#' cache_info()
#' @export
cache_info <- function() {
cache_path <- cache_path()
if (dir.exists(cache_path)) {
files <- dir(cache_path(), full.names = TRUE)
info <- file.info(files)
rownames(info) <- basename(files)
info
}
}

#' @rdname cache
#' @details
#' `cache_reset()` clears all downloaded files from the disk cache.
#' @returns
#' `cache_reset()` returns the path to the (now empty) cache, invisibly)
#' @export
cache_reset <- function() {
cache_path <- cache_path()
if (dir.exists(cache_path))
cache_disk(cache_path)$reset()
invisible(cache_path)
}

## utility to check valid values of `use_cache =`
#' @importFrom checkmate test_string
assert_use_cache <- function(use_cache) {
test <-
test_string(use_cache) &&
use_cache %in% c("disk", "session", "none")
if (!test)
stop("argument 'use_cache' is not correct, see ?use_cache")
}
5 changes: 2 additions & 3 deletions R/dataset_versions.R
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,8 @@
dataset_versions <- function(dataset, key = Sys.getenv("DATAVERSE_KEY"), server = Sys.getenv("DATAVERSE_SERVER"), ...) {
dataset <- dataset_id(dataset, key = key, server = server, ...)
u <- paste0(api_url(server), "datasets/", dataset, "/versions")
r <- httr::GET(u, httr::add_headers("X-Dataverse-key" = key), ...)
httr::stop_for_status(r, task = httr::content(r)$message)
out <- httr::content(r, encoding = "UTF-8")$data
r <- api_get(u, ..., key = key, as = NULL)
out <- r$data
lapply(out, function(x) {
x <- `class<-`(x, "dataverse_dataset_version")
x$files <- lapply(x$files, `class<-`, "dataverse_file")
Expand Down
5 changes: 2 additions & 3 deletions R/dataverse_metadata.R
Original file line number Diff line number Diff line change
Expand Up @@ -18,9 +18,8 @@
dataverse_metadata <- function(dataverse, key = Sys.getenv("DATAVERSE_KEY"), server = Sys.getenv("DATAVERSE_SERVER"), ...) {
dataverse <- dataverse_id(dataverse, key = key, server = server, ...)
u <- paste0(api_url(server), "dataverses/", dataverse, "/metadatablocks")
r <- httr::GET(u, httr::add_headers("X-Dataverse-key" = key), ...)
httr::stop_for_status(r, task = httr::content(r)$message)
jsonlite::fromJSON(httr::content(r, as = "text", encoding = "UTF-8"), simplifyDataFrame = FALSE)$data
r <- api_get(u, ..., key = key)
jsonlite::fromJSON(r, simplifyDataFrame = FALSE)$data
}

#' @title Set Dataverse metadata
Expand Down
5 changes: 2 additions & 3 deletions R/dataverse_search.R
Original file line number Diff line number Diff line change
Expand Up @@ -103,9 +103,8 @@ function(...,
u <- paste0(api_url(server), "search")

# execute request
r <- httr::GET(u, httr::add_headers("X-Dataverse-key" = key), query = query)
httr::stop_for_status(r, task = httr::content(r)$message)
out <- jsonlite::fromJSON(httr::content(r, as = "text", encoding = "UTF-8"))
r <- api_get(u, query = query, key = key)
out <- jsonlite::fromJSON(r)
if (isTRUE(verbose)) {
n_total <- ngettext(out$data$total_count, "result", "results")
message(sprintf(paste0("%s of %s ", n_total, " retrieved"), out$data$count_in_response, out$data$total_count))
Expand Down
32 changes: 16 additions & 16 deletions R/get_dataset.R
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
#' @template version
#' @template envvars
#' @template dots
#' @param use_cache one of `"disk"`, `"session"`, or `"none"`, describing how datasets are cached to reduce network traffic. See \code{\link{cache_dataset}} for details.
#' @return A list of class \dQuote{dataverse_dataset} or a list of a form dependent
#' on the specific metadata block retrieved. \code{dataset_files} returns a list of
#' objects of class \dQuote{dataverse_file}.
Expand All @@ -45,17 +46,17 @@ get_dataset <- function(
version = ":latest",
key = Sys.getenv("DATAVERSE_KEY"),
server = Sys.getenv("DATAVERSE_SERVER"),
...
...,
use_cache = Sys.getenv("DATAVERSE_USE_CACHE", cache_dataset(version))
) {
dataset <- dataset_id(dataset, key = key, server = server, ...)
dataset <- dataset_id(dataset, key = key, server = server, ..., use_cache = use_cache)
if (!is.null(version)) {
u <- paste0(api_url(server), "datasets/", dataset, "/versions/", version)
} else {
u <- paste0(api_url(server), "datasets/", dataset)
}
r <- httr::GET(u, httr::add_headers("X-Dataverse-key" = key), ...)
httr::stop_for_status(r, task = httr::content(r)$message)
parse_dataset(httr::content(r, as = "text", encoding = "UTF-8"))
r <- api_get(u, ..., key = key, use_cache = use_cache)
parse_dataset(r)
}

#' @rdname get_dataset
Expand All @@ -70,19 +71,18 @@ dataset_metadata <- function(
block = "citation",
key = Sys.getenv("DATAVERSE_KEY"),
server = Sys.getenv("DATAVERSE_SERVER"),
...
...,
use_cache = Sys.getenv("DATAVERSE_USE_CACHE", cache_dataset(version))
) {
dataset <- dataset_id(dataset, key = key, server = server, ...)
dataset <- dataset_id(dataset, key = key, server = server, ..., use_cache = use_cache)
if (!is.null(block)) {
u <- paste0(api_url(server), "datasets/", dataset, "/versions/", version, "/metadata/", block)
} else {
u <- paste0(api_url(server), "datasets/", dataset, "/versions/", version, "/metadata")
}

r <- httr::GET(u, httr::add_headers("X-Dataverse-key" = key), ...)
httr::stop_for_status(r, task = httr::content(r)$message)
out <- httr::content(r, as = "text", encoding = "UTF-8")
jsonlite::fromJSON(out)[["data"]]
r <- api_get(u, ..., key = key, use_cache = use_cache)
jsonlite::fromJSON(r)[["data"]]
}

#' @rdname get_dataset
Expand All @@ -92,12 +92,12 @@ dataset_files <- function(
version = ":latest",
key = Sys.getenv("DATAVERSE_KEY"),
server = Sys.getenv("DATAVERSE_SERVER"),
...
...,
use_cache = Sys.getenv("DATAVERSE_USE_CACHE", cache_dataset(version))
) {
dataset <- dataset_id(dataset, key = key, server = server, ...)
dataset <- dataset_id(dataset, key = key, server = server, ..., use_cache = use_cache)
u <- paste0(api_url(server), "datasets/", dataset, "/versions/", version, "/files")
r <- httr::GET(u, httr::add_headers("X-Dataverse-key" = key), ...)
httr::stop_for_status(r, task = httr::content(r)$message)
out <- jsonlite::fromJSON(httr::content(r, as = "text", encoding = "UTF-8"), simplifyDataFrame = FALSE)$data
r <- api_get(u, ..., key = key, use_cache = use_cache)
out <- jsonlite::fromJSON(r, simplifyDataFrame = FALSE)$data
structure(lapply(out, `class<-`, "dataverse_file"))
}
10 changes: 4 additions & 6 deletions R/get_dataverse.R
Original file line number Diff line number Diff line change
Expand Up @@ -31,9 +31,8 @@ get_dataverse <- function(dataverse, key = Sys.getenv("DATAVERSE_KEY"), server =
dataverse <- dataverse_id(dataverse, key = key, server = server, ...)
}
u <- paste0(api_url(server), "dataverses/", dataverse)
r <- httr::GET(u, httr::add_headers("X-Dataverse-key" = key), ...)
httr::stop_for_status(r, task = httr::content(r)$message)
out <- jsonlite::fromJSON(httr::content(r, as = "text", encoding = "UTF-8"))
r <- api_get(u, ..., key = key)
out <- jsonlite::fromJSON(r)
structure(out$data, class = "dataverse")
}

Expand All @@ -42,9 +41,8 @@ get_dataverse <- function(dataverse, key = Sys.getenv("DATAVERSE_KEY"), server =
dataverse_contents <- function(dataverse, key = Sys.getenv("DATAVERSE_KEY"), server = Sys.getenv("DATAVERSE_SERVER"), ...) {
dataverse <- dataverse_id(dataverse, key = key, server = server, ...)
u <- paste0(api_url(server), "dataverses/", dataverse, "/contents")
r <- httr::GET(u, httr::add_headers("X-Dataverse-key" = key), ...)
httr::stop_for_status(r, task = httr::content(r)$message)
out <- jsonlite::fromJSON(httr::content(r, as = "text", encoding = "UTF-8"), simplifyDataFrame = FALSE)
r <- api_get(u, ..., key = key)
out <- jsonlite::fromJSON(r, simplifyDataFrame = FALSE)
structure(lapply(out$data, function(x) {
`class<-`(x, if (x$type == "dataset") "dataverse_dataset" else "dataverse")
}))
Expand Down
5 changes: 2 additions & 3 deletions R/get_facets.R
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@
get_facets <- function(dataverse, key = Sys.getenv("DATAVERSE_KEY"), server = Sys.getenv("DATAVERSE_SERVER"), ...) {
dataverse <- dataverse_id(dataverse, key = key, server = server, ...)
u <- paste0(api_url(server), "dataverses/", dataverse, "/facets")
r <- httr::GET(u, httr::add_headers("X-Dataverse-key" = key), ...)
httr::stop_for_status(r, task = httr::content(r)$message)
jsonlite::fromJSON(httr::content(r, as = "text", encoding = "UTF-8"))$data
r <- api_get(u, ..., key = key)
jsonlite::fromJSON(r)$data
}
2 changes: 2 additions & 0 deletions R/get_file.R
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@
#' @template envvars
#' @template dots
#' @template ds
#' @template version
#'
#' @return \code{get_file} returns a raw vector (or list of raw vectors,
#' if \code{length(file) > 1}), which can be saved locally with the `writeBin`
Expand Down Expand Up @@ -89,6 +90,7 @@ get_file <- function(
key = Sys.getenv("DATAVERSE_KEY"),
server = Sys.getenv("DATAVERSE_SERVER"),
original = TRUE,
version = ":latest",
...
) {

Expand Down
14 changes: 4 additions & 10 deletions R/get_file_by_id.R
Original file line number Diff line number Diff line change
Expand Up @@ -98,16 +98,10 @@ get_file_by_id <- function(
if (return_url) {
return(httr::modify_url(u, query = query))
}
if (isFALSE(progress))
r <- httr::GET(u, httr::add_headers("X-Dataverse-key" = key), query = query, ...)

if (isTRUE(progress))
r <- httr::GET(u, httr::add_headers("X-Dataverse-key" = key), query = query, httr::progress(type = "down"), ...)



httr::stop_for_status(r, task = httr::content(r)$message)
httr::content(r, as = "raw")
# add a progress bar; 'NULL' if progress is not TRUE. 'NULL' arguments
# are not seen by httr::GET()
progress_bar <- if (isTRUE(progress)) httr::progress(type = "down")
api_get(u, query = query, progress_bar, ..., key = key, as = "raw")
}

#' @rdname files
Expand Down
5 changes: 1 addition & 4 deletions R/get_file_metadata.R
Original file line number Diff line number Diff line change
Expand Up @@ -48,8 +48,5 @@ get_file_metadata <-
u <- paste0(api_url(server), "access/datafile/", file, "/metadata/", format)
}

r <- httr::GET(u, httr::add_headers("X-Dataverse-key" = key), ...)
httr::stop_for_status(r, task = httr::content(r)$message)
out <- httr::content(r, as = "text", encoding = "UTF-8")
return(out)
api_get(u, ..., key = key)
}
Loading
Loading