From 62d604ae12c8636a477c0b96418a7a6af173cdea Mon Sep 17 00:00:00 2001 From: Shiro Kuriwaki Date: Sat, 26 Dec 2020 15:34:03 -0500 Subject: [PATCH 01/75] Initial crack at separating get_file_by_id into separate function --- R/get_file.R | 99 ++++++++++++++++++++++------------------------ R/get_file_by_id.R | 49 +++++++++++++++++++++++ 2 files changed, 97 insertions(+), 51 deletions(-) create mode 100644 R/get_file_by_id.R diff --git a/R/get_file.R b/R/get_file.R index 2934a02..a445cb8 100644 --- a/R/get_file.R +++ b/R/get_file.R @@ -1,14 +1,29 @@ #' @rdname files +#' +#' #' @title Download File(s) -#' @description Download Dataverse File(s) +#' @description Download Dataverse File(s). `get_file` internally calls +#' `get_file_by_id`. +#' #' @details This function provides access to data files from a Dataverse entry. -#' @param file An integer specifying a file identifier; or a vector of integers specifying file identifiers; or, if \code{doi} is specified, a character string specifying a file name within the DOI-identified dataset; or an object of class \dQuote{dataverse_file} as returned by \code{\link{dataset_files}}. -#' @template ds -#' @param format A character string specifying a file format. For \code{get_file}: by default, this is \dQuote{original} (the original file format). If \dQuote{RData} or \dQuote{prep} is used, an alternative is returned. If \dQuote{bundle}, a compressed directory containing a bundle of file formats is returned. For \code{get_file_metadata}, this is \dQuote{ddi}. -#' @param vars A character vector specifying one or more variable names, used to extract a subset of the data. -#' @template envvars -#' @template dots -#' @return \code{get_file_metadata} returns a character vector containing a DDI metadata file. \code{get_file} returns a raw vector (or list of raw vectors, if \code{length(file) > 1}). +#' @param file An integer specifying a file identifier; or a vector of integers +#' specifying file identifiers; or, if \code{doi} is specified, a character string +#' specifying a file name within the DOI-identified dataset; or an object of +#' class \dQuote{dataverse_file} as returned by \code{\link{dataset_files}}. +#' @param fileid A numeric ID internally used for `get_file_by_id` +#' @param format A character string specifying a file format. For \code{get_file}: +#' by default, this is \dQuote{original} (the original file format). If \dQuote{RData} +#' or \dQuote{prep} is used, an alternative is returned. If \dQuote{bundle}, a +#' compressed directory containing a bundle of file formats is returned. For +#' \code{get_file_metadata}, this is \dQuote{ddi}. +#' @param vars A character vector specifying one or more variable names, used to +#' extract a subset of the data. +#' +#' @return \code{get_file_metadata} returns a character vector containing a DDI +#' metadata file. \code{get_file} returns a raw vector (or list of raw vectors, +#' if \code{length(file) > 1}). +#' +#' #' @examples #' \dontrun{ #' # download file from: @@ -47,16 +62,15 @@ #' } #' @importFrom utils unzip #' @export -get_file <- function( - file, - dataset = NULL, - format = c("original", "RData", "prep", "bundle"), - # thumb = TRUE, - vars = NULL, - key = Sys.getenv("DATAVERSE_KEY"), - server = Sys.getenv("DATAVERSE_SERVER"), - ... -) { +get_file <- + function(file, + dataset = NULL, + format = c("original", "RData", "prep", "bundle"), + # thumb = TRUE, + vars = NULL, + key = Sys.getenv("DATAVERSE_KEY"), + server = Sys.getenv("DATAVERSE_SERVER"), + ...) { format <- match.arg(format) # single file ID @@ -100,32 +114,16 @@ get_file <- function( # downloading files sequentially and add the raw vectors to a list out <- vector("list", length(fileid)) for (i in 1:length(fileid)) { - if (format == "bundle") { - u <- paste0(api_url(server), "access/datafile/bundle/", fileid[i]) - r <- httr::GET(u, httr::add_headers("X-Dataverse-key" = key), ...) - } - if (format != "bundle") { - u <- paste0(api_url(server), "access/datafile/", fileid[i]) - query <- list() - if (!is.null(vars)) { - query$vars <- paste0(vars, collapse = ",") - } - if (!is.null(format)) { - query$format <- match.arg(format) - } - - # request single file in non-bundle format ---- - # add query if ingesting a tab (detect from original file name) - if (length(query) == 1 & grepl("\\.tab$", file[i])) { - r <- httr::GET(u, httr::add_headers("X-Dataverse-key" = key), query = query, ...) - } else { - # do not add query if not an ingestion file - r <- httr::GET(u, httr::add_headers("X-Dataverse-key" = key), ...) - } - } - httr::stop_for_status(r) - out[[i]] <- httr::content(r, as = "raw") + out[[i]] <- get_file_by_id( + fileid = fileid[i], + dataset, + format, + vars, + keys, + server + ) } + # return the raw vector if there's a single file if (length(out) == 1) { return (out[[1]]) @@ -143,14 +141,13 @@ get_file_name_from_header <- function(x) { #' @rdname files #' @import xml2 #' @export -get_file_metadata <- function( - file, - dataset = NULL, - format = c("ddi", "preprocessed"), - key = Sys.getenv("DATAVERSE_KEY"), - server = Sys.getenv("DATAVERSE_SERVER"), - ... -) { +get_file_metadata <- + function(file, + dataset = NULL, + format = c("ddi", "preprocessed"), + key = Sys.getenv("DATAVERSE_KEY"), + server = Sys.getenv("DATAVERSE_SERVER"), + ...) { # get file ID from doi if (!is.numeric(file)) { if (inherits(file, "dataverse_file")) { diff --git a/R/get_file_by_id.R b/R/get_file_by_id.R new file mode 100644 index 0000000..d537a09 --- /dev/null +++ b/R/get_file_by_id.R @@ -0,0 +1,49 @@ +#' @title Download Single File by dataverse ID +#' +#' @rdname files +#' +#' @export +get_file_by_id <- + function(fileid, + dataset = NULL, + format = c("original", "RData", "prep", "bundle"), + # thumb = TRUE, + vars = NULL, + key = Sys.getenv("DATAVERSE_KEY"), + server = Sys.getenv("DATAVERSE_SERVER"), + ...) { + format <- match.arg(format) + + # single file ID + stopifnot (is.numeric(fileid)) + stopifnot (length(fileid) == 1) + + + # downloading files sequentially and add the raw vectors to a list + out <- vector("list", length(fileid)) + + # create query ----- + u <- paste0(api_url(server), "access/datafile/", fileid[i]) + query <- list() + if (!is.null(vars)) { + query$vars <- paste0(vars, collapse = ",") + } + if (!is.null(format)) { + query$format <- match.arg(format) + } + + # request single file in non-bundle format ---- + # add query if ingesting a tab (detect from original file name) + if (length(query) == 1 & grepl("\\.tab$", file[i])) { + r <- httr::GET(u, httr::add_headers("X-Dataverse-key" = key), query = query, ...) + } else { + # do not add query if not an ingestion file + r <- httr::GET(u, httr::add_headers("X-Dataverse-key" = key), ...) + } + + httr::stop_for_status(r) + out <- httr::content(r, as = "raw") + + return (out) + + } From 59ae459d49486cb79cc299017986e1b4071d1183 Mon Sep 17 00:00:00 2001 From: Shiro Kuriwaki Date: Sat, 26 Dec 2020 15:38:04 -0500 Subject: [PATCH 02/75] Separate out metadata from field --- R/get_file.R | 38 ++++---------------------------------- R/get_file_metadata.R | 40 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 44 insertions(+), 34 deletions(-) create mode 100644 R/get_file_metadata.R diff --git a/R/get_file.R b/R/get_file.R index a445cb8..cea0778 100644 --- a/R/get_file.R +++ b/R/get_file.R @@ -14,13 +14,11 @@ #' @param format A character string specifying a file format. For \code{get_file}: #' by default, this is \dQuote{original} (the original file format). If \dQuote{RData} #' or \dQuote{prep} is used, an alternative is returned. If \dQuote{bundle}, a -#' compressed directory containing a bundle of file formats is returned. For -#' \code{get_file_metadata}, this is \dQuote{ddi}. +#' compressed directory containing a bundle of file formats is returned. #' @param vars A character vector specifying one or more variable names, used to #' extract a subset of the data. #' -#' @return \code{get_file_metadata} returns a character vector containing a DDI -#' metadata file. \code{get_file} returns a raw vector (or list of raw vectors, +#' @return \code{get_file} returns a raw vector (or list of raw vectors, #' if \code{length(file) > 1}). #' #' @@ -134,34 +132,6 @@ get_file <- } } -get_file_name_from_header <- function(x) { - gsub("\"", "", strsplit(httr::headers(x)[["content-type"]], "name=")[[1]][2]) -} -#' @rdname files -#' @import xml2 -#' @export -get_file_metadata <- - function(file, - dataset = NULL, - format = c("ddi", "preprocessed"), - key = Sys.getenv("DATAVERSE_KEY"), - server = Sys.getenv("DATAVERSE_SERVER"), - ...) { - # get file ID from doi - if (!is.numeric(file)) { - if (inherits(file, "dataverse_file")) { - file <- get_fileid(file) - } else if (is.null(dataset)) { - stop("When 'file' is a character string, dataset must be specified. Or, use a global fileid instead.") - } else { - file <- get_fileid(dataset, file, key = key, server = server, ...) - } - } - format <- match.arg(format) - u <- paste0(api_url(server), "access/datafile/", file, "/metadata/", format) - r <- httr::GET(u, httr::add_headers("X-Dataverse-key" = key), ...) - httr::stop_for_status(r) - out <- httr::content(r, as = "text", encoding = "UTF-8") - return(out) - } + + diff --git a/R/get_file_metadata.R b/R/get_file_metadata.R new file mode 100644 index 0000000..1cbf477 --- /dev/null +++ b/R/get_file_metadata.R @@ -0,0 +1,40 @@ +#' Retrieve a ddi metadata file +#' +#' +#' @param format Defaults to \dQuote{ddi} for metadata files +#' @inheritParams get_file +#' @return A character vector containing a DDI +#' metadata file. +#' +#' @import xml2 +#' @export +get_file_metadata <- + function(file, + dataset = NULL, + format = c("ddi", "preprocessed"), + key = Sys.getenv("DATAVERSE_KEY"), + server = Sys.getenv("DATAVERSE_SERVER"), + ...) { + # get file ID from doi + if (!is.numeric(file)) { + if (inherits(file, "dataverse_file")) { + file <- get_fileid(file) + } else if (is.null(dataset)) { + stop("When 'file' is a character string, dataset must be specified. Or, use a global fileid instead.") + } else { + file <- get_fileid(dataset, file, key = key, server = server, ...) + } + } + format <- match.arg(format) + u <- paste0(api_url(server), "access/datafile/", file, "/metadata/", format) + r <- httr::GET(u, httr::add_headers("X-Dataverse-key" = key), ...) + httr::stop_for_status(r) + out <- httr::content(r, as = "text", encoding = "UTF-8") + return(out) + } + + + +get_file_name_from_header <- function(x) { + gsub("\"", "", strsplit(httr::headers(x)[["content-type"]], "name=")[[1]][2]) +} From 0fd2acfc4a01667c0e4d4cd148ef2e6ed9c638d4 Mon Sep 17 00:00:00 2001 From: Shiro Kuriwaki Date: Sat, 26 Dec 2020 15:38:38 -0500 Subject: [PATCH 03/75] Remove file_name_from_header which never gets used. --- R/get_file_metadata.R | 5 ----- 1 file changed, 5 deletions(-) diff --git a/R/get_file_metadata.R b/R/get_file_metadata.R index 1cbf477..d5f7cec 100644 --- a/R/get_file_metadata.R +++ b/R/get_file_metadata.R @@ -33,8 +33,3 @@ get_file_metadata <- return(out) } - - -get_file_name_from_header <- function(x) { - gsub("\"", "", strsplit(httr::headers(x)[["content-type"]], "name=")[[1]][2]) -} From 96d19b406a1feede7c6897a2785769d2bce82704 Mon Sep 17 00:00:00 2001 From: Shiro Kuriwaki Date: Sat, 26 Dec 2020 15:46:21 -0500 Subject: [PATCH 04/75] Reflow and add Havard example --- man-roxygen/dots.R | 4 +++- man-roxygen/envvars.R | 10 ++++++++-- 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/man-roxygen/dots.R b/man-roxygen/dots.R index bd3f491..cf74d0d 100644 --- a/man-roxygen/dots.R +++ b/man-roxygen/dots.R @@ -1 +1,3 @@ -#' @param ... Additional arguments passed to an HTTP request function, such as \code{\link[httr]{GET}}, \code{\link[httr]{POST}}, or \code{\link[httr]{DELETE}}. +#' @param ... Additional arguments passed to an HTTP request function, such as +#' \code{\link[httr]{GET}}, \code{\link[httr]{POST}}, or +#' \code{\link[httr]{DELETE}}. diff --git a/man-roxygen/envvars.R b/man-roxygen/envvars.R index 7526c1f..35579b9 100644 --- a/man-roxygen/envvars.R +++ b/man-roxygen/envvars.R @@ -1,2 +1,8 @@ -#' @param key A character string specifying a Dataverse server API key. If one is not specified, functions calling authenticated API endpoints will fail. Keys can be specified atomically or globally using \code{Sys.setenv("DATAVERSE_KEY" = "examplekey")}. -#' @param server A character string specifying a Dataverse server. There are multiple Dataverse installations, but the defaults is to use the Harvard Dataverse. This can be modified atomically or globally using \code{Sys.setenv("DATAVERSE_SERVER" = "dataverse.example.com")}. +#' @param key A character string specifying a Dataverse server API key. If one +#' is not specified, functions calling authenticated API endpoints will fail. +#' Keys can be specified atomically or globally using +#' \code{Sys.setenv("DATAVERSE_KEY" = "examplekey")}. +#' @param server A character string specifying a Dataverse server. There are +#' multiple Dataverse installations, but the defaults is to use the Harvard +#' Dataverse (`server = "dataverse.harvard.edu"`). This can be modified atomically +#' or globally using \code{Sys.setenv("DATAVERSE_SERVER" = "dataverse.example.com")}. From c069b7ba97fb309170464ab6e15a3a53179c49d4 Mon Sep 17 00:00:00 2001 From: Shiro Kuriwaki Date: Sat, 26 Dec 2020 15:46:32 -0500 Subject: [PATCH 05/75] Add back param def --- R/get_file.R | 3 +++ 1 file changed, 3 insertions(+) diff --git a/R/get_file.R b/R/get_file.R index cea0778..71ef41c 100644 --- a/R/get_file.R +++ b/R/get_file.R @@ -18,6 +18,9 @@ #' @param vars A character vector specifying one or more variable names, used to #' extract a subset of the data. #' +#' @template envvars +#' @template dots +#' #' @return \code{get_file} returns a raw vector (or list of raw vectors, #' if \code{length(file) > 1}). #' From b46174244df3a4016807fc8988d2809d706e9feb Mon Sep 17 00:00:00 2001 From: Shiro Kuriwaki Date: Sat, 26 Dec 2020 15:59:01 -0500 Subject: [PATCH 06/75] Remove comment and simplify loops. Do not use nested loops --- R/get_file.R | 40 ++++++++++------------------------------ 1 file changed, 10 insertions(+), 30 deletions(-) diff --git a/R/get_file.R b/R/get_file.R index 71ef41c..e92f6b2 100644 --- a/R/get_file.R +++ b/R/get_file.R @@ -72,6 +72,7 @@ get_file <- key = Sys.getenv("DATAVERSE_KEY"), server = Sys.getenv("DATAVERSE_SERVER"), ...) { + format <- match.arg(format) # single file ID @@ -79,49 +80,28 @@ get_file <- fileid <- file # get file ID from 'dataset' + if (!is.numeric(file) & is.null(dataset)) + stop("When 'file' is a character (non-global ID), dataset must be specified.") + if (!is.numeric(file)) { if (inherits(file, "dataverse_file")) { fileid <- get_fileid(file, key = key, server = server) - } else if (is.null(dataset)) { - stop("When 'file' is a character string, dataset must be specified. Or, use a global fileid instead.") } else { fileid <- get_fileid(dataset, file, key = key, server = server, ...) } - } else { - fileid <- file } - # # request multiple files ----- - # if (length(fileid) > 1) { - # fileid <- paste0(fileid, collapse = ",") - # u <- paste0(api_url(server), "access/datafiles/", fileid) - # r <- httr::GET(u, httr::add_headers("X-Dataverse-key" = key), ...) - # httr::stop_for_status(r) - # tempf <- tempfile(fileext = ".zip") - # tempd <- tempfile() - # dir.create(tempd) - # on.exit(unlink(tempf), add = TRUE) - # on.exit(unlink(tempd), add = TRUE) - # writeBin(httr::content(r, as = "raw"), tempf) - # to_extract <- utils::unzip(tempf, list = TRUE) - # out <- lapply(to_extract$Name[to_extract$Name != "MANIFEST.TXT"], function(zipf) { - # utils::unzip(zipfile = tempf, files = zipf, exdir = tempd) - # readBin(file.path(tempd, zipf), "raw", n = 1e8) - # }) - # return(out) - # } - - # downloading files sequentially and add the raw vectors to a list + # Main function. CAll get_file_by_id out <- vector("list", length(fileid)) for (i in 1:length(fileid)) { out[[i]] <- get_file_by_id( fileid = fileid[i], - dataset, - format, - vars, - keys, - server + dataset = dataset, + format = format, + vars = vars, + key = key, + server = server ) } From 79ee71d61cedd8785d4cd71c88d99a043a296a8c Mon Sep 17 00:00:00 2001 From: Shiro Kuriwaki Date: Sat, 26 Dec 2020 16:01:32 -0500 Subject: [PATCH 07/75] Undo nested loops, but note that in future this should be cleaned and just rely on the methods of get_fileid in utils.R --- R/get_file.R | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/R/get_file.R b/R/get_file.R index e92f6b2..d0cada1 100644 --- a/R/get_file.R +++ b/R/get_file.R @@ -79,17 +79,13 @@ get_file <- if (is.numeric(file)) fileid <- file - # get file ID from 'dataset' + # get file ID from 'dataset'. Streamline in feature relying on get_fileid if (!is.numeric(file) & is.null(dataset)) stop("When 'file' is a character (non-global ID), dataset must be specified.") - - if (!is.numeric(file)) { - if (inherits(file, "dataverse_file")) { - fileid <- get_fileid(file, key = key, server = server) - } else { - fileid <- get_fileid(dataset, file, key = key, server = server, ...) - } - } + if (!is.numeric(file) & inherits(file, "dataverse_file")) + fileid <- get_fileid(file, key = key, server = server) + if (!is.numeric(file) & !inherits(file, "dataverse_file")) + fileid <- get_fileid(dataset, file, key = key, server = server, ...) # Main function. CAll get_file_by_id From 5ea71bf37d39429cb396319d090ff9b7ff38afe9 Mon Sep 17 00:00:00 2001 From: Shiro Kuriwaki Date: Sat, 26 Dec 2020 16:04:40 -0500 Subject: [PATCH 08/75] streamline example --- R/get_file.R | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/R/get_file.R b/R/get_file.R index d0cada1..4f5d1bd 100644 --- a/R/get_file.R +++ b/R/get_file.R @@ -28,19 +28,14 @@ #' @examples #' \dontrun{ #' # download file from: -#' # https://dataverse.harvard.edu/dataset.xhtml?persistentId=doi:10.7910/DVN/ARKOTI -#' monogan <- get_dataverse("monogan") -#' monogan_data <- dataverse_contents(monogan) +#' # https://doi.org/10.7910/DVN/ARKOTI +#' #' d1 <- get_dataset("doi:10.7910/DVN/ARKOTI") #' f <- get_file(d1$files$datafile$id[3]) -#' -#' # check file metadata -#' m1 <- get_file_metadata("constructionData.tab", "doi:10.7910/DVN/ARKOTI") -#' m2 <- get_file_metadata(2437257) +#' f2 <- get_file(2692202) #' #' # retrieve file based on DOI and filename #' f2 <- get_file("constructionData.tab", "doi:10.7910/DVN/ARKOTI") -#' f2 <- get_file(2692202) #' #' # retrieve file based on "dataverse_file" object #' flist <- dataset_files(2692151) @@ -49,6 +44,8 @@ #' # retrieve all files in a dataset in their original format (returns a list of raw vectors) #' file_ids <- get_dataset("doi:10.7910/DVN/CXOB4K")[['files']]$id #' f3 <- get_file(file_ids, format = "original") +#' +#' #' # read file as data.frame #' if (require("rio")) { #' tmp <- tempfile(fileext = ".dta") @@ -88,7 +85,7 @@ get_file <- fileid <- get_fileid(dataset, file, key = key, server = server, ...) - # Main function. CAll get_file_by_id + # Main function. Call get_file_by_id out <- vector("list", length(fileid)) for (i in 1:length(fileid)) { out[[i]] <- get_file_by_id( From e4d0143937714dc8cfd65d60388a1b208826b6ba Mon Sep 17 00:00:00 2001 From: Shiro Kuriwaki Date: Sat, 26 Dec 2020 16:21:40 -0500 Subject: [PATCH 09/75] Remove condition but this is a problem. get_file_id no longer has file --- R/get_file_by_id.R | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/R/get_file_by_id.R b/R/get_file_by_id.R index d537a09..f96004e 100644 --- a/R/get_file_by_id.R +++ b/R/get_file_by_id.R @@ -23,7 +23,7 @@ get_file_by_id <- out <- vector("list", length(fileid)) # create query ----- - u <- paste0(api_url(server), "access/datafile/", fileid[i]) + u <- paste0(api_url(server), "access/datafile/", fileid) query <- list() if (!is.null(vars)) { query$vars <- paste0(vars, collapse = ",") @@ -34,12 +34,12 @@ get_file_by_id <- # request single file in non-bundle format ---- # add query if ingesting a tab (detect from original file name) - if (length(query) == 1 & grepl("\\.tab$", file[i])) { - r <- httr::GET(u, httr::add_headers("X-Dataverse-key" = key), query = query, ...) - } else { + # if (length(query) == 1 & grepl("\\.tab$", file[i])) { + # r <- httr::GET(u, httr::add_headers("X-Dataverse-key" = key), query = query, ...) + # } else { # do not add query if not an ingestion file r <- httr::GET(u, httr::add_headers("X-Dataverse-key" = key), ...) - } + # } httr::stop_for_status(r) out <- httr::content(r, as = "raw") From 812e414b4dc8dc71736ef0e2e274c13c8aa228e4 Mon Sep 17 00:00:00 2001 From: Shiro Kuriwaki Date: Sat, 26 Dec 2020 16:22:17 -0500 Subject: [PATCH 10/75] First version of #48, #35 --- R/get_file_as_dataframe.R | 37 +++++++++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) create mode 100644 R/get_file_as_dataframe.R diff --git a/R/get_file_as_dataframe.R b/R/get_file_as_dataframe.R new file mode 100644 index 0000000..5a48815 --- /dev/null +++ b/R/get_file_as_dataframe.R @@ -0,0 +1,37 @@ +#' Get file from dataverse and convert it into a dataframe or tibble +#' +#' +#' @param file to be passed on to get_file +#' @param dataset to be passed on to get_file +#' @param read_function If supplied a function object, this will write the +#' raw file to a tempfile and read it back in with the supplied function. This +#' is useful when you want to start working with the data right away in the R +#' environment +#' @inheritDotParams get_file +#' +#' @examples +#' gap_df <- get_dataframe_by_name( +#' file = "gapminder-FiveYearData.tab", +#' dataset = "doi:10.7910/DVN/GJQNEQ", +#' server = "dataverse.harvard.edu", +#' read_function = readr::read_tsv) +#' +#' @export +get_dataframe_by_name <- function(file, + dataset = NULL, + read_function = NULL, + ...) { + + raw_file <- get_file(file = file, dataset = dataset, ...) + + # default of get_file + if (is.null(read_function)) + return(raw_file) + + # save to temp and then read it in with supplied function + if (!is.null(read_function)) { + tmp <- tempfile(file, fileext = stringr::str_extract(file, "\\.[A-z]+$")) + writeBin(raw_file, tmp) + return(do.call(read_function, list(tmp))) + } +} From bf9dc33ead989067b6e89f1d060d19a59cdc5273 Mon Sep 17 00:00:00 2001 From: Shiro Kuriwaki Date: Sat, 26 Dec 2020 16:23:44 -0500 Subject: [PATCH 11/75] Main Rd changes due to additions in data import --- DESCRIPTION | 2 +- NAMESPACE | 2 ++ R/get_file.R | 1 + man/get_dataframe_by_name.Rd | 48 ++++++++++++++++++++++++++++++++++++ man/get_file_metadata.Rd | 44 +++++++++++++++++++++++++++++++++ 5 files changed, 96 insertions(+), 1 deletion(-) create mode 100644 man/get_dataframe_by_name.Rd create mode 100644 man/get_file_metadata.Rd diff --git a/DESCRIPTION b/DESCRIPTION index 492a3c2..5ac1516 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -60,4 +60,4 @@ URL: https://github.com/iqss/dataverse-client-r BugReports: https://github.com/iqss/dataverse-client-r/issues VignetteBuilder: knitr Encoding: UTF-8 -RoxygenNote: 7.1.0 +RoxygenNote: 7.1.1 diff --git a/NAMESPACE b/NAMESPACE index eb14124..4aae134 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -27,10 +27,12 @@ export(delete_dataset) export(delete_dataverse) export(delete_file) export(delete_sword_dataset) +export(get_dataframe_by_name) export(get_dataset) export(get_dataverse) export(get_facets) export(get_file) +export(get_file_by_id) export(get_file_metadata) export(get_user_key) export(initiate_sword_dataset) diff --git a/R/get_file.R b/R/get_file.R index 4f5d1bd..fa14e24 100644 --- a/R/get_file.R +++ b/R/get_file.R @@ -87,6 +87,7 @@ get_file <- # Main function. Call get_file_by_id out <- vector("list", length(fileid)) + for (i in 1:length(fileid)) { out[[i]] <- get_file_by_id( fileid = fileid[i], diff --git a/man/get_dataframe_by_name.Rd b/man/get_dataframe_by_name.Rd new file mode 100644 index 0000000..6570c44 --- /dev/null +++ b/man/get_dataframe_by_name.Rd @@ -0,0 +1,48 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/get_file_as_dataframe.R +\name{get_dataframe_by_name} +\alias{get_dataframe_by_name} +\title{Get file from dataverse and convert it into a dataframe or tibble} +\usage{ +get_dataframe_by_name(file, dataset = NULL, read_function = NULL, ...) +} +\arguments{ +\item{file}{to be passed on to get_file} + +\item{dataset}{to be passed on to get_file} + +\item{read_function}{If supplied a function object, this will write the +raw file to a tempfile and read it back in with the supplied function. This +is useful when you want to start working with the data right away in the R +environment} + +\item{...}{ + Arguments passed on to \code{\link[=get_file]{get_file}} + \describe{ + \item{\code{format}}{A character string specifying a file format. For \code{get_file}: +by default, this is \dQuote{original} (the original file format). If \dQuote{RData} +or \dQuote{prep} is used, an alternative is returned. If \dQuote{bundle}, a +compressed directory containing a bundle of file formats is returned.} + \item{\code{vars}}{A character vector specifying one or more variable names, used to +extract a subset of the data.} + \item{\code{key}}{A character string specifying a Dataverse server API key. If one +is not specified, functions calling authenticated API endpoints will fail. +Keys can be specified atomically or globally using +\code{Sys.setenv("DATAVERSE_KEY" = "examplekey")}.} + \item{\code{server}}{A character string specifying a Dataverse server. There are +multiple Dataverse installations, but the defaults is to use the Harvard +Dataverse (`server = "dataverse.harvard.edu"`). This can be modified atomically +or globally using \code{Sys.setenv("DATAVERSE_SERVER" = "dataverse.example.com")}.} + }} +} +\description{ +Get file from dataverse and convert it into a dataframe or tibble +} +\examples{ +gap_df <- get_dataframe_by_name( + file = "gapminder-FiveYearData.tab", + dataset = "doi:10.7910/DVN/GJQNEQ", + server = "dataverse.harvard.edu", + read_function = readr::read_tsv) + +} diff --git a/man/get_file_metadata.Rd b/man/get_file_metadata.Rd new file mode 100644 index 0000000..044b405 --- /dev/null +++ b/man/get_file_metadata.Rd @@ -0,0 +1,44 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/get_file_metadata.R +\name{get_file_metadata} +\alias{get_file_metadata} +\title{Retrieve a ddi metadata file} +\usage{ +get_file_metadata( + file, + dataset = NULL, + format = c("ddi", "preprocessed"), + key = Sys.getenv("DATAVERSE_KEY"), + server = Sys.getenv("DATAVERSE_SERVER"), + ... +) +} +\arguments{ +\item{file}{An integer specifying a file identifier; or a vector of integers +specifying file identifiers; or, if \code{doi} is specified, a character string +specifying a file name within the DOI-identified dataset; or an object of + class \dQuote{dataverse_file} as returned by \code{\link{dataset_files}}.} + +\item{format}{Defaults to \dQuote{ddi} for metadata files} + +\item{key}{A character string specifying a Dataverse server API key. If one +is not specified, functions calling authenticated API endpoints will fail. +Keys can be specified atomically or globally using +\code{Sys.setenv("DATAVERSE_KEY" = "examplekey")}.} + +\item{server}{A character string specifying a Dataverse server. There are +multiple Dataverse installations, but the defaults is to use the Harvard +Dataverse (`server = "dataverse.harvard.edu"`). This can be modified atomically +or globally using \code{Sys.setenv("DATAVERSE_SERVER" = "dataverse.example.com")}.} + +\item{...}{Additional arguments passed to an HTTP request function, such as +\code{\link[httr]{GET}}, \code{\link[httr]{POST}}, or +\code{\link[httr]{DELETE}}.} +} +\value{ +A character vector containing a DDI + metadata file. +} +\description{ +Retrieve a ddi metadata file +} From 52fc2e0789da071370c341f545746eb2fb3becb1 Mon Sep 17 00:00:00 2001 From: Shiro Kuriwaki Date: Sat, 26 Dec 2020 16:24:13 -0500 Subject: [PATCH 12/75] Downstream from changing man-roxygen --- man/add_dataset_file.Rd | 14 +++++++-- man/add_file.Rd | 14 +++++++-- man/create_dataset.Rd | 14 +++++++-- man/create_dataverse.Rd | 14 +++++++-- man/dataset_atom.Rd | 14 +++++++-- man/dataset_versions.Rd | 14 +++++++-- man/dataverse_metadata.Rd | 14 +++++++-- man/dataverse_search.Rd | 12 +++++-- man/delete_dataset.Rd | 14 +++++++-- man/delete_dataverse.Rd | 14 +++++++-- man/delete_file.Rd | 14 +++++++-- man/delete_sword_dataset.Rd | 14 +++++++-- man/files.Rd | 59 ++++++++++++++++++++++------------- man/get_dataset.Rd | 14 +++++++-- man/get_dataverse.Rd | 14 +++++++-- man/get_facets.Rd | 14 +++++++-- man/get_user_key.Rd | 4 ++- man/initiate_sword_dataset.Rd | 14 +++++++-- man/list_datasets.Rd | 14 +++++++-- man/publish_dataset.Rd | 14 +++++++-- man/publish_dataverse.Rd | 14 +++++++-- man/publish_sword_dataset.Rd | 14 +++++++-- man/service_document.Rd | 14 +++++++-- man/set_dataverse_metadata.Rd | 14 +++++++-- 24 files changed, 280 insertions(+), 89 deletions(-) diff --git a/man/add_dataset_file.Rd b/man/add_dataset_file.Rd index faa1db7..b4fff97 100644 --- a/man/add_dataset_file.Rd +++ b/man/add_dataset_file.Rd @@ -32,11 +32,19 @@ update_dataset_file( \item{description}{Optionally, a character string providing a description of the file.} -\item{key}{A character string specifying a Dataverse server API key. If one is not specified, functions calling authenticated API endpoints will fail. Keys can be specified atomically or globally using \code{Sys.setenv("DATAVERSE_KEY" = "examplekey")}.} +\item{key}{A character string specifying a Dataverse server API key. If one +is not specified, functions calling authenticated API endpoints will fail. +Keys can be specified atomically or globally using +\code{Sys.setenv("DATAVERSE_KEY" = "examplekey")}.} -\item{server}{A character string specifying a Dataverse server. There are multiple Dataverse installations, but the defaults is to use the Harvard Dataverse. This can be modified atomically or globally using \code{Sys.setenv("DATAVERSE_SERVER" = "dataverse.example.com")}.} +\item{server}{A character string specifying a Dataverse server. There are +multiple Dataverse installations, but the defaults is to use the Harvard +Dataverse (`server = "dataverse.harvard.edu"`). This can be modified atomically +or globally using \code{Sys.setenv("DATAVERSE_SERVER" = "dataverse.example.com")}.} -\item{...}{Additional arguments passed to an HTTP request function, such as \code{\link[httr]{GET}}, \code{\link[httr]{POST}}, or \code{\link[httr]{DELETE}}.} +\item{...}{Additional arguments passed to an HTTP request function, such as +\code{\link[httr]{GET}}, \code{\link[httr]{POST}}, or +\code{\link[httr]{DELETE}}.} \item{id}{An integer specifying a file identifier; or, if \code{doi} is specified, a character string specifying a file name within the DOI-identified dataset; or an object of class \dQuote{dataverse_file} as returned by \code{\link{dataset_files}}.} diff --git a/man/add_file.Rd b/man/add_file.Rd index e1db2ff..f65268e 100644 --- a/man/add_file.Rd +++ b/man/add_file.Rd @@ -17,11 +17,19 @@ add_file( \item{file}{A character vector of file names, a data.frame, or a list of R objects.} -\item{key}{A character string specifying a Dataverse server API key. If one is not specified, functions calling authenticated API endpoints will fail. Keys can be specified atomically or globally using \code{Sys.setenv("DATAVERSE_KEY" = "examplekey")}.} +\item{key}{A character string specifying a Dataverse server API key. If one +is not specified, functions calling authenticated API endpoints will fail. +Keys can be specified atomically or globally using +\code{Sys.setenv("DATAVERSE_KEY" = "examplekey")}.} -\item{server}{A character string specifying a Dataverse server. There are multiple Dataverse installations, but the defaults is to use the Harvard Dataverse. This can be modified atomically or globally using \code{Sys.setenv("DATAVERSE_SERVER" = "dataverse.example.com")}.} +\item{server}{A character string specifying a Dataverse server. There are +multiple Dataverse installations, but the defaults is to use the Harvard +Dataverse (`server = "dataverse.harvard.edu"`). This can be modified atomically +or globally using \code{Sys.setenv("DATAVERSE_SERVER" = "dataverse.example.com")}.} -\item{...}{Additional arguments passed to an HTTP request function, such as \code{\link[httr]{GET}}, \code{\link[httr]{POST}}, or \code{\link[httr]{DELETE}}.} +\item{...}{Additional arguments passed to an HTTP request function, such as +\code{\link[httr]{GET}}, \code{\link[httr]{POST}}, or +\code{\link[httr]{DELETE}}.} } \value{ An object of class \dQuote{dataset_atom}. diff --git a/man/create_dataset.Rd b/man/create_dataset.Rd index c27eb4f..bc83cf5 100644 --- a/man/create_dataset.Rd +++ b/man/create_dataset.Rd @@ -26,11 +26,19 @@ update_dataset( \item{body}{A list describing the dataset.} -\item{key}{A character string specifying a Dataverse server API key. If one is not specified, functions calling authenticated API endpoints will fail. Keys can be specified atomically or globally using \code{Sys.setenv("DATAVERSE_KEY" = "examplekey")}.} +\item{key}{A character string specifying a Dataverse server API key. If one +is not specified, functions calling authenticated API endpoints will fail. +Keys can be specified atomically or globally using +\code{Sys.setenv("DATAVERSE_KEY" = "examplekey")}.} -\item{server}{A character string specifying a Dataverse server. There are multiple Dataverse installations, but the defaults is to use the Harvard Dataverse. This can be modified atomically or globally using \code{Sys.setenv("DATAVERSE_SERVER" = "dataverse.example.com")}.} +\item{server}{A character string specifying a Dataverse server. There are +multiple Dataverse installations, but the defaults is to use the Harvard +Dataverse (`server = "dataverse.harvard.edu"`). This can be modified atomically +or globally using \code{Sys.setenv("DATAVERSE_SERVER" = "dataverse.example.com")}.} -\item{...}{Additional arguments passed to an HTTP request function, such as \code{\link[httr]{GET}}, \code{\link[httr]{POST}}, or \code{\link[httr]{DELETE}}.} +\item{...}{Additional arguments passed to an HTTP request function, such as +\code{\link[httr]{GET}}, \code{\link[httr]{POST}}, or +\code{\link[httr]{DELETE}}.} \item{dataset}{An integer specifying a dataset identification number or an object of class \dQuote{dataverse_dataset}. The identification number is the dataset's persistent identification number (not the integer specifying a specific version of the dataset, such as returned by \code{\link{dataset_versions}}).} } diff --git a/man/create_dataverse.Rd b/man/create_dataverse.Rd index 1c71ae9..36d5a68 100644 --- a/man/create_dataverse.Rd +++ b/man/create_dataverse.Rd @@ -14,11 +14,19 @@ create_dataverse( \arguments{ \item{dataverse}{A character string specifying a Dataverse name or an object of class \dQuote{dataverse}. If missing, a top-level Dataverse is created.} -\item{key}{A character string specifying a Dataverse server API key. If one is not specified, functions calling authenticated API endpoints will fail. Keys can be specified atomically or globally using \code{Sys.setenv("DATAVERSE_KEY" = "examplekey")}.} +\item{key}{A character string specifying a Dataverse server API key. If one +is not specified, functions calling authenticated API endpoints will fail. +Keys can be specified atomically or globally using +\code{Sys.setenv("DATAVERSE_KEY" = "examplekey")}.} -\item{server}{A character string specifying a Dataverse server. There are multiple Dataverse installations, but the defaults is to use the Harvard Dataverse. This can be modified atomically or globally using \code{Sys.setenv("DATAVERSE_SERVER" = "dataverse.example.com")}.} +\item{server}{A character string specifying a Dataverse server. There are +multiple Dataverse installations, but the defaults is to use the Harvard +Dataverse (`server = "dataverse.harvard.edu"`). This can be modified atomically +or globally using \code{Sys.setenv("DATAVERSE_SERVER" = "dataverse.example.com")}.} -\item{...}{Additional arguments passed to an HTTP request function, such as \code{\link[httr]{GET}}, \code{\link[httr]{POST}}, or \code{\link[httr]{DELETE}}.} +\item{...}{Additional arguments passed to an HTTP request function, such as +\code{\link[httr]{GET}}, \code{\link[httr]{POST}}, or +\code{\link[httr]{DELETE}}.} } \value{ A list. diff --git a/man/dataset_atom.Rd b/man/dataset_atom.Rd index b13305a..d4cf4c7 100644 --- a/man/dataset_atom.Rd +++ b/man/dataset_atom.Rd @@ -22,11 +22,19 @@ dataset_statement( \arguments{ \item{dataset}{A dataset DOI (or other persistent identifier), an object of class \dQuote{dataset_atom} or \dQuote{dataset_statement}, or an appropriate and complete SWORD URL.} -\item{key}{A character string specifying a Dataverse server API key. If one is not specified, functions calling authenticated API endpoints will fail. Keys can be specified atomically or globally using \code{Sys.setenv("DATAVERSE_KEY" = "examplekey")}.} +\item{key}{A character string specifying a Dataverse server API key. If one +is not specified, functions calling authenticated API endpoints will fail. +Keys can be specified atomically or globally using +\code{Sys.setenv("DATAVERSE_KEY" = "examplekey")}.} -\item{server}{A character string specifying a Dataverse server. There are multiple Dataverse installations, but the defaults is to use the Harvard Dataverse. This can be modified atomically or globally using \code{Sys.setenv("DATAVERSE_SERVER" = "dataverse.example.com")}.} +\item{server}{A character string specifying a Dataverse server. There are +multiple Dataverse installations, but the defaults is to use the Harvard +Dataverse (`server = "dataverse.harvard.edu"`). This can be modified atomically +or globally using \code{Sys.setenv("DATAVERSE_SERVER" = "dataverse.example.com")}.} -\item{...}{Additional arguments passed to an HTTP request function, such as \code{\link[httr]{GET}}, \code{\link[httr]{POST}}, or \code{\link[httr]{DELETE}}.} +\item{...}{Additional arguments passed to an HTTP request function, such as +\code{\link[httr]{GET}}, \code{\link[httr]{POST}}, or +\code{\link[httr]{DELETE}}.} } \value{ A list. For \code{dataset_atom}, an object of class \dQuote{dataset_atom}. diff --git a/man/dataset_versions.Rd b/man/dataset_versions.Rd index e1ea475..f573ec1 100644 --- a/man/dataset_versions.Rd +++ b/man/dataset_versions.Rd @@ -14,11 +14,19 @@ dataset_versions( \arguments{ \item{dataset}{An integer specifying a dataset identification number or an object of class \dQuote{dataverse_dataset}. The identification number is the dataset's persistent identification number (not the integer specifying a specific version of the dataset, such as returned by \code{\link{dataset_versions}}).} -\item{key}{A character string specifying a Dataverse server API key. If one is not specified, functions calling authenticated API endpoints will fail. Keys can be specified atomically or globally using \code{Sys.setenv("DATAVERSE_KEY" = "examplekey")}.} +\item{key}{A character string specifying a Dataverse server API key. If one +is not specified, functions calling authenticated API endpoints will fail. +Keys can be specified atomically or globally using +\code{Sys.setenv("DATAVERSE_KEY" = "examplekey")}.} -\item{server}{A character string specifying a Dataverse server. There are multiple Dataverse installations, but the defaults is to use the Harvard Dataverse. This can be modified atomically or globally using \code{Sys.setenv("DATAVERSE_SERVER" = "dataverse.example.com")}.} +\item{server}{A character string specifying a Dataverse server. There are +multiple Dataverse installations, but the defaults is to use the Harvard +Dataverse (`server = "dataverse.harvard.edu"`). This can be modified atomically +or globally using \code{Sys.setenv("DATAVERSE_SERVER" = "dataverse.example.com")}.} -\item{...}{Additional arguments passed to an HTTP request function, such as \code{\link[httr]{GET}}, \code{\link[httr]{POST}}, or \code{\link[httr]{DELETE}}.} +\item{...}{Additional arguments passed to an HTTP request function, such as +\code{\link[httr]{GET}}, \code{\link[httr]{POST}}, or +\code{\link[httr]{DELETE}}.} } \value{ A list of class \dQuote{dataverse_dataset_version}. diff --git a/man/dataverse_metadata.Rd b/man/dataverse_metadata.Rd index 115dc0d..11fa71e 100644 --- a/man/dataverse_metadata.Rd +++ b/man/dataverse_metadata.Rd @@ -14,11 +14,19 @@ dataverse_metadata( \arguments{ \item{dataverse}{A character string specifying a Dataverse name or an object of class \dQuote{dataverse}.} -\item{key}{A character string specifying a Dataverse server API key. If one is not specified, functions calling authenticated API endpoints will fail. Keys can be specified atomically or globally using \code{Sys.setenv("DATAVERSE_KEY" = "examplekey")}.} +\item{key}{A character string specifying a Dataverse server API key. If one +is not specified, functions calling authenticated API endpoints will fail. +Keys can be specified atomically or globally using +\code{Sys.setenv("DATAVERSE_KEY" = "examplekey")}.} -\item{server}{A character string specifying a Dataverse server. There are multiple Dataverse installations, but the defaults is to use the Harvard Dataverse. This can be modified atomically or globally using \code{Sys.setenv("DATAVERSE_SERVER" = "dataverse.example.com")}.} +\item{server}{A character string specifying a Dataverse server. There are +multiple Dataverse installations, but the defaults is to use the Harvard +Dataverse (`server = "dataverse.harvard.edu"`). This can be modified atomically +or globally using \code{Sys.setenv("DATAVERSE_SERVER" = "dataverse.example.com")}.} -\item{...}{Additional arguments passed to an HTTP request function, such as \code{\link[httr]{GET}}, \code{\link[httr]{POST}}, or \code{\link[httr]{DELETE}}.} +\item{...}{Additional arguments passed to an HTTP request function, such as +\code{\link[httr]{GET}}, \code{\link[httr]{POST}}, or +\code{\link[httr]{DELETE}}.} } \value{ A list diff --git a/man/dataverse_search.Rd b/man/dataverse_search.Rd index 9030076..622cb2a 100644 --- a/man/dataverse_search.Rd +++ b/man/dataverse_search.Rd @@ -42,9 +42,15 @@ dataverse_search( \item{fq}{See API documentation.} -\item{key}{A character string specifying a Dataverse server API key. If one is not specified, functions calling authenticated API endpoints will fail. Keys can be specified atomically or globally using \code{Sys.setenv("DATAVERSE_KEY" = "examplekey")}.} - -\item{server}{A character string specifying a Dataverse server. There are multiple Dataverse installations, but the defaults is to use the Harvard Dataverse. This can be modified atomically or globally using \code{Sys.setenv("DATAVERSE_SERVER" = "dataverse.example.com")}.} +\item{key}{A character string specifying a Dataverse server API key. If one +is not specified, functions calling authenticated API endpoints will fail. +Keys can be specified atomically or globally using +\code{Sys.setenv("DATAVERSE_KEY" = "examplekey")}.} + +\item{server}{A character string specifying a Dataverse server. There are +multiple Dataverse installations, but the defaults is to use the Harvard +Dataverse (`server = "dataverse.harvard.edu"`). This can be modified atomically +or globally using \code{Sys.setenv("DATAVERSE_SERVER" = "dataverse.example.com")}.} \item{verbose}{A logical indicating whether to display information about the search query (default is \code{TRUE}).} diff --git a/man/delete_dataset.Rd b/man/delete_dataset.Rd index 20d5721..4357c75 100644 --- a/man/delete_dataset.Rd +++ b/man/delete_dataset.Rd @@ -14,11 +14,19 @@ delete_dataset( \arguments{ \item{dataset}{An integer specifying a dataset identification number or an object of class \dQuote{dataverse_dataset}. The identification number is the dataset's persistent identification number (not the integer specifying a specific version of the dataset, such as returned by \code{\link{dataset_versions}}).} -\item{key}{A character string specifying a Dataverse server API key. If one is not specified, functions calling authenticated API endpoints will fail. Keys can be specified atomically or globally using \code{Sys.setenv("DATAVERSE_KEY" = "examplekey")}.} +\item{key}{A character string specifying a Dataverse server API key. If one +is not specified, functions calling authenticated API endpoints will fail. +Keys can be specified atomically or globally using +\code{Sys.setenv("DATAVERSE_KEY" = "examplekey")}.} -\item{server}{A character string specifying a Dataverse server. There are multiple Dataverse installations, but the defaults is to use the Harvard Dataverse. This can be modified atomically or globally using \code{Sys.setenv("DATAVERSE_SERVER" = "dataverse.example.com")}.} +\item{server}{A character string specifying a Dataverse server. There are +multiple Dataverse installations, but the defaults is to use the Harvard +Dataverse (`server = "dataverse.harvard.edu"`). This can be modified atomically +or globally using \code{Sys.setenv("DATAVERSE_SERVER" = "dataverse.example.com")}.} -\item{...}{Additional arguments passed to an HTTP request function, such as \code{\link[httr]{GET}}, \code{\link[httr]{POST}}, or \code{\link[httr]{DELETE}}.} +\item{...}{Additional arguments passed to an HTTP request function, such as +\code{\link[httr]{GET}}, \code{\link[httr]{POST}}, or +\code{\link[httr]{DELETE}}.} } \value{ A logical. diff --git a/man/delete_dataverse.Rd b/man/delete_dataverse.Rd index 8421dbd..95c1087 100644 --- a/man/delete_dataverse.Rd +++ b/man/delete_dataverse.Rd @@ -14,11 +14,19 @@ delete_dataverse( \arguments{ \item{dataverse}{A character string specifying a Dataverse name or an object of class \dQuote{dataverse}.} -\item{key}{A character string specifying a Dataverse server API key. If one is not specified, functions calling authenticated API endpoints will fail. Keys can be specified atomically or globally using \code{Sys.setenv("DATAVERSE_KEY" = "examplekey")}.} +\item{key}{A character string specifying a Dataverse server API key. If one +is not specified, functions calling authenticated API endpoints will fail. +Keys can be specified atomically or globally using +\code{Sys.setenv("DATAVERSE_KEY" = "examplekey")}.} -\item{server}{A character string specifying a Dataverse server. There are multiple Dataverse installations, but the defaults is to use the Harvard Dataverse. This can be modified atomically or globally using \code{Sys.setenv("DATAVERSE_SERVER" = "dataverse.example.com")}.} +\item{server}{A character string specifying a Dataverse server. There are +multiple Dataverse installations, but the defaults is to use the Harvard +Dataverse (`server = "dataverse.harvard.edu"`). This can be modified atomically +or globally using \code{Sys.setenv("DATAVERSE_SERVER" = "dataverse.example.com")}.} -\item{...}{Additional arguments passed to an HTTP request function, such as \code{\link[httr]{GET}}, \code{\link[httr]{POST}}, or \code{\link[httr]{DELETE}}.} +\item{...}{Additional arguments passed to an HTTP request function, such as +\code{\link[httr]{GET}}, \code{\link[httr]{POST}}, or +\code{\link[httr]{DELETE}}.} } \value{ A logical. diff --git a/man/delete_file.Rd b/man/delete_file.Rd index 7ca4631..6cae555 100644 --- a/man/delete_file.Rd +++ b/man/delete_file.Rd @@ -14,11 +14,19 @@ delete_file( \arguments{ \item{id}{A file ID, possibly returned by \code{\link{add_file}}, or a complete \dQuote{edit-media/file} URL.} -\item{key}{A character string specifying a Dataverse server API key. If one is not specified, functions calling authenticated API endpoints will fail. Keys can be specified atomically or globally using \code{Sys.setenv("DATAVERSE_KEY" = "examplekey")}.} +\item{key}{A character string specifying a Dataverse server API key. If one +is not specified, functions calling authenticated API endpoints will fail. +Keys can be specified atomically or globally using +\code{Sys.setenv("DATAVERSE_KEY" = "examplekey")}.} -\item{server}{A character string specifying a Dataverse server. There are multiple Dataverse installations, but the defaults is to use the Harvard Dataverse. This can be modified atomically or globally using \code{Sys.setenv("DATAVERSE_SERVER" = "dataverse.example.com")}.} +\item{server}{A character string specifying a Dataverse server. There are +multiple Dataverse installations, but the defaults is to use the Harvard +Dataverse (`server = "dataverse.harvard.edu"`). This can be modified atomically +or globally using \code{Sys.setenv("DATAVERSE_SERVER" = "dataverse.example.com")}.} -\item{...}{Additional arguments passed to an HTTP request function, such as \code{\link[httr]{GET}}, \code{\link[httr]{POST}}, or \code{\link[httr]{DELETE}}.} +\item{...}{Additional arguments passed to an HTTP request function, such as +\code{\link[httr]{GET}}, \code{\link[httr]{POST}}, or +\code{\link[httr]{DELETE}}.} } \value{ If successful, a logical \code{TRUE}, else possibly some information. diff --git a/man/delete_sword_dataset.Rd b/man/delete_sword_dataset.Rd index 0e6facc..a59582a 100644 --- a/man/delete_sword_dataset.Rd +++ b/man/delete_sword_dataset.Rd @@ -14,11 +14,19 @@ delete_sword_dataset( \arguments{ \item{dataset}{A dataset DOI (or other persistent identifier).} -\item{key}{A character string specifying a Dataverse server API key. If one is not specified, functions calling authenticated API endpoints will fail. Keys can be specified atomically or globally using \code{Sys.setenv("DATAVERSE_KEY" = "examplekey")}.} +\item{key}{A character string specifying a Dataverse server API key. If one +is not specified, functions calling authenticated API endpoints will fail. +Keys can be specified atomically or globally using +\code{Sys.setenv("DATAVERSE_KEY" = "examplekey")}.} -\item{server}{A character string specifying a Dataverse server. There are multiple Dataverse installations, but the defaults is to use the Harvard Dataverse. This can be modified atomically or globally using \code{Sys.setenv("DATAVERSE_SERVER" = "dataverse.example.com")}.} +\item{server}{A character string specifying a Dataverse server. There are +multiple Dataverse installations, but the defaults is to use the Harvard +Dataverse (`server = "dataverse.harvard.edu"`). This can be modified atomically +or globally using \code{Sys.setenv("DATAVERSE_SERVER" = "dataverse.example.com")}.} -\item{...}{Additional arguments passed to an HTTP request function, such as \code{\link[httr]{GET}}, \code{\link[httr]{POST}}, or \code{\link[httr]{DELETE}}.} +\item{...}{Additional arguments passed to an HTTP request function, such as +\code{\link[httr]{GET}}, \code{\link[httr]{POST}}, or +\code{\link[httr]{DELETE}}.} } \value{ If successful, a logical \code{TRUE}, else possibly some information. diff --git a/man/files.Rd b/man/files.Rd index 941bd21..26c9b04 100644 --- a/man/files.Rd +++ b/man/files.Rd @@ -1,8 +1,8 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/get_file.R +% Please edit documentation in R/get_file.R, R/get_file_by_id.R \name{get_file} \alias{get_file} -\alias{get_file_metadata} +\alias{get_file_by_id} \title{Download File(s)} \usage{ get_file( @@ -15,35 +15,53 @@ get_file( ... ) -get_file_metadata( - file, +get_file_by_id( + fileid, dataset = NULL, - format = c("ddi", "preprocessed"), + format = c("original", "RData", "prep", "bundle"), + vars = NULL, key = Sys.getenv("DATAVERSE_KEY"), server = Sys.getenv("DATAVERSE_SERVER"), ... ) } \arguments{ -\item{file}{An integer specifying a file identifier; or a vector of integers specifying file identifiers; or, if \code{doi} is specified, a character string specifying a file name within the DOI-identified dataset; or an object of class \dQuote{dataverse_file} as returned by \code{\link{dataset_files}}.} +\item{file}{An integer specifying a file identifier; or a vector of integers +specifying file identifiers; or, if \code{doi} is specified, a character string +specifying a file name within the DOI-identified dataset; or an object of + class \dQuote{dataverse_file} as returned by \code{\link{dataset_files}}.} -\item{dataset}{An integer specifying a dataset identification number or an object of class \dQuote{dataverse_dataset}. The identification number is the dataset's persistent identification number (not the integer specifying a specific version of the dataset, such as returned by \code{\link{dataset_versions}}).} +\item{format}{A character string specifying a file format. For \code{get_file}: +by default, this is \dQuote{original} (the original file format). If \dQuote{RData} +or \dQuote{prep} is used, an alternative is returned. If \dQuote{bundle}, a +compressed directory containing a bundle of file formats is returned.} -\item{format}{A character string specifying a file format. For \code{get_file}: by default, this is \dQuote{original} (the original file format). If \dQuote{RData} or \dQuote{prep} is used, an alternative is returned. If \dQuote{bundle}, a compressed directory containing a bundle of file formats is returned. For \code{get_file_metadata}, this is \dQuote{ddi}.} +\item{vars}{A character vector specifying one or more variable names, used to +extract a subset of the data.} -\item{vars}{A character vector specifying one or more variable names, used to extract a subset of the data.} +\item{key}{A character string specifying a Dataverse server API key. If one +is not specified, functions calling authenticated API endpoints will fail. +Keys can be specified atomically or globally using +\code{Sys.setenv("DATAVERSE_KEY" = "examplekey")}.} -\item{key}{A character string specifying a Dataverse server API key. If one is not specified, functions calling authenticated API endpoints will fail. Keys can be specified atomically or globally using \code{Sys.setenv("DATAVERSE_KEY" = "examplekey")}.} +\item{server}{A character string specifying a Dataverse server. There are +multiple Dataverse installations, but the defaults is to use the Harvard +Dataverse (`server = "dataverse.harvard.edu"`). This can be modified atomically +or globally using \code{Sys.setenv("DATAVERSE_SERVER" = "dataverse.example.com")}.} -\item{server}{A character string specifying a Dataverse server. There are multiple Dataverse installations, but the defaults is to use the Harvard Dataverse. This can be modified atomically or globally using \code{Sys.setenv("DATAVERSE_SERVER" = "dataverse.example.com")}.} +\item{...}{Additional arguments passed to an HTTP request function, such as +\code{\link[httr]{GET}}, \code{\link[httr]{POST}}, or +\code{\link[httr]{DELETE}}.} -\item{...}{Additional arguments passed to an HTTP request function, such as \code{\link[httr]{GET}}, \code{\link[httr]{POST}}, or \code{\link[httr]{DELETE}}.} +\item{fileid}{A numeric ID internally used for `get_file_by_id`} } \value{ -\code{get_file_metadata} returns a character vector containing a DDI metadata file. \code{get_file} returns a raw vector (or list of raw vectors, if \code{length(file) > 1}). +\code{get_file} returns a raw vector (or list of raw vectors, + if \code{length(file) > 1}). } \description{ -Download Dataverse File(s) +Download Dataverse File(s). `get_file` internally calls + `get_file_by_id`. } \details{ This function provides access to data files from a Dataverse entry. @@ -51,19 +69,14 @@ This function provides access to data files from a Dataverse entry. \examples{ \dontrun{ # download file from: -# https://dataverse.harvard.edu/dataset.xhtml?persistentId=doi:10.7910/DVN/ARKOTI -monogan <- get_dataverse("monogan") -monogan_data <- dataverse_contents(monogan) +# https://doi.org/10.7910/DVN/ARKOTI + d1 <- get_dataset("doi:10.7910/DVN/ARKOTI") f <- get_file(d1$files$datafile$id[3]) - -# check file metadata -m1 <- get_file_metadata("constructionData.tab", "doi:10.7910/DVN/ARKOTI") -m2 <- get_file_metadata(2437257) +f2 <- get_file(2692202) # retrieve file based on DOI and filename f2 <- get_file("constructionData.tab", "doi:10.7910/DVN/ARKOTI") -f2 <- get_file(2692202) # retrieve file based on "dataverse_file" object flist <- dataset_files(2692151) @@ -72,6 +85,8 @@ get_file(flist[[2]]) # retrieve all files in a dataset in their original format (returns a list of raw vectors) file_ids <- get_dataset("doi:10.7910/DVN/CXOB4K")[['files']]$id f3 <- get_file(file_ids, format = "original") + + # read file as data.frame if (require("rio")) { tmp <- tempfile(fileext = ".dta") diff --git a/man/get_dataset.Rd b/man/get_dataset.Rd index 70175c5..c1bd0ca 100644 --- a/man/get_dataset.Rd +++ b/man/get_dataset.Rd @@ -36,11 +36,19 @@ dataset_files( \item{version}{A character string specifying a version of the dataset. This can be one of \dQuote{:draft} (the current draft), \dQuote{:latest} (the latest draft, if it exists, or the latest published version), \dQuote{:latest-published} (the latest published version, ignoring any draft), or \dQuote{x.y} (where \samp{x} is a major version and \samp{y} is a minor version; the \samp{.y} can be omitted to obtain a major version). In lieu of this, a dataset's version-specific identification number can be used for the \code{dataset} argument.} -\item{key}{A character string specifying a Dataverse server API key. If one is not specified, functions calling authenticated API endpoints will fail. Keys can be specified atomically or globally using \code{Sys.setenv("DATAVERSE_KEY" = "examplekey")}.} +\item{key}{A character string specifying a Dataverse server API key. If one +is not specified, functions calling authenticated API endpoints will fail. +Keys can be specified atomically or globally using +\code{Sys.setenv("DATAVERSE_KEY" = "examplekey")}.} -\item{server}{A character string specifying a Dataverse server. There are multiple Dataverse installations, but the defaults is to use the Harvard Dataverse. This can be modified atomically or globally using \code{Sys.setenv("DATAVERSE_SERVER" = "dataverse.example.com")}.} +\item{server}{A character string specifying a Dataverse server. There are +multiple Dataverse installations, but the defaults is to use the Harvard +Dataverse (`server = "dataverse.harvard.edu"`). This can be modified atomically +or globally using \code{Sys.setenv("DATAVERSE_SERVER" = "dataverse.example.com")}.} -\item{...}{Additional arguments passed to an HTTP request function, such as \code{\link[httr]{GET}}, \code{\link[httr]{POST}}, or \code{\link[httr]{DELETE}}.} +\item{...}{Additional arguments passed to an HTTP request function, such as +\code{\link[httr]{GET}}, \code{\link[httr]{POST}}, or +\code{\link[httr]{DELETE}}.} \item{block}{A character string specifying a metadata block to retrieve. By default this is \dQuote{citation}. Other values may be available, depending on the dataset, such as \dQuote{geospatial} or \dQuote{socialscience}.} } diff --git a/man/get_dataverse.Rd b/man/get_dataverse.Rd index 58a10db..fb10545 100644 --- a/man/get_dataverse.Rd +++ b/man/get_dataverse.Rd @@ -23,13 +23,21 @@ dataverse_contents( \arguments{ \item{dataverse}{A character string specifying a Dataverse name or an object of class \dQuote{dataverse}.} -\item{key}{A character string specifying a Dataverse server API key. If one is not specified, functions calling authenticated API endpoints will fail. Keys can be specified atomically or globally using \code{Sys.setenv("DATAVERSE_KEY" = "examplekey")}.} +\item{key}{A character string specifying a Dataverse server API key. If one +is not specified, functions calling authenticated API endpoints will fail. +Keys can be specified atomically or globally using +\code{Sys.setenv("DATAVERSE_KEY" = "examplekey")}.} -\item{server}{A character string specifying a Dataverse server. There are multiple Dataverse installations, but the defaults is to use the Harvard Dataverse. This can be modified atomically or globally using \code{Sys.setenv("DATAVERSE_SERVER" = "dataverse.example.com")}.} +\item{server}{A character string specifying a Dataverse server. There are +multiple Dataverse installations, but the defaults is to use the Harvard +Dataverse (`server = "dataverse.harvard.edu"`). This can be modified atomically +or globally using \code{Sys.setenv("DATAVERSE_SERVER" = "dataverse.example.com")}.} \item{check}{A logical indicating whether to check that the value of \code{dataverse} is actually a numeric} -\item{...}{Additional arguments passed to an HTTP request function, such as \code{\link[httr]{GET}}, \code{\link[httr]{POST}}, or \code{\link[httr]{DELETE}}.} +\item{...}{Additional arguments passed to an HTTP request function, such as +\code{\link[httr]{GET}}, \code{\link[httr]{POST}}, or +\code{\link[httr]{DELETE}}.} } \value{ A list of class \dQuote{dataverse}. diff --git a/man/get_facets.Rd b/man/get_facets.Rd index 4361c9a..ae0afdd 100644 --- a/man/get_facets.Rd +++ b/man/get_facets.Rd @@ -14,11 +14,19 @@ get_facets( \arguments{ \item{dataverse}{A character string specifying a Dataverse name or an object of class \dQuote{dataverse}.} -\item{key}{A character string specifying a Dataverse server API key. If one is not specified, functions calling authenticated API endpoints will fail. Keys can be specified atomically or globally using \code{Sys.setenv("DATAVERSE_KEY" = "examplekey")}.} +\item{key}{A character string specifying a Dataverse server API key. If one +is not specified, functions calling authenticated API endpoints will fail. +Keys can be specified atomically or globally using +\code{Sys.setenv("DATAVERSE_KEY" = "examplekey")}.} -\item{server}{A character string specifying a Dataverse server. There are multiple Dataverse installations, but the defaults is to use the Harvard Dataverse. This can be modified atomically or globally using \code{Sys.setenv("DATAVERSE_SERVER" = "dataverse.example.com")}.} +\item{server}{A character string specifying a Dataverse server. There are +multiple Dataverse installations, but the defaults is to use the Harvard +Dataverse (`server = "dataverse.harvard.edu"`). This can be modified atomically +or globally using \code{Sys.setenv("DATAVERSE_SERVER" = "dataverse.example.com")}.} -\item{...}{Additional arguments passed to an HTTP request function, such as \code{\link[httr]{GET}}, \code{\link[httr]{POST}}, or \code{\link[httr]{DELETE}}.} +\item{...}{Additional arguments passed to an HTTP request function, such as +\code{\link[httr]{GET}}, \code{\link[httr]{POST}}, or +\code{\link[httr]{DELETE}}.} } \value{ A list. diff --git a/man/get_user_key.Rd b/man/get_user_key.Rd index 7692cc2..5e6cd43 100644 --- a/man/get_user_key.Rd +++ b/man/get_user_key.Rd @@ -13,7 +13,9 @@ get_user_key(user, password, server = Sys.getenv("DATAVERSE_SERVER"), ...) \item{server}{A character string specifying a Dataverse server. There are multiple Dataverse installations, but the defaults is to use the Harvard Dataverse. This can be modified atomically or globally using \code{Sys.setenv("DATAVERSE_SERVER" = "dataverse.example.com")}.} -\item{...}{Additional arguments passed to an HTTP request function, such as \code{\link[httr]{GET}}, \code{\link[httr]{POST}}, or \code{\link[httr]{DELETE}}.} +\item{...}{Additional arguments passed to an HTTP request function, such as +\code{\link[httr]{GET}}, \code{\link[httr]{POST}}, or +\code{\link[httr]{DELETE}}.} } \value{ A list. diff --git a/man/initiate_sword_dataset.Rd b/man/initiate_sword_dataset.Rd index 86b927d..444b040 100644 --- a/man/initiate_sword_dataset.Rd +++ b/man/initiate_sword_dataset.Rd @@ -17,11 +17,19 @@ initiate_sword_dataset( \item{body}{A list containing one or more metadata fields. Field names must be valid Dublin Core Terms labels (see details, below). The \samp{title}, \samp{description}, and \samp{creator} fields are required.} -\item{key}{A character string specifying a Dataverse server API key. If one is not specified, functions calling authenticated API endpoints will fail. Keys can be specified atomically or globally using \code{Sys.setenv("DATAVERSE_KEY" = "examplekey")}.} +\item{key}{A character string specifying a Dataverse server API key. If one +is not specified, functions calling authenticated API endpoints will fail. +Keys can be specified atomically or globally using +\code{Sys.setenv("DATAVERSE_KEY" = "examplekey")}.} -\item{server}{A character string specifying a Dataverse server. There are multiple Dataverse installations, but the defaults is to use the Harvard Dataverse. This can be modified atomically or globally using \code{Sys.setenv("DATAVERSE_SERVER" = "dataverse.example.com")}.} +\item{server}{A character string specifying a Dataverse server. There are +multiple Dataverse installations, but the defaults is to use the Harvard +Dataverse (`server = "dataverse.harvard.edu"`). This can be modified atomically +or globally using \code{Sys.setenv("DATAVERSE_SERVER" = "dataverse.example.com")}.} -\item{...}{Additional arguments passed to an HTTP request function, such as \code{\link[httr]{GET}}, \code{\link[httr]{POST}}, or \code{\link[httr]{DELETE}}.} +\item{...}{Additional arguments passed to an HTTP request function, such as +\code{\link[httr]{GET}}, \code{\link[httr]{POST}}, or +\code{\link[httr]{DELETE}}.} } \value{ An object of class \dQuote{dataset_atom}. diff --git a/man/list_datasets.Rd b/man/list_datasets.Rd index 8bbe410..c88b576 100644 --- a/man/list_datasets.Rd +++ b/man/list_datasets.Rd @@ -14,11 +14,19 @@ list_datasets( \arguments{ \item{dataverse}{A Dataverse alias or ID number, or an object of class \dQuote{dataverse}, perhaps as returned by \code{\link{service_document}}.} -\item{key}{A character string specifying a Dataverse server API key. If one is not specified, functions calling authenticated API endpoints will fail. Keys can be specified atomically or globally using \code{Sys.setenv("DATAVERSE_KEY" = "examplekey")}.} +\item{key}{A character string specifying a Dataverse server API key. If one +is not specified, functions calling authenticated API endpoints will fail. +Keys can be specified atomically or globally using +\code{Sys.setenv("DATAVERSE_KEY" = "examplekey")}.} -\item{server}{A character string specifying a Dataverse server. There are multiple Dataverse installations, but the defaults is to use the Harvard Dataverse. This can be modified atomically or globally using \code{Sys.setenv("DATAVERSE_SERVER" = "dataverse.example.com")}.} +\item{server}{A character string specifying a Dataverse server. There are +multiple Dataverse installations, but the defaults is to use the Harvard +Dataverse (`server = "dataverse.harvard.edu"`). This can be modified atomically +or globally using \code{Sys.setenv("DATAVERSE_SERVER" = "dataverse.example.com")}.} -\item{...}{Additional arguments passed to an HTTP request function, such as \code{\link[httr]{GET}}, \code{\link[httr]{POST}}, or \code{\link[httr]{DELETE}}.} +\item{...}{Additional arguments passed to an HTTP request function, such as +\code{\link[httr]{GET}}, \code{\link[httr]{POST}}, or +\code{\link[httr]{DELETE}}.} } \value{ A list. diff --git a/man/publish_dataset.Rd b/man/publish_dataset.Rd index 1a82044..52a10d8 100644 --- a/man/publish_dataset.Rd +++ b/man/publish_dataset.Rd @@ -17,11 +17,19 @@ publish_dataset( \item{minor}{A logical specifying whether the new release of the dataset is a \dQuote{minor} release (\code{TRUE}, by default), resulting in a minor version increase (e.g., from 1.1 to 1.2). If \code{FALSE}, the dataset is given a \dQuote{major} release (e.g., from 1.1 to 2.0).} -\item{key}{A character string specifying a Dataverse server API key. If one is not specified, functions calling authenticated API endpoints will fail. Keys can be specified atomically or globally using \code{Sys.setenv("DATAVERSE_KEY" = "examplekey")}.} +\item{key}{A character string specifying a Dataverse server API key. If one +is not specified, functions calling authenticated API endpoints will fail. +Keys can be specified atomically or globally using +\code{Sys.setenv("DATAVERSE_KEY" = "examplekey")}.} -\item{server}{A character string specifying a Dataverse server. There are multiple Dataverse installations, but the defaults is to use the Harvard Dataverse. This can be modified atomically or globally using \code{Sys.setenv("DATAVERSE_SERVER" = "dataverse.example.com")}.} +\item{server}{A character string specifying a Dataverse server. There are +multiple Dataverse installations, but the defaults is to use the Harvard +Dataverse (`server = "dataverse.harvard.edu"`). This can be modified atomically +or globally using \code{Sys.setenv("DATAVERSE_SERVER" = "dataverse.example.com")}.} -\item{...}{Additional arguments passed to an HTTP request function, such as \code{\link[httr]{GET}}, \code{\link[httr]{POST}}, or \code{\link[httr]{DELETE}}.} +\item{...}{Additional arguments passed to an HTTP request function, such as +\code{\link[httr]{GET}}, \code{\link[httr]{POST}}, or +\code{\link[httr]{DELETE}}.} } \value{ A list. diff --git a/man/publish_dataverse.Rd b/man/publish_dataverse.Rd index 45a0b01..fbac84d 100644 --- a/man/publish_dataverse.Rd +++ b/man/publish_dataverse.Rd @@ -14,11 +14,19 @@ publish_dataverse( \arguments{ \item{dataverse}{An object of class \dQuote{sword_collection}, as returned by \code{\link{service_document}}.} -\item{key}{A character string specifying a Dataverse server API key. If one is not specified, functions calling authenticated API endpoints will fail. Keys can be specified atomically or globally using \code{Sys.setenv("DATAVERSE_KEY" = "examplekey")}.} +\item{key}{A character string specifying a Dataverse server API key. If one +is not specified, functions calling authenticated API endpoints will fail. +Keys can be specified atomically or globally using +\code{Sys.setenv("DATAVERSE_KEY" = "examplekey")}.} -\item{server}{A character string specifying a Dataverse server. There are multiple Dataverse installations, but the defaults is to use the Harvard Dataverse. This can be modified atomically or globally using \code{Sys.setenv("DATAVERSE_SERVER" = "dataverse.example.com")}.} +\item{server}{A character string specifying a Dataverse server. There are +multiple Dataverse installations, but the defaults is to use the Harvard +Dataverse (`server = "dataverse.harvard.edu"`). This can be modified atomically +or globally using \code{Sys.setenv("DATAVERSE_SERVER" = "dataverse.example.com")}.} -\item{...}{Additional arguments passed to an HTTP request function, such as \code{\link[httr]{GET}}, \code{\link[httr]{POST}}, or \code{\link[httr]{DELETE}}.} +\item{...}{Additional arguments passed to an HTTP request function, such as +\code{\link[httr]{GET}}, \code{\link[httr]{POST}}, or +\code{\link[httr]{DELETE}}.} } \value{ A list. diff --git a/man/publish_sword_dataset.Rd b/man/publish_sword_dataset.Rd index c65a026..246275d 100644 --- a/man/publish_sword_dataset.Rd +++ b/man/publish_sword_dataset.Rd @@ -14,11 +14,19 @@ publish_sword_dataset( \arguments{ \item{dataset}{A dataset DOI (or other persistent identifier), an object of class \dQuote{dataset_atom} or \dQuote{dataset_statement}, or an appropriate and complete SWORD URL.} -\item{key}{A character string specifying a Dataverse server API key. If one is not specified, functions calling authenticated API endpoints will fail. Keys can be specified atomically or globally using \code{Sys.setenv("DATAVERSE_KEY" = "examplekey")}.} +\item{key}{A character string specifying a Dataverse server API key. If one +is not specified, functions calling authenticated API endpoints will fail. +Keys can be specified atomically or globally using +\code{Sys.setenv("DATAVERSE_KEY" = "examplekey")}.} -\item{server}{A character string specifying a Dataverse server. There are multiple Dataverse installations, but the defaults is to use the Harvard Dataverse. This can be modified atomically or globally using \code{Sys.setenv("DATAVERSE_SERVER" = "dataverse.example.com")}.} +\item{server}{A character string specifying a Dataverse server. There are +multiple Dataverse installations, but the defaults is to use the Harvard +Dataverse (`server = "dataverse.harvard.edu"`). This can be modified atomically +or globally using \code{Sys.setenv("DATAVERSE_SERVER" = "dataverse.example.com")}.} -\item{...}{Additional arguments passed to an HTTP request function, such as \code{\link[httr]{GET}}, \code{\link[httr]{POST}}, or \code{\link[httr]{DELETE}}.} +\item{...}{Additional arguments passed to an HTTP request function, such as +\code{\link[httr]{GET}}, \code{\link[httr]{POST}}, or +\code{\link[httr]{DELETE}}.} } \value{ A list. diff --git a/man/service_document.Rd b/man/service_document.Rd index 45f4c29..2d91e59 100644 --- a/man/service_document.Rd +++ b/man/service_document.Rd @@ -11,11 +11,19 @@ service_document( ) } \arguments{ -\item{key}{A character string specifying a Dataverse server API key. If one is not specified, functions calling authenticated API endpoints will fail. Keys can be specified atomically or globally using \code{Sys.setenv("DATAVERSE_KEY" = "examplekey")}.} +\item{key}{A character string specifying a Dataverse server API key. If one +is not specified, functions calling authenticated API endpoints will fail. +Keys can be specified atomically or globally using +\code{Sys.setenv("DATAVERSE_KEY" = "examplekey")}.} -\item{server}{A character string specifying a Dataverse server. There are multiple Dataverse installations, but the defaults is to use the Harvard Dataverse. This can be modified atomically or globally using \code{Sys.setenv("DATAVERSE_SERVER" = "dataverse.example.com")}.} +\item{server}{A character string specifying a Dataverse server. There are +multiple Dataverse installations, but the defaults is to use the Harvard +Dataverse (`server = "dataverse.harvard.edu"`). This can be modified atomically +or globally using \code{Sys.setenv("DATAVERSE_SERVER" = "dataverse.example.com")}.} -\item{...}{Additional arguments passed to an HTTP request function, such as \code{\link[httr]{GET}}, \code{\link[httr]{POST}}, or \code{\link[httr]{DELETE}}.} +\item{...}{Additional arguments passed to an HTTP request function, such as +\code{\link[httr]{GET}}, \code{\link[httr]{POST}}, or +\code{\link[httr]{DELETE}}.} } \value{ A list of class \dQuote{sword_service_document}, possibly with one or more \dQuote{sword_collection} entries. The latter are SWORD representations of a Dataverse. These can be passed to other SWORD API functions, e.g., for creating a new dataset. diff --git a/man/set_dataverse_metadata.Rd b/man/set_dataverse_metadata.Rd index d08a693..2119aa7 100644 --- a/man/set_dataverse_metadata.Rd +++ b/man/set_dataverse_metadata.Rd @@ -20,11 +20,19 @@ set_dataverse_metadata( \item{root}{A logical.} -\item{key}{A character string specifying a Dataverse server API key. If one is not specified, functions calling authenticated API endpoints will fail. Keys can be specified atomically or globally using \code{Sys.setenv("DATAVERSE_KEY" = "examplekey")}.} +\item{key}{A character string specifying a Dataverse server API key. If one +is not specified, functions calling authenticated API endpoints will fail. +Keys can be specified atomically or globally using +\code{Sys.setenv("DATAVERSE_KEY" = "examplekey")}.} -\item{server}{A character string specifying a Dataverse server. There are multiple Dataverse installations, but the defaults is to use the Harvard Dataverse. This can be modified atomically or globally using \code{Sys.setenv("DATAVERSE_SERVER" = "dataverse.example.com")}.} +\item{server}{A character string specifying a Dataverse server. There are +multiple Dataverse installations, but the defaults is to use the Harvard +Dataverse (`server = "dataverse.harvard.edu"`). This can be modified atomically +or globally using \code{Sys.setenv("DATAVERSE_SERVER" = "dataverse.example.com")}.} -\item{...}{Additional arguments passed to an HTTP request function, such as \code{\link[httr]{GET}}, \code{\link[httr]{POST}}, or \code{\link[httr]{DELETE}}.} +\item{...}{Additional arguments passed to an HTTP request function, such as +\code{\link[httr]{GET}}, \code{\link[httr]{POST}}, or +\code{\link[httr]{DELETE}}.} } \value{ A list From 905cbc72ed036f66d18663cce28c0e6de8f977e2 Mon Sep 17 00:00:00 2001 From: Shiro Kuriwaki Date: Sat, 26 Dec 2020 16:45:12 -0500 Subject: [PATCH 13/75] Cleaup get_fileid --- NAMESPACE | 1 - R/get_file.R | 17 ++--------------- man/files.Rd | 15 +++------------ 3 files changed, 5 insertions(+), 28 deletions(-) diff --git a/NAMESPACE b/NAMESPACE index 4aae134..80b53cc 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -48,4 +48,3 @@ import(httr) import(xml2) importFrom(stats,setNames) importFrom(utils,str) -importFrom(utils,unzip) diff --git a/R/get_file.R b/R/get_file.R index fa14e24..1cfb1c7 100644 --- a/R/get_file.R +++ b/R/get_file.R @@ -24,7 +24,7 @@ #' @return \code{get_file} returns a raw vector (or list of raw vectors, #' if \code{length(file) > 1}). #' -#' +#' @seealso \link{get_dataframe_by_name} #' @examples #' \dontrun{ #' # download file from: @@ -44,21 +44,8 @@ #' # retrieve all files in a dataset in their original format (returns a list of raw vectors) #' file_ids <- get_dataset("doi:10.7910/DVN/CXOB4K")[['files']]$id #' f3 <- get_file(file_ids, format = "original") -#' -#' -#' # read file as data.frame -#' if (require("rio")) { -#' tmp <- tempfile(fileext = ".dta") -#' writeBin(f, tmp) -#' dat <- haven::read_dta(tmp) -#' -#' # check UNF match -#' # if (require("UNF")) { -#' # unf(dat) %unf% d1$files$datafile$UNF[3] -#' # } -#' } #' } -#' @importFrom utils unzip +#' #' @export get_file <- function(file, diff --git a/man/files.Rd b/man/files.Rd index 26c9b04..324124f 100644 --- a/man/files.Rd +++ b/man/files.Rd @@ -85,18 +85,9 @@ get_file(flist[[2]]) # retrieve all files in a dataset in their original format (returns a list of raw vectors) file_ids <- get_dataset("doi:10.7910/DVN/CXOB4K")[['files']]$id f3 <- get_file(file_ids, format = "original") - - -# read file as data.frame -if (require("rio")) { - tmp <- tempfile(fileext = ".dta") - writeBin(f, tmp) - dat <- haven::read_dta(tmp) - - # check UNF match - # if (require("UNF")) { - # unf(dat) \%unf\% d1$files$datafile$UNF[3] - # } } + } +\seealso{ +\link{get_dataframe_by_name} } From 053c07287dadee3f5d14642cd3e1d1cb10785b8d Mon Sep 17 00:00:00 2001 From: Shiro Kuriwaki Date: Sat, 26 Dec 2020 20:01:29 -0500 Subject: [PATCH 14/75] Make as many examples work --- NAMESPACE | 3 + R/get_file.R | 57 ++++++++++------- R/get_file_as_dataframe.R | 64 +++++++++++++++++-- R/get_file_by_id.R | 36 +++++++---- man/files.Rd | 53 +++++++++------ ..._dataframe_by_name.Rd => get_dataframe.Rd} | 40 ++++++++++-- man/get_dataframe_internal.Rd | 12 ++++ 7 files changed, 200 insertions(+), 65 deletions(-) rename man/{get_dataframe_by_name.Rd => get_dataframe.Rd} (60%) create mode 100644 man/get_dataframe_internal.Rd diff --git a/NAMESPACE b/NAMESPACE index 80b53cc..668dc34 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -47,4 +47,7 @@ export(update_dataset_file) import(httr) import(xml2) importFrom(stats,setNames) +importFrom(stringr,str_extract) importFrom(utils,str) +importFrom(xml2,as_list) +importFrom(xml2,read_xml) diff --git a/R/get_file.R b/R/get_file.R index 1cfb1c7..e3c0e93 100644 --- a/R/get_file.R +++ b/R/get_file.R @@ -11,10 +11,12 @@ #' specifying a file name within the DOI-identified dataset; or an object of #' class \dQuote{dataverse_file} as returned by \code{\link{dataset_files}}. #' @param fileid A numeric ID internally used for `get_file_by_id` -#' @param format A character string specifying a file format. For \code{get_file}: -#' by default, this is \dQuote{original} (the original file format). If \dQuote{RData} -#' or \dQuote{prep} is used, an alternative is returned. If \dQuote{bundle}, a -#' compressed directory containing a bundle of file formats is returned. +#' @param format A character string specifying a file format for download. +#' by default, this is \dQuote{original} (the original file format). If `NULL`, +#' no query is added, so ingested files are returned in their ingested TSV form. +#' If \dQuote{RData} or \dQuote{prep} is used, an alternative is returned. +#' If \dQuote{bundle}, a compressed directory containing a bundle of file formats +#' is returned. #' @param vars A character vector specifying one or more variable names, used to #' extract a subset of the data. #' @@ -22,28 +24,36 @@ #' @template dots #' #' @return \code{get_file} returns a raw vector (or list of raw vectors, -#' if \code{length(file) > 1}). +#' if \code{length(file) > 1}). To load as a dataframe, see \link{get_dataframe_by_name}. #' -#' @seealso \link{get_dataframe_by_name} +#' @seealso To load the objects as datasets \link{get_dataframe_by_name} #' @examples #' \dontrun{ #' # download file from: #' # https://doi.org/10.7910/DVN/ARKOTI #' -#' d1 <- get_dataset("doi:10.7910/DVN/ARKOTI") -#' f <- get_file(d1$files$datafile$id[3]) -#' f2 <- get_file(2692202) +#' # 1. Two-steps: Find ID from get_dataset +#' d1 <- get_dataset("doi:10.7910/DVN/ARKOTI", server = "dataverse.harvard.edu") +#' f1 <- get_file(d1$files$id[1], server = "dataverse.harvard.edu") #' -#' # retrieve file based on DOI and filename -#' f2 <- get_file("constructionData.tab", "doi:10.7910/DVN/ARKOTI") +#' # 2. Using filename and dataverse +#' f2 <- get_file("constructionData.tab", +#' "doi:10.7910/DVN/ARKOTI", +#' server = "dataverse.harvard.edu") #' -#' # retrieve file based on "dataverse_file" object -#' flist <- dataset_files(2692151) -#' get_file(flist[[2]]) +#' # 3. Based on "dataverse_file" object +#' flist <- dataset_files(2692151, server = "dataverse.harvard.edu") +#' f3 <- get_file(flist[[2]], server = "dataverse.harvard.edu") +#' +#' # 4. Retrieve bundle of raw data in list +#' file_ids <- get_dataset("doi:10.7910/DVN/CXOB4K", +#' server = "dataverse.harvard.edu")$files$id +#' ## doesn't work yet +#' f4 <- get_file(file_ids, +#' format = "original", +#' server = "dataverse.harvard.edu") +#' length(f4) #' -#' # retrieve all files in a dataset in their original format (returns a list of raw vectors) -#' file_ids <- get_dataset("doi:10.7910/DVN/CXOB4K")[['files']]$id -#' f3 <- get_file(file_ids, format = "original") #' } #' #' @export @@ -51,7 +61,6 @@ get_file <- function(file, dataset = NULL, format = c("original", "RData", "prep", "bundle"), - # thumb = TRUE, vars = NULL, key = Sys.getenv("DATAVERSE_KEY"), server = Sys.getenv("DATAVERSE_SERVER"), @@ -64,12 +73,13 @@ get_file <- fileid <- file # get file ID from 'dataset'. Streamline in feature relying on get_fileid - if (!is.numeric(file) & is.null(dataset)) - stop("When 'file' is a character (non-global ID), dataset must be specified.") if (!is.numeric(file) & inherits(file, "dataverse_file")) - fileid <- get_fileid(file, key = key, server = server) + fileid <- get_fileid.dataverse_file(file, key = key, server = server) + + if (!is.numeric(file) & !inherits(file, "dataverse_file") & is.null(dataset)) + stop("When 'file' is a character (non-global ID), dataset must be specified.") if (!is.numeric(file) & !inherits(file, "dataverse_file")) - fileid <- get_fileid(dataset, file, key = key, server = server, ...) + fileid <- get_fileid.character(dataset, file, key = key, server = server, ...) # Main function. Call get_file_by_id @@ -82,7 +92,8 @@ get_file <- format = format, vars = vars, key = key, - server = server + server = server, + ... ) } diff --git a/R/get_file_as_dataframe.R b/R/get_file_as_dataframe.R index 5a48815..29b9e90 100644 --- a/R/get_file_as_dataframe.R +++ b/R/get_file_as_dataframe.R @@ -1,5 +1,9 @@ #' Get file from dataverse and convert it into a dataframe or tibble #' +#' `get_dataframe_by_id`, if you know the numeric ID of the dataset, or instead +#' `get_dataframe_by_name` if you know the filename and doi. The dataset +#' +#' @rdname get_dataframe #' #' @param file to be passed on to get_file #' @param dataset to be passed on to get_file @@ -14,7 +18,29 @@ #' file = "gapminder-FiveYearData.tab", #' dataset = "doi:10.7910/DVN/GJQNEQ", #' server = "dataverse.harvard.edu", -#' read_function = readr::read_tsv) +#' read_function = readr::read_csv) +#' +#' # equivalently, if you know the ID +#' gap_df <- get_dataframe_by_id( +#' 3037713, +#' server = "dataverse.harvard.edu", +#' read_function = readr::read_csv) +#' +#' # equivalently, using a dataverse object +#' gap_ds <- dataset_files("doi:10.7910/DVN/GJQNEQ", +#' server = "dataverse.harvard.edu") +#' gap_df <- get_dataframe_by_id( +#' gap_ds[[2]], +#' server = "dataverse.harvard.edu", +#' read_function = readr::read_csv +#' ) +#' +#' # to use the ingested version (and read as TSV) +#' gap_df <- get_dataframe_by_id( +#' 3037713, +#' server = "dataverse.harvard.edu", +#' use_ingested = TRUE, +#' read_function = readr::read_tsv) #' #' @export get_dataframe_by_name <- function(file, @@ -22,16 +48,42 @@ get_dataframe_by_name <- function(file, read_function = NULL, ...) { - raw_file <- get_file(file = file, dataset = dataset, ...) + # retrieve ID + fileid <- get_fileid.character(x = dataset, + file = file, + ...) + + get_dataframe_by_id(fileid, read_function, ...) + +} + + +#' @rdname get_dataframe +get_dataframe_by_id <- function(file, + read_function = NULL, + ...) { + + raw <- get_file(file = file, ...) # default of get_file if (is.null(read_function)) - return(raw_file) + return(raw) # save to temp and then read it in with supplied function if (!is.null(read_function)) { - tmp <- tempfile(file, fileext = stringr::str_extract(file, "\\.[A-z]+$")) - writeBin(raw_file, tmp) - return(do.call(read_function, list(tmp))) + get_dataframe_internal(raw, filename = "foo", .f = read_function) } } + + +#' Write to temp and apply function +#' +#' @importFrom stringr str_extract +#' +#' @keywords internal +get_dataframe_internal <- function(raw, filename, .f) { + tmp <- tempfile(filename) + writeBin(raw, tmp) + + do.call(.f, list(tmp)) +} diff --git a/R/get_file_by_id.R b/R/get_file_by_id.R index f96004e..59bd653 100644 --- a/R/get_file_by_id.R +++ b/R/get_file_by_id.R @@ -2,28 +2,41 @@ #' #' @rdname files #' +#' @param use_ingested If a ingested (.tab) version is available, download +#' the ingested version or not? If `format = "original"`, this is forced +#' to `FALSE` +#' +#' @importFrom xml2 read_xml as_list +#' #' @export get_file_by_id <- function(fileid, dataset = NULL, + server, format = c("original", "RData", "prep", "bundle"), - # thumb = TRUE, - vars = NULL, + vars, + use_ingested = NULL, key = Sys.getenv("DATAVERSE_KEY"), - server = Sys.getenv("DATAVERSE_SERVER"), ...) { format <- match.arg(format) # single file ID - stopifnot (is.numeric(fileid)) - stopifnot (length(fileid) == 1) + stopifnot(is.numeric(fileid)) + stopifnot(length(fileid) == 1) + + # detect file type to determine if something is ingested + xml <- read_xml(get_file_metadata(fileid, server = server)) + filename <- as_list(xml)$codeBook$fileDscr$fileTxt$fileName[[1]] + is_ingested <- grepl(x = filename, pattern = "\\.tab$") + # update use_ingested if not specified + if (is_ingested & is.null(use_ingested)) + use_ingested <- FALSE # downloading files sequentially and add the raw vectors to a list out <- vector("list", length(fileid)) # create query ----- - u <- paste0(api_url(server), "access/datafile/", fileid) query <- list() if (!is.null(vars)) { query$vars <- paste0(vars, collapse = ",") @@ -33,13 +46,14 @@ get_file_by_id <- } # request single file in non-bundle format ---- - # add query if ingesting a tab (detect from original file name) - # if (length(query) == 1 & grepl("\\.tab$", file[i])) { - # r <- httr::GET(u, httr::add_headers("X-Dataverse-key" = key), query = query, ...) - # } else { + u <- paste0(api_url(server), "access/datafile/", fileid) + # add query if you want to want the original version even though ingested + if (is_ingested & !use_ingested) { + r <- httr::GET(u, httr::add_headers("X-Dataverse-key" = key), query = query, ...) + } else { # do not add query if not an ingestion file r <- httr::GET(u, httr::add_headers("X-Dataverse-key" = key), ...) - # } + } httr::stop_for_status(r) out <- httr::content(r, as = "raw") diff --git a/man/files.Rd b/man/files.Rd index 324124f..4c59579 100644 --- a/man/files.Rd +++ b/man/files.Rd @@ -18,10 +18,11 @@ get_file( get_file_by_id( fileid, dataset = NULL, + server, format = c("original", "RData", "prep", "bundle"), - vars = NULL, + vars, + use_ingested = NULL, key = Sys.getenv("DATAVERSE_KEY"), - server = Sys.getenv("DATAVERSE_SERVER"), ... ) } @@ -31,10 +32,12 @@ specifying file identifiers; or, if \code{doi} is specified, a character string specifying a file name within the DOI-identified dataset; or an object of class \dQuote{dataverse_file} as returned by \code{\link{dataset_files}}.} -\item{format}{A character string specifying a file format. For \code{get_file}: -by default, this is \dQuote{original} (the original file format). If \dQuote{RData} -or \dQuote{prep} is used, an alternative is returned. If \dQuote{bundle}, a -compressed directory containing a bundle of file formats is returned.} +\item{format}{A character string specifying a file format for download. +by default, this is \dQuote{original} (the original file format). If `NULL`, +no query is added, so ingested files are returned in their ingested TSV form. +If \dQuote{RData} or \dQuote{prep} is used, an alternative is returned. +If \dQuote{bundle}, a compressed directory containing a bundle of file formats +is returned.} \item{vars}{A character vector specifying one or more variable names, used to extract a subset of the data.} @@ -54,10 +57,14 @@ or globally using \code{Sys.setenv("DATAVERSE_SERVER" = "dataverse.example.com") \code{\link[httr]{DELETE}}.} \item{fileid}{A numeric ID internally used for `get_file_by_id`} + +\item{use_ingested}{If a ingested (.tab) version is available, download +the ingested version or not? If `format = "original"`, this is forced +to `FALSE`} } \value{ \code{get_file} returns a raw vector (or list of raw vectors, - if \code{length(file) > 1}). + if \code{length(file) > 1}). To load as a dataframe, see \link{get_dataframe_by_name}. } \description{ Download Dataverse File(s). `get_file` internally calls @@ -71,23 +78,31 @@ This function provides access to data files from a Dataverse entry. # download file from: # https://doi.org/10.7910/DVN/ARKOTI -d1 <- get_dataset("doi:10.7910/DVN/ARKOTI") -f <- get_file(d1$files$datafile$id[3]) -f2 <- get_file(2692202) +# 1. Two-steps: Find ID from get_dataset +d1 <- get_dataset("doi:10.7910/DVN/ARKOTI", server = "dataverse.harvard.edu") +f1 <- get_file(d1$files$id[1], server = "dataverse.harvard.edu") + +# 2. Using filename and dataverse +f2 <- get_file("constructionData.tab", + "doi:10.7910/DVN/ARKOTI", + server = "dataverse.harvard.edu") -# retrieve file based on DOI and filename -f2 <- get_file("constructionData.tab", "doi:10.7910/DVN/ARKOTI") +# 3. Based on "dataverse_file" object +flist <- dataset_files(2692151, server = "dataverse.harvard.edu") +f3 <- get_file(flist[[2]], server = "dataverse.harvard.edu") -# retrieve file based on "dataverse_file" object -flist <- dataset_files(2692151) -get_file(flist[[2]]) +# 4. Retrieve bundle of raw data in list +file_ids <- get_dataset("doi:10.7910/DVN/CXOB4K", + server = "dataverse.harvard.edu")$files$id +## doesn't work yet +f4 <- get_file(file_ids, + format = "original", + server = "dataverse.harvard.edu") +length(f4) -# retrieve all files in a dataset in their original format (returns a list of raw vectors) -file_ids <- get_dataset("doi:10.7910/DVN/CXOB4K")[['files']]$id -f3 <- get_file(file_ids, format = "original") } } \seealso{ -\link{get_dataframe_by_name} +To load the objects as datasets \link{get_dataframe_by_name} } diff --git a/man/get_dataframe_by_name.Rd b/man/get_dataframe.Rd similarity index 60% rename from man/get_dataframe_by_name.Rd rename to man/get_dataframe.Rd index 6570c44..df312a0 100644 --- a/man/get_dataframe_by_name.Rd +++ b/man/get_dataframe.Rd @@ -2,9 +2,12 @@ % Please edit documentation in R/get_file_as_dataframe.R \name{get_dataframe_by_name} \alias{get_dataframe_by_name} +\alias{get_dataframe_by_id} \title{Get file from dataverse and convert it into a dataframe or tibble} \usage{ get_dataframe_by_name(file, dataset = NULL, read_function = NULL, ...) + +get_dataframe_by_id(file, read_function = NULL, ...) } \arguments{ \item{file}{to be passed on to get_file} @@ -19,10 +22,12 @@ environment} \item{...}{ Arguments passed on to \code{\link[=get_file]{get_file}} \describe{ - \item{\code{format}}{A character string specifying a file format. For \code{get_file}: -by default, this is \dQuote{original} (the original file format). If \dQuote{RData} -or \dQuote{prep} is used, an alternative is returned. If \dQuote{bundle}, a -compressed directory containing a bundle of file formats is returned.} + \item{\code{format}}{A character string specifying a file format for download. +by default, this is \dQuote{original} (the original file format). If `NULL`, +no query is added, so ingested files are returned in their ingested TSV form. +If \dQuote{RData} or \dQuote{prep} is used, an alternative is returned. +If \dQuote{bundle}, a compressed directory containing a bundle of file formats +is returned.} \item{\code{vars}}{A character vector specifying one or more variable names, used to extract a subset of the data.} \item{\code{key}}{A character string specifying a Dataverse server API key. If one @@ -36,13 +41,36 @@ or globally using \code{Sys.setenv("DATAVERSE_SERVER" = "dataverse.example.com") }} } \description{ -Get file from dataverse and convert it into a dataframe or tibble +`get_dataframe_by_id`, if you know the numeric ID of the dataset, or instead +`get_dataframe_by_name` if you know the filename and doi. The dataset } \examples{ gap_df <- get_dataframe_by_name( file = "gapminder-FiveYearData.tab", dataset = "doi:10.7910/DVN/GJQNEQ", server = "dataverse.harvard.edu", - read_function = readr::read_tsv) + read_function = readr::read_csv) + +# equivalently, if you know the ID +gap_df <- get_dataframe_by_id( + 3037713, + server = "dataverse.harvard.edu", + read_function = readr::read_csv) + +# equivalently, using a dataverse object +gap_ds <- dataset_files("doi:10.7910/DVN/GJQNEQ", + server = "dataverse.harvard.edu") +gap_df <- get_dataframe_by_id( + gap_ds[[2]], + server = "dataverse.harvard.edu", + read_function = readr::read_csv +) + +# to use the ingested version (and read as TSV) +gap_df <- get_dataframe_by_id( + 3037713, + server = "dataverse.harvard.edu", + use_ingested = TRUE, + read_function = readr::read_tsv) } diff --git a/man/get_dataframe_internal.Rd b/man/get_dataframe_internal.Rd new file mode 100644 index 0000000..1f46382 --- /dev/null +++ b/man/get_dataframe_internal.Rd @@ -0,0 +1,12 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/get_file_as_dataframe.R +\name{get_dataframe_internal} +\alias{get_dataframe_internal} +\title{Write to temp and apply function} +\usage{ +get_dataframe_internal(raw, filename, .f) +} +\description{ +Write to temp and apply function +} +\keyword{internal} From 1139753cbba2546aa99d24563e8e9e06cddbab18 Mon Sep 17 00:00:00 2001 From: Shiro Kuriwaki Date: Sat, 26 Dec 2020 20:12:27 -0500 Subject: [PATCH 15/75] export _by_id --- NAMESPACE | 1 + R/get_file_as_dataframe.R | 2 ++ man/get_dataframe.Rd | 1 + 3 files changed, 4 insertions(+) diff --git a/NAMESPACE b/NAMESPACE index 668dc34..f7301f9 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -27,6 +27,7 @@ export(delete_dataset) export(delete_dataverse) export(delete_file) export(delete_sword_dataset) +export(get_dataframe_by_id) export(get_dataframe_by_name) export(get_dataset) export(get_dataverse) diff --git a/R/get_file_as_dataframe.R b/R/get_file_as_dataframe.R index 29b9e90..37540fe 100644 --- a/R/get_file_as_dataframe.R +++ b/R/get_file_as_dataframe.R @@ -14,6 +14,7 @@ #' @inheritDotParams get_file #' #' @examples +#' # load dataset from file name and dataverse DOI #' gap_df <- get_dataframe_by_name( #' file = "gapminder-FiveYearData.tab", #' dataset = "doi:10.7910/DVN/GJQNEQ", @@ -59,6 +60,7 @@ get_dataframe_by_name <- function(file, #' @rdname get_dataframe +#' @export get_dataframe_by_id <- function(file, read_function = NULL, ...) { diff --git a/man/get_dataframe.Rd b/man/get_dataframe.Rd index df312a0..52a8d56 100644 --- a/man/get_dataframe.Rd +++ b/man/get_dataframe.Rd @@ -45,6 +45,7 @@ or globally using \code{Sys.setenv("DATAVERSE_SERVER" = "dataverse.example.com") `get_dataframe_by_name` if you know the filename and doi. The dataset } \examples{ +# load dataset from file name and dataverse DOI gap_df <- get_dataframe_by_name( file = "gapminder-FiveYearData.tab", dataset = "doi:10.7910/DVN/GJQNEQ", From eee24efaa2d17b44f3e77af2270dca80ff46a1a6 Mon Sep 17 00:00:00 2001 From: Shiro Kuriwaki Date: Sun, 27 Dec 2020 18:23:05 -0500 Subject: [PATCH 16/75] Change argument use_ingested to `archival`, which seems more intuitive. Set archival = FALSE in get_dataframe_*. --- R/get_file.R | 12 +++++------- R/get_file_as_dataframe.R | 13 ++++++++++--- R/get_file_by_id.R | 30 ++++++++++++++++++------------ man/files.Rd | 18 +++++++----------- man/get_dataframe.Rd | 22 ++++++++++++++++------ 5 files changed, 56 insertions(+), 39 deletions(-) diff --git a/R/get_file.R b/R/get_file.R index e3c0e93..de51149 100644 --- a/R/get_file.R +++ b/R/get_file.R @@ -14,9 +14,7 @@ #' @param format A character string specifying a file format for download. #' by default, this is \dQuote{original} (the original file format). If `NULL`, #' no query is added, so ingested files are returned in their ingested TSV form. -#' If \dQuote{RData} or \dQuote{prep} is used, an alternative is returned. -#' If \dQuote{bundle}, a compressed directory containing a bundle of file formats -#' is returned. +#' For other formats, see . #' @param vars A character vector specifying one or more variable names, used to #' extract a subset of the data. #' @@ -24,9 +22,11 @@ #' @template dots #' #' @return \code{get_file} returns a raw vector (or list of raw vectors, -#' if \code{length(file) > 1}). To load as a dataframe, see \link{get_dataframe_by_name}. +#' if \code{length(file) > 1}). To load as a dataframe, see +#' \link{get_dataframe_by_name}. +#' +#' @seealso To load the objects as datasets \link{get_dataframe_by_name}. #' -#' @seealso To load the objects as datasets \link{get_dataframe_by_name} #' @examples #' \dontrun{ #' # download file from: @@ -48,9 +48,7 @@ #' # 4. Retrieve bundle of raw data in list #' file_ids <- get_dataset("doi:10.7910/DVN/CXOB4K", #' server = "dataverse.harvard.edu")$files$id -#' ## doesn't work yet #' f4 <- get_file(file_ids, -#' format = "original", #' server = "dataverse.harvard.edu") #' length(f4) #' diff --git a/R/get_file_as_dataframe.R b/R/get_file_as_dataframe.R index 37540fe..36881ab 100644 --- a/R/get_file_as_dataframe.R +++ b/R/get_file_as_dataframe.R @@ -11,6 +11,11 @@ #' raw file to a tempfile and read it back in with the supplied function. This #' is useful when you want to start working with the data right away in the R #' environment +#' @param archival Whether to read from the ingested, archival version of the +#' dataset, or whether to read the original. The archival versions are tab-delimited +#' `.tab` files. If functions to read the original version is available without +#' loss of information, then `archival = FALSE` is better. If such functions +#' are not available or the original format is unknown, use `archival = TRUE`. #' @inheritDotParams get_file #' #' @examples @@ -40,13 +45,14 @@ #' gap_df <- get_dataframe_by_id( #' 3037713, #' server = "dataverse.harvard.edu", -#' use_ingested = TRUE, +#' archival = TRUE, #' read_function = readr::read_tsv) #' #' @export get_dataframe_by_name <- function(file, dataset = NULL, read_function = NULL, + archival = FALSE, ...) { # retrieve ID @@ -54,7 +60,7 @@ get_dataframe_by_name <- function(file, file = file, ...) - get_dataframe_by_id(fileid, read_function, ...) + get_dataframe_by_id(fileid, read_function, archival = archival, ...) } @@ -63,9 +69,10 @@ get_dataframe_by_name <- function(file, #' @export get_dataframe_by_id <- function(file, read_function = NULL, + archival = FALSE, ...) { - raw <- get_file(file = file, ...) + raw <- get_file(file = file, archival = archival, ...) # default of get_file if (is.null(read_function)) diff --git a/R/get_file_by_id.R b/R/get_file_by_id.R index 59bd653..b446c3d 100644 --- a/R/get_file_by_id.R +++ b/R/get_file_by_id.R @@ -2,9 +2,8 @@ #' #' @rdname files #' -#' @param use_ingested If a ingested (.tab) version is available, download -#' the ingested version or not? If `format = "original"`, this is forced -#' to `FALSE` +#' @param archival If a ingested (.tab) version is available, download +#' the ingested archival version or not? #' #' @importFrom xml2 read_xml as_list #' @@ -15,7 +14,7 @@ get_file_by_id <- server, format = c("original", "RData", "prep", "bundle"), vars, - use_ingested = NULL, + archival = NULL, key = Sys.getenv("DATAVERSE_KEY"), ...) { format <- match.arg(format) @@ -25,13 +24,20 @@ get_file_by_id <- stopifnot(length(fileid) == 1) # detect file type to determine if something is ingested - xml <- read_xml(get_file_metadata(fileid, server = server)) - filename <- as_list(xml)$codeBook$fileDscr$fileTxt$fileName[[1]] - is_ingested <- grepl(x = filename, pattern = "\\.tab$") + if (!is.null(archival)) { + xml <- read_xml(get_file_metadata(fileid, server = server)) + filename <- as_list(xml)$codeBook$fileDscr$fileTxt$fileName[[1]] + is_ingested <- grepl(x = filename, pattern = "\\.tab$") - # update use_ingested if not specified - if (is_ingested & is.null(use_ingested)) - use_ingested <- FALSE + if (archival & !is_ingested) + stop("The file does not have a .tab suffix so does not appear ingested.") + } else { + is_ingested <- FALSE + } + + # update archival if not specified + if (is.null(archival)) + archival <- FALSE # downloading files sequentially and add the raw vectors to a list out <- vector("list", length(fileid)) @@ -48,7 +54,7 @@ get_file_by_id <- # request single file in non-bundle format ---- u <- paste0(api_url(server), "access/datafile/", fileid) # add query if you want to want the original version even though ingested - if (is_ingested & !use_ingested) { + if (is_ingested & !archival) { r <- httr::GET(u, httr::add_headers("X-Dataverse-key" = key), query = query, ...) } else { # do not add query if not an ingestion file @@ -58,6 +64,6 @@ get_file_by_id <- httr::stop_for_status(r) out <- httr::content(r, as = "raw") - return (out) + return(out) } diff --git a/man/files.Rd b/man/files.Rd index 4c59579..83b5c2b 100644 --- a/man/files.Rd +++ b/man/files.Rd @@ -21,7 +21,7 @@ get_file_by_id( server, format = c("original", "RData", "prep", "bundle"), vars, - use_ingested = NULL, + archival = NULL, key = Sys.getenv("DATAVERSE_KEY"), ... ) @@ -35,9 +35,7 @@ specifying a file name within the DOI-identified dataset; or an object of \item{format}{A character string specifying a file format for download. by default, this is \dQuote{original} (the original file format). If `NULL`, no query is added, so ingested files are returned in their ingested TSV form. -If \dQuote{RData} or \dQuote{prep} is used, an alternative is returned. -If \dQuote{bundle}, a compressed directory containing a bundle of file formats -is returned.} +For other formats, see .} \item{vars}{A character vector specifying one or more variable names, used to extract a subset of the data.} @@ -58,13 +56,13 @@ or globally using \code{Sys.setenv("DATAVERSE_SERVER" = "dataverse.example.com") \item{fileid}{A numeric ID internally used for `get_file_by_id`} -\item{use_ingested}{If a ingested (.tab) version is available, download -the ingested version or not? If `format = "original"`, this is forced -to `FALSE`} +\item{archival}{If a ingested (.tab) version is available, download +the ingested archival version or not?} } \value{ \code{get_file} returns a raw vector (or list of raw vectors, - if \code{length(file) > 1}). To load as a dataframe, see \link{get_dataframe_by_name}. + if \code{length(file) > 1}). To load as a dataframe, see + \link{get_dataframe_by_name}. } \description{ Download Dataverse File(s). `get_file` internally calls @@ -94,9 +92,7 @@ f3 <- get_file(flist[[2]], server = "dataverse.harvard.edu") # 4. Retrieve bundle of raw data in list file_ids <- get_dataset("doi:10.7910/DVN/CXOB4K", server = "dataverse.harvard.edu")$files$id -## doesn't work yet f4 <- get_file(file_ids, - format = "original", server = "dataverse.harvard.edu") length(f4) @@ -104,5 +100,5 @@ length(f4) } \seealso{ -To load the objects as datasets \link{get_dataframe_by_name} +To load the objects as datasets \link{get_dataframe_by_name}. } diff --git a/man/get_dataframe.Rd b/man/get_dataframe.Rd index 52a8d56..de8fc97 100644 --- a/man/get_dataframe.Rd +++ b/man/get_dataframe.Rd @@ -5,9 +5,15 @@ \alias{get_dataframe_by_id} \title{Get file from dataverse and convert it into a dataframe or tibble} \usage{ -get_dataframe_by_name(file, dataset = NULL, read_function = NULL, ...) +get_dataframe_by_name( + file, + dataset = NULL, + read_function = NULL, + archival = FALSE, + ... +) -get_dataframe_by_id(file, read_function = NULL, ...) +get_dataframe_by_id(file, read_function = NULL, archival = FALSE, ...) } \arguments{ \item{file}{to be passed on to get_file} @@ -19,15 +25,19 @@ raw file to a tempfile and read it back in with the supplied function. This is useful when you want to start working with the data right away in the R environment} +\item{archival}{Whether to read from the ingested, archival version of the +dataset, or whether to read the original. The archival versions are tab-delimited +`.tab` files. If functions to read the original version is available without +loss of information, then `archival = FALSE` is better. If such functions +are not available or the original format is unknown, use `archival = TRUE`.} + \item{...}{ Arguments passed on to \code{\link[=get_file]{get_file}} \describe{ \item{\code{format}}{A character string specifying a file format for download. by default, this is \dQuote{original} (the original file format). If `NULL`, no query is added, so ingested files are returned in their ingested TSV form. -If \dQuote{RData} or \dQuote{prep} is used, an alternative is returned. -If \dQuote{bundle}, a compressed directory containing a bundle of file formats -is returned.} +For other formats, see .} \item{\code{vars}}{A character vector specifying one or more variable names, used to extract a subset of the data.} \item{\code{key}}{A character string specifying a Dataverse server API key. If one @@ -71,7 +81,7 @@ gap_df <- get_dataframe_by_id( gap_df <- get_dataframe_by_id( 3037713, server = "dataverse.harvard.edu", - use_ingested = TRUE, + archival = TRUE, read_function = readr::read_tsv) } From 1710dab13482c5b12b0908b0065864c2254ec0b9 Mon Sep 17 00:00:00 2001 From: Shiro Kuriwaki Date: Sun, 27 Dec 2020 21:20:12 -0500 Subject: [PATCH 17/75] Better names --- R/get_file.R | 8 ++++---- man/files.Rd | 8 ++++---- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/R/get_file.R b/R/get_file.R index de51149..f823954 100644 --- a/R/get_file.R +++ b/R/get_file.R @@ -42,13 +42,13 @@ #' server = "dataverse.harvard.edu") #' #' # 3. Based on "dataverse_file" object -#' flist <- dataset_files(2692151, server = "dataverse.harvard.edu") -#' f3 <- get_file(flist[[2]], server = "dataverse.harvard.edu") +#' f3_dvf <- dataset_files(2692151, server = "dataverse.harvard.edu") +#' f3 <- get_file(f3_dvf[[2]], server = "dataverse.harvard.edu") #' #' # 4. Retrieve bundle of raw data in list -#' file_ids <- get_dataset("doi:10.7910/DVN/CXOB4K", +#' f4_meta <- get_dataset("doi:10.7910/DVN/CXOB4K", #' server = "dataverse.harvard.edu")$files$id -#' f4 <- get_file(file_ids, +#' f4 <- get_file(f4_meta, #' server = "dataverse.harvard.edu") #' length(f4) #' diff --git a/man/files.Rd b/man/files.Rd index 83b5c2b..f8db37c 100644 --- a/man/files.Rd +++ b/man/files.Rd @@ -86,13 +86,13 @@ f2 <- get_file("constructionData.tab", server = "dataverse.harvard.edu") # 3. Based on "dataverse_file" object -flist <- dataset_files(2692151, server = "dataverse.harvard.edu") -f3 <- get_file(flist[[2]], server = "dataverse.harvard.edu") +f3_dvf <- dataset_files(2692151, server = "dataverse.harvard.edu") +f3 <- get_file(f3_dvf[[2]], server = "dataverse.harvard.edu") # 4. Retrieve bundle of raw data in list -file_ids <- get_dataset("doi:10.7910/DVN/CXOB4K", +f4_meta <- get_dataset("doi:10.7910/DVN/CXOB4K", server = "dataverse.harvard.edu")$files$id -f4 <- get_file(file_ids, +f4 <- get_file(f4_meta, server = "dataverse.harvard.edu") length(f4) From c0158292f961b38c7538e880794eb95641ad7a90 Mon Sep 17 00:00:00 2001 From: Shiro Kuriwaki Date: Sun, 27 Dec 2020 21:53:16 -0500 Subject: [PATCH 18/75] Make `format = "bundle"` work --- R/get_file_by_id.R | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/R/get_file_by_id.R b/R/get_file_by_id.R index b446c3d..7b5a1ee 100644 --- a/R/get_file_by_id.R +++ b/R/get_file_by_id.R @@ -51,7 +51,15 @@ get_file_by_id <- query$format <- match.arg(format) } - # request single file in non-bundle format ---- + # if bundle, use custom url ---- + if (format == "bundle") { + u <- paste0(api_url(server), "access/datafile/bundle/", fileid) + r <- httr::GET(u, httr::add_headers("X-Dataverse-key" = key), ...) + out <- httr::content(r, as = "raw") + return(out) + } + + # If not bundle, request single file in non-bundle format ---- u <- paste0(api_url(server), "access/datafile/", fileid) # add query if you want to want the original version even though ingested if (is_ingested & !archival) { @@ -62,8 +70,6 @@ get_file_by_id <- } httr::stop_for_status(r) - out <- httr::content(r, as = "raw") - + out <- httr::content(r, as = "raw") return(out) - } From f3b0da9704849117802b3dba4294a7f69da5ad47 Mon Sep 17 00:00:00 2001 From: Shiro Kuriwaki Date: Sun, 27 Dec 2020 21:57:21 -0500 Subject: [PATCH 19/75] Add documentation for writing binary files. Also, remove the RData and prep for now --- R/get_file.R | 30 ++++++++++++++++++------------ man/files.Rd | 23 +++++++++++++++-------- man/get_dataframe.Rd | 4 +++- 3 files changed, 36 insertions(+), 21 deletions(-) diff --git a/R/get_file.R b/R/get_file.R index f823954..1aa0bf1 100644 --- a/R/get_file.R +++ b/R/get_file.R @@ -14,7 +14,9 @@ #' @param format A character string specifying a file format for download. #' by default, this is \dQuote{original} (the original file format). If `NULL`, #' no query is added, so ingested files are returned in their ingested TSV form. -#' For other formats, see . +#' For tabular datasets, the option \dQuote{bundle} downloads the bundle +#' of the original and archival versions, as well as the documentation. +#' See for details. #' @param vars A character vector specifying one or more variable names, used to #' extract a subset of the data. #' @@ -22,15 +24,14 @@ #' @template dots #' #' @return \code{get_file} returns a raw vector (or list of raw vectors, -#' if \code{length(file) > 1}). To load as a dataframe, see +#' if \code{length(file) > 1}), which can be saved locally with the `writeBin` +#' function. To load datasets into the R environment dataframe, see #' \link{get_dataframe_by_name}. #' #' @seealso To load the objects as datasets \link{get_dataframe_by_name}. #' #' @examples #' \dontrun{ -#' # download file from: -#' # https://doi.org/10.7910/DVN/ARKOTI #' #' # 1. Two-steps: Find ID from get_dataset #' d1 <- get_dataset("doi:10.7910/DVN/ARKOTI", server = "dataverse.harvard.edu") @@ -45,20 +46,26 @@ #' f3_dvf <- dataset_files(2692151, server = "dataverse.harvard.edu") #' f3 <- get_file(f3_dvf[[2]], server = "dataverse.harvard.edu") #' -#' # 4. Retrieve bundle of raw data in list -#' f4_meta <- get_dataset("doi:10.7910/DVN/CXOB4K", +#' # 4. Retrieve multiple raw data in list +#' f4_vec <- get_dataset("doi:10.7910/DVN/CXOB4K", #' server = "dataverse.harvard.edu")$files$id -#' f4 <- get_file(f4_meta, +#' f4 <- get_file(f4_vec, #' server = "dataverse.harvard.edu") #' length(f4) #' +#' # Write binary files. +#' # The appropriate file extension needs to be assigned by the user. +#' writeBin(f2, "constructionData.tab") +#' writeBin(f4, "dataverse_download.zip") +#' writeBin(f4[[1]], "Appendices.docx") +#' #' } #' #' @export get_file <- function(file, dataset = NULL, - format = c("original", "RData", "prep", "bundle"), + format = c("original", "bundle"), vars = NULL, key = Sys.getenv("DATAVERSE_KEY"), server = Sys.getenv("DATAVERSE_SERVER"), @@ -97,11 +104,10 @@ get_file <- # return the raw vector if there's a single file if (length(out) == 1) { - return (out[[1]]) - } - else { + return(out[[1]]) + } else { # return a list of raw vectors otherwise - return (out) + return(out) } } diff --git a/man/files.Rd b/man/files.Rd index f8db37c..c995780 100644 --- a/man/files.Rd +++ b/man/files.Rd @@ -8,7 +8,7 @@ get_file( file, dataset = NULL, - format = c("original", "RData", "prep", "bundle"), + format = c("original", "bundle"), vars = NULL, key = Sys.getenv("DATAVERSE_KEY"), server = Sys.getenv("DATAVERSE_SERVER"), @@ -35,7 +35,9 @@ specifying a file name within the DOI-identified dataset; or an object of \item{format}{A character string specifying a file format for download. by default, this is \dQuote{original} (the original file format). If `NULL`, no query is added, so ingested files are returned in their ingested TSV form. -For other formats, see .} +For tabular datasets, the option \dQuote{bundle} downloads the bundle +of the original and archival versions, as well as the documentation. +See for details.} \item{vars}{A character vector specifying one or more variable names, used to extract a subset of the data.} @@ -61,7 +63,8 @@ the ingested archival version or not?} } \value{ \code{get_file} returns a raw vector (or list of raw vectors, - if \code{length(file) > 1}). To load as a dataframe, see + if \code{length(file) > 1}), which can be saved locally with the `writeBin` + function. To load datasets into the R environment dataframe, see \link{get_dataframe_by_name}. } \description{ @@ -73,8 +76,6 @@ This function provides access to data files from a Dataverse entry. } \examples{ \dontrun{ -# download file from: -# https://doi.org/10.7910/DVN/ARKOTI # 1. Two-steps: Find ID from get_dataset d1 <- get_dataset("doi:10.7910/DVN/ARKOTI", server = "dataverse.harvard.edu") @@ -89,13 +90,19 @@ f2 <- get_file("constructionData.tab", f3_dvf <- dataset_files(2692151, server = "dataverse.harvard.edu") f3 <- get_file(f3_dvf[[2]], server = "dataverse.harvard.edu") -# 4. Retrieve bundle of raw data in list -f4_meta <- get_dataset("doi:10.7910/DVN/CXOB4K", +# 4. Retrieve multiple raw data in list +f4_vec <- get_dataset("doi:10.7910/DVN/CXOB4K", server = "dataverse.harvard.edu")$files$id -f4 <- get_file(f4_meta, +f4 <- get_file(f4_vec, server = "dataverse.harvard.edu") length(f4) +# Write binary files. +# The appropriate file extension needs to be assigned by the user. +writeBin(f2, "constructionData.tab") +writeBin(f4, "dataverse_download.zip") +writeBin(f4[[1]], "Appendices.docx") + } } diff --git a/man/get_dataframe.Rd b/man/get_dataframe.Rd index de8fc97..21ea96f 100644 --- a/man/get_dataframe.Rd +++ b/man/get_dataframe.Rd @@ -37,7 +37,9 @@ are not available or the original format is unknown, use `archival = TRUE`.} \item{\code{format}}{A character string specifying a file format for download. by default, this is \dQuote{original} (the original file format). If `NULL`, no query is added, so ingested files are returned in their ingested TSV form. -For other formats, see .} +For tabular datasets, the option \dQuote{bundle} downloads the bundle +of the original and archival versions, as well as the documentation. +See for details.} \item{\code{vars}}{A character vector specifying one or more variable names, used to extract a subset of the data.} \item{\code{key}}{A character string specifying a Dataverse server API key. If one From a6c08cefd6d4d8bc20711ad2c3ee4f0e58e4e6a5 Mon Sep 17 00:00:00 2001 From: Shiro Kuriwaki Date: Sun, 27 Dec 2020 22:41:30 -0500 Subject: [PATCH 20/75] First add for get_file_by_name --- NAMESPACE | 1 + R/get_file.R | 70 +++++++++++++++++++++++++++++++++++++--------- R/get_file_by_id.R | 3 +- man/files.Rd | 55 +++++++++++++++++++++++++----------- 4 files changed, 97 insertions(+), 32 deletions(-) diff --git a/NAMESPACE b/NAMESPACE index f7301f9..778fef3 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -34,6 +34,7 @@ export(get_dataverse) export(get_facets) export(get_file) export(get_file_by_id) +export(get_file_by_name) export(get_file_metadata) export(get_user_key) export(initiate_sword_dataset) diff --git a/R/get_file.R b/R/get_file.R index 1aa0bf1..1d18641 100644 --- a/R/get_file.R +++ b/R/get_file.R @@ -2,15 +2,22 @@ #' #' #' @title Download File(s) -#' @description Download Dataverse File(s). `get_file` internally calls -#' `get_file_by_id`. +#' +#' @description Download Dataverse File(s). `get_file` is a general wrapper, +#' and can take either dataverse objects, file IDs, or a filename and dataverse. +#' `get_file_by_name` is a shorthand for running `get_file` by +#' specifying a file name (`filename`) and dataverse DOI (`dataset`). +#' Internally, all functions download each file by `get_file_by_id`. `get_file_*` +#' functions return a raw binary file, which cannot be readily analyzed in R. +#' To download dataframes, see the `get_dataset_*` functions at \link{get_dataset} +#' +#' #' #' @details This function provides access to data files from a Dataverse entry. #' @param file An integer specifying a file identifier; or a vector of integers #' specifying file identifiers; or, if \code{doi} is specified, a character string #' specifying a file name within the DOI-identified dataset; or an object of #' class \dQuote{dataverse_file} as returned by \code{\link{dataset_files}}. -#' @param fileid A numeric ID internally used for `get_file_by_id` #' @param format A character string specifying a file format for download. #' by default, this is \dQuote{original} (the original file format). If `NULL`, #' no query is added, so ingested files are returned in their ingested TSV form. @@ -18,7 +25,7 @@ #' of the original and archival versions, as well as the documentation. #' See for details. #' @param vars A character vector specifying one or more variable names, used to -#' extract a subset of the data. +#' extract a subset of the data. #' #' @template envvars #' @template dots @@ -33,14 +40,15 @@ #' @examples #' \dontrun{ #' -#' # 1. Two-steps: Find ID from get_dataset -#' d1 <- get_dataset("doi:10.7910/DVN/ARKOTI", server = "dataverse.harvard.edu") -#' f1 <- get_file(d1$files$id[1], server = "dataverse.harvard.edu") +#' # 1. Using filename and dataverse +#' f1 <- get_file_by_name("constructionData.tab", +#' dataset = "doi:10.7910/DVN/ARKOTI", +#' server = "dataverse.harvard.edu") +#' +#' # 2. Two-steps: Find ID from get_dataset +#' d2 <- get_dataset("doi:10.7910/DVN/ARKOTI", server = "dataverse.harvard.edu") +#' f2 <- get_file(d1$files$id[1], server = "dataverse.harvard.edu") #' -#' # 2. Using filename and dataverse -#' f2 <- get_file("constructionData.tab", -#' "doi:10.7910/DVN/ARKOTI", -#' server = "dataverse.harvard.edu") #' #' # 3. Based on "dataverse_file" object #' f3_dvf <- dataset_files(2692151, server = "dataverse.harvard.edu") @@ -55,7 +63,7 @@ #' #' # Write binary files. #' # The appropriate file extension needs to be assigned by the user. -#' writeBin(f2, "constructionData.tab") +#' writeBin(f1, "constructionData.tab") #' writeBin(f4, "dataverse_download.zip") #' writeBin(f4[[1]], "Appendices.docx") #' @@ -66,9 +74,10 @@ get_file <- function(file, dataset = NULL, format = c("original", "bundle"), + server = Sys.getenv("DATAVERSE_SERVER"), vars = NULL, key = Sys.getenv("DATAVERSE_KEY"), - server = Sys.getenv("DATAVERSE_SERVER"), + archival = NULL, ...) { format <- match.arg(format) @@ -98,6 +107,7 @@ get_file <- vars = vars, key = key, server = server, + archival = archival, ... ) } @@ -113,4 +123,38 @@ get_file <- +#' @rdname files +#' +#' +#' @param filename Filename of the dataset, with file extension +#' +#' @inheritParams get_file +#' +#' @export +get_file_by_name <- function(filename, + dataset, + format = c("original", "bundle"), + server = Sys.getenv("DATAVERSE_SERVER"), + vars = NULL, + key = Sys.getenv("DATAVERSE_KEY"), + archival = NULL, + ... + ) { + format <- match.arg(format) + + + # retrieve ID + fileid <- get_fileid.character(x = dataset, + file = filename, + server = server, + ...) + + get_file_by_id(fileid, + format = format, + vars = vars, + key = key, + server = server, + archival = archival, + ...) +} diff --git a/R/get_file_by_id.R b/R/get_file_by_id.R index 7b5a1ee..a59f61d 100644 --- a/R/get_file_by_id.R +++ b/R/get_file_by_id.R @@ -1,9 +1,8 @@ -#' @title Download Single File by dataverse ID -#' #' @rdname files #' #' @param archival If a ingested (.tab) version is available, download #' the ingested archival version or not? +#' @param fileid A numeric ID internally used for `get_file_by_id` #' #' @importFrom xml2 read_xml as_list #' diff --git a/man/files.Rd b/man/files.Rd index c995780..e54b920 100644 --- a/man/files.Rd +++ b/man/files.Rd @@ -2,6 +2,7 @@ % Please edit documentation in R/get_file.R, R/get_file_by_id.R \name{get_file} \alias{get_file} +\alias{get_file_by_name} \alias{get_file_by_id} \title{Download File(s)} \usage{ @@ -9,9 +10,21 @@ get_file( file, dataset = NULL, format = c("original", "bundle"), + server = Sys.getenv("DATAVERSE_SERVER"), vars = NULL, key = Sys.getenv("DATAVERSE_KEY"), + archival = NULL, + ... +) + +get_file_by_name( + filename, + dataset, + format = c("original", "bundle"), server = Sys.getenv("DATAVERSE_SERVER"), + vars = NULL, + key = Sys.getenv("DATAVERSE_KEY"), + archival = NULL, ... ) @@ -39,6 +52,11 @@ For tabular datasets, the option \dQuote{bundle} downloads the bundle of the original and archival versions, as well as the documentation. See for details.} +\item{server}{A character string specifying a Dataverse server. There are +multiple Dataverse installations, but the defaults is to use the Harvard +Dataverse (`server = "dataverse.harvard.edu"`). This can be modified atomically +or globally using \code{Sys.setenv("DATAVERSE_SERVER" = "dataverse.example.com")}.} + \item{vars}{A character vector specifying one or more variable names, used to extract a subset of the data.} @@ -47,19 +65,16 @@ is not specified, functions calling authenticated API endpoints will fail. Keys can be specified atomically or globally using \code{Sys.setenv("DATAVERSE_KEY" = "examplekey")}.} -\item{server}{A character string specifying a Dataverse server. There are -multiple Dataverse installations, but the defaults is to use the Harvard -Dataverse (`server = "dataverse.harvard.edu"`). This can be modified atomically -or globally using \code{Sys.setenv("DATAVERSE_SERVER" = "dataverse.example.com")}.} +\item{archival}{If a ingested (.tab) version is available, download +the ingested archival version or not?} \item{...}{Additional arguments passed to an HTTP request function, such as \code{\link[httr]{GET}}, \code{\link[httr]{POST}}, or \code{\link[httr]{DELETE}}.} -\item{fileid}{A numeric ID internally used for `get_file_by_id`} +\item{filename}{Filename of the dataset, with file extension} -\item{archival}{If a ingested (.tab) version is available, download -the ingested archival version or not?} +\item{fileid}{A numeric ID internally used for `get_file_by_id`} } \value{ \code{get_file} returns a raw vector (or list of raw vectors, @@ -68,8 +83,13 @@ the ingested archival version or not?} \link{get_dataframe_by_name}. } \description{ -Download Dataverse File(s). `get_file` internally calls - `get_file_by_id`. +Download Dataverse File(s). `get_file` is a general wrapper, + and can take either dataverse objects, file IDs, or a filename and dataverse. + `get_file_by_name` is a shorthand for running `get_file` by + specifying a file name (`filename`) and dataverse DOI (`dataset`). + Internally, all functions download each file by `get_file_by_id`. `get_file_*` + functions return a raw binary file, which cannot be readily analyzed in R. + To download dataframes, see the `get_dataset_*` functions at \link{get_dataset} } \details{ This function provides access to data files from a Dataverse entry. @@ -77,14 +97,15 @@ This function provides access to data files from a Dataverse entry. \examples{ \dontrun{ -# 1. Two-steps: Find ID from get_dataset -d1 <- get_dataset("doi:10.7910/DVN/ARKOTI", server = "dataverse.harvard.edu") -f1 <- get_file(d1$files$id[1], server = "dataverse.harvard.edu") +# 1. Using filename and dataverse +f1 <- get_file_by_name("constructionData.tab", + dataset = "doi:10.7910/DVN/ARKOTI", + server = "dataverse.harvard.edu") + +# 2. Two-steps: Find ID from get_dataset +d2 <- get_dataset("doi:10.7910/DVN/ARKOTI", server = "dataverse.harvard.edu") +f2 <- get_file(d1$files$id[1], server = "dataverse.harvard.edu") -# 2. Using filename and dataverse -f2 <- get_file("constructionData.tab", - "doi:10.7910/DVN/ARKOTI", - server = "dataverse.harvard.edu") # 3. Based on "dataverse_file" object f3_dvf <- dataset_files(2692151, server = "dataverse.harvard.edu") @@ -99,7 +120,7 @@ length(f4) # Write binary files. # The appropriate file extension needs to be assigned by the user. -writeBin(f2, "constructionData.tab") +writeBin(f1, "constructionData.tab") writeBin(f4, "dataverse_download.zip") writeBin(f4[[1]], "Appendices.docx") From f2f60abc08a920be73206b8c2f5808042fcfb22d Mon Sep 17 00:00:00 2001 From: Shiro Kuriwaki Date: Sun, 27 Dec 2020 23:29:52 -0500 Subject: [PATCH 21/75] Set markdown option to true in DESCRIPTION so markdown code in roxygen will be rendered properly --- DESCRIPTION | 1 + man/add_dataset_file.Rd | 2 +- man/add_file.Rd | 2 +- man/create_dataset.Rd | 2 +- man/create_dataverse.Rd | 2 +- man/dataset_atom.Rd | 2 +- man/dataset_versions.Rd | 2 +- man/dataverse.Rd | 14 +++++++------- man/dataverse_metadata.Rd | 2 +- man/dataverse_search.Rd | 2 +- man/delete_dataset.Rd | 2 +- man/delete_dataverse.Rd | 2 +- man/delete_file.Rd | 2 +- man/delete_sword_dataset.Rd | 2 +- man/files.Rd | 30 +++++++++++++++--------------- man/get_dataframe.Rd | 16 ++++++++-------- man/get_dataset.Rd | 2 +- man/get_dataverse.Rd | 2 +- man/get_facets.Rd | 2 +- man/get_file_metadata.Rd | 6 +++--- man/initiate_sword_dataset.Rd | 2 +- man/list_datasets.Rd | 2 +- man/publish_dataset.Rd | 2 +- man/publish_dataverse.Rd | 2 +- man/publish_sword_dataset.Rd | 2 +- man/service_document.Rd | 2 +- man/set_dataverse_metadata.Rd | 2 +- 27 files changed, 56 insertions(+), 55 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 5ac1516..7b90f99 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -61,3 +61,4 @@ BugReports: https://github.com/iqss/dataverse-client-r/issues VignetteBuilder: knitr Encoding: UTF-8 RoxygenNote: 7.1.1 +Roxygen: list(markdown = TRUE) diff --git a/man/add_dataset_file.Rd b/man/add_dataset_file.Rd index b4fff97..c2b2b15 100644 --- a/man/add_dataset_file.Rd +++ b/man/add_dataset_file.Rd @@ -39,7 +39,7 @@ Keys can be specified atomically or globally using \item{server}{A character string specifying a Dataverse server. There are multiple Dataverse installations, but the defaults is to use the Harvard -Dataverse (`server = "dataverse.harvard.edu"`). This can be modified atomically +Dataverse (\code{server = "dataverse.harvard.edu"}). This can be modified atomically or globally using \code{Sys.setenv("DATAVERSE_SERVER" = "dataverse.example.com")}.} \item{...}{Additional arguments passed to an HTTP request function, such as diff --git a/man/add_file.Rd b/man/add_file.Rd index f65268e..848de4b 100644 --- a/man/add_file.Rd +++ b/man/add_file.Rd @@ -24,7 +24,7 @@ Keys can be specified atomically or globally using \item{server}{A character string specifying a Dataverse server. There are multiple Dataverse installations, but the defaults is to use the Harvard -Dataverse (`server = "dataverse.harvard.edu"`). This can be modified atomically +Dataverse (\code{server = "dataverse.harvard.edu"}). This can be modified atomically or globally using \code{Sys.setenv("DATAVERSE_SERVER" = "dataverse.example.com")}.} \item{...}{Additional arguments passed to an HTTP request function, such as diff --git a/man/create_dataset.Rd b/man/create_dataset.Rd index bc83cf5..4ca5a8b 100644 --- a/man/create_dataset.Rd +++ b/man/create_dataset.Rd @@ -33,7 +33,7 @@ Keys can be specified atomically or globally using \item{server}{A character string specifying a Dataverse server. There are multiple Dataverse installations, but the defaults is to use the Harvard -Dataverse (`server = "dataverse.harvard.edu"`). This can be modified atomically +Dataverse (\code{server = "dataverse.harvard.edu"}). This can be modified atomically or globally using \code{Sys.setenv("DATAVERSE_SERVER" = "dataverse.example.com")}.} \item{...}{Additional arguments passed to an HTTP request function, such as diff --git a/man/create_dataverse.Rd b/man/create_dataverse.Rd index 36d5a68..9df42c8 100644 --- a/man/create_dataverse.Rd +++ b/man/create_dataverse.Rd @@ -21,7 +21,7 @@ Keys can be specified atomically or globally using \item{server}{A character string specifying a Dataverse server. There are multiple Dataverse installations, but the defaults is to use the Harvard -Dataverse (`server = "dataverse.harvard.edu"`). This can be modified atomically +Dataverse (\code{server = "dataverse.harvard.edu"}). This can be modified atomically or globally using \code{Sys.setenv("DATAVERSE_SERVER" = "dataverse.example.com")}.} \item{...}{Additional arguments passed to an HTTP request function, such as diff --git a/man/dataset_atom.Rd b/man/dataset_atom.Rd index d4cf4c7..3c3d423 100644 --- a/man/dataset_atom.Rd +++ b/man/dataset_atom.Rd @@ -29,7 +29,7 @@ Keys can be specified atomically or globally using \item{server}{A character string specifying a Dataverse server. There are multiple Dataverse installations, but the defaults is to use the Harvard -Dataverse (`server = "dataverse.harvard.edu"`). This can be modified atomically +Dataverse (\code{server = "dataverse.harvard.edu"}). This can be modified atomically or globally using \code{Sys.setenv("DATAVERSE_SERVER" = "dataverse.example.com")}.} \item{...}{Additional arguments passed to an HTTP request function, such as diff --git a/man/dataset_versions.Rd b/man/dataset_versions.Rd index f573ec1..31af3cc 100644 --- a/man/dataset_versions.Rd +++ b/man/dataset_versions.Rd @@ -21,7 +21,7 @@ Keys can be specified atomically or globally using \item{server}{A character string specifying a Dataverse server. There are multiple Dataverse installations, but the defaults is to use the Harvard -Dataverse (`server = "dataverse.harvard.edu"`). This can be modified atomically +Dataverse (\code{server = "dataverse.harvard.edu"}). This can be modified atomically or globally using \code{Sys.setenv("DATAVERSE_SERVER" = "dataverse.example.com")}.} \item{...}{Additional arguments passed to an HTTP request function, such as diff --git a/man/dataverse.Rd b/man/dataverse.Rd index 2c7313b..570a4c6 100644 --- a/man/dataverse.Rd +++ b/man/dataverse.Rd @@ -15,17 +15,17 @@ A Dataverse is structured as a nested set of \dQuote{dataverse} repositories, su This package provides five main sets of functions to interact with Dataverse: \itemize{ - \item Search: \code{\link{dataverse_search}} - \item Data retrieval: \code{\link{get_dataverse}}, \code{\link{dataverse_contents}}, \code{\link{get_dataset}}, \code{\link{dataset_metadata}}, \code{\link{get_file}} - \item Data archiving (SWORD API): \code{\link{service_document}}, \code{\link{list_datasets}}, \code{\link{initiate_sword_dataset}}, \code{\link{delete_sword_dataset}}, \code{\link{publish_sword_dataset}}, \code{\link{add_file}}, \code{\link{delete_file}} - \item Dataverse management \dQuote{native} API: \code{\link{create_dataverse}}, \code{\link{publish_dataverse}}, \code{\link{delete_dataverse}} - \item Dataset management \dQuote{native} API: \code{\link{create_dataset}}, \code{\link{update_dataset}}, \code{\link{publish_dataset}}, \code{\link{delete_dataset}}, \code{\link{dataset_files}}, \code{\link{dataset_versions}} +\item Search: \code{\link{dataverse_search}} +\item Data retrieval: \code{\link{get_dataverse}}, \code{\link{dataverse_contents}}, \code{\link{get_dataset}}, \code{\link{dataset_metadata}}, \code{\link{get_file}} +\item Data archiving (SWORD API): \code{\link{service_document}}, \code{\link{list_datasets}}, \code{\link{initiate_sword_dataset}}, \code{\link{delete_sword_dataset}}, \code{\link{publish_sword_dataset}}, \code{\link{add_file}}, \code{\link{delete_file}} +\item Dataverse management \dQuote{native} API: \code{\link{create_dataverse}}, \code{\link{publish_dataverse}}, \code{\link{delete_dataverse}} +\item Dataset management \dQuote{native} API: \code{\link{create_dataset}}, \code{\link{update_dataset}}, \code{\link{publish_dataset}}, \code{\link{delete_dataset}}, \code{\link{dataset_files}}, \code{\link{dataset_versions}} } } \references{ \href{http://guides.dataverse.org/en/latest/api/index.html}{Dataverse API Documentation} - \href{http://dataverse.org/}{Dataverse Homepage} +\href{http://dataverse.org/}{Dataverse Homepage} - \href{https://dataverse.harvard.edu/}{Harvard IQSS Dataverse} +\href{https://dataverse.harvard.edu/}{Harvard IQSS Dataverse} } diff --git a/man/dataverse_metadata.Rd b/man/dataverse_metadata.Rd index 11fa71e..0735899 100644 --- a/man/dataverse_metadata.Rd +++ b/man/dataverse_metadata.Rd @@ -21,7 +21,7 @@ Keys can be specified atomically or globally using \item{server}{A character string specifying a Dataverse server. There are multiple Dataverse installations, but the defaults is to use the Harvard -Dataverse (`server = "dataverse.harvard.edu"`). This can be modified atomically +Dataverse (\code{server = "dataverse.harvard.edu"}). This can be modified atomically or globally using \code{Sys.setenv("DATAVERSE_SERVER" = "dataverse.example.com")}.} \item{...}{Additional arguments passed to an HTTP request function, such as diff --git a/man/dataverse_search.Rd b/man/dataverse_search.Rd index 622cb2a..0ed2e61 100644 --- a/man/dataverse_search.Rd +++ b/man/dataverse_search.Rd @@ -49,7 +49,7 @@ Keys can be specified atomically or globally using \item{server}{A character string specifying a Dataverse server. There are multiple Dataverse installations, but the defaults is to use the Harvard -Dataverse (`server = "dataverse.harvard.edu"`). This can be modified atomically +Dataverse (\code{server = "dataverse.harvard.edu"}). This can be modified atomically or globally using \code{Sys.setenv("DATAVERSE_SERVER" = "dataverse.example.com")}.} \item{verbose}{A logical indicating whether to display information about the search query (default is \code{TRUE}).} diff --git a/man/delete_dataset.Rd b/man/delete_dataset.Rd index 4357c75..56a2744 100644 --- a/man/delete_dataset.Rd +++ b/man/delete_dataset.Rd @@ -21,7 +21,7 @@ Keys can be specified atomically or globally using \item{server}{A character string specifying a Dataverse server. There are multiple Dataverse installations, but the defaults is to use the Harvard -Dataverse (`server = "dataverse.harvard.edu"`). This can be modified atomically +Dataverse (\code{server = "dataverse.harvard.edu"}). This can be modified atomically or globally using \code{Sys.setenv("DATAVERSE_SERVER" = "dataverse.example.com")}.} \item{...}{Additional arguments passed to an HTTP request function, such as diff --git a/man/delete_dataverse.Rd b/man/delete_dataverse.Rd index 95c1087..a34ce4c 100644 --- a/man/delete_dataverse.Rd +++ b/man/delete_dataverse.Rd @@ -21,7 +21,7 @@ Keys can be specified atomically or globally using \item{server}{A character string specifying a Dataverse server. There are multiple Dataverse installations, but the defaults is to use the Harvard -Dataverse (`server = "dataverse.harvard.edu"`). This can be modified atomically +Dataverse (\code{server = "dataverse.harvard.edu"}). This can be modified atomically or globally using \code{Sys.setenv("DATAVERSE_SERVER" = "dataverse.example.com")}.} \item{...}{Additional arguments passed to an HTTP request function, such as diff --git a/man/delete_file.Rd b/man/delete_file.Rd index 6cae555..38908d3 100644 --- a/man/delete_file.Rd +++ b/man/delete_file.Rd @@ -21,7 +21,7 @@ Keys can be specified atomically or globally using \item{server}{A character string specifying a Dataverse server. There are multiple Dataverse installations, but the defaults is to use the Harvard -Dataverse (`server = "dataverse.harvard.edu"`). This can be modified atomically +Dataverse (\code{server = "dataverse.harvard.edu"}). This can be modified atomically or globally using \code{Sys.setenv("DATAVERSE_SERVER" = "dataverse.example.com")}.} \item{...}{Additional arguments passed to an HTTP request function, such as diff --git a/man/delete_sword_dataset.Rd b/man/delete_sword_dataset.Rd index a59582a..7cb8462 100644 --- a/man/delete_sword_dataset.Rd +++ b/man/delete_sword_dataset.Rd @@ -21,7 +21,7 @@ Keys can be specified atomically or globally using \item{server}{A character string specifying a Dataverse server. There are multiple Dataverse installations, but the defaults is to use the Harvard -Dataverse (`server = "dataverse.harvard.edu"`). This can be modified atomically +Dataverse (\code{server = "dataverse.harvard.edu"}). This can be modified atomically or globally using \code{Sys.setenv("DATAVERSE_SERVER" = "dataverse.example.com")}.} \item{...}{Additional arguments passed to an HTTP request function, such as diff --git a/man/files.Rd b/man/files.Rd index e54b920..a8866da 100644 --- a/man/files.Rd +++ b/man/files.Rd @@ -43,18 +43,18 @@ get_file_by_id( \item{file}{An integer specifying a file identifier; or a vector of integers specifying file identifiers; or, if \code{doi} is specified, a character string specifying a file name within the DOI-identified dataset; or an object of - class \dQuote{dataverse_file} as returned by \code{\link{dataset_files}}.} +class \dQuote{dataverse_file} as returned by \code{\link{dataset_files}}.} \item{format}{A character string specifying a file format for download. -by default, this is \dQuote{original} (the original file format). If `NULL`, +by default, this is \dQuote{original} (the original file format). If \code{NULL}, no query is added, so ingested files are returned in their ingested TSV form. For tabular datasets, the option \dQuote{bundle} downloads the bundle of the original and archival versions, as well as the documentation. -See for details.} +See \url{https://guides.dataverse.org/en/latest/api/dataaccess.html} for details.} \item{server}{A character string specifying a Dataverse server. There are multiple Dataverse installations, but the defaults is to use the Harvard -Dataverse (`server = "dataverse.harvard.edu"`). This can be modified atomically +Dataverse (\code{server = "dataverse.harvard.edu"}). This can be modified atomically or globally using \code{Sys.setenv("DATAVERSE_SERVER" = "dataverse.example.com")}.} \item{vars}{A character vector specifying one or more variable names, used to @@ -74,22 +74,22 @@ the ingested archival version or not?} \item{filename}{Filename of the dataset, with file extension} -\item{fileid}{A numeric ID internally used for `get_file_by_id`} +\item{fileid}{A numeric ID internally used for \code{get_file_by_id}} } \value{ \code{get_file} returns a raw vector (or list of raw vectors, - if \code{length(file) > 1}), which can be saved locally with the `writeBin` - function. To load datasets into the R environment dataframe, see - \link{get_dataframe_by_name}. +if \code{length(file) > 1}), which can be saved locally with the \code{writeBin} +function. To load datasets into the R environment dataframe, see +\link{get_dataframe_by_name}. } \description{ -Download Dataverse File(s). `get_file` is a general wrapper, - and can take either dataverse objects, file IDs, or a filename and dataverse. - `get_file_by_name` is a shorthand for running `get_file` by - specifying a file name (`filename`) and dataverse DOI (`dataset`). - Internally, all functions download each file by `get_file_by_id`. `get_file_*` - functions return a raw binary file, which cannot be readily analyzed in R. - To download dataframes, see the `get_dataset_*` functions at \link{get_dataset} +Download Dataverse File(s). \code{get_file} is a general wrapper, +and can take either dataverse objects, file IDs, or a filename and dataverse. +\code{get_file_by_name} is a shorthand for running \code{get_file} by +specifying a file name (\code{filename}) and dataverse DOI (\code{dataset}). +Internally, all functions download each file by \code{get_file_by_id}. \verb{get_file_*} +functions return a raw binary file, which cannot be readily analyzed in R. +To download dataframes, see the \verb{get_dataset_*} functions at \link{get_dataset} } \details{ This function provides access to data files from a Dataverse entry. diff --git a/man/get_dataframe.Rd b/man/get_dataframe.Rd index 21ea96f..197a7dc 100644 --- a/man/get_dataframe.Rd +++ b/man/get_dataframe.Rd @@ -27,19 +27,19 @@ environment} \item{archival}{Whether to read from the ingested, archival version of the dataset, or whether to read the original. The archival versions are tab-delimited -`.tab` files. If functions to read the original version is available without -loss of information, then `archival = FALSE` is better. If such functions -are not available or the original format is unknown, use `archival = TRUE`.} +\code{.tab} files. If functions to read the original version is available without +loss of information, then \code{archival = FALSE} is better. If such functions +are not available or the original format is unknown, use \code{archival = TRUE}.} \item{...}{ Arguments passed on to \code{\link[=get_file]{get_file}} \describe{ \item{\code{format}}{A character string specifying a file format for download. -by default, this is \dQuote{original} (the original file format). If `NULL`, +by default, this is \dQuote{original} (the original file format). If \code{NULL}, no query is added, so ingested files are returned in their ingested TSV form. For tabular datasets, the option \dQuote{bundle} downloads the bundle of the original and archival versions, as well as the documentation. -See for details.} +See \url{https://guides.dataverse.org/en/latest/api/dataaccess.html} for details.} \item{\code{vars}}{A character vector specifying one or more variable names, used to extract a subset of the data.} \item{\code{key}}{A character string specifying a Dataverse server API key. If one @@ -48,13 +48,13 @@ Keys can be specified atomically or globally using \code{Sys.setenv("DATAVERSE_KEY" = "examplekey")}.} \item{\code{server}}{A character string specifying a Dataverse server. There are multiple Dataverse installations, but the defaults is to use the Harvard -Dataverse (`server = "dataverse.harvard.edu"`). This can be modified atomically +Dataverse (\code{server = "dataverse.harvard.edu"}). This can be modified atomically or globally using \code{Sys.setenv("DATAVERSE_SERVER" = "dataverse.example.com")}.} }} } \description{ -`get_dataframe_by_id`, if you know the numeric ID of the dataset, or instead -`get_dataframe_by_name` if you know the filename and doi. The dataset +\code{get_dataframe_by_id}, if you know the numeric ID of the dataset, or instead +\code{get_dataframe_by_name} if you know the filename and doi. The dataset } \examples{ # load dataset from file name and dataverse DOI diff --git a/man/get_dataset.Rd b/man/get_dataset.Rd index c1bd0ca..c0c55e0 100644 --- a/man/get_dataset.Rd +++ b/man/get_dataset.Rd @@ -43,7 +43,7 @@ Keys can be specified atomically or globally using \item{server}{A character string specifying a Dataverse server. There are multiple Dataverse installations, but the defaults is to use the Harvard -Dataverse (`server = "dataverse.harvard.edu"`). This can be modified atomically +Dataverse (\code{server = "dataverse.harvard.edu"}). This can be modified atomically or globally using \code{Sys.setenv("DATAVERSE_SERVER" = "dataverse.example.com")}.} \item{...}{Additional arguments passed to an HTTP request function, such as diff --git a/man/get_dataverse.Rd b/man/get_dataverse.Rd index fb10545..832df18 100644 --- a/man/get_dataverse.Rd +++ b/man/get_dataverse.Rd @@ -30,7 +30,7 @@ Keys can be specified atomically or globally using \item{server}{A character string specifying a Dataverse server. There are multiple Dataverse installations, but the defaults is to use the Harvard -Dataverse (`server = "dataverse.harvard.edu"`). This can be modified atomically +Dataverse (\code{server = "dataverse.harvard.edu"}). This can be modified atomically or globally using \code{Sys.setenv("DATAVERSE_SERVER" = "dataverse.example.com")}.} \item{check}{A logical indicating whether to check that the value of \code{dataverse} is actually a numeric} diff --git a/man/get_facets.Rd b/man/get_facets.Rd index ae0afdd..3819735 100644 --- a/man/get_facets.Rd +++ b/man/get_facets.Rd @@ -21,7 +21,7 @@ Keys can be specified atomically or globally using \item{server}{A character string specifying a Dataverse server. There are multiple Dataverse installations, but the defaults is to use the Harvard -Dataverse (`server = "dataverse.harvard.edu"`). This can be modified atomically +Dataverse (\code{server = "dataverse.harvard.edu"}). This can be modified atomically or globally using \code{Sys.setenv("DATAVERSE_SERVER" = "dataverse.example.com")}.} \item{...}{Additional arguments passed to an HTTP request function, such as diff --git a/man/get_file_metadata.Rd b/man/get_file_metadata.Rd index 044b405..3c3e343 100644 --- a/man/get_file_metadata.Rd +++ b/man/get_file_metadata.Rd @@ -17,7 +17,7 @@ get_file_metadata( \item{file}{An integer specifying a file identifier; or a vector of integers specifying file identifiers; or, if \code{doi} is specified, a character string specifying a file name within the DOI-identified dataset; or an object of - class \dQuote{dataverse_file} as returned by \code{\link{dataset_files}}.} +class \dQuote{dataverse_file} as returned by \code{\link{dataset_files}}.} \item{format}{Defaults to \dQuote{ddi} for metadata files} @@ -28,7 +28,7 @@ Keys can be specified atomically or globally using \item{server}{A character string specifying a Dataverse server. There are multiple Dataverse installations, but the defaults is to use the Harvard -Dataverse (`server = "dataverse.harvard.edu"`). This can be modified atomically +Dataverse (\code{server = "dataverse.harvard.edu"}). This can be modified atomically or globally using \code{Sys.setenv("DATAVERSE_SERVER" = "dataverse.example.com")}.} \item{...}{Additional arguments passed to an HTTP request function, such as @@ -37,7 +37,7 @@ or globally using \code{Sys.setenv("DATAVERSE_SERVER" = "dataverse.example.com") } \value{ A character vector containing a DDI - metadata file. +metadata file. } \description{ Retrieve a ddi metadata file diff --git a/man/initiate_sword_dataset.Rd b/man/initiate_sword_dataset.Rd index 444b040..9565759 100644 --- a/man/initiate_sword_dataset.Rd +++ b/man/initiate_sword_dataset.Rd @@ -24,7 +24,7 @@ Keys can be specified atomically or globally using \item{server}{A character string specifying a Dataverse server. There are multiple Dataverse installations, but the defaults is to use the Harvard -Dataverse (`server = "dataverse.harvard.edu"`). This can be modified atomically +Dataverse (\code{server = "dataverse.harvard.edu"}). This can be modified atomically or globally using \code{Sys.setenv("DATAVERSE_SERVER" = "dataverse.example.com")}.} \item{...}{Additional arguments passed to an HTTP request function, such as diff --git a/man/list_datasets.Rd b/man/list_datasets.Rd index c88b576..3d0257f 100644 --- a/man/list_datasets.Rd +++ b/man/list_datasets.Rd @@ -21,7 +21,7 @@ Keys can be specified atomically or globally using \item{server}{A character string specifying a Dataverse server. There are multiple Dataverse installations, but the defaults is to use the Harvard -Dataverse (`server = "dataverse.harvard.edu"`). This can be modified atomically +Dataverse (\code{server = "dataverse.harvard.edu"}). This can be modified atomically or globally using \code{Sys.setenv("DATAVERSE_SERVER" = "dataverse.example.com")}.} \item{...}{Additional arguments passed to an HTTP request function, such as diff --git a/man/publish_dataset.Rd b/man/publish_dataset.Rd index 52a10d8..2414f77 100644 --- a/man/publish_dataset.Rd +++ b/man/publish_dataset.Rd @@ -24,7 +24,7 @@ Keys can be specified atomically or globally using \item{server}{A character string specifying a Dataverse server. There are multiple Dataverse installations, but the defaults is to use the Harvard -Dataverse (`server = "dataverse.harvard.edu"`). This can be modified atomically +Dataverse (\code{server = "dataverse.harvard.edu"}). This can be modified atomically or globally using \code{Sys.setenv("DATAVERSE_SERVER" = "dataverse.example.com")}.} \item{...}{Additional arguments passed to an HTTP request function, such as diff --git a/man/publish_dataverse.Rd b/man/publish_dataverse.Rd index fbac84d..de2bbb2 100644 --- a/man/publish_dataverse.Rd +++ b/man/publish_dataverse.Rd @@ -21,7 +21,7 @@ Keys can be specified atomically or globally using \item{server}{A character string specifying a Dataverse server. There are multiple Dataverse installations, but the defaults is to use the Harvard -Dataverse (`server = "dataverse.harvard.edu"`). This can be modified atomically +Dataverse (\code{server = "dataverse.harvard.edu"}). This can be modified atomically or globally using \code{Sys.setenv("DATAVERSE_SERVER" = "dataverse.example.com")}.} \item{...}{Additional arguments passed to an HTTP request function, such as diff --git a/man/publish_sword_dataset.Rd b/man/publish_sword_dataset.Rd index 246275d..34c2e90 100644 --- a/man/publish_sword_dataset.Rd +++ b/man/publish_sword_dataset.Rd @@ -21,7 +21,7 @@ Keys can be specified atomically or globally using \item{server}{A character string specifying a Dataverse server. There are multiple Dataverse installations, but the defaults is to use the Harvard -Dataverse (`server = "dataverse.harvard.edu"`). This can be modified atomically +Dataverse (\code{server = "dataverse.harvard.edu"}). This can be modified atomically or globally using \code{Sys.setenv("DATAVERSE_SERVER" = "dataverse.example.com")}.} \item{...}{Additional arguments passed to an HTTP request function, such as diff --git a/man/service_document.Rd b/man/service_document.Rd index 2d91e59..a11a7a6 100644 --- a/man/service_document.Rd +++ b/man/service_document.Rd @@ -18,7 +18,7 @@ Keys can be specified atomically or globally using \item{server}{A character string specifying a Dataverse server. There are multiple Dataverse installations, but the defaults is to use the Harvard -Dataverse (`server = "dataverse.harvard.edu"`). This can be modified atomically +Dataverse (\code{server = "dataverse.harvard.edu"}). This can be modified atomically or globally using \code{Sys.setenv("DATAVERSE_SERVER" = "dataverse.example.com")}.} \item{...}{Additional arguments passed to an HTTP request function, such as diff --git a/man/set_dataverse_metadata.Rd b/man/set_dataverse_metadata.Rd index 2119aa7..77110c3 100644 --- a/man/set_dataverse_metadata.Rd +++ b/man/set_dataverse_metadata.Rd @@ -27,7 +27,7 @@ Keys can be specified atomically or globally using \item{server}{A character string specifying a Dataverse server. There are multiple Dataverse installations, but the defaults is to use the Harvard -Dataverse (`server = "dataverse.harvard.edu"`). This can be modified atomically +Dataverse (\code{server = "dataverse.harvard.edu"}). This can be modified atomically or globally using \code{Sys.setenv("DATAVERSE_SERVER" = "dataverse.example.com")}.} \item{...}{Additional arguments passed to an HTTP request function, such as From 4375b9a2a750db95e80ca7a9b606b6d1e7d9977d Mon Sep 17 00:00:00 2001 From: Shiro Kuriwaki Date: Mon, 28 Dec 2020 00:41:16 -0500 Subject: [PATCH 22/75] Change example to dataset on example, not the Monogan text --- R/get_file.R | 31 ++++++++++++++------------- R/get_file_as_dataframe.R | 45 +++++++++++++++++++++++---------------- man/files.Rd | 30 ++++++++++++++------------ man/get_dataframe.Rd | 44 ++++++++++++++++++++++---------------- 4 files changed, 85 insertions(+), 65 deletions(-) diff --git a/R/get_file.R b/R/get_file.R index 1d18641..ea23616 100644 --- a/R/get_file.R +++ b/R/get_file.R @@ -41,31 +41,33 @@ #' \dontrun{ #' #' # 1. Using filename and dataverse -#' f1 <- get_file_by_name("constructionData.tab", -#' dataset = "doi:10.7910/DVN/ARKOTI", -#' server = "dataverse.harvard.edu") +#' f1 <- get_file_by_name("gapminder-FiveYearData.tab", +#' dataset = "doi:10.70122/FK2/PPKHI1", +#' server = "demo.dataverse.org") #' #' # 2. Two-steps: Find ID from get_dataset -#' d2 <- get_dataset("doi:10.7910/DVN/ARKOTI", server = "dataverse.harvard.edu") -#' f2 <- get_file(d1$files$id[1], server = "dataverse.harvard.edu") +#' d2 <- get_dataset("doi:10.70122/FK2/PPKHI1", server = "demo.dataverse.org") +#' f2 <- get_file(d1$files$id[1], server = "demo.dataverse.org") #' #' -#' # 3. Based on "dataverse_file" object -#' f3_dvf <- dataset_files(2692151, server = "dataverse.harvard.edu") -#' f3 <- get_file(f3_dvf[[2]], server = "dataverse.harvard.edu") +#' # 3. Alternatively, based on "dataverse_file" object +#' f3_dvf <- dataset_files("doi:10.70122/FK2/PPKHI1", server = "demo.dataverse.org") +#' f3 <- get_file(f3_dvf[[1]], server = "demo.dataverse.org") #' #' # 4. Retrieve multiple raw data in list -#' f4_vec <- get_dataset("doi:10.7910/DVN/CXOB4K", -#' server = "dataverse.harvard.edu")$files$id +#' f4_vec <- get_dataset("doi:10.70122/FK2/PPKHI1", +#' server = "demo.dataverse.org")$files$id #' f4 <- get_file(f4_vec, -#' server = "dataverse.harvard.edu") +#' server = "demo.dataverse.org") #' length(f4) #' #' # Write binary files. #' # The appropriate file extension needs to be assigned by the user. -#' writeBin(f1, "constructionData.tab") -#' writeBin(f4, "dataverse_download.zip") -#' writeBin(f4[[1]], "Appendices.docx") +#' writeBin(f1, "gapminder-FiveYearData.tab") +#' writeBin(f4[[1]], "gapminder-FiveYearData.tab") +#' +#' # NOTE: fix so that get_file (with multiple) files +#' # (f4) in example can return a tabulated dataset in original #' #' } #' @@ -122,7 +124,6 @@ get_file <- } - #' @rdname files #' #' diff --git a/R/get_file_as_dataframe.R b/R/get_file_as_dataframe.R index 36881ab..847dade 100644 --- a/R/get_file_as_dataframe.R +++ b/R/get_file_as_dataframe.R @@ -19,34 +19,43 @@ #' @inheritDotParams get_file #' #' @examples -#' # load dataset from file name and dataverse DOI +# load dataset from file name and dataverse DOI #' gap_df <- get_dataframe_by_name( #' file = "gapminder-FiveYearData.tab", -#' dataset = "doi:10.7910/DVN/GJQNEQ", -#' server = "dataverse.harvard.edu", -#' read_function = readr::read_csv) +#' dataset = "doi:10.70122/FK2/PPKHI1", +#' server = "demo.dataverse.org", +#' read_function = read_csv) +#' +#' # or a Stata dta +#' stata_df <- get_dataframe_by_id( +#' file = "nlsw88.tab", +#' dataset = "doi:10.70122/FK2/PPKHI1", +#' server = "demo.dataverse.org", +#' read_function = haven::read_dta) #' #' # equivalently, if you know the ID +#' # you can also customize the read_function (in this case to supress parse msg) #' gap_df <- get_dataframe_by_id( -#' 3037713, -#' server = "dataverse.harvard.edu", -#' read_function = readr::read_csv) +#' 1733998, +#' server = "demo.dataverse.org", +#' read_function = function(x) read_csv(x, col_types = cols())) #' #' # equivalently, using a dataverse object -#' gap_ds <- dataset_files("doi:10.7910/DVN/GJQNEQ", -#' server = "dataverse.harvard.edu") +#' gap_ds <- dataset_files("doi:10.70122/FK2/PPKHI1", +#' server = "demo.dataverse.org") +#' #' gap_df <- get_dataframe_by_id( -#' gap_ds[[2]], -#' server = "dataverse.harvard.edu", -#' read_function = readr::read_csv -#' ) +#' gap_ds[[1]], +#' server = "demo.dataverse.org", +#' read_function = function(x) read_csv(x, col_types = cols())) #' -#' # to use the ingested version (and read as TSV) +#' # to use the archival version (and read as TSV) #' gap_df <- get_dataframe_by_id( -#' 3037713, -#' server = "dataverse.harvard.edu", -#' archival = TRUE, -#' read_function = readr::read_tsv) +#' 1733998, +#' server = "demo.dataverse.org", +#' archival = TRUE, +#' read_function = function(x) read_tsv(x, col_types = cols())) +#' #' #' @export get_dataframe_by_name <- function(file, diff --git a/man/files.Rd b/man/files.Rd index a8866da..6d0036f 100644 --- a/man/files.Rd +++ b/man/files.Rd @@ -98,31 +98,33 @@ This function provides access to data files from a Dataverse entry. \dontrun{ # 1. Using filename and dataverse -f1 <- get_file_by_name("constructionData.tab", - dataset = "doi:10.7910/DVN/ARKOTI", - server = "dataverse.harvard.edu") +f1 <- get_file_by_name("gapminder-FiveYearData.tab", + dataset = "doi:10.70122/FK2/PPKHI1", + server = "demo.dataverse.org") # 2. Two-steps: Find ID from get_dataset -d2 <- get_dataset("doi:10.7910/DVN/ARKOTI", server = "dataverse.harvard.edu") -f2 <- get_file(d1$files$id[1], server = "dataverse.harvard.edu") +d2 <- get_dataset("doi:10.70122/FK2/PPKHI1", server = "demo.dataverse.org") +f2 <- get_file(d1$files$id[1], server = "demo.dataverse.org") -# 3. Based on "dataverse_file" object -f3_dvf <- dataset_files(2692151, server = "dataverse.harvard.edu") -f3 <- get_file(f3_dvf[[2]], server = "dataverse.harvard.edu") +# 3. Alternatively, based on "dataverse_file" object +f3_dvf <- dataset_files("doi:10.70122/FK2/PPKHI1", server = "demo.dataverse.org") +f3 <- get_file(f3_dvf[[1]], server = "demo.dataverse.org") # 4. Retrieve multiple raw data in list -f4_vec <- get_dataset("doi:10.7910/DVN/CXOB4K", - server = "dataverse.harvard.edu")$files$id +f4_vec <- get_dataset("doi:10.70122/FK2/PPKHI1", + server = "demo.dataverse.org")$files$id f4 <- get_file(f4_vec, - server = "dataverse.harvard.edu") + server = "demo.dataverse.org") length(f4) # Write binary files. # The appropriate file extension needs to be assigned by the user. -writeBin(f1, "constructionData.tab") -writeBin(f4, "dataverse_download.zip") -writeBin(f4[[1]], "Appendices.docx") +writeBin(f1, "gapminder-FiveYearData.tab") +writeBin(f4[[1]], "gapminder-FiveYearData.tab") + +# NOTE: fix so that get_file (with multiple) files +# (f4) in example can return a tabulated dataset in original } diff --git a/man/get_dataframe.Rd b/man/get_dataframe.Rd index 197a7dc..8d50c15 100644 --- a/man/get_dataframe.Rd +++ b/man/get_dataframe.Rd @@ -57,33 +57,41 @@ or globally using \code{Sys.setenv("DATAVERSE_SERVER" = "dataverse.example.com") \code{get_dataframe_by_name} if you know the filename and doi. The dataset } \examples{ -# load dataset from file name and dataverse DOI gap_df <- get_dataframe_by_name( file = "gapminder-FiveYearData.tab", - dataset = "doi:10.7910/DVN/GJQNEQ", - server = "dataverse.harvard.edu", - read_function = readr::read_csv) + dataset = "doi:10.70122/FK2/PPKHI1", + server = "demo.dataverse.org", + read_function = read_csv) + +# or a Stata dta +stata_df <- get_dataframe_by_id( + file = "nlsw88.tab", + dataset = "doi:10.70122/FK2/PPKHI1", + server = "demo.dataverse.org", + read_function = haven::read_dta) # equivalently, if you know the ID +# you can also customize the read_function (in this case to supress parse msg) gap_df <- get_dataframe_by_id( - 3037713, - server = "dataverse.harvard.edu", - read_function = readr::read_csv) + 1733998, + server = "demo.dataverse.org", + read_function = function(x) read_csv(x, col_types = cols())) # equivalently, using a dataverse object -gap_ds <- dataset_files("doi:10.7910/DVN/GJQNEQ", - server = "dataverse.harvard.edu") +gap_ds <- dataset_files("doi:10.70122/FK2/PPKHI1", + server = "demo.dataverse.org") + gap_df <- get_dataframe_by_id( - gap_ds[[2]], - server = "dataverse.harvard.edu", - read_function = readr::read_csv -) + gap_ds[[1]], + server = "demo.dataverse.org", + read_function = function(x) read_csv(x, col_types = cols())) -# to use the ingested version (and read as TSV) +# to use the archival version (and read as TSV) gap_df <- get_dataframe_by_id( - 3037713, - server = "dataverse.harvard.edu", - archival = TRUE, - read_function = readr::read_tsv) + 1733998, + server = "demo.dataverse.org", + archival = TRUE, + read_function = function(x) read_tsv(x, col_types = cols())) + } From 6919efe3518faa035ecfb83b0a994a1867ab8cdd Mon Sep 17 00:00:00 2001 From: Shiro Kuriwaki Date: Mon, 28 Dec 2020 14:47:16 -0500 Subject: [PATCH 23/75] Detects if file is ingested by tryCatch (a slightly less clunky way) Now we can read non-ingested files too --- R/get_file_as_dataframe.R | 9 ++++++++- R/get_file_by_id.R | 20 +++++++++----------- man/get_dataframe.Rd | 9 ++++++++- 3 files changed, 25 insertions(+), 13 deletions(-) diff --git a/R/get_file_as_dataframe.R b/R/get_file_as_dataframe.R index 847dade..a863c3d 100644 --- a/R/get_file_as_dataframe.R +++ b/R/get_file_as_dataframe.R @@ -27,12 +27,19 @@ #' read_function = read_csv) #' #' # or a Stata dta -#' stata_df <- get_dataframe_by_id( +#' stata_df <- get_dataframe_by_name( #' file = "nlsw88.tab", #' dataset = "doi:10.70122/FK2/PPKHI1", #' server = "demo.dataverse.org", #' read_function = haven::read_dta) #' +#' # or a Rds file +#' rds_df <- get_dataframe_by_name( +#' file = "nlsw88_rds-export.rds", +#' dataset = "doi:10.70122/FK2/PPKHI1", +#' server = "demo.dataverse.org", +#' read_function = read_rds) +#' #' # equivalently, if you know the ID #' # you can also customize the read_function (in this case to supress parse msg) #' gap_df <- get_dataframe_by_id( diff --git a/R/get_file_by_id.R b/R/get_file_by_id.R index a59f61d..a144425 100644 --- a/R/get_file_by_id.R +++ b/R/get_file_by_id.R @@ -22,22 +22,20 @@ get_file_by_id <- stopifnot(is.numeric(fileid)) stopifnot(length(fileid) == 1) - # detect file type to determine if something is ingested - if (!is.null(archival)) { - xml <- read_xml(get_file_metadata(fileid, server = server)) - filename <- as_list(xml)$codeBook$fileDscr$fileTxt$fileName[[1]] - is_ingested <- grepl(x = filename, pattern = "\\.tab$") - - if (archival & !is_ingested) - stop("The file does not have a .tab suffix so does not appear ingested.") - } else { - is_ingested <- FALSE - } + # ping get_file_metadata to see if file is ingested + ping_metadata <- tryCatch(get_file_metadata(fileid, server = server), + error = function(e) e) + is_ingested <- !inherits(ping_metadata, "error") # if error, not ingested # update archival if not specified if (is.null(archival)) archival <- FALSE + # check + if (archival & !is_ingested) + stop("You requested an archival version, but the file has no metadata so does not appear ingested.") + + # downloading files sequentially and add the raw vectors to a list out <- vector("list", length(fileid)) diff --git a/man/get_dataframe.Rd b/man/get_dataframe.Rd index 8d50c15..89ad84d 100644 --- a/man/get_dataframe.Rd +++ b/man/get_dataframe.Rd @@ -64,12 +64,19 @@ gap_df <- get_dataframe_by_name( read_function = read_csv) # or a Stata dta -stata_df <- get_dataframe_by_id( +stata_df <- get_dataframe_by_name( file = "nlsw88.tab", dataset = "doi:10.70122/FK2/PPKHI1", server = "demo.dataverse.org", read_function = haven::read_dta) +# or a Rds file +rds_df <- get_dataframe_by_name( + file = "nlsw88_rds-export.rds", + dataset = "doi:10.70122/FK2/PPKHI1", + server = "demo.dataverse.org", + read_function = read_rds) + # equivalently, if you know the ID # you can also customize the read_function (in this case to supress parse msg) gap_df <- get_dataframe_by_id( From b5e2838d5be16db0013df3c1ee4a6eb846318b9b Mon Sep 17 00:00:00 2001 From: Shiro Kuriwaki Date: Mon, 28 Dec 2020 14:53:44 -0500 Subject: [PATCH 24/75] Add defaults to get_file_by_id in case they get run alone --- R/get_file_by_id.R | 4 ++-- man/files.Rd | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/R/get_file_by_id.R b/R/get_file_by_id.R index a144425..ded84ee 100644 --- a/R/get_file_by_id.R +++ b/R/get_file_by_id.R @@ -10,9 +10,9 @@ get_file_by_id <- function(fileid, dataset = NULL, - server, + server = Sys.getenv("DATAVERSE_SERVER"), format = c("original", "RData", "prep", "bundle"), - vars, + vars = NULL, archival = NULL, key = Sys.getenv("DATAVERSE_KEY"), ...) { diff --git a/man/files.Rd b/man/files.Rd index 6d0036f..5b012e4 100644 --- a/man/files.Rd +++ b/man/files.Rd @@ -31,9 +31,9 @@ get_file_by_name( get_file_by_id( fileid, dataset = NULL, - server, + server = Sys.getenv("DATAVERSE_SERVER"), format = c("original", "RData", "prep", "bundle"), - vars, + vars = NULL, archival = NULL, key = Sys.getenv("DATAVERSE_KEY"), ... From 22427e0bf9af03ab1039b48df5b79a1a63fff0ad Mon Sep 17 00:00:00 2001 From: Shiro Kuriwaki Date: Mon, 28 Dec 2020 15:23:45 -0500 Subject: [PATCH 25/75] Simpler condition for #33 - only use `format` for getting ingested tabular data in their original form. Otherwise, do not specify `format` --- R/get_file_by_id.R | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/R/get_file_by_id.R b/R/get_file_by_id.R index ded84ee..aa7d12c 100644 --- a/R/get_file_by_id.R +++ b/R/get_file_by_id.R @@ -41,12 +41,18 @@ get_file_by_id <- # create query ----- query <- list() - if (!is.null(vars)) { + if (!is.null(vars)) query$vars <- paste0(vars, collapse = ",") - } - if (!is.null(format)) { + + # format only matters in ingested datasets, + # For non-ingested files (rds/docx), we need to NOT specify a format + if (is_ingested) query$format <- match.arg(format) - } + + # if the archival version is desired, we need to NOT specify a format + if (is_ingested & archival) + query$format <- NULL + # if bundle, use custom url ---- if (format == "bundle") { @@ -58,13 +64,7 @@ get_file_by_id <- # If not bundle, request single file in non-bundle format ---- u <- paste0(api_url(server), "access/datafile/", fileid) - # add query if you want to want the original version even though ingested - if (is_ingested & !archival) { - r <- httr::GET(u, httr::add_headers("X-Dataverse-key" = key), query = query, ...) - } else { - # do not add query if not an ingestion file - r <- httr::GET(u, httr::add_headers("X-Dataverse-key" = key), ...) - } + r <- httr::GET(u, httr::add_headers("X-Dataverse-key" = key), query = query, ...) httr::stop_for_status(r) out <- httr::content(r, as = "raw") From 266759e766d0db185106ae0c24a4807f6feb848b Mon Sep 17 00:00:00 2001 From: Shiro Kuriwaki Date: Mon, 28 Dec 2020 15:35:03 -0500 Subject: [PATCH 26/75] Add warning if read_function is not specified --- R/get_file_as_dataframe.R | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/R/get_file_as_dataframe.R b/R/get_file_as_dataframe.R index a863c3d..5c1c0d1 100644 --- a/R/get_file_as_dataframe.R +++ b/R/get_file_as_dataframe.R @@ -91,8 +91,10 @@ get_dataframe_by_id <- function(file, raw <- get_file(file = file, archival = archival, ...) # default of get_file - if (is.null(read_function)) + if (is.null(read_function)) { + warning("function was not supplied so returning the raw binary file.") return(raw) + } # save to temp and then read it in with supplied function if (!is.null(read_function)) { From 89109daf527410232b059b0833e17ee30bc3ba63 Mon Sep 17 00:00:00 2001 From: Shiro Kuriwaki Date: Tue, 29 Dec 2020 00:51:47 -0500 Subject: [PATCH 27/75] Simpler --- R/get_file_by_id.R | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/R/get_file_by_id.R b/R/get_file_by_id.R index aa7d12c..2231c3f 100644 --- a/R/get_file_by_id.R +++ b/R/get_file_by_id.R @@ -4,14 +4,13 @@ #' the ingested archival version or not? #' @param fileid A numeric ID internally used for `get_file_by_id` #' -#' @importFrom xml2 read_xml as_list #' #' @export get_file_by_id <- function(fileid, dataset = NULL, server = Sys.getenv("DATAVERSE_SERVER"), - format = c("original", "RData", "prep", "bundle"), + format = c("original", "bundle"), vars = NULL, archival = NULL, key = Sys.getenv("DATAVERSE_KEY"), @@ -70,3 +69,8 @@ get_file_by_id <- out <- httr::content(r, as = "raw") return(out) } + + +get_file_by_doi <- function() { + +} From 27050583fbbc1d55da05ac9fa8acf5bc2eff8e40 Mon Sep 17 00:00:00 2001 From: Shiro Kuriwaki Date: Tue, 29 Dec 2020 01:34:41 -0500 Subject: [PATCH 28/75] Implement _by_doi in files. Also simplify the httr::GET by only having it appear once --- NAMESPACE | 3 +- R/get_file.R | 36 +++++++++++++---------- R/get_file_as_dataframe.R | 8 +++++ R/get_file_by_id.R | 62 ++++++++++++++++++++++++++++++--------- man/files.Rd | 53 +++++++++++++++++++++++---------- 5 files changed, 115 insertions(+), 47 deletions(-) diff --git a/NAMESPACE b/NAMESPACE index 778fef3..f574143 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -33,6 +33,7 @@ export(get_dataset) export(get_dataverse) export(get_facets) export(get_file) +export(get_file_by_doi) export(get_file_by_id) export(get_file_by_name) export(get_file_metadata) @@ -51,5 +52,3 @@ import(xml2) importFrom(stats,setNames) importFrom(stringr,str_extract) importFrom(utils,str) -importFrom(xml2,as_list) -importFrom(xml2,read_xml) diff --git a/R/get_file.R b/R/get_file.R index ea23616..db6a776 100644 --- a/R/get_file.R +++ b/R/get_file.R @@ -1,15 +1,17 @@ #' @rdname files #' #' -#' @title Download File(s) +#' @title Download File #' #' @description Download Dataverse File(s). `get_file` is a general wrapper, #' and can take either dataverse objects, file IDs, or a filename and dataverse. #' `get_file_by_name` is a shorthand for running `get_file` by -#' specifying a file name (`filename`) and dataverse DOI (`dataset`). +#' specifying a file name (`filename`) and dataset (`dataset`). +#' `get_file_by_doi` obtains a file by its file DOI, bypassing the +#' `dataset` argument. #' Internally, all functions download each file by `get_file_by_id`. `get_file_*` #' functions return a raw binary file, which cannot be readily analyzed in R. -#' To download dataframes, see the `get_dataset_*` functions at \link{get_dataset} +#' To use the objects as dataframes, see the `get_dataset_*` functions at \link{get_dataset} #' #' #' @@ -45,29 +47,33 @@ #' dataset = "doi:10.70122/FK2/PPKHI1", #' server = "demo.dataverse.org") #' -#' # 2. Two-steps: Find ID from get_dataset -#' d2 <- get_dataset("doi:10.70122/FK2/PPKHI1", server = "demo.dataverse.org") -#' f2 <- get_file(d1$files$id[1], server = "demo.dataverse.org") +#' # 2. Using DOI +#' f2 <- get_file_by_doi("10.70122/FK2/PPKHI1/ZYATZZ", +#' server = "demo.dataverse.org") #' +#' # 3. Two-steps: Find ID from get_dataset +#' d3 <- get_dataset("doi:10.70122/FK2/PPKHI1", server = "demo.dataverse.org") +#' f3 <- get_file(d3$files$id[1], server = "demo.dataverse.org") #' -#' # 3. Alternatively, based on "dataverse_file" object -#' f3_dvf <- dataset_files("doi:10.70122/FK2/PPKHI1", server = "demo.dataverse.org") -#' f3 <- get_file(f3_dvf[[1]], server = "demo.dataverse.org") #' -#' # 4. Retrieve multiple raw data in list -#' f4_vec <- get_dataset("doi:10.70122/FK2/PPKHI1", +#' # 4. Alternatively, based on "dataverse_file" object +#' f4_dvf <- dataset_files("doi:10.70122/FK2/PPKHI1", server = "demo.dataverse.org") +#' f4 <- get_file(f4_dvf[[1]], server = "demo.dataverse.org") +#' +#' # 5. Retrieve multiple raw data in list +#' f5_vec <- get_dataset("doi:10.70122/FK2/PPKHI1", #' server = "demo.dataverse.org")$files$id -#' f4 <- get_file(f4_vec, +#' f5 <- get_file(f5_vec, #' server = "demo.dataverse.org") -#' length(f4) +#' length(f5) #' #' # Write binary files. #' # The appropriate file extension needs to be assigned by the user. #' writeBin(f1, "gapminder-FiveYearData.tab") -#' writeBin(f4[[1]], "gapminder-FiveYearData.tab") +#' writeBin(f5[[1]], "gapminder-FiveYearData.tab") #' #' # NOTE: fix so that get_file (with multiple) files -#' # (f4) in example can return a tabulated dataset in original +#' # (f5) in example can return a tabulated dataset in original #' #' } #' diff --git a/R/get_file_as_dataframe.R b/R/get_file_as_dataframe.R index 5c1c0d1..4c6cde3 100644 --- a/R/get_file_as_dataframe.R +++ b/R/get_file_as_dataframe.R @@ -102,6 +102,13 @@ get_dataframe_by_id <- function(file, } } +get_dataframe_by_doi <- function(doi, + read_function = NULL, + archival = FALSE, + ...) { + doi <- prepend_doi(doi) + +} #' Write to temp and apply function #' @@ -114,3 +121,4 @@ get_dataframe_internal <- function(raw, filename, .f) { do.call(.f, list(tmp)) } + diff --git a/R/get_file_by_id.R b/R/get_file_by_id.R index 2231c3f..cc656cc 100644 --- a/R/get_file_by_id.R +++ b/R/get_file_by_id.R @@ -18,9 +18,15 @@ get_file_by_id <- format <- match.arg(format) # single file ID - stopifnot(is.numeric(fileid)) stopifnot(length(fileid) == 1) + # must be a number OR doi string in the form of "doi:" + if (is.numeric(fileid)) + use_persistentID <- FALSE + if (grepl(x = fileid, pattern = "^doi:")) + use_persistentID <- TRUE + + # ping get_file_metadata to see if file is ingested ping_metadata <- tryCatch(get_file_metadata(fileid, server = server), error = function(e) e) @@ -45,7 +51,8 @@ get_file_by_id <- # format only matters in ingested datasets, # For non-ingested files (rds/docx), we need to NOT specify a format - if (is_ingested) + # also for bundle, only change url + if (is_ingested & format != "bundle") query$format <- match.arg(format) # if the archival version is desired, we need to NOT specify a format @@ -53,24 +60,51 @@ get_file_by_id <- query$format <- NULL - # if bundle, use custom url ---- - if (format == "bundle") { - u <- paste0(api_url(server), "access/datafile/bundle/", fileid) - r <- httr::GET(u, httr::add_headers("X-Dataverse-key" = key), ...) - out <- httr::content(r, as = "raw") - return(out) - } + # part of URL depending on DOI, bundle, or file + if (format == "bundle") + u_part <- "access/datafile/bundle/" + + if (format == "original") + u_part <- "access/datafile/" + + if (use_persistentID) + u_part <- "access/datafile/:persistentId/?persistentId=" # If not bundle, request single file in non-bundle format ---- - u <- paste0(api_url(server), "access/datafile/", fileid) - r <- httr::GET(u, httr::add_headers("X-Dataverse-key" = key), query = query, ...) + u <- paste0(api_url(server), u_part, fileid) + r <- httr::GET(u, + httr::add_headers("X-Dataverse-key" = key), + query = query, + ...) httr::stop_for_status(r) - out <- httr::content(r, as = "raw") - return(out) + httr::content(r, as = "raw") } -get_file_by_doi <- function() { +#' @rdname files +#' @param filedoi A DOI for a single file (not the entire dataset), of the form +#' `"10.70122/FK2/PPKHI1/ZYATZZ"` or `"doi:10.70122/FK2/PPKHI1/ZYATZZ"` +#' +#' @export +get_file_by_doi <- function(filedoi, + dataset = NULL, + server = Sys.getenv("DATAVERSE_SERVER"), + format = c("original", "bundle"), + vars = NULL, + archival = NULL, + key = Sys.getenv("DATAVERSE_KEY"), + ...) { + + get_file_by_id( + fileid = prepend_doi(filedoi), + dataset = dataset, + format = format, + vars = vars, + key = key, + server = server, + archival = archival, + ... + ) } diff --git a/man/files.Rd b/man/files.Rd index 5b012e4..7d13148 100644 --- a/man/files.Rd +++ b/man/files.Rd @@ -4,7 +4,8 @@ \alias{get_file} \alias{get_file_by_name} \alias{get_file_by_id} -\title{Download File(s)} +\alias{get_file_by_doi} +\title{Download File} \usage{ get_file( file, @@ -32,7 +33,18 @@ get_file_by_id( fileid, dataset = NULL, server = Sys.getenv("DATAVERSE_SERVER"), - format = c("original", "RData", "prep", "bundle"), + format = c("original", "bundle"), + vars = NULL, + archival = NULL, + key = Sys.getenv("DATAVERSE_KEY"), + ... +) + +get_file_by_doi( + filedoi, + dataset = NULL, + server = Sys.getenv("DATAVERSE_SERVER"), + format = c("original", "bundle"), vars = NULL, archival = NULL, key = Sys.getenv("DATAVERSE_KEY"), @@ -75,6 +87,9 @@ the ingested archival version or not?} \item{filename}{Filename of the dataset, with file extension} \item{fileid}{A numeric ID internally used for \code{get_file_by_id}} + +\item{filedoi}{A DOI for a single file (not the entire dataset), of the form +\code{"10.70122/FK2/PPKHI1/ZYATZZ"} or \code{"doi:10.70122/FK2/PPKHI1/ZYATZZ"}} } \value{ \code{get_file} returns a raw vector (or list of raw vectors, @@ -86,10 +101,12 @@ function. To load datasets into the R environment dataframe, see Download Dataverse File(s). \code{get_file} is a general wrapper, and can take either dataverse objects, file IDs, or a filename and dataverse. \code{get_file_by_name} is a shorthand for running \code{get_file} by -specifying a file name (\code{filename}) and dataverse DOI (\code{dataset}). +specifying a file name (\code{filename}) and dataset (\code{dataset}). +\code{get_file_by_doi} obtains a file by its file DOI, bypassing the +\code{dataset} argument. Internally, all functions download each file by \code{get_file_by_id}. \verb{get_file_*} functions return a raw binary file, which cannot be readily analyzed in R. -To download dataframes, see the \verb{get_dataset_*} functions at \link{get_dataset} +To use the objects as dataframes, see the \verb{get_dataset_*} functions at \link{get_dataset} } \details{ This function provides access to data files from a Dataverse entry. @@ -102,29 +119,33 @@ f1 <- get_file_by_name("gapminder-FiveYearData.tab", dataset = "doi:10.70122/FK2/PPKHI1", server = "demo.dataverse.org") -# 2. Two-steps: Find ID from get_dataset -d2 <- get_dataset("doi:10.70122/FK2/PPKHI1", server = "demo.dataverse.org") -f2 <- get_file(d1$files$id[1], server = "demo.dataverse.org") +# 2. Using DOI +f2 <- get_file_by_doi("10.70122/FK2/PPKHI1/ZYATZZ", + server = "demo.dataverse.org") + +# 3. Two-steps: Find ID from get_dataset +d3 <- get_dataset("doi:10.70122/FK2/PPKHI1", server = "demo.dataverse.org") +f3 <- get_file(d3$files$id[1], server = "demo.dataverse.org") -# 3. Alternatively, based on "dataverse_file" object -f3_dvf <- dataset_files("doi:10.70122/FK2/PPKHI1", server = "demo.dataverse.org") -f3 <- get_file(f3_dvf[[1]], server = "demo.dataverse.org") +# 4. Alternatively, based on "dataverse_file" object +f4_dvf <- dataset_files("doi:10.70122/FK2/PPKHI1", server = "demo.dataverse.org") +f4 <- get_file(f4_dvf[[1]], server = "demo.dataverse.org") -# 4. Retrieve multiple raw data in list -f4_vec <- get_dataset("doi:10.70122/FK2/PPKHI1", +# 5. Retrieve multiple raw data in list +f5_vec <- get_dataset("doi:10.70122/FK2/PPKHI1", server = "demo.dataverse.org")$files$id -f4 <- get_file(f4_vec, +f5 <- get_file(f5_vec, server = "demo.dataverse.org") -length(f4) +length(f5) # Write binary files. # The appropriate file extension needs to be assigned by the user. writeBin(f1, "gapminder-FiveYearData.tab") -writeBin(f4[[1]], "gapminder-FiveYearData.tab") +writeBin(f5[[1]], "gapminder-FiveYearData.tab") # NOTE: fix so that get_file (with multiple) files -# (f4) in example can return a tabulated dataset in original +# (f5) in example can return a tabulated dataset in original } From d4bd2dc0552e14837ec5de64cde1f1593e334de8 Mon Sep 17 00:00:00 2001 From: Shiro Kuriwaki Date: Tue, 29 Dec 2020 02:16:29 -0500 Subject: [PATCH 29/75] Add get_dataframe_by_doi -- can just use _by_id now that get_file_by_id also takes a DOI --- NAMESPACE | 1 + R/get_file.R | 7 ++++--- R/get_file_as_dataframe.R | 19 ++++++++++++++++--- man/get_dataframe.Rd | 15 ++++++++++++++- 4 files changed, 35 insertions(+), 7 deletions(-) diff --git a/NAMESPACE b/NAMESPACE index f574143..c79c987 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -27,6 +27,7 @@ export(delete_dataset) export(delete_dataverse) export(delete_file) export(delete_sword_dataset) +export(get_dataframe_by_doi) export(get_dataframe_by_id) export(get_dataframe_by_name) export(get_dataset) diff --git a/R/get_file.R b/R/get_file.R index db6a776..2bb3c16 100644 --- a/R/get_file.R +++ b/R/get_file.R @@ -98,11 +98,12 @@ get_file <- if (!is.numeric(file) & inherits(file, "dataverse_file")) fileid <- get_fileid.dataverse_file(file, key = key, server = server) - if (!is.numeric(file) & !inherits(file, "dataverse_file") & is.null(dataset)) - stop("When 'file' is a character (non-global ID), dataset must be specified.") - if (!is.numeric(file) & !inherits(file, "dataverse_file")) + if (!is.numeric(file) & !inherits(file, "dataverse_file") & !is.null(dataset)) fileid <- get_fileid.character(dataset, file, key = key, server = server, ...) + if (!is.numeric(file) & !inherits(file, "dataverse_file") & is.null(dataset) & !grepl(x = file, pattern = "^doi")) + stop("When 'file' is a character (non-global ID), dataset must be specified.") + # Main function. Call get_file_by_id out <- vector("list", length(fileid)) diff --git a/R/get_file_as_dataframe.R b/R/get_file_as_dataframe.R index 4c6cde3..341ec1c 100644 --- a/R/get_file_as_dataframe.R +++ b/R/get_file_as_dataframe.R @@ -40,7 +40,14 @@ #' server = "demo.dataverse.org", #' read_function = read_rds) #' -#' # equivalently, if you know the ID +#' # equivalently, if you know the DOI +#' gap_df <- get_dataframe_by_doi( +#' filedoi = "10.70122/FK2/PPKHI1/ZYATZZ", +#' server = "demo.dataverse.rog", +#' read_function = read_csv +#' ) +#' +#' # or the id #' # you can also customize the read_function (in this case to supress parse msg) #' gap_df <- get_dataframe_by_id( #' 1733998, @@ -102,12 +109,18 @@ get_dataframe_by_id <- function(file, } } -get_dataframe_by_doi <- function(doi, + +#' @rdname get_dataframe +#' @inheritParams get_file_by_doi +#' @export +get_dataframe_by_doi <- function(filedoi, read_function = NULL, archival = FALSE, ...) { - doi <- prepend_doi(doi) + filedoi <- prepend_doi(filedoi) + # get_file can also take doi now + get_dataframe_by_id(file = filedoi, read_function = read_function, archival = archival, ...) } #' Write to temp and apply function diff --git a/man/get_dataframe.Rd b/man/get_dataframe.Rd index 89ad84d..5ff45f0 100644 --- a/man/get_dataframe.Rd +++ b/man/get_dataframe.Rd @@ -3,6 +3,7 @@ \name{get_dataframe_by_name} \alias{get_dataframe_by_name} \alias{get_dataframe_by_id} +\alias{get_dataframe_by_doi} \title{Get file from dataverse and convert it into a dataframe or tibble} \usage{ get_dataframe_by_name( @@ -14,6 +15,8 @@ get_dataframe_by_name( ) get_dataframe_by_id(file, read_function = NULL, archival = FALSE, ...) + +get_dataframe_by_doi(filedoi, read_function = NULL, archival = FALSE, ...) } \arguments{ \item{file}{to be passed on to get_file} @@ -51,6 +54,9 @@ multiple Dataverse installations, but the defaults is to use the Harvard Dataverse (\code{server = "dataverse.harvard.edu"}). This can be modified atomically or globally using \code{Sys.setenv("DATAVERSE_SERVER" = "dataverse.example.com")}.} }} + +\item{filedoi}{A DOI for a single file (not the entire dataset), of the form +\code{"10.70122/FK2/PPKHI1/ZYATZZ"} or \code{"doi:10.70122/FK2/PPKHI1/ZYATZZ"}} } \description{ \code{get_dataframe_by_id}, if you know the numeric ID of the dataset, or instead @@ -77,7 +83,14 @@ rds_df <- get_dataframe_by_name( server = "demo.dataverse.org", read_function = read_rds) -# equivalently, if you know the ID +# equivalently, if you know the DOI +gap_df <- get_dataframe_by_doi( + filedoi = "10.70122/FK2/PPKHI1/ZYATZZ", + server = "demo.dataverse.rog", + read_function = read_csv +) + +# or the id # you can also customize the read_function (in this case to supress parse msg) gap_df <- get_dataframe_by_id( 1733998, From 603c0a672d95bf01d3ad10f89fd3f5610b1c1b6e Mon Sep 17 00:00:00 2001 From: Shiro Kuriwaki Date: Tue, 29 Dec 2020 16:17:01 -0500 Subject: [PATCH 30/75] Do some tricks to get metadaga of DOIs (but only file specific) too --- R/get_file_metadata.R | 20 +++++++++++++++++--- man/get_file_metadata.Rd | 5 +++-- 2 files changed, 20 insertions(+), 5 deletions(-) diff --git a/R/get_file_metadata.R b/R/get_file_metadata.R index d5f7cec..f0335c3 100644 --- a/R/get_file_metadata.R +++ b/R/get_file_metadata.R @@ -15,18 +15,32 @@ get_file_metadata <- key = Sys.getenv("DATAVERSE_KEY"), server = Sys.getenv("DATAVERSE_SERVER"), ...) { + # get file ID from doi + persistentID <- FALSE if (!is.numeric(file)) { if (inherits(file, "dataverse_file")) { - file <- get_fileid(file) + fileid <- get_fileid(file) + } else if (grepl(x = file, pattern = "^doi:")) { + # if file-specific DOI, then use DOI + fileid <- file + persistentID <- TRUE } else if (is.null(dataset)) { stop("When 'file' is a character string, dataset must be specified. Or, use a global fileid instead.") } else { - file <- get_fileid(dataset, file, key = key, server = server, ...) + fileid <- get_fileid(dataset, file, key = key, server = server, ...) } } + format <- match.arg(format) - u <- paste0(api_url(server), "access/datafile/", file, "/metadata/", format) + + # different URL depending on if you have persistentId + if (persistentID) { + u <- paste0(api_url(server), "access/datafile/:persistentId/metadata/", format, "/?persistentId=", fileid) + } else { + u <- paste0(api_url(server), "access/datafile/", fileid, "/metadata/", format) + } + r <- httr::GET(u, httr::add_headers("X-Dataverse-key" = key), ...) httr::stop_for_status(r) out <- httr::content(r, as = "text", encoding = "UTF-8") diff --git a/man/get_file_metadata.Rd b/man/get_file_metadata.Rd index 3c3e343..9291066 100644 --- a/man/get_file_metadata.Rd +++ b/man/get_file_metadata.Rd @@ -15,8 +15,9 @@ get_file_metadata( } \arguments{ \item{file}{An integer specifying a file identifier; or a vector of integers -specifying file identifiers; or, if \code{doi} is specified, a character string -specifying a file name within the DOI-identified dataset; or an object of +specifying file identifiers; or, if used with the prefix \code{"doi:"}, a +character with the file-specific DOI; or, if used without the prefix, a +filename accompanied by a dataset DOI in the \code{dataset} argument, or an object of class \dQuote{dataverse_file} as returned by \code{\link{dataset_files}}.} \item{format}{Defaults to \dQuote{ddi} for metadata files} From d247a404d077b29bd940da4e57cd3fbdcb6cd493 Mon Sep 17 00:00:00 2001 From: Shiro Kuriwaki Date: Tue, 29 Dec 2020 16:17:15 -0500 Subject: [PATCH 31/75] Change argument to FUN (matching base apply) --- R/get_file_as_dataframe.R | 47 ++++++++++++++++++++------------------- man/get_dataframe.Rd | 41 ++++++++++++++++------------------ 2 files changed, 43 insertions(+), 45 deletions(-) diff --git a/R/get_file_as_dataframe.R b/R/get_file_as_dataframe.R index 341ec1c..7858f13 100644 --- a/R/get_file_as_dataframe.R +++ b/R/get_file_as_dataframe.R @@ -7,10 +7,9 @@ #' #' @param file to be passed on to get_file #' @param dataset to be passed on to get_file -#' @param read_function If supplied a function object, this will write the -#' raw file to a tempfile and read it back in with the supplied function. This -#' is useful when you want to start working with the data right away in the R -#' environment +#' @param FUN The function to used for reading in the raw dataset. This user +#' must choose the appropriate funuction: for example if the target is a .rds +#' file, then `FUN` should be `readRDS` or `readr::read_rds`. #' @param archival Whether to read from the ingested, archival version of the #' dataset, or whether to read the original. The archival versions are tab-delimited #' `.tab` files. If functions to read the original version is available without @@ -19,40 +18,42 @@ #' @inheritDotParams get_file #' #' @examples +#' library(readr) +#' # load dataset from file name and dataverse DOI #' gap_df <- get_dataframe_by_name( #' file = "gapminder-FiveYearData.tab", #' dataset = "doi:10.70122/FK2/PPKHI1", #' server = "demo.dataverse.org", -#' read_function = read_csv) +#' FUN = read_csv) #' #' # or a Stata dta #' stata_df <- get_dataframe_by_name( #' file = "nlsw88.tab", #' dataset = "doi:10.70122/FK2/PPKHI1", #' server = "demo.dataverse.org", -#' read_function = haven::read_dta) +#' FUN = haven::read_dta) #' #' # or a Rds file #' rds_df <- get_dataframe_by_name( #' file = "nlsw88_rds-export.rds", #' dataset = "doi:10.70122/FK2/PPKHI1", #' server = "demo.dataverse.org", -#' read_function = read_rds) +#' FUN = read_rds) #' #' # equivalently, if you know the DOI #' gap_df <- get_dataframe_by_doi( #' filedoi = "10.70122/FK2/PPKHI1/ZYATZZ", -#' server = "demo.dataverse.rog", -#' read_function = read_csv +#' server = "demo.dataverse.org", +#' FUN = read_csv #' ) #' #' # or the id -#' # you can also customize the read_function (in this case to supress parse msg) +#' # you can also customize the FUN (in this case to supress parse msg) #' gap_df <- get_dataframe_by_id( #' 1733998, #' server = "demo.dataverse.org", -#' read_function = function(x) read_csv(x, col_types = cols())) +#' FUN = function(x) read_csv(x, col_types = cols())) #' #' # equivalently, using a dataverse object #' gap_ds <- dataset_files("doi:10.70122/FK2/PPKHI1", @@ -61,20 +62,20 @@ #' gap_df <- get_dataframe_by_id( #' gap_ds[[1]], #' server = "demo.dataverse.org", -#' read_function = function(x) read_csv(x, col_types = cols())) +#' FUN = function(x) read_csv(x, col_types = cols())) #' #' # to use the archival version (and read as TSV) #' gap_df <- get_dataframe_by_id( #' 1733998, #' server = "demo.dataverse.org", #' archival = TRUE, -#' read_function = function(x) read_tsv(x, col_types = cols())) +#' FUN = function(x) read_tsv(x, col_types = cols())) #' #' #' @export get_dataframe_by_name <- function(file, dataset = NULL, - read_function = NULL, + FUN = NULL, archival = FALSE, ...) { @@ -83,29 +84,29 @@ get_dataframe_by_name <- function(file, file = file, ...) - get_dataframe_by_id(fileid, read_function, archival = archival, ...) + get_dataframe_by_id(fileid, FUN, archival = archival, ...) } #' @rdname get_dataframe #' @export -get_dataframe_by_id <- function(file, - read_function = NULL, +get_dataframe_by_id <- function(fileid, + FUN = NULL, archival = FALSE, ...) { - raw <- get_file(file = file, archival = archival, ...) + raw <- get_file(file = fileid, archival = archival, ...) # default of get_file - if (is.null(read_function)) { + if (is.null(FUN)) { warning("function was not supplied so returning the raw binary file.") return(raw) } # save to temp and then read it in with supplied function - if (!is.null(read_function)) { - get_dataframe_internal(raw, filename = "foo", .f = read_function) + if (!is.null(FUN)) { + get_dataframe_internal(raw, filename = "foo", .f = FUN) } } @@ -114,13 +115,13 @@ get_dataframe_by_id <- function(file, #' @inheritParams get_file_by_doi #' @export get_dataframe_by_doi <- function(filedoi, - read_function = NULL, + FUN = NULL, archival = FALSE, ...) { filedoi <- prepend_doi(filedoi) # get_file can also take doi now - get_dataframe_by_id(file = filedoi, read_function = read_function, archival = archival, ...) + get_dataframe_by_id(file = filedoi, FUN = FUN, archival = archival, ...) } #' Write to temp and apply function diff --git a/man/get_dataframe.Rd b/man/get_dataframe.Rd index 5ff45f0..a414573 100644 --- a/man/get_dataframe.Rd +++ b/man/get_dataframe.Rd @@ -6,27 +6,20 @@ \alias{get_dataframe_by_doi} \title{Get file from dataverse and convert it into a dataframe or tibble} \usage{ -get_dataframe_by_name( - file, - dataset = NULL, - read_function = NULL, - archival = FALSE, - ... -) +get_dataframe_by_name(file, dataset = NULL, FUN = NULL, archival = FALSE, ...) -get_dataframe_by_id(file, read_function = NULL, archival = FALSE, ...) +get_dataframe_by_id(fileid, FUN = NULL, archival = FALSE, ...) -get_dataframe_by_doi(filedoi, read_function = NULL, archival = FALSE, ...) +get_dataframe_by_doi(filedoi, FUN = NULL, archival = FALSE, ...) } \arguments{ \item{file}{to be passed on to get_file} \item{dataset}{to be passed on to get_file} -\item{read_function}{If supplied a function object, this will write the -raw file to a tempfile and read it back in with the supplied function. This -is useful when you want to start working with the data right away in the R -environment} +\item{FUN}{The function to used for reading in the raw dataset. This user +must choose the appropriate funuction: for example if the target is a .rds +file, then \code{FUN} should be \code{readRDS} or \code{readr::read_rds}.} \item{archival}{Whether to read from the ingested, archival version of the dataset, or whether to read the original. The archival versions are tab-delimited @@ -55,6 +48,8 @@ Dataverse (\code{server = "dataverse.harvard.edu"}). This can be modified atomic or globally using \code{Sys.setenv("DATAVERSE_SERVER" = "dataverse.example.com")}.} }} +\item{fileid}{A numeric ID internally used for \code{get_file_by_id}} + \item{filedoi}{A DOI for a single file (not the entire dataset), of the form \code{"10.70122/FK2/PPKHI1/ZYATZZ"} or \code{"doi:10.70122/FK2/PPKHI1/ZYATZZ"}} } @@ -63,39 +58,41 @@ or globally using \code{Sys.setenv("DATAVERSE_SERVER" = "dataverse.example.com") \code{get_dataframe_by_name} if you know the filename and doi. The dataset } \examples{ +library(readr) + gap_df <- get_dataframe_by_name( file = "gapminder-FiveYearData.tab", dataset = "doi:10.70122/FK2/PPKHI1", server = "demo.dataverse.org", - read_function = read_csv) + FUN = read_csv) # or a Stata dta stata_df <- get_dataframe_by_name( file = "nlsw88.tab", dataset = "doi:10.70122/FK2/PPKHI1", server = "demo.dataverse.org", - read_function = haven::read_dta) + FUN = haven::read_dta) # or a Rds file rds_df <- get_dataframe_by_name( file = "nlsw88_rds-export.rds", dataset = "doi:10.70122/FK2/PPKHI1", server = "demo.dataverse.org", - read_function = read_rds) + FUN = read_rds) # equivalently, if you know the DOI gap_df <- get_dataframe_by_doi( filedoi = "10.70122/FK2/PPKHI1/ZYATZZ", - server = "demo.dataverse.rog", - read_function = read_csv + server = "demo.dataverse.org", + FUN = read_csv ) # or the id -# you can also customize the read_function (in this case to supress parse msg) +# you can also customize the FUN (in this case to supress parse msg) gap_df <- get_dataframe_by_id( 1733998, server = "demo.dataverse.org", - read_function = function(x) read_csv(x, col_types = cols())) + FUN = function(x) read_csv(x, col_types = cols())) # equivalently, using a dataverse object gap_ds <- dataset_files("doi:10.70122/FK2/PPKHI1", @@ -104,14 +101,14 @@ gap_ds <- dataset_files("doi:10.70122/FK2/PPKHI1", gap_df <- get_dataframe_by_id( gap_ds[[1]], server = "demo.dataverse.org", - read_function = function(x) read_csv(x, col_types = cols())) + FUN = function(x) read_csv(x, col_types = cols())) # to use the archival version (and read as TSV) gap_df <- get_dataframe_by_id( 1733998, server = "demo.dataverse.org", archival = TRUE, - read_function = function(x) read_tsv(x, col_types = cols())) + FUN = function(x) read_tsv(x, col_types = cols())) } From d7d24804472c708e672c4afbe56913a01579852e Mon Sep 17 00:00:00 2001 From: Shiro Kuriwaki Date: Tue, 29 Dec 2020 16:17:20 -0500 Subject: [PATCH 32/75] Allow for DOI --- R/get_file.R | 17 +++++++++++------ man/files.Rd | 5 +++-- 2 files changed, 14 insertions(+), 8 deletions(-) diff --git a/R/get_file.R b/R/get_file.R index 2bb3c16..2680c2e 100644 --- a/R/get_file.R +++ b/R/get_file.R @@ -17,9 +17,10 @@ #' #' @details This function provides access to data files from a Dataverse entry. #' @param file An integer specifying a file identifier; or a vector of integers -#' specifying file identifiers; or, if \code{doi} is specified, a character string -#' specifying a file name within the DOI-identified dataset; or an object of -#' class \dQuote{dataverse_file} as returned by \code{\link{dataset_files}}. +#' specifying file identifiers; or, if used with the prefix \code{"doi:"}, a +#' character with the file-specific DOI; or, if used without the prefix, a +#' filename accompanied by a dataset DOI in the `dataset` argument, or an object of +#' class \dQuote{dataverse_file} as returned by \code{\link{dataset_files}}. #' @param format A character string specifying a file format for download. #' by default, this is \dQuote{original} (the original file format). If `NULL`, #' no query is added, so ingested files are returned in their ingested TSV form. @@ -101,9 +102,13 @@ get_file <- if (!is.numeric(file) & !inherits(file, "dataverse_file") & !is.null(dataset)) fileid <- get_fileid.character(dataset, file, key = key, server = server, ...) - if (!is.numeric(file) & !inherits(file, "dataverse_file") & is.null(dataset) & !grepl(x = file, pattern = "^doi")) - stop("When 'file' is a character (non-global ID), dataset must be specified.") - + if (!is.numeric(file) & !inherits(file, "dataverse_file") & is.null(dataset)) { + if (grepl(x = file, pattern = "^doi")) { + fileid <- file # doi is allowed + } else { + stop("When 'file' is a character (non-global ID), dataset must be specified.") + } + } # Main function. Call get_file_by_id out <- vector("list", length(fileid)) diff --git a/man/files.Rd b/man/files.Rd index 7d13148..ca0d50a 100644 --- a/man/files.Rd +++ b/man/files.Rd @@ -53,8 +53,9 @@ get_file_by_doi( } \arguments{ \item{file}{An integer specifying a file identifier; or a vector of integers -specifying file identifiers; or, if \code{doi} is specified, a character string -specifying a file name within the DOI-identified dataset; or an object of +specifying file identifiers; or, if used with the prefix \code{"doi:"}, a +character with the file-specific DOI; or, if used without the prefix, a +filename accompanied by a dataset DOI in the \code{dataset} argument, or an object of class \dQuote{dataverse_file} as returned by \code{\link{dataset_files}}.} \item{format}{A character string specifying a file format for download. From 72b8c0face03ed7b6ac51e82b10cc83987ffbab5 Mon Sep 17 00:00:00 2001 From: Shiro Kuriwaki Date: Tue, 29 Dec 2020 16:40:42 -0500 Subject: [PATCH 33/75] Add get_file examples --- README.Rmd | 99 ++++++++++++++++++++++++++++++++++-------------------- 1 file changed, 63 insertions(+), 36 deletions(-) diff --git a/README.Rmd b/README.Rmd index ef9d212..8bda0ff 100644 --- a/README.Rmd +++ b/README.Rmd @@ -3,21 +3,22 @@ ```{r knitr_options, echo=FALSE, results="hide"} options(width = 120) knitr::opts_chunk$set(results = "hold") +Sys.setenv("DATAVERSE_SERVER" = "dataverse.harvard.edu") ``` [![Dataverse Project logo](http://dataverse.org/files/dataverseorg/files/dataverse_project_logo-hp.png "Dataverse Project")](http://dataverse.org) The **dataverse** package provides access to [Dataverse 4](http://dataverse.org/) APIs, enabling data search, retrieval, and deposit, thus allowing R users to integrate public data sharing into the reproducible research workflow. **dataverse** is the next-generation iteration of [the **dvn** package](https://cran.r-project.org/package=dvn), which works with Dataverse 3 ("Dataverse Network") applications. **dataverse** includes numerous improvements for data search, retrieval, and deposit, including use of the (currently in development) **sword** package for data deposit and the **UNF** package for data fingerprinting. -Some features of the Dataverse 4 API are public and require no authentication. This means in many cases you can search for and retrieve data without a Dataverse account for that a specific Dataverse installation. But, other features require a Dataverse account for the specific server installation of the Dataverse software, and an API key linked to that account. Instructions for obtaining an account and setting up an API key are available in the [Dataverse User Guide](http://guides.dataverse.org/en/latest/user/account.html). (Note: if your key is compromised, it can be regenerated to preserve security.) Once you have an API key, this should be stored as an environment variable called `DATAVERSE_KEY`. It can be set within R using: +Some features of the Dataverse 4 API are public and require no authentication. This means in many cases you can search for and retrieve data without a Dataverse account for that a specific Dataverse installation. But, other features require a Dataverse account for the specific server installation of the Dataverse software, and an API key linked to that account. Instructions for obtaining an account and setting up an API key are available in the [Dataverse User Guide](http://guides.dataverse.org/en/latest/user/account.html). (Note: if your key is compromised, it can be regenerated to preserve security.) Once you have an API key, this should be stored as an environment variable called `DATAVERSE_KEY`. It can be set within R using: -```R +``` r Sys.setenv("DATAVERSE_KEY" = "examplekey12345") ``` -Because [there are many Dataverse installations](http://dataverse.org/), all functions in the R client require specifying what server installation you are interacting with. This can be set by default with an environment variable, `DATAVERSE_SERVER`. This should be the Dataverse server, without the "https" prefix or the "/api" URL path, etc. For example, the Harvard Dataverse can be used by setting: +Because [there are many Dataverse installations](http://dataverse.org/), all functions in the R client require specifying what server installation you are interacting with. This can be set by default with an environment variable, `DATAVERSE_SERVER`. This should be the Dataverse server, without the "https" prefix or the "/api" URL path, etc. For example, the Harvard Dataverse can be used by setting: -```R +``` r Sys.setenv("DATAVERSE_SERVER" = "dataverse.harvard.edu") ``` @@ -25,6 +26,59 @@ Note: The package attempts to compensate for any malformed values, though. Currently, the package wraps the data management features of the Dataverse API. Functions for other API features - related to user management and permissions - are not currently exported in the package (but are drafted in the [source code](https://github.com/IQSS/dataverse-client-r)). +### Data and Metadata Retrieval + +Datasets on Dataverse are directly downloadable by their API, and this is straightforward especially if the data is not restricted. The dataverse package provides multiple interfaces. Users can supply a file DOI, a dataset DOI combined with a filename, or a dataverse object. They can read in the file as a raw binary or a dataset read in with the appropriate R function. + +#### Reading data as R objects + +Use the `get_dataframe_*` functions, depending on the input you have. For example, we will read a survey dataset on dataverse, [nlsw88.dta](https://demo.dataverse.org/file.xhtml?persistentId=doi:10.70122/FK2/PPKHI1/ZYATZZ) (`doi:10.70122/FK2/PPKHI1/ZYATZZ`), originally in Stata dta form. + +With a file DOI + +```{r get_dataframe_by_doi} +nlsw <- get_dataframe_by_doi("10.70122/FK2/PPKHI1/ZYATZZ", + haven::read_dta, + server = "demo.dataverse.org") +nlsw +``` + +With a name and dataset DOI + +```{r get_dataframe_by_name} +nlsw <- get_dataframe_by_name(file = "nlsw88.tab", + dataset = "10.70122/FK2/PPKHI1", + haven::read_dta, + server = "demo.dataverse.org") +``` + +Note that even though the file prefix is ".tab", we use `read_dta`. This is because this file was originally a dta file that was ingested into an archival format with a ".tab" file extension. The `get_dataframe_` functions do not attempt to download the archival versions by default, but it is possible to turn this option off with \`archival = TRUE\`. + +Sometimes you may know the underlying file ID. In this case, the fileid is + +```{r get_dataframe_by_id} +nlsw <- get_dataframe_by_id(1733999, + haven::read_dta, + server = "demo.dataverse.org") +``` + +#### Reading a dataset as a binary file. + +In some cases, you may not need to render the raw binary file, or you do not have the functions to do so in R, so you want to write these into your local disk. To take only the raw files, use the `get_file` commands. The arguments are equivalent, except we do need a \`FUN\` argument + +```{r get_file_by_name} +nlsw_raw <- get_file_by_name(file = "nlsw88.tab", + dataset = "10.70122/FK2/PPKHI1", + server = "demo.dataverse.org") +class(nlsw_raw) +``` + +The function `get_file_metadata` can also be used similarly. This will return a metadata format for ingested tabular files in the `ddi` format. The function `get_dataset` will retrieve the list of files in a dataset. + +```{r, get_dataset} +get_dataset("doi:10.7910/DVN/ARKOTI", server = "demo.dataverse.org") +``` + ### Data Discovery Dataverse supplies a pretty robust search API to discover Dataverses, datasets, and files. The simplest searches simply consist of a query string: @@ -48,34 +102,11 @@ str(dataverse_search(author = "Gary King", type = "dataset"), 1) The results are paginated using `per_page` argument. To retrieve subsequent pages, specify `start`. - -### Data and Metadata Retrieval - -The easiest way to access data from Dataverse is to use a persistent identifier (typically a DOI). You can retrieve the contents of a Dataverse dataset: - -```{r get_dataset} -get_dataset("doi:10.7910/DVN/ARKOTI") -``` - -Knowing a file name, you can also access that file (e.g., a Stata dataset) directly in R: - -```{r get_file} -f <- get_file("constructionData.tab", "doi:10.7910/DVN/ARKOTI") - -# load it into memory -tmp <- tempfile(fileext = ".dta") -writeBin(as.vector(f), tmp) -dat <- foreign::read.dta(tmp) -``` - -If you don't know the file name in advance, you can parse the available files returned by `get_dataset()` and retrieve the file using its Dataverse "id" number. - - ### Data Archiving Dataverse provides two - basically unrelated - workflows for managing (adding, documenting, and publishing) datasets. The first is built on [SWORD v2.0](http://swordapp.org/sword-v2/). This means that to create a new dataset listing, you will have first initialize a dataset entry with some metadata, add one or more files to the dataset, and then publish it. This looks something like the following: -```R +``` r # retrieve your service document d <- service_document() @@ -101,7 +132,7 @@ list_datasets("mydataverse") The second workflow is called the "native" API and is similar but uses slightly different functions: -```R +``` r # create the dataset ds <- create_dataset("mydataverse") @@ -121,14 +152,11 @@ Through the native API it is possible to update a dataset by modifying its metad ## Installation -[![CRAN Version](https://www.r-pkg.org/badges/version/dataverse)](https://cran.r-project.org/package=dataverse) -![Downloads](https://cranlogs.r-pkg.org/badges/dataverse) -[![Travis-CI Build Status](https://travis-ci.org/IQSS/dataverse-client-r.png?branch=master)](https://travis-ci.org/IQSS/dataverse-client-r) -[![codecov.io](https://codecov.io/github/IQSS/dataverse-client-r/coverage.svg?branch=master)](https://codecov.io/github/IQSS/dataverse-client-r?branch=master) +[![CRAN Version](https://www.r-pkg.org/badges/version/dataverse)](https://cran.r-project.org/package=dataverse) ![Downloads](https://cranlogs.r-pkg.org/badges/dataverse) [![Travis-CI Build Status](https://travis-ci.org/IQSS/dataverse-client-r.png?branch=master)](https://travis-ci.org/IQSS/dataverse-client-r) [![codecov.io](https://codecov.io/github/IQSS/dataverse-client-r/coverage.svg?branch=master)](https://codecov.io/github/IQSS/dataverse-client-r?branch=master) You can (eventually) find a stable release on [CRAN](https://cran.r-project.org/package=dataverse), or install the latest development version from GitHub: -```R +``` r if (!require("remotes")) { install.packages("remotes") } @@ -136,5 +164,4 @@ remotes::install_github("iqss/dataverse-client-r") library("dataverse") ``` -Users interested in downloading metadata from archives other than Dataverse may be interested in Kurt Hornik's [OAIHarvester](https://cran.r-project.org/package=OAIHarvester) and Scott Chamberlain's [oai](https://cran.r-project.org/package=oai), which offer metadata download from any web repository that is compliant with the [Open Archives Initiative](http://www.openarchives.org/) standards. Additionally, [rdryad](https://cran.r-project.org/package=rdryad) uses OAIHarvester to interface with [Dryad](http://datadryad.org/). The [rfigshare](https://cran.r-project.org/package=rfigshare) package works in a similar spirit to **dataverse** with [https://figshare.com/](https://figshare.com/). - +Users interested in downloading metadata from archives other than Dataverse may be interested in Kurt Hornik's [OAIHarvester](https://cran.r-project.org/package=OAIHarvester) and Scott Chamberlain's [oai](https://cran.r-project.org/package=oai), which offer metadata download from any web repository that is compliant with the [Open Archives Initiative](http://www.openarchives.org/) standards. Additionally, [rdryad](https://cran.r-project.org/package=rdryad) uses OAIHarvester to interface with [Dryad](http://datadryad.org/). The [rfigshare](https://cran.r-project.org/package=rfigshare) package works in a similar spirit to **dataverse** with . From e192e6ce90bd1f0fe708bad8a298d5430bca65ce Mon Sep 17 00:00:00 2001 From: Shiro Kuriwaki Date: Tue, 29 Dec 2020 16:53:52 -0500 Subject: [PATCH 34/75] Update README with #48 --- README.Rmd | 55 +++++--- README.md | 405 ++++++++++++++++++++++++++++++++--------------------- 2 files changed, 277 insertions(+), 183 deletions(-) diff --git a/README.Rmd b/README.Rmd index 8bda0ff..95c8f59 100644 --- a/README.Rmd +++ b/README.Rmd @@ -1,4 +1,8 @@ -# R Client for Dataverse 4 Repositories +--- +title: "R Client for Dataverse 4 Repositories" +output: github_document +--- + ```{r knitr_options, echo=FALSE, results="hide"} options(width = 120) @@ -6,16 +10,40 @@ knitr::opts_chunk$set(results = "hold") Sys.setenv("DATAVERSE_SERVER" = "dataverse.harvard.edu") ``` +[![CRAN Version](https://www.r-pkg.org/badges/version/dataverse)](https://cran.r-project.org/package=dataverse) ![Downloads](https://cranlogs.r-pkg.org/badges/dataverse) [![Travis-CI Build Status](https://travis-ci.org/IQSS/dataverse-client-r.png?branch=master)](https://travis-ci.org/IQSS/dataverse-client-r) [![codecov.io](https://codecov.io/github/IQSS/dataverse-client-r/coverage.svg?branch=master)](https://codecov.io/github/IQSS/dataverse-client-r?branch=master) + + + [![Dataverse Project logo](http://dataverse.org/files/dataverseorg/files/dataverse_project_logo-hp.png "Dataverse Project")](http://dataverse.org) The **dataverse** package provides access to [Dataverse 4](http://dataverse.org/) APIs, enabling data search, retrieval, and deposit, thus allowing R users to integrate public data sharing into the reproducible research workflow. **dataverse** is the next-generation iteration of [the **dvn** package](https://cran.r-project.org/package=dvn), which works with Dataverse 3 ("Dataverse Network") applications. **dataverse** includes numerous improvements for data search, retrieval, and deposit, including use of the (currently in development) **sword** package for data deposit and the **UNF** package for data fingerprinting. +### Getting Started + +You can find a stable 2017 release on [CRAN](https://cran.r-project.org/package=dataverse), or install the latest development version from GitHub: + +```{r, echo = FALSE, eval = FALSE} +if (!require("remotes")) { + install.packages("remotes") +} +remotes::install_github("iqss/dataverse-client-r") +``` + + +```{r} +library("dataverse") +``` + +### Keys + Some features of the Dataverse 4 API are public and require no authentication. This means in many cases you can search for and retrieve data without a Dataverse account for that a specific Dataverse installation. But, other features require a Dataverse account for the specific server installation of the Dataverse software, and an API key linked to that account. Instructions for obtaining an account and setting up an API key are available in the [Dataverse User Guide](http://guides.dataverse.org/en/latest/user/account.html). (Note: if your key is compromised, it can be regenerated to preserve security.) Once you have an API key, this should be stored as an environment variable called `DATAVERSE_KEY`. It can be set within R using: ``` r Sys.setenv("DATAVERSE_KEY" = "examplekey12345") ``` +### Server + Because [there are many Dataverse installations](http://dataverse.org/), all functions in the R client require specifying what server installation you are interacting with. This can be set by default with an environment variable, `DATAVERSE_SERVER`. This should be the Dataverse server, without the "https" prefix or the "/api" URL path, etc. For example, the Harvard Dataverse can be used by setting: ``` r @@ -76,22 +104,21 @@ class(nlsw_raw) The function `get_file_metadata` can also be used similarly. This will return a metadata format for ingested tabular files in the `ddi` format. The function `get_dataset` will retrieve the list of files in a dataset. ```{r, get_dataset} -get_dataset("doi:10.7910/DVN/ARKOTI", server = "demo.dataverse.org") +get_dataset("doi:10.7910/DVN/ARKOTI") ``` ### Data Discovery Dataverse supplies a pretty robust search API to discover Dataverses, datasets, and files. The simplest searches simply consist of a query string: -```{r search1} -library("dataverse") -str(dataverse_search("Gary King"), 1) +```{r search1, eval = FALSE} +dataverse_search("Gary King") ``` More complicated searches might specify metadata fields: -```{r search2} -str(dataverse_search(author = "Gary King", title = "Ecological Inference"), 1) +```{r search2, eval = FALSE} +dataverse_search(author = "Gary King", title = "Ecological Inference") ``` And searches can be restricted to specific types of objects (Dataverse, dataset, or file): @@ -150,18 +177,6 @@ get_dataverse("mydataverse") Through the native API it is possible to update a dataset by modifying its metadata with `update_dataset()` or file contents using `update_dataset_file()` and then republish a new version using `publish_dataset()`. -## Installation - -[![CRAN Version](https://www.r-pkg.org/badges/version/dataverse)](https://cran.r-project.org/package=dataverse) ![Downloads](https://cranlogs.r-pkg.org/badges/dataverse) [![Travis-CI Build Status](https://travis-ci.org/IQSS/dataverse-client-r.png?branch=master)](https://travis-ci.org/IQSS/dataverse-client-r) [![codecov.io](https://codecov.io/github/IQSS/dataverse-client-r/coverage.svg?branch=master)](https://codecov.io/github/IQSS/dataverse-client-r?branch=master) - -You can (eventually) find a stable release on [CRAN](https://cran.r-project.org/package=dataverse), or install the latest development version from GitHub: - -``` r -if (!require("remotes")) { - install.packages("remotes") -} -remotes::install_github("iqss/dataverse-client-r") -library("dataverse") -``` +### Other Installations Users interested in downloading metadata from archives other than Dataverse may be interested in Kurt Hornik's [OAIHarvester](https://cran.r-project.org/package=OAIHarvester) and Scott Chamberlain's [oai](https://cran.r-project.org/package=oai), which offer metadata download from any web repository that is compliant with the [Open Archives Initiative](http://www.openarchives.org/) standards. Additionally, [rdryad](https://cran.r-project.org/package=rdryad) uses OAIHarvester to interface with [Dryad](http://datadryad.org/). The [rfigshare](https://cran.r-project.org/package=rfigshare) package works in a similar spirit to **dataverse** with . diff --git a/README.md b/README.md index f709041..1f10301 100644 --- a/README.md +++ b/README.md @@ -1,191 +1,273 @@ -# R Client for Dataverse 4 Repositories +R Client for Dataverse 4 Repositories +================ + +[![CRAN +Version](https://www.r-pkg.org/badges/version/dataverse)](https://cran.r-project.org/package=dataverse) +![Downloads](https://cranlogs.r-pkg.org/badges/dataverse) [![Travis-CI +Build +Status](https://travis-ci.org/IQSS/dataverse-client-r.png?branch=master)](https://travis-ci.org/IQSS/dataverse-client-r) +[![codecov.io](https://codecov.io/github/IQSS/dataverse-client-r/coverage.svg?branch=master)](https://codecov.io/github/IQSS/dataverse-client-r?branch=master) +[![Dataverse Project +logo](http://dataverse.org/files/dataverseorg/files/dataverse_project_logo-hp.png "Dataverse Project")](http://dataverse.org) +The **dataverse** package provides access to [Dataverse +4](http://dataverse.org/) APIs, enabling data search, retrieval, and +deposit, thus allowing R users to integrate public data sharing into the +reproducible research workflow. **dataverse** is the next-generation +iteration of [the **dvn** +package](https://cran.r-project.org/package=dvn), which works with +Dataverse 3 (“Dataverse Network”) applications. **dataverse** includes +numerous improvements for data search, retrieval, and deposit, including +use of the (currently in development) **sword** package for data deposit +and the **UNF** package for data fingerprinting. -[![Dataverse Project logo](http://dataverse.org/files/dataverseorg/files/dataverse_project_logo-hp.png "Dataverse Project")](http://dataverse.org) +### Getting Started -The **dataverse** package provides access to [Dataverse 4](http://dataverse.org/) APIs, enabling data search, retrieval, and deposit, thus allowing R users to integrate public data sharing into the reproducible research workflow. **dataverse** is the next-generation iteration of [the **dvn** package](https://cran.r-project.org/package=dvn), which works with Dataverse 3 ("Dataverse Network") applications. **dataverse** includes numerous improvements for data search, retrieval, and deposit, including use of the (currently in development) **sword** package for data deposit and the **UNF** package for data fingerprinting. +You can find a stable 2017 release on +[CRAN](https://cran.r-project.org/package=dataverse), or install the +latest development version from GitHub: -Some features of the Dataverse 4 API are public and require no authentication. This means in many cases you can search for and retrieve data without a Dataverse account for that a specific Dataverse installation. But, other features require a Dataverse account for the specific server installation of the Dataverse software, and an API key linked to that account. Instructions for obtaining an account and setting up an API key are available in the [Dataverse User Guide](http://guides.dataverse.org/en/latest/user/account.html). (Note: if your key is compromised, it can be regenerated to preserve security.) Once you have an API key, this should be stored as an environment variable called `DATAVERSE_KEY`. It can be set within R using: +``` r +library("dataverse") +``` -```R +### Keys + +Some features of the Dataverse 4 API are public and require no +authentication. This means in many cases you can search for and retrieve +data without a Dataverse account for that a specific Dataverse +installation. But, other features require a Dataverse account for the +specific server installation of the Dataverse software, and an API key +linked to that account. Instructions for obtaining an account and +setting up an API key are available in the [Dataverse User +Guide](http://guides.dataverse.org/en/latest/user/account.html). (Note: +if your key is compromised, it can be regenerated to preserve security.) +Once you have an API key, this should be stored as an environment +variable called `DATAVERSE_KEY`. It can be set within R using: + +``` r Sys.setenv("DATAVERSE_KEY" = "examplekey12345") ``` -Because [there are many Dataverse installations](http://dataverse.org/), all functions in the R client require specifying what server installation you are interacting with. This can be set by default with an environment variable, `DATAVERSE_SERVER`. This should be the Dataverse server, without the "https" prefix or the "/api" URL path, etc. For example, the Harvard Dataverse can be used by setting: +### Server -```R +Because [there are many Dataverse installations](http://dataverse.org/), +all functions in the R client require specifying what server +installation you are interacting with. This can be set by default with +an environment variable, `DATAVERSE_SERVER`. This should be the +Dataverse server, without the “https” prefix or the “/api” URL path, +etc. For example, the Harvard Dataverse can be used by setting: + +``` r Sys.setenv("DATAVERSE_SERVER" = "dataverse.harvard.edu") ``` -Note: The package attempts to compensate for any malformed values, though. - -Currently, the package wraps the data management features of the Dataverse API. Functions for other API features - related to user management and permissions - are not currently exported in the package (but are drafted in the [source code](https://github.com/IQSS/dataverse-client-r)). - -### Data Discovery - -Dataverse supplies a pretty robust search API to discover Dataverses, datasets, and files. The simplest searches simply consist of a query string: - +Note: The package attempts to compensate for any malformed values, +though. -```r -library("dataverse") -str(dataverse_search("Gary King"), 1) -``` +Currently, the package wraps the data management features of the +Dataverse API. Functions for other API features - related to user +management and permissions - are not currently exported in the package +(but are drafted in the [source +code](https://github.com/IQSS/dataverse-client-r)). -``` -## 10 of 1043 results retrieved -``` +### Data and Metadata Retrieval -``` -## 'data.frame': 10 obs. of 17 variables: -## $ name : chr "00698McArthur-King-BoxCoverSheets.pdf" "00698McArthur-King-MemoOfAgreement.pdf" "00698McArthur-King-StudyDescription.pdf" "077_mod1_s2m.tab" ... -## $ type : chr "file" "file" "file" "file" ... -## $ url : chr "https://dataverse.harvard.edu/api/access/datafile/101348" "https://dataverse.harvard.edu/api/access/datafile/101349" "https://dataverse.harvard.edu/api/access/datafile/101350" "https://dataverse.harvard.edu/api/access/datafile/2910738" ... -## $ file_id : chr "101348" "101349" "101350" "2910738" ... -## $ description : chr "Describe contents of each box of a paper data set" "Legal agreement between data depositor and Murray Archive" "Overview: abstract, research methodology, publications, and other info." NA ... -## $ published_at : chr "2009-03-05T00:00:00Z" "2009-03-05T00:00:00Z" "2009-03-05T00:00:00Z" "2016-11-09T22:06:10Z" ... -## $ file_type : chr "Adobe PDF" "Adobe PDF" "Adobe PDF" "Tab-Delimited" ... -## $ file_content_type: chr "application/pdf" "application/pdf" "application/pdf" "text/tab-separated-values" ... -## $ size_in_bytes : int 503714 360107 16506 318276 NA NA NA NA NA NA -## $ md5 : chr "" "" "" "af9a6fa00bf29009e9eb5d366ad64660" ... -## $ checksum :'data.frame': 10 obs. of 2 variables: -## $ dataset_citation : chr "Charles C. McArthur; Stanley H. King, 2009, \"Harvard Student Study, 1960-1964\", hdl:1902.1/00698, Harvard Dataverse, V2" "Charles C. McArthur; Stanley H. King, 2009, \"Harvard Student Study, 1960-1964\", hdl:1902.1/00698, Harvard Dataverse, V2" "Charles C. McArthur; Stanley H. King, 2009, \"Harvard Student Study, 1960-1964\", hdl:1902.1/00698, Harvard Dataverse, V2" "International Food Policy Research Institute (IFPRI); Savannah Agricultural Research Institute, 2016, \"Medium "| __truncated__ ... -## $ unf : chr NA NA NA "UNF:6:4mZh78EEGxqFLF71f/Nh/A==" ... -## $ global_id : chr NA NA NA NA ... -## $ citationHtml : chr NA NA NA NA ... -## $ citation : chr NA NA NA NA ... -## $ authors :List of 10 -``` +Datasets on Dataverse are directly downloadable by their API, and this +is straightforward especially if the data is not restricted. The +dataverse package provides multiple interfaces. Users can supply a file +DOI, a dataset DOI combined with a filename, or a dataverse object. They +can read in the file as a raw binary or a dataset read in with the +appropriate R function. -More complicated searches might specify metadata fields: +#### Reading data as R objects +Use the `get_dataframe_*` functions, depending on the input you have. +For example, we will read a survey dataset on dataverse, +[nlsw88.dta](https://demo.dataverse.org/file.xhtml?persistentId=doi:10.70122/FK2/PPKHI1/ZYATZZ) +(`doi:10.70122/FK2/PPKHI1/ZYATZZ`), originally in Stata dta form. -```r -str(dataverse_search(author = "Gary King", title = "Ecological Inference"), 1) -``` +With a file DOI -``` -## 10 of 1349 results retrieved +``` r +nlsw <- get_dataframe_by_doi("10.70122/FK2/PPKHI1/ZYATZZ", + haven::read_dta, + server = "demo.dataverse.org") +nlsw ``` -``` -## 'data.frame': 10 obs. of 17 variables: -## $ name : chr "00531Winter-LiberalArts-Clare-Data.tab" "00698McArthur-King-BoxCoverSheets.pdf" "00698McArthur-King-MemoOfAgreement.pdf" "00698McArthur-King-StudyDescription.pdf" ... -## $ type : chr "file" "file" "file" "file" ... -## $ url : chr "https://dataverse.harvard.edu/api/access/datafile/101725" "https://dataverse.harvard.edu/api/access/datafile/101348" "https://dataverse.harvard.edu/api/access/datafile/101349" "https://dataverse.harvard.edu/api/access/datafile/101350" ... -## $ file_id : chr "101725" "101348" "101349" "101350" ... -## $ description : chr "Clare College data in tab delimited format" "Describe contents of each box of a paper data set" "Legal agreement between data depositor and Murray Archive" "Overview: abstract, research methodology, publications, and other info." ... -## $ published_at : chr "2010-05-10T00:00:00Z" "2009-03-05T00:00:00Z" "2009-03-05T00:00:00Z" "2009-03-05T00:00:00Z" ... -## $ file_type : chr "Tab-Delimited" "Adobe PDF" "Adobe PDF" "Adobe PDF" ... -## $ file_content_type: chr "text/tab-separated-values" "application/pdf" "application/pdf" "application/pdf" ... -## $ size_in_bytes : int 167843 503714 360107 16506 318276 NA 3825612 4012 9054 48213 -## $ md5 : chr "" "" "" "" ... -## $ checksum :'data.frame': 10 obs. of 2 variables: -## $ unf : chr "UNF:3:9ZWOqiilVGnLacm4Qg2EYQ==" NA NA NA ... -## $ dataset_citation : chr "David G. Winter; David C. McClelland; Abigail J. Stewart, 2010, \"New Case for the Liberal Arts, 1974-1978\", h"| __truncated__ "Charles C. McArthur; Stanley H. King, 2009, \"Harvard Student Study, 1960-1964\", hdl:1902.1/00698, Harvard Dataverse, V2" "Charles C. McArthur; Stanley H. King, 2009, \"Harvard Student Study, 1960-1964\", hdl:1902.1/00698, Harvard Dataverse, V2" "Charles C. McArthur; Stanley H. King, 2009, \"Harvard Student Study, 1960-1964\", hdl:1902.1/00698, Harvard Dataverse, V2" ... -## $ global_id : chr NA NA NA NA ... -## $ citationHtml : chr NA NA NA NA ... -## $ citation : chr NA NA NA NA ... -## $ authors :List of 10 + ## # A tibble: 2,246 x 17 + ## idcode age race married never_married grade collgrad south smsa c_city industry occupation union wage + ## + ## 1 1 37 2 [bla… 0 [sin… 0 12 0 [not … 0 1 [SMS… 0 5 [Tra… 6 [Opera… 1 [uni… 11.7 + ## 2 2 37 2 [bla… 0 [sin… 0 12 0 [not … 0 1 [SMS… 1 4 [Man… 5 [Craft… 1 [uni… 6.40 + ## 3 3 42 2 [bla… 0 [sin… 1 12 0 [not … 0 1 [SMS… 1 4 [Man… 3 [Sales] NA 5.02 + ## 4 4 43 1 [whi… 1 [mar… 0 17 1 [coll… 0 1 [SMS… 0 11 [Pro… 13 [Other] 1 [uni… 9.03 + ## 5 6 42 1 [whi… 1 [mar… 0 12 0 [not … 0 1 [SMS… 0 4 [Man… 6 [Opera… 0 [non… 8.08 + ## 6 7 39 1 [whi… 1 [mar… 0 12 0 [not … 0 1 [SMS… 0 11 [Pro… 3 [Sales] 0 [non… 4.63 + ## 7 9 37 1 [whi… 0 [sin… 0 12 0 [not … 0 1 [SMS… 1 5 [Tra… 2 [Manag… 1 [uni… 10.5 + ## 8 12 40 1 [whi… 1 [mar… 0 18 1 [coll… 0 1 [SMS… 0 11 [Pro… 2 [Manag… 0 [non… 17.2 + ## 9 13 40 1 [whi… 1 [mar… 0 14 0 [not … 0 1 [SMS… 0 11 [Pro… 3 [Sales] 0 [non… 13.1 + ## 10 14 40 1 [whi… 1 [mar… 0 15 0 [not … 0 1 [SMS… 0 11 [Pro… 1 [Profe… 0 [non… 7.75 + ## # … with 2,236 more rows, and 3 more variables: hours , ttl_exp , tenure + +With a name and dataset DOI + +``` r +nlsw <- get_dataframe_by_name(file = "nlsw88.tab", + dataset = "10.70122/FK2/PPKHI1", + haven::read_dta, + server = "demo.dataverse.org") ``` -And searches can be restricted to specific types of objects (Dataverse, dataset, or file): +Note that even though the file prefix is “.tab”, we use `read_dta`. This +is because this file was originally a dta file that was ingested into an +archival format with a “.tab” file extension. The `get_dataframe_` +functions do not attempt to download the archival versions by default, +but it is possible to turn this option off with \`archival = TRUE\`. +Sometimes you may know the underlying file ID. In this case, the fileid +is -```r -str(dataverse_search(author = "Gary King", type = "dataset"), 1) +``` r +nlsw <- get_dataframe_by_id(1733999, + haven::read_dta, + server = "demo.dataverse.org") ``` -``` -## 10 of 523 results retrieved -``` +#### Reading a dataset as a binary file. -``` -## 'data.frame': 10 obs. of 9 variables: -## $ name : chr "10 Million International Dyadic Events" "A Comparative Study between Gurukul System and Western System of Education" "A Lexicial Index of Electoral Democracy" "A Unified Model of Cabinet Dissolution in Parliamentary Democracies" ... -## $ type : chr "dataset" "dataset" "dataset" "dataset" ... -## $ url : chr "http://hdl.handle.net/1902.1/FYXLAWZRIA" "http://dx.doi.org/10.7910/DVN/329UAV" "http://dx.doi.org/10.7910/DVN/29106" "http://dx.doi.org/10.3886/ICPSR01115.v1" ... -## $ global_id : chr "hdl:1902.1/FYXLAWZRIA" "doi:10.7910/DVN/329UAV" "doi:10.7910/DVN/29106" "doi:10.3886/ICPSR01115.v1" ... -## $ description : chr "When the Palestinians launch a mortar attack into Israel, the Israeli army does not wait until the end of the c"| __truncated__ "India, in ancient times has witnessed students which used to be like the great king Vikramaditya. He followed t"| __truncated__ "We operationalize electoral democracy as a series of necessary-and-sufficient conditions arrayed in an ordinal "| __truncated__ "The literature on cabinet duration is split between two apparently irreconcilable positions. The ATTRIBUTES THE"| __truncated__ ... -## $ published_at: chr "2014-08-21T00:00:00Z" "2016-06-07T13:09:20Z" "2016-08-05T20:42:31Z" "2015-04-09T04:13:54Z" ... -## $ citationHtml: chr "King, Gary; Lowe, Will, 2008, \"10 Million International Dyadic Events\", . From dd1348cf670f153412ee1817c966ca47251fb5d6 Mon Sep 17 00:00:00 2001 From: Shiro Kuriwaki Date: Tue, 29 Dec 2020 16:54:05 -0500 Subject: [PATCH 35/75] Revert back to overwriting `file` --- R/get_file_metadata.R | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/R/get_file_metadata.R b/R/get_file_metadata.R index f0335c3..8b114b0 100644 --- a/R/get_file_metadata.R +++ b/R/get_file_metadata.R @@ -20,15 +20,14 @@ get_file_metadata <- persistentID <- FALSE if (!is.numeric(file)) { if (inherits(file, "dataverse_file")) { - fileid <- get_fileid(file) + file <- get_fileid(file) } else if (grepl(x = file, pattern = "^doi:")) { # if file-specific DOI, then use DOI - fileid <- file persistentID <- TRUE } else if (is.null(dataset)) { stop("When 'file' is a character string, dataset must be specified. Or, use a global fileid instead.") } else { - fileid <- get_fileid(dataset, file, key = key, server = server, ...) + file <- get_fileid(dataset, file, key = key, server = server, ...) } } @@ -36,9 +35,9 @@ get_file_metadata <- # different URL depending on if you have persistentId if (persistentID) { - u <- paste0(api_url(server), "access/datafile/:persistentId/metadata/", format, "/?persistentId=", fileid) + u <- paste0(api_url(server), "access/datafile/:persistentId/metadata/", format, "/?persistentId=", file) } else { - u <- paste0(api_url(server), "access/datafile/", fileid, "/metadata/", format) + u <- paste0(api_url(server), "access/datafile/", file, "/metadata/", format) } r <- httr::GET(u, httr::add_headers("X-Dataverse-key" = key), ...) From 51581bad0c0e4f9cb645278bf9e649b6437acee0 Mon Sep 17 00:00:00 2001 From: Shiro Kuriwaki Date: Tue, 29 Dec 2020 16:54:12 -0500 Subject: [PATCH 36/75] Need for help page --- DESCRIPTION | 1 + 1 file changed, 1 insertion(+) diff --git a/DESCRIPTION b/DESCRIPTION index 7b90f99..055c292 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -51,6 +51,7 @@ Suggests: purrr, testthat, UNF, + readr, yaml Description: Provides access to Dataverse version 4 APIs , enabling data search, retrieval, and deposit. For Dataverse versions <= 4.0, From 1ee735329abaae7c68ebfb21f5b801de8cac9659 Mon Sep 17 00:00:00 2001 From: Shiro Kuriwaki Date: Tue, 29 Dec 2020 22:33:32 -0500 Subject: [PATCH 37/75] Port is_ingested out --- R/utils.R | 25 +++++++++++++++++++++++++ man/is_ingested.Rd | 25 +++++++++++++++++++++++++ 2 files changed, 50 insertions(+) create mode 100644 man/is_ingested.Rd diff --git a/R/utils.R b/R/utils.R index 9766290..42f4ed3 100644 --- a/R/utils.R +++ b/R/utils.R @@ -81,6 +81,30 @@ get_fileid.dataverse_file <- function(x, ...) { x[["dataFile"]][["id"]] } + +#' Identify if file is an ingested file +#' +#' @param fileid A numeric fileid or file-specific DOI +#' +#' @examples +#' # https://demo.dataverse.org/file.xhtml?persistentId=doi:10.70122/FK2/X5MUPQ/T0KKUZ +#' # nlsw88.tab +#' is_ingested(fileid = "doi:10.70122/FK2/X5MUPQ/T0KKUZ", +#' server = "demo.dataverse.org") +#' +#' # nlsw88_rds-export.rds +#' is_ingested(fileid = "doi:10.70122/FK2/PPIAXE/SUCFNI", +#' server = "demo.dataverse.org") +#' +is_ingested <- function(fileid, server = Sys.getenv("DATAVERSE_SERVER")) { + ping_metadata <- tryCatch(get_file_metadata(fileid, server = server), + error = function(e) e) + is_ingested <- !inherits(ping_metadata, "error") # if error, not ingested + is_ingested +} + + + # other functions prepend_doi <- function(dataset) { if (grepl("^hdl", dataset)) { @@ -143,3 +167,4 @@ parse_dataset <- function(out) { out$files <- cbind(out$files, file_df) structure(out, class = "dataverse_dataset") } + diff --git a/man/is_ingested.Rd b/man/is_ingested.Rd new file mode 100644 index 0000000..501f935 --- /dev/null +++ b/man/is_ingested.Rd @@ -0,0 +1,25 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/utils.R +\name{is_ingested} +\alias{is_ingested} +\title{Identify if file is an ingested file} +\usage{ +is_ingested(fileid, server = Sys.getenv("DATAVERSE_SERVER")) +} +\arguments{ +\item{fileid}{A numeric fileid or file-specific DOI} +} +\description{ +Identify if file is an ingested file +} +\examples{ +# https://demo.dataverse.org/file.xhtml?persistentId=doi:10.70122/FK2/X5MUPQ/T0KKUZ +# nlsw88.tab +is_ingested(fileid = "doi:10.70122/FK2/X5MUPQ/T0KKUZ", + server = "demo.dataverse.org") + +# nlsw88_rds-export.rds +is_ingested(fileid = "doi:10.70122/FK2/PPIAXE/SUCFNI", + server = "demo.dataverse.org") + +} From be0aad872a8295b773c32f348ff2ce168e0a1783 Mon Sep 17 00:00:00 2001 From: Shiro Kuriwaki Date: Tue, 29 Dec 2020 23:33:59 -0500 Subject: [PATCH 38/75] Use the "original" argument, and replace all examples with that --- NAMESPACE | 1 + R/get_file.R | 26 +++---- R/get_file_as_dataframe.R | 111 +++++++++++++-------------- R/get_file_by_id.R | 32 +++----- README.Rmd | 54 +++++++++---- README.md | 156 +++++++++++++++++++++----------------- man/files.Rd | 32 ++++---- man/get_dataframe.Rd | 82 ++++++++------------ 8 files changed, 252 insertions(+), 242 deletions(-) diff --git a/NAMESPACE b/NAMESPACE index c79c987..f928a79 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -50,6 +50,7 @@ export(update_dataset) export(update_dataset_file) import(httr) import(xml2) +importFrom(readr,read_tsv) importFrom(stats,setNames) importFrom(stringr,str_extract) importFrom(utils,str) diff --git a/R/get_file.R b/R/get_file.R index 2680c2e..c23b77c 100644 --- a/R/get_file.R +++ b/R/get_file.R @@ -44,25 +44,22 @@ #' \dontrun{ #' #' # 1. Using filename and dataverse -#' f1 <- get_file_by_name("gapminder-FiveYearData.tab", -#' dataset = "doi:10.70122/FK2/PPKHI1", +#' f1 <- get_file_by_name("nlsw88.tab", +#' dataset = "doi:10.70122/FK2/PPIAXE", #' server = "demo.dataverse.org") #' #' # 2. Using DOI -#' f2 <- get_file_by_doi("10.70122/FK2/PPKHI1/ZYATZZ", +#' f2 <- get_file_by_doi("10.70122/FK2/PPIAXE/MHDB0O", #' server = "demo.dataverse.org") #' #' # 3. Two-steps: Find ID from get_dataset -#' d3 <- get_dataset("doi:10.70122/FK2/PPKHI1", server = "demo.dataverse.org") +#' d3 <- get_dataset("doi:10.70122/FK2/PPIAXE", server = "demo.dataverse.org") #' f3 <- get_file(d3$files$id[1], server = "demo.dataverse.org") #' #' -#' # 4. Alternatively, based on "dataverse_file" object -#' f4_dvf <- dataset_files("doi:10.70122/FK2/PPKHI1", server = "demo.dataverse.org") -#' f4 <- get_file(f4_dvf[[1]], server = "demo.dataverse.org") #' #' # 5. Retrieve multiple raw data in list -#' f5_vec <- get_dataset("doi:10.70122/FK2/PPKHI1", +#' f5_vec <- get_dataset("doi:10.70122/FK2/PPIAXE", #' server = "demo.dataverse.org")$files$id #' f5 <- get_file(f5_vec, #' server = "demo.dataverse.org") @@ -70,8 +67,9 @@ #' #' # Write binary files. #' # The appropriate file extension needs to be assigned by the user. -#' writeBin(f1, "gapminder-FiveYearData.tab") -#' writeBin(f5[[1]], "gapminder-FiveYearData.tab") +#' writeBin(f1, "nlsw88.tab") +#' writeBin(f2, "nlsw88.tab") +#' writeBin(f5[[1]], "nlsw88.tab") #' #' # NOTE: fix so that get_file (with multiple) files #' # (f5) in example can return a tabulated dataset in original @@ -86,7 +84,7 @@ get_file <- server = Sys.getenv("DATAVERSE_SERVER"), vars = NULL, key = Sys.getenv("DATAVERSE_KEY"), - archival = NULL, + original = NULL, ...) { format <- match.arg(format) @@ -121,7 +119,7 @@ get_file <- vars = vars, key = key, server = server, - archival = archival, + original = original, ... ) } @@ -150,7 +148,7 @@ get_file_by_name <- function(filename, server = Sys.getenv("DATAVERSE_SERVER"), vars = NULL, key = Sys.getenv("DATAVERSE_KEY"), - archival = NULL, + original = TRUE, ... ) { format <- match.arg(format) @@ -167,7 +165,7 @@ get_file_by_name <- function(filename, vars = vars, key = key, server = server, - archival = archival, + original = original, ...) } diff --git a/R/get_file_as_dataframe.R b/R/get_file_as_dataframe.R index 7858f13..6859ddd 100644 --- a/R/get_file_as_dataframe.R +++ b/R/get_file_as_dataframe.R @@ -8,75 +8,60 @@ #' @param file to be passed on to get_file #' @param dataset to be passed on to get_file #' @param FUN The function to used for reading in the raw dataset. This user -#' must choose the appropriate funuction: for example if the target is a .rds +#' must choose the appropriate function: for example if the target is a .rds #' file, then `FUN` should be `readRDS` or `readr::read_rds`. -#' @param archival Whether to read from the ingested, archival version of the -#' dataset, or whether to read the original. The archival versions are tab-delimited -#' `.tab` files. If functions to read the original version is available without -#' loss of information, then `archival = FALSE` is better. If such functions -#' are not available or the original format is unknown, use `archival = TRUE`. +#' @param original A logical, defaulting to TRUE. Whether to read the ingested, +#' archival version of the dataset if one exists. The archival versions are tab-delimited +#' `.tab` files so if `original = FALSE`, `FUN` is set to `readr::read_tsv`. +#' If functions to read the original version is available, then `original = TRUE` +#' with a specified `FUN` is better. #' @inheritDotParams get_file #' +#' @importFrom readr read_tsv +#' #' @examples #' library(readr) #' -# load dataset from file name and dataverse DOI -#' gap_df <- get_dataframe_by_name( -#' file = "gapminder-FiveYearData.tab", -#' dataset = "doi:10.70122/FK2/PPKHI1", -#' server = "demo.dataverse.org", -#' FUN = read_csv) +#' # load dataset from file name and dataverse DOI +#' csv_tab <- get_dataframe_by_name( +#' file = "roster-bulls-1996.tab", +#' dataset = "doi:10.70122/FK2/HXJVJU", +#' server = "demo.dataverse.org") #' #' # or a Stata dta #' stata_df <- get_dataframe_by_name( #' file = "nlsw88.tab", -#' dataset = "doi:10.70122/FK2/PPKHI1", -#' server = "demo.dataverse.org", -#' FUN = haven::read_dta) +#' dataset = "doi:10.70122/FK2/PPIAXE", +#' server = "demo.dataverse.org") #' -#' # or a Rds file -#' rds_df <- get_dataframe_by_name( -#' file = "nlsw88_rds-export.rds", -#' dataset = "doi:10.70122/FK2/PPKHI1", -#' server = "demo.dataverse.org", -#' FUN = read_rds) +#' # To use the original version, or for non-ingested data, +#' # please specify `orginal = TRUE` and specify a function in FUN #' -#' # equivalently, if you know the DOI -#' gap_df <- get_dataframe_by_doi( -#' filedoi = "10.70122/FK2/PPKHI1/ZYATZZ", -#' server = "demo.dataverse.org", -#' FUN = read_csv -#' ) -#' -#' # or the id -#' # you can also customize the FUN (in this case to supress parse msg) -#' gap_df <- get_dataframe_by_id( -#' 1733998, +#' stata_df <- get_dataframe_by_name( +#' file = "nlsw88.tab", +#' dataset = "doi:10.70122/FK2/PPIAXE", #' server = "demo.dataverse.org", -#' FUN = function(x) read_csv(x, col_types = cols())) -#' -#' # equivalently, using a dataverse object -#' gap_ds <- dataset_files("doi:10.70122/FK2/PPKHI1", -#' server = "demo.dataverse.org") +#' original = TRUE, +#' FUN = haven::read_dta) #' -#' gap_df <- get_dataframe_by_id( -#' gap_ds[[1]], +#' rds_df <- get_dataframe_by_name( +#' file = "nlsw88_rds-export.rds", +#' dataset = "doi:10.70122/FK2/PPIAXE", #' server = "demo.dataverse.org", -#' FUN = function(x) read_csv(x, col_types = cols())) +#' FUN = readr::read_rds) #' -#' # to use the archival version (and read as TSV) -#' gap_df <- get_dataframe_by_id( -#' 1733998, +#' # equivalently, if you know the DOI +#' stata_df <- get_dataframe_by_doi( +#' filedoi = "10.70122/FK2/PPIAXE/MHDB0O", #' server = "demo.dataverse.org", -#' archival = TRUE, -#' FUN = function(x) read_tsv(x, col_types = cols())) -#' -#' +#' original = TRUE, +#' FUN = haven::read_dta +#' ) #' @export get_dataframe_by_name <- function(file, dataset = NULL, FUN = NULL, - archival = FALSE, + original = FALSE, ...) { # retrieve ID @@ -84,26 +69,38 @@ get_dataframe_by_name <- function(file, file = file, ...) - get_dataframe_by_id(fileid, FUN, archival = archival, ...) + get_dataframe_by_id(fileid, FUN, original = original, ...) } #' @rdname get_dataframe +#' @importFrom readr read_tsv #' @export get_dataframe_by_id <- function(fileid, FUN = NULL, - archival = FALSE, + original = FALSE, ...) { - raw <- get_file(file = fileid, archival = archival, ...) + # if not ingested, then whether to take the original is not relevant. + ingested <- is_ingested(fileid, ...) - # default of get_file - if (is.null(FUN)) { - warning("function was not supplied so returning the raw binary file.") - return(raw) + if (isFALSE(ingested)) { + original <- NA } + if (is.null(FUN) & isTRUE(ingested) & isFALSE(original)) { + warning("Downloading ingested version of data with read_tsv. To download the original version and remove this warning, set original = TRUE.\n") + FUN <- read_tsv + } + + if (is.null(FUN) & (isFALSE(ingested) | isTRUE(original))) { + stop("read-in function was left NULL, but the target file is not ingested or you asked for the original version. Please supply a FUN argument.\n") + } + + # READ raw data + raw <- get_file(file = fileid, original = original, ...) + # save to temp and then read it in with supplied function if (!is.null(FUN)) { get_dataframe_internal(raw, filename = "foo", .f = FUN) @@ -116,12 +113,12 @@ get_dataframe_by_id <- function(fileid, #' @export get_dataframe_by_doi <- function(filedoi, FUN = NULL, - archival = FALSE, + original = FALSE, ...) { filedoi <- prepend_doi(filedoi) # get_file can also take doi now - get_dataframe_by_id(file = filedoi, FUN = FUN, archival = archival, ...) + get_dataframe_by_id(file = filedoi, FUN = FUN, original = original, ...) } #' Write to temp and apply function diff --git a/R/get_file_by_id.R b/R/get_file_by_id.R index cc656cc..2ba1eb3 100644 --- a/R/get_file_by_id.R +++ b/R/get_file_by_id.R @@ -1,7 +1,9 @@ #' @rdname files #' -#' @param archival If a ingested (.tab) version is available, download -#' the ingested archival version or not? +#' @param original A logical, defaulting to TRUE. If a ingested (.tab) version is +#' available, download the original version instead of the ingested? If there was +#' no ingested version, is set to NA. Note in `get_dataframe_*`, +#' `original` is set to FALSE by default can be changed. #' @param fileid A numeric ID internally used for `get_file_by_id` #' #' @@ -12,7 +14,7 @@ get_file_by_id <- server = Sys.getenv("DATAVERSE_SERVER"), format = c("original", "bundle"), vars = NULL, - archival = NULL, + original = TRUE, key = Sys.getenv("DATAVERSE_KEY"), ...) { format <- match.arg(format) @@ -28,21 +30,11 @@ get_file_by_id <- # ping get_file_metadata to see if file is ingested - ping_metadata <- tryCatch(get_file_metadata(fileid, server = server), - error = function(e) e) - is_ingested <- !inherits(ping_metadata, "error") # if error, not ingested + is_ingested <- is_ingested(fileid, server = server) # update archival if not specified - if (is.null(archival)) - archival <- FALSE - - # check - if (archival & !is_ingested) - stop("You requested an archival version, but the file has no metadata so does not appear ingested.") - - - # downloading files sequentially and add the raw vectors to a list - out <- vector("list", length(fileid)) + if (isFALSE(is_ingested)) + original <- NA # create query ----- query <- list() @@ -55,8 +47,8 @@ get_file_by_id <- if (is_ingested & format != "bundle") query$format <- match.arg(format) - # if the archival version is desired, we need to NOT specify a format - if (is_ingested & archival) + # if the original is not desired, we need to NOT specify a format + if (is_ingested & (isFALSE(original) | is.na(original))) query$format <- NULL @@ -92,7 +84,7 @@ get_file_by_doi <- function(filedoi, server = Sys.getenv("DATAVERSE_SERVER"), format = c("original", "bundle"), vars = NULL, - archival = NULL, + original = TRUE, key = Sys.getenv("DATAVERSE_KEY"), ...) { @@ -103,7 +95,7 @@ get_file_by_doi <- function(filedoi, vars = vars, key = key, server = server, - archival = archival, + original = original, ... ) diff --git a/README.Rmd b/README.Rmd index 95c8f59..8acc2cc 100644 --- a/README.Rmd +++ b/README.Rmd @@ -65,28 +65,51 @@ Use the `get_dataframe_*` functions, depending on the input you have. For exampl With a file DOI ```{r get_dataframe_by_doi} -nlsw <- get_dataframe_by_doi("10.70122/FK2/PPKHI1/ZYATZZ", - haven::read_dta, +nlsw <- get_dataframe_by_doi("10.70122/FK2/PPIAXE/MHDB0O", server = "demo.dataverse.org") -nlsw ``` -With a name and dataset DOI +Alternatively, we can download the same file by specifying the filename and the DOI of the "dataset" (in Dataverse, a collection of files is called a dataset). -```{r get_dataframe_by_name} -nlsw <- get_dataframe_by_name(file = "nlsw88.tab", - dataset = "10.70122/FK2/PPKHI1", - haven::read_dta, - server = "demo.dataverse.org") +```{r get_dataframe_by_name_tsv, message=FALSE} +nlsw_tsv <- get_dataframe_by_name(file = "nlsw88.tab", + dataset = "10.70122/FK2/PPIAXE", + server = "demo.dataverse.org") ``` -Note that even though the file prefix is ".tab", we use `read_dta`. This is because this file was originally a dta file that was ingested into an archival format with a ".tab" file extension. The `get_dataframe_` functions do not attempt to download the archival versions by default, but it is possible to turn this option off with \`archival = TRUE\`. +Many file formats are translated into an ingested, or "archival" version, which is application-neutral and easily-readable. `read_dataframe` takes this ingested version as a default by deafaulting `original = FALSE`. This is safer because you may not have the properietary software that was originally used. On the other hand, using the ingested version may lead to loss of information. + +To read the same file but its original version, specify `original = TRUE` and set a `FUN` argument. In this case, we know that `nlsw88.tab` is a Stata `.dta` dataset, so we will use the `haven::read_dta` function. + +```{r get_dataframe_by_name_original} +nlsw_original <- get_dataframe_by_name(file = "nlsw88.tab", + dataset = "10.70122/FK2/PPIAXE", + FUN = haven::read_dta, + original = TRUE, + server = "demo.dataverse.org") +``` + +Note that even though the file prefix is ".tab", we use `read_dta`. + + +Note the difference between `nls_tsv` and `nls_original`. `nls_original` preserves the data attributes like value labels, whereas `nls_tsv` has dropped this or left this in file metadata. + +```{r} +class(nlsw_tsv$race) +class(nlsw_original$race) + +head(nlsw_tsv$race) +head(haven::as_factor(nlsw_original$race)) +``` + + -Sometimes you may know the underlying file ID. In this case, the fileid is +You may know the underlying file ID, which is a single numeric number unique to the dataset. In this case, the fileid is `1734017` ```{r get_dataframe_by_id} -nlsw <- get_dataframe_by_id(1733999, - haven::read_dta, +nlsw <- get_dataframe_by_id(fileid = 1734017, + FUN = haven::read_dta, + original = TRUE, server = "demo.dataverse.org") ``` @@ -96,7 +119,7 @@ In some cases, you may not need to render the raw binary file, or you do not hav ```{r get_file_by_name} nlsw_raw <- get_file_by_name(file = "nlsw88.tab", - dataset = "10.70122/FK2/PPKHI1", + dataset = "10.70122/FK2/PPIAXE", server = "demo.dataverse.org") class(nlsw_raw) ``` @@ -104,7 +127,8 @@ class(nlsw_raw) The function `get_file_metadata` can also be used similarly. This will return a metadata format for ingested tabular files in the `ddi` format. The function `get_dataset` will retrieve the list of files in a dataset. ```{r, get_dataset} -get_dataset("doi:10.7910/DVN/ARKOTI") +get_dataset(dataset = "10.70122/FK2/PPIAXE", + server = "demo.dataverse.org") ``` ### Data Discovery diff --git a/README.md b/README.md index 1f10301..26a3e73 100644 --- a/README.md +++ b/README.md @@ -91,48 +91,92 @@ For example, we will read a survey dataset on dataverse, With a file DOI ``` r -nlsw <- get_dataframe_by_doi("10.70122/FK2/PPKHI1/ZYATZZ", - haven::read_dta, +nlsw <- get_dataframe_by_doi("10.70122/FK2/PPIAXE/MHDB0O", server = "demo.dataverse.org") -nlsw ``` - ## # A tibble: 2,246 x 17 - ## idcode age race married never_married grade collgrad south smsa c_city industry occupation union wage - ## - ## 1 1 37 2 [bla… 0 [sin… 0 12 0 [not … 0 1 [SMS… 0 5 [Tra… 6 [Opera… 1 [uni… 11.7 - ## 2 2 37 2 [bla… 0 [sin… 0 12 0 [not … 0 1 [SMS… 1 4 [Man… 5 [Craft… 1 [uni… 6.40 - ## 3 3 42 2 [bla… 0 [sin… 1 12 0 [not … 0 1 [SMS… 1 4 [Man… 3 [Sales] NA 5.02 - ## 4 4 43 1 [whi… 1 [mar… 0 17 1 [coll… 0 1 [SMS… 0 11 [Pro… 13 [Other] 1 [uni… 9.03 - ## 5 6 42 1 [whi… 1 [mar… 0 12 0 [not … 0 1 [SMS… 0 4 [Man… 6 [Opera… 0 [non… 8.08 - ## 6 7 39 1 [whi… 1 [mar… 0 12 0 [not … 0 1 [SMS… 0 11 [Pro… 3 [Sales] 0 [non… 4.63 - ## 7 9 37 1 [whi… 0 [sin… 0 12 0 [not … 0 1 [SMS… 1 5 [Tra… 2 [Manag… 1 [uni… 10.5 - ## 8 12 40 1 [whi… 1 [mar… 0 18 1 [coll… 0 1 [SMS… 0 11 [Pro… 2 [Manag… 0 [non… 17.2 - ## 9 13 40 1 [whi… 1 [mar… 0 14 0 [not … 0 1 [SMS… 0 11 [Pro… 3 [Sales] 0 [non… 13.1 - ## 10 14 40 1 [whi… 1 [mar… 0 15 0 [not … 0 1 [SMS… 0 11 [Pro… 1 [Profe… 0 [non… 7.75 - ## # … with 2,236 more rows, and 3 more variables: hours , ttl_exp , tenure - -With a name and dataset DOI + ## Warning in get_dataframe_by_id(file = filedoi, FUN = FUN, original = original, : Downloading ingested version of data with read_tsv. To download the original version and remove this warning, set original = TRUE. + + ## Parsed with column specification: + ## cols( + ## idcode = col_double(), + ## age = col_double(), + ## race = col_double(), + ## married = col_double(), + ## never_married = col_double(), + ## grade = col_double(), + ## collgrad = col_double(), + ## south = col_double(), + ## smsa = col_double(), + ## c_city = col_double(), + ## industry = col_double(), + ## occupation = col_double(), + ## union = col_double(), + ## wage = col_double(), + ## hours = col_double(), + ## ttl_exp = col_double(), + ## tenure = col_double() + ## ) + +Alternatively, we can download the same file by specifying the filename +and the DOI of the “dataset” (in Dataverse, a collection of files is +called a dataset). ``` r -nlsw <- get_dataframe_by_name(file = "nlsw88.tab", - dataset = "10.70122/FK2/PPKHI1", - haven::read_dta, - server = "demo.dataverse.org") +nlsw_tsv <- get_dataframe_by_name(file = "nlsw88.tab", + dataset = "10.70122/FK2/PPIAXE", + server = "demo.dataverse.org") +``` + + ## Warning in get_dataframe_by_id(fileid, FUN, original = original, ...): Downloading ingested version of data with read_tsv. To download the original version and remove this warning, set original = TRUE. + +Many file formats are translated into an ingested, or “archival” +version, which is application-neutral and easily-readable. +`read_dataframe` takes this ingested version as a default by deafaulting +`original = FALSE`. This is safer because you may not have the +properietary software that was originally used. On the other hand, using +the ingested version may lead to loss of information. + +To read the same file but its original version, specify +`original = TRUE` and set a `FUN` argument. In this case, we know that +`nlsw88.tab` is a Stata `.dta` dataset, so we will use the +`haven::read_dta` function. + +``` r +nlsw_original <- get_dataframe_by_name(file = "nlsw88.tab", + dataset = "10.70122/FK2/PPIAXE", + FUN = haven::read_dta, + original = TRUE, + server = "demo.dataverse.org") +``` + +Note that even though the file prefix is “.tab”, we use `read_dta`. + +Note the difference between `nls_tsv` and `nls_original`. `nls_original` +preserves the data attributes like value labels, whereas `nls_tsv` has +dropped this or left this in file metadata. + +``` r +class(nlsw_tsv$race) +class(nlsw_original$race) + +head(nlsw_tsv$race) +head(haven::as_factor(nlsw_original$race)) ``` -Note that even though the file prefix is “.tab”, we use `read_dta`. This -is because this file was originally a dta file that was ingested into an -archival format with a “.tab” file extension. The `get_dataframe_` -functions do not attempt to download the archival versions by default, -but it is possible to turn this option off with \`archival = TRUE\`. + ## [1] "numeric" + ## [1] "haven_labelled" "vctrs_vctr" "double" + ## [1] 2 2 2 1 1 1 + ## [1] black black black white white white + ## Levels: white black other -Sometimes you may know the underlying file ID. In this case, the fileid -is +You may know the underlying file ID, which is a single numeric number +unique to the dataset. In this case, the fileid is `1734017` ``` r -nlsw <- get_dataframe_by_id(1733999, - haven::read_dta, +nlsw <- get_dataframe_by_id(fileid = 1734017, + FUN = haven::read_dta, + original = TRUE, server = "demo.dataverse.org") ``` @@ -146,7 +190,7 @@ argument ``` r nlsw_raw <- get_file_by_name(file = "nlsw88.tab", - dataset = "10.70122/FK2/PPKHI1", + dataset = "10.70122/FK2/PPIAXE", server = "demo.dataverse.org") class(nlsw_raw) ``` @@ -158,48 +202,18 @@ return a metadata format for ingested tabular files in the `ddi` format. The function `get_dataset` will retrieve the list of files in a dataset. ``` r -get_dataset("doi:10.7910/DVN/ARKOTI") +get_dataset(dataset = "10.70122/FK2/PPIAXE", + server = "demo.dataverse.org") ``` - ## Dataset (193956): - ## Version: 2.0, RELEASED - ## Release Date: 2020-04-29T01:52:28Z + ## Dataset (182162): + ## Version: 1.1, RELEASED + ## Release Date: 2020-12-30T00:00:24Z ## License: CC0 ## 22 Files: - ## label version id contentType - ## 1 alpl2013.tab 1 2692294 text/tab-separated-values - ## 2 BPchap7.tab 1 2692295 text/tab-separated-values - ## 3 chapter01.R 1 2692202 text/plain; charset=US-ASCII - ## 4 chapter02.R 1 2692206 text/plain; charset=US-ASCII - ## 5 chapter03.R 1 2692210 text/plain; charset=US-ASCII - ## 6 chapter04.R 1 2692204 text/plain; charset=US-ASCII - ## 7 chapter05.R 1 2692205 text/plain; charset=US-ASCII - ## 8 chapter06.R 1 2692212 text/plain; charset=US-ASCII - ## 9 chapter07.R 1 2692209 text/plain; charset=US-ASCII - ## 10 chapter08.R 1 2692208 text/plain; charset=US-ASCII - ## 11 chapter09.R 1 2692211 text/plain; charset=US-ASCII - ## 12 chapter10.R 1 2692203 text/plain; charset=US-ASCII - ## 13 chapter11.R 1 2692207 text/plain; charset=US-ASCII - ## 14 comprehensiveJapanEnergy.tab 1 2692296 text/tab-separated-values - ## 15 constructionData.tab 1 2692293 text/tab-separated-values - ## 16 drugCoverage.csv 1 2692233 text/plain; charset=US-ASCII - ## 17 erratum.pdf 1 3820744 application/pdf - ## 18 hanmerKalkanANES.tab 1 2692290 text/tab-separated-values - ## 19 hmnrghts.tab 1 2692298 text/tab-separated-values - ## 20 hmnrghts.txt 1 2692238 text/plain - ## 21 levant.tab 1 2692289 text/tab-separated-values - ## 22 LL.csv 1 2692228 text/plain; charset=US-ASCII - ## 23 moneyDem.tab 1 2692292 text/tab-separated-values - ## 24 owsiakJOP2013.tab 1 2692297 text/tab-separated-values - ## 25 PESenergy.csv 1 2692230 text/plain; charset=US-ASCII - ## 26 pts1994.csv 1 2692229 text/plain; charset=US-ASCII - ## 27 pts1995.csv 1 2692231 text/plain; charset=US-ASCII - ## 28 sen113kh.ord 1 2692239 text/plain; charset=US-ASCII - ## 29 SinghEJPR.tab 1 2692299 text/tab-separated-values - ## 30 SinghJTP.tab 1 2692288 text/tab-separated-values - ## 31 stdSingh.tab 1 2692291 text/tab-separated-values - ## 32 UN.csv 1 2692232 text/plain; charset=US-ASCII - ## 33 war1800.tab 1 2692300 text/tab-separated-values + ## label version id contentType + ## 1 nlsw88_rds-export.rds 1 1734016 application/octet-stream + ## 2 nlsw88.tab 3 1734017 text/tab-separated-values ### Data Discovery diff --git a/man/files.Rd b/man/files.Rd index ca0d50a..780ce26 100644 --- a/man/files.Rd +++ b/man/files.Rd @@ -14,7 +14,7 @@ get_file( server = Sys.getenv("DATAVERSE_SERVER"), vars = NULL, key = Sys.getenv("DATAVERSE_KEY"), - archival = NULL, + original = NULL, ... ) @@ -25,7 +25,7 @@ get_file_by_name( server = Sys.getenv("DATAVERSE_SERVER"), vars = NULL, key = Sys.getenv("DATAVERSE_KEY"), - archival = NULL, + original = TRUE, ... ) @@ -35,7 +35,7 @@ get_file_by_id( server = Sys.getenv("DATAVERSE_SERVER"), format = c("original", "bundle"), vars = NULL, - archival = NULL, + original = TRUE, key = Sys.getenv("DATAVERSE_KEY"), ... ) @@ -46,7 +46,7 @@ get_file_by_doi( server = Sys.getenv("DATAVERSE_SERVER"), format = c("original", "bundle"), vars = NULL, - archival = NULL, + original = TRUE, key = Sys.getenv("DATAVERSE_KEY"), ... ) @@ -78,8 +78,10 @@ is not specified, functions calling authenticated API endpoints will fail. Keys can be specified atomically or globally using \code{Sys.setenv("DATAVERSE_KEY" = "examplekey")}.} -\item{archival}{If a ingested (.tab) version is available, download -the ingested archival version or not?} +\item{original}{A logical, defaulting to TRUE. If a ingested (.tab) version is +available, download the original version instead of the ingested? If there was +no ingested version, is set to NA. Note in \verb{get_dataframe_*}, +\code{original} is set to FALSE by default can be changed.} \item{...}{Additional arguments passed to an HTTP request function, such as \code{\link[httr]{GET}}, \code{\link[httr]{POST}}, or @@ -116,25 +118,22 @@ This function provides access to data files from a Dataverse entry. \dontrun{ # 1. Using filename and dataverse -f1 <- get_file_by_name("gapminder-FiveYearData.tab", - dataset = "doi:10.70122/FK2/PPKHI1", +f1 <- get_file_by_name("nlsw88.tab", + dataset = "doi:10.70122/FK2/PPIAXE", server = "demo.dataverse.org") # 2. Using DOI -f2 <- get_file_by_doi("10.70122/FK2/PPKHI1/ZYATZZ", +f2 <- get_file_by_doi("10.70122/FK2/PPIAXE/MHDB0O", server = "demo.dataverse.org") # 3. Two-steps: Find ID from get_dataset -d3 <- get_dataset("doi:10.70122/FK2/PPKHI1", server = "demo.dataverse.org") +d3 <- get_dataset("doi:10.70122/FK2/PPIAXE", server = "demo.dataverse.org") f3 <- get_file(d3$files$id[1], server = "demo.dataverse.org") -# 4. Alternatively, based on "dataverse_file" object -f4_dvf <- dataset_files("doi:10.70122/FK2/PPKHI1", server = "demo.dataverse.org") -f4 <- get_file(f4_dvf[[1]], server = "demo.dataverse.org") # 5. Retrieve multiple raw data in list -f5_vec <- get_dataset("doi:10.70122/FK2/PPKHI1", +f5_vec <- get_dataset("doi:10.70122/FK2/PPIAXE", server = "demo.dataverse.org")$files$id f5 <- get_file(f5_vec, server = "demo.dataverse.org") @@ -142,8 +141,9 @@ length(f5) # Write binary files. # The appropriate file extension needs to be assigned by the user. -writeBin(f1, "gapminder-FiveYearData.tab") -writeBin(f5[[1]], "gapminder-FiveYearData.tab") +writeBin(f1, "nlsw88.tab") +writeBin(f2, "nlsw88.tab") +writeBin(f5[[1]], "nlsw88.tab") # NOTE: fix so that get_file (with multiple) files # (f5) in example can return a tabulated dataset in original diff --git a/man/get_dataframe.Rd b/man/get_dataframe.Rd index a414573..beb3b58 100644 --- a/man/get_dataframe.Rd +++ b/man/get_dataframe.Rd @@ -6,11 +6,11 @@ \alias{get_dataframe_by_doi} \title{Get file from dataverse and convert it into a dataframe or tibble} \usage{ -get_dataframe_by_name(file, dataset = NULL, FUN = NULL, archival = FALSE, ...) +get_dataframe_by_name(file, dataset = NULL, FUN = NULL, original = FALSE, ...) -get_dataframe_by_id(fileid, FUN = NULL, archival = FALSE, ...) +get_dataframe_by_id(fileid, FUN = NULL, original = FALSE, ...) -get_dataframe_by_doi(filedoi, FUN = NULL, archival = FALSE, ...) +get_dataframe_by_doi(filedoi, FUN = NULL, original = FALSE, ...) } \arguments{ \item{file}{to be passed on to get_file} @@ -18,14 +18,14 @@ get_dataframe_by_doi(filedoi, FUN = NULL, archival = FALSE, ...) \item{dataset}{to be passed on to get_file} \item{FUN}{The function to used for reading in the raw dataset. This user -must choose the appropriate funuction: for example if the target is a .rds +must choose the appropriate function: for example if the target is a .rds file, then \code{FUN} should be \code{readRDS} or \code{readr::read_rds}.} -\item{archival}{Whether to read from the ingested, archival version of the -dataset, or whether to read the original. The archival versions are tab-delimited -\code{.tab} files. If functions to read the original version is available without -loss of information, then \code{archival = FALSE} is better. If such functions -are not available or the original format is unknown, use \code{archival = TRUE}.} +\item{original}{A logical, defaulting to TRUE. Whether to read the ingested, +archival version of the dataset if one exists. The archival versions are tab-delimited +\code{.tab} files so if \code{original = FALSE}, \code{FUN} is set to \code{readr::read_tsv}. +If functions to read the original version is available, then \code{original = TRUE} +with a specified \code{FUN} is better.} \item{...}{ Arguments passed on to \code{\link[=get_file]{get_file}} @@ -60,55 +60,39 @@ or globally using \code{Sys.setenv("DATAVERSE_SERVER" = "dataverse.example.com") \examples{ library(readr) -gap_df <- get_dataframe_by_name( - file = "gapminder-FiveYearData.tab", - dataset = "doi:10.70122/FK2/PPKHI1", - server = "demo.dataverse.org", - FUN = read_csv) +# load dataset from file name and dataverse DOI +csv_tab <- get_dataframe_by_name( + file = "roster-bulls-1996.tab", + dataset = "doi:10.70122/FK2/HXJVJU", + server = "demo.dataverse.org") # or a Stata dta stata_df <- get_dataframe_by_name( file = "nlsw88.tab", - dataset = "doi:10.70122/FK2/PPKHI1", - server = "demo.dataverse.org", - FUN = haven::read_dta) - -# or a Rds file -rds_df <- get_dataframe_by_name( - file = "nlsw88_rds-export.rds", - dataset = "doi:10.70122/FK2/PPKHI1", - server = "demo.dataverse.org", - FUN = read_rds) + dataset = "doi:10.70122/FK2/PPIAXE", + server = "demo.dataverse.org") -# equivalently, if you know the DOI -gap_df <- get_dataframe_by_doi( - filedoi = "10.70122/FK2/PPKHI1/ZYATZZ", - server = "demo.dataverse.org", - FUN = read_csv -) +# To use the original version, or for non-ingested data, +# please specify `orginal = TRUE` and specify a function in FUN -# or the id -# you can also customize the FUN (in this case to supress parse msg) -gap_df <- get_dataframe_by_id( - 1733998, +stata_df <- get_dataframe_by_name( + file = "nlsw88.tab", + dataset = "doi:10.70122/FK2/PPIAXE", server = "demo.dataverse.org", - FUN = function(x) read_csv(x, col_types = cols())) - -# equivalently, using a dataverse object -gap_ds <- dataset_files("doi:10.70122/FK2/PPKHI1", - server = "demo.dataverse.org") + original = TRUE, + FUN = haven::read_dta) -gap_df <- get_dataframe_by_id( - gap_ds[[1]], +rds_df <- get_dataframe_by_name( + file = "nlsw88_rds-export.rds", + dataset = "doi:10.70122/FK2/PPIAXE", server = "demo.dataverse.org", - FUN = function(x) read_csv(x, col_types = cols())) + FUN = readr::read_rds) -# to use the archival version (and read as TSV) -gap_df <- get_dataframe_by_id( - 1733998, +# equivalently, if you know the DOI +stata_df <- get_dataframe_by_doi( + filedoi = "10.70122/FK2/PPIAXE/MHDB0O", server = "demo.dataverse.org", - archival = TRUE, - FUN = function(x) read_tsv(x, col_types = cols())) - - + original = TRUE, + FUN = haven::read_dta +) } From f15abe8510d94f152aa48184d2cac159eec695f0 Mon Sep 17 00:00:00 2001 From: Shiro Kuriwaki Date: Tue, 29 Dec 2020 23:58:28 -0500 Subject: [PATCH 39/75] Update examples in get_file --- R/get_file.R | 30 ++++++++++++++++-------------- R/get_file_by_id.R | 6 +++--- man/files.Rd | 34 ++++++++++++++++++---------------- man/get_dataframe.Rd | 2 +- 4 files changed, 38 insertions(+), 34 deletions(-) diff --git a/R/get_file.R b/R/get_file.R index c23b77c..d466026 100644 --- a/R/get_file.R +++ b/R/get_file.R @@ -45,10 +45,10 @@ #' #' # 1. Using filename and dataverse #' f1 <- get_file_by_name("nlsw88.tab", -#' dataset = "doi:10.70122/FK2/PPIAXE", +#' dataset = "10.70122/FK2/PPIAXE", #' server = "demo.dataverse.org") #' -#' # 2. Using DOI +#' # 2. Using file DOI #' f2 <- get_file_by_doi("10.70122/FK2/PPIAXE/MHDB0O", #' server = "demo.dataverse.org") #' @@ -58,21 +58,21 @@ #' #' #' -#' # 5. Retrieve multiple raw data in list -#' f5_vec <- get_dataset("doi:10.70122/FK2/PPIAXE", +#' # 4. Retrieve multiple raw data in list +#' f4_vec <- get_dataset("doi:10.70122/FK2/PPIAXE", #' server = "demo.dataverse.org")$files$id -#' f5 <- get_file(f5_vec, +#' f4 <- get_file(f4_vec, #' server = "demo.dataverse.org") -#' length(f5) +#' length(f4) #' -#' # Write binary files. +#' # Write binary files +#' # (see `get_dataframe_by_name` to load in environment) #' # The appropriate file extension needs to be assigned by the user. -#' writeBin(f1, "nlsw88.tab") -#' writeBin(f2, "nlsw88.tab") -#' writeBin(f5[[1]], "nlsw88.tab") +#' writeBin(f1, "nlsw88.dta") +#' writeBin(f2, "nlsw88.dta") #' -#' # NOTE: fix so that get_file (with multiple) files -#' # (f5) in example can return a tabulated dataset in original +#' writeBin(f4[[1]], "nlsw88.rds") # originally a rds file +#' writeBin(f4[[2]], "nlsw88.dta") # originally a dta file #' #' } #' @@ -84,7 +84,7 @@ get_file <- server = Sys.getenv("DATAVERSE_SERVER"), vars = NULL, key = Sys.getenv("DATAVERSE_KEY"), - original = NULL, + original = TRUE, ...) { format <- match.arg(format) @@ -137,7 +137,9 @@ get_file <- #' @rdname files #' #' -#' @param filename Filename of the dataset, with file extension +#' @param filename Filename of the dataset, with file extension as shown in Dataverse +#' (for example, if nlsw88.dta was the original but is displayed as the ingested +#' nlsw88.tab, use the ingested version.) #' #' @inheritParams get_file #' diff --git a/R/get_file_by_id.R b/R/get_file_by_id.R index 2ba1eb3..b797c68 100644 --- a/R/get_file_by_id.R +++ b/R/get_file_by_id.R @@ -3,7 +3,7 @@ #' @param original A logical, defaulting to TRUE. If a ingested (.tab) version is #' available, download the original version instead of the ingested? If there was #' no ingested version, is set to NA. Note in `get_dataframe_*`, -#' `original` is set to FALSE by default can be changed. +#' `original` is set to FALSE by default. Either can be changed. #' @param fileid A numeric ID internally used for `get_file_by_id` #' #' @@ -48,7 +48,7 @@ get_file_by_id <- query$format <- match.arg(format) # if the original is not desired, we need to NOT specify a format - if (is_ingested & (isFALSE(original) | is.na(original))) + if (is_ingested & (isFALSE(original) || is.na(original) || is.null(original))) query$format <- NULL @@ -76,7 +76,7 @@ get_file_by_id <- #' @rdname files #' @param filedoi A DOI for a single file (not the entire dataset), of the form -#' `"10.70122/FK2/PPKHI1/ZYATZZ"` or `"doi:10.70122/FK2/PPKHI1/ZYATZZ"` +#' `"10.70122/FK2/PPIAXE/MHDB0O"` or `"doi:10.70122/FK2/PPIAXE/MHDB0O"` #' #' @export get_file_by_doi <- function(filedoi, diff --git a/man/files.Rd b/man/files.Rd index 780ce26..2cda584 100644 --- a/man/files.Rd +++ b/man/files.Rd @@ -14,7 +14,7 @@ get_file( server = Sys.getenv("DATAVERSE_SERVER"), vars = NULL, key = Sys.getenv("DATAVERSE_KEY"), - original = NULL, + original = TRUE, ... ) @@ -81,18 +81,20 @@ Keys can be specified atomically or globally using \item{original}{A logical, defaulting to TRUE. If a ingested (.tab) version is available, download the original version instead of the ingested? If there was no ingested version, is set to NA. Note in \verb{get_dataframe_*}, -\code{original} is set to FALSE by default can be changed.} +\code{original} is set to FALSE by default. Either can be changed.} \item{...}{Additional arguments passed to an HTTP request function, such as \code{\link[httr]{GET}}, \code{\link[httr]{POST}}, or \code{\link[httr]{DELETE}}.} -\item{filename}{Filename of the dataset, with file extension} +\item{filename}{Filename of the dataset, with file extension as shown in Dataverse +(for example, if nlsw88.dta was the original but is displayed as the ingested +nlsw88.tab, use the ingested version.)} \item{fileid}{A numeric ID internally used for \code{get_file_by_id}} \item{filedoi}{A DOI for a single file (not the entire dataset), of the form -\code{"10.70122/FK2/PPKHI1/ZYATZZ"} or \code{"doi:10.70122/FK2/PPKHI1/ZYATZZ"}} +\code{"10.70122/FK2/PPIAXE/MHDB0O"} or \code{"doi:10.70122/FK2/PPIAXE/MHDB0O"}} } \value{ \code{get_file} returns a raw vector (or list of raw vectors, @@ -119,10 +121,10 @@ This function provides access to data files from a Dataverse entry. # 1. Using filename and dataverse f1 <- get_file_by_name("nlsw88.tab", - dataset = "doi:10.70122/FK2/PPIAXE", + dataset = "10.70122/FK2/PPIAXE", server = "demo.dataverse.org") -# 2. Using DOI +# 2. Using file DOI f2 <- get_file_by_doi("10.70122/FK2/PPIAXE/MHDB0O", server = "demo.dataverse.org") @@ -132,21 +134,21 @@ f3 <- get_file(d3$files$id[1], server = "demo.dataverse.org") -# 5. Retrieve multiple raw data in list -f5_vec <- get_dataset("doi:10.70122/FK2/PPIAXE", +# 4. Retrieve multiple raw data in list +f4_vec <- get_dataset("doi:10.70122/FK2/PPIAXE", server = "demo.dataverse.org")$files$id -f5 <- get_file(f5_vec, +f4 <- get_file(f4_vec, server = "demo.dataverse.org") -length(f5) +length(f4) -# Write binary files. +# Write binary files +# (see `get_dataframe_by_name` to load in environment) # The appropriate file extension needs to be assigned by the user. -writeBin(f1, "nlsw88.tab") -writeBin(f2, "nlsw88.tab") -writeBin(f5[[1]], "nlsw88.tab") +writeBin(f1, "nlsw88.dta") +writeBin(f2, "nlsw88.dta") -# NOTE: fix so that get_file (with multiple) files -# (f5) in example can return a tabulated dataset in original +writeBin(f4[[1]], "nlsw88.rds") # originally a rds file +writeBin(f4[[2]], "nlsw88.dta") # originally a dta file } diff --git a/man/get_dataframe.Rd b/man/get_dataframe.Rd index beb3b58..479b66b 100644 --- a/man/get_dataframe.Rd +++ b/man/get_dataframe.Rd @@ -51,7 +51,7 @@ or globally using \code{Sys.setenv("DATAVERSE_SERVER" = "dataverse.example.com") \item{fileid}{A numeric ID internally used for \code{get_file_by_id}} \item{filedoi}{A DOI for a single file (not the entire dataset), of the form -\code{"10.70122/FK2/PPKHI1/ZYATZZ"} or \code{"doi:10.70122/FK2/PPKHI1/ZYATZZ"}} +\code{"10.70122/FK2/PPIAXE/MHDB0O"} or \code{"doi:10.70122/FK2/PPIAXE/MHDB0O"}} } \description{ \code{get_dataframe_by_id}, if you know the numeric ID of the dataset, or instead From 66966b610815413c5ff31c714f2becc436fbeb4e Mon Sep 17 00:00:00 2001 From: Will Beasley Date: Wed, 30 Dec 2020 17:14:51 -0600 Subject: [PATCH 40/75] remove empty reference to 'stringr' package @kuriwaki, I believe this stringr isn't used, so I'm removing it from the dependencies ref #66 --- NAMESPACE | 1 - R/get_file_as_dataframe.R | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/NAMESPACE b/NAMESPACE index f928a79..b3675db 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -52,5 +52,4 @@ import(httr) import(xml2) importFrom(readr,read_tsv) importFrom(stats,setNames) -importFrom(stringr,str_extract) importFrom(utils,str) diff --git a/R/get_file_as_dataframe.R b/R/get_file_as_dataframe.R index 6859ddd..fe8fc14 100644 --- a/R/get_file_as_dataframe.R +++ b/R/get_file_as_dataframe.R @@ -123,7 +123,7 @@ get_dataframe_by_doi <- function(filedoi, #' Write to temp and apply function #' -#' @importFrom stringr str_extract +# @importFrom stringr str_extract #' #' @keywords internal get_dataframe_internal <- function(raw, filename, .f) { From 504020a1893fa7f54e31059fdca1953e85e5f034 Mon Sep 17 00:00:00 2001 From: Shiro Kuriwaki Date: Wed, 30 Dec 2020 18:17:27 -0500 Subject: [PATCH 41/75] Fix typos and improve description of get_file in README --- README.Rmd | 45 +++++++++++---------- README.md | 113 +++++++++++++++++++---------------------------------- 2 files changed, 62 insertions(+), 96 deletions(-) diff --git a/README.Rmd b/README.Rmd index 8acc2cc..063bd80 100644 --- a/README.Rmd +++ b/README.Rmd @@ -34,7 +34,7 @@ remotes::install_github("iqss/dataverse-client-r") library("dataverse") ``` -### Keys +#### Keys Some features of the Dataverse 4 API are public and require no authentication. This means in many cases you can search for and retrieve data without a Dataverse account for that a specific Dataverse installation. But, other features require a Dataverse account for the specific server installation of the Dataverse software, and an API key linked to that account. Instructions for obtaining an account and setting up an API key are available in the [Dataverse User Guide](http://guides.dataverse.org/en/latest/user/account.html). (Note: if your key is compromised, it can be regenerated to preserve security.) Once you have an API key, this should be stored as an environment variable called `DATAVERSE_KEY`. It can be set within R using: @@ -42,7 +42,7 @@ Some features of the Dataverse 4 API are public and require no authentication. T Sys.setenv("DATAVERSE_KEY" = "examplekey12345") ``` -### Server +#### Server Because [there are many Dataverse installations](http://dataverse.org/), all functions in the R client require specifying what server installation you are interacting with. This can be set by default with an environment variable, `DATAVERSE_SERVER`. This should be the Dataverse server, without the "https" prefix or the "/api" URL path, etc. For example, the Harvard Dataverse can be used by setting: @@ -56,18 +56,19 @@ Currently, the package wraps the data management features of the Dataverse API. ### Data and Metadata Retrieval -Datasets on Dataverse are directly downloadable by their API, and this is straightforward especially if the data is not restricted. The dataverse package provides multiple interfaces. Users can supply a file DOI, a dataset DOI combined with a filename, or a dataverse object. They can read in the file as a raw binary or a dataset read in with the appropriate R function. +The dataverse package provides multiple interfaces to obtain data into R. Users can supply a file DOI, a dataset DOI combined with a filename, or a dataverse object. They can read in the file as a raw binary or a dataset read in with the appropriate R function. #### Reading data as R objects -Use the `get_dataframe_*` functions, depending on the input you have. For example, we will read a survey dataset on dataverse, [nlsw88.dta](https://demo.dataverse.org/file.xhtml?persistentId=doi:10.70122/FK2/PPKHI1/ZYATZZ) (`doi:10.70122/FK2/PPKHI1/ZYATZZ`), originally in Stata dta form. +Use the `get_dataframe_*` functions, depending on the input you have. For example, we will read a survey dataset on Dataverse, [nlsw88.dta](https://demo.dataverse.org/file.xhtml?persistentId=doi:10.70122/FK2/PPKHI1/ZYATZZ) (`doi:10.70122/FK2/PPKHI1/ZYATZZ`), originally in Stata dta form. -With a file DOI +With a file DOI, we can use the `get_dataframe_by_doi` function: ```{r get_dataframe_by_doi} nlsw <- get_dataframe_by_doi("10.70122/FK2/PPIAXE/MHDB0O", server = "demo.dataverse.org") ``` +which by default reads in the ingested file (not the original dta) by the [`readr::read_tsv`](https://readr.tidyverse.org/reference/read_delim.html) function. Alternatively, we can download the same file by specifying the filename and the DOI of the "dataset" (in Dataverse, a collection of files is called a dataset). @@ -77,9 +78,11 @@ nlsw_tsv <- get_dataframe_by_name(file = "nlsw88.tab", server = "demo.dataverse.org") ``` -Many file formats are translated into an ingested, or "archival" version, which is application-neutral and easily-readable. `read_dataframe` takes this ingested version as a default by deafaulting `original = FALSE`. This is safer because you may not have the properietary software that was originally used. On the other hand, using the ingested version may lead to loss of information. +Now, Dataverse often translates rectangular data into an ingested, or "archival" version, which is application-neutral and easily-readable. `read_dataframe_*` defaults to taking this ingested version rather than using the original, through the argument `original = FALSE`. -To read the same file but its original version, specify `original = TRUE` and set a `FUN` argument. In this case, we know that `nlsw88.tab` is a Stata `.dta` dataset, so we will use the `haven::read_dta` function. +This default is safe because you may not have the proprietary software that was originally used. On the other hand, the data may have lost information in the process of the ingestation. + +Instead, to read the same file but its original version, specify `original = TRUE` and set a `FUN` argument. In this case, we know that `nlsw88.tab` is a Stata `.dta` dataset, so we will use the `haven::read_dta` function. ```{r get_dataframe_by_name_original} nlsw_original <- get_dataframe_by_name(file = "nlsw88.tab", @@ -91,31 +94,25 @@ nlsw_original <- get_dataframe_by_name(file = "nlsw88.tab", Note that even though the file prefix is ".tab", we use `read_dta`. +Of course, when the dataset is not ingested (such as a Rds file), users would always need to specify a `FUN` argument for the specific file. + Note the difference between `nls_tsv` and `nls_original`. `nls_original` preserves the data attributes like value labels, whereas `nls_tsv` has dropped this or left this in file metadata. ```{r} -class(nlsw_tsv$race) -class(nlsw_original$race) - -head(nlsw_tsv$race) -head(haven::as_factor(nlsw_original$race)) +class(nlsw_tsv$race) # tab ingested version only has numeric data ``` +```{r} +attr(nlsw_original$race, "labels") # original dta has value labels +``` -You may know the underlying file ID, which is a single numeric number unique to the dataset. In this case, the fileid is `1734017` -```{r get_dataframe_by_id} -nlsw <- get_dataframe_by_id(fileid = 1734017, - FUN = haven::read_dta, - original = TRUE, - server = "demo.dataverse.org") -``` #### Reading a dataset as a binary file. -In some cases, you may not need to render the raw binary file, or you do not have the functions to do so in R, so you want to write these into your local disk. To take only the raw files, use the `get_file` commands. The arguments are equivalent, except we do need a \`FUN\` argument +In some cases, you may not want to read in the data in your environment, perhaps because that is not possible (e.g. for a `.docx` file), and you want to simply write these files your local disk. To do this, use the more primitive `get_file_*` commands. The arguments are equivalent, except we no longer need a `FUN` argument ```{r get_file_by_name} nlsw_raw <- get_file_by_name(file = "nlsw88.tab", @@ -124,6 +121,8 @@ nlsw_raw <- get_file_by_name(file = "nlsw88.tab", class(nlsw_raw) ``` +#### Reading file metadata + The function `get_file_metadata` can also be used similarly. This will return a metadata format for ingested tabular files in the `ddi` format. The function `get_dataset` will retrieve the list of files in a dataset. ```{r, get_dataset} @@ -133,7 +132,7 @@ get_dataset(dataset = "10.70122/FK2/PPIAXE", ### Data Discovery -Dataverse supplies a pretty robust search API to discover Dataverses, datasets, and files. The simplest searches simply consist of a query string: +Dataverse supplies a robust search API to discover Dataverses, datasets, and files. The simplest searches simply consist of a query string: ```{r search1, eval = FALSE} dataverse_search("Gary King") @@ -147,8 +146,8 @@ dataverse_search(author = "Gary King", title = "Ecological Inference") And searches can be restricted to specific types of objects (Dataverse, dataset, or file): -```{r search3} -str(dataverse_search(author = "Gary King", type = "dataset"), 1) +```{r search3, eval = FALSE} +dataverse_search(author = "Gary King", type = "dataset") ``` The results are paginated using `per_page` argument. To retrieve subsequent pages, specify `start`. diff --git a/README.md b/README.md index 26a3e73..c0de64a 100644 --- a/README.md +++ b/README.md @@ -32,7 +32,7 @@ latest development version from GitHub: library("dataverse") ``` -### Keys +#### Keys Some features of the Dataverse 4 API are public and require no authentication. This means in many cases you can search for and retrieve @@ -50,7 +50,7 @@ variable called `DATAVERSE_KEY`. It can be set within R using: Sys.setenv("DATAVERSE_KEY" = "examplekey12345") ``` -### Server +#### Server Because [there are many Dataverse installations](http://dataverse.org/), all functions in the R client require specifying what server @@ -74,21 +74,19 @@ code](https://github.com/IQSS/dataverse-client-r)). ### Data and Metadata Retrieval -Datasets on Dataverse are directly downloadable by their API, and this -is straightforward especially if the data is not restricted. The -dataverse package provides multiple interfaces. Users can supply a file -DOI, a dataset DOI combined with a filename, or a dataverse object. They -can read in the file as a raw binary or a dataset read in with the -appropriate R function. +The dataverse package provides multiple interfaces to obtain data into +R. Users can supply a file DOI, a dataset DOI combined with a filename, +or a dataverse object. They can read in the file as a raw binary or a +dataset read in with the appropriate R function. #### Reading data as R objects Use the `get_dataframe_*` functions, depending on the input you have. -For example, we will read a survey dataset on dataverse, +For example, we will read a survey dataset on Dataverse, [nlsw88.dta](https://demo.dataverse.org/file.xhtml?persistentId=doi:10.70122/FK2/PPKHI1/ZYATZZ) (`doi:10.70122/FK2/PPKHI1/ZYATZZ`), originally in Stata dta form. -With a file DOI +With a file DOI, we can use the `get_dataframe_by_doi` function: ``` r nlsw <- get_dataframe_by_doi("10.70122/FK2/PPIAXE/MHDB0O", @@ -118,6 +116,11 @@ nlsw <- get_dataframe_by_doi("10.70122/FK2/PPIAXE/MHDB0O", ## tenure = col_double() ## ) +which by default reads in the ingested file (not the original dta) by +the +[`readr::read_tsv`](https://readr.tidyverse.org/reference/read_delim.html) +function. + Alternatively, we can download the same file by specifying the filename and the DOI of the “dataset” (in Dataverse, a collection of files is called a dataset). @@ -130,14 +133,16 @@ nlsw_tsv <- get_dataframe_by_name(file = "nlsw88.tab", ## Warning in get_dataframe_by_id(fileid, FUN, original = original, ...): Downloading ingested version of data with read_tsv. To download the original version and remove this warning, set original = TRUE. -Many file formats are translated into an ingested, or “archival” -version, which is application-neutral and easily-readable. -`read_dataframe` takes this ingested version as a default by deafaulting -`original = FALSE`. This is safer because you may not have the -properietary software that was originally used. On the other hand, using -the ingested version may lead to loss of information. +Now, Dataverse often translates rectangular data into an ingested, or +“archival” version, which is application-neutral and easily-readable. +`read_dataframe_*` defaults to taking this ingested version rather than +using the original, through the argument `original = FALSE`. + +This default is safe because you may not have the proprietary software +that was originally used. On the other hand, the data may have lost +information in the process of the ingestation. -To read the same file but its original version, specify +Instead, to read the same file but its original version, specify `original = TRUE` and set a `FUN` argument. In this case, we know that `nlsw88.tab` is a Stata `.dta` dataset, so we will use the `haven::read_dta` function. @@ -152,41 +157,33 @@ nlsw_original <- get_dataframe_by_name(file = "nlsw88.tab", Note that even though the file prefix is “.tab”, we use `read_dta`. +Of course, when the dataset is not ingested (such as a Rds file), users +would always need to specify a `FUN` argument for the specific file. + Note the difference between `nls_tsv` and `nls_original`. `nls_original` preserves the data attributes like value labels, whereas `nls_tsv` has dropped this or left this in file metadata. ``` r -class(nlsw_tsv$race) -class(nlsw_original$race) - -head(nlsw_tsv$race) -head(haven::as_factor(nlsw_original$race)) +class(nlsw_tsv$race) # tab ingested version only has numeric data ``` ## [1] "numeric" - ## [1] "haven_labelled" "vctrs_vctr" "double" - ## [1] 2 2 2 1 1 1 - ## [1] black black black white white white - ## Levels: white black other - -You may know the underlying file ID, which is a single numeric number -unique to the dataset. In this case, the fileid is `1734017` ``` r -nlsw <- get_dataframe_by_id(fileid = 1734017, - FUN = haven::read_dta, - original = TRUE, - server = "demo.dataverse.org") +attr(nlsw_original$race, "labels") # original dta has value labels ``` + ## white black other + ## 1 2 3 + #### Reading a dataset as a binary file. -In some cases, you may not need to render the raw binary file, or you do -not have the functions to do so in R, so you want to write these into -your local disk. To take only the raw files, use the `get_file` -commands. The arguments are equivalent, except we do need a \`FUN\` -argument +In some cases, you may not want to read in the data in your environment, +perhaps because that is not possible (e.g. for a `.docx` file), and you +want to simply write these files your local disk. To do this, use the +more primitive `get_file_*` commands. The arguments are equivalent, +except we no longer need a `FUN` argument ``` r nlsw_raw <- get_file_by_name(file = "nlsw88.tab", @@ -197,6 +194,8 @@ class(nlsw_raw) ## [1] "raw" +#### Reading file metadata + The function `get_file_metadata` can also be used similarly. This will return a metadata format for ingested tabular files in the `ddi` format. The function `get_dataset` will retrieve the list of files in a dataset. @@ -217,9 +216,8 @@ get_dataset(dataset = "10.70122/FK2/PPIAXE", ### Data Discovery -Dataverse supplies a pretty robust search API to discover Dataverses, -datasets, and files. The simplest searches simply consist of a query -string: +Dataverse supplies a robust search API to discover Dataverses, datasets, +and files. The simplest searches simply consist of a query string: ``` r dataverse_search("Gary King") @@ -235,40 +233,9 @@ And searches can be restricted to specific types of objects (Dataverse, dataset, or file): ``` r -str(dataverse_search(author = "Gary King", type = "dataset"), 1) +dataverse_search(author = "Gary King", type = "dataset") ``` - ## 10 of 701 results retrieved - - ## 'data.frame': 10 obs. of 27 variables: - ## $ name : chr "10 Million International Dyadic Events" "1479 data points of covid19 policy response times" "A Comparative Analysis of Brazil's Foreign Policy Drivers Towards the USA: Comment on Amorim Neto (2011)" "A Framework to Quantify the Signs of Abandonment in Online Digital Humanities Projects" ... - ## $ type : chr "dataset" "dataset" "dataset" "dataset" ... - ## $ url : chr "https://doi.org/10.7910/DVN/BTMQA0" "https://doi.org/10.7910/DVN/6VMRYG" "https://doi.org/10.7910/DVN/K6H0LV" "https://doi.org/10.34894/YNQOQT" ... - ## $ global_id : chr "doi:10.7910/DVN/BTMQA0" "doi:10.7910/DVN/6VMRYG" "doi:10.7910/DVN/K6H0LV" "doi:10.34894/YNQOQT" ... - ## $ description : chr "When the Palestinians launch a mortar attack into Israel, the Israeli army does not wait until the end of the c"| __truncated__ "a data set of 1479 time data points of policy responses to covid19" "This paper looks at the main finding by Amorim Neto (2011), namely that Brazil's power explains why it distance"| __truncated__ "Abstract of paper 0429 presented at the Digital Humanities Conference 2019 (DH2019), Utrecht , the Netherlands "| __truncated__ ... - ## $ published_at : chr "2014-08-21T00:00:00Z" "2020-09-14T20:25:40Z" "2017-10-30T20:30:58Z" "2020-06-20T00:00:11Z" ... - ## $ publisher : chr "Gary King Dataverse" "Jose Oriol Lopez Berengueres Dataverse" "Francisco Urdinez Dataverse" "DataverseNL Harvested Dataverse" ... - ## $ citationHtml : chr "King, Gary; Lowe, Will, 2008, \"10 Million International Dyadic Events\", Date: Wed, 30 Dec 2020 17:17:55 -0600 Subject: [PATCH 42/75] readr is only in imports, not suggests. @kuriwaki, this additional reference throws a warning in R check, so I'll remove it. I think it's central enough to be in Imports, and not Suggests https://github.com/IQSS/dataverse-client-r/commit/51581bad0c0e4f9cb645278bf9e649b6437acee0# --- DESCRIPTION | 1 - 1 file changed, 1 deletion(-) diff --git a/DESCRIPTION b/DESCRIPTION index 0d63e38..95ae618 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -51,7 +51,6 @@ Suggests: purrr, testthat, UNF, - readr, yaml Description: Provides access to Dataverse version 4 APIs , enabling data search, retrieval, and deposit. For Dataverse versions <= 4.0, From 79ca78982a9a88123fc098b7c2faea8915505a84 Mon Sep 17 00:00:00 2001 From: Will Beasley Date: Wed, 30 Dec 2020 17:24:25 -0600 Subject: [PATCH 43/75] add haven to Suggests & check in examples MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit addresses the warning: > checking for unstated dependencies in examples ... WARNING '::' or ':::' import not declared from: ‘haven’ ref #66, cc: @kuriwaki --- DESCRIPTION | 1 + R/get_file_as_dataframe.R | 14 ++++++++------ man/get_dataframe.Rd | 14 ++++++++------ 3 files changed, 17 insertions(+), 12 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 95ae618..778a441 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -47,6 +47,7 @@ Suggests: checkmate, covr, foreign, + haven, knitr, purrr, testthat, diff --git a/R/get_file_as_dataframe.R b/R/get_file_as_dataframe.R index fe8fc14..2523b23 100644 --- a/R/get_file_as_dataframe.R +++ b/R/get_file_as_dataframe.R @@ -37,12 +37,14 @@ #' # To use the original version, or for non-ingested data, #' # please specify `orginal = TRUE` and specify a function in FUN #' -#' stata_df <- get_dataframe_by_name( -#' file = "nlsw88.tab", -#' dataset = "doi:10.70122/FK2/PPIAXE", -#' server = "demo.dataverse.org", -#' original = TRUE, -#' FUN = haven::read_dta) +#' if (requireNamespace("haven", quietly = T)) { +#' stata_df <- get_dataframe_by_name( +#' file = "nlsw88.tab", +#' dataset = "doi:10.70122/FK2/PPIAXE", +#' server = "demo.dataverse.org", +#' original = TRUE, +#' FUN = haven::read_dta) +#' } #' #' rds_df <- get_dataframe_by_name( #' file = "nlsw88_rds-export.rds", diff --git a/man/get_dataframe.Rd b/man/get_dataframe.Rd index 479b66b..2cb9ae1 100644 --- a/man/get_dataframe.Rd +++ b/man/get_dataframe.Rd @@ -75,12 +75,14 @@ stata_df <- get_dataframe_by_name( # To use the original version, or for non-ingested data, # please specify `orginal = TRUE` and specify a function in FUN -stata_df <- get_dataframe_by_name( - file = "nlsw88.tab", - dataset = "doi:10.70122/FK2/PPIAXE", - server = "demo.dataverse.org", - original = TRUE, - FUN = haven::read_dta) +if (requireNamespace("haven", quietly = T)) { + stata_df <- get_dataframe_by_name( + file = "nlsw88.tab", + dataset = "doi:10.70122/FK2/PPIAXE", + server = "demo.dataverse.org", + original = TRUE, + FUN = haven::read_dta) +} rds_df <- get_dataframe_by_name( file = "nlsw88_rds-export.rds", From 1eab3f1ab6ca9feba1c7f168ede7a6af13d56bab Mon Sep 17 00:00:00 2001 From: Will Beasley Date: Wed, 30 Dec 2020 17:26:17 -0600 Subject: [PATCH 44/75] spell out 'TRUE' --- R/get_file_as_dataframe.R | 2 +- man/get_dataframe.Rd | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/R/get_file_as_dataframe.R b/R/get_file_as_dataframe.R index 2523b23..ae86d36 100644 --- a/R/get_file_as_dataframe.R +++ b/R/get_file_as_dataframe.R @@ -37,7 +37,7 @@ #' # To use the original version, or for non-ingested data, #' # please specify `orginal = TRUE` and specify a function in FUN #' -#' if (requireNamespace("haven", quietly = T)) { +#' if (requireNamespace("haven", quietly = TRUE)) { #' stata_df <- get_dataframe_by_name( #' file = "nlsw88.tab", #' dataset = "doi:10.70122/FK2/PPIAXE", diff --git a/man/get_dataframe.Rd b/man/get_dataframe.Rd index 2cb9ae1..d192c8c 100644 --- a/man/get_dataframe.Rd +++ b/man/get_dataframe.Rd @@ -75,7 +75,7 @@ stata_df <- get_dataframe_by_name( # To use the original version, or for non-ingested data, # please specify `orginal = TRUE` and specify a function in FUN -if (requireNamespace("haven", quietly = T)) { +if (requireNamespace("haven", quietly = TRUE)) { stata_df <- get_dataframe_by_name( file = "nlsw88.tab", dataset = "doi:10.70122/FK2/PPIAXE", From d3b2f5b85b1468184c352f46cac06126787ffae1 Mon Sep 17 00:00:00 2001 From: Will Beasley Date: Wed, 30 Dec 2020 17:30:15 -0600 Subject: [PATCH 45/75] spell out 'fileid' parameter in documentation Addresses the error: N checking R code for possible problems (7.9s) get_dataframe_by_doi: warning in get_dataframe_by_id(file = filedoi, FUN = FUN, original = original, ...): partial argument match of 'file' to 'fileid' ref #66, cc: @kuriwaki, tell me if you disagree with this change --- R/get_file_as_dataframe.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/get_file_as_dataframe.R b/R/get_file_as_dataframe.R index ae86d36..0e2f84b 100644 --- a/R/get_file_as_dataframe.R +++ b/R/get_file_as_dataframe.R @@ -120,7 +120,7 @@ get_dataframe_by_doi <- function(filedoi, filedoi <- prepend_doi(filedoi) # get_file can also take doi now - get_dataframe_by_id(file = filedoi, FUN = FUN, original = original, ...) + get_dataframe_by_id(fileid = filedoi, FUN = FUN, original = original, ...) } #' Write to temp and apply function From 21c793fa76670eb8f4a48e7e02da1998e6c1547b Mon Sep 17 00:00:00 2001 From: Will Beasley Date: Wed, 30 Dec 2020 21:02:09 -0600 Subject: [PATCH 46/75] export `is_ingested()` @kuriwaki, I "exported" it so the documentation examples would work. Tell me if you think it doesn't need to be in the examples. ref #66 --- NAMESPACE | 1 + R/utils.R | 4 ++-- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/NAMESPACE b/NAMESPACE index b3675db..f31c29d 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -40,6 +40,7 @@ export(get_file_by_name) export(get_file_metadata) export(get_user_key) export(initiate_sword_dataset) +export(is_ingested) export(list_datasets) export(publish_dataset) export(publish_dataverse) diff --git a/R/utils.R b/R/utils.R index 42f4ed3..ff40a9a 100644 --- a/R/utils.R +++ b/R/utils.R @@ -96,11 +96,11 @@ get_fileid.dataverse_file <- function(x, ...) { #' is_ingested(fileid = "doi:10.70122/FK2/PPIAXE/SUCFNI", #' server = "demo.dataverse.org") #' +#' @export is_ingested <- function(fileid, server = Sys.getenv("DATAVERSE_SERVER")) { ping_metadata <- tryCatch(get_file_metadata(fileid, server = server), error = function(e) e) - is_ingested <- !inherits(ping_metadata, "error") # if error, not ingested - is_ingested + !inherits(ping_metadata, "error") # if error, not ingested } From d6f0c80a2f409f30ccb68446b3146f3c72b9d303 Mon Sep 17 00:00:00 2001 From: Shiro Kuriwaki Date: Thu, 31 Dec 2020 00:12:19 -0500 Subject: [PATCH 47/75] Change the param description of dataSET. It shouldn't be a integer, but rather a persistentID. --- man-roxygen/ds.R | 4 +++- man/add_dataset_file.Rd | 4 +++- man/create_dataset.Rd | 4 +++- man/dataset_versions.Rd | 4 +++- man/delete_dataset.Rd | 4 +++- man/files.Rd | 5 ++--- man/get_dataframe.Rd | 26 ++++++++++++++++++-------- man/get_dataset.Rd | 4 +++- man/publish_dataset.Rd | 4 +++- 9 files changed, 41 insertions(+), 18 deletions(-) diff --git a/man-roxygen/ds.R b/man-roxygen/ds.R index 5391eaa..efe78d0 100644 --- a/man-roxygen/ds.R +++ b/man-roxygen/ds.R @@ -1 +1,3 @@ -#' @param dataset An integer specifying a dataset identification number or an object of class \dQuote{dataverse_dataset}. The identification number is the dataset's persistent identification number (not the integer specifying a specific version of the dataset, such as returned by \code{\link{dataset_versions}}). +#' @param dataset A character specifying a persistent identification ID for a dataset, +#' for example `"doi:10.70122/FK2/HXJVJU"`. Alternatively, an object of class +#' \dQuote{dataverse_dataset} obtained by `dataverse_contents()`. diff --git a/man/add_dataset_file.Rd b/man/add_dataset_file.Rd index c2b2b15..c0a74d7 100644 --- a/man/add_dataset_file.Rd +++ b/man/add_dataset_file.Rd @@ -28,7 +28,9 @@ update_dataset_file( \arguments{ \item{file}{A character string} -\item{dataset}{An integer specifying a dataset identification number or an object of class \dQuote{dataverse_dataset}. The identification number is the dataset's persistent identification number (not the integer specifying a specific version of the dataset, such as returned by \code{\link{dataset_versions}}).} +\item{dataset}{A character specifying a persistent identification ID for a dataset, +for example \code{"doi:10.70122/FK2/HXJVJU"}. Alternatively, an object of class +\dQuote{dataverse_dataset} obtained by \code{dataverse_contents()}.} \item{description}{Optionally, a character string providing a description of the file.} diff --git a/man/create_dataset.Rd b/man/create_dataset.Rd index 4ca5a8b..948af53 100644 --- a/man/create_dataset.Rd +++ b/man/create_dataset.Rd @@ -40,7 +40,9 @@ or globally using \code{Sys.setenv("DATAVERSE_SERVER" = "dataverse.example.com") \code{\link[httr]{GET}}, \code{\link[httr]{POST}}, or \code{\link[httr]{DELETE}}.} -\item{dataset}{An integer specifying a dataset identification number or an object of class \dQuote{dataverse_dataset}. The identification number is the dataset's persistent identification number (not the integer specifying a specific version of the dataset, such as returned by \code{\link{dataset_versions}}).} +\item{dataset}{A character specifying a persistent identification ID for a dataset, +for example \code{"doi:10.70122/FK2/HXJVJU"}. Alternatively, an object of class +\dQuote{dataverse_dataset} obtained by \code{dataverse_contents()}.} } \value{ An object of class \dQuote{dataverse_dataset}. diff --git a/man/dataset_versions.Rd b/man/dataset_versions.Rd index 31af3cc..851177c 100644 --- a/man/dataset_versions.Rd +++ b/man/dataset_versions.Rd @@ -12,7 +12,9 @@ dataset_versions( ) } \arguments{ -\item{dataset}{An integer specifying a dataset identification number or an object of class \dQuote{dataverse_dataset}. The identification number is the dataset's persistent identification number (not the integer specifying a specific version of the dataset, such as returned by \code{\link{dataset_versions}}).} +\item{dataset}{A character specifying a persistent identification ID for a dataset, +for example \code{"doi:10.70122/FK2/HXJVJU"}. Alternatively, an object of class +\dQuote{dataverse_dataset} obtained by \code{dataverse_contents()}.} \item{key}{A character string specifying a Dataverse server API key. If one is not specified, functions calling authenticated API endpoints will fail. diff --git a/man/delete_dataset.Rd b/man/delete_dataset.Rd index 56a2744..ece4252 100644 --- a/man/delete_dataset.Rd +++ b/man/delete_dataset.Rd @@ -12,7 +12,9 @@ delete_dataset( ) } \arguments{ -\item{dataset}{An integer specifying a dataset identification number or an object of class \dQuote{dataverse_dataset}. The identification number is the dataset's persistent identification number (not the integer specifying a specific version of the dataset, such as returned by \code{\link{dataset_versions}}).} +\item{dataset}{A character specifying a persistent identification ID for a dataset, +for example \code{"doi:10.70122/FK2/HXJVJU"}. Alternatively, an object of class +\dQuote{dataverse_dataset} obtained by \code{dataverse_contents()}.} \item{key}{A character string specifying a Dataverse server API key. If one is not specified, functions calling authenticated API endpoints will fail. diff --git a/man/files.Rd b/man/files.Rd index 2cda584..03916c3 100644 --- a/man/files.Rd +++ b/man/files.Rd @@ -120,12 +120,12 @@ This function provides access to data files from a Dataverse entry. \dontrun{ # 1. Using filename and dataverse -f1 <- get_file_by_name("nlsw88.tab", +f1 <- get_file_by_name(filename = "nlsw88.tab", dataset = "10.70122/FK2/PPIAXE", server = "demo.dataverse.org") # 2. Using file DOI -f2 <- get_file_by_doi("10.70122/FK2/PPIAXE/MHDB0O", +f2 <- get_file_by_doi(filedoi = "10.70122/FK2/PPIAXE/MHDB0O", server = "demo.dataverse.org") # 3. Two-steps: Find ID from get_dataset @@ -133,7 +133,6 @@ d3 <- get_dataset("doi:10.70122/FK2/PPIAXE", server = "demo.dataverse.org") f3 <- get_file(d3$files$id[1], server = "demo.dataverse.org") - # 4. Retrieve multiple raw data in list f4_vec <- get_dataset("doi:10.70122/FK2/PPIAXE", server = "demo.dataverse.org")$files$id diff --git a/man/get_dataframe.Rd b/man/get_dataframe.Rd index d192c8c..ed1c080 100644 --- a/man/get_dataframe.Rd +++ b/man/get_dataframe.Rd @@ -6,16 +6,21 @@ \alias{get_dataframe_by_doi} \title{Get file from dataverse and convert it into a dataframe or tibble} \usage{ -get_dataframe_by_name(file, dataset = NULL, FUN = NULL, original = FALSE, ...) +get_dataframe_by_name( + filename, + dataset = NULL, + FUN = NULL, + original = FALSE, + ... +) get_dataframe_by_id(fileid, FUN = NULL, original = FALSE, ...) get_dataframe_by_doi(filedoi, FUN = NULL, original = FALSE, ...) } \arguments{ -\item{file}{to be passed on to get_file} - -\item{dataset}{to be passed on to get_file} +\item{filename}{The name of the file of interest, with file extension. e.g. +\code{"roster-bulls-1996.tab"}.} \item{FUN}{The function to used for reading in the raw dataset. This user must choose the appropriate function: for example if the target is a .rds @@ -30,6 +35,11 @@ with a specified \code{FUN} is better.} \item{...}{ Arguments passed on to \code{\link[=get_file]{get_file}} \describe{ + \item{\code{file}}{An integer specifying a file identifier; or a vector of integers +specifying file identifiers; or, if used with the prefix \code{"doi:"}, a +character with the file-specific DOI; or, if used without the prefix, a +filename accompanied by a dataset DOI in the \code{dataset} argument, or an object of +class \dQuote{dataverse_file} as returned by \code{\link{dataset_files}}.} \item{\code{format}}{A character string specifying a file format for download. by default, this is \dQuote{original} (the original file format). If \code{NULL}, no query is added, so ingested files are returned in their ingested TSV form. @@ -62,13 +72,13 @@ library(readr) # load dataset from file name and dataverse DOI csv_tab <- get_dataframe_by_name( - file = "roster-bulls-1996.tab", + filename = "roster-bulls-1996.tab", dataset = "doi:10.70122/FK2/HXJVJU", server = "demo.dataverse.org") # or a Stata dta stata_df <- get_dataframe_by_name( - file = "nlsw88.tab", + filename = "nlsw88.tab", dataset = "doi:10.70122/FK2/PPIAXE", server = "demo.dataverse.org") @@ -77,7 +87,7 @@ stata_df <- get_dataframe_by_name( if (requireNamespace("haven", quietly = TRUE)) { stata_df <- get_dataframe_by_name( - file = "nlsw88.tab", + filename = "nlsw88.tab", dataset = "doi:10.70122/FK2/PPIAXE", server = "demo.dataverse.org", original = TRUE, @@ -85,7 +95,7 @@ if (requireNamespace("haven", quietly = TRUE)) { } rds_df <- get_dataframe_by_name( - file = "nlsw88_rds-export.rds", + filename = "nlsw88_rds-export.rds", dataset = "doi:10.70122/FK2/PPIAXE", server = "demo.dataverse.org", FUN = readr::read_rds) diff --git a/man/get_dataset.Rd b/man/get_dataset.Rd index c0c55e0..2611456 100644 --- a/man/get_dataset.Rd +++ b/man/get_dataset.Rd @@ -32,7 +32,9 @@ dataset_files( ) } \arguments{ -\item{dataset}{An integer specifying a dataset identification number or an object of class \dQuote{dataverse_dataset}. The identification number is the dataset's persistent identification number (not the integer specifying a specific version of the dataset, such as returned by \code{\link{dataset_versions}}).} +\item{dataset}{A character specifying a persistent identification ID for a dataset, +for example \code{"doi:10.70122/FK2/HXJVJU"}. Alternatively, an object of class +\dQuote{dataverse_dataset} obtained by \code{dataverse_contents()}.} \item{version}{A character string specifying a version of the dataset. This can be one of \dQuote{:draft} (the current draft), \dQuote{:latest} (the latest draft, if it exists, or the latest published version), \dQuote{:latest-published} (the latest published version, ignoring any draft), or \dQuote{x.y} (where \samp{x} is a major version and \samp{y} is a minor version; the \samp{.y} can be omitted to obtain a major version). In lieu of this, a dataset's version-specific identification number can be used for the \code{dataset} argument.} diff --git a/man/publish_dataset.Rd b/man/publish_dataset.Rd index 2414f77..33f0392 100644 --- a/man/publish_dataset.Rd +++ b/man/publish_dataset.Rd @@ -13,7 +13,9 @@ publish_dataset( ) } \arguments{ -\item{dataset}{An integer specifying a dataset identification number or an object of class \dQuote{dataverse_dataset}. The identification number is the dataset's persistent identification number (not the integer specifying a specific version of the dataset, such as returned by \code{\link{dataset_versions}}).} +\item{dataset}{A character specifying a persistent identification ID for a dataset, +for example \code{"doi:10.70122/FK2/HXJVJU"}. Alternatively, an object of class +\dQuote{dataverse_dataset} obtained by \code{dataverse_contents()}.} \item{minor}{A logical specifying whether the new release of the dataset is a \dQuote{minor} release (\code{TRUE}, by default), resulting in a minor version increase (e.g., from 1.1 to 1.2). If \code{FALSE}, the dataset is given a \dQuote{major} release (e.g., from 1.1 to 2.0).} From 993f9c16cabf856ffd9023b360e1db8bb3919c26 Mon Sep 17 00:00:00 2001 From: Shiro Kuriwaki Date: Thu, 31 Dec 2020 00:17:04 -0500 Subject: [PATCH 48/75] Update argument names and descriptions for filename and dataset --- R/get_file.R | 8 +++----- R/get_file_as_dataframe.R | 16 ++++++++-------- README.Rmd | 6 +++--- README.md | 8 ++++---- man/files.Rd | 4 ++++ man/get_dataframe.Rd | 6 +++++- man/get_file_metadata.Rd | 4 ++++ 7 files changed, 31 insertions(+), 21 deletions(-) diff --git a/R/get_file.R b/R/get_file.R index d466026..1538427 100644 --- a/R/get_file.R +++ b/R/get_file.R @@ -32,6 +32,7 @@ #' #' @template envvars #' @template dots +#' @template ds #' #' @return \code{get_file} returns a raw vector (or list of raw vectors, #' if \code{length(file) > 1}), which can be saved locally with the `writeBin` @@ -44,12 +45,12 @@ #' \dontrun{ #' #' # 1. Using filename and dataverse -#' f1 <- get_file_by_name("nlsw88.tab", +#' f1 <- get_file_by_name(filename = "nlsw88.tab", #' dataset = "10.70122/FK2/PPIAXE", #' server = "demo.dataverse.org") #' #' # 2. Using file DOI -#' f2 <- get_file_by_doi("10.70122/FK2/PPIAXE/MHDB0O", +#' f2 <- get_file_by_doi(filedoi = "10.70122/FK2/PPIAXE/MHDB0O", #' server = "demo.dataverse.org") #' #' # 3. Two-steps: Find ID from get_dataset @@ -57,7 +58,6 @@ #' f3 <- get_file(d3$files$id[1], server = "demo.dataverse.org") #' #' -#' #' # 4. Retrieve multiple raw data in list #' f4_vec <- get_dataset("doi:10.70122/FK2/PPIAXE", #' server = "demo.dataverse.org")$files$id @@ -141,8 +141,6 @@ get_file <- #' (for example, if nlsw88.dta was the original but is displayed as the ingested #' nlsw88.tab, use the ingested version.) #' -#' @inheritParams get_file -#' #' @export get_file_by_name <- function(filename, dataset, diff --git a/R/get_file_as_dataframe.R b/R/get_file_as_dataframe.R index 0e2f84b..17f0406 100644 --- a/R/get_file_as_dataframe.R +++ b/R/get_file_as_dataframe.R @@ -5,8 +5,8 @@ #' #' @rdname get_dataframe #' -#' @param file to be passed on to get_file -#' @param dataset to be passed on to get_file +#' @param filename The name of the file of interest, with file extension, for example +#' `"roster-bulls-1996.tab"`. #' @param FUN The function to used for reading in the raw dataset. This user #' must choose the appropriate function: for example if the target is a .rds #' file, then `FUN` should be `readRDS` or `readr::read_rds`. @@ -24,13 +24,13 @@ #' #' # load dataset from file name and dataverse DOI #' csv_tab <- get_dataframe_by_name( -#' file = "roster-bulls-1996.tab", +#' filename = "roster-bulls-1996.tab", #' dataset = "doi:10.70122/FK2/HXJVJU", #' server = "demo.dataverse.org") #' #' # or a Stata dta #' stata_df <- get_dataframe_by_name( -#' file = "nlsw88.tab", +#' filename = "nlsw88.tab", #' dataset = "doi:10.70122/FK2/PPIAXE", #' server = "demo.dataverse.org") #' @@ -39,7 +39,7 @@ #' #' if (requireNamespace("haven", quietly = TRUE)) { #' stata_df <- get_dataframe_by_name( -#' file = "nlsw88.tab", +#' filename = "nlsw88.tab", #' dataset = "doi:10.70122/FK2/PPIAXE", #' server = "demo.dataverse.org", #' original = TRUE, @@ -47,7 +47,7 @@ #' } #' #' rds_df <- get_dataframe_by_name( -#' file = "nlsw88_rds-export.rds", +#' filename = "nlsw88_rds-export.rds", #' dataset = "doi:10.70122/FK2/PPIAXE", #' server = "demo.dataverse.org", #' FUN = readr::read_rds) @@ -60,7 +60,7 @@ #' FUN = haven::read_dta #' ) #' @export -get_dataframe_by_name <- function(file, +get_dataframe_by_name <- function(filename, dataset = NULL, FUN = NULL, original = FALSE, @@ -68,7 +68,7 @@ get_dataframe_by_name <- function(file, # retrieve ID fileid <- get_fileid.character(x = dataset, - file = file, + file = filename, ...) get_dataframe_by_id(fileid, FUN, original = original, ...) diff --git a/README.Rmd b/README.Rmd index 063bd80..7460f4e 100644 --- a/README.Rmd +++ b/README.Rmd @@ -73,7 +73,7 @@ which by default reads in the ingested file (not the original dta) by the [`read Alternatively, we can download the same file by specifying the filename and the DOI of the "dataset" (in Dataverse, a collection of files is called a dataset). ```{r get_dataframe_by_name_tsv, message=FALSE} -nlsw_tsv <- get_dataframe_by_name(file = "nlsw88.tab", +nlsw_tsv <- get_dataframe_by_name(filename = "nlsw88.tab", dataset = "10.70122/FK2/PPIAXE", server = "demo.dataverse.org") ``` @@ -85,7 +85,7 @@ This default is safe because you may not have the proprietary software that was Instead, to read the same file but its original version, specify `original = TRUE` and set a `FUN` argument. In this case, we know that `nlsw88.tab` is a Stata `.dta` dataset, so we will use the `haven::read_dta` function. ```{r get_dataframe_by_name_original} -nlsw_original <- get_dataframe_by_name(file = "nlsw88.tab", +nlsw_original <- get_dataframe_by_name(filename = "nlsw88.tab", dataset = "10.70122/FK2/PPIAXE", FUN = haven::read_dta, original = TRUE, @@ -115,7 +115,7 @@ attr(nlsw_original$race, "labels") # original dta has value labels In some cases, you may not want to read in the data in your environment, perhaps because that is not possible (e.g. for a `.docx` file), and you want to simply write these files your local disk. To do this, use the more primitive `get_file_*` commands. The arguments are equivalent, except we no longer need a `FUN` argument ```{r get_file_by_name} -nlsw_raw <- get_file_by_name(file = "nlsw88.tab", +nlsw_raw <- get_file_by_name(filename = "nlsw88.tab", dataset = "10.70122/FK2/PPIAXE", server = "demo.dataverse.org") class(nlsw_raw) diff --git a/README.md b/README.md index c0de64a..7eb8981 100644 --- a/README.md +++ b/README.md @@ -93,7 +93,7 @@ nlsw <- get_dataframe_by_doi("10.70122/FK2/PPIAXE/MHDB0O", server = "demo.dataverse.org") ``` - ## Warning in get_dataframe_by_id(file = filedoi, FUN = FUN, original = original, : Downloading ingested version of data with read_tsv. To download the original version and remove this warning, set original = TRUE. + ## Warning in get_dataframe_by_id(fileid = filedoi, FUN = FUN, original = original, : Downloading ingested version of data with read_tsv. To download the original version and remove this warning, set original = TRUE. ## Parsed with column specification: ## cols( @@ -126,7 +126,7 @@ and the DOI of the “dataset” (in Dataverse, a collection of files is called a dataset). ``` r -nlsw_tsv <- get_dataframe_by_name(file = "nlsw88.tab", +nlsw_tsv <- get_dataframe_by_name(filename = "nlsw88.tab", dataset = "10.70122/FK2/PPIAXE", server = "demo.dataverse.org") ``` @@ -148,7 +148,7 @@ Instead, to read the same file but its original version, specify `haven::read_dta` function. ``` r -nlsw_original <- get_dataframe_by_name(file = "nlsw88.tab", +nlsw_original <- get_dataframe_by_name(filename = "nlsw88.tab", dataset = "10.70122/FK2/PPIAXE", FUN = haven::read_dta, original = TRUE, @@ -186,7 +186,7 @@ more primitive `get_file_*` commands. The arguments are equivalent, except we no longer need a `FUN` argument ``` r -nlsw_raw <- get_file_by_name(file = "nlsw88.tab", +nlsw_raw <- get_file_by_name(filename = "nlsw88.tab", dataset = "10.70122/FK2/PPIAXE", server = "demo.dataverse.org") class(nlsw_raw) diff --git a/man/files.Rd b/man/files.Rd index 03916c3..da42bf0 100644 --- a/man/files.Rd +++ b/man/files.Rd @@ -58,6 +58,10 @@ character with the file-specific DOI; or, if used without the prefix, a filename accompanied by a dataset DOI in the \code{dataset} argument, or an object of class \dQuote{dataverse_file} as returned by \code{\link{dataset_files}}.} +\item{dataset}{A character specifying a persistent identification ID for a dataset, +for example \code{"doi:10.70122/FK2/HXJVJU"}. Alternatively, an object of class +\dQuote{dataverse_dataset} obtained by \code{dataverse_contents()}.} + \item{format}{A character string specifying a file format for download. by default, this is \dQuote{original} (the original file format). If \code{NULL}, no query is added, so ingested files are returned in their ingested TSV form. diff --git a/man/get_dataframe.Rd b/man/get_dataframe.Rd index ed1c080..a7c0ca8 100644 --- a/man/get_dataframe.Rd +++ b/man/get_dataframe.Rd @@ -19,9 +19,13 @@ get_dataframe_by_id(fileid, FUN = NULL, original = FALSE, ...) get_dataframe_by_doi(filedoi, FUN = NULL, original = FALSE, ...) } \arguments{ -\item{filename}{The name of the file of interest, with file extension. e.g. +\item{filename}{The name of the file of interest, with file extension, for example \code{"roster-bulls-1996.tab"}.} +\item{dataset}{A character specifying a persistent identification ID for a dataset, +for example \code{"doi:10.70122/FK2/HXJVJU"}. Alternatively, an object of class +\dQuote{dataverse_dataset} obtained by \code{dataverse_contents()}.} + \item{FUN}{The function to used for reading in the raw dataset. This user must choose the appropriate function: for example if the target is a .rds file, then \code{FUN} should be \code{readRDS} or \code{readr::read_rds}.} diff --git a/man/get_file_metadata.Rd b/man/get_file_metadata.Rd index 9291066..14fb258 100644 --- a/man/get_file_metadata.Rd +++ b/man/get_file_metadata.Rd @@ -20,6 +20,10 @@ character with the file-specific DOI; or, if used without the prefix, a filename accompanied by a dataset DOI in the \code{dataset} argument, or an object of class \dQuote{dataverse_file} as returned by \code{\link{dataset_files}}.} +\item{dataset}{A character specifying a persistent identification ID for a dataset, +for example \code{"doi:10.70122/FK2/HXJVJU"}. Alternatively, an object of class +\dQuote{dataverse_dataset} obtained by \code{dataverse_contents()}.} + \item{format}{Defaults to \dQuote{ddi} for metadata files} \item{key}{A character string specifying a Dataverse server API key. If one From f2be27c10f72c258137e167e803874fa2fc66d16 Mon Sep 17 00:00:00 2001 From: Will Beasley Date: Thu, 31 Dec 2020 10:08:14 -0600 Subject: [PATCH 49/75] complete some documentation checkboxes @kuriwaki, can you please add a description for this parameter? ref #66 --- R/get_file.R | 1 + R/utils.R | 16 +++++++++++++--- man/files.Rd | 2 ++ man/get_file_metadata.Rd | 2 ++ man/is_ingested.Rd | 16 +++++++++++++++- 5 files changed, 33 insertions(+), 4 deletions(-) diff --git a/R/get_file.R b/R/get_file.R index d466026..771bc02 100644 --- a/R/get_file.R +++ b/R/get_file.R @@ -21,6 +21,7 @@ #' character with the file-specific DOI; or, if used without the prefix, a #' filename accompanied by a dataset DOI in the `dataset` argument, or an object of #' class \dQuote{dataverse_file} as returned by \code{\link{dataset_files}}. +#' @param dataset @kuriwaki, can you please add a description for this parameter? #' @param format A character string specifying a file format for download. #' by default, this is \dQuote{original} (the original file format). If `NULL`, #' no query is added, so ingested files are returned in their ingested TSV form. diff --git a/R/utils.R b/R/utils.R index ff40a9a..a7a9a37 100644 --- a/R/utils.R +++ b/R/utils.R @@ -85,6 +85,7 @@ get_fileid.dataverse_file <- function(x, ...) { #' Identify if file is an ingested file #' #' @param fileid A numeric fileid or file-specific DOI +#' @template envvars #' #' @examples #' # https://demo.dataverse.org/file.xhtml?persistentId=doi:10.70122/FK2/X5MUPQ/T0KKUZ @@ -97,9 +98,18 @@ get_fileid.dataverse_file <- function(x, ...) { #' server = "demo.dataverse.org") #' #' @export -is_ingested <- function(fileid, server = Sys.getenv("DATAVERSE_SERVER")) { - ping_metadata <- tryCatch(get_file_metadata(fileid, server = server), - error = function(e) e) +is_ingested <- + function( + fileid, + key = Sys.getenv("DATAVERSE_KEY"), + server = Sys.getenv("DATAVERSE_SERVER") + ) { + ping_metadata <- tryCatch( + { + get_file_metadata(fileid, key = key, server = server) + }, + error = function(e) e + ) !inherits(ping_metadata, "error") # if error, not ingested } diff --git a/man/files.Rd b/man/files.Rd index 2cda584..7c14800 100644 --- a/man/files.Rd +++ b/man/files.Rd @@ -58,6 +58,8 @@ character with the file-specific DOI; or, if used without the prefix, a filename accompanied by a dataset DOI in the \code{dataset} argument, or an object of class \dQuote{dataverse_file} as returned by \code{\link{dataset_files}}.} +\item{dataset}{@kuriwaki, can you please add a description for this parameter?} + \item{format}{A character string specifying a file format for download. by default, this is \dQuote{original} (the original file format). If \code{NULL}, no query is added, so ingested files are returned in their ingested TSV form. diff --git a/man/get_file_metadata.Rd b/man/get_file_metadata.Rd index 9291066..b411af9 100644 --- a/man/get_file_metadata.Rd +++ b/man/get_file_metadata.Rd @@ -20,6 +20,8 @@ character with the file-specific DOI; or, if used without the prefix, a filename accompanied by a dataset DOI in the \code{dataset} argument, or an object of class \dQuote{dataverse_file} as returned by \code{\link{dataset_files}}.} +\item{dataset}{@kuriwaki, can you please add a description for this parameter?} + \item{format}{Defaults to \dQuote{ddi} for metadata files} \item{key}{A character string specifying a Dataverse server API key. If one diff --git a/man/is_ingested.Rd b/man/is_ingested.Rd index 501f935..b92fe02 100644 --- a/man/is_ingested.Rd +++ b/man/is_ingested.Rd @@ -4,10 +4,24 @@ \alias{is_ingested} \title{Identify if file is an ingested file} \usage{ -is_ingested(fileid, server = Sys.getenv("DATAVERSE_SERVER")) +is_ingested( + fileid, + key = Sys.getenv("DATAVERSE_KEY"), + server = Sys.getenv("DATAVERSE_SERVER") +) } \arguments{ \item{fileid}{A numeric fileid or file-specific DOI} + +\item{key}{A character string specifying a Dataverse server API key. If one +is not specified, functions calling authenticated API endpoints will fail. +Keys can be specified atomically or globally using +\code{Sys.setenv("DATAVERSE_KEY" = "examplekey")}.} + +\item{server}{A character string specifying a Dataverse server. There are +multiple Dataverse installations, but the defaults is to use the Harvard +Dataverse (\code{server = "dataverse.harvard.edu"}). This can be modified atomically +or globally using \code{Sys.setenv("DATAVERSE_SERVER" = "dataverse.example.com")}.} } \description{ Identify if file is an ingested file From 904f9597461160a2442d17b99b0318da27974980 Mon Sep 17 00:00:00 2001 From: Will Beasley Date: Thu, 31 Dec 2020 14:08:07 -0600 Subject: [PATCH 50/75] hide visibility of `is_ingested()` @kuriwaki, I kinda changed my mind. At least for now, let's keep this private, so it's one less thing that we have to worry about for backward compatibility. It will still be available through `:::`. If it's helpful to others, we'll make it public. ref #66 --- NAMESPACE | 1 - R/utils.R | 21 ++++++++++----------- man/is_ingested.Rd | 11 ----------- 3 files changed, 10 insertions(+), 23 deletions(-) diff --git a/NAMESPACE b/NAMESPACE index f31c29d..b3675db 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -40,7 +40,6 @@ export(get_file_by_name) export(get_file_metadata) export(get_user_key) export(initiate_sword_dataset) -export(is_ingested) export(list_datasets) export(publish_dataset) export(publish_dataverse) diff --git a/R/utils.R b/R/utils.R index a7a9a37..df36f49 100644 --- a/R/utils.R +++ b/R/utils.R @@ -87,17 +87,16 @@ get_fileid.dataverse_file <- function(x, ...) { #' @param fileid A numeric fileid or file-specific DOI #' @template envvars #' -#' @examples -#' # https://demo.dataverse.org/file.xhtml?persistentId=doi:10.70122/FK2/X5MUPQ/T0KKUZ -#' # nlsw88.tab -#' is_ingested(fileid = "doi:10.70122/FK2/X5MUPQ/T0KKUZ", -#' server = "demo.dataverse.org") -#' -#' # nlsw88_rds-export.rds -#' is_ingested(fileid = "doi:10.70122/FK2/PPIAXE/SUCFNI", -#' server = "demo.dataverse.org") -#' -#' @export +# @examples +# # https://demo.dataverse.org/file.xhtml?persistentId=doi:10.70122/FK2/X5MUPQ/T0KKUZ +# # nlsw88.tab +# is_ingested(fileid = "doi:10.70122/FK2/X5MUPQ/T0KKUZ", +# server = "demo.dataverse.org") +# +# # nlsw88_rds-export.rds +# is_ingested(fileid = "doi:10.70122/FK2/PPIAXE/SUCFNI", +# server = "demo.dataverse.org") +# is_ingested <- function( fileid, diff --git a/man/is_ingested.Rd b/man/is_ingested.Rd index b92fe02..b46c456 100644 --- a/man/is_ingested.Rd +++ b/man/is_ingested.Rd @@ -26,14 +26,3 @@ or globally using \code{Sys.setenv("DATAVERSE_SERVER" = "dataverse.example.com") \description{ Identify if file is an ingested file } -\examples{ -# https://demo.dataverse.org/file.xhtml?persistentId=doi:10.70122/FK2/X5MUPQ/T0KKUZ -# nlsw88.tab -is_ingested(fileid = "doi:10.70122/FK2/X5MUPQ/T0KKUZ", - server = "demo.dataverse.org") - -# nlsw88_rds-export.rds -is_ingested(fileid = "doi:10.70122/FK2/PPIAXE/SUCFNI", - server = "demo.dataverse.org") - -} From 4f2d4accbbbccf2d2640730a2cb465cb4be0a3ed Mon Sep 17 00:00:00 2001 From: Will Beasley Date: Thu, 31 Dec 2020 15:45:33 -0600 Subject: [PATCH 51/75] tweak progression & formatting of `get_dataframe()` example. ref #66 --- R/get_file_as_dataframe.R | 79 +++++++++++++++++++++++---------------- man/get_dataframe.Rd | 77 ++++++++++++++++++++++---------------- 2 files changed, 90 insertions(+), 66 deletions(-) diff --git a/R/get_file_as_dataframe.R b/R/get_file_as_dataframe.R index 17f0406..6158c75 100644 --- a/R/get_file_as_dataframe.R +++ b/R/get_file_as_dataframe.R @@ -20,45 +20,56 @@ #' @importFrom readr read_tsv #' #' @examples -#' library(readr) #' -#' # load dataset from file name and dataverse DOI -#' csv_tab <- get_dataframe_by_name( -#' filename = "roster-bulls-1996.tab", -#' dataset = "doi:10.70122/FK2/HXJVJU", -#' server = "demo.dataverse.org") +#' # Retrieve data.frame from dataverse DOI and file name +#' df_from_rds_ingested <- +#' get_dataframe_by_name( +#' filename = "roster-bulls-1996.tab", +#' dataset = "doi:10.70122/FK2/HXJVJU", +#' server = "demo.dataverse.org" +#' ) #' -#' # or a Stata dta -#' stata_df <- get_dataframe_by_name( -#' filename = "nlsw88.tab", -#' dataset = "doi:10.70122/FK2/PPIAXE", -#' server = "demo.dataverse.org") +#' # Retrieve the same data.frame from dataverse + file DOI +#' df_from_rds_ingested_by_doi <- +#' get_dataframe_by_doi( +#' filedoi = "10.70122/FK2/HXJVJU/SA3Z2V", +#' server = "demo.dataverse.org" +#' ) #' -#' # To use the original version, or for non-ingested data, -#' # please specify `orginal = TRUE` and specify a function in FUN +#' # Retrieve ingested file originally a Stata dta +#' df_from_stata_ingested <- +#' get_dataframe_by_name( +#' filename = "nlsw88.tab", +#' dataset = "doi:10.70122/FK2/PPIAXE", +#' server = "demo.dataverse.org" +#' ) #' -#' if (requireNamespace("haven", quietly = TRUE)) { -#' stata_df <- get_dataframe_by_name( -#' filename = "nlsw88.tab", -#' dataset = "doi:10.70122/FK2/PPIAXE", -#' server = "demo.dataverse.org", -#' original = TRUE, -#' FUN = haven::read_dta) -#' } #' -#' rds_df <- get_dataframe_by_name( -#' filename = "nlsw88_rds-export.rds", -#' dataset = "doi:10.70122/FK2/PPIAXE", -#' server = "demo.dataverse.org", -#' FUN = readr::read_rds) +#' # To use the original file version, or for non-ingested data, +#' # please specify `orginal = TRUE` and specify a function in FUN. +#' +#' # A data.frame is still returned, but the +#' if (requireNamespace("readr", quietly = TRUE)) { +#' df_from_rds_original <- +#' get_dataframe_by_name( +#' filename = "nlsw88_rds-export.rds", +#' dataset = "doi:10.70122/FK2/PPIAXE", +#' server = "demo.dataverse.org", +#' original = TRUE, +#' FUN = readr::read_rds +#' ) +#' } #' -#' # equivalently, if you know the DOI -#' stata_df <- get_dataframe_by_doi( -#' filedoi = "10.70122/FK2/PPIAXE/MHDB0O", -#' server = "demo.dataverse.org", -#' original = TRUE, -#' FUN = haven::read_dta -#' ) +#' if (requireNamespace("haven", quietly = TRUE)) { +#' df_from_stata_original <- +#' get_dataframe_by_name( +#' filename = "nlsw88.tab", +#' dataset = "doi:10.70122/FK2/PPIAXE", +#' server = "demo.dataverse.org", +#' original = TRUE, +#' FUN = haven::read_dta +#' ) +#' } #' @export get_dataframe_by_name <- function(filename, dataset = NULL, @@ -133,5 +144,7 @@ get_dataframe_internal <- function(raw, filename, .f) { writeBin(raw, tmp) do.call(.f, list(tmp)) + + # TODO: unlink/delete tmp file in a try/catch/finally block. } diff --git a/man/get_dataframe.Rd b/man/get_dataframe.Rd index a7c0ca8..a21b06e 100644 --- a/man/get_dataframe.Rd +++ b/man/get_dataframe.Rd @@ -72,43 +72,54 @@ or globally using \code{Sys.setenv("DATAVERSE_SERVER" = "dataverse.example.com") \code{get_dataframe_by_name} if you know the filename and doi. The dataset } \examples{ -library(readr) -# load dataset from file name and dataverse DOI -csv_tab <- get_dataframe_by_name( - filename = "roster-bulls-1996.tab", - dataset = "doi:10.70122/FK2/HXJVJU", - server = "demo.dataverse.org") +# Retrieve data.frame from dataverse DOI and file name +df_from_rds_ingested <- + get_dataframe_by_name( + filename = "roster-bulls-1996.tab", + dataset = "doi:10.70122/FK2/HXJVJU", + server = "demo.dataverse.org" + ) -# or a Stata dta -stata_df <- get_dataframe_by_name( - filename = "nlsw88.tab", - dataset = "doi:10.70122/FK2/PPIAXE", - server = "demo.dataverse.org") +# Retrieve the same data.frame from dataverse + file DOI +df_from_rds_ingested_by_doi <- + get_dataframe_by_doi( + filedoi = "10.70122/FK2/HXJVJU/SA3Z2V", + server = "demo.dataverse.org" + ) -# To use the original version, or for non-ingested data, -# please specify `orginal = TRUE` and specify a function in FUN +# Retrieve ingested file originally a Stata dta +df_from_stata_ingested <- + get_dataframe_by_name( + filename = "nlsw88.tab", + dataset = "doi:10.70122/FK2/PPIAXE", + server = "demo.dataverse.org" + ) -if (requireNamespace("haven", quietly = TRUE)) { - stata_df <- get_dataframe_by_name( - filename = "nlsw88.tab", - dataset = "doi:10.70122/FK2/PPIAXE", - server = "demo.dataverse.org", - original = TRUE, - FUN = haven::read_dta) -} -rds_df <- get_dataframe_by_name( - filename = "nlsw88_rds-export.rds", - dataset = "doi:10.70122/FK2/PPIAXE", - server = "demo.dataverse.org", - FUN = readr::read_rds) +# To use the original file version, or for non-ingested data, +# please specify `orginal = TRUE` and specify a function in FUN. -# equivalently, if you know the DOI -stata_df <- get_dataframe_by_doi( - filedoi = "10.70122/FK2/PPIAXE/MHDB0O", - server = "demo.dataverse.org", - original = TRUE, - FUN = haven::read_dta -) +# A data.frame is still returned, but the +if (requireNamespace("readr", quietly = TRUE)) { + df_from_rds_original <- + get_dataframe_by_name( + filename = "nlsw88_rds-export.rds", + dataset = "doi:10.70122/FK2/PPIAXE", + server = "demo.dataverse.org", + original = TRUE, + FUN = readr::read_rds + ) +} + +if (requireNamespace("haven", quietly = TRUE)) { + df_from_stata_original <- + get_dataframe_by_name( + filename = "nlsw88.tab", + dataset = "doi:10.70122/FK2/PPIAXE", + server = "demo.dataverse.org", + original = TRUE, + FUN = haven::read_dta + ) +} } From 39d4c302a786a29e8232501e9ba0bc2b38e43f19 Mon Sep 17 00:00:00 2001 From: Will Beasley Date: Thu, 31 Dec 2020 16:27:40 -0600 Subject: [PATCH 52/75] rename `get_file_as_dataframe.R` to `get_dataframe.R` @kuriwaki, so the file name closer reflects your function name and the Rd name ref #66 --- R/{get_file_as_dataframe.R => get_dataframe.R} | 2 +- man/get_dataframe.Rd | 4 ++-- man/get_dataframe_internal.Rd | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) rename R/{get_file_as_dataframe.R => get_dataframe.R} (98%) diff --git a/R/get_file_as_dataframe.R b/R/get_dataframe.R similarity index 98% rename from R/get_file_as_dataframe.R rename to R/get_dataframe.R index 6158c75..c8cbe82 100644 --- a/R/get_file_as_dataframe.R +++ b/R/get_dataframe.R @@ -46,7 +46,7 @@ #' #' #' # To use the original file version, or for non-ingested data, -#' # please specify `orginal = TRUE` and specify a function in FUN. +#' # please specify `original = TRUE` and specify a function in FUN. #' #' # A data.frame is still returned, but the #' if (requireNamespace("readr", quietly = TRUE)) { diff --git a/man/get_dataframe.Rd b/man/get_dataframe.Rd index a21b06e..4e5f5d0 100644 --- a/man/get_dataframe.Rd +++ b/man/get_dataframe.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/get_file_as_dataframe.R +% Please edit documentation in R/get_dataframe.R \name{get_dataframe_by_name} \alias{get_dataframe_by_name} \alias{get_dataframe_by_id} @@ -98,7 +98,7 @@ df_from_stata_ingested <- # To use the original file version, or for non-ingested data, -# please specify `orginal = TRUE` and specify a function in FUN. +# please specify `original = TRUE` and specify a function in FUN. # A data.frame is still returned, but the if (requireNamespace("readr", quietly = TRUE)) { diff --git a/man/get_dataframe_internal.Rd b/man/get_dataframe_internal.Rd index 1f46382..d53841a 100644 --- a/man/get_dataframe_internal.Rd +++ b/man/get_dataframe_internal.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/get_file_as_dataframe.R +% Please edit documentation in R/get_dataframe.R \name{get_dataframe_internal} \alias{get_dataframe_internal} \title{Write to temp and apply function} From a41ab8805072a062da2baabe55735c9d14054aad Mon Sep 17 00:00:00 2001 From: Will Beasley Date: Thu, 31 Dec 2020 17:36:17 -0600 Subject: [PATCH 53/75] include plain text in yaml expectation file ref #70 --- inst/dataset-basketball/expected-metadata.yml | 76 +++++++++++++++++++ tests/testthat/manual/seed/seed-yaml.R | 38 +++++++++- tests/testthat/tests-get_dataframe.R | 38 ++++++++++ 3 files changed, 150 insertions(+), 2 deletions(-) create mode 100644 tests/testthat/tests-get_dataframe.R diff --git a/inst/dataset-basketball/expected-metadata.yml b/inst/dataset-basketball/expected-metadata.yml index 8957bac..1a8aa94 100644 --- a/inst/dataset-basketball/expected-metadata.yml +++ b/inst/dataset-basketball/expected-metadata.yml @@ -24,6 +24,19 @@ roster: type: MD5 value: c6feabffac401627b80761c5a1de55f0 creationDate: '2020-12-29' + raw_value: "number,player,position,height,weight,dob,country_birth,experience_years,college\r\n0,Robert + Parish,C,7-0,230,\"August 30, 1953\",us,20,Centenary College of Louisiana\r\n1,Randy + Brown,PG,6-2,190,\"May 22, 1968\",us,5,\"Houston, New Mexico State\"\r\n6,Matt + Steigenga,SF,6-7,225,\"March 27, 1970\",us,0,Michigan State\r\n7,Toni Kukoč,SF,6-10,192,\"September + 18, 1968\",hr,3,\r\n8,Dickey Simpkins,PF,6-9,248,\"April 6, 1972\",us,2,Providence\r\n9,Ron + Harper,PG,6-6,185,\"January 20, 1964\",us,10,Miami University\r\n13,Luc Longley,C,7-2,265,\"January + 19, 1969\",au,5,New Mexico\r\n18,Bison Dele,C,6-9,235,\"April 6, 1969\",us,5,\"Maryland, Arizona\"\r\n23,Michael + Jordan,SG,6-6,195,\"February 17, 1963\",us,11,UNC\r\n25,Steve Kerr,PG,6-3,175,\"September + 27, 1965\",lb,8,Arizona\r\n30,Jud Buechler,SF,6-6,220,\"June 19, 1968\",us,6,Arizona\r\n33,Scottie + Pippen,SF,6-8,210,\"September 25, 1965\",us,9,University of Central Arkansas\r\n34,Bill + Wennington,C,7-0,245,\"April 26, 1963\",ca,9,St. John's\r\n35,Jason Caffey,PF,6-8,255,\"June + 12, 1973\",us,1,Alabama\r\n91,Dennis Rodman,PF,6-7,210,\"May 13, 1961\",us,10,Southeastern + Oklahoma State University\r\n" image: description: 'ID: 1734006. A svg file' label: vector-basketball.svg @@ -46,3 +59,66 @@ image: type: MD5 value: 8038c2efb57dd470e908ae2ad1ff70e0 creationDate: '2020-12-29' + raw_value: |+ + + + + + Created by potrace 1.15, written by Peter Selinger 2001-2017 + + + + + + diff --git a/tests/testthat/manual/seed/seed-yaml.R b/tests/testthat/manual/seed/seed-yaml.R index a582b95..94518a9 100644 --- a/tests/testthat/manual/seed/seed-yaml.R +++ b/tests/testthat/manual/seed/seed-yaml.R @@ -1,8 +1,10 @@ import::from("magrittr", "%>%") dv <- get_dataverse("dataverse-client-r") contents <- dataverse_contents(dv) -ds_1 <- dataset_files(contents[[1]]) +ds_1 <- dataset_files(contents[[1]]) %>% + rlang::set_names(c("roster", "image")) # Manually add friendly names to each file +# ---- seed-dataverses --------------------------------------------------- get_dataverse(":root") %>% base::append(c("testing_name" = ":root")) %>% yaml::write_yaml("inst/expected-dataverse-root.yml") @@ -11,10 +13,42 @@ dv %>% base::append(c("testing_name" = "dataverse-client-r")) %>% yaml::write_yaml("inst/expected-dataverse.yml") + +# ---- seed-basketball-files --------------------------------------------------- +file_csv <- + get_dataframe_by_name( + filename = "roster-bulls-1996.tab", + dataset = "doi:10.70122/FK2/HXJVJU", + original = TRUE, + FUN = readr::read_file + ) + +ds_1$roster$raw_value <- + get_dataframe_by_name( + # filename = "roster-bulls-1996.tab", + filename = ds_1$roster$label, + dataset = dirname(ds_1$roster$dataFile$persistentId), + original = TRUE, + FUN = readr::read_file + ) + +ds_1$image$raw_value <- + paste0( # The yaml needs a terminal new line to mirror the real content. + get_dataframe_by_name( + # filename = "roster-bulls-1996.tab", + filename = ds_1$image$label, + dataset = dirname(ds_1$image$dataFile$persistentId), + original = TRUE, + FUN = readr::read_file + ), + "\n" + ) + ds_1 %>% - rlang::set_names(c("roster", "image")) %>% # Manually add friendly names to each file + # rlang::set_names(c("roster", "image")) %>% # Manually add friendly names to each file yaml::write_yaml("inst/dataset-basketball/expected-metadata.yml") + # retrieve-from-file ------------------------------------------------------ y <- yaml::read_yaml(system.file("dataset-basketball/expected-metadata.yml", package = "dataverse")) diff --git a/tests/testthat/tests-get_dataframe.R b/tests/testthat/tests-get_dataframe.R new file mode 100644 index 0000000..756c09e --- /dev/null +++ b/tests/testthat/tests-get_dataframe.R @@ -0,0 +1,38 @@ +# See https://demo.dataverse.org/dataverse/dataverse-client-r +# https://doi.org/10.70122/FK2/HXJVJU + +test_that("roster-original", { + expected_ds <- retrieve_info_dataset("dataset-basketball/expected-metadata.yml") + expected_file <- expected_ds$roster$raw_value + + actual <- + get_dataframe_by_name( + filename = expected_ds$roster$label , #"roster-bulls-1996.tab", + dataset = dirname(expected_ds$roster$dataFile$persistentId), #"doi:10.70122/FK2/HXJVJU", + original = TRUE, + FUN = readr::read_file + ) + + expect_equal(substr(actual, 1, 30), substr(expected_file, 1, 30)) + expect_equal(nchar( actual ), nchar( expected_file )) + + expect_equal(actual, expected_file) +}) + +test_that("image-original", { + expected_ds <- retrieve_info_dataset("dataset-basketball/expected-metadata.yml") + expected_file <- expected_ds$image$raw_value + + actual <- + get_dataframe_by_name( + filename = expected_ds$image$label , #"vector-basketball.svg", + dataset = dirname(expected_ds$image$dataFile$persistentId), #"doi:10.70122/FK2/HXJVJU", + original = TRUE, + FUN = readr::read_file + ) + + expect_equal(substr(actual, 1, 30), substr(expected_file, 1, 30)) + expect_equal(nchar( actual ), nchar( expected_file )) + + expect_equal(actual, expected_file) +}) From 56feab169672caea8b9c9fd6d5ee17349be93822 Mon Sep 17 00:00:00 2001 From: Will Beasley Date: Thu, 31 Dec 2020 17:52:38 -0600 Subject: [PATCH 54/75] include doi & id retrieval ref #70 --- tests/testthat/tests-get_dataframe.R | 76 ++++++++++++++++++++++++++-- 1 file changed, 72 insertions(+), 4 deletions(-) diff --git a/tests/testthat/tests-get_dataframe.R b/tests/testthat/tests-get_dataframe.R index 756c09e..2c656aa 100644 --- a/tests/testthat/tests-get_dataframe.R +++ b/tests/testthat/tests-get_dataframe.R @@ -1,14 +1,14 @@ # See https://demo.dataverse.org/dataverse/dataverse-client-r # https://doi.org/10.70122/FK2/HXJVJU -test_that("roster-original", { +test_that("roster-original-by-name", { expected_ds <- retrieve_info_dataset("dataset-basketball/expected-metadata.yml") expected_file <- expected_ds$roster$raw_value actual <- get_dataframe_by_name( - filename = expected_ds$roster$label , #"roster-bulls-1996.tab", - dataset = dirname(expected_ds$roster$dataFile$persistentId), #"doi:10.70122/FK2/HXJVJU", + filename = expected_ds$roster$label , # A value like "roster-bulls-1996.tab", + dataset = dirname(expected_ds$roster$dataFile$persistentId), # A value like "doi:10.70122/FK2/HXJVJU", original = TRUE, FUN = readr::read_file ) @@ -19,7 +19,41 @@ test_that("roster-original", { expect_equal(actual, expected_file) }) -test_that("image-original", { +test_that("roster-original-by-doi", { + expected_ds <- retrieve_info_dataset("dataset-basketball/expected-metadata.yml") + expected_file <- expected_ds$roster$raw_value + + actual <- + get_dataframe_by_doi( + filedoi = expected_ds$roster$dataFile$persistentId, # A value like "doi:10.70122/FK2/HXJVJU/SA3Z2V", + original = TRUE, + FUN = readr::read_file + ) + + expect_equal(substr(actual, 1, 30), substr(expected_file, 1, 30)) + expect_equal(nchar( actual ), nchar( expected_file )) + + expect_equal(actual, expected_file) +}) + +test_that("roster-original-by-id", { + expected_ds <- retrieve_info_dataset("dataset-basketball/expected-metadata.yml") + expected_file <- expected_ds$roster$raw_value + + actual <- + get_dataframe_by_id( + fileid = expected_ds$roster$dataFile$id, # A value like 1734005 + original = TRUE, + FUN = readr::read_file + ) + + expect_equal(substr(actual, 1, 30), substr(expected_file, 1, 30)) + expect_equal(nchar( actual ), nchar( expected_file )) + + expect_equal(actual, expected_file) +}) + +test_that("image-original-by-name", { expected_ds <- retrieve_info_dataset("dataset-basketball/expected-metadata.yml") expected_file <- expected_ds$image$raw_value @@ -36,3 +70,37 @@ test_that("image-original", { expect_equal(actual, expected_file) }) + +test_that("image-original-by-doi", { + expected_ds <- retrieve_info_dataset("dataset-basketball/expected-metadata.yml") + expected_file <- expected_ds$image$raw_value + + actual <- + get_dataframe_by_doi( + filedoi = expected_ds$image$dataFile$persistentId, # A value like "doi:10.70122/FK2/HXJVJU/FHV8ZB", + original = TRUE, + FUN = readr::read_file + ) + + expect_equal(substr(actual, 1, 30), substr(expected_file, 1, 30)) + expect_equal(nchar( actual ), nchar( expected_file )) + + expect_equal(actual, expected_file) +}) + +test_that("image-original-by-id", { + expected_ds <- retrieve_info_dataset("dataset-basketball/expected-metadata.yml") + expected_file <- expected_ds$image$raw_value + + actual <- + get_dataframe_by_id( + fileid = expected_ds$image$dataFile$id, # A value like 1734006 + original = TRUE, + FUN = readr::read_file + ) + + expect_equal(substr(actual, 1, 30), substr(expected_file, 1, 30)) + expect_equal(nchar( actual ), nchar( expected_file )) + + expect_equal(actual, expected_file) +}) From f05feedac634c6395f9aa5c0fe7738c28a671948 Mon Sep 17 00:00:00 2001 From: Will Beasley Date: Thu, 31 Dec 2020 18:09:07 -0600 Subject: [PATCH 55/75] produce a message instead of a warning with an ingested file ref #66 @kuriwaki, tell me if you object to the softer communication --- R/get_dataframe.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/get_dataframe.R b/R/get_dataframe.R index c8cbe82..434d636 100644 --- a/R/get_dataframe.R +++ b/R/get_dataframe.R @@ -103,7 +103,7 @@ get_dataframe_by_id <- function(fileid, } if (is.null(FUN) & isTRUE(ingested) & isFALSE(original)) { - warning("Downloading ingested version of data with read_tsv. To download the original version and remove this warning, set original = TRUE.\n") + message("Downloading ingested version of data with read_tsv. To download the original version and remove this message, set original = TRUE.\n") FUN <- read_tsv } From 2eab927a85448c7dffb761f83cff5a4ae63bbeaf Mon Sep 17 00:00:00 2001 From: Will Beasley Date: Thu, 31 Dec 2020 22:42:17 -0600 Subject: [PATCH 56/75] name test file more specifically ref #66 --- ...e.R => tests-get_dataframe-original-basketball.R} | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) rename tests/testthat/{tests-get_dataframe.R => tests-get_dataframe-original-basketball.R} (93%) diff --git a/tests/testthat/tests-get_dataframe.R b/tests/testthat/tests-get_dataframe-original-basketball.R similarity index 93% rename from tests/testthat/tests-get_dataframe.R rename to tests/testthat/tests-get_dataframe-original-basketball.R index 2c656aa..6a83796 100644 --- a/tests/testthat/tests-get_dataframe.R +++ b/tests/testthat/tests-get_dataframe-original-basketball.R @@ -1,7 +1,7 @@ # See https://demo.dataverse.org/dataverse/dataverse-client-r # https://doi.org/10.70122/FK2/HXJVJU -test_that("roster-original-by-name", { +test_that("roster-by-name", { expected_ds <- retrieve_info_dataset("dataset-basketball/expected-metadata.yml") expected_file <- expected_ds$roster$raw_value @@ -19,7 +19,7 @@ test_that("roster-original-by-name", { expect_equal(actual, expected_file) }) -test_that("roster-original-by-doi", { +test_that("roster-by-doi", { expected_ds <- retrieve_info_dataset("dataset-basketball/expected-metadata.yml") expected_file <- expected_ds$roster$raw_value @@ -36,7 +36,7 @@ test_that("roster-original-by-doi", { expect_equal(actual, expected_file) }) -test_that("roster-original-by-id", { +test_that("roster-by-id", { expected_ds <- retrieve_info_dataset("dataset-basketball/expected-metadata.yml") expected_file <- expected_ds$roster$raw_value @@ -53,7 +53,7 @@ test_that("roster-original-by-id", { expect_equal(actual, expected_file) }) -test_that("image-original-by-name", { +test_that("image-by-name", { expected_ds <- retrieve_info_dataset("dataset-basketball/expected-metadata.yml") expected_file <- expected_ds$image$raw_value @@ -71,7 +71,7 @@ test_that("image-original-by-name", { expect_equal(actual, expected_file) }) -test_that("image-original-by-doi", { +test_that("image-by-doi", { expected_ds <- retrieve_info_dataset("dataset-basketball/expected-metadata.yml") expected_file <- expected_ds$image$raw_value @@ -88,7 +88,7 @@ test_that("image-original-by-doi", { expect_equal(actual, expected_file) }) -test_that("image-original-by-id", { +test_that("image-by-id", { expected_ds <- retrieve_info_dataset("dataset-basketball/expected-metadata.yml") expected_file <- expected_ds$image$raw_value From 2a1e3558264cf41ebac58a87cb7951590127cf62 Mon Sep 17 00:00:00 2001 From: Will Beasley Date: Thu, 31 Dec 2020 22:56:30 -0600 Subject: [PATCH 57/75] test data frame ref #66 --- .../dataset-basketball/dataframe-from-tab.rds | Bin 0 -> 3226 bytes tests/testthat/manual/seed/seed-yaml.R | 12 +++++- ...tests-get_dataframe-dataframe-basketball.R | 39 ++++++++++++++++++ 3 files changed, 50 insertions(+), 1 deletion(-) create mode 100644 inst/dataset-basketball/dataframe-from-tab.rds create mode 100644 tests/testthat/tests-get_dataframe-dataframe-basketball.R diff --git a/inst/dataset-basketball/dataframe-from-tab.rds b/inst/dataset-basketball/dataframe-from-tab.rds new file mode 100644 index 0000000000000000000000000000000000000000..2850adc96940c79811bf7a3881b55a72a066bed6 GIT binary patch literal 3226 zcmds3?{3>R5LaZciJdrh+kF^@p&*8#1BN=NokniHY;qE0gMw$&3J1mt@r1Q(N78hWoo)1^=8 zc$|k*IZL%7av?`ESbH^bTc;SJ-TT@|{qw&{y`0PSa?@O}`?-%?%iiE^VX)-(d@Zs< ztNsw_9~*Whh@5rmEmHt)@CHth zWw{f?fTTmq^)s+si8vvd;D*0woW0cOBeJ)8JiXTVqpMN8)+pa)pjyTqL>^$ zd_WqA`J~xO*_O%Rojwa=HiIf6jmIncm3F9jK%7k7yoXwc zii9d~o-OD+!i42~<2Z9-lD_Q3G34!YHdL9HMiz`?!d?nShv^)l|4Jtj4B}pia9RB* zn=$J9I`(E3eC{5dWNB%cHM3MT4m0z{T6XOu!u2;wzNz1hvov|1Z7_V_tVcJ_V)Jz$ zWkcPby6hGAB(!20uXIU1p#oXBW^Ukz87TLZyyPP$JfmOh zaysMWh2WP=MLeDx&4qtUH~2entfP+!id-th8HtfCF? zWDBGGK+qYBfJpDW5|Xz>l9sC(g#n%Cqn%JfLU?{J-16bZ^JAlXwLB_}q&b5XPbRRh z`C!OZY_8E7dj;1WPZK|wGph2PdHQ8rYq5F(jZCaXTyZxFnKy`s!C*8tL9C1lFv5xQ zh{p6_tYFJ)u1j!}ADC2BAQkpCJm84@v8Gr|m%~pL_-3*7CyV}c0gE)Z_6-PWI@3>^ z^dq2_ZO4nH8jNHz% literal 0 HcmV?d00001 diff --git a/tests/testthat/manual/seed/seed-yaml.R b/tests/testthat/manual/seed/seed-yaml.R index 94518a9..f4dca6b 100644 --- a/tests/testthat/manual/seed/seed-yaml.R +++ b/tests/testthat/manual/seed/seed-yaml.R @@ -49,7 +49,17 @@ ds_1 %>% yaml::write_yaml("inst/dataset-basketball/expected-metadata.yml") -# retrieve-from-file ------------------------------------------------------ +# ---- save-expected-dataframe ------------------------------------------------- +ds_1$roster %>% + { + get_dataframe_by_name( + filename = .$label, + dataset = dirname(.$dataFile$persistentId) + ) + } %>% + readr::write_rds("inst/dataset-basketball/dataframe-from-tab.rds") + +# ---- practice-retrieving-from-file ------------------------------------------------------ y <- yaml::read_yaml(system.file("dataset-basketball/expected-metadata.yml", package = "dataverse")) y$roster diff --git a/tests/testthat/tests-get_dataframe-dataframe-basketball.R b/tests/testthat/tests-get_dataframe-dataframe-basketball.R new file mode 100644 index 0000000..290559f --- /dev/null +++ b/tests/testthat/tests-get_dataframe-dataframe-basketball.R @@ -0,0 +1,39 @@ +# See https://demo.dataverse.org/dataverse/dataverse-client-r +# https://doi.org/10.70122/FK2/HXJVJU + +test_that("roster-by-name", { + expected_ds <- retrieve_info_dataset("dataset-basketball/expected-metadata.yml") + expected_file <- readr::read_rds(system.file("dataset-basketball/dataframe-from-tab.rds", package = "dataverse")) + + actual <- + get_dataframe_by_name( + filename = expected_ds$roster$label , # A value like "roster-bulls-1996.tab", + dataset = dirname(expected_ds$roster$dataFile$persistentId)#, # A value like "doi:10.70122/FK2/HXJVJU", + ) + + expect_equal(actual, expected_file) +}) + +test_that("roster-by-doi", { + expected_ds <- retrieve_info_dataset("dataset-basketball/expected-metadata.yml") + expected_file <- readr::read_rds(system.file("dataset-basketball/dataframe-from-tab.rds", package = "dataverse")) + + actual <- + get_dataframe_by_doi( + filedoi = expected_ds$roster$dataFile$persistentId, # A value like "doi:10.70122/FK2/HXJVJU/SA3Z2V", + ) + + expect_equal(actual, expected_file) +}) + +test_that("roster-by-id", { + expected_ds <- retrieve_info_dataset("dataset-basketball/expected-metadata.yml") + expected_file <- readr::read_rds(system.file("dataset-basketball/dataframe-from-tab.rds", package = "dataverse")) + + actual <- + get_dataframe_by_id( + fileid = expected_ds$roster$dataFile$id, # A value like 1734005 + ) + + expect_equal(actual, expected_file) +}) From 4e801803f27eb1e4df2de488f6e4b5d5afefe9e0 Mon Sep 17 00:00:00 2001 From: Will Beasley Date: Thu, 31 Dec 2020 23:25:58 -0600 Subject: [PATCH 58/75] remove temp files afterwards --- NEWS.md | 1 + R/get_dataframe.R | 16 ++++++++++------ 2 files changed, 11 insertions(+), 6 deletions(-) diff --git a/NEWS.md b/NEWS.md index a7589f3..a12718a 100644 --- a/NEWS.md +++ b/NEWS.md @@ -6,6 +6,7 @@ * Tests use https://demo.dataverse.org/dataverse/dataverse-client-r/. (#40) * Fixes most get_file errors by removing query argument (#33 @kuriwaki) * Fix getting multiple files by id in `get_file()` (#47 @adam3smith) +* Temporary files created by `get_file()` are automatically deleted. # CHANGES TO dataverse 0.2.1 diff --git a/R/get_dataframe.R b/R/get_dataframe.R index 434d636..31d14bc 100644 --- a/R/get_dataframe.R +++ b/R/get_dataframe.R @@ -140,11 +140,15 @@ get_dataframe_by_doi <- function(filedoi, #' #' @keywords internal get_dataframe_internal <- function(raw, filename, .f) { - tmp <- tempfile(filename) - writeBin(raw, tmp) - - do.call(.f, list(tmp)) - - # TODO: unlink/delete tmp file in a try/catch/finally block. + tryCatch( + { + tmp <- tempfile(filename) + writeBin(raw, tmp) + do.call(.f, list(tmp)) + }, + finally = { + if (file.exists(tmp)) unlink(tmp) + } + ) } From c6db66412142cb7fd66d4d55e3a5cb98a7bd96b8 Mon Sep 17 00:00:00 2001 From: Will Beasley Date: Thu, 31 Dec 2020 23:37:53 -0600 Subject: [PATCH 59/75] seq_len --address some lintr flags; ref #41 --- R/get_file.R | 2 +- for-developers/developer-tasks.R | 11 +++++++++++ 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/R/get_file.R b/R/get_file.R index 4f9d819..566d979 100644 --- a/R/get_file.R +++ b/R/get_file.R @@ -112,7 +112,7 @@ get_file <- # Main function. Call get_file_by_id out <- vector("list", length(fileid)) - for (i in 1:length(fileid)) { + for (i in seq_len(fileid)) { out[[i]] <- get_file_by_id( fileid = fileid[i], dataset = dataset, diff --git a/for-developers/developer-tasks.R b/for-developers/developer-tasks.R index b308ce8..9b61d04 100644 --- a/for-developers/developer-tasks.R +++ b/for-developers/developer-tasks.R @@ -10,6 +10,17 @@ pkgdown::clean_site() pkgdown::build_site() system("R CMD Rd2pdf --no-preview --force --output=./documentation-peek.pdf ." ) +checks_to_exclude <- c( + "covr", + "lintr_line_length_linter" +) +gp <- + goodpractice::all_checks() %>% + purrr::discard(~(. %in% checks_to_exclude)) %>% + goodpractice::gp(checks = .) +goodpractice::results(gp) +gp + devtools::run_examples(); #dev.off() #This overwrites the NAMESPACE file too # devtools::run_examples(, "redcap_read.Rd") test_results_checked <- devtools::test() From 453f231509aace79847c1718a8dafe65e5c17f44 Mon Sep 17 00:00:00 2001 From: Will Beasley Date: Fri, 1 Jan 2021 16:09:41 -0600 Subject: [PATCH 60/75] remove unused magrittr imports --address some lintr flags; ref #41 --- DESCRIPTION | 1 - R/get_file.R | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 778a441..0e2aec9 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -38,7 +38,6 @@ Authors@R: c( Imports: httr, jsonlite, - magrittr, readr, stats, utils, diff --git a/R/get_file.R b/R/get_file.R index 566d979..629259a 100644 --- a/R/get_file.R +++ b/R/get_file.R @@ -112,7 +112,7 @@ get_file <- # Main function. Call get_file_by_id out <- vector("list", length(fileid)) - for (i in seq_len(fileid)) { + for (i in seq_along(fileid)) { out[[i]] <- get_file_by_id( fileid = fileid[i], dataset = dataset, From 2cfa95b057f5a5ab6da31c031368324fda99f4d3 Mon Sep 17 00:00:00 2001 From: Will Beasley Date: Fri, 1 Jan 2021 20:30:15 -0600 Subject: [PATCH 61/75] adding some more error messages & validation ref #71 --- DESCRIPTION | 2 +- R/get_file_by_id.R | 18 +++++++++++------- 2 files changed, 12 insertions(+), 8 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 0e2aec9..215b7e7 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -36,6 +36,7 @@ Authors@R: c( "Jan", "Kanis", role = "ctb" )) Imports: + checkmate, httr, jsonlite, readr, @@ -43,7 +44,6 @@ Imports: utils, xml2 Suggests: - checkmate, covr, foreign, haven, diff --git a/R/get_file_by_id.R b/R/get_file_by_id.R index b797c68..94f6103 100644 --- a/R/get_file_by_id.R +++ b/R/get_file_by_id.R @@ -19,14 +19,18 @@ get_file_by_id <- ...) { format <- match.arg(format) - # single file ID - stopifnot(length(fileid) == 1) + if (length(fileid) != 1L) + stop("The fileid parameter must be single element.") # must be a number OR doi string in the form of "doi:" - if (is.numeric(fileid)) - use_persistentID <- FALSE - if (grepl(x = fileid, pattern = "^doi:")) - use_persistentID <- TRUE + use_persistent_id <- !is.numeric(fileid) + if (use_persistent_id) { + if (!grepl(x = fileid, pattern = "^doi:")) + stop("A 'persistent' fileid must be prefixed with 'doi:'. It was `", fileid, "`.") + } else { + if (!checkmate::check_integerish(fileid)) + stop("A 'non-persistent' fileid must be a whole number. It was `", fileid, "`.") + } # ping get_file_metadata to see if file is ingested @@ -59,7 +63,7 @@ get_file_by_id <- if (format == "original") u_part <- "access/datafile/" - if (use_persistentID) + if (use_persistent_id) u_part <- "access/datafile/:persistentId/?persistentId=" # If not bundle, request single file in non-bundle format ---- From a705eefb13416bed6498ba4f6e224144b8db65da Mon Sep 17 00:00:00 2001 From: Will Beasley Date: Fri, 1 Jan 2021 22:02:19 -0600 Subject: [PATCH 62/75] make waterfall precedence clearer ref #71 --- R/get_file_by_id.R | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/R/get_file_by_id.R b/R/get_file_by_id.R index 94f6103..4611c7c 100644 --- a/R/get_file_by_id.R +++ b/R/get_file_by_id.R @@ -6,7 +6,6 @@ #' `original` is set to FALSE by default. Either can be changed. #' @param fileid A numeric ID internally used for `get_file_by_id` #' -#' #' @export get_file_by_id <- function(fileid, @@ -32,7 +31,6 @@ get_file_by_id <- stop("A 'non-persistent' fileid must be a whole number. It was `", fileid, "`.") } - # ping get_file_metadata to see if file is ingested is_ingested <- is_ingested(fileid, server = server) @@ -55,16 +53,16 @@ get_file_by_id <- if (is_ingested & (isFALSE(original) || is.na(original) || is.null(original))) query$format <- NULL - # part of URL depending on DOI, bundle, or file - if (format == "bundle") + if (use_persistent_id) { + u_part <- "access/datafile/:persistentId/?persistentId=" + } else if (format == "bundle") { u_part <- "access/datafile/bundle/" - - if (format == "original") + } else if (format == "original") { u_part <- "access/datafile/" - - if (use_persistent_id) - u_part <- "access/datafile/:persistentId/?persistentId=" + } else { + stop("The `format` value should be 'bundle' or 'original', or a doi needs to be passed to `fileid`.") + } # If not bundle, request single file in non-bundle format ---- u <- paste0(api_url(server), u_part, fileid) From c17c2ba542afa399601d577d530130cf9baf89e9 Mon Sep 17 00:00:00 2001 From: Will Beasley Date: Fri, 1 Jan 2021 22:04:42 -0600 Subject: [PATCH 63/75] `server` parameter follows `key` to be consistent with most of the other dataverse functions --- R/get_file_by_id.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/R/get_file_by_id.R b/R/get_file_by_id.R index 4611c7c..f10b9d6 100644 --- a/R/get_file_by_id.R +++ b/R/get_file_by_id.R @@ -10,11 +10,11 @@ get_file_by_id <- function(fileid, dataset = NULL, - server = Sys.getenv("DATAVERSE_SERVER"), format = c("original", "bundle"), vars = NULL, original = TRUE, key = Sys.getenv("DATAVERSE_KEY"), + server = Sys.getenv("DATAVERSE_SERVER"), ...) { format <- match.arg(format) @@ -83,11 +83,11 @@ get_file_by_id <- #' @export get_file_by_doi <- function(filedoi, dataset = NULL, - server = Sys.getenv("DATAVERSE_SERVER"), format = c("original", "bundle"), vars = NULL, original = TRUE, key = Sys.getenv("DATAVERSE_KEY"), + server = Sys.getenv("DATAVERSE_SERVER"), ...) { get_file_by_id( From eb206c2d73c12d2d73e6955553ee5e8580c74cf2 Mon Sep 17 00:00:00 2001 From: Will Beasley Date: Fri, 1 Jan 2021 22:20:54 -0600 Subject: [PATCH 64/75] `server` parameter follows `key` to be consistent with most of the other dataverse functions --- R/get_file.R | 4 ++-- man/files.Rd | 18 +++++++++--------- 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/R/get_file.R b/R/get_file.R index 629259a..e37e419 100644 --- a/R/get_file.R +++ b/R/get_file.R @@ -82,9 +82,9 @@ get_file <- function(file, dataset = NULL, format = c("original", "bundle"), - server = Sys.getenv("DATAVERSE_SERVER"), vars = NULL, key = Sys.getenv("DATAVERSE_KEY"), + server = Sys.getenv("DATAVERSE_SERVER"), original = TRUE, ...) { @@ -146,9 +146,9 @@ get_file <- get_file_by_name <- function(filename, dataset, format = c("original", "bundle"), - server = Sys.getenv("DATAVERSE_SERVER"), vars = NULL, key = Sys.getenv("DATAVERSE_KEY"), + server = Sys.getenv("DATAVERSE_SERVER"), original = TRUE, ... ) { diff --git a/man/files.Rd b/man/files.Rd index da42bf0..85e764b 100644 --- a/man/files.Rd +++ b/man/files.Rd @@ -11,9 +11,9 @@ get_file( file, dataset = NULL, format = c("original", "bundle"), - server = Sys.getenv("DATAVERSE_SERVER"), vars = NULL, key = Sys.getenv("DATAVERSE_KEY"), + server = Sys.getenv("DATAVERSE_SERVER"), original = TRUE, ... ) @@ -22,9 +22,9 @@ get_file_by_name( filename, dataset, format = c("original", "bundle"), - server = Sys.getenv("DATAVERSE_SERVER"), vars = NULL, key = Sys.getenv("DATAVERSE_KEY"), + server = Sys.getenv("DATAVERSE_SERVER"), original = TRUE, ... ) @@ -32,22 +32,22 @@ get_file_by_name( get_file_by_id( fileid, dataset = NULL, - server = Sys.getenv("DATAVERSE_SERVER"), format = c("original", "bundle"), vars = NULL, original = TRUE, key = Sys.getenv("DATAVERSE_KEY"), + server = Sys.getenv("DATAVERSE_SERVER"), ... ) get_file_by_doi( filedoi, dataset = NULL, - server = Sys.getenv("DATAVERSE_SERVER"), format = c("original", "bundle"), vars = NULL, original = TRUE, key = Sys.getenv("DATAVERSE_KEY"), + server = Sys.getenv("DATAVERSE_SERVER"), ... ) } @@ -69,11 +69,6 @@ For tabular datasets, the option \dQuote{bundle} downloads the bundle of the original and archival versions, as well as the documentation. See \url{https://guides.dataverse.org/en/latest/api/dataaccess.html} for details.} -\item{server}{A character string specifying a Dataverse server. There are -multiple Dataverse installations, but the defaults is to use the Harvard -Dataverse (\code{server = "dataverse.harvard.edu"}). This can be modified atomically -or globally using \code{Sys.setenv("DATAVERSE_SERVER" = "dataverse.example.com")}.} - \item{vars}{A character vector specifying one or more variable names, used to extract a subset of the data.} @@ -82,6 +77,11 @@ is not specified, functions calling authenticated API endpoints will fail. Keys can be specified atomically or globally using \code{Sys.setenv("DATAVERSE_KEY" = "examplekey")}.} +\item{server}{A character string specifying a Dataverse server. There are +multiple Dataverse installations, but the defaults is to use the Harvard +Dataverse (\code{server = "dataverse.harvard.edu"}). This can be modified atomically +or globally using \code{Sys.setenv("DATAVERSE_SERVER" = "dataverse.example.com")}.} + \item{original}{A logical, defaulting to TRUE. If a ingested (.tab) version is available, download the original version instead of the ingested? If there was no ingested version, is set to NA. Note in \verb{get_dataframe_*}, From 7660c0c04ae981060b42fd5d7d8dff0290661ed9 Mon Sep 17 00:00:00 2001 From: Will Beasley Date: Fri, 1 Jan 2021 22:31:30 -0600 Subject: [PATCH 65/75] tidy --- R/get_file.R | 117 +++++++++++++++++++++++---------------------- R/get_file_by_id.R | 56 +++++++++++----------- man/files.Rd | 26 +++++----- 3 files changed, 103 insertions(+), 96 deletions(-) diff --git a/R/get_file.R b/R/get_file.R index e37e419..4d67cc9 100644 --- a/R/get_file.R +++ b/R/get_file.R @@ -1,6 +1,5 @@ #' @rdname files #' -#' #' @title Download File #' #' @description Download Dataverse File(s). `get_file` is a general wrapper, @@ -13,8 +12,6 @@ #' functions return a raw binary file, which cannot be readily analyzed in R. #' To use the objects as dataframes, see the `get_dataset_*` functions at \link{get_dataset} #' -#' -#' #' @details This function provides access to data files from a Dataverse entry. #' @param file An integer specifying a file identifier; or a vector of integers #' specifying file identifiers; or, if used with the prefix \code{"doi:"}, a @@ -46,24 +43,29 @@ #' \dontrun{ #' #' # 1. Using filename and dataverse -#' f1 <- get_file_by_name(filename = "nlsw88.tab", -#' dataset = "10.70122/FK2/PPIAXE", -#' server = "demo.dataverse.org") +#' f1 <- get_file_by_name( +#' filename = "nlsw88.tab", +#' dataset = "10.70122/FK2/PPIAXE", +#' server = "demo.dataverse.org" +#' ) #' #' # 2. Using file DOI -#' f2 <- get_file_by_doi(filedoi = "10.70122/FK2/PPIAXE/MHDB0O", -#' server = "demo.dataverse.org") +#' f2 <- get_file_by_doi( +#' filedoi = "10.70122/FK2/PPIAXE/MHDB0O", +#' server = "demo.dataverse.org" +#' ) #' #' # 3. Two-steps: Find ID from get_dataset #' d3 <- get_dataset("doi:10.70122/FK2/PPIAXE", server = "demo.dataverse.org") #' f3 <- get_file(d3$files$id[1], server = "demo.dataverse.org") #' -#' #' # 4. Retrieve multiple raw data in list -#' f4_vec <- get_dataset("doi:10.70122/FK2/PPIAXE", -#' server = "demo.dataverse.org")$files$id -#' f4 <- get_file(f4_vec, -#' server = "demo.dataverse.org") +#' f4_vec <- get_dataset( +#' "doi:10.70122/FK2/PPIAXE", +#' server = "demo.dataverse.org" +#' )$files$id +#' +#' f4 <- get_file(f4_vec, server = "demo.dataverse.org") #' length(f4) #' #' # Write binary files @@ -74,19 +76,19 @@ #' #' writeBin(f4[[1]], "nlsw88.rds") # originally a rds file #' writeBin(f4[[2]], "nlsw88.dta") # originally a dta file -#' #' } #' #' @export -get_file <- - function(file, - dataset = NULL, - format = c("original", "bundle"), - vars = NULL, - key = Sys.getenv("DATAVERSE_KEY"), - server = Sys.getenv("DATAVERSE_SERVER"), - original = TRUE, - ...) { +get_file <- function( + file, + dataset = NULL, + format = c("original", "bundle"), + vars = NULL, + key = Sys.getenv("DATAVERSE_KEY"), + server = Sys.getenv("DATAVERSE_SERVER"), + original = TRUE, + ... +) { format <- match.arg(format) @@ -114,59 +116,60 @@ get_file <- for (i in seq_along(fileid)) { out[[i]] <- get_file_by_id( - fileid = fileid[i], - dataset = dataset, - format = format, - vars = vars, - key = key, - server = server, - original = original, + fileid = fileid[i], + dataset = dataset, + format = format, + vars = vars, + key = key, + server = server, + original = original, ... - ) + ) } - # return the raw vector if there's a single file - if (length(out) == 1) { + if (length(out) == 1L) { # return the raw vector if there's a single file return(out[[1]]) } else { - # return a list of raw vectors otherwise - return(out) + return(out) # return a list of raw vectors otherwise } } #' @rdname files #' -#' #' @param filename Filename of the dataset, with file extension as shown in Dataverse #' (for example, if nlsw88.dta was the original but is displayed as the ingested #' nlsw88.tab, use the ingested version.) #' #' @export -get_file_by_name <- function(filename, - dataset, - format = c("original", "bundle"), - vars = NULL, - key = Sys.getenv("DATAVERSE_KEY"), - server = Sys.getenv("DATAVERSE_SERVER"), - original = TRUE, - ... - ) { +get_file_by_name <- function( + filename, + dataset, + format = c("original", "bundle"), + vars = NULL, + key = Sys.getenv("DATAVERSE_KEY"), + server = Sys.getenv("DATAVERSE_SERVER"), + original = TRUE, + ... +) { format <- match.arg(format) # retrieve ID - fileid <- get_fileid.character(x = dataset, - file = filename, - server = server, - ...) - - get_file_by_id(fileid, - format = format, - vars = vars, - key = key, - server = server, - original = original, - ...) + fileid <- get_fileid.character( + x = dataset, + file = filename, + server = server, + ... + ) + get_file_by_id( + fileid, + format = format, + vars = vars, + key = key, + server = server, + original = original, + ... + ) } diff --git a/R/get_file_by_id.R b/R/get_file_by_id.R index f10b9d6..249352b 100644 --- a/R/get_file_by_id.R +++ b/R/get_file_by_id.R @@ -7,15 +7,16 @@ #' @param fileid A numeric ID internally used for `get_file_by_id` #' #' @export -get_file_by_id <- - function(fileid, - dataset = NULL, - format = c("original", "bundle"), - vars = NULL, - original = TRUE, - key = Sys.getenv("DATAVERSE_KEY"), - server = Sys.getenv("DATAVERSE_SERVER"), - ...) { +get_file_by_id <- function( + fileid, + dataset = NULL, + format = c("original", "bundle"), + vars = NULL, + original = TRUE, + key = Sys.getenv("DATAVERSE_KEY"), + server = Sys.getenv("DATAVERSE_SERVER"), + ... +) { format <- match.arg(format) if (length(fileid) != 1L) @@ -66,10 +67,7 @@ get_file_by_id <- # If not bundle, request single file in non-bundle format ---- u <- paste0(api_url(server), u_part, fileid) - r <- httr::GET(u, - httr::add_headers("X-Dataverse-key" = key), - query = query, - ...) + r <- httr::GET(u, httr::add_headers("X-Dataverse-key" = key), query = query, ...) httr::stop_for_status(r) httr::content(r, as = "raw") @@ -81,23 +79,25 @@ get_file_by_id <- #' `"10.70122/FK2/PPIAXE/MHDB0O"` or `"doi:10.70122/FK2/PPIAXE/MHDB0O"` #' #' @export -get_file_by_doi <- function(filedoi, - dataset = NULL, - format = c("original", "bundle"), - vars = NULL, - original = TRUE, - key = Sys.getenv("DATAVERSE_KEY"), - server = Sys.getenv("DATAVERSE_SERVER"), - ...) { +get_file_by_doi <- function( + filedoi, + dataset = NULL, + format = c("original", "bundle"), + vars = NULL, + original = TRUE, + key = Sys.getenv("DATAVERSE_KEY"), + server = Sys.getenv("DATAVERSE_SERVER"), + ... +) { get_file_by_id( - fileid = prepend_doi(filedoi), - dataset = dataset, - format = format, - vars = vars, - key = key, - server = server, - original = original, + fileid = prepend_doi(filedoi), + dataset = dataset, + format = format, + vars = vars, + key = key, + server = server, + original = original, ... ) diff --git a/man/files.Rd b/man/files.Rd index 85e764b..e24c4be 100644 --- a/man/files.Rd +++ b/man/files.Rd @@ -124,24 +124,29 @@ This function provides access to data files from a Dataverse entry. \dontrun{ # 1. Using filename and dataverse -f1 <- get_file_by_name(filename = "nlsw88.tab", - dataset = "10.70122/FK2/PPIAXE", - server = "demo.dataverse.org") +f1 <- get_file_by_name( + filename = "nlsw88.tab", + dataset = "10.70122/FK2/PPIAXE", + server = "demo.dataverse.org" +) # 2. Using file DOI -f2 <- get_file_by_doi(filedoi = "10.70122/FK2/PPIAXE/MHDB0O", - server = "demo.dataverse.org") +f2 <- get_file_by_doi( + filedoi = "10.70122/FK2/PPIAXE/MHDB0O", + server = "demo.dataverse.org" +) # 3. Two-steps: Find ID from get_dataset d3 <- get_dataset("doi:10.70122/FK2/PPIAXE", server = "demo.dataverse.org") f3 <- get_file(d3$files$id[1], server = "demo.dataverse.org") - # 4. Retrieve multiple raw data in list -f4_vec <- get_dataset("doi:10.70122/FK2/PPIAXE", - server = "demo.dataverse.org")$files$id -f4 <- get_file(f4_vec, - server = "demo.dataverse.org") +f4_vec <- get_dataset( + "doi:10.70122/FK2/PPIAXE", + server = "demo.dataverse.org" +)$files$id + +f4 <- get_file(f4_vec, server = "demo.dataverse.org") length(f4) # Write binary files @@ -152,7 +157,6 @@ writeBin(f2, "nlsw88.dta") writeBin(f4[[1]], "nlsw88.rds") # originally a rds file writeBin(f4[[2]], "nlsw88.dta") # originally a dta file - } } From b72f9a94d8c1eb6fd2ba3528cc43409a3d0a248a Mon Sep 17 00:00:00 2001 From: Will Beasley Date: Fri, 1 Jan 2021 22:46:25 -0600 Subject: [PATCH 66/75] closer inspection of parameters ref #71 --- R/get_file_by_id.R | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/R/get_file_by_id.R b/R/get_file_by_id.R index 249352b..f6d617b 100644 --- a/R/get_file_by_id.R +++ b/R/get_file_by_id.R @@ -19,8 +19,17 @@ get_file_by_id <- function( ) { format <- match.arg(format) - if (length(fileid) != 1L) - stop("The fileid parameter must be single element.") + if (length(fileid) != 1L) { + stop("The `fileid` parameter must be single element.") + } else if (!(inherits(fileid, "numeric") | inherits(fileid, "integer") | inherits(fileid, "character"))) { + stop("The `fileid` data type must be numeric, integer, or character.") + } + # `dataset` place holder. + checkmate::assert_character(format , any.missing = FALSE, len = 1) + # `vars` place holder. + checkmate::assert_logical( original, any.missing = TRUE , len = 1) + checkmate::assert_character(key , any.missing = FALSE, len = 1) + checkmate::assert_character(server , any.missing = FALSE, len = 1) # must be a number OR doi string in the form of "doi:" use_persistent_id <- !is.numeric(fileid) From 72c9baa4d8025b621dce3ad5fa3d736e1be94e79 Mon Sep 17 00:00:00 2001 From: Will Beasley Date: Fri, 1 Jan 2021 22:57:10 -0600 Subject: [PATCH 67/75] tidy --- R/get_dataframe.R | 58 ++++++++++++++++++++++------------------------ R/get_file.R | 50 ++++++++++++++++++++------------------- R/get_file_by_id.R | 3 --- man/files.Rd | 1 + 4 files changed, 55 insertions(+), 57 deletions(-) diff --git a/R/get_dataframe.R b/R/get_dataframe.R index 31d14bc..7c60c10 100644 --- a/R/get_dataframe.R +++ b/R/get_dataframe.R @@ -6,15 +6,16 @@ #' @rdname get_dataframe #' #' @param filename The name of the file of interest, with file extension, for example -#' `"roster-bulls-1996.tab"`. +#' `"roster-bulls-1996.tab"`. #' @param FUN The function to used for reading in the raw dataset. This user -#' must choose the appropriate function: for example if the target is a .rds -#' file, then `FUN` should be `readRDS` or `readr::read_rds`. +#' must choose the appropriate function: for example if the target is a .rds +#' file, then `FUN` should be `readRDS` or `readr::read_rds`. #' @param original A logical, defaulting to TRUE. Whether to read the ingested, #' archival version of the dataset if one exists. The archival versions are tab-delimited -#' `.tab` files so if `original = FALSE`, `FUN` is set to `readr::read_tsv`. -#' If functions to read the original version is available, then `original = TRUE` -#' with a specified `FUN` is better. +#' `.tab` files so if `original = FALSE`, `FUN` is set to `readr::read_tsv`. +#' If functions to read the original version is available, then `original = TRUE` +#' with a specified `FUN` is better. +#' #' @inheritDotParams get_file #' #' @importFrom readr read_tsv @@ -71,29 +72,28 @@ #' ) #' } #' @export -get_dataframe_by_name <- function(filename, - dataset = NULL, - FUN = NULL, - original = FALSE, - ...) { - +get_dataframe_by_name <- function ( + filename, + dataset = NULL, + FUN = NULL, + original = FALSE, + ... +) { # retrieve ID - fileid <- get_fileid.character(x = dataset, - file = filename, - ...) + fileid <- get_fileid.character(x = dataset, file = filename, ...) get_dataframe_by_id(fileid, FUN, original = original, ...) - } - #' @rdname get_dataframe #' @importFrom readr read_tsv #' @export -get_dataframe_by_id <- function(fileid, - FUN = NULL, - original = FALSE, - ...) { +get_dataframe_by_id <- function( + fileid, + FUN = NULL, + original = FALSE, + ... +) { # if not ingested, then whether to take the original is not relevant. ingested <- is_ingested(fileid, ...) @@ -120,14 +120,15 @@ get_dataframe_by_id <- function(fileid, } } - #' @rdname get_dataframe #' @inheritParams get_file_by_doi #' @export -get_dataframe_by_doi <- function(filedoi, - FUN = NULL, - original = FALSE, - ...) { +get_dataframe_by_doi <- function ( + filedoi, + FUN = NULL, + original = FALSE, + ... +) { filedoi <- prepend_doi(filedoi) # get_file can also take doi now @@ -136,10 +137,8 @@ get_dataframe_by_doi <- function(filedoi, #' Write to temp and apply function #' -# @importFrom stringr str_extract -#' #' @keywords internal -get_dataframe_internal <- function(raw, filename, .f) { +get_dataframe_internal <- function (raw, filename, .f) { tryCatch( { tmp <- tempfile(filename) @@ -151,4 +150,3 @@ get_dataframe_internal <- function(raw, filename, .f) { } ) } - diff --git a/R/get_file.R b/R/get_file.R index 4d67cc9..7964f42 100644 --- a/R/get_file.R +++ b/R/get_file.R @@ -3,39 +3,41 @@ #' @title Download File #' #' @description Download Dataverse File(s). `get_file` is a general wrapper, -#' and can take either dataverse objects, file IDs, or a filename and dataverse. -#' `get_file_by_name` is a shorthand for running `get_file` by -#' specifying a file name (`filename`) and dataset (`dataset`). -#' `get_file_by_doi` obtains a file by its file DOI, bypassing the -#' `dataset` argument. -#' Internally, all functions download each file by `get_file_by_id`. `get_file_*` -#' functions return a raw binary file, which cannot be readily analyzed in R. -#' To use the objects as dataframes, see the `get_dataset_*` functions at \link{get_dataset} +#' and can take either dataverse objects, file IDs, or a filename and dataverse. +#' `get_file_by_name` is a shorthand for running `get_file` by +#' specifying a file name (`filename`) and dataset (`dataset`). +#' `get_file_by_doi` obtains a file by its file DOI, bypassing the +#' `dataset` argument. +#' +#' Internally, all functions download each file by `get_file_by_id`. `get_file_*` +#' functions return a raw binary file, which cannot be readily analyzed in R. +#' To use the objects as dataframes, see the `get_dataset_*` functions at \link{get_dataset} #' #' @details This function provides access to data files from a Dataverse entry. +#' #' @param file An integer specifying a file identifier; or a vector of integers -#' specifying file identifiers; or, if used with the prefix \code{"doi:"}, a -#' character with the file-specific DOI; or, if used without the prefix, a -#' filename accompanied by a dataset DOI in the `dataset` argument, or an object of -#' class \dQuote{dataverse_file} as returned by \code{\link{dataset_files}}. +#' specifying file identifiers; or, if used with the prefix \code{"doi:"}, a +#' character with the file-specific DOI; or, if used without the prefix, a +#' filename accompanied by a dataset DOI in the `dataset` argument, or an object of +#' class \dQuote{dataverse_file} as returned by \code{\link{dataset_files}}. #' @param dataset @kuriwaki, can you please add a description for this parameter? #' @param format A character string specifying a file format for download. -#' by default, this is \dQuote{original} (the original file format). If `NULL`, -#' no query is added, so ingested files are returned in their ingested TSV form. -#' For tabular datasets, the option \dQuote{bundle} downloads the bundle -#' of the original and archival versions, as well as the documentation. -#' See for details. +#' by default, this is \dQuote{original} (the original file format). If `NULL`, +#' no query is added, so ingested files are returned in their ingested TSV form. +#' For tabular datasets, the option \dQuote{bundle} downloads the bundle +#' of the original and archival versions, as well as the documentation. +#' See for details. #' @param vars A character vector specifying one or more variable names, used to -#' extract a subset of the data. +#' extract a subset of the data. #' #' @template envvars #' @template dots #' @template ds #' #' @return \code{get_file} returns a raw vector (or list of raw vectors, -#' if \code{length(file) > 1}), which can be saved locally with the `writeBin` -#' function. To load datasets into the R environment dataframe, see -#' \link{get_dataframe_by_name}. +#' if \code{length(file) > 1}), which can be saved locally with the `writeBin` +#' function. To load datasets into the R environment dataframe, see +#' \link{get_dataframe_by_name}. #' #' @seealso To load the objects as datasets \link{get_dataframe_by_name}. #' @@ -138,11 +140,11 @@ get_file <- function( #' @rdname files #' #' @param filename Filename of the dataset, with file extension as shown in Dataverse -#' (for example, if nlsw88.dta was the original but is displayed as the ingested -#' nlsw88.tab, use the ingested version.) +#' (for example, if nlsw88.dta was the original but is displayed as the ingested +#' nlsw88.tab, use the ingested version.) #' #' @export -get_file_by_name <- function( +get_file_by_name <- function ( filename, dataset, format = c("original", "bundle"), diff --git a/R/get_file_by_id.R b/R/get_file_by_id.R index f6d617b..0e37c74 100644 --- a/R/get_file_by_id.R +++ b/R/get_file_by_id.R @@ -82,7 +82,6 @@ get_file_by_id <- function( httr::content(r, as = "raw") } - #' @rdname files #' @param filedoi A DOI for a single file (not the entire dataset), of the form #' `"10.70122/FK2/PPIAXE/MHDB0O"` or `"doi:10.70122/FK2/PPIAXE/MHDB0O"` @@ -98,7 +97,6 @@ get_file_by_doi <- function( server = Sys.getenv("DATAVERSE_SERVER"), ... ) { - get_file_by_id( fileid = prepend_doi(filedoi), dataset = dataset, @@ -109,5 +107,4 @@ get_file_by_doi <- function( original = original, ... ) - } diff --git a/man/files.Rd b/man/files.Rd index e24c4be..7137ad5 100644 --- a/man/files.Rd +++ b/man/files.Rd @@ -113,6 +113,7 @@ and can take either dataverse objects, file IDs, or a filename and dataverse. specifying a file name (\code{filename}) and dataset (\code{dataset}). \code{get_file_by_doi} obtains a file by its file DOI, bypassing the \code{dataset} argument. + Internally, all functions download each file by \code{get_file_by_id}. \verb{get_file_*} functions return a raw binary file, which cannot be readily analyzed in R. To use the objects as dataframes, see the \verb{get_dataset_*} functions at \link{get_dataset} From 7e09a810e72a0ad0dfde5f246da9ba488bc535b2 Mon Sep 17 00:00:00 2001 From: Will Beasley Date: Fri, 1 Jan 2021 23:14:27 -0600 Subject: [PATCH 68/75] avoid `@importFrom` From https://r-pkgs.org/namespace.html#import-r > If you are using just a few functions from another package, my recommendation is to note the package name in the Imports: field of the DESCRIPTION file and call the function(s) explicitly using ::, e.g., pkg::fun(). ref #41 --- NAMESPACE | 3 --- R/SWORD.R | 2 +- R/get_dataframe.R | 7 ++----- R/get_dataset.R | 2 +- R/print.R | 1 - 5 files changed, 4 insertions(+), 11 deletions(-) diff --git a/NAMESPACE b/NAMESPACE index b3675db..25d8c4f 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -50,6 +50,3 @@ export(update_dataset) export(update_dataset_file) import(httr) import(xml2) -importFrom(readr,read_tsv) -importFrom(stats,setNames) -importFrom(utils,str) diff --git a/R/SWORD.R b/R/SWORD.R index ed4f3b2..d1427bb 100644 --- a/R/SWORD.R +++ b/R/SWORD.R @@ -13,7 +13,7 @@ #' list_datasets(d[[2]]) #' } #' @seealso Managing a Dataverse: \code{\link{publish_dataverse}}; Managing a dataset: \code{\link{dataset_atom}}, \code{\link{list_datasets}}, \code{\link{create_dataset}}, \code{\link{delete_dataset}}, \code{\link{publish_dataset}}; Managing files within a dataset: \code{\link{add_file}}, \code{\link{delete_file}} -#' @importFrom stats setNames +#' #' @export service_document <- function(key = Sys.getenv("DATAVERSE_KEY"), server = Sys.getenv("DATAVERSE_SERVER"), ...) { u <- paste0(api_url(server, prefix="dvn/api/"), "data-deposit/v1.1/swordv2/service-document") diff --git a/R/get_dataframe.R b/R/get_dataframe.R index 7c60c10..d95a944 100644 --- a/R/get_dataframe.R +++ b/R/get_dataframe.R @@ -18,8 +18,6 @@ #' #' @inheritDotParams get_file #' -#' @importFrom readr read_tsv -#' #' @examples #' #' # Retrieve data.frame from dataverse DOI and file name @@ -86,7 +84,6 @@ get_dataframe_by_name <- function ( } #' @rdname get_dataframe -#' @importFrom readr read_tsv #' @export get_dataframe_by_id <- function( fileid, @@ -103,8 +100,8 @@ get_dataframe_by_id <- function( } if (is.null(FUN) & isTRUE(ingested) & isFALSE(original)) { - message("Downloading ingested version of data with read_tsv. To download the original version and remove this message, set original = TRUE.\n") - FUN <- read_tsv + message("Downloading ingested version of data with readr::read_tsv. To download the original version and remove this message, set original = TRUE.\n") + FUN <- readr::read_tsv } if (is.null(FUN) & (isFALSE(ingested) | isTRUE(original))) { diff --git a/R/get_dataset.R b/R/get_dataset.R index 039d423..09435e7 100644 --- a/R/get_dataset.R +++ b/R/get_dataset.R @@ -66,7 +66,7 @@ get_dataset <- function( #' @rdname get_dataset #' @param block A character string specifying a metadata block to retrieve. By default this is \dQuote{citation}. Other values may be available, depending on the dataset, such as \dQuote{geospatial} or \dQuote{socialscience}. -#' @importFrom utils str +#' #' @export dataset_metadata <- function( dataset, diff --git a/R/print.R b/R/print.R index 9ad8137..f514a1f 100644 --- a/R/print.R +++ b/R/print.R @@ -33,7 +33,6 @@ print.dataverse <- function(x, ...) { } # dataverse_dataset class -#' @importFrom utils str #' @export print.dataverse_dataset <- function(x, ...) { cat("Dataset (", x$id, "): ", x$persistentUrl, "\n", sep = "") From d5756e7127e00165b1b82b874babc6b99073e294 Mon Sep 17 00:00:00 2001 From: Will Beasley Date: Fri, 1 Jan 2021 23:37:38 -0600 Subject: [PATCH 69/75] remove unused "@import" roxygen commands Use `::` for all imported fxs ref #71 --- NAMESPACE | 2 -- R/SWORD_dataset.R | 1 - R/get_file_metadata.R | 1 - R/utils.R | 1 - 4 files changed, 5 deletions(-) diff --git a/NAMESPACE b/NAMESPACE index 25d8c4f..a007dab 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -48,5 +48,3 @@ export(service_document) export(set_dataverse_metadata) export(update_dataset) export(update_dataset_file) -import(httr) -import(xml2) diff --git a/R/SWORD_dataset.R b/R/SWORD_dataset.R index 450ebd6..93e5c29 100644 --- a/R/SWORD_dataset.R +++ b/R/SWORD_dataset.R @@ -212,7 +212,6 @@ dataset_atom <- function(dataset, key = Sys.getenv("DATAVERSE_KEY"), server = Sy } #' @rdname dataset_atom -#' @import xml2 #' @export dataset_statement <- function(dataset, key = Sys.getenv("DATAVERSE_KEY"), server = Sys.getenv("DATAVERSE_SERVER"), ...) { if (inherits(dataset, "dataset_atom")) { diff --git a/R/get_file_metadata.R b/R/get_file_metadata.R index 8b114b0..c0fed61 100644 --- a/R/get_file_metadata.R +++ b/R/get_file_metadata.R @@ -6,7 +6,6 @@ #' @return A character vector containing a DDI #' metadata file. #' -#' @import xml2 #' @export get_file_metadata <- function(file, diff --git a/R/utils.R b/R/utils.R index df36f49..9379f54 100644 --- a/R/utils.R +++ b/R/utils.R @@ -133,7 +133,6 @@ prepend_doi <- function(dataset) { dataset } -#' @import httr api_url <- function(server = Sys.getenv("DATAVERSE_SERVER"), prefix = "api/") { if (is.null(server) || server == "") { stop("'server' is missing with no default set in DATAVERSE_SERVER environment variable.") From dab5f65f19ab96280125730656913ecfbfd4a6bb Mon Sep 17 00:00:00 2001 From: Will Beasley Date: Sat, 2 Jan 2021 13:25:27 -0600 Subject: [PATCH 70/75] rename `FUN` parameters to `.f` @kuriwaki, let's choose the purrr/tidyverse convention over the `apply()` convention https://purrr.tidyverse.org/reference/map.html ref #66 --- R/get_dataframe.R | 36 +++++++++--------- README.Rmd | 22 +++++------ README.md | 38 +++++++++---------- man/get_dataframe.Rd | 20 +++++----- tests/testthat/manual/seed/seed-yaml.R | 6 +-- .../tests-get_dataframe-original-basketball.R | 12 +++--- 6 files changed, 67 insertions(+), 67 deletions(-) diff --git a/R/get_dataframe.R b/R/get_dataframe.R index d95a944..09c9101 100644 --- a/R/get_dataframe.R +++ b/R/get_dataframe.R @@ -7,14 +7,14 @@ #' #' @param filename The name of the file of interest, with file extension, for example #' `"roster-bulls-1996.tab"`. -#' @param FUN The function to used for reading in the raw dataset. This user +#' @param .f The function to used for reading in the raw dataset. This user #' must choose the appropriate function: for example if the target is a .rds -#' file, then `FUN` should be `readRDS` or `readr::read_rds`. +#' file, then `.f` should be `readRDS` or `readr::read_`rds`. #' @param original A logical, defaulting to TRUE. Whether to read the ingested, #' archival version of the dataset if one exists. The archival versions are tab-delimited -#' `.tab` files so if `original = FALSE`, `FUN` is set to `readr::read_tsv`. +#' `.tab` files so if `original = FALSE`, `.f` is set to `readr::read_tsv`. #' If functions to read the original version is available, then `original = TRUE` -#' with a specified `FUN` is better. +#' with a specified `.f` is better. #' #' @inheritDotParams get_file #' @@ -45,7 +45,7 @@ #' #' #' # To use the original file version, or for non-ingested data, -#' # please specify `original = TRUE` and specify a function in FUN. +#' # please specify `original = TRUE` and specify a function in .f. #' #' # A data.frame is still returned, but the #' if (requireNamespace("readr", quietly = TRUE)) { @@ -55,7 +55,7 @@ #' dataset = "doi:10.70122/FK2/PPIAXE", #' server = "demo.dataverse.org", #' original = TRUE, -#' FUN = readr::read_rds +#' .f = readr::read_rds #' ) #' } #' @@ -66,28 +66,28 @@ #' dataset = "doi:10.70122/FK2/PPIAXE", #' server = "demo.dataverse.org", #' original = TRUE, -#' FUN = haven::read_dta +#' .f = haven::read_dta #' ) #' } #' @export get_dataframe_by_name <- function ( filename, dataset = NULL, - FUN = NULL, + .f = NULL, original = FALSE, ... ) { # retrieve ID fileid <- get_fileid.character(x = dataset, file = filename, ...) - get_dataframe_by_id(fileid, FUN, original = original, ...) + get_dataframe_by_id(fileid, .f, original = original, ...) } #' @rdname get_dataframe #' @export get_dataframe_by_id <- function( fileid, - FUN = NULL, + .f = NULL, original = FALSE, ... ) { @@ -99,21 +99,21 @@ get_dataframe_by_id <- function( original <- NA } - if (is.null(FUN) & isTRUE(ingested) & isFALSE(original)) { + if (is.null(.f) & isTRUE(ingested) & isFALSE(original)) { message("Downloading ingested version of data with readr::read_tsv. To download the original version and remove this message, set original = TRUE.\n") - FUN <- readr::read_tsv + .f <- readr::read_tsv } - if (is.null(FUN) & (isFALSE(ingested) | isTRUE(original))) { - stop("read-in function was left NULL, but the target file is not ingested or you asked for the original version. Please supply a FUN argument.\n") + if (is.null(.f) & (isFALSE(ingested) | isTRUE(original))) { + stop("read-in function was left NULL, but the target file is not ingested or you asked for the original version. Please supply a .f argument.\n") } # READ raw data raw <- get_file(file = fileid, original = original, ...) # save to temp and then read it in with supplied function - if (!is.null(FUN)) { - get_dataframe_internal(raw, filename = "foo", .f = FUN) + if (!is.null(.f)) { + get_dataframe_internal(raw, filename = "foo", .f = .f) } } @@ -122,14 +122,14 @@ get_dataframe_by_id <- function( #' @export get_dataframe_by_doi <- function ( filedoi, - FUN = NULL, + .f = NULL, original = FALSE, ... ) { filedoi <- prepend_doi(filedoi) # get_file can also take doi now - get_dataframe_by_id(fileid = filedoi, FUN = FUN, original = original, ...) + get_dataframe_by_id(fileid = filedoi, .f = .f, original = original, ...) } #' Write to temp and apply function diff --git a/README.Rmd b/README.Rmd index 7460f4e..a2168e3 100644 --- a/README.Rmd +++ b/README.Rmd @@ -65,7 +65,7 @@ Use the `get_dataframe_*` functions, depending on the input you have. For exampl With a file DOI, we can use the `get_dataframe_by_doi` function: ```{r get_dataframe_by_doi} -nlsw <- get_dataframe_by_doi("10.70122/FK2/PPIAXE/MHDB0O", +nlsw <- get_dataframe_by_doi("10.70122/FK2/PPIAXE/MHDB0O", server = "demo.dataverse.org") ``` which by default reads in the ingested file (not the original dta) by the [`readr::read_tsv`](https://readr.tidyverse.org/reference/read_delim.html) function. @@ -74,27 +74,27 @@ Alternatively, we can download the same file by specifying the filename and the ```{r get_dataframe_by_name_tsv, message=FALSE} nlsw_tsv <- get_dataframe_by_name(filename = "nlsw88.tab", - dataset = "10.70122/FK2/PPIAXE", + dataset = "10.70122/FK2/PPIAXE", server = "demo.dataverse.org") ``` -Now, Dataverse often translates rectangular data into an ingested, or "archival" version, which is application-neutral and easily-readable. `read_dataframe_*` defaults to taking this ingested version rather than using the original, through the argument `original = FALSE`. +Now, Dataverse often translates rectangular data into an ingested, or "archival" version, which is application-neutral and easily-readable. `read_dataframe_*` defaults to taking this ingested version rather than using the original, through the argument `original = FALSE`. -This default is safe because you may not have the proprietary software that was originally used. On the other hand, the data may have lost information in the process of the ingestation. +This default is safe because you may not have the proprietary software that was originally used. On the other hand, the data may have lost information in the process of the ingestation. -Instead, to read the same file but its original version, specify `original = TRUE` and set a `FUN` argument. In this case, we know that `nlsw88.tab` is a Stata `.dta` dataset, so we will use the `haven::read_dta` function. +Instead, to read the same file but its original version, specify `original = TRUE` and set an `.f` argument. In this case, we know that `nlsw88.tab` is a Stata `.dta` dataset, so we will use the `haven::read_dta` function. ```{r get_dataframe_by_name_original} nlsw_original <- get_dataframe_by_name(filename = "nlsw88.tab", - dataset = "10.70122/FK2/PPIAXE", - FUN = haven::read_dta, + dataset = "10.70122/FK2/PPIAXE", + .f = haven::read_dta, original = TRUE, server = "demo.dataverse.org") ``` -Note that even though the file prefix is ".tab", we use `read_dta`. +Note that even though the file prefix is ".tab", we use `read_dta`. -Of course, when the dataset is not ingested (such as a Rds file), users would always need to specify a `FUN` argument for the specific file. +Of course, when the dataset is not ingested (such as a Rds file), users would always need to specify an `.f` argument for the specific file. Note the difference between `nls_tsv` and `nls_original`. `nls_original` preserves the data attributes like value labels, whereas `nls_tsv` has dropped this or left this in file metadata. @@ -112,11 +112,11 @@ attr(nlsw_original$race, "labels") # original dta has value labels #### Reading a dataset as a binary file. -In some cases, you may not want to read in the data in your environment, perhaps because that is not possible (e.g. for a `.docx` file), and you want to simply write these files your local disk. To do this, use the more primitive `get_file_*` commands. The arguments are equivalent, except we no longer need a `FUN` argument +In some cases, you may not want to read in the data in your environment, perhaps because that is not possible (e.g. for a `.docx` file), and you want to simply write these files your local disk. To do this, use the more primitive `get_file_*` commands. The arguments are equivalent, except we no longer need an `.f` argument ```{r get_file_by_name} nlsw_raw <- get_file_by_name(filename = "nlsw88.tab", - dataset = "10.70122/FK2/PPIAXE", + dataset = "10.70122/FK2/PPIAXE", server = "demo.dataverse.org") class(nlsw_raw) ``` diff --git a/README.md b/README.md index 7eb8981..a54c15d 100644 --- a/README.md +++ b/README.md @@ -9,13 +9,14 @@ Status](https://travis-ci.org/IQSS/dataverse-client-r.png?branch=master)](https: [![codecov.io](https://codecov.io/github/IQSS/dataverse-client-r/coverage.svg?branch=master)](https://codecov.io/github/IQSS/dataverse-client-r?branch=master) [![Dataverse Project -logo](http://dataverse.org/files/dataverseorg/files/dataverse_project_logo-hp.png "Dataverse Project")](http://dataverse.org) - -The **dataverse** package provides access to [Dataverse -4](http://dataverse.org/) APIs, enabling data search, retrieval, and -deposit, thus allowing R users to integrate public data sharing into the -reproducible research workflow. **dataverse** is the next-generation -iteration of [the **dvn** +logo](http://dataverse.org/files/dataverseorg/files/dataverse_project_logo-hp.png +"Dataverse Project")](http://dataverse.org) + +The **dataverse** package provides access to +[Dataverse 4](http://dataverse.org/) APIs, enabling data search, +retrieval, and deposit, thus allowing R users to integrate public data +sharing into the reproducible research workflow. **dataverse** is the +next-generation iteration of [the **dvn** package](https://cran.r-project.org/package=dvn), which works with Dataverse 3 (“Dataverse Network”) applications. **dataverse** includes numerous improvements for data search, retrieval, and deposit, including @@ -89,13 +90,14 @@ For example, we will read a survey dataset on Dataverse, With a file DOI, we can use the `get_dataframe_by_doi` function: ``` r -nlsw <- get_dataframe_by_doi("10.70122/FK2/PPIAXE/MHDB0O", +nlsw <- get_dataframe_by_doi("10.70122/FK2/PPIAXE/MHDB0O", server = "demo.dataverse.org") ``` - ## Warning in get_dataframe_by_id(fileid = filedoi, FUN = FUN, original = original, : Downloading ingested version of data with read_tsv. To download the original version and remove this warning, set original = TRUE. + ## Downloading ingested version of data with readr::read_tsv. To download the original version and remove this message, set original = TRUE. - ## Parsed with column specification: + ## + ## ── Column specification ──────────────────────────────────────────────────────────────────────────────────────────────── ## cols( ## idcode = col_double(), ## age = col_double(), @@ -127,12 +129,10 @@ called a dataset). ``` r nlsw_tsv <- get_dataframe_by_name(filename = "nlsw88.tab", - dataset = "10.70122/FK2/PPIAXE", + dataset = "10.70122/FK2/PPIAXE", server = "demo.dataverse.org") ``` - ## Warning in get_dataframe_by_id(fileid, FUN, original = original, ...): Downloading ingested version of data with read_tsv. To download the original version and remove this warning, set original = TRUE. - Now, Dataverse often translates rectangular data into an ingested, or “archival” version, which is application-neutral and easily-readable. `read_dataframe_*` defaults to taking this ingested version rather than @@ -143,14 +143,14 @@ that was originally used. On the other hand, the data may have lost information in the process of the ingestation. Instead, to read the same file but its original version, specify -`original = TRUE` and set a `FUN` argument. In this case, we know that +`original = TRUE` and set an `.f` argument. In this case, we know that `nlsw88.tab` is a Stata `.dta` dataset, so we will use the `haven::read_dta` function. ``` r nlsw_original <- get_dataframe_by_name(filename = "nlsw88.tab", - dataset = "10.70122/FK2/PPIAXE", - FUN = haven::read_dta, + dataset = "10.70122/FK2/PPIAXE", + .f = haven::read_dta, original = TRUE, server = "demo.dataverse.org") ``` @@ -158,7 +158,7 @@ nlsw_original <- get_dataframe_by_name(filename = "nlsw88.tab", Note that even though the file prefix is “.tab”, we use `read_dta`. Of course, when the dataset is not ingested (such as a Rds file), users -would always need to specify a `FUN` argument for the specific file. +would always need to specify an `.f` argument for the specific file. Note the difference between `nls_tsv` and `nls_original`. `nls_original` preserves the data attributes like value labels, whereas `nls_tsv` has @@ -183,11 +183,11 @@ In some cases, you may not want to read in the data in your environment, perhaps because that is not possible (e.g. for a `.docx` file), and you want to simply write these files your local disk. To do this, use the more primitive `get_file_*` commands. The arguments are equivalent, -except we no longer need a `FUN` argument +except we no longer need an `.f` argument ``` r nlsw_raw <- get_file_by_name(filename = "nlsw88.tab", - dataset = "10.70122/FK2/PPIAXE", + dataset = "10.70122/FK2/PPIAXE", server = "demo.dataverse.org") class(nlsw_raw) ``` diff --git a/man/get_dataframe.Rd b/man/get_dataframe.Rd index 4e5f5d0..6f4f4c1 100644 --- a/man/get_dataframe.Rd +++ b/man/get_dataframe.Rd @@ -9,14 +9,14 @@ get_dataframe_by_name( filename, dataset = NULL, - FUN = NULL, + .f = NULL, original = FALSE, ... ) -get_dataframe_by_id(fileid, FUN = NULL, original = FALSE, ...) +get_dataframe_by_id(fileid, .f = NULL, original = FALSE, ...) -get_dataframe_by_doi(filedoi, FUN = NULL, original = FALSE, ...) +get_dataframe_by_doi(filedoi, .f = NULL, original = FALSE, ...) } \arguments{ \item{filename}{The name of the file of interest, with file extension, for example @@ -26,15 +26,15 @@ get_dataframe_by_doi(filedoi, FUN = NULL, original = FALSE, ...) for example \code{"doi:10.70122/FK2/HXJVJU"}. Alternatively, an object of class \dQuote{dataverse_dataset} obtained by \code{dataverse_contents()}.} -\item{FUN}{The function to used for reading in the raw dataset. This user +\item{.f}{The function to used for reading in the raw dataset. This user must choose the appropriate function: for example if the target is a .rds -file, then \code{FUN} should be \code{readRDS} or \code{readr::read_rds}.} +file, then \code{.f} should be \code{readRDS} or \code{readr::read_}rds`.} \item{original}{A logical, defaulting to TRUE. Whether to read the ingested, archival version of the dataset if one exists. The archival versions are tab-delimited -\code{.tab} files so if \code{original = FALSE}, \code{FUN} is set to \code{readr::read_tsv}. +\code{.tab} files so if \code{original = FALSE}, \code{.f} is set to \code{readr::read_tsv}. If functions to read the original version is available, then \code{original = TRUE} -with a specified \code{FUN} is better.} +with a specified \code{.f} is better.} \item{...}{ Arguments passed on to \code{\link[=get_file]{get_file}} @@ -98,7 +98,7 @@ df_from_stata_ingested <- # To use the original file version, or for non-ingested data, -# please specify `original = TRUE` and specify a function in FUN. +# please specify `original = TRUE` and specify a function in .f. # A data.frame is still returned, but the if (requireNamespace("readr", quietly = TRUE)) { @@ -108,7 +108,7 @@ if (requireNamespace("readr", quietly = TRUE)) { dataset = "doi:10.70122/FK2/PPIAXE", server = "demo.dataverse.org", original = TRUE, - FUN = readr::read_rds + .f = readr::read_rds ) } @@ -119,7 +119,7 @@ if (requireNamespace("haven", quietly = TRUE)) { dataset = "doi:10.70122/FK2/PPIAXE", server = "demo.dataverse.org", original = TRUE, - FUN = haven::read_dta + .f = haven::read_dta ) } } diff --git a/tests/testthat/manual/seed/seed-yaml.R b/tests/testthat/manual/seed/seed-yaml.R index f4dca6b..7b23fb8 100644 --- a/tests/testthat/manual/seed/seed-yaml.R +++ b/tests/testthat/manual/seed/seed-yaml.R @@ -20,7 +20,7 @@ file_csv <- filename = "roster-bulls-1996.tab", dataset = "doi:10.70122/FK2/HXJVJU", original = TRUE, - FUN = readr::read_file + .f = readr::read_file ) ds_1$roster$raw_value <- @@ -29,7 +29,7 @@ ds_1$roster$raw_value <- filename = ds_1$roster$label, dataset = dirname(ds_1$roster$dataFile$persistentId), original = TRUE, - FUN = readr::read_file + .f = readr::read_file ) ds_1$image$raw_value <- @@ -39,7 +39,7 @@ ds_1$image$raw_value <- filename = ds_1$image$label, dataset = dirname(ds_1$image$dataFile$persistentId), original = TRUE, - FUN = readr::read_file + .f = readr::read_file ), "\n" ) diff --git a/tests/testthat/tests-get_dataframe-original-basketball.R b/tests/testthat/tests-get_dataframe-original-basketball.R index 6a83796..c26c900 100644 --- a/tests/testthat/tests-get_dataframe-original-basketball.R +++ b/tests/testthat/tests-get_dataframe-original-basketball.R @@ -10,7 +10,7 @@ test_that("roster-by-name", { filename = expected_ds$roster$label , # A value like "roster-bulls-1996.tab", dataset = dirname(expected_ds$roster$dataFile$persistentId), # A value like "doi:10.70122/FK2/HXJVJU", original = TRUE, - FUN = readr::read_file + .f = readr::read_file ) expect_equal(substr(actual, 1, 30), substr(expected_file, 1, 30)) @@ -27,7 +27,7 @@ test_that("roster-by-doi", { get_dataframe_by_doi( filedoi = expected_ds$roster$dataFile$persistentId, # A value like "doi:10.70122/FK2/HXJVJU/SA3Z2V", original = TRUE, - FUN = readr::read_file + .f = readr::read_file ) expect_equal(substr(actual, 1, 30), substr(expected_file, 1, 30)) @@ -44,7 +44,7 @@ test_that("roster-by-id", { get_dataframe_by_id( fileid = expected_ds$roster$dataFile$id, # A value like 1734005 original = TRUE, - FUN = readr::read_file + .f = readr::read_file ) expect_equal(substr(actual, 1, 30), substr(expected_file, 1, 30)) @@ -62,7 +62,7 @@ test_that("image-by-name", { filename = expected_ds$image$label , #"vector-basketball.svg", dataset = dirname(expected_ds$image$dataFile$persistentId), #"doi:10.70122/FK2/HXJVJU", original = TRUE, - FUN = readr::read_file + .f = readr::read_file ) expect_equal(substr(actual, 1, 30), substr(expected_file, 1, 30)) @@ -79,7 +79,7 @@ test_that("image-by-doi", { get_dataframe_by_doi( filedoi = expected_ds$image$dataFile$persistentId, # A value like "doi:10.70122/FK2/HXJVJU/FHV8ZB", original = TRUE, - FUN = readr::read_file + .f = readr::read_file ) expect_equal(substr(actual, 1, 30), substr(expected_file, 1, 30)) @@ -96,7 +96,7 @@ test_that("image-by-id", { get_dataframe_by_id( fileid = expected_ds$image$dataFile$id, # A value like 1734006 original = TRUE, - FUN = readr::read_file + .f = readr::read_file ) expect_equal(substr(actual, 1, 30), substr(expected_file, 1, 30)) From cb0aa10b495e80b5dd12dbde5876e6f6e32a5b8e Mon Sep 17 00:00:00 2001 From: Will Beasley Date: Sat, 2 Jan 2021 13:35:10 -0600 Subject: [PATCH 71/75] tidy --- README.Rmd | 67 ++++++++++++++++++++++++++++++++---------------------- README.md | 64 ++++++++++++++++++++++++++++++++------------------- 2 files changed, 81 insertions(+), 50 deletions(-) diff --git a/README.Rmd b/README.Rmd index a2168e3..286bc6a 100644 --- a/README.Rmd +++ b/README.Rmd @@ -3,7 +3,6 @@ title: "R Client for Dataverse 4 Repositories" output: github_document --- - ```{r knitr_options, echo=FALSE, results="hide"} options(width = 120) knitr::opts_chunk$set(results = "hold") @@ -12,8 +11,6 @@ Sys.setenv("DATAVERSE_SERVER" = "dataverse.harvard.edu") [![CRAN Version](https://www.r-pkg.org/badges/version/dataverse)](https://cran.r-project.org/package=dataverse) ![Downloads](https://cranlogs.r-pkg.org/badges/dataverse) [![Travis-CI Build Status](https://travis-ci.org/IQSS/dataverse-client-r.png?branch=master)](https://travis-ci.org/IQSS/dataverse-client-r) [![codecov.io](https://codecov.io/github/IQSS/dataverse-client-r/coverage.svg?branch=master)](https://codecov.io/github/IQSS/dataverse-client-r?branch=master) - - [![Dataverse Project logo](http://dataverse.org/files/dataverseorg/files/dataverse_project_logo-hp.png "Dataverse Project")](http://dataverse.org) The **dataverse** package provides access to [Dataverse 4](http://dataverse.org/) APIs, enabling data search, retrieval, and deposit, thus allowing R users to integrate public data sharing into the reproducible research workflow. **dataverse** is the next-generation iteration of [the **dvn** package](https://cran.r-project.org/package=dvn), which works with Dataverse 3 ("Dataverse Network") applications. **dataverse** includes numerous improvements for data search, retrieval, and deposit, including use of the (currently in development) **sword** package for data deposit and the **UNF** package for data fingerprinting. @@ -24,12 +21,11 @@ You can find a stable 2017 release on [CRAN](https://cran.r-project.org/package= ```{r, echo = FALSE, eval = FALSE} if (!require("remotes")) { - install.packages("remotes") + install.packages("remotes") } remotes::install_github("iqss/dataverse-client-r") ``` - ```{r} library("dataverse") ``` @@ -60,39 +56,48 @@ The dataverse package provides multiple interfaces to obtain data into R. Users #### Reading data as R objects -Use the `get_dataframe_*` functions, depending on the input you have. For example, we will read a survey dataset on Dataverse, [nlsw88.dta](https://demo.dataverse.org/file.xhtml?persistentId=doi:10.70122/FK2/PPKHI1/ZYATZZ) (`doi:10.70122/FK2/PPKHI1/ZYATZZ`), originally in Stata dta form. +Use the `get_dataframe_*()` functions, depending on the input you have. For example, we will read a survey dataset on Dataverse, [nlsw88.dta](https://demo.dataverse.org/file.xhtml?persistentId=doi:10.70122/FK2/PPKHI1/ZYATZZ) (`doi:10.70122/FK2/PPKHI1/ZYATZZ`), originally in Stata dta form. With a file DOI, we can use the `get_dataframe_by_doi` function: ```{r get_dataframe_by_doi} -nlsw <- get_dataframe_by_doi("10.70122/FK2/PPIAXE/MHDB0O", - server = "demo.dataverse.org") +nlsw <- + get_dataframe_by_doi( + filedoi = "10.70122/FK2/PPIAXE/MHDB0O", + server = "demo.dataverse.org" + ) ``` which by default reads in the ingested file (not the original dta) by the [`readr::read_tsv`](https://readr.tidyverse.org/reference/read_delim.html) function. Alternatively, we can download the same file by specifying the filename and the DOI of the "dataset" (in Dataverse, a collection of files is called a dataset). ```{r get_dataframe_by_name_tsv, message=FALSE} -nlsw_tsv <- get_dataframe_by_name(filename = "nlsw88.tab", - dataset = "10.70122/FK2/PPIAXE", - server = "demo.dataverse.org") +nlsw_tsv <- + get_dataframe_by_name( + filename = "nlsw88.tab", + dataset = "10.70122/FK2/PPIAXE", + server = "demo.dataverse.org" + ) ``` -Now, Dataverse often translates rectangular data into an ingested, or "archival" version, which is application-neutral and easily-readable. `read_dataframe_*` defaults to taking this ingested version rather than using the original, through the argument `original = FALSE`. +Now, Dataverse often translates rectangular data into an ingested, or "archival" version, which is application-neutral and easily-readable. `read_dataframe_*()` defaults to taking this ingested version rather than using the original, through the argument `original = FALSE`. This default is safe because you may not have the proprietary software that was originally used. On the other hand, the data may have lost information in the process of the ingestation. Instead, to read the same file but its original version, specify `original = TRUE` and set an `.f` argument. In this case, we know that `nlsw88.tab` is a Stata `.dta` dataset, so we will use the `haven::read_dta` function. ```{r get_dataframe_by_name_original} -nlsw_original <- get_dataframe_by_name(filename = "nlsw88.tab", - dataset = "10.70122/FK2/PPIAXE", - .f = haven::read_dta, - original = TRUE, - server = "demo.dataverse.org") +nlsw_original <- + get_dataframe_by_name( + filename = "nlsw88.tab", + dataset = "10.70122/FK2/PPIAXE", + .f = haven::read_dta, + original = TRUE, + server = "demo.dataverse.org" + ) ``` -Note that even though the file prefix is ".tab", we use `read_dta`. +Note that even though the file prefix is ".tab", we use `haven::read_dta`. Of course, when the dataset is not ingested (such as a Rds file), users would always need to specify an `.f` argument for the specific file. @@ -115,19 +120,24 @@ attr(nlsw_original$race, "labels") # original dta has value labels In some cases, you may not want to read in the data in your environment, perhaps because that is not possible (e.g. for a `.docx` file), and you want to simply write these files your local disk. To do this, use the more primitive `get_file_*` commands. The arguments are equivalent, except we no longer need an `.f` argument ```{r get_file_by_name} -nlsw_raw <- get_file_by_name(filename = "nlsw88.tab", - dataset = "10.70122/FK2/PPIAXE", - server = "demo.dataverse.org") +nlsw_raw <- + get_file_by_name( + filename = "nlsw88.tab", + dataset = "10.70122/FK2/PPIAXE", + server = "demo.dataverse.org" + ) class(nlsw_raw) ``` #### Reading file metadata -The function `get_file_metadata` can also be used similarly. This will return a metadata format for ingested tabular files in the `ddi` format. The function `get_dataset` will retrieve the list of files in a dataset. +The function `get_file_metadata()` can also be used similarly. This will return a metadata format for ingested tabular files in the `ddi` format. The function `get_dataset()` will retrieve the list of files in a dataset. ```{r, get_dataset} -get_dataset(dataset = "10.70122/FK2/PPIAXE", - server = "demo.dataverse.org") +get_dataset( + dataset = "10.70122/FK2/PPIAXE", + server = "demo.dataverse.org" +) ``` ### Data Discovery @@ -161,9 +171,12 @@ Dataverse provides two - basically unrelated - workflows for managing (adding, d d <- service_document() # create a list of metadata -metadat <- list(title = "My Study", - creator = "Doe, John", - description = "An example study") +metadat <- + list( + title = "My Study", + creator = "Doe, John", + description = "An example study" + ) # create the dataset ds <- initiate_sword_dataset("mydataverse", body = metadat) diff --git a/README.md b/README.md index a54c15d..dc24620 100644 --- a/README.md +++ b/README.md @@ -82,7 +82,7 @@ dataset read in with the appropriate R function. #### Reading data as R objects -Use the `get_dataframe_*` functions, depending on the input you have. +Use the `get_dataframe_*()` functions, depending on the input you have. For example, we will read a survey dataset on Dataverse, [nlsw88.dta](https://demo.dataverse.org/file.xhtml?persistentId=doi:10.70122/FK2/PPKHI1/ZYATZZ) (`doi:10.70122/FK2/PPKHI1/ZYATZZ`), originally in Stata dta form. @@ -90,8 +90,11 @@ For example, we will read a survey dataset on Dataverse, With a file DOI, we can use the `get_dataframe_by_doi` function: ``` r -nlsw <- get_dataframe_by_doi("10.70122/FK2/PPIAXE/MHDB0O", - server = "demo.dataverse.org") +nlsw <- + get_dataframe_by_doi( + filedoi = "10.70122/FK2/PPIAXE/MHDB0O", + server = "demo.dataverse.org" + ) ``` ## Downloading ingested version of data with readr::read_tsv. To download the original version and remove this message, set original = TRUE. @@ -128,15 +131,18 @@ and the DOI of the “dataset” (in Dataverse, a collection of files is called a dataset). ``` r -nlsw_tsv <- get_dataframe_by_name(filename = "nlsw88.tab", - dataset = "10.70122/FK2/PPIAXE", - server = "demo.dataverse.org") +nlsw_tsv <- + get_dataframe_by_name( + filename = "nlsw88.tab", + dataset = "10.70122/FK2/PPIAXE", + server = "demo.dataverse.org" + ) ``` Now, Dataverse often translates rectangular data into an ingested, or “archival” version, which is application-neutral and easily-readable. -`read_dataframe_*` defaults to taking this ingested version rather than -using the original, through the argument `original = FALSE`. +`read_dataframe_*()` defaults to taking this ingested version rather +than using the original, through the argument `original = FALSE`. This default is safe because you may not have the proprietary software that was originally used. On the other hand, the data may have lost @@ -148,11 +154,14 @@ Instead, to read the same file but its original version, specify `haven::read_dta` function. ``` r -nlsw_original <- get_dataframe_by_name(filename = "nlsw88.tab", - dataset = "10.70122/FK2/PPIAXE", - .f = haven::read_dta, - original = TRUE, - server = "demo.dataverse.org") +nlsw_original <- + get_dataframe_by_name( + filename = "nlsw88.tab", + dataset = "10.70122/FK2/PPIAXE", + .f = haven::read_dta, + original = TRUE, + server = "demo.dataverse.org" + ) ``` Note that even though the file prefix is “.tab”, we use `read_dta`. @@ -186,9 +195,12 @@ more primitive `get_file_*` commands. The arguments are equivalent, except we no longer need an `.f` argument ``` r -nlsw_raw <- get_file_by_name(filename = "nlsw88.tab", - dataset = "10.70122/FK2/PPIAXE", - server = "demo.dataverse.org") +nlsw_raw <- + get_file_by_name( + filename = "nlsw88.tab", + dataset = "10.70122/FK2/PPIAXE", + server = "demo.dataverse.org" + ) class(nlsw_raw) ``` @@ -196,13 +208,16 @@ class(nlsw_raw) #### Reading file metadata -The function `get_file_metadata` can also be used similarly. This will +The function `get_file_metadata()` can also be used similarly. This will return a metadata format for ingested tabular files in the `ddi` format. -The function `get_dataset` will retrieve the list of files in a dataset. +The function `get_dataset()` will retrieve the list of files in a +dataset. ``` r -get_dataset(dataset = "10.70122/FK2/PPIAXE", - server = "demo.dataverse.org") +get_dataset( + dataset = "10.70122/FK2/PPIAXE", + server = "demo.dataverse.org" +) ``` ## Dataset (182162): @@ -253,9 +268,12 @@ it. This looks something like the following: d <- service_document() # create a list of metadata -metadat <- list(title = "My Study", - creator = "Doe, John", - description = "An example study") +metadat <- + list( + title = "My Study", + creator = "Doe, John", + description = "An example study" + ) # create the dataset ds <- initiate_sword_dataset("mydataverse", body = metadat) From 97ba989893298035b6ae32b9680d55c5c6dd72b9 Mon Sep 17 00:00:00 2001 From: Will Beasley Date: Sun, 3 Jan 2021 12:47:18 -0600 Subject: [PATCH 72/75] 1st draft of pkgdown ref #72 --- docs/404.html | 169 ++++++++ docs/ISSUE_TEMPLATE.html | 200 ++++++++++ docs/PULL_REQUEST_TEMPLATE.html | 192 +++++++++ docs/articles/A-introduction.html | 272 +++++++++++++ .../empty-anchor.js | 15 + .../header-attrs-2.6/header-attrs.js | 12 + docs/articles/B-search.html | 204 ++++++++++ .../empty-anchor.js | 15 + .../header-attrs-2.6/header-attrs.js | 12 + docs/articles/C-retrieval.html | 339 ++++++++++++++++ .../empty-anchor.js | 15 + .../header-attrs-2.6/header-attrs.js | 12 + docs/articles/D-archiving.html | 183 +++++++++ .../empty-anchor.js | 15 + .../header-attrs-2.6/header-attrs.js | 12 + docs/articles/index.html | 174 +++++++++ docs/authors.html | 200 ++++++++++ docs/bootstrap-toc.css | 60 +++ docs/bootstrap-toc.js | 159 ++++++++ docs/docsearch.css | 148 +++++++ docs/docsearch.js | 85 ++++ docs/index.html | 358 +++++++++++++++++ docs/link.svg | 12 + docs/news/index.html | 167 ++++++++ docs/pkgdown.css | 367 ++++++++++++++++++ docs/pkgdown.js | 108 ++++++ docs/pkgdown.yml | 10 + docs/reference/Rplot001.png | Bin 0 -> 1011 bytes docs/reference/add_dataset_file.html | 271 +++++++++++++ docs/reference/add_file.html | 248 ++++++++++++ docs/reference/create_dataset.html | 249 ++++++++++++ docs/reference/create_dataverse.html | 226 +++++++++++ docs/reference/dataset_atom.html | 236 +++++++++++ docs/reference/dataset_versions.html | 230 +++++++++++ docs/reference/dataverse.html | 192 +++++++++ docs/reference/dataverse_metadata.html | 226 +++++++++++ docs/reference/dataverse_search.html | 282 ++++++++++++++ docs/reference/delete_dataset.html | 226 +++++++++++ docs/reference/delete_dataverse.html | 223 +++++++++++ docs/reference/delete_file.html | 219 +++++++++++ docs/reference/delete_sword_dataset.html | 234 +++++++++++ docs/reference/files.html | 359 +++++++++++++++++ docs/reference/get_dataframe.html | 348 +++++++++++++++++ docs/reference/get_dataframe_internal.html | 175 +++++++++ docs/reference/get_dataset.html | 278 +++++++++++++ docs/reference/get_dataverse.html | 254 ++++++++++++ docs/reference/get_facets.html | 228 +++++++++++ docs/reference/get_file_metadata.html | 228 +++++++++++ docs/reference/get_user_key.html | 208 ++++++++++ docs/reference/index.html | 356 +++++++++++++++++ docs/reference/initiate_sword_dataset.html | 266 +++++++++++++ docs/reference/is_ingested.html | 201 ++++++++++ docs/reference/list_datasets.html | 225 +++++++++++ docs/reference/publish_dataset.html | 232 +++++++++++ docs/reference/publish_dataverse.html | 217 +++++++++++ docs/reference/publish_sword_dataset.html | 237 +++++++++++ docs/reference/service_document.html | 221 +++++++++++ docs/reference/set_dataverse_metadata.html | 227 +++++++++++ 58 files changed, 10837 insertions(+) create mode 100644 docs/404.html create mode 100644 docs/ISSUE_TEMPLATE.html create mode 100644 docs/PULL_REQUEST_TEMPLATE.html create mode 100644 docs/articles/A-introduction.html create mode 100644 docs/articles/A-introduction_files/accessible-code-block-0.0.1/empty-anchor.js create mode 100644 docs/articles/A-introduction_files/header-attrs-2.6/header-attrs.js create mode 100644 docs/articles/B-search.html create mode 100644 docs/articles/B-search_files/accessible-code-block-0.0.1/empty-anchor.js create mode 100644 docs/articles/B-search_files/header-attrs-2.6/header-attrs.js create mode 100644 docs/articles/C-retrieval.html create mode 100644 docs/articles/C-retrieval_files/accessible-code-block-0.0.1/empty-anchor.js create mode 100644 docs/articles/C-retrieval_files/header-attrs-2.6/header-attrs.js create mode 100644 docs/articles/D-archiving.html create mode 100644 docs/articles/D-archiving_files/accessible-code-block-0.0.1/empty-anchor.js create mode 100644 docs/articles/D-archiving_files/header-attrs-2.6/header-attrs.js create mode 100644 docs/articles/index.html create mode 100644 docs/authors.html create mode 100644 docs/bootstrap-toc.css create mode 100644 docs/bootstrap-toc.js create mode 100644 docs/docsearch.css create mode 100644 docs/docsearch.js create mode 100644 docs/index.html create mode 100644 docs/link.svg create mode 100644 docs/news/index.html create mode 100644 docs/pkgdown.css create mode 100644 docs/pkgdown.js create mode 100644 docs/pkgdown.yml create mode 100644 docs/reference/Rplot001.png create mode 100644 docs/reference/add_dataset_file.html create mode 100644 docs/reference/add_file.html create mode 100644 docs/reference/create_dataset.html create mode 100644 docs/reference/create_dataverse.html create mode 100644 docs/reference/dataset_atom.html create mode 100644 docs/reference/dataset_versions.html create mode 100644 docs/reference/dataverse.html create mode 100644 docs/reference/dataverse_metadata.html create mode 100644 docs/reference/dataverse_search.html create mode 100644 docs/reference/delete_dataset.html create mode 100644 docs/reference/delete_dataverse.html create mode 100644 docs/reference/delete_file.html create mode 100644 docs/reference/delete_sword_dataset.html create mode 100644 docs/reference/files.html create mode 100644 docs/reference/get_dataframe.html create mode 100644 docs/reference/get_dataframe_internal.html create mode 100644 docs/reference/get_dataset.html create mode 100644 docs/reference/get_dataverse.html create mode 100644 docs/reference/get_facets.html create mode 100644 docs/reference/get_file_metadata.html create mode 100644 docs/reference/get_user_key.html create mode 100644 docs/reference/index.html create mode 100644 docs/reference/initiate_sword_dataset.html create mode 100644 docs/reference/is_ingested.html create mode 100644 docs/reference/list_datasets.html create mode 100644 docs/reference/publish_dataset.html create mode 100644 docs/reference/publish_dataverse.html create mode 100644 docs/reference/publish_sword_dataset.html create mode 100644 docs/reference/service_document.html create mode 100644 docs/reference/set_dataverse_metadata.html diff --git a/docs/404.html b/docs/404.html new file mode 100644 index 0000000..f33a2e6 --- /dev/null +++ b/docs/404.html @@ -0,0 +1,169 @@ + + + + + + + + +Page not found (404) • dataverse + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+ + + + +
+ +
+
+ + +Content not found. Please use links in the navbar. + +
+ + + +
+ + + +
+ + +
+

Site built with pkgdown 1.6.1.

+
+ +
+
+ + + + + + + + diff --git a/docs/ISSUE_TEMPLATE.html b/docs/ISSUE_TEMPLATE.html new file mode 100644 index 0000000..b595c17 --- /dev/null +++ b/docs/ISSUE_TEMPLATE.html @@ -0,0 +1,200 @@ + + + + + + + + +NA • dataverse + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+ + + + +
+ +
+
+ + + +

Please specify whether your issue is about:

+
    +
  • + +a possible bug
  • +
  • + +a question about package functionality
  • +
  • + +a suggested code or documentation change, improvement to the code, or feature request
  • +
+

If you are reporting (1) a bug or (2) a question about code, please supply:

+ +

Put your code here:

+
+## load package
+library("dataverse")
+
+## code goes here
+
+
+## session info for your system
+sessionInfo()
+ + +
+ + + +
+ + + +
+ + +
+

Site built with pkgdown 1.6.1.

+
+ +
+
+ + + + + + + + diff --git a/docs/PULL_REQUEST_TEMPLATE.html b/docs/PULL_REQUEST_TEMPLATE.html new file mode 100644 index 0000000..0d3a629 --- /dev/null +++ b/docs/PULL_REQUEST_TEMPLATE.html @@ -0,0 +1,192 @@ + + + + + + + + +NA • dataverse + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+ + + + +
+ +
+
+ + + +

Please ensure the following before submitting a PR:

+
    +
  • + +if suggesting code changes or improvements, open an issue first
  • +
  • + +for all but trivial changes (e.g., typo fixes), add your name to DESCRIPTION +
  • +
  • + +for all but trivial changes (e.g., typo fixes), documentation your change in NEWS.md with a parenthetical reference to the issue number being addressed
  • +
  • + +if changing documentation, edit files in /R not /man and run devtools::document() to update documentation
  • +
  • + +add code or new test files to /tests for any new functionality or bug fix
  • +
  • + +make sure R CMD check runs without error before submitting the PR
  • +
+ + +
+ + + +
+ + + +
+ + +
+

Site built with pkgdown 1.6.1.

+
+ +
+
+ + + + + + + + diff --git a/docs/articles/A-introduction.html b/docs/articles/A-introduction.html new file mode 100644 index 0000000..385d033 --- /dev/null +++ b/docs/articles/A-introduction.html @@ -0,0 +1,272 @@ + + + + + + + +Introduction to Dataverse • dataverse + + + + + + + + + + +
+
+ + + + +
+
+ + + + +

The dataverse package is the official R client for Dataverse 4 data repositories. The package enables data search, retrieval, and deposit with any Dataverse installation, thus allowing R users to integrate public data sharing into the reproducible research workflow.

+

In addition to this introduction, the package contains three additional vignettes covering:

+ +

They can be accessed from CRAN or from within R using vignettes(package = "dataverse").

+

The dataverse client package can be installed from CRAN, and you can find the latest development version and report any issues on GitHub:

+
+if (!require("remotes")) {
+    install.packages("remotes")
+}
+remotes::install_github("iqss/dataverse-client-r")
+library("dataverse")
+

(Note: dataverse is the next-generation iteration of the dvn package, which works with Dataverse 3 (“Dataverse Network”) applications. See the appendix of this vignette for a cross-walk of functionality between dvn and dataverse.)

+
+

+Quick Start

+

Dataverse has some terminology that is worth quickly reviewing before showing how to work with Dataverse in R. Dataverse is an application that can be installed in many places. As a result, dataverse can work with any instllation but you need to specify which installation you want to work with. This can be set by default with an environment variable, DATAVERSE_SERVER:

+
+library("dataverse")
+Sys.setenv("DATAVERSE_SERVER" = "dataverse.harvard.edu")
+

This should be the Dataverse server, without the “https” prefix or the “/api” URL path, etc. The package attempts to compensate for any malformed values, though.

+

Within a given Dataverse installation, organizations or individuals can create objects that are also called “Dataverses”. These Dataverses can then contain other dataverses, which can contain other dataverses, and so on. They can also contain datasets which in turn contain files. You can think of Harvard’s Dataverse as a top-level installation, where an institution might have a dataverse that contains a subsidiary dataverse for each researcher at the organization, who in turn publishes all files relevant to a given study as a dataset.

+

You can search for and retrieve data without a Dataverse account for that a specific Dataverse installation. For example, to search for data files or datasets that mention “ecological inference”, we can just do:

+
+dataverse_search("ecological inference")[c("name", "type", "description")]
+

The search vignette describes this functionality in more detail. To retrieve a data file, we need to investigate the dataset being returned and look at what files it contains using a variety of functions, the last of which - get_file() - can retrieve the files as raw vectors:

+ +

For “native” Dataverse features (such as user account controls) or to create and publish a dataset, you will need an API key linked to a Dataverse installation account. Instructions for obtaining an account and setting up an API key are available in the Dataverse User Guide. (Note: if your key is compromised, it can be regenerated to preserve security.) Once you have an API key, this should be stored as an environment variable called DATAVERSE_KEY. It can be set within R using:

+
+Sys.setenv("DATAVERSE_KEY" = "examplekey12345")
+

With that set, you can easily create a new dataverse, create a dataset within that dataverse, push files to the dataset, and release it:

+
+# create a dataverse
+dat <- create_dataverse("mydataverse")
+
+# create a list of metadata
+metadat <- list(title = "My Study",
+                creator = "Doe, John",
+                description = "An example study")
+
+# create the dataset
+dat <- initiate_dataset("mydataverse", body = metadat)
+
+# add files to dataset
+tmp <- tempfile()
+write.csv(iris, file = tmp)
+f <- add_file(dat, file = tmp)
+
+# publish new dataset
+publish_dataset(dat)
+

Your data are now publicly accessible.

+
+
+

+Appendix: dvn to dataverse Crosswalk

+

The original Dataverse client for R was called dvn; it worked with Dataverse versions <= 3 and was removed from CRAN in favor of dataverse in 2018. dvn provided functionality for searching, retrieving, and depositing data. Here is a cross-walk of functionality in case you were already familiar with the dvn package:

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
API Category +dataverse functions +dvn functions
Data Searchdataverse_search()dvSearch()
Data Retrievalget_file_metadata()dvMetadata()
get_file()
Data Depositcreate_dataverse()
initiate_dataset()dvCreateStudy()
update_dataset()dvEditStudy()
add_file()addFile()
delete_file()dvDeleteFile()
publish_sword_dataset()dvReleaseStudy()
delete_sword_dataset()
service_document()dvServiceDoc()
dataset_statement()dvStudyStatement()
list_datasets()dvUserStudies()
+
+
+ + + +
+ + + +
+ +
+

Site built with pkgdown 1.6.1.

+
+ +
+
+ + + + + + diff --git a/docs/articles/A-introduction_files/accessible-code-block-0.0.1/empty-anchor.js b/docs/articles/A-introduction_files/accessible-code-block-0.0.1/empty-anchor.js new file mode 100644 index 0000000..ca349fd --- /dev/null +++ b/docs/articles/A-introduction_files/accessible-code-block-0.0.1/empty-anchor.js @@ -0,0 +1,15 @@ +// Hide empty tag within highlighted CodeBlock for screen reader accessibility (see https://github.com/jgm/pandoc/issues/6352#issuecomment-626106786) --> +// v0.0.1 +// Written by JooYoung Seo (jooyoung@psu.edu) and Atsushi Yasumoto on June 1st, 2020. + +document.addEventListener('DOMContentLoaded', function() { + const codeList = document.getElementsByClassName("sourceCode"); + for (var i = 0; i < codeList.length; i++) { + var linkList = codeList[i].getElementsByTagName('a'); + for (var j = 0; j < linkList.length; j++) { + if (linkList[j].innerHTML === "") { + linkList[j].setAttribute('aria-hidden', 'true'); + } + } + } +}); diff --git a/docs/articles/A-introduction_files/header-attrs-2.6/header-attrs.js b/docs/articles/A-introduction_files/header-attrs-2.6/header-attrs.js new file mode 100644 index 0000000..dd57d92 --- /dev/null +++ b/docs/articles/A-introduction_files/header-attrs-2.6/header-attrs.js @@ -0,0 +1,12 @@ +// Pandoc 2.9 adds attributes on both header and div. We remove the former (to +// be compatible with the behavior of Pandoc < 2.8). +document.addEventListener('DOMContentLoaded', function(e) { + var hs = document.querySelectorAll("div.section[class*='level'] > :first-child"); + var i, h, a; + for (i = 0; i < hs.length; i++) { + h = hs[i]; + if (!/^h[1-6]$/i.test(h.tagName)) continue; // it should be a header h1-h6 + a = h.attributes; + while (a.length > 0) h.removeAttribute(a[0].name); + } +}); diff --git a/docs/articles/B-search.html b/docs/articles/B-search.html new file mode 100644 index 0000000..0183446 --- /dev/null +++ b/docs/articles/B-search.html @@ -0,0 +1,204 @@ + + + + + + + +Data Search and Discovery • dataverse + + + + + + + + + + +
+
+ + + + +
+
+ + + + +

Searching for data within Dataverse is quite easy using the dataverse_search() function. The simplest searches simply consist of a query string:

+
+library("dataverse")
+Sys.setenv("DATAVERSE_SERVER" = "dataverse.harvard.edu")
+dataverse_search("Gary King")[c("name")]
+
## 10 of 1043 results retrieved
+
##                                                                          name
+## 1                                       00698McArthur-King-BoxCoverSheets.pdf
+## 2                                      00698McArthur-King-MemoOfAgreement.pdf
+## 3                                     00698McArthur-King-StudyDescription.pdf
+## 4                                                            077_mod1_s2m.tab
+## 5                                      10 Million International Dyadic Events
+## 6             1998 Jewish Community Study of the Coachella Valley, California
+## 7                                               2002 State Legislative Survey
+## 8  A Comparative Study between Gurukul System and Western System of Education
+## 9    A Demographic and Attitudinal Study of the Jewish Community of St. Louis
+## 10       A Demographic Study of the Jewish Community of Atlantic County, 1985
+

The results are paginated, so users can rely upon the per_page and start argument to requested subsequent pages of results. We’ll start at 6 and to show that we retrieve the last five results from the previous query plus 15 more (due to per_page = 20):

+
+dataverse_search("Gary King", start = 6, per_page = 20)[c("name")]
+
## 20 of 1043 results retrieved
+
##                                                                          name
+## 1                                               2002 State Legislative Survey
+## 2  A Comparative Study between Gurukul System and Western System of Education
+## 3    A Demographic and Attitudinal Study of the Jewish Community of St. Louis
+## 4        A Demographic Study of the Jewish Community of Atlantic County, 1985
+## 5          A Demographic Study of the Jewish Community of Greater Kansas City
+## 6     A Demographic Study of the Jewish Community of Greater Washington, 1983
+## 7                                     A Lexicial Index of Electoral Democracy
+## 8         A Population Study of the Jewish Community of Metrowest, New Jersey
+## 9               A Population Study of the Jewish Community of Rochester, 1986
+## 10                    A Population Study of the Jewish Community of Worcester
+## 11                                  A Study of Jewish Culture in the Bay Area
+## 12        A Unified Model of Cabinet Dissolution in Parliamentary Democracies
+## 13                        ABC News / The Washington Post  Poll: January, 1988
+## 14 ABC News / The Washington Post poll # 7925:  Social Security/1984 Election
+## 15                        ABC News / The Washington Post Poll: December, 1987
+## 16                                     ABC News Gary Hart Poll, December 1987
+## 17                                     ABC News Gary Hart Poll, December 1987
+## 18                                            ABC News Iraq Poll, August 1990
+## 19                                   ABC News Kosovo Peace Poll #1, June 1999
+## 20                    ABC News New Hampshire Primary Voter Poll, January 2000
+

More complicated searches can specify metadata fields like title and restrict results to a specific type of Dataverse object (a “dataverse”, “dataset”, or “file”):

+
+ei <- dataverse_search(author = "Gary King", title = "Ecological Inference", type = "dataset", per_page = 20)
+
## 20 of 867 results retrieved
+
+# fields returned
+names(ei)
+# names of datasets
+ei$name
+
## [1] "name"         "type"         "url"          "global_id"    "description"  "published_at" "citationHtml"
+## [8] "citation"     "authors"
+##  [1] "10 Million International Dyadic Events"
+##  [2] "3D Dust map from Green et al. (2015)"
+##  [3] "[KRISNA02]³ New Religious Movements : Case of ISKCON"
+##  [4] "A Comparative Study between Gurukul System and Western System of Education"
+##  [5] "A Lexicial Index of Electoral Democracy"
+##  [6] "A Statistical Inference Engine for Small, Dependent Samples  [Version 2.310]"
+##  [7] "A Unified Model of Cabinet Dissolution in Parliamentary Democracies"
+##  [8] "ABC News / The Washington Post poll # 7925:  Social Security/1984 Election"
+##  [9] "ABC News Iraq Poll, August 1990"
+## [10] "ABC News/The Washington Post Poll:  Los Angeles Race Riots"
+## [11] "ABC News/The Washington Post Poll:  Race Relations"
+## [12] "ABC News/Washington Post Los Angeles Beating Poll, April 1992"
+## [13] "ABC News/Washington Post Poll #1, September 1990"
+## [14] "ABC News/Washington Post Race Relations Poll, May 1992"
+## [15] "ABC News/Washington Post Reagan 100 Days Poll, April 1981"
+## [16] "Afrobarometer Round 3: The Quality of Democracy and Governance in 18 African Countries, 2005-2006"
+## [17] "Afrobarometer Round 3: The Quality of Democracy and Governance in Benin, 2005"
+## [18] "Afrobarometer Round 3: The Quality of Democracy and Governance in Botswana, 2005"
+## [19] "Afrobarometer Round 3: The Quality of Democracy and Governance in Cape Verde, 2005"
+## [20] "Afrobarometer Round 3: The Quality of Democracy and Governance in Ghana, 2005"
+

Once datasets and files are identified, it is easy to download and use them directly in R. See the “Data Retrieval” vignette for details.

+
+ + + +
+ + + +
+ +
+

Site built with pkgdown 1.6.1.

+
+ +
+
+ + + + + + diff --git a/docs/articles/B-search_files/accessible-code-block-0.0.1/empty-anchor.js b/docs/articles/B-search_files/accessible-code-block-0.0.1/empty-anchor.js new file mode 100644 index 0000000..ca349fd --- /dev/null +++ b/docs/articles/B-search_files/accessible-code-block-0.0.1/empty-anchor.js @@ -0,0 +1,15 @@ +// Hide empty tag within highlighted CodeBlock for screen reader accessibility (see https://github.com/jgm/pandoc/issues/6352#issuecomment-626106786) --> +// v0.0.1 +// Written by JooYoung Seo (jooyoung@psu.edu) and Atsushi Yasumoto on June 1st, 2020. + +document.addEventListener('DOMContentLoaded', function() { + const codeList = document.getElementsByClassName("sourceCode"); + for (var i = 0; i < codeList.length; i++) { + var linkList = codeList[i].getElementsByTagName('a'); + for (var j = 0; j < linkList.length; j++) { + if (linkList[j].innerHTML === "") { + linkList[j].setAttribute('aria-hidden', 'true'); + } + } + } +}); diff --git a/docs/articles/B-search_files/header-attrs-2.6/header-attrs.js b/docs/articles/B-search_files/header-attrs-2.6/header-attrs.js new file mode 100644 index 0000000..dd57d92 --- /dev/null +++ b/docs/articles/B-search_files/header-attrs-2.6/header-attrs.js @@ -0,0 +1,12 @@ +// Pandoc 2.9 adds attributes on both header and div. We remove the former (to +// be compatible with the behavior of Pandoc < 2.8). +document.addEventListener('DOMContentLoaded', function(e) { + var hs = document.querySelectorAll("div.section[class*='level'] > :first-child"); + var i, h, a; + for (i = 0; i < hs.length; i++) { + h = hs[i]; + if (!/^h[1-6]$/i.test(h.tagName)) continue; // it should be a header h1-h6 + a = h.attributes; + while (a.length > 0) h.removeAttribute(a[0].name); + } +}); diff --git a/docs/articles/C-retrieval.html b/docs/articles/C-retrieval.html new file mode 100644 index 0000000..a5335ca --- /dev/null +++ b/docs/articles/C-retrieval.html @@ -0,0 +1,339 @@ + + + + + + + +Data Retrieval and Reuse • dataverse + + + + + + + + + + +
+
+ + + + +
+
+ + + + +

This vignette shows how to download data from Dataverse using the dataverse package. We’ll focus on a Dataverse repository that contains supplemental files for Jamie Monogan’s book Political Analysis Using R, which is stored at Harvard University’s IQSS Dataverse Network:

+
+

Monogan, Jamie, 2015, “Political Analysis Using R: Example Code and Data, Plus Data for Practice Problems”, doi:10.7910/DVN/ARKOTI, Harvard Dataverse, V1, UNF:6:+itU9hcUJ8I9E0Kqv8HWHg==

+
+

This study is persistently retrievable by a “Digital Object Identifier (DOI)”: https://doi.org/10.7910/DVN/ARKOTI and the citation above (taken from the Dataverse page) includes a “Universal Numeric Fingerprint (UNF)”: UNF:6:+itU9hcUJ8I9E0Kqv8HWHg==, which provides a versioned, multi-file hash for the entire study, which contains 32 files.

+

If you don’t already know what datasets and files you want to use from Dataverse, see the “Data Search” vignette for guidance on data search and discovery.

+
+

+Retrieving Dataset and File Metadata

+

We will download these files and examine them directly in R using the dataverse package. To begin, we need to loading the package and using the get_dataset() function to retrieve some basic metadata about the dataset:

+
+library("dataverse")
+Sys.setenv("DATAVERSE_SERVER" = "dataverse.harvard.edu")
+(dataset <- get_dataset("doi:10.7910/DVN/ARKOTI"))
+
## Dataset (75170):
+## Version: 1.0, RELEASED
+## Release Date: 2015-07-07T02:57:02Z
+## License: CC0
+## 17 Files:
+##                           label version      id                  contentType
+## 1                  alpl2013.tab       2 2692294    text/tab-separated-values
+## 2                   BPchap7.tab       2 2692295    text/tab-separated-values
+## 3                   chapter01.R       2 2692202 text/plain; charset=US-ASCII
+## 4                   chapter02.R       2 2692206 text/plain; charset=US-ASCII
+## 5                   chapter03.R       2 2692210 text/plain; charset=US-ASCII
+## 6                   chapter04.R       2 2692204 text/plain; charset=US-ASCII
+## 7                   chapter05.R       2 2692205 text/plain; charset=US-ASCII
+## 8                   chapter06.R       2 2692212 text/plain; charset=US-ASCII
+## 9                   chapter07.R       2 2692209 text/plain; charset=US-ASCII
+## 10                  chapter08.R       2 2692208 text/plain; charset=US-ASCII
+## 11                  chapter09.R       2 2692211 text/plain; charset=US-ASCII
+## 12                  chapter10.R       1 2692203 text/plain; charset=US-ASCII
+## 13                  chapter11.R       1 2692207 text/plain; charset=US-ASCII
+## 14 comprehensiveJapanEnergy.tab       2 2692296    text/tab-separated-values
+## 15         constructionData.tab       2 2692293    text/tab-separated-values
+## 16             drugCoverage.csv       1 2692233 text/plain; charset=US-ASCII
+## 17         hanmerKalkanANES.tab       2 2692290    text/tab-separated-values
+## 18                 hmnrghts.tab       2 2692298    text/tab-separated-values
+## 19                 hmnrghts.txt       1 2692238                   text/plain
+## 20                   levant.tab       2 2692289    text/tab-separated-values
+## 21                       LL.csv       1 2692228 text/plain; charset=US-ASCII
+## 22                 moneyDem.tab       2 2692292    text/tab-separated-values
+## 23            owsiakJOP2013.tab       2 2692297    text/tab-separated-values
+## 24                PESenergy.csv       1 2692230 text/plain; charset=US-ASCII
+## 25                  pts1994.csv       1 2692229 text/plain; charset=US-ASCII
+## 26                  pts1995.csv       1 2692231 text/plain; charset=US-ASCII
+## 27                 sen113kh.ord       1 2692239 text/plain; charset=US-ASCII
+## 28                SinghEJPR.tab       2 2692299    text/tab-separated-values
+## 29                 SinghJTP.tab       2 2692288    text/tab-separated-values
+## 30                 stdSingh.tab       2 2692291    text/tab-separated-values
+## 31                       UN.csv       1 2692232 text/plain; charset=US-ASCII
+## 32                  war1800.tab       2 2692300    text/tab-separated-values
+

The output prints some basic metadata and then the str() of the files data frame returned by the call. This lists all of the files in the dataset along with a considerable amount of metadata about each. We can see a quick glance at these files using:

+
dataset$files[c("filename", "contentType")]
+

This shows that there are indeed 32 files, a mix of .R code files and tab- and comma-separated data files.

+

You can also retrieve more extensive metadata using dataset_metadata():

+
+str(dataset_metadata("doi:10.7910/DVN/ARKOTI"), 1)
+
## List of 2
+##  $ displayName: chr "Citation Metadata"
+##  $ fields     :'data.frame': 7 obs. of  4 variables:
+

We’ll focus here on the code and data files for Chapter 2 from the book.

+
+
+

+Retrieving Files

+

Let’s start by grabbing the code using get_file() (note that this always returns a raw vector):

+
+code3 <- get_file("chapter03.R", "doi:10.7910/DVN/ARKOTI")
+writeBin(code3, "chapter03.R")
+

Now we’ll get the corresponding data and save it locally. For this code we need two data files:

+
+writeBin(get_file("constructionData.tab", "doi:10.7910/DVN/ARKOTI"),
+         "constructionData.dta")
+writeBin(get_file("PESenergy.csv", "doi:10.7910/DVN/ARKOTI"),
+         "PESenergy.csv")
+

To confirm that the data look the way we want, we can also (perhaps alternatively) load it directly into R:

+
+constructionData <- foreign::read.dta("constructionData.dta")
+str(constructionData)
+PESenergy <- utils::read.table("PESenergy.csv")
+str(PESenergy)
+
## 'data.frame':    50 obs. of  55 variables:
+##  $ year                      : int  1997 1997 1997 1997 1997 1997 1997 1997 1997 1997 ...
+##  $ stno                      : int  1 2 3 4 5 6 7 8 9 10 ...
+##  $ totalreg                  : int  329 500 314 963 2106 643 634 239 1996 880 ...
+##  $ totalhealth               : int  300 424 263 834 1859 554 501 204 1640 732 ...
+##  $ raneyfolded97             : num  0.58 0.69 0.85 0.63 0.5 ...
+##  $ healthagenda97            : int  49 180 137 220 1409 153 324 40 408 157 ...
+##  $ predictedtotalig          : num  51.8 99 81.8 111.2 224.1 ...
+##  $ supplytotalhealth         : int  1168 6991 4666 9194 70014 8847 7845 1438 35363 13471 ...
+##  $ totalhealthsupplysq       : int  136 4887 2177 8453 490196 7827 6154 207 125054 18147 ...
+##  $ partratetotalhealth       : num  2.48 1.09 1.09 1.4 0.35 ...
+##  $ ighealthcare              : int  29 76 51 129 247 89 133 35 356 148 ...
+##  $ supplydirectpatientcare   : int  1137 6687 4458 8785 66960 8320 7439 1365 33793 12760 ...
+##  $ dpcsupplysq               : int  129 4472 1987 7718 448364 6922 5534 186 114197 16282 ...
+##  $ partratedpc               : num  1.14 0.51 0.43 0.68 0.17 ...
+##  $ igdpcare                  : int  13 34 19 60 112 40 67 12 212 74 ...
+##  $ supplypharmprod           : int  0 174 78 229 2288 340 202 36 962 360 ...
+##  $ pharmsupplysq             : int  0 30276 6084 52441 5234944 115600 40804 1296 925444 129600 ...
+##  $ partratepharmprod         : num  0 10.34 19.23 5.24 2.05 ...
+##  $ igpharmprod               : int  4 18 15 12 47 23 22 12 46 32 ...
+##  $ supplybusiness            : int  0 51 28 93 315 55 36 14 317 78 ...
+##  $ businesssupplysq          : int  0 2601 784 8649 99225 3025 1296 196 100489 6084 ...
+##  $ partratebusness           : num  0 1.96 14.29 15.05 6.03 ...
+##  $ igbusiness                : int  2 1 4 14 19 5 4 2 25 6 ...
+##  $ supplygovt                : int  14 26 80 23 70 71 105 2 67 176 ...
+##  $ govsupplysq               : num  0.02 0.07 0.64 0.05 0.49 ...
+##  $ partrategov               : num  0 38.5 2.5 30.4 10 ...
+##  $ iggovt                    : int  0 10 2 7 7 1 8 0 12 2 ...
+##  $ supplyadvocacy            : int  16 37 14 57 344 54 51 18 206 76 ...
+##  $ advossq                   : int  256 1369 196 3249 118336 2916 2601 324 42436 5776 ...
+##  $ partrateadvo              : num  31.25 16.22 28.57 31.58 8.72 ...
+##  $ ig97advoc                 : int  5 6 4 18 30 7 9 4 26 17 ...
+##  $ rnmedschools              : int  1 16 8 7 37 7 12 3 18 21 ...
+##  $ rnmedschoolssq            : int  1 256 64 49 1369 49 144 9 324 441 ...
+##  $ rnmedschoolpartrate       : num  100 0 12.5 28.57 5.41 ...
+##  $ rnmedschooligs            : int  1 0 1 2 2 0 1 0 6 1 ...
+##  $ healthprofessionals       : int  12890 128980 82140 122760 749620 111550 121110 22740 471270 215670 ...
+##  $ healthprofessionalssquared: int  16615 1663584 674698 1507002 56193014 1244340 1466763 51711 22209541 4651355 ...
+##  $ partrateprofessionals     : num  0.03 0.01 0.01 0.01 0 ...
+##  $ ighealthprofessionals     : int  4 7 6 16 30 13 22 5 29 16 ...
+##  $ predictdpcpartrate        : num  1.175 0.915 1.016 0.826 0.348 ...
+##  $ predictdpcig              : num  23.1 49.7 39.4 58.8 103.5 ...
+##  $ predictprofpartrate       : num  0.02475 0.01383 0.01788 0.01434 0.00579 ...
+##  $ predictprofig             : num  7.59 12.58 10.69 12.34 22.47 ...
+##  $ predictmedschoolparttrate : num  17.39 8.08 12.3 12.95 5.02 ...
+##  $ predictmedschoolig        : num  0.355 1.269 0.774 0.713 2.65 ...
+##  $ predictadvopartrate       : num  31.9 26.4 32.5 21.6 13 ...
+##  $ predictadvoig             : num  5.96 7.98 5.76 9.83 28.53 ...
+##  $ predictbuspartrate        : num  25.78 18.08 21.33 13.1 7.27 ...
+##  $ predictbusig              : num  2.58 7.96 5.66 11.66 20.04 ...
+##  $ predictpharmpartrate      : num  21.38 15.22 18.52 13.44 4.14 ...
+##  $ predictpharmig            : num  11.3 18.1 14.4 20.1 45.1 ...
+##  $ predictgovpartrate        : num  14.41 12.61 5.84 13.03 6.93 ...
+##  $ predictgovig              : num  2.06 2.43 3.78 2.35 3.57 ...
+##  $ predicttotalpartrate      : num  2.41 1.823 2.047 1.623 0.752 ...
+##  $ predicttotalig            : num  54.2 99.2 81.9 114.8 228.3 ...
+##  - attr(*, "datalabel")= chr ""
+##  - attr(*, "time.stamp")= chr " 1 Jun 2013 16:59"
+##  - attr(*, "formats")= chr  "%8.0g" "%8.0g" "%8.0g" "%8.0g" ...
+##  - attr(*, "types")= int  252 251 252 252 254 252 254 253 253 254 ...
+##  - attr(*, "val.labels")= chr  "" "" "" "" ...
+##  - attr(*, "var.labels")= chr  "Year" "StNo." "97 TotalReg" "97Total-Health" ...
+##  - attr(*, "version")= int 12
+## 'data.frame':    181 obs. of  1 variable:
+##  $ V1: Factor w/ 181 levels "Apr-69,5,3.4,60,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,39.2",..: 31 62 47 107 1 122 92 77 16 167 ...
+

In addition to visual inspection, we can compare the UNF signatures for each dataset against what is reported by Dataverse to confirm that we received the correct files:

+
+library("UNF")
+unf(constructionData)
+unf(PESenergy)
+dataset$files[c("label", "UNF")]
+
## UNF6:+4pc5114xS0ryr1sSvdX6g==
+## UNF6:TD7TEMZyrX4iGTlTsUKQDg==
+##                           label                            UNF
+## 1                  alpl2013.tab UNF:6:d9ZNXvmiPfiunSAiXRpVfg==
+## 2                   BPchap7.tab UNF:6:B3/HJbnzktaX5eEJA2ItiA==
+## 3                   chapter01.R                           <NA>
+## 4                   chapter02.R                           <NA>
+## 5                   chapter03.R                           <NA>
+## 6                   chapter04.R                           <NA>
+## 7                   chapter05.R                           <NA>
+## 8                   chapter06.R                           <NA>
+## 9                   chapter07.R                           <NA>
+## 10                  chapter08.R                           <NA>
+## 11                  chapter09.R                           <NA>
+## 12                  chapter10.R                           <NA>
+## 13                  chapter11.R                           <NA>
+## 14 comprehensiveJapanEnergy.tab UNF:6:Vhb3oZb9m4Nk9N7s6UAHGg==
+## 15         constructionData.tab UNF:6:+4pc5114xS0ryr1sSvdX6g==
+## 16             drugCoverage.csv                           <NA>
+## 17         hanmerKalkanANES.tab UNF:6:lrQrhDAXFc8lSRP9muJslw==
+## 18                 hmnrghts.tab UNF:6:uEg24jBA2ht0P4WeNLjI+w==
+## 19                 hmnrghts.txt                           <NA>
+## 20                   levant.tab UNF:6:zlgG7+JXsIZYvS383eQOvA==
+## 21                       LL.csv                           <NA>
+## 22                 moneyDem.tab UNF:6:7M/QM5i6IM/VUM94UJjJUQ==
+## 23            owsiakJOP2013.tab UNF:6:0ZEvCFuUQms2zYD57hmwNQ==
+## 24                PESenergy.csv                           <NA>
+## 25                  pts1994.csv                           <NA>
+## 26                  pts1995.csv                           <NA>
+## 27                 sen113kh.ord                           <NA>
+## 28                SinghEJPR.tab UNF:6:iDGp9dXOl4SiR+rCBWo8Tw==
+## 29                 SinghJTP.tab UNF:6:lDCyZ7YQF5O++SRsxh2kGA==
+## 30                 stdSingh.tab UNF:6:A5gwtn5q/ewkTMpcQEQ73w==
+## 31                       UN.csv                           <NA>
+## 32                  war1800.tab UNF:6:jJ++mepKcv9JbJOOPLMf2Q==
+
+
+

+Reusing Files and Reproducing Analysis

+

To reproduce the analysis, we can simply run the code file either as a system() call or directly in R using source() (note this particular file begins with an rm() call so you may want to run it in a new enviroment):

+
+# Option 1
+system("Rscript chapter03.R")
+
+# Option 2
+source("chapter03.R", local=new.env())
+

Any well-produced set of analysis reproduction files, like this one, should run without error once the data and code are in-hand. Troubleshooting anlaysis files is beyond the scope of this vignette, but common sources are

+
    +
  1. The working directory is not set the same as the author intended. This could affect code files not finding the relative position of datasets or of other code files.
  2. +
  3. Your local machine hasn’t downloaded or installed all the necessary datasets and packages.
  4. +
  5. The functions called in the code have changed since the script was developed.
  6. +
+

To archive your own reproducible analyses using Dataverse, see the “Archiving Data” vignette.

+
+
+ + + +
+ + + +
+ +
+

Site built with pkgdown 1.6.1.

+
+ +
+
+ + + + + + diff --git a/docs/articles/C-retrieval_files/accessible-code-block-0.0.1/empty-anchor.js b/docs/articles/C-retrieval_files/accessible-code-block-0.0.1/empty-anchor.js new file mode 100644 index 0000000..ca349fd --- /dev/null +++ b/docs/articles/C-retrieval_files/accessible-code-block-0.0.1/empty-anchor.js @@ -0,0 +1,15 @@ +// Hide empty tag within highlighted CodeBlock for screen reader accessibility (see https://github.com/jgm/pandoc/issues/6352#issuecomment-626106786) --> +// v0.0.1 +// Written by JooYoung Seo (jooyoung@psu.edu) and Atsushi Yasumoto on June 1st, 2020. + +document.addEventListener('DOMContentLoaded', function() { + const codeList = document.getElementsByClassName("sourceCode"); + for (var i = 0; i < codeList.length; i++) { + var linkList = codeList[i].getElementsByTagName('a'); + for (var j = 0; j < linkList.length; j++) { + if (linkList[j].innerHTML === "") { + linkList[j].setAttribute('aria-hidden', 'true'); + } + } + } +}); diff --git a/docs/articles/C-retrieval_files/header-attrs-2.6/header-attrs.js b/docs/articles/C-retrieval_files/header-attrs-2.6/header-attrs.js new file mode 100644 index 0000000..dd57d92 --- /dev/null +++ b/docs/articles/C-retrieval_files/header-attrs-2.6/header-attrs.js @@ -0,0 +1,12 @@ +// Pandoc 2.9 adds attributes on both header and div. We remove the former (to +// be compatible with the behavior of Pandoc < 2.8). +document.addEventListener('DOMContentLoaded', function(e) { + var hs = document.querySelectorAll("div.section[class*='level'] > :first-child"); + var i, h, a; + for (i = 0; i < hs.length; i++) { + h = hs[i]; + if (!/^h[1-6]$/i.test(h.tagName)) continue; // it should be a header h1-h6 + a = h.attributes; + while (a.length > 0) h.removeAttribute(a[0].name); + } +}); diff --git a/docs/articles/D-archiving.html b/docs/articles/D-archiving.html new file mode 100644 index 0000000..dbafd7f --- /dev/null +++ b/docs/articles/D-archiving.html @@ -0,0 +1,183 @@ + + + + + + + +Data Archiving • dataverse + + + + + + + + + + +
+
+ + + + +
+
+ + + + +

This vignette describes how to archive data into Dataverse directly from R.

+
+library("dataverse")
+Sys.setenv("DATAVERSE_SERVER" = "dataverse.harvard.edu")
+
+

+SWORD-based Data Archiving

+

The main data archiving (or “deposit”) workflow for Dataverse is built on SWORD v2.0. This means that to create a new dataset listing, you will have first initialize a dataset entry with some metadata, add one or more files to the dataset, and then publish it. This looks something like the following:

+
+# retrieve your service document
+d <- service_document()
+
+# list current datasets in a dataverse
+list_datasets("mydataverse")
+
+# create a new dataset
+## create a list of metadata
+metadat <- list(title = "My Study",
+                creator = "Doe, John",
+                description = "An example study")
+## initiate the dataset
+dat <- initiate_sword_dataset("mydataverse", body = metadat)
+

Once the dataset is initiated, it is possible to add and delete files:

+
+tmp <- tempfile()
+write.csv(iris, file = tmp)
+f <- add_file(dat, file = tmp)
+

The add_file() function accepts, as its first argument, a character vector of file names, a data.frame, or a list of R objects. Files can be deleted using delete_file(). Once the dataset is finalized, it can be published using publish_dataset():

+ +

And it will then show up in the list of published datasets returned by list_datasets(dat).

+
+
+

+Native API

+

Dataverse also implements a second way to release datasets, called the “native” API. It is similar to to the SWORD API:

+
+# create the dataset
+ds <- create_dataset("mydataverse")
+
+# add files
+tmp <- tempfile()
+write.csv(iris, file = tmp)
+f <- add_dataset_file(file = tmp, dataset = ds)
+
+# publish dataset
+publish_dataset(ds)
+
+# dataset will now be published
+get_dataverse("mydataverse")
+
+
+ + + +
+ + + +
+ +
+

Site built with pkgdown 1.6.1.

+
+ +
+
+ + + + + + diff --git a/docs/articles/D-archiving_files/accessible-code-block-0.0.1/empty-anchor.js b/docs/articles/D-archiving_files/accessible-code-block-0.0.1/empty-anchor.js new file mode 100644 index 0000000..ca349fd --- /dev/null +++ b/docs/articles/D-archiving_files/accessible-code-block-0.0.1/empty-anchor.js @@ -0,0 +1,15 @@ +// Hide empty tag within highlighted CodeBlock for screen reader accessibility (see https://github.com/jgm/pandoc/issues/6352#issuecomment-626106786) --> +// v0.0.1 +// Written by JooYoung Seo (jooyoung@psu.edu) and Atsushi Yasumoto on June 1st, 2020. + +document.addEventListener('DOMContentLoaded', function() { + const codeList = document.getElementsByClassName("sourceCode"); + for (var i = 0; i < codeList.length; i++) { + var linkList = codeList[i].getElementsByTagName('a'); + for (var j = 0; j < linkList.length; j++) { + if (linkList[j].innerHTML === "") { + linkList[j].setAttribute('aria-hidden', 'true'); + } + } + } +}); diff --git a/docs/articles/D-archiving_files/header-attrs-2.6/header-attrs.js b/docs/articles/D-archiving_files/header-attrs-2.6/header-attrs.js new file mode 100644 index 0000000..dd57d92 --- /dev/null +++ b/docs/articles/D-archiving_files/header-attrs-2.6/header-attrs.js @@ -0,0 +1,12 @@ +// Pandoc 2.9 adds attributes on both header and div. We remove the former (to +// be compatible with the behavior of Pandoc < 2.8). +document.addEventListener('DOMContentLoaded', function(e) { + var hs = document.querySelectorAll("div.section[class*='level'] > :first-child"); + var i, h, a; + for (i = 0; i < hs.length; i++) { + h = hs[i]; + if (!/^h[1-6]$/i.test(h.tagName)) continue; // it should be a header h1-h6 + a = h.attributes; + while (a.length > 0) h.removeAttribute(a[0].name); + } +}); diff --git a/docs/articles/index.html b/docs/articles/index.html new file mode 100644 index 0000000..87b696c --- /dev/null +++ b/docs/articles/index.html @@ -0,0 +1,174 @@ + + + + + + + + +Articles • dataverse + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+ + + + +
+ +
+
+ + + +
+
+ + +
+ + +
+

Site built with pkgdown 1.6.1.

+
+ +
+
+ + + + + + + + diff --git a/docs/authors.html b/docs/authors.html new file mode 100644 index 0000000..dd23d48 --- /dev/null +++ b/docs/authors.html @@ -0,0 +1,200 @@ + + + + + + + + +Citation and Authors • dataverse + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+ + + + +
+ +
+
+ + +

Thomas J. Leeper (). dataverse: R Client for Dataverse 4. R package version 0.2.1.9002.

+
@Manual{,
+  title = {dataverse: R Client for Dataverse 4},
+  author = {Thomas J. Leeper},
+  note = {R package version 0.2.1.9002},
+}
+ + + +
    +
  • +

    Will Beasley. Author, maintainer. +

    +
  • +
  • +

    Thomas J. Leeper. Author. +

    +
  • +
  • +

    Philip Durbin. Author. +

    +
  • +
  • +

    Shiro Kuriwaki. Author. +

    +
  • +
  • +

    Sebastian Karcher. Author. +

    +
  • +
  • +

    Jan Kanis. Contributor. +

    +
  • +
+ +
+ +
+ + + +
+ + +
+

Site built with pkgdown 1.6.1.

+
+ +
+
+ + + + + + + + diff --git a/docs/bootstrap-toc.css b/docs/bootstrap-toc.css new file mode 100644 index 0000000..5a85941 --- /dev/null +++ b/docs/bootstrap-toc.css @@ -0,0 +1,60 @@ +/*! + * Bootstrap Table of Contents v0.4.1 (http://afeld.github.io/bootstrap-toc/) + * Copyright 2015 Aidan Feldman + * Licensed under MIT (https://github.com/afeld/bootstrap-toc/blob/gh-pages/LICENSE.md) */ + +/* modified from https://github.com/twbs/bootstrap/blob/94b4076dd2efba9af71f0b18d4ee4b163aa9e0dd/docs/assets/css/src/docs.css#L548-L601 */ + +/* All levels of nav */ +nav[data-toggle='toc'] .nav > li > a { + display: block; + padding: 4px 20px; + font-size: 13px; + font-weight: 500; + color: #767676; +} +nav[data-toggle='toc'] .nav > li > a:hover, +nav[data-toggle='toc'] .nav > li > a:focus { + padding-left: 19px; + color: #563d7c; + text-decoration: none; + background-color: transparent; + border-left: 1px solid #563d7c; +} +nav[data-toggle='toc'] .nav > .active > a, +nav[data-toggle='toc'] .nav > .active:hover > a, +nav[data-toggle='toc'] .nav > .active:focus > a { + padding-left: 18px; + font-weight: bold; + color: #563d7c; + background-color: transparent; + border-left: 2px solid #563d7c; +} + +/* Nav: second level (shown on .active) */ +nav[data-toggle='toc'] .nav .nav { + display: none; /* Hide by default, but at >768px, show it */ + padding-bottom: 10px; +} +nav[data-toggle='toc'] .nav .nav > li > a { + padding-top: 1px; + padding-bottom: 1px; + padding-left: 30px; + font-size: 12px; + font-weight: normal; +} +nav[data-toggle='toc'] .nav .nav > li > a:hover, +nav[data-toggle='toc'] .nav .nav > li > a:focus { + padding-left: 29px; +} +nav[data-toggle='toc'] .nav .nav > .active > a, +nav[data-toggle='toc'] .nav .nav > .active:hover > a, +nav[data-toggle='toc'] .nav .nav > .active:focus > a { + padding-left: 28px; + font-weight: 500; +} + +/* from https://github.com/twbs/bootstrap/blob/e38f066d8c203c3e032da0ff23cd2d6098ee2dd6/docs/assets/css/src/docs.css#L631-L634 */ +nav[data-toggle='toc'] .nav > .active > ul { + display: block; +} diff --git a/docs/bootstrap-toc.js b/docs/bootstrap-toc.js new file mode 100644 index 0000000..1cdd573 --- /dev/null +++ b/docs/bootstrap-toc.js @@ -0,0 +1,159 @@ +/*! + * Bootstrap Table of Contents v0.4.1 (http://afeld.github.io/bootstrap-toc/) + * Copyright 2015 Aidan Feldman + * Licensed under MIT (https://github.com/afeld/bootstrap-toc/blob/gh-pages/LICENSE.md) */ +(function() { + 'use strict'; + + window.Toc = { + helpers: { + // return all matching elements in the set, or their descendants + findOrFilter: function($el, selector) { + // http://danielnouri.org/notes/2011/03/14/a-jquery-find-that-also-finds-the-root-element/ + // http://stackoverflow.com/a/12731439/358804 + var $descendants = $el.find(selector); + return $el.filter(selector).add($descendants).filter(':not([data-toc-skip])'); + }, + + generateUniqueIdBase: function(el) { + var text = $(el).text(); + var anchor = text.trim().toLowerCase().replace(/[^A-Za-z0-9]+/g, '-'); + return anchor || el.tagName.toLowerCase(); + }, + + generateUniqueId: function(el) { + var anchorBase = this.generateUniqueIdBase(el); + for (var i = 0; ; i++) { + var anchor = anchorBase; + if (i > 0) { + // add suffix + anchor += '-' + i; + } + // check if ID already exists + if (!document.getElementById(anchor)) { + return anchor; + } + } + }, + + generateAnchor: function(el) { + if (el.id) { + return el.id; + } else { + var anchor = this.generateUniqueId(el); + el.id = anchor; + return anchor; + } + }, + + createNavList: function() { + return $(''); + }, + + createChildNavList: function($parent) { + var $childList = this.createNavList(); + $parent.append($childList); + return $childList; + }, + + generateNavEl: function(anchor, text) { + var $a = $(''); + $a.attr('href', '#' + anchor); + $a.text(text); + var $li = $('
  • '); + $li.append($a); + return $li; + }, + + generateNavItem: function(headingEl) { + var anchor = this.generateAnchor(headingEl); + var $heading = $(headingEl); + var text = $heading.data('toc-text') || $heading.text(); + return this.generateNavEl(anchor, text); + }, + + // Find the first heading level (`

    `, then `

    `, etc.) that has more than one element. Defaults to 1 (for `

    `). + getTopLevel: function($scope) { + for (var i = 1; i <= 6; i++) { + var $headings = this.findOrFilter($scope, 'h' + i); + if ($headings.length > 1) { + return i; + } + } + + return 1; + }, + + // returns the elements for the top level, and the next below it + getHeadings: function($scope, topLevel) { + var topSelector = 'h' + topLevel; + + var secondaryLevel = topLevel + 1; + var secondarySelector = 'h' + secondaryLevel; + + return this.findOrFilter($scope, topSelector + ',' + secondarySelector); + }, + + getNavLevel: function(el) { + return parseInt(el.tagName.charAt(1), 10); + }, + + populateNav: function($topContext, topLevel, $headings) { + var $context = $topContext; + var $prevNav; + + var helpers = this; + $headings.each(function(i, el) { + var $newNav = helpers.generateNavItem(el); + var navLevel = helpers.getNavLevel(el); + + // determine the proper $context + if (navLevel === topLevel) { + // use top level + $context = $topContext; + } else if ($prevNav && $context === $topContext) { + // create a new level of the tree and switch to it + $context = helpers.createChildNavList($prevNav); + } // else use the current $context + + $context.append($newNav); + + $prevNav = $newNav; + }); + }, + + parseOps: function(arg) { + var opts; + if (arg.jquery) { + opts = { + $nav: arg + }; + } else { + opts = arg; + } + opts.$scope = opts.$scope || $(document.body); + return opts; + } + }, + + // accepts a jQuery object, or an options object + init: function(opts) { + opts = this.helpers.parseOps(opts); + + // ensure that the data attribute is in place for styling + opts.$nav.attr('data-toggle', 'toc'); + + var $topContext = this.helpers.createChildNavList(opts.$nav); + var topLevel = this.helpers.getTopLevel(opts.$scope); + var $headings = this.helpers.getHeadings(opts.$scope, topLevel); + this.helpers.populateNav($topContext, topLevel, $headings); + } + }; + + $(function() { + $('nav[data-toggle="toc"]').each(function(i, el) { + var $nav = $(el); + Toc.init($nav); + }); + }); +})(); diff --git a/docs/docsearch.css b/docs/docsearch.css new file mode 100644 index 0000000..e5f1fe1 --- /dev/null +++ b/docs/docsearch.css @@ -0,0 +1,148 @@ +/* Docsearch -------------------------------------------------------------- */ +/* + Source: https://github.com/algolia/docsearch/ + License: MIT +*/ + +.algolia-autocomplete { + display: block; + -webkit-box-flex: 1; + -ms-flex: 1; + flex: 1 +} + +.algolia-autocomplete .ds-dropdown-menu { + width: 100%; + min-width: none; + max-width: none; + padding: .75rem 0; + background-color: #fff; + background-clip: padding-box; + border: 1px solid rgba(0, 0, 0, .1); + box-shadow: 0 .5rem 1rem rgba(0, 0, 0, .175); +} + +@media (min-width:768px) { + .algolia-autocomplete .ds-dropdown-menu { + width: 175% + } +} + +.algolia-autocomplete .ds-dropdown-menu::before { + display: none +} + +.algolia-autocomplete .ds-dropdown-menu [class^=ds-dataset-] { + padding: 0; + background-color: rgb(255,255,255); + border: 0; + max-height: 80vh; +} + +.algolia-autocomplete .ds-dropdown-menu .ds-suggestions { + margin-top: 0 +} + +.algolia-autocomplete .algolia-docsearch-suggestion { + padding: 0; + overflow: visible +} + +.algolia-autocomplete .algolia-docsearch-suggestion--category-header { + padding: .125rem 1rem; + margin-top: 0; + font-size: 1.3em; + font-weight: 500; + color: #00008B; + border-bottom: 0 +} + +.algolia-autocomplete .algolia-docsearch-suggestion--wrapper { + float: none; + padding-top: 0 +} + +.algolia-autocomplete .algolia-docsearch-suggestion--subcategory-column { + float: none; + width: auto; + padding: 0; + text-align: left +} + +.algolia-autocomplete .algolia-docsearch-suggestion--content { + float: none; + width: auto; + padding: 0 +} + +.algolia-autocomplete .algolia-docsearch-suggestion--content::before { + display: none +} + +.algolia-autocomplete .ds-suggestion:not(:first-child) .algolia-docsearch-suggestion--category-header { + padding-top: .75rem; + margin-top: .75rem; + border-top: 1px solid rgba(0, 0, 0, .1) +} + +.algolia-autocomplete .ds-suggestion .algolia-docsearch-suggestion--subcategory-column { + display: block; + padding: .1rem 1rem; + margin-bottom: 0.1; + font-size: 1.0em; + font-weight: 400 + /* display: none */ +} + +.algolia-autocomplete .algolia-docsearch-suggestion--title { + display: block; + padding: .25rem 1rem; + margin-bottom: 0; + font-size: 0.9em; + font-weight: 400 +} + +.algolia-autocomplete .algolia-docsearch-suggestion--text { + padding: 0 1rem .5rem; + margin-top: -.25rem; + font-size: 0.8em; + font-weight: 400; + line-height: 1.25 +} + +.algolia-autocomplete .algolia-docsearch-footer { + width: 110px; + height: 20px; + z-index: 3; + margin-top: 10.66667px; + float: right; + font-size: 0; + line-height: 0; +} + +.algolia-autocomplete .algolia-docsearch-footer--logo { + background-image: url("data:image/svg+xml;utf8,"); + background-repeat: no-repeat; + background-position: 50%; + background-size: 100%; + overflow: hidden; + text-indent: -9000px; + width: 100%; + height: 100%; + display: block; + transform: translate(-8px); +} + +.algolia-autocomplete .algolia-docsearch-suggestion--highlight { + color: #FF8C00; + background: rgba(232, 189, 54, 0.1) +} + + +.algolia-autocomplete .algolia-docsearch-suggestion--text .algolia-docsearch-suggestion--highlight { + box-shadow: inset 0 -2px 0 0 rgba(105, 105, 105, .5) +} + +.algolia-autocomplete .ds-suggestion.ds-cursor .algolia-docsearch-suggestion--content { + background-color: rgba(192, 192, 192, .15) +} diff --git a/docs/docsearch.js b/docs/docsearch.js new file mode 100644 index 0000000..b35504c --- /dev/null +++ b/docs/docsearch.js @@ -0,0 +1,85 @@ +$(function() { + + // register a handler to move the focus to the search bar + // upon pressing shift + "/" (i.e. "?") + $(document).on('keydown', function(e) { + if (e.shiftKey && e.keyCode == 191) { + e.preventDefault(); + $("#search-input").focus(); + } + }); + + $(document).ready(function() { + // do keyword highlighting + /* modified from https://jsfiddle.net/julmot/bL6bb5oo/ */ + var mark = function() { + + var referrer = document.URL ; + var paramKey = "q" ; + + if (referrer.indexOf("?") !== -1) { + var qs = referrer.substr(referrer.indexOf('?') + 1); + var qs_noanchor = qs.split('#')[0]; + var qsa = qs_noanchor.split('&'); + var keyword = ""; + + for (var i = 0; i < qsa.length; i++) { + var currentParam = qsa[i].split('='); + + if (currentParam.length !== 2) { + continue; + } + + if (currentParam[0] == paramKey) { + keyword = decodeURIComponent(currentParam[1].replace(/\+/g, "%20")); + } + } + + if (keyword !== "") { + $(".contents").unmark({ + done: function() { + $(".contents").mark(keyword); + } + }); + } + } + }; + + mark(); + }); +}); + +/* Search term highlighting ------------------------------*/ + +function matchedWords(hit) { + var words = []; + + var hierarchy = hit._highlightResult.hierarchy; + // loop to fetch from lvl0, lvl1, etc. + for (var idx in hierarchy) { + words = words.concat(hierarchy[idx].matchedWords); + } + + var content = hit._highlightResult.content; + if (content) { + words = words.concat(content.matchedWords); + } + + // return unique words + var words_uniq = [...new Set(words)]; + return words_uniq; +} + +function updateHitURL(hit) { + + var words = matchedWords(hit); + var url = ""; + + if (hit.anchor) { + url = hit.url_without_anchor + '?q=' + escape(words.join(" ")) + '#' + hit.anchor; + } else { + url = hit.url + '?q=' + escape(words.join(" ")); + } + + return url; +} diff --git a/docs/index.html b/docs/index.html new file mode 100644 index 0000000..896eb76 --- /dev/null +++ b/docs/index.html @@ -0,0 +1,358 @@ + + + + + + + +Client for Dataverse 4 Repositories • dataverse + + + + + + + + + + +
    +
    + + + + +
    +
    +
    + + +

    Dataverse Project logo

    +

    The dataverse package provides access to Dataverse 4 APIs, enabling data search, retrieval, and deposit, thus allowing R users to integrate public data sharing into the reproducible research workflow. dataverse is the next-generation iteration of the dvn package, which works with Dataverse 3 (“Dataverse Network”) applications. dataverse includes numerous improvements for data search, retrieval, and deposit, including use of the (currently in development) sword package for data deposit and the UNF package for data fingerprinting.

    +
    +

    +Getting Started

    +

    You can find a stable 2017 release on CRAN, or install the latest development version from GitHub:

    + +
    +

    +Keys

    +

    Some features of the Dataverse 4 API are public and require no authentication. This means in many cases you can search for and retrieve data without a Dataverse account for that a specific Dataverse installation. But, other features require a Dataverse account for the specific server installation of the Dataverse software, and an API key linked to that account. Instructions for obtaining an account and setting up an API key are available in the Dataverse User Guide. (Note: if your key is compromised, it can be regenerated to preserve security.) Once you have an API key, this should be stored as an environment variable called DATAVERSE_KEY. It can be set within R using:

    +
    +Sys.setenv("DATAVERSE_KEY" = "examplekey12345")
    +
    +
    +

    +Server

    +

    Because there are many Dataverse installations, all functions in the R client require specifying what server installation you are interacting with. This can be set by default with an environment variable, DATAVERSE_SERVER. This should be the Dataverse server, without the “https” prefix or the “/api” URL path, etc. For example, the Harvard Dataverse can be used by setting:

    +
    +Sys.setenv("DATAVERSE_SERVER" = "dataverse.harvard.edu")
    +

    Note: The package attempts to compensate for any malformed values, though.

    +

    Currently, the package wraps the data management features of the Dataverse API. Functions for other API features - related to user management and permissions - are not currently exported in the package (but are drafted in the source code).

    +
    +
    +
    +

    +Data and Metadata Retrieval

    +

    The dataverse package provides multiple interfaces to obtain data into R. Users can supply a file DOI, a dataset DOI combined with a filename, or a dataverse object. They can read in the file as a raw binary or a dataset read in with the appropriate R function.

    +
    +

    +Reading data as R objects

    +

    Use the get_dataframe_*() functions, depending on the input you have. For example, we will read a survey dataset on Dataverse, nlsw88.dta (doi:10.70122/FK2/PPKHI1/ZYATZZ), originally in Stata dta form.

    +

    With a file DOI, we can use the get_dataframe_by_doi function:

    +
    +nlsw <- 
    +  get_dataframe_by_doi(
    +    filedoi     = "10.70122/FK2/PPIAXE/MHDB0O",
    +    server      = "demo.dataverse.org"
    +  )
    +
    ## Downloading ingested version of data with readr::read_tsv. To download the original version and remove this message, set original = TRUE.
    +
    +## 
    +## ── Column specification ────────────────────────────────────────────────────────────────────────────────────────────────
    +## cols(
    +##   idcode = col_double(),
    +##   age = col_double(),
    +##   race = col_double(),
    +##   married = col_double(),
    +##   never_married = col_double(),
    +##   grade = col_double(),
    +##   collgrad = col_double(),
    +##   south = col_double(),
    +##   smsa = col_double(),
    +##   c_city = col_double(),
    +##   industry = col_double(),
    +##   occupation = col_double(),
    +##   union = col_double(),
    +##   wage = col_double(),
    +##   hours = col_double(),
    +##   ttl_exp = col_double(),
    +##   tenure = col_double()
    +## )
    +

    which by default reads in the ingested file (not the original dta) by the readr::read_tsv function.

    +

    Alternatively, we can download the same file by specifying the filename and the DOI of the “dataset” (in Dataverse, a collection of files is called a dataset).

    +
    +nlsw_tsv <- 
    +  get_dataframe_by_name(
    +    filename  = "nlsw88.tab",
    +    dataset   = "10.70122/FK2/PPIAXE",
    +    server    = "demo.dataverse.org"
    +  )
    +

    Now, Dataverse often translates rectangular data into an ingested, or “archival” version, which is application-neutral and easily-readable. read_dataframe_*() defaults to taking this ingested version rather than using the original, through the argument original = FALSE.

    +

    This default is safe because you may not have the proprietary software that was originally used. On the other hand, the data may have lost information in the process of the ingestation.

    +

    Instead, to read the same file but its original version, specify original = TRUE and set an .f argument. In this case, we know that nlsw88.tab is a Stata .dta dataset, so we will use the haven::read_dta function.

    +
    +nlsw_original <- 
    +  get_dataframe_by_name(
    +    filename    = "nlsw88.tab",
    +    dataset     = "10.70122/FK2/PPIAXE",
    +    .f          = haven::read_dta,
    +    original    = TRUE,
    +    server      = "demo.dataverse.org"
    +  )
    +

    Note that even though the file prefix is “.tab”, we use read_dta.

    +

    Of course, when the dataset is not ingested (such as a Rds file), users would always need to specify an .f argument for the specific file.

    +

    Note the difference between nls_tsv and nls_original. nls_original preserves the data attributes like value labels, whereas nls_tsv has dropped this or left this in file metadata.

    +
    +class(nlsw_tsv$race) # tab ingested version only has numeric data
    +
    ## [1] "numeric"
    +
    +attr(nlsw_original$race, "labels") # original dta has value labels
    +
    ## white black other 
    +##     1     2     3
    +
    +
    +

    +Reading a dataset as a binary file.

    +

    In some cases, you may not want to read in the data in your environment, perhaps because that is not possible (e.g. for a .docx file), and you want to simply write these files your local disk. To do this, use the more primitive get_file_* commands. The arguments are equivalent, except we no longer need an .f argument

    +
    +nlsw_raw <- 
    +  get_file_by_name(
    +    filename    = "nlsw88.tab",
    +    dataset     = "10.70122/FK2/PPIAXE",
    +    server      = "demo.dataverse.org"
    +  )
    +class(nlsw_raw)
    +
    ## [1] "raw"
    +
    +
    +

    +Reading file metadata

    +

    The function get_file_metadata() can also be used similarly. This will return a metadata format for ingested tabular files in the ddi format. The function get_dataset() will retrieve the list of files in a dataset.

    +
    +get_dataset(
    +  dataset = "10.70122/FK2/PPIAXE",
    +  server  = "demo.dataverse.org"
    +)
    +
    ## Dataset (182162): 
    +## Version: 1.1, RELEASED
    +## Release Date: 2020-12-30T00:00:24Z
    +## License: CC0
    +## 22 Files:
    +##                   label version      id               contentType
    +## 1 nlsw88_rds-export.rds       1 1734016  application/octet-stream
    +## 2            nlsw88.tab       3 1734017 text/tab-separated-values
    +
    +
    +
    +

    +Data Discovery

    +

    Dataverse supplies a robust search API to discover Dataverses, datasets, and files. The simplest searches simply consist of a query string:

    +
    +dataverse_search("Gary King")
    +

    More complicated searches might specify metadata fields:

    +
    +dataverse_search(author = "Gary King", title = "Ecological Inference")
    +

    And searches can be restricted to specific types of objects (Dataverse, dataset, or file):

    +
    +dataverse_search(author = "Gary King", type = "dataset")
    +

    The results are paginated using per_page argument. To retrieve subsequent pages, specify start.

    +
    +
    +

    +Data Archiving

    +

    Dataverse provides two - basically unrelated - workflows for managing (adding, documenting, and publishing) datasets. The first is built on SWORD v2.0. This means that to create a new dataset listing, you will have first initialize a dataset entry with some metadata, add one or more files to the dataset, and then publish it. This looks something like the following:

    +
    +# retrieve your service document
    +d <- service_document()
    +
    +# create a list of metadata
    +metadat <- 
    +  list(
    +    title       = "My Study",
    +    creator     = "Doe, John",
    +    description = "An example study"
    +  )
    +
    +# create the dataset
    +ds <- initiate_sword_dataset("mydataverse", body = metadat)
    +
    +# add files to dataset
    +tmp <- tempfile()
    +write.csv(iris, file = tmp)
    +f <- add_file(ds, file = tmp)
    +
    +# publish new dataset
    +publish_sword_dataset(ds)
    +
    +# dataset will now be published
    +list_datasets("mydataverse")
    +

    The second workflow is called the “native” API and is similar but uses slightly different functions:

    +
    +# create the dataset
    +ds <- create_dataset("mydataverse")
    +
    +# add files
    +tmp <- tempfile()
    +write.csv(iris, file = tmp)
    +f <- add_dataset_file(file = tmp, dataset = ds)
    +
    +# publish dataset
    +publish_dataset(ds)
    +
    +# dataset will now be published
    +get_dataverse("mydataverse")
    +

    Through the native API it is possible to update a dataset by modifying its metadata with update_dataset() or file contents using update_dataset_file() and then republish a new version using publish_dataset().

    +
    +
    +

    +Other Installations

    +

    Users interested in downloading metadata from archives other than Dataverse may be interested in Kurt Hornik’s OAIHarvester and Scott Chamberlain’s oai, which offer metadata download from any web repository that is compliant with the Open Archives Initiative standards. Additionally, rdryad uses OAIHarvester to interface with Dryad. The rfigshare package works in a similar spirit to dataverse with https://figshare.com/.

    +
    +
    +
    + + +
    + + +
    + +
    +

    Site built with pkgdown 1.6.1.

    +
    + +
    +
    + + + + + + diff --git a/docs/link.svg b/docs/link.svg new file mode 100644 index 0000000..88ad827 --- /dev/null +++ b/docs/link.svg @@ -0,0 +1,12 @@ + + + + + + diff --git a/docs/news/index.html b/docs/news/index.html new file mode 100644 index 0000000..5835634 --- /dev/null +++ b/docs/news/index.html @@ -0,0 +1,167 @@ + + + + + + + + +Changelog • dataverse + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    + + + + +
    + +
    +
    + + +
    + + + +
    + + +
    + + +
    +

    Site built with pkgdown 1.6.1.

    +
    + +
    +
    + + + + + + + + diff --git a/docs/pkgdown.css b/docs/pkgdown.css new file mode 100644 index 0000000..1273238 --- /dev/null +++ b/docs/pkgdown.css @@ -0,0 +1,367 @@ +/* Sticky footer */ + +/** + * Basic idea: https://philipwalton.github.io/solved-by-flexbox/demos/sticky-footer/ + * Details: https://github.com/philipwalton/solved-by-flexbox/blob/master/assets/css/components/site.css + * + * .Site -> body > .container + * .Site-content -> body > .container .row + * .footer -> footer + * + * Key idea seems to be to ensure that .container and __all its parents__ + * have height set to 100% + * + */ + +html, body { + height: 100%; +} + +body { + position: relative; +} + +body > .container { + display: flex; + height: 100%; + flex-direction: column; +} + +body > .container .row { + flex: 1 0 auto; +} + +footer { + margin-top: 45px; + padding: 35px 0 36px; + border-top: 1px solid #e5e5e5; + color: #666; + display: flex; + flex-shrink: 0; +} +footer p { + margin-bottom: 0; +} +footer div { + flex: 1; +} +footer .pkgdown { + text-align: right; +} +footer p { + margin-bottom: 0; +} + +img.icon { + float: right; +} + +img { + max-width: 100%; +} + +/* Fix bug in bootstrap (only seen in firefox) */ +summary { + display: list-item; +} + +/* Typographic tweaking ---------------------------------*/ + +.contents .page-header { + margin-top: calc(-60px + 1em); +} + +dd { + margin-left: 3em; +} + +/* Section anchors ---------------------------------*/ + +a.anchor { + margin-left: -30px; + display:inline-block; + width: 30px; + height: 30px; + visibility: hidden; + + background-image: url(./link.svg); + background-repeat: no-repeat; + background-size: 20px 20px; + background-position: center center; +} + +.hasAnchor:hover a.anchor { + visibility: visible; +} + +@media (max-width: 767px) { + .hasAnchor:hover a.anchor { + visibility: hidden; + } +} + + +/* Fixes for fixed navbar --------------------------*/ + +.contents h1, .contents h2, .contents h3, .contents h4 { + padding-top: 60px; + margin-top: -40px; +} + +/* Navbar submenu --------------------------*/ + +.dropdown-submenu { + position: relative; +} + +.dropdown-submenu>.dropdown-menu { + top: 0; + left: 100%; + margin-top: -6px; + margin-left: -1px; + border-radius: 0 6px 6px 6px; +} + +.dropdown-submenu:hover>.dropdown-menu { + display: block; +} + +.dropdown-submenu>a:after { + display: block; + content: " "; + float: right; + width: 0; + height: 0; + border-color: transparent; + border-style: solid; + border-width: 5px 0 5px 5px; + border-left-color: #cccccc; + margin-top: 5px; + margin-right: -10px; +} + +.dropdown-submenu:hover>a:after { + border-left-color: #ffffff; +} + +.dropdown-submenu.pull-left { + float: none; +} + +.dropdown-submenu.pull-left>.dropdown-menu { + left: -100%; + margin-left: 10px; + border-radius: 6px 0 6px 6px; +} + +/* Sidebar --------------------------*/ + +#pkgdown-sidebar { + margin-top: 30px; + position: -webkit-sticky; + position: sticky; + top: 70px; +} + +#pkgdown-sidebar h2 { + font-size: 1.5em; + margin-top: 1em; +} + +#pkgdown-sidebar h2:first-child { + margin-top: 0; +} + +#pkgdown-sidebar .list-unstyled li { + margin-bottom: 0.5em; +} + +/* bootstrap-toc tweaks ------------------------------------------------------*/ + +/* All levels of nav */ + +nav[data-toggle='toc'] .nav > li > a { + padding: 4px 20px 4px 6px; + font-size: 1.5rem; + font-weight: 400; + color: inherit; +} + +nav[data-toggle='toc'] .nav > li > a:hover, +nav[data-toggle='toc'] .nav > li > a:focus { + padding-left: 5px; + color: inherit; + border-left: 1px solid #878787; +} + +nav[data-toggle='toc'] .nav > .active > a, +nav[data-toggle='toc'] .nav > .active:hover > a, +nav[data-toggle='toc'] .nav > .active:focus > a { + padding-left: 5px; + font-size: 1.5rem; + font-weight: 400; + color: inherit; + border-left: 2px solid #878787; +} + +/* Nav: second level (shown on .active) */ + +nav[data-toggle='toc'] .nav .nav { + display: none; /* Hide by default, but at >768px, show it */ + padding-bottom: 10px; +} + +nav[data-toggle='toc'] .nav .nav > li > a { + padding-left: 16px; + font-size: 1.35rem; +} + +nav[data-toggle='toc'] .nav .nav > li > a:hover, +nav[data-toggle='toc'] .nav .nav > li > a:focus { + padding-left: 15px; +} + +nav[data-toggle='toc'] .nav .nav > .active > a, +nav[data-toggle='toc'] .nav .nav > .active:hover > a, +nav[data-toggle='toc'] .nav .nav > .active:focus > a { + padding-left: 15px; + font-weight: 500; + font-size: 1.35rem; +} + +/* orcid ------------------------------------------------------------------- */ + +.orcid { + font-size: 16px; + color: #A6CE39; + /* margins are required by official ORCID trademark and display guidelines */ + margin-left:4px; + margin-right:4px; + vertical-align: middle; +} + +/* Reference index & topics ----------------------------------------------- */ + +.ref-index th {font-weight: normal;} + +.ref-index td {vertical-align: top; min-width: 100px} +.ref-index .icon {width: 40px;} +.ref-index .alias {width: 40%;} +.ref-index-icons .alias {width: calc(40% - 40px);} +.ref-index .title {width: 60%;} + +.ref-arguments th {text-align: right; padding-right: 10px;} +.ref-arguments th, .ref-arguments td {vertical-align: top; min-width: 100px} +.ref-arguments .name {width: 20%;} +.ref-arguments .desc {width: 80%;} + +/* Nice scrolling for wide elements --------------------------------------- */ + +table { + display: block; + overflow: auto; +} + +/* Syntax highlighting ---------------------------------------------------- */ + +pre { + word-wrap: normal; + word-break: normal; + border: 1px solid #eee; +} + +pre, code { + background-color: #f8f8f8; + color: #333; +} + +pre code { + overflow: auto; + word-wrap: normal; + white-space: pre; +} + +pre .img { + margin: 5px 0; +} + +pre .img img { + background-color: #fff; + display: block; + height: auto; +} + +code a, pre a { + color: #375f84; +} + +a.sourceLine:hover { + text-decoration: none; +} + +.fl {color: #1514b5;} +.fu {color: #000000;} /* function */ +.ch,.st {color: #036a07;} /* string */ +.kw {color: #264D66;} /* keyword */ +.co {color: #888888;} /* comment */ + +.message { color: black; font-weight: bolder;} +.error { color: orange; font-weight: bolder;} +.warning { color: #6A0366; font-weight: bolder;} + +/* Clipboard --------------------------*/ + +.hasCopyButton { + position: relative; +} + +.btn-copy-ex { + position: absolute; + right: 0; + top: 0; + visibility: hidden; +} + +.hasCopyButton:hover button.btn-copy-ex { + visibility: visible; +} + +/* headroom.js ------------------------ */ + +.headroom { + will-change: transform; + transition: transform 200ms linear; +} +.headroom--pinned { + transform: translateY(0%); +} +.headroom--unpinned { + transform: translateY(-100%); +} + +/* mark.js ----------------------------*/ + +mark { + background-color: rgba(255, 255, 51, 0.5); + border-bottom: 2px solid rgba(255, 153, 51, 0.3); + padding: 1px; +} + +/* vertical spacing after htmlwidgets */ +.html-widget { + margin-bottom: 10px; +} + +/* fontawesome ------------------------ */ + +.fab { + font-family: "Font Awesome 5 Brands" !important; +} + +/* don't display links in code chunks when printing */ +/* source: https://stackoverflow.com/a/10781533 */ +@media print { + code a:link:after, code a:visited:after { + content: ""; + } +} diff --git a/docs/pkgdown.js b/docs/pkgdown.js new file mode 100644 index 0000000..7e7048f --- /dev/null +++ b/docs/pkgdown.js @@ -0,0 +1,108 @@ +/* http://gregfranko.com/blog/jquery-best-practices/ */ +(function($) { + $(function() { + + $('.navbar-fixed-top').headroom(); + + $('body').css('padding-top', $('.navbar').height() + 10); + $(window).resize(function(){ + $('body').css('padding-top', $('.navbar').height() + 10); + }); + + $('[data-toggle="tooltip"]').tooltip(); + + var cur_path = paths(location.pathname); + var links = $("#navbar ul li a"); + var max_length = -1; + var pos = -1; + for (var i = 0; i < links.length; i++) { + if (links[i].getAttribute("href") === "#") + continue; + // Ignore external links + if (links[i].host !== location.host) + continue; + + var nav_path = paths(links[i].pathname); + + var length = prefix_length(nav_path, cur_path); + if (length > max_length) { + max_length = length; + pos = i; + } + } + + // Add class to parent
  • , and enclosing
  • if in dropdown + if (pos >= 0) { + var menu_anchor = $(links[pos]); + menu_anchor.parent().addClass("active"); + menu_anchor.closest("li.dropdown").addClass("active"); + } + }); + + function paths(pathname) { + var pieces = pathname.split("/"); + pieces.shift(); // always starts with / + + var end = pieces[pieces.length - 1]; + if (end === "index.html" || end === "") + pieces.pop(); + return(pieces); + } + + // Returns -1 if not found + function prefix_length(needle, haystack) { + if (needle.length > haystack.length) + return(-1); + + // Special case for length-0 haystack, since for loop won't run + if (haystack.length === 0) { + return(needle.length === 0 ? 0 : -1); + } + + for (var i = 0; i < haystack.length; i++) { + if (needle[i] != haystack[i]) + return(i); + } + + return(haystack.length); + } + + /* Clipboard --------------------------*/ + + function changeTooltipMessage(element, msg) { + var tooltipOriginalTitle=element.getAttribute('data-original-title'); + element.setAttribute('data-original-title', msg); + $(element).tooltip('show'); + element.setAttribute('data-original-title', tooltipOriginalTitle); + } + + if(ClipboardJS.isSupported()) { + $(document).ready(function() { + var copyButton = ""; + + $(".examples, div.sourceCode").addClass("hasCopyButton"); + + // Insert copy buttons: + $(copyButton).prependTo(".hasCopyButton"); + + // Initialize tooltips: + $('.btn-copy-ex').tooltip({container: 'body'}); + + // Initialize clipboard: + var clipboardBtnCopies = new ClipboardJS('[data-clipboard-copy]', { + text: function(trigger) { + return trigger.parentNode.textContent; + } + }); + + clipboardBtnCopies.on('success', function(e) { + changeTooltipMessage(e.trigger, 'Copied!'); + e.clearSelection(); + }); + + clipboardBtnCopies.on('error', function() { + changeTooltipMessage(e.trigger,'Press Ctrl+C or Command+C to copy'); + }); + }); + } +})(window.jQuery || window.$) diff --git a/docs/pkgdown.yml b/docs/pkgdown.yml new file mode 100644 index 0000000..5eb33f8 --- /dev/null +++ b/docs/pkgdown.yml @@ -0,0 +1,10 @@ +pandoc: 2.9.2.1 +pkgdown: 1.6.1 +pkgdown_sha: ~ +articles: + A-introduction: A-introduction.html + B-search: B-search.html + C-retrieval: C-retrieval.html + D-archiving: D-archiving.html +last_built: 2021-01-03T18:35Z + diff --git a/docs/reference/Rplot001.png b/docs/reference/Rplot001.png new file mode 100644 index 0000000000000000000000000000000000000000..17a358060aed2a86950757bbd25c6f92c08c458f GIT binary patch literal 1011 zcmeAS@N?(olHy`uVBq!ia0y~yV0-|=9Be?5+AI5}0x7m6Z+90U4Fo@(ch>_c&H|6f zVg?3oArNM~bhqvg0|WD9PZ!6KiaBo&GBN^{G%5UFpXcEKVvd5*5Eu=C0SJK)8A6*F U7`aXvEC5;V>FVdQ&MBb@00SN#Z2$lO literal 0 HcmV?d00001 diff --git a/docs/reference/add_dataset_file.html b/docs/reference/add_dataset_file.html new file mode 100644 index 0000000..21b954d --- /dev/null +++ b/docs/reference/add_dataset_file.html @@ -0,0 +1,271 @@ + + + + + + + + +Add or update a file in a dataset — add_dataset_file • dataverse + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    + + + + +
    + +
    +
    + + +
    +

    Add or update a file in a dataset

    +
    + +
    add_dataset_file(
    +  file,
    +  dataset,
    +  description = NULL,
    +  key = Sys.getenv("DATAVERSE_KEY"),
    +  server = Sys.getenv("DATAVERSE_SERVER"),
    +  ...
    +)
    +
    +update_dataset_file(
    +  file,
    +  dataset = NULL,
    +  id,
    +  description = NULL,
    +  force = TRUE,
    +  key = Sys.getenv("DATAVERSE_KEY"),
    +  server = Sys.getenv("DATAVERSE_SERVER"),
    +  ...
    +)
    + +

    Arguments

    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    file

    A character string

    dataset

    A character specifying a persistent identification ID for a dataset, +for example "doi:10.70122/FK2/HXJVJU". Alternatively, an object of class +“dataverse_dataset” obtained by dataverse_contents().

    description

    Optionally, a character string providing a description of the file.

    key

    A character string specifying a Dataverse server API key. If one +is not specified, functions calling authenticated API endpoints will fail. +Keys can be specified atomically or globally using +Sys.setenv("DATAVERSE_KEY" = "examplekey").

    server

    A character string specifying a Dataverse server. There are +multiple Dataverse installations, but the defaults is to use the Harvard +Dataverse (server = "dataverse.harvard.edu"). This can be modified atomically +or globally using Sys.setenv("DATAVERSE_SERVER" = "dataverse.example.com").

    ...

    Additional arguments passed to an HTTP request function, such as +GET, POST, or +DELETE.

    id

    An integer specifying a file identifier; or, if doi is specified, a character string specifying a file name within the DOI-identified dataset; or an object of class “dataverse_file” as returned by dataset_files.

    force

    A logical indicating whether to force the update even if the file types differ. Default is TRUE.

    + +

    Value

    + +

    add_dataset_file returns the new file ID.

    +

    Details

    + +

    From Dataverse v4.6.1, the “native” API provides endpoints to add and update files without going through the SWORD workflow. To use SWORD instead, see add_file. add_dataset_file adds a new file to a specified dataset.

    +

    update_dataset_file can be used to replace/update a published file. Note that it only works on published files, so unpublished drafts cannot be updated - the dataset must first either be published (publish_dataset) or deleted (delete_dataset).

    +

    See also

    + + + +

    Examples

    +
    if (FALSE) { +meta <- list() +ds <- create_dataset("mydataverse", body = meta) + +saveRDS(mtcars, tmp <- tempfile(fileext = ".rds")) +f <- add_dataset_file(tmp, dataset = ds, description = "mtcars") + +# publish dataset +publish_dataset(ds) + +# update file and republish +saveRDS(iris, tmp) +update_dataset_file(tmp, dataset = ds, id = f, + description = "Actually iris") +publish_dataset(ds) + +# cleanup +unlink(tmp) +delete_dataset(ds) +} +
    +
    + +
    + + +
    + + +
    +

    Site built with pkgdown 1.6.1.

    +
    + +
    +
    + + + + + + + + diff --git a/docs/reference/add_file.html b/docs/reference/add_file.html new file mode 100644 index 0000000..7574142 --- /dev/null +++ b/docs/reference/add_file.html @@ -0,0 +1,248 @@ + + + + + + + + +Add file (SWORD) — add_file • dataverse + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    + + + + +
    + +
    +
    + + +
    +

    Add one or more files to a SWORD (possibly unpublished) dataset

    +
    + +
    add_file(
    +  dataset,
    +  file,
    +  key = Sys.getenv("DATAVERSE_KEY"),
    +  server = Sys.getenv("DATAVERSE_SERVER"),
    +  ...
    +)
    + +

    Arguments

    + + + + + + + + + + + + + + + + + + + + + + +
    dataset

    A dataset DOI (or other persistent identifier), an object of class “dataset_atom” or “dataset_statement”, or an appropriate and complete SWORD URL.

    file

    A character vector of file names, a data.frame, or a list of R objects.

    key

    A character string specifying a Dataverse server API key. If one +is not specified, functions calling authenticated API endpoints will fail. +Keys can be specified atomically or globally using +Sys.setenv("DATAVERSE_KEY" = "examplekey").

    server

    A character string specifying a Dataverse server. There are +multiple Dataverse installations, but the defaults is to use the Harvard +Dataverse (server = "dataverse.harvard.edu"). This can be modified atomically +or globally using Sys.setenv("DATAVERSE_SERVER" = "dataverse.example.com").

    ...

    Additional arguments passed to an HTTP request function, such as +GET, POST, or +DELETE.

    + +

    Value

    + +

    An object of class “dataset_atom”.

    +

    Details

    + +

    This function is used to add files to a dataset. It is part of the SWORD API, which is used to upload data to a Dataverse server. This means this can be used to view unpublished Dataverses and Datasets.

    +

    As of Dataverse v4.6.1, the “native” API also provides endpoints to add and update files without going through the SWORD workflow. This functionality is provided by add_dataset_file and update_dataset_file.

    +

    See also

    + +

    Managing a Dataverse: publish_dataverse; Managing a dataset: dataset_atom, list_datasets, create_dataset, delete_dataset, publish_dataset; Managing files within a dataset: add_file, delete_file

    + +

    Examples

    +
    if (FALSE) { +# retrieve your service document +d <- service_document() + +# create a list of metadata +metadat <- list(title = "My Study", + creator = "Doe, John", + description = "An example study") + +# create the dataset +dat <- initiate_sword_dataset("mydataverse", body = metadat) + +# add files to dataset +tmp <- tempfile() +write.csv(iris, file = tmp) +f <- add_file(dat, file = tmp) + +# publish dataset +publish_dataset(dat) + +# delete a dataset +delete_dataset(dat) +} +
    +
    + +
    + + +
    + + +
    +

    Site built with pkgdown 1.6.1.

    +
    + +
    +
    + + + + + + + + diff --git a/docs/reference/create_dataset.html b/docs/reference/create_dataset.html new file mode 100644 index 0000000..c6eb3de --- /dev/null +++ b/docs/reference/create_dataset.html @@ -0,0 +1,249 @@ + + + + + + + + +Create or update a dataset — create_dataset • dataverse + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    + + + + +
    + +
    +
    + + +
    +

    Create or update dataset within a Dataverse

    +
    + +
    create_dataset(
    +  dataverse,
    +  body,
    +  key = Sys.getenv("DATAVERSE_KEY"),
    +  server = Sys.getenv("DATAVERSE_SERVER"),
    +  ...
    +)
    +
    +update_dataset(
    +  dataset,
    +  body,
    +  key = Sys.getenv("DATAVERSE_KEY"),
    +  server = Sys.getenv("DATAVERSE_SERVER"),
    +  ...
    +)
    + +

    Arguments

    + + + + + + + + + + + + + + + + + + + + + + + + + + +
    dataverse

    A character string specifying a Dataverse name or an object of class “dataverse”.

    body

    A list describing the dataset.

    key

    A character string specifying a Dataverse server API key. If one +is not specified, functions calling authenticated API endpoints will fail. +Keys can be specified atomically or globally using +Sys.setenv("DATAVERSE_KEY" = "examplekey").

    server

    A character string specifying a Dataverse server. There are +multiple Dataverse installations, but the defaults is to use the Harvard +Dataverse (server = "dataverse.harvard.edu"). This can be modified atomically +or globally using Sys.setenv("DATAVERSE_SERVER" = "dataverse.example.com").

    ...

    Additional arguments passed to an HTTP request function, such as +GET, POST, or +DELETE.

    dataset

    A character specifying a persistent identification ID for a dataset, +for example "doi:10.70122/FK2/HXJVJU". Alternatively, an object of class +“dataverse_dataset” obtained by dataverse_contents().

    + +

    Value

    + +

    An object of class “dataverse_dataset”.

    +

    Details

    + +

    create_dataset creates a Dataverse dataset. In Dataverse, a “dataset” is the lowest-level structure in which to organize files. For example, a Dataverse dataset might contain the files used to reproduce a published article, including data, analysis code, and related materials. Datasets can be organized into “Dataverse” objects, which can be further nested within other Dataverses. For someone creating an archive, this would be the first step to producing said archive (after creating a Dataverse, if one does not already exist). Once files and metadata have been added, the dataset can be publised (i.e., made public) using publish_dataset.

    +

    update_dataset updates a Dataverse dataset that has already been created using create_dataset. This creates a draft version of the dataset or modifies the current draft if one is already in-progress. It does not assign a new version number to the dataset nor does it make it publicly visible (which can be done with publish_dataset).

    +

    See also

    + + + +

    Examples

    +
    if (FALSE) { +meta <- list() +ds <- create_dataset("mydataverse", body = meta) + +meta2 <- list() +update_dataset(ds, body = meta2) + +# cleanup +delete_dataset(ds) +} +
    +
    + +
    + + +
    + + +
    +

    Site built with pkgdown 1.6.1.

    +
    + +
    +
    + + + + + + + + diff --git a/docs/reference/create_dataverse.html b/docs/reference/create_dataverse.html new file mode 100644 index 0000000..64b9211 --- /dev/null +++ b/docs/reference/create_dataverse.html @@ -0,0 +1,226 @@ + + + + + + + + +Create Dataverse — create_dataverse • dataverse + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    + + + + +
    + +
    +
    + + +
    +

    Create a new Dataverse

    +
    + +
    create_dataverse(
    +  dataverse,
    +  key = Sys.getenv("DATAVERSE_KEY"),
    +  server = Sys.getenv("DATAVERSE_SERVER"),
    +  ...
    +)
    + +

    Arguments

    + + + + + + + + + + + + + + + + + + +
    dataverse

    A character string specifying a Dataverse name or an object of class “dataverse”. If missing, a top-level Dataverse is created.

    key

    A character string specifying a Dataverse server API key. If one +is not specified, functions calling authenticated API endpoints will fail. +Keys can be specified atomically or globally using +Sys.setenv("DATAVERSE_KEY" = "examplekey").

    server

    A character string specifying a Dataverse server. There are +multiple Dataverse installations, but the defaults is to use the Harvard +Dataverse (server = "dataverse.harvard.edu"). This can be modified atomically +or globally using Sys.setenv("DATAVERSE_SERVER" = "dataverse.example.com").

    ...

    Additional arguments passed to an HTTP request function, such as +GET, POST, or +DELETE.

    + +

    Value

    + +

    A list.

    +

    Details

    + +

    This function can create a new Dataverse. In the language of Dataverse, a user has a “root” Dataverse into which they can create further nested Dataverses and/or “datasets” that contain, for example, a set of files for a specific project. Creating a new Dataverse can therefore be a useful way to organize other related Dataverses or sets of related datasets.

    +

    For example, if one were involved in an ongoing project that generated monthly data. One may want to store each month's data and related files in a separate “dataset”, so that each has its own persistent identifier (e.g., DOI), but keep all of these datasets within a named Dataverse so that the project's files are kept separate the user's personal Dataverse records. The flexible nesting of Dataverses allows for a number of possible organizational approaches.

    +

    See also

    + +

    To manage Dataverses: delete_dataverse, publish_dataverse, dataverse_contents; to get datasets: get_dataset; to search for Dataverses, datasets, or files: dataverse_search

    + +

    Examples

    +
    if (FALSE) { +(dv <- create_dataverse("mydataverse")) + +# cleanup +delete_dataverse("mydataverse") +} +
    +
    + +
    + + +
    + + +
    +

    Site built with pkgdown 1.6.1.

    +
    + +
    +
    + + + + + + + + diff --git a/docs/reference/dataset_atom.html b/docs/reference/dataset_atom.html new file mode 100644 index 0000000..3dc45ea --- /dev/null +++ b/docs/reference/dataset_atom.html @@ -0,0 +1,236 @@ + + + + + + + + +View dataset (SWORD) — dataset_atom • dataverse + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    + + + + +
    + +
    +
    + + +
    +

    View a SWORD (possibly unpublished) dataset “statement”

    +
    + +
    dataset_atom(
    +  dataset,
    +  key = Sys.getenv("DATAVERSE_KEY"),
    +  server = Sys.getenv("DATAVERSE_SERVER"),
    +  ...
    +)
    +
    +dataset_statement(
    +  dataset,
    +  key = Sys.getenv("DATAVERSE_KEY"),
    +  server = Sys.getenv("DATAVERSE_SERVER"),
    +  ...
    +)
    + +

    Arguments

    + + + + + + + + + + + + + + + + + + +
    dataset

    A dataset DOI (or other persistent identifier), an object of class “dataset_atom” or “dataset_statement”, or an appropriate and complete SWORD URL.

    key

    A character string specifying a Dataverse server API key. If one +is not specified, functions calling authenticated API endpoints will fail. +Keys can be specified atomically or globally using +Sys.setenv("DATAVERSE_KEY" = "examplekey").

    server

    A character string specifying a Dataverse server. There are +multiple Dataverse installations, but the defaults is to use the Harvard +Dataverse (server = "dataverse.harvard.edu"). This can be modified atomically +or globally using Sys.setenv("DATAVERSE_SERVER" = "dataverse.example.com").

    ...

    Additional arguments passed to an HTTP request function, such as +GET, POST, or +DELETE.

    + +

    Value

    + +

    A list. For dataset_atom, an object of class “dataset_atom”.

    +

    Details

    + +

    These functions are used to view a dataset by its persistent identifier. dataset_statement will contain information about the contents of the dataset, whereas dataset_atom contains “metadata” relevant to the SWORD API.

    +

    See also

    + +

    Managing a Dataverse: publish_dataverse; Managing a dataset: dataset_atom, list_datasets, create_dataset, delete_sword_dataset, publish_dataset; Managing files within a dataset: add_file, delete_file

    + +

    Examples

    +
    if (FALSE) { +# retrieve your service document +d <- service_document() + +# retrieve dataset statement (list contents) +dataset_statement(d[[2]]) + +# retrieve dataset atom +dataset_atom(d[[2]]) +} +
    +
    + +
    + + +
    + + +
    +

    Site built with pkgdown 1.6.1.

    +
    + +
    +
    + + + + + + + + diff --git a/docs/reference/dataset_versions.html b/docs/reference/dataset_versions.html new file mode 100644 index 0000000..58892cb --- /dev/null +++ b/docs/reference/dataset_versions.html @@ -0,0 +1,230 @@ + + + + + + + + +Dataset versions — dataset_versions • dataverse + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    + + + + +
    + +
    +
    + + +
    +

    View versions of a dataset

    +
    + +
    dataset_versions(
    +  dataset,
    +  key = Sys.getenv("DATAVERSE_KEY"),
    +  server = Sys.getenv("DATAVERSE_SERVER"),
    +  ...
    +)
    + +

    Arguments

    + + + + + + + + + + + + + + + + + + +
    dataset

    A character specifying a persistent identification ID for a dataset, +for example "doi:10.70122/FK2/HXJVJU". Alternatively, an object of class +“dataverse_dataset” obtained by dataverse_contents().

    key

    A character string specifying a Dataverse server API key. If one +is not specified, functions calling authenticated API endpoints will fail. +Keys can be specified atomically or globally using +Sys.setenv("DATAVERSE_KEY" = "examplekey").

    server

    A character string specifying a Dataverse server. There are +multiple Dataverse installations, but the defaults is to use the Harvard +Dataverse (server = "dataverse.harvard.edu"). This can be modified atomically +or globally using Sys.setenv("DATAVERSE_SERVER" = "dataverse.example.com").

    ...

    Additional arguments passed to an HTTP request function, such as +GET, POST, or +DELETE.

    + +

    Value

    + +

    A list of class “dataverse_dataset_version”.

    +

    Details

    + +

    This returns a list of objects of all versions of a dataset, including metadata. This can be used as a first step for retrieving older versions of files or datasets.

    +

    See also

    + + + +

    Examples

    +
    if (FALSE) { +# download file from: +# https://dataverse.harvard.edu/dataset.xhtml?persistentId=doi:10.7910/DVN/ARKOTI +monogan <- get_dataverse("monogan") +monogan_data <- dataverse_contents(monogan) +d1 <- get_dataset(monogan_data[[1]]) +dataset_versions(d1) +dataset_files(d1) +} +
    +
    + +
    + + +
    + + +
    +

    Site built with pkgdown 1.6.1.

    +
    + +
    +
    + + + + + + + + diff --git a/docs/reference/dataverse.html b/docs/reference/dataverse.html new file mode 100644 index 0000000..331144e --- /dev/null +++ b/docs/reference/dataverse.html @@ -0,0 +1,192 @@ + + + + + + + + +Client for Dataverse 4 Repositories — dataverse • dataverse + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    + + + + +
    + +
    +
    + + +
    +

    Provides access to Dataverse 4 APIs, enabling data search, retrieval, and deposit.

    +
    + + + +

    Details

    + +

    Dataverse is open-source data repository management software developed by the Institute for Quantitative Social Science at Harvard University. This package provides an R interface to Dataverse version 4 repositories, including the principal Dataverse hosted at Harvard (https://dataverse.harvard.edu/). Users can use the package to search for data stored in a Dataverse repository, retrieve data and other files, and also use the package to directly create and archive their own research data and software.

    +

    A Dataverse is structured as a nested set of “dataverse” repositories, such that a single dataverse can contain “datasets” (a set of code files, data files, etc.) or other dataverses. Thus, users may want to search for dataverses (sets of dataverses and datasets), datasets (sets of files), or individual files, and retrieve those objects accordingly. To retrieve a given file, a user typically needs to know what dataset it is stored in. All datasets are identified by a persistent identifier (such as an DOI or Handle, depending on the age of the dataset and what Dataverse repository it is hosted in).

    +

    This package provides five main sets of functions to interact with Dataverse:

    + + +

    References

    + +

    Dataverse API Documentation

    +

    Dataverse Homepage

    +

    Harvard IQSS Dataverse

    + +
    + +
    + + +
    + + +
    +

    Site built with pkgdown 1.6.1.

    +
    + +
    +
    + + + + + + + + diff --git a/docs/reference/dataverse_metadata.html b/docs/reference/dataverse_metadata.html new file mode 100644 index 0000000..08d602d --- /dev/null +++ b/docs/reference/dataverse_metadata.html @@ -0,0 +1,226 @@ + + + + + + + + +Dataverse metadata — dataverse_metadata • dataverse + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    + + + + +
    + +
    +
    + + +
    +

    Get metadata for a named Dataverse.

    +
    + +
    dataverse_metadata(
    +  dataverse,
    +  key = Sys.getenv("DATAVERSE_KEY"),
    +  server = Sys.getenv("DATAVERSE_SERVER"),
    +  ...
    +)
    + +

    Arguments

    + + + + + + + + + + + + + + + + + + +
    dataverse

    A character string specifying a Dataverse name or an object of class “dataverse”.

    key

    A character string specifying a Dataverse server API key. If one +is not specified, functions calling authenticated API endpoints will fail. +Keys can be specified atomically or globally using +Sys.setenv("DATAVERSE_KEY" = "examplekey").

    server

    A character string specifying a Dataverse server. There are +multiple Dataverse installations, but the defaults is to use the Harvard +Dataverse (server = "dataverse.harvard.edu"). This can be modified atomically +or globally using Sys.setenv("DATAVERSE_SERVER" = "dataverse.example.com").

    ...

    Additional arguments passed to an HTTP request function, such as +GET, POST, or +DELETE.

    + +

    Value

    + +

    A list

    +

    Details

    + +

    This function returns a list of metadata for a named Dataverse. Use dataverse_contents to list Dataverses and/or datasets contained within a Dataverse or use dataset_metadata to get metadata for a specific dataset.

    +

    See also

    + + + +

    Examples

    +
    if (FALSE) { +# download file from: +# https://dataverse.harvard.edu/dataset.xhtml?persistentId=doi:10.7910/DVN/ARKOTI +monogan <- get_dataverse("monogan") +monogan_data <- dataverse_contents(monogan) +dataverse_metadata(monogan) +} +
    +
    + +
    + + +
    + + +
    +

    Site built with pkgdown 1.6.1.

    +
    + +
    +
    + + + + + + + + diff --git a/docs/reference/dataverse_search.html b/docs/reference/dataverse_search.html new file mode 100644 index 0000000..d95684b --- /dev/null +++ b/docs/reference/dataverse_search.html @@ -0,0 +1,282 @@ + + + + + + + + +Search Dataverse server — dataverse_search • dataverse + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    + + + + +
    + +
    +
    + + +
    +

    Search for Dataverses and datasets

    +
    + +
    dataverse_search(
    +  ...,
    +  type = c("dataverse", "dataset", "file"),
    +  subtree = NULL,
    +  sort = c("name", "date"),
    +  order = c("asc", "desc"),
    +  per_page = 10,
    +  start = NULL,
    +  show_relevance = FALSE,
    +  show_facets = FALSE,
    +  fq = NULL,
    +  key = Sys.getenv("DATAVERSE_KEY"),
    +  server = Sys.getenv("DATAVERSE_SERVER"),
    +  verbose = TRUE,
    +  http_opts = NULL
    +)
    + +

    Arguments

    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    ...

    A length-one character vector specifying a search query, a named character vector of search arguments, or a sequence of named character arguments. The specific fields available may vary by server installation.

    type

    A character vector specifying one or more of “dataverse”, “dataset”, and “file”, which is used to restrict the search results. By default, all three types of objects are searched for.

    subtree

    Currently ignored.

    sort

    A character vector specifying whether to sort results by “name” or “date”.

    order

    A character vector specifying either “asc” or “desc” results order.

    per_page

    An integer specifying the page size of results.

    start

    An integer specifying used for pagination.

    show_relevance

    A logical indicating whether or not to show details of which fields were matched by the query

    show_facets

    A logical indicating whether or not to show facets that can be operated on by the fq parameter

    fq

    See API documentation.

    key

    A character string specifying a Dataverse server API key. If one +is not specified, functions calling authenticated API endpoints will fail. +Keys can be specified atomically or globally using +Sys.setenv("DATAVERSE_KEY" = "examplekey").

    server

    A character string specifying a Dataverse server. There are +multiple Dataverse installations, but the defaults is to use the Harvard +Dataverse (server = "dataverse.harvard.edu"). This can be modified atomically +or globally using Sys.setenv("DATAVERSE_SERVER" = "dataverse.example.com").

    verbose

    A logical indicating whether to display information about the search query (default is TRUE).

    http_opts

    Currently ignored.

    dataverse

    A character string specifying a Dataverse name or an object of class “dataverse”.

    + +

    Value

    + +

    A list.

    +

    Details

    + +

    This function provides an interface for searching for Dataverses, datasets, and/or files within a Dataverse server.

    +

    See also

    + + + +

    Examples

    +
    if (FALSE) { +# simple string search +dataverse_search("Gary King") + +# search using named arguments +dataverse_search(c(author = "Gary King", title = "Ecological Inference")) +dataverse_search(author = "Gary King", title = "Ecological Inference") + +# search only for datasets +dataverse_search(author = "Gary King", type = "dataset") +} +
    +
    + +
    + + +
    + + +
    +

    Site built with pkgdown 1.6.1.

    +
    + +
    +
    + + + + + + + + diff --git a/docs/reference/delete_dataset.html b/docs/reference/delete_dataset.html new file mode 100644 index 0000000..c542790 --- /dev/null +++ b/docs/reference/delete_dataset.html @@ -0,0 +1,226 @@ + + + + + + + + +Delete draft dataset — delete_dataset • dataverse + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    + + + + +
    + +
    +
    + + +
    +

    Delete a dataset draft

    +
    + +
    delete_dataset(
    +  dataset,
    +  key = Sys.getenv("DATAVERSE_KEY"),
    +  server = Sys.getenv("DATAVERSE_SERVER"),
    +  ...
    +)
    + +

    Arguments

    + + + + + + + + + + + + + + + + + + +
    dataset

    A character specifying a persistent identification ID for a dataset, +for example "doi:10.70122/FK2/HXJVJU". Alternatively, an object of class +“dataverse_dataset” obtained by dataverse_contents().

    key

    A character string specifying a Dataverse server API key. If one +is not specified, functions calling authenticated API endpoints will fail. +Keys can be specified atomically or globally using +Sys.setenv("DATAVERSE_KEY" = "examplekey").

    server

    A character string specifying a Dataverse server. There are +multiple Dataverse installations, but the defaults is to use the Harvard +Dataverse (server = "dataverse.harvard.edu"). This can be modified atomically +or globally using Sys.setenv("DATAVERSE_SERVER" = "dataverse.example.com").

    ...

    Additional arguments passed to an HTTP request function, such as +GET, POST, or +DELETE.

    + +

    Value

    + +

    A logical.

    +

    Details

    + +

    This function can be used to delete a draft (unpublished) Dataverse dataset. Once published, a dataset cannot be deleted. An existing draft can instead be modified using update_dataset.

    +

    See also

    + + + +

    Examples

    +
    if (FALSE) { +meta <- list() +ds <- create_dataset("mydataverse", body = meta) +delete_dataset(ds) +} +
    +
    + +
    + + +
    + + +
    +

    Site built with pkgdown 1.6.1.

    +
    + +
    +
    + + + + + + + + diff --git a/docs/reference/delete_dataverse.html b/docs/reference/delete_dataverse.html new file mode 100644 index 0000000..c9f0712 --- /dev/null +++ b/docs/reference/delete_dataverse.html @@ -0,0 +1,223 @@ + + + + + + + + +Delete Dataverse — delete_dataverse • dataverse + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    + + + + +
    + +
    +
    + + +
    +

    Delete a dataverse

    +
    + +
    delete_dataverse(
    +  dataverse,
    +  key = Sys.getenv("DATAVERSE_KEY"),
    +  server = Sys.getenv("DATAVERSE_SERVER"),
    +  ...
    +)
    + +

    Arguments

    + + + + + + + + + + + + + + + + + + +
    dataverse

    A character string specifying a Dataverse name or an object of class “dataverse”.

    key

    A character string specifying a Dataverse server API key. If one +is not specified, functions calling authenticated API endpoints will fail. +Keys can be specified atomically or globally using +Sys.setenv("DATAVERSE_KEY" = "examplekey").

    server

    A character string specifying a Dataverse server. There are +multiple Dataverse installations, but the defaults is to use the Harvard +Dataverse (server = "dataverse.harvard.edu"). This can be modified atomically +or globally using Sys.setenv("DATAVERSE_SERVER" = "dataverse.example.com").

    ...

    Additional arguments passed to an HTTP request function, such as +GET, POST, or +DELETE.

    + +

    Value

    + +

    A logical.

    +

    Details

    + +

    This function deletes a Dataverse.

    +

    See also

    + +

    To manage Dataverses: create_dataverse, publish_dataverse, dataverse_contents; to get datasets: get_dataset; to search for Dataverses, datasets, or files: dataverse_search

    + +

    Examples

    +
    if (FALSE) { +dv <- create_dataverse("mydataverse") +delete_dataverse(dv) +} +
    +
    + +
    + + +
    + + +
    +

    Site built with pkgdown 1.6.1.

    +
    + +
    +
    + + + + + + + + diff --git a/docs/reference/delete_file.html b/docs/reference/delete_file.html new file mode 100644 index 0000000..19d41bf --- /dev/null +++ b/docs/reference/delete_file.html @@ -0,0 +1,219 @@ + + + + + + + + +Delete file (SWORD) — delete_file • dataverse + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    + + + + +
    + +
    +
    + + +
    +

    Delete a file from a SWORD (possibly unpublished) dataset

    +
    + +
    delete_file(
    +  id,
    +  key = Sys.getenv("DATAVERSE_KEY"),
    +  server = Sys.getenv("DATAVERSE_SERVER"),
    +  ...
    +)
    + +

    Arguments

    + + + + + + + + + + + + + + + + + + +
    id

    A file ID, possibly returned by add_file, or a complete “edit-media/file” URL.

    key

    A character string specifying a Dataverse server API key. If one +is not specified, functions calling authenticated API endpoints will fail. +Keys can be specified atomically or globally using +Sys.setenv("DATAVERSE_KEY" = "examplekey").

    server

    A character string specifying a Dataverse server. There are +multiple Dataverse installations, but the defaults is to use the Harvard +Dataverse (server = "dataverse.harvard.edu"). This can be modified atomically +or globally using Sys.setenv("DATAVERSE_SERVER" = "dataverse.example.com").

    ...

    Additional arguments passed to an HTTP request function, such as +GET, POST, or +DELETE.

    + +

    Value

    + +

    If successful, a logical TRUE, else possibly some information.

    +

    Details

    + +

    This function is used to delete a file from a dataset by its file ID. It is part of the SWORD API, which is used to upload data to a Dataverse server.

    +

    See also

    + +

    Managing a Dataverse: publish_dataverse; Managing a dataset: dataset_atom, list_datasets, create_dataset, delete_dataset, publish_dataset; Managing files within a dataset: add_file, delete_file

    + +

    Examples

    +
    
    +  
    + +
    + + +
    + + +
    +

    Site built with pkgdown 1.6.1.

    +
    + +
    +
    + + + + + + + + diff --git a/docs/reference/delete_sword_dataset.html b/docs/reference/delete_sword_dataset.html new file mode 100644 index 0000000..5ca44cc --- /dev/null +++ b/docs/reference/delete_sword_dataset.html @@ -0,0 +1,234 @@ + + + + + + + + +Delete dataset (SWORD) — delete_sword_dataset • dataverse + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    + + + + +
    + +
    +
    + + +
    +

    Delete a SWORD (possibly unpublished) dataset

    +
    + +
    delete_sword_dataset(
    +  dataset,
    +  key = Sys.getenv("DATAVERSE_KEY"),
    +  server = Sys.getenv("DATAVERSE_SERVER"),
    +  ...
    +)
    + +

    Arguments

    + + + + + + + + + + + + + + + + + + +
    dataset

    A dataset DOI (or other persistent identifier).

    key

    A character string specifying a Dataverse server API key. If one +is not specified, functions calling authenticated API endpoints will fail. +Keys can be specified atomically or globally using +Sys.setenv("DATAVERSE_KEY" = "examplekey").

    server

    A character string specifying a Dataverse server. There are +multiple Dataverse installations, but the defaults is to use the Harvard +Dataverse (server = "dataverse.harvard.edu"). This can be modified atomically +or globally using Sys.setenv("DATAVERSE_SERVER" = "dataverse.example.com").

    ...

    Additional arguments passed to an HTTP request function, such as +GET, POST, or +DELETE.

    + +

    Value

    + +

    If successful, a logical TRUE, else possibly some information.

    +

    Details

    + +

    This function is used to delete a dataset by its persistent identifier. It is part of the SWORD API, which is used to upload data to a Dataverse server.

    +

    See also

    + +

    Managing a Dataverse: publish_dataverse; Managing a dataset: dataset_atom, list_datasets, create_dataset, publish_dataset; Managing files within a dataset: add_file, delete_file

    + +

    Examples

    +
    if (FALSE) { +# retrieve your service document +d <- service_document() + +# create a list of metadata +metadat <- list(title = "My Study", + creator = "Doe, John", + description = "An example study") + +# create the dataset in first dataverse +dat <- initiate_sword_dataset(d[[2]], body = metadat) + +# delete a dataset +delete_dataset(dat) +} +
    +
    + +
    + + +
    + + +
    +

    Site built with pkgdown 1.6.1.

    +
    + +
    +
    + + + + + + + + diff --git a/docs/reference/files.html b/docs/reference/files.html new file mode 100644 index 0000000..a65f9bd --- /dev/null +++ b/docs/reference/files.html @@ -0,0 +1,359 @@ + + + + + + + + +Download File — get_file • dataverse + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    + + + + +
    + +
    +
    + + +
    +

    Download Dataverse File(s). get_file is a general wrapper, +and can take either dataverse objects, file IDs, or a filename and dataverse. +get_file_by_name is a shorthand for running get_file by +specifying a file name (filename) and dataset (dataset). +get_file_by_doi obtains a file by its file DOI, bypassing the +dataset argument.

    +

    Internally, all functions download each file by get_file_by_id. get_file_* +functions return a raw binary file, which cannot be readily analyzed in R. +To use the objects as dataframes, see the get_dataset_* functions at get_dataset

    +
    + +
    get_file(
    +  file,
    +  dataset = NULL,
    +  format = c("original", "bundle"),
    +  vars = NULL,
    +  key = Sys.getenv("DATAVERSE_KEY"),
    +  server = Sys.getenv("DATAVERSE_SERVER"),
    +  original = TRUE,
    +  ...
    +)
    +
    +get_file_by_name(
    +  filename,
    +  dataset,
    +  format = c("original", "bundle"),
    +  vars = NULL,
    +  key = Sys.getenv("DATAVERSE_KEY"),
    +  server = Sys.getenv("DATAVERSE_SERVER"),
    +  original = TRUE,
    +  ...
    +)
    +
    +get_file_by_id(
    +  fileid,
    +  dataset = NULL,
    +  format = c("original", "bundle"),
    +  vars = NULL,
    +  original = TRUE,
    +  key = Sys.getenv("DATAVERSE_KEY"),
    +  server = Sys.getenv("DATAVERSE_SERVER"),
    +  ...
    +)
    +
    +get_file_by_doi(
    +  filedoi,
    +  dataset = NULL,
    +  format = c("original", "bundle"),
    +  vars = NULL,
    +  original = TRUE,
    +  key = Sys.getenv("DATAVERSE_KEY"),
    +  server = Sys.getenv("DATAVERSE_SERVER"),
    +  ...
    +)
    + +

    Arguments

    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    file

    An integer specifying a file identifier; or a vector of integers +specifying file identifiers; or, if used with the prefix "doi:", a +character with the file-specific DOI; or, if used without the prefix, a +filename accompanied by a dataset DOI in the dataset argument, or an object of +class “dataverse_file” as returned by dataset_files.

    dataset

    A character specifying a persistent identification ID for a dataset, +for example "doi:10.70122/FK2/HXJVJU". Alternatively, an object of class +“dataverse_dataset” obtained by dataverse_contents().

    format

    A character string specifying a file format for download. +by default, this is “original” (the original file format). If NULL, +no query is added, so ingested files are returned in their ingested TSV form. +For tabular datasets, the option “bundle” downloads the bundle +of the original and archival versions, as well as the documentation. +See https://guides.dataverse.org/en/latest/api/dataaccess.html for details.

    vars

    A character vector specifying one or more variable names, used to +extract a subset of the data.

    key

    A character string specifying a Dataverse server API key. If one +is not specified, functions calling authenticated API endpoints will fail. +Keys can be specified atomically or globally using +Sys.setenv("DATAVERSE_KEY" = "examplekey").

    server

    A character string specifying a Dataverse server. There are +multiple Dataverse installations, but the defaults is to use the Harvard +Dataverse (server = "dataverse.harvard.edu"). This can be modified atomically +or globally using Sys.setenv("DATAVERSE_SERVER" = "dataverse.example.com").

    original

    A logical, defaulting to TRUE. If a ingested (.tab) version is +available, download the original version instead of the ingested? If there was +no ingested version, is set to NA. Note in get_dataframe_*, +original is set to FALSE by default. Either can be changed.

    ...

    Additional arguments passed to an HTTP request function, such as +GET, POST, or +DELETE.

    filename

    Filename of the dataset, with file extension as shown in Dataverse +(for example, if nlsw88.dta was the original but is displayed as the ingested +nlsw88.tab, use the ingested version.)

    fileid

    A numeric ID internally used for get_file_by_id

    filedoi

    A DOI for a single file (not the entire dataset), of the form +"10.70122/FK2/PPIAXE/MHDB0O" or "doi:10.70122/FK2/PPIAXE/MHDB0O"

    + +

    Value

    + +

    get_file returns a raw vector (or list of raw vectors, +if length(file) > 1), which can be saved locally with the writeBin +function. To load datasets into the R environment dataframe, see +get_dataframe_by_name.

    +

    Details

    + +

    This function provides access to data files from a Dataverse entry.

    +

    See also

    + +

    To load the objects as datasets get_dataframe_by_name.

    + +

    Examples

    +
    if (FALSE) { + +# 1. Using filename and dataverse +f1 <- get_file_by_name( + filename = "nlsw88.tab", + dataset = "10.70122/FK2/PPIAXE", + server = "demo.dataverse.org" +) + +# 2. Using file DOI +f2 <- get_file_by_doi( + filedoi = "10.70122/FK2/PPIAXE/MHDB0O", + server = "demo.dataverse.org" +) + +# 3. Two-steps: Find ID from get_dataset +d3 <- get_dataset("doi:10.70122/FK2/PPIAXE", server = "demo.dataverse.org") +f3 <- get_file(d3$files$id[1], server = "demo.dataverse.org") + +# 4. Retrieve multiple raw data in list +f4_vec <- get_dataset( + "doi:10.70122/FK2/PPIAXE", + server = "demo.dataverse.org" +)$files$id + +f4 <- get_file(f4_vec, server = "demo.dataverse.org") +length(f4) + +# Write binary files +# (see `get_dataframe_by_name` to load in environment) +# The appropriate file extension needs to be assigned by the user. +writeBin(f1, "nlsw88.dta") +writeBin(f2, "nlsw88.dta") + +writeBin(f4[[1]], "nlsw88.rds") # originally a rds file +writeBin(f4[[2]], "nlsw88.dta") # originally a dta file +} + +
    +
    + +
    + + +
    + + +
    +

    Site built with pkgdown 1.6.1.

    +
    + +
    +
    + + + + + + + + diff --git a/docs/reference/get_dataframe.html b/docs/reference/get_dataframe.html new file mode 100644 index 0000000..262cd1a --- /dev/null +++ b/docs/reference/get_dataframe.html @@ -0,0 +1,348 @@ + + + + + + + + +Get file from dataverse and convert it into a dataframe or tibble — get_dataframe_by_name • dataverse + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    + + + + +
    + +
    +
    + + +
    +

    get_dataframe_by_id, if you know the numeric ID of the dataset, or instead +get_dataframe_by_name if you know the filename and doi. The dataset

    +
    + +
    get_dataframe_by_name(
    +  filename,
    +  dataset = NULL,
    +  .f = NULL,
    +  original = FALSE,
    +  ...
    +)
    +
    +get_dataframe_by_id(fileid, .f = NULL, original = FALSE, ...)
    +
    +get_dataframe_by_doi(filedoi, .f = NULL, original = FALSE, ...)
    + +

    Arguments

    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    filename

    The name of the file of interest, with file extension, for example +"roster-bulls-1996.tab".

    dataset

    A character specifying a persistent identification ID for a dataset, +for example "doi:10.70122/FK2/HXJVJU". Alternatively, an object of class +“dataverse_dataset” obtained by dataverse_contents().

    .f

    The function to used for reading in the raw dataset. This user +must choose the appropriate function: for example if the target is a .rds +file, then .f should be readRDS or readr::read_rds`.

    original

    A logical, defaulting to TRUE. Whether to read the ingested, +archival version of the dataset if one exists. The archival versions are tab-delimited +.tab files so if original = FALSE, .f is set to readr::read_tsv. +If functions to read the original version is available, then original = TRUE +with a specified .f is better.

    ...

    Arguments passed on to get_file

    +
    file

    An integer specifying a file identifier; or a vector of integers +specifying file identifiers; or, if used with the prefix "doi:", a +character with the file-specific DOI; or, if used without the prefix, a +filename accompanied by a dataset DOI in the dataset argument, or an object of +class “dataverse_file” as returned by dataset_files.

    +
    format

    A character string specifying a file format for download. +by default, this is “original” (the original file format). If NULL, +no query is added, so ingested files are returned in their ingested TSV form. +For tabular datasets, the option “bundle” downloads the bundle +of the original and archival versions, as well as the documentation. +See https://guides.dataverse.org/en/latest/api/dataaccess.html for details.

    +
    vars

    A character vector specifying one or more variable names, used to +extract a subset of the data.

    +
    key

    A character string specifying a Dataverse server API key. If one +is not specified, functions calling authenticated API endpoints will fail. +Keys can be specified atomically or globally using +Sys.setenv("DATAVERSE_KEY" = "examplekey").

    +
    server

    A character string specifying a Dataverse server. There are +multiple Dataverse installations, but the defaults is to use the Harvard +Dataverse (server = "dataverse.harvard.edu"). This can be modified atomically +or globally using Sys.setenv("DATAVERSE_SERVER" = "dataverse.example.com").

    + +
    fileid

    A numeric ID internally used for get_file_by_id

    filedoi

    A DOI for a single file (not the entire dataset), of the form +"10.70122/FK2/PPIAXE/MHDB0O" or "doi:10.70122/FK2/PPIAXE/MHDB0O"

    + + +

    Examples

    +
    +# Retrieve data.frame from dataverse DOI and file name +df_from_rds_ingested <- + get_dataframe_by_name( + filename = "roster-bulls-1996.tab", + dataset = "doi:10.70122/FK2/HXJVJU", + server = "demo.dataverse.org" + ) +
    #> Downloading ingested version of data with readr::read_tsv. To download the original version and remove this message, set original = TRUE.
    #> +#> ── Column specification ──────────────────────────────────────────────────────── +#> cols( +#> number = col_double(), +#> player = col_character(), +#> position = col_character(), +#> height = col_character(), +#> weight = col_double(), +#> dob = col_character(), +#> country_birth = col_character(), +#> experience_years = col_double(), +#> college = col_character() +#> )
    +# Retrieve the same data.frame from dataverse + file DOI +df_from_rds_ingested_by_doi <- + get_dataframe_by_doi( + filedoi = "10.70122/FK2/HXJVJU/SA3Z2V", + server = "demo.dataverse.org" + ) +
    #> Downloading ingested version of data with readr::read_tsv. To download the original version and remove this message, set original = TRUE.
    #> +#> ── Column specification ──────────────────────────────────────────────────────── +#> cols( +#> number = col_double(), +#> player = col_character(), +#> position = col_character(), +#> height = col_character(), +#> weight = col_double(), +#> dob = col_character(), +#> country_birth = col_character(), +#> experience_years = col_double(), +#> college = col_character() +#> )
    +# Retrieve ingested file originally a Stata dta +df_from_stata_ingested <- + get_dataframe_by_name( + filename = "nlsw88.tab", + dataset = "doi:10.70122/FK2/PPIAXE", + server = "demo.dataverse.org" + ) +
    #> Downloading ingested version of data with readr::read_tsv. To download the original version and remove this message, set original = TRUE.
    #> +#> ── Column specification ──────────────────────────────────────────────────────── +#> cols( +#> idcode = col_double(), +#> age = col_double(), +#> race = col_double(), +#> married = col_double(), +#> never_married = col_double(), +#> grade = col_double(), +#> collgrad = col_double(), +#> south = col_double(), +#> smsa = col_double(), +#> c_city = col_double(), +#> industry = col_double(), +#> occupation = col_double(), +#> union = col_double(), +#> wage = col_double(), +#> hours = col_double(), +#> ttl_exp = col_double(), +#> tenure = col_double() +#> )
    + +# To use the original file version, or for non-ingested data, +# please specify `original = TRUE` and specify a function in .f. + +# A data.frame is still returned, but the +if (requireNamespace("readr", quietly = TRUE)) { + df_from_rds_original <- + get_dataframe_by_name( + filename = "nlsw88_rds-export.rds", + dataset = "doi:10.70122/FK2/PPIAXE", + server = "demo.dataverse.org", + original = TRUE, + .f = readr::read_rds + ) +} + +if (requireNamespace("haven", quietly = TRUE)) { + df_from_stata_original <- + get_dataframe_by_name( + filename = "nlsw88.tab", + dataset = "doi:10.70122/FK2/PPIAXE", + server = "demo.dataverse.org", + original = TRUE, + .f = haven::read_dta + ) +} +
    +
    + +
    + + +
    + + +
    +

    Site built with pkgdown 1.6.1.

    +
    + +
    +
    + + + + + + + + diff --git a/docs/reference/get_dataframe_internal.html b/docs/reference/get_dataframe_internal.html new file mode 100644 index 0000000..90fa686 --- /dev/null +++ b/docs/reference/get_dataframe_internal.html @@ -0,0 +1,175 @@ + + + + + + + + +Write to temp and apply function — get_dataframe_internal • dataverse + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    + + + + +
    + +
    +
    + + +
    +

    Write to temp and apply function

    +
    + +
    get_dataframe_internal(raw, filename, .f)
    + + + +
    + +
    + + +
    + + +
    +

    Site built with pkgdown 1.6.1.

    +
    + +
    +
    + + + + + + + + diff --git a/docs/reference/get_dataset.html b/docs/reference/get_dataset.html new file mode 100644 index 0000000..dc3039b --- /dev/null +++ b/docs/reference/get_dataset.html @@ -0,0 +1,278 @@ + + + + + + + + +Get dataset — get_dataset • dataverse + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    + + + + +
    + +
    +
    + + +
    +

    Retrieve a Dataverse dataset or its metadata

    +
    + +
    get_dataset(
    +  dataset,
    +  version = ":latest",
    +  key = Sys.getenv("DATAVERSE_KEY"),
    +  server = Sys.getenv("DATAVERSE_SERVER"),
    +  ...
    +)
    +
    +dataset_metadata(
    +  dataset,
    +  version = ":latest",
    +  block = "citation",
    +  key = Sys.getenv("DATAVERSE_KEY"),
    +  server = Sys.getenv("DATAVERSE_SERVER"),
    +  ...
    +)
    +
    +dataset_files(
    +  dataset,
    +  version = ":latest",
    +  key = Sys.getenv("DATAVERSE_KEY"),
    +  server = Sys.getenv("DATAVERSE_SERVER"),
    +  ...
    +)
    + +

    Arguments

    + + + + + + + + + + + + + + + + + + + + + + + + + + +
    dataset

    A character specifying a persistent identification ID for a dataset, +for example "doi:10.70122/FK2/HXJVJU". Alternatively, an object of class +“dataverse_dataset” obtained by dataverse_contents().

    version

    A character string specifying a version of the dataset. This can be one of “:draft” (the current draft), “:latest” (the latest draft, if it exists, or the latest published version), “:latest-published” (the latest published version, ignoring any draft), or “x.y” (where x is a major version and y is a minor version; the .y can be omitted to obtain a major version). In lieu of this, a dataset's version-specific identification number can be used for the dataset argument.

    key

    A character string specifying a Dataverse server API key. If one +is not specified, functions calling authenticated API endpoints will fail. +Keys can be specified atomically or globally using +Sys.setenv("DATAVERSE_KEY" = "examplekey").

    server

    A character string specifying a Dataverse server. There are +multiple Dataverse installations, but the defaults is to use the Harvard +Dataverse (server = "dataverse.harvard.edu"). This can be modified atomically +or globally using Sys.setenv("DATAVERSE_SERVER" = "dataverse.example.com").

    ...

    Additional arguments passed to an HTTP request function, such as +GET, POST, or +DELETE.

    block

    A character string specifying a metadata block to retrieve. By default this is “citation”. Other values may be available, depending on the dataset, such as “geospatial” or “socialscience”.

    + +

    Value

    + +

    A list of class “dataverse_dataset” or a list of a form dependent on the specific metadata block retrieved. dataset_files returns a list of objects of class “dataverse_file”.

    +

    Details

    + +

    get_dataset retrieves details about a Dataverse dataset.

    +

    dataset_metadata returns a named metadata block for a dataset. +This is already returned by get_dataset, but this function allows +you to retrieve just a specific block of metadata, such as citation information.

    +

    dataset_files returns a list of files in a dataset, similar to +get_dataset. The difference is that this returns only a list of +“dataverse_dataset” objects, whereas get_dataset returns +metadata and a data.frame of files (rather than a list of file objects).

    +

    See also

    + +

    create_dataset, update_dataset, delete_dataset, publish_dataset, dataset_files, dataset_metadata

    + +

    Examples

    +
    if (FALSE) { +Sys.setenv("DATAVERSE_SERVER" = "demo.dataverse.org") +Sys.setenv("DATAVERSE_KEY" = "c7208dd2-6ec5-469a-bec5-f57e164888d4") + +# Download file from: https://demo.dataverse.org/file.xhtml?fileId=769385 +dv <- get_dataverse("dataverse-client-r") +contents <- dataverse_contents(dv) + +dataset_files(contents[[1]]) # Dataset contains 2 files +dataset_metadata(contents[[1]]) # Easier to query later + +set <- get_dataset(contents[[1]]) # 1st dataset w/n dataverse +f <- get_file(set$files$id[2]) # 2nd file w/n dataset + +# Check the *binary* representation of the file. +length(f) +head(f) + +# Examine the plain-text representation. +tmp <- tempfile(fileext = "svg") +writeBin(as.vector(f), tmp) +svg_lines <- readLines(tmp) +head(svg_lines) +} +
    +
    + +
    + + +
    + + +
    +

    Site built with pkgdown 1.6.1.

    +
    + +
    +
    + + + + + + + + diff --git a/docs/reference/get_dataverse.html b/docs/reference/get_dataverse.html new file mode 100644 index 0000000..07ee595 --- /dev/null +++ b/docs/reference/get_dataverse.html @@ -0,0 +1,254 @@ + + + + + + + + +Get Dataverse — get_dataverse • dataverse + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    + + + + +
    + +
    +
    + + +
    +

    Retrieve details of a Dataverse

    +
    + +
    get_dataverse(
    +  dataverse,
    +  key = Sys.getenv("DATAVERSE_KEY"),
    +  server = Sys.getenv("DATAVERSE_SERVER"),
    +  check = TRUE,
    +  ...
    +)
    +
    +dataverse_contents(
    +  dataverse,
    +  key = Sys.getenv("DATAVERSE_KEY"),
    +  server = Sys.getenv("DATAVERSE_SERVER"),
    +  ...
    +)
    + +

    Arguments

    + + + + + + + + + + + + + + + + + + + + + + +
    dataverse

    A character string specifying a Dataverse name or an object of class “dataverse”.

    key

    A character string specifying a Dataverse server API key. If one +is not specified, functions calling authenticated API endpoints will fail. +Keys can be specified atomically or globally using +Sys.setenv("DATAVERSE_KEY" = "examplekey").

    server

    A character string specifying a Dataverse server. There are +multiple Dataverse installations, but the defaults is to use the Harvard +Dataverse (server = "dataverse.harvard.edu"). This can be modified atomically +or globally using Sys.setenv("DATAVERSE_SERVER" = "dataverse.example.com").

    check

    A logical indicating whether to check that the value of dataverse is actually a numeric

    ...

    Additional arguments passed to an HTTP request function, such as +GET, POST, or +DELETE.

    + +

    Value

    + +

    A list of class “dataverse”.

    +

    Details

    + +

    get_dataverse function retrieves basic information about a Dataverse from a Dataverse server. To see the contents of the Dataverse, use dataverse_contents instead. Contents might include one or more “datasets” and/or further Dataverses that themselves contain Dataverses and/or datasets. To view the file contents of a single Dataset, use get_dataset.

    +

    See also

    + +

    To manage Dataverses: +create_dataverse, +delete_dataverse, +publish_dataverse, +dataverse_contents;

    +

    To get datasets: +get_dataset;

    +

    To search for Dataverses, datasets, or files: +dataverse_search

    + +

    Examples

    +
    if (FALSE) { +# view the root dataverse for a server +get_dataverse(":root") +dataverse_contents(":root") + +Sys.setenv("DATAVERSE_SERVER" = "dataverse.harvard.edu") +# download file from: +# https://dataverse.harvard.edu/dataset.xhtml?persistentId=doi:10.7910/DVN/ARKOTI +dv <- get_dataverse("monogan") +(contents <- dataverse_contents(dv)) + +# get a dataset from the dataverse +d1 <- get_dataset(contents[[1]]) +f <- get_file(d1$files$id[3]) +} +
    +
    + +
    + + +
    + + +
    +

    Site built with pkgdown 1.6.1.

    +
    + +
    +
    + + + + + + + + diff --git a/docs/reference/get_facets.html b/docs/reference/get_facets.html new file mode 100644 index 0000000..9e10695 --- /dev/null +++ b/docs/reference/get_facets.html @@ -0,0 +1,228 @@ + + + + + + + + +Get Dataverse facets — get_facets • dataverse + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    + + + + +
    + +
    +
    + + +
    +

    Dataverse metadata facets

    +
    + +
    get_facets(
    +  dataverse,
    +  key = Sys.getenv("DATAVERSE_KEY"),
    +  server = Sys.getenv("DATAVERSE_SERVER"),
    +  ...
    +)
    + +

    Arguments

    + + + + + + + + + + + + + + + + + + +
    dataverse

    A character string specifying a Dataverse name or an object of class “dataverse”.

    key

    A character string specifying a Dataverse server API key. If one +is not specified, functions calling authenticated API endpoints will fail. +Keys can be specified atomically or globally using +Sys.setenv("DATAVERSE_KEY" = "examplekey").

    server

    A character string specifying a Dataverse server. There are +multiple Dataverse installations, but the defaults is to use the Harvard +Dataverse (server = "dataverse.harvard.edu"). This can be modified atomically +or globally using Sys.setenv("DATAVERSE_SERVER" = "dataverse.example.com").

    ...

    Additional arguments passed to an HTTP request function, such as +GET, POST, or +DELETE.

    + +

    Value

    + +

    A list.

    +

    Details

    + +

    Retrieve a list of Dataverse metadata facets.

    +

    See also

    + +

    To manage Dataverses: create_dataverse, delete_dataverse, publish_dataverse, dataverse_contents; to get datasets: get_dataset; to search for Dataverses, datasets, or files: dataverse_search

    + +

    Examples

    +
    if (FALSE) { +# download file from: +# https://dataverse.harvard.edu/dataset.xhtml?persistentId=doi:10.7910/DVN/ARKOTI +monogan <- get_dataverse("monogan") +(monogan_data <- dataverse_contents(monogan)) + +# get facets +get_facets(monogan) +} +
    +
    + +
    + + +
    + + +
    +

    Site built with pkgdown 1.6.1.

    +
    + +
    +
    + + + + + + + + diff --git a/docs/reference/get_file_metadata.html b/docs/reference/get_file_metadata.html new file mode 100644 index 0000000..f145eee --- /dev/null +++ b/docs/reference/get_file_metadata.html @@ -0,0 +1,228 @@ + + + + + + + + +Retrieve a ddi metadata file — get_file_metadata • dataverse + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    + + + + +
    + +
    +
    + + +
    +

    Retrieve a ddi metadata file

    +
    + +
    get_file_metadata(
    +  file,
    +  dataset = NULL,
    +  format = c("ddi", "preprocessed"),
    +  key = Sys.getenv("DATAVERSE_KEY"),
    +  server = Sys.getenv("DATAVERSE_SERVER"),
    +  ...
    +)
    + +

    Arguments

    + + + + + + + + + + + + + + + + + + + + + + + + + + +
    file

    An integer specifying a file identifier; or a vector of integers +specifying file identifiers; or, if used with the prefix "doi:", a +character with the file-specific DOI; or, if used without the prefix, a +filename accompanied by a dataset DOI in the dataset argument, or an object of +class “dataverse_file” as returned by dataset_files.

    dataset

    A character specifying a persistent identification ID for a dataset, +for example "doi:10.70122/FK2/HXJVJU". Alternatively, an object of class +“dataverse_dataset” obtained by dataverse_contents().

    format

    Defaults to “ddi” for metadata files

    key

    A character string specifying a Dataverse server API key. If one +is not specified, functions calling authenticated API endpoints will fail. +Keys can be specified atomically or globally using +Sys.setenv("DATAVERSE_KEY" = "examplekey").

    server

    A character string specifying a Dataverse server. There are +multiple Dataverse installations, but the defaults is to use the Harvard +Dataverse (server = "dataverse.harvard.edu"). This can be modified atomically +or globally using Sys.setenv("DATAVERSE_SERVER" = "dataverse.example.com").

    ...

    Additional arguments passed to an HTTP request function, such as +GET, POST, or +DELETE.

    + +

    Value

    + +

    A character vector containing a DDI +metadata file.

    + +
    + +
    + + +
    + + +
    +

    Site built with pkgdown 1.6.1.

    +
    + +
    +
    + + + + + + + + diff --git a/docs/reference/get_user_key.html b/docs/reference/get_user_key.html new file mode 100644 index 0000000..97a426b --- /dev/null +++ b/docs/reference/get_user_key.html @@ -0,0 +1,208 @@ + + + + + + + + +Get API Key — get_user_key • dataverse + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    + + + + +
    + +
    +
    + + +
    +

    Get a user's API key

    +
    + +
    get_user_key(user, password, server = Sys.getenv("DATAVERSE_SERVER"), ...)
    + +

    Arguments

    + + + + + + + + + + + + + + + + + + +
    user

    A character vector specifying a Dataverse server username.

    password

    A character vector specifying the password for this user.

    server

    A character string specifying a Dataverse server. There are multiple Dataverse installations, but the defaults is to use the Harvard Dataverse. This can be modified atomically or globally using Sys.setenv("DATAVERSE_SERVER" = "dataverse.example.com").

    ...

    Additional arguments passed to an HTTP request function, such as +GET, POST, or +DELETE.

    + +

    Value

    + +

    A list.

    +

    Details

    + +

    Use a Dataverse server's username and password login to obtain an API key for the user. This can be used if one does not yet have an API key, or desires to reset the key. This function does not require an API key argument to authenticate, but server must still be specified.

    + +

    Examples

    +
    if (FALSE) { +get_user_key("username", "password") +} +
    +
    + +
    + + +
    + + +
    +

    Site built with pkgdown 1.6.1.

    +
    + +
    +
    + + + + + + + + diff --git a/docs/reference/index.html b/docs/reference/index.html new file mode 100644 index 0000000..695a1c9 --- /dev/null +++ b/docs/reference/index.html @@ -0,0 +1,356 @@ + + + + + + + + +Function reference • dataverse + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    + + + + +
    + +
    +
    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +

    All functions

    +

    +
    +

    add_dataset_file() update_dataset_file()

    +

    Add or update a file in a dataset

    +

    add_file()

    +

    Add file (SWORD)

    +

    create_dataset() update_dataset()

    +

    Create or update a dataset

    +

    create_dataverse()

    +

    Create Dataverse

    +

    dataset_atom() dataset_statement()

    +

    View dataset (SWORD)

    +

    dataset_versions()

    +

    Dataset versions

    +

    dataverse

    +

    Client for Dataverse 4 Repositories

    +

    dataverse_metadata()

    +

    Dataverse metadata

    +

    dataverse_search()

    +

    Search Dataverse server

    +

    delete_dataset()

    +

    Delete draft dataset

    +

    delete_dataverse()

    +

    Delete Dataverse

    +

    delete_file()

    +

    Delete file (SWORD)

    +

    delete_sword_dataset()

    +

    Delete dataset (SWORD)

    +

    get_file() get_file_by_name() get_file_by_id() get_file_by_doi()

    +

    Download File

    +

    get_dataframe_by_name() get_dataframe_by_id() get_dataframe_by_doi()

    +

    Get file from dataverse and convert it into a dataframe or tibble

    +

    get_dataset() dataset_metadata() dataset_files()

    +

    Get dataset

    +

    get_dataverse() dataverse_contents()

    +

    Get Dataverse

    +

    get_facets()

    +

    Get Dataverse facets

    +

    get_file_metadata()

    +

    Retrieve a ddi metadata file

    +

    get_user_key()

    +

    Get API Key

    +

    initiate_sword_dataset()

    +

    Initiate dataset (SWORD)

    +

    is_ingested()

    +

    Identify if file is an ingested file

    +

    list_datasets()

    +

    List datasets (SWORD)

    +

    publish_dataset()

    +

    Publish dataset

    +

    publish_dataverse()

    +

    Publish Dataverse (SWORD)

    +

    publish_sword_dataset()

    +

    Publish dataset (SWORD)

    +

    service_document()

    +

    SWORD Service Document

    +

    set_dataverse_metadata()

    +

    Set Dataverse metadata

    +
    + + +
    + + +
    + + +
    +

    Site built with pkgdown 1.6.1.

    +
    + +
    +
    + + + + + + + + diff --git a/docs/reference/initiate_sword_dataset.html b/docs/reference/initiate_sword_dataset.html new file mode 100644 index 0000000..8fed0e1 --- /dev/null +++ b/docs/reference/initiate_sword_dataset.html @@ -0,0 +1,266 @@ + + + + + + + + +Initiate dataset (SWORD) — initiate_sword_dataset • dataverse + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    + + + + +
    + +
    +
    + + +
    +

    Initiate a SWORD (possibly unpublished) dataset

    +
    + +
    initiate_sword_dataset(
    +  dataverse,
    +  body,
    +  key = Sys.getenv("DATAVERSE_KEY"),
    +  server = Sys.getenv("DATAVERSE_SERVER"),
    +  ...
    +)
    + +

    Arguments

    + + + + + + + + + + + + + + + + + + + + + + +
    dataverse

    A Dataverse alias or ID number, or an object of class “dataverse”, perhaps as returned by service_document.

    body

    A list containing one or more metadata fields. Field names must be valid Dublin Core Terms labels (see details, below). The title, description, and creator fields are required.

    key

    A character string specifying a Dataverse server API key. If one +is not specified, functions calling authenticated API endpoints will fail. +Keys can be specified atomically or globally using +Sys.setenv("DATAVERSE_KEY" = "examplekey").

    server

    A character string specifying a Dataverse server. There are +multiple Dataverse installations, but the defaults is to use the Harvard +Dataverse (server = "dataverse.harvard.edu"). This can be modified atomically +or globally using Sys.setenv("DATAVERSE_SERVER" = "dataverse.example.com").

    ...

    Additional arguments passed to an HTTP request function, such as +GET, POST, or +DELETE.

    + +

    Value

    + +

    An object of class “dataset_atom”.

    +

    Details

    + +

    This function is used to initiate a dataset in a (SWORD) Dataverse by supplying relevant metadata. The function is part of the SWORD API (see Atom entry specification), which is used to upload data to a Dataverse server. +Allowed fields are: +“abstract”, “accessRights”, “accrualMethod”, +“accrualPeriodicity”, “accrualPolicy”, “alternative”, +“audience”, “available”, “bibliographicCitation”, +“conformsTo”, “contributor”, “coverage”, “created”, +“creator”, “date”, “dateAccepted”, “dateCopyrighted”, +“dateSubmitted”, “description”, “educationLevel”, “extent”, +“format”, “hasFormat”, “hasPart”, “hasVersion”, +“identifier”, “instructionalMethod”, “isFormatOf”, +“isPartOf”, “isReferencedBy”, “isReplacedBy”, “isRequiredBy”, +“issued”, “isVersionOf”, “language”, “license”, +“mediator”, “medium”, “modified”, “provenance”, +“publisher”, “references”, “relation”, “replaces”, +“requires”, “rights”, “rightsHolder”, “source”, +“spatial”, “subject”, “tableOfContents”, “temporal”, +“title”, “type”, and “valid”.

    +

    Note

    + +

    There are two ways to create dataset: native API (create_dataset) and SWORD API (initiate_sword_dataset).

    +

    References

    + +

    Dublin Core Metadata Terms

    +

    See also

    + +

    Managing a Dataverse: publish_dataverse; Managing a dataset: dataset_atom, list_datasets, create_dataset, delete_sword_dataset, publish_dataset; Managing files within a dataset: add_file, delete_file

    + +

    Examples

    +
    if (FALSE) { +# retrieve your service document (dataverse list) +d <- service_document() + +# create a list of metadata +metadat <- list(title = "My Study", + creator = "Doe, John", + description = "An example study") + +# create the dataset in first dataverse +dat <- initiate_sword_dataset(d[[2]], body = metadat) + +# add files to dataset +tmp <- tempfile(fileext = ".csv") +write.csv(iris, file = tmp) +add_file(dat, file = tmp) + +# publish dataset +publish_dataset(dat) +} +
    +
    + +
    + + +
    + + +
    +

    Site built with pkgdown 1.6.1.

    +
    + +
    +
    + + + + + + + + diff --git a/docs/reference/is_ingested.html b/docs/reference/is_ingested.html new file mode 100644 index 0000000..38b91ec --- /dev/null +++ b/docs/reference/is_ingested.html @@ -0,0 +1,201 @@ + + + + + + + + +Identify if file is an ingested file — is_ingested • dataverse + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    + + + + +
    + +
    +
    + + +
    +

    Identify if file is an ingested file

    +
    + +
    is_ingested(
    +  fileid,
    +  key = Sys.getenv("DATAVERSE_KEY"),
    +  server = Sys.getenv("DATAVERSE_SERVER")
    +)
    + +

    Arguments

    + + + + + + + + + + + + + + +
    fileid

    A numeric fileid or file-specific DOI

    key

    A character string specifying a Dataverse server API key. If one +is not specified, functions calling authenticated API endpoints will fail. +Keys can be specified atomically or globally using +Sys.setenv("DATAVERSE_KEY" = "examplekey").

    server

    A character string specifying a Dataverse server. There are +multiple Dataverse installations, but the defaults is to use the Harvard +Dataverse (server = "dataverse.harvard.edu"). This can be modified atomically +or globally using Sys.setenv("DATAVERSE_SERVER" = "dataverse.example.com").

    + + +
    + +
    + + +
    + + +
    +

    Site built with pkgdown 1.6.1.

    +
    + +
    +
    + + + + + + + + diff --git a/docs/reference/list_datasets.html b/docs/reference/list_datasets.html new file mode 100644 index 0000000..2508abd --- /dev/null +++ b/docs/reference/list_datasets.html @@ -0,0 +1,225 @@ + + + + + + + + +List datasets (SWORD) — list_datasets • dataverse + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    + + + + +
    + +
    +
    + + +
    +

    List datasets in a SWORD (possibly unpublished) Dataverse

    +
    + +
    list_datasets(
    +  dataverse,
    +  key = Sys.getenv("DATAVERSE_KEY"),
    +  server = Sys.getenv("DATAVERSE_SERVER"),
    +  ...
    +)
    + +

    Arguments

    + + + + + + + + + + + + + + + + + + +
    dataverse

    A Dataverse alias or ID number, or an object of class “dataverse”, perhaps as returned by service_document.

    key

    A character string specifying a Dataverse server API key. If one +is not specified, functions calling authenticated API endpoints will fail. +Keys can be specified atomically or globally using +Sys.setenv("DATAVERSE_KEY" = "examplekey").

    server

    A character string specifying a Dataverse server. There are +multiple Dataverse installations, but the defaults is to use the Harvard +Dataverse (server = "dataverse.harvard.edu"). This can be modified atomically +or globally using Sys.setenv("DATAVERSE_SERVER" = "dataverse.example.com").

    ...

    Additional arguments passed to an HTTP request function, such as +GET, POST, or +DELETE.

    + +

    Value

    + +

    A list.

    +

    Details

    + +

    This function is used to list datasets in a given Dataverse. It is part of the SWORD API, which is used to upload data to a Dataverse server. This means this can be used to view unpublished Dataverses and Datasets.

    +

    See also

    + +

    Managing a Dataverse: publish_dataverse; Managing a dataset: dataset_atom, list_datasets, create_dataset, delete_dataset, publish_dataset; Managing files within a dataset: add_file, delete_file

    + +

    Examples

    +
    if (FALSE) { +Sys.setenv("DATAVERSE_SERVER" = "demo.dataverse.org") +Sys.setenv("DATAVERSE_KEY" = "c7208dd2-6ec5-469a-bec5-f57e164888d4") +dv <- get_dataverse("dataverse-client-r") +list_datasets(dv) +} +
    +
    + +
    + + +
    + + +
    +

    Site built with pkgdown 1.6.1.

    +
    + +
    +
    + + + + + + + + diff --git a/docs/reference/publish_dataset.html b/docs/reference/publish_dataset.html new file mode 100644 index 0000000..1de0a06 --- /dev/null +++ b/docs/reference/publish_dataset.html @@ -0,0 +1,232 @@ + + + + + + + + +Publish dataset — publish_dataset • dataverse + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    + + + + +
    + +
    +
    + + +
    +

    Publish/release Dataverse dataset

    +
    + +
    publish_dataset(
    +  dataset,
    +  minor = TRUE,
    +  key = Sys.getenv("DATAVERSE_KEY"),
    +  server = Sys.getenv("DATAVERSE_SERVER"),
    +  ...
    +)
    + +

    Arguments

    + + + + + + + + + + + + + + + + + + + + + + +
    dataset

    A character specifying a persistent identification ID for a dataset, +for example "doi:10.70122/FK2/HXJVJU". Alternatively, an object of class +“dataverse_dataset” obtained by dataverse_contents().

    minor

    A logical specifying whether the new release of the dataset is a “minor” release (TRUE, by default), resulting in a minor version increase (e.g., from 1.1 to 1.2). If FALSE, the dataset is given a “major” release (e.g., from 1.1 to 2.0).

    key

    A character string specifying a Dataverse server API key. If one +is not specified, functions calling authenticated API endpoints will fail. +Keys can be specified atomically or globally using +Sys.setenv("DATAVERSE_KEY" = "examplekey").

    server

    A character string specifying a Dataverse server. There are +multiple Dataverse installations, but the defaults is to use the Harvard +Dataverse (server = "dataverse.harvard.edu"). This can be modified atomically +or globally using Sys.setenv("DATAVERSE_SERVER" = "dataverse.example.com").

    ...

    Additional arguments passed to an HTTP request function, such as +GET, POST, or +DELETE.

    + +

    Value

    + +

    A list.

    +

    Details

    + +

    Use this function to “publish” (i.e., publicly release) a draft Dataverse dataset. This creates a publicly visible listing of the dataset, accessible by its DOI, with a numbered version. This action cannot be undone. +There are no requirements for what constitutes a major or minor release, but a minor release might be used to update metadata (e.g., a new linked publication) or the addition of supplemental files. A major release is best used to reflect a substantial change to the dataset, such as would require a published erratum or a substantial change to data or code.

    +

    See also

    + + + +

    Examples

    +
    if (FALSE) { +meta <- list() +ds <- create_dataset("mydataverse", body = meta) +publish_dataset(ds) +} +
    +
    + +
    + + +
    + + +
    +

    Site built with pkgdown 1.6.1.

    +
    + +
    +
    + + + + + + + + diff --git a/docs/reference/publish_dataverse.html b/docs/reference/publish_dataverse.html new file mode 100644 index 0000000..8ac7b93 --- /dev/null +++ b/docs/reference/publish_dataverse.html @@ -0,0 +1,217 @@ + + + + + + + + +Publish Dataverse (SWORD) — publish_dataverse • dataverse + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    + + + + +
    + +
    +
    + + +
    +

    Publish/re-publish a Dataverse via SWORD

    +
    + +
    publish_dataverse(
    +  dataverse,
    +  key = Sys.getenv("DATAVERSE_KEY"),
    +  server = Sys.getenv("DATAVERSE_SERVER"),
    +  ...
    +)
    + +

    Arguments

    + + + + + + + + + + + + + + + + + + +
    dataverse

    An object of class “sword_collection”, as returned by service_document.

    key

    A character string specifying a Dataverse server API key. If one +is not specified, functions calling authenticated API endpoints will fail. +Keys can be specified atomically or globally using +Sys.setenv("DATAVERSE_KEY" = "examplekey").

    server

    A character string specifying a Dataverse server. There are +multiple Dataverse installations, but the defaults is to use the Harvard +Dataverse (server = "dataverse.harvard.edu"). This can be modified atomically +or globally using Sys.setenv("DATAVERSE_SERVER" = "dataverse.example.com").

    ...

    Additional arguments passed to an HTTP request function, such as +GET, POST, or +DELETE.

    + +

    Value

    + +

    A list.

    +

    Details

    + +

    This function is used to publish a (possibly already published) Dataverse. It is part of the SWORD API, which is used to upload data to a Dataverse server.

    +

    See also

    + +

    Managing a Dataverse: publish_dataverse; Managing a dataset: dataset_atom, list_datasets, create_dataset, delete_dataset, publish_dataset; Managing files within a dataset: add_file, delete_file

    + +
    + +
    + + +
    + + +
    +

    Site built with pkgdown 1.6.1.

    +
    + +
    +
    + + + + + + + + diff --git a/docs/reference/publish_sword_dataset.html b/docs/reference/publish_sword_dataset.html new file mode 100644 index 0000000..93f0188 --- /dev/null +++ b/docs/reference/publish_sword_dataset.html @@ -0,0 +1,237 @@ + + + + + + + + +Publish dataset (SWORD) — publish_sword_dataset • dataverse + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    + + + + +
    + +
    +
    + + +
    +

    Publish a SWORD (possibly unpublished) dataset

    +
    + +
    publish_sword_dataset(
    +  dataset,
    +  key = Sys.getenv("DATAVERSE_KEY"),
    +  server = Sys.getenv("DATAVERSE_SERVER"),
    +  ...
    +)
    + +

    Arguments

    + + + + + + + + + + + + + + + + + + +
    dataset

    A dataset DOI (or other persistent identifier), an object of class “dataset_atom” or “dataset_statement”, or an appropriate and complete SWORD URL.

    key

    A character string specifying a Dataverse server API key. If one +is not specified, functions calling authenticated API endpoints will fail. +Keys can be specified atomically or globally using +Sys.setenv("DATAVERSE_KEY" = "examplekey").

    server

    A character string specifying a Dataverse server. There are +multiple Dataverse installations, but the defaults is to use the Harvard +Dataverse (server = "dataverse.harvard.edu"). This can be modified atomically +or globally using Sys.setenv("DATAVERSE_SERVER" = "dataverse.example.com").

    ...

    Additional arguments passed to an HTTP request function, such as +GET, POST, or +DELETE.

    + +

    Value

    + +

    A list.

    +

    Details

    + +

    This function is used to publish a dataset by its persistent identifier. This cannot be undone. The function is part of the SWORD API, which is used to upload data to a Dataverse server.

    +

    See also

    + +

    Managing a Dataverse: publish_dataverse; Managing a dataset: dataset_atom, list_datasets, create_dataset, delete_sword_dataset, publish_dataset; Managing files within a dataset: add_file, delete_file

    + +

    Examples

    +
    if (FALSE) { +# retrieve your service document +d <- service_document() + +# create a list of metadata +metadat <- list(title = "My Study", + creator = "Doe, John", + description = "An example study") + +# create the dataset in first dataverse +dat <- initiate_sword_dataset(d[[2]], body = metadat) + +# publish dataset +publish_sword_dataset(dat) + +# delete a dataset +delete_dataset(dat) +} +
    +
    + +
    + + +
    + + +
    +

    Site built with pkgdown 1.6.1.

    +
    + +
    +
    + + + + + + + + diff --git a/docs/reference/service_document.html b/docs/reference/service_document.html new file mode 100644 index 0000000..a7c7d39 --- /dev/null +++ b/docs/reference/service_document.html @@ -0,0 +1,221 @@ + + + + + + + + +SWORD Service Document — service_document • dataverse + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    + + + + +
    + +
    +
    + + +
    +

    Obtain a SWORD service document.

    +
    + +
    service_document(
    +  key = Sys.getenv("DATAVERSE_KEY"),
    +  server = Sys.getenv("DATAVERSE_SERVER"),
    +  ...
    +)
    + +

    Arguments

    + + + + + + + + + + + + + + +
    key

    A character string specifying a Dataverse server API key. If one +is not specified, functions calling authenticated API endpoints will fail. +Keys can be specified atomically or globally using +Sys.setenv("DATAVERSE_KEY" = "examplekey").

    server

    A character string specifying a Dataverse server. There are +multiple Dataverse installations, but the defaults is to use the Harvard +Dataverse (server = "dataverse.harvard.edu"). This can be modified atomically +or globally using Sys.setenv("DATAVERSE_SERVER" = "dataverse.example.com").

    ...

    Additional arguments passed to an HTTP request function, such as +GET, POST, or +DELETE.

    + +

    Value

    + +

    A list of class “sword_service_document”, possibly with one or more “sword_collection” entries. The latter are SWORD representations of a Dataverse. These can be passed to other SWORD API functions, e.g., for creating a new dataset.

    +

    Details

    + +

    This function can be used to check authentication against the Dataverse SWORD server. It is typically a first step when creating a new Dataverse, a new Dataset, or modifying an existing Dataverse or Dataset.

    +

    See also

    + +

    Managing a Dataverse: publish_dataverse; Managing a dataset: dataset_atom, list_datasets, create_dataset, delete_dataset, publish_dataset; Managing files within a dataset: add_file, delete_file

    + +

    Examples

    +
    if (FALSE) { +# retrieve your service document +d <- service_document() + +# list available datasets in first dataverse +list_datasets(d[[2]]) +} +
    +
    + +
    + + +
    + + +
    +

    Site built with pkgdown 1.6.1.

    +
    + +
    +
    + + + + + + + + diff --git a/docs/reference/set_dataverse_metadata.html b/docs/reference/set_dataverse_metadata.html new file mode 100644 index 0000000..9af611c --- /dev/null +++ b/docs/reference/set_dataverse_metadata.html @@ -0,0 +1,227 @@ + + + + + + + + +Set Dataverse metadata — set_dataverse_metadata • dataverse + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    + + + + +
    + +
    +
    + + +
    +

    Set Dataverse metadata

    +
    + +
    set_dataverse_metadata(
    +  dataverse,
    +  body,
    +  root = TRUE,
    +  key = Sys.getenv("DATAVERSE_KEY"),
    +  server = Sys.getenv("DATAVERSE_SERVER"),
    +  ...
    +)
    + +

    Arguments

    + + + + + + + + + + + + + + + + + + + + + + + + + + +
    dataverse

    A character string specifying a Dataverse name or an object of class “dataverse”.

    body

    A list.

    root

    A logical.

    key

    A character string specifying a Dataverse server API key. If one +is not specified, functions calling authenticated API endpoints will fail. +Keys can be specified atomically or globally using +Sys.setenv("DATAVERSE_KEY" = "examplekey").

    server

    A character string specifying a Dataverse server. There are +multiple Dataverse installations, but the defaults is to use the Harvard +Dataverse (server = "dataverse.harvard.edu"). This can be modified atomically +or globally using Sys.setenv("DATAVERSE_SERVER" = "dataverse.example.com").

    ...

    Additional arguments passed to an HTTP request function, such as +GET, POST, or +DELETE.

    + +

    Value

    + +

    A list

    +

    Details

    + +

    This function sets the value of metadata fields for a Dataverse. Use update_dataset to set the metadata fields for a dataset instead.

    +

    See also

    + + + +
    + +
    + + +
    + + +
    +

    Site built with pkgdown 1.6.1.

    +
    + +
    +
    + + + + + + + + From 9eb7234db8fe9b1f1da6e15e87aabd4dc778b117 Mon Sep 17 00:00:00 2001 From: Will Beasley Date: Sun, 3 Jan 2021 16:06:56 -0600 Subject: [PATCH 73/75] customize pkgdown site ref #72 --- .Rbuildignore | 3 + .gitignore | 1 + _pkgdown.yml | 72 ++++++++++ docs/404.html | 12 +- docs/ISSUE_TEMPLATE.html | 2 +- docs/PULL_REQUEST_TEMPLATE.html | 2 +- docs/articles/A-introduction.html | 2 +- docs/articles/B-search.html | 2 +- docs/articles/C-retrieval.html | 2 +- docs/articles/D-archiving.html | 2 +- docs/articles/index.html | 6 +- docs/authors.html | 2 +- docs/index.html | 2 +- docs/news/index.html | 2 +- docs/pkgdown.yml | 5 +- docs/reference/add_dataset_file.html | 2 +- docs/reference/add_file.html | 2 +- docs/reference/create_dataset.html | 2 +- docs/reference/create_dataverse.html | 2 +- docs/reference/dataset_atom.html | 2 +- docs/reference/dataset_versions.html | 2 +- docs/reference/dataverse.html | 2 +- docs/reference/dataverse_metadata.html | 2 +- docs/reference/dataverse_search.html | 2 +- docs/reference/delete_dataset.html | 2 +- docs/reference/delete_dataverse.html | 2 +- docs/reference/delete_file.html | 2 +- docs/reference/delete_sword_dataset.html | 2 +- docs/reference/files.html | 2 +- docs/reference/get_dataframe.html | 2 +- docs/reference/get_dataframe_internal.html | 2 +- docs/reference/get_dataset.html | 2 +- docs/reference/get_dataverse.html | 2 +- docs/reference/get_facets.html | 2 +- docs/reference/get_file_metadata.html | 2 +- docs/reference/get_user_key.html | 2 +- docs/reference/index.html | 153 +++++++++++++-------- docs/reference/initiate_sword_dataset.html | 2 +- docs/reference/is_ingested.html | 2 +- docs/reference/list_datasets.html | 2 +- docs/reference/publish_dataset.html | 2 +- docs/reference/publish_dataverse.html | 2 +- docs/reference/publish_sword_dataset.html | 2 +- docs/reference/service_document.html | 2 +- docs/reference/set_dataverse_metadata.html | 2 +- 45 files changed, 223 insertions(+), 105 deletions(-) create mode 100644 _pkgdown.yml diff --git a/.Rbuildignore b/.Rbuildignore index f45c63b..87bee57 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -16,3 +16,6 @@ man-roxygen/* ^codecov\.yml$ ^.*\.Rproj$ ^\.Rproj\.user$ +^_pkgdown\.yml$ +^docs$ +^pkgdown$ diff --git a/.gitignore b/.gitignore index eedcabb..9326638 100644 --- a/.gitignore +++ b/.gitignore @@ -2,3 +2,4 @@ README.html doc Meta .Rproj.user +docs diff --git a/_pkgdown.yml b/_pkgdown.yml new file mode 100644 index 0000000..9695ab4 --- /dev/null +++ b/_pkgdown.yml @@ -0,0 +1,72 @@ +url: https://IQSS.github.io/dataverse-client-r + +template: + params: + bootswatch: sandstone + # docsearch: + # api_key: eeeeeeeeeeeeeeeeeeeeeeeeeeeeeeee + # index_name: dataverse-client-r + +home: + links: +# - text: Ask a question +# href: http://discourse.mc-stan.org/ + +development: + mode: release + +navbar: + title: "Dataverse R Client" + # type: inverse + +articles: +- title: "Getting Started" + navbar: ~ + desc: > + These vignettes provide an introduction to Dataverse with R. + contents: + - 'A-introduction' + - 'B-search' + - 'C-retrieval' + - 'D-archiving' + +reference: +- title: "Retrieve" + contents: + - '`get_file`' + - '`get_dataframe_by_name`' + - '`get_dataset`' + - '`get_dataverse`' + - '`get_facets`' + - '`get_file_metadata`' + - '`get_user_key`' + +- title: "Create, Add, & Publish" + contents: + - '`create_dataset`' + - '`create_dataverse`' + - '`add_dataset_file`' + - '`add_file`' + - '`publish_dataset`' + - '`publish_dataverse`' + - '`publish_sword_dataset`' + +- title: "Delete" + contents: + - '`delete_dataset`' + - '`delete_dataverse`' + - '`delete_file`' + - '`delete_sword_dataset`' + +- title: Other + contents: + - '`dataset_atom`' + - '`dataset_versions`' + - '`dataverse`' + - '`dataverse_metadata`' + - '`dataverse_search`' + - '`initiate_sword_dataset`' + - '`is_ingested`' + - '`list_datasets`' + - '`service_document`' + - '`set_dataverse_metadata`' diff --git a/docs/404.html b/docs/404.html index f33a2e6..c20f9e2 100644 --- a/docs/404.html +++ b/docs/404.html @@ -12,14 +12,14 @@ + - - - + + @@ -33,8 +33,8 @@ - - + + @@ -70,7 +70,7 @@ - dataverse + dataverse 0.2.1.9002 diff --git a/docs/ISSUE_TEMPLATE.html b/docs/ISSUE_TEMPLATE.html index b595c17..c52c35a 100644 --- a/docs/ISSUE_TEMPLATE.html +++ b/docs/ISSUE_TEMPLATE.html @@ -12,8 +12,8 @@ + - diff --git a/docs/PULL_REQUEST_TEMPLATE.html b/docs/PULL_REQUEST_TEMPLATE.html index 0d3a629..6de5746 100644 --- a/docs/PULL_REQUEST_TEMPLATE.html +++ b/docs/PULL_REQUEST_TEMPLATE.html @@ -12,8 +12,8 @@ + - diff --git a/docs/articles/A-introduction.html b/docs/articles/A-introduction.html index 385d033..dfe717f 100644 --- a/docs/articles/A-introduction.html +++ b/docs/articles/A-introduction.html @@ -6,7 +6,7 @@ Introduction to Dataverse • dataverse - + diff --git a/docs/articles/B-search.html b/docs/articles/B-search.html index 0183446..dff2407 100644 --- a/docs/articles/B-search.html +++ b/docs/articles/B-search.html @@ -6,7 +6,7 @@ Data Search and Discovery • dataverse - + diff --git a/docs/articles/C-retrieval.html b/docs/articles/C-retrieval.html index a5335ca..bc55799 100644 --- a/docs/articles/C-retrieval.html +++ b/docs/articles/C-retrieval.html @@ -6,7 +6,7 @@ Data Retrieval and Reuse • dataverse - + diff --git a/docs/articles/D-archiving.html b/docs/articles/D-archiving.html index dbafd7f..4316f7c 100644 --- a/docs/articles/D-archiving.html +++ b/docs/articles/D-archiving.html @@ -6,7 +6,7 @@ Data Archiving • dataverse - + diff --git a/docs/articles/index.html b/docs/articles/index.html index 87b696c..682428b 100644 --- a/docs/articles/index.html +++ b/docs/articles/index.html @@ -12,8 +12,8 @@ + - @@ -135,8 +135,8 @@

    Articles

    -

    All vignettes

    -

    +

    Getting Started

    +

    These vignettes provide an introduction to Dataverse with R.

    Introduction to Dataverse
    diff --git a/docs/authors.html b/docs/authors.html index dd23d48..9ccabe4 100644 --- a/docs/authors.html +++ b/docs/authors.html @@ -12,8 +12,8 @@ + - diff --git a/docs/index.html b/docs/index.html index 896eb76..3306641 100644 --- a/docs/index.html +++ b/docs/index.html @@ -6,7 +6,7 @@ Client for Dataverse 4 Repositories • dataverse - + diff --git a/docs/news/index.html b/docs/news/index.html index 5835634..d9988b7 100644 --- a/docs/news/index.html +++ b/docs/news/index.html @@ -12,8 +12,8 @@ + - diff --git a/docs/pkgdown.yml b/docs/pkgdown.yml index 5eb33f8..46f2d65 100644 --- a/docs/pkgdown.yml +++ b/docs/pkgdown.yml @@ -6,5 +6,8 @@ articles: B-search: B-search.html C-retrieval: C-retrieval.html D-archiving: D-archiving.html -last_built: 2021-01-03T18:35Z +last_built: 2021-01-03T22:03Z +urls: + reference: https://IQSS.github.io/dataverse-client-r/reference + article: https://IQSS.github.io/dataverse-client-r/articles diff --git a/docs/reference/add_dataset_file.html b/docs/reference/add_dataset_file.html index 21b954d..7400029 100644 --- a/docs/reference/add_dataset_file.html +++ b/docs/reference/add_dataset_file.html @@ -12,8 +12,8 @@ + - diff --git a/docs/reference/add_file.html b/docs/reference/add_file.html index 7574142..cd64134 100644 --- a/docs/reference/add_file.html +++ b/docs/reference/add_file.html @@ -12,8 +12,8 @@ + - diff --git a/docs/reference/create_dataset.html b/docs/reference/create_dataset.html index c6eb3de..2ffe7e9 100644 --- a/docs/reference/create_dataset.html +++ b/docs/reference/create_dataset.html @@ -12,8 +12,8 @@ + - diff --git a/docs/reference/create_dataverse.html b/docs/reference/create_dataverse.html index 64b9211..5d5b83e 100644 --- a/docs/reference/create_dataverse.html +++ b/docs/reference/create_dataverse.html @@ -12,8 +12,8 @@ + - diff --git a/docs/reference/dataset_atom.html b/docs/reference/dataset_atom.html index 3dc45ea..3936166 100644 --- a/docs/reference/dataset_atom.html +++ b/docs/reference/dataset_atom.html @@ -12,8 +12,8 @@ + - diff --git a/docs/reference/dataset_versions.html b/docs/reference/dataset_versions.html index 58892cb..67c54e1 100644 --- a/docs/reference/dataset_versions.html +++ b/docs/reference/dataset_versions.html @@ -12,8 +12,8 @@ + - diff --git a/docs/reference/dataverse.html b/docs/reference/dataverse.html index 331144e..c942aa7 100644 --- a/docs/reference/dataverse.html +++ b/docs/reference/dataverse.html @@ -12,8 +12,8 @@ + - diff --git a/docs/reference/dataverse_metadata.html b/docs/reference/dataverse_metadata.html index 08d602d..9d88281 100644 --- a/docs/reference/dataverse_metadata.html +++ b/docs/reference/dataverse_metadata.html @@ -12,8 +12,8 @@ + - diff --git a/docs/reference/dataverse_search.html b/docs/reference/dataverse_search.html index d95684b..943ee2a 100644 --- a/docs/reference/dataverse_search.html +++ b/docs/reference/dataverse_search.html @@ -12,8 +12,8 @@ + - diff --git a/docs/reference/delete_dataset.html b/docs/reference/delete_dataset.html index c542790..5c726c7 100644 --- a/docs/reference/delete_dataset.html +++ b/docs/reference/delete_dataset.html @@ -12,8 +12,8 @@ + - diff --git a/docs/reference/delete_dataverse.html b/docs/reference/delete_dataverse.html index c9f0712..f8ebedf 100644 --- a/docs/reference/delete_dataverse.html +++ b/docs/reference/delete_dataverse.html @@ -12,8 +12,8 @@ + - diff --git a/docs/reference/delete_file.html b/docs/reference/delete_file.html index 19d41bf..b4d224b 100644 --- a/docs/reference/delete_file.html +++ b/docs/reference/delete_file.html @@ -12,8 +12,8 @@ + - diff --git a/docs/reference/delete_sword_dataset.html b/docs/reference/delete_sword_dataset.html index 5ca44cc..4b81c64 100644 --- a/docs/reference/delete_sword_dataset.html +++ b/docs/reference/delete_sword_dataset.html @@ -12,8 +12,8 @@ + - diff --git a/docs/reference/files.html b/docs/reference/files.html index a65f9bd..add2fbe 100644 --- a/docs/reference/files.html +++ b/docs/reference/files.html @@ -12,8 +12,8 @@ + - diff --git a/docs/reference/get_dataframe.html b/docs/reference/get_dataframe.html index 262cd1a..2703a77 100644 --- a/docs/reference/get_dataframe.html +++ b/docs/reference/get_dataframe.html @@ -12,8 +12,8 @@ + - diff --git a/docs/reference/get_dataframe_internal.html b/docs/reference/get_dataframe_internal.html index 90fa686..e449bdc 100644 --- a/docs/reference/get_dataframe_internal.html +++ b/docs/reference/get_dataframe_internal.html @@ -12,8 +12,8 @@ + - diff --git a/docs/reference/get_dataset.html b/docs/reference/get_dataset.html index dc3039b..8907f9f 100644 --- a/docs/reference/get_dataset.html +++ b/docs/reference/get_dataset.html @@ -12,8 +12,8 @@ + - diff --git a/docs/reference/get_dataverse.html b/docs/reference/get_dataverse.html index 07ee595..74a8ae7 100644 --- a/docs/reference/get_dataverse.html +++ b/docs/reference/get_dataverse.html @@ -12,8 +12,8 @@ + - diff --git a/docs/reference/get_facets.html b/docs/reference/get_facets.html index 9e10695..694ae54 100644 --- a/docs/reference/get_facets.html +++ b/docs/reference/get_facets.html @@ -12,8 +12,8 @@ + - diff --git a/docs/reference/get_file_metadata.html b/docs/reference/get_file_metadata.html index f145eee..9eaf7d6 100644 --- a/docs/reference/get_file_metadata.html +++ b/docs/reference/get_file_metadata.html @@ -12,8 +12,8 @@ + - diff --git a/docs/reference/get_user_key.html b/docs/reference/get_user_key.html index 97a426b..c665dca 100644 --- a/docs/reference/get_user_key.html +++ b/docs/reference/get_user_key.html @@ -12,8 +12,8 @@ + - diff --git a/docs/reference/index.html b/docs/reference/index.html index 695a1c9..af54628 100644 --- a/docs/reference/index.html +++ b/docs/reference/index.html @@ -12,8 +12,8 @@ + - @@ -145,7 +145,7 @@

    Reference

    -

    All functions

    +

    Retrieve

    @@ -157,159 +157,198 @@

    add_dataset_file() update_dataset_file()

    +

    get_file() get_file_by_name() get_file_by_id() get_file_by_doi()

    -

    Add or update a file in a dataset

    +

    Download File

    -

    add_file()

    +

    get_dataframe_by_name() get_dataframe_by_id() get_dataframe_by_doi()

    -

    Add file (SWORD)

    +

    Get file from dataverse and convert it into a dataframe or tibble

    -

    create_dataset() update_dataset()

    +

    get_dataset() dataset_metadata() dataset_files()

    -

    Create or update a dataset

    +

    Get dataset

    -

    create_dataverse()

    +

    get_dataverse() dataverse_contents()

    -

    Create Dataverse

    +

    Get Dataverse

    -

    dataset_atom() dataset_statement()

    +

    get_facets()

    -

    View dataset (SWORD)

    +

    Get Dataverse facets

    -

    dataset_versions()

    +

    get_file_metadata()

    -

    Dataset versions

    +

    Retrieve a ddi metadata file

    -

    dataverse

    +

    get_user_key()

    -

    Client for Dataverse 4 Repositories

    - +

    Get API Key

    + + + + +

    Create, Add, & Publish

    +

    + + + + + + + + -

    dataverse_metadata()

    +

    create_dataset() update_dataset()

    -

    Dataverse metadata

    +

    Create or update a dataset

    -

    dataverse_search()

    +

    create_dataverse()

    -

    Search Dataverse server

    +

    Create Dataverse

    -

    delete_dataset()

    +

    add_dataset_file() update_dataset_file()

    -

    Delete draft dataset

    +

    Add or update a file in a dataset

    -

    delete_dataverse()

    +

    add_file()

    -

    Delete Dataverse

    +

    Add file (SWORD)

    -

    delete_file()

    +

    publish_dataset()

    -

    Delete file (SWORD)

    +

    Publish dataset

    -

    delete_sword_dataset()

    +

    publish_dataverse()

    -

    Delete dataset (SWORD)

    +

    Publish Dataverse (SWORD)

    -

    get_file() get_file_by_name() get_file_by_id() get_file_by_doi()

    +

    publish_sword_dataset()

    -

    Download File

    - +

    Publish dataset (SWORD)

    + + + + +

    Delete

    +

    + + + + + + + + -

    get_dataframe_by_name() get_dataframe_by_id() get_dataframe_by_doi()

    +

    delete_dataset()

    -

    Get file from dataverse and convert it into a dataframe or tibble

    +

    Delete draft dataset

    -

    get_dataset() dataset_metadata() dataset_files()

    +

    delete_dataverse()

    -

    Get dataset

    +

    Delete Dataverse

    -

    get_dataverse() dataverse_contents()

    +

    delete_file()

    -

    Get Dataverse

    +

    Delete file (SWORD)

    -

    get_facets()

    +

    delete_sword_dataset()

    -

    Get Dataverse facets

    - +

    Delete dataset (SWORD)

    + + + + +

    Other

    +

    + + + + + + + + -

    get_file_metadata()

    +

    dataset_atom() dataset_statement()

    -

    Retrieve a ddi metadata file

    +

    View dataset (SWORD)

    -

    get_user_key()

    +

    dataset_versions()

    -

    Get API Key

    +

    Dataset versions

    -

    initiate_sword_dataset()

    +

    dataverse

    -

    Initiate dataset (SWORD)

    +

    Client for Dataverse 4 Repositories

    -

    is_ingested()

    +

    dataverse_metadata()

    -

    Identify if file is an ingested file

    +

    Dataverse metadata

    -

    list_datasets()

    +

    dataverse_search()

    -

    List datasets (SWORD)

    +

    Search Dataverse server

    -

    publish_dataset()

    +

    initiate_sword_dataset()

    -

    Publish dataset

    +

    Initiate dataset (SWORD)

    -

    publish_dataverse()

    +

    is_ingested()

    -

    Publish Dataverse (SWORD)

    +

    Identify if file is an ingested file

    -

    publish_sword_dataset()

    +

    list_datasets()

    -

    Publish dataset (SWORD)

    +

    List datasets (SWORD)

    diff --git a/docs/reference/initiate_sword_dataset.html b/docs/reference/initiate_sword_dataset.html index 8fed0e1..ee94fd5 100644 --- a/docs/reference/initiate_sword_dataset.html +++ b/docs/reference/initiate_sword_dataset.html @@ -12,8 +12,8 @@ + - diff --git a/docs/reference/is_ingested.html b/docs/reference/is_ingested.html index 38b91ec..1ff70da 100644 --- a/docs/reference/is_ingested.html +++ b/docs/reference/is_ingested.html @@ -12,8 +12,8 @@ + - diff --git a/docs/reference/list_datasets.html b/docs/reference/list_datasets.html index 2508abd..22424db 100644 --- a/docs/reference/list_datasets.html +++ b/docs/reference/list_datasets.html @@ -12,8 +12,8 @@ + - diff --git a/docs/reference/publish_dataset.html b/docs/reference/publish_dataset.html index 1de0a06..aa139bc 100644 --- a/docs/reference/publish_dataset.html +++ b/docs/reference/publish_dataset.html @@ -12,8 +12,8 @@ + - diff --git a/docs/reference/publish_dataverse.html b/docs/reference/publish_dataverse.html index 8ac7b93..5d1bd92 100644 --- a/docs/reference/publish_dataverse.html +++ b/docs/reference/publish_dataverse.html @@ -12,8 +12,8 @@ + - diff --git a/docs/reference/publish_sword_dataset.html b/docs/reference/publish_sword_dataset.html index 93f0188..194f299 100644 --- a/docs/reference/publish_sword_dataset.html +++ b/docs/reference/publish_sword_dataset.html @@ -12,8 +12,8 @@ + - diff --git a/docs/reference/service_document.html b/docs/reference/service_document.html index a7c7d39..695d136 100644 --- a/docs/reference/service_document.html +++ b/docs/reference/service_document.html @@ -12,8 +12,8 @@ + - diff --git a/docs/reference/set_dataverse_metadata.html b/docs/reference/set_dataverse_metadata.html index 9af611c..4f97254 100644 --- a/docs/reference/set_dataverse_metadata.html +++ b/docs/reference/set_dataverse_metadata.html @@ -12,8 +12,8 @@ + - From 926aef128ffdd1aa2d0f0e3f7d9669e506d102ad Mon Sep 17 00:00:00 2001 From: Will Beasley Date: Sun, 3 Jan 2021 20:41:56 -0600 Subject: [PATCH 74/75] fix documentation parsing error close #72 --- R/SWORD_files.R | 2 +- docs/pkgdown.yml | 2 +- docs/reference/delete_file.html | 26 +++++++++++++++++++++++++- man/delete_file.Rd | 2 +- 4 files changed, 28 insertions(+), 4 deletions(-) diff --git a/R/SWORD_files.R b/R/SWORD_files.R index 90dfea6..2b3d5d7 100644 --- a/R/SWORD_files.R +++ b/R/SWORD_files.R @@ -121,7 +121,7 @@ add_file <- function(dataset, file, key = Sys.getenv("DATAVERSE_KEY"), server = #' #' # delete a file #' ds <- dataset_statement(dat) -#' delete_file(ds$files[[1]]$id +#' delete_file(ds$files[[1]]$id) #' #' # delete a dataset #' delete_dataset(dat) diff --git a/docs/pkgdown.yml b/docs/pkgdown.yml index 46f2d65..cba2373 100644 --- a/docs/pkgdown.yml +++ b/docs/pkgdown.yml @@ -6,7 +6,7 @@ articles: B-search: B-search.html C-retrieval: C-retrieval.html D-archiving: D-archiving.html -last_built: 2021-01-03T22:03Z +last_built: 2021-01-04T02:40Z urls: reference: https://IQSS.github.io/dataverse-client-r/reference article: https://IQSS.github.io/dataverse-client-r/articles diff --git a/docs/reference/delete_file.html b/docs/reference/delete_file.html index b4d224b..580e178 100644 --- a/docs/reference/delete_file.html +++ b/docs/reference/delete_file.html @@ -188,7 +188,31 @@

    See a

    Managing a Dataverse: publish_dataverse; Managing a dataset: dataset_atom, list_datasets, create_dataset, delete_dataset, publish_dataset; Managing files within a dataset: add_file, delete_file

    Examples

    -
    
    +    
    if (FALSE) { +# retrieve your service document +d <- service_document() + +# create a list of metadata +metadat <- list(title = "My Study", + creator = "Doe, John", + description = "An example study") + +# create the dataset +dat <- initiate_sword_dataset("mydataverse", body = metadat) + +# add files to dataset +tmp <- tempfile() +write.csv(iris, file = tmp) +f <- add_file(dat, file = tmp) + +# delete a file +ds <- dataset_statement(dat) +delete_file(ds$files[[1]]$id) + +# delete a dataset +delete_dataset(dat) +} +
    -

    For “native” Dataverse features (such as user account controls) or to create and publish a dataset, you will need an API key linked to a Dataverse installation account. Instructions for obtaining an account and setting up an API key are available in the Dataverse User Guide. (Note: if your key is compromised, it can be regenerated to preserve security.) Once you have an API key, this should be stored as an environment variable called DATAVERSE_KEY. It can be set within R using:

    +

    For “native” Dataverse features (such as user account controls) or to create and publish a dataset, you will need an API key linked to a Dataverse installation account. Instructions for obtaining an account and setting up an API key are available in the Dataverse User Guide. (Note: if your key is compromised, it can be regenerated to preserve security.) Once you have an API key, this should be stored as an environment variable called DATAVERSE_KEY. It can be set within R using:

     Sys.setenv("DATAVERSE_KEY" = "examplekey12345")

    With that set, you can easily create a new dataverse, create a dataset within that dataverse, push files to the dataset, and release it:

    @@ -164,7 +164,7 @@

    Appendix: dvn to dataverse Crosswalk

    -

    The original Dataverse client for R was called dvn; it worked with Dataverse versions <= 3 and was removed from CRAN in favor of dataverse in 2018. dvn provided functionality for searching, retrieving, and depositing data. Here is a cross-walk of functionality in case you were already familiar with the dvn package:

    +

    The original Dataverse client for R was called dvn; it worked with Dataverse versions <= 3 and was removed from CRAN in favor of dataverse in 2018. dvn provided functionality for searching, retrieving, and depositing data. Here is a cross-walk of functionality in case you were already familiar with the dvn package:

    diff --git a/docs/articles/C-retrieval.html b/docs/articles/C-retrieval.html index bc55799..b5258d7 100644 --- a/docs/articles/C-retrieval.html +++ b/docs/articles/C-retrieval.html @@ -102,11 +102,11 @@

    2017-06-15

    -

    This vignette shows how to download data from Dataverse using the dataverse package. We’ll focus on a Dataverse repository that contains supplemental files for Jamie Monogan’s book Political Analysis Using R, which is stored at Harvard University’s IQSS Dataverse Network:

    +

    This vignette shows how to download data from Dataverse using the dataverse package. We’ll focus on a Dataverse repository that contains supplemental files for Jamie Monogan’s book Political Analysis Using R, which is stored at Harvard University’s IQSS Dataverse Network:

    Monogan, Jamie, 2015, “Political Analysis Using R: Example Code and Data, Plus Data for Practice Problems”, doi:10.7910/DVN/ARKOTI, Harvard Dataverse, V1, UNF:6:+itU9hcUJ8I9E0Kqv8HWHg==

    -

    This study is persistently retrievable by a “Digital Object Identifier (DOI)”: https://doi.org/10.7910/DVN/ARKOTI and the citation above (taken from the Dataverse page) includes a “Universal Numeric Fingerprint (UNF)”: UNF:6:+itU9hcUJ8I9E0Kqv8HWHg==, which provides a versioned, multi-file hash for the entire study, which contains 32 files.

    +

    This study is persistently retrievable by a “Digital Object Identifier (DOI)”: https://doi.org/10.7910/DVN/ARKOTI and the citation above (taken from the Dataverse page) includes a “Universal Numeric Fingerprint (UNF)”: UNF:6:+itU9hcUJ8I9E0Kqv8HWHg==, which provides a versioned, multi-file hash for the entire study, which contains 32 files.

    If you don’t already know what datasets and files you want to use from Dataverse, see the “Data Search” vignette for guidance on data search and discovery.

    diff --git a/docs/index.html b/docs/index.html index 3306641..2559d3b 100644 --- a/docs/index.html +++ b/docs/index.html @@ -96,8 +96,8 @@ -

    Dataverse Project logo

    -

    The dataverse package provides access to Dataverse 4 APIs, enabling data search, retrieval, and deposit, thus allowing R users to integrate public data sharing into the reproducible research workflow. dataverse is the next-generation iteration of the dvn package, which works with Dataverse 3 (“Dataverse Network”) applications. dataverse includes numerous improvements for data search, retrieval, and deposit, including use of the (currently in development) sword package for data deposit and the UNF package for data fingerprinting.

    +

    Dataverse Project logo

    +

    The dataverse package provides access to Dataverse 4 APIs, enabling data search, retrieval, and deposit, thus allowing R users to integrate public data sharing into the reproducible research workflow. dataverse is the next-generation iteration of the dvn package, which works with Dataverse 3 (“Dataverse Network”) applications. dataverse includes numerous improvements for data search, retrieval, and deposit, including use of the (currently in development) sword package for data deposit and the UNF package for data fingerprinting.

    Getting Started

    @@ -107,14 +107,14 @@

    Keys

    -

    Some features of the Dataverse 4 API are public and require no authentication. This means in many cases you can search for and retrieve data without a Dataverse account for that a specific Dataverse installation. But, other features require a Dataverse account for the specific server installation of the Dataverse software, and an API key linked to that account. Instructions for obtaining an account and setting up an API key are available in the Dataverse User Guide. (Note: if your key is compromised, it can be regenerated to preserve security.) Once you have an API key, this should be stored as an environment variable called DATAVERSE_KEY. It can be set within R using:

    +

    Some features of the Dataverse 4 API are public and require no authentication. This means in many cases you can search for and retrieve data without a Dataverse account for that a specific Dataverse installation. But, other features require a Dataverse account for the specific server installation of the Dataverse software, and an API key linked to that account. Instructions for obtaining an account and setting up an API key are available in the Dataverse User Guide. (Note: if your key is compromised, it can be regenerated to preserve security.) Once you have an API key, this should be stored as an environment variable called DATAVERSE_KEY. It can be set within R using:

     Sys.setenv("DATAVERSE_KEY" = "examplekey12345")

    Server

    -

    Because there are many Dataverse installations, all functions in the R client require specifying what server installation you are interacting with. This can be set by default with an environment variable, DATAVERSE_SERVER. This should be the Dataverse server, without the “https” prefix or the “/api” URL path, etc. For example, the Harvard Dataverse can be used by setting:

    +

    Because there are many Dataverse installations, all functions in the R client require specifying what server installation you are interacting with. This can be set by default with an environment variable, DATAVERSE_SERVER. This should be the Dataverse server, without the “https” prefix or the “/api” URL path, etc. For example, the Harvard Dataverse can be used by setting:

     Sys.setenv("DATAVERSE_SERVER" = "dataverse.harvard.edu")

    Note: The package attempts to compensate for any malformed values, though.

    @@ -131,14 +131,14 @@

    Use the get_dataframe_*() functions, depending on the input you have. For example, we will read a survey dataset on Dataverse, nlsw88.dta (doi:10.70122/FK2/PPKHI1/ZYATZZ), originally in Stata dta form.

    With a file DOI, we can use the get_dataframe_by_doi function:

    -nlsw <- 
    +nlsw <-
       get_dataframe_by_doi(
         filedoi     = "10.70122/FK2/PPIAXE/MHDB0O",
         server      = "demo.dataverse.org"
       )
    ## Downloading ingested version of data with readr::read_tsv. To download the original version and remove this message, set original = TRUE.
     
    -## 
    +##
     ## ── Column specification ────────────────────────────────────────────────────────────────────────────────────────────────
     ## cols(
     ##   idcode = col_double(),
    @@ -162,7 +162,7 @@ 

    which by default reads in the ingested file (not the original dta) by the readr::read_tsv function.

    Alternatively, we can download the same file by specifying the filename and the DOI of the “dataset” (in Dataverse, a collection of files is called a dataset).

    -nlsw_tsv <- 
    +nlsw_tsv <-
       get_dataframe_by_name(
         filename  = "nlsw88.tab",
         dataset   = "10.70122/FK2/PPIAXE",
    @@ -172,7 +172,7 @@ 

    This default is safe because you may not have the proprietary software that was originally used. On the other hand, the data may have lost information in the process of the ingestation.

    Instead, to read the same file but its original version, specify original = TRUE and set an .f argument. In this case, we know that nlsw88.tab is a Stata .dta dataset, so we will use the haven::read_dta function.

    -nlsw_original <- 
    +nlsw_original <-
       get_dataframe_by_name(
         filename    = "nlsw88.tab",
         dataset     = "10.70122/FK2/PPIAXE",
    @@ -188,7 +188,7 @@ 

    ## [1] "numeric"
     attr(nlsw_original$race, "labels") # original dta has value labels
    -
    ## white black other 
    +
    ## white black other
     ##     1     2     3
    @@ -196,7 +196,7 @@

    Reading a dataset as a binary file.

    In some cases, you may not want to read in the data in your environment, perhaps because that is not possible (e.g. for a .docx file), and you want to simply write these files your local disk. To do this, use the more primitive get_file_* commands. The arguments are equivalent, except we no longer need an .f argument

    -nlsw_raw <- 
    +nlsw_raw <-
       get_file_by_name(
         filename    = "nlsw88.tab",
         dataset     = "10.70122/FK2/PPIAXE",
    @@ -214,7 +214,7 @@ 

    dataset = "10.70122/FK2/PPIAXE", server = "demo.dataverse.org" )

    -
    ## Dataset (182162): 
    +
    ## Dataset (182162):
     ## Version: 1.1, RELEASED
     ## Release Date: 2020-12-30T00:00:24Z
     ## License: CC0
    @@ -247,7 +247,7 @@ 

    d <- service_document() # create a list of metadata -metadat <- +metadat <- list( title = "My Study", creator = "Doe, John", diff --git a/docs/pkgdown.yml b/docs/pkgdown.yml index cba2373..e7f6cff 100644 --- a/docs/pkgdown.yml +++ b/docs/pkgdown.yml @@ -6,7 +6,7 @@ articles: B-search: B-search.html C-retrieval: C-retrieval.html D-archiving: D-archiving.html -last_built: 2021-01-04T02:40Z +last_built: 2021-01-17T17:13Z urls: reference: https://IQSS.github.io/dataverse-client-r/reference article: https://IQSS.github.io/dataverse-client-r/articles diff --git a/docs/reference/dataverse.html b/docs/reference/dataverse.html index c942aa7..841223f 100644 --- a/docs/reference/dataverse.html +++ b/docs/reference/dataverse.html @@ -158,8 +158,8 @@

    Details

    References

    -

    Dataverse API Documentation

    -

    Dataverse Homepage

    +

    Dataverse API Documentation

    +

    Dataverse Homepage

    Harvard IQSS Dataverse

    diff --git a/docs/reference/initiate_sword_dataset.html b/docs/reference/initiate_sword_dataset.html index ee94fd5..77a6ea5 100644 --- a/docs/reference/initiate_sword_dataset.html +++ b/docs/reference/initiate_sword_dataset.html @@ -187,7 +187,7 @@

    Value

    An object of class “dataset_atom”.

    Details

    -

    This function is used to initiate a dataset in a (SWORD) Dataverse by supplying relevant metadata. The function is part of the SWORD API (see Atom entry specification), which is used to upload data to a Dataverse server. +

    This function is used to initiate a dataset in a (SWORD) Dataverse by supplying relevant metadata. The function is part of the SWORD API (see Atom entry specification), which is used to upload data to a Dataverse server. Allowed fields are: “abstract”, “accessRights”, “accrualMethod”, “accrualPeriodicity”, “accrualPolicy”, “alternative”, diff --git a/man/dataverse.Rd b/man/dataverse.Rd index 570a4c6..651cb6f 100644 --- a/man/dataverse.Rd +++ b/man/dataverse.Rd @@ -23,9 +23,9 @@ This package provides five main sets of functions to interact with Dataverse: } } \references{ -\href{http://guides.dataverse.org/en/latest/api/index.html}{Dataverse API Documentation} +\href{https://guides.dataverse.org/en/latest/api/index.html}{Dataverse API Documentation} -\href{http://dataverse.org/}{Dataverse Homepage} +\href{https://dataverse.org/}{Dataverse Homepage} \href{https://dataverse.harvard.edu/}{Harvard IQSS Dataverse} } diff --git a/man/initiate_sword_dataset.Rd b/man/initiate_sword_dataset.Rd index 9565759..b9e92c8 100644 --- a/man/initiate_sword_dataset.Rd +++ b/man/initiate_sword_dataset.Rd @@ -38,7 +38,7 @@ An object of class \dQuote{dataset_atom}. Initiate a SWORD (possibly unpublished) dataset } \details{ -This function is used to initiate a dataset in a (SWORD) Dataverse by supplying relevant metadata. The function is part of the SWORD API (see \href{http://www.ietf.org/rfc/rfc5023.txt}{Atom entry specification}), which is used to upload data to a Dataverse server. +This function is used to initiate a dataset in a (SWORD) Dataverse by supplying relevant metadata. The function is part of the SWORD API (see \href{https://www.ietf.org/rfc/rfc5023.txt}{Atom entry specification}), which is used to upload data to a Dataverse server. Allowed fields are: \dQuote{abstract}, \dQuote{accessRights}, \dQuote{accrualMethod}, \dQuote{accrualPeriodicity}, \dQuote{accrualPolicy}, \dQuote{alternative}, diff --git a/vignettes/A-introduction.Rmd b/vignettes/A-introduction.Rmd index 3f1501f..9b380cb 100644 --- a/vignettes/A-introduction.Rmd +++ b/vignettes/A-introduction.Rmd @@ -15,7 +15,7 @@ vignette: > %\VignetteEncoding{UTF-8} --- -The **dataverse** package is the official R client for [Dataverse 4](http://dataverse.org/) data repositories. The package enables data search, retrieval, and deposit with any Dataverse installation, thus allowing R users to integrate public data sharing into the reproducible research workflow. +The **dataverse** package is the official R client for [Dataverse 4](https://dataverse.org/) data repositories. The package enables data search, retrieval, and deposit with any Dataverse installation, thus allowing R users to integrate public data sharing into the reproducible research workflow. In addition to this introduction, the package contains three additional vignettes covering: @@ -66,7 +66,7 @@ get_file_metadata() get_file() ``` -For "native" Dataverse features (such as user account controls) or to create and publish a dataset, you will need an API key linked to a Dataverse installation account. Instructions for obtaining an account and setting up an API key are available in the [Dataverse User Guide](http://guides.dataverse.org/en/latest/user/account.html). (Note: if your key is compromised, it can be regenerated to preserve security.) Once you have an API key, this should be stored as an environment variable called `DATAVERSE_KEY`. It can be set within R using: +For "native" Dataverse features (such as user account controls) or to create and publish a dataset, you will need an API key linked to a Dataverse installation account. Instructions for obtaining an account and setting up an API key are available in the [Dataverse User Guide](https://guides.dataverse.org/en/latest/user/account.html). (Note: if your key is compromised, it can be regenerated to preserve security.) Once you have an API key, this should be stored as an environment variable called `DATAVERSE_KEY`. It can be set within R using: ```R Sys.setenv("DATAVERSE_KEY" = "examplekey12345") @@ -100,7 +100,7 @@ Your data are now publicly accessible. ## Appendix: dvn to dataverse Crosswalk -The original Dataverse client for R was called [dvn](https://cran.r-project.org/web/packages/dvn/index.html); it worked with Dataverse versions <= 3 and was removed from CRAN in favor of [dataverse](https://CRAN.R-project.org/package=dataverse) in 2018. dvn provided functionality for searching, retrieving, and depositing data. Here is a cross-walk of functionality in case you were already familiar with the dvn package: +The original Dataverse client for R was called [dvn](https://CRAN.R-project.org/package=dvn); it worked with Dataverse versions <= 3 and was removed from CRAN in favor of [dataverse](https://CRAN.R-project.org/package=dataverse) in 2018. dvn provided functionality for searching, retrieving, and depositing data. Here is a cross-walk of functionality in case you were already familiar with the dvn package: | API Category | **dataverse** functions | **dvn** functions | | ------------ | ----------------------- | ----------------- | diff --git a/vignettes/A-introduction.Rmd2 b/vignettes/A-introduction.Rmd2 index d33d8d3..22413ef 100644 --- a/vignettes/A-introduction.Rmd2 +++ b/vignettes/A-introduction.Rmd2 @@ -15,7 +15,7 @@ vignette: > %\VignetteEncoding{UTF-8} --- -The **dataverse** package is the official R client for [Dataverse 4](http://dataverse.org/) data repositories. The package enables data search, retrieval, and deposit with any Dataverse installation, thus allowing R users to integrate public data sharing into the reproducible research workflow. +The **dataverse** package is the official R client for [Dataverse 4](https://dataverse.org/) data repositories. The package enables data search, retrieval, and deposit with any Dataverse installation, thus allowing R users to integrate public data sharing into the reproducible research workflow. In addition to this introduction, the package contains three additional vignettes covering: @@ -46,7 +46,7 @@ library("dataverse") Sys.setenv("DATAVERSE_SERVER" = "dataverse.harvard.edu") ``` -This should be the Dataverse server, without the "https" prefix or the "/api" URL path, etc. The package attempts to compensate for any malformed values, though. +This should be the Dataverse server, without the "https" prefix or the "/api" URL path, etc. The package attempts to compensate for any malformed values, though. Within a given Dataverse installation, organizations or individuals can create objects that are also called "Dataverses". These Dataverses can then contain other *dataverses*, which can contain other *dataverses*, and so on. They can also contain *datasets* which in turn contain files. You can think of Harvard's Dataverse as a top-level installation, where an institution might have a *dataverse* that contains a subsidiary *dataverse* for each researcher at the organization, who in turn publishes all files relevant to a given study as a *dataset*. @@ -65,7 +65,7 @@ get_file_metadata() get_file() ``` -For "native" Dataverse features (such as user account controls) or to create and publish a dataset, you will need an API key linked to a Dataverse installation account. Instructions for obtaining an account and setting up an API key are available in the [Dataverse User Guide](http://guides.dataverse.org/en/latest/user/account.html). (Note: if your key is compromised, it can be regenerated to preserve security.) Once you have an API key, this should be stored as an environment variable called `DATAVERSE_KEY`. It can be set within R using: +For "native" Dataverse features (such as user account controls) or to create and publish a dataset, you will need an API key linked to a Dataverse installation account. Instructions for obtaining an account and setting up an API key are available in the [Dataverse User Guide](https://guides.dataverse.org/en/latest/user/account.html). (Note: if your key is compromised, it can be regenerated to preserve security.) Once you have an API key, this should be stored as an environment variable called `DATAVERSE_KEY`. It can be set within R using: ```R Sys.setenv("DATAVERSE_KEY" = "examplekey12345") diff --git a/vignettes/C-retrieval.Rmd b/vignettes/C-retrieval.Rmd index 71d91ad..2b24ded 100644 --- a/vignettes/C-retrieval.Rmd +++ b/vignettes/C-retrieval.Rmd @@ -17,11 +17,11 @@ vignette: > -This vignette shows how to download data from Dataverse using the dataverse package. We'll focus on a Dataverse repository that contains supplemental files for [Jamie Monogan](http://spia.uga.edu/faculty-member/jamie-monogan/)'s book [*Political Analysis Using R*](http://www.springer.com/gb/book/9783319234458), which is stored at Harvard University's [IQSS Dataverse Network](https://dataverse.harvard.edu/): +This vignette shows how to download data from Dataverse using the dataverse package. We'll focus on a Dataverse repository that contains supplemental files for [Jamie Monogan](https://spia.uga.edu/faculty-member/jamie-monogan/)'s book [*Political Analysis Using R*](https://www.springer.com/gb/book/9783319234458), which is stored at Harvard University's [IQSS Dataverse Network](https://dataverse.harvard.edu/): > Monogan, Jamie, 2015, "Political Analysis Using R: Example Code and Data, Plus Data for Practice Problems", [doi:10.7910/DVN/ARKOTI](https://doi.org/10.7910/DVN/ARKOTI), Harvard Dataverse, V1, UNF:6:+itU9hcUJ8I9E0Kqv8HWHg== -This study is persistently retrievable by a "[Digital Object Identifier (DOI)](https://www.doi.org/)": https://doi.org/10.7910/DVN/ARKOTI and the citation above (taken from the Dataverse page) includes a "[Universal Numeric Fingerprint (UNF)](http://guides.dataverse.org/en/latest/developers/unf/index.html)": `UNF:6:+itU9hcUJ8I9E0Kqv8HWHg==`, which provides a versioned, multi-file hash for the entire study, which contains 32 files. +This study is persistently retrievable by a "[Digital Object Identifier (DOI)](https://www.doi.org/)": https://doi.org/10.7910/DVN/ARKOTI and the citation above (taken from the Dataverse page) includes a "[Universal Numeric Fingerprint (UNF)](https://guides.dataverse.org/en/latest/developers/unf/index.html)": `UNF:6:+itU9hcUJ8I9E0Kqv8HWHg==`, which provides a versioned, multi-file hash for the entire study, which contains 32 files. If you don't already know what datasets and files you want to use from Dataverse, see the ["Data Search" vignette](B-search.html) for guidance on data search and discovery. diff --git a/vignettes/C-retrieval.Rmd2 b/vignettes/C-retrieval.Rmd2 index 15c6b77..a6c5230 100644 --- a/vignettes/C-retrieval.Rmd2 +++ b/vignettes/C-retrieval.Rmd2 @@ -20,11 +20,11 @@ options(width = 120) knitr::opts_chunk$set(results = "hold") ``` -This vignette shows how to download data from Dataverse using the dataverse package. We'll focus on a Dataverse repository that contains supplemental files for [Jamie Monogan](http://spia.uga.edu/faculty-member/jamie-monogan/)'s book [*Political Analysis Using R*](http://www.springer.com/gb/book/9783319234458), which is stored at Harvard University's [IQSS Dataverse Network](https://dataverse.harvard.edu/): +This vignette shows how to download data from Dataverse using the dataverse package. We'll focus on a Dataverse repository that contains supplemental files for [Jamie Monogan](https://spia.uga.edu/faculty-member/jamie-monogan/)'s book [*Political Analysis Using R*](https://www.springer.com/gb/book/9783319234458), which is stored at Harvard University's [IQSS Dataverse Network](https://dataverse.harvard.edu/): > Monogan, Jamie, 2015, "Political Analysis Using R: Example Code and Data, Plus Data for Practice Problems", [doi:10.7910/DVN/ARKOTI](https://doi.org/10.7910/DVN/ARKOTI), Harvard Dataverse, V1, UNF:6:+itU9hcUJ8I9E0Kqv8HWHg== -This study is persistently retrievable by a "[Digital Object Identifier (DOI)](https://www.doi.org/)": https://doi.org/10.7910/DVN/ARKOTI and the citation above (taken from the Dataverse page) includes a "[Universal Numeric Fingerprint (UNF)](http://guides.dataverse.org/en/latest/developers/unf/index.html)": `UNF:6:+itU9hcUJ8I9E0Kqv8HWHg==`, which provides a versioned, multi-file hash for the entire study, which contains 32 files. +This study is persistently retrievable by a "[Digital Object Identifier (DOI)](https://www.doi.org/)": https://doi.org/10.7910/DVN/ARKOTI and the citation above (taken from the Dataverse page) includes a "[Universal Numeric Fingerprint (UNF)](https://guides.dataverse.org/en/latest/developers/unf/index.html)": `UNF:6:+itU9hcUJ8I9E0Kqv8HWHg==`, which provides a versioned, multi-file hash for the entire study, which contains 32 files. If you don't already know what datasets and files you want to use from Dataverse, see the ["Data Search" vignette](B-search.html) for guidance on data search and discovery. @@ -44,7 +44,7 @@ The output prints some basic metadata and then the `str()` of the `files` data f dataset$files[c("filename", "contentType")] ``` -This shows that there are indeed 32 files, a mix of .R code files and tab- and comma-separated data files. +This shows that there are indeed 32 files, a mix of .R code files and tab- and comma-separated data files. You can also retrieve more extensive metadata using `dataset_metadata()`: @@ -67,9 +67,9 @@ writeBin(code3, "chapter03.R") Now we'll get the corresponding data and save it locally. For this code we need two data files: ```{r} -writeBin(get_file("constructionData.tab", "doi:10.7910/DVN/ARKOTI"), +writeBin(get_file("constructionData.tab", "doi:10.7910/DVN/ARKOTI"), "constructionData.dta") -writeBin(get_file("PESenergy.csv", "doi:10.7910/DVN/ARKOTI"), +writeBin(get_file("PESenergy.csv", "doi:10.7910/DVN/ARKOTI"), "PESenergy.csv") ```

    API Category