Merge pull request #66 from IQSS/dev-data-download

Reorganize get_file functions and add a get_dataframe functions
IQSS · Jan 17, 2021 · d4a6e31 · d4a6e31
2 parents 96e5ed7 + 1881657
commit d4a6e31
Show file tree

Hide file tree

Showing 120 changed files with 12,772 additions and 523 deletions.
diff --git a/.Rbuildignore b/.Rbuildignore
@@ -16,3 +16,6 @@ man-roxygen/*
 ^codecov\.yml$
 ^.*\.Rproj$
 ^\.Rproj\.user$
+^_pkgdown\.yml$
+^docs$
+^pkgdown$
diff --git a/.gitignore b/.gitignore
@@ -2,3 +2,4 @@ README.html
 doc
 Meta
 .Rproj.user
+docs
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -22,7 +22,7 @@ Authors@R: c(
   ),
   person(
     "Shiro", "Kuriwaki",
-    role = c("aut"), 
+    role = c("aut"),
     email = "[email protected]",
     comment = c(ORCID = "0000-0002-5687-2647")
   ),
@@ -36,17 +36,17 @@ Authors@R: c(
     "Jan", "Kanis", role = "ctb"
   ))
 Imports:
+    checkmate,
     httr,
     jsonlite,
-    magrittr,
     readr,
     stats,
     utils,
     xml2
 Suggests:
-    checkmate,
     covr,
     foreign,
+    haven,
     knitr,
     purrr,
     testthat,
@@ -61,3 +61,4 @@ BugReports: https://github.com/iqss/dataverse-client-r/issues
 VignetteBuilder: knitr
 Encoding: UTF-8
 RoxygenNote: 7.1.1
+Roxygen: list(markdown = TRUE)
diff --git a/NAMESPACE b/NAMESPACE
@@ -27,10 +27,16 @@ export(delete_dataset)
 export(delete_dataverse)
 export(delete_file)
 export(delete_sword_dataset)
+export(get_dataframe_by_doi)
+export(get_dataframe_by_id)
+export(get_dataframe_by_name)
 export(get_dataset)
 export(get_dataverse)
 export(get_facets)
 export(get_file)
+export(get_file_by_doi)
+export(get_file_by_id)
+export(get_file_by_name)
 export(get_file_metadata)
 export(get_user_key)
 export(initiate_sword_dataset)
@@ -42,8 +48,3 @@ export(service_document)
 export(set_dataverse_metadata)
 export(update_dataset)
 export(update_dataset_file)
-import(httr)
-import(xml2)
-importFrom(stats,setNames)
-importFrom(utils,str)
-importFrom(utils,unzip)
diff --git a/NEWS.md b/NEWS.md
@@ -6,6 +6,7 @@
 * Tests use https://demo.dataverse.org/dataverse/dataverse-client-r/. (#40)
 * Fixes most get_file errors by removing query argument (#33 @kuriwaki)
 * Fix getting multiple files by id in `get_file()` (#47 @adam3smith)
+* Temporary files created by `get_file()` are automatically deleted.
 
 # CHANGES TO dataverse 0.2.1
 

diff --git a/R/SWORD.R b/R/SWORD.R
@@ -13,7 +13,7 @@
 #' list_datasets(d[[2]])
 #' }
 #' @seealso Managing a Dataverse: \code{\link{publish_dataverse}}; Managing a dataset: \code{\link{dataset_atom}}, \code{\link{list_datasets}}, \code{\link{create_dataset}}, \code{\link{delete_dataset}}, \code{\link{publish_dataset}}; Managing files within a dataset: \code{\link{add_file}}, \code{\link{delete_file}}
-#' @importFrom stats setNames
+#'
 #' @export
 service_document <- function(key = Sys.getenv("DATAVERSE_KEY"), server = Sys.getenv("DATAVERSE_SERVER"), ...) {
     u <- paste0(api_url(server, prefix="dvn/api/"), "data-deposit/v1.1/swordv2/service-document")

diff --git a/R/SWORD_dataset.R b/R/SWORD_dataset.R
@@ -4,7 +4,7 @@
 #' @param body A list containing one or more metadata fields. Field names must be valid Dublin Core Terms labels (see details, below). The \samp{title}, \samp{description}, and \samp{creator} fields are required.
 #' @template envvars
 #' @template dots
-#' @details This function is used to initiate a dataset in a (SWORD) Dataverse by supplying relevant metadata. The function is part of the SWORD API (see \href{http://www.ietf.org/rfc/rfc5023.txt}{Atom entry specification}), which is used to upload data to a Dataverse server.
+#' @details This function is used to initiate a dataset in a (SWORD) Dataverse by supplying relevant metadata. The function is part of the SWORD API (see \href{https://www.ietf.org/rfc/rfc5023.txt}{Atom entry specification}), which is used to upload data to a Dataverse server.
 #' Allowed fields are:
 #' \dQuote{abstract}, \dQuote{accessRights}, \dQuote{accrualMethod},
 #' \dQuote{accrualPeriodicity}, \dQuote{accrualPolicy}, \dQuote{alternative},
@@ -212,7 +212,6 @@ dataset_atom <- function(dataset, key = Sys.getenv("DATAVERSE_KEY"), server = Sy
 }
 
 #' @rdname dataset_atom
-#' @import xml2
 #' @export
 dataset_statement <- function(dataset, key = Sys.getenv("DATAVERSE_KEY"), server = Sys.getenv("DATAVERSE_SERVER"), ...) {
     if (inherits(dataset, "dataset_atom")) {

diff --git a/R/SWORD_files.R b/R/SWORD_files.R
@@ -121,7 +121,7 @@ add_file <- function(dataset, file, key = Sys.getenv("DATAVERSE_KEY"), server =
 #'
 #' # delete a file
 #' ds <- dataset_statement(dat)
-#' delete_file(ds$files[[1]]$id
+#' delete_file(ds$files[[1]]$id)
 #'
 #' # delete a dataset
 #' delete_dataset(dat)

diff --git a/R/dataverse-package.R b/R/dataverse-package.R
@@ -17,9 +17,9 @@
 #' }
 #'
 #' @references
-#'  \href{http://guides.dataverse.org/en/latest/api/index.html}{Dataverse API Documentation}
+#'  \href{https://guides.dataverse.org/en/latest/api/index.html}{Dataverse API Documentation}
 #'
-#'  \href{http://dataverse.org/}{Dataverse Homepage}
+#'  \href{https://dataverse.org/}{Dataverse Homepage}
 #'
 #'  \href{https://dataverse.harvard.edu/}{Harvard IQSS Dataverse}
 #'

diff --git a/R/get_dataframe.R b/R/get_dataframe.R
@@ -0,0 +1,149 @@
+#' Get file from dataverse and convert it into a dataframe or tibble
+#'
+#' `get_dataframe_by_id`, if you know the numeric ID of the dataset, or instead
+#' `get_dataframe_by_name` if you know the filename and doi. The dataset
+#'
+#' @rdname get_dataframe
+#'
+#' @param filename The name of the file of interest, with file extension, for example
+#' `"roster-bulls-1996.tab"`.
+#' @param .f The function to used for reading in the raw dataset. This user
+#' must choose the appropriate function: for example if the target is a .rds
+#' file, then `.f` should be `readRDS` or `readr::read_`rds`.
+#' @param original A logical, defaulting to TRUE. Whether to read the ingested,
+#' archival version of the dataset if one exists. The archival versions are tab-delimited
+#' `.tab` files so if `original = FALSE`, `.f` is set to `readr::read_tsv`.
+#' If functions to read the original version is available, then `original = TRUE`
+#' with a specified `.f` is better.
+#'
+#' @inheritDotParams get_file
+#'
+#' @examples
+#'
+#' # Retrieve data.frame from dataverse DOI and file name
+#' df_from_rds_ingested <-
+#'   get_dataframe_by_name(
+#'     filename = "roster-bulls-1996.tab",
+#'     dataset  = "doi:10.70122/FK2/HXJVJU",
+#'     server   = "demo.dataverse.org"
+#'   )
+#'
+#' # Retrieve the same data.frame from dataverse + file DOI
+#' df_from_rds_ingested_by_doi <-
+#'   get_dataframe_by_doi(
+#'     filedoi      = "10.70122/FK2/HXJVJU/SA3Z2V",
+#'     server       = "demo.dataverse.org"
+#'   )
+#'
+#' # Retrieve ingested file originally a Stata dta
+#' df_from_stata_ingested <-
+#'   get_dataframe_by_name(
+#'     filename   = "nlsw88.tab",
+#'     dataset    = "doi:10.70122/FK2/PPIAXE",
+#'     server     = "demo.dataverse.org"
+#'  )
+#'
+#'
+#' # To use the original file version, or for non-ingested data,
+#' # please specify `original = TRUE` and specify a function in .f.
+#'
+#' # A data.frame is still returned, but the
+#' if (requireNamespace("readr", quietly = TRUE)) {
+#'   df_from_rds_original <-
+#'     get_dataframe_by_name(
+#'       filename   = "nlsw88_rds-export.rds",
+#'       dataset    = "doi:10.70122/FK2/PPIAXE",
+#'       server     = "demo.dataverse.org",
+#'       original   = TRUE,
+#'       .f         = readr::read_rds
+#'    )
+#' }
+#'
+#' if (requireNamespace("haven", quietly = TRUE)) {
+#'   df_from_stata_original <-
+#'     get_dataframe_by_name(
+#'       filename   = "nlsw88.tab",
+#'       dataset    = "doi:10.70122/FK2/PPIAXE",
+#'       server     = "demo.dataverse.org",
+#'       original   = TRUE,
+#'       .f         = haven::read_dta
+#'    )
+#' }
+#' @export
+get_dataframe_by_name <- function (
+  filename,
+  dataset       = NULL,
+  .f            = NULL,
+  original      = FALSE,
+  ...
+) {
+  # retrieve ID
+  fileid <- get_fileid.character(x = dataset, file = filename, ...)
+
+  get_dataframe_by_id(fileid, .f, original = original, ...)
+}
+
+#' @rdname get_dataframe
+#' @export
+get_dataframe_by_id <- function(
+  fileid,
+  .f            = NULL,
+  original      = FALSE,
+  ...
+) {
+
+  # if not ingested, then whether to take the original is not relevant.
+  ingested <- is_ingested(fileid, ...)
+
+  if (isFALSE(ingested)) {
+    original <- NA
+  }
+
+  if (is.null(.f) & isTRUE(ingested) & isFALSE(original)) {
+    message("Downloading ingested version of data with readr::read_tsv. To download the original version and remove this message, set original = TRUE.\n")
+    .f <- readr::read_tsv
+  }
+
+  if (is.null(.f) & (isFALSE(ingested) | isTRUE(original))) {
+    stop("read-in function was left NULL, but the target file is not ingested or you asked for the original version. Please supply a .f argument.\n")
+  }
+
+  # READ raw data
+  raw <- get_file(file = fileid, original = original, ...)
+
+  # save to temp and then read it in with supplied function
+  if (!is.null(.f)) {
+    get_dataframe_internal(raw, filename = "foo", .f = .f)
+  }
+}
+
+#' @rdname get_dataframe
+#' @inheritParams get_file_by_doi
+#' @export
+get_dataframe_by_doi <- function (
+  filedoi,
+  .f            = NULL,
+  original      = FALSE,
+  ...
+) {
+  filedoi <- prepend_doi(filedoi)
+
+  # get_file can also take doi now
+  get_dataframe_by_id(fileid = filedoi, .f = .f, original = original, ...)
+}
+
+#' Write to temp and apply function
+#'
+#' @keywords internal
+get_dataframe_internal <- function (raw, filename, .f) {
+  tryCatch(
+    {
+      tmp <- tempfile(filename)
+      writeBin(raw, tmp)
+      do.call(.f, list(tmp))
+    },
+    finally = {
+      if (file.exists(tmp)) unlink(tmp)
+    }
+  )
+}
diff --git a/R/get_dataset.R b/R/get_dataset.R
@@ -66,7 +66,7 @@ get_dataset <- function(
 
 #' @rdname get_dataset
 #' @param block A character string specifying a metadata block to retrieve. By default this is \dQuote{citation}. Other values may be available, depending on the dataset, such as \dQuote{geospatial} or \dQuote{socialscience}.
-#' @importFrom utils str
+#'
 #' @export
 dataset_metadata <- function(
   dataset,
-Original file line number
+Diff line change
@@ Expand Up / @@ -2,3 +2,4 @@ README.html @@
     doc
     Meta
     .Rproj.user
+    docs