Skip to content

Commit

Permalink
Add gutenberg_get_all_mirrors() (#68)
Browse files Browse the repository at this point in the history
* Separate mirror functions

* Add `gutenberg_get_all_mirrors`
  • Loading branch information
jrdnbradford authored Sep 1, 2024
1 parent 4443a05 commit 2a73b8e
Show file tree
Hide file tree
Showing 10 changed files with 188 additions and 79 deletions.
1 change: 1 addition & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
# Generated by roxygen2: do not edit by hand

export(gutenberg_download)
export(gutenberg_get_all_mirrors)
export(gutenberg_get_mirror)
export(gutenberg_strip)
export(gutenberg_works)
Expand Down
2 changes: 2 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
# gutenbergr (development version)

* `gutenberg_get_all_mirrors()` has been added to retrieve mirror data (@jrdnbradford, #58)

# gutenbergr 0.2.4

* Update data scraping process to use R end-to-end (@jonthegeek, #36).
Expand Down
46 changes: 0 additions & 46 deletions R/gutenberg_download.R
Original file line number Diff line number Diff line change
Expand Up @@ -148,52 +148,6 @@ read_next <- function(possible_urls) {
}
}

#' Get the recommended mirror for Gutenberg files
#'
#' Get the recommended mirror for Gutenberg files and set the global
#' `gutenberg_mirror` options.
#'
#' @param verbose Whether to show messages about the Project Gutenberg mirror
#' that was chosen
#'
#' @return A character vector with the url for the chosen mirror.
#'
#' @examplesIf interactive()
#'
#' gutenberg_get_mirror()
#'
#' @export
gutenberg_get_mirror <- function(verbose = TRUE) {
mirror <- getOption("gutenberg_mirror")
if (!is.null(mirror)) {
return(mirror)
}

# figure out the mirror for this location from wget
harvest_url <- "https://www.gutenberg.org/robot/harvest"
maybe_message(
verbose,
"Determining mirror for Project Gutenberg from {harvest_url}."
)
wget_url <- glue::glue("{harvest_url}?filetypes[]=txt")
lines <- read_url(wget_url)
a <- stringr::str_subset(lines, stringr::fixed("<a href="))[1]
mirror_full_url <- stringr::str_match(a, "href=\"(.*?)\"")[2]

# parse and leave out the path
parsed <- urltools::url_parse(mirror_full_url)
if (parsed$domain == "www.gutenberg.lib.md.us") {
# Broken mirror. PG has been contacted. For now, replace:
parsed$domain <- "aleph.gutenberg.org" # nocov
}

mirror <- unclass(glue::glue_data(parsed, "{scheme}://{domain}"))
maybe_message(verbose, "Using mirror {mirror}.")

# set option for next time
options(gutenberg_mirror = mirror)
return(mirror)
}

gutenberg_add_metadata <- function(gutenberg_tbl, meta_fields) {
meta_fields <- union("gutenberg_id", meta_fields)
Expand Down
88 changes: 88 additions & 0 deletions R/gutenberg_mirrors.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
#' Get the recommended mirror for Gutenberg files
#'
#' Get the recommended mirror for Gutenberg files and set the global
#' `gutenberg_mirror` options.
#'
#' @param verbose Whether to show messages about the Project Gutenberg mirror
#' that was chosen
#'
#' @return A character vector with the url for the chosen mirror.
#'
#' @examplesIf interactive()
#'
#' gutenberg_get_mirror()
#'
#' @export
gutenberg_get_mirror <- function(verbose = TRUE) {
mirror <- getOption("gutenberg_mirror")
if (!is.null(mirror)) {
return(mirror)
}

# figure out the mirror for this location from wget
harvest_url <- "https://www.gutenberg.org/robot/harvest"
maybe_message(
verbose,
"Determining mirror for Project Gutenberg from {harvest_url}."
)
wget_url <- glue::glue("{harvest_url}?filetypes[]=txt")
lines <- read_url(wget_url)
a <- stringr::str_subset(lines, stringr::fixed("<a href="))[1]
mirror_full_url <- stringr::str_match(a, "href=\"(.*?)\"")[2]

# parse and leave out the path
parsed <- urltools::url_parse(mirror_full_url)
if (parsed$domain == "www.gutenberg.lib.md.us") {
# Broken mirror. PG has been contacted. For now, replace:
parsed$domain <- "aleph.gutenberg.org" # nocov
}

mirror <- unclass(glue::glue_data(parsed, "{scheme}://{domain}"))
maybe_message(verbose, "Using mirror {mirror}.")

# set option for next time
options(gutenberg_mirror = mirror)
return(mirror)
}


#' Get all mirror data from Project Gutenberg
#'
#' Get all the mirror data from \url{https://www.gutenberg.org/MIRRORS.ALL}
#'
#' @return A tbl_df of Project Gutenberg mirrors and related data
#' \describe{
#'
#' \item{continent}{Continent where the mirror is located}
#'
#' \item{nation}{Nation where the mirror is located}
#'
#' \item{location}{Location of the mirror}
#'
#' \item{provider}{Provider of the mirror}
#'
#' \item{url}{URL of the mirror}
#'
#' \item{note}{Special notes}
#' }
#' @examplesIf interactive()
#'
#' gutenberg_get_all_mirrors()
#'
#' @export
gutenberg_get_all_mirrors <- function() {
mirrors_url <- "https://www.gutenberg.org/MIRRORS.ALL"
mirrors_md <- read_url(mirrors_url)
tmp <- tempfile(fileext = ".md")
writeLines(mirrors_md, tmp)
mirrors <- suppressWarnings(
readr::read_delim(
tmp,
delim = "|",
trim_ws = TRUE
) |>
dplyr::slice(2:(dplyr::n() - 1))
)

return(mirrors)
}
34 changes: 34 additions & 0 deletions man/gutenberg_get_all_mirrors.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion man/gutenberg_get_mirror.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

21 changes: 21 additions & 0 deletions tests/testthat/fixtures/MIRRORS-ALL
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
continent | nation | location | provider | url | note
---------------+---------------+---------------------+----------------------------------------------+------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------------
Europe | Great Britain | Kent | UK Mirror Service | http://www.mirrorservice.org/sites/ftp.ibiblio.org/pub/docs/books/gutenberg/ |
Europe | Great Britain | Kent | UK Mirror Service | ftp://ftp.mirrorservice.org/sites/ftp.ibiblio.org/pub/docs/books/gutenberg/ |
Europe | Great Britain | Kent | UK Mirror Service | rsync://rsync.mirrorservice.org/gutenberg/ |
Europe | Portugal | Braga | Universidade do Minho | http://eremita.di.uminho.pt/gutenberg/ |
Europe | Portugal | Braga | Universidade do Minho | ftp://eremita.di.uminho.pt/pub/gutenberg/ |
North America | Canada | Waterloo | University of Waterloo Computer Science Club | http://mirror.csclub.uwaterloo.ca/gutenberg/ |
North America | United States | Buffalo, NY | Jake Nabasny | https://gutenberg.nabasny.com/ |
North America | United States | Chapel Hill | iBiblio | https://www.gutenberg.org/dirs/ | Main Project Gutenberg Collection Site
North America | United States | Chapel Hill | iBiblio | ftp://ftp.ibiblio.org/pub/docs/books/gutenberg/ | Main Project Gutenberg FTP Site.
North America | United States | Pikeville, Kentucky | SandyRiver.NET | https://mirror2.sandyriver.net/pub/gutenberg | High speed mirror on a 10Gb network connection. Also available by http, and by rsync to rsync://mirror2.sandyriver.net/pub/gutenberg
North America | United States | Salt Lake City | Xmission ISP - FTP | ftp://mirrors.xmission.com/gutenberg/ |
North America | United States | Salt Lake City | Xmission ISP - HTTP | http://mirrors.xmission.com/gutenberg/ |
North America | United States | San Diego | Project Gutenberg | ftp://gutenberg.pglaf.org | High-speed mirror. Includes cache/generated files (epub, mobi, etc.).
North America | United States | San Diego | Project Gutenberg | https://aleph.gutenberg.org/ | High-speed mirror. Includes cache/generated files (epub, mobi, etc.). Also available via rsync and ftp.
North America | United States | San Diego | Project Gutenberg | https://gutenberg.pglaf.org/ | High-speed mirror. Includes cache/generated files (epub, mobi, etc.).
North America | United States | San Diego | Project Gutenberg | gopher://gopher.pglaf.org/ | Gopher server.
North America | United States | San Diego | Project Gutenberg | rsync://gutenberg.pglaf.org/gutenberg | High-speed mirror. Includes cache/generated files (epub, mobi, etc.).
(17 rows)

1 change: 1 addition & 0 deletions tests/testthat/fixtures/create_fixtures.R
Original file line number Diff line number Diff line change
Expand Up @@ -16,3 +16,4 @@ dl_fixture("https://www.gutenberg.org/cache/epub/68283/pg68283.txt")
dl_fixture("https://www.gutenberg.org/robot/harvest?filetypes[]=txt")
dl_fixture("http://aleph.gutenberg.org/1/0/105/105-0.zip")
dl_fixture("http://aleph.gutenberg.org/1/0/109/109.zip")
dl_fixture("https://www.gutenberg.org/MIRRORS.ALL")
32 changes: 0 additions & 32 deletions tests/testthat/test-gutenberg_download.R
Original file line number Diff line number Diff line change
@@ -1,35 +1,3 @@
test_that("gutenberg_get_mirror works with no option set", {
local_dl_and_read()
withr::local_options(gutenberg_mirror = NULL)
expect_message(
expect_identical(
gutenberg_get_mirror(),
"http://aleph.gutenberg.org"
),
"Determining mirror"
)
expect_no_message(
expect_identical(
gutenberg_get_mirror(),
"http://aleph.gutenberg.org"
)
)
})

test_that("gutenberg_get_mirror respects verbose", {
local_dl_and_read()
withr::local_options(gutenberg_mirror = NULL)
expect_no_message(gutenberg_get_mirror(verbose = FALSE))
})

test_that("gutenberg_get_mirror uses existing option", {
local_dl_and_read()
withr::local_options(gutenberg_mirror = "mirror")
expect_identical(
gutenberg_get_mirror(), "mirror"
)
})

test_that("gutenberg_url constructs a url from an id", {
expect_identical(
gutenberg_url(c(1, 23, 456), mirror = "base", verbose = FALSE),
Expand Down
40 changes: 40 additions & 0 deletions tests/testthat/test-gutenberg_mirrors.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
test_that("gutenberg_get_mirror works with no option set", {
local_dl_and_read()
withr::local_options(gutenberg_mirror = NULL)
expect_message(
expect_identical(
gutenberg_get_mirror(),
"http://aleph.gutenberg.org"
),
"Determining mirror"
)
expect_no_message(
expect_identical(
gutenberg_get_mirror(),
"http://aleph.gutenberg.org"
)
)
})

test_that("gutenberg_get_mirror respects verbose", {
local_dl_and_read()
withr::local_options(gutenberg_mirror = NULL)
expect_no_message(gutenberg_get_mirror(verbose = FALSE))
})

test_that("gutenberg_get_mirror uses existing option", {
local_dl_and_read()
withr::local_options(gutenberg_mirror = "mirror")
expect_identical(
gutenberg_get_mirror(), "mirror"
)
})

test_that("gutenberg_get_all_mirrors works", {
local_dl_and_read()
mirrors <- gutenberg_get_all_mirrors()
expect_true(inherits(mirrors, "data.frame"))
expect_true(inherits(mirrors, "tbl_df"))
expect_equal(ncol(mirrors), 6)
expect_true(nrow(mirrors) > 10)
})

0 comments on commit 2a73b8e

Please sign in to comment.