From 5f0e060682368ef22c7a1a488701a73d3729032c Mon Sep 17 00:00:00 2001 From: 401118 <401118@hdi.de> Date: Tue, 16 Jul 2024 11:01:04 +0200 Subject: [PATCH] adding the files scrap R file and files scrap logic --- DESCRIPTION | 3 +- NAMESPACE | 3 + R/files_scrap.R | 220 +++++++++++++++++++++++++++++++++++++++++++ R/images_scrap.R | 11 +++ R/paragraphs_scrap.R | 1 + R/pdfs_scrap.R | 101 -------------------- man/csv_scrap.Rd | 21 +++++ man/pdfs_scrap.Rd | 21 +++++ man/xls_scrap.Rd | 21 +++++ man/xlsx_scrap.Rd | 30 ++++++ 10 files changed, 330 insertions(+), 102 deletions(-) create mode 100644 R/files_scrap.R delete mode 100644 R/pdfs_scrap.R create mode 100644 man/csv_scrap.Rd create mode 100644 man/pdfs_scrap.Rd create mode 100644 man/xls_scrap.Rd create mode 100644 man/xlsx_scrap.Rd diff --git a/DESCRIPTION b/DESCRIPTION index 2b4d781..a832c37 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -23,7 +23,8 @@ Imports: robotstxt, crayon, curl, - stringi + stringi, + urltools (>= 1.7.3) Suggests: knitr, testthat, diff --git a/NAMESPACE b/NAMESPACE index 49ab947..455de5d 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,6 +1,7 @@ # Generated by roxygen2: do not edit by hand export(attribute_scrap) +export(csv_scrap) export(images_noalt_scrap) export(images_preview) export(images_scrap) @@ -11,6 +12,8 @@ export(table_scrap) export(tidy_scrap) export(titles_scrap) export(weblink_scrap) +export(xls_scrap) +export(xlsx_scrap) importFrom(crayon,bgRed) importFrom(crayon,green) importFrom(curl,has_internet) diff --git a/R/files_scrap.R b/R/files_scrap.R new file mode 100644 index 0000000..15d9527 --- /dev/null +++ b/R/files_scrap.R @@ -0,0 +1,220 @@ + +.get_base_from_full_url <- function(url) { + + scheme <- urltools::scheme(url) + domain <- urltools::domain(url) + + base_url <- paste0(scheme, "://", domain) + base_url +} + + + +.format_url <- function(file_url, link) { + + if (grepl("^http", file_url)) { + return(file_url) + } else { + base_url <- .get_base_from_full_url(link) + file_url <- paste0(base_url, "/", file_url) + return(file_url) + } +} + +.scrap_specific_file <- function( + link, + path, + ext, + askRobot +) { + + if (path != getwd() && !dir.exists(path)) { + stop("the path: ", path, " doesn't seem to exist !") + } + + if (askRobot) { + + if (paths_allowed(link) == TRUE) { + message(green("the robot.txt doesn't prohibit scraping this web page")) + + } else { + message(bgRed( + "WARNING: the robot.txt doesn't allow scraping this web page" + )) + + } + } + + urls_containing_files <- weblink_scrap( + link, + contain = ext + ) + + files_to_consider <- urls_containing_files %>% + purrr::keep(function(x) { + tolower(tools::file_ext(x)) == ext + }) + + if (length(files_to_consider) == 0) { + message("No file has been found. Returning NULL.") + return(invisible(NULL)) + } + + + files_to_consider <- purrr::map_chr( + files_to_consider, + .format_url, + link = link + ) + + for (i in seq_along(files_to_consider)) { + + tryCatch( + expr = { + download.file(files_to_consider[i], + destfile = paste0(path, "/", basename(files_to_consider[i])), + mode = "wb" + ) + + }, + + error = function(cond) { + + if (!has_internet()) { + + message(paste0("Please check your internet connexion: ", cond)) + + return(NA) + + } else if (grepl("current working directory", cond) || + grepl("HTTP error 404", cond)) { + + message(paste0("The URL doesn't seem to be a valid one: ", link)) + + message(paste0("Here the original error message: ", cond)) + + return(NA) + + } else { + + message(paste0("Undefined Error: ", cond)) + return(NA) + + } + } + + ) + + } + +} + + +#' Scrape and download pdf files from a Web Page +#' +#' @param link the link of the web page +#' @param path the path where to save the PDF files. Defaults to the current directory +#' @param askRobot logical. Should the function ask the robots.txt if we're allowed or not to scrape the web page ? Default is FALSE. +#' +#' @return called for the side effect of downloading PDF files from a website +#' @export +#' + +pdfs_scrap <- function( + link, + path = getwd(), + askRobot = FALSE +) { + + .scrap_specific_file( + link = link, + path = path, + ext = "pdf", + askRobot = askRobot + ) + +} + + +#' Scrape and download Excel xlsx files from a Web Page +#' +#' @param link the link of the web page +#' @param path the path where to save the Excel xlsx files. Defaults to the current directory +#' @param askRobot logical. Should the function ask the robots.txt if we're allowed or not to scrape the web page ? Default is FALSE. +#' +#' @return called for the side effect of downloading Excel xlsx files from a website +#' @export +#' @examples \dontrun{ +#' +#' excel_scrap( +#' link = "https://www.rieter.com/investor-relations/results-and-presentations/financial-statements" +#' ) +#' +#' } + +xlsx_scrap <- function( + link, + path = getwd(), + askRobot = FALSE +) { + + .scrap_specific_file( + link = link, + path = path, + ext = "xlsx", + askRobot = askRobot + ) + +} + +#' Scrape and download Excel xls files from a Web Page +#' +#' @param link the link of the web page +#' @param path the path where to save the Excel xls files. Defaults to the current directory +#' @param askRobot logical. Should the function ask the robots.txt if we're allowed or not to scrape the web page ? Default is FALSE. +#' +#' @return called for the side effect of downloading Excel xls files from a website +#' @export +#' + +xls_scrap <- function( + link, + path = getwd(), + askRobot = FALSE +) { + + .scrap_specific_file( + link = link, + path = path, + ext = "xls", + askRobot = askRobot + ) + +} + + + +#' Scrape and download CSV files from a Web Page +#' +#' @param link the link of the web page +#' @param path the path where to save the CSV files. Defaults to the current directory +#' @param askRobot logical. Should the function ask the robots.txt if we're allowed or not to scrape the web page ? Default is FALSE. +#' +#' @return called for the side effect of downloading CSV files from a website +#' @export +#' + +csv_scrap <- function( + link, + path = getwd(), + askRobot = FALSE +) { + + .scrap_specific_file( + link = link, + path = path, + ext = "csv", + askRobot = askRobot + ) + +} diff --git a/R/images_scrap.R b/R/images_scrap.R index 14d4690..ae00cb3 100644 --- a/R/images_scrap.R +++ b/R/images_scrap.R @@ -96,6 +96,17 @@ images_scrap <- function(link, x = img_urls_unlist, ignore.case = FALSE)] + if (length(img_urls_f) == 0) { + message("No image has been found. Returning NULL") + return(invisible(NULL)) + } + + img_urls_f <- purrr::map_chr( + img_urls_f, + .format_url, + link = link + ) + for (i in seq_along(img_urls_f)) { download.file(img_urls_f[i], diff --git a/R/paragraphs_scrap.R b/R/paragraphs_scrap.R index 54ce216..cf1a2a1 100644 --- a/R/paragraphs_scrap.R +++ b/R/paragraphs_scrap.R @@ -79,6 +79,7 @@ paragraphs_scrap <- function(link, return(paste(data, collapse = " ")) } else if (!is.null(contain) & collapse == FALSE) { + return(data[grepl(contain, data, ignore.case = !case_sensitive)]) diff --git a/R/pdfs_scrap.R b/R/pdfs_scrap.R deleted file mode 100644 index 14e3367..0000000 --- a/R/pdfs_scrap.R +++ /dev/null @@ -1,101 +0,0 @@ - -.scrap_specific_file <- function( - ext, - link, - path -) { - - urls_containing_files <- weblink_scrap( - link, - contain = ext - ) - - files_to_consider <- urls_containing_files %>% - purrr::keep(function(x) { - tools::file_ext(x) == ext - }) - - for (i in seq_along(files_to_consider)) { - - tryCatch( - expr = { - download.file(files_to_consider[i], - destfile = paste0(path, "/", basename(files_to_consider[i])), - mode = "wb" - ) - - }, - - error = function(cond) { - - if (!has_internet()) { - - message(paste0("Please check your internet connexion: ", cond)) - - return(NA) - - } else if (grepl("current working directory", cond) || - grepl("HTTP error 404", cond)) { - - message(paste0("The URL doesn't seem to be a valid one: ", link)) - - message(paste0("Here the original error message: ", cond)) - - return(NA) - - } else { - - message(paste0("Undefined Error: ", cond)) - return(NA) - - } - } - - ) - - } - -} - - -#' Scrape and download pdf files from a Web Page -#' -#' @param link the link of the web page -#' @param pdfpath the path where to save the PDF files. Defaults to the current directory -#' @param askRobot logical. Should the function ask the robots.txt if we're allowed or not to scrape the web page ? Default is FALSE. -#' -#' @return called for the side effect of downloading PDF files -#' @export -#' - -pdfs_scrap <- function( - link, - pdfpath = getwd(), - askRobot = FALSE -) { - - - if (pdfpath != getwd() && !dir.exists(pdfpath)) { - stop("the path: ", pdfpath, " doesn't seem to exist !") - } - - if (askRobot) { - - if (paths_allowed(link) == TRUE) { - message(green("the robot.txt doesn't prohibit scraping this web page")) - - } else { - message(bgRed( - "WARNING: the robot.txt doesn't allow scraping this web page" - )) - - } - } - - .scrap_specific_file( - link = link, - path = pdfpath, - ext = "pdf" - ) - -} diff --git a/man/csv_scrap.Rd b/man/csv_scrap.Rd new file mode 100644 index 0000000..830b1ef --- /dev/null +++ b/man/csv_scrap.Rd @@ -0,0 +1,21 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/files_scrap.R +\name{csv_scrap} +\alias{csv_scrap} +\title{Scrape and download CSV files from a Web Page} +\usage{ +csv_scrap(link, path = getwd(), askRobot = FALSE) +} +\arguments{ +\item{link}{the link of the web page} + +\item{path}{the path where to save the CSV files. Defaults to the current directory} + +\item{askRobot}{logical. Should the function ask the robots.txt if we're allowed or not to scrape the web page ? Default is FALSE.} +} +\value{ +called for the side effect of downloading CSV files from a website +} +\description{ +Scrape and download CSV files from a Web Page +} diff --git a/man/pdfs_scrap.Rd b/man/pdfs_scrap.Rd new file mode 100644 index 0000000..d7087e2 --- /dev/null +++ b/man/pdfs_scrap.Rd @@ -0,0 +1,21 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/files_scrap.R +\name{pdfs_scrap} +\alias{pdfs_scrap} +\title{Scrape and download pdf files from a Web Page} +\usage{ +pdfs_scrap(link, path = getwd(), askRobot = FALSE) +} +\arguments{ +\item{link}{the link of the web page} + +\item{path}{the path where to save the PDF files. Defaults to the current directory} + +\item{askRobot}{logical. Should the function ask the robots.txt if we're allowed or not to scrape the web page ? Default is FALSE.} +} +\value{ +called for the side effect of downloading PDF files from a website +} +\description{ +Scrape and download pdf files from a Web Page +} diff --git a/man/xls_scrap.Rd b/man/xls_scrap.Rd new file mode 100644 index 0000000..4f20ffd --- /dev/null +++ b/man/xls_scrap.Rd @@ -0,0 +1,21 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/files_scrap.R +\name{xls_scrap} +\alias{xls_scrap} +\title{Scrape and download Excel xls files from a Web Page} +\usage{ +xls_scrap(link, path = getwd(), askRobot = FALSE) +} +\arguments{ +\item{link}{the link of the web page} + +\item{path}{the path where to save the Excel xls files. Defaults to the current directory} + +\item{askRobot}{logical. Should the function ask the robots.txt if we're allowed or not to scrape the web page ? Default is FALSE.} +} +\value{ +called for the side effect of downloading Excel xls files from a website +} +\description{ +Scrape and download Excel xls files from a Web Page +} diff --git a/man/xlsx_scrap.Rd b/man/xlsx_scrap.Rd new file mode 100644 index 0000000..3dbb306 --- /dev/null +++ b/man/xlsx_scrap.Rd @@ -0,0 +1,30 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/files_scrap.R +\name{xlsx_scrap} +\alias{xlsx_scrap} +\title{Scrape and download Excel xlsx files from a Web Page} +\usage{ +xlsx_scrap(link, path = getwd(), askRobot = FALSE) +} +\arguments{ +\item{link}{the link of the web page} + +\item{path}{the path where to save the Excel xlsx files. Defaults to the current directory} + +\item{askRobot}{logical. Should the function ask the robots.txt if we're allowed or not to scrape the web page ? Default is FALSE.} +} +\value{ +called for the side effect of downloading Excel xlsx files from a website +} +\description{ +Scrape and download Excel xlsx files from a Web Page +} +\examples{ +\dontrun{ + +excel_scrap( +link = "https://www.rieter.com/investor-relations/results-and-presentations/financial-statements" +) + +} +}