Merge pull request #14 from feddelegrand7/restyled/ralger_modified

Restyle Modification of Ralger package to suit rvest 1.0.2
feddelegrand7 · Jun 18, 2022 · 38d3671 · 38d3671
2 parents ce182f4 + a52ee28
commit 38d3671
Show file tree

Hide file tree

Showing 5 changed files with 335 additions and 277 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -4,17 +4,17 @@ Title: Easy Web Scraping
 Version: 2.2.4
 Authors@R: c(
     person("Mohamed El Fodil", "Ihaddaden", email = "[email protected]", role = c("aut", "cre")),
-    person("Ezekiel", "Ogundepo", role = c("ctb")), 
-    person("Romain", "François", email = "[email protected]", role = c("ctb"))) 
+    person("Ezekiel", "Ogundepo", role = c("ctb")),
+    person("Romain", "François", email = "[email protected]", role = c("ctb")))
 Maintainer: Mohamed El Fodil Ihaddaden <[email protected]>
-Description: The goal of 'ralger' is to facilitate web scraping in R. 
+Description: The goal of 'ralger' is to facilitate web scraping in R.
 License: MIT + file LICENSE
 Encoding: UTF-8
 LazyData: true
 URL: https://github.com/feddelegrand7/ralger
 BugReports: https://github.com/feddelegrand7/ralger/issues
 VignetteBuilder: knitr
-Imports: 
+Imports:
     rvest,
     xml2,
     tidyr,
@@ -24,9 +24,9 @@ Imports:
     crayon,
     curl,
     stringi
-Suggests: 
+Suggests:
     knitr,
     testthat,
     rmarkdown,
     covr
-RoxygenNote: 7.1.1
+RoxygenNote: 7.2.0
diff --git a/R/table_scrap.R b/R/table_scrap.R
@@ -8,7 +8,6 @@
 #' @param choose an integer indicating which table to scrape
 #' @param header do you want the first line to be the leader (default to TRUE)
 #' @param askRobot logical. Should the function ask the robots.txt if we're allowed or not to scrape the web page ? Default is FALSE.
-#' @param fill logical. Should be set to TRUE when the table has an inconsistent number of columns.
 #' @return a data frame object.
 #' @examples \donttest{
 #' # Extracting premier ligue 2019/2020 top scorers
@@ -30,60 +29,58 @@
 table_scrap <- function(link,
                         choose = 1,
                         header = TRUE,
-                        fill = FALSE,
                         askRobot = FALSE) {
 
 
-if(missing(link)) {
-  stop("'link' is a mandatory parameter")
-}
+  if(missing(link)) {
+    stop("'link' is a mandatory parameter")
+  }
 
 
-if(!is.character(link)) {
-  stop("'link' parameter must be provided as a character string")
-}
+  if(!is.character(link)) {
+    stop("'link' parameter must be provided as a character string")
+  }
 
 
-if(!is.numeric(choose)){
-  stop(paste0("the 'choose' parameter must be provided as numeric not as "),
-       typeof(choose))
-}
+  if(!is.numeric(choose)){
+    stop(paste0("the 'choose' parameter must be provided as numeric not as "),
+         typeof(choose))
+  }
 
 
-############################## Ask robot part ###################################################
+  ############################## Ask robot part ###################################################
 
-    if (askRobot) {
-      if (paths_allowed(link) == TRUE) {
-        message(green("the robot.txt doesn't prohibit scraping this web page"))
+  if (askRobot) {
+    if (paths_allowed(link) == TRUE) {
+      message(green("the robot.txt doesn't prohibit scraping this web page"))
 
-      } else {
-        message(bgRed(
-          "WARNING: the robot.txt doesn't allow scraping this web page"
-        ))
-
-      }
+    } else {
+      message(bgRed(
+        "WARNING: the robot.txt doesn't allow scraping this web page"
+      ))
 
     }
 
-#################################################################################################
+  }
 
+  #################################################################################################
 
-tryCatch(
+  tryCatch(
 
-expr = {
+    expr = {
 
-table <- link %>%
-      read_html() %>%
-      html_table(header, fill = fill)
+      table <- link %>%
+        read_html() %>%
+        html_table(header)
 
-chosen_table <- table[[choose]]
+      chosen_table <- table[[choose]]
 
-return(chosen_table)
+      return(chosen_table)
 
 
-  },
+    },
 
-error = function(cond){
+    error = function(cond){
 
       if(!has_internet()){
 
@@ -93,18 +90,18 @@ error = function(cond){
 
       } else if (grepl("current working directory", cond) || grepl("HTTP error 404", cond)) {
 
-          message(paste0("The URL doesn't seem to be a valid one: ", link))
+        message(paste0("The URL doesn't seem to be a valid one: ", link))
 
-          message(paste0("Here the original error message: ", cond))
+        message(paste0("Here the original error message: ", cond))
 
-          return(NA)
+        return(NA)
 
 
       } else if(grepl("subscript out of bounds", cond)) {
 
         message(
-        "Are you sure that your web page contains more than one HTML table ?"
-          )
+          "Are you sure that your web page contains more than one HTML table ?"
+        )
 
         message(paste0("Here the original error message: ", cond))
 
@@ -117,6 +114,6 @@ error = function(cond){
         return(NA)
 
       }
-}
+    }
 
-)}
+  )}
diff --git a/README.Rmd b/README.Rmd
@@ -51,20 +51,21 @@ devtools::install_github("feddelegrand7/ralger")
 ```
 ## `scrap()`
 
-This is an example which shows how to extract [top ranked universities' names](http://www.shanghairanking.com/ARWU2020.html) according to the ShanghaiRanking Consultancy:
+This is an example which shows how to extract [top ranked universities' names](http://www.shanghairanking.com/rankings/arwu/2021) according to the ShanghaiRanking Consultancy:
 
 
 ```{r example}
 library(ralger)
 
-my_link <- "http://www.shanghairanking.com/ARWU2020.html"
+my_link <- "http://www.shanghairanking.com/rankings/arwu/2021"
 
-my_node <- "#UniversityRanking a" # The element ID , I recommend SelectorGadget if you're not familiar with CSS selectors
+my_node <- "a span" # The element ID , I recommend SelectorGadget if you're not familiar with CSS selectors
 
-best_uni <- scrap(link = my_link, node = my_node)
+clean <- TRUE # Should the function clean the extracted vector or not ? Default is FALSE
 
-head(best_uni, 10)
+best_uni <- scrap(link = my_link, node = my_node, clean = clean)
 
+head(best_uni, 10)
 
 ```
 
@@ -88,27 +89,27 @@ head(scrap(links, node), 10) # printing the first 10 speakers
 
 ## `attribute_scrap()`
 
-If you need to scrape some elements' attributes, you can use the `attribute_scrap()` function as in the following example: 
+If you need to scrape some elements' attributes, you can use the `attribute_scrap()` function as in the following example:
 
 
 ```{r}
 # Getting all classes' names from the anchor elements
-# from the ropensci website 
+# from the ropensci website
 
-attributes <- attribute_scrap(link = "https://ropensci.org/", 
+attributes <- attribute_scrap(link = "https://ropensci.org/",
                 node = "a", # the a tag
                 attr = "class" # getting the class attribute
-                )  
+                )
 
 head(attributes, 10) # NA values are a tags without a class attribute
 ```
 
-Another example, let's we want to get all javascript dependencies within the same web page: 
+Another example, let's we want to get all javascript dependencies within the same web page:
 
 ```{r}
 
-js_depend <- attribute_scrap(link = "https://ropensci.org/", 
-                             node = "script", 
+js_depend <- attribute_scrap(link = "https://ropensci.org/",
+                             node = "script",
                              attr = "src")
 
 js_depend
@@ -282,21 +283,21 @@ images_scrap(link = "https://rstudio.com/",
 ```
 
 
-# Accessibility related functions 
+# Accessibility related functions
 
 
-## `images_noalt_scrap()` 
+## `images_noalt_scrap()`
 
 
-`images_noalt_scrap()` can be used to get the images within a specific web page that don't have an `alt` attribute which can be annoying for people using a screen reader: 
+`images_noalt_scrap()` can be used to get the images within a specific web page that don't have an `alt` attribute which can be annoying for people using a screen reader:
 
 
 ```{r}
 
 images_noalt_scrap(link = "https://www.r-consortium.org/")
 
 ```
-If no images without `alt` attributes are found, the function returns `NULL` and displays an indication message: 
+If no images without `alt` attributes are found, the function returns `NULL` and displays an indication message:
 
 
 ```{r}
@@ -310,4 +311,3 @@ images_noalt_scrap(link = "https://webaim.org/techniques/forms/controls")
 ## Code of Conduct
 
 Please note that the ralger project is released with a [Contributor Code of Conduct](https://contributor-covenant.org/version/2/0/CODE_OF_CONDUCT.html). By contributing to this project, you agree to abide by its terms.
-