From dbb4f93aff00fa5151faa1766e7ea13867ab197d Mon Sep 17 00:00:00 2001 From: terrianne-zhang Date: Mon, 8 Jul 2024 17:08:07 -0700 Subject: [PATCH 1/4] updated readme --- rag/scraper/README.md | 159 ++++++++++++++++++++++++++---------------- 1 file changed, 100 insertions(+), 59 deletions(-) diff --git a/rag/scraper/README.md b/rag/scraper/README.md index 3acf8d74..4ed2764b 100644 --- a/rag/scraper/README.md +++ b/rag/scraper/README.md @@ -1,63 +1,104 @@ -# scrapper - -## Folders for scraper -- Scrapper usually takes a root url or pdf and scrapes the entire website or pdf. -- In every scraping folder, there will be a code called `scrape.py` that is the place you will use to scrape your documents. - - `scrape.py` will scrape the documents from the root url recursively until the entire website is scraped. -- [Scrape_header](Scrape_header/): For general websites -- [Scrape_md](Scrape_md/): For websites that uses markdown -- [Scrape_rst](Scrape_rst/): For websites that uses rst -- [Scrape_pdf](Scrape_pdf/): For pdfs -- [Scrape_vid](Scrape_vid/): For videos +# scraper + +## Supported Formats +The following scrapers can be found under the [Scrape_master](Scraper_master/) folder: +- [scrape_header.py](Scraper_master/scrape_header.py): For general websites +- [scrape_md.py](Scraper_master/scrape_md.py): For websites that use markdown +- [scrape_rst.py](Scraper_master/scrape_rst.py): For websites that use rst +- [scrape_pdf.py](Scraper_master/scrape_pdf.py): For pdfs +- [scrape_vid.py](Scraper_master/scrape_vid.py): For videos + +## Scraper Structure and Usage Instructions +All scrapers follow the base structure outlined in [base_scraper.py](Scraper_master/base_scraper.py), and common utilities can be found in [utils.py](Scraper_master/utils.py). These are some common features about the scrapers: +- All links are compared to a root or base URL to ensure the webpage remains in a relevant domain (ex. for a root URL of berkeley.edu, berkeley.edu/about will be scraped, but youtube.com will not). +- Each conversion creates metadata file in the .yaml format that contains the URL of the page. + +## General websites: scrape_header.py +Beginning from a root URL, the scraper runs depth-first-search. On the current page, a set is created, and all unique links that are found will be processed. This process continues for all the links in the set. The scraper can handle both HTML and PDF formats - if it encounters a webpage, it will be converted from HTML to MD, and if it encounters a PDF, it will download the PDF file. + +### Usage Instructions: +1. Update `url`, `root_regex`, and `root_filename` under the main function accordingly. `url` is the webpage where the scraping will begin, `root_regex` is the string that all further URLS will be compared to before being added to the traversal set, and `root_filename` will be the name of the root file. +2. Next, update `content_tags`. `content_tags` is currently the result of the `match_tags` function, which will take a URL, and map it to the correct content tags specified by the dictionary `content_tags_dict`. The dictionary currently contains several examples of content tags for certain URLs, but the following pictures will illustrate how you can determine content tags for a website you need to scrape. + +First, open up the the developer tools on your page of interest (MacOS: Command + Option + C, Windows: Control + Shift + C, or right-click --> inspect). Your page should look something like this: + +![My Image](images/page_inspect.png) + +Then, you can either hover over the section of the page that you want to scrape, or click open the dropdowns in the control panel until you find the correct area. It should look something like this: + +![My Image](images/content_inspect.png) + +Then, you will need to copy the element, in this case `
` and put it into the following format: +`[('div', {'id': 'page', 'lang': 'en', 'dir': 'ltr'})]`. If you have multiple elements that you want to scrape, you can add another tuple in the list in the same format. You can then add this into the `content_tags_dict`, or directly change the `content_tags` variable. + +3. In the directory you want your root file to be, run `python3 scrape_header.py` + +## Markdown websites: scrape_md.py +The markdown scraper utilizes the "nav" section of mkdocs.yml file in a GitHub repo to run DFS. Based on the "nav" section structure and the base URL, it recursively fetch and save Markdown files. + +### Usage Instructions: +1. Update `root_filename` to the name you want for your root folder +2. Update `site_url` to the URL of the website +3. Update `github_url` to the URL of the GitHub repo, specifically the link to the mkdocs.yml file. +4. In the directory you want your root file to be, run `python3 scrape_md.py` + +## RST websites: scrape_rst.py +The RST scraper uses the toctree from a GitHub repo to run DFS. It fetches the content of the current file, extracts links from the toctree, and then recursively processses each link. + +### Usage Instructions: +1. Update `filename` to the name you want for your root folder +2. Update `doc_url` to the URL of the website +3. Update `github_url` to the URL of the GitHub repo, specifically the link that contains the toctree (commonly found in `index.rst`) +4. In the directory you want your root file to be, run `python3 scrape_rst.py` + +## PDFs: scrape_pdf.py +Downloads a PDF file given the link. + +### Usage Instructions: +1. Update `pdf_url` to the PDF URL that you want to download +2. Update the "name" string under the call to `content_extract` to what you want to name the PDF +3. In the directory you want your PDF file to be, run `python3 scrape_pdf.py` + +## Videos: scrape_vid.py +Given a playlist URL, the scraper will retrieve all of the playlist videos' URLS and download them in the specified directory. + +### Usage Instructions: +1. Update `base_path` to the name you want for your root folder +2. Update `playlist_url` to the name of the YouTube playlist you want to scrape +3. In the directory you want your root folder to be, run `python3 scrape_vid.py` + +Note: Scraping is also supported for Ed, however this is done differently than the scrapers above, so please follow separate instructions to scrape the Ed forum. ## End Results -- After running the scrapper, you will get a folder with this following tree structure that will be used in `rag/embedding_crate.py`. +- After running the scrapper, you will get a folder with this following tree structure. ``` - (rag) bot@botPC:~/roarai/rag/scraper/Scrape_md/carla$ tree . - . - ├── CARLA Ecosystem - │   ├── ANSYS - │   │   ├── ecosys_ansys.md - │   │   ├── ecosys_ansys.pkl - │   │   ├── ecosys_ansys_segment.txt - │   │   └── ecosys_ansys_tree.txt - │   ├── AWS - │   │   ├── tuto_G_rllib_integration.md - │   │   ├── tuto_G_rllib_integration.pkl - │   │   ├── tuto_G_rllib_integration_segment.txt - │   │   └── tuto_G_rllib_integration_tree.txt - │   ├── CarSIM - │   │   ├── tuto_G_carsim_integration.md - │   │   ├── tuto_G_carsim_integration.pkl - │   │   ├── tuto_G_carsim_integration_segment.txt - │   │   └── tuto_G_carsim_integration_tree.txt + ├── MonashDataFluency + │   ├── A (brief) Python refresher + │   │   ├── section-0-brief-python-refresher.md + │   │   ├── section-0-brief-python-refresher.md_metadata.yml + │   ├── API based scraping + │   │   ├── section-3-API-based-scraping.md + │   │   ├── section-3-API-based-scraping.md_metadata.yaml + │   ├── Getting started + │   │   ├── index.md + │   │   ├── index.md_metadata.yaml + │   ├── HTML based scraping + │   │   ├── section-2-HTML-based-scraping.md + │   │   ├── section-2-HTML-based-scraping.md_metadata.yaml + │   ├── Introduction to Web scraping + │   │   ├── section-1-intro-to-web-scraping.md + │   │   ├── section-1-intro-to-web-scraping.md_metadata.yaml + │   ├── Legal and Ethical Considerations + │   │   ├── section-5-legal-and-ethical-considerations.md + │   │   ├── section-5-legal-and-ethical-considerations.md_metadata.yaml + │   ├── References + │   │   ├── section-7-references.md + │   │   ├── section-7-references.md_metadata.yaml + │   ├── Wrangling and Analysis + │   │   ├── section-4-wrangling-and-analysis.md + │   │   ├── section-4-wrangling-and-analysis.md_metadata.yaml ``` - This is an example of the result of running the entire code. It forms a tree structure of the entire website from the root webpage `Carla`. - - Each webpage will have an individual folder containing, `.md`, `_segment.txt`, and `_md_tree.txt`. - - `.md`: This is the entire content of the webpage in markdown format. - - `_segment.txt`: This file contains all the headers and it's contents. - - `_tree.txt`: This file contains the tree structure and the segments of the tree structure of the webpage. - - Here is what a tree structure looks like in a webpage. - ``` - (Table of Contents) - Quick start package installation (h1) - --Before you begin (h2) - ----Windows (h3) - ----Linux (h3) - --CARLA installation (h2) - ----A. Debian CARLA installation (h3) - ----B. Package installation (h3) - --Import additional assets (h2) - --Install client library (h2) - ----CARLA versions prior to 0.9.12 (h3) - ----CARLA 0.9.12+ (h3) - --Running CARLA (h2) - ------Command-line options (h4) - --Updating CARLA (h2) - --Follow-up (h2) - ``` - - Each segment would be the path from the root to the leaf node. - - For example, the path to the node Linux would be `(h1) Quick Start Package Installation -> (h2) Before You Begin -> (h3) Linux`. - - The purpose of grouping the documents in this tree structure is to allow the embedding model to understand the relationship between the headers. If the embedding model is only given the content of `(h3) Linux`, it would not know what it is related to nor how we get to the point `(h3) Linux`. By adding the segments from previous headers, it becomes complete information that explains: this information is about "installation", then continues with "steps you need to do before you begin", and how you begin in "Linux". - -Now that you already have your documents ready, it's time to convert them into embeddings. + + This is an example of the result of running the entire code. It forms a tree structure of the entire website from the root webpage `MonashDataFluency`. Each subfolder will have some content extracted as well as a metadata file in the .yaml format. + +Now that you already have your documents ready, it's time to convert them into embeddings. \ No newline at end of file From 47ab43bda5f3863bb83fc87ee6ec693085a9ec18 Mon Sep 17 00:00:00 2001 From: terrianne-zhang Date: Thu, 18 Jul 2024 14:45:47 -0700 Subject: [PATCH 2/4] fixed video scraper, pdf scraper, updates to general scraper --- rag/scraper/Scraper_master/scrape_header.py | 12 ++++++------ rag/scraper/Scraper_master/scrape_pdf.py | 6 +++--- rag/scraper/Scraper_master/scrape_vid.py | 3 ++- 3 files changed, 11 insertions(+), 10 deletions(-) diff --git a/rag/scraper/Scraper_master/scrape_header.py b/rag/scraper/Scraper_master/scrape_header.py index e62015fa..53f7a78c 100644 --- a/rag/scraper/Scraper_master/scrape_header.py +++ b/rag/scraper/Scraper_master/scrape_header.py @@ -168,7 +168,7 @@ def download_pdf(self, url, filename): - filename(str): Name of the PDF file Returns: 1 on failure, file path on success. """ - file_path = os.path.join(os.getcwd(), filename) + file_path = os.path.join(os.getcwd(), filename + ".pdf") response = requests.get(url, headers=self.http_header) if response.status_code == 200: with open(file_path, 'wb') as f: @@ -181,7 +181,7 @@ def download_pdf(self, url, filename): # Override def content_extract(self, filename, url, **kwargs): - if url[-4] == ".pdf": + if ".pdf" in url: pdf_result = self.download_pdf(url, filename) return pdf_result else: @@ -201,10 +201,10 @@ def scrape(self): self.extract_unique_links(self.url,self.root,self.root_regex,self.root_filename,self.content_tags, self.delay) if __name__ == "__main__": - url = "https://docs.opencv.org/4.x/d6/d00/tutorial_py_root.html" - root_regex = r"^https://docs.opencv.org/4.x\/\w+\/\w+\/tutorial_py" - root = "https://docs.opencv.org/4.x/d6/d00/" - root_filename = "opencv" + url = "https://cs61a.org/" + root_regex = r"https://cs61a.org/" + root = "https://cs61a.org/" + root_filename = "61A_Website" content_tags = match_tags(url) scrapper = ScrapeHeader(url, root, root_regex, root_filename, content_tags) diff --git a/rag/scraper/Scraper_master/scrape_pdf.py b/rag/scraper/Scraper_master/scrape_pdf.py index 20ee2074..d25b44e9 100644 --- a/rag/scraper/Scraper_master/scrape_pdf.py +++ b/rag/scraper/Scraper_master/scrape_pdf.py @@ -18,13 +18,13 @@ def content_extract(self, filename, url, **kwargs): if response.status_code == 200: with open(filename, 'wb') as f: f.write(response.content) - print(f"Download completed successfully and saved as {self.root_filename}") + print(f"Download completed successfully and saved as {filename}") else: print(f"Failed to download the PDF. Status code: {response.status_code}") # Example usage: if __name__ == "__main__": - pdf_url = "http://example.com/path/to/your/pdf/file.pdf" # Replace with the actual PDF URL + pdf_url = "https://ucb-ee106.github.io/106b-sp23site/assets/hw/Homework_5__Grasping.pdf" # Replace with the actual PDF URL pdf_saver = ScrapePdf(pdf_url) # Specify the filename to save as - pdf_saver.content_extract("name", pdf_url) # Start the download process + pdf_saver.content_extract("HW5", pdf_url) # Start the download process diff --git a/rag/scraper/Scraper_master/scrape_vid.py b/rag/scraper/Scraper_master/scrape_vid.py index 9ab79f8e..bd9748cc 100644 --- a/rag/scraper/Scraper_master/scrape_vid.py +++ b/rag/scraper/Scraper_master/scrape_vid.py @@ -1,7 +1,8 @@ from rag.scraper.Scraper_master.base_scraper import BaseScraper -from pytube import Playlist, YouTube +from pytubefix import Playlist, YouTube import os from utils import save_to_file + class ScrapeVid(BaseScraper): def __init__(self, url, root_filename): super().__init__(url) From b5bc1e275fbc6102de4aaeaf914049a1aa27a5ca Mon Sep 17 00:00:00 2001 From: terrianne-zhang Date: Thu, 18 Jul 2024 14:48:33 -0700 Subject: [PATCH 3/4] comment changes --- rag/scraper/Scraper_master/scrape_header.py | 8 ++++---- rag/scraper/Scraper_master/scrape_pdf.py | 6 +++--- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/rag/scraper/Scraper_master/scrape_header.py b/rag/scraper/Scraper_master/scrape_header.py index 53f7a78c..2fb613d0 100644 --- a/rag/scraper/Scraper_master/scrape_header.py +++ b/rag/scraper/Scraper_master/scrape_header.py @@ -201,10 +201,10 @@ def scrape(self): self.extract_unique_links(self.url,self.root,self.root_regex,self.root_filename,self.content_tags, self.delay) if __name__ == "__main__": - url = "https://cs61a.org/" - root_regex = r"https://cs61a.org/" - root = "https://cs61a.org/" - root_filename = "61A_Website" + url = "https://docs.opencv.org/4.x/d6/d00/tutorial_py_root.html" + root_regex = r"^https://docs.opencv.org/4.x\/\w+\/\w+\/tutorial_py" + root = "https://docs.opencv.org/4.x/d6/d00/" + root_filename = "opencv" content_tags = match_tags(url) scrapper = ScrapeHeader(url, root, root_regex, root_filename, content_tags) diff --git a/rag/scraper/Scraper_master/scrape_pdf.py b/rag/scraper/Scraper_master/scrape_pdf.py index d25b44e9..a41a6bb7 100644 --- a/rag/scraper/Scraper_master/scrape_pdf.py +++ b/rag/scraper/Scraper_master/scrape_pdf.py @@ -25,6 +25,6 @@ def content_extract(self, filename, url, **kwargs): # Example usage: if __name__ == "__main__": - pdf_url = "https://ucb-ee106.github.io/106b-sp23site/assets/hw/Homework_5__Grasping.pdf" # Replace with the actual PDF URL - pdf_saver = ScrapePdf(pdf_url) # Specify the filename to save as - pdf_saver.content_extract("HW5", pdf_url) # Start the download process + pdf_url = "pdflink" # Replace with the actual PDF URL + pdf_saver = ScrapePdf(pdf_url) + pdf_saver.content_extract("filename", pdf_url) # Change filename to save as and start the download process From 9ef2afef72c1ca755a8a1235877dd1ec9b0f451d Mon Sep 17 00:00:00 2001 From: terrianne-zhang Date: Fri, 19 Jul 2024 13:28:53 -0700 Subject: [PATCH 4/4] changes to requirements.txt and scrape_header URL parsing --- rag/requirements.txt | 2 +- rag/scraper/Scraper_master/scrape_header.py | 15 ++++++++------- 2 files changed, 9 insertions(+), 8 deletions(-) diff --git a/rag/requirements.txt b/rag/requirements.txt index f56ba65a..befd7877 100644 --- a/rag/requirements.txt +++ b/rag/requirements.txt @@ -12,7 +12,7 @@ openai_whisper==20231117 packaging==24.0 pytest==8.1.1 python-dotenv==1.0.1 -pytube==15.0.0 +pytubefix==6.3.3 PyYAML==6.0.1 Requests==2.31.0 rst_to_myst==0.4.0 diff --git a/rag/scraper/Scraper_master/scrape_header.py b/rag/scraper/Scraper_master/scrape_header.py index 07ca3c8b..39d50f28 100644 --- a/rag/scraper/Scraper_master/scrape_header.py +++ b/rag/scraper/Scraper_master/scrape_header.py @@ -4,7 +4,7 @@ import time import re from termcolor import colored -from urllib.parse import urljoin +from urllib.parse import urljoin, unquote from markdownify import markdownify as md from rag.scraper.Scraper_master.base_scraper import BaseScraper import yaml @@ -70,6 +70,7 @@ def process_links_and_save(self, links, dir_name, delay, content_tags): link = link[:-1] filename = link.split('/')[-1] filename = filename.split('.')[0] + filename = unquote(filename).replace(' ', '_') cur_dir = os.getcwd() create_and_enter_dir(filename) # if not os.path.exists(filename): @@ -227,13 +228,13 @@ def run_tasks(yaml_file): scrapper.scrape() if __name__ == "__main__": - # url = "https://guide.berkeley.edu/courses/" - # root_regex = r"^https://classes.berkeley.edu/" - # root = "https://classes.berkeley.edu/" - # root_filename = "courses" - # # + # url = "https://wiki.ros.org/ROS/Tutorials/" + # root_regex = r"^https://wiki.ros.org/ROS/Tutorials/" + # root = "https://wiki.ros.org/ROS/Tutorials/" + # root_filename = "ROS" + # content_tags = match_tags(url) - # + # scrapper = ScrapeHeader(url, root, root_regex, root_filename, content_tags) # scrapper.scrape() run_tasks('106b_task.yaml')