diff --git a/rag/requirements.txt b/rag/requirements.txt index f56ba65a..befd7877 100644 --- a/rag/requirements.txt +++ b/rag/requirements.txt @@ -12,7 +12,7 @@ openai_whisper==20231117 packaging==24.0 pytest==8.1.1 python-dotenv==1.0.1 -pytube==15.0.0 +pytubefix==6.3.3 PyYAML==6.0.1 Requests==2.31.0 rst_to_myst==0.4.0 diff --git a/rag/scraper/Scraper_master/scrape_header.py b/rag/scraper/Scraper_master/scrape_header.py index 07ca3c8b..39d50f28 100644 --- a/rag/scraper/Scraper_master/scrape_header.py +++ b/rag/scraper/Scraper_master/scrape_header.py @@ -4,7 +4,7 @@ import time import re from termcolor import colored -from urllib.parse import urljoin +from urllib.parse import urljoin, unquote from markdownify import markdownify as md from rag.scraper.Scraper_master.base_scraper import BaseScraper import yaml @@ -70,6 +70,7 @@ def process_links_and_save(self, links, dir_name, delay, content_tags): link = link[:-1] filename = link.split('/')[-1] filename = filename.split('.')[0] + filename = unquote(filename).replace(' ', '_') cur_dir = os.getcwd() create_and_enter_dir(filename) # if not os.path.exists(filename): @@ -227,13 +228,13 @@ def run_tasks(yaml_file): scrapper.scrape() if __name__ == "__main__": - # url = "https://guide.berkeley.edu/courses/" - # root_regex = r"^https://classes.berkeley.edu/" - # root = "https://classes.berkeley.edu/" - # root_filename = "courses" - # # + # url = "https://wiki.ros.org/ROS/Tutorials/" + # root_regex = r"^https://wiki.ros.org/ROS/Tutorials/" + # root = "https://wiki.ros.org/ROS/Tutorials/" + # root_filename = "ROS" + # content_tags = match_tags(url) - # + # scrapper = ScrapeHeader(url, root, root_regex, root_filename, content_tags) # scrapper.scrape() run_tasks('106b_task.yaml')