From 42255eabd0ee5683237eac5e3a16c0f145e411b7 Mon Sep 17 00:00:00 2001 From: FranardoHuang Date: Thu, 18 Jul 2024 18:44:38 -0700 Subject: [PATCH] fix --- rag/scraper/Scraper_master/scrape_header.py | 12 +++++++++--- rag/scraper/Scraper_master/utils.py | 9 +++++++++ 2 files changed, 18 insertions(+), 3 deletions(-) diff --git a/rag/scraper/Scraper_master/scrape_header.py b/rag/scraper/Scraper_master/scrape_header.py index a7bee76b..07ca3c8b 100644 --- a/rag/scraper/Scraper_master/scrape_header.py +++ b/rag/scraper/Scraper_master/scrape_header.py @@ -9,7 +9,7 @@ from rag.scraper.Scraper_master.base_scraper import BaseScraper import yaml -from utils import create_and_enter_dir, remove_consecutive_empty_lines, save_to_file,remove_slash_and_hash, cd_home,get_crawl_delay +from utils import create_and_enter_dir, delete_and_exit_dir, remove_consecutive_empty_lines, save_to_file,remove_slash_and_hash, cd_home,get_crawl_delay content_tags_dict = { @@ -72,14 +72,20 @@ def process_links_and_save(self, links, dir_name, delay, content_tags): filename = filename.split('.')[0] cur_dir = os.getcwd() create_and_enter_dir(filename) + # if not os.path.exists(filename): + # os.makedirs(filename, exist_ok=True) error = self.content_extract(filename, link, content_tags=content_tags) - self.metadata_extract(filename, link) + # print("error", error) if error == 1: + print("error",filename) + delete_and_exit_dir() continue + self.metadata_extract(filename, link) os.chdir(cur_dir) time.sleep(delay) + def extract_unique_links(self, url, root, root_regex, root_filename, content_tags, delay=0, found_links=[]): print("extract_unique_links") """ @@ -208,6 +214,7 @@ def run_tasks(yaml_file): with open(yaml_file, 'r') as file: configuration=yaml.safe_load(file) root=configuration['root_folder'] + root=os.path.abspath(root) for task in configuration['tasks']: url=task['url'] base_url = url.split('/') @@ -215,7 +222,6 @@ def run_tasks(yaml_file): base_regex = rf"^{base_url}" root_folder = root + '/' + task['name'] content_tags = match_tags(url) - print(root_folder) scrapper = ScrapeHeader(url, base_url, base_regex, root_folder, content_tags) scrapper.scrape() diff --git a/rag/scraper/Scraper_master/utils.py b/rag/scraper/Scraper_master/utils.py index d72a5884..b5b27914 100644 --- a/rag/scraper/Scraper_master/utils.py +++ b/rag/scraper/Scraper_master/utils.py @@ -1,6 +1,7 @@ import os import re import urllib.robotparser as robotparser +import shutil # Scrape general @@ -36,6 +37,14 @@ def create_and_enter_dir(directory_name): os.makedirs(directory_name, exist_ok=True) os.chdir(directory_name) +def delete_and_exit_dir(): + """ + Deletes the current directory and files in it and exits it. + """ + cur_dir = os.getcwd() + os.chdir('..') + shutil.rmtree(cur_dir) + def cd_home(url): """ Returns the home directory of a given URL.