fix

augcog · Jul 19, 2024 · 42255ea · 42255ea
1 parent 541a279
commit 42255ea
Show file tree

Hide file tree

Showing 2 changed files with 18 additions and 3 deletions.
diff --git a/rag/scraper/Scraper_master/scrape_header.py b/rag/scraper/Scraper_master/scrape_header.py
@@ -9,7 +9,7 @@
 from rag.scraper.Scraper_master.base_scraper import BaseScraper
 import yaml
 
-from utils import create_and_enter_dir, remove_consecutive_empty_lines, save_to_file,remove_slash_and_hash, cd_home,get_crawl_delay
+from utils import create_and_enter_dir, delete_and_exit_dir, remove_consecutive_empty_lines, save_to_file,remove_slash_and_hash, cd_home,get_crawl_delay
 
 
 content_tags_dict = {
@@ -72,14 +72,20 @@ def process_links_and_save(self, links, dir_name, delay, content_tags):
             filename = filename.split('.')[0]
             cur_dir = os.getcwd()
             create_and_enter_dir(filename)
+            # if not os.path.exists(filename):
+            #     os.makedirs(filename, exist_ok=True)
             error = self.content_extract(filename, link, content_tags=content_tags)
-            self.metadata_extract(filename, link)
+            # print("error", error)
             if error == 1:
+                print("error",filename)
+                delete_and_exit_dir()
                 continue
+            self.metadata_extract(filename, link)
             os.chdir(cur_dir)
             time.sleep(delay)
 
 
+
     def extract_unique_links(self, url, root, root_regex, root_filename, content_tags, delay=0, found_links=[]):
         print("extract_unique_links")
         """
@@ -208,14 +214,14 @@ def run_tasks(yaml_file):
     with open(yaml_file, 'r') as file:
         configuration=yaml.safe_load(file)
         root=configuration['root_folder']
+        root=os.path.abspath(root)
         for task in configuration['tasks']:
             url=task['url']
             base_url = url.split('/')
             base_url = '/'.join(base_url[:3]) + '/'
             base_regex = rf"^{base_url}"
             root_folder = root + '/' + task['name']
             content_tags = match_tags(url)
-            print(root_folder)
 
             scrapper = ScrapeHeader(url, base_url, base_regex, root_folder, content_tags)
             scrapper.scrape()

diff --git a/rag/scraper/Scraper_master/utils.py b/rag/scraper/Scraper_master/utils.py
@@ -1,6 +1,7 @@
 import os
 import re
 import urllib.robotparser as robotparser
+import shutil
 
 # Scrape general
 
@@ -36,6 +37,14 @@ def create_and_enter_dir(directory_name):
             os.makedirs(directory_name, exist_ok=True)
         os.chdir(directory_name)
 
+def delete_and_exit_dir():
+    """
+    Deletes the current directory and files in it and exits it.
+    """
+    cur_dir = os.getcwd()
+    os.chdir('..')
+    shutil.rmtree(cur_dir)
+
 def cd_home(url):
     """
     Returns the home directory of a given URL.