Skip to content

Commit

Permalink
fix
Browse files Browse the repository at this point in the history
  • Loading branch information
FranardoHuang committed Jul 19, 2024
1 parent 541a279 commit 42255ea
Show file tree
Hide file tree
Showing 2 changed files with 18 additions and 3 deletions.
12 changes: 9 additions & 3 deletions rag/scraper/Scraper_master/scrape_header.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
from rag.scraper.Scraper_master.base_scraper import BaseScraper
import yaml

from utils import create_and_enter_dir, remove_consecutive_empty_lines, save_to_file,remove_slash_and_hash, cd_home,get_crawl_delay
from utils import create_and_enter_dir, delete_and_exit_dir, remove_consecutive_empty_lines, save_to_file,remove_slash_and_hash, cd_home,get_crawl_delay


content_tags_dict = {
Expand Down Expand Up @@ -72,14 +72,20 @@ def process_links_and_save(self, links, dir_name, delay, content_tags):
filename = filename.split('.')[0]
cur_dir = os.getcwd()
create_and_enter_dir(filename)
# if not os.path.exists(filename):
# os.makedirs(filename, exist_ok=True)
error = self.content_extract(filename, link, content_tags=content_tags)
self.metadata_extract(filename, link)
# print("error", error)
if error == 1:
print("error",filename)
delete_and_exit_dir()
continue
self.metadata_extract(filename, link)
os.chdir(cur_dir)
time.sleep(delay)



def extract_unique_links(self, url, root, root_regex, root_filename, content_tags, delay=0, found_links=[]):
print("extract_unique_links")
"""
Expand Down Expand Up @@ -208,14 +214,14 @@ def run_tasks(yaml_file):
with open(yaml_file, 'r') as file:
configuration=yaml.safe_load(file)
root=configuration['root_folder']
root=os.path.abspath(root)
for task in configuration['tasks']:
url=task['url']
base_url = url.split('/')
base_url = '/'.join(base_url[:3]) + '/'
base_regex = rf"^{base_url}"
root_folder = root + '/' + task['name']
content_tags = match_tags(url)
print(root_folder)

scrapper = ScrapeHeader(url, base_url, base_regex, root_folder, content_tags)
scrapper.scrape()
Expand Down
9 changes: 9 additions & 0 deletions rag/scraper/Scraper_master/utils.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import os
import re
import urllib.robotparser as robotparser
import shutil

# Scrape general

Expand Down Expand Up @@ -36,6 +37,14 @@ def create_and_enter_dir(directory_name):
os.makedirs(directory_name, exist_ok=True)
os.chdir(directory_name)

def delete_and_exit_dir():
"""
Deletes the current directory and files in it and exits it.
"""
cur_dir = os.getcwd()
os.chdir('..')
shutil.rmtree(cur_dir)

def cd_home(url):
"""
Returns the home directory of a given URL.
Expand Down

0 comments on commit 42255ea

Please sign in to comment.