Skip to content

Commit

Permalink
fix bug on to page
Browse files Browse the repository at this point in the history
  • Loading branch information
Catrunaround committed Jul 19, 2024
1 parent 010fdff commit bc2727f
Show file tree
Hide file tree
Showing 6 changed files with 44 additions and 41 deletions.
2 changes: 2 additions & 0 deletions rag/file_conversion_router/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,3 +16,5 @@ def convert_directory(input_dir: Union[str, Path], output_dir: Union[str, Path])
"""
process_folder(input_dir, output_dir)

if __name__ == "__main__" :
convert_directory("tests\\test_rag\\data\\integrated_tests\\input_folder2_nested_folder_pdf+md", "tests\\test_rag\\data\\integrated_tests\\output_test")
10 changes: 5 additions & 5 deletions rag/file_conversion_router/conversion/md_converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,19 +27,19 @@ def _to_markdown(self, input_path: Path, output_path: Path) -> Path:
def _to_page(self, input_path: Path, output_path: Path) -> Page:
"""Perform Markdown to Page conversion."""
try:
self._to_markdown(input_path, output_path)
md_file_path = self._to_markdown(input_path, output_path)
except Exception as e:
self._logger.error(f"An error occurred during markdown conversion: {str(e)}")
raise

output_path.parent.mkdir(parents=True, exist_ok=True)

filetype = input_path.suffix.lstrip('.')
with open(input_path, "r") as input_file:
filetype = md_file_path.suffix.lstrip('.')
with open(md_file_path, "r") as input_file:
text = input_file.read()

metadata_path = input_path.with_name(f"{input_path.stem}_metadata.yaml")
metadata_path = md_file_path.with_name(f"{md_file_path.stem}_metadata.yaml")
metadata_content = self._read_metadata(metadata_path)
url = metadata_content.get("URL")
return Page(pagename=input_path.stem, content={'text': text}, filetype=filetype, page_url=url)
return Page(pagename=md_file_path.stem, content={'text': text}, filetype=filetype, page_url=url)

12 changes: 6 additions & 6 deletions rag/file_conversion_router/conversion/pdf_converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -182,21 +182,21 @@ def _to_markdown(self, input_path: Path, output_path: Path) -> Path:
def _to_page(self, input_path: Path, output_path: Path) -> Page:
"""Perform Markdown to Page conversion."""
try:
self._to_markdown(input_path, output_path,)
md_file_path = self._to_markdown(input_path, output_path,)
except Exception as e:
self._logger.error(f"An error occurred during markdown conversion: {str(e)}")
raise

output_path.parent.mkdir(parents=True, exist_ok=True)

filetype = input_path.suffix.lstrip('.')
with open(input_path, "r") as input_file:
text = input_file.read()
filetype = md_file_path.suffix.lstrip('.')
with open(md_file_path, "r") as input_file:
text = md_file_path.read()


metadata_path = input_path.with_name(f"{input_path.stem}_metadata.yaml")
metadata_path = md_file_path.with_name(f"{md_file_path.stem}_metadata.yaml")
metadata_content = self._read_metadata(metadata_path)
url = metadata_content.get("URL")
print("PDF",url)
return Page(pagename=input_path.stem, content={'text': text}, filetype=filetype, page_url=url)
return Page(pagename = md_file_path.stem, content={'text': text}, filetype=filetype, page_url = url)

28 changes: 15 additions & 13 deletions rag/file_conversion_router/conversion/rst_converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@
from rst_to_myst import rst_to_myst
import yaml


class RstConverter(BaseConverter):
def __init__(self):
super().__init__()
Expand All @@ -29,20 +28,23 @@ def _to_markdown(self, input_path: Path, output_path: Path) -> Path:

def _to_page(self, input_path: Path, output_path: Path) -> Page:
"""Perform Markdown to Page conversion."""
try:
md_file_path = self._to_markdown(input_path, output_path)
except Exception as e:
self._logger.error(f"An error occurred during markdown conversion: {str(e)}")
raise

output_path.parent.mkdir(parents=True, exist_ok=True)
parent = input_path.parent
self._to_markdown(input_path, output_path)
stem = input_path.stem
filetype = input_path.suffix.split(".")[1]
with open(input_path, "r") as input_file:

filetype = md_file_path.suffix.lstrip('.')
with open(md_file_path, "r") as input_file:
text = input_file.read()
metadata = parent / (stem+"_metadata.yaml")
with open(metadata, "r") as metadata_file:
metadata_content = yaml.safe_load(metadata_file)
url = metadata_content["URL"]
page = Page(pagename=stem, content={'text': text}, filetype=filetype, page_url=url)
return page
#

metadata_path = md_file_path.with_name(f"{md_file_path.stem}_metadata.yaml")
metadata_content = self._read_metadata(metadata_path)
url = metadata_content.get("URL")
return Page(pagename=md_file_path.stem, content={'text': text}, filetype=filetype, page_url=url)


# converter = RstConverter()
# converter._to_markdown(Path("/home/bot/roarai/rag/scraper/Scraper_master/Moveit/index.rst"), Path("/home/bot/roarai/rag/scraper/Scraper_master/"))
30 changes: 15 additions & 15 deletions rag/file_conversion_router/conversion/video_converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -168,21 +168,21 @@ def _to_markdown(self, input_path, output_path):
return md_path

def _to_page(self, input_path: Path, output_path: Path) -> Page:
"""Perform mp4 to Page conversion."""
"""Perform Markdown to Page conversion."""
try:
md_file_path = self._to_markdown(input_path, output_path)
except Exception as e:
self._logger.error(f"An error occurred during markdown conversion: {str(e)}")
raise

output_path.parent.mkdir(parents=True, exist_ok=True)
parent = input_path.parent
stem = input_path.stem
filetype = input_path.suffix.split(".")[1]
md_path = self._to_markdown(input_path, output_path)
with open(md_path, "r") as md_file:
md_content = md_file.read()
metadata = parent / (stem+"_metadata.yml")
with open(metadata, "r") as metadata_file:
metadata_content = yaml.safe_load(metadata_file)
url = metadata_content["URL"]
timestamp = [i[1] for i in self.paragraphs]
page = VidPage(pagename=stem,content={"text": md_content, "timestamp": timestamp}, filetype=filetype, page_url=url)

return page

filetype = md_file_path.suffix.lstrip('.')
with open(md_file_path, "r") as input_file:
text = input_file.read()

metadata_path = md_file_path.with_name(f"{md_file_path.stem}_metadata.yaml")
metadata_content = self._read_metadata(metadata_path)
url = metadata_content.get("URL")
return Page(pagename=md_file_path.stem, content={'text': text}, filetype=filetype, page_url=url)

3 changes: 1 addition & 2 deletions rag/file_conversion_router/services/directory_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,8 +59,7 @@ def process_folder(input_dir: Union[str, Path], output_dir: Union[str, Path]) ->
# Construct the output subdirectory and file path
output_subdir = output_dir / input_file_path.relative_to(input_dir).parent
output_subdir.mkdir(parents=True, exist_ok=True)
# output_file_path = output_subdir / input_file_path.stem
output_file_path = output_subdir
output_file_path = output_subdir / input_file_path.stem

# Instantiate a new converter object for each file based on the file extension
converter_class = converter_mapping.get(input_file_path.suffix)
Expand Down

0 comments on commit bc2727f

Please sign in to comment.