Skip to content

Commit

Permalink
use base to_page instead of subclass
Browse files Browse the repository at this point in the history
  • Loading branch information
Catrunaround committed Jul 22, 2024
1 parent f2aef2a commit 1987188
Show file tree
Hide file tree
Showing 14 changed files with 104 additions and 94 deletions.
11 changes: 0 additions & 11 deletions output_tmp/expected_output/61a-sp24-mt2_sol_2_pages.md

This file was deleted.

Binary file not shown.
2 changes: 1 addition & 1 deletion rag/file_conversion_router/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,4 +17,4 @@ def convert_directory(input_dir: Union[str, Path], output_dir: Union[str, Path])
process_folder(input_dir, output_dir)

if __name__ == "__main__":
convert_directory("output_tmp\\input", "output_tmp\expected_output")
convert_directory("tests\\test_rag\\data\\integrated_tests\\input_folder2_nested_folder_pdf+md\\mds\\mds_1", "tests\\test_rag\\data\\integrated_tests\\expected_output_folder2_nested_folder_pdf+md\\mds\\mds_1\\section-0-brief-python-refresher")
33 changes: 28 additions & 5 deletions rag/file_conversion_router/conversion/base_converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
from rag.file_conversion_router.utils.logger import conversion_logger, logger
from rag.file_conversion_router.utils.utils import calculate_hash, ensure_path
from rag.file_conversion_router.classes.page import Page
from rag.file_conversion_router.classes.vidpage import VidPage


class BaseConverter(ABC):
Expand Down Expand Up @@ -158,11 +159,33 @@ def _perform_conversion(self, input_path: Path, output_folder: Path) -> None:
page.to_chunk()
page.chunks_to_pkl(str(pkl_output_path))

@abstractmethod
def _to_page(self, input_path: Path, output_path: Path) -> Page:
"""Convert the input file to Expected Page format. To be implemented by subclasses."""
raise NotImplementedError("This method should be overridden by subclasses.")

# @abstractmethod
# def _to_page(self, input_path: Path, output_path: Path) -> Page:
# """Convert the input file to Expected Page format. To be implemented by subclasses."""
# raise NotImplementedError("This method should be overridden by subclasses.")


def _to_page(self, input_path: Path, output_path: Path, file_type: str = "markdown") -> Page:
output_path.parent.mkdir(parents = True, exist_ok = True)
stem = input_path.stem
file_type = input_path.suffix.lstrip('.')

md_path = self._to_markdown(input_path, output_path)
with open(md_path, "r") as input_file:
content_text = input_file.read()

metadata_path = input_path.with_name(f"{input_path.stem}_metadata.yaml")
metadata_content = self._read_metadata(metadata_path)
url = metadata_content.get("URL")

if file_type == "mp4":
timestamp = [i[1] for i in self.paragraphs]
content = {"text": content_text, "timestamp": timestamp}
return VidPage(pagename = stem, content = content, filetype = file_type, page_url = url)
else:
content = {"text": content_text}
return Page(pagename = stem, content = content, filetype = file_type, page_url=url)

@abstractmethod
def _to_markdown(self, input_path: Path, output_path: Path) -> None:
"""Convert the input file to Expected Markdown format. To be implemented by subclasses."""
Expand Down
37 changes: 18 additions & 19 deletions rag/file_conversion_router/conversion/md_converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,22 +24,21 @@ def _to_markdown(self, input_path: Path, output_path: Path) -> Path:
content = input_file.read()
output_file.write(content)
return output_path
def _to_page(self, input_path: Path, output_path: Path) -> Page:
"""Perform Markdown to Page conversion."""
try:
md_file = self._to_markdown(input_path, output_path,)
except Exception as e:
self._logger.error(f"An error occurred during markdown conversion: {str(e)}")
raise

output_path.parent.mkdir(parents=True, exist_ok=True)

filetype = md_file.suffix.lstrip('.')
with open(md_file, "r") as input_file:
text = input_file.read()


metadata_path = input_path.with_name(f"{input_path.stem}_metadata.yaml")
metadata_content = self._read_metadata(metadata_path)
url = metadata_content.get("URL")
return Page(pagename = input_path.stem, content={'text': text}, filetype=filetype, page_url=url)
# def _to_page(self, input_path: Path, output_path: Path) -> Page:
# """Perform Markdown to Page conversion."""
# try:
# self._to_markdown(input_path, output_path)
# except Exception as e:
# self._logger.error(f"An error occurred during markdown conversion: {str(e)}")
# raise

# output_path.parent.mkdir(parents=True, exist_ok=True)

# filetype = input_path.suffix.lstrip('.')
# with open(input_path, "r") as input_file:
# text = input_file.read()

# metadata_path = input_path.with_name(f"{input_path.stem}_metadata.yaml")
# metadata_content = self._read_metadata(metadata_path)
# url = metadata_content.get("URL")
# return Page(pagename=input_path.stem, content={'text': text}, filetype=filetype, page_url=url)
29 changes: 14 additions & 15 deletions rag/file_conversion_router/conversion/pdf_converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -179,22 +179,21 @@ def _to_markdown(self, input_path: Path, output_path: Path) -> Path:
# self._logger.error(f"An error occurred {str(e)})")
# raise

def _to_page(self, input_path: Path, output_path: Path) -> Page:
"""Perform Markdown to Page conversion."""
try:
md_file = self._to_markdown(input_path, output_path,)
except Exception as e:
self._logger.error(f"An error occurred during markdown conversion: {str(e)}")
raise
# def _to_page(self, input_path: Path, output_path: Path) -> Page:
# """Perform Markdown to Page conversion."""
# md_file = self._to_markdown(input_path, output_path,)
# # except Exception as e:
# # self._logger.error(f"An error occurred during markdown conversion: {str(e)}")
# # raise

output_path.parent.mkdir(parents=True, exist_ok=True)
# output_path.parent.mkdir(parents = True, exist_ok = True)

filetype = md_file.suffix.lstrip('.')
with open(md_file, "r") as input_file:
text = input_file.read()
# filetype = md_file.suffix.lstrip('.')
# with open(md_file, "r") as input_file:
# text = input_file.read()


metadata_path = input_path.with_name(f"{input_path.stem}_metadata.yaml")
metadata_content = self._read_metadata(metadata_path)
url = metadata_content.get("URL")
return Page(pagename=input_path.stem, content={'text': text}, filetype=filetype, page_url=url)
# metadata_path = input_path.with_name(f"{input_path.stem}_metadata.yaml")
# metadata_content = self._read_metadata(metadata_path)
# url = metadata_content.get("URL")
# return Page(pagename = input_path.stem, content={'text': text}, filetype = filetype, page_url=url)
43 changes: 20 additions & 23 deletions rag/file_conversion_router/conversion/rst_converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,26 +26,23 @@ def _to_markdown(self, input_path: Path, output_path: Path) -> Path:
output_file.write(content.text)
return output_path

def _to_page(self, input_path: Path, output_path: Path) -> Page:
"""Perform Markdown to Page conversion."""
try:
md_file = self._to_markdown(input_path, output_path,)
except Exception as e:
self._logger.error(f"An error occurred during markdown conversion: {str(e)}")
raise

output_path.parent.mkdir(parents=True, exist_ok=True)

filetype = md_file.suffix.lstrip('.')
with open(md_file, "r") as input_file:
text = input_file.read()


metadata_path = input_path.with_name(f"{input_path.stem}_metadata.yaml")
metadata_content = self._read_metadata(metadata_path)
url = metadata_content.get("URL")
return Page(pagename = input_path.stem, content={'text': text}, filetype=filetype, page_url=url)


# converter = RstConverter()
# converter._to_markdown(Path("/home/bot/roarai/rag/scraper/Scraper_master/Moveit/index.rst"), Path("/home/bot/roarai/rag/scraper/Scraper_master/"))
# def _to_page(self, input_path: Path, output_path: Path) -> Page:
# """Perform Markdown to Page conversion."""

# output_path.parent.mkdir(parents = True, exist_ok = True)

# parent = input_path.parent
# self._to_markdown(input_path, output_path)
# stem = input_path.stem
# filetype = input_path.suffix.split(".")[1]

# with open(input_path, "r") as input_file:
# text = input_file.read()
# metadata = parent / (stem+"_metadata.yaml")

# with open(metadata, "r") as metadata_file:
# metadata_content = yaml.safe_load(metadata_file)

# url = metadata_content["URL"]
# page = Page(pagename=stem, content={'text': text}, filetype=filetype, page_url=url)
# return page
40 changes: 21 additions & 19 deletions rag/file_conversion_router/conversion/video_converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -167,22 +167,24 @@ def _to_markdown(self, input_path, output_path):
md_file.write(markdown_content)
return md_path

def _to_page(self, input_path: Path, output_path: Path) -> Page:
"""Perform Markdown to Page conversion."""
try:
md_file = self._to_markdown(input_path, output_path,)
except Exception as e:
self._logger.error(f"An error occurred during markdown conversion: {str(e)}")
raise

output_path.parent.mkdir(parents=True, exist_ok=True)

filetype = md_file.suffix.lstrip('.')
with open(md_file, "r") as input_file:
text = input_file.read()


metadata_path = input_path.with_name(f"{input_path.stem}_metadata.yaml")
metadata_content = self._read_metadata(metadata_path)
url = metadata_content.get("URL")
return Page(pagename = input_path.stem, content={'text': text}, filetype=filetype, page_url=url)
# def _to_page(self, input_path: Path, output_path: Path) -> Page:
# """Perform mp4 to Page conversion."""

# output_path.parent.mkdir(parents = True, exist_ok = True)

# parent = input_path.parent
# stem = input_path.stem
# filetype = input_path.suffix.split(".")[1]
# md_path = self._to_markdown(input_path, output_path)

# with open(md_path, "r") as md_file:
# md_content = md_file.read()
# metadata = parent / (stem+"_metadata.yml")

# with open(metadata, "r") as metadata_file:
# metadata_content = yaml.safe_load(metadata_file)
# url = metadata_content["URL"]

# timestamp = [i[1] for i in self.paragraphs]
# page = VidPage(pagename = stem,content={"text": md_content, "timestamp": timestamp}, filetype = filetype, page_url = url)
# return page
3 changes: 2 additions & 1 deletion rag/file_conversion_router/services/directory_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,8 +58,9 @@ def process_folder(input_dir: Union[str, Path], output_dir: Union[str, Path]) ->
if input_file_path.suffix in valid_extensions and input_file_path.is_file():
# Construct the output subdirectory and file path
output_subdir = output_dir / input_file_path.relative_to(input_dir).parent
# output_subdir.mkdir(parents=True, exist_ok=True)
output_subdir.mkdir(parents=True, exist_ok=True)
output_file_path = output_subdir / input_file_path.stem
# output_file_path = output_subdir

# Instantiate a new converter object for each file based on the file extension
converter_class = converter_mapping.get(input_file_path.suffix)
Expand Down
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.

0 comments on commit 1987188

Please sign in to comment.