diff --git a/src/instructlab/sdg/utils/chunkers.py b/src/instructlab/sdg/utils/chunkers.py index b292239c..ae08ce37 100644 --- a/src/instructlab/sdg/utils/chunkers.py +++ b/src/instructlab/sdg/utils/chunkers.py @@ -70,8 +70,7 @@ def resolve_ocr_options() -> OcrOptions: def split_docs_by_filetype(document_paths: List[Path]) -> Dict[str, List[Path]]: - """Split document paths into a dict of lists based on their file extension. - """ + """Split document paths into a dict of lists based on their file extension.""" document_dict = defaultdict(list) for path in document_paths: filetype = path.suffix @@ -83,7 +82,6 @@ def split_docs_by_filetype(document_paths: List[Path]) -> Dict[str, List[Path]]: return dict(document_dict) - # class DocumentChunker: # """A factory chunker class that instantiates the applicable chunker @@ -226,8 +224,7 @@ def split_docs_by_filetype(document_paths: List[Path]) -> Dict[str, List[Path]]: # return chunk_markdowns(self.document_contents, chunk_size) -class DocumentChunker(): # pylint: disable=too-many-instance-attributes - +class DocumentChunker: # pylint: disable=too-many-instance-attributes # def __new__( # cls, # leaf_node, @@ -267,13 +264,12 @@ def __init__( self.tokenizer = self.create_tokenizer(tokenizer_model_name) def _init_docling_converter(self): - """Initialize docling converter with filetype-specific configurations - """ + """Initialize docling converter with filetype-specific configurations""" # triggers torch loading, import lazily # pylint: disable=import-outside-toplevel # Third Party - from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline from docling.document_converter import DocumentConverter, PdfFormatOption + from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline if self.docling_model_path is None: logger.info("Docling models not found on disk, downloading models...") @@ -285,7 +281,7 @@ def _init_docling_converter(self): artifacts_path=self.docling_model_path, do_ocr=False, ) - + ocr_options = resolve_ocr_options() if ocr_options is not None: pipeline_options.do_ocr = True @@ -402,7 +398,9 @@ def create_tokenizer(model_name: str): # Third Party from transformers import AutoTokenizer - model_path = Path(model_name) # TODO expect a path from the DocumentChunker constructor + model_path = Path( + model_name + ) # TODO expect a path from the DocumentChunker constructor error_info_message = ( "Please run `ilab model download {download_args}` and try again" ) @@ -583,7 +581,9 @@ def build_chunks_from_docling_json( ) book_text = self.get_table(json_book, book_element["$ref"]) elif book_element["prov"]: - current_book_page_number = book_element["prov"][0]["page"] # TODO export to function to handle empty ["prov"] + current_book_page_number = book_element["prov"][0][ + "page" + ] # TODO export to function to handle empty ["prov"] book_text = book_element["text"] else: current_book_page_number = None diff --git a/src/instructlab/sdg/utils/taxonomy.py b/src/instructlab/sdg/utils/taxonomy.py index 5ac466e6..8a5e7dbc 100644 --- a/src/instructlab/sdg/utils/taxonomy.py +++ b/src/instructlab/sdg/utils/taxonomy.py @@ -111,12 +111,12 @@ def _get_taxonomy(repo="taxonomy"): def _string_contains_html(s: str) -> bool: """Detect HTML tags in a string. - + We use this to catch markdown files that may contain html elements since docling does not support this.""" # Define a regex to detect HTML tags html_tag_pattern = re.compile(r"<\/?[a-zA-Z][\s\S]*?>") - + # Check for HTML tags in the content return bool(html_tag_pattern.search(s)) @@ -173,11 +173,13 @@ def _get_documents( with open(file_path, "r", encoding="utf-8") as file: content = file.read() if _string_contains_html(content): - raise ValueError(f"Provided markdown file {file_path} contains" - " HTML, which is currently unsupported. Please" - " format your markdown documents without the" - " use of HTML or use a different document" - " filetype.") + raise ValueError( + f"Provided markdown file {file_path} contains" + " HTML, which is currently unsupported. Please" + " format your markdown documents without the" + " use of HTML or use a different document" + " filetype." + ) file_contents.append(content) filepaths.append(Path(file_path)) logger.info( diff --git a/tests/test_chunkers.py b/tests/test_chunkers.py index ff1079d5..42db1ac3 100644 --- a/tests/test_chunkers.py +++ b/tests/test_chunkers.py @@ -11,10 +11,7 @@ import pytest # First Party -from instructlab.sdg.utils.chunkers import ( - DocumentChunker, - resolve_ocr_options, -) +from instructlab.sdg.utils.chunkers import DocumentChunker, resolve_ocr_options # Local from .testdata import testdata @@ -32,7 +29,9 @@ def tokenizer_model_name(): return os.path.join(TEST_DATA_DIR, "models/instructlab/granite-7b-lab") -def test_init_document_chunker_unsupported_filetype(documents_dir, tokenizer_model_name): +def test_init_document_chunker_unsupported_filetype( + documents_dir, tokenizer_model_name +): """Test that the DocumentChunker factory class fails when provided an unsupported document""" document_paths = [documents_dir / "document.jpg"] with pytest.raises(ValueError): diff --git a/tests/test_generate_data.py b/tests/test_generate_data.py index 4c296d62..9b8f2518 100644 --- a/tests/test_generate_data.py +++ b/tests/test_generate_data.py @@ -314,7 +314,9 @@ def test_generate(self): client=MagicMock(), logger=mocked_logger, model_family="granite", - model_name=os.path.join(TEST_DATA_DIR, "models/instructlab/granite-7b-lab"), + model_name=os.path.join( + TEST_DATA_DIR, "models/instructlab/granite-7b-lab" + ), num_instructions_to_generate=10, taxonomy=self.test_taxonomy.root, taxonomy_base=TEST_TAXONOMY_BASE, @@ -391,7 +393,9 @@ def test_generate(self): client=MagicMock(), logger=mocked_logger, model_family="granite", - model_name=os.path.join(TEST_DATA_DIR, "models/instructlab/granite-7b-lab"), + model_name=os.path.join( + TEST_DATA_DIR, "models/instructlab/granite-7b-lab" + ), num_instructions_to_generate=10, taxonomy=self.test_taxonomy.root, taxonomy_base=TEST_TAXONOMY_BASE, @@ -489,7 +493,9 @@ def test_generate(self): client=MagicMock(), logger=mocked_logger, model_family="granite", - model_name=os.path.join(TEST_DATA_DIR, "models/instructlab/granite-7b-lab"), + model_name=os.path.join( + TEST_DATA_DIR, "models/instructlab/granite-7b-lab" + ), num_instructions_to_generate=10, taxonomy=self.test_taxonomy.root, taxonomy_base=TEST_TAXONOMY_BASE, diff --git a/tests/test_taxonomy.py b/tests/test_taxonomy.py index 32114fcc..0b697bd7 100644 --- a/tests/test_taxonomy.py +++ b/tests/test_taxonomy.py @@ -86,13 +86,13 @@ def test_read_taxonomy_leaf_nodes( ): seed_example_exists = True assert seed_example_exists is True - + @pytest.mark.parametrize( "s, contains_html", [ ("hello, world!", False), ("hello,