Update for ruff and pylint issues

Signed-off-by: Aakanksha Duggal <[email protected]>
instructlab · Jan 6, 2025 · 309fd11 · 309fd11
1 parent 1722161
commit 309fd11
Show file tree

Hide file tree

Showing 5 changed files with 37 additions and 32 deletions.
diff --git a/src/instructlab/sdg/utils/chunkers.py b/src/instructlab/sdg/utils/chunkers.py
@@ -1,9 +1,7 @@
 # Standard
-from abc import ABC, abstractmethod
 from collections import defaultdict
-from enum import Enum
 from pathlib import Path
-from typing import Dict, Iterable, List, Optional, Tuple
+from typing import Dict, Iterable, List, Optional
 import json
 import logging
 import re
@@ -70,8 +68,7 @@ def resolve_ocr_options() -> OcrOptions:
 
 
 def split_docs_by_filetype(document_paths: List[Path]) -> Dict[str, List[Path]]:
-    """Split document paths into a dict of lists based on their file extension.
-    """
+    """Split document paths into a dict of lists based on their file extension."""
     document_dict = defaultdict(list)
     for path in document_paths:
         filetype = path.suffix
@@ -83,7 +80,6 @@ def split_docs_by_filetype(document_paths: List[Path]) -> Dict[str, List[Path]]:
     return dict(document_dict)
 
 
-
 # class DocumentChunker:
 #     """A factory chunker class that instantiates the applicable chunker
 
@@ -226,8 +222,7 @@ def split_docs_by_filetype(document_paths: List[Path]) -> Dict[str, List[Path]]:
 #         return chunk_markdowns(self.document_contents, chunk_size)
 
 
-class DocumentChunker():  # pylint: disable=too-many-instance-attributes
-
+class DocumentChunker:  # pylint: disable=too-many-instance-attributes
     # def __new__(
     #     cls,
     #     leaf_node,
@@ -267,13 +262,12 @@ def __init__(
         self.tokenizer = self.create_tokenizer(tokenizer_model_name)
 
     def _init_docling_converter(self):
-        """Initialize docling converter with filetype-specific configurations
-        """
+        """Initialize docling converter with filetype-specific configurations"""
         # triggers torch loading, import lazily
         # pylint: disable=import-outside-toplevel
         # Third Party
-        from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline
         from docling.document_converter import DocumentConverter, PdfFormatOption
+        from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline
 
         if self.docling_model_path is None:
             logger.info("Docling models not found on disk, downloading models...")
@@ -285,7 +279,7 @@ def _init_docling_converter(self):
             artifacts_path=self.docling_model_path,
             do_ocr=False,
         )
-        
+
         ocr_options = resolve_ocr_options()
         if ocr_options is not None:
             pipeline_options.do_ocr = True
@@ -402,7 +396,9 @@ def create_tokenizer(model_name: str):
         # Third Party
         from transformers import AutoTokenizer
 
-        model_path = Path(model_name)  # TODO expect a path from the DocumentChunker constructor
+        model_path = Path(
+            model_name
+        )  # TODO expect a path from the DocumentChunker constructor
         error_info_message = (
             "Please run `ilab model download {download_args}` and try again"
         )
@@ -583,7 +579,9 @@ def build_chunks_from_docling_json(
                     )
                     book_text = self.get_table(json_book, book_element["$ref"])
                 elif book_element["prov"]:
-                    current_book_page_number = book_element["prov"][0]["page"]  # TODO export to function to handle empty ["prov"]
+                    current_book_page_number = book_element["prov"][0][
+                        "page"
+                    ]  # TODO export to function to handle empty ["prov"]
                     book_text = book_element["text"]
                 else:
                     current_book_page_number = None

diff --git a/src/instructlab/sdg/utils/taxonomy.py b/src/instructlab/sdg/utils/taxonomy.py
@@ -111,12 +111,12 @@ def _get_taxonomy(repo="taxonomy"):
 
 def _string_contains_html(s: str) -> bool:
     """Detect HTML tags in a string.
-    
+
     We use this to catch markdown files that may contain html elements since
     docling does not support this."""
     # Define a regex to detect HTML tags
     html_tag_pattern = re.compile(r"<\/?[a-zA-Z][\s\S]*?>")
-    
+
     # Check for HTML tags in the content
     return bool(html_tag_pattern.search(s))
 
@@ -173,11 +173,13 @@ def _get_documents(
                             with open(file_path, "r", encoding="utf-8") as file:
                                 content = file.read()
                                 if _string_contains_html(content):
-                                    raise ValueError(f"Provided markdown file {file_path} contains"
-                                                     " HTML, which is currently unsupported. Please"
-                                                     " format your markdown documents without the"
-                                                     " use of HTML or use a different document"
-                                                     " filetype.")
+                                    raise ValueError(
+                                        f"Provided markdown file {file_path} contains"
+                                        " HTML, which is currently unsupported. Please"
+                                        " format your markdown documents without the"
+                                        " use of HTML or use a different document"
+                                        " filetype."
+                                    )
                                 file_contents.append(content)
                                 filepaths.append(Path(file_path))
                                 logger.info(
@@ -429,7 +431,7 @@ def map_chunks_to_icls(chunks: List, leaf_node: Dict) -> Dataset:
 
 def _knowledge_leaf_node_to_samples(
     leaf_node,
-    taxonomy_path,
+    taxonomy_path,  # pylint: disable=unused-argument
     server_ctx_size,
     chunk_word_count,
     document_output_dir,

diff --git a/tests/test_chunkers.py b/tests/test_chunkers.py
@@ -11,10 +11,7 @@
 import pytest
 
 # First Party
-from instructlab.sdg.utils.chunkers import (
-    DocumentChunker,
-    resolve_ocr_options,
-)
+from instructlab.sdg.utils.chunkers import DocumentChunker, resolve_ocr_options
 
 # Local
 from .testdata import testdata
@@ -32,7 +29,9 @@ def tokenizer_model_name():
     return os.path.join(TEST_DATA_DIR, "models/instructlab/granite-7b-lab")
 
 
-def test_init_document_chunker_unsupported_filetype(documents_dir, tokenizer_model_name):
+def test_init_document_chunker_unsupported_filetype(
+    documents_dir, tokenizer_model_name
+):
     """Test that the DocumentChunker factory class fails when provided an unsupported document"""
     document_paths = [documents_dir / "document.jpg"]
     with pytest.raises(ValueError):

diff --git a/tests/test_generate_data.py b/tests/test_generate_data.py
@@ -314,7 +314,9 @@ def test_generate(self):
                 client=MagicMock(),
                 logger=mocked_logger,
                 model_family="granite",
-                model_name=os.path.join(TEST_DATA_DIR, "models/instructlab/granite-7b-lab"),
+                model_name=os.path.join(
+                    TEST_DATA_DIR, "models/instructlab/granite-7b-lab"
+                ),
                 num_instructions_to_generate=10,
                 taxonomy=self.test_taxonomy.root,
                 taxonomy_base=TEST_TAXONOMY_BASE,
@@ -391,7 +393,9 @@ def test_generate(self):
                 client=MagicMock(),
                 logger=mocked_logger,
                 model_family="granite",
-                model_name=os.path.join(TEST_DATA_DIR, "models/instructlab/granite-7b-lab"),
+                model_name=os.path.join(
+                    TEST_DATA_DIR, "models/instructlab/granite-7b-lab"
+                ),
                 num_instructions_to_generate=10,
                 taxonomy=self.test_taxonomy.root,
                 taxonomy_base=TEST_TAXONOMY_BASE,
@@ -489,7 +493,9 @@ def test_generate(self):
                 client=MagicMock(),
                 logger=mocked_logger,
                 model_family="granite",
-                model_name=os.path.join(TEST_DATA_DIR, "models/instructlab/granite-7b-lab"),
+                model_name=os.path.join(
+                    TEST_DATA_DIR, "models/instructlab/granite-7b-lab"
+                ),
                 num_instructions_to_generate=10,
                 taxonomy=self.test_taxonomy.root,
                 taxonomy_base=TEST_TAXONOMY_BASE,

diff --git a/tests/test_taxonomy.py b/tests/test_taxonomy.py
@@ -86,13 +86,13 @@ def test_read_taxonomy_leaf_nodes(
             ):
                 seed_example_exists = True
             assert seed_example_exists is True
-    
+
     @pytest.mark.parametrize(
         "s, contains_html",
         [
             ("hello, world!", False),
             ("hello, <div>world!</div>", True),
-        ]
+        ],
     )
     def test_string_contains_html(self, s, contains_html):
         print(taxonomy.__dict__)