Skip to content

Commit

Permalink
Update for ruff issues
Browse files Browse the repository at this point in the history
Signed-off-by: Aakanksha Duggal <[email protected]>
  • Loading branch information
aakankshaduggal committed Jan 6, 2025
1 parent 1722161 commit f62567d
Show file tree
Hide file tree
Showing 5 changed files with 35 additions and 28 deletions.
22 changes: 11 additions & 11 deletions src/instructlab/sdg/utils/chunkers.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,8 +70,7 @@ def resolve_ocr_options() -> OcrOptions:


def split_docs_by_filetype(document_paths: List[Path]) -> Dict[str, List[Path]]:
"""Split document paths into a dict of lists based on their file extension.
"""
"""Split document paths into a dict of lists based on their file extension."""
document_dict = defaultdict(list)
for path in document_paths:
filetype = path.suffix
Expand All @@ -83,7 +82,6 @@ def split_docs_by_filetype(document_paths: List[Path]) -> Dict[str, List[Path]]:
return dict(document_dict)



# class DocumentChunker:
# """A factory chunker class that instantiates the applicable chunker

Expand Down Expand Up @@ -226,8 +224,7 @@ def split_docs_by_filetype(document_paths: List[Path]) -> Dict[str, List[Path]]:
# return chunk_markdowns(self.document_contents, chunk_size)


class DocumentChunker(): # pylint: disable=too-many-instance-attributes

class DocumentChunker: # pylint: disable=too-many-instance-attributes
# def __new__(
# cls,
# leaf_node,
Expand Down Expand Up @@ -267,13 +264,12 @@ def __init__(
self.tokenizer = self.create_tokenizer(tokenizer_model_name)

def _init_docling_converter(self):
"""Initialize docling converter with filetype-specific configurations
"""
"""Initialize docling converter with filetype-specific configurations"""
# triggers torch loading, import lazily
# pylint: disable=import-outside-toplevel
# Third Party
from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline
from docling.document_converter import DocumentConverter, PdfFormatOption
from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline

if self.docling_model_path is None:
logger.info("Docling models not found on disk, downloading models...")
Expand All @@ -285,7 +281,7 @@ def _init_docling_converter(self):
artifacts_path=self.docling_model_path,
do_ocr=False,
)

ocr_options = resolve_ocr_options()
if ocr_options is not None:
pipeline_options.do_ocr = True
Expand Down Expand Up @@ -402,7 +398,9 @@ def create_tokenizer(model_name: str):
# Third Party
from transformers import AutoTokenizer

model_path = Path(model_name) # TODO expect a path from the DocumentChunker constructor
model_path = Path(
model_name
) # TODO expect a path from the DocumentChunker constructor
error_info_message = (
"Please run `ilab model download {download_args}` and try again"
)
Expand Down Expand Up @@ -583,7 +581,9 @@ def build_chunks_from_docling_json(
)
book_text = self.get_table(json_book, book_element["$ref"])
elif book_element["prov"]:
current_book_page_number = book_element["prov"][0]["page"] # TODO export to function to handle empty ["prov"]
current_book_page_number = book_element["prov"][0][
"page"
] # TODO export to function to handle empty ["prov"]
book_text = book_element["text"]
else:
current_book_page_number = None
Expand Down
16 changes: 9 additions & 7 deletions src/instructlab/sdg/utils/taxonomy.py
Original file line number Diff line number Diff line change
Expand Up @@ -111,12 +111,12 @@ def _get_taxonomy(repo="taxonomy"):

def _string_contains_html(s: str) -> bool:
"""Detect HTML tags in a string.
We use this to catch markdown files that may contain html elements since
docling does not support this."""
# Define a regex to detect HTML tags
html_tag_pattern = re.compile(r"<\/?[a-zA-Z][\s\S]*?>")

# Check for HTML tags in the content
return bool(html_tag_pattern.search(s))

Expand Down Expand Up @@ -173,11 +173,13 @@ def _get_documents(
with open(file_path, "r", encoding="utf-8") as file:
content = file.read()
if _string_contains_html(content):
raise ValueError(f"Provided markdown file {file_path} contains"
" HTML, which is currently unsupported. Please"
" format your markdown documents without the"
" use of HTML or use a different document"
" filetype.")
raise ValueError(
f"Provided markdown file {file_path} contains"
" HTML, which is currently unsupported. Please"
" format your markdown documents without the"
" use of HTML or use a different document"
" filetype."
)
file_contents.append(content)
filepaths.append(Path(file_path))
logger.info(
Expand Down
9 changes: 4 additions & 5 deletions tests/test_chunkers.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,7 @@
import pytest

# First Party
from instructlab.sdg.utils.chunkers import (
DocumentChunker,
resolve_ocr_options,
)
from instructlab.sdg.utils.chunkers import DocumentChunker, resolve_ocr_options

# Local
from .testdata import testdata
Expand All @@ -32,7 +29,9 @@ def tokenizer_model_name():
return os.path.join(TEST_DATA_DIR, "models/instructlab/granite-7b-lab")


def test_init_document_chunker_unsupported_filetype(documents_dir, tokenizer_model_name):
def test_init_document_chunker_unsupported_filetype(
documents_dir, tokenizer_model_name
):
"""Test that the DocumentChunker factory class fails when provided an unsupported document"""
document_paths = [documents_dir / "document.jpg"]
with pytest.raises(ValueError):
Expand Down
12 changes: 9 additions & 3 deletions tests/test_generate_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -314,7 +314,9 @@ def test_generate(self):
client=MagicMock(),
logger=mocked_logger,
model_family="granite",
model_name=os.path.join(TEST_DATA_DIR, "models/instructlab/granite-7b-lab"),
model_name=os.path.join(
TEST_DATA_DIR, "models/instructlab/granite-7b-lab"
),
num_instructions_to_generate=10,
taxonomy=self.test_taxonomy.root,
taxonomy_base=TEST_TAXONOMY_BASE,
Expand Down Expand Up @@ -391,7 +393,9 @@ def test_generate(self):
client=MagicMock(),
logger=mocked_logger,
model_family="granite",
model_name=os.path.join(TEST_DATA_DIR, "models/instructlab/granite-7b-lab"),
model_name=os.path.join(
TEST_DATA_DIR, "models/instructlab/granite-7b-lab"
),
num_instructions_to_generate=10,
taxonomy=self.test_taxonomy.root,
taxonomy_base=TEST_TAXONOMY_BASE,
Expand Down Expand Up @@ -489,7 +493,9 @@ def test_generate(self):
client=MagicMock(),
logger=mocked_logger,
model_family="granite",
model_name=os.path.join(TEST_DATA_DIR, "models/instructlab/granite-7b-lab"),
model_name=os.path.join(
TEST_DATA_DIR, "models/instructlab/granite-7b-lab"
),
num_instructions_to_generate=10,
taxonomy=self.test_taxonomy.root,
taxonomy_base=TEST_TAXONOMY_BASE,
Expand Down
4 changes: 2 additions & 2 deletions tests/test_taxonomy.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,13 +86,13 @@ def test_read_taxonomy_leaf_nodes(
):
seed_example_exists = True
assert seed_example_exists is True

@pytest.mark.parametrize(
"s, contains_html",
[
("hello, world!", False),
("hello, <div>world!</div>", True),
]
],
)
def test_string_contains_html(self, s, contains_html):
print(taxonomy.__dict__)
Expand Down

0 comments on commit f62567d

Please sign in to comment.