Skip to content

Commit

Permalink
Update for ruff and pylint issues
Browse files Browse the repository at this point in the history
Signed-off-by: Aakanksha Duggal <[email protected]>
  • Loading branch information
aakankshaduggal committed Jan 6, 2025
1 parent 1722161 commit 309fd11
Show file tree
Hide file tree
Showing 5 changed files with 37 additions and 32 deletions.
26 changes: 12 additions & 14 deletions src/instructlab/sdg/utils/chunkers.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,7 @@
# Standard
from abc import ABC, abstractmethod
from collections import defaultdict
from enum import Enum
from pathlib import Path
from typing import Dict, Iterable, List, Optional, Tuple
from typing import Dict, Iterable, List, Optional
import json
import logging
import re
Expand Down Expand Up @@ -70,8 +68,7 @@ def resolve_ocr_options() -> OcrOptions:


def split_docs_by_filetype(document_paths: List[Path]) -> Dict[str, List[Path]]:
"""Split document paths into a dict of lists based on their file extension.
"""
"""Split document paths into a dict of lists based on their file extension."""
document_dict = defaultdict(list)
for path in document_paths:
filetype = path.suffix
Expand All @@ -83,7 +80,6 @@ def split_docs_by_filetype(document_paths: List[Path]) -> Dict[str, List[Path]]:
return dict(document_dict)



# class DocumentChunker:
# """A factory chunker class that instantiates the applicable chunker

Expand Down Expand Up @@ -226,8 +222,7 @@ def split_docs_by_filetype(document_paths: List[Path]) -> Dict[str, List[Path]]:
# return chunk_markdowns(self.document_contents, chunk_size)


class DocumentChunker(): # pylint: disable=too-many-instance-attributes

class DocumentChunker: # pylint: disable=too-many-instance-attributes
# def __new__(
# cls,
# leaf_node,
Expand Down Expand Up @@ -267,13 +262,12 @@ def __init__(
self.tokenizer = self.create_tokenizer(tokenizer_model_name)

def _init_docling_converter(self):
"""Initialize docling converter with filetype-specific configurations
"""
"""Initialize docling converter with filetype-specific configurations"""
# triggers torch loading, import lazily
# pylint: disable=import-outside-toplevel
# Third Party
from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline
from docling.document_converter import DocumentConverter, PdfFormatOption
from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline

if self.docling_model_path is None:
logger.info("Docling models not found on disk, downloading models...")
Expand All @@ -285,7 +279,7 @@ def _init_docling_converter(self):
artifacts_path=self.docling_model_path,
do_ocr=False,
)

ocr_options = resolve_ocr_options()
if ocr_options is not None:
pipeline_options.do_ocr = True
Expand Down Expand Up @@ -402,7 +396,9 @@ def create_tokenizer(model_name: str):
# Third Party
from transformers import AutoTokenizer

model_path = Path(model_name) # TODO expect a path from the DocumentChunker constructor
model_path = Path(
model_name
) # TODO expect a path from the DocumentChunker constructor
error_info_message = (
"Please run `ilab model download {download_args}` and try again"
)
Expand Down Expand Up @@ -583,7 +579,9 @@ def build_chunks_from_docling_json(
)
book_text = self.get_table(json_book, book_element["$ref"])
elif book_element["prov"]:
current_book_page_number = book_element["prov"][0]["page"] # TODO export to function to handle empty ["prov"]
current_book_page_number = book_element["prov"][0][
"page"
] # TODO export to function to handle empty ["prov"]
book_text = book_element["text"]
else:
current_book_page_number = None
Expand Down
18 changes: 10 additions & 8 deletions src/instructlab/sdg/utils/taxonomy.py
Original file line number Diff line number Diff line change
Expand Up @@ -111,12 +111,12 @@ def _get_taxonomy(repo="taxonomy"):

def _string_contains_html(s: str) -> bool:
"""Detect HTML tags in a string.
We use this to catch markdown files that may contain html elements since
docling does not support this."""
# Define a regex to detect HTML tags
html_tag_pattern = re.compile(r"<\/?[a-zA-Z][\s\S]*?>")

# Check for HTML tags in the content
return bool(html_tag_pattern.search(s))

Expand Down Expand Up @@ -173,11 +173,13 @@ def _get_documents(
with open(file_path, "r", encoding="utf-8") as file:
content = file.read()
if _string_contains_html(content):
raise ValueError(f"Provided markdown file {file_path} contains"
" HTML, which is currently unsupported. Please"
" format your markdown documents without the"
" use of HTML or use a different document"
" filetype.")
raise ValueError(
f"Provided markdown file {file_path} contains"
" HTML, which is currently unsupported. Please"
" format your markdown documents without the"
" use of HTML or use a different document"
" filetype."
)
file_contents.append(content)
filepaths.append(Path(file_path))
logger.info(
Expand Down Expand Up @@ -429,7 +431,7 @@ def map_chunks_to_icls(chunks: List, leaf_node: Dict) -> Dataset:

def _knowledge_leaf_node_to_samples(
leaf_node,
taxonomy_path,
taxonomy_path, # pylint: disable=unused-argument
server_ctx_size,
chunk_word_count,
document_output_dir,
Expand Down
9 changes: 4 additions & 5 deletions tests/test_chunkers.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,7 @@
import pytest

# First Party
from instructlab.sdg.utils.chunkers import (
DocumentChunker,
resolve_ocr_options,
)
from instructlab.sdg.utils.chunkers import DocumentChunker, resolve_ocr_options

# Local
from .testdata import testdata
Expand All @@ -32,7 +29,9 @@ def tokenizer_model_name():
return os.path.join(TEST_DATA_DIR, "models/instructlab/granite-7b-lab")


def test_init_document_chunker_unsupported_filetype(documents_dir, tokenizer_model_name):
def test_init_document_chunker_unsupported_filetype(
documents_dir, tokenizer_model_name
):
"""Test that the DocumentChunker factory class fails when provided an unsupported document"""
document_paths = [documents_dir / "document.jpg"]
with pytest.raises(ValueError):
Expand Down
12 changes: 9 additions & 3 deletions tests/test_generate_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -314,7 +314,9 @@ def test_generate(self):
client=MagicMock(),
logger=mocked_logger,
model_family="granite",
model_name=os.path.join(TEST_DATA_DIR, "models/instructlab/granite-7b-lab"),
model_name=os.path.join(
TEST_DATA_DIR, "models/instructlab/granite-7b-lab"
),
num_instructions_to_generate=10,
taxonomy=self.test_taxonomy.root,
taxonomy_base=TEST_TAXONOMY_BASE,
Expand Down Expand Up @@ -391,7 +393,9 @@ def test_generate(self):
client=MagicMock(),
logger=mocked_logger,
model_family="granite",
model_name=os.path.join(TEST_DATA_DIR, "models/instructlab/granite-7b-lab"),
model_name=os.path.join(
TEST_DATA_DIR, "models/instructlab/granite-7b-lab"
),
num_instructions_to_generate=10,
taxonomy=self.test_taxonomy.root,
taxonomy_base=TEST_TAXONOMY_BASE,
Expand Down Expand Up @@ -489,7 +493,9 @@ def test_generate(self):
client=MagicMock(),
logger=mocked_logger,
model_family="granite",
model_name=os.path.join(TEST_DATA_DIR, "models/instructlab/granite-7b-lab"),
model_name=os.path.join(
TEST_DATA_DIR, "models/instructlab/granite-7b-lab"
),
num_instructions_to_generate=10,
taxonomy=self.test_taxonomy.root,
taxonomy_base=TEST_TAXONOMY_BASE,
Expand Down
4 changes: 2 additions & 2 deletions tests/test_taxonomy.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,13 +86,13 @@ def test_read_taxonomy_leaf_nodes(
):
seed_example_exists = True
assert seed_example_exists is True

@pytest.mark.parametrize(
"s, contains_html",
[
("hello, world!", False),
("hello, <div>world!</div>", True),
]
],
)
def test_string_contains_html(self, s, contains_html):
print(taxonomy.__dict__)
Expand Down

0 comments on commit 309fd11

Please sign in to comment.