From 16dfb4d59dbae27848524085ac1aabb11788f6ba Mon Sep 17 00:00:00 2001 From: Umar Butler Date: Fri, 12 Jul 2024 20:54:30 +1000 Subject: [PATCH] Fixing #7. --- CHANGELOG.md | 4 ++ README.md | 6 ++- pyproject.toml | 2 +- src/semchunk/__init__.py | 2 +- src/semchunk/semchunk.py | 87 ++++++++++++++++++++++------------------ 5 files changed, 59 insertions(+), 42 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 82bd7ff..563d3b6 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,10 @@ ## Changelog 🔄 All notable changes to `semchunk` will be documented here. This project adheres to [Keep a Changelog](https://keepachangelog.com/en/1.1.0/) and [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## [2.2.0] - 2024-07-12 +### Changed +- Switched from having `chunkerify()` output a function to having it return an instance of the new `Chunker()` class which should not alter functionality in any way but will allow for the preservation of type hints, fixing [#7](https://github.com/umarbutler/semchunk/pull/7). + ## [2.1.0] - 2024-06-20 ### Fixed - Ceased memoizing `chunk()` (but not token counters) due to the fact that cached outputs of memoized functions are shallow rather than deep copies of original outputs, meaning that if one were to chunk a text and then chunk that same text again and then modify one of the chunks outputted by the first call, the chunks outputted by the second call would also be modified. This behaviour is not expected and therefore undesirable. The memoization of token counters is not impacted as they output immutable objects, namely, integers. diff --git a/README.md b/README.md index 006f61b..9637504 100644 --- a/README.md +++ b/README.md @@ -66,12 +66,14 @@ def chunkerify( `memoize` flags whether to memoize the token counter. It defaults to `True`. -This function returns a callable that takes either a single text or a sequence of texts and returns, if a single text has been provided, a list of chunks up to `chunk_size`-tokens-long with any whitespace used to split the text removed, or, if multiple texts have been provided, a list of lists of chunks, with each inner list corresponding to the chunks of one of the provided input texts. +This function returns a chunker that takes either a single text or a sequence of texts and returns, if a single text has been provided, a list of chunks up to `chunk_size`-tokens-long with any whitespace used to split the text removed, or, if multiple texts have been provided, a list of lists of chunks, with each inner list corresponding to the chunks of one of the provided input texts. -The resulting chunker function can also be passed a `processes` argument that specifies the number of processes to be used when chunking multiple texts. +The resulting chunker can be passed a `processes` argument that specifies the number of processes to be used when chunking multiple texts. It is also possible to pass a `progress` argument which, if set to `True` and multiple texts are passed, will display a progress bar. +Technically, the chunker will be an instance of the `semchunk.Chunker` class to assist with type hinting, though this should have no impact on how it can be used. + ### Chunk ```python def chunk( diff --git a/pyproject.toml b/pyproject.toml index 958c2e3..47c1911 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "hatchling.build" [project] name = "semchunk" -version = "2.1.0" +version = "2.2.0" authors = [ {name="Umar Butler", email="umar@umar.au"}, ] diff --git a/src/semchunk/__init__.py b/src/semchunk/__init__.py index fb06e16..e192e54 100644 --- a/src/semchunk/__init__.py +++ b/src/semchunk/__init__.py @@ -1,3 +1,3 @@ """A fast and lightweight Python library for splitting text into semantically meaningful chunks.""" -from .semchunk import chunk, chunkerify \ No newline at end of file +from .semchunk import chunk, Chunker, chunkerify diff --git a/src/semchunk/semchunk.py b/src/semchunk/semchunk.py index 8b2c96a..651f59f 100644 --- a/src/semchunk/semchunk.py +++ b/src/semchunk/semchunk.py @@ -18,6 +18,7 @@ import tokenizers import transformers + _memoized_token_counters = {} """A map of token counters to their memoized versions.""" @@ -29,6 +30,7 @@ ) """A tuple of semantically meaningful non-whitespace splitters that may be used to chunk texts, ordered from most desirable to least desirable.""" + def _split_text(text: str) -> tuple[str, bool, list[str]]: """Split text using the most semantically meaningful splitter possible.""" @@ -151,13 +153,51 @@ def chunk( return chunks + +class Chunker: + def __init__(self, chunk_size: int, token_counter: Callable[[str], int]) -> None: + self.chunk_size = chunk_size + self.token_counter = token_counter + + def chunk(self, text: str) -> list[str]: + """Chunk a text.""" + + return chunk(text, self.chunk_size, self.token_counter, memoize = False) + + def __call__( + self, + text_or_texts: str | Sequence[str], + processes: int = 1, + progress: bool = False, + ) -> list[str] | list[list[str]]: + """Split text or texts into semantically meaningful chunks of a specified size as determined by the provided tokenizer or token counter. + + Args: + text_or_texts (str | Sequence[str]): The text or texts to be chunked. + + Returns: + list[str] | list[list[str]]: If a single text has been provided, a list of chunks up to `chunk_size`-tokens-long, with any whitespace used to split the text removed, or, if multiple texts have been provided, a list of lists of chunks, with each inner list corresponding to the chunks of one of the provided input texts. + processes (int, optional): The number of processes to use when chunking multiple texts. Defaults to `1` in which case chunking will occur in the main process. + progress (bool, optional): Whether to display a progress bar when chunking multiple texts. Defaults to `False`.""" + if isinstance(text_or_texts, str): + return self.chunk(text_or_texts) + + if progress and processes == 1: + text_or_texts = tqdm(text_or_texts) + + if processes == 1: + return [self.chunk(text) for text in text_or_texts] + + with mpire.WorkerPool(processes, use_dill = True) as pool: + return pool.map(self.chunk, text_or_texts, progress_bar = progress) + def chunkerify( tokenizer_or_token_counter: str | tiktoken.Encoding | transformers.PreTrainedTokenizer \ | tokenizers.Tokenizer | Callable[[str], int], - chunk_size: int = None, - max_token_chars: int = None, + chunk_size: int | None = None, + max_token_chars: int | None = None, memoize: bool = True, -): # NOTE The output of `chunkerify()` is not type hinted because it causes `vscode` to overwrite the signature and docstring of the outputted chunker with the type hint. +) -> Chunker: """Construct a chunker that splits one or more texts into semantically meaningful chunks of a specified size as determined by the provided tokenizer or token counter. Args: @@ -167,11 +207,13 @@ def chunkerify( memoize (bool, optional): Whether to memoize the token counter. Defaults to `True`. Returns: - Callable[[str | Sequence[str], bool, bool], list[str] | list[list[str]]]: A function that takes either a single text or a sequence of texts and returns, if a single text has been provided, a list of chunks up to `chunk_size`-tokens-long with any whitespace used to split the text removed, or, if multiple texts have been provided, a list of lists of chunks, with each inner list corresponding to the chunks of one of the provided input texts. + Callable[[str | Sequence[str], bool, bool], list[str] | list[list[str]]]: A chunker that takes either a single text or a sequence of texts and returns, if a single text has been provided, a list of chunks up to `chunk_size`-tokens-long with any whitespace used to split the text removed, or, if multiple texts have been provided, a list of lists of chunks, with each inner list corresponding to the chunks of one of the provided input texts. + + The resulting chunker can be passed a `processes` argument that specifies the number of processes to be used when chunking multiple texts. - The resulting chunker function can also be passed a `processes` argument that specifies the number of processes to be used when chunking multiple texts. + It is also possible to pass a `progress` argument which, if set to `True` and multiple texts are passed, will display a progress bar. - It is also possible to pass a `progress` argument which, if set to `True` and multiple texts are passed, will display a progress bar.""" + Technically, the chunker will be an instance of the `semchunk.Chunker` class to assist with type hinting, though this should have no impact on how it can be used.""" # If the provided tokenizer is a string, try to load it with either `tiktoken` or `transformers` or raise an error if neither is available. if isinstance(tokenizer_or_token_counter, str): @@ -254,36 +296,5 @@ def faster_token_counter(text: str) -> int: if memoize: token_counter = _memoized_token_counters.setdefault(token_counter, cache(token_counter)) - # Construct a chunking function that passes the chunk size and token counter to `chunk()`. - def chunking_function(text: str) -> list[str]: - return chunk(text, chunk_size, token_counter, memoize = False) - # Construct and return the chunker. - def chunker( - text_or_texts: str | Sequence[str], - processes: int = 1, - progress: bool = False, - ) -> list[str] | list[list[str]]: - """Split text or texts into semantically meaningful chunks of a specified size as determined by the provided tokenizer or token counter. - - Args: - text_or_texts (str | Sequence[str]): The text or texts to be chunked. - - Returns: - list[str] | list[list[str]]: If a single text has been provided, a list of chunks up to `chunk_size`-tokens-long, with any whitespace used to split the text removed, or, if multiple texts have been provided, a list of lists of chunks, with each inner list corresponding to the chunks of one of the provided input texts. - processes (int, optional): The number of processes to use when chunking multiple texts. Defaults to `1` in which case chunking will occur in the main process. - progress (bool, optional): Whether to display a progress bar when chunking multiple texts. Defaults to `False`.""" - - if isinstance(text_or_texts, str): - return chunking_function(text_or_texts) - - if progress and processes == 1: - text_or_texts = tqdm(text_or_texts) - - if processes == 1: - return [chunking_function(text) for text in text_or_texts] - - with mpire.WorkerPool(processes, use_dill = True) as pool: - return pool.map(chunking_function, text_or_texts, progress_bar = progress) - - return chunker + return Chunker(chunk_size, token_counter) \ No newline at end of file