From 337a18615f991076b076262288b0408cb162b48c Mon Sep 17 00:00:00 2001 From: Rob Kopel Date: Wed, 3 Apr 2024 21:12:00 +1000 Subject: [PATCH 01/11] =?UTF-8?q?=F0=9F=93=88=20performance=20optimisation?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .gitignore | 1 + CHANGELOG.md | 5 ++ README.md | 2 +- pyproject.toml | 2 +- src/semchunk/__init__.py | 2 +- src/semchunk/semchunk.py | 99 +++++++++++++++++++++++++++++++++++++++- tests/bench.py | 59 ++++++++++++++++-------- 7 files changed, 146 insertions(+), 24 deletions(-) create mode 100644 .gitignore diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..61f2dc9 --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +**/__pycache__/ diff --git a/CHANGELOG.md b/CHANGELOG.md index 25c4020..158c255 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,11 @@ ## Changelog 🔄 All notable changes to `semchunk` will be documented here. This project adheres to [Keep a Changelog](https://keepachangelog.com/en/1.1.0/) and [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## [0.2.4] - 2024-04-03 +### Changed +- Improved chunking performance +- Added git ignore file + ## [0.2.3] - 2024-03-11 ### Fixed - Ensured that memoization does not overwrite `chunk()`'s function signature. diff --git a/README.md b/README.md index 0c3ff94..98ea0f9 100644 --- a/README.md +++ b/README.md @@ -3,7 +3,7 @@ `semchunk` is a fast and lightweight pure Python library for splitting text into semantically meaningful chunks. -Owing to its complex yet highly efficient chunking algorithm, `semchunk` is both more semantically accurate than [`langchain.text_splitter.RecursiveCharacterTextSplitter`](https://python.langchain.com/docs/modules/data_connection/document_transformers/text_splitters/recursive_text_splitter) (see [How It Works 🔍](https://github.com/umarbutler/semchunk#how-it-works-)) and is also over 70% faster than [`semantic-text-splitter`](https://pypi.org/project/semantic-text-splitter/) (see the [Benchmarks 📊](https://github.com/umarbutler/semchunk#benchmarks-)). +Owing to its complex yet highly efficient chunking algorithm, `semchunk` is both more semantically accurate than [`langchain.text_splitter.RecursiveCharacterTextSplitter`](https://python.langchain.com/docs/modules/data_connection/document_transformers/text_splitters/recursive_text_splitter) (see [How It Works 🔍](https://github.com/umarbutler/semchunk#how-it-works-)) and is also over 80% faster than [`semantic-text-splitter`](https://pypi.org/project/semantic-text-splitter/) (see the [Benchmarks 📊](https://github.com/umarbutler/semchunk#benchmarks-)). ## Installation 📦 `semchunk` may be installed with `pip`: diff --git a/pyproject.toml b/pyproject.toml index f2c0158..734b0a7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "hatchling.build" [project] name = "semchunk" -version = "0.2.3" +version = "0.2.4" authors = [ {name="Umar Butler", email="umar@umar.au"}, ] diff --git a/src/semchunk/__init__.py b/src/semchunk/__init__.py index bf8b68c..8f33c5e 100644 --- a/src/semchunk/__init__.py +++ b/src/semchunk/__init__.py @@ -1,3 +1,3 @@ """A fast and lightweight pure Python library for splitting text into semantically meaningful chunks.""" -from .semchunk import chunk +from .semchunk import chunk, chunk_legacy diff --git a/src/semchunk/semchunk.py b/src/semchunk/semchunk.py index ae40557..291122e 100644 --- a/src/semchunk/semchunk.py +++ b/src/semchunk/semchunk.py @@ -45,7 +45,7 @@ def _split_text(text: str) -> tuple[str, bool, list[str]]: # Return the splitter and the split text. return splitter, splitter_is_whitespace, text.split(splitter) -def chunk(text: str, chunk_size: int, token_counter: callable, memoize: bool=True, _recursion_depth: int = 0) -> list[str]: +def chunk_legacy(text: str, chunk_size: int, token_counter: callable, memoize: bool=True, _recursion_depth: int = 0) -> list[str]: """Split text into semantically meaningful chunks of a specified size as determined by the provided token counter. Args: @@ -76,7 +76,7 @@ def chunk(text: str, chunk_size: int, token_counter: callable, memoize: bool=Tru # If the split is over the chunk size, recursively chunk it. if token_counter(split) > chunk_size: - chunks.extend(chunk(split, chunk_size, token_counter=token_counter, memoize=memoize, _recursion_depth=_recursion_depth+1)) + chunks.extend(chunk_legacy(split, chunk_size, token_counter=token_counter, memoize=memoize, _recursion_depth=_recursion_depth+1)) # If the split is equal to or under the chunk size, merge it with all subsequent splits until the chunk size is reached. else: @@ -114,4 +114,99 @@ def chunk(text: str, chunk_size: int, token_counter: callable, memoize: bool=Tru return chunks +def count(text: str, max_size: int, counter: callable) -> int: + """Counts the number of tokens in a text, with a heuristic to accelerate long texts""" + heuritistic = 6*max_size + + # There is a rare failure case for the below heuristic where superfluous tokens + # may be added from a longer, existing token being split before it was finished. + # e.g. Australia -> 1 token + # Australi -> 3 token + # + # We mitigate this failure case by adding the len(longest token)-1 such that + # any ongoing token will be able to finish + # + # Using the cl100k tokenset, the length of the longest non-symbol token is 42 + # See: https://gist.github.com/Yardanico/623b3092d0b707119f8c7d90a3596afe + max_token = 42 - 1 + + if len(text) > heuritistic and counter(text[:heuritistic+max_token]) > max_size: + return max_size+1 + return counter(text) + +def find_split(splits: list[str], max_size: int, splitter: str, counter: callable) -> tuple[int, str]: + """Binary search for the optimal split point where the accumulated_token_count < max_size.""" + low, high = 0, len(splits) + 1 + while low < high: + # As the main performance hit comes from running the token_counter on long texts + # we can bias the binary search to favour guessing towards shorter sequences. + # This is done below by using > 2 as the divisor + mid = low + (high - low) // 8 + if count(splitter.join(splits[:mid]), max_size, counter) > max_size: + high = mid + else: + low = mid + 1 + + return low-1, splitter.join(splits[:low-1]) + +def chunk(text: str, chunk_size: int, token_counter: callable, memoize: bool=True, _recursion_depth: int = 0) -> list[str]: + """Split text into semantically meaningful chunks of a specified size as determined by the provided token counter. + + Args: + text (str): The text to be chunked. + chunk_size (int): The maximum number of tokens a chunk may contain. + token_counter (callable): A callable that takes a string and returns the number of tokens in it. + memoize (bool, optional): Whether to memoise the token counter. Defaults to True. + + Returns: + list[str]: A list of chunks up to `chunk_size`-tokens-long, with any whitespace used to split the text removed.""" + + # If this is not a recursive call and memoization is enabled, overwrite the `token_counter` with a memoised version of itself. + if not _recursion_depth and memoize: + token_counter = _memoised_token_counters.setdefault(token_counter, cache(token_counter)) + + # Split the text using the most semantically meaningful splitter possible. + splitter, splitter_is_whitespace, splits = _split_text(text) + + chunks = [] + skips = set() + """A list of indices of splits to skip because they have already been added to a chunk.""" + + # Iterate through the splits. + for i, split in enumerate(splits): + # Skip the split if it has already been added to a chunk. + if i in skips: + continue + + # If the split is over the chunk size, recursively chunk it. + if count(split, chunk_size, token_counter) > chunk_size: + chunks.extend(chunk(split, chunk_size, token_counter, memoize, _recursion_depth+1)) + + # If the split is equal to or under the chunk size, merge it with all subsequent splits until the chunk size is reached. + else: + # Use n-ary search to find the optimal split point. + optimal, new_chunk = find_split(splits[i:], chunk_size, splitter, token_counter) + + # Update the skips set based on the splits included in the new chunk. + skips.update(range(i+1, i + optimal)) + + # Add the chunk. + chunks.append(new_chunk) + + # If the splitter is not whitespace and the split is not the last split, add the splitter to the end of the last chunk if doing so would not cause it to exceed the chunk size otherwise add the splitter as a new chunk. + if not splitter_is_whitespace and not (i == len(splits) - 1 or all(j in skips for j in range(i+1, len(splits)))): + # We seperately add tokens(prior chunk) and tokens(splitter) to ensure O(1) - (both will be in cache). + # There is a failure case where tokens(get_last_token(prior_chunk) + splitter) == 1 however this is + # quite uncommon and leads to a negligible impact + if token_counter(chunks[-1]) + token_counter(splitter) <= chunk_size: + chunks[-1] += splitter + else: + chunks.append(splitter) + + # If this is not a recursive call, remove any empty chunks. + if not _recursion_depth: + chunks = list(filter(None, chunks)) + + return chunks + chunk = wraps(chunk)(cache(chunk)) \ No newline at end of file diff --git a/tests/bench.py b/tests/bench.py index f55a3c7..9660cf6 100644 --- a/tests/bench.py +++ b/tests/bench.py @@ -1,34 +1,55 @@ -import semchunk -import semantic_text_splitter -import test_semchunk import time -chunk_size = 512 -semantic_text_splitter_chunker = semantic_text_splitter.TiktokenTextSplitter('gpt-4') +import test_semchunk +import tiktoken +from semantic_text_splitter import TextSplitter + +import semchunk + +chunk_sizes = [8,16,32,64,128,256,512,1024] +semantic_text_splitter_chunker = TextSplitter.from_tiktoken_model('gpt-4') -def bench_semchunk(text: str) -> None: - semchunk.chunk(text, chunk_size=chunk_size, token_counter=test_semchunk._token_counter) +encoder = tiktoken.encoding_for_model('gpt-4') -def bench_semantic_text_splitter(text: str) -> None: +def _token_counterv1(text: str) -> int: + """Count the number of tokens in a text.""" + return len(encoder.encode(text)) + +def _token_counterv2(text: str) -> int: + """Count the number of tokens in a text.""" + return len(encoder.encode(text)) + +def bench_semchunkv1(text: str, chunk_size: int) -> list[str]: + return semchunk.chunk_legacy(text, chunk_size=chunk_size, token_counter=_token_counterv1) + +def bench_semchunkv2(text: str, chunk_size: int) -> list[str]: + return semchunk.chunk(text, chunk_size=chunk_size, token_counter=_token_counterv2) + +def bench_semantic_text_splitter(text: str, chunk_size: int) -> None: semantic_text_splitter_chunker.chunks(text, chunk_size) libraries = { - 'semchunk': bench_semchunk, + 'semchunk': bench_semchunkv1, + 'semchunkv2': bench_semchunkv2, 'semantic_text_splitter': bench_semantic_text_splitter, } def bench() -> dict[str, float]: - benchmarks = dict.fromkeys(libraries.keys(), 0) - - for fileid in test_semchunk.gutenberg.fileids(): - sample = test_semchunk.gutenberg.raw(fileid) - for library, function in libraries.items(): - start = time.time() - function(sample) - benchmarks[library] += time.time() - start + benchmarks = {k: [0]*len(chunk_sizes) for k in libraries.keys()} + for i, chunk_size in enumerate(chunk_sizes): + semchunk.semchunk._memoised_token_counters = {} + for fileid in test_semchunk.gutenberg.fileids(): + sample = test_semchunk.gutenberg.raw(fileid) + for library, function in libraries.items(): + start = time.time() + function(sample, chunk_size) + benchmarks[library][i] += time.time() - start + return benchmarks if __name__ == '__main__': - for library, time_taken in bench().items(): - print(f'{library}: {time_taken:.2f}s') \ No newline at end of file + print('\t\t' + '\t'.join(map(str, chunk_sizes))) + for library, times_taken in bench().items(): + times = '\t'.join(f'{time:.2f}s' for time in times_taken) + print(f'{library}:\t {times}') \ No newline at end of file From eca19bea63fdf5003fd59b144543155ea891f4ed Mon Sep 17 00:00:00 2001 From: Umar Butler Date: Thu, 11 Apr 2024 15:08:07 +1000 Subject: [PATCH 02/11] Added a .gitignore. --- .gitignore | 4 ---- 1 file changed, 4 deletions(-) diff --git a/.gitignore b/.gitignore index aa35c27..fd4d2ff 100644 --- a/.gitignore +++ b/.gitignore @@ -1,6 +1,3 @@ -<<<<<<< HEAD -**/__pycache__/ -======= # Exclude everything. * @@ -26,4 +23,3 @@ __pycache__/ .pytest_cache/ tests/profiler.py tests/test_bench.py ->>>>>>> main From 33c81cced3d1c53deb3434058fd68e5e629e217e Mon Sep 17 00:00:00 2001 From: Umar Butler Date: Thu, 11 Apr 2024 15:43:52 +1000 Subject: [PATCH 03/11] Fixed typo in .gitignore. --- .gitignore | 1 - 1 file changed, 1 deletion(-) diff --git a/.gitignore b/.gitignore index fd4d2ff..e23ba1b 100644 --- a/.gitignore +++ b/.gitignore @@ -15,7 +15,6 @@ !.gitignore # Finally, exclude anything in the above inclusions that we don't want. -# Exclude common Python files and folders. *.pyc *.pyo *.ipynb From 69ff0666dded44cc79ebfe50b392f0a358134846 Mon Sep 17 00:00:00 2001 From: Rob Kopel Date: Mon, 22 Apr 2024 22:27:57 +1000 Subject: [PATCH 04/11] Performance, reliably this time --- src/semchunk/semchunk.py | 51 ++++++++++++++++------------------------ tests/bench.py | 10 +++++--- 2 files changed, 27 insertions(+), 34 deletions(-) diff --git a/src/semchunk/semchunk.py b/src/semchunk/semchunk.py index 291122e..914e92c 100644 --- a/src/semchunk/semchunk.py +++ b/src/semchunk/semchunk.py @@ -1,5 +1,7 @@ import re +from bisect import bisect_left from functools import cache, wraps +from itertools import accumulate _memoised_token_counters = {} """A map of token counters to their memoised versions.""" @@ -114,41 +116,31 @@ def chunk_legacy(text: str, chunk_size: int, token_counter: callable, memoize: b return chunks -def count(text: str, max_size: int, counter: callable) -> int: - """Counts the number of tokens in a text, with a heuristic to accelerate long texts""" - heuritistic = 6*max_size - - # There is a rare failure case for the below heuristic where superfluous tokens - # may be added from a longer, existing token being split before it was finished. - # e.g. Australia -> 1 token - # Australi -> 3 token - # - # We mitigate this failure case by adding the len(longest token)-1 such that - # any ongoing token will be able to finish - # - # Using the cl100k tokenset, the length of the longest non-symbol token is 42 - # See: https://gist.github.com/Yardanico/623b3092d0b707119f8c7d90a3596afe - max_token = 42 - 1 - - if len(text) > heuritistic and counter(text[:heuritistic+max_token]) > max_size: - return max_size+1 - return counter(text) def find_split(splits: list[str], max_size: int, splitter: str, counter: callable) -> tuple[int, str]: """Binary search for the optimal split point where the accumulated_token_count < max_size.""" - low, high = 0, len(splits) + 1 + + # Start avg low for fast calc of first real avg + avg, low, high = 0.2, 0, len(splits) + 1 + sums = list(accumulate(map(len, splits), initial=0)) + sums.append(sums[-1]) + while low < high: - # As the main performance hit comes from running the token_counter on long texts - # we can bias the binary search to favour guessing towards shorter sequences. - # This is done below by using > 2 as the divisor - mid = low + (high - low) // 8 - if count(splitter.join(splits[:mid]), max_size, counter) > max_size: + idx = bisect_left(sums[low:high + 1], max_size * avg) + mid = min(idx + low, high - 1) + + tokens = counter(splitter.join(splits[:mid])) + + avg = sums[mid]/tokens if sums[mid] else avg + + if tokens > max_size: high = mid else: low = mid + 1 return low-1, splitter.join(splits[:low-1]) + def chunk(text: str, chunk_size: int, token_counter: callable, memoize: bool=True, _recursion_depth: int = 0) -> list[str]: """Split text into semantically meaningful chunks of a specified size as determined by the provided token counter. @@ -179,7 +171,7 @@ def chunk(text: str, chunk_size: int, token_counter: callable, memoize: bool=Tru continue # If the split is over the chunk size, recursively chunk it. - if count(split, chunk_size, token_counter) > chunk_size: + if token_counter(split) > chunk_size: chunks.extend(chunk(split, chunk_size, token_counter, memoize, _recursion_depth+1)) # If the split is equal to or under the chunk size, merge it with all subsequent splits until the chunk size is reached. @@ -195,11 +187,8 @@ def chunk(text: str, chunk_size: int, token_counter: callable, memoize: bool=Tru # If the splitter is not whitespace and the split is not the last split, add the splitter to the end of the last chunk if doing so would not cause it to exceed the chunk size otherwise add the splitter as a new chunk. if not splitter_is_whitespace and not (i == len(splits) - 1 or all(j in skips for j in range(i+1, len(splits)))): - # We seperately add tokens(prior chunk) and tokens(splitter) to ensure O(1) - (both will be in cache). - # There is a failure case where tokens(get_last_token(prior_chunk) + splitter) == 1 however this is - # quite uncommon and leads to a negligible impact - if token_counter(chunks[-1]) + token_counter(splitter) <= chunk_size: - chunks[-1] += splitter + if token_counter(last_chunk_with_splitter:=chunks[-1]+splitter) <= chunk_size: + chunks[-1] = last_chunk_with_splitter else: chunks.append(splitter) diff --git a/tests/bench.py b/tests/bench.py index 9660cf6..6e386ec 100644 --- a/tests/bench.py +++ b/tests/bench.py @@ -6,7 +6,8 @@ import semchunk -chunk_sizes = [8,16,32,64,128,256,512,1024] +chunk_sizes = [8,16,32,64,128,256,512,1024,2048,4096,8192] + semantic_text_splitter_chunker = TextSplitter.from_tiktoken_model('gpt-4') encoder = tiktoken.encoding_for_model('gpt-4') @@ -31,7 +32,7 @@ def bench_semantic_text_splitter(text: str, chunk_size: int) -> None: libraries = { 'semchunk': bench_semchunkv1, 'semchunkv2': bench_semchunkv2, - 'semantic_text_splitter': bench_semantic_text_splitter, + # 'semantic_text_splitter': bench_semantic_text_splitter, } def bench() -> dict[str, float]: @@ -41,10 +42,13 @@ def bench() -> dict[str, float]: semchunk.semchunk._memoised_token_counters = {} for fileid in test_semchunk.gutenberg.fileids(): sample = test_semchunk.gutenberg.raw(fileid) + results = [] for library, function in libraries.items(): start = time.time() - function(sample, chunk_size) + results.append(function(sample, chunk_size)) benchmarks[library][i] += time.time() - start + if len(results) > 1: + assert results[-1] == results[-2] return benchmarks From 48a0b234af8dd7e85ff6eab8fde388f31fa9e7d9 Mon Sep 17 00:00:00 2001 From: Umar Butler Date: Mon, 13 May 2024 15:39:10 +1000 Subject: [PATCH 05/11] Restoring the original benchmarking script. --- tests/bench.py | 65 ++++++++++++++++---------------------------------- 1 file changed, 20 insertions(+), 45 deletions(-) diff --git a/tests/bench.py b/tests/bench.py index 6e386ec..f55a3c7 100644 --- a/tests/bench.py +++ b/tests/bench.py @@ -1,59 +1,34 @@ -import time - -import test_semchunk -import tiktoken -from semantic_text_splitter import TextSplitter - import semchunk +import semantic_text_splitter +import test_semchunk +import time -chunk_sizes = [8,16,32,64,128,256,512,1024,2048,4096,8192] - -semantic_text_splitter_chunker = TextSplitter.from_tiktoken_model('gpt-4') - -encoder = tiktoken.encoding_for_model('gpt-4') - -def _token_counterv1(text: str) -> int: - """Count the number of tokens in a text.""" - return len(encoder.encode(text)) - -def _token_counterv2(text: str) -> int: - """Count the number of tokens in a text.""" - return len(encoder.encode(text)) - -def bench_semchunkv1(text: str, chunk_size: int) -> list[str]: - return semchunk.chunk_legacy(text, chunk_size=chunk_size, token_counter=_token_counterv1) +chunk_size = 512 +semantic_text_splitter_chunker = semantic_text_splitter.TiktokenTextSplitter('gpt-4') -def bench_semchunkv2(text: str, chunk_size: int) -> list[str]: - return semchunk.chunk(text, chunk_size=chunk_size, token_counter=_token_counterv2) +def bench_semchunk(text: str) -> None: + semchunk.chunk(text, chunk_size=chunk_size, token_counter=test_semchunk._token_counter) -def bench_semantic_text_splitter(text: str, chunk_size: int) -> None: +def bench_semantic_text_splitter(text: str) -> None: semantic_text_splitter_chunker.chunks(text, chunk_size) libraries = { - 'semchunk': bench_semchunkv1, - 'semchunkv2': bench_semchunkv2, - # 'semantic_text_splitter': bench_semantic_text_splitter, + 'semchunk': bench_semchunk, + 'semantic_text_splitter': bench_semantic_text_splitter, } def bench() -> dict[str, float]: - benchmarks = {k: [0]*len(chunk_sizes) for k in libraries.keys()} + benchmarks = dict.fromkeys(libraries.keys(), 0) + + for fileid in test_semchunk.gutenberg.fileids(): + sample = test_semchunk.gutenberg.raw(fileid) + for library, function in libraries.items(): + start = time.time() + function(sample) + benchmarks[library] += time.time() - start - for i, chunk_size in enumerate(chunk_sizes): - semchunk.semchunk._memoised_token_counters = {} - for fileid in test_semchunk.gutenberg.fileids(): - sample = test_semchunk.gutenberg.raw(fileid) - results = [] - for library, function in libraries.items(): - start = time.time() - results.append(function(sample, chunk_size)) - benchmarks[library][i] += time.time() - start - if len(results) > 1: - assert results[-1] == results[-2] - return benchmarks if __name__ == '__main__': - print('\t\t' + '\t'.join(map(str, chunk_sizes))) - for library, times_taken in bench().items(): - times = '\t'.join(f'{time:.2f}s' for time in times_taken) - print(f'{library}:\t {times}') \ No newline at end of file + for library, time_taken in bench().items(): + print(f'{library}: {time_taken:.2f}s') \ No newline at end of file From 8683723f857841092314cfe1ca8f87f9d05a86b7 Mon Sep 17 00:00:00 2001 From: Umar Butler Date: Mon, 13 May 2024 15:48:16 +1000 Subject: [PATCH 06/11] Minor reformatting. --- src/semchunk/semchunk.py | 72 ++-------------------------------------- 1 file changed, 3 insertions(+), 69 deletions(-) diff --git a/src/semchunk/semchunk.py b/src/semchunk/semchunk.py index 914e92c..fdd3859 100644 --- a/src/semchunk/semchunk.py +++ b/src/semchunk/semchunk.py @@ -1,8 +1,10 @@ import re + from bisect import bisect_left from functools import cache, wraps from itertools import accumulate + _memoised_token_counters = {} """A map of token counters to their memoised versions.""" @@ -47,75 +49,6 @@ def _split_text(text: str) -> tuple[str, bool, list[str]]: # Return the splitter and the split text. return splitter, splitter_is_whitespace, text.split(splitter) -def chunk_legacy(text: str, chunk_size: int, token_counter: callable, memoize: bool=True, _recursion_depth: int = 0) -> list[str]: - """Split text into semantically meaningful chunks of a specified size as determined by the provided token counter. - - Args: - text (str): The text to be chunked. - chunk_size (int): The maximum number of tokens a chunk may contain. - token_counter (callable): A callable that takes a string and returns the number of tokens in it. - memoize (bool, optional): Whether to memoise the token counter. Defaults to True. - - Returns: - list[str]: A list of chunks up to `chunk_size`-tokens-long, with any whitespace used to split the text removed.""" - - # If this is not a recursive call and memoization is enabled, overwrite the `token_counter` with a memoised version of itself. - if not _recursion_depth and memoize: - token_counter = _memoised_token_counters.setdefault(token_counter, cache(token_counter)) - - # Split the text using the most semantically meaningful splitter possible. - splitter, splitter_is_whitespace, splits = _split_text(text) - - chunks = [] - skips = set() - """A list of indices of splits to skip because they have already been added to a chunk.""" - - # Iterate through the splits. - for i, split in enumerate(splits): - # Skip the split if it has already been added to a chunk. - if i in skips: - continue - - # If the split is over the chunk size, recursively chunk it. - if token_counter(split) > chunk_size: - chunks.extend(chunk_legacy(split, chunk_size, token_counter=token_counter, memoize=memoize, _recursion_depth=_recursion_depth+1)) - - # If the split is equal to or under the chunk size, merge it with all subsequent splits until the chunk size is reached. - else: - # Initalise the new chunk. - new_chunk = split - - # Iterate through each subsequent split until the chunk size is reached. - for j, next_split in enumerate(splits[i+1:], start=i+1): - # Check whether the next split can be added to the chunk without exceeding the chunk size. - if token_counter(updated_chunk:=new_chunk+splitter+next_split) <= chunk_size: - # Add the next split to the new chunk. - new_chunk = updated_chunk - - # Add the index of the next split to the list of indices to skip. - skips.add(j) - - # If the next split cannot be added to the chunk without exceeding the chunk size, break. - else: - break - - # Add the chunk. - chunks.append(new_chunk) - - # If the splitter is not whitespace and the split is not the last split, add the splitter to the end of the last chunk if doing so would not cause it to exceed the chunk size otherwise add the splitter as a new chunk. - if not splitter_is_whitespace and not (i == len(splits) - 1 or all(j in skips for j in range(i+1, len(splits)))): - if token_counter(last_chunk_with_splitter:=chunks[-1]+splitter) <= chunk_size: - chunks[-1] = last_chunk_with_splitter - - else: - chunks.append(splitter) - - # If this is not a recursive call, remove any empty chunks. - if not _recursion_depth: - chunks = list(filter(None, chunks)) - - return chunks - def find_split(splits: list[str], max_size: int, splitter: str, counter: callable) -> tuple[int, str]: """Binary search for the optimal split point where the accumulated_token_count < max_size.""" @@ -198,4 +131,5 @@ def chunk(text: str, chunk_size: int, token_counter: callable, memoize: bool=Tru return chunks + chunk = wraps(chunk)(cache(chunk)) \ No newline at end of file From 2cd1f6eba65f3b518843398e5ea8ac718cec5b35 Mon Sep 17 00:00:00 2001 From: Umar Butler Date: Mon, 13 May 2024 17:46:16 +1000 Subject: [PATCH 07/11] Reverted version number while new version is still in development. --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 734b0a7..f2c0158 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "hatchling.build" [project] name = "semchunk" -version = "0.2.4" +version = "0.2.3" authors = [ {name="Umar Butler", email="umar@umar.au"}, ] From 5a787585f179ffa998c141ee71b5607aaa7da5ca Mon Sep 17 00:00:00 2001 From: Umar Butler Date: Mon, 13 May 2024 19:09:32 +1000 Subject: [PATCH 08/11] Removed legacy chunker. --- src/semchunk/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/semchunk/__init__.py b/src/semchunk/__init__.py index 8f33c5e..bf8b68c 100644 --- a/src/semchunk/__init__.py +++ b/src/semchunk/__init__.py @@ -1,3 +1,3 @@ """A fast and lightweight pure Python library for splitting text into semantically meaningful chunks.""" -from .semchunk import chunk, chunk_legacy +from .semchunk import chunk From 6f800ed0750cab875bd927ccea217599fb1b1bbe Mon Sep 17 00:00:00 2001 From: Umar Butler Date: Mon, 13 May 2024 19:09:50 +1000 Subject: [PATCH 09/11] Logged improvements to chunking performance. --- CHANGELOG.md | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 158c255..3bc3fcd 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,10 +1,9 @@ ## Changelog 🔄 All notable changes to `semchunk` will be documented here. This project adheres to [Keep a Changelog](https://keepachangelog.com/en/1.1.0/) and [Semantic Versioning](https://semver.org/spec/v2.0.0.html). -## [0.2.4] - 2024-04-03 +## [Unreleased] - 2024-XX-XX ### Changed -- Improved chunking performance -- Added git ignore file +- Improved chunking performance with larger chunk sizes by switching from linear to binary search for the identification of optimal chunk boundaries. ## [0.2.3] - 2024-03-11 ### Fixed From c989716d4c602ac96f76bba81b443fb19fa8e660 Mon Sep 17 00:00:00 2001 From: Umar Butler Date: Mon, 13 May 2024 19:10:42 +1000 Subject: [PATCH 10/11] Clarified documentation and naming of variables related to the binary search algorithm. --- src/semchunk/semchunk.py | 50 +++++++++++++++++++++------------------- 1 file changed, 26 insertions(+), 24 deletions(-) diff --git a/src/semchunk/semchunk.py b/src/semchunk/semchunk.py index fdd3859..4a2b9c8 100644 --- a/src/semchunk/semchunk.py +++ b/src/semchunk/semchunk.py @@ -1,6 +1,7 @@ import re from bisect import bisect_left +from typing import Callable from functools import cache, wraps from itertools import accumulate @@ -50,31 +51,32 @@ def _split_text(text: str) -> tuple[str, bool, list[str]]: return splitter, splitter_is_whitespace, text.split(splitter) -def find_split(splits: list[str], max_size: int, splitter: str, counter: callable) -> tuple[int, str]: - """Binary search for the optimal split point where the accumulated_token_count < max_size.""" +def merge_splits(splits: list[str], chunk_size: int, splitter: str, token_counter: Callable) -> tuple[int, str]: + """Merge splits until a chunk size is reached, returning the index of the last split included in the merged chunk along with the merged chunk itself.""" - # Start avg low for fast calc of first real avg - avg, low, high = 0.2, 0, len(splits) + 1 - sums = list(accumulate(map(len, splits), initial=0)) - sums.append(sums[-1]) + average = 0.2 + low = 0 + high = len(splits) + 1 + cumulative_lengths = tuple(accumulate(map(len, splits), initial=0)) + cumulative_lengths += (cumulative_lengths[-1],) while low < high: - idx = bisect_left(sums[low:high + 1], max_size * avg) - mid = min(idx + low, high - 1) + i = bisect_left(cumulative_lengths[low : high + 1], chunk_size * average) + midpoint = min(i + low, high - 1) - tokens = counter(splitter.join(splits[:mid])) + tokens = token_counter(splitter.join(splits[:midpoint])) - avg = sums[mid]/tokens if sums[mid] else avg + average = cumulative_lengths[midpoint] / tokens if cumulative_lengths[midpoint] else average - if tokens > max_size: - high = mid + if tokens > chunk_size: + high = midpoint else: - low = mid + 1 + low = midpoint + 1 - return low-1, splitter.join(splits[:low-1]) + return low - 1, splitter.join(splits[:low - 1]) -def chunk(text: str, chunk_size: int, token_counter: callable, memoize: bool=True, _recursion_depth: int = 0) -> list[str]: +def chunk(text: str, chunk_size: int, token_counter: Callable, memoize: bool = True, _recursion_depth: int = 0) -> list[str]: """Split text into semantically meaningful chunks of a specified size as determined by the provided token counter. Args: @@ -105,22 +107,22 @@ def chunk(text: str, chunk_size: int, token_counter: callable, memoize: bool=Tru # If the split is over the chunk size, recursively chunk it. if token_counter(split) > chunk_size: - chunks.extend(chunk(split, chunk_size, token_counter, memoize, _recursion_depth+1)) + chunks.extend(chunk(split, chunk_size, token_counter = token_counter, memoize = memoize, recursion_depth = _recursion_depth + 1)) - # If the split is equal to or under the chunk size, merge it with all subsequent splits until the chunk size is reached. + # If the split is equal to or under the chunk size, add it and any subsequent splits to a new chunk until the chunk size is reached. else: - # Use n-ary search to find the optimal split point. - optimal, new_chunk = find_split(splits[i:], chunk_size, splitter, token_counter) - - # Update the skips set based on the splits included in the new chunk. - skips.update(range(i+1, i + optimal)) + # Merge the split with subsequent splits until the chunk size is reached. + final_split_in_chunk_i, new_chunk = merge_splits(splits[i:], chunk_size, splitter, token_counter) + + # Mark any splits included in the new chunk for exclusion from future chunks. + skips.update(range(i + 1, i + final_split_in_chunk_i)) # Add the chunk. chunks.append(new_chunk) # If the splitter is not whitespace and the split is not the last split, add the splitter to the end of the last chunk if doing so would not cause it to exceed the chunk size otherwise add the splitter as a new chunk. - if not splitter_is_whitespace and not (i == len(splits) - 1 or all(j in skips for j in range(i+1, len(splits)))): - if token_counter(last_chunk_with_splitter:=chunks[-1]+splitter) <= chunk_size: + if not splitter_is_whitespace and not (i == len(splits) - 1 or all(j in skips for j in range(i + 1, len(splits)))): + if token_counter(last_chunk_with_splitter := chunks[-1] + splitter) <= chunk_size: chunks[-1] = last_chunk_with_splitter else: chunks.append(splitter) From 1e3ddb91698f072da1d8a7d809a66467e1d31ff8 Mon Sep 17 00:00:00 2001 From: Umar Butler Date: Mon, 13 May 2024 19:11:47 +1000 Subject: [PATCH 11/11] Fixing typos. --- src/semchunk/semchunk.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/semchunk/semchunk.py b/src/semchunk/semchunk.py index 4a2b9c8..7ee9515 100644 --- a/src/semchunk/semchunk.py +++ b/src/semchunk/semchunk.py @@ -107,7 +107,7 @@ def chunk(text: str, chunk_size: int, token_counter: Callable, memoize: bool = T # If the split is over the chunk size, recursively chunk it. if token_counter(split) > chunk_size: - chunks.extend(chunk(split, chunk_size, token_counter = token_counter, memoize = memoize, recursion_depth = _recursion_depth + 1)) + chunks.extend(chunk(split, chunk_size, token_counter = token_counter, memoize = memoize, _recursion_depth = _recursion_depth + 1)) # If the split is equal to or under the chunk size, add it and any subsequent splits to a new chunk until the chunk size is reached. else: