From da0b25aba505606a1f08b77015298e5fc1700b12 Mon Sep 17 00:00:00 2001 From: Umar Butler Date: Mon, 5 Feb 2024 20:41:04 +1100 Subject: [PATCH] Ensured that the `memoize` argument is passed back to `chunk()` in recursive calls. --- CHANGELOG.md | 5 +++++ LICENCE | 2 +- README.md | 2 +- pyproject.toml | 2 +- src/semchunk/semchunk.py | 2 +- 5 files changed, 9 insertions(+), 4 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index ea79af0..aed70cc 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,10 @@ ## Changelog 🔄 All notable changes to `semchunk` will be documented here. This project adheres to [Keep a Changelog](https://keepachangelog.com/en/1.1.0/) and [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## [0.2.2] - 2024-02-05 +### Fixed +- Ensured that the `memoize` argument is passed back to `chunk()` in recursive calls. + ## [0.2.1] - 2023-11-09 ### Added - Memoized `chunk()`. @@ -32,6 +36,7 @@ All notable changes to `semchunk` will be documented here. This project adheres ### Added - Added the `chunk()` function, which splits text into semantically meaningful chunks of a specified size as determined by a provided token counter. +[0.2.2]: https://github.com/umarbutler/semchunk/compare/v0.2.1...v0.2.2 [0.2.1]: https://github.com/umarbutler/semchunk/compare/v0.2.0...v0.2.1 [0.2.0]: https://github.com/umarbutler/semchunk/compare/v0.1.2...v0.2.0 [0.1.2]: https://github.com/umarbutler/semchunk/compare/v0.1.1...v0.1.2 diff --git a/LICENCE b/LICENCE index 627e8e0..fba0d79 100644 --- a/LICENCE +++ b/LICENCE @@ -1,4 +1,4 @@ -Copyright (c) 2023 Umar Butler +Copyright (c) 2024 Umar Butler Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal diff --git a/README.md b/README.md index 3dfe878..0c3ff94 100644 --- a/README.md +++ b/README.md @@ -66,7 +66,7 @@ To ensure that chunks are as semantically meaningful as possible, `semchunk` use `semchunk` also relies on memoization to cache the results of token counters and the `chunk()` function, thereby improving performance. ## Benchmarks 📊 -On a desktop with a Ryzen 3600, 64 GB of RAM, Windows 11 and Python 3.12.0, it takes `semchunk` 25.29 seconds to split every sample in [NLTK's Gutenberg Corpus](https://www.nltk.org/howto/corpus.html#plaintext-corpora) into 512-token-long chunks (for context, the Corpus contains 18 texts and 3,001,260 tokens). By comparison, it takes [`semantic-text-splitter`](https://pypi.org/project/semantic-text-splitter/) 1 minute and 51.65 seconds to chunk the same texts into 512-token-long chunks — a difference of 77.35%. +On a desktop with a Ryzen 3600, 64 GB of RAM, Windows 11 and Python 3.12.0, it takes `semchunk` 24.41s seconds to split every sample in [NLTK's Gutenberg Corpus](https://www.nltk.org/howto/corpus.html#plaintext-corpora) into 512-token-long chunks (for context, the Corpus contains 18 texts and 3,001,260 tokens). By comparison, it takes [`semantic-text-splitter`](https://pypi.org/project/semantic-text-splitter/) 1 minute and 48.01 seconds to chunk the same texts into 512-token-long chunks — a difference of 77.35%. The code used to benchmark `semchunk` and `semantic-text-splitter` is available [here](https://github.com/umarbutler/semchunk/blob/main/tests/bench.py). diff --git a/pyproject.toml b/pyproject.toml index c4f4408..4995291 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "hatchling.build" [project] name = "semchunk" -version = "0.2.1" +version = "0.2.2" authors = [ {name="Umar Butler", email="umar@umar.au"}, ] diff --git a/src/semchunk/semchunk.py b/src/semchunk/semchunk.py index 1252bd8..c5a9b4e 100644 --- a/src/semchunk/semchunk.py +++ b/src/semchunk/semchunk.py @@ -77,7 +77,7 @@ def chunk(text: str, chunk_size: int, token_counter: callable, memoize: bool=Tru # If the split is over the chunk size, recursively chunk it. if token_counter(split) > chunk_size: - chunks.extend(chunk(split, chunk_size, token_counter=token_counter, _recursion_depth=_recursion_depth+1)) + chunks.extend(chunk(split, chunk_size, token_counter=token_counter, memoize=memoize, _recursion_depth=_recursion_depth+1)) # If the split is equal to or under the chunk size, merge it with all subsequent splits until the chunk size is reached. else: