From 9b77c054ef4297c7057fa8db875368c7c02f1bfc Mon Sep 17 00:00:00 2001 From: Ryan Landay Date: Mon, 14 Oct 2024 02:40:17 -0400 Subject: [PATCH] Fix off-by-one error in tokenizer::normalizer::Range::len (#1638) --- tokenizers/src/tokenizer/normalizer.rs | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/tokenizers/src/tokenizer/normalizer.rs b/tokenizers/src/tokenizer/normalizer.rs index a8a05c795..9cbbccee2 100644 --- a/tokenizers/src/tokenizer/normalizer.rs +++ b/tokenizers/src/tokenizer/normalizer.rs @@ -45,8 +45,8 @@ where match range.start_bound() { Bound::Unbounded => Some(end), - Bound::Included(i) => Some(end - (*i + 1)), - Bound::Excluded(i) => Some(end - *i), + Bound::Included(i) => Some(end - *i), + Bound::Excluded(i) => Some(end - (*i + 1)), } } @@ -1013,6 +1013,20 @@ mod tests { use regex::Regex; use unicode_categories::UnicodeCategories; + #[test] + fn test_len_range_inclusive() { + let range = Range::Original(3..=7); + let len = range.len(); + assert_eq!(len, Some(5)); // 7 - 3 + 1 = 5 + } + + #[test] + fn test_len_range_exclusive() { + let range = Range::Original(3..7); + let len = range.len(); + assert_eq!(len, Some(4)); // 7 - 3 = 4 + } + #[test] fn nfd_adds_new_chars() { let mut n = NormalizedString::from("élégant");