Skip to content

Commit

Permalink
Fix off-by-one error in tokenizer::normalizer::Range::len (#1638)
Browse files Browse the repository at this point in the history
  • Loading branch information
rlanday authored Oct 14, 2024
1 parent bce68a6 commit 9b77c05
Showing 1 changed file with 16 additions and 2 deletions.
18 changes: 16 additions & 2 deletions tokenizers/src/tokenizer/normalizer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -45,8 +45,8 @@ where

match range.start_bound() {
Bound::Unbounded => Some(end),
Bound::Included(i) => Some(end - (*i + 1)),
Bound::Excluded(i) => Some(end - *i),
Bound::Included(i) => Some(end - *i),
Bound::Excluded(i) => Some(end - (*i + 1)),
}
}

Expand Down Expand Up @@ -1013,6 +1013,20 @@ mod tests {
use regex::Regex;
use unicode_categories::UnicodeCategories;

#[test]
fn test_len_range_inclusive() {
let range = Range::Original(3..=7);
let len = range.len();
assert_eq!(len, Some(5)); // 7 - 3 + 1 = 5
}

#[test]
fn test_len_range_exclusive() {
let range = Range::Original(3..7);
let len = range.len();
assert_eq!(len, Some(4)); // 7 - 3 = 4
}

#[test]
fn nfd_adds_new_chars() {
let mut n = NormalizedString::from("élégant");
Expand Down

0 comments on commit 9b77c05

Please sign in to comment.