From 41d8b0363e7c18e3bc6dad08dbcfe5bc2e30ca94 Mon Sep 17 00:00:00 2001 From: Arthur Zucker Date: Mon, 30 Sep 2024 11:05:03 +0200 Subject: [PATCH] Fix the default offset create --- tokenizers/src/models/bpe/model.rs | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/tokenizers/src/models/bpe/model.rs b/tokenizers/src/models/bpe/model.rs index 1585da761..4a534046a 100644 --- a/tokenizers/src/models/bpe/model.rs +++ b/tokenizers/src/models/bpe/model.rs @@ -462,7 +462,11 @@ impl BPE { fn tokenize_with_cache(&self, sequence: &str) -> Result> { if self.ignore_merges { if let Some(id) = self.vocab.get(sequence) { - return Ok(vec![Token::new(*id, sequence.to_string().clone(), (0, 0))]); + return Ok(vec![Token::new( + *id, + sequence.to_string().clone(), + (0, sequence.len()), + )]); } } if let Some(ref hit) = self.cache.as_ref().and_then(|c| c.get(sequence)) {