Skip to content

Commit

Permalink
Disable caching for long strings. (#1676)
Browse files Browse the repository at this point in the history
  • Loading branch information
Narsil authored Nov 7, 2024
1 parent c6b5c3e commit 5aa9f6c
Show file tree
Hide file tree
Showing 3 changed files with 11 additions and 4 deletions.
6 changes: 4 additions & 2 deletions tokenizers/src/models/bpe/model.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
use super::{super::OrderedVocabIter, trainer::BpeTrainer, Error, Pair, Word};
use crate::tokenizer::{Model, Result, Token};
use crate::utils::cache::{Cache, DEFAULT_CACHE_CAPACITY};
use crate::utils::cache::{Cache, DEFAULT_CACHE_CAPACITY, MAX_LENGTH};
use crate::utils::iter::ResultShunt;
use serde_json::Value;
use std::borrow::Cow;
Expand Down Expand Up @@ -482,7 +482,9 @@ impl BPE {
let word = self.merge_word(sequence)?;
let ret = self.word_to_tokens(&word).collect();
if let Some(ref cache) = self.cache {
cache.set(sequence.to_owned(), word);
if sequence.len() < MAX_LENGTH {
cache.set(sequence.to_owned(), word);
}
}
Ok(ret)
}
Expand Down
6 changes: 4 additions & 2 deletions tokenizers/src/models/unigram/model.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ use super::{
trie::{Trie, TrieBuilder},
};
use crate::tokenizer::{Model, Result, Token};
use crate::utils::cache::Cache;
use crate::utils::cache::{Cache, MAX_LENGTH};

use std::collections::HashMap;
use std::convert::TryInto;
Expand Down Expand Up @@ -230,7 +230,9 @@ impl Unigram {
} else {
self.encode_unoptimized(sentence)?
};
self.cache.set(sentence.to_owned(), result.clone());
if sentence.len() < MAX_LENGTH {
self.cache.set(sentence.to_owned(), result.clone());
}
Ok(result)
}
}
Expand Down
3 changes: 3 additions & 0 deletions tokenizers/src/utils/cache.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,9 @@ use std::sync::RwLock;

/// The default capacity for a `BPE`'s internal cache.
pub static DEFAULT_CACHE_CAPACITY: usize = 10_000;
/// The maximum length we should cache in a model
/// Strings that are too long have minimal chances to cache hit anyway
pub static MAX_LENGTH: usize = 256;

/// Provides a simple multithread cache to speed up BPE tokenization that will try to read values
/// concurrently but won't block if another thread is writing.
Expand Down

0 comments on commit 5aa9f6c

Please sign in to comment.