diff --git a/tokenizers/src/models/bpe/model.rs b/tokenizers/src/models/bpe/model.rs index 10d6080e6..992b1252a 100644 --- a/tokenizers/src/models/bpe/model.rs +++ b/tokenizers/src/models/bpe/model.rs @@ -21,7 +21,7 @@ struct Config { files: Option<(String, String)>, vocab: Vocab, merges: Merges, - cache_capacity: usize, + cache_capacity: i64, dropout: Option, unk_token: Option, continuing_subword_prefix: Option, @@ -43,7 +43,7 @@ impl Default for BpeBuilder { files: None, vocab: HashMap::new(), merges: vec![], - cache_capacity: DEFAULT_CACHE_CAPACITY, + cache_capacity: -1, dropout: None, unk_token: None, continuing_subword_prefix: None, @@ -80,7 +80,7 @@ impl BpeBuilder { /// Set the cache's capacity. Set to 0 if you want to disable caching. #[must_use] pub fn cache_capacity(mut self, capacity: usize) -> Self { - self.config.cache_capacity = capacity; + self.config.cache_capacity = capacity as i64; self } @@ -154,7 +154,11 @@ impl BpeBuilder { .iter() .map(|(key, val)| (*val, key.to_owned())) .collect(); - let cache =Some(Cache::new(self.config.cache_capacity)); + let cache = match self.config.cache_capacity { + 0 => None, + -1 => Some(Cache::new(0)), + capacity => Some(Cache::new(capacity as usize)), + }; let vocab = self.config.vocab; let prefix_len = if let Some(prefix) = &self.config.continuing_subword_prefix { diff --git a/tokenizers/src/utils/cache.rs b/tokenizers/src/utils/cache.rs index 796e26ffa..004c6259a 100644 --- a/tokenizers/src/utils/cache.rs +++ b/tokenizers/src/utils/cache.rs @@ -56,8 +56,6 @@ where } else{ use_default_capacity }; - let h_format = capacity / (1024 * 1024 * 1024); - println!("Using capacity {h_format} (nb of elements)"); let map = RwLock::new(HashMap::with_capacity(capacity)); Cache { map, capacity } } @@ -138,13 +136,9 @@ fn default_cache_capacity() -> usize { // Get the sizes of the key and value types in bytes let key_size = mem::size_of::(); let value_size = mem::size_of::(); - println!("{key_size}bytes, {value_size}bytes"); let entry_size = key_size + value_size; - // Total available memory in bytes (from KB to bytes) let available_memory_bytes = ((total_memory as f64* 0.90) as usize / 64) * entry_size ; - let h_format = available_memory_bytes/ (1024 * 1024 * 1024); - println!("Available memory: {h_format}GB"); return available_memory_bytes /entry_size }