Skip to content

Commit

Permalink
good defaults?
Browse files Browse the repository at this point in the history
  • Loading branch information
ArthurZucker committed Oct 21, 2024
1 parent b2c667c commit a56f73e
Show file tree
Hide file tree
Showing 2 changed files with 8 additions and 10 deletions.
12 changes: 8 additions & 4 deletions tokenizers/src/models/bpe/model.rs
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ struct Config {
files: Option<(String, String)>,
vocab: Vocab,
merges: Merges,
cache_capacity: usize,
cache_capacity: i64,
dropout: Option<f32>,
unk_token: Option<String>,
continuing_subword_prefix: Option<String>,
Expand All @@ -43,7 +43,7 @@ impl Default for BpeBuilder {
files: None,
vocab: HashMap::new(),
merges: vec![],
cache_capacity: DEFAULT_CACHE_CAPACITY,
cache_capacity: -1,
dropout: None,
unk_token: None,
continuing_subword_prefix: None,
Expand Down Expand Up @@ -80,7 +80,7 @@ impl BpeBuilder {
/// Set the cache's capacity. Set to 0 if you want to disable caching.
#[must_use]
pub fn cache_capacity(mut self, capacity: usize) -> Self {
self.config.cache_capacity = capacity;
self.config.cache_capacity = capacity as i64;
self
}

Expand Down Expand Up @@ -154,7 +154,11 @@ impl BpeBuilder {
.iter()
.map(|(key, val)| (*val, key.to_owned()))
.collect();
let cache =Some(Cache::new(self.config.cache_capacity));
let cache = match self.config.cache_capacity {
0 => None,
-1 => Some(Cache::new(0)),
capacity => Some(Cache::new(capacity as usize)),
};

let vocab = self.config.vocab;
let prefix_len = if let Some(prefix) = &self.config.continuing_subword_prefix {
Expand Down
6 changes: 0 additions & 6 deletions tokenizers/src/utils/cache.rs
Original file line number Diff line number Diff line change
Expand Up @@ -56,8 +56,6 @@ where
} else{
use_default_capacity
};
let h_format = capacity / (1024 * 1024 * 1024);
println!("Using capacity {h_format} (nb of elements)");
let map = RwLock::new(HashMap::with_capacity(capacity));
Cache { map, capacity }
}
Expand Down Expand Up @@ -138,13 +136,9 @@ fn default_cache_capacity<K, V>() -> usize {
// Get the sizes of the key and value types in bytes
let key_size = mem::size_of::<K>();
let value_size = mem::size_of::<V>();
println!("{key_size}bytes, {value_size}bytes");
let entry_size = key_size + value_size;

// Total available memory in bytes (from KB to bytes)
let available_memory_bytes = ((total_memory as f64* 0.90) as usize / 64) * entry_size ;
let h_format = available_memory_bytes/ (1024 * 1024 * 1024);
println!("Available memory: {h_format}GB");
return available_memory_bytes /entry_size
}

0 comments on commit a56f73e

Please sign in to comment.