Skip to content

Commit

Permalink
Merge branch 'main' into remove-deprecated-cached-download
Browse files Browse the repository at this point in the history
  • Loading branch information
Wauplin committed Jun 10, 2024
2 parents b40c763 + 8d28dbe commit fd7672f
Show file tree
Hide file tree
Showing 3 changed files with 9 additions and 8 deletions.
2 changes: 1 addition & 1 deletion tokenizers/src/models/bpe/trainer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -523,7 +523,7 @@ impl BpeTrainer {
.get(&new_token)
.copied()
.unwrap_or(id_to_word.len() as u32);
if word_to_id.get(&new_token).is_none() {
if !word_to_id.contains_key(&new_token) {
id_to_word.push(new_token.clone());
word_to_id.insert(new_token.clone(), new_token_id);
}
Expand Down
4 changes: 2 additions & 2 deletions tokenizers/src/models/wordpiece/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -180,10 +180,10 @@ impl WordPiece {
pub fn from_bpe(bpe: &BPE) -> Self {
let mut wp = Self::builder().vocab(bpe.get_vocab()).build().unwrap();
if let Some(unk) = bpe.get_unk_token() {
wp.unk_token = unk.to_owned();
unk.clone_into(&mut wp.unk_token);
}
if let Some(prefix) = bpe.get_continuing_subword_prefix() {
wp.continuing_subword_prefix = prefix.to_owned();
prefix.clone_into(&mut wp.continuing_subword_prefix);
}
wp
}
Expand Down
11 changes: 6 additions & 5 deletions tokenizers/src/utils/parallelism.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,14 +5,15 @@
use rayon::iter::IterBridge;
use rayon::prelude::*;
use rayon_cond::CondIterator;
use std::sync::atomic::AtomicBool;
use std::sync::atomic::Ordering;

// Re-export rayon current_num_threads
pub use rayon::current_num_threads;

pub const ENV_VARIABLE: &str = "TOKENIZERS_PARALLELISM";

// Reading/Writing this variable should always happen on the main thread
static mut USED_PARALLELISM: bool = false;
static USED_PARALLELISM: AtomicBool = AtomicBool::new(false);

/// Check if the TOKENIZERS_PARALLELISM env variable has been explicitly set
pub fn is_parallelism_configured() -> bool {
Expand All @@ -21,7 +22,7 @@ pub fn is_parallelism_configured() -> bool {

/// Check if at some point we used a parallel iterator
pub fn has_parallelism_been_used() -> bool {
unsafe { USED_PARALLELISM }
USED_PARALLELISM.load(Ordering::SeqCst)
}

/// Get the currently set value for `TOKENIZERS_PARALLELISM` env variable
Expand Down Expand Up @@ -70,7 +71,7 @@ where
fn into_maybe_par_iter(self) -> CondIterator<P, S> {
let parallelism = get_parallelism();
if parallelism {
unsafe { USED_PARALLELISM = true };
USED_PARALLELISM.store(true, Ordering::SeqCst);
}
CondIterator::new(self, parallelism)
}
Expand Down Expand Up @@ -159,7 +160,7 @@ where
let iter = CondIterator::from_serial(self);

if get_parallelism() {
unsafe { USED_PARALLELISM = true };
USED_PARALLELISM.store(true, Ordering::SeqCst);
CondIterator::from_parallel(iter.into_parallel().right().unwrap())
} else {
iter
Expand Down

0 comments on commit fd7672f

Please sign in to comment.