Skip to content

Commit

Permalink
revert and cleanup
Browse files Browse the repository at this point in the history
  • Loading branch information
ArthurZucker committed Jul 24, 2024
1 parent 1ed26d7 commit 16af28b
Show file tree
Hide file tree
Showing 5 changed files with 4 additions and 44 deletions.
2 changes: 0 additions & 2 deletions tokenizers/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -151,5 +151,3 @@ pub use utils::parallelism;
// Re-export for from_pretrained
#[cfg(feature = "http")]
pub use utils::from_pretrained::FromPretrainedParameters;
#[global_allocator]
static GLOBAL: mimalloc::MiMalloc = mimalloc::MiMalloc;
22 changes: 1 addition & 21 deletions tokenizers/src/tokenizer/added_vocabulary.rs
Original file line number Diff line number Diff line change
Expand Up @@ -453,26 +453,6 @@ impl AddedVocabulary {
splits
}

fn fast_split_with_indices(
&self,
sentence: NormalizedString,
split_re: &MatchingSet,
) -> Vec<(NormalizedString, Option<Vec<Token>>)> {
self.find_matches(sentence.get(), split_re)
.into_iter()
.map(|(id, byte_offsets)| {
let slice = sentence
.slice(Range::Normalized(byte_offsets.0..byte_offsets.1))
.expect("AddedVocabulary bad split");
if let Some(id) = id {
(slice, Some(vec![Token::new(id, String::new(), (0, 0))]))
} else {
(slice, None)
}
})
.collect()
}

/// Split the input sentence to extract anything we found from the `MatchingSet`, as well as
/// the list of corresponding IDs
/// The list of IDs have the exact same number of elements than the Iterator.
Expand Down Expand Up @@ -514,7 +494,7 @@ impl AddedVocabulary {
// 1. We extract all the non-normalized tokens from the non-normalized string
pretokenized
.split(|_, sequence| {
Ok(self.fast_split_with_indices(
Ok(self.split_with_indices(
sequence,
&self.split_trie_vec[hash_current_thread() % MAX_NUM_THREADS],
))
Expand Down
3 changes: 2 additions & 1 deletion tokenizers/src/tokenizer/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -895,7 +895,7 @@ where
) -> Result<Encoding> {
let mut pretokenized: PreTokenizedString = pretokenized.into();
pretokenized.tokenize(|normalized| self.model.tokenize(normalized.get()))?;
pretokenized.fast_into_encoding()
pretokenized.into_encoding(word_idx, type_id, offsets_type)
}
}

Expand Down Expand Up @@ -1070,6 +1070,7 @@ where
.num_threads(num_threads)
.build()
.unwrap();

let mut encodings = pool.install(|| {
let result = inputs
.into_maybe_par_iter()
Expand Down
19 changes: 0 additions & 19 deletions tokenizers/src/tokenizer/pre_tokenizer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -186,25 +186,6 @@ impl PreTokenizedString {
}
}

pub fn fast_into_encoding(self) -> Result<Encoding> {
if self.splits.is_empty() {
Ok(Encoding::default())
} else if !self.splits.iter().all(|split| split.tokens.is_some()) {
Err("Split has not been tokenized.".into())
} else {
let tokens = self
.splits
.into_iter()
.flat_map(|split| {
split.tokens.unwrap().into_iter().map(|token| {
// Replace this with the actual fields you need for the Encoding type
(token.id, String::new(), (0, 0), None, 0)
})
})
.collect();
Ok(tokens)
}
}
/// Returns a list of splits, each of them being a slice of the normalized
/// string, the associated offsets either in original or normalized
/// referential, as well as the potention tokens
Expand Down
2 changes: 1 addition & 1 deletion tokenizers/src/utils/parallelism.rs
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@ where
if parallelism {
USED_PARALLELISM.store(true, Ordering::SeqCst);
}
CondIterator::new(self, true)
CondIterator::new(self, parallelism)
}

fn into_maybe_par_iter_cond(self, cond: bool) -> CondIterator<P, S> {
Expand Down

0 comments on commit 16af28b

Please sign in to comment.