Skip to content

Commit

Permalink
fix everything
Browse files Browse the repository at this point in the history
  • Loading branch information
ArthurZucker committed Jul 12, 2024
1 parent 8d4fae8 commit 7ff8935
Show file tree
Hide file tree
Showing 3 changed files with 8 additions and 15 deletions.
1 change: 0 additions & 1 deletion bindings/python/src/tokenizer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1217,7 +1217,6 @@ impl PyTokenizer {
processed_old_tokens.push(old_token);
processed_new_tokens.push(new_token);
}

Ok(self
.tokenizer
.assign_tokens(&processed_old_tokens, &processed_new_tokens))
Expand Down
20 changes: 8 additions & 12 deletions tokenizers/src/tokenizer/added_vocabulary.rs
Original file line number Diff line number Diff line change
Expand Up @@ -322,7 +322,7 @@ impl AddedVocabulary {
.lock()
.unwrap()
.entry(id)
.and_modify(|t| *t = new.clone());
.and_modify(|t| t.content = new.content.clone());
self.refresh_added_tokens(model, normalizer);
} else {
error!("Error: you tried to re-assign a token that does not exist in the added vocab. Make sure {:?} is first added to the vocab", old.content.clone())
Expand All @@ -336,17 +336,12 @@ impl AddedVocabulary {
/// non-normalized string, and one matching against the normalized one.
fn refresh_added_tokens<N: Normalizer>(&mut self, model: &impl Model, normalizer: Option<&N>) {
type TupleTokenId<'a> = (&'a AddedToken, u32);
let (normalized, non_normalized): (Vec<TupleTokenId>, Vec<TupleTokenId>) = self
.added_tokens
.iter()
.map(|token| {
(
token,
self.token_to_id(&token.content, model)
.expect("Missing additional token"),
)
})
.partition(|(token, _)| token.normalized);
let added_tokens_map_r = self.added_tokens_map_r.lock().unwrap().clone();
let (normalized, non_normalized): (Vec<TupleTokenId>, Vec<TupleTokenId>) =
added_tokens_map_r
.iter()
.map(|(id, token)| (token, *id))
.partition(|(token, _)| token.normalized);

let (tokens, ids): (Vec<&AddedToken>, Vec<u32>) = non_normalized.into_iter().unzip();
let trie = AhoCorasickBuilder::new()
Expand All @@ -363,6 +358,7 @@ impl AddedVocabulary {
if let Some(n) = normalizer {
n.normalize(&mut content).unwrap();
}
println!("{:?}", token);
content
})
.collect();
Expand Down
2 changes: 0 additions & 2 deletions tokenizers/src/tokenizer/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -541,9 +541,7 @@ where
model,
post_processor: None,
decoder: None,

added_vocabulary: AddedVocabulary::new(),

truncation: None,
padding: None,
}
Expand Down

0 comments on commit 7ff8935

Please sign in to comment.