Skip to content

Commit

Permalink
nits
Browse files Browse the repository at this point in the history
  • Loading branch information
ArthurZucker committed Dec 18, 2023
1 parent ed302a8 commit a581a04
Show file tree
Hide file tree
Showing 2 changed files with 7 additions and 3 deletions.
4 changes: 2 additions & 2 deletions tokenizers/src/tokenizer/added_vocabulary.rs
Original file line number Diff line number Diff line change
Expand Up @@ -183,7 +183,7 @@ impl AddedVocabulary {
special_tokens_set: HashSet::new(),
split_trie: (trie, vec![]),
split_normalized_trie: (normalized_trie, vec![]),
encode_special_tokens: false,
encode_special_tokens: true,
}
}
/// Size of the additional vocabulary
Expand Down Expand Up @@ -370,7 +370,7 @@ impl AddedVocabulary {
let id = split_re.1[aho_id];
let added_token = &self.added_tokens_map_r.get(&id).unwrap();

if self.encode_special_tokens && added_token.special {
if self.encode_special_tokens && self.special_tokens_set.contains(&added_token.content) {
continue;
}

Expand Down
6 changes: 5 additions & 1 deletion tokenizers/src/tokenizer/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -686,7 +686,11 @@ where
pub fn id_to_token(&self, id: u32) -> Option<String> {
self.added_vocabulary.id_to_token(id, &self.model)
}


/// set the added bocab's splitting scheme
pub fn set_encode_special_tokens(&mut self, value:bool){
self.added_vocabulary.set_encode_special_tokens(value);
}
/// Encode a single sequence
fn encode_single_sequence(
&self,
Expand Down

0 comments on commit a581a04

Please sign in to comment.