Skip to content

Commit

Permalink
dob; t repeat yourself
Browse files Browse the repository at this point in the history
  • Loading branch information
ArthurZucker committed Apr 15, 2024
1 parent 574b608 commit 78ce8be
Showing 1 changed file with 11 additions and 18 deletions.
29 changes: 11 additions & 18 deletions tokenizers/src/models/bpe/model.rs
Original file line number Diff line number Diff line change
Expand Up @@ -461,26 +461,19 @@ impl BPE {

fn tokenize_with_cache(&self, sequence: &str) -> Result<Vec<Token>> {
if let Some(ref hit) = self.cache.as_ref().and_then(|c| c.get(sequence)) {
Ok(self.word_to_tokens(hit).collect())
} else if let Some(id) = self.vocab.get(sequence) {
if self.ignore_merges {
Ok(vec![Token::new(*id, sequence.to_string().clone(), (0, 0))])
} else {
let word = self.merge_word(sequence)?;
let ret = self.word_to_tokens(&word).collect();
if let Some(ref cache) = self.cache {
cache.set(sequence.to_owned(), word);
}
Ok(ret)
}
} else {
let word = self.merge_word(sequence)?;
let ret = self.word_to_tokens(&word).collect();
if let Some(ref cache) = self.cache {
cache.set(sequence.to_owned(), word);
return Ok(self.word_to_tokens(hit).collect());
}
if self.ignore_merges {
if let Some(id) = self.vocab.get(sequence) {
return Ok(vec![Token::new(*id, sequence.to_string().clone(), (0, 0))]);
}
Ok(ret)
}
let word = self.merge_word(sequence)?;
let ret = self.word_to_tokens(&word).collect();
if let Some(ref cache) = self.cache {
cache.set(sequence.to_owned(), word);
}
Ok(ret)
}
}

Expand Down

0 comments on commit 78ce8be

Please sign in to comment.