Skip to content

Commit

Permalink
initial commit
Browse files Browse the repository at this point in the history
  • Loading branch information
ArthurZucker committed Aug 8, 2024
1 parent 38003ad commit b6d01b7
Show file tree
Hide file tree
Showing 2 changed files with 20 additions and 0 deletions.
1 change: 1 addition & 0 deletions tokenizers/src/tokenizer/added_vocabulary.rs
Original file line number Diff line number Diff line change
Expand Up @@ -453,6 +453,7 @@ impl AddedVocabulary {
splits
}


fn fast_split_with_indices(
&self,
sentence: NormalizedString,
Expand Down
19 changes: 19 additions & 0 deletions tokenizers/src/tokenizer/pre_tokenizer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -186,6 +186,25 @@ impl PreTokenizedString {
}
}

pub fn fast_into_encoding(self) -> Result<Encoding> {
if self.splits.is_empty() {
Ok(Encoding::default())
} else if !self.splits.iter().all(|split| split.tokens.is_some()) {
Err("Split has not been tokenized.".into())
} else {
let tokens = self
.splits
.into_iter()
.flat_map(|split| {
split.tokens.unwrap().into_iter().map(|token| {
// Replace this with the actual fields you need for the Encoding type
(token.id, String::new(), (0, 0), None, 0)
})
})
.collect();
Ok(tokens)
}
}
/// Returns a list of splits, each of them being a slice of the normalized
/// string, the associated offsets either in original or normalized
/// referential, as well as the potention tokens
Expand Down

0 comments on commit b6d01b7

Please sign in to comment.