Skip to content

Commit

Permalink
Merges cannot handle tokens containing spaces.
Browse files Browse the repository at this point in the history
This fixes this while keeping backward support.
We don't want to merge that blindly.
  • Loading branch information
Narsil committed Nov 23, 2022
1 parent c74e9e6 commit 0f617cc
Showing 1 changed file with 17 additions and 6 deletions.
23 changes: 17 additions & 6 deletions tokenizers/src/models/bpe/serialization.rs
Original file line number Diff line number Diff line change
Expand Up @@ -28,14 +28,14 @@ impl Serialize for BPE {
.map(|(pair, (rank, _))| (pair, rank))
.collect();
merges.sort_unstable_by_key(|k| *k.1);
let merges_str = merges
let merges = merges
.into_iter()
.map(|(pair, _)| format!("{} {}", self.vocab_r[&pair.0], self.vocab_r[&pair.1]))
.map(|(pair, _)| (self.vocab_r[&pair.0].clone(), self.vocab_r[&pair.1].clone()))
.collect::<Vec<_>>();
let ordered_vocab = OrderedVocabIter::new(&self.vocab_r);

model.serialize_field("vocab", &ordered_vocab)?;
model.serialize_field("merges", &merges_str)?;
model.serialize_field("merges", &merges)?;

model.end()
}
Expand Down Expand Up @@ -77,7 +77,14 @@ impl<'de> Visitor<'de> for BPEVisitor {
{
let mut builder = BpeBuilder::new();
let mut vocab: Option<HashMap<String, u32>> = None;
let mut merges: Option<Vec<String>> = None;

#[derive(Debug, Deserialize)]
#[serde(untagged)]
enum MergeType {
Tuple(Vec<(String, String)>),
Legacy(Vec<String>),
}
let mut merges: Option<MergeType> = None;
while let Some(key) = map.next_key::<String>()? {
match key.as_ref() {
"dropout" => {
Expand Down Expand Up @@ -120,8 +127,12 @@ impl<'de> Visitor<'de> for BPEVisitor {
}
}
if let (Some(vocab), Some(merges)) = (vocab, merges) {
let merges =
convert_merges_to_hashmap(merges.into_iter(), &vocab).map_err(Error::custom)?;
let merges = match merges {
MergeType::Tuple(merges) => merges,
MergeType::Legacy(merges) => {
convert_merges_to_hashmap(merges.into_iter(), &vocab).map_err(Error::custom)?
}
};
builder = builder.vocab_and_merges(vocab, merges);
Ok(builder.build().map_err(Error::custom)?)
} else {
Expand Down

0 comments on commit 0f617cc

Please sign in to comment.