Skip to content

Commit

Permalink
Merges cannot handle tokens containing spaces.
Browse files Browse the repository at this point in the history
This fixes this while keeping backward support.
We don't want to merge that blindly.
  • Loading branch information
Narsil committed Feb 16, 2022
1 parent 2fecdc1 commit 74a4c9e
Showing 1 changed file with 17 additions and 6 deletions.
23 changes: 17 additions & 6 deletions tokenizers/src/models/bpe/serialization.rs
Original file line number Diff line number Diff line change
Expand Up @@ -28,14 +28,14 @@ impl Serialize for BPE {
.map(|(pair, (rank, _))| (pair, rank))
.collect();
merges.sort_unstable_by_key(|k| *k.1);
let merges_str = merges
let merges = merges
.into_iter()
.map(|(pair, _)| format!("{} {}", self.vocab_r[&pair.0], self.vocab_r[&pair.1]))
.map(|(pair, _)| (self.vocab_r[&pair.0].clone(), self.vocab_r[&pair.1].clone()))
.collect::<Vec<_>>();
let ordered_vocab = OrderedVocabIter::new(&self.vocab_r);

model.serialize_field("vocab", &ordered_vocab)?;
model.serialize_field("merges", &merges_str)?;
model.serialize_field("merges", &merges)?;

model.end()
}
Expand Down Expand Up @@ -77,7 +77,14 @@ impl<'de> Visitor<'de> for BPEVisitor {
{
let mut builder = BpeBuilder::new();
let mut vocab: Option<HashMap<String, u32>> = None;
let mut merges: Option<Vec<String>> = None;

#[derive(Debug, Deserialize)]
#[serde(untagged)]
enum MergeType {
Tuple(Vec<(String, String)>),
Legacy(Vec<String>),
}
let mut merges: Option<MergeType> = None;
while let Some(key) = map.next_key::<String>()? {
match key.as_ref() {
"dropout" => {
Expand Down Expand Up @@ -120,8 +127,12 @@ impl<'de> Visitor<'de> for BPEVisitor {
}
}
if let (Some(vocab), Some(merges)) = (vocab, merges) {
let merges =
convert_merges_to_hashmap(merges.into_iter(), &vocab).map_err(Error::custom)?;
let merges = match merges {
MergeType::Tuple(merges) => merges,
MergeType::Legacy(merges) => {
convert_merges_to_hashmap(merges.into_iter(), &vocab).map_err(Error::custom)?
}
};
builder = builder.vocab_and_merges(vocab, merges);
Ok(builder.build().map_err(Error::custom)?)
} else {
Expand Down

0 comments on commit 74a4c9e

Please sign in to comment.