Skip to content

Commit

Permalink
Add a test with spaces in the token/merge.
Browse files Browse the repository at this point in the history
  • Loading branch information
Narsil committed Aug 7, 2024
1 parent d76c0cd commit 6fe1889
Showing 1 changed file with 23 additions and 0 deletions.
23 changes: 23 additions & 0 deletions tokenizers/src/models/bpe/serialization.rs
Original file line number Diff line number Diff line change
Expand Up @@ -188,7 +188,30 @@ mod test {
r#"{"type":"BPE","dropout":null,"unk_token":"<unk>","continuing_subword_prefix":null,"end_of_word_suffix":null,"fuse_unk":false,"byte_fallback":false,"ignore_merges":true,"vocab":{"<unk>":0,"a":1,"b":2,"ab":3},"merges":[["a","b"]]}"#
);
let reconstructed = serde_json::from_str(&data).unwrap();
assert_eq!(bpe, reconstructed);

// With a space in the token
let vocab: Vocab = [
("<unk>".into(), 0),
("a".into(), 1),
("b c d".into(), 2),
("ab c d".into(), 3),
]
.iter()
.cloned()
.collect();
let bpe = BpeBuilder::default()
.vocab_and_merges(vocab, vec![("a".to_string(), "b c d".to_string())])
.unk_token("<unk>".to_string())
.ignore_merges(true)
.build()
.unwrap();
let data = serde_json::to_string(&bpe).unwrap();
assert_eq!(
data,
r#"{"type":"BPE","dropout":null,"unk_token":"<unk>","continuing_subword_prefix":null,"end_of_word_suffix":null,"fuse_unk":false,"byte_fallback":false,"ignore_merges":true,"vocab":{"<unk>":0,"a":1,"b c d":2,"ab c d":3},"merges":[["a","b c d"]]}"#
);
let reconstructed = serde_json::from_str(&data).unwrap();
assert_eq!(bpe, reconstructed);
}

Expand Down

0 comments on commit 6fe1889

Please sign in to comment.