Skip to content

Commit

Permalink
fix some tses
Browse files Browse the repository at this point in the history
  • Loading branch information
ArthurZucker committed Nov 13, 2023
1 parent eaf24bb commit 3ec7b54
Showing 1 changed file with 47 additions and 2 deletions.
49 changes: 47 additions & 2 deletions tokenizers/src/pre_tokenizers/metaspace.rs
Original file line number Diff line number Diff line change
Expand Up @@ -110,7 +110,7 @@ impl Metaspace {
self.replacement = replacement;
self.str_rep = replacement.to_string();
}
pub fn set_prepend_scheme(&mut self, scheme: impl Into<PrependScheme>){
pub fn set_prepend_scheme(&mut self, scheme: impl Into<PrependScheme>) {
self.prepend_scheme = scheme.into();
}
}
Expand Down Expand Up @@ -302,7 +302,7 @@ mod tests {
("▁you", (35, 41))
]
);
pretok.set_prepend_scheme("never");
pretok.set_prepend_scheme("always");
pretok.pre_tokenize(&mut pretokenized).unwrap();
assert_eq!(
pretokenized
Expand All @@ -321,6 +321,51 @@ mod tests {
("▁you", (41, 47))
]
);

pretok.set_prepend_scheme("first");
let mut pretokenized = PreTokenizedString::from(" Hey <s>how"); // test with prefix
pretokenized
.split(|_, sequence| sequence.split(&re_ref, SplitDelimiterBehavior::Isolated))
.expect("Bad split");
pretok.pre_tokenize(&mut pretokenized).unwrap();
assert_eq!(
pretokenized
.get_splits(OffsetReferential::Normalized, OffsetType::Byte)
.into_iter()
.map(|(s, o, _)| (s, o))
.collect::<Vec<_>>(),
vec![
("▁Hey", (0, 6)),
("▁", (6, 9)),
("<s>", (9, 12)),
("how", (12, 15))
]
);

let mut pretokenized = PreTokenizedString::from(" Hey <s>how <s>are <s> you"); // test with many splits
pretokenized
.split(|_, sequence| sequence.split(&re_ref, SplitDelimiterBehavior::Isolated))
.expect("Bad split");
pretok.pre_tokenize(&mut pretokenized).unwrap();
assert_eq!(
pretokenized
.get_splits(OffsetReferential::Normalized, OffsetType::Byte)
.into_iter()
.map(|(s, o, _)| (s, o))
.collect::<Vec<_>>(),
vec![
("▁Hey", (0, 6)),
("▁", (6, 9)),
("<s>", (9, 12)),
("how", (12, 15)),
("▁", (15, 18)),
("<s>", (18, 21)),
("are", (21, 24)),
("▁", (24, 27)),
("<s>", (27, 30)),
("▁you", (30, 36))
]
);
}
#[test]
fn decode() {
Expand Down

0 comments on commit 3ec7b54

Please sign in to comment.