Skip to content

Commit

Permalink
better tests
Browse files Browse the repository at this point in the history
  • Loading branch information
ArthurZucker committed Nov 10, 2023
1 parent b2bb369 commit 948d2dd
Show file tree
Hide file tree
Showing 3 changed files with 46 additions and 3 deletions.
38 changes: 36 additions & 2 deletions tokenizers/src/pre_tokenizers/metaspace.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
use serde::{Deserialize, Deserializer, Serialize};

use crate::tokenizer::{Decoder, PreTokenizedString, PreTokenizer, Result, SplitDelimiterBehavior};
use regex::Regex;

Check warning on line 4 in tokenizers/src/pre_tokenizers/metaspace.rs

View workflow job for this annotation

GitHub Actions / Check it builds for Windows 32-bit (3.7)

unused import: `regex::Regex`

Check warning on line 4 in tokenizers/src/pre_tokenizers/metaspace.rs

View workflow job for this annotation

GitHub Actions / Check it builds for Windows 32-bit (3.10)

unused import: `regex::Regex`

Check warning on line 4 in tokenizers/src/pre_tokenizers/metaspace.rs

View workflow job for this annotation

GitHub Actions / Check everything builds

unused import: `regex::Regex`

Check warning on line 4 in tokenizers/src/pre_tokenizers/metaspace.rs

View workflow job for this annotation

GitHub Actions / Check it builds for Windows 32-bit (3.8)

unused import: `regex::Regex`

Check warning on line 4 in tokenizers/src/pre_tokenizers/metaspace.rs

View workflow job for this annotation

GitHub Actions / Check it builds for Windows 32-bit (3.9)

unused import: `regex::Regex`

Check warning on line 4 in tokenizers/src/pre_tokenizers/metaspace.rs

View workflow job for this annotation

GitHub Actions / Check everything builds & tests (ubuntu-latest)

unused import: `regex::Regex`

Check warning on line 4 in tokenizers/src/pre_tokenizers/metaspace.rs

View workflow job for this annotation

GitHub Actions / Check everything builds & tests (ubuntu-latest)

unused import: `regex::Regex`

Check warning on line 4 in tokenizers/src/pre_tokenizers/metaspace.rs

View workflow job for this annotation

GitHub Actions / Check everything builds & tests (macos-latest)

unused import: `regex::Regex`

Check warning on line 4 in tokenizers/src/pre_tokenizers/metaspace.rs

View workflow job for this annotation

GitHub Actions / Check everything builds & tests (macos-latest)

unused import: `regex::Regex`

#[derive(Debug, Clone, PartialEq, Serialize, Eq)]
/// Replaces all the whitespaces by the provided meta character and then
Expand Down Expand Up @@ -85,10 +86,10 @@ impl PreTokenizer for Metaspace {

pretokenized.split(|_, mut normalized| {
normalized.replace(' ', &self.str_rep)?;
if self.add_prefix_space {
if self.add_prefix_space && !normalized.get().starts_with(self.replacement){
if self.legacy {
normalized.prepend(&self.str_rep);
} else if self.add_prefix_space && first_split {
} else if first_split {
normalized.prepend(&self.str_rep);
first_split = false; // Set the flag to false after the first split
}
Expand Down Expand Up @@ -210,6 +211,39 @@ mod tests {
);
}

#[test]
fn non_legacy_meta_space(){
let mut pretok = Metaspace::new('▁', true);
pretok.legacy = false;
let mut pretokenized = PreTokenizedString::from("Hey my friend <s>how▁are you");
let re_ref = Regex::new(r"(<s>)").unwrap();
pretokenized.split(|_, sequence| sequence.split(&re_ref, SplitDelimiterBehavior::Isolated)).expect("AddedVocabulary bad split");
println!("{:?}",pretokenized);

pretok.pre_tokenize(&mut pretokenized).unwrap();
assert_eq!(
pretokenized
.get_splits(OffsetReferential::Normalized, OffsetType::Byte)
.into_iter()
.map(|(s, o, _)| (s, o))
.collect::<Vec<_>>(),
vec![
("▁Hey", (0, 6)), ("▁my", (6, 11)), ("▁friend", (11, 20)), ("▁", (20, 23)), ("<s>", (23, 26)), ("how", (26, 29)), ("▁are", (29, 35)), ("▁you", (35, 41))
]
);
pretok.legacy = true;
pretok.pre_tokenize(&mut pretokenized).unwrap();
assert_eq!(
pretokenized
.get_splits(OffsetReferential::Normalized, OffsetType::Byte)
.into_iter()
.map(|(s, o, _)| (s, o))
.collect::<Vec<_>>(),
vec![
("▁Hey", (0, 6)), ("▁my", (6, 11)), ("▁friend", (11, 20)), ("▁", (20, 23)), ("▁<s>", (23, 26)), ("▁how", (26, 29)), ("▁are", (29, 35)), ("▁you", (35, 41))
]
);
}
#[test]
fn decode() {
let decoder = Metaspace::new('▁', true);
Expand Down
10 changes: 10 additions & 0 deletions tokenizers/src/pre_tokenizers/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -116,6 +116,16 @@ mod tests {
pre_tokenizer,
PreTokenizerWrapper::Metaspace(expected_pre_tokenizer)
);

let pre_tokenizer: PreTokenizerWrapper = serde_json::from_str(
r#"{"type":"Metaspace","replacement":"▁","add_prefix_space":true, "legacy":true}"#,
)
.unwrap();

assert_eq!(
pre_tokenizer,
PreTokenizerWrapper::Metaspace(Metaspace::new('▁', true))
);
}

#[test]
Expand Down
1 change: 0 additions & 1 deletion tokenizers/src/tokenizer/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -697,7 +697,6 @@ where
offsets_type: OffsetType,
) -> Result<Encoding> {
let encode = |is_pre_tokenized, subseq_idx, subseq| -> Result<Encoding> {
// FIXME FIXME FIXME all of our SPM issues most probably come from here. The addition of the space after special tokens
let normalized = self
.added_vocabulary
.extract_and_normalize(self.normalizer.as_ref(), subseq);
Expand Down

0 comments on commit 948d2dd

Please sign in to comment.