Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[pre_tokenizers] Fix sentencepiece based Metaspace #1357

Merged
merged 49 commits into from
Nov 14, 2023
Merged
Show file tree
Hide file tree
Changes from 20 commits
Commits
Show all changes
49 commits
Select commit Hold shift + click to select a range
b77e698
nits
ArthurZucker Oct 7, 2023
02cc535
Merge branch 'main' of https://github.com/huggingface/tokenizers into…
ArthurZucker Oct 7, 2023
e7ca464
allow for legacy beahaviour without making any breaking changes
ArthurZucker Oct 7, 2023
b18b8d5
add a todo
ArthurZucker Oct 7, 2023
f634484
set to legacy by default
ArthurZucker Oct 7, 2023
f6bcb30
skip legacy serialization
ArthurZucker Oct 7, 2023
08b6ff7
push correct update
ArthurZucker Oct 7, 2023
2327cd1
lint
ArthurZucker Nov 10, 2023
4f82b0d
add deserialization test
ArthurZucker Nov 10, 2023
3e0f2be
add a python test as well
ArthurZucker Nov 10, 2023
0ef1d6f
updates
ArthurZucker Nov 10, 2023
ed24483
fix serialization tests
ArthurZucker Nov 10, 2023
ba5a284
nits
ArthurZucker Nov 10, 2023
b2bb369
python stylijng of the tests
ArthurZucker Nov 10, 2023
948d2dd
better tests
ArthurZucker Nov 10, 2023
dbe25cd
fix offsets
ArthurZucker Nov 10, 2023
4309667
fix imports
ArthurZucker Nov 10, 2023
ca048d5
fmt
ArthurZucker Nov 10, 2023
5705262
update metaspace
ArthurZucker Nov 10, 2023
2b16353
remove TODO
ArthurZucker Nov 10, 2023
eaf24bb
use enm
ArthurZucker Nov 13, 2023
3ec7b54
fix some tses
ArthurZucker Nov 13, 2023
34355e5
nits
ArthurZucker Nov 13, 2023
8697912
use enum
ArthurZucker Nov 14, 2023
562227b
update tests
ArthurZucker Nov 14, 2023
4e714db
syling
ArthurZucker Nov 14, 2023
6107351
remove impl from for PrependScheme
ArthurZucker Nov 14, 2023
d86b036
use simple getters and setters
ArthurZucker Nov 14, 2023
550304a
lint
ArthurZucker Nov 14, 2023
a1bf2f9
update tests
ArthurZucker Nov 14, 2023
24d72e0
add test new == new_with_prepend_scheme
ArthurZucker Nov 14, 2023
ab0e427
revert a change
ArthurZucker Nov 14, 2023
eb2d0d8
use setters and getterts
ArthurZucker Nov 14, 2023
cc55193
Update bindings/python/src/pre_tokenizers.rs
ArthurZucker Nov 14, 2023
e68bcc3
nits
ArthurZucker Nov 14, 2023
4b9840b
Merge branch 'update-stuff' of https://github.com/ArthurZucker/tokeni…
ArthurZucker Nov 14, 2023
20b0cf4
use copy rather than ref
ArthurZucker Nov 14, 2023
6547fd8
nits format
ArthurZucker Nov 14, 2023
3ad148d
more nits
ArthurZucker Nov 14, 2023
4fb2cb0
allow option string
ArthurZucker Nov 14, 2023
d5491a0
enforce First Never Always camel cased
ArthurZucker Nov 14, 2023
e4d1fd6
nits
ArthurZucker Nov 14, 2023
0ed8d4b
refactor
ArthurZucker Nov 14, 2023
cc4e4e6
update test as well
ArthurZucker Nov 14, 2023
2f67039
fmt
ArthurZucker Nov 14, 2023
88d862b
nits
ArthurZucker Nov 14, 2023
c646978
properly error out
ArthurZucker Nov 14, 2023
ac646fa
Update bindings/python/src/pre_tokenizers.rs
ArthurZucker Nov 14, 2023
e41955b
suggestion changes
ArthurZucker Nov 14, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions bindings/python/src/pre_tokenizers.rs
Original file line number Diff line number Diff line change
Expand Up @@ -489,6 +489,16 @@ impl PyMetaspace {
setter!(self_, Metaspace, add_prefix_space, add_prefix_space);
}

#[getter]
fn get_legacy(self_: PyRef<Self>) -> bool {
getter!(self_, Metaspace, legacy)
}

#[setter]
fn set_legacy(self_: PyRef<Self>, legacy: bool) {
setter!(self_, Metaspace, legacy, legacy);
}

#[new]
#[pyo3(signature = (replacement = PyChar('▁'), add_prefix_space = true, **_kwargs), text_signature = "(self, replacement=\"_\", add_prefix_space=True)")]
fn new(
Expand Down
2 changes: 2 additions & 0 deletions bindings/python/tests/bindings/test_pre_tokenizers.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,8 @@ def test_can_modify(self):
assert pretok.replacement == "%"
pretok.add_prefix_space = True
assert pretok.add_prefix_space == True
pretok.legacy = False
assert pretok.legacy == False


class TestCharDelimiterSplit:
Expand Down
1 change: 0 additions & 1 deletion tokenizers/src/decoders/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,6 @@ mod tests {
let serialized = serde_json::to_string(&decoder).unwrap();
assert_eq!(serialized, json);
}

#[test]
fn decoder_serialization_other_no_arg() {
let json = r#"{"type":"Sequence","decoders":[{"type":"Fuse"},{"type":"Metaspace","replacement":"▁","add_prefix_space":true}]}"#;
Expand Down
78 changes: 75 additions & 3 deletions tokenizers/src/pre_tokenizers/metaspace.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,20 @@ use crate::tokenizer::{Decoder, PreTokenizedString, PreTokenizer, Result, SplitD
pub struct Metaspace {
replacement: char,
pub add_prefix_space: bool,
#[serde(skip_serializing_if = "skip_legacy_serialization")]
pub legacy: bool,
ArthurZucker marked this conversation as resolved.
Show resolved Hide resolved
#[serde(skip)]
str_rep: String,
}

fn skip_legacy_serialization(legacy: &bool) -> bool {
*legacy // Skip serialization if legacy is true
}

fn default_legacy_value() -> bool {
true
}

impl<'de> Deserialize<'de> for Metaspace {
fn deserialize<D>(deserializer: D) -> std::result::Result<Self, D::Error>
where
Expand All @@ -29,12 +39,16 @@ impl<'de> Deserialize<'de> for Metaspace {
_type: Type,
replacement: char,
pub add_prefix_space: bool,
#[serde(default = "default_legacy_value")]
pub legacy: bool,
#[serde(skip, rename = "str_rep")]
_str_rep: String,
}

let helper = MetaspaceHelper::deserialize(deserializer)?;
Ok(Self::new(helper.replacement, helper.add_prefix_space))
let mut instance = Self::new(helper.replacement, helper.add_prefix_space);
instance.legacy = helper.legacy;
Ok(instance)
}
}

Expand All @@ -44,6 +58,7 @@ impl Metaspace {
replacement,
str_rep: replacement.to_string(),
add_prefix_space,
legacy: true,
}
}

Expand All @@ -65,12 +80,18 @@ impl Default for Metaspace {

impl PreTokenizer for Metaspace {
fn pre_tokenize(&self, pretokenized: &mut PreTokenizedString) -> Result<()> {
let mut first_split = true;

pretokenized.split(|_, mut normalized| {
normalized.replace(' ', &self.str_rep)?;
if self.add_prefix_space && !normalized.get().starts_with(self.replacement) {
normalized.prepend(&self.str_rep);
if self.legacy {
ArthurZucker marked this conversation as resolved.
Show resolved Hide resolved
normalized.prepend(&self.str_rep);
} else if first_split {
normalized.prepend(&self.str_rep);
first_split = false; // Set the flag to false after the first split
}
}

normalized.split(self.replacement, SplitDelimiterBehavior::MergedWithNext)
})
}
Expand Down Expand Up @@ -103,6 +124,8 @@ impl Decoder for Metaspace {

#[cfg(test)]
mod tests {
use regex::Regex;

use super::*;
use crate::{OffsetReferential, OffsetType};

Expand Down Expand Up @@ -188,6 +211,55 @@ mod tests {
);
}

#[test]
ArthurZucker marked this conversation as resolved.
Show resolved Hide resolved
fn non_legacy_meta_space() {
let mut pretok = Metaspace::new('▁', true);
pretok.legacy = false;
let mut pretokenized = PreTokenizedString::from("Hey my friend <s>how▁are you");
let re_ref = Regex::new(r"(<s>)").unwrap();
pretokenized
.split(|_, sequence| sequence.split(&re_ref, SplitDelimiterBehavior::Isolated))
.expect("Bad split");
println!("{:?}", pretokenized);

pretok.pre_tokenize(&mut pretokenized).unwrap();
assert_eq!(
pretokenized
.get_splits(OffsetReferential::Normalized, OffsetType::Byte)
.into_iter()
.map(|(s, o, _)| (s, o))
.collect::<Vec<_>>(),
vec![
("▁Hey", (0, 6)),
("▁my", (6, 11)),
("▁friend", (11, 20)),
("▁", (20, 23)),
("<s>", (23, 26)),
("how", (26, 29)),
("▁are", (29, 35)),
("▁you", (35, 41))
]
);
pretok.legacy = true;
pretok.pre_tokenize(&mut pretokenized).unwrap();
assert_eq!(
pretokenized
.get_splits(OffsetReferential::Normalized, OffsetType::Byte)
.into_iter()
.map(|(s, o, _)| (s, o))
.collect::<Vec<_>>(),
vec![
("▁Hey", (0, 6)),
("▁my", (6, 11)),
("▁friend", (11, 20)),
("▁", (20, 23)),
("▁<s>", (23, 29)),
("▁how", (29, 35)),
("▁are", (35, 41)),
("▁you", (41, 47))
]
);
}
#[test]
fn decode() {
let decoder = Metaspace::new('▁', true);
Expand Down
22 changes: 22 additions & 0 deletions tokenizers/src/pre_tokenizers/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,28 @@ mod tests {
PreTokenizerWrapper::Metaspace(Metaspace::new('▁', true))
]))
);

let pre_tokenizer: PreTokenizerWrapper = serde_json::from_str(
r#"{"type":"Metaspace","replacement":"▁","add_prefix_space":true, "legacy":false}"#,
)
.unwrap();

let mut expected_pre_tokenizer = Metaspace::new('▁', true);
expected_pre_tokenizer.legacy = false;
ArthurZucker marked this conversation as resolved.
Show resolved Hide resolved
assert_eq!(
pre_tokenizer,
PreTokenizerWrapper::Metaspace(expected_pre_tokenizer)
);

let pre_tokenizer: PreTokenizerWrapper = serde_json::from_str(
r#"{"type":"Metaspace","replacement":"▁","add_prefix_space":true, "legacy":true}"#,
)
.unwrap();

assert_eq!(
pre_tokenizer,
PreTokenizerWrapper::Metaspace(Metaspace::new('▁', true))
ArthurZucker marked this conversation as resolved.
Show resolved Hide resolved
);
}

#[test]
Expand Down
4 changes: 4 additions & 0 deletions tokenizers/src/tokenizer/mod.rs
ArthurZucker marked this conversation as resolved.
Show resolved Hide resolved
Original file line number Diff line number Diff line change
Expand Up @@ -658,6 +658,10 @@ where

final_vocab
}
/// Get the added tokens encoder
pub fn get_added_tokens_encoder(&self) -> HashMap<String, u32> {
self.added_vocabulary.get_vocab().clone()
}

/// Get the added tokens decoder
pub fn get_added_tokens_decoder(&self) -> HashMap<u32, AddedToken> {
Expand Down
Loading