From 021eec2ad261bf6d71edffc8ba5942a1089ccc5f Mon Sep 17 00:00:00 2001 From: Arthur Zucker Date: Tue, 6 Aug 2024 12:02:26 +0200 Subject: [PATCH 01/14] add tests --- tokenizers/src/decoders/mod.rs | 7 +++++++ tokenizers/src/pre_tokenizers/mod.rs | 6 ++++++ tokenizers/src/processors/mod.rs | 6 ++++++ 3 files changed, 19 insertions(+) diff --git a/tokenizers/src/decoders/mod.rs b/tokenizers/src/decoders/mod.rs index 682e63b50..431ae8b82 100644 --- a/tokenizers/src/decoders/mod.rs +++ b/tokenizers/src/decoders/mod.rs @@ -96,4 +96,11 @@ mod tests { let json = r#"{"type":"Sequence","decoders":[{},{"type":"Metaspace","replacement":"▁","prepend_scheme":"always"}]}"#; assert!(serde_json::from_str::(json).is_err()); } + + + #[test] + fn decoder_deserialization_no_type() { + let json = r#"{"replacement":"▁","prepend_scheme":"always"}"#; + assert!(serde_json::from_str::(json).is_err()); + } } diff --git a/tokenizers/src/pre_tokenizers/mod.rs b/tokenizers/src/pre_tokenizers/mod.rs index cf64fb876..d59ff39ef 100644 --- a/tokenizers/src/pre_tokenizers/mod.rs +++ b/tokenizers/src/pre_tokenizers/mod.rs @@ -144,4 +144,10 @@ mod tests { PreTokenizerWrapper::WhitespaceSplit(WhitespaceSplit {}) ); } + + #[test] + fn pre_tokenizer_deserialization_no_type() { + let json = r#"{"replacement":"▁","add_prefix_space":true, "prepend_scheme":"always"}}"#; + assert!(serde_json::from_str::(json).is_err()); + } } diff --git a/tokenizers/src/processors/mod.rs b/tokenizers/src/processors/mod.rs index 130a537ba..06ac45d80 100644 --- a/tokenizers/src/processors/mod.rs +++ b/tokenizers/src/processors/mod.rs @@ -87,4 +87,10 @@ mod tests { PostProcessorWrapper::Bert(bert) ); } + + #[test] + fn post_processor_deserialization_no_type() { + let json = r#"{"sep":["[SEP]",102],"cls":["[CLS]",101]}}"#; + assert!(serde_json::from_str::(json).is_err()); + } } From 6860310df1a0cfb4e1db0b91ae36aff8e0bd8cd2 Mon Sep 17 00:00:00 2001 From: Arthur Zucker Date: Tue, 6 Aug 2024 12:02:36 +0200 Subject: [PATCH 02/14] decoder as well --- tokenizers/src/decoders/mod.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tokenizers/src/decoders/mod.rs b/tokenizers/src/decoders/mod.rs index 431ae8b82..c5700ae1b 100644 --- a/tokenizers/src/decoders/mod.rs +++ b/tokenizers/src/decoders/mod.rs @@ -100,7 +100,7 @@ mod tests { #[test] fn decoder_deserialization_no_type() { - let json = r#"{"replacement":"▁","prepend_scheme":"always"}"#; + let json = r#"{"replacement":"▁","add_prefix_space":true,"prepend_scheme":"always"}"#; assert!(serde_json::from_str::(json).is_err()); } } From b580f96b6aeea962ca41ec605bb39ec6c02e8f82 Mon Sep 17 00:00:00 2001 From: Arthur Zucker Date: Tue, 6 Aug 2024 12:05:32 +0200 Subject: [PATCH 03/14] check error --- tokenizers/src/decoders/mod.rs | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/tokenizers/src/decoders/mod.rs b/tokenizers/src/decoders/mod.rs index c5700ae1b..f1c3f7e78 100644 --- a/tokenizers/src/decoders/mod.rs +++ b/tokenizers/src/decoders/mod.rs @@ -101,6 +101,13 @@ mod tests { #[test] fn decoder_deserialization_no_type() { let json = r#"{"replacement":"▁","add_prefix_space":true,"prepend_scheme":"always"}"#; - assert!(serde_json::from_str::(json).is_err()); + let reconstructed = serde_json::from_str::(json); + match reconstructed { + Err(err) => assert_eq!( + err.to_string(), + "data did not match any variant of untagged enum DecoderWrapper" + ), + _ => panic!("Expected an error here"), + } } } From 0711bc6035c8a32ae673d10512ca7dd1883743af Mon Sep 17 00:00:00 2001 From: Arthur Zucker Date: Tue, 6 Aug 2024 12:06:28 +0200 Subject: [PATCH 04/14] propagate --- tokenizers/src/pre_tokenizers/mod.rs | 9 ++++++++- tokenizers/src/processors/mod.rs | 9 ++++++++- 2 files changed, 16 insertions(+), 2 deletions(-) diff --git a/tokenizers/src/pre_tokenizers/mod.rs b/tokenizers/src/pre_tokenizers/mod.rs index d59ff39ef..1a414b5b3 100644 --- a/tokenizers/src/pre_tokenizers/mod.rs +++ b/tokenizers/src/pre_tokenizers/mod.rs @@ -148,6 +148,13 @@ mod tests { #[test] fn pre_tokenizer_deserialization_no_type() { let json = r#"{"replacement":"▁","add_prefix_space":true, "prepend_scheme":"always"}}"#; - assert!(serde_json::from_str::(json).is_err()); + let reconstructed = serde_json::from_str::(json); + match reconstructed { + Err(err) => assert_eq!( + err.to_string(), + "data did not match any variant of untagged enum PreTokenizerWrapper" + ), + _ => panic!("Expected an error here"), + } } } diff --git a/tokenizers/src/processors/mod.rs b/tokenizers/src/processors/mod.rs index 06ac45d80..c7224e909 100644 --- a/tokenizers/src/processors/mod.rs +++ b/tokenizers/src/processors/mod.rs @@ -91,6 +91,13 @@ mod tests { #[test] fn post_processor_deserialization_no_type() { let json = r#"{"sep":["[SEP]",102],"cls":["[CLS]",101]}}"#; - assert!(serde_json::from_str::(json).is_err()); + let reconstructed = serde_json::from_str::(json); + match reconstructed { + Err(err) => assert_eq!( + err.to_string(), + "data did not match any variant of untagged enum PostProcessorWrapper" + ), + _ => panic!("Expected an error here"), + } } } From f3067955111e7bf7d7bf88034212ac263222854d Mon Sep 17 00:00:00 2001 From: Arthur Zucker Date: Tue, 6 Aug 2024 12:07:29 +0200 Subject: [PATCH 05/14] lint --- tokenizers/src/decoders/mod.rs | 1 - 1 file changed, 1 deletion(-) diff --git a/tokenizers/src/decoders/mod.rs b/tokenizers/src/decoders/mod.rs index f1c3f7e78..8d767cfba 100644 --- a/tokenizers/src/decoders/mod.rs +++ b/tokenizers/src/decoders/mod.rs @@ -97,7 +97,6 @@ mod tests { assert!(serde_json::from_str::(json).is_err()); } - #[test] fn decoder_deserialization_no_type() { let json = r#"{"replacement":"▁","add_prefix_space":true,"prepend_scheme":"always"}"#; From 4ecd4d0b436b9904410805455dea470472baa5fb Mon Sep 17 00:00:00 2001 From: Arthur Zucker Date: Tue, 6 Aug 2024 12:16:53 +0200 Subject: [PATCH 06/14] rafiune the test --- tokenizers/src/processors/mod.rs | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/tokenizers/src/processors/mod.rs b/tokenizers/src/processors/mod.rs index c7224e909..5748233d9 100644 --- a/tokenizers/src/processors/mod.rs +++ b/tokenizers/src/processors/mod.rs @@ -90,7 +90,7 @@ mod tests { #[test] fn post_processor_deserialization_no_type() { - let json = r#"{"sep":["[SEP]",102],"cls":["[CLS]",101]}}"#; + let json = r#"{"add_prefix_space": true, "trim_offsets": false, "use_regex": false}"#; let reconstructed = serde_json::from_str::(json); match reconstructed { Err(err) => assert_eq!( @@ -99,5 +99,19 @@ mod tests { ), _ => panic!("Expected an error here"), } + + let json = r#"{"sep":["[SEP]",102],"cls":["[CLS]",101]}"#; + let reconstructed = serde_json::from_str::(json); + match reconstructed { + Ok(processor) => assert!(matches!(processor, PostProcessorWrapper::Bert(_))), + Err(err) => panic!("{:?}",err) + } + + let json = r#"{"sep":["",2], "cls":["",0], "trim_offsets":true, "add_prefix_space":true}"#; + let reconstructed = serde_json::from_str::(json); + match reconstructed { + Ok(processor) => assert!(matches!(processor, PostProcessorWrapper::Roberta(_))), + Err(err) => panic!("{:?}",err) + } } } From 9a826147f1cf2175da75b4cff6af3e15c3820b99 Mon Sep 17 00:00:00 2001 From: Arthur Zucker Date: Tue, 6 Aug 2024 12:21:05 +0200 Subject: [PATCH 07/14] lint --- tokenizers/src/processors/mod.rs | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/tokenizers/src/processors/mod.rs b/tokenizers/src/processors/mod.rs index 5748233d9..afdfa17f9 100644 --- a/tokenizers/src/processors/mod.rs +++ b/tokenizers/src/processors/mod.rs @@ -104,14 +104,25 @@ mod tests { let reconstructed = serde_json::from_str::(json); match reconstructed { Ok(processor) => assert!(matches!(processor, PostProcessorWrapper::Bert(_))), - Err(err) => panic!("{:?}",err) + Err(err) => panic!("{:?}", err), } - let json = r#"{"sep":["",2], "cls":["",0], "trim_offsets":true, "add_prefix_space":true}"#; + let json = + r#"{"sep":["",2], "cls":["",0], "trim_offsets":true, "add_prefix_space":true}"#; let reconstructed = serde_json::from_str::(json); match reconstructed { Ok(processor) => assert!(matches!(processor, PostProcessorWrapper::Roberta(_))), - Err(err) => panic!("{:?}",err) + Err(err) => panic!("{:?}", err), + } + + let json = r#"{"type":"RobertaProcessing", "sep":["",2] }"#; + let reconstructed = serde_json::from_str::(json); + match reconstructed { + Err(err) => assert_eq!( + err.to_string(), + "data did not match any variant of untagged enum PostProcessorWrapper" + ), + _ => panic!("Expected an error here"), } } } From d14cc7d9972911d2a792d93f2c8820408156f09d Mon Sep 17 00:00:00 2001 From: Arthur Zucker Date: Tue, 6 Aug 2024 12:23:44 +0200 Subject: [PATCH 08/14] revert decoder changes --- tokenizers/src/decoders/mod.rs | 13 ------------- tokenizers/src/pre_tokenizers/mod.rs | 17 +++++++++++++++++ 2 files changed, 17 insertions(+), 13 deletions(-) diff --git a/tokenizers/src/decoders/mod.rs b/tokenizers/src/decoders/mod.rs index 8d767cfba..682e63b50 100644 --- a/tokenizers/src/decoders/mod.rs +++ b/tokenizers/src/decoders/mod.rs @@ -96,17 +96,4 @@ mod tests { let json = r#"{"type":"Sequence","decoders":[{},{"type":"Metaspace","replacement":"▁","prepend_scheme":"always"}]}"#; assert!(serde_json::from_str::(json).is_err()); } - - #[test] - fn decoder_deserialization_no_type() { - let json = r#"{"replacement":"▁","add_prefix_space":true,"prepend_scheme":"always"}"#; - let reconstructed = serde_json::from_str::(json); - match reconstructed { - Err(err) => assert_eq!( - err.to_string(), - "data did not match any variant of untagged enum DecoderWrapper" - ), - _ => panic!("Expected an error here"), - } - } } diff --git a/tokenizers/src/pre_tokenizers/mod.rs b/tokenizers/src/pre_tokenizers/mod.rs index 1a414b5b3..ea73aa7e3 100644 --- a/tokenizers/src/pre_tokenizers/mod.rs +++ b/tokenizers/src/pre_tokenizers/mod.rs @@ -156,5 +156,22 @@ mod tests { ), _ => panic!("Expected an error here"), } + + let json = r#"{"type":"Metaspace", "replacement":"▁" }"#; + let reconstructed = serde_json::from_str::(json); + match reconstructed { + Ok(processor) => assert!(matches!(processor, PreTokenizerWrapper::Metaspace(_))), + Err(err) => panic!("{:?}", err), + } + + let json = r#"{"type":"Metaspace", "add_prefix_space":true }"#; + let reconstructed = serde_json::from_str::(json); + match reconstructed { + Err(err) => assert_eq!( + err.to_string(), + "data did not match any variant of untagged enum PreTokenizerWrapper" + ), + _ => panic!("Expected an error here"), + } } } From 669a9e8263e7ef2aafb4d2a45fbbb9d3062cd1ff Mon Sep 17 00:00:00 2001 From: Arthur Zucker Date: Tue, 6 Aug 2024 12:26:16 +0200 Subject: [PATCH 09/14] on more? --- tokenizers/src/pre_tokenizers/mod.rs | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/tokenizers/src/pre_tokenizers/mod.rs b/tokenizers/src/pre_tokenizers/mod.rs index ea73aa7e3..79df89b24 100644 --- a/tokenizers/src/pre_tokenizers/mod.rs +++ b/tokenizers/src/pre_tokenizers/mod.rs @@ -173,5 +173,14 @@ mod tests { ), _ => panic!("Expected an error here"), } + let json = r#"{"behavior":"default_split"}"#; + let reconstructed = serde_json::from_str::(json); + match reconstructed { + Err(err) => assert_eq!( + err.to_string(), + "data did not match any variant of untagged enum PreTokenizerWrapper" + ), + _ => panic!("Expected an error here"), + } } } From 8c642ee78dbf786d75c25e83741d6a0ef7507f81 Mon Sep 17 00:00:00 2001 From: Arthur Zucker Date: Tue, 6 Aug 2024 12:26:26 +0200 Subject: [PATCH 10/14] fmt --- tokenizers/src/pre_tokenizers/mod.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tokenizers/src/pre_tokenizers/mod.rs b/tokenizers/src/pre_tokenizers/mod.rs index 79df89b24..0d86191a5 100644 --- a/tokenizers/src/pre_tokenizers/mod.rs +++ b/tokenizers/src/pre_tokenizers/mod.rs @@ -181,6 +181,6 @@ mod tests { "data did not match any variant of untagged enum PreTokenizerWrapper" ), _ => panic!("Expected an error here"), - } + } } } From d5dc8ce75852dda6299b1b0957fa245052926fdb Mon Sep 17 00:00:00 2001 From: Arthur <48595927+ArthurZucker@users.noreply.github.com> Date: Tue, 6 Aug 2024 12:45:47 +0200 Subject: [PATCH 11/14] Update tokenizers/src/pre_tokenizers/mod.rs Co-authored-by: Nicolas Patry --- tokenizers/src/pre_tokenizers/mod.rs | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/tokenizers/src/pre_tokenizers/mod.rs b/tokenizers/src/pre_tokenizers/mod.rs index 0d86191a5..08506ed14 100644 --- a/tokenizers/src/pre_tokenizers/mod.rs +++ b/tokenizers/src/pre_tokenizers/mod.rs @@ -159,10 +159,7 @@ mod tests { let json = r#"{"type":"Metaspace", "replacement":"▁" }"#; let reconstructed = serde_json::from_str::(json); - match reconstructed { - Ok(processor) => assert!(matches!(processor, PreTokenizerWrapper::Metaspace(_))), - Err(err) => panic!("{:?}", err), - } +assert_eq!(reconstructed, PreTokenizerWrapper::Metaspace(Metaspace::default())); let json = r#"{"type":"Metaspace", "add_prefix_space":true }"#; let reconstructed = serde_json::from_str::(json); From 002bb4661824a5dea1cd2de65e7a522e91b7fb9c Mon Sep 17 00:00:00 2001 From: Arthur Zucker Date: Tue, 6 Aug 2024 12:47:16 +0200 Subject: [PATCH 12/14] fix commit --- tokenizers/src/pre_tokenizers/mod.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tokenizers/src/pre_tokenizers/mod.rs b/tokenizers/src/pre_tokenizers/mod.rs index 08506ed14..be257df0e 100644 --- a/tokenizers/src/pre_tokenizers/mod.rs +++ b/tokenizers/src/pre_tokenizers/mod.rs @@ -159,7 +159,7 @@ mod tests { let json = r#"{"type":"Metaspace", "replacement":"▁" }"#; let reconstructed = serde_json::from_str::(json); -assert_eq!(reconstructed, PreTokenizerWrapper::Metaspace(Metaspace::default())); + assert_eq!(reconstructed.unwrap(), PreTokenizerWrapper::Metaspace(Metaspace::default())); let json = r#"{"type":"Metaspace", "add_prefix_space":true }"#; let reconstructed = serde_json::from_str::(json); From 81b1ecbfe62449380d6653b71b7b6276923f3265 Mon Sep 17 00:00:00 2001 From: Arthur Zucker Date: Tue, 6 Aug 2024 12:49:58 +0200 Subject: [PATCH 13/14] simplify err --- tokenizers/src/processors/mod.rs | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/tokenizers/src/processors/mod.rs b/tokenizers/src/processors/mod.rs index afdfa17f9..8fe6042c4 100644 --- a/tokenizers/src/processors/mod.rs +++ b/tokenizers/src/processors/mod.rs @@ -102,18 +102,12 @@ mod tests { let json = r#"{"sep":["[SEP]",102],"cls":["[CLS]",101]}"#; let reconstructed = serde_json::from_str::(json); - match reconstructed { - Ok(processor) => assert!(matches!(processor, PostProcessorWrapper::Bert(_))), - Err(err) => panic!("{:?}", err), - } + assert!(matches!(reconstructed.unwrap(), PostProcessorWrapper::Bert(_))); let json = r#"{"sep":["",2], "cls":["",0], "trim_offsets":true, "add_prefix_space":true}"#; let reconstructed = serde_json::from_str::(json); - match reconstructed { - Ok(processor) => assert!(matches!(processor, PostProcessorWrapper::Roberta(_))), - Err(err) => panic!("{:?}", err), - } + assert!(matches!(reconstructed.unwrap(), PostProcessorWrapper::Roberta(_))); let json = r#"{"type":"RobertaProcessing", "sep":["",2] }"#; let reconstructed = serde_json::from_str::(json); From 14dba86643cded8b6a61a92f1652dd9914188590 Mon Sep 17 00:00:00 2001 From: Arthur Zucker Date: Tue, 6 Aug 2024 12:50:21 +0200 Subject: [PATCH 14/14] fmt --- tokenizers/src/pre_tokenizers/mod.rs | 5 ++++- tokenizers/src/processors/mod.rs | 10 ++++++++-- 2 files changed, 12 insertions(+), 3 deletions(-) diff --git a/tokenizers/src/pre_tokenizers/mod.rs b/tokenizers/src/pre_tokenizers/mod.rs index be257df0e..c12646102 100644 --- a/tokenizers/src/pre_tokenizers/mod.rs +++ b/tokenizers/src/pre_tokenizers/mod.rs @@ -159,7 +159,10 @@ mod tests { let json = r#"{"type":"Metaspace", "replacement":"▁" }"#; let reconstructed = serde_json::from_str::(json); - assert_eq!(reconstructed.unwrap(), PreTokenizerWrapper::Metaspace(Metaspace::default())); + assert_eq!( + reconstructed.unwrap(), + PreTokenizerWrapper::Metaspace(Metaspace::default()) + ); let json = r#"{"type":"Metaspace", "add_prefix_space":true }"#; let reconstructed = serde_json::from_str::(json); diff --git a/tokenizers/src/processors/mod.rs b/tokenizers/src/processors/mod.rs index 8fe6042c4..869cc6891 100644 --- a/tokenizers/src/processors/mod.rs +++ b/tokenizers/src/processors/mod.rs @@ -102,12 +102,18 @@ mod tests { let json = r#"{"sep":["[SEP]",102],"cls":["[CLS]",101]}"#; let reconstructed = serde_json::from_str::(json); - assert!(matches!(reconstructed.unwrap(), PostProcessorWrapper::Bert(_))); + assert!(matches!( + reconstructed.unwrap(), + PostProcessorWrapper::Bert(_) + )); let json = r#"{"sep":["",2], "cls":["",0], "trim_offsets":true, "add_prefix_space":true}"#; let reconstructed = serde_json::from_str::(json); - assert!(matches!(reconstructed.unwrap(), PostProcessorWrapper::Roberta(_))); + assert!(matches!( + reconstructed.unwrap(), + PostProcessorWrapper::Roberta(_) + )); let json = r#"{"type":"RobertaProcessing", "sep":["",2] }"#; let reconstructed = serde_json::from_str::(json);