diff --git a/tokenizers/src/pre_tokenizers/mod.rs b/tokenizers/src/pre_tokenizers/mod.rs index cf64fb876..c12646102 100644 --- a/tokenizers/src/pre_tokenizers/mod.rs +++ b/tokenizers/src/pre_tokenizers/mod.rs @@ -144,4 +144,43 @@ mod tests { PreTokenizerWrapper::WhitespaceSplit(WhitespaceSplit {}) ); } + + #[test] + fn pre_tokenizer_deserialization_no_type() { + let json = r#"{"replacement":"▁","add_prefix_space":true, "prepend_scheme":"always"}}"#; + let reconstructed = serde_json::from_str::(json); + match reconstructed { + Err(err) => assert_eq!( + err.to_string(), + "data did not match any variant of untagged enum PreTokenizerWrapper" + ), + _ => panic!("Expected an error here"), + } + + let json = r#"{"type":"Metaspace", "replacement":"▁" }"#; + let reconstructed = serde_json::from_str::(json); + assert_eq!( + reconstructed.unwrap(), + PreTokenizerWrapper::Metaspace(Metaspace::default()) + ); + + let json = r#"{"type":"Metaspace", "add_prefix_space":true }"#; + let reconstructed = serde_json::from_str::(json); + match reconstructed { + Err(err) => assert_eq!( + err.to_string(), + "data did not match any variant of untagged enum PreTokenizerWrapper" + ), + _ => panic!("Expected an error here"), + } + let json = r#"{"behavior":"default_split"}"#; + let reconstructed = serde_json::from_str::(json); + match reconstructed { + Err(err) => assert_eq!( + err.to_string(), + "data did not match any variant of untagged enum PreTokenizerWrapper" + ), + _ => panic!("Expected an error here"), + } + } } diff --git a/tokenizers/src/processors/mod.rs b/tokenizers/src/processors/mod.rs index 130a537ba..869cc6891 100644 --- a/tokenizers/src/processors/mod.rs +++ b/tokenizers/src/processors/mod.rs @@ -87,4 +87,42 @@ mod tests { PostProcessorWrapper::Bert(bert) ); } + + #[test] + fn post_processor_deserialization_no_type() { + let json = r#"{"add_prefix_space": true, "trim_offsets": false, "use_regex": false}"#; + let reconstructed = serde_json::from_str::(json); + match reconstructed { + Err(err) => assert_eq!( + err.to_string(), + "data did not match any variant of untagged enum PostProcessorWrapper" + ), + _ => panic!("Expected an error here"), + } + + let json = r#"{"sep":["[SEP]",102],"cls":["[CLS]",101]}"#; + let reconstructed = serde_json::from_str::(json); + assert!(matches!( + reconstructed.unwrap(), + PostProcessorWrapper::Bert(_) + )); + + let json = + r#"{"sep":["",2], "cls":["",0], "trim_offsets":true, "add_prefix_space":true}"#; + let reconstructed = serde_json::from_str::(json); + assert!(matches!( + reconstructed.unwrap(), + PostProcessorWrapper::Roberta(_) + )); + + let json = r#"{"type":"RobertaProcessing", "sep":["",2] }"#; + let reconstructed = serde_json::from_str::(json); + match reconstructed { + Err(err) => assert_eq!( + err.to_string(), + "data did not match any variant of untagged enum PostProcessorWrapper" + ), + _ => panic!("Expected an error here"), + } + } }