From 9441f7e8f7f5788aa56a00373a346d0bea71e282 Mon Sep 17 00:00:00 2001 From: Arthur <48595927+ArthurZucker@users.noreply.github.com> Date: Thu, 20 Jun 2024 14:33:21 +0200 Subject: [PATCH] make sure we don't warn on empty tokens (#1554) * make sure we don't warn on empty tokens * Testing the log is actually hard :sweat: * mpty --- tokenizers/Cargo.toml | 2 ++ tokenizers/src/tokenizer/serialization.rs | 30 +++++++++++++++-------- 2 files changed, 22 insertions(+), 10 deletions(-) diff --git a/tokenizers/Cargo.toml b/tokenizers/Cargo.toml index 07cc85d1b..038486880 100644 --- a/tokenizers/Cargo.toml +++ b/tokenizers/Cargo.toml @@ -75,6 +75,8 @@ unstable_wasm = ["fancy-regex", "getrandom/js"] criterion = "0.5" tempfile = "3.10" assert_approx_eq = "1.1" +tracing = "0.1" +tracing-subscriber = "0.3.18" [profile.release] lto = "fat" diff --git a/tokenizers/src/tokenizer/serialization.rs b/tokenizers/src/tokenizer/serialization.rs index f08783dd3..c3ad5b410 100644 --- a/tokenizers/src/tokenizer/serialization.rs +++ b/tokenizers/src/tokenizer/serialization.rs @@ -155,17 +155,15 @@ where for token in &tokens { // Warn the user if the id is different than expected let received_id = tokenizer.token_to_id(&token.token.content); - if received_id != Some(token.id) { - warn!( - "Warning: Token '{}' was expected to have ID '{}' but was given ID '{}'", - token.token.content, - token.id, - if let Some(rid) = received_id { + if let Some(rid) = received_id { + if rid != token.id { + warn!( + "Warning: Token '{}' was expected to have ID '{}' but was given ID '{}'", + token.token.content, + token.id, rid.to_string() - } else { - "None".to_string() - } - ); + ); + } } } let added_tokens: Vec<_> = tokens.into_iter().map(|token| token.token).collect(); @@ -179,6 +177,7 @@ where mod tests { use crate::tokenizer::Tokenizer; use std::str::FromStr; + use tracing_subscriber::fmt; #[test] fn test_deserialization_serialization_invariant() { @@ -233,4 +232,15 @@ mod tests { // It should be exactly the same as above assert_eq!(tok_str, tok_json); } + + #[cfg(feature = "http")] + #[test] + fn test_from_pretrained() { + fmt() + .with_max_level(tracing::Level::DEBUG) + .with_target(false) + .init(); + let _ = Tokenizer::from_pretrained("Qwen/Qwen2-7B-Instruct", None); + warn!("This should be the first warning"); + } }