Skip to content

Commit

Permalink
Add qwen-specific tokenizer unit test
Browse files Browse the repository at this point in the history
  • Loading branch information
xenova committed Feb 5, 2024
1 parent 8fdeb78 commit ebd3bcd
Showing 1 changed file with 9 additions and 1 deletion.
10 changes: 9 additions & 1 deletion tests/generate_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,9 +29,13 @@
# 'Xenova/t5-tokenizer-new',
],
'bert': [
# Uses `Whitespace` pretokenizer
# Uses `Whitespace` pretokenizer
'Xenova/jina-embeddings-v2-base-zh-tokenizer',
],
'qwen2': [
# Uses a pretokenizer regex which is not compatible with JavaScript.
'Qwen/Qwen1.5-0.5B-Chat',
],
}

MODELS_TO_IGNORE = [
Expand Down Expand Up @@ -137,6 +141,10 @@
# Special treatment of characters in certain language
"ț ţ",
],

"qwen2": [
"i'm i'M i've i've i'Ve i'vE i'VE",
],
},
"custom": {
"facebook/blenderbot_small-90M": [
Expand Down

0 comments on commit ebd3bcd

Please sign in to comment.