From 68ed7f6cbb94a16128237a92103a8705e1c4cb1b Mon Sep 17 00:00:00 2001 From: Joshua Lochner Date: Wed, 21 Feb 2024 16:22:22 +0200 Subject: [PATCH] Add Gemma Tokenizer (#598) * Fix styling for whitespace tokens * Add `GemmaTokenizer` * Update minimum `@huggingface/jinja` version * Add Gemma to tokenizer playground * Add Gemma tokenizer unit test * Update tokenizer names in playground * Update Gemma tokenizer test --- examples/tokenizer-playground/src/App.jsx | 3 ++- examples/tokenizer-playground/src/components/Token.jsx | 2 +- package-lock.json | 8 ++++---- package.json | 2 +- src/tokenizers.js | 5 +++++ tests/generate_tests.py | 3 +++ 6 files changed, 16 insertions(+), 7 deletions(-) diff --git a/examples/tokenizer-playground/src/App.jsx b/examples/tokenizer-playground/src/App.jsx index fd6deb077..c6546d20f 100644 --- a/examples/tokenizer-playground/src/App.jsx +++ b/examples/tokenizer-playground/src/App.jsx @@ -70,7 +70,8 @@ function App() { - + + diff --git a/examples/tokenizer-playground/src/components/Token.jsx b/examples/tokenizer-playground/src/components/Token.jsx index b49fa7a07..579b000f9 100644 --- a/examples/tokenizer-playground/src/components/Token.jsx +++ b/examples/tokenizer-playground/src/components/Token.jsx @@ -14,7 +14,7 @@ export function Token({ text, position, margin }) { + className={`leading-5 ${COLOURS[position % COLOURS.length]}`}> {text} ) :
) diff --git a/package-lock.json b/package-lock.json index c953c7232..49fb65a51 100644 --- a/package-lock.json +++ b/package-lock.json @@ -9,7 +9,7 @@ "version": "2.15.0", "license": "Apache-2.0", "dependencies": { - "@huggingface/jinja": "^0.1.0", + "@huggingface/jinja": "^0.1.3", "onnxruntime-web": "1.14.0", "sharp": "^0.32.0" }, @@ -745,9 +745,9 @@ } }, "node_modules/@huggingface/jinja": { - "version": "0.1.0", - "resolved": "https://registry.npmjs.org/@huggingface/jinja/-/jinja-0.1.0.tgz", - "integrity": "sha512-NgZ0imvGPHblw+nFJN2eC+so0DmvLSEieldI7gjZZbBUDE80ypG1O+DibdeWne1vQuGBYV/pC3XL//SgxiXC7g==", + "version": "0.1.3", + "resolved": "https://registry.npmjs.org/@huggingface/jinja/-/jinja-0.1.3.tgz", + "integrity": "sha512-9KsiorsdIK8+7VmlamAT7Uh90zxAhC/SeKaKc80v58JhtPYuwaJpmR/ST7XAUxrHAFqHTCoTH5aJnJDwSL6xIQ==", "engines": { "node": ">=18" } diff --git a/package.json b/package.json index ff14f5c97..6fe63e41f 100644 --- a/package.json +++ b/package.json @@ -40,7 +40,7 @@ "dependencies": { "onnxruntime-web": "1.14.0", "sharp": "^0.32.0", - "@huggingface/jinja": "^0.1.0" + "@huggingface/jinja": "^0.1.3" }, "optionalDependencies": { "onnxruntime-node": "1.14.0" diff --git a/src/tokenizers.js b/src/tokenizers.js index 563e39319..9692cf3b0 100644 --- a/src/tokenizers.js +++ b/src/tokenizers.js @@ -3204,6 +3204,10 @@ export class EsmTokenizer extends PreTrainedTokenizer { } export class Qwen2Tokenizer extends PreTrainedTokenizer { } +export class GemmaTokenizer extends PreTrainedTokenizer { + _default_chat_template = "{% if messages[0]['role'] == 'system' %}{{ raise_exception('System role not supported') }}{% endif %}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if (message['role'] == 'assistant') %}{% set role = 'model' %}{% else %}{% set role = message['role'] %}{% endif %}{{ '' + role + '\n' + message['content'] | trim + '\n' }}{% endfor %}{% if add_generation_prompt %}{{'model\n'}}{% endif %}" +} + /** * Helper function to build translation inputs for an `NllbTokenizer` or `M2M100Tokenizer`. * @param {PreTrainedTokenizer} self The tokenizer instance. @@ -4309,6 +4313,7 @@ export class AutoTokenizer { NougatTokenizer, VitsTokenizer, Qwen2Tokenizer, + GemmaTokenizer, // Base case: PreTrainedTokenizer, diff --git a/tests/generate_tests.py b/tests/generate_tests.py index 81836b901..c449d34be 100644 --- a/tests/generate_tests.py +++ b/tests/generate_tests.py @@ -36,6 +36,9 @@ # Uses a pretokenizer regex which is not compatible with JavaScript. 'Qwen/Qwen1.5-0.5B-Chat', ], + 'gemma': [ + 'Xenova/gemma-tokenizer', + ], } MODELS_TO_IGNORE = [