Skip to content

Commit

Permalink
Add Gemma Tokenizer (#598)
Browse files Browse the repository at this point in the history
* Fix styling for whitespace tokens

* Add `GemmaTokenizer`

* Update minimum `@huggingface/jinja` version

* Add Gemma to tokenizer playground

* Add Gemma tokenizer unit test

* Update tokenizer names in playground

* Update Gemma tokenizer test
  • Loading branch information
xenova authored Feb 21, 2024
1 parent 6d2808b commit 68ed7f6
Show file tree
Hide file tree
Showing 6 changed files with 16 additions and 7 deletions.
3 changes: 2 additions & 1 deletion examples/tokenizer-playground/src/App.jsx
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,8 @@ function App() {
<option value="Xenova/gpt-4">gpt-4 / gpt-3.5-turbo / text-embedding-ada-002</option>
<option value="Xenova/text-davinci-003">text-davinci-003 / text-davinci-002</option>
<option value="Xenova/gpt-3">gpt-3</option>
<option value="hf-internal-testing/llama-tokenizer">LLaMA / Llama 2</option>
<option value="Xenova/gemma-tokenizer">Gemma</option>
<option value="Xenova/llama-tokenizer">LLaMA / Llama 2</option>
<option value="Xenova/t5-small">T5</option>
<option value="Xenova/bert-base-cased">bert-base-cased</option>
</select>
Expand Down
2 changes: 1 addition & 1 deletion examples/tokenizer-playground/src/components/Token.jsx
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ export function Token({ text, position, margin }) {
<span
style={{marginLeft: margin}}

className={`leading-5 inline-block ${COLOURS[position % COLOURS.length]}`}>
className={`leading-5 ${COLOURS[position % COLOURS.length]}`}>
{text}
</span>) : <br />
)
Expand Down
8 changes: 4 additions & 4 deletions package-lock.json

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion package.json
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@
"dependencies": {
"onnxruntime-web": "1.14.0",
"sharp": "^0.32.0",
"@huggingface/jinja": "^0.1.0"
"@huggingface/jinja": "^0.1.3"
},
"optionalDependencies": {
"onnxruntime-node": "1.14.0"
Expand Down
5 changes: 5 additions & 0 deletions src/tokenizers.js
Original file line number Diff line number Diff line change
Expand Up @@ -3204,6 +3204,10 @@ export class EsmTokenizer extends PreTrainedTokenizer { }

export class Qwen2Tokenizer extends PreTrainedTokenizer { }

export class GemmaTokenizer extends PreTrainedTokenizer {
_default_chat_template = "{% if messages[0]['role'] == 'system' %}{{ raise_exception('System role not supported') }}{% endif %}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if (message['role'] == 'assistant') %}{% set role = 'model' %}{% else %}{% set role = message['role'] %}{% endif %}{{ '<start_of_turn>' + role + '\n' + message['content'] | trim + '<end_of_turn>\n' }}{% endfor %}{% if add_generation_prompt %}{{'<start_of_turn>model\n'}}{% endif %}"
}

/**
* Helper function to build translation inputs for an `NllbTokenizer` or `M2M100Tokenizer`.
* @param {PreTrainedTokenizer} self The tokenizer instance.
Expand Down Expand Up @@ -4309,6 +4313,7 @@ export class AutoTokenizer {
NougatTokenizer,
VitsTokenizer,
Qwen2Tokenizer,
GemmaTokenizer,

// Base case:
PreTrainedTokenizer,
Expand Down
3 changes: 3 additions & 0 deletions tests/generate_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,9 @@
# Uses a pretokenizer regex which is not compatible with JavaScript.
'Qwen/Qwen1.5-0.5B-Chat',
],
'gemma': [
'Xenova/gemma-tokenizer',
],
}

MODELS_TO_IGNORE = [
Expand Down

0 comments on commit 68ed7f6

Please sign in to comment.