Skip to content

Commit

Permalink
Remove default chat templates
Browse files Browse the repository at this point in the history
  • Loading branch information
xenova committed Aug 7, 2024
1 parent 38a3bf6 commit 9df84c4
Showing 1 changed file with 22 additions and 53 deletions.
75 changes: 22 additions & 53 deletions src/tokenizers.js
Original file line number Diff line number Diff line change
Expand Up @@ -2502,8 +2502,6 @@ function truncateHelper(item, length) {
export class PreTrainedTokenizer extends Callable {
return_token_type_ids = false;

_default_chat_template = `{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}`;

padding_side = 'right';
/**
* Create a new PreTrainedTokenizer instance.
Expand Down Expand Up @@ -3059,26 +3057,10 @@ export class PreTrainedTokenizer extends Callable {

return decoded;
}

get default_chat_template() {
if (!this._warned_about_chat_template) {
console.warn(
"No chat template is defined for this tokenizer - using a default chat template " +
"that implements the ChatML format. If the default is not appropriate for " +
"your model, please set `tokenizer.chat_template` to an appropriate template. " +
"See https://huggingface.co/docs/transformers/main/chat_templating for more information."
)
this._warned_about_chat_template = true; // TODO move to logger.warning_once()
}

return this._default_chat_template;
}

/**
* Converts a list of message objects with `"role"` and `"content"` keys to a list of token
* ids. This method is intended for use with chat models, and will read the tokenizer's chat_template attribute to
* determine the format and control tokens to use when converting. When chat_template is None, it will fall back
* to the default_chat_template specified at the class level.
* determine the format and control tokens to use when converting.
*
* See [here](https://huggingface.co/docs/transformers/chat_templating) for more information.
*
Expand All @@ -3105,7 +3087,7 @@ export class PreTrainedTokenizer extends Callable {
* @param {Message[]} conversation A list of message objects with `"role"` and `"content"` keys.
* @param {Object} options An optional object containing the following properties:
* @param {string} [options.chat_template=null] A Jinja template to use for this conversion. If
* this is not passed, the model's default chat template will be used instead.
* this is not passed, the model's chat template will be used instead.
* @param {boolean} [options.add_generation_prompt=false] Whether to end the prompt with the token(s) that indicate
* the start of an assistant message. This is useful when you want to generate a response from the model.
* Note that this argument will be passed to the chat template, and so it must be supported in the
Expand Down Expand Up @@ -3135,10 +3117,10 @@ export class PreTrainedTokenizer extends Callable {

// First, handle the cases when the model has a dict of multiple templates
if (
(this.chat_template && typeof this.chat_template === 'object') ||
(this.chat_template === null && this.default_chat_template && typeof this.default_chat_template === 'object')
(this.chat_template && typeof this.chat_template === 'object')
|| this.chat_template === null
) {
const template_dict = this.chat_template ?? this.default_chat_template; // Guaranteed to be a non-null object
const template_dict = this.chat_template;

if (chat_template !== null && Object.hasOwn(template_dict, chat_template)) {
// The user can pass the name of a template to the chat template argument instead of an entire template
Expand All @@ -3154,8 +3136,17 @@ export class PreTrainedTokenizer extends Callable {
}
} else {
// These are the cases when the model has a single template
// priority: `chat_template` argument > `tokenizer.chat_template` > `tokenizer.default_chat_template
chat_template ??= this.chat_template ?? this.default_chat_template;
// priority: `chat_template` argument > `tokenizer.chat_template`
if (this.chat_template) {
chat_template = this.chat_template;
} else {
throw Error(
"Cannot use apply_chat_template() because tokenizer.chat_template is not set and no template " +
"argument was passed! For information about writing templates and setting the " +
"tokenizer.chat_template attribute, please see the documentation at " +
"https://huggingface.co/docs/transformers/main/en/chat_templating"
)
}
}
if (typeof chat_template !== 'string') {
throw Error(`chat_template must be a string, but got ${typeof chat_template}`);
Expand Down Expand Up @@ -3250,9 +3241,7 @@ export class ElectraTokenizer extends PreTrainedTokenizer {
}

export class T5Tokenizer extends PreTrainedTokenizer { }
export class GPT2Tokenizer extends PreTrainedTokenizer {
_default_chat_template = `{% for message in messages %}" "{{ message.content }}{{ eos_token }}" "{% endfor %}`
}
export class GPT2Tokenizer extends PreTrainedTokenizer { }
export class BartTokenizer extends PreTrainedTokenizer { }
export class MBartTokenizer extends PreTrainedTokenizer {
constructor(tokenizerJSON, tokenizerConfig) {
Expand All @@ -3278,7 +3267,7 @@ export class MBart50Tokenizer extends MBartTokenizer { } // NOTE: extends MBartT

export class RobertaTokenizer extends PreTrainedTokenizer { }

export class BloomTokenizer extends GPT2Tokenizer { // NOTE: `GPT2Tokenizer` to get the correct chat template
export class BloomTokenizer extends PreTrainedTokenizer {

constructor(tokenizerJSON, tokenizerConfig) {
// Override the default (invalid) regex of the pretokenizer.
Expand All @@ -3295,20 +3284,11 @@ export class BloomTokenizer extends GPT2Tokenizer { // NOTE: `GPT2Tokenizer` to
const SPIECE_UNDERLINE = "▁";

export class LlamaTokenizer extends PreTrainedTokenizer {
_default_chat_template = `{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% elif USE_DEFAULT_PROMPT == true and not '<<SYS>>' in messages[0]['content'] %}{% set loop_messages = messages %}{% set system_message = 'DEFAULT_SYSTEM_MESSAGE' %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<<SYS>>\n' + system_message + '\n<</SYS>>\n\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ bos_token + '[INST] ' + content.strip() + ' [/INST]' }}{% elif message['role'] == 'system' %}{{ '<<SYS>>\n' + content.strip() + '\n<</SYS>>\n\n' }}{% elif message['role'] == 'assistant' %}{{ ' ' + content.strip() + ' ' + eos_token }}{% endif %}{% endfor %}`

DEFAULT_SYSTEM_PROMPT =
"You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your " +
"answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure " +
"that your responses are socially unbiased and positive in nature.\n\n" +
"If a question does not make any sense, or is not factually coherent, explain why instead of answering something not " +
"correct. If you don't know the answer to a question, please don't share false information."

padding_side = 'left';

constructor(tokenizerJSON, tokenizerConfig) {
super(tokenizerJSON, tokenizerConfig);
this.use_default_system_prompt = tokenizerConfig.use_default_system_prompt ?? false;

this.legacy = tokenizerConfig.legacy ?? true;
if (!this.legacy) {
Expand Down Expand Up @@ -3341,14 +3321,8 @@ export class LlamaTokenizer extends PreTrainedTokenizer {
}
return tokens;
}

get default_chat_template() {
return super.default_chat_template
.replaceAll('USE_DEFAULT_PROMPT', this.use_default_system_prompt ? 'true' : 'false')
.replaceAll('DEFAULT_SYSTEM_MESSAGE', this.DEFAULT_SYSTEM_PROMPT.replaceAll("\n", "\\n").replaceAll("'", "\\'"));
}
}
export class CodeLlamaTokenizer extends LlamaTokenizer { } // NOTE: `LlamaTokenizer` to get the correct chat template
export class CodeLlamaTokenizer extends PreTrainedTokenizer { }

export class XLMRobertaTokenizer extends PreTrainedTokenizer { }
export class MPNetTokenizer extends PreTrainedTokenizer { }
Expand All @@ -3361,9 +3335,7 @@ export class EsmTokenizer extends PreTrainedTokenizer { }

export class Qwen2Tokenizer extends PreTrainedTokenizer { }

export class GemmaTokenizer extends PreTrainedTokenizer {
_default_chat_template = "{% if messages[0]['role'] == 'system' %}{{ raise_exception('System role not supported') }}{% endif %}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if (message['role'] == 'assistant') %}{% set role = 'model' %}{% else %}{% set role = message['role'] %}{% endif %}{{ '<start_of_turn>' + role + '\n' + message['content'] | trim + '<end_of_turn>\n' }}{% endfor %}{% if add_generation_prompt %}{{'<start_of_turn>model\n'}}{% endif %}"
}
export class GemmaTokenizer extends PreTrainedTokenizer { }

export class Grok1Tokenizer extends PreTrainedTokenizer { }

Expand Down Expand Up @@ -3491,7 +3463,6 @@ export class M2M100Tokenizer extends PreTrainedTokenizer {
* @extends PreTrainedTokenizer
*/
export class WhisperTokenizer extends PreTrainedTokenizer {
_default_chat_template = `{% for message in messages %}" "{{ message.content }}{{ eos_token }}" "{% endfor %}`;

get timestamp_begin() {
return this.model.convert_tokens_to_ids(["<|notimestamps|>"])[0] + 1;
Expand Down Expand Up @@ -4284,10 +4255,8 @@ export class MarianTokenizer extends PreTrainedTokenizer {

export class Wav2Vec2CTCTokenizer extends PreTrainedTokenizer { }

export class BlenderbotTokenizer extends PreTrainedTokenizer {
_default_chat_template = `{% for message in messages %}{% if message['role'] == 'user' %}{{ ' ' }}{% endif %}{{ message['content'] }}{% if not loop.last %}{{ ' ' }}{% endif %}{% endfor %}{{ eos_token }}`;
}
export class BlenderbotSmallTokenizer extends BlenderbotTokenizer { } // NOTE `BlenderbotTokenizer` to get the correct chat template
export class BlenderbotTokenizer extends PreTrainedTokenizer { }
export class BlenderbotSmallTokenizer extends PreTrainedTokenizer { }

export class SpeechT5Tokenizer extends PreTrainedTokenizer { }

Expand Down

0 comments on commit 9df84c4

Please sign in to comment.