diff --git a/src/tokenizers.js b/src/tokenizers.js index cfa29d7bd..7b5096354 100644 --- a/src/tokenizers.js +++ b/src/tokenizers.js @@ -2926,6 +2926,24 @@ export class BloomTokenizer extends GPT2Tokenizer { // NOTE: `GPT2Tokenizer` to } export class LlamaTokenizer extends PreTrainedTokenizer { _default_chat_template = `{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% elif USE_DEFAULT_PROMPT == true and not '<>' in messages[0]['content'] %}{% set loop_messages = messages %}{% set system_message = 'DEFAULT_SYSTEM_MESSAGE' %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<>\n' + system_message + '\n<>\n\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ bos_token + '[INST] ' + content.strip() + ' [/INST]' }}{% elif message['role'] == 'system' %}{{ '<>\n' + content.strip() + '\n<>\n\n' }}{% elif message['role'] == 'assistant' %}{{ ' ' + content.strip() + ' ' + eos_token }}{% endif %}{% endfor %}` + + DEFAULT_SYSTEM_PROMPT = + "You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your " + + "answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure " + + "that your responses are socially unbiased and positive in nature.\n\n" + + "If a question does not make any sense, or is not factually coherent, explain why instead of answering something not " + + "correct. If you don't know the answer to a question, please don't share false information." + + constructor(tokenizerJSON, tokenizerConfig) { + super(tokenizerJSON, tokenizerConfig); + this.use_default_system_prompt = tokenizerConfig.use_default_system_prompt ?? false; + } + + get default_chat_template() { + return super.default_chat_template + .replaceAll('USE_DEFAULT_PROMPT', this.use_default_system_prompt ? 'true' : 'false') + .replaceAll('DEFAULT_SYSTEM_MESSAGE', this.DEFAULT_SYSTEM_PROMPT.replaceAll("\n", "\\n").replaceAll("'", "\\'")); + } } export class CodeLlamaTokenizer extends LlamaTokenizer { } // NOTE: `LlamaTokenizer` to get the correct chat template