diff --git a/docs/source/en/model_doc/telechat2.md b/docs/source/en/model_doc/telechat2.md index 4a9f0ead453f82..a28e8e0a9a30bd 100644 --- a/docs/source/en/model_doc/telechat2.md +++ b/docs/source/en/model_doc/telechat2.md @@ -18,21 +18,54 @@ rendered properly in your Markdown viewer. ## Overview -The TeleChat2 model was proposed in []() by . - +The TeleChat2 model was proposed in [TELECHAT TECHNICAL REPORT](https://arxiv.org/pdf/2401.03804) by TeleAI. + +### Summary The abstract from the paper is the following: -** +TeleChat is a series of large language models, offering decoder-based language models in various sizes (3B, 7B, and 12B). For each size, we provide both the base pretrained model and the fine-tuned chat model aligned with human preferences. TeleChat leverages a Transformer architecture with features such as SwiGLU activation, advanced attention mechanisms (QKV bias, group query attention), and support for sliding window attention. The models are optimized for bilingual proficiency (English and Chinese) and include an enhanced tokenizer adaptable to diverse natural languages and coding formats. + +## Usage tips + +The original code for telechat2 can be found [here](https://huggingface.co/Tele-AI/TeleChat2-7B). + +In the following, we demonstrate how to use `TeleChat2-7B` for the inference. Note that we have used the ChatML format for dialog, in this demo we show how to leverage `apply_chat_template` for this purpose. + +```python +>>> from transformers import AutoModelForCausalLM, AutoTokenizer +>>> device = "cuda" # the device to load the model onto + +>>> model = AutoModelForCausalLM.from_pretrained("Tele-AI/TeleChat2-7B", device_map="auto") +>>> tokenizer = AutoTokenizer.from_pretrained("Tele-AI/TeleChat2-7B") + +>>> prompt = "Give me a short introduction to large language model." -Tips: +>>> messages = [{"role": "user", "content": prompt}] - +>>> text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) -This model was contributed by [INSERT YOUR HF USERNAME HERE](https://huggingface.co/). -The original code can be found [here](). +>>> model_inputs = tokenizer([text], return_tensors="pt").to(device) +>>> generated_ids = model.generate(model_inputs.input_ids, max_new_tokens=512, do_sample=True) + +>>> generated_ids = [output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)] + +>>> response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0] +``` ## TeleChat2Config [[autodoc]] TeleChat2Config + + +## TeleChat2Model + +[[autodoc]] TeleChat2Model + - forward + +## TeleChat2ForCausalLM + +[[autodoc]] TeleChat2ForCausalLM + - forward + - generate diff --git a/src/transformers/convert_slow_tokenizers_checkpoints_to_fast.py b/src/transformers/convert_slow_tokenizers_checkpoints_to_fast.py index 6955c6ec6a29e2..0e67af91855075 100755 --- a/src/transformers/convert_slow_tokenizers_checkpoints_to_fast.py +++ b/src/transformers/convert_slow_tokenizers_checkpoints_to_fast.py @@ -30,7 +30,9 @@ TOKENIZER_CLASSES = { # Phi3 uses Llama tokenizer - name: getattr(transformers, "LlamaTokenizerFast" if name in ["Phi3Tokenizer", "TeleChat2Tokenizer"] else name + "Fast") + name: getattr( + transformers, "LlamaTokenizerFast" if name in ["Phi3Tokenizer", "TeleChat2Tokenizer"] else name + "Fast" + ) for name in SLOW_TO_FAST_CONVERTERS } diff --git a/src/transformers/models/telechat2/configuration_telechat2.py b/src/transformers/models/telechat2/configuration_telechat2.py index 0005248b6e2e7e..b24d072c6c6c22 100644 --- a/src/transformers/models/telechat2/configuration_telechat2.py +++ b/src/transformers/models/telechat2/configuration_telechat2.py @@ -177,7 +177,7 @@ def __init__( mlp_bias=False, head_dim=None, sliding_window=None, - embed_layernorm = False, + embed_layernorm=False, use_sliding_window=False, max_window_layers=28, **kwargs, diff --git a/src/transformers/models/telechat2/modeling_telechat2.py b/src/transformers/models/telechat2/modeling_telechat2.py index 3d5e78337ad9f8..4010dabfd9cda8 100644 --- a/src/transformers/models/telechat2/modeling_telechat2.py +++ b/src/transformers/models/telechat2/modeling_telechat2.py @@ -261,12 +261,8 @@ def __init__(self, config: TeleChat2Config, layer_idx: int): self.attention_dropout = config.attention_dropout self.is_causal = True - self.query = nn.Linear( - config.hidden_size, config.num_attention_heads * self.head_dim, bias=False - ) - self.key_value = nn.Linear( - config.hidden_size, self.head_dim * config.num_key_value_heads * 2, bias=False - ) + self.query = nn.Linear(config.hidden_size, config.num_attention_heads * self.head_dim, bias=False) + self.key_value = nn.Linear(config.hidden_size, self.head_dim * config.num_key_value_heads * 2, bias=False) self.dense = nn.Linear(config.num_attention_heads * self.head_dim, config.hidden_size) def forward( @@ -378,7 +374,6 @@ def forward( return outputs - TELECHAT2_START_DOCSTRING = r""" This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads @@ -518,9 +513,7 @@ def __init__(self, config: TeleChat2Config): self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx) - self.h = nn.ModuleList( - [TeleChat2DecoderLayer(config, i) for i in range(config.num_hidden_layers)] - ) + self.h = nn.ModuleList([TeleChat2DecoderLayer(config, i) for i in range(config.num_hidden_layers)]) self.ln_f = TeleChat2RMSNorm(config.hidden_size, eps=config.layer_norm_epsilon) self.rotary_emb = TeleChat2RotaryEmbedding(config=config) self.gradient_checkpointing = False diff --git a/tests/models/telechat2/test_modeling_telechat2.py b/tests/models/telechat2/test_modeling_telechat2.py index c6b0bdc9881a75..c0b2fbbd088f03 100644 --- a/tests/models/telechat2/test_modeling_telechat2.py +++ b/tests/models/telechat2/test_modeling_telechat2.py @@ -435,7 +435,9 @@ def test_model_450m_long_prompt_sdpa(self): EXPECTED_OUTPUT_TOKEN_IDS = [306, 338] # An input with 4097 tokens that is above the size of the sliding window input_ids = [1] + [306, 338] * 2048 - model = TeleChat2ForCausalLM.from_pretrained("TeleAI/TeleChat2-3B", device_map="auto", attn_implementation="sdpa") + model = TeleChat2ForCausalLM.from_pretrained( + "TeleAI/TeleChat2-3B", device_map="auto", attn_implementation="sdpa" + ) input_ids = torch.tensor([input_ids]).to(model.model.embed_tokens.weight.device) generated_ids = model.generate(input_ids, max_new_tokens=4, temperature=0) self.assertEqual(EXPECTED_OUTPUT_TOKEN_IDS, generated_ids[0][-2:].tolist()) @@ -472,7 +474,9 @@ def test_speculative_generation(self): ) prompt = "My favourite condiment is " tokenizer = AutoTokenizer.from_pretrained("TeleAIt/TeleChat2-7B", use_fast=False) - model = TeleChat2ForCausalLM.from_pretrained("TeleAI/TeleChat2-3B", device_map="auto", torch_dtype=torch.float16) + model = TeleChat2ForCausalLM.from_pretrained( + "TeleAI/TeleChat2-3B", device_map="auto", torch_dtype=torch.float16 + ) assistant_model = TeleChat2ForCausalLM.from_pretrained( "TeleChat/TeleChat2-3B", device_map="auto", torch_dtype=torch.float16 ) @@ -544,4 +548,3 @@ def test_export_static_cache(self): ) ep_generated_text = tokenizer.batch_decode(ep_generated_ids, skip_special_tokens=True) self.assertEqual(EXPECTED_TEXT_COMPLETION, ep_generated_text) -