diff --git a/src/transformers/modeling_gguf_pytorch_utils.py b/src/transformers/modeling_gguf_pytorch_utils.py index c784ca0eb4ca2c..f58bf330ce7db3 100644 --- a/src/transformers/modeling_gguf_pytorch_utils.py +++ b/src/transformers/modeling_gguf_pytorch_utils.py @@ -106,6 +106,17 @@ def load_gguf_checkpoint(gguf_checkpoint_path, return_tensors=False): if "qwen2moe" in architecture: updated_architecture = "qwen2_moe" + # For stablelm architecture, we need to set qkv_bias and use_parallel_residual from tensors + # If `qkv_bias=True`, qkv_proj with bias will be present in the tensors + # If `use_parallel_residual=False`, ffn_norm will be present in the tensors + if "stablelm" in architecture: + attn_bias_name = {"attn_q.bias", "attn_k.bias", "attn_v.bias"} + ffn_norm_name = "ffn_norm" + qkv_bias = any(bias_name in tensor.name for tensor in reader.tensors for bias_name in attn_bias_name) + use_parallel_residual = any(ffn_norm_name in tensor.name for tensor in reader.tensors) + parsed_parameters["config"]["qkv_bias"] = qkv_bias + parsed_parameters["config"]["use_parallel_residual"] = not use_parallel_residual + model_size = "" # extract the number of params from file name as architectures can differ ; # eg. for falcon : `...falcon-7b-...` diff --git a/tests/quantization/ggml/test_ggml.py b/tests/quantization/ggml/test_ggml.py index da1af9bff8df90..84278e7032537b 100644 --- a/tests/quantization/ggml/test_ggml.py +++ b/tests/quantization/ggml/test_ggml.py @@ -673,10 +673,6 @@ def test_stablelm_fp16(self): self.stablelm2_model_id, gguf_file=self.fp16_stablelm2_model_id, torch_dtype=torch.float16, - # for precise comparison it is required to use the original model config - # as quantized one is different in parameters: use_parallel_residual and use_qkv_bias - # and it highly influences on the output results - config=original_model.config, ) tokenizer = AutoTokenizer.from_pretrained(self.stablelm2_model_id, gguf_file=self.fp16_stablelm2_model_id) @@ -703,10 +699,6 @@ def test_stablelm_weights_conversion_fp16(self): gguf_file=self.fp16_stablelm2_model_id, device_map="auto", torch_dtype=torch.float16, - # for precise comparison it is required to use the original model config - # as quantized one is different in parameters: use_parallel_residual and use_qkv_bias - # and it highly influences on the output results - config=original_model.config, ) converted_state_dict = converted_model.state_dict()