From 5e9a3394eaed54aabab4f991f9847c30e69e2857 Mon Sep 17 00:00:00 2001 From: Fabian Degen <106864199+degenfabian@users.noreply.github.com> Date: Sat, 28 Dec 2024 01:43:07 +0100 Subject: [PATCH] Set prepend_bos to false by default for Qwen models (#815) * Set prepend_bos to false by default for Qwen * Fix typo in warning for center_unembed when logit softcap is activated --------- Co-authored-by: Fabian Degen --- transformer_lens/HookedTransformer.py | 2 +- transformer_lens/loading_from_pretrained.py | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/transformer_lens/HookedTransformer.py b/transformer_lens/HookedTransformer.py index 500098c32..9fa4571ae 100644 --- a/transformer_lens/HookedTransformer.py +++ b/transformer_lens/HookedTransformer.py @@ -1311,7 +1311,7 @@ def from_pretrained( center_writing_weights = False if center_unembed and cfg.output_logits_soft_cap > 0.0: logging.warning( - "You tried to specify center_unembed=True for a model using logit softcap, but this can't be done! Softcapping is not invariant upon adding a constant" + "You tried to specify center_unembed=True for a model using logit softcap, but this can't be done! Softcapping is not invariant upon adding a constant " "Setting center_unembed=False instead." ) center_unembed = False diff --git a/transformer_lens/loading_from_pretrained.py b/transformer_lens/loading_from_pretrained.py index b4ecc8d64..4d36c744f 100644 --- a/transformer_lens/loading_from_pretrained.py +++ b/transformer_lens/loading_from_pretrained.py @@ -1241,6 +1241,7 @@ def convert_hf_model_config(model_name: str, **kwargs): "trust_remote_code": True, "final_rms": True, "gated_mlp": True, + "default_prepend_bos": False, } elif architecture == "Qwen2ForCausalLM": # Note that Qwen1.5 models have architecture type Qwen2ForCausalLM. @@ -1265,6 +1266,7 @@ def convert_hf_model_config(model_name: str, **kwargs): "tokenizer_prepends_bos": True, "final_rms": True, "gated_mlp": True, + "default_prepend_bos": False, } elif architecture == "PhiForCausalLM": # Architecture for microsoft/phi models