huggingface · younesbelkada · Dec 15, 2023 · Dec 14, 2023 · Dec 14, 2023 · Dec 14, 2023
diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py
@@ -2955,6 +2955,19 @@ def from_pretrained(
                 **kwargs,
             )
         else:
+            # In case one passes a config to `from_pretrained` + "attn_implementation"
+            # override the `_attn_implementation` attribute to `attn_implementation` of the kwargs
+            # Please see: https://github.com/huggingface/transformers/issues/28038
+
+            # Overwrite `config._attn_implementation` by the one from the kwargs --> in auto-factory
+            # we pop attn_implementation from the kwargs but this handles the case where users
+            # passes manually the config to `from_pretrained`.
+            config = copy.deepcopy(config)
+
+            if kwargs.get("attn_implementation", None) is not None and getattr(
+                config, "_attn_implementation", None
+            ) != kwargs.get("attn_implementation", None):
+                config._attn_implementation = kwargs.get("attn_implementation", None)
             model_kwargs = kwargs
 
         quantizer = None

diff --git a/tests/test_modeling_utils.py b/tests/test_modeling_utils.py
@@ -1823,6 +1823,16 @@ def test_error_no_flash_available(self):
 
         self.assertTrue("does not support Flash Attention 2.0" in str(cm.exception))
 
+    def test_error_no_flash_available_with_config(self):
+        with self.assertRaises(ValueError) as cm:
+            config = AutoConfig.from_pretrained("hf-tiny-model-private/tiny-random-MCTCTModel")
+
+            _ = AutoModel.from_pretrained(
+                "hf-tiny-model-private/tiny-random-MCTCTModel", config=config, attn_implementation="flash_attention_2"
+            )
+
+        self.assertTrue("does not support Flash Attention 2.0" in str(cm.exception))
+
     def test_error_wrong_attn_implementation(self):
         with self.assertRaises(ValueError) as cm:
             _ = AutoModel.from_pretrained("hf-tiny-model-private/tiny-random-MCTCTModel", attn_implementation="foo")