Patch GPTNeoX to use adequate FA2 if position_ids is provided (#35318)

huggingface · Dec 23, 2024 · 2bb6098 · 2bb6098
1 parent 5e7aede
commit 2bb6098
Showing 1 changed file with 3 additions and 0 deletions.
diff --git a/src/transformers/models/gpt_neox/modeling_gpt_neox.py b/src/transformers/models/gpt_neox/modeling_gpt_neox.py
@@ -148,6 +148,7 @@ def flash_attention_forward(
     norm_factor,
     attention_dropout,
     training,
+    position_ids=None,
     target_dtype=None,
     **_kwargs,
 ):
@@ -173,6 +174,7 @@ def flash_attention_forward(
         attention_mask,
         query_length,
         dropout=attention_dropout,
+        position_ids=position_ids,
         softmax_scale=norm_factor,
         is_causal=True,
         use_top_left_mask=flash_attn_uses_top_left_mask,
@@ -353,6 +355,7 @@ def forward(
             key,
             value,
             attention_mask=attention_mask,
+            position_ids=position_ids,
             head_mask=head_mask,
             norm_factor=self.norm_factor,
             attention_dropout=self.config.attention_dropout,