fix comment

huggingface · Sep 9, 2024 · 633f436 · 633f436
1 parent f745e7d
commit 633f436
Showing 1 changed file with 2 additions and 2 deletions.
diff --git a/src/transformers/models/llama/modeling_llama.py b/src/transformers/models/llama/modeling_llama.py
@@ -490,8 +490,8 @@ def forward(
         value_states = self.v_proj(hidden_states)
 
         # Flash attention requires the input to have the shape
-        # batch_size x seq_length x head_dim x hidden_dim
-        # therefore we just need to keep the original shape
+        # batch_size x seq_length x num_heads x head_dim
+        # but rotary embeddings require batch_size x num_heads x seq_length x head_dim
         query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
         key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
         value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)