diff --git a/server/text_generation_server/models/custom_modeling/flash_deepseek_v2_modeling.py b/server/text_generation_server/models/custom_modeling/flash_deepseek_v2_modeling.py index 01bcea7d39a..4ea1f469c98 100644 --- a/server/text_generation_server/models/custom_modeling/flash_deepseek_v2_modeling.py +++ b/server/text_generation_server/models/custom_modeling/flash_deepseek_v2_modeling.py @@ -404,11 +404,14 @@ def forward( ) # Decode else: + logger.warning( + f"paged attention -> query: {query.shape}, key: {key.shape}, value: {value_states.shape}" + ) paged_attention( attn_output, query, - key, - value_states, + kv_cache[0], + kv_cache[1], self.kv_head_mapping, self.softmax_scale, block_tables, @@ -420,7 +423,6 @@ def forward( logger.warning(f"attention output: {attn_output.shape}") attn_output = attn_output[..., : self.v_head_dim] logger.warning(f"attention output after unpad: {attn_output.shape}") - logger.warning(f"v_head_dim: {self.v_head_dim}") return self.o_proj(attn_output.reshape(-1, self.num_heads * self.v_head_dim))