From a1fe340c11198488a717cf48495d6247f41e8b28 Mon Sep 17 00:00:00 2001 From: niushengxiao Date: Thu, 28 Nov 2024 15:30:46 +0800 Subject: [PATCH] feat: add cc, acc method and ppl decoding mha kernel for deepseek2 --- .../layer_infer/transformer_layer_infer.py | 174 +++++++- .../layer_weights/transformer_layer_weight.py | 5 + .../context_flashattention_nopad.py | 422 ++++++++++++++++++ 3 files changed, 599 insertions(+), 2 deletions(-) diff --git a/lightllm/models/deepseek2/layer_infer/transformer_layer_infer.py b/lightllm/models/deepseek2/layer_infer/transformer_layer_infer.py index bf5db846..4ce4d286 100644 --- a/lightllm/models/deepseek2/layer_infer/transformer_layer_infer.py +++ b/lightllm/models/deepseek2/layer_infer/transformer_layer_infer.py @@ -1,6 +1,7 @@ from typing import Tuple import torch import torch.functional as F +import torch.nn.functional as FN import torch.distributed as dist import numpy as np from lightllm.models.deepseek2.layer_weights.transformer_layer_weight import Deepseek2TransformerLayerWeight @@ -8,6 +9,8 @@ from lightllm.models.deepseek2.triton_kernel.context_flashattention_nopad import ( context_attention_fwd, context_attention_fwd_no_prompt_cache, + context_attention_fwd_with_v, + context_attention_fwd_no_prompt_cache_with_v, ) from lightllm.models.deepseek2.triton_kernel.gqa_flash_decoding import gqa_token_decode_attention_flash_decoding @@ -18,6 +21,7 @@ from lightllm.models.llama.infer_struct import LlamaInferStateInfo from functools import partial from lightllm.models.llama.yarn_rotary_utils import get_deepseek_mscale +import os class Deepseek2TransformerLayerInfer(LlamaTransformerLayerInfer): @@ -55,6 +59,11 @@ def __init__( self.softmax_scale = self.softmax_scale * mscale * mscale super().__init__(layer_num, tp_rank, world_size, network_config, mode) self.tp_o_head_num_ = self.tp_q_head_num_ + + self.num_heads = network_config["num_attention_heads"] + self.num_kv_heads = network_config["num_key_value_heads"] + self.enable_opt_decoding_mha = os.getenv("ENABLE_OPT_DECODE_MHA", "False").upper() in ["ON", "TRUE", "1"] + return def _bind_attention(self): @@ -97,7 +106,8 @@ def _get_qkv( q = q.view(-1, self.tp_q_head_num_, self.qk_nope_head_dim + self.qk_rope_head_dim) q_nope, q_rope = torch.split(q, [self.qk_nope_head_dim, self.qk_rope_head_dim], dim=-1) - q_nope = layer_weight.k_b_proj_.bmm(q_nope.transpose(0, 1)).transpose(0, 1) + if layer_weight.mla_type == "ACCM": + q_nope = layer_weight.k_b_proj_.bmm(q_nope.transpose(0, 1)).transpose(0, 1) layer_weight.kv_a_proj_with_mqa_.mm(input, out=cache_kv.view(-1, self.kv_lora_rank + self.qk_rope_head_dim)) @@ -123,11 +133,157 @@ def _get_o( input = input.view(-1, self.tp_q_head_num_ * self.kv_lora_rank) o_tensor = layer_weight.fuse_vo_weight_.mm(input) else: - input = layer_weight.v_b_proj_.bmm(input.transpose(0, 1)).transpose(0, 1) + if layer_weight.mla_type == "ACCM": + input = layer_weight.v_b_proj_.bmm(input.transpose(0, 1)).transpose(0, 1) o_tensor = layer_weight.o_weight_.mm(input.reshape(-1, self.tp_q_head_num_ * self.qk_nope_head_dim)) return o_tensor + def _CC_method( + self, q, compressed_kv, infer_state: LlamaInferStateInfo, layer_weight: Deepseek2TransformerLayerWeight + ): + num_local_heads = self.num_heads + num_local_kv_heads = self.num_kv_heads + if self.world_size_ > 1: + num_local_heads //= self.world_size_ + num_local_kv_heads //= self.world_size_ + if infer_state.use_dynamic_prompt_cache: + compressed_kv = infer_state.mem_manager.kv_buffer[self.layer_num_] + # CC + compressed_kv, k_pe = torch.split( # (b*s, 1, kv_lora + qk_r) + compressed_kv, [layer_weight.kv_lora_rank, layer_weight.qk_rope_head_dim], dim=-1 + ) + compressed_kv = compressed_kv.view(-1, layer_weight.kv_lora_rank) + k = torch.empty( + k_pe.shape[0], + num_local_kv_heads, + layer_weight.qk_nope_head_dim + layer_weight.qk_rope_head_dim, + dtype=q[0].dtype, + device=q[0].device, + ) + k[..., layer_weight.qk_nope_head_dim :] = k_pe + k[..., : layer_weight.qk_nope_head_dim] = FN.linear( + compressed_kv, layer_weight.k_b_proj_.weight.view(-1, layer_weight.k_b_proj_.weight.shape[-1]) + ).view(-1, num_local_kv_heads, layer_weight.qk_nope_head_dim) + trans_weight = layer_weight.v_b_proj_.weight.transpose(1, 2) + v = FN.linear(compressed_kv, trans_weight.view(-1, trans_weight.shape[-1])).view( + -1, num_local_kv_heads, layer_weight.qk_nope_head_dim + ) # (b*s, h, vo_d) + return self._context_attention_kernel_with_v(q, k, v, infer_state, layer_weight) + + def _ACC_method( + self, q, compressed_kv, infer_state: LlamaInferStateInfo, layer_weight: Deepseek2TransformerLayerWeight + ): + q_ne, q_pe = q + num_local_heads = self.num_heads + num_local_kv_heads = self.num_kv_heads + if self.world_size_ > 1: + num_local_heads //= self.world_size_ + num_local_kv_heads //= self.world_size_ + # ACC + q = torch.empty( + q_ne.shape[0], + num_local_heads, + self.kv_lora_rank + self.qk_rope_head_dim, + dtype=q_ne.dtype, + device=q_ne.device, + ) + q[..., self.kv_lora_rank :] = q_pe + q[..., : self.kv_lora_rank] = torch.bmm( # TODO: 转换成einsum 或者 cublas + q_ne.transpose(0, 1), # (h, b*s, qk_n) + layer_weight.k_b_proj_.weight.view( + num_local_kv_heads, self.qk_nope_head_dim, self.kv_lora_rank + ), # (h, qk_n, kv_lora) + ).transpose( + 0, 1 + ) # (b*s, h, kv_lora) + q_nope, q_rope = torch.split( # (b*s, h, qk_n + qk_r) -> (b*s, h, qk_n), (b*s, h, qk_r) + q, [self.kv_lora_rank, self.qk_rope_head_dim], dim=-1 + ) + if self.enable_opt_decoding_mha: + import lightllm_ppl_mla + + o_tensor = self.alloc_tensor(q_nope.shape, dtype=q_nope.dtype) + kvstarts = torch.zeros(infer_state.batch_size + 1, dtype=torch.int, device=q.device) + kvstarts[1:] = infer_state.b_seq_len.clone().detach().cumsum(dim=0) + lightllm_ppl_mla.decode_mla( + o_tensor, + q, + compressed_kv[: infer_state.mem_end, :, :], + infer_state.b_start_loc, + kvstarts, + self.softmax_scale, + q.shape[-1], + q_nope.shape[-1], + ) + output_parallel = o_tensor + else: + output_parallel = self._token_gqa_decode_attention_flashdecoding_origin( + (q_nope, q_rope), infer_state, layer_weight + ) + trans_weight = layer_weight.v_b_proj_.weight.transpose(1, 2) + output_parallel = torch.bmm( # TODO: 转换成einsum 或者 cublas + output_parallel.transpose(0, 1), # (h, b*s, kv_lora) + trans_weight.view(num_local_kv_heads, layer_weight.qk_nope_head_dim, self.kv_lora_rank).transpose( + 1, 2 + ), # (h, kv_lora, vo_d) + ).transpose( + 0, 1 + ) # (b*s, h, vo_d) + return output_parallel + def _context_attention_kernel( + self, q, kv, infer_state: LlamaInferStateInfo, layer_weight: Deepseek2TransformerLayerWeight, out=None + ) -> torch.Tensor: + if layer_weight.mla_type == "MIX": + return self._context_attention_kernel_with_CC(q, kv, infer_state, layer_weight, out) + else: + return self._context_attention_kernel_origin(q, kv, infer_state, layer_weight, out) + + def _context_attention_kernel_with_CC( + self, q, kv, infer_state: LlamaInferStateInfo, layer_weight: Deepseek2TransformerLayerWeight, out=None + ) -> torch.Tensor: + return self._CC_method(q, kv, infer_state, layer_weight) + + def _context_attention_kernel_with_v( + self, q: Tuple[torch.Tensor, torch.Tensor], kv, v, infer_state: LlamaInferStateInfo, layer_weight, out=None + ) -> torch.Tensor: + q_nope, q_rope = q + nope_head_dim = q_nope.shape[-1] + o_tensor = self.alloc_tensor(q_nope.shape, dtype=q_nope.dtype) if out is None else out + if infer_state.use_dynamic_prompt_cache: + context_attention_fwd_with_v( + q_nope, + q_rope, + kv[:, :, :nope_head_dim], + kv[:, :, nope_head_dim:], + v, + o_tensor.view(-1, self.tp_q_head_num_, nope_head_dim), + infer_state.b_req_idx, + infer_state.b_start_loc, + infer_state.b_seq_len, + infer_state.b_ready_cache_len, + infer_state.max_len_in_batch, + infer_state.req_manager.req_to_token_indexs, + self.softmax_scale, + ) + else: + context_attention_fwd_no_prompt_cache_with_v( + q_nope, + q_rope, + kv[:, :, :nope_head_dim], + kv[:, :, nope_head_dim:], + v, + o_tensor.view(-1, self.tp_q_head_num_, nope_head_dim), + infer_state.b_start_loc, + infer_state.b_seq_len, + infer_state.max_len_in_batch, + self.softmax_scale, + ) + q_nope = None + q_rope = None + return o_tensor + + def _context_attention_kernel_origin( self, q: Tuple[torch.Tensor, torch.Tensor], kv, infer_state: LlamaInferStateInfo, layer_weight, out=None ) -> torch.Tensor: q_nope, q_rope = q @@ -166,6 +322,20 @@ def _context_attention_kernel( return o_tensor def _token_gqa_decode_attention_flashdecoding(self, q, infer_state: LlamaInferStateInfo, layer_weight, out=None): + if layer_weight.mla_type == "MIX": + return self._token_gqa_decode_attention_flashdecoding_with_ACC(q, infer_state, layer_weight, out) + else: + return self._token_gqa_decode_attention_flashdecoding_origin(q, infer_state, layer_weight, out) + + def _token_gqa_decode_attention_flashdecoding_with_ACC( + self, q, infer_state: LlamaInferStateInfo, layer_weight, out=None + ): + compressed_kv = infer_state.mem_manager.kv_buffer[self.layer_num_][: infer_state.mem_end, :, :] + return self._ACC_method(q, compressed_kv, infer_state, layer_weight) + + def _token_gqa_decode_attention_flashdecoding_origin( + self, q, infer_state: LlamaInferStateInfo, layer_weight, out=None + ): q_nope, q_rope = q kv = infer_state.mem_manager.kv_buffer[self.layer_num_][:, :, : self.kv_lora_rank] kv_rope = infer_state.mem_manager.kv_buffer[self.layer_num_][:, :, self.kv_lora_rank :] diff --git a/lightllm/models/deepseek2/layer_weights/transformer_layer_weight.py b/lightllm/models/deepseek2/layer_weights/transformer_layer_weight.py index 614df1ce..995ad1f1 100644 --- a/lightllm/models/deepseek2/layer_weights/transformer_layer_weight.py +++ b/lightllm/models/deepseek2/layer_weights/transformer_layer_weight.py @@ -75,6 +75,11 @@ def __init__( self.disable_qk_absorb = disable_qk_absorb self.disable_vo_absorb = disable_vo_absorb super().__init__(layer_num, tp_rank, world_size, data_type, network_config, mode, quant_cfg) + # mla_type = "ACCM", "MIX" + # MIX是prefilled CC,decoding ACC + self.mla_type = "MIX" + if not disable_vo_absorb or not disable_qk_absorb: + self.mla_type = "ACCM" return def _parse_config(self): diff --git a/lightllm/models/deepseek2/triton_kernel/context_flashattention_nopad.py b/lightllm/models/deepseek2/triton_kernel/context_flashattention_nopad.py index 99e2b7b9..ed88ab08 100644 --- a/lightllm/models/deepseek2/triton_kernel/context_flashattention_nopad.py +++ b/lightllm/models/deepseek2/triton_kernel/context_flashattention_nopad.py @@ -9,6 +9,428 @@ CUDA_CAPABILITY = torch.cuda.get_device_capability() +@triton.jit +def _fwd_kernel_with_v( + Q_nope, + Q_rope, + K_nope, + K_rope, + V, + sm_scale, + B_Start_Loc, + B_Seqlen, # B_LOC 内部记录每个batch 输入的真实位置, B_SEQ_len 记录当前输入的真实长度 + Out, + Req_to_tokens, + B_req_idx, + stride_q_bs, + stride_q_h, + stride_q_d, + stride_q_rope_bs, + stride_q_rope_h, + stride_q_rope_d, + stride_k_bs, + stride_k_h, + stride_k_d, + stride_k_rope_bs, + stride_k_rope_h, + stride_k_rope_d, + stride_vbs, + stride_vh, + stride_vd, + stride_obs, + stride_oh, + stride_od, + stride_req_to_tokens_b, + stride_req_to_tokens_s, + b_prompt_cache_len, + BLOCK_M: tl.constexpr, + BLOCK_DMODEL: tl.constexpr, + BLOCK_ROPE_DMODEL: tl.constexpr, + BLOCK_N: tl.constexpr, +): + cur_batch = tl.program_id(0) + cur_head = tl.program_id(1) + start_m = tl.program_id(2) + + cur_k_head = cur_head + + cur_batch_in_all_start_index = tl.load(B_Start_Loc + cur_batch) + prompt_cache_len = tl.load(b_prompt_cache_len + cur_batch) + cur_batch_seq_len = tl.load(B_Seqlen + cur_batch) - prompt_cache_len + cur_batch_req_idx = tl.load(B_req_idx + cur_batch) + + block_start_loc = BLOCK_M * start_m + + # initialize offsets + offs_n = tl.arange(0, BLOCK_N) + offs_d = tl.arange(0, BLOCK_DMODEL) + offs_rope_d = tl.arange(0, BLOCK_ROPE_DMODEL) + offs_m = start_m * BLOCK_M + tl.arange(0, BLOCK_M) + off_q = ( + (cur_batch_in_all_start_index + offs_m[:, None]) * stride_q_bs + + cur_head * stride_q_h + + offs_d[None, :] * stride_q_d + ) + off_q_rope = ( + (cur_batch_in_all_start_index + offs_m[:, None]) * stride_q_rope_bs + + cur_head * stride_q_rope_h + + offs_rope_d[None, :] * stride_q_rope_d + ) + + q = tl.load(Q_nope + off_q, mask=offs_m[:, None] < cur_batch_seq_len, other=0.0) + q_rope = tl.load(Q_rope + off_q_rope, mask=offs_m[:, None] < cur_batch_seq_len, other=0.0) + + # initialize pointer to m and l + m_i = tl.zeros([BLOCK_M], dtype=tl.float32) - float("inf") + l_i = tl.zeros([BLOCK_M], dtype=tl.float32) + acc = tl.zeros([BLOCK_M, BLOCK_DMODEL], dtype=tl.float32) + + block_mask = tl.where(block_start_loc < cur_batch_seq_len, 1, 0) + block_end_loc = tl.minimum((start_m + 1) * BLOCK_M + prompt_cache_len, cur_batch_seq_len + prompt_cache_len) + + for start_n in range(0, block_mask * block_end_loc, BLOCK_N): + start_n = tl.multiple_of(start_n, BLOCK_N) + # -- compute qk ---- + k_loc = tl.load( + Req_to_tokens + stride_req_to_tokens_b * cur_batch_req_idx + stride_req_to_tokens_s * (start_n + offs_n), + mask=(start_n + offs_n) < block_end_loc, + other=0, + ) + off_k = k_loc[None, :] * stride_k_bs + cur_k_head * stride_k_h + offs_d[:, None] * stride_k_d + off_k_rope = ( + k_loc[None, :] * stride_k_rope_bs + cur_k_head * stride_k_rope_h + offs_rope_d[:, None] * stride_k_rope_d + ) + k = tl.load(K_nope + off_k, mask=(start_n + offs_n[None, :]) < block_end_loc, other=0.0) + k_rope = tl.load(K_rope + off_k_rope, mask=(start_n + offs_n[None, :]) < block_end_loc, other=0.0) + + qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32) + qk += tl.dot(q, k) + qk += tl.dot(q_rope, k_rope) + + qk *= sm_scale + qk = tl.where(offs_m[:, None] + prompt_cache_len >= start_n + offs_n[None, :], qk, float("-100000000.0")) + + # -- compute m_ij, p, l_ij + m_ij = tl.max(qk, 1) + p = tl.exp(qk - m_ij[:, None]) + l_ij = tl.sum(p, 1) + # -- update m_i and l_i + m_i_new = tl.maximum(m_i, m_ij) + alpha = tl.exp(m_i - m_i_new) + beta = tl.exp(m_ij - m_i_new) + l_i_new = alpha * l_i + beta * l_ij + # -- update output accumulator -- + # scale p + p_scale = beta / l_i_new + p = p * p_scale[:, None] + # scale acc + acc_scale = l_i / l_i_new * alpha + acc_scale = tl.where(offs_m + prompt_cache_len >= start_n, acc_scale, 1.0) + acc = acc * acc_scale[:, None] + # update acc + off_v = k_loc[:, None] * stride_vbs + cur_k_head * stride_vh + offs_d[None, :] * stride_vd + v = tl.load(V + off_v, mask=(start_n + offs_n[:, None]) < block_end_loc, other=0.0) + p = p.to(v.dtype) + acc += tl.dot(p, v) + # update m_i and l_i + l_i = l_i_new + m_i = m_i_new + # initialize pointers to output + off_o = ( + (cur_batch_in_all_start_index + offs_m[:, None]) * stride_obs + + cur_head * stride_oh + + offs_d[None, :] * stride_od + ) + out_ptrs = Out + off_o + tl.store(out_ptrs, acc, mask=offs_m[:, None] < cur_batch_seq_len) + return + + +@torch.no_grad() +def context_attention_fwd_with_v( + q_nope, + q_rope, + k_nope, + k_rope, + v, + o, + b_req_idx, + b_start_loc, + b_seq_len, + b_prompt_cache_len, + max_input_len, + req_to_token_indexs, + softmax_scale, +): + + BLOCK = 128 if not TESLA else 64 + q_nope_dim = q_nope.shape[-1] + q_rope_dim = q_rope.shape[-1] + assert q_nope_dim == k_nope.shape[-1] + assert q_rope_dim == k_rope.shape[-1] + assert q_nope_dim in {16, 32, 64, 128, 256, 512} + assert q_rope_dim in {16, 32, 64, 128, 256} + assert q_nope_dim == v.shape[-1] + + if q_nope_dim >= 512: + BLOCK = 64 if not TESLA else 32 + else: + BLOCK = 128 if not TESLA else 64 + + if q_nope.dtype == torch.float32: + BLOCK = BLOCK // 4 + + sm_scale = softmax_scale + batch, head = b_seq_len.shape[0], q_nope.shape[1] + + grid = (batch, head, triton.cdiv(max_input_len, BLOCK)) # batch, head, + num_warps = 4 if q_nope_dim <= 64 else 8 + + _fwd_kernel_with_v[grid]( + q_nope, + q_rope, + k_nope, + k_rope, + v, + sm_scale, + b_start_loc, + b_seq_len, + o, + req_to_token_indexs, + b_req_idx, + q_nope.stride(0), + q_nope.stride(1), + q_nope.stride(2), + q_rope.stride(0), + q_rope.stride(1), + q_rope.stride(2), + k_nope.stride(0), + k_nope.stride(1), + k_nope.stride(2), + k_rope.stride(0), + k_rope.stride(1), + k_rope.stride(2), + v.stride(0), + v.stride(1), + v.stride(2), + o.stride(0), + o.stride(1), + o.stride(2), + req_to_token_indexs.stride(0), + req_to_token_indexs.stride(1), + b_prompt_cache_len=b_prompt_cache_len, + BLOCK_M=BLOCK, + BLOCK_DMODEL=q_nope_dim, + BLOCK_ROPE_DMODEL=q_rope_dim, + BLOCK_N=BLOCK, + num_warps=num_warps, + num_stages=1, + ) + return + + +@triton.jit +def _fwd_kernel_no_prompt_cache_with_v( + Q_nope, + Q_rope, + K_nope, + K_rope, + V, + sm_scale, + B_Start_Loc, + B_Seqlen, # B_LOC 内部记录每个batch 输入的真实位置, B_SEQ_len 记录当前输入的真实长度 + Out, + stride_q_bs, + stride_q_h, + stride_q_d, + stride_q_rope_bs, + stride_q_rope_h, + stride_q_rope_d, + stride_k_bs, + stride_k_h, + stride_k_d, + stride_k_rope_bs, + stride_k_rope_h, + stride_k_rope_d, + stride_vbs, + stride_vh, + stride_vd, + stride_obs, + stride_oh, + stride_od, + BLOCK_M: tl.constexpr, + BLOCK_DMODEL: tl.constexpr, + BLOCK_ROPE_DMODEL: tl.constexpr, + BLOCK_N: tl.constexpr, +): + cur_batch = tl.program_id(0) + cur_head = tl.program_id(1) + start_m = tl.program_id(2) + + cur_k_head = cur_head + + cur_batch_seq_len = tl.load(B_Seqlen + cur_batch) + cur_batch_in_all_start_index = tl.load(B_Start_Loc + cur_batch) + + block_start_loc = BLOCK_M * start_m + + # initialize offsets + offs_n = tl.arange(0, BLOCK_N) + offs_d = tl.arange(0, BLOCK_DMODEL) + offs_rope_d = tl.arange(0, BLOCK_ROPE_DMODEL) + offs_m = start_m * BLOCK_M + tl.arange(0, BLOCK_M) + off_q = ( + (cur_batch_in_all_start_index + offs_m[:, None]) * stride_q_bs + + cur_head * stride_q_h + + offs_d[None, :] * stride_q_d + ) + off_rope_q = ( + (cur_batch_in_all_start_index + offs_m[:, None]) * stride_q_rope_bs + + cur_head * stride_q_rope_h + + offs_rope_d[None, :] * stride_q_rope_d + ) + off_k = offs_n[None, :] * stride_k_bs + cur_k_head * stride_k_h + offs_d[:, None] * stride_k_d + off_rope_k = ( + offs_n[None, :] * stride_k_rope_bs + cur_k_head * stride_k_rope_h + offs_rope_d[:, None] * stride_k_rope_d + ) + off_v = offs_n[:, None] * stride_vbs + cur_k_head * stride_vh + offs_d[None, :] * stride_vd + + q = tl.load(Q_nope + off_q, mask=offs_m[:, None] < cur_batch_seq_len, other=0.0) + q_rope = tl.load(Q_rope + off_rope_q, mask=offs_m[:, None] < cur_batch_seq_len, other=0.0) + + k_ptrs = K_nope + off_k + k_rope_ptrs = K_rope + off_rope_k + v_ptrs = V + off_v + + # initialize pointer to m and l + m_i = tl.zeros([BLOCK_M], dtype=tl.float32) - float("inf") + l_i = tl.zeros([BLOCK_M], dtype=tl.float32) + acc = tl.zeros([BLOCK_M, BLOCK_DMODEL], dtype=tl.float32) + + block_mask = tl.where(block_start_loc < cur_batch_seq_len, 1, 0) + + for start_n in range(0, block_mask * (start_m + 1) * BLOCK_M, BLOCK_N): + start_n = tl.multiple_of(start_n, BLOCK_N) + # -- compute qk ---- + k = tl.load( + k_ptrs + (cur_batch_in_all_start_index + start_n) * stride_k_bs, + mask=(start_n + offs_n[None, :]) < cur_batch_seq_len, + other=0.0, + ) + k_rope = tl.load( + k_rope_ptrs + (cur_batch_in_all_start_index + start_n) * stride_k_rope_bs, + mask=(start_n + offs_n[None, :]) < cur_batch_seq_len, + other=0.0, + ) + + qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32) + qk += tl.dot(q, k) + qk += tl.dot(q_rope, k_rope) + qk *= sm_scale + qk = tl.where(offs_m[:, None] >= (start_n + offs_n[None, :]), qk, float("-inf")) + + # -- compute m_ij, p, l_ij + m_ij = tl.max(qk, 1) + p = tl.exp(qk - m_ij[:, None]) + l_ij = tl.sum(p, 1) + # -- update m_i and l_i + m_i_new = tl.maximum(m_i, m_ij) + alpha = tl.exp(m_i - m_i_new) + beta = tl.exp(m_ij - m_i_new) + l_i_new = alpha * l_i + beta * l_ij + # -- update output accumulator -- + # scale p + p_scale = beta / l_i_new + p = p * p_scale[:, None] + # scale acc + acc_scale = l_i / l_i_new * alpha + acc = acc * acc_scale[:, None] + # update acc + v = tl.load( + v_ptrs + (cur_batch_in_all_start_index + start_n) * stride_vbs, + mask=(start_n + offs_n[:, None]) < cur_batch_seq_len, + other=0.0, + ) + p = p.to(v.dtype) + acc += tl.dot(p, v) + # update m_i and l_i + l_i = l_i_new + m_i = m_i_new + # initialize pointers to output + off_o = ( + (cur_batch_in_all_start_index + offs_m[:, None]) * stride_obs + + cur_head * stride_oh + + offs_d[None, :] * stride_od + ) + out_ptrs = Out + off_o + tl.store(out_ptrs, acc, mask=offs_m[:, None] < cur_batch_seq_len) + return + + +@torch.no_grad() +def context_attention_fwd_no_prompt_cache_with_v( + q_nope, q_rope, k_nope, k_rope, v, o, b_start_loc, b_seq_len, max_input_len, softmax_scale +): + q_nope_dim = q_nope.shape[-1] + q_rope_dim = q_rope.shape[-1] + assert q_nope_dim == k_nope.shape[-1] + assert q_rope_dim == k_rope.shape[-1] + assert q_nope_dim in {16, 32, 64, 128, 256, 512} + assert q_rope_dim in {16, 32, 64, 128, 256} + assert q_nope_dim == v.shape[-1] + + if q_nope_dim >= 512: + BLOCK = 64 if not TESLA else 32 + else: + BLOCK = 128 if not TESLA else 64 + + if q_nope.dtype == torch.float32: + BLOCK = BLOCK // 4 + + sm_scale = softmax_scale + batch, head = b_seq_len.shape[0], q_nope.shape[1] + + grid = (batch, head, triton.cdiv(max_input_len, BLOCK)) # batch, head, + + num_warps = 4 if q_nope_dim <= 64 else 8 + _fwd_kernel_no_prompt_cache_with_v[grid]( + q_nope, + q_rope, + k_nope, + k_rope, + v, + sm_scale, + b_start_loc, + b_seq_len, + o, + q_nope.stride(0), + q_nope.stride(1), + q_nope.stride(2), + q_rope.stride(0), + q_rope.stride(1), + q_rope.stride(2), + k_nope.stride(0), + k_nope.stride(1), + k_nope.stride(2), + k_rope.stride(0), + k_rope.stride(1), + k_rope.stride(2), + v.stride(0), + v.stride(1), + v.stride(2), + o.stride(0), + o.stride(1), + o.stride(2), + BLOCK_M=BLOCK, + BLOCK_DMODEL=q_nope_dim, + BLOCK_ROPE_DMODEL=q_rope_dim, + BLOCK_N=BLOCK, + num_warps=num_warps, + num_stages=1, + ) + return + + @triton.jit def _fwd_kernel( Q_nope,