diff --git a/src/transformers/models/mamba/modeling_mamba.py b/src/transformers/models/mamba/modeling_mamba.py index bdc39e10ee4eee..65f928f418599c 100644 --- a/src/transformers/models/mamba/modeling_mamba.py +++ b/src/transformers/models/mamba/modeling_mamba.py @@ -17,7 +17,7 @@ import math from dataclasses import dataclass from pathlib import Path -from typing import List, Optional, Tuple, Union, Dict, Any +from typing import Any, Dict, List, Optional, Tuple, Union import torch import torch.utils.checkpoint @@ -349,20 +349,9 @@ def __init__(self, config, batch_size, conv_dtype=torch.float32, ssm_dtype=torc expand = config.expand d_conv = config.conv_kernel - self.conv_states = { i: torch.zeros( - batch_size, d_model * expand, d_conv, device=device, dtype=conv_dtype - ) for i in range(config.num_hidden_layers)} - self.ssm_states = { i: torch.zeros( - batch_size, d_model * expand, d_state, device=device, dtype=ssm_dtype - )for i in range(config.num_hidden_layers)} + self.conv_states = { i: torch.zeros(batch_size, d_model * expand, d_conv, device=device, dtype=conv_dtype) for i in range(config.num_hidden_layers)} + self.ssm_states = { i: torch.zeros(batch_size, d_model * expand, d_state, device=device, dtype=ssm_dtype)for i in range(config.num_hidden_layers)} - def update_conv_state(self, hidden_states): - self.conv_state.copy_(torch.roll(self.conv_state, shifts=-1, dims=-1)) # Update state (B D W) - self.conv_state[:, :, -1] = hidden_states - return self.conv_state - - def update_ssm_state(self, ssm_state): - self.ssm_state.copy_(ssm_state) class MambaSlowMixer(MambaMixer): @@ -391,34 +380,17 @@ def forward(self, hidden_states, inference_params=None): if inference_params.seqlen_offset > 0: conv_state = inference_params.conv_states[self.layer_idx] conv_state.copy_(torch.roll(conv_state, shifts=-1, dims=-1)) - conv_state[:, :, -1] = hidden_states[:,:,0] - # out, conv_state, ssm_state = self.step(hidden_states, conv_state, ssm_state) - # return out, conv_state, ssm_state + conv_state[:, :, -1].copy_(hidden_states[:,:,0]) + hidden_states = torch.sum(conv_state * self.conv1d.weight[:, 0, :], dim=-1).unsqueeze(-1) else: - conv_state = hidden_states inference_params.conv_states[self.layer_idx].copy_(nn.functional.pad(hidden_states, (self.d_conv - hidden_states.shape[-1], 0))) - - ssm_state = inference_params.ssm_states[self.layer_idx] - - # conv_state.copy_(torch.roll(conv_state, shifts=-1, dims=-1)) # Update state (B D W) - # conv_state[:, :, -1] = hidden_states - - # when you have the first iter, use conv_state - hidden_states = self.act(self.conv1d(conv_state)[..., :seq_len]) - - # x = torch.sum(conv_state * rearrange(self.conv1d.weight, "d 1 w -> d w"), dim=-1) # (B D) - # if self.conv1d.bias is not None: - # x = x + self.conv1d.bias - # x = self.act(x).to(dtype=dtype) - + hidden_states = self.act(self.conv1d(hidden_states)[..., :seq_len]) # 3. State Space Model sequence transformation # 3.a. input varying initialization of time_step, B and C x_dbl = self.x_proj(hidden_states.transpose(1,2)) time_step, B, C = torch.split(x_dbl, [self.time_step_rank, self.d_state, self.d_state], dim=-1) discrete_time_step = self.dt_proj(time_step) - - # discrete_time_step = discrete_time_step.transpose(0,1) A = -torch.exp(self.A_log.float()) # (d_inner, d_state) # 3.b. discretize time_step, B and C: zero-order hold from (B,L,D) to (B,L,D,N) @@ -429,6 +401,7 @@ def forward(self, hidden_states, inference_params=None): deltaB_u = (discrete_time_step[:, :, :, None] * hidden_states[:, :, :, None]) * B[:, None, :, :] # 3.c perform the recurrence y ← SSM(A, B, C)(x) + ssm_state = inference_params.ssm_states[self.layer_idx] ys = [] for i in range(seq_len): ssm_state.copy_(ssm_state * dA[:, :, i, :] + deltaB_u[:, :, i, :]) @@ -436,13 +409,11 @@ def forward(self, hidden_states, inference_params=None): y = torch.matmul(ssm_state, C[:,i,:].unsqueeze(-1)) ys.append(y[:,:,0]) y = torch.stack(ys, dim=-1) # shape (b, l, d) - y = y + (hidden_states * self.D.to(hidden_states.dtype)[None,:,None]) y = y * self.act(gate) # (B D) - # 4. Final linear projection attn_outputs = self.out_proj(y.transpose(1,2)) - return attn_outputs, conv_state, ssm_state + return attn_outputs, None, ssm_state class MambaRMSNorm(nn.Module): def __init__(self, hidden_size, eps=1e-6): @@ -502,8 +473,18 @@ def _init_weights(self, module): nn.init.zeros_(module.bias) elif isinstance(module, nn.Embedding): nn.init.normal_(module.weight, std=self.config.initializer_range) - - + # + # # Initialize dt bias so that F.softplus(dt_bias) is between dt_min and dt_max + # dt = torch.exp( + # torch.rand(self.d_inner, **factory_kwargs) * (math.log(dt_max) - math.log(dt_min)) + # + math.log(dt_min) + # ).clamp(min=dt_init_floor) + # # Inverse of softplus: https://github.com/pytorch/pytorch/issues/72759 + # inv_dt = dt + torch.log(-torch.expm1(-dt)) + # with torch.no_grad(): + # self.dt_proj.bias.copy_(inv_dt) + # # Our initialization would set all Linear.bias to zero, need to mark this one as _no_reinit + # self.dt_proj.bias._no_reinit = True @dataclass @@ -690,16 +671,15 @@ def forward( hidden_states, conv_state, ssm_state = self._gradient_checkpointing_func(layer.__call__, hidden_states, inference_params) else: hidden_states, conv_state, ssm_state = layer(hidden_states, inference_params=inference_params) - # inference_params.update_conv_state(conv_state) - # inference_params.update_ssm_state(ssm_state) - inference_params.seqlen_offset += inputs_embeds.shape[1] inference_params.ssm_states[idx].copy_(ssm_state) + # inference_params.conv_states[idx].copy_(conv_state) if output_hidden_states: all_hidden_states = all_hidden_states + (hidden_states,) if output_attentions: all_last_states = all_last_states + (ssm_state,) + inference_params.seqlen_offset += inputs_embeds.shape[1] if output_hidden_states: all_hidden_states = all_hidden_states + (hidden_states,)