Skip to content

Commit

Permalink
Propagate multimodal_projector_bias change
Browse files Browse the repository at this point in the history
  • Loading branch information
Rocketknight1 committed Dec 23, 2024
1 parent 9509d02 commit 24d9ee5
Show file tree
Hide file tree
Showing 7 changed files with 32 additions and 12 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,8 @@ class LlavaNextConfig(PretrainedConfig):
Whether the model's input and output word embeddings should be tied.
image_seq_length (`int`, *optional*, defaults to 576):
Sequence length of one image embedding.
multimodal_projector_bias (`bool`, *optional*, defaults to `True`):
Whether to use bias in the multimodal projector.
Example:
Expand Down
9 changes: 6 additions & 3 deletions src/transformers/models/llava_next/modeling_llava_next.py
Original file line number Diff line number Diff line change
Expand Up @@ -194,10 +194,13 @@ class LlavaNextCausalLMOutputWithPast(ModelOutput):
class LlavaNextMultiModalProjector(nn.Module):
def __init__(self, config: LlavaNextConfig):
super().__init__()

self.linear_1 = nn.Linear(config.vision_config.hidden_size, config.text_config.hidden_size, bias=True)
self.linear_1 = nn.Linear(
config.vision_config.hidden_size, config.text_config.hidden_size, bias=config.multimodal_projector_bias
)
self.act = ACT2FN[config.projector_hidden_act]
self.linear_2 = nn.Linear(config.text_config.hidden_size, config.text_config.hidden_size, bias=True)
self.linear_2 = nn.Linear(
config.text_config.hidden_size, config.text_config.hidden_size, bias=config.multimodal_projector_bias
)

def forward(self, image_features):
hidden_states = self.linear_1(image_features)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -179,10 +179,13 @@ def _init_weights(self, module):
class LlavaNextVideoMultiModalProjector(nn.Module):
def __init__(self, config: LlavaNextVideoConfig):
super().__init__()

self.linear_1 = nn.Linear(config.vision_config.hidden_size, config.text_config.hidden_size, bias=True)
self.linear_1 = nn.Linear(
config.vision_config.hidden_size, config.text_config.hidden_size, bias=config.multimodal_projector_bias
)
self.act = ACT2FN[config.projector_hidden_act]
self.linear_2 = nn.Linear(config.text_config.hidden_size, config.text_config.hidden_size, bias=True)
self.linear_2 = nn.Linear(
config.text_config.hidden_size, config.text_config.hidden_size, bias=config.multimodal_projector_bias
)

def forward(self, image_features):
hidden_states = self.linear_1(image_features)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,8 @@ class LlavaOnevisionConfig(PretrainedConfig):
of the form `(height, width)`.
tie_word_embeddings (`bool`, *optional*, defaults to `False`):
Whether the model's input and output word embeddings should be tied.
multimodal_projector_bias (`bool`, *optional*, defaults to `True`):
Whether to use bias in the multimodal projector.
Example:
Expand Down Expand Up @@ -95,6 +97,7 @@ def __init__(
vision_aspect_ratio="anyres_max_9",
image_grid_pinpoints=None,
tie_word_embeddings=False,
multimodal_projector_bias=True,
**kwargs,
):
self.image_token_index = image_token_index
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -201,10 +201,13 @@ class LlavaOnevisionCausalLMOutputWithPast(ModelOutput):
class LlavaOnevisionMultiModalProjector(nn.Module):
def __init__(self, config: LlavaOnevisionConfig):
super().__init__()

self.linear_1 = nn.Linear(config.vision_config.hidden_size, config.text_config.hidden_size, bias=True)
self.linear_1 = nn.Linear(
config.vision_config.hidden_size, config.text_config.hidden_size, bias=config.multimodal_projector_bias
)
self.act = ACT2FN[config.projector_hidden_act]
self.linear_2 = nn.Linear(config.text_config.hidden_size, config.text_config.hidden_size, bias=True)
self.linear_2 = nn.Linear(
config.text_config.hidden_size, config.text_config.hidden_size, bias=config.multimodal_projector_bias
)

def forward(self, image_features):
hidden_states = self.linear_1(image_features)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,8 @@ class VideoLlavaConfig(PretrainedConfig):
Sequence length of one image embedding.
video_seq_length (`int`, *optional*, defaults to 2056):
Sequence length of one video embedding.
multimodal_projector_bias (`bool`, *optional*, defaults to `True`):
Whether to use bias in the multimodal projector.
Example:
Expand Down Expand Up @@ -92,6 +94,7 @@ def __init__(
vision_feature_layer=-2,
image_seq_length=256,
video_seq_length=2056,
multimodal_projector_bias=True,
**kwargs,
):
self.ignore_index = ignore_index
Expand Down
9 changes: 6 additions & 3 deletions src/transformers/models/video_llava/modeling_video_llava.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,10 +88,13 @@ class VideoLlavaCausalLMOutputWithPast(ModelOutput):
class VideoLlavaMultiModalProjector(nn.Module):
def __init__(self, config: VideoLlavaConfig):
super().__init__()

self.linear_1 = nn.Linear(config.vision_config.hidden_size, config.text_config.hidden_size, bias=True)
self.linear_1 = nn.Linear(
config.vision_config.hidden_size, config.text_config.hidden_size, bias=config.multimodal_projector_bias
)
self.act = ACT2FN[config.projector_hidden_act]
self.linear_2 = nn.Linear(config.text_config.hidden_size, config.text_config.hidden_size, bias=True)
self.linear_2 = nn.Linear(
config.text_config.hidden_size, config.text_config.hidden_size, bias=config.multimodal_projector_bias
)

def forward(self, image_features):
hidden_states = self.linear_1(image_features)
Expand Down

0 comments on commit 24d9ee5

Please sign in to comment.