From aba20bf92a67a9cd061504d7a83e7cd61345b422 Mon Sep 17 00:00:00 2001 From: Matt Date: Mon, 23 Dec 2024 12:04:25 +0000 Subject: [PATCH] Propagate multimodal_projector_bias change --- .../models/llava_next/configuration_llava_next.py | 2 ++ .../models/llava_next/modeling_llava_next.py | 9 ++++++--- .../models/llava_next_video/modeling_llava_next_video.py | 9 ++++++--- .../llava_onevision/configuration_llava_onevision.py | 3 +++ .../models/llava_onevision/modeling_llava_onevision.py | 9 ++++++--- .../models/video_llava/configuration_video_llava.py | 3 +++ .../models/video_llava/modeling_video_llava.py | 9 ++++++--- 7 files changed, 32 insertions(+), 12 deletions(-) diff --git a/src/transformers/models/llava_next/configuration_llava_next.py b/src/transformers/models/llava_next/configuration_llava_next.py index 54616edbf96dce..2c4daf515785a4 100644 --- a/src/transformers/models/llava_next/configuration_llava_next.py +++ b/src/transformers/models/llava_next/configuration_llava_next.py @@ -55,6 +55,8 @@ class LlavaNextConfig(PretrainedConfig): Whether the model's input and output word embeddings should be tied. image_seq_length (`int`, *optional*, defaults to 576): Sequence length of one image embedding. + multimodal_projector_bias (`bool`, *optional*, defaults to `True`): + Whether to use bias in the multimodal projector. Example: diff --git a/src/transformers/models/llava_next/modeling_llava_next.py b/src/transformers/models/llava_next/modeling_llava_next.py index 269663c7d6141a..75acf2479a7e24 100644 --- a/src/transformers/models/llava_next/modeling_llava_next.py +++ b/src/transformers/models/llava_next/modeling_llava_next.py @@ -194,10 +194,13 @@ class LlavaNextCausalLMOutputWithPast(ModelOutput): class LlavaNextMultiModalProjector(nn.Module): def __init__(self, config: LlavaNextConfig): super().__init__() - - self.linear_1 = nn.Linear(config.vision_config.hidden_size, config.text_config.hidden_size, bias=True) + self.linear_1 = nn.Linear( + config.vision_config.hidden_size, config.text_config.hidden_size, bias=config.multimodal_projector_bias + ) self.act = ACT2FN[config.projector_hidden_act] - self.linear_2 = nn.Linear(config.text_config.hidden_size, config.text_config.hidden_size, bias=True) + self.linear_2 = nn.Linear( + config.text_config.hidden_size, config.text_config.hidden_size, bias=config.multimodal_projector_bias + ) def forward(self, image_features): hidden_states = self.linear_1(image_features) diff --git a/src/transformers/models/llava_next_video/modeling_llava_next_video.py b/src/transformers/models/llava_next_video/modeling_llava_next_video.py index 7cd7e18abaf3e0..ac7dfffa664611 100644 --- a/src/transformers/models/llava_next_video/modeling_llava_next_video.py +++ b/src/transformers/models/llava_next_video/modeling_llava_next_video.py @@ -179,10 +179,13 @@ def _init_weights(self, module): class LlavaNextVideoMultiModalProjector(nn.Module): def __init__(self, config: LlavaNextVideoConfig): super().__init__() - - self.linear_1 = nn.Linear(config.vision_config.hidden_size, config.text_config.hidden_size, bias=True) + self.linear_1 = nn.Linear( + config.vision_config.hidden_size, config.text_config.hidden_size, bias=config.multimodal_projector_bias + ) self.act = ACT2FN[config.projector_hidden_act] - self.linear_2 = nn.Linear(config.text_config.hidden_size, config.text_config.hidden_size, bias=True) + self.linear_2 = nn.Linear( + config.text_config.hidden_size, config.text_config.hidden_size, bias=config.multimodal_projector_bias + ) def forward(self, image_features): hidden_states = self.linear_1(image_features) diff --git a/src/transformers/models/llava_onevision/configuration_llava_onevision.py b/src/transformers/models/llava_onevision/configuration_llava_onevision.py index 46b65b35b1a5cb..2595d0f6cdee8c 100644 --- a/src/transformers/models/llava_onevision/configuration_llava_onevision.py +++ b/src/transformers/models/llava_onevision/configuration_llava_onevision.py @@ -58,6 +58,8 @@ class LlavaOnevisionConfig(PretrainedConfig): of the form `(height, width)`. tie_word_embeddings (`bool`, *optional*, defaults to `False`): Whether the model's input and output word embeddings should be tied. + multimodal_projector_bias (`bool`, *optional*, defaults to `True`): + Whether to use bias in the multimodal projector. Example: @@ -95,6 +97,7 @@ def __init__( vision_aspect_ratio="anyres_max_9", image_grid_pinpoints=None, tie_word_embeddings=False, + multimodal_projector_bias=True, **kwargs, ): self.image_token_index = image_token_index diff --git a/src/transformers/models/llava_onevision/modeling_llava_onevision.py b/src/transformers/models/llava_onevision/modeling_llava_onevision.py index 626db4d96aae2e..856518ea315ef5 100644 --- a/src/transformers/models/llava_onevision/modeling_llava_onevision.py +++ b/src/transformers/models/llava_onevision/modeling_llava_onevision.py @@ -201,10 +201,13 @@ class LlavaOnevisionCausalLMOutputWithPast(ModelOutput): class LlavaOnevisionMultiModalProjector(nn.Module): def __init__(self, config: LlavaOnevisionConfig): super().__init__() - - self.linear_1 = nn.Linear(config.vision_config.hidden_size, config.text_config.hidden_size, bias=True) + self.linear_1 = nn.Linear( + config.vision_config.hidden_size, config.text_config.hidden_size, bias=config.multimodal_projector_bias + ) self.act = ACT2FN[config.projector_hidden_act] - self.linear_2 = nn.Linear(config.text_config.hidden_size, config.text_config.hidden_size, bias=True) + self.linear_2 = nn.Linear( + config.text_config.hidden_size, config.text_config.hidden_size, bias=config.multimodal_projector_bias + ) def forward(self, image_features): hidden_states = self.linear_1(image_features) diff --git a/src/transformers/models/video_llava/configuration_video_llava.py b/src/transformers/models/video_llava/configuration_video_llava.py index 87d96ca24ffdb4..47fd3a3a72b86f 100644 --- a/src/transformers/models/video_llava/configuration_video_llava.py +++ b/src/transformers/models/video_llava/configuration_video_llava.py @@ -55,6 +55,8 @@ class VideoLlavaConfig(PretrainedConfig): Sequence length of one image embedding. video_seq_length (`int`, *optional*, defaults to 2056): Sequence length of one video embedding. + multimodal_projector_bias (`bool`, *optional*, defaults to `True`): + Whether to use bias in the multimodal projector. Example: @@ -92,6 +94,7 @@ def __init__( vision_feature_layer=-2, image_seq_length=256, video_seq_length=2056, + multimodal_projector_bias=True, **kwargs, ): self.ignore_index = ignore_index diff --git a/src/transformers/models/video_llava/modeling_video_llava.py b/src/transformers/models/video_llava/modeling_video_llava.py index 30adcb6ab5c089..88065687b03fd3 100644 --- a/src/transformers/models/video_llava/modeling_video_llava.py +++ b/src/transformers/models/video_llava/modeling_video_llava.py @@ -88,10 +88,13 @@ class VideoLlavaCausalLMOutputWithPast(ModelOutput): class VideoLlavaMultiModalProjector(nn.Module): def __init__(self, config: VideoLlavaConfig): super().__init__() - - self.linear_1 = nn.Linear(config.vision_config.hidden_size, config.text_config.hidden_size, bias=True) + self.linear_1 = nn.Linear( + config.vision_config.hidden_size, config.text_config.hidden_size, bias=config.multimodal_projector_bias + ) self.act = ACT2FN[config.projector_hidden_act] - self.linear_2 = nn.Linear(config.text_config.hidden_size, config.text_config.hidden_size, bias=True) + self.linear_2 = nn.Linear( + config.text_config.hidden_size, config.text_config.hidden_size, bias=config.multimodal_projector_bias + ) def forward(self, image_features): hidden_states = self.linear_1(image_features)