From bdc243deb4feb6bd72f3a39bc68cf7943ec92465 Mon Sep 17 00:00:00 2001 From: Nilesh Kokane Date: Thu, 25 Jan 2024 21:35:05 +0530 Subject: [PATCH 1/6] Fixed nll with label_smoothing to nll --- src/transformers/models/blip/modeling_blip_text.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/transformers/models/blip/modeling_blip_text.py b/src/transformers/models/blip/modeling_blip_text.py index 353c0f486a5629..1ac95ea17054b2 100644 --- a/src/transformers/models/blip/modeling_blip_text.py +++ b/src/transformers/models/blip/modeling_blip_text.py @@ -889,7 +889,7 @@ def forward( # we are doing next-token prediction; shift prediction scores and input ids by one shifted_prediction_scores = prediction_scores[:, :-1, :].contiguous() labels = labels[:, 1:].contiguous().to(shifted_prediction_scores.device) - loss_fct = CrossEntropyLoss(reduction=reduction, label_smoothing=0.1) + loss_fct = CrossEntropyLoss(reduction=reduction, label_smoothing=0.0) lm_loss = loss_fct(shifted_prediction_scores.view(-1, self.config.vocab_size), labels.view(-1)) if reduction == "none": lm_loss = lm_loss.view(prediction_scores.size(0), -1).sum(1) From 90d2459e5d782eb91fe2f65519d12d690740b92f Mon Sep 17 00:00:00 2001 From: Nilesh Kokane Date: Tue, 6 Feb 2024 15:12:36 +0530 Subject: [PATCH 2/6] Resolved conflict by rebase --- src/transformers/models/blip/modeling_tf_blip_text.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/transformers/models/blip/modeling_tf_blip_text.py b/src/transformers/models/blip/modeling_tf_blip_text.py index 19d8bc9b6ecfa0..c30c72fe50fa4b 100644 --- a/src/transformers/models/blip/modeling_tf_blip_text.py +++ b/src/transformers/models/blip/modeling_tf_blip_text.py @@ -1063,7 +1063,7 @@ def call( # Keras won't give us label smoothing for sparse CE, so we de-sparsify things here # Use relu to clamp masked labels at 0 to avoid NaN (we will be zeroing those out later anyway) one_hot_labels = tf.one_hot(tf.nn.relu(labels), depth=self.config.vocab_size, dtype=tf.float32) - loss_fct = keras.losses.CategoricalCrossentropy(from_logits=True, label_smoothing=0.1, reduction="none") + loss_fct = keras.losses.CategoricalCrossentropy(from_logits=True, label_smoothing=0.0, reduction="none") masked_positions = tf.cast(tf.not_equal(labels, -100), dtype=tf.float32) lm_loss = loss_fct(one_hot_labels, shifted_prediction_scores) lm_loss *= masked_positions From ad2b756f81d49ab84281d0e330b2784054451aaa Mon Sep 17 00:00:00 2001 From: Nilesh Kokane Date: Thu, 25 Jan 2024 21:35:05 +0530 Subject: [PATCH 3/6] Fixed nll with label_smoothing to nll --- src/transformers/models/blip/modeling_blip_text.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/transformers/models/blip/modeling_blip_text.py b/src/transformers/models/blip/modeling_blip_text.py index f9ae08b667e3f5..23621a619edd01 100644 --- a/src/transformers/models/blip/modeling_blip_text.py +++ b/src/transformers/models/blip/modeling_blip_text.py @@ -893,7 +893,7 @@ def forward( # we are doing next-token prediction; shift prediction scores and input ids by one shifted_prediction_scores = prediction_scores[:, :-1, :].contiguous() labels = labels[:, 1:].contiguous().to(shifted_prediction_scores.device) - loss_fct = CrossEntropyLoss(reduction=reduction, label_smoothing=0.1) + loss_fct = CrossEntropyLoss(reduction=reduction, label_smoothing=0.0) lm_loss = loss_fct(shifted_prediction_scores.view(-1, self.config.vocab_size), labels.view(-1)) if reduction == "none": lm_loss = lm_loss.view(prediction_scores.size(0), -1).sum(1) From faa05b9140f781a1b69cb0d4904cb25369786c6e Mon Sep 17 00:00:00 2001 From: Nilesh Kokane Date: Tue, 6 Feb 2024 15:12:36 +0530 Subject: [PATCH 4/6] Resolved conflict by rebase --- src/transformers/models/blip/modeling_tf_blip_text.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/transformers/models/blip/modeling_tf_blip_text.py b/src/transformers/models/blip/modeling_tf_blip_text.py index 19d8bc9b6ecfa0..c30c72fe50fa4b 100644 --- a/src/transformers/models/blip/modeling_tf_blip_text.py +++ b/src/transformers/models/blip/modeling_tf_blip_text.py @@ -1063,7 +1063,7 @@ def call( # Keras won't give us label smoothing for sparse CE, so we de-sparsify things here # Use relu to clamp masked labels at 0 to avoid NaN (we will be zeroing those out later anyway) one_hot_labels = tf.one_hot(tf.nn.relu(labels), depth=self.config.vocab_size, dtype=tf.float32) - loss_fct = keras.losses.CategoricalCrossentropy(from_logits=True, label_smoothing=0.1, reduction="none") + loss_fct = keras.losses.CategoricalCrossentropy(from_logits=True, label_smoothing=0.0, reduction="none") masked_positions = tf.cast(tf.not_equal(labels, -100), dtype=tf.float32) lm_loss = loss_fct(one_hot_labels, shifted_prediction_scores) lm_loss *= masked_positions From 03f72e68a4d3561a1c3f97ef8c2b2b37cd63e836 Mon Sep 17 00:00:00 2001 From: Nilesh Kokane Date: Sat, 17 Feb 2024 11:50:51 +0530 Subject: [PATCH 5/6] Added label_smoothing to config file --- src/transformers/models/blip/configuration_blip.py | 12 ++++++++++++ src/transformers/models/blip/modeling_blip_text.py | 3 ++- .../models/blip/modeling_tf_blip_text.py | 5 ++++- 3 files changed, 18 insertions(+), 2 deletions(-) diff --git a/src/transformers/models/blip/configuration_blip.py b/src/transformers/models/blip/configuration_blip.py index 0b3dfb4a121c97..976da3b1837f12 100644 --- a/src/transformers/models/blip/configuration_blip.py +++ b/src/transformers/models/blip/configuration_blip.py @@ -94,6 +94,10 @@ class BlipTextConfig(PretrainedConfig): Whether the model is used as a decoder. use_cache (`bool`, *optional*, defaults to `True`): Whether or not the model should return the last key/values attentions (not used by all models). + label_smoothing (float, optional): A float in [0.0, 1.0]. Specifies the amount + of smoothing when computing the loss, where 0.0 means no smoothing. The targets + become a mixture of the original ground truth and a uniform distribution as described in + `Rethinking the Inception Architecture for Computer Vision `__. Default: :math:`0.0`. Example: @@ -133,6 +137,7 @@ def __init__( sep_token_id=102, is_decoder=True, use_cache=True, + label_smoothing=0.0, **kwargs, ): super().__init__( @@ -158,6 +163,7 @@ def __init__( self.attention_probs_dropout_prob = attention_probs_dropout_prob self.is_decoder = is_decoder self.use_cache = use_cache + self.label_smoothing = label_smoothing @classmethod def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig": @@ -298,6 +304,10 @@ class BlipConfig(PretrainedConfig): The inital value of the *logit_scale* paramter. Default is used as per the original BLIP implementation. image_text_hidden_size (`int`, *optional*, defaults to 256): Dimentionality of the hidden state of the image-text fusion layer. + label_smoothing (float, optional): A float in [0.0, 1.0]. Specifies the amount + of smoothing when computing the loss, where 0.0 means no smoothing. The targets + become a mixture of the original ground truth and a uniform distribution as described in + `Rethinking the Inception Architecture for Computer Vision `__. Default: :math:`0.0`. kwargs (*optional*): Dictionary of keyword arguments. @@ -333,6 +343,7 @@ def __init__( projection_dim=512, logit_scale_init_value=2.6592, image_text_hidden_size=256, + label_smoothing=0.0, **kwargs, ): super().__init__(**kwargs) @@ -355,6 +366,7 @@ def __init__( self.initializer_factor = 1.0 self.initializer_range = 0.02 self.image_text_hidden_size = image_text_hidden_size + self.label_smoothing = label_smoothing @classmethod def from_text_vision_configs(cls, text_config: BlipTextConfig, vision_config: BlipVisionConfig, **kwargs): diff --git a/src/transformers/models/blip/modeling_blip_text.py b/src/transformers/models/blip/modeling_blip_text.py index 23621a619edd01..fa9b1e0e4fc476 100644 --- a/src/transformers/models/blip/modeling_blip_text.py +++ b/src/transformers/models/blip/modeling_blip_text.py @@ -813,6 +813,7 @@ def __init__(self, config): self.bert = BlipTextModel(config, add_pooling_layer=False) self.cls = BlipTextOnlyMLMHead(config) + self.label_smoothing = config.label_smoothing def get_output_embeddings(self): return self.cls.predictions.decoder @@ -893,7 +894,7 @@ def forward( # we are doing next-token prediction; shift prediction scores and input ids by one shifted_prediction_scores = prediction_scores[:, :-1, :].contiguous() labels = labels[:, 1:].contiguous().to(shifted_prediction_scores.device) - loss_fct = CrossEntropyLoss(reduction=reduction, label_smoothing=0.0) + loss_fct = CrossEntropyLoss(reduction=reduction, label_smoothing=self.label_smoothing) lm_loss = loss_fct(shifted_prediction_scores.view(-1, self.config.vocab_size), labels.view(-1)) if reduction == "none": lm_loss = lm_loss.view(prediction_scores.size(0), -1).sum(1) diff --git a/src/transformers/models/blip/modeling_tf_blip_text.py b/src/transformers/models/blip/modeling_tf_blip_text.py index c30c72fe50fa4b..b605a25eeb4bcf 100644 --- a/src/transformers/models/blip/modeling_tf_blip_text.py +++ b/src/transformers/models/blip/modeling_tf_blip_text.py @@ -976,6 +976,7 @@ def __init__(self, config, **kwargs): self.bert = TFBlipTextModel(config, add_pooling_layer=False, name="bert") self.cls = TFBlipTextOnlyMLMHead(config, name="cls") + self.label_smoothing = config.label_smoothing def get_output_embeddings(self): return self.cls.predictions.decoder @@ -1063,7 +1064,9 @@ def call( # Keras won't give us label smoothing for sparse CE, so we de-sparsify things here # Use relu to clamp masked labels at 0 to avoid NaN (we will be zeroing those out later anyway) one_hot_labels = tf.one_hot(tf.nn.relu(labels), depth=self.config.vocab_size, dtype=tf.float32) - loss_fct = keras.losses.CategoricalCrossentropy(from_logits=True, label_smoothing=0.0, reduction="none") + loss_fct = keras.losses.CategoricalCrossentropy( + from_logits=True, label_smoothing=self.label_smoothing, reduction="none" + ) masked_positions = tf.cast(tf.not_equal(labels, -100), dtype=tf.float32) lm_loss = loss_fct(one_hot_labels, shifted_prediction_scores) lm_loss *= masked_positions From 527dc949bcff9b646c9189b8cda14453aace0377 Mon Sep 17 00:00:00 2001 From: Nilesh Kokane Date: Mon, 19 Feb 2024 21:40:16 +0530 Subject: [PATCH 6/6] Fixed nits --- src/transformers/models/blip/configuration_blip.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/transformers/models/blip/configuration_blip.py b/src/transformers/models/blip/configuration_blip.py index cc9cd348528d92..42e35958ced3cf 100644 --- a/src/transformers/models/blip/configuration_blip.py +++ b/src/transformers/models/blip/configuration_blip.py @@ -94,8 +94,8 @@ class BlipTextConfig(PretrainedConfig): Whether the model is used as a decoder. use_cache (`bool`, *optional*, defaults to `True`): Whether or not the model should return the last key/values attentions (not used by all models). - label_smoothing (float, optional): A float in [0.0, 1.0]. Specifies the amount - of smoothing when computing the loss, where 0.0 means no smoothing. The targets + label_smoothing (float, *optional*): + A float in [0.0, 1.0]. Specifies the amount of smoothing when computing the loss, where 0.0 means no smoothing. The targets become a mixture of the original ground truth and a uniform distribution as described in `Rethinking the Inception Architecture for Computer Vision `__. Default: :math:`0.0`. @@ -304,8 +304,8 @@ class BlipConfig(PretrainedConfig): The inital value of the *logit_scale* paramter. Default is used as per the original BLIP implementation. image_text_hidden_size (`int`, *optional*, defaults to 256): Dimentionality of the hidden state of the image-text fusion layer. - label_smoothing (float, optional, *optional*, defaults to 0.0): A float in [0.0, 1.0]. Specifies the amount - of smoothing when computing the loss, where 0.0 means no smoothing. The targets + label_smoothing (float, optional, *optional*, defaults to 0.0): + A float in [0.0, 1.0]. Specifies the amount of smoothing when computing the loss, where 0.0 means no smoothing. The targets become a mixture of the original ground truth and a uniform distribution as described in `Rethinking the Inception Architecture for Computer Vision `__. Default: :math:`0.0`. kwargs (*optional*):